diff --git a/src/java/org/jivesoftware/multiplexer/net/XMLLightweightParser.java b/src/java/org/jivesoftware/multiplexer/net/XMLLightweightParser.java
index 251c24c..14a0a7f 100644
--- a/src/java/org/jivesoftware/multiplexer/net/XMLLightweightParser.java
+++ b/src/java/org/jivesoftware/multiplexer/net/XMLLightweightParser.java
@@ -26,13 +26,13 @@
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import org.apache.mina.common.ByteBuffer;
import org.jivesoftware.util.JiveGlobals;
import org.jivesoftware.util.PropertyEventDispatcher;
import org.jivesoftware.util.PropertyEventListener;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
/**
* This is a Light-Weight XML Parser.
@@ -46,8 +46,8 @@
* @author Gaston Dombiak
*/
class XMLLightweightParser {
-
- private static final Logger Log = LoggerFactory.getLogger(XMLLightweightParser.class);
+
+ private static final Pattern XML_HAS_CHARREF = Pattern.compile("(0*([0-9]+)|[xX]0*([0-9a-fA-F]+));");
private static final String MAX_PROPERTY_NAME = "xmpp.parser.buffer.size";
private static int maxBufferSize;
@@ -112,11 +112,11 @@
PropertyEventDispatcher.addListener(new PropertyListener());
}
- public XMLLightweightParser(String charset) {
- encoder = Charset.forName(charset).newDecoder()
- .onMalformedInput(CodingErrorAction.REPORT)
- .onUnmappableCharacter(CodingErrorAction.REPORT);
- }
+ public XMLLightweightParser(String charset) {
+ encoder = Charset.forName(charset).newDecoder()
+ .onMalformedInput(CodingErrorAction.REPLACE)
+ .onUnmappableCharacter(CodingErrorAction.REPLACE);
+ }
/*
* true if the parser has found some complete xml message.
@@ -155,9 +155,12 @@
/*
* Method that add a message to the list and reinit parser.
*/
- protected void foundMsg(String msg) {
+ protected void foundMsg(String msg) throws Exception {
// Add message to the complete message list
if (msg != null) {
+ if (hasIllegalCharacterReferences(msg)) {
+ throw new Exception("Illegal character reference found in: " + msg);
+ }
msgs.add(msg);
}
// Move the position into the buffer
@@ -180,39 +183,24 @@
if (buffer.length() > maxBufferSize) {
throw new Exception("Stopped parsing never ending stanza");
}
- CharBuffer charBuffer = encoder.decode(byteBuffer.buf());
- char[] buf = charBuffer.array();
- int readByte = charBuffer.remaining();
+ CharBuffer charBuffer = CharBuffer.allocate(byteBuffer.capacity());
+ encoder.reset();
+ encoder.decode(byteBuffer.buf(), charBuffer, false);
+ char[] buf = new char[charBuffer.position()];
+ charBuffer.flip();charBuffer.get(buf);
+ int readChar = buf.length;
// Just return if nothing was read
- if (readByte == 0) {
+ if (readChar == 0) {
return;
}
- // Verify if the last received byte is an incomplete double byte character
- char lastChar = buf[readByte-1];
- if (lastChar >= 0xfff0) {
- if (Log.isDebugEnabled()) {
- Log.debug("Waiting to get complete char: " + String.valueOf(buf));
- }
- // Rewind the position one place so the last byte stays in the buffer
- // The missing byte should arrive in the next iteration. Once we have both
- // of bytes we will have the correct character
- byteBuffer.position(byteBuffer.position()-1);
- // Decrease the number of bytes read by one
- readByte--;
- // Just return if nothing was read
- if (readByte == 0) {
- return;
- }
- }
-
- buffer.append(buf, 0, readByte);
+ buffer.append(buf);
// Robot.
char ch;
boolean isHighSurrogate = false;
- for (int i = 0; i < readByte; i++) {
+ for (int i = 0; i < readChar; i++) {
ch = buf[i];
if (ch < 0x20 && ch != 0x9 && ch != 0xA && ch != 0xD && ch != 0x0) {
//Unicode characters in the range 0x0000-0x001F other than 9, A, and D are not allowed in XML
@@ -243,7 +231,7 @@
if (tailCount == head.length()) {
// Close stanza found!
// Calculate the correct start,end position of the message into the buffer
- int end = buffer.length() - readByte + (i + 1);
+ int end = buffer.length() - readChar + (i + 1);
String msg = buffer.substring(startLastMsg, end);
// Add message to the list
foundMsg(msg);
@@ -282,7 +270,7 @@
status = XMLLightweightParser.OUTSIDE;
if (depth < 1) {
// Found a tag in the form
- int end = buffer.length() - readByte + (i + 1);
+ int end = buffer.length() - readChar + (i + 1);
String msg = buffer.substring(startLastMsg, end);
// Add message to the list
foundMsg(msg);
@@ -328,7 +316,7 @@
if (insideRootTag && ("stream:stream>".equals(head.toString()) ||
("?xml>".equals(head.toString())) || ("flash:stream>".equals(head.toString())))) {
// Found closing stream:stream
- int end = buffer.length() - readByte + (i + 1);
+ int end = buffer.length() - readChar + (i + 1);
// Skip LF, CR and other "weird" characters that could appear
while (startLastMsg < end && '<' != buffer.charAt(startLastMsg)) {
startLastMsg++;
@@ -382,6 +370,65 @@
}
}
+ /**
+ * This method verifies if the provided argument contains at least one numeric character reference (
+ * CharRef ::= '' [0-9]+ ';' | '' [0-9a-fA-F]+ ';
) for which the decimal or hexidecimal
+ * character value refers to an invalid XML 1.0 character.
+ *
+ * @param string
+ * The input string
+ * @return true if the input string contains an invalid numeric character reference, false
+ * otherwise.
+ * @see http://www.w3.org/TR/2008/REC-xml-20081126/#dt-charref
+ */
+ public static boolean hasIllegalCharacterReferences(String string) {
+ // If there's no character reference, don't bother to do more specific checking.
+ final Matcher matcher = XML_HAS_CHARREF.matcher(string);
+
+ while (matcher.find()) {
+ final String decValue = matcher.group(2);
+ if (decValue != null) {
+ final int value = Integer.parseInt(decValue);
+ if (!isLegalXmlCharacter(value)) {
+ return true;
+ } else {
+ continue;
+ }
+ }
+
+ final String hexValue = matcher.group(3);
+ if (hexValue != null) {
+ final int value = Integer.parseInt(hexValue, 16);
+ if (!isLegalXmlCharacter(value)) {
+ return true;
+ } else {
+ continue;
+ }
+ }
+
+ // This is bad. The XML_HAS_CHARREF expression should have a hit for either the decimal
+ // or the heximal notation.
+ throw new IllegalStateException(
+ "An error occurred while searching for illegal character references in the value [" + string + "].");
+ }
+
+ return false;
+ }
+
+ /**
+ * Verifies if the codepoint value represents a valid character as defined in paragraph 2.2 of
+ * "Extensible Markup Language (XML) 1.0 (Fifth Edition)"
+ *
+ * @param value
+ * the codepoint
+ * @return true if the codepoint is a valid charater per XML 1.0 definition, false otherwise.
+ * @see http://www.w3.org/TR/2008/REC-xml-20081126/#NT-Char
+ */
+ public static boolean isLegalXmlCharacter(int value) {
+ return value == 0x9 || value == 0xA || value == 0xD || (value >= 0x20 && value <= 0xD7FF)
+ || (value >= 0xE000 && value <= 0xFFFD) || (value >= 0x10000 && value <= 0x10FFFF);
+ }
+
private static class PropertyListener implements PropertyEventListener {
public void propertySet(String property, Map params) {
if (MAX_PROPERTY_NAME.equals(property)) {