fix bug where end of stream is read wrongly parsing XHTML (+ test case)

This commit is contained in:
Grahame Grieve 2020-11-14 08:17:48 +11:00
parent c6a2fb375c
commit 4088dddaee
2 changed files with 23 additions and 15 deletions

View File

@ -55,6 +55,7 @@ import org.xmlpull.v1.XmlPullParserException;
public class XhtmlParser {
public static final String XHTML_NS = "http://www.w3.org/1999/xhtml";
private static final char END_OF_CHARS = (char) -1;
public class NSMap {
private Map<String, String> nslist = new HashMap<String, String>();
@ -515,7 +516,7 @@ private boolean elementIsOk(String name) throws FHIRFormatError {
private void parseElementInner(XhtmlNode node, List<XhtmlNode> parents, NSMap nsm, boolean escaping) throws FHIRFormatError, IOException
{
StringBuilder s = new StringBuilder();
while (peekChar() != '\0' && !parents.contains(unwindPoint) && !(node == unwindPoint))
while (peekChar() != END_OF_CHARS && !parents.contains(unwindPoint) && !(node == unwindPoint))
{
if (peekChar() == '<')
{
@ -606,7 +607,7 @@ private boolean elementIsOk(String name) throws FHIRFormatError {
{
while (Character.isWhitespace(peekChar()))
readChar();
while (peekChar() != '>' && peekChar() != '/' && peekChar() != '\0')
while (peekChar() != '>' && peekChar() != '/' && peekChar() != END_OF_CHARS)
{
String name = readName();
if (name.length() == 0)
@ -630,7 +631,7 @@ private boolean elementIsOk(String name) throws FHIRFormatError {
if (peekChar() == '"' || peekChar() == '\'')
node.getAttributes().put(name, parseAttributeValue(readChar()));
else
node.getAttributes().put(name, parseAttributeValue('\0'));
node.getAttributes().put(name, parseAttributeValue(END_OF_CHARS));
}
while (Character.isWhitespace(peekChar()))
readChar();
@ -640,7 +641,7 @@ private boolean elementIsOk(String name) throws FHIRFormatError {
private String parseAttributeValue(char term) throws IOException, FHIRFormatError
{
StringBuilder b = new StringBuilder();
while (peekChar() != '\0' && peekChar() != '>' && (term != '\0' || peekChar() != '/') && peekChar() != term)
while (peekChar() != END_OF_CHARS && peekChar() != '>' && (term != END_OF_CHARS || peekChar() != '/') && peekChar() != term)
{
if (peekChar() == '&')
{
@ -704,15 +705,15 @@ private boolean elementIsOk(String name) throws FHIRFormatError {
if (cache.length() > 0)
return cache.charAt(0);
else if (!rdr.ready())
return '\0';
return END_OF_CHARS;
else
{
char c = (char)rdr.read();
if (c == (char)-1)
{
int i = rdr.read();
if (i == -1) {
cache = "";
return '\0';
return END_OF_CHARS;
}
char c = (char) i;
cache = Character.toString(c);
return c;
}
@ -727,7 +728,7 @@ private boolean elementIsOk(String name) throws FHIRFormatError {
cache = cache.length() == 1 ? "" : cache.substring(1);
}
else if (!rdr.ready())
c = '\0';
c = END_OF_CHARS;
else
c = (char)rdr.read();
if (c == '\r' || c == '\n') {
@ -744,9 +745,9 @@ private boolean elementIsOk(String name) throws FHIRFormatError {
private String readToTagEnd() throws IOException, FHIRFormatError
{
StringBuilder s = new StringBuilder();
while (peekChar() != '>' && peekChar() != '\0')
while (peekChar() != '>' && peekChar() != END_OF_CHARS)
s.append(readChar());
if (peekChar() != '\0')
if (peekChar() != END_OF_CHARS)
{
readChar();
skipWhiteSpace();
@ -765,7 +766,7 @@ private boolean elementIsOk(String name) throws FHIRFormatError {
if (c == '>') {
done = true;
readChar();
} else if (c != '\0')
} else if (c != END_OF_CHARS)
s.append(readChar());
else if (mustBeWellFormed)
throw new FHIRFormatError("Unexpected termination of html source"+descLoc());
@ -814,12 +815,12 @@ private boolean elementIsOk(String name) throws FHIRFormatError {
} else if (c == '[' && s.toString().startsWith("DOCTYPE ")) {
doctypeEntities = true;
s.append(readChar());
} else if (c != '\0')
} else if (c != END_OF_CHARS)
s.append(readChar());
else if (mustBeWellFormed)
throw new FHIRFormatError("Unexpected termination of html source"+descLoc());
}
if (peekChar() != '\0')
if (peekChar() != END_OF_CHARS)
{
readChar();
skipWhiteSpace();

View File

@ -1,6 +1,8 @@
package org.hl7.fhir.utilities.tests;
import org.hl7.fhir.exceptions.FHIRFormatError;
import org.hl7.fhir.utilities.xhtml.XhtmlNode;
import org.hl7.fhir.utilities.xhtml.XhtmlParser;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
@ -88,4 +90,9 @@ public class XhtmlNodeTest {
ObjectOutputStream oout = new ObjectOutputStream(bout);
oout.writeObject(node);
}
@Test
public void testParseBadChars() throws FHIRFormatError, IOException {
XhtmlNode x = new XhtmlParser().parse(BaseTestingUtilities.loadTestResource("xhtml", "bad-chars.html"), "div");
}
}