Validate XML header and syntax issues

This commit is contained in:
Grahame Grieve 2020-03-19 06:17:03 +11:00
parent 849b65417a
commit a7cf428fef
4 changed files with 97 additions and 2 deletions

View File

@ -23,6 +23,7 @@ package org.hl7.fhir.r5.elementmodel;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
@ -71,6 +72,7 @@ import org.xml.sax.XMLReader;
public class XmlParser extends ParserBase {
private boolean allowXsiLocation;
private String version;
public XmlParser(IWorkerContext context) {
super(context);
@ -99,6 +101,13 @@ public class XmlParser extends ParserBase {
factory.setNamespaceAware(true);
if (policy == ValidationPolicy.EVERYTHING) {
// The SAX interface appears to not work when reporting the correct version/encoding.
// if we can, we'll inspect the header/encoding ourselves
if (stream.markSupported()) {
stream.mark(1024);
version = checkHeader(stream);
stream.reset();
}
// use a slower parser that keeps location data
TransformerFactory transformerFactory = TransformerFactory.newInstance();
Transformer nullTransformer = transformerFactory.newTransformer();
@ -135,6 +144,7 @@ public class XmlParser extends ParserBase {
return parse(doc);
}
private void checkForProcessingInstruction(Document document) throws FHIRFormatError {
if (policy == ValidationPolicy.EVERYTHING && FormatUtilities.FHIR_NS.equals(document.getDocumentElement().getNamespaceURI())) {
Node node = document.getFirstChild();
@ -264,6 +274,10 @@ public class XmlParser extends ParserBase {
for (int i = 0; i < node.getAttributes().getLength(); i++) {
Node attr = node.getAttributes().item(i);
String value = attr.getNodeValue();
if (!validAttrValue(value)) {
logError(line(node), col(node), path, IssueType.STRUCTURE, context.formatMessage(I18nConstants.XML_ATTR_VALUE_INVALID, attr.getNodeName()), IssueSeverity.ERROR);
}
if (!(attr.getNodeName().equals("xmlns") || attr.getNodeName().startsWith("xmlns:"))) {
Property property = getAttrProp(properties, attr.getNodeName());
if (property != null) {
@ -345,6 +359,23 @@ public class XmlParser extends ParserBase {
}
}
private boolean validAttrValue(String value) {
if (version == null) {
return true;
}
if (version.equals("1.0")) {
boolean ok = true;
for (char ch : value.toCharArray()) {
if (ch <= 0x1F && !Utilities.existsInList(ch, '\r', '\n', '\t')) {
ok = false;
}
}
return ok;
} else
return true;
}
private Property getElementProp(List<Property> properties, String nodeName) {
List<Property> propsSortedByLongestFirst = new ArrayList<Property>(properties);
// sort properties according to their name longest first, so .requestOrganizationReference comes first before .request[x]
@ -583,4 +614,60 @@ public class XmlParser extends ParserBase {
}
}
private String checkHeader(InputStream stream) throws IOException {
try {
// the stream will either start with the UTF-8 BOF or with <xml
int i0 = stream.read();
int i1 = stream.read();
int i2 = stream.read();
StringBuilder b = new StringBuilder();
if (i0 == 0xEF && i1 == 0xBB && i2 == 0xBF) {
// ok, it's UTF-8
} else if (i0 == 0x3C && i1 == 0x3F && i2 == 0x78) { // <xm
b.append((char) i0);
b.append((char) i1);
b.append((char) i2);
} else if (i0 == 60) { // just plain old XML with no header
return "1.0";
} else {
throw new Exception(context.formatMessage(I18nConstants.XML_ENCODING_INVALID));
}
int i = stream.read();
do {
b.append((char) i);
i = stream.read();
} while (i != 0x3E);
String header = b.toString();
String e = null;
i = header.indexOf("encoding=\"");
if (i > -1) {
e = header.substring(i+10, i+15);
} else {
i = header.indexOf("encoding='");
if (i > -1) {
e = header.substring(i+10, i+15);
}
}
if (e != null && !"UTF-8".equalsIgnoreCase(e)) {
logError(0, 0, "XML", IssueType.INVALID, context.formatMessage(I18nConstants.XML_ENCODING_INVALID), IssueSeverity.ERROR);
}
i = header.indexOf("version=\"");
if (i > -1) {
return header.substring(i+9, i+12);
} else {
i = header.indexOf("version='");
if (i > -1) {
return header.substring(i+9, i+12);
}
}
return "??";
} catch (Exception e) {
// suppress this error
logError(0, 0, "XML", IssueType.INVALID, e.getMessage(), IssueSeverity.ERROR);
}
return "??";
}
}

View File

@ -35,6 +35,7 @@ import org.xml.sax.Attributes;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.ext.Locator2;
import org.xml.sax.helpers.LocatorImpl;
import org.xml.sax.helpers.XMLFilterImpl;
@ -120,4 +121,7 @@ public class XmlLocationAnnotator extends XMLFilterImpl {
}
}
}
}

View File

@ -428,5 +428,8 @@ public class I18nConstants {
public final static String DOCUMENT = "documentmsg";
public final static String DOCUMENT_DATE_REQUIRED = "Bundle_Document_Date_Missing";
public final static String DOCUMENT_DATE_REQUIRED_HTML = "Bundle_Document_Date_Missing_html";
public final static String XML_ATTR_VALUE_INVALID = "xml_attr_value_invalid";
public final static String XML_ENCODING_INVALID = "xml_encoding_invalid";
public final static String XML_STATED_ENCODING_INVALID = "xml_stated_encoding_invalid";
}

View File

@ -428,5 +428,6 @@ Unable_to_resolve_system__no_value_set = Unable to resolve system - no value set
This_base_property_must_be_an_Array_not_a_ = This base property must be an Array, not a {0}
This_property_must_be_an_Array_not_a_ = This property must be an Array, not a {0}
documentmsg = (document)
xml_attr_value_invalid = The XML Attribute {0} has an illegal character
xml_encoding_invalid = The XML encoding is invalid (must be UTF-8)
xml_stated_encoding_invalid = The XML encoding stated in the header is invalid (must be "UTF-8" if stated)