From a7cf428fefda248768a39cd9a60628508bc7e867 Mon Sep 17 00:00:00 2001 From: Grahame Grieve Date: Thu, 19 Mar 2020 06:17:03 +1100 Subject: [PATCH] Validate XML header and syntax issues --- .../hl7/fhir/r5/elementmodel/XmlParser.java | 87 +++++++++++++++++++ .../utils/formats/XmlLocationAnnotator.java | 4 + .../fhir/utilities/i18n/I18nConstants.java | 3 + .../src/main/resources/Messages.properties | 5 +- 4 files changed, 97 insertions(+), 2 deletions(-) diff --git a/org.hl7.fhir.r5/src/main/java/org/hl7/fhir/r5/elementmodel/XmlParser.java b/org.hl7.fhir.r5/src/main/java/org/hl7/fhir/r5/elementmodel/XmlParser.java index b72342b1b..f7aaca243 100644 --- a/org.hl7.fhir.r5/src/main/java/org/hl7/fhir/r5/elementmodel/XmlParser.java +++ b/org.hl7.fhir.r5/src/main/java/org/hl7/fhir/r5/elementmodel/XmlParser.java @@ -23,6 +23,7 @@ package org.hl7.fhir.r5.elementmodel; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; @@ -71,6 +72,7 @@ import org.xml.sax.XMLReader; public class XmlParser extends ParserBase { private boolean allowXsiLocation; + private String version; public XmlParser(IWorkerContext context) { super(context); @@ -99,6 +101,13 @@ public class XmlParser extends ParserBase { factory.setNamespaceAware(true); if (policy == ValidationPolicy.EVERYTHING) { + // The SAX interface appears to not work when reporting the correct version/encoding. + // if we can, we'll inspect the header/encoding ourselves + if (stream.markSupported()) { + stream.mark(1024); + version = checkHeader(stream); + stream.reset(); + } // use a slower parser that keeps location data TransformerFactory transformerFactory = TransformerFactory.newInstance(); Transformer nullTransformer = transformerFactory.newTransformer(); @@ -135,6 +144,7 @@ public class XmlParser extends ParserBase { return parse(doc); } + private void checkForProcessingInstruction(Document document) throws FHIRFormatError { if (policy == ValidationPolicy.EVERYTHING && FormatUtilities.FHIR_NS.equals(document.getDocumentElement().getNamespaceURI())) { Node node = document.getFirstChild(); @@ -264,6 +274,10 @@ public class XmlParser extends ParserBase { for (int i = 0; i < node.getAttributes().getLength(); i++) { Node attr = node.getAttributes().item(i); + String value = attr.getNodeValue(); + if (!validAttrValue(value)) { + logError(line(node), col(node), path, IssueType.STRUCTURE, context.formatMessage(I18nConstants.XML_ATTR_VALUE_INVALID, attr.getNodeName()), IssueSeverity.ERROR); + } if (!(attr.getNodeName().equals("xmlns") || attr.getNodeName().startsWith("xmlns:"))) { Property property = getAttrProp(properties, attr.getNodeName()); if (property != null) { @@ -345,6 +359,23 @@ public class XmlParser extends ParserBase { } } + private boolean validAttrValue(String value) { + if (version == null) { + return true; + } + if (version.equals("1.0")) { + boolean ok = true; + for (char ch : value.toCharArray()) { + if (ch <= 0x1F && !Utilities.existsInList(ch, '\r', '\n', '\t')) { + ok = false; + } + } + return ok; + } else + return true; + } + + private Property getElementProp(List properties, String nodeName) { List propsSortedByLongestFirst = new ArrayList(properties); // sort properties according to their name longest first, so .requestOrganizationReference comes first before .request[x] @@ -583,4 +614,60 @@ public class XmlParser extends ParserBase { } } + private String checkHeader(InputStream stream) throws IOException { + try { + // the stream will either start with the UTF-8 BOF or with -1) { + e = header.substring(i+10, i+15); + } else { + i = header.indexOf("encoding='"); + if (i > -1) { + e = header.substring(i+10, i+15); + } + } + if (e != null && !"UTF-8".equalsIgnoreCase(e)) { + logError(0, 0, "XML", IssueType.INVALID, context.formatMessage(I18nConstants.XML_ENCODING_INVALID), IssueSeverity.ERROR); + } + + i = header.indexOf("version=\""); + if (i > -1) { + return header.substring(i+9, i+12); + } else { + i = header.indexOf("version='"); + if (i > -1) { + return header.substring(i+9, i+12); + } + } + return "??"; + } catch (Exception e) { + // suppress this error + logError(0, 0, "XML", IssueType.INVALID, e.getMessage(), IssueSeverity.ERROR); + } + return "??"; + } + } diff --git a/org.hl7.fhir.r5/src/main/java/org/hl7/fhir/r5/utils/formats/XmlLocationAnnotator.java b/org.hl7.fhir.r5/src/main/java/org/hl7/fhir/r5/utils/formats/XmlLocationAnnotator.java index a8e5a97dc..c0e3c21ac 100644 --- a/org.hl7.fhir.r5/src/main/java/org/hl7/fhir/r5/utils/formats/XmlLocationAnnotator.java +++ b/org.hl7.fhir.r5/src/main/java/org/hl7/fhir/r5/utils/formats/XmlLocationAnnotator.java @@ -35,6 +35,7 @@ import org.xml.sax.Attributes; import org.xml.sax.Locator; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; +import org.xml.sax.ext.Locator2; import org.xml.sax.helpers.LocatorImpl; import org.xml.sax.helpers.XMLFilterImpl; @@ -120,4 +121,7 @@ public class XmlLocationAnnotator extends XMLFilterImpl { } } } + + + } diff --git a/org.hl7.fhir.utilities/src/main/java/org/hl7/fhir/utilities/i18n/I18nConstants.java b/org.hl7.fhir.utilities/src/main/java/org/hl7/fhir/utilities/i18n/I18nConstants.java index 78da69902..10cde0e39 100644 --- a/org.hl7.fhir.utilities/src/main/java/org/hl7/fhir/utilities/i18n/I18nConstants.java +++ b/org.hl7.fhir.utilities/src/main/java/org/hl7/fhir/utilities/i18n/I18nConstants.java @@ -428,5 +428,8 @@ public class I18nConstants { public final static String DOCUMENT = "documentmsg"; public final static String DOCUMENT_DATE_REQUIRED = "Bundle_Document_Date_Missing"; public final static String DOCUMENT_DATE_REQUIRED_HTML = "Bundle_Document_Date_Missing_html"; + public final static String XML_ATTR_VALUE_INVALID = "xml_attr_value_invalid"; + public final static String XML_ENCODING_INVALID = "xml_encoding_invalid"; + public final static String XML_STATED_ENCODING_INVALID = "xml_stated_encoding_invalid"; } diff --git a/org.hl7.fhir.utilities/src/main/resources/Messages.properties b/org.hl7.fhir.utilities/src/main/resources/Messages.properties index d1efa98bb..da8c52ef0 100644 --- a/org.hl7.fhir.utilities/src/main/resources/Messages.properties +++ b/org.hl7.fhir.utilities/src/main/resources/Messages.properties @@ -428,5 +428,6 @@ Unable_to_resolve_system__no_value_set = Unable to resolve system - no value set This_base_property_must_be_an_Array_not_a_ = This base property must be an Array, not a {0} This_property_must_be_an_Array_not_a_ = This property must be an Array, not a {0} documentmsg = (document) - - +xml_attr_value_invalid = The XML Attribute {0} has an illegal character +xml_encoding_invalid = The XML encoding is invalid (must be UTF-8) +xml_stated_encoding_invalid = The XML encoding stated in the header is invalid (must be "UTF-8" if stated)