revise whitespace handling for unicode conformance in validator

This commit is contained in:
Grahame Grieve 2023-03-07 06:25:45 +11:00
parent 5eca02f879
commit 0c26f09721
17 changed files with 102 additions and 56 deletions

View File

@ -134,7 +134,7 @@ public class TestingUtilities {
}
private static Node skipBlankText(Node node) {
while (node != null && (((node.getNodeType() == Node.TEXT_NODE) && Utilities.isWhitespace(node.getTextContent())) || (node.getNodeType() == Node.COMMENT_NODE)))
while (node != null && (((node.getNodeType() == Node.TEXT_NODE) && Utilities.isAllWhitespace(node.getTextContent())) || (node.getNodeType() == Node.COMMENT_NODE)))
node = node.getNextSibling();
return node;
}

View File

@ -134,7 +134,7 @@ public class TestingUtilities {
}
private static Node skipBlankText(Node node) {
while (node != null && (((node.getNodeType() == Node.TEXT_NODE) && Utilities.isWhitespace(node.getTextContent())) || (node.getNodeType() == Node.COMMENT_NODE)))
while (node != null && (((node.getNodeType() == Node.TEXT_NODE) && Utilities.isAllWhitespace(node.getTextContent())) || (node.getNodeType() == Node.COMMENT_NODE)))
node = node.getNextSibling();
return node;
}

View File

@ -164,7 +164,7 @@ public class TestingUtilities extends BaseTestingUtilities {
}
private static Node skipBlankText(Node node) {
while (node != null && (((node.getNodeType() == Node.TEXT_NODE) && Utilities.isWhitespace(node.getTextContent())) || (node.getNodeType() == Node.COMMENT_NODE)))
while (node != null && (((node.getNodeType() == Node.TEXT_NODE) && Utilities.isAllWhitespace(node.getTextContent())) || (node.getNodeType() == Node.COMMENT_NODE)))
node = node.getNextSibling();
return node;
}

View File

@ -242,7 +242,7 @@ public class TestingUtilities {
}
private static Node skipBlankText(Node node) {
while (node != null && (((node.getNodeType() == Node.TEXT_NODE) && Utilities.isWhitespace(node.getTextContent())) || (node.getNodeType() == Node.COMMENT_NODE)))
while (node != null && (((node.getNodeType() == Node.TEXT_NODE) && Utilities.isAllWhitespace(node.getTextContent())) || (node.getNodeType() == Node.COMMENT_NODE)))
node = node.getNextSibling();
return node;
}

View File

@ -257,7 +257,7 @@ public class TestingUtilities extends BaseTestingUtilities {
}
private static Node skipBlankText(Node node) {
while (node != null && (((node.getNodeType() == Node.TEXT_NODE) && Utilities.isWhitespace(node.getTextContent())) || (node.getNodeType() == Node.COMMENT_NODE)))
while (node != null && (((node.getNodeType() == Node.TEXT_NODE) && Utilities.isAllWhitespace(node.getTextContent())) || (node.getNodeType() == Node.COMMENT_NODE)))
node = node.getNextSibling();
return node;
}

View File

@ -919,7 +919,7 @@ public class ProfileUtilities extends TranslatingUtilities {
throw new FHIRException(context.formatMessage(I18nConstants.ILLEGAL_PATH__IN_DIFFERENTIAL_IN__NAME_PORTION_EXCEEDS_64_CHARS_IN_LENGTH, p, url));
}
for (char ch : pp.toCharArray()) {
if (Character.isWhitespace(ch)) {
if (Utilities.isWhitespace(ch)) {
throw new FHIRException(context.formatMessage(I18nConstants.ILLEGAL_PATH__IN_DIFFERENTIAL_IN__NO_UNICODE_WHITESPACE, p, url));
}
if (Utilities.existsInList(ch, ',', ':', ';', '\'', '"', '/', '|', '?', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '{', '}')) {

View File

@ -1,6 +1,7 @@
package org.hl7.fhir.r5.test.utils;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.lang3.StringUtils;
import org.hl7.fhir.utilities.CSFile;
import org.hl7.fhir.utilities.TextFile;
import org.hl7.fhir.utilities.ToolGlobalSettings;
@ -147,7 +148,7 @@ public class CompareUtilities extends BaseTestingUtilities {
}
private static Node skipBlankText(Node node) {
while (node != null && (((node.getNodeType() == Node.TEXT_NODE) && Utilities.isWhitespace(node.getTextContent())) || (node.getNodeType() == Node.COMMENT_NODE)))
while (node != null && (((node.getNodeType() == Node.TEXT_NODE) && StringUtils.isWhitespace(node.getTextContent())) || (node.getNodeType() == Node.COMMENT_NODE)))
node = node.getNextSibling();
return node;
}

View File

@ -338,7 +338,7 @@ public class FHIRLexer {
comments.add(source.substring(start, cursor).trim());
cursor = cursor + 2;
}
} else if (Character.isWhitespace(source.charAt(cursor))) {
} else if (Utilities.isWhitespace(source.charAt(cursor))) {
last13 = currentLocation.checkChar(source.charAt(cursor), last13);
cursor++;
} else {

View File

@ -826,7 +826,7 @@ public class Utilities {
boolean isWhitespace = false;
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (!Character.isWhitespace(c)) {
if (!isWhitespace(c)) {
b.append(Character.toLowerCase(c));
isWhitespace = false;
} else if (!isWhitespace) {
@ -861,15 +861,6 @@ public class Utilities {
}
public static boolean isWhitespace(String s) {
boolean ok = true;
for (int i = 0; i < s.length(); i++)
ok = ok && Character.isWhitespace(s.charAt(i));
return ok;
}
public static String URLEncode(String string) {
try {
return URLEncoder.encode(string, "UTF-8");
@ -1002,7 +993,11 @@ public class Utilities {
b.append("\\\"");
else if (c == '\\')
b.append("\\\\");
else if (((int) c) < 32)
else if (c == ' ')
b.append(" ");
else if (isWhitespace(c)) {
b.append("\\u"+Integer.toHexString(c));
} else if (((int) c) < 32)
b.append("\\u" + Utilities.padLeft(String.valueOf((int) c), '0', 4));
else
b.append(c);
@ -1086,15 +1081,15 @@ public class Utilities {
int expectedByte = in1.read();
while (expectedByte != -1) {
boolean w1 = isWhitespace(expectedByte);
boolean w1 = Character.isWhitespace(expectedByte);
if (w1)
while (isWhitespace(expectedByte))
while (Character.isWhitespace(expectedByte))
expectedByte = in1.read();
int foundByte = in2.read();
if (w1) {
if (!isWhitespace(foundByte))
if (!Character.isWhitespace(foundByte))
return false;
while (isWhitespace(foundByte))
while (Character.isWhitespace(foundByte))
foundByte = in2.read();
}
if (expectedByte != foundByte)
@ -1121,10 +1116,6 @@ public class Utilities {
}
}
private static boolean isWhitespace(int b) {
return b == 9 || b == 10 || b == 13 || b == 32;
}
public static boolean compareIgnoreWhitespace(String fn1, String fn2) throws IOException {
return compareIgnoreWhitespace(new File(fn1), new File(fn2));
@ -1881,4 +1872,53 @@ public class Utilities {
return name != null && name.matches("[A-Z]([A-Za-z0-9_]){1,254}");
}
public static boolean isAllWhitespace(String s) {
if (Utilities.noString(s)) {
return true;
}
for (char ch : s.toCharArray()) {
if (!isWhitespace(ch)) {
return false;
}
}
return true;
}
public static String trimWS(String s) {
if (Utilities.noString(s)) {
return s;
}
int start = 0;
while (start < s.length() && isWhitespace(s.charAt(start))) {
start++;
}
if (start == s.length()) {
return "";
}
int end = s.length() - 1;
while (end >= 0 && isWhitespace(s.charAt(end))) {
end--;
}
if (start > end) {
return "";
}
return s.substring(start, end+1);
}
// from https://en.wikipedia.org/wiki/Whitespace_character#Unicode
public static boolean isWhitespace(int ch) {
return Utilities.existsInList(ch, '\u0009', '\n', '\u000B','\u000C','\r','\u0020','\u0085','\u00A0',
'\u1680','\u2000','\u2001','\u2002','\u2003','\u2004','\u2005','\u2006','\u2007','\u2008','\u2009','\u200A',
'\u2028', '\u2029', '\u202F', '\u205F', '\u3000');
}
//public static boolean !isWhitespace(String s) {
//boolean ok = true;
//for (int i = 0; i < s.length(); i++)
// ok = ok && Character.isWhitespace(s.charAt(i));
//return ok;
//
//}
}

View File

@ -534,6 +534,7 @@ public class I18nConstants {
public static final String TYPE_SPECIFIC_CHECKS_DT_QTY_NO_ANNOTATIONS = "TYPE_SPECIFIC_CHECKS_DT_QTY_NO_ANNOTATIONS";
public static final String TYPE_SPECIFIC_CHECKS_DT_STRING_LENGTH = "Type_Specific_Checks_DT_String_Length";
public static final String TYPE_SPECIFIC_CHECKS_DT_STRING_WS = "Type_Specific_Checks_DT_String_WS";
public static final String TYPE_SPECIFIC_CHECKS_DT_STRING_WS_ALL = "Type_Specific_Checks_DT_String_WS_ALL";
public static final String TYPE_SPECIFIC_CHECKS_DT_TIME_VALID = "Type_Specific_Checks_DT_Time_Valid";
public static final String TYPE_SPECIFIC_CHECKS_DT_URI_OID = "Type_Specific_Checks_DT_URI_OID";
public static final String TYPE_SPECIFIC_CHECKS_DT_URI_UUID = "Type_Specific_Checks_DT_URI_UUID";

View File

@ -33,6 +33,7 @@ package org.hl7.fhir.utilities.xhtml;
import java.io.IOException;
import org.apache.commons.lang3.StringUtils;
import org.hl7.fhir.exceptions.FHIRException;
import org.hl7.fhir.utilities.Utilities;
import org.hl7.fhir.utilities.xml.IXMLWriter;
@ -81,7 +82,7 @@ public class CDANarrativeFormat {
xn.addComment(n.getTextContent());
return;
case Node.TEXT_NODE:
if (!Utilities.isWhitespace(n.getTextContent()))
if (!StringUtils.isWhitespace(n.getTextContent()))
xn.addText(n.getTextContent());
return;
case Node.ELEMENT_NODE:

View File

@ -77,6 +77,7 @@ import javax.imageio.ImageIO;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.commonmark.node.Node;
import org.commonmark.parser.Parser;
import org.commonmark.renderer.html.HtmlRenderer;
@ -303,7 +304,7 @@ public class HierarchicalTableGenerator extends TranslatingUtilities {
myPieces.add(new Piece("br"));
}
if (c.getNodeType() == NodeType.Text) {
if (!Utilities.isWhitespace(c.getContent()))
if (!StringUtils.isWhitespace(c.getContent()))
addNode(myPieces, c, style);
} else if ("p".equals(c.getName())) {
for (XhtmlNode g : c.getChildNodes()) {

View File

@ -218,6 +218,7 @@ Type_Specific_Checks_DT_Primitive_ValueExt = Primitive types must have a value o
Type_Specific_Checks_DT_Primitive_WS = Primitive types should not only be whitespace
Type_Specific_Checks_DT_String_Length = value is longer than permitted maximum length of 1 MB (1048576 bytes)
Type_Specific_Checks_DT_String_WS = value should not start or finish with whitespace ''{0}''
Type_Specific_Checks_DT_String_WS_ALL = value should not be all whitespace ''{0}''
Type_Specific_Checks_DT_Time_Valid = Not a valid time ({0})
Type_Specific_Checks_DT_URI_OID = URI values cannot start with oid:
Type_Specific_Checks_DT_URI_UUID = URI values cannot start with uuid:

View File

@ -230,4 +230,16 @@ class UtilitiesTest {
}
@Test
@DisplayName("trimWS tests")
void testTrimWS() {
Assertions.assertEquals("", Utilities.trimWS(""));
Assertions.assertEquals("", Utilities.trimWS(" "));
Assertions.assertEquals("t", Utilities.trimWS(" t "));
Assertions.assertEquals(".", Utilities.trimWS("\r."));
Assertions.assertEquals("# %", Utilities.trimWS("# %"));
Assertions.assertEquals("", Utilities.trimWS("\u0009\n\u000B\u000C\r\u0020\u0085\u00A0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000"));
}
}

View File

@ -2258,7 +2258,7 @@ public class InstanceValidator extends BaseValidator implements IResourceValidat
ok = rule(errors, NO_RULE_DATE, IssueType.INVALID, e.line(), e.col(), path, e.hasChildren(), I18nConstants.TYPE_SPECIFIC_CHECKS_DT_PRIMITIVE_VALUEEXT) && ok;
else if (e.primitiveValue().length() == 0)
ok = rule(errors, NO_RULE_DATE, IssueType.INVALID, e.line(), e.col(), path, e.hasChildren(), I18nConstants.TYPE_SPECIFIC_CHECKS_DT_PRIMITIVE_NOTEMPTY) && ok;
else if (StringUtils.isWhitespace(e.primitiveValue()))
else if (Utilities.isAllWhitespace(e.primitiveValue()))
warning(errors, NO_RULE_DATE, IssueType.INVALID, e.line(), e.col(), path, e.hasChildren(), I18nConstants.TYPE_SPECIFIC_CHECKS_DT_PRIMITIVE_WS);
if (context.hasBinding()) {
ok = rule(errors, NO_RULE_DATE, IssueType.CODEINVALID, e.line(), e.col(), path, context.getBinding().getStrength() != BindingStrength.REQUIRED, I18nConstants.Terminology_TX_Code_ValueSet_MISSING) && ok;
@ -2306,7 +2306,7 @@ public class InstanceValidator extends BaseValidator implements IResourceValidat
String url = e.primitiveValue();
ok = rule(errors, NO_RULE_DATE, IssueType.INVALID, e.line(), e.col(), path, !url.startsWith("oid:"), I18nConstants.TYPE_SPECIFIC_CHECKS_DT_URI_OID) && ok;
ok = rule(errors, NO_RULE_DATE, IssueType.INVALID, e.line(), e.col(), path, !url.startsWith("uuid:"), I18nConstants.TYPE_SPECIFIC_CHECKS_DT_URI_UUID) && ok;
ok = rule(errors, NO_RULE_DATE, IssueType.INVALID, e.line(), e.col(), path, url.equals(url.trim().replace(" ", ""))
ok = rule(errors, NO_RULE_DATE, IssueType.INVALID, e.line(), e.col(), path, url.equals(Utilities.trimWS(url).replace(" ", ""))
// work around an old invalid example in a core package
|| "http://www.acme.com/identifiers/patient or urn:ietf:rfc:3986 if the Identifier.value itself is a full uri".equals(url), I18nConstants.TYPE_SPECIFIC_CHECKS_DT_URI_WS, url) && ok;
ok = rule(errors, NO_RULE_DATE, IssueType.INVALID, e.line(), e.col(), path, !context.hasMaxLength() || context.getMaxLength() == 0 || url.length() <= context.getMaxLength(), I18nConstants.TYPE_SPECIFIC_CHECKS_DT_PRIMITIVE_LENGTH, context.getMaxLength()) && ok;
@ -2353,7 +2353,9 @@ public class InstanceValidator extends BaseValidator implements IResourceValidat
}
if (type.equalsIgnoreCase("string") && e.hasPrimitiveValue()) {
if (rule(errors, NO_RULE_DATE, IssueType.INVALID, e.line(), e.col(), path, e.primitiveValue() == null || e.primitiveValue().length() > 0, I18nConstants.TYPE_SPECIFIC_CHECKS_DT_PRIMITIVE_NOTEMPTY)) {
warning(errors, NO_RULE_DATE, IssueType.INVALID, e.line(), e.col(), path, e.primitiveValue() == null || e.primitiveValue().trim().equals(e.primitiveValue()), I18nConstants.TYPE_SPECIFIC_CHECKS_DT_STRING_WS, prepWSPresentation(e.primitiveValue()));
if (warning(errors, NO_RULE_DATE, IssueType.INVALID, e.line(), e.col(), path, e.primitiveValue() == null || !Utilities.isAllWhitespace(e.primitiveValue()), I18nConstants.TYPE_SPECIFIC_CHECKS_DT_STRING_WS_ALL, prepWSPresentation(e.primitiveValue()))) {
warning(errors, NO_RULE_DATE, IssueType.INVALID, e.line(), e.col(), path, e.primitiveValue() == null || Utilities.trimWS(e.primitiveValue()).equals(e.primitiveValue()), I18nConstants.TYPE_SPECIFIC_CHECKS_DT_STRING_WS, prepWSPresentation(e.primitiveValue()));
}
if (rule(errors, NO_RULE_DATE, IssueType.INVALID, e.line(), e.col(), path, e.primitiveValue().length() <= 1048576, I18nConstants.TYPE_SPECIFIC_CHECKS_DT_STRING_LENGTH)) {
ok = rule(errors, NO_RULE_DATE, IssueType.INVALID, e.line(), e.col(), path, !context.hasMaxLength() || context.getMaxLength() == 0 || e.primitiveValue().length() <= context.getMaxLength(), I18nConstants.TYPE_SPECIFIC_CHECKS_DT_PRIMITIVE_LENGTH, context.getMaxLength()) && ok;
} else {
@ -2623,24 +2625,7 @@ public class InstanceValidator extends BaseValidator implements IResourceValidat
if (Utilities.noString(s)) {
return "";
}
if (!StringUtils.containsWhitespace(s.trim())) {
return s;
}
int b = 0;
while (Character.isWhitespace(s.charAt(b))) {
b++;
}
while (!Character.isWhitespace(s.charAt(b))) {
b++;
}
int e = s.length() - 1;
while (Character.isWhitespace(s.charAt(e))) {
e--;
}
while (!Character.isWhitespace(s.charAt(e))) {
e--;
}
return s.substring(0, b)+"..."+s.substring(e+1);
return Utilities.escapeJson(s);
}
public boolean validateReference(ValidatorHostContext hostContext, List<ValidationMessage> errors, String path, String type, ElementDefinition context, Element e, String url) {
@ -2803,7 +2788,7 @@ public class InstanceValidator extends BaseValidator implements IResourceValidat
boolean ok = true;
for (int i = 0; i < theEncoded.length(); i++) {
char nextChar = theEncoded.charAt(i);
if (Character.isWhitespace(nextChar)) {
if (Utilities.isWhitespace(nextChar)) {
continue;
}
if (Character.isLetterOrDigit(nextChar)) {
@ -2826,7 +2811,7 @@ public class InstanceValidator extends BaseValidator implements IResourceValidat
}
for (int i = 0; i < theEncoded.length(); i++) {
char nextChar = theEncoded.charAt(i);
if (Character.isWhitespace(nextChar)) {
if (Utilities.isWhitespace(nextChar)) {
return true;
}
}
@ -2930,7 +2915,7 @@ public class InstanceValidator extends BaseValidator implements IResourceValidat
return context.formatMessage(I18nConstants.XHTML_URL_DATA_DATA_INVALID, value);
} else {
if (p[0].startsWith(" ")) {
p[0] = p[0].trim();
p[0] = Utilities.trimWS(p[0]);
}
String mMsg = checkValidMimeType(p[0].substring(0, p[0].lastIndexOf(";")));
if (mMsg != null) {
@ -4039,7 +4024,7 @@ public class InstanceValidator extends BaseValidator implements IResourceValidat
}
private boolean passesCodeWhitespaceRules(String v) {
if (!v.trim().equals(v))
if (!Utilities.trimWS(v).equals(v))
return false;
boolean lastWasSpace = true;
for (char c : v.toCharArray()) {
@ -4048,7 +4033,7 @@ public class InstanceValidator extends BaseValidator implements IResourceValidat
return false;
else
lastWasSpace = true;
} else if (Character.isWhitespace(c) || c == '\u00A0')
} else if (Utilities.isWhitespace(c))
return false;
else
lastWasSpace = false;

View File

@ -134,6 +134,9 @@ public class FHIRPathExpressionFixer {
if (regex.equals("-?(0|[1-9][0-9]{0,17})(\\.[0-9]{1,17})?([eE][+-]?[0-9]{1,9}})?")) {
return "-?(0|[1-9][0-9]{0,17})(\\.[0-9]{1,17})?([eE](0|[+\\-]?[1-9][0-9]{0,9}))?";
}
if (regex.equals("[ \\r\\n\\t\\S]+")) {
return "^[\\s\\r\\n\\t\\S]+$";
}
return regex;
}

View File

@ -45,6 +45,7 @@ import javax.xml.parsers.DocumentBuilderFactory;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.fhir.ucum.UcumEssenceService;
import org.hl7.fhir.convertors.loaders.loaderR5.NullLoaderKnowledgeProviderR5;
import org.hl7.fhir.convertors.loaders.loaderR5.R2016MayToR5Loader;
@ -265,7 +266,7 @@ public class UtilitiesXTests {
}
private static Node skipBlankText(Node node) {
while (node != null && (((node.getNodeType() == Node.TEXT_NODE) && Utilities.isWhitespace(node.getTextContent())) || (node.getNodeType() == Node.COMMENT_NODE)))
while (node != null && (((node.getNodeType() == Node.TEXT_NODE) && StringUtils.isWhitespace(node.getTextContent())) || (node.getNodeType() == Node.COMMENT_NODE)))
node = node.getNextSibling();
return node;
}