Merge pull request #1247 from pieter-edelman-nictiz/issue-133

#133: Escape just embedded HTML tags, not autolinks in Markdown
This commit is contained in:
Grahame Grieve 2023-05-10 05:46:22 -05:00 committed by GitHub
commit 28d8491f05
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 55 additions and 45 deletions

View File

@ -33,6 +33,8 @@ package org.hl7.fhir.utilities;
import java.util.Collections;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.commonmark.Extension;
import org.commonmark.ext.gfm.tables.TablesExtension;
@ -157,7 +159,13 @@ public class MarkDownProcessor {
mid = -1;
}
}
return false;
// Detect autolinks, which should start with a scheme, followed by a colon, followed by some content. Whitespace
// is not allowed and for practical purposes, the scheme is considered to consist of lowercase ASCII characters
// only.
Pattern autolinkPattern = Pattern.compile("<[a-z]+:[^\\s]+>");
Matcher autolinkMatcher = autolinkPattern.matcher(s);
return autolinkMatcher.find();
}
@ -193,8 +201,8 @@ public class MarkDownProcessor {
* and the way commonmark specifies that < is handled in content. For control reasons, the FHIR specification does
* not allow raw html tags in the markdown
*
* This check finds any raw <[x] where [x] is any alpha character, and prepends \ to it so that it
* renders as a < (e.g. gets escaped in the output HTML)
* This check finds any raw html tag and prepends \ to it so that it renders as a < (e.g. gets escaped in the output
* HTML)
*
* This is public to enable testing (not for direct use otherwise)
*
@ -202,21 +210,16 @@ public class MarkDownProcessor {
* @return
*/
public static String preProcess(String source) {
StringBuilder b = new StringBuilder();
for (int i = 0; i < source.length(); i++) {
char last = i > 0 ? source.charAt(i-1) : 0;
char current = source.charAt(i);
char next = i < source.length() -1 ? source.charAt(i+1) : 0;
if (current == '<' && Character.isAlphabetic(next) && last != '\\') {
b.append('\\');
b.append(current);
} else {
b.append(current);
}
}
return b.toString();
}
// Escape all unescaped open and closing tags ('<' or '</', followed by an ASCII letter, followed by ASCII
// letters, digits and/or hyphens).
String processed = source.replaceAll("(?<!\\\\)<(\\/)?([A-Za-z][A-Za-z0-9-]*[\\s>])", "\\\\<$1$2");
// Escape all other HTML tags: HTML comments, processing instructions, declarations and CDATA sections --
// everything starting with '<?' or '<!'.
processed = processed.replaceAll("<(!|\\?)", "\\\\<$1");
return processed;
}
private String processCommonMark(String source) {
Set<Extension> extensions = Collections.singleton(TablesExtension.create());

View File

@ -25,6 +25,10 @@ class MarkdownTests {
testMarkdown("this [is(link)] a test string", false);
testMarkdown("this [is](link a test string", false);
testMarkdown("this [i]s] (link) a test string", false);
testMarkdown("this <https://hl7.org> is a test string", true);
testMarkdown("this < https://hl7.org> is a test string", false);
testMarkdown("this <mailto:info@hl7.org> is a test string", true);
testMarkdown("this <b>is</b> a test string", false);
testMarkdown("## heading", true);
testMarkdown("# heading", false);
testMarkdown("## heading", false);

View File

@ -18,6 +18,9 @@ public class MarkdownPreprocessorTesting {
public void testHTML() throws IOException {
assertEquals(MarkDownProcessor.preProcess("<type>"), "\\<type>");
assertEquals(MarkDownProcessor.preProcess("\\<type>"), "\\<type>");
assertEquals(MarkDownProcessor.preProcess("</type>"), "\\</type>");
assertEquals(MarkDownProcessor.preProcess("<http://hl7.org>"), "<http://hl7.org>");
assertEquals(MarkDownProcessor.preProcess("\\<http://hl7.org>"), "\\<http://hl7.org>");
}