More markdown detection work

This commit is contained in:
Grahame Grieve 2023-02-21 16:52:32 +11:00
parent bae27ef706
commit 1eeee429ce
2 changed files with 176 additions and 0 deletions

View File

@ -69,6 +69,125 @@ public class MarkDownProcessor {
}
}
/**
* Returns true if this is intended to be processed as markdown
*
* this is guess, based on textual analysis of the content.
*
* Uses of this routine:
* In general, the main use of this is to decide to escape the string so erroneous markdown processing doesn't munge characters
* If it's a plain string, and it's being put into something that's markdown, then you should escape the content
* If it's markdown, but you're not sure whether to process it as markdown
*
* The underlying problem is that markdown processing plain strings is problematic because some technical characters might
* get lost. So it's good to escape them... but if it's meant to be markdown, then it'll get trashed.
*
* This method works by looking for character patterns that are unlikely to occur outside markdown - but it's still only unlikely
*
* @param content
* @return
*/
// todo: dialect dependency?
public boolean isProbablyMarkdown(String content, boolean mdIfParagrapghs) {
if (mdIfParagrapghs && content.contains("\n")) {
return true;
}
String[] lines = content.split("\\r?\\n");
for (String s : lines) {
if (s.startsWith("* ") || isHeading(s) || s.startsWith("1. ") || s.startsWith(" ")) {
return true;
}
if (s.contains("```") || s.contains("~~~") || s.contains("[[[")) {
return true;
}
if (hasLink(s)) {
return true;
}
if (hasTextSpecial(s, '*') || hasTextSpecial(s, '_') ) {
return true;
}
}
return false;
}
private boolean isHeading(String s) {
if (s.length() > 7 && s.startsWith("###### ") && !Character.isWhitespace(s.charAt(7))) {
return true;
}
if (s.length() > 6 && s.startsWith("##### ") && !Character.isWhitespace(s.charAt(6))) {
return true;
}
if (s.length() > 5 && s.startsWith("#### ") && !Character.isWhitespace(s.charAt(5))) {
return true;
}
if (s.length() > 4 && s.startsWith("### ") && !Character.isWhitespace(s.charAt(4))) {
return true;
}
if (s.length() > 3 && s.startsWith("## ") && !Character.isWhitespace(s.charAt(3))) {
return true;
}
//
// not sure about this one. # [string] is something that could easily arise in non-markdown,
// so this appearing isn't enough to call it markdown
//
// if (s.length() > 2 && s.startsWith("# ") && !Character.isWhitespace(s.charAt(2))) {
// return true;
// }
return false;
}
private boolean hasLink(String s) {
int left = -1;
int mid = -1;
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (c == '[') {
mid = -1;
left = i;
} else if (left > -1 && i < s.length()-1 && c == ']' && s.charAt(i+1) == '(') {
mid = i;
} else if (left > -1 && c == ']') {
left = -1;
} else if (left > -1 && mid > -1 && c == ')') {
return true;
} else if (mid > -1 && c == '[' || c == ']' || (c == '(' && i > mid+1)) {
left = -1;
mid = -1;
}
}
return false;
}
private boolean hasTextSpecial(String s, char c) {
boolean second = false;
for (int i = 0; i < s.length(); i++) {
char prev = i == 0 ? ' ' : s.charAt(i-1);
char next = i < s.length() - 1 ? s.charAt(i+1) : ' ';
if (s.charAt(i) != c) {
// nothing
} else if (second) {
if (Character.isWhitespace(next) && (isPunctation(prev) || Character.isLetterOrDigit(prev))) {
return true;
}
second = false;
} else {
if (Character.isWhitespace(prev) && (isPunctation(next) || Character.isLetterOrDigit(next))) {
second = true;
}
}
}
return false;
}
private boolean isPunctation(char ch) {
return Utilities.existsInList(ch, '.', ',', '!', '?');
}
/**
* This deals with a painful problem created by the intersection of previous publishing processes
* and the way commonmark specifies that < is handled in content. For control reasons, the FHIR specification does

View File

@ -2,15 +2,72 @@ package org.hl7.fhir.utilities;
import static org.junit.jupiter.api.Assertions.*;
import org.hl7.fhir.utilities.MarkDownProcessor.Dialect;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
class MarkdownTests {
@Test
void testMarkdownDetection() {
testMarkdown("this is a test string", false);
testMarkdown("this is a \r\ntest string", false);
testMarkdown("this is a \r\ntest string", true, true);
testMarkdown("this is a t*est strin*g", false);
testMarkdown("this is a *test strin*g", false);
testMarkdown("this is a *test string*", true);
testMarkdown("this is a *test *string", false);
testMarkdown("this is a *test* string", true);
testMarkdown("this [is] a test string", false);
testMarkdown("this [is](link) a test string", true);
testMarkdown("this [is](link a test string", false);
testMarkdown("this [is] (link) a test string", false);
testMarkdown("this [is(link)] a test string", false);
testMarkdown("this [is](link a test string", false);
testMarkdown("this [i]s] (link) a test string", false);
testMarkdown("## heading", true);
testMarkdown("# heading", false);
testMarkdown("## heading", false);
testMarkdown("###", false);
}
private void testMarkdown(String content, boolean isMD) {
testMarkdown(content, isMD, false);
}
private void testMarkdown(String content, boolean isMD, boolean ifLines) {
boolean test = new MarkDownProcessor(Dialect.COMMON_MARK).isProbablyMarkdown(content, ifLines);
assertEquals(isMD, test);
}
@Test
void testStringToMarkdown() {
// first, we test the need for replacing
Assertions.assertEquals("<p>This is a string</p>", new MarkDownProcessor(Dialect.COMMON_MARK).process("This is a string", null).trim());
Assertions.assertEquals("<p>This is *a string</p>", new MarkDownProcessor(Dialect.COMMON_MARK).process("This is *a string", null).trim());
Assertions.assertNotEquals("<p>This is *a* string</p>", new MarkDownProcessor(Dialect.COMMON_MARK).process("This is *a* string", null).trim());
Assertions.assertEquals("<p>This is *a *string</p>", new MarkDownProcessor(Dialect.COMMON_MARK).process("This is *a *string", null).trim());
Assertions.assertNotEquals("<p>This genomic study analyzes CYP2D6*1 and CYP2D6*2</p>", new MarkDownProcessor(Dialect.COMMON_MARK).process("This genomic study analyzes CYP2D6*1 and CYP2D6*2", null).trim());
Assertions.assertEquals("<p>This genomic study analyzes CYP2D6*1 and CYP2D6*2</p>", new MarkDownProcessor(Dialect.COMMON_MARK).process("This genomic study analyzes CYP2D6*1 and CYP2D6\\*2", null).trim());
Assertions.assertEquals("This is \\*a test\\*", MarkDownProcessor.makeStringSafeAsMarkdown("This is *a test*"));
Assertions.assertEquals("This is *a test*", MarkDownProcessor.makeMarkdownForString("This is \\*a test\\*"));
}
}
//
//case '*':
//case '&':
//case '#':
//case '[':
//case '>':
//case '<':
//case '`':
// -
// |
// :
// ~
// ^
// =