More markdown detection work
This commit is contained in:
parent
bae27ef706
commit
1eeee429ce
|
@ -69,6 +69,125 @@ public class MarkDownProcessor {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if this is intended to be processed as markdown
|
||||
*
|
||||
* this is guess, based on textual analysis of the content.
|
||||
*
|
||||
* Uses of this routine:
|
||||
* In general, the main use of this is to decide to escape the string so erroneous markdown processing doesn't munge characters
|
||||
* If it's a plain string, and it's being put into something that's markdown, then you should escape the content
|
||||
* If it's markdown, but you're not sure whether to process it as markdown
|
||||
*
|
||||
* The underlying problem is that markdown processing plain strings is problematic because some technical characters might
|
||||
* get lost. So it's good to escape them... but if it's meant to be markdown, then it'll get trashed.
|
||||
*
|
||||
* This method works by looking for character patterns that are unlikely to occur outside markdown - but it's still only unlikely
|
||||
*
|
||||
* @param content
|
||||
* @return
|
||||
*/
|
||||
// todo: dialect dependency?
|
||||
public boolean isProbablyMarkdown(String content, boolean mdIfParagrapghs) {
|
||||
if (mdIfParagrapghs && content.contains("\n")) {
|
||||
return true;
|
||||
}
|
||||
String[] lines = content.split("\\r?\\n");
|
||||
for (String s : lines) {
|
||||
if (s.startsWith("* ") || isHeading(s) || s.startsWith("1. ") || s.startsWith(" ")) {
|
||||
return true;
|
||||
}
|
||||
if (s.contains("```") || s.contains("~~~") || s.contains("[[[")) {
|
||||
return true;
|
||||
}
|
||||
if (hasLink(s)) {
|
||||
return true;
|
||||
}
|
||||
if (hasTextSpecial(s, '*') || hasTextSpecial(s, '_') ) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean isHeading(String s) {
|
||||
if (s.length() > 7 && s.startsWith("###### ") && !Character.isWhitespace(s.charAt(7))) {
|
||||
return true;
|
||||
}
|
||||
if (s.length() > 6 && s.startsWith("##### ") && !Character.isWhitespace(s.charAt(6))) {
|
||||
return true;
|
||||
}
|
||||
if (s.length() > 5 && s.startsWith("#### ") && !Character.isWhitespace(s.charAt(5))) {
|
||||
return true;
|
||||
}
|
||||
if (s.length() > 4 && s.startsWith("### ") && !Character.isWhitespace(s.charAt(4))) {
|
||||
return true;
|
||||
}
|
||||
if (s.length() > 3 && s.startsWith("## ") && !Character.isWhitespace(s.charAt(3))) {
|
||||
return true;
|
||||
}
|
||||
//
|
||||
// not sure about this one. # [string] is something that could easily arise in non-markdown,
|
||||
// so this appearing isn't enough to call it markdown
|
||||
//
|
||||
// if (s.length() > 2 && s.startsWith("# ") && !Character.isWhitespace(s.charAt(2))) {
|
||||
// return true;
|
||||
// }
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private boolean hasLink(String s) {
|
||||
int left = -1;
|
||||
int mid = -1;
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
char c = s.charAt(i);
|
||||
if (c == '[') {
|
||||
mid = -1;
|
||||
left = i;
|
||||
} else if (left > -1 && i < s.length()-1 && c == ']' && s.charAt(i+1) == '(') {
|
||||
mid = i;
|
||||
} else if (left > -1 && c == ']') {
|
||||
left = -1;
|
||||
} else if (left > -1 && mid > -1 && c == ')') {
|
||||
return true;
|
||||
} else if (mid > -1 && c == '[' || c == ']' || (c == '(' && i > mid+1)) {
|
||||
left = -1;
|
||||
mid = -1;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private boolean hasTextSpecial(String s, char c) {
|
||||
boolean second = false;
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
char prev = i == 0 ? ' ' : s.charAt(i-1);
|
||||
char next = i < s.length() - 1 ? s.charAt(i+1) : ' ';
|
||||
if (s.charAt(i) != c) {
|
||||
// nothing
|
||||
} else if (second) {
|
||||
if (Character.isWhitespace(next) && (isPunctation(prev) || Character.isLetterOrDigit(prev))) {
|
||||
return true;
|
||||
}
|
||||
second = false;
|
||||
} else {
|
||||
if (Character.isWhitespace(prev) && (isPunctation(next) || Character.isLetterOrDigit(next))) {
|
||||
second = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private boolean isPunctation(char ch) {
|
||||
return Utilities.existsInList(ch, '.', ',', '!', '?');
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This deals with a painful problem created by the intersection of previous publishing processes
|
||||
* and the way commonmark specifies that < is handled in content. For control reasons, the FHIR specification does
|
||||
|
|
|
@ -2,15 +2,72 @@ package org.hl7.fhir.utilities;
|
|||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
import org.hl7.fhir.utilities.MarkDownProcessor.Dialect;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
class MarkdownTests {
|
||||
|
||||
@Test
|
||||
void testMarkdownDetection() {
|
||||
testMarkdown("this is a test string", false);
|
||||
testMarkdown("this is a \r\ntest string", false);
|
||||
testMarkdown("this is a \r\ntest string", true, true);
|
||||
testMarkdown("this is a t*est strin*g", false);
|
||||
testMarkdown("this is a *test strin*g", false);
|
||||
testMarkdown("this is a *test string*", true);
|
||||
testMarkdown("this is a *test *string", false);
|
||||
testMarkdown("this is a *test* string", true);
|
||||
testMarkdown("this [is] a test string", false);
|
||||
testMarkdown("this [is](link) a test string", true);
|
||||
testMarkdown("this [is](link a test string", false);
|
||||
testMarkdown("this [is] (link) a test string", false);
|
||||
testMarkdown("this [is(link)] a test string", false);
|
||||
testMarkdown("this [is](link a test string", false);
|
||||
testMarkdown("this [i]s] (link) a test string", false);
|
||||
testMarkdown("## heading", true);
|
||||
testMarkdown("# heading", false);
|
||||
testMarkdown("## heading", false);
|
||||
testMarkdown("###", false);
|
||||
}
|
||||
|
||||
private void testMarkdown(String content, boolean isMD) {
|
||||
testMarkdown(content, isMD, false);
|
||||
}
|
||||
|
||||
private void testMarkdown(String content, boolean isMD, boolean ifLines) {
|
||||
boolean test = new MarkDownProcessor(Dialect.COMMON_MARK).isProbablyMarkdown(content, ifLines);
|
||||
assertEquals(isMD, test);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testStringToMarkdown() {
|
||||
// first, we test the need for replacing
|
||||
Assertions.assertEquals("<p>This is a string</p>", new MarkDownProcessor(Dialect.COMMON_MARK).process("This is a string", null).trim());
|
||||
Assertions.assertEquals("<p>This is *a string</p>", new MarkDownProcessor(Dialect.COMMON_MARK).process("This is *a string", null).trim());
|
||||
Assertions.assertNotEquals("<p>This is *a* string</p>", new MarkDownProcessor(Dialect.COMMON_MARK).process("This is *a* string", null).trim());
|
||||
Assertions.assertEquals("<p>This is *a *string</p>", new MarkDownProcessor(Dialect.COMMON_MARK).process("This is *a *string", null).trim());
|
||||
|
||||
Assertions.assertNotEquals("<p>This genomic study analyzes CYP2D6*1 and CYP2D6*2</p>", new MarkDownProcessor(Dialect.COMMON_MARK).process("This genomic study analyzes CYP2D6*1 and CYP2D6*2", null).trim());
|
||||
Assertions.assertEquals("<p>This genomic study analyzes CYP2D6*1 and CYP2D6*2</p>", new MarkDownProcessor(Dialect.COMMON_MARK).process("This genomic study analyzes CYP2D6*1 and CYP2D6\\*2", null).trim());
|
||||
|
||||
|
||||
Assertions.assertEquals("This is \\*a test\\*", MarkDownProcessor.makeStringSafeAsMarkdown("This is *a test*"));
|
||||
Assertions.assertEquals("This is *a test*", MarkDownProcessor.makeMarkdownForString("This is \\*a test\\*"));
|
||||
}
|
||||
|
||||
}
|
||||
//
|
||||
//case '*':
|
||||
//case '&':
|
||||
//case '#':
|
||||
//case '[':
|
||||
//case '>':
|
||||
//case '<':
|
||||
//case '`':
|
||||
// -
|
||||
// |
|
||||
// :
|
||||
// ~
|
||||
// ^
|
||||
// =
|
Loading…
Reference in New Issue