More markdown detection work
This commit is contained in:
parent
bae27ef706
commit
1eeee429ce
|
@ -69,6 +69,125 @@ public class MarkDownProcessor {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns true if this is intended to be processed as markdown
|
||||||
|
*
|
||||||
|
* this is guess, based on textual analysis of the content.
|
||||||
|
*
|
||||||
|
* Uses of this routine:
|
||||||
|
* In general, the main use of this is to decide to escape the string so erroneous markdown processing doesn't munge characters
|
||||||
|
* If it's a plain string, and it's being put into something that's markdown, then you should escape the content
|
||||||
|
* If it's markdown, but you're not sure whether to process it as markdown
|
||||||
|
*
|
||||||
|
* The underlying problem is that markdown processing plain strings is problematic because some technical characters might
|
||||||
|
* get lost. So it's good to escape them... but if it's meant to be markdown, then it'll get trashed.
|
||||||
|
*
|
||||||
|
* This method works by looking for character patterns that are unlikely to occur outside markdown - but it's still only unlikely
|
||||||
|
*
|
||||||
|
* @param content
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
// todo: dialect dependency?
|
||||||
|
public boolean isProbablyMarkdown(String content, boolean mdIfParagrapghs) {
|
||||||
|
if (mdIfParagrapghs && content.contains("\n")) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
String[] lines = content.split("\\r?\\n");
|
||||||
|
for (String s : lines) {
|
||||||
|
if (s.startsWith("* ") || isHeading(s) || s.startsWith("1. ") || s.startsWith(" ")) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (s.contains("```") || s.contains("~~~") || s.contains("[[[")) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (hasLink(s)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (hasTextSpecial(s, '*') || hasTextSpecial(s, '_') ) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isHeading(String s) {
|
||||||
|
if (s.length() > 7 && s.startsWith("###### ") && !Character.isWhitespace(s.charAt(7))) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (s.length() > 6 && s.startsWith("##### ") && !Character.isWhitespace(s.charAt(6))) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (s.length() > 5 && s.startsWith("#### ") && !Character.isWhitespace(s.charAt(5))) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (s.length() > 4 && s.startsWith("### ") && !Character.isWhitespace(s.charAt(4))) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (s.length() > 3 && s.startsWith("## ") && !Character.isWhitespace(s.charAt(3))) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
//
|
||||||
|
// not sure about this one. # [string] is something that could easily arise in non-markdown,
|
||||||
|
// so this appearing isn't enough to call it markdown
|
||||||
|
//
|
||||||
|
// if (s.length() > 2 && s.startsWith("# ") && !Character.isWhitespace(s.charAt(2))) {
|
||||||
|
// return true;
|
||||||
|
// }
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean hasLink(String s) {
|
||||||
|
int left = -1;
|
||||||
|
int mid = -1;
|
||||||
|
for (int i = 0; i < s.length(); i++) {
|
||||||
|
char c = s.charAt(i);
|
||||||
|
if (c == '[') {
|
||||||
|
mid = -1;
|
||||||
|
left = i;
|
||||||
|
} else if (left > -1 && i < s.length()-1 && c == ']' && s.charAt(i+1) == '(') {
|
||||||
|
mid = i;
|
||||||
|
} else if (left > -1 && c == ']') {
|
||||||
|
left = -1;
|
||||||
|
} else if (left > -1 && mid > -1 && c == ')') {
|
||||||
|
return true;
|
||||||
|
} else if (mid > -1 && c == '[' || c == ']' || (c == '(' && i > mid+1)) {
|
||||||
|
left = -1;
|
||||||
|
mid = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean hasTextSpecial(String s, char c) {
|
||||||
|
boolean second = false;
|
||||||
|
for (int i = 0; i < s.length(); i++) {
|
||||||
|
char prev = i == 0 ? ' ' : s.charAt(i-1);
|
||||||
|
char next = i < s.length() - 1 ? s.charAt(i+1) : ' ';
|
||||||
|
if (s.charAt(i) != c) {
|
||||||
|
// nothing
|
||||||
|
} else if (second) {
|
||||||
|
if (Character.isWhitespace(next) && (isPunctation(prev) || Character.isLetterOrDigit(prev))) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
second = false;
|
||||||
|
} else {
|
||||||
|
if (Character.isWhitespace(prev) && (isPunctation(next) || Character.isLetterOrDigit(next))) {
|
||||||
|
second = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean isPunctation(char ch) {
|
||||||
|
return Utilities.existsInList(ch, '.', ',', '!', '?');
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This deals with a painful problem created by the intersection of previous publishing processes
|
* This deals with a painful problem created by the intersection of previous publishing processes
|
||||||
* and the way commonmark specifies that < is handled in content. For control reasons, the FHIR specification does
|
* and the way commonmark specifies that < is handled in content. For control reasons, the FHIR specification does
|
||||||
|
|
|
@ -2,15 +2,72 @@ package org.hl7.fhir.utilities;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
import org.hl7.fhir.utilities.MarkDownProcessor.Dialect;
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
class MarkdownTests {
|
class MarkdownTests {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testMarkdownDetection() {
|
||||||
|
testMarkdown("this is a test string", false);
|
||||||
|
testMarkdown("this is a \r\ntest string", false);
|
||||||
|
testMarkdown("this is a \r\ntest string", true, true);
|
||||||
|
testMarkdown("this is a t*est strin*g", false);
|
||||||
|
testMarkdown("this is a *test strin*g", false);
|
||||||
|
testMarkdown("this is a *test string*", true);
|
||||||
|
testMarkdown("this is a *test *string", false);
|
||||||
|
testMarkdown("this is a *test* string", true);
|
||||||
|
testMarkdown("this [is] a test string", false);
|
||||||
|
testMarkdown("this [is](link) a test string", true);
|
||||||
|
testMarkdown("this [is](link a test string", false);
|
||||||
|
testMarkdown("this [is] (link) a test string", false);
|
||||||
|
testMarkdown("this [is(link)] a test string", false);
|
||||||
|
testMarkdown("this [is](link a test string", false);
|
||||||
|
testMarkdown("this [i]s] (link) a test string", false);
|
||||||
|
testMarkdown("## heading", true);
|
||||||
|
testMarkdown("# heading", false);
|
||||||
|
testMarkdown("## heading", false);
|
||||||
|
testMarkdown("###", false);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void testMarkdown(String content, boolean isMD) {
|
||||||
|
testMarkdown(content, isMD, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void testMarkdown(String content, boolean isMD, boolean ifLines) {
|
||||||
|
boolean test = new MarkDownProcessor(Dialect.COMMON_MARK).isProbablyMarkdown(content, ifLines);
|
||||||
|
assertEquals(isMD, test);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void testStringToMarkdown() {
|
void testStringToMarkdown() {
|
||||||
|
// first, we test the need for replacing
|
||||||
|
Assertions.assertEquals("<p>This is a string</p>", new MarkDownProcessor(Dialect.COMMON_MARK).process("This is a string", null).trim());
|
||||||
|
Assertions.assertEquals("<p>This is *a string</p>", new MarkDownProcessor(Dialect.COMMON_MARK).process("This is *a string", null).trim());
|
||||||
|
Assertions.assertNotEquals("<p>This is *a* string</p>", new MarkDownProcessor(Dialect.COMMON_MARK).process("This is *a* string", null).trim());
|
||||||
|
Assertions.assertEquals("<p>This is *a *string</p>", new MarkDownProcessor(Dialect.COMMON_MARK).process("This is *a *string", null).trim());
|
||||||
|
|
||||||
|
Assertions.assertNotEquals("<p>This genomic study analyzes CYP2D6*1 and CYP2D6*2</p>", new MarkDownProcessor(Dialect.COMMON_MARK).process("This genomic study analyzes CYP2D6*1 and CYP2D6*2", null).trim());
|
||||||
|
Assertions.assertEquals("<p>This genomic study analyzes CYP2D6*1 and CYP2D6*2</p>", new MarkDownProcessor(Dialect.COMMON_MARK).process("This genomic study analyzes CYP2D6*1 and CYP2D6\\*2", null).trim());
|
||||||
|
|
||||||
|
|
||||||
Assertions.assertEquals("This is \\*a test\\*", MarkDownProcessor.makeStringSafeAsMarkdown("This is *a test*"));
|
Assertions.assertEquals("This is \\*a test\\*", MarkDownProcessor.makeStringSafeAsMarkdown("This is *a test*"));
|
||||||
Assertions.assertEquals("This is *a test*", MarkDownProcessor.makeMarkdownForString("This is \\*a test\\*"));
|
Assertions.assertEquals("This is *a test*", MarkDownProcessor.makeMarkdownForString("This is \\*a test\\*"));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
//
|
||||||
|
//case '*':
|
||||||
|
//case '&':
|
||||||
|
//case '#':
|
||||||
|
//case '[':
|
||||||
|
//case '>':
|
||||||
|
//case '<':
|
||||||
|
//case '`':
|
||||||
|
// -
|
||||||
|
// |
|
||||||
|
// :
|
||||||
|
// ~
|
||||||
|
// ^
|
||||||
|
// =
|
Loading…
Reference in New Issue