More markdown detection work

2023-02-21 16:52:32 +11:00 · 2023-02-21 16:52:32 +11:00 · 1eeee429ce
parent bae27ef706
commit 1eeee429ce
2 changed files with 176 additions and 0 deletions
--- a/org.hl7.fhir.utilities/src/main/java/org/hl7/fhir/utilities/MarkDownProcessor.java
+++ b/org.hl7.fhir.utilities/src/main/java/org/hl7/fhir/utilities/MarkDownProcessor.java
@ -69,6 +69,125 @@ public class MarkDownProcessor {
    }
  }
  /**
   * Returns true if this is intended to be processed as markdown
   * 
   * this is guess, based on textual analysis of the content. 
   * 
   * Uses of this routine:
   *   In general, the main use of this is to decide to escape the string so erroneous markdown processing doesn't munge characters
   *   If it's a plain string, and it's being put into something that's markdown, then you should escape the content
   *   If it's markdown, but you're not sure whether to process it as markdown
   *   
   * The underlying problem is that markdown processing plain strings is problematic because some technical characters might 
   * get lost. So it's good to escape them... but if it's meant to be markdown, then it'll get trashed. 
   * 
   * This method works by looking for character patterns that are unlikely to occur outside markdown - but it's still only unlikely
   *  
   * @param content
   * @return
   */
  // todo: dialect dependency?
  public boolean isProbablyMarkdown(String content, boolean mdIfParagrapghs) {
    if (mdIfParagrapghs && content.contains("\n")) {
      return true;
    }
    String[] lines = content.split("\\r?\\n");
    for (String s : lines) {
      if (s.startsWith("* ") || isHeading(s) || s.startsWith("1. ") || s.startsWith("    ")) {
        return true;
      }
      if (s.contains("```") || s.contains("~~~") || s.contains("[[[")) {
        return true;
      }
      if (hasLink(s)) {
        return true;
      }
      if (hasTextSpecial(s, '*') || hasTextSpecial(s, '_') ) {
        return true;
      }
    }
    return false;
  }
  private boolean isHeading(String s) {
    if (s.length() > 7 && s.startsWith("###### ") && !Character.isWhitespace(s.charAt(7))) {
      return true;
    }
    if (s.length() > 6 && s.startsWith("##### ") && !Character.isWhitespace(s.charAt(6))) {
      return true;
    }
    if (s.length() > 5 && s.startsWith("#### ") && !Character.isWhitespace(s.charAt(5))) {
      return true;
    }
    if (s.length() > 4 && s.startsWith("### ") && !Character.isWhitespace(s.charAt(4))) {
      return true;
    }
    if (s.length() > 3 && s.startsWith("## ") && !Character.isWhitespace(s.charAt(3))) {
      return true;
    }
    //
    // not sure about this one. # [string] is something that could easily arise in non-markdown, 
    // so this appearing isn't enough to call it markdown
    //
 //    if (s.length() > 2 && s.startsWith("# ") && !Character.isWhitespace(s.charAt(2))) {
 //      return true;
 //    }
    return false;
  }
  private boolean hasLink(String s) {
    int left = -1;
    int mid = -1;
    for (int i = 0; i < s.length(); i++) {
      char c = s.charAt(i);
      if (c == '[') {
        mid = -1;
        left = i;
      } else if (left > -1 && i < s.length()-1 && c == ']' && s.charAt(i+1) == '(') {
        mid = i;
      } else if (left > -1 && c == ']') {
        left = -1;
      } else if (left > -1 && mid > -1 && c == ')') {
        return true;
      } else if (mid > -1 && c == '[' || c == ']' || (c == '(' && i > mid+1)) {
        left = -1;
        mid = -1;
      }
    }
    return false;
  }
  private boolean hasTextSpecial(String s, char c) {
    boolean second = false;
    for (int i = 0; i < s.length(); i++) {
      char prev = i == 0 ? ' ' : s.charAt(i-1);
      char next = i < s.length() - 1 ? s.charAt(i+1) : ' ';
      if (s.charAt(i) != c) {
        // nothing
      } else if (second) {
        if (Character.isWhitespace(next) && (isPunctation(prev) || Character.isLetterOrDigit(prev))) {
          return true;
        }
        second = false;        
      } else {
        if (Character.isWhitespace(prev) && (isPunctation(next) || Character.isLetterOrDigit(next))) {
          second = true;
        }            
      }
    }
    return false;
  }
  private boolean isPunctation(char ch) {
    return Utilities.existsInList(ch, '.', ',', '!', '?');
  }
  /**
   * This deals with a painful problem created by the intersection of previous publishing processes 
   * and the way commonmark specifies that < is handled in content. For control reasons, the FHIR specification does 
--- a/org.hl7.fhir.utilities/src/test/java/org/hl7/fhir/utilities/MarkdownTests.java
+++ b/org.hl7.fhir.utilities/src/test/java/org/hl7/fhir/utilities/MarkdownTests.java
@ -2,15 +2,72 @@ package org.hl7.fhir.utilities;
 import static org.junit.jupiter.api.Assertions.*;
 import org.hl7.fhir.utilities.MarkDownProcessor.Dialect;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.Test;
 class MarkdownTests {
  @Test
  void testMarkdownDetection() {
    testMarkdown("this is a test string", false);
    testMarkdown("this is a \r\ntest string", false);
    testMarkdown("this is a \r\ntest string", true, true);
    testMarkdown("this is a t*est strin*g", false);
    testMarkdown("this is a *test strin*g", false);
    testMarkdown("this is a *test string*", true);
    testMarkdown("this is a *test *string", false);
    testMarkdown("this is a *test* string", true);
    testMarkdown("this [is] a test string", false);
    testMarkdown("this [is](link) a test string", true);
    testMarkdown("this [is](link a test string", false);
    testMarkdown("this [is] (link) a test string", false);
    testMarkdown("this [is(link)] a test string", false);
    testMarkdown("this [is](link a test string", false);
    testMarkdown("this [i]s] (link) a test string", false);
    testMarkdown("## heading", true);
    testMarkdown("# heading", false);
    testMarkdown("##  heading", false);
    testMarkdown("###", false);
  }
  private void testMarkdown(String content, boolean isMD) {
    testMarkdown(content, isMD, false);
  }
  private void testMarkdown(String content, boolean isMD, boolean ifLines) {
    boolean test = new MarkDownProcessor(Dialect.COMMON_MARK).isProbablyMarkdown(content, ifLines);
    assertEquals(isMD, test);    
  }
  @Test
  void testStringToMarkdown() {
    // first, we test the need for replacing
    Assertions.assertEquals("<p>This is a string</p>", new MarkDownProcessor(Dialect.COMMON_MARK).process("This is a string", null).trim());
    Assertions.assertEquals("<p>This is *a string</p>", new MarkDownProcessor(Dialect.COMMON_MARK).process("This is *a string", null).trim());
    Assertions.assertNotEquals("<p>This is *a* string</p>", new MarkDownProcessor(Dialect.COMMON_MARK).process("This is *a* string", null).trim());
    Assertions.assertEquals("<p>This is *a *string</p>", new MarkDownProcessor(Dialect.COMMON_MARK).process("This is *a *string", null).trim());
    Assertions.assertNotEquals("<p>This genomic study analyzes CYP2D6*1 and CYP2D6*2</p>", new MarkDownProcessor(Dialect.COMMON_MARK).process("This genomic study analyzes CYP2D6*1 and CYP2D6*2", null).trim());
    Assertions.assertEquals("<p>This genomic study analyzes CYP2D6*1 and CYP2D6*2</p>", new MarkDownProcessor(Dialect.COMMON_MARK).process("This genomic study analyzes CYP2D6*1 and CYP2D6\\*2", null).trim());
    Assertions.assertEquals("This is \\*a test\\*", MarkDownProcessor.makeStringSafeAsMarkdown("This is *a test*"));
    Assertions.assertEquals("This is *a test*", MarkDownProcessor.makeMarkdownForString("This is \\*a test\\*"));
  }
 }
 //
 //case '*':
 //case '&':
 //case '#':
 //case '[':
 //case '>':
 //case '<':
 //case '`':
 //  -
 //  |
 //  :
 //  ~
 //  ^
 //  =