SOLR-42: additional tweaks to get highlighting to work properly (contributed by Grant Ingersoll)

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@610526 13f79535-47bb-0310-9956-ffa450edef68
2008-01-09 19:41:19 +00:00 · 2008-01-09 19:41:19 +00:00 · 981d977eef
parent fffa39127c
commit 981d977eef
2 changed files with 244 additions and 126 deletions
--- a/src/java/org/apache/solr/analysis/HTMLStripReader.java
+++ b/src/java/org/apache/solr/analysis/HTMLStripReader.java
@ -24,7 +24,6 @@ import java.io.IOException;
 import java.io.InputStreamReader;
 import java.util.HashMap;
 import java.util.Set;
-import java.util.Collections;

 /**
 * A Reader that wraps another reader and attempts to strip out HTML constructs.
@ -35,20 +34,22 @@ import java.util.Collections;

 public class HTMLStripReader extends Reader {
  private final Reader in;
-  private final int READAHEAD=4096;
+  private int readAheadLimit = DEFAULT_READ_AHEAD;
+  private int readAheadLimitMinus1 = readAheadLimit -1;
  private int numWhitespace = 0;
  private int numRead = 0;
+  private int lastMark;
  private Set<String> escapedTags;

  // pushback buffer
  private final StringBuilder pushed = new StringBuilder();
-
  private static final int EOF=-1;
  private static final int MISMATCH=-2;
-  private static final int MATCH=-3;

+  private static final int MATCH=-3;
  // temporary buffer
  private final StringBuilder sb = new StringBuilder();
+  public static final int DEFAULT_READ_AHEAD = 8192;


  public static void main(String[] args) throws IOException {
@ -68,6 +69,16 @@ public class HTMLStripReader extends Reader {
    this.escapedTags = escapedTags;
  }

+  public HTMLStripReader(Reader source, Set<String> escapedTags, int readAheadLimit){
+    this(source);
+    this.escapedTags = escapedTags;
+    this.readAheadLimit = readAheadLimit;
+    readAheadLimitMinus1 = readAheadLimit - 1;
+  }
+
+  public int getReadAheadLimit() {
+    return readAheadLimit;
+  }

  private int next() throws IOException {
    int len = pushed.length();
@ -150,8 +161,10 @@ public class HTMLStripReader extends Reader {
    // return isAlpha(ch) || ch=='_' || Character.isLetter(ch);
  }

+
  private void saveState() throws IOException {
-    in.mark(READAHEAD);
+    lastMark = numRead;
+    in.mark(readAheadLimit);
  }

  private void restoreState() throws IOException {
@ -199,15 +212,21 @@ public class HTMLStripReader extends Reader {
    // In older HTML, an entity may not have always been terminated
    // with a semicolon.  We'll also treat EOF or whitespace as terminating
    // the entity.
-    if (ch==';' || ch==-1) {
-      return Integer.parseInt(sb.toString(), base);
-    }
+    try {
+      if (ch==';' || ch==-1) {
+        numWhitespace = sb.length() + 2;// + 2 accounts for &, #, and ;, then, take away 1 for the fact that we do output a char
+        return Integer.parseInt(sb.toString(), base);
+      }

-    // if whitespace terminated the entity, we need to return
-    // that whitespace on the next call to read().
-    if (isSpace(ch)) {
-      push(ch);
-      return Integer.parseInt(sb.toString(), base);
+      // if whitespace terminated the entity, we need to return
+      // that whitespace on the next call to read().
+      if (isSpace(ch)) {
+        push(ch);
+        numWhitespace = sb.length() + 2;// + 2 accounts for &, #, and ;, then, take away 1 for the fact that we do output a char
+        return Integer.parseInt(sb.toString(), base);
+      }
+    } catch (NumberFormatException e) {
+      return MISMATCH;
    }

    // Not an entity...
@ -230,7 +249,7 @@ public class HTMLStripReader extends Reader {
    sb.setLength(0);
    sb.append((char)ch);

-    for (int i=0; i<READAHEAD; i++) {
+    for (int i=0; i< readAheadLimitMinus1; i++) {
      ch=next();
      if (Character.isLetter(ch)) {
        sb.append((char)ch);
@ -243,6 +262,7 @@ public class HTMLStripReader extends Reader {
      String entity=sb.toString();
      Character entityChar = entityTable.get(entity);
      if (entityChar!=null) {
+        numWhitespace = entity.length() + 1 ;
        return entityChar.charValue();
      }
    }
@ -274,24 +294,29 @@ public class HTMLStripReader extends Reader {

  private int readBang(boolean inScript) throws IOException {
    // at this point, "<!" has been read
-
    int ret = readComment(inScript);
    if (ret==MATCH) return MATCH;

-    int ch = next();
-    if (ch=='>') return MATCH;
+    if ((numRead - lastMark) < readAheadLimitMinus1 || peek() == '>' ) {

-    // if it starts with <! and isn't a comment,
-    // simply read until ">"
-    while (true) {
-      ch = next();
-      if (ch=='>') {
-        return MATCH;
-      }
-      else if (ch<0) {
-        return MISMATCH;
+      int ch = next();
+      if (ch=='>') return MATCH;
+
+      // if it starts with <! and isn't a comment,
+      // simply read until ">"
+      //since we did readComment already, it may be the case that we are already deep into the read ahead buffer
+      //so, we may need to abort sooner
+      while ((numRead - lastMark) < readAheadLimitMinus1) {
+        ch = next();
+        if (ch=='>') {
+          return MATCH;
+        }
+        else if (ch<0) {
+          return MISMATCH;
+        }
      }
    }
+    return MISMATCH;
  }

  // tries to read comments the way browsers do, not
@ -317,8 +342,8 @@ public class HTMLStripReader extends Reader {
      push('-');
      return MISMATCH;
    }
-
-    while (true) {
+    /*two extra calls to next() here, so make sure we don't read past our mark*/
+    while ((numRead - lastMark) < readAheadLimitMinus1 -3 ) {
      ch = next();
      if (ch<0) return MISMATCH;
      if (ch=='-') {
@ -347,7 +372,10 @@ public class HTMLStripReader extends Reader {
      } else if (ch=='<') {
        eatSSI();
      }
+
    }
+    return MISMATCH;
+
  }


@ -362,8 +390,8 @@ public class HTMLStripReader extends Reader {

    sb.setLength(0);
    sb.append((char)ch);
+    while((numRead - lastMark) < readAheadLimitMinus1) {

-    while(true) {
      ch = next();
      if (isIdChar(ch)) {
        sb.append((char)ch);
@ -387,7 +415,7 @@ public class HTMLStripReader extends Reader {

    if (ch!='>') {
      // process attributes
-      while (true) {
+      while ((numRead - lastMark) < readAheadLimitMinus1) {
        ch=next();
        if (isSpace(ch)) {
          continue;
@ -403,6 +431,10 @@ public class HTMLStripReader extends Reader {
        } else {
          return MISMATCH;
        }
+
+      }
+      if ((numRead - lastMark) >= readAheadLimitMinus1){
+        return MISMATCH;//exit out if we exceeded the buffer
      }
    }

@ -441,7 +473,8 @@ public class HTMLStripReader extends Reader {
  // beware markup in script strings: </script>...document.write("</script>")foo</script>
  // TODO: do I need to worry about CDATA sections "<![CDATA["  ?
  int findEndTag() throws IOException {
-    while (true) {
+
+    while ((numRead - lastMark) < readAheadLimitMinus1) {
      int ch = next();
      if (ch=='<') {
        ch = next();
@ -474,7 +507,9 @@ public class HTMLStripReader extends Reader {
      } else if (ch<0) {
        return MISMATCH;
      }
+
    }
+    return MISMATCH;
  }


@ -482,7 +517,8 @@ public class HTMLStripReader extends Reader {
  private int readScriptString() throws IOException {
    int quoteChar = next();
    if (quoteChar!='\'' && quoteChar!='"') return MISMATCH;
-    while(true) {
+
+    while((numRead - lastMark) < readAheadLimitMinus1) {
      int ch = next();
      if (ch==quoteChar) return MATCH;
      else if (ch=='\\') {
@ -492,7 +528,9 @@ public class HTMLStripReader extends Reader {
      } else if (ch=='<') {
        eatSSI();
      }
+
    }
+    return MISMATCH;
  }


@ -526,105 +564,63 @@ public class HTMLStripReader extends Reader {
  <td id=msviGlobalToolbar height="22" nowrap align=left>

  ***/
-  private int readAttr() throws IOException {
-    int ch = read();
-    if (!isFirstIdChar(ch)) return MISMATCH;
-    ch = read();
-    while(isIdChar(ch)) ch=read();
-    if (isSpace(ch)) ch = nextSkipWS();
-
-    // attributes may not have a value at all!
-    // if (ch != '=') return MISMATCH;
-    if (ch != '=') {
-      push(ch);
-      return MATCH;
-    }
-
-    int quoteChar = nextSkipWS();
-
-    if (quoteChar=='"' || quoteChar=='\'') {
-      // TODO: should I set a max size to try and find the other
-      // quote?  Otherwise, I may read to much to restore
-      // the stream.
-      while (true) {
-        ch = next();
-        if (ch<0) return MISMATCH;
-        else if (ch==quoteChar) {
-          return MATCH;
-        //} else if (ch=='<') {
-        //  return MISMATCH;
-        }
-      }
-    } else {
-      // unquoted attribute
-      while (true) {
-        ch = next();
-        if (ch<0) return MISMATCH;
-        else if (isSpace(ch)) {
-          push(ch);
-          return MATCH;
-        } else if (ch=='>') {
-          push(ch);
-          return MATCH;
-        }
-      }
-    }
-
-  }

    // This reads attributes and attempts to handle any
    // embedded server side includes that would otherwise
    // mess up the quote handling.
    //  <a href="a/<!--#echo "path"-->">
    private int readAttr2() throws IOException {
-    int ch = read();
-    if (!isFirstIdChar(ch)) return MISMATCH;
-    ch = read();
-    while(isIdChar(ch)) ch=read();
-    if (isSpace(ch)) ch = nextSkipWS();
-
-    // attributes may not have a value at all!
-    // if (ch != '=') return MISMATCH;
-    if (ch != '=') {
-      push(ch);
-      return MATCH;
-    }
-
-    int quoteChar = nextSkipWS();
-
-    if (quoteChar=='"' || quoteChar=='\'') {
-      // TODO: should I set a max size to try and find the other
-      // quote?  Otherwise, I may read to much to restore
-      // the stream.
-      while (true) {
-        ch = next();
-        if (ch<0) return MISMATCH;
-        else if (ch=='<') {
-          eatSSI();
-        }
-        else if (ch==quoteChar) {
-          return MATCH;
-        //} else if (ch=='<') {
-        //  return MISMATCH;
-        }
+    if ((numRead - lastMark < readAheadLimitMinus1)) {
+      int ch = read();
+      if (!isFirstIdChar(ch)) return MISMATCH;
+      ch = read();
+      while(isIdChar(ch) && ((numRead - lastMark) < readAheadLimitMinus1 - 1)){
+        ch=read();
      }
-    } else {
-      // unquoted attribute
-      while (true) {
-        ch = next();
-        if (ch<0) return MISMATCH;
-        else if (isSpace(ch)) {
-          push(ch);
-          return MATCH;
-        } else if (ch=='>') {
-          push(ch);
-          return MATCH;
-        } else if (ch=='<') {
-          eatSSI();
+      if (isSpace(ch)) ch = nextSkipWS();
+
+      // attributes may not have a value at all!
+      // if (ch != '=') return MISMATCH;
+      if (ch != '=') {
+        push(ch);
+        return MATCH;
+      }
+
+      int quoteChar = nextSkipWS();
+
+      if (quoteChar=='"' || quoteChar=='\'') {
+        while ((numRead - lastMark) < readAheadLimitMinus1) {
+          ch = next();
+          if (ch<0) return MISMATCH;
+          else if (ch=='<') {
+            eatSSI();
+          }
+          else if (ch==quoteChar) {
+            return MATCH;
+          //} else if (ch=='<') {
+          //  return MISMATCH;
+          }
+
+        }
+      } else {
+        // unquoted attribute
+        while ((numRead - lastMark) < readAheadLimitMinus1) {
+          ch = next();
+          if (ch<0) return MISMATCH;
+          else if (isSpace(ch)) {
+            push(ch);
+            return MATCH;
+          } else if (ch=='>') {
+            push(ch);
+            return MATCH;
+          } else if (ch=='<') {
+            eatSSI();
+          }
+
        }
      }
    }
-
+    return MISMATCH;
  }

  // skip past server side include
@ -659,8 +655,7 @@ public class HTMLStripReader extends Reader {

  private int readProcessingInstruction() throws IOException {
    // "<?" has already been read
-
-    while (true) {
+    while ((numRead - lastMark) < readAheadLimitMinus1) {
      int ch = next();
      if (ch=='?' && peek()=='>') {
        next();
@ -668,7 +663,9 @@ public class HTMLStripReader extends Reader {
      } else if (ch==-1) {
        return MISMATCH;
      }
+
    }
+    return MISMATCH;
  }


@ -681,7 +678,7 @@ public class HTMLStripReader extends Reader {
      numWhitespace--;
      return ' ';
    }
-
+    //do not limit this one by the READAHEAD
    while(true) {
      int lastNumRead = numRead;
      int ch = next();
--- a/src/test/org/apache/solr/analysis/HTMLStripReaderTest.java
+++ b/src/test/org/apache/solr/analysis/HTMLStripReaderTest.java
@ -23,6 +23,7 @@ import java.io.IOException;
 import java.io.FileReader;
 import java.io.File;
 import java.io.Reader;
+import java.io.BufferedReader;
 import java.util.Set;
 import java.util.HashSet;
 import java.util.Collections;
@ -48,7 +49,7 @@ public class HTMLStripReaderTest extends TestCase {
            "This is an entity: &amp; plus a &lt;.  Here is an &. <!-- is a comment -->";
    String gold = "                 this is some text       here is a                link     and " +
            "another                                     link    . " +
-            "This is an entity: & plus a <.  Here is an &.                      ";
+            "This is an entity: &     plus a <   .  Here is an &.                      ";
    HTMLStripReader reader = new HTMLStripReader(new StringReader(html));
    StringBuilder builder = new StringBuilder();
    int ch = -1;
@ -57,7 +58,8 @@ public class HTMLStripReaderTest extends TestCase {
    while ((ch = reader.read()) != -1){
      char theChar = (char) ch;
      builder.append(theChar);
-      assertTrue("\"" + theChar + "\"" + " at position: " + position + " does not equal: " + goldArray[position] + " Buffer so far: " + builder + "<EOB>", theChar == goldArray[position]);
+      assertTrue("\"" + theChar + "\"" + " at position: " + position + " does not equal: " + goldArray[position]
+              + " Buffer so far: " + builder + "<EOB>", theChar == goldArray[position]);
      position++;
    }
    assertTrue(gold + " is not equal to " + builder.toString(), gold.equals(builder.toString()) == true);
@ -81,6 +83,57 @@ public class HTMLStripReaderTest extends TestCase {
    
  }

+  public void testGamma() throws Exception {
+    String test = "&Gamma;";
+    String gold = "\u0393      ";
+    Set<String> set = new HashSet<String>();
+    set.add("reserved");
+    Reader reader = new HTMLStripReader(new StringReader(test), set);
+    StringBuilder builder = new StringBuilder();
+    int ch = 0;
+    while ((ch = reader.read()) != -1){
+      builder.append((char)ch);
+    }
+    String result = builder.toString();
+    System.out.println("Resu: " + result + "<EOL>");
+    System.out.println("Gold: " + gold + "<EOL>");
+    assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
+  }
+
+  public void testEntities() throws Exception {
+    String test = "&nbsp; &lt;foo&gt; &#61; &Gamma; bar &#x393;";
+    String gold = "       <   foo>    =     \u0393       bar \u0393     ";
+    Set<String> set = new HashSet<String>();
+    set.add("reserved");
+    Reader reader = new HTMLStripReader(new StringReader(test), set);
+    StringBuilder builder = new StringBuilder();
+    int ch = 0;
+    while ((ch = reader.read()) != -1){
+      builder.append((char)ch);
+    }
+    String result = builder.toString();
+    System.out.println("Resu: " + result + "<EOL>");
+    System.out.println("Gold: " + gold + "<EOL>");
+    assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
+  }
+
+  public void testMoreEntities() throws Exception {
+    String test = "&nbsp; &lt;junk/&gt; &nbsp; &#33; &#64; and &#8217;";
+    String gold = "       <   junk/>           !     @     and ’      ";
+    Set<String> set = new HashSet<String>();
+    set.add("reserved");
+    Reader reader = new HTMLStripReader(new StringReader(test), set);
+    StringBuilder builder = new StringBuilder();
+    int ch = 0;
+    while ((ch = reader.read()) != -1){
+      builder.append((char)ch);
+    }
+    String result = builder.toString();
+    System.out.println("Resu: " + result + "<EOL>");
+    System.out.println("Gold: " + gold + "<EOL>");
+    assertTrue(result + " is not equal to " + gold, result.equals(gold) == true);
+  }
+
  public void testReserved() throws Exception {
    String test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved ggg=\"hhhh\"/> <other/>";
    Set<String> set = new HashSet<String>();
@ -155,4 +208,72 @@ public class HTMLStripReaderTest extends TestCase {
      i++;
    }
  }
+
+  public void testBufferOverflow() throws Exception {
+    StringBuilder testBuilder = new StringBuilder(HTMLStripReader.DEFAULT_READ_AHEAD + 50);
+    testBuilder.append("ah<?> ");
+    appendChars(testBuilder, HTMLStripReader.DEFAULT_READ_AHEAD + 500);
+    processBuffer(testBuilder.toString(), "Failed on pseudo proc. instr.");//processing instructions
+
+    testBuilder.setLength(0);
+    testBuilder.append("<!--");//comments
+    appendChars(testBuilder, 3*HTMLStripReader.DEFAULT_READ_AHEAD + 500);//comments have two lookaheads
+
+    testBuilder.append("-->foo");
+    processBuffer(testBuilder.toString(), "Failed w/ comment");
+
+    testBuilder.setLength(0);
+    testBuilder.append("<?");
+    appendChars(testBuilder, HTMLStripReader.DEFAULT_READ_AHEAD + 500);
+    testBuilder.append("?>");
+    processBuffer(testBuilder.toString(), "Failed with proc. instr.");
+    
+    testBuilder.setLength(0);
+    testBuilder.append("<b ");
+    appendChars(testBuilder, HTMLStripReader.DEFAULT_READ_AHEAD + 500);
+    testBuilder.append("/>");
+    processBuffer(testBuilder.toString(), "Failed on tag");
+
+  }
+
+  private void appendChars(StringBuilder testBuilder, int numChars) {
+    int i1 = numChars / 2;
+    for (int i = 0; i < i1; i++){
+      testBuilder.append('a').append(' ');//tack on enough to go beyond the mark readahead limit, since <?> makes HTMLStripReader think it is a processing instruction
+    }
+  }  
+
+
+  private void processBuffer(String test, String assertMsg) throws IOException {
+    System.out.println("-------------------processBuffer----------");
+    Reader reader = new HTMLStripReader(new BufferedReader(new StringReader(test)));//force the use of BufferedReader
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      System.out.println("String (trimmed): " + builder.toString().trim() + "<EOS>");
+    }
+    assertTrue(assertMsg + "::: " + builder.toString() + " is not equal to " + test, builder.toString().equals(test) == true);
+  }
+
+  public void testComment() throws Exception {
+
+    String test = "<!--- three dashes, still a valid comment ---> ";
+    String gold = "                                               ";
+    Reader reader = new HTMLStripReader(new BufferedReader(new StringReader(test)));//force the use of BufferedReader
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      System.out.println("String: " + builder.toString());
+    }
+    assertTrue(builder.toString() + " is not equal to " + gold + "<EOS>", builder.toString().equals(gold) == true);
+  }
+
 }