LUCENE-3913: Fix HTMLStripCharFilter invalid final offset for input containing </br>

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1304912 13f79535-47bb-0310-9956-ffa450edef68
2012-03-24 20:54:31 +00:00 · 2012-03-24 20:54:31 +00:00 · ada9780484
parent 1f7c31d711
commit ada9780484
5 changed files with 559 additions and 461 deletions
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@ -391,188 +391,194 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
      }
-      if (VERBOSE) {
+      checkAnalysisConsistency(random, a, useCharFilter, text);
-        System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
+    }
  }
  public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text) throws IOException {
    if (VERBOSE) {
      System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
    }
    int remainder = random.nextInt(10);
    Reader reader = new StringReader(text);
    TokenStream ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
    assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
    PositionIncrementAttribute posIncAtt = ts.hasAttribute(PositionIncrementAttribute.class) ? ts.getAttribute(PositionIncrementAttribute.class) : null;
    PositionLengthAttribute posLengthAtt = ts.hasAttribute(PositionLengthAttribute.class) ? ts.getAttribute(PositionLengthAttribute.class) : null;
    TypeAttribute typeAtt = ts.hasAttribute(TypeAttribute.class) ? ts.getAttribute(TypeAttribute.class) : null;
    List<String> tokens = new ArrayList<String>();
    List<String> types = new ArrayList<String>();
    List<Integer> positions = new ArrayList<Integer>();
    List<Integer> positionLengths = new ArrayList<Integer>();
    List<Integer> startOffsets = new ArrayList<Integer>();
    List<Integer> endOffsets = new ArrayList<Integer>();
    ts.reset();
    // First pass: save away "correct" tokens
    while (ts.incrementToken()) {
      tokens.add(termAtt.toString());
      if (typeAtt != null) types.add(typeAtt.type());
      if (posIncAtt != null) positions.add(posIncAtt.getPositionIncrement());
      if (posLengthAtt != null) positionLengths.add(posLengthAtt.getPositionLength());
      if (offsetAtt != null) {
        startOffsets.add(offsetAtt.startOffset());
        endOffsets.add(offsetAtt.endOffset());
      }
    }
    ts.end();
    ts.close();
-      int remainder = random.nextInt(10);
+    // verify reusing is "reproducable" and also get the normal tokenstream sanity checks
-      Reader reader = new StringReader(text);
+    if (!tokens.isEmpty()) {
      TokenStream ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
      assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
      CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
      OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
      PositionIncrementAttribute posIncAtt = ts.hasAttribute(PositionIncrementAttribute.class) ? ts.getAttribute(PositionIncrementAttribute.class) : null;
      PositionLengthAttribute posLengthAtt = ts.hasAttribute(PositionLengthAttribute.class) ? ts.getAttribute(PositionLengthAttribute.class) : null;
      TypeAttribute typeAtt = ts.hasAttribute(TypeAttribute.class) ? ts.getAttribute(TypeAttribute.class) : null;
      List<String> tokens = new ArrayList<String>();
      List<String> types = new ArrayList<String>();
      List<Integer> positions = new ArrayList<Integer>();
      List<Integer> positionLengths = new ArrayList<Integer>();
      List<Integer> startOffsets = new ArrayList<Integer>();
      List<Integer> endOffsets = new ArrayList<Integer>();
      ts.reset();
-      // First pass: save away "correct" tokens
+      // KWTokenizer (for example) can produce a token
-      while (ts.incrementToken()) {
+      // even when input is length 0:
-        tokens.add(termAtt.toString());
+      if (text.length() != 0) {
        if (typeAtt != null) types.add(typeAtt.type());
        if (posIncAtt != null) positions.add(posIncAtt.getPositionIncrement());
        if (posLengthAtt != null) positionLengths.add(posLengthAtt.getPositionLength());
        if (offsetAtt != null) {
          startOffsets.add(offsetAtt.startOffset());
          endOffsets.add(offsetAtt.endOffset());
        }
      }
      ts.end();
      ts.close();
-      // verify reusing is "reproducable" and also get the normal tokenstream sanity checks
+        // (Optional) second pass: do something evil:
-      if (!tokens.isEmpty()) {
+        final int evilness = random.nextInt(50);
-
+        if (evilness == 17) {
        // KWTokenizer (for example) can produce a token
        // even when input is length 0:
        if (text.length() != 0) {
          // (Optional) second pass: do something evil:
          final int evilness = random.nextInt(50);
          if (evilness == 17) {
            if (VERBOSE) {
              System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis w/ exception");
            }
            // Throw an errant exception from the Reader:
            MockReaderWrapper evilReader = new MockReaderWrapper(random, new StringReader(text));
            evilReader.throwExcAfterChar(random.nextInt(text.length()+1));
            reader = evilReader;
            try {
              // NOTE: some Tokenizers go and read characters
              // when you call .setReader(Reader), eg
              // PatternTokenizer.  This is a bit
              // iffy... (really, they should only
              // pull from the Reader when you call
              // .incremenToken(), I think?), but we
              // currently allow it, so, we must call
              // a.tokenStream inside the try since we may
              // hit the exc on init:
              ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(evilReader, remainder) : evilReader);
              ts.reset();
              while (ts.incrementToken());
              fail("did not hit exception");
            } catch (RuntimeException re) {
              assertTrue(MockReaderWrapper.isMyEvilException(re));
            }
            try {
              ts.end();
            } catch (AssertionError ae) {
              // Catch & ignore MockTokenizer's
              // anger...
              if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
                // OK
              } else {
                throw ae;
              }
            }
            ts.close();
          } else if (evilness == 7) {
            // Only consume a subset of the tokens:
            final int numTokensToRead = random.nextInt(tokens.size());
            if (VERBOSE) {
              System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.size() + " tokens");
            }
            reader = new StringReader(text);
            ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
            ts.reset();
            for(int tokenCount=0;tokenCount<numTokensToRead;tokenCount++) {
              assertTrue(ts.incrementToken());
            }
            try {
              ts.end();
            } catch (AssertionError ae) {
              // Catch & ignore MockTokenizer's
              // anger...
              if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
                // OK
              } else {
                throw ae;
              }
            }
            ts.close();
          }
        }
        // Final pass: verify clean tokenization matches
        // results from first pass:
        if (VERBOSE) {
          System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
        }
        reader = new StringReader(text);
        if (random.nextInt(30) == 7) {
          if (VERBOSE) {
-            System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader");
+            System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis w/ exception");
          }
          // Throw an errant exception from the Reader:
          MockReaderWrapper evilReader = new MockReaderWrapper(random, new StringReader(text));
          evilReader.throwExcAfterChar(random.nextInt(text.length()+1));
          reader = evilReader;
          try {
            // NOTE: some Tokenizers go and read characters
            // when you call .setReader(Reader), eg
            // PatternTokenizer.  This is a bit
            // iffy... (really, they should only
            // pull from the Reader when you call
            // .incremenToken(), I think?), but we
            // currently allow it, so, we must call
            // a.tokenStream inside the try since we may
            // hit the exc on init:
            ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(evilReader, remainder) : evilReader);
            ts.reset();
            while (ts.incrementToken());
            fail("did not hit exception");
          } catch (RuntimeException re) {
            assertTrue(MockReaderWrapper.isMyEvilException(re));
          }
          try {
            ts.end();
          } catch (AssertionError ae) {
            // Catch & ignore MockTokenizer's
            // anger...
            if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
              // OK
            } else {
              throw ae;
            }
          }
          ts.close();
        } else if (evilness == 7) {
          // Only consume a subset of the tokens:
          final int numTokensToRead = random.nextInt(tokens.size());
          if (VERBOSE) {
            System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.size() + " tokens");
          }
-          reader = new MockReaderWrapper(random, reader);
+          reader = new StringReader(text);
          ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
          ts.reset();
          for(int tokenCount=0;tokenCount<numTokensToRead;tokenCount++) {
            assertTrue(ts.incrementToken());
          }
          try {
            ts.end();
          } catch (AssertionError ae) {
            // Catch & ignore MockTokenizer's
            // anger...
            if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
              // OK
            } else {
              throw ae;
            }
          }
          ts.close();
        }
-        
+      }
-        ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
+
-        if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
+      // Final pass: verify clean tokenization matches
-          // offset + pos + posLength + type
+      // results from first pass:
-          assertTokenStreamContents(ts, 
+
-            tokens.toArray(new String[tokens.size()]),
+      if (VERBOSE) {
-            toIntArray(startOffsets),
+        System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
-            toIntArray(endOffsets),
+      }
-            types.toArray(new String[types.size()]),
+      reader = new StringReader(text);
-            toIntArray(positions),
+
-            toIntArray(positionLengths),
+      if (random.nextInt(30) == 7) {
-            text.length());
+        if (VERBOSE) {
-        } else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
+          System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader");
          // offset + pos + type
          assertTokenStreamContents(ts, 
            tokens.toArray(new String[tokens.size()]),
            toIntArray(startOffsets),
            toIntArray(endOffsets),
            types.toArray(new String[types.size()]),
            toIntArray(positions),
            null,
            text.length());
        } else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
          // offset + pos + posLength
          assertTokenStreamContents(ts, 
              tokens.toArray(new String[tokens.size()]),
              toIntArray(startOffsets),
              toIntArray(endOffsets),
              null,
              toIntArray(positions),
              toIntArray(positionLengths),
              text.length());
        } else if (posIncAtt != null && offsetAtt != null) {
          // offset + pos
          assertTokenStreamContents(ts, 
              tokens.toArray(new String[tokens.size()]),
              toIntArray(startOffsets),
              toIntArray(endOffsets),
              null,
              toIntArray(positions),
              null,
              text.length());
        } else if (offsetAtt != null) {
          // offset
          assertTokenStreamContents(ts, 
              tokens.toArray(new String[tokens.size()]),
              toIntArray(startOffsets),
              toIntArray(endOffsets),
              null,
              null,
              null,
              text.length());
        } else {
          // terms only
          assertTokenStreamContents(ts, 
              tokens.toArray(new String[tokens.size()]));
        }
        reader = new MockReaderWrapper(random, reader);
      }
      ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
      if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
        // offset + pos + posLength + type
        assertTokenStreamContents(ts, 
                                  tokens.toArray(new String[tokens.size()]),
                                  toIntArray(startOffsets),
                                  toIntArray(endOffsets),
                                  types.toArray(new String[types.size()]),
                                  toIntArray(positions),
                                  toIntArray(positionLengths),
                                  text.length());
      } else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
        // offset + pos + type
        assertTokenStreamContents(ts, 
                                  tokens.toArray(new String[tokens.size()]),
                                  toIntArray(startOffsets),
                                  toIntArray(endOffsets),
                                  types.toArray(new String[types.size()]),
                                  toIntArray(positions),
                                  null,
                                  text.length());
      } else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
        // offset + pos + posLength
        assertTokenStreamContents(ts, 
                                  tokens.toArray(new String[tokens.size()]),
                                  toIntArray(startOffsets),
                                  toIntArray(endOffsets),
                                  null,
                                  toIntArray(positions),
                                  toIntArray(positionLengths),
                                  text.length());
      } else if (posIncAtt != null && offsetAtt != null) {
        // offset + pos
        assertTokenStreamContents(ts, 
                                  tokens.toArray(new String[tokens.size()]),
                                  toIntArray(startOffsets),
                                  toIntArray(endOffsets),
                                  null,
                                  toIntArray(positions),
                                  null,
                                  text.length());
      } else if (offsetAtt != null) {
        // offset
        assertTokenStreamContents(ts, 
                                  tokens.toArray(new String[tokens.size()]),
                                  toIntArray(startOffsets),
                                  toIntArray(endOffsets),
                                  null,
                                  null,
                                  null,
                                  text.length());
      } else {
        // terms only
        assertTokenStreamContents(ts, 
                                  tokens.toArray(new String[tokens.size()]));
      }
    }
  }
--- a/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java
@ -27,10 +27,7 @@ import java.io.OutputStream;
 import java.io.PrintStream;
 import java.lang.reflect.Method;
 import java.nio.CharBuffer;
-import java.util.Enumeration;
+import java.util.*;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Random;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipFile;
@ -414,12 +411,51 @@ public class _TestUtil {
        case 20: sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1)); break;
        case 21: sb.append("\n"); break;
        case 22: sb.append("          ".substring(nextInt(random, 0, 10))); break;
        case 23: {
          sb.append("<");
          if (0 == nextInt(random, 0, 3)) {
            sb.append("          ".substring(nextInt(random, 1, 10)));
          }
          if (0 == nextInt(random, 0, 1)) {
            sb.append("/");
            if (0 == nextInt(random, 0, 3)) {
              sb.append("          ".substring(nextInt(random, 1, 10)));
            }
          }
          switch (nextInt(random, 0, 3)) {
            case 0: sb.append(randomlyRecaseCodePoints(random, "script")); break;
            case 1: sb.append(randomlyRecaseCodePoints(random, "style")); break;
            case 2: sb.append(randomlyRecaseCodePoints(random, "br")); break;
            // default: append nothing
          }
          sb.append(">".substring(nextInt(random, 0, 1)));
          break;
        }
        default: sb.append(randomSimpleString(random));
      }
    }
    return sb.toString();
  }
  /**
   * Randomly upcases, downcases, or leaves intact each code point in the given string
   */
  public static String randomlyRecaseCodePoints(Random random, String str) {
    StringBuilder builder = new StringBuilder();
    int pos = 0;
    while (pos < str.length()) {
      int codePoint = str.codePointAt(pos);
      pos += Character.charCount(codePoint);
      String codePointSubstring = new String(new int[] { codePoint }, 0, 1);
      switch (nextInt(random, 0, 2)) {
        case 0: builder.append(codePointSubstring.toUpperCase()); break;
        case 1: builder.append(codePointSubstring.toLowerCase()); break;
        case 2: builder.append(codePointSubstring); // leave intact
      }
    }
    return builder.toString();
  }
  private static final int[] blockStarts = {
    0x0000, 0x0080, 0x0100, 0x0180, 0x0250, 0x02B0, 0x0300, 0x0370, 0x0400, 
    0x0500, 0x0530, 0x0590, 0x0600, 0x0700, 0x0750, 0x0780, 0x07C0, 0x0800, 
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 1/23/12 2:15 AM */
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 3/24/12 4:50 PM */
 package org.apache.lucene.analysis.charfilter;
@ -39,7 +39,7 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
 /**
 * This class is a scanner generated by 
 * <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
- * on 1/23/12 2:15 AM from the specification file
+ * on 3/24/12 4:50 PM from the specification file
 * <tt>C:/cygwin/home/s/svn/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex</tt>
 */
 public final class HTMLStripCharFilter extends BaseCharFilter {
@ -30967,7 +30967,9 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
    case START_TAG_TAIL_EXCLUDE:
    case SERVER_SIDE_INCLUDE:
    case START_TAG_TAIL_SUBSTITUTE: { // Exclude
      // add (length of input that won't be output) [ - (substitution length) = 0 ]
      cumulativeDiff += yychar - inputStart;
      // position the correction at (already output length) [ + (substitution length) = 0 ]
      addOffCorrectMap(outputCharCount, cumulativeDiff);
      outputSegment.clear();
      eofReturnValue = -1;
@ -30975,7 +30977,9 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
    }
    case CHARACTER_REFERENCE_TAIL: {        // Substitute
      // At end of file, allow char refs without semicolons
      // add (length of input that won't be output) - (substitution length)
      cumulativeDiff += inputSegment.length() - outputSegment.length();
      // position the correction at (already output length) + (substitution length)
      addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
      eofReturnValue = outputSegment.nextChar();
      break;
@ -31095,6 +31099,16 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
          { yybegin(STYLE);
          }
        case 55: break;
        case 27: 
          { // add (previously matched input length) + (this match length) - (substitution length)
    cumulativeDiff += inputSegment.length() + yylength() - 1;
    // position the correction at (already output length) + (substitution length)
    addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
    return BLOCK_LEVEL_START_TAG_REPLACEMENT;
          }
        case 56: break;
        case 30: 
          { int length = yylength();
    inputSegment.write(zzBuffer, zzStartRead, length);
@ -31104,7 +31118,30 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
    outputSegment = entitySegment;
    yybegin(CHARACTER_REFERENCE_TAIL);
          }
-        case 56: break;
+        case 57: break;
        case 48: 
          { inputSegment.clear();
    yybegin(YYINITIAL);
    // add (previously matched input length) -- current match and substitution handled below
    cumulativeDiff += yychar - inputStart;
    // position the offset correction at (already output length) -- substitution handled below
    int offsetCorrectionPos = outputCharCount;
    int returnValue;
    if (escapeSTYLE) {
      inputSegment.write(zzBuffer, zzStartRead, yylength());
      outputSegment = inputSegment;
      returnValue = outputSegment.nextChar();
    } else {
      // add (this match length) - (substitution length)
      cumulativeDiff += yylength() - 1;
      // add (substitution length)
      ++offsetCorrectionPos;
      returnValue = STYLE_REPLACEMENT;
    }
    addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
    return returnValue;
          }
        case 58: break;
        case 8: 
          { inputSegment.write(zzBuffer, zzStartRead, yylength());
    if (null != escapedTags
@ -31114,71 +31151,75 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
      yybegin(START_TAG_TAIL_SUBSTITUTE);
    }
          }
-        case 57: break;
+        case 59: break;
        case 26: 
          { cumulativeDiff += inputSegment.length() + yylength();
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    inputSegment.clear();
    outputSegment = inputSegment;
    yybegin(YYINITIAL);
          }
        case 58: break;
        case 2: 
          { inputStart = yychar;
  inputSegment.clear();
  inputSegment.append('<');
  yybegin(LEFT_ANGLE_BRACKET);
          }
        case 59: break;
        case 34: 
          { cumulativeDiff += yychar - inputStart + yylength();
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
          }
        case 60: break;
        case 47: 
          { cumulativeDiff += inputSegment.length() + yylength();
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    inputSegment.clear();
    yybegin(CDATA);
          }
        case 61: break;
        case 27: 
          { cumulativeDiff += inputSegment.length() + yylength() - 1;
    addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
    return BLOCK_LEVEL_START_TAG_REPLACEMENT;
          }
        case 62: break;
        case 44: 
          { restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
          }
-        case 63: break;
+        case 61: break;
        case 21: 
          { previousRestoreState = restoreState;
    restoreState = SERVER_SIDE_INCLUDE;
    yybegin(SINGLE_QUOTED_STRING);
          }
-        case 64: break;
+        case 62: break;
        case 11: 
          { inputSegment.write(zzBuffer, zzStartRead, yylength());
    yybegin(LEFT_ANGLE_BRACKET_SPACE);
          }
-        case 65: break;
+        case 63: break;
        case 35: 
          { yybegin(SCRIPT);
          }
-        case 66: break;
+        case 64: break;
        case 42: 
          { restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE);
          }
-        case 67: break;
+        case 65: break;
        case 10: 
          { inputSegment.append('!'); yybegin(BANG);
          }
-        case 68: break;
+        case 66: break;
        case 51: 
          { // Handle paired UTF-16 surrogates.
    String surrogatePair = yytext();
    char highSurrogate = '\u0000';
    char lowSurrogate = '\u0000';
    try {
      highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
    } catch(Exception e) { // should never happen
      assert false: "Exception parsing high surrogate '"
                  + surrogatePair.substring(2, 6) + "'";
    }
    try { // Low surrogates are in decimal range [56320, 57343]
      lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
    } catch(Exception e) { // should never happen
      assert false: "Exception parsing low surrogate '"
                  + surrogatePair.substring(9, 14) + "'";
    }
    if (Character.isLowSurrogate(lowSurrogate)) {
      outputSegment = entitySegment;
      outputSegment.clear();
      outputSegment.unsafeWrite(lowSurrogate);
      // add (previously matched input length) + (this match length) - (substitution length)
      cumulativeDiff += inputSegment.length() + yylength() - 2;
      // position the correction at (already output length) + (substitution length)
      addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
      inputSegment.clear();
      yybegin(YYINITIAL);
      return highSurrogate;
    }
    yypushback(surrogatePair.length() - 1); // Consume only '#'
    inputSegment.append('#');
    yybegin(NUMERIC_CHARACTER);
          }
        case 67: break;
        case 4: 
          { yypushback(1);
    outputSegment = inputSegment;
@ -31186,37 +31227,48 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
    yybegin(YYINITIAL);
    return outputSegment.nextChar();
          }
-        case 69: break;
+        case 68: break;
        case 48: 
          { inputSegment.clear();
    yybegin(YYINITIAL);
    cumulativeDiff += yychar - inputStart;
    int outputEnd = outputCharCount;
    int returnValue;
    if (escapeSTYLE) {
      inputSegment.write(zzBuffer, zzStartRead, yylength());
      outputSegment = inputSegment;
      returnValue = outputSegment.nextChar();
    } else {
      cumulativeDiff += yylength() - 1;
      ++outputEnd;
      returnValue = STYLE_REPLACEMENT;
    }
    addOffCorrectMap(outputEnd, cumulativeDiff);
    return returnValue;
          }
        case 70: break;
        case 43: 
          { restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
          }
-        case 71: break;
+        case 69: break;
-        case 14: 
+        case 52: 
-          { cumulativeDiff += inputSegment.length() + yylength();
+          { // Handle paired UTF-16 surrogates.
-    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    String surrogatePair = yytext();
-    inputSegment.clear();
+    char highSurrogate = '\u0000';
-    yybegin(YYINITIAL);
+    try { // High surrogates are in decimal range [55296, 56319]
      highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
    } catch(Exception e) { // should never happen
      assert false: "Exception parsing high surrogate '"
                  + surrogatePair.substring(1, 6) + "'";
    }
    if (Character.isHighSurrogate(highSurrogate)) {
      outputSegment = entitySegment;
      outputSegment.clear();
      try {
        outputSegment.unsafeWrite
            ((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
      } catch(Exception e) { // should never happen
        assert false: "Exception parsing low surrogate '"
                    + surrogatePair.substring(10, 14) + "'";
      }
      // add (previously matched input length) + (this match length) - (substitution length)
      cumulativeDiff += inputSegment.length() + yylength() - 2;
      // position the correction at (already output length) + (substitution length)
      addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
      inputSegment.clear();
      yybegin(YYINITIAL);
      return highSurrogate;
    }
    yypushback(surrogatePair.length() - 1); // Consume only '#'
    inputSegment.append('#');
    yybegin(NUMERIC_CHARACTER);
          }
-        case 72: break;
+        case 70: break;
        case 28: 
          { restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING);
          }
        case 71: break;
        case 50: 
          { // Handle paired UTF-16 surrogates.
    outputSegment = entitySegment;
@ -31236,49 +31288,63 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
      assert false: "Exception parsing low surrogate '"
                  + surrogatePair.substring(10, 14) + "'";
    }
    // add (previously matched input length) + (this match length) - (substitution length)
    cumulativeDiff += inputSegment.length() + yylength() - 2;
    // position the correction at (already output length) + (substitution length)
    addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
    return highSurrogate;
          }
-        case 73: break;
+        case 72: break;
        case 28: 
          { restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING);
          }
        case 74: break;
        case 49: 
          { inputSegment.clear();
    yybegin(YYINITIAL);
    cumulativeDiff += yychar - inputStart;
    int outputEnd = outputCharCount;
    int returnValue;
    if (escapeSCRIPT) {
      inputSegment.write(zzBuffer, zzStartRead, yylength());
      outputSegment = inputSegment;
      returnValue = outputSegment.nextChar();
    } else {
      cumulativeDiff += yylength() - 1;
      ++outputEnd;
      returnValue = SCRIPT_REPLACEMENT;
    }
    addOffCorrectMap(outputEnd, cumulativeDiff);
    return returnValue;
          }
        case 75: break;
        case 16: 
          { restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING);
          }
-        case 76: break;
+        case 73: break;
        case 22: 
          { previousRestoreState = restoreState;
    restoreState = SERVER_SIDE_INCLUDE;
    yybegin(DOUBLE_QUOTED_STRING);
          }
-        case 77: break;
+        case 74: break;
        case 26: 
          { // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
    cumulativeDiff += inputSegment.length() + yylength();
    // position the correction at (already output length) [ + (substitution length) = 0 ]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    inputSegment.clear();
    outputSegment = inputSegment;
    yybegin(YYINITIAL);
          }
        case 75: break;
        case 20: 
          { inputSegment.write(zzBuffer, zzStartRead, yylength());
          }
        case 76: break;
        case 47: 
          { // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
    cumulativeDiff += inputSegment.length() + yylength();
    // position the correction at (already output length) [ + (substitution length) = 0 ]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    inputSegment.clear();
    yybegin(CDATA);
          }
        case 77: break;
        case 33: 
          { yybegin(YYINITIAL);
    if (escapeBR) {
      inputSegment.write(zzBuffer, zzStartRead, yylength());
      outputSegment = inputSegment;
      return outputSegment.nextChar();
    } else {
      // add (previously matched input length) + (this match length) - (substitution length)
      cumulativeDiff += inputSegment.length() + yylength() - 1;
      // position the correction at (already output length) + (substitution length)
      addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
      inputSegment.reset();
      return BR_START_TAG_REPLACEMENT;
    }
          }
        case 78: break;
        case 23: 
          { yybegin(restoreState); restoreState = previousRestoreState;
@ -31288,28 +31354,20 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
          { yybegin(COMMENT);
          }
        case 80: break;
        case 25: 
          { cumulativeDiff += inputSegment.length() + yylength() - 1;
    addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
    return BLOCK_LEVEL_END_TAG_REPLACEMENT;
          }
        case 81: break;
        case 24: 
          { inputSegment.write(zzBuffer, zzStartRead, yylength());
     outputSegment = inputSegment;
     yybegin(YYINITIAL);
     return outputSegment.nextChar();
          }
-        case 82: break;
+        case 81: break;
        case 3: 
          { inputStart = yychar;
  inputSegment.clear();
  inputSegment.append('&');
  yybegin(AMPERSAND);
          }
-        case 83: break;
+        case 82: break;
        case 46: 
          { yybegin(SCRIPT);
    if (escapeSCRIPT) {
@ -31319,6 +31377,15 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
      return outputSegment.nextChar();
    }
          }
        case 83: break;
        case 14: 
          { // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
    cumulativeDiff += inputSegment.length() + yylength();
    // position the correction at (already output length) [ + (substitution length) = 0 ]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
          }
        case 84: break;
        case 6: 
          { int matchLength = yylength();
@ -31354,14 +31421,23 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
    }
          }
        case 85: break;
        case 34: 
          { // add (previously matched input length) + (this match length) [ - (substitution length) = 0]
    cumulativeDiff += yychar - inputStart + yylength();
    // position the correction at (already output length) [ + (substitution length) = 0]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
          }
        case 86: break;
        case 5: 
          { inputSegment.append('#'); yybegin(NUMERIC_CHARACTER);
          }
-        case 86: break;
+        case 87: break;
        case 13: 
          { inputSegment.append(zzBuffer[zzStartRead]);
          }
-        case 87: break;
+        case 88: break;
        case 18: 
          { inputSegment.write(zzBuffer, zzStartRead, yylength());
    if (null != escapedTags
@ -31369,93 +31445,25 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
      yybegin(END_TAG_TAIL_INCLUDE);
    } else {
      yybegin(END_TAG_TAIL_SUBSTITUTE);
    }
          }
        case 88: break;
        case 36: 
          { yybegin(YYINITIAL);
    if (escapeBR) {
      inputSegment.write(zzBuffer, zzStartRead, yylength());
      outputSegment = inputSegment;
      return outputSegment.nextChar();
    } else {
      cumulativeDiff
          += inputSegment.length() + yylength() - outputSegment.length();
      addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
      inputSegment.reset();
      return BR_END_TAG_REPLACEMENT;
    }
          }
        case 89: break;
        case 33: 
          { yybegin(YYINITIAL);
    if (escapeBR) {
      inputSegment.write(zzBuffer, zzStartRead, yylength());
      outputSegment = inputSegment;
      return outputSegment.nextChar();
    } else {
      cumulativeDiff
          += inputSegment.length() + yylength() - outputSegment.length();
      addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
      inputSegment.reset();
      return BR_START_TAG_REPLACEMENT;
    }
          }
        case 90: break;
        case 40: 
          { yybegin(SCRIPT_COMMENT);
          }
-        case 91: break;
+        case 90: break;
        case 37: 
-          { cumulativeDiff += yylength();
+          { // add (this match length) [ - (substitution length) = 0 ]
    cumulativeDiff += yylength();
    // position the correction at (already output length) [ + (substitution length) = 0 ]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    yybegin(YYINITIAL);
          }
-        case 92: break;
+        case 91: break;
        case 12: 
          { inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH);
          }
-        case 93: break;
+        case 92: break;
        case 51: 
          { // Handle paired UTF-16 surrogates.
    String surrogatePair = yytext();
    char highSurrogate = '\u0000';
    char lowSurrogate = '\u0000';
    try {
      highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
    } catch(Exception e) { // should never happen
      assert false: "Exception parsing high surrogate '"
                  + surrogatePair.substring(2, 6) + "'";
    }
    try { // Low surrogates are in decimal range [56320, 57343]
      lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
    } catch(Exception e) { // should never happen
      assert false: "Exception parsing low surrogate '"
                  + surrogatePair.substring(9, 14) + "'";
    }
    if (Character.isLowSurrogate(lowSurrogate)) {
      outputSegment = entitySegment;
      outputSegment.clear();
      outputSegment.unsafeWrite(lowSurrogate);
      cumulativeDiff += inputSegment.length() + yylength() - 2;
      addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
      inputSegment.clear();
      yybegin(YYINITIAL);
      return highSurrogate;
    }
    yypushback(surrogatePair.length() - 1); // Consume only '#'
    inputSegment.append('#');
    yybegin(NUMERIC_CHARACTER);
          }
        case 94: break;
        case 7: 
          { cumulativeDiff
        += inputSegment.length() + yylength() - outputSegment.length();
    addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
    yybegin(YYINITIAL);
    return outputSegment.nextChar();
          }
        case 95: break;
        case 9: 
          { inputSegment.write(zzBuffer, zzStartRead, yylength());
    if (null != escapedTags
@ -31465,15 +31473,38 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
      yybegin(START_TAG_TAIL_EXCLUDE);
    }
          }
-        case 96: break;
+        case 93: break;
        case 49: 
          { inputSegment.clear();
    yybegin(YYINITIAL);
    // add (previously matched input length) -- current match and substitution handled below
    cumulativeDiff += yychar - inputStart;
    // position at (already output length) -- substitution handled below
    int offsetCorrectionPos = outputCharCount;
    int returnValue;
    if (escapeSCRIPT) {
      inputSegment.write(zzBuffer, zzStartRead, yylength());
      outputSegment = inputSegment;
      returnValue = outputSegment.nextChar();
    } else {
      // add (this match length) - (substitution length)
      cumulativeDiff += yylength() - 1;
      // add (substitution length)
      ++offsetCorrectionPos;
      returnValue = SCRIPT_REPLACEMENT;
    }
    addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
    return returnValue;
          }
        case 94: break;
        case 29: 
          { restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
          }
-        case 97: break;
+        case 95: break;
        case 17: 
          { restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
          }
-        case 98: break;
+        case 96: break;
        case 45: 
          { yybegin(STYLE);
    if (escapeSTYLE) {
@ -31483,7 +31514,16 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
      return outputSegment.nextChar();
    }
          }
-        case 99: break;
+        case 97: break;
        case 7: 
          { // add (previously matched input length) + (this match length) - (substitution length)
    cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length();
    // position the correction at (already output length) + (substitution length)
    addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
    yybegin(YYINITIAL);
    return outputSegment.nextChar();
          }
        case 98: break;
        case 19: 
          { inputSegment.write(zzBuffer, zzStartRead, yylength());
    if (null != escapedTags
@ -31493,6 +31533,16 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
      yybegin(END_TAG_TAIL_EXCLUDE);
    }
          }
        case 99: break;
        case 25: 
          { // add (previously matched input length) + (this match length) - (substitution length)
    cumulativeDiff += inputSegment.length() + yylength() - 1;
    // position the correction at (already output length) + (substitution length)
    addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
    return BLOCK_LEVEL_END_TAG_REPLACEMENT;
          }
        case 100: break;
        case 31: 
          { int matchLength = yylength();
@ -31529,49 +31579,6 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
    }
          }
        case 101: break;
        case 38: 
          { yybegin(restoreState);
          }
        case 102: break;
        case 41: 
          { yybegin(STYLE_COMMENT);
          }
        case 103: break;
        case 1: 
          { return zzBuffer[zzStartRead];
          }
        case 104: break;
        case 52: 
          { // Handle paired UTF-16 surrogates.
    String surrogatePair = yytext();
    char highSurrogate = '\u0000';
    try { // High surrogates are in decimal range [55296, 56319]
      highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
    } catch(Exception e) { // should never happen
      assert false: "Exception parsing high surrogate '"
                  + surrogatePair.substring(1, 6) + "'";
    }
    if (Character.isHighSurrogate(highSurrogate)) {
      outputSegment = entitySegment;
      outputSegment.clear();
      try {
        outputSegment.unsafeWrite
            ((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
      } catch(Exception e) { // should never happen
        assert false: "Exception parsing low surrogate '"
                    + surrogatePair.substring(10, 14) + "'";
      }
      cumulativeDiff += inputSegment.length() + yylength() - 2;
      addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
      inputSegment.clear();
      yybegin(YYINITIAL);
      return highSurrogate;
    }
    yypushback(surrogatePair.length() - 1); // Consume only '#'
    inputSegment.append('#');
    yybegin(NUMERIC_CHARACTER);
          }
        case 105: break;
        case 53: 
          { // Handle paired UTF-16 surrogates.
    String surrogatePair = yytext();
@ -31594,7 +31601,9 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
        outputSegment = entitySegment;
        outputSegment.clear();
        outputSegment.unsafeWrite(lowSurrogate);
        // add (previously matched input length) + (this match length) - (substitution length)
        cumulativeDiff += inputSegment.length() + yylength() - 2;
        // position the correction at (already output length) + (substitution length)
        addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
        inputSegment.clear();
        yybegin(YYINITIAL);
@ -31605,6 +31614,34 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
    inputSegment.append('#');
    yybegin(NUMERIC_CHARACTER);
          }
        case 102: break;
        case 36: 
          { yybegin(YYINITIAL);
    if (escapeBR) {
      inputSegment.write(zzBuffer, zzStartRead, yylength());
      outputSegment = inputSegment;
      return outputSegment.nextChar();
    } else {
      // add (previously matched input length) + (this match length) - (substitution length)
      cumulativeDiff += inputSegment.length() + yylength() - 1;
      // position the correction at (already output length) + (substitution length)
      addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
      inputSegment.reset();
      return BR_END_TAG_REPLACEMENT;
    }
          }
        case 103: break;
        case 38: 
          { yybegin(restoreState);
          }
        case 104: break;
        case 41: 
          { yybegin(STYLE_COMMENT);
          }
        case 105: break;
        case 1: 
          { return zzBuffer[zzStartRead];
          }
        case 106: break;
        default: 
          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
@ -293,7 +293,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
    case START_TAG_TAIL_EXCLUDE:
    case SERVER_SIDE_INCLUDE:
    case START_TAG_TAIL_SUBSTITUTE: { // Exclude
      // add (length of input that won't be output) [ - (substitution length) = 0 ]
      cumulativeDiff += yychar - inputStart;
      // position the correction at (already output length) [ + (substitution length) = 0 ]
      addOffCorrectMap(outputCharCount, cumulativeDiff);
      outputSegment.clear();
      eofReturnValue = -1;
@ -301,7 +303,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
    }
    case CHARACTER_REFERENCE_TAIL: {        // Substitute
      // At end of file, allow char refs without semicolons
      // add (length of input that won't be output) - (substitution length)
      cumulativeDiff += inputSegment.length() - outputSegment.length();
      // position the correction at (already output length) + (substitution length)
      addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
      eofReturnValue = outputSegment.nextChar();
      break;
@ -374,7 +378,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
      assert false: "Exception parsing low surrogate '"
                  + surrogatePair.substring(10, 14) + "'";
    }
    // add (previously matched input length) + (this match length) - (substitution length)
    cumulativeDiff += inputSegment.length() + yylength() - 2;
    // position the correction at (already output length) + (substitution length)
    addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
@ -403,7 +409,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
        assert false: "Exception parsing low surrogate '"
                    + surrogatePair.substring(10, 14) + "'";
      }
      // add (previously matched input length) + (this match length) - (substitution length)
      cumulativeDiff += inputSegment.length() + yylength() - 2;
      // position the correction at (already output length) + (substitution length)
      addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
      inputSegment.clear();
      yybegin(YYINITIAL);
@ -437,7 +445,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
      outputSegment = entitySegment;
      outputSegment.clear();
      outputSegment.unsafeWrite(lowSurrogate);
      // add (previously matched input length) + (this match length) - (substitution length)
      cumulativeDiff += inputSegment.length() + yylength() - 2;
      // position the correction at (already output length) + (substitution length)
      addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
      inputSegment.clear();
      yybegin(YYINITIAL);
@ -472,7 +482,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
        outputSegment = entitySegment;
        outputSegment.clear();
        outputSegment.unsafeWrite(lowSurrogate);
        // add (previously matched input length) + (this match length) - (substitution length)
        cumulativeDiff += inputSegment.length() + yylength() - 2;
        // position the correction at (already output length) + (substitution length)
        addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
        inputSegment.clear();
        yybegin(YYINITIAL);
@ -557,8 +569,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
 <CHARACTER_REFERENCE_TAIL> {
  ";" {
-    cumulativeDiff
+    // add (previously matched input length) + (this match length) - (substitution length)
-        += inputSegment.length() + yylength() - outputSegment.length();
+    cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length();
    // position the correction at (already output length) + (substitution length)
    addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
    yybegin(YYINITIAL);
    return outputSegment.nextChar();
@ -574,9 +587,10 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
      outputSegment = inputSegment;
      return outputSegment.nextChar();
    } else {
-      cumulativeDiff
+      // add (previously matched input length) + (this match length) - (substitution length)
-          += inputSegment.length() + yylength() - outputSegment.length();
+      cumulativeDiff += inputSegment.length() + yylength() - 1;
-      addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
+      // position the correction at (already output length) + (substitution length)
      addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
      inputSegment.reset();
      return BR_END_TAG_REPLACEMENT;
    }
@ -612,7 +626,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
 <END_TAG_TAIL_EXCLUDE> {
  \s* ">" {
    // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
    cumulativeDiff += inputSegment.length() + yylength();
    // position the correction at (already output length) [ + (substitution length) = 0 ]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
@ -621,7 +637,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
 <END_TAG_TAIL_SUBSTITUTE> {
  \s* ">" {
    // add (previously matched input length) + (this match length) - (substitution length)
    cumulativeDiff += inputSegment.length() + yylength() - 1;
    // position the correction at (already output length) + (substitution length)
    addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
@ -637,7 +655,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
    yybegin(LEFT_ANGLE_BRACKET_SPACE);
  }
  "?" [^>]* [/?] ">" {
    // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
    cumulativeDiff += inputSegment.length() + yylength();
    // position the correction at (already output length) [ + (substitution length) = 0 ]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
@ -649,8 +669,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
      outputSegment = inputSegment;
      return outputSegment.nextChar();
    } else {
-      cumulativeDiff
+      // add (previously matched input length) + (this match length) - (substitution length)
-          += inputSegment.length() + yylength() - outputSegment.length();
+      cumulativeDiff += inputSegment.length() + yylength() - 1;
      // position the correction at (already output length) + (substitution length)
      addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
      inputSegment.reset();
      return BR_START_TAG_REPLACEMENT;
@ -708,7 +729,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
 <START_TAG_TAIL_EXCLUDE> {
   ( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
    // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
    cumulativeDiff += inputSegment.length() + yylength();
    // position the correction at (already output length) [ + (substitution length) = 0 ]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    inputSegment.clear();
    outputSegment = inputSegment;
@ -718,7 +741,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
 <START_TAG_TAIL_SUBSTITUTE> {
  ( ( "="\s* | \s+ ) {OpenTagContent} )? \s*  "/"? ">" {
    // add (previously matched input length) + (this match length) - (substitution length)
    cumulativeDiff += inputSegment.length() + yylength() - 1;
    // position the correction at (already output length) + (substitution length)
    addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
@ -729,7 +754,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
 <BANG> {
  "--" { yybegin(COMMENT); }
  ">" {
    // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
    cumulativeDiff += inputSegment.length() + yylength();
    // position the correction at (already output length) [ + (substitution length) = 0 ]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
@ -742,7 +769,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
  // [21] CDEnd   ::= ']]>'
  //
  "[CDATA[" {
    // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
    cumulativeDiff += inputSegment.length() + yylength();
    // position the correction at (already output length) [ + (substitution length) = 0 ]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    inputSegment.clear();
    yybegin(CDATA);
@ -754,7 +783,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
 <CDATA> {
  "]]>" {
    // add (this match length) [ - (substitution length) = 0 ]
    cumulativeDiff += yylength();
    // position the correction at (already output length) [ + (substitution length) = 0 ]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    yybegin(YYINITIAL);
  }
@ -764,7 +795,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
 <COMMENT> {
  "<!--#" { restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
  "-->" {
    // add (previously matched input length) + (this match length) [ - (substitution length) = 0]
    cumulativeDiff += yychar - inputStart + yylength();
    // position the correction at (already output length) [ + (substitution length) = 0]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
@ -820,19 +853,23 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
  "</" \s* [sS][cC][rR][iI][pP][tT] \s* ">" {
    inputSegment.clear();
    yybegin(YYINITIAL);
    // add (previously matched input length) -- current match and substitution handled below
    cumulativeDiff += yychar - inputStart;
-    int outputEnd = outputCharCount;
+    // position at (already output length) -- substitution handled below
    int offsetCorrectionPos = outputCharCount;
    int returnValue;
    if (escapeSCRIPT) {
      inputSegment.write(zzBuffer, zzStartRead, yylength());
      outputSegment = inputSegment;
      returnValue = outputSegment.nextChar();
    } else {
      // add (this match length) - (substitution length)
      cumulativeDiff += yylength() - 1;
-      ++outputEnd;
+      // add (substitution length)
      ++offsetCorrectionPos;
      returnValue = SCRIPT_REPLACEMENT;
    }
-    addOffCorrectMap(outputEnd, cumulativeDiff);
+    addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
    return returnValue;
  }
  [^] { }
@ -843,19 +880,23 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
  "</" \s* [sS][tT][yY][lL][eE] \s* ">" {
    inputSegment.clear();
    yybegin(YYINITIAL);
    // add (previously matched input length) -- current match and substitution handled below
    cumulativeDiff += yychar - inputStart;
-    int outputEnd = outputCharCount;
+    // position the offset correction at (already output length) -- substitution handled below
    int offsetCorrectionPos = outputCharCount;
    int returnValue;
    if (escapeSTYLE) {
      inputSegment.write(zzBuffer, zzStartRead, yylength());
      outputSegment = inputSegment;
      returnValue = outputSegment.nextChar();
    } else {
      // add (this match length) - (substitution length)
      cumulativeDiff += yylength() - 1;
-      ++outputEnd;
+      // add (substitution length)
      ++offsetCorrectionPos;
      returnValue = STYLE_REPLACEMENT;
    }
-    addOffCorrectMap(outputEnd, cumulativeDiff);
+    addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
    return returnValue;
  }
  [^] { }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
@ -36,6 +36,21 @@ import org.apache.lucene.util._TestUtil;
 public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
  static private Analyzer newTestAnalyzer() {
    return new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
        return new TokenStreamComponents(tokenizer, tokenizer);
      }
      @Override
      protected Reader initReader(Reader reader) {
        return new HTMLStripCharFilter(CharReader.get(reader));
      }
    };
  }
  //this is some text  here is a  link  and another  link . This is an entity: & plus a <.  Here is an &
  //
  public void test() throws IOException {
@ -493,41 +508,17 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
  }
  public void testRandom() throws Exception {
    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
        return new TokenStreamComponents(tokenizer, tokenizer);
      }
      @Override
      protected Reader initReader(Reader reader) {
        return new HTMLStripCharFilter(CharReader.get(reader));
      }
    };
    int numRounds = RANDOM_MULTIPLIER * 10000;
-    checkRandomData(random, analyzer, numRounds);
+    checkRandomData(random, newTestAnalyzer(), numRounds);
  }
  public void testRandomHugeStrings() throws Exception {
    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
        return new TokenStreamComponents(tokenizer, tokenizer);
      }
      @Override
      protected Reader initReader(Reader reader) {
        return new HTMLStripCharFilter(CharReader.get(reader));
      }
    };
    int numRounds = RANDOM_MULTIPLIER * 200;
-    checkRandomData(random, analyzer, numRounds, 8192);
+    checkRandomData(random, newTestAnalyzer(), numRounds, 8192);
  }
  public void testCloseBR() throws Exception {
    checkAnalysisConsistency(random, newTestAnalyzer(), random.nextBoolean(), " Secretary)</br> [[M");
  }
  public void testServerSideIncludes() throws Exception {
@ -797,9 +788,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
  public void testRandomBrokenHTML() throws Exception {
    int maxNumElements = 10000;
    String text = _TestUtil.randomHtmlishString(random, maxNumElements);
-    Reader reader = new HTMLStripCharFilter
+    checkAnalysisConsistency(random, newTestAnalyzer(), random.nextBoolean(), text);
        (CharReader.get(new StringReader(text)));
    while (reader.read() != -1);
  }
  public void testRandomText() throws Exception {
@ -838,18 +827,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
  }
  public void testUTF16Surrogates() throws Exception {
-    Analyzer analyzer = new Analyzer() {
+    Analyzer analyzer = newTestAnalyzer();
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
        return new TokenStreamComponents(tokenizer, tokenizer);
      }
      @Override
      protected Reader initReader(Reader reader) {
        return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
      }
    };
    // Paired surrogates
    assertAnalyzesTo(analyzer, " one two &#xD86C;&#XdC01;three",
        new String[] { "one", "two", "\uD86C\uDC01three" } );