LUCENE-3913: Fix HTMLStripCharFilter invalid final offset for input containing </br>

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1304912 13f79535-47bb-0310-9956-ffa450edef68
2012-03-24 20:54:31 +00:00 · 2012-03-24 20:54:31 +00:00 · ada9780484
parent 1f7c31d711
commit ada9780484
5 changed files with 559 additions and 461 deletions
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@ -391,188 +391,194 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
      }


-      if (VERBOSE) {
-        System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
+      checkAnalysisConsistency(random, a, useCharFilter, text);
+    }
+  }
+
+  public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text) throws IOException {
+
+    if (VERBOSE) {
+      System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
+    }
+
+    int remainder = random.nextInt(10);
+    Reader reader = new StringReader(text);
+    TokenStream ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
+    assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
+    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
+    OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
+    PositionIncrementAttribute posIncAtt = ts.hasAttribute(PositionIncrementAttribute.class) ? ts.getAttribute(PositionIncrementAttribute.class) : null;
+    PositionLengthAttribute posLengthAtt = ts.hasAttribute(PositionLengthAttribute.class) ? ts.getAttribute(PositionLengthAttribute.class) : null;
+    TypeAttribute typeAtt = ts.hasAttribute(TypeAttribute.class) ? ts.getAttribute(TypeAttribute.class) : null;
+    List<String> tokens = new ArrayList<String>();
+    List<String> types = new ArrayList<String>();
+    List<Integer> positions = new ArrayList<Integer>();
+    List<Integer> positionLengths = new ArrayList<Integer>();
+    List<Integer> startOffsets = new ArrayList<Integer>();
+    List<Integer> endOffsets = new ArrayList<Integer>();
+    ts.reset();
+
+    // First pass: save away "correct" tokens
+    while (ts.incrementToken()) {
+      tokens.add(termAtt.toString());
+      if (typeAtt != null) types.add(typeAtt.type());
+      if (posIncAtt != null) positions.add(posIncAtt.getPositionIncrement());
+      if (posLengthAtt != null) positionLengths.add(posLengthAtt.getPositionLength());
+      if (offsetAtt != null) {
+        startOffsets.add(offsetAtt.startOffset());
+        endOffsets.add(offsetAtt.endOffset());
      }
+    }
+    ts.end();
+    ts.close();

-      int remainder = random.nextInt(10);
-      Reader reader = new StringReader(text);
-      TokenStream ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
-      assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
-      CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
-      OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
-      PositionIncrementAttribute posIncAtt = ts.hasAttribute(PositionIncrementAttribute.class) ? ts.getAttribute(PositionIncrementAttribute.class) : null;
-      PositionLengthAttribute posLengthAtt = ts.hasAttribute(PositionLengthAttribute.class) ? ts.getAttribute(PositionLengthAttribute.class) : null;
-      TypeAttribute typeAtt = ts.hasAttribute(TypeAttribute.class) ? ts.getAttribute(TypeAttribute.class) : null;
-      List<String> tokens = new ArrayList<String>();
-      List<String> types = new ArrayList<String>();
-      List<Integer> positions = new ArrayList<Integer>();
-      List<Integer> positionLengths = new ArrayList<Integer>();
-      List<Integer> startOffsets = new ArrayList<Integer>();
-      List<Integer> endOffsets = new ArrayList<Integer>();
-      ts.reset();
+    // verify reusing is "reproducable" and also get the normal tokenstream sanity checks
+    if (!tokens.isEmpty()) {

-      // First pass: save away "correct" tokens
-      while (ts.incrementToken()) {
-        tokens.add(termAtt.toString());
-        if (typeAtt != null) types.add(typeAtt.type());
-        if (posIncAtt != null) positions.add(posIncAtt.getPositionIncrement());
-        if (posLengthAtt != null) positionLengths.add(posLengthAtt.getPositionLength());
-        if (offsetAtt != null) {
-          startOffsets.add(offsetAtt.startOffset());
-          endOffsets.add(offsetAtt.endOffset());
-        }
-      }
-      ts.end();
-      ts.close();
+      // KWTokenizer (for example) can produce a token
+      // even when input is length 0:
+      if (text.length() != 0) {

-      // verify reusing is "reproducable" and also get the normal tokenstream sanity checks
-      if (!tokens.isEmpty()) {
-
-        // KWTokenizer (for example) can produce a token
-        // even when input is length 0:
-        if (text.length() != 0) {
-
-          // (Optional) second pass: do something evil:
-          final int evilness = random.nextInt(50);
-          if (evilness == 17) {
-            if (VERBOSE) {
-              System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis w/ exception");
-            }
-            // Throw an errant exception from the Reader:
-
-            MockReaderWrapper evilReader = new MockReaderWrapper(random, new StringReader(text));
-            evilReader.throwExcAfterChar(random.nextInt(text.length()+1));
-            reader = evilReader;
-
-            try {
-              // NOTE: some Tokenizers go and read characters
-              // when you call .setReader(Reader), eg
-              // PatternTokenizer.  This is a bit
-              // iffy... (really, they should only
-              // pull from the Reader when you call
-              // .incremenToken(), I think?), but we
-              // currently allow it, so, we must call
-              // a.tokenStream inside the try since we may
-              // hit the exc on init:
-              ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(evilReader, remainder) : evilReader);
-              ts.reset();
-              while (ts.incrementToken());
-              fail("did not hit exception");
-            } catch (RuntimeException re) {
-              assertTrue(MockReaderWrapper.isMyEvilException(re));
-            }
-            try {
-              ts.end();
-            } catch (AssertionError ae) {
-              // Catch & ignore MockTokenizer's
-              // anger...
-              if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
-                // OK
-              } else {
-                throw ae;
-              }
-            }
-            ts.close();
-          } else if (evilness == 7) {
-            // Only consume a subset of the tokens:
-            final int numTokensToRead = random.nextInt(tokens.size());
-            if (VERBOSE) {
-              System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.size() + " tokens");
-            }
-
-            reader = new StringReader(text);
-            ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
-            ts.reset();
-            for(int tokenCount=0;tokenCount<numTokensToRead;tokenCount++) {
-              assertTrue(ts.incrementToken());
-            }
-            try {
-              ts.end();
-            } catch (AssertionError ae) {
-              // Catch & ignore MockTokenizer's
-              // anger...
-              if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
-                // OK
-              } else {
-                throw ae;
-              }
-            }
-            ts.close();
-          }
-        }
-
-        // Final pass: verify clean tokenization matches
-        // results from first pass:
-        if (VERBOSE) {
-          System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
-        }
-        reader = new StringReader(text);
-
-        if (random.nextInt(30) == 7) {
+        // (Optional) second pass: do something evil:
+        final int evilness = random.nextInt(50);
+        if (evilness == 17) {
          if (VERBOSE) {
-            System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader");
+            System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis w/ exception");
+          }
+          // Throw an errant exception from the Reader:
+
+          MockReaderWrapper evilReader = new MockReaderWrapper(random, new StringReader(text));
+          evilReader.throwExcAfterChar(random.nextInt(text.length()+1));
+          reader = evilReader;
+
+          try {
+            // NOTE: some Tokenizers go and read characters
+            // when you call .setReader(Reader), eg
+            // PatternTokenizer.  This is a bit
+            // iffy... (really, they should only
+            // pull from the Reader when you call
+            // .incremenToken(), I think?), but we
+            // currently allow it, so, we must call
+            // a.tokenStream inside the try since we may
+            // hit the exc on init:
+            ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(evilReader, remainder) : evilReader);
+            ts.reset();
+            while (ts.incrementToken());
+            fail("did not hit exception");
+          } catch (RuntimeException re) {
+            assertTrue(MockReaderWrapper.isMyEvilException(re));
+          }
+          try {
+            ts.end();
+          } catch (AssertionError ae) {
+            // Catch & ignore MockTokenizer's
+            // anger...
+            if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
+              // OK
+            } else {
+              throw ae;
+            }
+          }
+          ts.close();
+        } else if (evilness == 7) {
+          // Only consume a subset of the tokens:
+          final int numTokensToRead = random.nextInt(tokens.size());
+          if (VERBOSE) {
+            System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.size() + " tokens");
          }

-          reader = new MockReaderWrapper(random, reader);
+          reader = new StringReader(text);
+          ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
+          ts.reset();
+          for(int tokenCount=0;tokenCount<numTokensToRead;tokenCount++) {
+            assertTrue(ts.incrementToken());
+          }
+          try {
+            ts.end();
+          } catch (AssertionError ae) {
+            // Catch & ignore MockTokenizer's
+            // anger...
+            if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
+              // OK
+            } else {
+              throw ae;
+            }
+          }
+          ts.close();
        }
-        
-        ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
-        if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
-          // offset + pos + posLength + type
-          assertTokenStreamContents(ts, 
-            tokens.toArray(new String[tokens.size()]),
-            toIntArray(startOffsets),
-            toIntArray(endOffsets),
-            types.toArray(new String[types.size()]),
-            toIntArray(positions),
-            toIntArray(positionLengths),
-            text.length());
-        } else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
-          // offset + pos + type
-          assertTokenStreamContents(ts, 
-            tokens.toArray(new String[tokens.size()]),
-            toIntArray(startOffsets),
-            toIntArray(endOffsets),
-            types.toArray(new String[types.size()]),
-            toIntArray(positions),
-            null,
-            text.length());
-        } else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
-          // offset + pos + posLength
-          assertTokenStreamContents(ts, 
-              tokens.toArray(new String[tokens.size()]),
-              toIntArray(startOffsets),
-              toIntArray(endOffsets),
-              null,
-              toIntArray(positions),
-              toIntArray(positionLengths),
-              text.length());
-        } else if (posIncAtt != null && offsetAtt != null) {
-          // offset + pos
-          assertTokenStreamContents(ts, 
-              tokens.toArray(new String[tokens.size()]),
-              toIntArray(startOffsets),
-              toIntArray(endOffsets),
-              null,
-              toIntArray(positions),
-              null,
-              text.length());
-        } else if (offsetAtt != null) {
-          // offset
-          assertTokenStreamContents(ts, 
-              tokens.toArray(new String[tokens.size()]),
-              toIntArray(startOffsets),
-              toIntArray(endOffsets),
-              null,
-              null,
-              null,
-              text.length());
-        } else {
-          // terms only
-          assertTokenStreamContents(ts, 
-              tokens.toArray(new String[tokens.size()]));
+      }
+
+      // Final pass: verify clean tokenization matches
+      // results from first pass:
+
+      if (VERBOSE) {
+        System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
+      }
+      reader = new StringReader(text);
+
+      if (random.nextInt(30) == 7) {
+        if (VERBOSE) {
+          System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader");
        }
+
+        reader = new MockReaderWrapper(random, reader);
+      }
+
+      ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
+      if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
+        // offset + pos + posLength + type
+        assertTokenStreamContents(ts, 
+                                  tokens.toArray(new String[tokens.size()]),
+                                  toIntArray(startOffsets),
+                                  toIntArray(endOffsets),
+                                  types.toArray(new String[types.size()]),
+                                  toIntArray(positions),
+                                  toIntArray(positionLengths),
+                                  text.length());
+      } else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
+        // offset + pos + type
+        assertTokenStreamContents(ts, 
+                                  tokens.toArray(new String[tokens.size()]),
+                                  toIntArray(startOffsets),
+                                  toIntArray(endOffsets),
+                                  types.toArray(new String[types.size()]),
+                                  toIntArray(positions),
+                                  null,
+                                  text.length());
+      } else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
+        // offset + pos + posLength
+        assertTokenStreamContents(ts, 
+                                  tokens.toArray(new String[tokens.size()]),
+                                  toIntArray(startOffsets),
+                                  toIntArray(endOffsets),
+                                  null,
+                                  toIntArray(positions),
+                                  toIntArray(positionLengths),
+                                  text.length());
+      } else if (posIncAtt != null && offsetAtt != null) {
+        // offset + pos
+        assertTokenStreamContents(ts, 
+                                  tokens.toArray(new String[tokens.size()]),
+                                  toIntArray(startOffsets),
+                                  toIntArray(endOffsets),
+                                  null,
+                                  toIntArray(positions),
+                                  null,
+                                  text.length());
+      } else if (offsetAtt != null) {
+        // offset
+        assertTokenStreamContents(ts, 
+                                  tokens.toArray(new String[tokens.size()]),
+                                  toIntArray(startOffsets),
+                                  toIntArray(endOffsets),
+                                  null,
+                                  null,
+                                  null,
+                                  text.length());
+      } else {
+        // terms only
+        assertTokenStreamContents(ts, 
+                                  tokens.toArray(new String[tokens.size()]));
      }
    }
  }
--- a/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java
@ -27,10 +27,7 @@ import java.io.OutputStream;
 import java.io.PrintStream;
 import java.lang.reflect.Method;
 import java.nio.CharBuffer;
-import java.util.Enumeration;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Random;
+import java.util.*;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipFile;

@ -414,12 +411,51 @@ public class _TestUtil {
        case 20: sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1)); break;
        case 21: sb.append("\n"); break;
        case 22: sb.append("          ".substring(nextInt(random, 0, 10))); break;
+        case 23: {
+          sb.append("<");
+          if (0 == nextInt(random, 0, 3)) {
+            sb.append("          ".substring(nextInt(random, 1, 10)));
+          }
+          if (0 == nextInt(random, 0, 1)) {
+            sb.append("/");
+            if (0 == nextInt(random, 0, 3)) {
+              sb.append("          ".substring(nextInt(random, 1, 10)));
+            }
+          }
+          switch (nextInt(random, 0, 3)) {
+            case 0: sb.append(randomlyRecaseCodePoints(random, "script")); break;
+            case 1: sb.append(randomlyRecaseCodePoints(random, "style")); break;
+            case 2: sb.append(randomlyRecaseCodePoints(random, "br")); break;
+            // default: append nothing
+          }
+          sb.append(">".substring(nextInt(random, 0, 1)));
+          break;
+        }
        default: sb.append(randomSimpleString(random));
      }
    }
    return sb.toString();
  }

+  /**
+   * Randomly upcases, downcases, or leaves intact each code point in the given string
+   */
+  public static String randomlyRecaseCodePoints(Random random, String str) {
+    StringBuilder builder = new StringBuilder();
+    int pos = 0;
+    while (pos < str.length()) {
+      int codePoint = str.codePointAt(pos);
+      pos += Character.charCount(codePoint);
+      String codePointSubstring = new String(new int[] { codePoint }, 0, 1);
+      switch (nextInt(random, 0, 2)) {
+        case 0: builder.append(codePointSubstring.toUpperCase()); break;
+        case 1: builder.append(codePointSubstring.toLowerCase()); break;
+        case 2: builder.append(codePointSubstring); // leave intact
+      }
+    }
+    return builder.toString();
+  }
+
  private static final int[] blockStarts = {
    0x0000, 0x0080, 0x0100, 0x0180, 0x0250, 0x02B0, 0x0300, 0x0370, 0x0400, 
    0x0500, 0x0530, 0x0590, 0x0600, 0x0700, 0x0750, 0x0780, 0x07C0, 0x0800, 
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 1/23/12 2:15 AM */
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 3/24/12 4:50 PM */

 package org.apache.lucene.analysis.charfilter;

@ -39,7 +39,7 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
 /**
 * This class is a scanner generated by 
 * <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
- * on 1/23/12 2:15 AM from the specification file
+ * on 3/24/12 4:50 PM from the specification file
 * <tt>C:/cygwin/home/s/svn/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex</tt>
 */
 public final class HTMLStripCharFilter extends BaseCharFilter {
@ -30967,7 +30967,9 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
    case START_TAG_TAIL_EXCLUDE:
    case SERVER_SIDE_INCLUDE:
    case START_TAG_TAIL_SUBSTITUTE: { // Exclude
+      // add (length of input that won't be output) [ - (substitution length) = 0 ]
      cumulativeDiff += yychar - inputStart;
+      // position the correction at (already output length) [ + (substitution length) = 0 ]
      addOffCorrectMap(outputCharCount, cumulativeDiff);
      outputSegment.clear();
      eofReturnValue = -1;
@ -30975,7 +30977,9 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
    }
    case CHARACTER_REFERENCE_TAIL: {        // Substitute
      // At end of file, allow char refs without semicolons
+      // add (length of input that won't be output) - (substitution length)
      cumulativeDiff += inputSegment.length() - outputSegment.length();
+      // position the correction at (already output length) + (substitution length)
      addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
      eofReturnValue = outputSegment.nextChar();
      break;
@ -31095,6 +31099,16 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
          { yybegin(STYLE);
          }
        case 55: break;
+        case 27: 
+          { // add (previously matched input length) + (this match length) - (substitution length)
+    cumulativeDiff += inputSegment.length() + yylength() - 1;
+    // position the correction at (already output length) + (substitution length)
+    addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+    return BLOCK_LEVEL_START_TAG_REPLACEMENT;
+          }
+        case 56: break;
        case 30: 
          { int length = yylength();
    inputSegment.write(zzBuffer, zzStartRead, length);
@ -31104,7 +31118,30 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
    outputSegment = entitySegment;
    yybegin(CHARACTER_REFERENCE_TAIL);
          }
-        case 56: break;
+        case 57: break;
+        case 48: 
+          { inputSegment.clear();
+    yybegin(YYINITIAL);
+    // add (previously matched input length) -- current match and substitution handled below
+    cumulativeDiff += yychar - inputStart;
+    // position the offset correction at (already output length) -- substitution handled below
+    int offsetCorrectionPos = outputCharCount;
+    int returnValue;
+    if (escapeSTYLE) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      returnValue = outputSegment.nextChar();
+    } else {
+      // add (this match length) - (substitution length)
+      cumulativeDiff += yylength() - 1;
+      // add (substitution length)
+      ++offsetCorrectionPos;
+      returnValue = STYLE_REPLACEMENT;
+    }
+    addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
+    return returnValue;
+          }
+        case 58: break;
        case 8: 
          { inputSegment.write(zzBuffer, zzStartRead, yylength());
    if (null != escapedTags
@ -31114,71 +31151,75 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
      yybegin(START_TAG_TAIL_SUBSTITUTE);
    }
          }
-        case 57: break;
-        case 26: 
-          { cumulativeDiff += inputSegment.length() + yylength();
-    addOffCorrectMap(outputCharCount, cumulativeDiff);
-    inputSegment.clear();
-    outputSegment = inputSegment;
-    yybegin(YYINITIAL);
-          }
-        case 58: break;
+        case 59: break;
        case 2: 
          { inputStart = yychar;
  inputSegment.clear();
  inputSegment.append('<');
  yybegin(LEFT_ANGLE_BRACKET);
          }
-        case 59: break;
-        case 34: 
-          { cumulativeDiff += yychar - inputStart + yylength();
-    addOffCorrectMap(outputCharCount, cumulativeDiff);
-    inputSegment.clear();
-    yybegin(YYINITIAL);
-          }
        case 60: break;
-        case 47: 
-          { cumulativeDiff += inputSegment.length() + yylength();
-    addOffCorrectMap(outputCharCount, cumulativeDiff);
-    inputSegment.clear();
-    yybegin(CDATA);
-          }
-        case 61: break;
-        case 27: 
-          { cumulativeDiff += inputSegment.length() + yylength() - 1;
-    addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
-    inputSegment.clear();
-    yybegin(YYINITIAL);
-    return BLOCK_LEVEL_START_TAG_REPLACEMENT;
-          }
-        case 62: break;
        case 44: 
          { restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
          }
-        case 63: break;
+        case 61: break;
        case 21: 
          { previousRestoreState = restoreState;
    restoreState = SERVER_SIDE_INCLUDE;
    yybegin(SINGLE_QUOTED_STRING);
          }
-        case 64: break;
+        case 62: break;
        case 11: 
          { inputSegment.write(zzBuffer, zzStartRead, yylength());
    yybegin(LEFT_ANGLE_BRACKET_SPACE);
          }
-        case 65: break;
+        case 63: break;
        case 35: 
          { yybegin(SCRIPT);
          }
-        case 66: break;
+        case 64: break;
        case 42: 
          { restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE);
          }
-        case 67: break;
+        case 65: break;
        case 10: 
          { inputSegment.append('!'); yybegin(BANG);
          }
-        case 68: break;
+        case 66: break;
+        case 51: 
+          { // Handle paired UTF-16 surrogates.
+    String surrogatePair = yytext();
+    char highSurrogate = '\u0000';
+    char lowSurrogate = '\u0000';
+    try {
+      highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
+    } catch(Exception e) { // should never happen
+      assert false: "Exception parsing high surrogate '"
+                  + surrogatePair.substring(2, 6) + "'";
+    }
+    try { // Low surrogates are in decimal range [56320, 57343]
+      lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
+    } catch(Exception e) { // should never happen
+      assert false: "Exception parsing low surrogate '"
+                  + surrogatePair.substring(9, 14) + "'";
+    }
+    if (Character.isLowSurrogate(lowSurrogate)) {
+      outputSegment = entitySegment;
+      outputSegment.clear();
+      outputSegment.unsafeWrite(lowSurrogate);
+      // add (previously matched input length) + (this match length) - (substitution length)
+      cumulativeDiff += inputSegment.length() + yylength() - 2;
+      // position the correction at (already output length) + (substitution length)
+      addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
+      inputSegment.clear();
+      yybegin(YYINITIAL);
+      return highSurrogate;
+    }
+    yypushback(surrogatePair.length() - 1); // Consume only '#'
+    inputSegment.append('#');
+    yybegin(NUMERIC_CHARACTER);
+          }
+        case 67: break;
        case 4: 
          { yypushback(1);
    outputSegment = inputSegment;
@ -31186,37 +31227,48 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
    yybegin(YYINITIAL);
    return outputSegment.nextChar();
          }
-        case 69: break;
-        case 48: 
-          { inputSegment.clear();
-    yybegin(YYINITIAL);
-    cumulativeDiff += yychar - inputStart;
-    int outputEnd = outputCharCount;
-    int returnValue;
-    if (escapeSTYLE) {
-      inputSegment.write(zzBuffer, zzStartRead, yylength());
-      outputSegment = inputSegment;
-      returnValue = outputSegment.nextChar();
-    } else {
-      cumulativeDiff += yylength() - 1;
-      ++outputEnd;
-      returnValue = STYLE_REPLACEMENT;
-    }
-    addOffCorrectMap(outputEnd, cumulativeDiff);
-    return returnValue;
-          }
-        case 70: break;
+        case 68: break;
        case 43: 
          { restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
          }
-        case 71: break;
-        case 14: 
-          { cumulativeDiff += inputSegment.length() + yylength();
-    addOffCorrectMap(outputCharCount, cumulativeDiff);
-    inputSegment.clear();
-    yybegin(YYINITIAL);
+        case 69: break;
+        case 52: 
+          { // Handle paired UTF-16 surrogates.
+    String surrogatePair = yytext();
+    char highSurrogate = '\u0000';
+    try { // High surrogates are in decimal range [55296, 56319]
+      highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
+    } catch(Exception e) { // should never happen
+      assert false: "Exception parsing high surrogate '"
+                  + surrogatePair.substring(1, 6) + "'";
+    }
+    if (Character.isHighSurrogate(highSurrogate)) {
+      outputSegment = entitySegment;
+      outputSegment.clear();
+      try {
+        outputSegment.unsafeWrite
+            ((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
+      } catch(Exception e) { // should never happen
+        assert false: "Exception parsing low surrogate '"
+                    + surrogatePair.substring(10, 14) + "'";
+      }
+      // add (previously matched input length) + (this match length) - (substitution length)
+      cumulativeDiff += inputSegment.length() + yylength() - 2;
+      // position the correction at (already output length) + (substitution length)
+      addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
+      inputSegment.clear();
+      yybegin(YYINITIAL);
+      return highSurrogate;
+    }
+    yypushback(surrogatePair.length() - 1); // Consume only '#'
+    inputSegment.append('#');
+    yybegin(NUMERIC_CHARACTER);
          }
-        case 72: break;
+        case 70: break;
+        case 28: 
+          { restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING);
+          }
+        case 71: break;
        case 50: 
          { // Handle paired UTF-16 surrogates.
    outputSegment = entitySegment;
@ -31236,49 +31288,63 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
      assert false: "Exception parsing low surrogate '"
                  + surrogatePair.substring(10, 14) + "'";
    }
+    // add (previously matched input length) + (this match length) - (substitution length)
    cumulativeDiff += inputSegment.length() + yylength() - 2;
+    // position the correction at (already output length) + (substitution length)
    addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
    return highSurrogate;
          }
-        case 73: break;
-        case 28: 
-          { restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING);
-          }
-        case 74: break;
-        case 49: 
-          { inputSegment.clear();
-    yybegin(YYINITIAL);
-    cumulativeDiff += yychar - inputStart;
-    int outputEnd = outputCharCount;
-    int returnValue;
-    if (escapeSCRIPT) {
-      inputSegment.write(zzBuffer, zzStartRead, yylength());
-      outputSegment = inputSegment;
-      returnValue = outputSegment.nextChar();
-    } else {
-      cumulativeDiff += yylength() - 1;
-      ++outputEnd;
-      returnValue = SCRIPT_REPLACEMENT;
-    }
-    addOffCorrectMap(outputEnd, cumulativeDiff);
-    return returnValue;
-          }
-        case 75: break;
+        case 72: break;
        case 16: 
          { restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING);
          }
-        case 76: break;
+        case 73: break;
        case 22: 
          { previousRestoreState = restoreState;
    restoreState = SERVER_SIDE_INCLUDE;
    yybegin(DOUBLE_QUOTED_STRING);
          }
-        case 77: break;
+        case 74: break;
+        case 26: 
+          { // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
+    cumulativeDiff += inputSegment.length() + yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0 ]
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    outputSegment = inputSegment;
+    yybegin(YYINITIAL);
+          }
+        case 75: break;
        case 20: 
          { inputSegment.write(zzBuffer, zzStartRead, yylength());
          }
+        case 76: break;
+        case 47: 
+          { // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
+    cumulativeDiff += inputSegment.length() + yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0 ]
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(CDATA);
+          }
+        case 77: break;
+        case 33: 
+          { yybegin(YYINITIAL);
+    if (escapeBR) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      return outputSegment.nextChar();
+    } else {
+      // add (previously matched input length) + (this match length) - (substitution length)
+      cumulativeDiff += inputSegment.length() + yylength() - 1;
+      // position the correction at (already output length) + (substitution length)
+      addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
+      inputSegment.reset();
+      return BR_START_TAG_REPLACEMENT;
+    }
+          }
        case 78: break;
        case 23: 
          { yybegin(restoreState); restoreState = previousRestoreState;
@ -31288,28 +31354,20 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
          { yybegin(COMMENT);
          }
        case 80: break;
-        case 25: 
-          { cumulativeDiff += inputSegment.length() + yylength() - 1;
-    addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
-    inputSegment.clear();
-    yybegin(YYINITIAL);
-    return BLOCK_LEVEL_END_TAG_REPLACEMENT;
-          }
-        case 81: break;
        case 24: 
          { inputSegment.write(zzBuffer, zzStartRead, yylength());
     outputSegment = inputSegment;
     yybegin(YYINITIAL);
     return outputSegment.nextChar();
          }
-        case 82: break;
+        case 81: break;
        case 3: 
          { inputStart = yychar;
  inputSegment.clear();
  inputSegment.append('&');
  yybegin(AMPERSAND);
          }
-        case 83: break;
+        case 82: break;
        case 46: 
          { yybegin(SCRIPT);
    if (escapeSCRIPT) {
@ -31319,6 +31377,15 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
      return outputSegment.nextChar();
    }
          }
+        case 83: break;
+        case 14: 
+          { // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
+    cumulativeDiff += inputSegment.length() + yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0 ]
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+          }
        case 84: break;
        case 6: 
          { int matchLength = yylength();
@ -31354,14 +31421,23 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
    }
          }
        case 85: break;
+        case 34: 
+          { // add (previously matched input length) + (this match length) [ - (substitution length) = 0]
+    cumulativeDiff += yychar - inputStart + yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0]
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+          }
+        case 86: break;
        case 5: 
          { inputSegment.append('#'); yybegin(NUMERIC_CHARACTER);
          }
-        case 86: break;
+        case 87: break;
        case 13: 
          { inputSegment.append(zzBuffer[zzStartRead]);
          }
-        case 87: break;
+        case 88: break;
        case 18: 
          { inputSegment.write(zzBuffer, zzStartRead, yylength());
    if (null != escapedTags
@ -31369,93 +31445,25 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
      yybegin(END_TAG_TAIL_INCLUDE);
    } else {
      yybegin(END_TAG_TAIL_SUBSTITUTE);
-    }
-          }
-        case 88: break;
-        case 36: 
-          { yybegin(YYINITIAL);
-    if (escapeBR) {
-      inputSegment.write(zzBuffer, zzStartRead, yylength());
-      outputSegment = inputSegment;
-      return outputSegment.nextChar();
-    } else {
-      cumulativeDiff
-          += inputSegment.length() + yylength() - outputSegment.length();
-      addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
-      inputSegment.reset();
-      return BR_END_TAG_REPLACEMENT;
    }
          }
        case 89: break;
-        case 33: 
-          { yybegin(YYINITIAL);
-    if (escapeBR) {
-      inputSegment.write(zzBuffer, zzStartRead, yylength());
-      outputSegment = inputSegment;
-      return outputSegment.nextChar();
-    } else {
-      cumulativeDiff
-          += inputSegment.length() + yylength() - outputSegment.length();
-      addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
-      inputSegment.reset();
-      return BR_START_TAG_REPLACEMENT;
-    }
-          }
-        case 90: break;
        case 40: 
          { yybegin(SCRIPT_COMMENT);
          }
-        case 91: break;
+        case 90: break;
        case 37: 
-          { cumulativeDiff += yylength();
+          { // add (this match length) [ - (substitution length) = 0 ]
+    cumulativeDiff += yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0 ]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    yybegin(YYINITIAL);
          }
-        case 92: break;
+        case 91: break;
        case 12: 
          { inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH);
          }
-        case 93: break;
-        case 51: 
-          { // Handle paired UTF-16 surrogates.
-    String surrogatePair = yytext();
-    char highSurrogate = '\u0000';
-    char lowSurrogate = '\u0000';
-    try {
-      highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
-    } catch(Exception e) { // should never happen
-      assert false: "Exception parsing high surrogate '"
-                  + surrogatePair.substring(2, 6) + "'";
-    }
-    try { // Low surrogates are in decimal range [56320, 57343]
-      lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
-    } catch(Exception e) { // should never happen
-      assert false: "Exception parsing low surrogate '"
-                  + surrogatePair.substring(9, 14) + "'";
-    }
-    if (Character.isLowSurrogate(lowSurrogate)) {
-      outputSegment = entitySegment;
-      outputSegment.clear();
-      outputSegment.unsafeWrite(lowSurrogate);
-      cumulativeDiff += inputSegment.length() + yylength() - 2;
-      addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
-      inputSegment.clear();
-      yybegin(YYINITIAL);
-      return highSurrogate;
-    }
-    yypushback(surrogatePair.length() - 1); // Consume only '#'
-    inputSegment.append('#');
-    yybegin(NUMERIC_CHARACTER);
-          }
-        case 94: break;
-        case 7: 
-          { cumulativeDiff
-        += inputSegment.length() + yylength() - outputSegment.length();
-    addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
-    yybegin(YYINITIAL);
-    return outputSegment.nextChar();
-          }
-        case 95: break;
+        case 92: break;
        case 9: 
          { inputSegment.write(zzBuffer, zzStartRead, yylength());
    if (null != escapedTags
@ -31465,15 +31473,38 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
      yybegin(START_TAG_TAIL_EXCLUDE);
    }
          }
-        case 96: break;
+        case 93: break;
+        case 49: 
+          { inputSegment.clear();
+    yybegin(YYINITIAL);
+    // add (previously matched input length) -- current match and substitution handled below
+    cumulativeDiff += yychar - inputStart;
+    // position at (already output length) -- substitution handled below
+    int offsetCorrectionPos = outputCharCount;
+    int returnValue;
+    if (escapeSCRIPT) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      returnValue = outputSegment.nextChar();
+    } else {
+      // add (this match length) - (substitution length)
+      cumulativeDiff += yylength() - 1;
+      // add (substitution length)
+      ++offsetCorrectionPos;
+      returnValue = SCRIPT_REPLACEMENT;
+    }
+    addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
+    return returnValue;
+          }
+        case 94: break;
        case 29: 
          { restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
          }
-        case 97: break;
+        case 95: break;
        case 17: 
          { restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
          }
-        case 98: break;
+        case 96: break;
        case 45: 
          { yybegin(STYLE);
    if (escapeSTYLE) {
@ -31483,7 +31514,16 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
      return outputSegment.nextChar();
    }
          }
-        case 99: break;
+        case 97: break;
+        case 7: 
+          { // add (previously matched input length) + (this match length) - (substitution length)
+    cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length();
+    // position the correction at (already output length) + (substitution length)
+    addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
+    yybegin(YYINITIAL);
+    return outputSegment.nextChar();
+          }
+        case 98: break;
        case 19: 
          { inputSegment.write(zzBuffer, zzStartRead, yylength());
    if (null != escapedTags
@ -31493,6 +31533,16 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
      yybegin(END_TAG_TAIL_EXCLUDE);
    }
          }
+        case 99: break;
+        case 25: 
+          { // add (previously matched input length) + (this match length) - (substitution length)
+    cumulativeDiff += inputSegment.length() + yylength() - 1;
+    // position the correction at (already output length) + (substitution length)
+    addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+    return BLOCK_LEVEL_END_TAG_REPLACEMENT;
+          }
        case 100: break;
        case 31: 
          { int matchLength = yylength();
@ -31529,49 +31579,6 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
    }
          }
        case 101: break;
-        case 38: 
-          { yybegin(restoreState);
-          }
-        case 102: break;
-        case 41: 
-          { yybegin(STYLE_COMMENT);
-          }
-        case 103: break;
-        case 1: 
-          { return zzBuffer[zzStartRead];
-          }
-        case 104: break;
-        case 52: 
-          { // Handle paired UTF-16 surrogates.
-    String surrogatePair = yytext();
-    char highSurrogate = '\u0000';
-    try { // High surrogates are in decimal range [55296, 56319]
-      highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
-    } catch(Exception e) { // should never happen
-      assert false: "Exception parsing high surrogate '"
-                  + surrogatePair.substring(1, 6) + "'";
-    }
-    if (Character.isHighSurrogate(highSurrogate)) {
-      outputSegment = entitySegment;
-      outputSegment.clear();
-      try {
-        outputSegment.unsafeWrite
-            ((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
-      } catch(Exception e) { // should never happen
-        assert false: "Exception parsing low surrogate '"
-                    + surrogatePair.substring(10, 14) + "'";
-      }
-      cumulativeDiff += inputSegment.length() + yylength() - 2;
-      addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
-      inputSegment.clear();
-      yybegin(YYINITIAL);
-      return highSurrogate;
-    }
-    yypushback(surrogatePair.length() - 1); // Consume only '#'
-    inputSegment.append('#');
-    yybegin(NUMERIC_CHARACTER);
-          }
-        case 105: break;
        case 53: 
          { // Handle paired UTF-16 surrogates.
    String surrogatePair = yytext();
@ -31594,7 +31601,9 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
        outputSegment = entitySegment;
        outputSegment.clear();
        outputSegment.unsafeWrite(lowSurrogate);
+        // add (previously matched input length) + (this match length) - (substitution length)
        cumulativeDiff += inputSegment.length() + yylength() - 2;
+        // position the correction at (already output length) + (substitution length)
        addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
        inputSegment.clear();
        yybegin(YYINITIAL);
@ -31605,6 +31614,34 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
    inputSegment.append('#');
    yybegin(NUMERIC_CHARACTER);
          }
+        case 102: break;
+        case 36: 
+          { yybegin(YYINITIAL);
+    if (escapeBR) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      return outputSegment.nextChar();
+    } else {
+      // add (previously matched input length) + (this match length) - (substitution length)
+      cumulativeDiff += inputSegment.length() + yylength() - 1;
+      // position the correction at (already output length) + (substitution length)
+      addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
+      inputSegment.reset();
+      return BR_END_TAG_REPLACEMENT;
+    }
+          }
+        case 103: break;
+        case 38: 
+          { yybegin(restoreState);
+          }
+        case 104: break;
+        case 41: 
+          { yybegin(STYLE_COMMENT);
+          }
+        case 105: break;
+        case 1: 
+          { return zzBuffer[zzStartRead];
+          }
        case 106: break;
        default: 
          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
@ -293,7 +293,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
    case START_TAG_TAIL_EXCLUDE:
    case SERVER_SIDE_INCLUDE:
    case START_TAG_TAIL_SUBSTITUTE: { // Exclude
+      // add (length of input that won't be output) [ - (substitution length) = 0 ]
      cumulativeDiff += yychar - inputStart;
+      // position the correction at (already output length) [ + (substitution length) = 0 ]
      addOffCorrectMap(outputCharCount, cumulativeDiff);
      outputSegment.clear();
      eofReturnValue = -1;
@ -301,7 +303,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
    }
    case CHARACTER_REFERENCE_TAIL: {        // Substitute
      // At end of file, allow char refs without semicolons
+      // add (length of input that won't be output) - (substitution length)
      cumulativeDiff += inputSegment.length() - outputSegment.length();
+      // position the correction at (already output length) + (substitution length)
      addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
      eofReturnValue = outputSegment.nextChar();
      break;
@ -374,7 +378,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
      assert false: "Exception parsing low surrogate '"
                  + surrogatePair.substring(10, 14) + "'";
    }
+    // add (previously matched input length) + (this match length) - (substitution length)
    cumulativeDiff += inputSegment.length() + yylength() - 2;
+    // position the correction at (already output length) + (substitution length)
    addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
@ -403,7 +409,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
        assert false: "Exception parsing low surrogate '"
                    + surrogatePair.substring(10, 14) + "'";
      }
+      // add (previously matched input length) + (this match length) - (substitution length)
      cumulativeDiff += inputSegment.length() + yylength() - 2;
+      // position the correction at (already output length) + (substitution length)
      addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
      inputSegment.clear();
      yybegin(YYINITIAL);
@ -437,7 +445,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
      outputSegment = entitySegment;
      outputSegment.clear();
      outputSegment.unsafeWrite(lowSurrogate);
+      // add (previously matched input length) + (this match length) - (substitution length)
      cumulativeDiff += inputSegment.length() + yylength() - 2;
+      // position the correction at (already output length) + (substitution length)
      addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
      inputSegment.clear();
      yybegin(YYINITIAL);
@ -472,7 +482,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
        outputSegment = entitySegment;
        outputSegment.clear();
        outputSegment.unsafeWrite(lowSurrogate);
+        // add (previously matched input length) + (this match length) - (substitution length)
        cumulativeDiff += inputSegment.length() + yylength() - 2;
+        // position the correction at (already output length) + (substitution length)
        addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
        inputSegment.clear();
        yybegin(YYINITIAL);
@ -557,8 +569,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |

 <CHARACTER_REFERENCE_TAIL> {
  ";" {
-    cumulativeDiff
-        += inputSegment.length() + yylength() - outputSegment.length();
+    // add (previously matched input length) + (this match length) - (substitution length)
+    cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length();
+    // position the correction at (already output length) + (substitution length)
    addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
    yybegin(YYINITIAL);
    return outputSegment.nextChar();
@ -574,9 +587,10 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
      outputSegment = inputSegment;
      return outputSegment.nextChar();
    } else {
-      cumulativeDiff
-          += inputSegment.length() + yylength() - outputSegment.length();
-      addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
+      // add (previously matched input length) + (this match length) - (substitution length)
+      cumulativeDiff += inputSegment.length() + yylength() - 1;
+      // position the correction at (already output length) + (substitution length)
+      addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
      inputSegment.reset();
      return BR_END_TAG_REPLACEMENT;
    }
@ -612,7 +626,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |

 <END_TAG_TAIL_EXCLUDE> {
  \s* ">" {
+    // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
    cumulativeDiff += inputSegment.length() + yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0 ]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
@ -621,7 +637,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |

 <END_TAG_TAIL_SUBSTITUTE> {
  \s* ">" {
+    // add (previously matched input length) + (this match length) - (substitution length)
    cumulativeDiff += inputSegment.length() + yylength() - 1;
+    // position the correction at (already output length) + (substitution length)
    addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
@ -637,7 +655,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
    yybegin(LEFT_ANGLE_BRACKET_SPACE);
  }
  "?" [^>]* [/?] ">" {
+    // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
    cumulativeDiff += inputSegment.length() + yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0 ]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
@ -649,8 +669,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
      outputSegment = inputSegment;
      return outputSegment.nextChar();
    } else {
-      cumulativeDiff
-          += inputSegment.length() + yylength() - outputSegment.length();
+      // add (previously matched input length) + (this match length) - (substitution length)
+      cumulativeDiff += inputSegment.length() + yylength() - 1;
+      // position the correction at (already output length) + (substitution length)
      addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
      inputSegment.reset();
      return BR_START_TAG_REPLACEMENT;
@ -708,7 +729,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |

 <START_TAG_TAIL_EXCLUDE> {
   ( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
+    // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
    cumulativeDiff += inputSegment.length() + yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0 ]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    inputSegment.clear();
    outputSegment = inputSegment;
@ -718,7 +741,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |

 <START_TAG_TAIL_SUBSTITUTE> {
  ( ( "="\s* | \s+ ) {OpenTagContent} )? \s*  "/"? ">" {
+    // add (previously matched input length) + (this match length) - (substitution length)
    cumulativeDiff += inputSegment.length() + yylength() - 1;
+    // position the correction at (already output length) + (substitution length)
    addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
@ -729,7 +754,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
 <BANG> {
  "--" { yybegin(COMMENT); }
  ">" {
+    // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
    cumulativeDiff += inputSegment.length() + yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0 ]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
@ -742,7 +769,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
  // [21] CDEnd   ::= ']]>'
  //
  "[CDATA[" {
+    // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
    cumulativeDiff += inputSegment.length() + yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0 ]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    inputSegment.clear();
    yybegin(CDATA);
@ -754,7 +783,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |

 <CDATA> {
  "]]>" {
+    // add (this match length) [ - (substitution length) = 0 ]
    cumulativeDiff += yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0 ]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    yybegin(YYINITIAL);
  }
@ -764,7 +795,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
 <COMMENT> {
  "<!--#" { restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
  "-->" {
+    // add (previously matched input length) + (this match length) [ - (substitution length) = 0]
    cumulativeDiff += yychar - inputStart + yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
@ -820,19 +853,23 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
  "</" \s* [sS][cC][rR][iI][pP][tT] \s* ">" {
    inputSegment.clear();
    yybegin(YYINITIAL);
+    // add (previously matched input length) -- current match and substitution handled below
    cumulativeDiff += yychar - inputStart;
-    int outputEnd = outputCharCount;
+    // position at (already output length) -- substitution handled below
+    int offsetCorrectionPos = outputCharCount;
    int returnValue;
    if (escapeSCRIPT) {
      inputSegment.write(zzBuffer, zzStartRead, yylength());
      outputSegment = inputSegment;
      returnValue = outputSegment.nextChar();
    } else {
+      // add (this match length) - (substitution length)
      cumulativeDiff += yylength() - 1;
-      ++outputEnd;
+      // add (substitution length)
+      ++offsetCorrectionPos;
      returnValue = SCRIPT_REPLACEMENT;
    }
-    addOffCorrectMap(outputEnd, cumulativeDiff);
+    addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
    return returnValue;
  }
  [^] { }
@ -843,19 +880,23 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
  "</" \s* [sS][tT][yY][lL][eE] \s* ">" {
    inputSegment.clear();
    yybegin(YYINITIAL);
+    // add (previously matched input length) -- current match and substitution handled below
    cumulativeDiff += yychar - inputStart;
-    int outputEnd = outputCharCount;
+    // position the offset correction at (already output length) -- substitution handled below
+    int offsetCorrectionPos = outputCharCount;
    int returnValue;
    if (escapeSTYLE) {
      inputSegment.write(zzBuffer, zzStartRead, yylength());
      outputSegment = inputSegment;
      returnValue = outputSegment.nextChar();
    } else {
+      // add (this match length) - (substitution length)
      cumulativeDiff += yylength() - 1;
-      ++outputEnd;
+      // add (substitution length)
+      ++offsetCorrectionPos;
      returnValue = STYLE_REPLACEMENT;
    }
-    addOffCorrectMap(outputEnd, cumulativeDiff);
+    addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
    return returnValue;
  }
  [^] { }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
@ -36,6 +36,21 @@ import org.apache.lucene.util._TestUtil;

 public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {

+  static private Analyzer newTestAnalyzer() {
+    return new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }
+
+      @Override
+      protected Reader initReader(Reader reader) {
+        return new HTMLStripCharFilter(CharReader.get(reader));
+      }
+    };
+  }
+
  //this is some text  here is a  link  and another  link . This is an entity: & plus a <.  Here is an &
  //
  public void test() throws IOException {
@ -493,41 +508,17 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
  }

  public void testRandom() throws Exception {
-    Analyzer analyzer = new Analyzer() {
-
-      @Override
-      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
-        return new TokenStreamComponents(tokenizer, tokenizer);
-      }
-
-      @Override
-      protected Reader initReader(Reader reader) {
-        return new HTMLStripCharFilter(CharReader.get(reader));
-      }
-    };
-    
    int numRounds = RANDOM_MULTIPLIER * 10000;
-    checkRandomData(random, analyzer, numRounds);
+    checkRandomData(random, newTestAnalyzer(), numRounds);
  }
  
  public void testRandomHugeStrings() throws Exception {
-    Analyzer analyzer = new Analyzer() {
-
-      @Override
-      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
-        return new TokenStreamComponents(tokenizer, tokenizer);
-      }
-
-      @Override
-      protected Reader initReader(Reader reader) {
-        return new HTMLStripCharFilter(CharReader.get(reader));
-      }
-    };
-    
    int numRounds = RANDOM_MULTIPLIER * 200;
-    checkRandomData(random, analyzer, numRounds, 8192);
+    checkRandomData(random, newTestAnalyzer(), numRounds, 8192);
+  }
+
+  public void testCloseBR() throws Exception {
+    checkAnalysisConsistency(random, newTestAnalyzer(), random.nextBoolean(), " Secretary)</br> [[M");
  }
  
  public void testServerSideIncludes() throws Exception {
@ -797,9 +788,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
  public void testRandomBrokenHTML() throws Exception {
    int maxNumElements = 10000;
    String text = _TestUtil.randomHtmlishString(random, maxNumElements);
-    Reader reader = new HTMLStripCharFilter
-        (CharReader.get(new StringReader(text)));
-    while (reader.read() != -1);
+    checkAnalysisConsistency(random, newTestAnalyzer(), random.nextBoolean(), text);
  }

  public void testRandomText() throws Exception {
@ -838,18 +827,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
  }

  public void testUTF16Surrogates() throws Exception {
-    Analyzer analyzer = new Analyzer() {
-      @Override
-      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
-        return new TokenStreamComponents(tokenizer, tokenizer);
-      }
-
-      @Override
-      protected Reader initReader(Reader reader) {
-        return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
-      }
-    };
+    Analyzer analyzer = newTestAnalyzer();
    // Paired surrogates
    assertAnalyzesTo(analyzer, " one two &#xD86C;&#XdC01;three",
        new String[] { "one", "two", "\uD86C\uDC01three" } );