LUCENE-4291: reduce jflex buffer sizes

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1369883 13f79535-47bb-0310-9956-ffa450edef68
2012-08-06 17:16:47 +00:00 · 2012-08-06 17:16:47 +00:00 · 9898822e28
parent 3930247e77
commit 9898822e28
12 changed files with 443 additions and 435 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -120,6 +120,10 @@ Optimizations
  making them substantially more lightweight. Behavior is unchanged. 
  (Robert Muir)
 * LUCENE-4291: Reduced internal buffer size for Jflex-based tokenizers 
  such as StandardTokenizer from 32kb to 8kb.  
  (Raintung Li, Steven Rowe, Robert Muir)
 Bug Fixes
 * LUCENE-4109: BooleanQueries are not parsed correctly with the 
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 7/26/12 6:22 PM */
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
 package org.apache.lucene.analysis.charfilter;
@ -40,8 +40,8 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
 /**
 * This class is a scanner generated by 
 * <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
- * on 7/26/12 6:22 PM from the specification file
+ * on 8/6/12 11:57 AM from the specification file
- * <tt>C:/svn/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex</tt>
+ * <tt>/home/rmuir/workspace/lucene-trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex</tt>
 */
 public final class HTMLStripCharFilter extends BaseCharFilter {
@ -31255,6 +31255,93 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
          { yybegin(STYLE);
          }
        case 55: break;
        case 27: 
          { // add (previously matched input length) + (this match length) - (substitution length)
    cumulativeDiff += inputSegment.length() + yylength() - 1;
    // position the correction at (already output length) + (substitution length)
    addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
    return BLOCK_LEVEL_START_TAG_REPLACEMENT;
          }
        case 56: break;
        case 30: 
          { int length = yylength();
    inputSegment.write(zzBuffer, zzStartRead, length);
    entitySegment.clear();
    char ch = entityValues.get(zzBuffer, zzStartRead, length).charValue();
    entitySegment.append(ch);
    outputSegment = entitySegment;
    yybegin(CHARACTER_REFERENCE_TAIL);
          }
        case 57: break;
        case 48: 
          { inputSegment.clear();
    yybegin(YYINITIAL);
    // add (previously matched input length) -- current match and substitution handled below
    cumulativeDiff += yychar - inputStart;
    // position the offset correction at (already output length) -- substitution handled below
    int offsetCorrectionPos = outputCharCount;
    int returnValue;
    if (escapeSTYLE) {
      inputSegment.write(zzBuffer, zzStartRead, yylength());
      outputSegment = inputSegment;
      returnValue = outputSegment.nextChar();
    } else {
      // add (this match length) - (substitution length)
      cumulativeDiff += yylength() - 1;
      // add (substitution length)
      ++offsetCorrectionPos;
      returnValue = STYLE_REPLACEMENT;
    }
    addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
    return returnValue;
          }
        case 58: break;
        case 8: 
          { inputSegment.write(zzBuffer, zzStartRead, yylength());
    if (null != escapedTags
        && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
      yybegin(START_TAG_TAIL_INCLUDE);
    } else {
      yybegin(START_TAG_TAIL_SUBSTITUTE);
    }
          }
        case 59: break;
        case 2: 
          { inputStart = yychar;
  inputSegment.clear();
  inputSegment.append('<');
  yybegin(LEFT_ANGLE_BRACKET);
          }
        case 60: break;
        case 44: 
          { restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
          }
        case 61: break;
        case 21: 
          { previousRestoreState = restoreState;
    restoreState = SERVER_SIDE_INCLUDE;
    yybegin(SINGLE_QUOTED_STRING);
          }
        case 62: break;
        case 11: 
          { inputSegment.write(zzBuffer, zzStartRead, yylength());
    yybegin(LEFT_ANGLE_BRACKET_SPACE);
          }
        case 63: break;
        case 35: 
          { yybegin(SCRIPT);
          }
        case 64: break;
        case 42: 
          { restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE);
          }
        case 65: break;
        case 10: 
          { inputSegment.append('!'); yybegin(BANG);
          }
        case 66: break;
        case 51: 
          { // Handle paired UTF-16 surrogates.
    String surrogatePair = yytext();
@ -31288,13 +31375,331 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
    inputSegment.append('#');
    yybegin(NUMERIC_CHARACTER);
          }
-        case 56: break;
+        case 67: break;
-        case 21: 
+        case 4: 
          { yypushback(1);
    outputSegment = inputSegment;
    outputSegment.restart();
    yybegin(YYINITIAL);
    return outputSegment.nextChar();
          }
        case 68: break;
        case 43: 
          { restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
          }
        case 69: break;
        case 52: 
          { // Handle paired UTF-16 surrogates.
    String surrogatePair = yytext();
    char highSurrogate = '\u0000';
    try { // High surrogates are in decimal range [55296, 56319]
      highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
    } catch(Exception e) { // should never happen
      assert false: "Exception parsing high surrogate '"
                  + surrogatePair.substring(1, 6) + "'";
    }
    if (Character.isHighSurrogate(highSurrogate)) {
      outputSegment = entitySegment;
      outputSegment.clear();
      try {
        outputSegment.unsafeWrite
            ((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
      } catch(Exception e) { // should never happen
        assert false: "Exception parsing low surrogate '"
                    + surrogatePair.substring(10, 14) + "'";
      }
      // add (previously matched input length) + (this match length) - (substitution length)
      cumulativeDiff += inputSegment.length() + yylength() - 2;
      // position the correction at (already output length) + (substitution length)
      addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
      inputSegment.clear();
      yybegin(YYINITIAL);
      return highSurrogate;
    }
    yypushback(surrogatePair.length() - 1); // Consume only '#'
    inputSegment.append('#');
    yybegin(NUMERIC_CHARACTER);
          }
        case 70: break;
        case 28: 
          { restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING);
          }
        case 71: break;
        case 50: 
          { // Handle paired UTF-16 surrogates.
    outputSegment = entitySegment;
    outputSegment.clear();
    String surrogatePair = yytext();
    char highSurrogate = '\u0000';
    try {
      highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
    } catch(Exception e) { // should never happen
      assert false: "Exception parsing high surrogate '"
                  + surrogatePair.substring(2, 6) + "'";
    }
    try {
      outputSegment.unsafeWrite
          ((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
    } catch(Exception e) { // should never happen
      assert false: "Exception parsing low surrogate '"
                  + surrogatePair.substring(10, 14) + "'";
    }
    // add (previously matched input length) + (this match length) - (substitution length)
    cumulativeDiff += inputSegment.length() + yylength() - 2;
    // position the correction at (already output length) + (substitution length)
    addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
    return highSurrogate;
          }
        case 72: break;
        case 16: 
          { restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING);
          }
        case 73: break;
        case 22: 
          { previousRestoreState = restoreState;
    restoreState = SERVER_SIDE_INCLUDE;
-    yybegin(SINGLE_QUOTED_STRING);
+    yybegin(DOUBLE_QUOTED_STRING);
          }
-        case 57: break;
+        case 74: break;
        case 26: 
          { // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
    cumulativeDiff += inputSegment.length() + yylength();
    // position the correction at (already output length) [ + (substitution length) = 0 ]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    inputSegment.clear();
    outputSegment = inputSegment;
    yybegin(YYINITIAL);
          }
        case 75: break;
        case 20: 
          { inputSegment.write(zzBuffer, zzStartRead, yylength());
          }
        case 76: break;
        case 47: 
          { // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
    cumulativeDiff += inputSegment.length() + yylength();
    // position the correction at (already output length) [ + (substitution length) = 0 ]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    inputSegment.clear();
    yybegin(CDATA);
          }
        case 77: break;
        case 33: 
          { yybegin(YYINITIAL);
    if (escapeBR) {
      inputSegment.write(zzBuffer, zzStartRead, yylength());
      outputSegment = inputSegment;
      return outputSegment.nextChar();
    } else {
      // add (previously matched input length) + (this match length) - (substitution length)
      cumulativeDiff += inputSegment.length() + yylength() - 1;
      // position the correction at (already output length) + (substitution length)
      addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
      inputSegment.reset();
      return BR_START_TAG_REPLACEMENT;
    }
          }
        case 78: break;
        case 23: 
          { yybegin(restoreState); restoreState = previousRestoreState;
          }
        case 79: break;
        case 32: 
          { yybegin(COMMENT);
          }
        case 80: break;
        case 24: 
          { inputSegment.write(zzBuffer, zzStartRead, yylength());
     outputSegment = inputSegment;
     yybegin(YYINITIAL);
     return outputSegment.nextChar();
          }
        case 81: break;
        case 3: 
          { inputStart = yychar;
  inputSegment.clear();
  inputSegment.append('&');
  yybegin(AMPERSAND);
          }
        case 82: break;
        case 46: 
          { yybegin(SCRIPT);
    if (escapeSCRIPT) {
      inputSegment.write(zzBuffer, zzStartRead, yylength());
      outputSegment = inputSegment;
      inputStart += 1 + yylength();
      return outputSegment.nextChar();
    }
          }
        case 83: break;
        case 14: 
          { // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
    cumulativeDiff += inputSegment.length() + yylength();
    // position the correction at (already output length) [ + (substitution length) = 0 ]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
          }
        case 84: break;
        case 6: 
          { int matchLength = yylength();
    inputSegment.write(zzBuffer, zzStartRead, matchLength);
    if (matchLength <= 7) { // 0x10FFFF = 1114111: max 7 decimal chars
      String decimalCharRef = yytext();
      int codePoint = 0;
      try {
        codePoint = Integer.parseInt(decimalCharRef);
      } catch(Exception e) {
        assert false: "Exception parsing code point '" + decimalCharRef + "'";
      }
      if (codePoint <= 0x10FFFF) {
        outputSegment = entitySegment;
        outputSegment.clear();
        if (codePoint >= Character.MIN_SURROGATE
            && codePoint <= Character.MAX_SURROGATE) {
          outputSegment.unsafeWrite(REPLACEMENT_CHARACTER);
        } else {
          outputSegment.setLength
              (Character.toChars(codePoint, outputSegment.getArray(), 0));
        }
        yybegin(CHARACTER_REFERENCE_TAIL);
      } else {
        outputSegment = inputSegment;
        yybegin(YYINITIAL);
        return outputSegment.nextChar();
      }
    } else {
      outputSegment = inputSegment;
      yybegin(YYINITIAL);
      return outputSegment.nextChar();
    }
          }
        case 85: break;
        case 34: 
          { // add (previously matched input length) + (this match length) [ - (substitution length) = 0]
    cumulativeDiff += yychar - inputStart + yylength();
    // position the correction at (already output length) [ + (substitution length) = 0]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
          }
        case 86: break;
        case 5: 
          { inputSegment.append('#'); yybegin(NUMERIC_CHARACTER);
          }
        case 87: break;
        case 13: 
          { inputSegment.append(zzBuffer[zzStartRead]);
          }
        case 88: break;
        case 18: 
          { inputSegment.write(zzBuffer, zzStartRead, yylength());
    if (null != escapedTags
        && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
      yybegin(END_TAG_TAIL_INCLUDE);
    } else {
      yybegin(END_TAG_TAIL_SUBSTITUTE);
    }
          }
        case 89: break;
        case 40: 
          { yybegin(SCRIPT_COMMENT);
          }
        case 90: break;
        case 37: 
          { // add (this match length) [ - (substitution length) = 0 ]
    cumulativeDiff += yylength();
    // position the correction at (already output length) [ + (substitution length) = 0 ]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    yybegin(YYINITIAL);
          }
        case 91: break;
        case 12: 
          { inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH);
          }
        case 92: break;
        case 9: 
          { inputSegment.write(zzBuffer, zzStartRead, yylength());
    if (null != escapedTags
        && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
      yybegin(START_TAG_TAIL_INCLUDE);
    } else {
      yybegin(START_TAG_TAIL_EXCLUDE);
    }
          }
        case 93: break;
        case 49: 
          { inputSegment.clear();
    yybegin(YYINITIAL);
    // add (previously matched input length) -- current match and substitution handled below
    cumulativeDiff += yychar - inputStart;
    // position at (already output length) -- substitution handled below
    int offsetCorrectionPos = outputCharCount;
    int returnValue;
    if (escapeSCRIPT) {
      inputSegment.write(zzBuffer, zzStartRead, yylength());
      outputSegment = inputSegment;
      returnValue = outputSegment.nextChar();
    } else {
      // add (this match length) - (substitution length)
      cumulativeDiff += yylength() - 1;
      // add (substitution length)
      ++offsetCorrectionPos;
      returnValue = SCRIPT_REPLACEMENT;
    }
    addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
    return returnValue;
          }
        case 94: break;
        case 29: 
          { restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
          }
        case 95: break;
        case 17: 
          { restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
          }
        case 96: break;
        case 45: 
          { yybegin(STYLE);
    if (escapeSTYLE) {
      inputSegment.write(zzBuffer, zzStartRead, yylength());
      outputSegment = inputSegment;
      inputStart += 1 + yylength();
      return outputSegment.nextChar();
    }
          }
        case 97: break;
        case 7: 
          { // add (previously matched input length) + (this match length) - (substitution length)
    cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length();
    // position the correction at (already output length) + (substitution length)
    addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
    yybegin(YYINITIAL);
    return outputSegment.nextChar();
          }
        case 98: break;
        case 19: 
          { inputSegment.write(zzBuffer, zzStartRead, yylength());
    if (null != escapedTags
        && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
      yybegin(END_TAG_TAIL_INCLUDE);
    } else {
      yybegin(END_TAG_TAIL_EXCLUDE);
    }
          }
        case 99: break;
        case 25: 
          { // add (previously matched input length) + (this match length) - (substitution length)
    cumulativeDiff += inputSegment.length() + yylength() - 1;
    // position the correction at (already output length) + (substitution length)
    addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
    return BLOCK_LEVEL_END_TAG_REPLACEMENT;
          }
        case 100: break;
        case 31: 
          { int matchLength = yylength();
    inputSegment.write(zzBuffer, zzStartRead, matchLength);
@ -31329,66 +31734,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
      return outputSegment.nextChar();
    }
          }
-        case 58: break;
+        case 101: break;
        case 19: 
          { inputSegment.write(zzBuffer, zzStartRead, yylength());
    if (null != escapedTags
        && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
      yybegin(END_TAG_TAIL_INCLUDE);
    } else {
      yybegin(END_TAG_TAIL_EXCLUDE);
    }
          }
        case 59: break;
        case 2: 
          { inputStart = yychar;
  inputSegment.clear();
  inputSegment.append('<');
  yybegin(LEFT_ANGLE_BRACKET);
          }
        case 60: break;
        case 27: 
          { // add (previously matched input length) + (this match length) - (substitution length)
    cumulativeDiff += inputSegment.length() + yylength() - 1;
    // position the correction at (already output length) + (substitution length)
    addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
    return BLOCK_LEVEL_START_TAG_REPLACEMENT;
          }
        case 61: break;
        case 44: 
          { restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
          }
        case 62: break;
        case 35: 
          { yybegin(SCRIPT);
          }
        case 63: break;
        case 42: 
          { restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE);
          }
        case 64: break;
        case 10: 
          { inputSegment.append('!'); yybegin(BANG);
          }
        case 65: break;
        case 33: 
          { yybegin(YYINITIAL);
    if (escapeBR) {
      inputSegment.write(zzBuffer, zzStartRead, yylength());
      outputSegment = inputSegment;
      return outputSegment.nextChar();
    } else {
      // add (previously matched input length) + (this match length) - (substitution length)
      cumulativeDiff += inputSegment.length() + yylength() - 1;
      // position the correction at (already output length) + (substitution length)
      addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
      inputSegment.reset();
      return BR_START_TAG_REPLACEMENT;
    }
          }
        case 66: break;
        case 53: 
          { // Handle paired UTF-16 surrogates.
    String surrogatePair = yytext();
@ -31424,288 +31770,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
    inputSegment.append('#');
    yybegin(NUMERIC_CHARACTER);
          }
-        case 67: break;
+        case 102: break;
        case 43: 
          { restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
          }
        case 68: break;
        case 30: 
          { int length = yylength();
    inputSegment.write(zzBuffer, zzStartRead, length);
    entitySegment.clear();
    char ch = entityValues.get(zzBuffer, zzStartRead, length).charValue();
    entitySegment.append(ch);
    outputSegment = entitySegment;
    yybegin(CHARACTER_REFERENCE_TAIL);
          }
        case 69: break;
        case 28: 
          { restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING);
          }
        case 70: break;
        case 3: 
          { inputStart = yychar;
  inputSegment.clear();
  inputSegment.append('&');
  yybegin(AMPERSAND);
          }
        case 71: break;
        case 16: 
          { restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING);
          }
        case 72: break;
        case 52: 
          { // Handle paired UTF-16 surrogates.
    String surrogatePair = yytext();
    char highSurrogate = '\u0000';
    try { // High surrogates are in decimal range [55296, 56319]
      highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
    } catch(Exception e) { // should never happen
      assert false: "Exception parsing high surrogate '"
                  + surrogatePair.substring(1, 6) + "'";
    }
    if (Character.isHighSurrogate(highSurrogate)) {
      outputSegment = entitySegment;
      outputSegment.clear();
      try {
        outputSegment.unsafeWrite
            ((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
      } catch(Exception e) { // should never happen
        assert false: "Exception parsing low surrogate '"
                    + surrogatePair.substring(10, 14) + "'";
      }
      // add (previously matched input length) + (this match length) - (substitution length)
      cumulativeDiff += inputSegment.length() + yylength() - 2;
      // position the correction at (already output length) + (substitution length)
      addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
      inputSegment.clear();
      yybegin(YYINITIAL);
      return highSurrogate;
    }
    yypushback(surrogatePair.length() - 1); // Consume only '#'
    inputSegment.append('#');
    yybegin(NUMERIC_CHARACTER);
          }
        case 73: break;
        case 6: 
          { int matchLength = yylength();
    inputSegment.write(zzBuffer, zzStartRead, matchLength);
    if (matchLength <= 7) { // 0x10FFFF = 1114111: max 7 decimal chars
      String decimalCharRef = yytext();
      int codePoint = 0;
      try {
        codePoint = Integer.parseInt(decimalCharRef);
      } catch(Exception e) {
        assert false: "Exception parsing code point '" + decimalCharRef + "'";
      }
      if (codePoint <= 0x10FFFF) {
        outputSegment = entitySegment;
        outputSegment.clear();
        if (codePoint >= Character.MIN_SURROGATE
            && codePoint <= Character.MAX_SURROGATE) {
          outputSegment.unsafeWrite(REPLACEMENT_CHARACTER);
        } else {
          outputSegment.setLength
              (Character.toChars(codePoint, outputSegment.getArray(), 0));
        }
        yybegin(CHARACTER_REFERENCE_TAIL);
      } else {
        outputSegment = inputSegment;
        yybegin(YYINITIAL);
        return outputSegment.nextChar();
      }
    } else {
      outputSegment = inputSegment;
      yybegin(YYINITIAL);
      return outputSegment.nextChar();
    }
          }
        case 74: break;
        case 37: 
          { // add (this match length) [ - (substitution length) = 0 ]
    cumulativeDiff += yylength();
    // position the correction at (already output length) [ + (substitution length) = 0 ]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    yybegin(YYINITIAL);
          }
        case 75: break;
        case 8: 
          { inputSegment.write(zzBuffer, zzStartRead, yylength());
    if (null != escapedTags
        && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
      yybegin(START_TAG_TAIL_INCLUDE);
    } else {
      yybegin(START_TAG_TAIL_SUBSTITUTE);
    }
          }
        case 76: break;
        case 46: 
          { yybegin(SCRIPT);
    if (escapeSCRIPT) {
      inputSegment.write(zzBuffer, zzStartRead, yylength());
      outputSegment = inputSegment;
      inputStart += 1 + yylength();
      return outputSegment.nextChar();
    }
          }
        case 77: break;
        case 11: 
          { inputSegment.write(zzBuffer, zzStartRead, yylength());
    yybegin(LEFT_ANGLE_BRACKET_SPACE);
          }
        case 78: break;
        case 20: 
          { inputSegment.write(zzBuffer, zzStartRead, yylength());
          }
        case 79: break;
        case 34: 
          { // add (previously matched input length) + (this match length) [ - (substitution length) = 0]
    cumulativeDiff += yychar - inputStart + yylength();
    // position the correction at (already output length) [ + (substitution length) = 0]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
          }
        case 80: break;
        case 23: 
          { yybegin(restoreState); restoreState = previousRestoreState;
          }
        case 81: break;
        case 32: 
          { yybegin(COMMENT);
          }
        case 82: break;
        case 14: 
          { // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
    cumulativeDiff += inputSegment.length() + yylength();
    // position the correction at (already output length) [ + (substitution length) = 0 ]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
          }
        case 83: break;
        case 18: 
          { inputSegment.write(zzBuffer, zzStartRead, yylength());
    if (null != escapedTags
        && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
      yybegin(END_TAG_TAIL_INCLUDE);
    } else {
      yybegin(END_TAG_TAIL_SUBSTITUTE);
    }
          }
        case 84: break;
        case 25: 
          { // add (previously matched input length) + (this match length) - (substitution length)
    cumulativeDiff += inputSegment.length() + yylength() - 1;
    // position the correction at (already output length) + (substitution length)
    addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
    return BLOCK_LEVEL_END_TAG_REPLACEMENT;
          }
        case 85: break;
        case 7: 
          { // add (previously matched input length) + (this match length) - (substitution length)
    cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length();
    // position the correction at (already output length) + (substitution length)
    addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
    yybegin(YYINITIAL);
    return outputSegment.nextChar();
          }
        case 86: break;
        case 48: 
          { inputSegment.clear();
    yybegin(YYINITIAL);
    // add (previously matched input length) -- current match and substitution handled below
    cumulativeDiff += yychar - inputStart;
    // position the offset correction at (already output length) -- substitution handled below
    int offsetCorrectionPos = outputCharCount;
    int returnValue;
    if (escapeSTYLE) {
      inputSegment.write(zzBuffer, zzStartRead, yylength());
      outputSegment = inputSegment;
      returnValue = outputSegment.nextChar();
    } else {
      // add (this match length) - (substitution length)
      cumulativeDiff += yylength() - 1;
      // add (substitution length)
      ++offsetCorrectionPos;
      returnValue = STYLE_REPLACEMENT;
    }
    addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
    return returnValue;
          }
        case 87: break;
        case 5: 
          { inputSegment.append('#'); yybegin(NUMERIC_CHARACTER);
          }
        case 88: break;
        case 26: 
          { // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
    cumulativeDiff += inputSegment.length() + yylength();
    // position the correction at (already output length) [ + (substitution length) = 0 ]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    inputSegment.clear();
    outputSegment = inputSegment;
    yybegin(YYINITIAL);
          }
        case 89: break;
        case 13: 
          { inputSegment.append(zzBuffer[zzStartRead]);
          }
        case 90: break;
        case 50: 
          { // Handle paired UTF-16 surrogates.
    outputSegment = entitySegment;
    outputSegment.clear();
    String surrogatePair = yytext();
    char highSurrogate = '\u0000';
    try {
      highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
    } catch(Exception e) { // should never happen
      assert false: "Exception parsing high surrogate '"
                  + surrogatePair.substring(2, 6) + "'";
    }
    try {
      outputSegment.unsafeWrite
          ((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
    } catch(Exception e) { // should never happen
      assert false: "Exception parsing low surrogate '"
                  + surrogatePair.substring(10, 14) + "'";
    }
    // add (previously matched input length) + (this match length) - (substitution length)
    cumulativeDiff += inputSegment.length() + yylength() - 2;
    // position the correction at (already output length) + (substitution length)
    addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
    inputSegment.clear();
    yybegin(YYINITIAL);
    return highSurrogate;
          }
        case 91: break;
        case 40: 
          { yybegin(SCRIPT_COMMENT);
          }
        case 92: break;
        case 45: 
          { yybegin(STYLE);
    if (escapeSTYLE) {
      inputSegment.write(zzBuffer, zzStartRead, yylength());
      outputSegment = inputSegment;
      inputStart += 1 + yylength();
      return outputSegment.nextChar();
    }
          }
        case 93: break;
        case 22: 
          { previousRestoreState = restoreState;
    restoreState = SERVER_SIDE_INCLUDE;
    yybegin(DOUBLE_QUOTED_STRING);
          }
        case 94: break;
        case 12: 
          { inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH);
          }
        case 95: break;
        case 36: 
          { yybegin(YYINITIAL);
    if (escapeBR) {
@ -31721,83 +31786,18 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
      return BR_END_TAG_REPLACEMENT;
    }
          }
-        case 96: break;
+        case 103: break;
        case 24: 
          { inputSegment.write(zzBuffer, zzStartRead, yylength());
     outputSegment = inputSegment;
     yybegin(YYINITIAL);
     return outputSegment.nextChar();
          }
        case 97: break;
        case 47: 
          { // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
    cumulativeDiff += inputSegment.length() + yylength();
    // position the correction at (already output length) [ + (substitution length) = 0 ]
    addOffCorrectMap(outputCharCount, cumulativeDiff);
    inputSegment.clear();
    yybegin(CDATA);
          }
        case 98: break;
        case 29: 
          { restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
          }
        case 99: break;
        case 17: 
          { restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
          }
        case 100: break;
        case 9: 
          { inputSegment.write(zzBuffer, zzStartRead, yylength());
    if (null != escapedTags
        && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
      yybegin(START_TAG_TAIL_INCLUDE);
    } else {
      yybegin(START_TAG_TAIL_EXCLUDE);
    }
          }
        case 101: break;
        case 49: 
          { inputSegment.clear();
    yybegin(YYINITIAL);
    // add (previously matched input length) -- current match and substitution handled below
    cumulativeDiff += yychar - inputStart;
    // position at (already output length) -- substitution handled below
    int offsetCorrectionPos = outputCharCount;
    int returnValue;
    if (escapeSCRIPT) {
      inputSegment.write(zzBuffer, zzStartRead, yylength());
      outputSegment = inputSegment;
      returnValue = outputSegment.nextChar();
    } else {
      // add (this match length) - (substitution length)
      cumulativeDiff += yylength() - 1;
      // add (substitution length)
      ++offsetCorrectionPos;
      returnValue = SCRIPT_REPLACEMENT;
    }
    addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
    return returnValue;
          }
        case 102: break;
        case 38: 
          { yybegin(restoreState);
          }
-        case 103: break;
+        case 104: break;
        case 41: 
          { yybegin(STYLE_COMMENT);
          }
-        case 104: break;
+        case 105: break;
        case 1: 
          { return zzBuffer[zzStartRead];
          }
        case 105: break;
        case 4: 
          { yypushback(1);
    outputSegment = inputSegment;
    outputSegment.restart();
    yybegin(YYINITIAL);
    return outputSegment.nextChar();
          }
        case 106: break;
        default: 
          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
@ -141,9 +141,9 @@ InlineElment = ( [aAbBiIqQsSuU]                   |
                 [vV][aA][rR]                     )
-%include src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex
+%include HTMLCharacterEntities.jflex
-%include src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
+%include HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
 %{
  private static final int INITIAL_INPUT_SEGMENT_SIZE = 1024;
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java
@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 7/15/12 1:57 AM */
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
 package org.apache.lucene.analysis.standard;
@ -33,8 +33,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 /**
 * This class is a scanner generated by 
 * <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
- * on 7/15/12 1:57 AM from the specification file
+ * on 8/6/12 11:57 AM from the specification file
- * <tt>C:/cygwin/home/s/svn/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
+ * <tt>/home/rmuir/workspace/lucene-trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
 */
 class ClassicTokenizerImpl implements StandardTokenizerInterface {
@ -42,7 +42,7 @@ class ClassicTokenizerImpl implements StandardTokenizerInterface {
  public static final int YYEOF = -1;
  /** initial size of the lookahead buffer */
-  private static final int ZZ_BUFFERSIZE = 16384;
+  private static final int ZZ_BUFFERSIZE = 4096;
  /** lexical states */
  public static final int YYINITIAL = 0;
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex
@ -36,6 +36,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 %function getNextToken
 %pack
 %char
 %buffer 4096
 %{
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
@ -14,7 +14,7 @@
 * limitations under the License.
 */
-// Generated using ICU4J 49.1.0.0 on Thursday, July 26, 2012 10:22:01 PM UTC
+// Generated using ICU4J 49.1.0.0 on Monday, August 6, 2012 3:57:23 PM UTC
 // by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 7/26/12 6:22 PM */
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
 package org.apache.lucene.analysis.standard;
@ -43,7 +43,7 @@ public final class StandardTokenizerImpl implements StandardTokenizerInterface {
  public static final int YYEOF = -1;
  /** initial size of the lookahead buffer */
-  private static final int ZZ_BUFFERSIZE = 16384;
+  private static final int ZZ_BUFFERSIZE = 4096;
  /** lexical states */
  public static final int YYINITIAL = 0;
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
@ -44,8 +44,9 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 %implements StandardTokenizerInterface
 %function getNextToken
 %char
 %buffer 4096
-%include src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
+%include SUPPLEMENTARY.jflex-macro
 ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
 Format =  ([\p{WB:Format}] | {FormatSupp})
 Numeric = ([\p{WB:Numeric}] | {NumericSupp})
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.java
@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 7/26/12 6:22 PM */
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
 package org.apache.lucene.analysis.standard;
@ -46,7 +46,7 @@ public final class UAX29URLEmailTokenizerImpl implements StandardTokenizerInterf
  public static final int YYEOF = -1;
  /** initial size of the lookahead buffer */
-  private static final int ZZ_BUFFERSIZE = 16384;
+  private static final int ZZ_BUFFERSIZE = 4096;
  /** lexical states */
  public static final int YYINITIAL = 0;
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex
@ -47,8 +47,9 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 %implements StandardTokenizerInterface
 %function getNextToken
 %char
 %buffer 4096
-%include src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
+%include SUPPLEMENTARY.jflex-macro
 ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
 Format =  ([\p{WB:Format}] | {FormatSupp})
 Numeric = ([\p{WB:Numeric}] | {NumericSupp})
@ -88,7 +89,7 @@ HiraganaEx = {Hiragana} ({Format} | {Extend})*
 //     RFC-5321: Simple Mail Transfer Protocol
 //     RFC-5322: Internet Message Format
-%include src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
+%include ASCIITLD.jflex-macro
 DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
 DomainNameStrict = {DomainLabel} ("." {DomainLabel})* {ASCIITLD}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java
@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 7/15/12 1:57 AM */
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
 package org.apache.lucene.analysis.wikipedia;
@ -25,8 +25,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 /**
 * This class is a scanner generated by 
 * <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
- * on 7/15/12 1:57 AM from the specification file
+ * on 8/6/12 11:57 AM from the specification file
- * <tt>C:/cygwin/home/s/svn/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
+ * <tt>/home/rmuir/workspace/lucene-trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
 */
 class WikipediaTokenizerImpl {
@ -34,7 +34,7 @@ class WikipediaTokenizerImpl {
  public static final int YYEOF = -1;
  /** initial size of the lookahead buffer */
-  private static final int ZZ_BUFFERSIZE = 16384;
+  private static final int ZZ_BUFFERSIZE = 4096;
  /** lexical states */
  public static final int THREE_SINGLE_QUOTES_STATE = 10;
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex
@ -27,6 +27,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 %function getNextToken
 %pack
 %char
 %buffer 4096
 %{