LUCENE-3913: Fix HTMLStripCharFilter invalid final offset for input containing </br>

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1304912 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2012-03-24 20:54:31 +00:00
parent 1f7c31d711
commit ada9780484
5 changed files with 559 additions and 461 deletions

View File

@ -391,188 +391,194 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
}
if (VERBOSE) {
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
checkAnalysisConsistency(random, a, useCharFilter, text);
}
}
public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text) throws IOException {
if (VERBOSE) {
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
}
int remainder = random.nextInt(10);
Reader reader = new StringReader(text);
TokenStream ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
PositionIncrementAttribute posIncAtt = ts.hasAttribute(PositionIncrementAttribute.class) ? ts.getAttribute(PositionIncrementAttribute.class) : null;
PositionLengthAttribute posLengthAtt = ts.hasAttribute(PositionLengthAttribute.class) ? ts.getAttribute(PositionLengthAttribute.class) : null;
TypeAttribute typeAtt = ts.hasAttribute(TypeAttribute.class) ? ts.getAttribute(TypeAttribute.class) : null;
List<String> tokens = new ArrayList<String>();
List<String> types = new ArrayList<String>();
List<Integer> positions = new ArrayList<Integer>();
List<Integer> positionLengths = new ArrayList<Integer>();
List<Integer> startOffsets = new ArrayList<Integer>();
List<Integer> endOffsets = new ArrayList<Integer>();
ts.reset();
// First pass: save away "correct" tokens
while (ts.incrementToken()) {
tokens.add(termAtt.toString());
if (typeAtt != null) types.add(typeAtt.type());
if (posIncAtt != null) positions.add(posIncAtt.getPositionIncrement());
if (posLengthAtt != null) positionLengths.add(posLengthAtt.getPositionLength());
if (offsetAtt != null) {
startOffsets.add(offsetAtt.startOffset());
endOffsets.add(offsetAtt.endOffset());
}
}
ts.end();
ts.close();
int remainder = random.nextInt(10);
Reader reader = new StringReader(text);
TokenStream ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
PositionIncrementAttribute posIncAtt = ts.hasAttribute(PositionIncrementAttribute.class) ? ts.getAttribute(PositionIncrementAttribute.class) : null;
PositionLengthAttribute posLengthAtt = ts.hasAttribute(PositionLengthAttribute.class) ? ts.getAttribute(PositionLengthAttribute.class) : null;
TypeAttribute typeAtt = ts.hasAttribute(TypeAttribute.class) ? ts.getAttribute(TypeAttribute.class) : null;
List<String> tokens = new ArrayList<String>();
List<String> types = new ArrayList<String>();
List<Integer> positions = new ArrayList<Integer>();
List<Integer> positionLengths = new ArrayList<Integer>();
List<Integer> startOffsets = new ArrayList<Integer>();
List<Integer> endOffsets = new ArrayList<Integer>();
ts.reset();
// verify reusing is "reproducable" and also get the normal tokenstream sanity checks
if (!tokens.isEmpty()) {
// First pass: save away "correct" tokens
while (ts.incrementToken()) {
tokens.add(termAtt.toString());
if (typeAtt != null) types.add(typeAtt.type());
if (posIncAtt != null) positions.add(posIncAtt.getPositionIncrement());
if (posLengthAtt != null) positionLengths.add(posLengthAtt.getPositionLength());
if (offsetAtt != null) {
startOffsets.add(offsetAtt.startOffset());
endOffsets.add(offsetAtt.endOffset());
}
}
ts.end();
ts.close();
// KWTokenizer (for example) can produce a token
// even when input is length 0:
if (text.length() != 0) {
// verify reusing is "reproducable" and also get the normal tokenstream sanity checks
if (!tokens.isEmpty()) {
// KWTokenizer (for example) can produce a token
// even when input is length 0:
if (text.length() != 0) {
// (Optional) second pass: do something evil:
final int evilness = random.nextInt(50);
if (evilness == 17) {
if (VERBOSE) {
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis w/ exception");
}
// Throw an errant exception from the Reader:
MockReaderWrapper evilReader = new MockReaderWrapper(random, new StringReader(text));
evilReader.throwExcAfterChar(random.nextInt(text.length()+1));
reader = evilReader;
try {
// NOTE: some Tokenizers go and read characters
// when you call .setReader(Reader), eg
// PatternTokenizer. This is a bit
// iffy... (really, they should only
// pull from the Reader when you call
// .incremenToken(), I think?), but we
// currently allow it, so, we must call
// a.tokenStream inside the try since we may
// hit the exc on init:
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(evilReader, remainder) : evilReader);
ts.reset();
while (ts.incrementToken());
fail("did not hit exception");
} catch (RuntimeException re) {
assertTrue(MockReaderWrapper.isMyEvilException(re));
}
try {
ts.end();
} catch (AssertionError ae) {
// Catch & ignore MockTokenizer's
// anger...
if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
// OK
} else {
throw ae;
}
}
ts.close();
} else if (evilness == 7) {
// Only consume a subset of the tokens:
final int numTokensToRead = random.nextInt(tokens.size());
if (VERBOSE) {
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.size() + " tokens");
}
reader = new StringReader(text);
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
ts.reset();
for(int tokenCount=0;tokenCount<numTokensToRead;tokenCount++) {
assertTrue(ts.incrementToken());
}
try {
ts.end();
} catch (AssertionError ae) {
// Catch & ignore MockTokenizer's
// anger...
if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
// OK
} else {
throw ae;
}
}
ts.close();
}
}
// Final pass: verify clean tokenization matches
// results from first pass:
if (VERBOSE) {
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
}
reader = new StringReader(text);
if (random.nextInt(30) == 7) {
// (Optional) second pass: do something evil:
final int evilness = random.nextInt(50);
if (evilness == 17) {
if (VERBOSE) {
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader");
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis w/ exception");
}
// Throw an errant exception from the Reader:
MockReaderWrapper evilReader = new MockReaderWrapper(random, new StringReader(text));
evilReader.throwExcAfterChar(random.nextInt(text.length()+1));
reader = evilReader;
try {
// NOTE: some Tokenizers go and read characters
// when you call .setReader(Reader), eg
// PatternTokenizer. This is a bit
// iffy... (really, they should only
// pull from the Reader when you call
// .incremenToken(), I think?), but we
// currently allow it, so, we must call
// a.tokenStream inside the try since we may
// hit the exc on init:
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(evilReader, remainder) : evilReader);
ts.reset();
while (ts.incrementToken());
fail("did not hit exception");
} catch (RuntimeException re) {
assertTrue(MockReaderWrapper.isMyEvilException(re));
}
try {
ts.end();
} catch (AssertionError ae) {
// Catch & ignore MockTokenizer's
// anger...
if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
// OK
} else {
throw ae;
}
}
ts.close();
} else if (evilness == 7) {
// Only consume a subset of the tokens:
final int numTokensToRead = random.nextInt(tokens.size());
if (VERBOSE) {
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.size() + " tokens");
}
reader = new MockReaderWrapper(random, reader);
reader = new StringReader(text);
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
ts.reset();
for(int tokenCount=0;tokenCount<numTokensToRead;tokenCount++) {
assertTrue(ts.incrementToken());
}
try {
ts.end();
} catch (AssertionError ae) {
// Catch & ignore MockTokenizer's
// anger...
if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
// OK
} else {
throw ae;
}
}
ts.close();
}
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
// offset + pos + posLength + type
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
types.toArray(new String[types.size()]),
toIntArray(positions),
toIntArray(positionLengths),
text.length());
} else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
// offset + pos + type
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
types.toArray(new String[types.size()]),
toIntArray(positions),
null,
text.length());
} else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
// offset + pos + posLength
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
null,
toIntArray(positions),
toIntArray(positionLengths),
text.length());
} else if (posIncAtt != null && offsetAtt != null) {
// offset + pos
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
null,
toIntArray(positions),
null,
text.length());
} else if (offsetAtt != null) {
// offset
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
null,
null,
null,
text.length());
} else {
// terms only
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]));
}
// Final pass: verify clean tokenization matches
// results from first pass:
if (VERBOSE) {
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
}
reader = new StringReader(text);
if (random.nextInt(30) == 7) {
if (VERBOSE) {
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader");
}
reader = new MockReaderWrapper(random, reader);
}
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
// offset + pos + posLength + type
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
types.toArray(new String[types.size()]),
toIntArray(positions),
toIntArray(positionLengths),
text.length());
} else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
// offset + pos + type
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
types.toArray(new String[types.size()]),
toIntArray(positions),
null,
text.length());
} else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
// offset + pos + posLength
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
null,
toIntArray(positions),
toIntArray(positionLengths),
text.length());
} else if (posIncAtt != null && offsetAtt != null) {
// offset + pos
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
null,
toIntArray(positions),
null,
text.length());
} else if (offsetAtt != null) {
// offset
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
null,
null,
null,
text.length());
} else {
// terms only
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]));
}
}
}

View File

@ -27,10 +27,7 @@ import java.io.OutputStream;
import java.io.PrintStream;
import java.lang.reflect.Method;
import java.nio.CharBuffer;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Map;
import java.util.Random;
import java.util.*;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
@ -414,12 +411,51 @@ public class _TestUtil {
case 20: sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1)); break;
case 21: sb.append("\n"); break;
case 22: sb.append(" ".substring(nextInt(random, 0, 10))); break;
case 23: {
sb.append("<");
if (0 == nextInt(random, 0, 3)) {
sb.append(" ".substring(nextInt(random, 1, 10)));
}
if (0 == nextInt(random, 0, 1)) {
sb.append("/");
if (0 == nextInt(random, 0, 3)) {
sb.append(" ".substring(nextInt(random, 1, 10)));
}
}
switch (nextInt(random, 0, 3)) {
case 0: sb.append(randomlyRecaseCodePoints(random, "script")); break;
case 1: sb.append(randomlyRecaseCodePoints(random, "style")); break;
case 2: sb.append(randomlyRecaseCodePoints(random, "br")); break;
// default: append nothing
}
sb.append(">".substring(nextInt(random, 0, 1)));
break;
}
default: sb.append(randomSimpleString(random));
}
}
return sb.toString();
}
/**
* Randomly upcases, downcases, or leaves intact each code point in the given string
*/
public static String randomlyRecaseCodePoints(Random random, String str) {
StringBuilder builder = new StringBuilder();
int pos = 0;
while (pos < str.length()) {
int codePoint = str.codePointAt(pos);
pos += Character.charCount(codePoint);
String codePointSubstring = new String(new int[] { codePoint }, 0, 1);
switch (nextInt(random, 0, 2)) {
case 0: builder.append(codePointSubstring.toUpperCase()); break;
case 1: builder.append(codePointSubstring.toLowerCase()); break;
case 2: builder.append(codePointSubstring); // leave intact
}
}
return builder.toString();
}
private static final int[] blockStarts = {
0x0000, 0x0080, 0x0100, 0x0180, 0x0250, 0x02B0, 0x0300, 0x0370, 0x0400,
0x0500, 0x0530, 0x0590, 0x0600, 0x0700, 0x0750, 0x0780, 0x07C0, 0x0800,

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 1/23/12 2:15 AM */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 3/24/12 4:50 PM */
package org.apache.lucene.analysis.charfilter;
@ -39,7 +39,7 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
* on 1/23/12 2:15 AM from the specification file
* on 3/24/12 4:50 PM from the specification file
* <tt>C:/cygwin/home/s/svn/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex</tt>
*/
public final class HTMLStripCharFilter extends BaseCharFilter {
@ -30967,7 +30967,9 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
case START_TAG_TAIL_EXCLUDE:
case SERVER_SIDE_INCLUDE:
case START_TAG_TAIL_SUBSTITUTE: { // Exclude
// add (length of input that won't be output) [ - (substitution length) = 0 ]
cumulativeDiff += yychar - inputStart;
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
outputSegment.clear();
eofReturnValue = -1;
@ -30975,7 +30977,9 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
}
case CHARACTER_REFERENCE_TAIL: { // Substitute
// At end of file, allow char refs without semicolons
// add (length of input that won't be output) - (substitution length)
cumulativeDiff += inputSegment.length() - outputSegment.length();
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
eofReturnValue = outputSegment.nextChar();
break;
@ -31095,6 +31099,16 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
{ yybegin(STYLE);
}
case 55: break;
case 27:
{ // add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return BLOCK_LEVEL_START_TAG_REPLACEMENT;
}
case 56: break;
case 30:
{ int length = yylength();
inputSegment.write(zzBuffer, zzStartRead, length);
@ -31104,7 +31118,30 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
outputSegment = entitySegment;
yybegin(CHARACTER_REFERENCE_TAIL);
}
case 56: break;
case 57: break;
case 48:
{ inputSegment.clear();
yybegin(YYINITIAL);
// add (previously matched input length) -- current match and substitution handled below
cumulativeDiff += yychar - inputStart;
// position the offset correction at (already output length) -- substitution handled below
int offsetCorrectionPos = outputCharCount;
int returnValue;
if (escapeSTYLE) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
returnValue = outputSegment.nextChar();
} else {
// add (this match length) - (substitution length)
cumulativeDiff += yylength() - 1;
// add (substitution length)
++offsetCorrectionPos;
returnValue = STYLE_REPLACEMENT;
}
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
return returnValue;
}
case 58: break;
case 8:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
@ -31114,71 +31151,75 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
yybegin(START_TAG_TAIL_SUBSTITUTE);
}
}
case 57: break;
case 26:
{ cumulativeDiff += inputSegment.length() + yylength();
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
outputSegment = inputSegment;
yybegin(YYINITIAL);
}
case 58: break;
case 59: break;
case 2:
{ inputStart = yychar;
inputSegment.clear();
inputSegment.append('<');
yybegin(LEFT_ANGLE_BRACKET);
}
case 59: break;
case 34:
{ cumulativeDiff += yychar - inputStart + yylength();
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
}
case 60: break;
case 47:
{ cumulativeDiff += inputSegment.length() + yylength();
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(CDATA);
}
case 61: break;
case 27:
{ cumulativeDiff += inputSegment.length() + yylength() - 1;
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return BLOCK_LEVEL_START_TAG_REPLACEMENT;
}
case 62: break;
case 44:
{ restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
}
case 63: break;
case 61: break;
case 21:
{ previousRestoreState = restoreState;
restoreState = SERVER_SIDE_INCLUDE;
yybegin(SINGLE_QUOTED_STRING);
}
case 64: break;
case 62: break;
case 11:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
yybegin(LEFT_ANGLE_BRACKET_SPACE);
}
case 65: break;
case 63: break;
case 35:
{ yybegin(SCRIPT);
}
case 66: break;
case 64: break;
case 42:
{ restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE);
}
case 67: break;
case 65: break;
case 10:
{ inputSegment.append('!'); yybegin(BANG);
}
case 68: break;
case 66: break;
case 51:
{ // Handle paired UTF-16 surrogates.
String surrogatePair = yytext();
char highSurrogate = '\u0000';
char lowSurrogate = '\u0000';
try {
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
} catch(Exception e) { // should never happen
assert false: "Exception parsing high surrogate '"
+ surrogatePair.substring(2, 6) + "'";
}
try { // Low surrogates are in decimal range [56320, 57343]
lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
} catch(Exception e) { // should never happen
assert false: "Exception parsing low surrogate '"
+ surrogatePair.substring(9, 14) + "'";
}
if (Character.isLowSurrogate(lowSurrogate)) {
outputSegment = entitySegment;
outputSegment.clear();
outputSegment.unsafeWrite(lowSurrogate);
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 2;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return highSurrogate;
}
yypushback(surrogatePair.length() - 1); // Consume only '#'
inputSegment.append('#');
yybegin(NUMERIC_CHARACTER);
}
case 67: break;
case 4:
{ yypushback(1);
outputSegment = inputSegment;
@ -31186,37 +31227,48 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
case 69: break;
case 48:
{ inputSegment.clear();
yybegin(YYINITIAL);
cumulativeDiff += yychar - inputStart;
int outputEnd = outputCharCount;
int returnValue;
if (escapeSTYLE) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
returnValue = outputSegment.nextChar();
} else {
cumulativeDiff += yylength() - 1;
++outputEnd;
returnValue = STYLE_REPLACEMENT;
}
addOffCorrectMap(outputEnd, cumulativeDiff);
return returnValue;
}
case 70: break;
case 68: break;
case 43:
{ restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
}
case 71: break;
case 14:
{ cumulativeDiff += inputSegment.length() + yylength();
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
case 69: break;
case 52:
{ // Handle paired UTF-16 surrogates.
String surrogatePair = yytext();
char highSurrogate = '\u0000';
try { // High surrogates are in decimal range [55296, 56319]
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
} catch(Exception e) { // should never happen
assert false: "Exception parsing high surrogate '"
+ surrogatePair.substring(1, 6) + "'";
}
if (Character.isHighSurrogate(highSurrogate)) {
outputSegment = entitySegment;
outputSegment.clear();
try {
outputSegment.unsafeWrite
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
} catch(Exception e) { // should never happen
assert false: "Exception parsing low surrogate '"
+ surrogatePair.substring(10, 14) + "'";
}
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 2;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return highSurrogate;
}
yypushback(surrogatePair.length() - 1); // Consume only '#'
inputSegment.append('#');
yybegin(NUMERIC_CHARACTER);
}
case 72: break;
case 70: break;
case 28:
{ restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING);
}
case 71: break;
case 50:
{ // Handle paired UTF-16 surrogates.
outputSegment = entitySegment;
@ -31236,49 +31288,63 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
assert false: "Exception parsing low surrogate '"
+ surrogatePair.substring(10, 14) + "'";
}
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 2;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return highSurrogate;
}
case 73: break;
case 28:
{ restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING);
}
case 74: break;
case 49:
{ inputSegment.clear();
yybegin(YYINITIAL);
cumulativeDiff += yychar - inputStart;
int outputEnd = outputCharCount;
int returnValue;
if (escapeSCRIPT) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
returnValue = outputSegment.nextChar();
} else {
cumulativeDiff += yylength() - 1;
++outputEnd;
returnValue = SCRIPT_REPLACEMENT;
}
addOffCorrectMap(outputEnd, cumulativeDiff);
return returnValue;
}
case 75: break;
case 72: break;
case 16:
{ restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING);
}
case 76: break;
case 73: break;
case 22:
{ previousRestoreState = restoreState;
restoreState = SERVER_SIDE_INCLUDE;
yybegin(DOUBLE_QUOTED_STRING);
}
case 77: break;
case 74: break;
case 26:
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
outputSegment = inputSegment;
yybegin(YYINITIAL);
}
case 75: break;
case 20:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
}
case 76: break;
case 47:
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(CDATA);
}
case 77: break;
case 33:
{ yybegin(YYINITIAL);
if (escapeBR) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
return outputSegment.nextChar();
} else {
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.reset();
return BR_START_TAG_REPLACEMENT;
}
}
case 78: break;
case 23:
{ yybegin(restoreState); restoreState = previousRestoreState;
@ -31288,28 +31354,20 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
{ yybegin(COMMENT);
}
case 80: break;
case 25:
{ cumulativeDiff += inputSegment.length() + yylength() - 1;
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return BLOCK_LEVEL_END_TAG_REPLACEMENT;
}
case 81: break;
case 24:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
case 82: break;
case 81: break;
case 3:
{ inputStart = yychar;
inputSegment.clear();
inputSegment.append('&');
yybegin(AMPERSAND);
}
case 83: break;
case 82: break;
case 46:
{ yybegin(SCRIPT);
if (escapeSCRIPT) {
@ -31319,6 +31377,15 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
return outputSegment.nextChar();
}
}
case 83: break;
case 14:
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
}
case 84: break;
case 6:
{ int matchLength = yylength();
@ -31354,14 +31421,23 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
}
}
case 85: break;
case 34:
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0]
cumulativeDiff += yychar - inputStart + yylength();
// position the correction at (already output length) [ + (substitution length) = 0]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
}
case 86: break;
case 5:
{ inputSegment.append('#'); yybegin(NUMERIC_CHARACTER);
}
case 86: break;
case 87: break;
case 13:
{ inputSegment.append(zzBuffer[zzStartRead]);
}
case 87: break;
case 88: break;
case 18:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
@ -31369,93 +31445,25 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
yybegin(END_TAG_TAIL_INCLUDE);
} else {
yybegin(END_TAG_TAIL_SUBSTITUTE);
}
}
case 88: break;
case 36:
{ yybegin(YYINITIAL);
if (escapeBR) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
return outputSegment.nextChar();
} else {
cumulativeDiff
+= inputSegment.length() + yylength() - outputSegment.length();
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
inputSegment.reset();
return BR_END_TAG_REPLACEMENT;
}
}
case 89: break;
case 33:
{ yybegin(YYINITIAL);
if (escapeBR) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
return outputSegment.nextChar();
} else {
cumulativeDiff
+= inputSegment.length() + yylength() - outputSegment.length();
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.reset();
return BR_START_TAG_REPLACEMENT;
}
}
case 90: break;
case 40:
{ yybegin(SCRIPT_COMMENT);
}
case 91: break;
case 90: break;
case 37:
{ cumulativeDiff += yylength();
{ // add (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
yybegin(YYINITIAL);
}
case 92: break;
case 91: break;
case 12:
{ inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH);
}
case 93: break;
case 51:
{ // Handle paired UTF-16 surrogates.
String surrogatePair = yytext();
char highSurrogate = '\u0000';
char lowSurrogate = '\u0000';
try {
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
} catch(Exception e) { // should never happen
assert false: "Exception parsing high surrogate '"
+ surrogatePair.substring(2, 6) + "'";
}
try { // Low surrogates are in decimal range [56320, 57343]
lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
} catch(Exception e) { // should never happen
assert false: "Exception parsing low surrogate '"
+ surrogatePair.substring(9, 14) + "'";
}
if (Character.isLowSurrogate(lowSurrogate)) {
outputSegment = entitySegment;
outputSegment.clear();
outputSegment.unsafeWrite(lowSurrogate);
cumulativeDiff += inputSegment.length() + yylength() - 2;
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return highSurrogate;
}
yypushback(surrogatePair.length() - 1); // Consume only '#'
inputSegment.append('#');
yybegin(NUMERIC_CHARACTER);
}
case 94: break;
case 7:
{ cumulativeDiff
+= inputSegment.length() + yylength() - outputSegment.length();
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
case 95: break;
case 92: break;
case 9:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
@ -31465,15 +31473,38 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
yybegin(START_TAG_TAIL_EXCLUDE);
}
}
case 96: break;
case 93: break;
case 49:
{ inputSegment.clear();
yybegin(YYINITIAL);
// add (previously matched input length) -- current match and substitution handled below
cumulativeDiff += yychar - inputStart;
// position at (already output length) -- substitution handled below
int offsetCorrectionPos = outputCharCount;
int returnValue;
if (escapeSCRIPT) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
returnValue = outputSegment.nextChar();
} else {
// add (this match length) - (substitution length)
cumulativeDiff += yylength() - 1;
// add (substitution length)
++offsetCorrectionPos;
returnValue = SCRIPT_REPLACEMENT;
}
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
return returnValue;
}
case 94: break;
case 29:
{ restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
}
case 97: break;
case 95: break;
case 17:
{ restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
}
case 98: break;
case 96: break;
case 45:
{ yybegin(STYLE);
if (escapeSTYLE) {
@ -31483,7 +31514,16 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
return outputSegment.nextChar();
}
}
case 99: break;
case 97: break;
case 7:
{ // add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length();
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
case 98: break;
case 19:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
@ -31493,6 +31533,16 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
yybegin(END_TAG_TAIL_EXCLUDE);
}
}
case 99: break;
case 25:
{ // add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return BLOCK_LEVEL_END_TAG_REPLACEMENT;
}
case 100: break;
case 31:
{ int matchLength = yylength();
@ -31529,49 +31579,6 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
}
}
case 101: break;
case 38:
{ yybegin(restoreState);
}
case 102: break;
case 41:
{ yybegin(STYLE_COMMENT);
}
case 103: break;
case 1:
{ return zzBuffer[zzStartRead];
}
case 104: break;
case 52:
{ // Handle paired UTF-16 surrogates.
String surrogatePair = yytext();
char highSurrogate = '\u0000';
try { // High surrogates are in decimal range [55296, 56319]
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
} catch(Exception e) { // should never happen
assert false: "Exception parsing high surrogate '"
+ surrogatePair.substring(1, 6) + "'";
}
if (Character.isHighSurrogate(highSurrogate)) {
outputSegment = entitySegment;
outputSegment.clear();
try {
outputSegment.unsafeWrite
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
} catch(Exception e) { // should never happen
assert false: "Exception parsing low surrogate '"
+ surrogatePair.substring(10, 14) + "'";
}
cumulativeDiff += inputSegment.length() + yylength() - 2;
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return highSurrogate;
}
yypushback(surrogatePair.length() - 1); // Consume only '#'
inputSegment.append('#');
yybegin(NUMERIC_CHARACTER);
}
case 105: break;
case 53:
{ // Handle paired UTF-16 surrogates.
String surrogatePair = yytext();
@ -31594,7 +31601,9 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
outputSegment = entitySegment;
outputSegment.clear();
outputSegment.unsafeWrite(lowSurrogate);
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 2;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
@ -31605,6 +31614,34 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
inputSegment.append('#');
yybegin(NUMERIC_CHARACTER);
}
case 102: break;
case 36:
{ yybegin(YYINITIAL);
if (escapeBR) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
return outputSegment.nextChar();
} else {
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.reset();
return BR_END_TAG_REPLACEMENT;
}
}
case 103: break;
case 38:
{ yybegin(restoreState);
}
case 104: break;
case 41:
{ yybegin(STYLE_COMMENT);
}
case 105: break;
case 1:
{ return zzBuffer[zzStartRead];
}
case 106: break;
default:
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {

View File

@ -293,7 +293,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
case START_TAG_TAIL_EXCLUDE:
case SERVER_SIDE_INCLUDE:
case START_TAG_TAIL_SUBSTITUTE: { // Exclude
// add (length of input that won't be output) [ - (substitution length) = 0 ]
cumulativeDiff += yychar - inputStart;
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
outputSegment.clear();
eofReturnValue = -1;
@ -301,7 +303,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
}
case CHARACTER_REFERENCE_TAIL: { // Substitute
// At end of file, allow char refs without semicolons
// add (length of input that won't be output) - (substitution length)
cumulativeDiff += inputSegment.length() - outputSegment.length();
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
eofReturnValue = outputSegment.nextChar();
break;
@ -374,7 +378,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
assert false: "Exception parsing low surrogate '"
+ surrogatePair.substring(10, 14) + "'";
}
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 2;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
@ -403,7 +409,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
assert false: "Exception parsing low surrogate '"
+ surrogatePair.substring(10, 14) + "'";
}
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 2;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
@ -437,7 +445,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
outputSegment = entitySegment;
outputSegment.clear();
outputSegment.unsafeWrite(lowSurrogate);
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 2;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
@ -472,7 +482,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
outputSegment = entitySegment;
outputSegment.clear();
outputSegment.unsafeWrite(lowSurrogate);
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 2;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
@ -557,8 +569,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
<CHARACTER_REFERENCE_TAIL> {
";" {
cumulativeDiff
+= inputSegment.length() + yylength() - outputSegment.length();
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length();
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
yybegin(YYINITIAL);
return outputSegment.nextChar();
@ -574,9 +587,10 @@ InlineElment = ( [aAbBiIqQsSuU] |
outputSegment = inputSegment;
return outputSegment.nextChar();
} else {
cumulativeDiff
+= inputSegment.length() + yylength() - outputSegment.length();
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.reset();
return BR_END_TAG_REPLACEMENT;
}
@ -612,7 +626,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
<END_TAG_TAIL_EXCLUDE> {
\s* ">" {
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
@ -621,7 +637,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
<END_TAG_TAIL_SUBSTITUTE> {
\s* ">" {
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
@ -637,7 +655,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
yybegin(LEFT_ANGLE_BRACKET_SPACE);
}
"?" [^>]* [/?] ">" {
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
@ -649,8 +669,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
outputSegment = inputSegment;
return outputSegment.nextChar();
} else {
cumulativeDiff
+= inputSegment.length() + yylength() - outputSegment.length();
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.reset();
return BR_START_TAG_REPLACEMENT;
@ -708,7 +729,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
<START_TAG_TAIL_EXCLUDE> {
( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
outputSegment = inputSegment;
@ -718,7 +741,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
<START_TAG_TAIL_SUBSTITUTE> {
( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
@ -729,7 +754,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
<BANG> {
"--" { yybegin(COMMENT); }
">" {
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
@ -742,7 +769,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
// [21] CDEnd ::= ']]>'
//
"[CDATA[" {
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(CDATA);
@ -754,7 +783,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
<CDATA> {
"]]>" {
// add (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
yybegin(YYINITIAL);
}
@ -764,7 +795,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
<COMMENT> {
"<!--#" { restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
"-->" {
// add (previously matched input length) + (this match length) [ - (substitution length) = 0]
cumulativeDiff += yychar - inputStart + yylength();
// position the correction at (already output length) [ + (substitution length) = 0]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
@ -820,19 +853,23 @@ InlineElment = ( [aAbBiIqQsSuU] |
"</" \s* [sS][cC][rR][iI][pP][tT] \s* ">" {
inputSegment.clear();
yybegin(YYINITIAL);
// add (previously matched input length) -- current match and substitution handled below
cumulativeDiff += yychar - inputStart;
int outputEnd = outputCharCount;
// position at (already output length) -- substitution handled below
int offsetCorrectionPos = outputCharCount;
int returnValue;
if (escapeSCRIPT) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
returnValue = outputSegment.nextChar();
} else {
// add (this match length) - (substitution length)
cumulativeDiff += yylength() - 1;
++outputEnd;
// add (substitution length)
++offsetCorrectionPos;
returnValue = SCRIPT_REPLACEMENT;
}
addOffCorrectMap(outputEnd, cumulativeDiff);
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
return returnValue;
}
[^] { }
@ -843,19 +880,23 @@ InlineElment = ( [aAbBiIqQsSuU] |
"</" \s* [sS][tT][yY][lL][eE] \s* ">" {
inputSegment.clear();
yybegin(YYINITIAL);
// add (previously matched input length) -- current match and substitution handled below
cumulativeDiff += yychar - inputStart;
int outputEnd = outputCharCount;
// position the offset correction at (already output length) -- substitution handled below
int offsetCorrectionPos = outputCharCount;
int returnValue;
if (escapeSTYLE) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
returnValue = outputSegment.nextChar();
} else {
// add (this match length) - (substitution length)
cumulativeDiff += yylength() - 1;
++outputEnd;
// add (substitution length)
++offsetCorrectionPos;
returnValue = STYLE_REPLACEMENT;
}
addOffCorrectMap(outputEnd, cumulativeDiff);
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
return returnValue;
}
[^] { }

View File

@ -36,6 +36,21 @@ import org.apache.lucene.util._TestUtil;
public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
static private Analyzer newTestAnalyzer() {
return new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, tokenizer);
}
@Override
protected Reader initReader(Reader reader) {
return new HTMLStripCharFilter(CharReader.get(reader));
}
};
}
//this is some text here is a link and another link . This is an entity: & plus a <. Here is an &
//
public void test() throws IOException {
@ -493,41 +508,17 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
}
public void testRandom() throws Exception {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, tokenizer);
}
@Override
protected Reader initReader(Reader reader) {
return new HTMLStripCharFilter(CharReader.get(reader));
}
};
int numRounds = RANDOM_MULTIPLIER * 10000;
checkRandomData(random, analyzer, numRounds);
checkRandomData(random, newTestAnalyzer(), numRounds);
}
public void testRandomHugeStrings() throws Exception {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, tokenizer);
}
@Override
protected Reader initReader(Reader reader) {
return new HTMLStripCharFilter(CharReader.get(reader));
}
};
int numRounds = RANDOM_MULTIPLIER * 200;
checkRandomData(random, analyzer, numRounds, 8192);
checkRandomData(random, newTestAnalyzer(), numRounds, 8192);
}
public void testCloseBR() throws Exception {
checkAnalysisConsistency(random, newTestAnalyzer(), random.nextBoolean(), " Secretary)</br> [[M");
}
public void testServerSideIncludes() throws Exception {
@ -797,9 +788,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
public void testRandomBrokenHTML() throws Exception {
int maxNumElements = 10000;
String text = _TestUtil.randomHtmlishString(random, maxNumElements);
Reader reader = new HTMLStripCharFilter
(CharReader.get(new StringReader(text)));
while (reader.read() != -1);
checkAnalysisConsistency(random, newTestAnalyzer(), random.nextBoolean(), text);
}
public void testRandomText() throws Exception {
@ -838,18 +827,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
}
public void testUTF16Surrogates() throws Exception {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, tokenizer);
}
@Override
protected Reader initReader(Reader reader) {
return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
}
};
Analyzer analyzer = newTestAnalyzer();
// Paired surrogates
assertAnalyzesTo(analyzer, " one two &#xD86C;&#XdC01;three",
new String[] { "one", "two", "\uD86C\uDC01three" } );