mirror of https://github.com/apache/lucene.git
LUCENE-3913: Fix HTMLStripCharFilter invalid final offset for input containing </br>
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1304912 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1f7c31d711
commit
ada9780484
|
@ -391,188 +391,194 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (VERBOSE) {
|
checkAnalysisConsistency(random, a, useCharFilter, text);
|
||||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text) throws IOException {
|
||||||
|
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
|
||||||
|
}
|
||||||
|
|
||||||
|
int remainder = random.nextInt(10);
|
||||||
|
Reader reader = new StringReader(text);
|
||||||
|
TokenStream ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
|
||||||
|
assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
|
||||||
|
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
|
||||||
|
OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
|
||||||
|
PositionIncrementAttribute posIncAtt = ts.hasAttribute(PositionIncrementAttribute.class) ? ts.getAttribute(PositionIncrementAttribute.class) : null;
|
||||||
|
PositionLengthAttribute posLengthAtt = ts.hasAttribute(PositionLengthAttribute.class) ? ts.getAttribute(PositionLengthAttribute.class) : null;
|
||||||
|
TypeAttribute typeAtt = ts.hasAttribute(TypeAttribute.class) ? ts.getAttribute(TypeAttribute.class) : null;
|
||||||
|
List<String> tokens = new ArrayList<String>();
|
||||||
|
List<String> types = new ArrayList<String>();
|
||||||
|
List<Integer> positions = new ArrayList<Integer>();
|
||||||
|
List<Integer> positionLengths = new ArrayList<Integer>();
|
||||||
|
List<Integer> startOffsets = new ArrayList<Integer>();
|
||||||
|
List<Integer> endOffsets = new ArrayList<Integer>();
|
||||||
|
ts.reset();
|
||||||
|
|
||||||
|
// First pass: save away "correct" tokens
|
||||||
|
while (ts.incrementToken()) {
|
||||||
|
tokens.add(termAtt.toString());
|
||||||
|
if (typeAtt != null) types.add(typeAtt.type());
|
||||||
|
if (posIncAtt != null) positions.add(posIncAtt.getPositionIncrement());
|
||||||
|
if (posLengthAtt != null) positionLengths.add(posLengthAtt.getPositionLength());
|
||||||
|
if (offsetAtt != null) {
|
||||||
|
startOffsets.add(offsetAtt.startOffset());
|
||||||
|
endOffsets.add(offsetAtt.endOffset());
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
ts.end();
|
||||||
|
ts.close();
|
||||||
|
|
||||||
int remainder = random.nextInt(10);
|
// verify reusing is "reproducable" and also get the normal tokenstream sanity checks
|
||||||
Reader reader = new StringReader(text);
|
if (!tokens.isEmpty()) {
|
||||||
TokenStream ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
|
|
||||||
assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
|
|
||||||
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
|
|
||||||
OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
|
|
||||||
PositionIncrementAttribute posIncAtt = ts.hasAttribute(PositionIncrementAttribute.class) ? ts.getAttribute(PositionIncrementAttribute.class) : null;
|
|
||||||
PositionLengthAttribute posLengthAtt = ts.hasAttribute(PositionLengthAttribute.class) ? ts.getAttribute(PositionLengthAttribute.class) : null;
|
|
||||||
TypeAttribute typeAtt = ts.hasAttribute(TypeAttribute.class) ? ts.getAttribute(TypeAttribute.class) : null;
|
|
||||||
List<String> tokens = new ArrayList<String>();
|
|
||||||
List<String> types = new ArrayList<String>();
|
|
||||||
List<Integer> positions = new ArrayList<Integer>();
|
|
||||||
List<Integer> positionLengths = new ArrayList<Integer>();
|
|
||||||
List<Integer> startOffsets = new ArrayList<Integer>();
|
|
||||||
List<Integer> endOffsets = new ArrayList<Integer>();
|
|
||||||
ts.reset();
|
|
||||||
|
|
||||||
// First pass: save away "correct" tokens
|
// KWTokenizer (for example) can produce a token
|
||||||
while (ts.incrementToken()) {
|
// even when input is length 0:
|
||||||
tokens.add(termAtt.toString());
|
if (text.length() != 0) {
|
||||||
if (typeAtt != null) types.add(typeAtt.type());
|
|
||||||
if (posIncAtt != null) positions.add(posIncAtt.getPositionIncrement());
|
|
||||||
if (posLengthAtt != null) positionLengths.add(posLengthAtt.getPositionLength());
|
|
||||||
if (offsetAtt != null) {
|
|
||||||
startOffsets.add(offsetAtt.startOffset());
|
|
||||||
endOffsets.add(offsetAtt.endOffset());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ts.end();
|
|
||||||
ts.close();
|
|
||||||
|
|
||||||
// verify reusing is "reproducable" and also get the normal tokenstream sanity checks
|
// (Optional) second pass: do something evil:
|
||||||
if (!tokens.isEmpty()) {
|
final int evilness = random.nextInt(50);
|
||||||
|
if (evilness == 17) {
|
||||||
// KWTokenizer (for example) can produce a token
|
|
||||||
// even when input is length 0:
|
|
||||||
if (text.length() != 0) {
|
|
||||||
|
|
||||||
// (Optional) second pass: do something evil:
|
|
||||||
final int evilness = random.nextInt(50);
|
|
||||||
if (evilness == 17) {
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis w/ exception");
|
|
||||||
}
|
|
||||||
// Throw an errant exception from the Reader:
|
|
||||||
|
|
||||||
MockReaderWrapper evilReader = new MockReaderWrapper(random, new StringReader(text));
|
|
||||||
evilReader.throwExcAfterChar(random.nextInt(text.length()+1));
|
|
||||||
reader = evilReader;
|
|
||||||
|
|
||||||
try {
|
|
||||||
// NOTE: some Tokenizers go and read characters
|
|
||||||
// when you call .setReader(Reader), eg
|
|
||||||
// PatternTokenizer. This is a bit
|
|
||||||
// iffy... (really, they should only
|
|
||||||
// pull from the Reader when you call
|
|
||||||
// .incremenToken(), I think?), but we
|
|
||||||
// currently allow it, so, we must call
|
|
||||||
// a.tokenStream inside the try since we may
|
|
||||||
// hit the exc on init:
|
|
||||||
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(evilReader, remainder) : evilReader);
|
|
||||||
ts.reset();
|
|
||||||
while (ts.incrementToken());
|
|
||||||
fail("did not hit exception");
|
|
||||||
} catch (RuntimeException re) {
|
|
||||||
assertTrue(MockReaderWrapper.isMyEvilException(re));
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
ts.end();
|
|
||||||
} catch (AssertionError ae) {
|
|
||||||
// Catch & ignore MockTokenizer's
|
|
||||||
// anger...
|
|
||||||
if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
|
|
||||||
// OK
|
|
||||||
} else {
|
|
||||||
throw ae;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ts.close();
|
|
||||||
} else if (evilness == 7) {
|
|
||||||
// Only consume a subset of the tokens:
|
|
||||||
final int numTokensToRead = random.nextInt(tokens.size());
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.size() + " tokens");
|
|
||||||
}
|
|
||||||
|
|
||||||
reader = new StringReader(text);
|
|
||||||
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
|
|
||||||
ts.reset();
|
|
||||||
for(int tokenCount=0;tokenCount<numTokensToRead;tokenCount++) {
|
|
||||||
assertTrue(ts.incrementToken());
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
ts.end();
|
|
||||||
} catch (AssertionError ae) {
|
|
||||||
// Catch & ignore MockTokenizer's
|
|
||||||
// anger...
|
|
||||||
if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
|
|
||||||
// OK
|
|
||||||
} else {
|
|
||||||
throw ae;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ts.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Final pass: verify clean tokenization matches
|
|
||||||
// results from first pass:
|
|
||||||
if (VERBOSE) {
|
|
||||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
|
|
||||||
}
|
|
||||||
reader = new StringReader(text);
|
|
||||||
|
|
||||||
if (random.nextInt(30) == 7) {
|
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader");
|
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis w/ exception");
|
||||||
|
}
|
||||||
|
// Throw an errant exception from the Reader:
|
||||||
|
|
||||||
|
MockReaderWrapper evilReader = new MockReaderWrapper(random, new StringReader(text));
|
||||||
|
evilReader.throwExcAfterChar(random.nextInt(text.length()+1));
|
||||||
|
reader = evilReader;
|
||||||
|
|
||||||
|
try {
|
||||||
|
// NOTE: some Tokenizers go and read characters
|
||||||
|
// when you call .setReader(Reader), eg
|
||||||
|
// PatternTokenizer. This is a bit
|
||||||
|
// iffy... (really, they should only
|
||||||
|
// pull from the Reader when you call
|
||||||
|
// .incremenToken(), I think?), but we
|
||||||
|
// currently allow it, so, we must call
|
||||||
|
// a.tokenStream inside the try since we may
|
||||||
|
// hit the exc on init:
|
||||||
|
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(evilReader, remainder) : evilReader);
|
||||||
|
ts.reset();
|
||||||
|
while (ts.incrementToken());
|
||||||
|
fail("did not hit exception");
|
||||||
|
} catch (RuntimeException re) {
|
||||||
|
assertTrue(MockReaderWrapper.isMyEvilException(re));
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
ts.end();
|
||||||
|
} catch (AssertionError ae) {
|
||||||
|
// Catch & ignore MockTokenizer's
|
||||||
|
// anger...
|
||||||
|
if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
|
||||||
|
// OK
|
||||||
|
} else {
|
||||||
|
throw ae;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ts.close();
|
||||||
|
} else if (evilness == 7) {
|
||||||
|
// Only consume a subset of the tokens:
|
||||||
|
final int numTokensToRead = random.nextInt(tokens.size());
|
||||||
|
if (VERBOSE) {
|
||||||
|
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.size() + " tokens");
|
||||||
}
|
}
|
||||||
|
|
||||||
reader = new MockReaderWrapper(random, reader);
|
reader = new StringReader(text);
|
||||||
|
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
|
||||||
|
ts.reset();
|
||||||
|
for(int tokenCount=0;tokenCount<numTokensToRead;tokenCount++) {
|
||||||
|
assertTrue(ts.incrementToken());
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
ts.end();
|
||||||
|
} catch (AssertionError ae) {
|
||||||
|
// Catch & ignore MockTokenizer's
|
||||||
|
// anger...
|
||||||
|
if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
|
||||||
|
// OK
|
||||||
|
} else {
|
||||||
|
throw ae;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ts.close();
|
||||||
}
|
}
|
||||||
|
}
|
||||||
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
|
|
||||||
if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
|
// Final pass: verify clean tokenization matches
|
||||||
// offset + pos + posLength + type
|
// results from first pass:
|
||||||
assertTokenStreamContents(ts,
|
|
||||||
tokens.toArray(new String[tokens.size()]),
|
if (VERBOSE) {
|
||||||
toIntArray(startOffsets),
|
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
|
||||||
toIntArray(endOffsets),
|
}
|
||||||
types.toArray(new String[types.size()]),
|
reader = new StringReader(text);
|
||||||
toIntArray(positions),
|
|
||||||
toIntArray(positionLengths),
|
if (random.nextInt(30) == 7) {
|
||||||
text.length());
|
if (VERBOSE) {
|
||||||
} else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
|
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader");
|
||||||
// offset + pos + type
|
|
||||||
assertTokenStreamContents(ts,
|
|
||||||
tokens.toArray(new String[tokens.size()]),
|
|
||||||
toIntArray(startOffsets),
|
|
||||||
toIntArray(endOffsets),
|
|
||||||
types.toArray(new String[types.size()]),
|
|
||||||
toIntArray(positions),
|
|
||||||
null,
|
|
||||||
text.length());
|
|
||||||
} else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
|
|
||||||
// offset + pos + posLength
|
|
||||||
assertTokenStreamContents(ts,
|
|
||||||
tokens.toArray(new String[tokens.size()]),
|
|
||||||
toIntArray(startOffsets),
|
|
||||||
toIntArray(endOffsets),
|
|
||||||
null,
|
|
||||||
toIntArray(positions),
|
|
||||||
toIntArray(positionLengths),
|
|
||||||
text.length());
|
|
||||||
} else if (posIncAtt != null && offsetAtt != null) {
|
|
||||||
// offset + pos
|
|
||||||
assertTokenStreamContents(ts,
|
|
||||||
tokens.toArray(new String[tokens.size()]),
|
|
||||||
toIntArray(startOffsets),
|
|
||||||
toIntArray(endOffsets),
|
|
||||||
null,
|
|
||||||
toIntArray(positions),
|
|
||||||
null,
|
|
||||||
text.length());
|
|
||||||
} else if (offsetAtt != null) {
|
|
||||||
// offset
|
|
||||||
assertTokenStreamContents(ts,
|
|
||||||
tokens.toArray(new String[tokens.size()]),
|
|
||||||
toIntArray(startOffsets),
|
|
||||||
toIntArray(endOffsets),
|
|
||||||
null,
|
|
||||||
null,
|
|
||||||
null,
|
|
||||||
text.length());
|
|
||||||
} else {
|
|
||||||
// terms only
|
|
||||||
assertTokenStreamContents(ts,
|
|
||||||
tokens.toArray(new String[tokens.size()]));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
reader = new MockReaderWrapper(random, reader);
|
||||||
|
}
|
||||||
|
|
||||||
|
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
|
||||||
|
if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
|
||||||
|
// offset + pos + posLength + type
|
||||||
|
assertTokenStreamContents(ts,
|
||||||
|
tokens.toArray(new String[tokens.size()]),
|
||||||
|
toIntArray(startOffsets),
|
||||||
|
toIntArray(endOffsets),
|
||||||
|
types.toArray(new String[types.size()]),
|
||||||
|
toIntArray(positions),
|
||||||
|
toIntArray(positionLengths),
|
||||||
|
text.length());
|
||||||
|
} else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
|
||||||
|
// offset + pos + type
|
||||||
|
assertTokenStreamContents(ts,
|
||||||
|
tokens.toArray(new String[tokens.size()]),
|
||||||
|
toIntArray(startOffsets),
|
||||||
|
toIntArray(endOffsets),
|
||||||
|
types.toArray(new String[types.size()]),
|
||||||
|
toIntArray(positions),
|
||||||
|
null,
|
||||||
|
text.length());
|
||||||
|
} else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
|
||||||
|
// offset + pos + posLength
|
||||||
|
assertTokenStreamContents(ts,
|
||||||
|
tokens.toArray(new String[tokens.size()]),
|
||||||
|
toIntArray(startOffsets),
|
||||||
|
toIntArray(endOffsets),
|
||||||
|
null,
|
||||||
|
toIntArray(positions),
|
||||||
|
toIntArray(positionLengths),
|
||||||
|
text.length());
|
||||||
|
} else if (posIncAtt != null && offsetAtt != null) {
|
||||||
|
// offset + pos
|
||||||
|
assertTokenStreamContents(ts,
|
||||||
|
tokens.toArray(new String[tokens.size()]),
|
||||||
|
toIntArray(startOffsets),
|
||||||
|
toIntArray(endOffsets),
|
||||||
|
null,
|
||||||
|
toIntArray(positions),
|
||||||
|
null,
|
||||||
|
text.length());
|
||||||
|
} else if (offsetAtt != null) {
|
||||||
|
// offset
|
||||||
|
assertTokenStreamContents(ts,
|
||||||
|
tokens.toArray(new String[tokens.size()]),
|
||||||
|
toIntArray(startOffsets),
|
||||||
|
toIntArray(endOffsets),
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
text.length());
|
||||||
|
} else {
|
||||||
|
// terms only
|
||||||
|
assertTokenStreamContents(ts,
|
||||||
|
tokens.toArray(new String[tokens.size()]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,10 +27,7 @@ import java.io.OutputStream;
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
import java.lang.reflect.Method;
|
import java.lang.reflect.Method;
|
||||||
import java.nio.CharBuffer;
|
import java.nio.CharBuffer;
|
||||||
import java.util.Enumeration;
|
import java.util.*;
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Random;
|
|
||||||
import java.util.zip.ZipEntry;
|
import java.util.zip.ZipEntry;
|
||||||
import java.util.zip.ZipFile;
|
import java.util.zip.ZipFile;
|
||||||
|
|
||||||
|
@ -414,12 +411,51 @@ public class _TestUtil {
|
||||||
case 20: sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1)); break;
|
case 20: sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1)); break;
|
||||||
case 21: sb.append("\n"); break;
|
case 21: sb.append("\n"); break;
|
||||||
case 22: sb.append(" ".substring(nextInt(random, 0, 10))); break;
|
case 22: sb.append(" ".substring(nextInt(random, 0, 10))); break;
|
||||||
|
case 23: {
|
||||||
|
sb.append("<");
|
||||||
|
if (0 == nextInt(random, 0, 3)) {
|
||||||
|
sb.append(" ".substring(nextInt(random, 1, 10)));
|
||||||
|
}
|
||||||
|
if (0 == nextInt(random, 0, 1)) {
|
||||||
|
sb.append("/");
|
||||||
|
if (0 == nextInt(random, 0, 3)) {
|
||||||
|
sb.append(" ".substring(nextInt(random, 1, 10)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
switch (nextInt(random, 0, 3)) {
|
||||||
|
case 0: sb.append(randomlyRecaseCodePoints(random, "script")); break;
|
||||||
|
case 1: sb.append(randomlyRecaseCodePoints(random, "style")); break;
|
||||||
|
case 2: sb.append(randomlyRecaseCodePoints(random, "br")); break;
|
||||||
|
// default: append nothing
|
||||||
|
}
|
||||||
|
sb.append(">".substring(nextInt(random, 0, 1)));
|
||||||
|
break;
|
||||||
|
}
|
||||||
default: sb.append(randomSimpleString(random));
|
default: sb.append(randomSimpleString(random));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Randomly upcases, downcases, or leaves intact each code point in the given string
|
||||||
|
*/
|
||||||
|
public static String randomlyRecaseCodePoints(Random random, String str) {
|
||||||
|
StringBuilder builder = new StringBuilder();
|
||||||
|
int pos = 0;
|
||||||
|
while (pos < str.length()) {
|
||||||
|
int codePoint = str.codePointAt(pos);
|
||||||
|
pos += Character.charCount(codePoint);
|
||||||
|
String codePointSubstring = new String(new int[] { codePoint }, 0, 1);
|
||||||
|
switch (nextInt(random, 0, 2)) {
|
||||||
|
case 0: builder.append(codePointSubstring.toUpperCase()); break;
|
||||||
|
case 1: builder.append(codePointSubstring.toLowerCase()); break;
|
||||||
|
case 2: builder.append(codePointSubstring); // leave intact
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return builder.toString();
|
||||||
|
}
|
||||||
|
|
||||||
private static final int[] blockStarts = {
|
private static final int[] blockStarts = {
|
||||||
0x0000, 0x0080, 0x0100, 0x0180, 0x0250, 0x02B0, 0x0300, 0x0370, 0x0400,
|
0x0000, 0x0080, 0x0100, 0x0180, 0x0250, 0x02B0, 0x0300, 0x0370, 0x0400,
|
||||||
0x0500, 0x0530, 0x0590, 0x0600, 0x0700, 0x0750, 0x0780, 0x07C0, 0x0800,
|
0x0500, 0x0530, 0x0590, 0x0600, 0x0700, 0x0750, 0x0780, 0x07C0, 0x0800,
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 1/23/12 2:15 AM */
|
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 3/24/12 4:50 PM */
|
||||||
|
|
||||||
package org.apache.lucene.analysis.charfilter;
|
package org.apache.lucene.analysis.charfilter;
|
||||||
|
|
||||||
|
@ -39,7 +39,7 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
|
||||||
/**
|
/**
|
||||||
* This class is a scanner generated by
|
* This class is a scanner generated by
|
||||||
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
||||||
* on 1/23/12 2:15 AM from the specification file
|
* on 3/24/12 4:50 PM from the specification file
|
||||||
* <tt>C:/cygwin/home/s/svn/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex</tt>
|
* <tt>C:/cygwin/home/s/svn/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex</tt>
|
||||||
*/
|
*/
|
||||||
public final class HTMLStripCharFilter extends BaseCharFilter {
|
public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
|
@ -30967,7 +30967,9 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
case START_TAG_TAIL_EXCLUDE:
|
case START_TAG_TAIL_EXCLUDE:
|
||||||
case SERVER_SIDE_INCLUDE:
|
case SERVER_SIDE_INCLUDE:
|
||||||
case START_TAG_TAIL_SUBSTITUTE: { // Exclude
|
case START_TAG_TAIL_SUBSTITUTE: { // Exclude
|
||||||
|
// add (length of input that won't be output) [ - (substitution length) = 0 ]
|
||||||
cumulativeDiff += yychar - inputStart;
|
cumulativeDiff += yychar - inputStart;
|
||||||
|
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||||
outputSegment.clear();
|
outputSegment.clear();
|
||||||
eofReturnValue = -1;
|
eofReturnValue = -1;
|
||||||
|
@ -30975,7 +30977,9 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
}
|
}
|
||||||
case CHARACTER_REFERENCE_TAIL: { // Substitute
|
case CHARACTER_REFERENCE_TAIL: { // Substitute
|
||||||
// At end of file, allow char refs without semicolons
|
// At end of file, allow char refs without semicolons
|
||||||
|
// add (length of input that won't be output) - (substitution length)
|
||||||
cumulativeDiff += inputSegment.length() - outputSegment.length();
|
cumulativeDiff += inputSegment.length() - outputSegment.length();
|
||||||
|
// position the correction at (already output length) + (substitution length)
|
||||||
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
||||||
eofReturnValue = outputSegment.nextChar();
|
eofReturnValue = outputSegment.nextChar();
|
||||||
break;
|
break;
|
||||||
|
@ -31095,6 +31099,16 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
{ yybegin(STYLE);
|
{ yybegin(STYLE);
|
||||||
}
|
}
|
||||||
case 55: break;
|
case 55: break;
|
||||||
|
case 27:
|
||||||
|
{ // add (previously matched input length) + (this match length) - (substitution length)
|
||||||
|
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||||
|
// position the correction at (already output length) + (substitution length)
|
||||||
|
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||||
|
inputSegment.clear();
|
||||||
|
yybegin(YYINITIAL);
|
||||||
|
return BLOCK_LEVEL_START_TAG_REPLACEMENT;
|
||||||
|
}
|
||||||
|
case 56: break;
|
||||||
case 30:
|
case 30:
|
||||||
{ int length = yylength();
|
{ int length = yylength();
|
||||||
inputSegment.write(zzBuffer, zzStartRead, length);
|
inputSegment.write(zzBuffer, zzStartRead, length);
|
||||||
|
@ -31104,7 +31118,30 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
outputSegment = entitySegment;
|
outputSegment = entitySegment;
|
||||||
yybegin(CHARACTER_REFERENCE_TAIL);
|
yybegin(CHARACTER_REFERENCE_TAIL);
|
||||||
}
|
}
|
||||||
case 56: break;
|
case 57: break;
|
||||||
|
case 48:
|
||||||
|
{ inputSegment.clear();
|
||||||
|
yybegin(YYINITIAL);
|
||||||
|
// add (previously matched input length) -- current match and substitution handled below
|
||||||
|
cumulativeDiff += yychar - inputStart;
|
||||||
|
// position the offset correction at (already output length) -- substitution handled below
|
||||||
|
int offsetCorrectionPos = outputCharCount;
|
||||||
|
int returnValue;
|
||||||
|
if (escapeSTYLE) {
|
||||||
|
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||||
|
outputSegment = inputSegment;
|
||||||
|
returnValue = outputSegment.nextChar();
|
||||||
|
} else {
|
||||||
|
// add (this match length) - (substitution length)
|
||||||
|
cumulativeDiff += yylength() - 1;
|
||||||
|
// add (substitution length)
|
||||||
|
++offsetCorrectionPos;
|
||||||
|
returnValue = STYLE_REPLACEMENT;
|
||||||
|
}
|
||||||
|
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
|
||||||
|
return returnValue;
|
||||||
|
}
|
||||||
|
case 58: break;
|
||||||
case 8:
|
case 8:
|
||||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||||
if (null != escapedTags
|
if (null != escapedTags
|
||||||
|
@ -31114,71 +31151,75 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
yybegin(START_TAG_TAIL_SUBSTITUTE);
|
yybegin(START_TAG_TAIL_SUBSTITUTE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
case 57: break;
|
case 59: break;
|
||||||
case 26:
|
|
||||||
{ cumulativeDiff += inputSegment.length() + yylength();
|
|
||||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
|
||||||
inputSegment.clear();
|
|
||||||
outputSegment = inputSegment;
|
|
||||||
yybegin(YYINITIAL);
|
|
||||||
}
|
|
||||||
case 58: break;
|
|
||||||
case 2:
|
case 2:
|
||||||
{ inputStart = yychar;
|
{ inputStart = yychar;
|
||||||
inputSegment.clear();
|
inputSegment.clear();
|
||||||
inputSegment.append('<');
|
inputSegment.append('<');
|
||||||
yybegin(LEFT_ANGLE_BRACKET);
|
yybegin(LEFT_ANGLE_BRACKET);
|
||||||
}
|
}
|
||||||
case 59: break;
|
|
||||||
case 34:
|
|
||||||
{ cumulativeDiff += yychar - inputStart + yylength();
|
|
||||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
|
||||||
inputSegment.clear();
|
|
||||||
yybegin(YYINITIAL);
|
|
||||||
}
|
|
||||||
case 60: break;
|
case 60: break;
|
||||||
case 47:
|
|
||||||
{ cumulativeDiff += inputSegment.length() + yylength();
|
|
||||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
|
||||||
inputSegment.clear();
|
|
||||||
yybegin(CDATA);
|
|
||||||
}
|
|
||||||
case 61: break;
|
|
||||||
case 27:
|
|
||||||
{ cumulativeDiff += inputSegment.length() + yylength() - 1;
|
|
||||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
|
||||||
inputSegment.clear();
|
|
||||||
yybegin(YYINITIAL);
|
|
||||||
return BLOCK_LEVEL_START_TAG_REPLACEMENT;
|
|
||||||
}
|
|
||||||
case 62: break;
|
|
||||||
case 44:
|
case 44:
|
||||||
{ restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
|
{ restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
|
||||||
}
|
}
|
||||||
case 63: break;
|
case 61: break;
|
||||||
case 21:
|
case 21:
|
||||||
{ previousRestoreState = restoreState;
|
{ previousRestoreState = restoreState;
|
||||||
restoreState = SERVER_SIDE_INCLUDE;
|
restoreState = SERVER_SIDE_INCLUDE;
|
||||||
yybegin(SINGLE_QUOTED_STRING);
|
yybegin(SINGLE_QUOTED_STRING);
|
||||||
}
|
}
|
||||||
case 64: break;
|
case 62: break;
|
||||||
case 11:
|
case 11:
|
||||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||||
yybegin(LEFT_ANGLE_BRACKET_SPACE);
|
yybegin(LEFT_ANGLE_BRACKET_SPACE);
|
||||||
}
|
}
|
||||||
case 65: break;
|
case 63: break;
|
||||||
case 35:
|
case 35:
|
||||||
{ yybegin(SCRIPT);
|
{ yybegin(SCRIPT);
|
||||||
}
|
}
|
||||||
case 66: break;
|
case 64: break;
|
||||||
case 42:
|
case 42:
|
||||||
{ restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE);
|
{ restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE);
|
||||||
}
|
}
|
||||||
case 67: break;
|
case 65: break;
|
||||||
case 10:
|
case 10:
|
||||||
{ inputSegment.append('!'); yybegin(BANG);
|
{ inputSegment.append('!'); yybegin(BANG);
|
||||||
}
|
}
|
||||||
case 68: break;
|
case 66: break;
|
||||||
|
case 51:
|
||||||
|
{ // Handle paired UTF-16 surrogates.
|
||||||
|
String surrogatePair = yytext();
|
||||||
|
char highSurrogate = '\u0000';
|
||||||
|
char lowSurrogate = '\u0000';
|
||||||
|
try {
|
||||||
|
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
|
||||||
|
} catch(Exception e) { // should never happen
|
||||||
|
assert false: "Exception parsing high surrogate '"
|
||||||
|
+ surrogatePair.substring(2, 6) + "'";
|
||||||
|
}
|
||||||
|
try { // Low surrogates are in decimal range [56320, 57343]
|
||||||
|
lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
|
||||||
|
} catch(Exception e) { // should never happen
|
||||||
|
assert false: "Exception parsing low surrogate '"
|
||||||
|
+ surrogatePair.substring(9, 14) + "'";
|
||||||
|
}
|
||||||
|
if (Character.isLowSurrogate(lowSurrogate)) {
|
||||||
|
outputSegment = entitySegment;
|
||||||
|
outputSegment.clear();
|
||||||
|
outputSegment.unsafeWrite(lowSurrogate);
|
||||||
|
// add (previously matched input length) + (this match length) - (substitution length)
|
||||||
|
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||||
|
// position the correction at (already output length) + (substitution length)
|
||||||
|
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||||
|
inputSegment.clear();
|
||||||
|
yybegin(YYINITIAL);
|
||||||
|
return highSurrogate;
|
||||||
|
}
|
||||||
|
yypushback(surrogatePair.length() - 1); // Consume only '#'
|
||||||
|
inputSegment.append('#');
|
||||||
|
yybegin(NUMERIC_CHARACTER);
|
||||||
|
}
|
||||||
|
case 67: break;
|
||||||
case 4:
|
case 4:
|
||||||
{ yypushback(1);
|
{ yypushback(1);
|
||||||
outputSegment = inputSegment;
|
outputSegment = inputSegment;
|
||||||
|
@ -31186,37 +31227,48 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
yybegin(YYINITIAL);
|
yybegin(YYINITIAL);
|
||||||
return outputSegment.nextChar();
|
return outputSegment.nextChar();
|
||||||
}
|
}
|
||||||
case 69: break;
|
case 68: break;
|
||||||
case 48:
|
|
||||||
{ inputSegment.clear();
|
|
||||||
yybegin(YYINITIAL);
|
|
||||||
cumulativeDiff += yychar - inputStart;
|
|
||||||
int outputEnd = outputCharCount;
|
|
||||||
int returnValue;
|
|
||||||
if (escapeSTYLE) {
|
|
||||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
|
||||||
outputSegment = inputSegment;
|
|
||||||
returnValue = outputSegment.nextChar();
|
|
||||||
} else {
|
|
||||||
cumulativeDiff += yylength() - 1;
|
|
||||||
++outputEnd;
|
|
||||||
returnValue = STYLE_REPLACEMENT;
|
|
||||||
}
|
|
||||||
addOffCorrectMap(outputEnd, cumulativeDiff);
|
|
||||||
return returnValue;
|
|
||||||
}
|
|
||||||
case 70: break;
|
|
||||||
case 43:
|
case 43:
|
||||||
{ restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
|
{ restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
|
||||||
}
|
}
|
||||||
case 71: break;
|
case 69: break;
|
||||||
case 14:
|
case 52:
|
||||||
{ cumulativeDiff += inputSegment.length() + yylength();
|
{ // Handle paired UTF-16 surrogates.
|
||||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
String surrogatePair = yytext();
|
||||||
inputSegment.clear();
|
char highSurrogate = '\u0000';
|
||||||
yybegin(YYINITIAL);
|
try { // High surrogates are in decimal range [55296, 56319]
|
||||||
|
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
|
||||||
|
} catch(Exception e) { // should never happen
|
||||||
|
assert false: "Exception parsing high surrogate '"
|
||||||
|
+ surrogatePair.substring(1, 6) + "'";
|
||||||
|
}
|
||||||
|
if (Character.isHighSurrogate(highSurrogate)) {
|
||||||
|
outputSegment = entitySegment;
|
||||||
|
outputSegment.clear();
|
||||||
|
try {
|
||||||
|
outputSegment.unsafeWrite
|
||||||
|
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
|
||||||
|
} catch(Exception e) { // should never happen
|
||||||
|
assert false: "Exception parsing low surrogate '"
|
||||||
|
+ surrogatePair.substring(10, 14) + "'";
|
||||||
|
}
|
||||||
|
// add (previously matched input length) + (this match length) - (substitution length)
|
||||||
|
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||||
|
// position the correction at (already output length) + (substitution length)
|
||||||
|
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||||
|
inputSegment.clear();
|
||||||
|
yybegin(YYINITIAL);
|
||||||
|
return highSurrogate;
|
||||||
|
}
|
||||||
|
yypushback(surrogatePair.length() - 1); // Consume only '#'
|
||||||
|
inputSegment.append('#');
|
||||||
|
yybegin(NUMERIC_CHARACTER);
|
||||||
}
|
}
|
||||||
case 72: break;
|
case 70: break;
|
||||||
|
case 28:
|
||||||
|
{ restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING);
|
||||||
|
}
|
||||||
|
case 71: break;
|
||||||
case 50:
|
case 50:
|
||||||
{ // Handle paired UTF-16 surrogates.
|
{ // Handle paired UTF-16 surrogates.
|
||||||
outputSegment = entitySegment;
|
outputSegment = entitySegment;
|
||||||
|
@ -31236,49 +31288,63 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
assert false: "Exception parsing low surrogate '"
|
assert false: "Exception parsing low surrogate '"
|
||||||
+ surrogatePair.substring(10, 14) + "'";
|
+ surrogatePair.substring(10, 14) + "'";
|
||||||
}
|
}
|
||||||
|
// add (previously matched input length) + (this match length) - (substitution length)
|
||||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||||
|
// position the correction at (already output length) + (substitution length)
|
||||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||||
inputSegment.clear();
|
inputSegment.clear();
|
||||||
yybegin(YYINITIAL);
|
yybegin(YYINITIAL);
|
||||||
return highSurrogate;
|
return highSurrogate;
|
||||||
}
|
}
|
||||||
case 73: break;
|
case 72: break;
|
||||||
case 28:
|
|
||||||
{ restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING);
|
|
||||||
}
|
|
||||||
case 74: break;
|
|
||||||
case 49:
|
|
||||||
{ inputSegment.clear();
|
|
||||||
yybegin(YYINITIAL);
|
|
||||||
cumulativeDiff += yychar - inputStart;
|
|
||||||
int outputEnd = outputCharCount;
|
|
||||||
int returnValue;
|
|
||||||
if (escapeSCRIPT) {
|
|
||||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
|
||||||
outputSegment = inputSegment;
|
|
||||||
returnValue = outputSegment.nextChar();
|
|
||||||
} else {
|
|
||||||
cumulativeDiff += yylength() - 1;
|
|
||||||
++outputEnd;
|
|
||||||
returnValue = SCRIPT_REPLACEMENT;
|
|
||||||
}
|
|
||||||
addOffCorrectMap(outputEnd, cumulativeDiff);
|
|
||||||
return returnValue;
|
|
||||||
}
|
|
||||||
case 75: break;
|
|
||||||
case 16:
|
case 16:
|
||||||
{ restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING);
|
{ restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING);
|
||||||
}
|
}
|
||||||
case 76: break;
|
case 73: break;
|
||||||
case 22:
|
case 22:
|
||||||
{ previousRestoreState = restoreState;
|
{ previousRestoreState = restoreState;
|
||||||
restoreState = SERVER_SIDE_INCLUDE;
|
restoreState = SERVER_SIDE_INCLUDE;
|
||||||
yybegin(DOUBLE_QUOTED_STRING);
|
yybegin(DOUBLE_QUOTED_STRING);
|
||||||
}
|
}
|
||||||
case 77: break;
|
case 74: break;
|
||||||
|
case 26:
|
||||||
|
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||||
|
cumulativeDiff += inputSegment.length() + yylength();
|
||||||
|
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||||
|
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||||
|
inputSegment.clear();
|
||||||
|
outputSegment = inputSegment;
|
||||||
|
yybegin(YYINITIAL);
|
||||||
|
}
|
||||||
|
case 75: break;
|
||||||
case 20:
|
case 20:
|
||||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||||
}
|
}
|
||||||
|
case 76: break;
|
||||||
|
case 47:
|
||||||
|
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||||
|
cumulativeDiff += inputSegment.length() + yylength();
|
||||||
|
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||||
|
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||||
|
inputSegment.clear();
|
||||||
|
yybegin(CDATA);
|
||||||
|
}
|
||||||
|
case 77: break;
|
||||||
|
case 33:
|
||||||
|
{ yybegin(YYINITIAL);
|
||||||
|
if (escapeBR) {
|
||||||
|
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||||
|
outputSegment = inputSegment;
|
||||||
|
return outputSegment.nextChar();
|
||||||
|
} else {
|
||||||
|
// add (previously matched input length) + (this match length) - (substitution length)
|
||||||
|
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||||
|
// position the correction at (already output length) + (substitution length)
|
||||||
|
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||||
|
inputSegment.reset();
|
||||||
|
return BR_START_TAG_REPLACEMENT;
|
||||||
|
}
|
||||||
|
}
|
||||||
case 78: break;
|
case 78: break;
|
||||||
case 23:
|
case 23:
|
||||||
{ yybegin(restoreState); restoreState = previousRestoreState;
|
{ yybegin(restoreState); restoreState = previousRestoreState;
|
||||||
|
@ -31288,28 +31354,20 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
{ yybegin(COMMENT);
|
{ yybegin(COMMENT);
|
||||||
}
|
}
|
||||||
case 80: break;
|
case 80: break;
|
||||||
case 25:
|
|
||||||
{ cumulativeDiff += inputSegment.length() + yylength() - 1;
|
|
||||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
|
||||||
inputSegment.clear();
|
|
||||||
yybegin(YYINITIAL);
|
|
||||||
return BLOCK_LEVEL_END_TAG_REPLACEMENT;
|
|
||||||
}
|
|
||||||
case 81: break;
|
|
||||||
case 24:
|
case 24:
|
||||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||||
outputSegment = inputSegment;
|
outputSegment = inputSegment;
|
||||||
yybegin(YYINITIAL);
|
yybegin(YYINITIAL);
|
||||||
return outputSegment.nextChar();
|
return outputSegment.nextChar();
|
||||||
}
|
}
|
||||||
case 82: break;
|
case 81: break;
|
||||||
case 3:
|
case 3:
|
||||||
{ inputStart = yychar;
|
{ inputStart = yychar;
|
||||||
inputSegment.clear();
|
inputSegment.clear();
|
||||||
inputSegment.append('&');
|
inputSegment.append('&');
|
||||||
yybegin(AMPERSAND);
|
yybegin(AMPERSAND);
|
||||||
}
|
}
|
||||||
case 83: break;
|
case 82: break;
|
||||||
case 46:
|
case 46:
|
||||||
{ yybegin(SCRIPT);
|
{ yybegin(SCRIPT);
|
||||||
if (escapeSCRIPT) {
|
if (escapeSCRIPT) {
|
||||||
|
@ -31319,6 +31377,15 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
return outputSegment.nextChar();
|
return outputSegment.nextChar();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
case 83: break;
|
||||||
|
case 14:
|
||||||
|
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||||
|
cumulativeDiff += inputSegment.length() + yylength();
|
||||||
|
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||||
|
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||||
|
inputSegment.clear();
|
||||||
|
yybegin(YYINITIAL);
|
||||||
|
}
|
||||||
case 84: break;
|
case 84: break;
|
||||||
case 6:
|
case 6:
|
||||||
{ int matchLength = yylength();
|
{ int matchLength = yylength();
|
||||||
|
@ -31354,14 +31421,23 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
case 85: break;
|
case 85: break;
|
||||||
|
case 34:
|
||||||
|
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0]
|
||||||
|
cumulativeDiff += yychar - inputStart + yylength();
|
||||||
|
// position the correction at (already output length) [ + (substitution length) = 0]
|
||||||
|
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||||
|
inputSegment.clear();
|
||||||
|
yybegin(YYINITIAL);
|
||||||
|
}
|
||||||
|
case 86: break;
|
||||||
case 5:
|
case 5:
|
||||||
{ inputSegment.append('#'); yybegin(NUMERIC_CHARACTER);
|
{ inputSegment.append('#'); yybegin(NUMERIC_CHARACTER);
|
||||||
}
|
}
|
||||||
case 86: break;
|
case 87: break;
|
||||||
case 13:
|
case 13:
|
||||||
{ inputSegment.append(zzBuffer[zzStartRead]);
|
{ inputSegment.append(zzBuffer[zzStartRead]);
|
||||||
}
|
}
|
||||||
case 87: break;
|
case 88: break;
|
||||||
case 18:
|
case 18:
|
||||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||||
if (null != escapedTags
|
if (null != escapedTags
|
||||||
|
@ -31369,93 +31445,25 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
yybegin(END_TAG_TAIL_INCLUDE);
|
yybegin(END_TAG_TAIL_INCLUDE);
|
||||||
} else {
|
} else {
|
||||||
yybegin(END_TAG_TAIL_SUBSTITUTE);
|
yybegin(END_TAG_TAIL_SUBSTITUTE);
|
||||||
}
|
|
||||||
}
|
|
||||||
case 88: break;
|
|
||||||
case 36:
|
|
||||||
{ yybegin(YYINITIAL);
|
|
||||||
if (escapeBR) {
|
|
||||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
|
||||||
outputSegment = inputSegment;
|
|
||||||
return outputSegment.nextChar();
|
|
||||||
} else {
|
|
||||||
cumulativeDiff
|
|
||||||
+= inputSegment.length() + yylength() - outputSegment.length();
|
|
||||||
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
|
||||||
inputSegment.reset();
|
|
||||||
return BR_END_TAG_REPLACEMENT;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
case 89: break;
|
case 89: break;
|
||||||
case 33:
|
|
||||||
{ yybegin(YYINITIAL);
|
|
||||||
if (escapeBR) {
|
|
||||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
|
||||||
outputSegment = inputSegment;
|
|
||||||
return outputSegment.nextChar();
|
|
||||||
} else {
|
|
||||||
cumulativeDiff
|
|
||||||
+= inputSegment.length() + yylength() - outputSegment.length();
|
|
||||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
|
||||||
inputSegment.reset();
|
|
||||||
return BR_START_TAG_REPLACEMENT;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
case 90: break;
|
|
||||||
case 40:
|
case 40:
|
||||||
{ yybegin(SCRIPT_COMMENT);
|
{ yybegin(SCRIPT_COMMENT);
|
||||||
}
|
}
|
||||||
case 91: break;
|
case 90: break;
|
||||||
case 37:
|
case 37:
|
||||||
{ cumulativeDiff += yylength();
|
{ // add (this match length) [ - (substitution length) = 0 ]
|
||||||
|
cumulativeDiff += yylength();
|
||||||
|
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||||
yybegin(YYINITIAL);
|
yybegin(YYINITIAL);
|
||||||
}
|
}
|
||||||
case 92: break;
|
case 91: break;
|
||||||
case 12:
|
case 12:
|
||||||
{ inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH);
|
{ inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH);
|
||||||
}
|
}
|
||||||
case 93: break;
|
case 92: break;
|
||||||
case 51:
|
|
||||||
{ // Handle paired UTF-16 surrogates.
|
|
||||||
String surrogatePair = yytext();
|
|
||||||
char highSurrogate = '\u0000';
|
|
||||||
char lowSurrogate = '\u0000';
|
|
||||||
try {
|
|
||||||
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
|
|
||||||
} catch(Exception e) { // should never happen
|
|
||||||
assert false: "Exception parsing high surrogate '"
|
|
||||||
+ surrogatePair.substring(2, 6) + "'";
|
|
||||||
}
|
|
||||||
try { // Low surrogates are in decimal range [56320, 57343]
|
|
||||||
lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
|
|
||||||
} catch(Exception e) { // should never happen
|
|
||||||
assert false: "Exception parsing low surrogate '"
|
|
||||||
+ surrogatePair.substring(9, 14) + "'";
|
|
||||||
}
|
|
||||||
if (Character.isLowSurrogate(lowSurrogate)) {
|
|
||||||
outputSegment = entitySegment;
|
|
||||||
outputSegment.clear();
|
|
||||||
outputSegment.unsafeWrite(lowSurrogate);
|
|
||||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
|
||||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
|
||||||
inputSegment.clear();
|
|
||||||
yybegin(YYINITIAL);
|
|
||||||
return highSurrogate;
|
|
||||||
}
|
|
||||||
yypushback(surrogatePair.length() - 1); // Consume only '#'
|
|
||||||
inputSegment.append('#');
|
|
||||||
yybegin(NUMERIC_CHARACTER);
|
|
||||||
}
|
|
||||||
case 94: break;
|
|
||||||
case 7:
|
|
||||||
{ cumulativeDiff
|
|
||||||
+= inputSegment.length() + yylength() - outputSegment.length();
|
|
||||||
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
|
||||||
yybegin(YYINITIAL);
|
|
||||||
return outputSegment.nextChar();
|
|
||||||
}
|
|
||||||
case 95: break;
|
|
||||||
case 9:
|
case 9:
|
||||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||||
if (null != escapedTags
|
if (null != escapedTags
|
||||||
|
@ -31465,15 +31473,38 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
yybegin(START_TAG_TAIL_EXCLUDE);
|
yybegin(START_TAG_TAIL_EXCLUDE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
case 96: break;
|
case 93: break;
|
||||||
|
case 49:
|
||||||
|
{ inputSegment.clear();
|
||||||
|
yybegin(YYINITIAL);
|
||||||
|
// add (previously matched input length) -- current match and substitution handled below
|
||||||
|
cumulativeDiff += yychar - inputStart;
|
||||||
|
// position at (already output length) -- substitution handled below
|
||||||
|
int offsetCorrectionPos = outputCharCount;
|
||||||
|
int returnValue;
|
||||||
|
if (escapeSCRIPT) {
|
||||||
|
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||||
|
outputSegment = inputSegment;
|
||||||
|
returnValue = outputSegment.nextChar();
|
||||||
|
} else {
|
||||||
|
// add (this match length) - (substitution length)
|
||||||
|
cumulativeDiff += yylength() - 1;
|
||||||
|
// add (substitution length)
|
||||||
|
++offsetCorrectionPos;
|
||||||
|
returnValue = SCRIPT_REPLACEMENT;
|
||||||
|
}
|
||||||
|
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
|
||||||
|
return returnValue;
|
||||||
|
}
|
||||||
|
case 94: break;
|
||||||
case 29:
|
case 29:
|
||||||
{ restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
|
{ restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
|
||||||
}
|
}
|
||||||
case 97: break;
|
case 95: break;
|
||||||
case 17:
|
case 17:
|
||||||
{ restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
|
{ restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
|
||||||
}
|
}
|
||||||
case 98: break;
|
case 96: break;
|
||||||
case 45:
|
case 45:
|
||||||
{ yybegin(STYLE);
|
{ yybegin(STYLE);
|
||||||
if (escapeSTYLE) {
|
if (escapeSTYLE) {
|
||||||
|
@ -31483,7 +31514,16 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
return outputSegment.nextChar();
|
return outputSegment.nextChar();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
case 99: break;
|
case 97: break;
|
||||||
|
case 7:
|
||||||
|
{ // add (previously matched input length) + (this match length) - (substitution length)
|
||||||
|
cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length();
|
||||||
|
// position the correction at (already output length) + (substitution length)
|
||||||
|
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
||||||
|
yybegin(YYINITIAL);
|
||||||
|
return outputSegment.nextChar();
|
||||||
|
}
|
||||||
|
case 98: break;
|
||||||
case 19:
|
case 19:
|
||||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||||
if (null != escapedTags
|
if (null != escapedTags
|
||||||
|
@ -31493,6 +31533,16 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
yybegin(END_TAG_TAIL_EXCLUDE);
|
yybegin(END_TAG_TAIL_EXCLUDE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
case 99: break;
|
||||||
|
case 25:
|
||||||
|
{ // add (previously matched input length) + (this match length) - (substitution length)
|
||||||
|
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||||
|
// position the correction at (already output length) + (substitution length)
|
||||||
|
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||||
|
inputSegment.clear();
|
||||||
|
yybegin(YYINITIAL);
|
||||||
|
return BLOCK_LEVEL_END_TAG_REPLACEMENT;
|
||||||
|
}
|
||||||
case 100: break;
|
case 100: break;
|
||||||
case 31:
|
case 31:
|
||||||
{ int matchLength = yylength();
|
{ int matchLength = yylength();
|
||||||
|
@ -31529,49 +31579,6 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
case 101: break;
|
case 101: break;
|
||||||
case 38:
|
|
||||||
{ yybegin(restoreState);
|
|
||||||
}
|
|
||||||
case 102: break;
|
|
||||||
case 41:
|
|
||||||
{ yybegin(STYLE_COMMENT);
|
|
||||||
}
|
|
||||||
case 103: break;
|
|
||||||
case 1:
|
|
||||||
{ return zzBuffer[zzStartRead];
|
|
||||||
}
|
|
||||||
case 104: break;
|
|
||||||
case 52:
|
|
||||||
{ // Handle paired UTF-16 surrogates.
|
|
||||||
String surrogatePair = yytext();
|
|
||||||
char highSurrogate = '\u0000';
|
|
||||||
try { // High surrogates are in decimal range [55296, 56319]
|
|
||||||
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
|
|
||||||
} catch(Exception e) { // should never happen
|
|
||||||
assert false: "Exception parsing high surrogate '"
|
|
||||||
+ surrogatePair.substring(1, 6) + "'";
|
|
||||||
}
|
|
||||||
if (Character.isHighSurrogate(highSurrogate)) {
|
|
||||||
outputSegment = entitySegment;
|
|
||||||
outputSegment.clear();
|
|
||||||
try {
|
|
||||||
outputSegment.unsafeWrite
|
|
||||||
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
|
|
||||||
} catch(Exception e) { // should never happen
|
|
||||||
assert false: "Exception parsing low surrogate '"
|
|
||||||
+ surrogatePair.substring(10, 14) + "'";
|
|
||||||
}
|
|
||||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
|
||||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
|
||||||
inputSegment.clear();
|
|
||||||
yybegin(YYINITIAL);
|
|
||||||
return highSurrogate;
|
|
||||||
}
|
|
||||||
yypushback(surrogatePair.length() - 1); // Consume only '#'
|
|
||||||
inputSegment.append('#');
|
|
||||||
yybegin(NUMERIC_CHARACTER);
|
|
||||||
}
|
|
||||||
case 105: break;
|
|
||||||
case 53:
|
case 53:
|
||||||
{ // Handle paired UTF-16 surrogates.
|
{ // Handle paired UTF-16 surrogates.
|
||||||
String surrogatePair = yytext();
|
String surrogatePair = yytext();
|
||||||
|
@ -31594,7 +31601,9 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
outputSegment = entitySegment;
|
outputSegment = entitySegment;
|
||||||
outputSegment.clear();
|
outputSegment.clear();
|
||||||
outputSegment.unsafeWrite(lowSurrogate);
|
outputSegment.unsafeWrite(lowSurrogate);
|
||||||
|
// add (previously matched input length) + (this match length) - (substitution length)
|
||||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||||
|
// position the correction at (already output length) + (substitution length)
|
||||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||||
inputSegment.clear();
|
inputSegment.clear();
|
||||||
yybegin(YYINITIAL);
|
yybegin(YYINITIAL);
|
||||||
|
@ -31605,6 +31614,34 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||||
inputSegment.append('#');
|
inputSegment.append('#');
|
||||||
yybegin(NUMERIC_CHARACTER);
|
yybegin(NUMERIC_CHARACTER);
|
||||||
}
|
}
|
||||||
|
case 102: break;
|
||||||
|
case 36:
|
||||||
|
{ yybegin(YYINITIAL);
|
||||||
|
if (escapeBR) {
|
||||||
|
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||||
|
outputSegment = inputSegment;
|
||||||
|
return outputSegment.nextChar();
|
||||||
|
} else {
|
||||||
|
// add (previously matched input length) + (this match length) - (substitution length)
|
||||||
|
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||||
|
// position the correction at (already output length) + (substitution length)
|
||||||
|
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||||
|
inputSegment.reset();
|
||||||
|
return BR_END_TAG_REPLACEMENT;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case 103: break;
|
||||||
|
case 38:
|
||||||
|
{ yybegin(restoreState);
|
||||||
|
}
|
||||||
|
case 104: break;
|
||||||
|
case 41:
|
||||||
|
{ yybegin(STYLE_COMMENT);
|
||||||
|
}
|
||||||
|
case 105: break;
|
||||||
|
case 1:
|
||||||
|
{ return zzBuffer[zzStartRead];
|
||||||
|
}
|
||||||
case 106: break;
|
case 106: break;
|
||||||
default:
|
default:
|
||||||
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
|
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
|
||||||
|
|
|
@ -293,7 +293,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
||||||
case START_TAG_TAIL_EXCLUDE:
|
case START_TAG_TAIL_EXCLUDE:
|
||||||
case SERVER_SIDE_INCLUDE:
|
case SERVER_SIDE_INCLUDE:
|
||||||
case START_TAG_TAIL_SUBSTITUTE: { // Exclude
|
case START_TAG_TAIL_SUBSTITUTE: { // Exclude
|
||||||
|
// add (length of input that won't be output) [ - (substitution length) = 0 ]
|
||||||
cumulativeDiff += yychar - inputStart;
|
cumulativeDiff += yychar - inputStart;
|
||||||
|
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||||
outputSegment.clear();
|
outputSegment.clear();
|
||||||
eofReturnValue = -1;
|
eofReturnValue = -1;
|
||||||
|
@ -301,7 +303,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
||||||
}
|
}
|
||||||
case CHARACTER_REFERENCE_TAIL: { // Substitute
|
case CHARACTER_REFERENCE_TAIL: { // Substitute
|
||||||
// At end of file, allow char refs without semicolons
|
// At end of file, allow char refs without semicolons
|
||||||
|
// add (length of input that won't be output) - (substitution length)
|
||||||
cumulativeDiff += inputSegment.length() - outputSegment.length();
|
cumulativeDiff += inputSegment.length() - outputSegment.length();
|
||||||
|
// position the correction at (already output length) + (substitution length)
|
||||||
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
||||||
eofReturnValue = outputSegment.nextChar();
|
eofReturnValue = outputSegment.nextChar();
|
||||||
break;
|
break;
|
||||||
|
@ -374,7 +378,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
||||||
assert false: "Exception parsing low surrogate '"
|
assert false: "Exception parsing low surrogate '"
|
||||||
+ surrogatePair.substring(10, 14) + "'";
|
+ surrogatePair.substring(10, 14) + "'";
|
||||||
}
|
}
|
||||||
|
// add (previously matched input length) + (this match length) - (substitution length)
|
||||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||||
|
// position the correction at (already output length) + (substitution length)
|
||||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||||
inputSegment.clear();
|
inputSegment.clear();
|
||||||
yybegin(YYINITIAL);
|
yybegin(YYINITIAL);
|
||||||
|
@ -403,7 +409,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
||||||
assert false: "Exception parsing low surrogate '"
|
assert false: "Exception parsing low surrogate '"
|
||||||
+ surrogatePair.substring(10, 14) + "'";
|
+ surrogatePair.substring(10, 14) + "'";
|
||||||
}
|
}
|
||||||
|
// add (previously matched input length) + (this match length) - (substitution length)
|
||||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||||
|
// position the correction at (already output length) + (substitution length)
|
||||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||||
inputSegment.clear();
|
inputSegment.clear();
|
||||||
yybegin(YYINITIAL);
|
yybegin(YYINITIAL);
|
||||||
|
@ -437,7 +445,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
||||||
outputSegment = entitySegment;
|
outputSegment = entitySegment;
|
||||||
outputSegment.clear();
|
outputSegment.clear();
|
||||||
outputSegment.unsafeWrite(lowSurrogate);
|
outputSegment.unsafeWrite(lowSurrogate);
|
||||||
|
// add (previously matched input length) + (this match length) - (substitution length)
|
||||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||||
|
// position the correction at (already output length) + (substitution length)
|
||||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||||
inputSegment.clear();
|
inputSegment.clear();
|
||||||
yybegin(YYINITIAL);
|
yybegin(YYINITIAL);
|
||||||
|
@ -472,7 +482,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
||||||
outputSegment = entitySegment;
|
outputSegment = entitySegment;
|
||||||
outputSegment.clear();
|
outputSegment.clear();
|
||||||
outputSegment.unsafeWrite(lowSurrogate);
|
outputSegment.unsafeWrite(lowSurrogate);
|
||||||
|
// add (previously matched input length) + (this match length) - (substitution length)
|
||||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||||
|
// position the correction at (already output length) + (substitution length)
|
||||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||||
inputSegment.clear();
|
inputSegment.clear();
|
||||||
yybegin(YYINITIAL);
|
yybegin(YYINITIAL);
|
||||||
|
@ -557,8 +569,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
||||||
|
|
||||||
<CHARACTER_REFERENCE_TAIL> {
|
<CHARACTER_REFERENCE_TAIL> {
|
||||||
";" {
|
";" {
|
||||||
cumulativeDiff
|
// add (previously matched input length) + (this match length) - (substitution length)
|
||||||
+= inputSegment.length() + yylength() - outputSegment.length();
|
cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length();
|
||||||
|
// position the correction at (already output length) + (substitution length)
|
||||||
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
||||||
yybegin(YYINITIAL);
|
yybegin(YYINITIAL);
|
||||||
return outputSegment.nextChar();
|
return outputSegment.nextChar();
|
||||||
|
@ -574,9 +587,10 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
||||||
outputSegment = inputSegment;
|
outputSegment = inputSegment;
|
||||||
return outputSegment.nextChar();
|
return outputSegment.nextChar();
|
||||||
} else {
|
} else {
|
||||||
cumulativeDiff
|
// add (previously matched input length) + (this match length) - (substitution length)
|
||||||
+= inputSegment.length() + yylength() - outputSegment.length();
|
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||||
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
// position the correction at (already output length) + (substitution length)
|
||||||
|
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||||
inputSegment.reset();
|
inputSegment.reset();
|
||||||
return BR_END_TAG_REPLACEMENT;
|
return BR_END_TAG_REPLACEMENT;
|
||||||
}
|
}
|
||||||
|
@ -612,7 +626,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
||||||
|
|
||||||
<END_TAG_TAIL_EXCLUDE> {
|
<END_TAG_TAIL_EXCLUDE> {
|
||||||
\s* ">" {
|
\s* ">" {
|
||||||
|
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||||
cumulativeDiff += inputSegment.length() + yylength();
|
cumulativeDiff += inputSegment.length() + yylength();
|
||||||
|
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||||
inputSegment.clear();
|
inputSegment.clear();
|
||||||
yybegin(YYINITIAL);
|
yybegin(YYINITIAL);
|
||||||
|
@ -621,7 +637,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
||||||
|
|
||||||
<END_TAG_TAIL_SUBSTITUTE> {
|
<END_TAG_TAIL_SUBSTITUTE> {
|
||||||
\s* ">" {
|
\s* ">" {
|
||||||
|
// add (previously matched input length) + (this match length) - (substitution length)
|
||||||
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||||
|
// position the correction at (already output length) + (substitution length)
|
||||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||||
inputSegment.clear();
|
inputSegment.clear();
|
||||||
yybegin(YYINITIAL);
|
yybegin(YYINITIAL);
|
||||||
|
@ -637,7 +655,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
||||||
yybegin(LEFT_ANGLE_BRACKET_SPACE);
|
yybegin(LEFT_ANGLE_BRACKET_SPACE);
|
||||||
}
|
}
|
||||||
"?" [^>]* [/?] ">" {
|
"?" [^>]* [/?] ">" {
|
||||||
|
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||||
cumulativeDiff += inputSegment.length() + yylength();
|
cumulativeDiff += inputSegment.length() + yylength();
|
||||||
|
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||||
inputSegment.clear();
|
inputSegment.clear();
|
||||||
yybegin(YYINITIAL);
|
yybegin(YYINITIAL);
|
||||||
|
@ -649,8 +669,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
||||||
outputSegment = inputSegment;
|
outputSegment = inputSegment;
|
||||||
return outputSegment.nextChar();
|
return outputSegment.nextChar();
|
||||||
} else {
|
} else {
|
||||||
cumulativeDiff
|
// add (previously matched input length) + (this match length) - (substitution length)
|
||||||
+= inputSegment.length() + yylength() - outputSegment.length();
|
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||||
|
// position the correction at (already output length) + (substitution length)
|
||||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||||
inputSegment.reset();
|
inputSegment.reset();
|
||||||
return BR_START_TAG_REPLACEMENT;
|
return BR_START_TAG_REPLACEMENT;
|
||||||
|
@ -708,7 +729,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
||||||
|
|
||||||
<START_TAG_TAIL_EXCLUDE> {
|
<START_TAG_TAIL_EXCLUDE> {
|
||||||
( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
|
( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
|
||||||
|
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||||
cumulativeDiff += inputSegment.length() + yylength();
|
cumulativeDiff += inputSegment.length() + yylength();
|
||||||
|
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||||
inputSegment.clear();
|
inputSegment.clear();
|
||||||
outputSegment = inputSegment;
|
outputSegment = inputSegment;
|
||||||
|
@ -718,7 +741,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
||||||
|
|
||||||
<START_TAG_TAIL_SUBSTITUTE> {
|
<START_TAG_TAIL_SUBSTITUTE> {
|
||||||
( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
|
( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
|
||||||
|
// add (previously matched input length) + (this match length) - (substitution length)
|
||||||
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||||
|
// position the correction at (already output length) + (substitution length)
|
||||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||||
inputSegment.clear();
|
inputSegment.clear();
|
||||||
yybegin(YYINITIAL);
|
yybegin(YYINITIAL);
|
||||||
|
@ -729,7 +754,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
||||||
<BANG> {
|
<BANG> {
|
||||||
"--" { yybegin(COMMENT); }
|
"--" { yybegin(COMMENT); }
|
||||||
">" {
|
">" {
|
||||||
|
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||||
cumulativeDiff += inputSegment.length() + yylength();
|
cumulativeDiff += inputSegment.length() + yylength();
|
||||||
|
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||||
inputSegment.clear();
|
inputSegment.clear();
|
||||||
yybegin(YYINITIAL);
|
yybegin(YYINITIAL);
|
||||||
|
@ -742,7 +769,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
||||||
// [21] CDEnd ::= ']]>'
|
// [21] CDEnd ::= ']]>'
|
||||||
//
|
//
|
||||||
"[CDATA[" {
|
"[CDATA[" {
|
||||||
|
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||||
cumulativeDiff += inputSegment.length() + yylength();
|
cumulativeDiff += inputSegment.length() + yylength();
|
||||||
|
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||||
inputSegment.clear();
|
inputSegment.clear();
|
||||||
yybegin(CDATA);
|
yybegin(CDATA);
|
||||||
|
@ -754,7 +783,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
||||||
|
|
||||||
<CDATA> {
|
<CDATA> {
|
||||||
"]]>" {
|
"]]>" {
|
||||||
|
// add (this match length) [ - (substitution length) = 0 ]
|
||||||
cumulativeDiff += yylength();
|
cumulativeDiff += yylength();
|
||||||
|
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||||
yybegin(YYINITIAL);
|
yybegin(YYINITIAL);
|
||||||
}
|
}
|
||||||
|
@ -764,7 +795,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
||||||
<COMMENT> {
|
<COMMENT> {
|
||||||
"<!--#" { restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
|
"<!--#" { restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
|
||||||
"-->" {
|
"-->" {
|
||||||
|
// add (previously matched input length) + (this match length) [ - (substitution length) = 0]
|
||||||
cumulativeDiff += yychar - inputStart + yylength();
|
cumulativeDiff += yychar - inputStart + yylength();
|
||||||
|
// position the correction at (already output length) [ + (substitution length) = 0]
|
||||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||||
inputSegment.clear();
|
inputSegment.clear();
|
||||||
yybegin(YYINITIAL);
|
yybegin(YYINITIAL);
|
||||||
|
@ -820,19 +853,23 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
||||||
"</" \s* [sS][cC][rR][iI][pP][tT] \s* ">" {
|
"</" \s* [sS][cC][rR][iI][pP][tT] \s* ">" {
|
||||||
inputSegment.clear();
|
inputSegment.clear();
|
||||||
yybegin(YYINITIAL);
|
yybegin(YYINITIAL);
|
||||||
|
// add (previously matched input length) -- current match and substitution handled below
|
||||||
cumulativeDiff += yychar - inputStart;
|
cumulativeDiff += yychar - inputStart;
|
||||||
int outputEnd = outputCharCount;
|
// position at (already output length) -- substitution handled below
|
||||||
|
int offsetCorrectionPos = outputCharCount;
|
||||||
int returnValue;
|
int returnValue;
|
||||||
if (escapeSCRIPT) {
|
if (escapeSCRIPT) {
|
||||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||||
outputSegment = inputSegment;
|
outputSegment = inputSegment;
|
||||||
returnValue = outputSegment.nextChar();
|
returnValue = outputSegment.nextChar();
|
||||||
} else {
|
} else {
|
||||||
|
// add (this match length) - (substitution length)
|
||||||
cumulativeDiff += yylength() - 1;
|
cumulativeDiff += yylength() - 1;
|
||||||
++outputEnd;
|
// add (substitution length)
|
||||||
|
++offsetCorrectionPos;
|
||||||
returnValue = SCRIPT_REPLACEMENT;
|
returnValue = SCRIPT_REPLACEMENT;
|
||||||
}
|
}
|
||||||
addOffCorrectMap(outputEnd, cumulativeDiff);
|
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
|
||||||
return returnValue;
|
return returnValue;
|
||||||
}
|
}
|
||||||
[^] { }
|
[^] { }
|
||||||
|
@ -843,19 +880,23 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
||||||
"</" \s* [sS][tT][yY][lL][eE] \s* ">" {
|
"</" \s* [sS][tT][yY][lL][eE] \s* ">" {
|
||||||
inputSegment.clear();
|
inputSegment.clear();
|
||||||
yybegin(YYINITIAL);
|
yybegin(YYINITIAL);
|
||||||
|
// add (previously matched input length) -- current match and substitution handled below
|
||||||
cumulativeDiff += yychar - inputStart;
|
cumulativeDiff += yychar - inputStart;
|
||||||
int outputEnd = outputCharCount;
|
// position the offset correction at (already output length) -- substitution handled below
|
||||||
|
int offsetCorrectionPos = outputCharCount;
|
||||||
int returnValue;
|
int returnValue;
|
||||||
if (escapeSTYLE) {
|
if (escapeSTYLE) {
|
||||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||||
outputSegment = inputSegment;
|
outputSegment = inputSegment;
|
||||||
returnValue = outputSegment.nextChar();
|
returnValue = outputSegment.nextChar();
|
||||||
} else {
|
} else {
|
||||||
|
// add (this match length) - (substitution length)
|
||||||
cumulativeDiff += yylength() - 1;
|
cumulativeDiff += yylength() - 1;
|
||||||
++outputEnd;
|
// add (substitution length)
|
||||||
|
++offsetCorrectionPos;
|
||||||
returnValue = STYLE_REPLACEMENT;
|
returnValue = STYLE_REPLACEMENT;
|
||||||
}
|
}
|
||||||
addOffCorrectMap(outputEnd, cumulativeDiff);
|
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
|
||||||
return returnValue;
|
return returnValue;
|
||||||
}
|
}
|
||||||
[^] { }
|
[^] { }
|
||||||
|
|
|
@ -36,6 +36,21 @@ import org.apache.lucene.util._TestUtil;
|
||||||
|
|
||||||
public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
static private Analyzer newTestAnalyzer() {
|
||||||
|
return new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Reader initReader(Reader reader) {
|
||||||
|
return new HTMLStripCharFilter(CharReader.get(reader));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
//this is some text here is a link and another link . This is an entity: & plus a <. Here is an &
|
//this is some text here is a link and another link . This is an entity: & plus a <. Here is an &
|
||||||
//
|
//
|
||||||
public void test() throws IOException {
|
public void test() throws IOException {
|
||||||
|
@ -493,41 +508,17 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testRandom() throws Exception {
|
public void testRandom() throws Exception {
|
||||||
Analyzer analyzer = new Analyzer() {
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
|
||||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
|
||||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected Reader initReader(Reader reader) {
|
|
||||||
return new HTMLStripCharFilter(CharReader.get(reader));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
int numRounds = RANDOM_MULTIPLIER * 10000;
|
int numRounds = RANDOM_MULTIPLIER * 10000;
|
||||||
checkRandomData(random, analyzer, numRounds);
|
checkRandomData(random, newTestAnalyzer(), numRounds);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testRandomHugeStrings() throws Exception {
|
public void testRandomHugeStrings() throws Exception {
|
||||||
Analyzer analyzer = new Analyzer() {
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
|
||||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
|
||||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected Reader initReader(Reader reader) {
|
|
||||||
return new HTMLStripCharFilter(CharReader.get(reader));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
int numRounds = RANDOM_MULTIPLIER * 200;
|
int numRounds = RANDOM_MULTIPLIER * 200;
|
||||||
checkRandomData(random, analyzer, numRounds, 8192);
|
checkRandomData(random, newTestAnalyzer(), numRounds, 8192);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testCloseBR() throws Exception {
|
||||||
|
checkAnalysisConsistency(random, newTestAnalyzer(), random.nextBoolean(), " Secretary)</br> [[M");
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testServerSideIncludes() throws Exception {
|
public void testServerSideIncludes() throws Exception {
|
||||||
|
@ -797,9 +788,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
public void testRandomBrokenHTML() throws Exception {
|
public void testRandomBrokenHTML() throws Exception {
|
||||||
int maxNumElements = 10000;
|
int maxNumElements = 10000;
|
||||||
String text = _TestUtil.randomHtmlishString(random, maxNumElements);
|
String text = _TestUtil.randomHtmlishString(random, maxNumElements);
|
||||||
Reader reader = new HTMLStripCharFilter
|
checkAnalysisConsistency(random, newTestAnalyzer(), random.nextBoolean(), text);
|
||||||
(CharReader.get(new StringReader(text)));
|
|
||||||
while (reader.read() != -1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testRandomText() throws Exception {
|
public void testRandomText() throws Exception {
|
||||||
|
@ -838,18 +827,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testUTF16Surrogates() throws Exception {
|
public void testUTF16Surrogates() throws Exception {
|
||||||
Analyzer analyzer = new Analyzer() {
|
Analyzer analyzer = newTestAnalyzer();
|
||||||
@Override
|
|
||||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
|
||||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
|
||||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected Reader initReader(Reader reader) {
|
|
||||||
return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
// Paired surrogates
|
// Paired surrogates
|
||||||
assertAnalyzesTo(analyzer, " one two ��three",
|
assertAnalyzesTo(analyzer, " one two ��three",
|
||||||
new String[] { "one", "two", "\uD86C\uDC01three" } );
|
new String[] { "one", "two", "\uD86C\uDC01three" } );
|
||||||
|
|
Loading…
Reference in New Issue