mirror of https://github.com/apache/lucene.git
LUCENE-3913: Fix HTMLStripCharFilter invalid final offset for input containing </br>
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1304912 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1f7c31d711
commit
ada9780484
|
@ -391,188 +391,194 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
}
|
||||
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
|
||||
checkAnalysisConsistency(random, a, useCharFilter, text);
|
||||
}
|
||||
}
|
||||
|
||||
public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text) throws IOException {
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
|
||||
}
|
||||
|
||||
int remainder = random.nextInt(10);
|
||||
Reader reader = new StringReader(text);
|
||||
TokenStream ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
|
||||
assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
|
||||
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
|
||||
OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
|
||||
PositionIncrementAttribute posIncAtt = ts.hasAttribute(PositionIncrementAttribute.class) ? ts.getAttribute(PositionIncrementAttribute.class) : null;
|
||||
PositionLengthAttribute posLengthAtt = ts.hasAttribute(PositionLengthAttribute.class) ? ts.getAttribute(PositionLengthAttribute.class) : null;
|
||||
TypeAttribute typeAtt = ts.hasAttribute(TypeAttribute.class) ? ts.getAttribute(TypeAttribute.class) : null;
|
||||
List<String> tokens = new ArrayList<String>();
|
||||
List<String> types = new ArrayList<String>();
|
||||
List<Integer> positions = new ArrayList<Integer>();
|
||||
List<Integer> positionLengths = new ArrayList<Integer>();
|
||||
List<Integer> startOffsets = new ArrayList<Integer>();
|
||||
List<Integer> endOffsets = new ArrayList<Integer>();
|
||||
ts.reset();
|
||||
|
||||
// First pass: save away "correct" tokens
|
||||
while (ts.incrementToken()) {
|
||||
tokens.add(termAtt.toString());
|
||||
if (typeAtt != null) types.add(typeAtt.type());
|
||||
if (posIncAtt != null) positions.add(posIncAtt.getPositionIncrement());
|
||||
if (posLengthAtt != null) positionLengths.add(posLengthAtt.getPositionLength());
|
||||
if (offsetAtt != null) {
|
||||
startOffsets.add(offsetAtt.startOffset());
|
||||
endOffsets.add(offsetAtt.endOffset());
|
||||
}
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
|
||||
int remainder = random.nextInt(10);
|
||||
Reader reader = new StringReader(text);
|
||||
TokenStream ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
|
||||
assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
|
||||
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
|
||||
OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
|
||||
PositionIncrementAttribute posIncAtt = ts.hasAttribute(PositionIncrementAttribute.class) ? ts.getAttribute(PositionIncrementAttribute.class) : null;
|
||||
PositionLengthAttribute posLengthAtt = ts.hasAttribute(PositionLengthAttribute.class) ? ts.getAttribute(PositionLengthAttribute.class) : null;
|
||||
TypeAttribute typeAtt = ts.hasAttribute(TypeAttribute.class) ? ts.getAttribute(TypeAttribute.class) : null;
|
||||
List<String> tokens = new ArrayList<String>();
|
||||
List<String> types = new ArrayList<String>();
|
||||
List<Integer> positions = new ArrayList<Integer>();
|
||||
List<Integer> positionLengths = new ArrayList<Integer>();
|
||||
List<Integer> startOffsets = new ArrayList<Integer>();
|
||||
List<Integer> endOffsets = new ArrayList<Integer>();
|
||||
ts.reset();
|
||||
// verify reusing is "reproducable" and also get the normal tokenstream sanity checks
|
||||
if (!tokens.isEmpty()) {
|
||||
|
||||
// First pass: save away "correct" tokens
|
||||
while (ts.incrementToken()) {
|
||||
tokens.add(termAtt.toString());
|
||||
if (typeAtt != null) types.add(typeAtt.type());
|
||||
if (posIncAtt != null) positions.add(posIncAtt.getPositionIncrement());
|
||||
if (posLengthAtt != null) positionLengths.add(posLengthAtt.getPositionLength());
|
||||
if (offsetAtt != null) {
|
||||
startOffsets.add(offsetAtt.startOffset());
|
||||
endOffsets.add(offsetAtt.endOffset());
|
||||
}
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
// KWTokenizer (for example) can produce a token
|
||||
// even when input is length 0:
|
||||
if (text.length() != 0) {
|
||||
|
||||
// verify reusing is "reproducable" and also get the normal tokenstream sanity checks
|
||||
if (!tokens.isEmpty()) {
|
||||
|
||||
// KWTokenizer (for example) can produce a token
|
||||
// even when input is length 0:
|
||||
if (text.length() != 0) {
|
||||
|
||||
// (Optional) second pass: do something evil:
|
||||
final int evilness = random.nextInt(50);
|
||||
if (evilness == 17) {
|
||||
if (VERBOSE) {
|
||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis w/ exception");
|
||||
}
|
||||
// Throw an errant exception from the Reader:
|
||||
|
||||
MockReaderWrapper evilReader = new MockReaderWrapper(random, new StringReader(text));
|
||||
evilReader.throwExcAfterChar(random.nextInt(text.length()+1));
|
||||
reader = evilReader;
|
||||
|
||||
try {
|
||||
// NOTE: some Tokenizers go and read characters
|
||||
// when you call .setReader(Reader), eg
|
||||
// PatternTokenizer. This is a bit
|
||||
// iffy... (really, they should only
|
||||
// pull from the Reader when you call
|
||||
// .incremenToken(), I think?), but we
|
||||
// currently allow it, so, we must call
|
||||
// a.tokenStream inside the try since we may
|
||||
// hit the exc on init:
|
||||
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(evilReader, remainder) : evilReader);
|
||||
ts.reset();
|
||||
while (ts.incrementToken());
|
||||
fail("did not hit exception");
|
||||
} catch (RuntimeException re) {
|
||||
assertTrue(MockReaderWrapper.isMyEvilException(re));
|
||||
}
|
||||
try {
|
||||
ts.end();
|
||||
} catch (AssertionError ae) {
|
||||
// Catch & ignore MockTokenizer's
|
||||
// anger...
|
||||
if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
|
||||
// OK
|
||||
} else {
|
||||
throw ae;
|
||||
}
|
||||
}
|
||||
ts.close();
|
||||
} else if (evilness == 7) {
|
||||
// Only consume a subset of the tokens:
|
||||
final int numTokensToRead = random.nextInt(tokens.size());
|
||||
if (VERBOSE) {
|
||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.size() + " tokens");
|
||||
}
|
||||
|
||||
reader = new StringReader(text);
|
||||
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
|
||||
ts.reset();
|
||||
for(int tokenCount=0;tokenCount<numTokensToRead;tokenCount++) {
|
||||
assertTrue(ts.incrementToken());
|
||||
}
|
||||
try {
|
||||
ts.end();
|
||||
} catch (AssertionError ae) {
|
||||
// Catch & ignore MockTokenizer's
|
||||
// anger...
|
||||
if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
|
||||
// OK
|
||||
} else {
|
||||
throw ae;
|
||||
}
|
||||
}
|
||||
ts.close();
|
||||
}
|
||||
}
|
||||
|
||||
// Final pass: verify clean tokenization matches
|
||||
// results from first pass:
|
||||
if (VERBOSE) {
|
||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
|
||||
}
|
||||
reader = new StringReader(text);
|
||||
|
||||
if (random.nextInt(30) == 7) {
|
||||
// (Optional) second pass: do something evil:
|
||||
final int evilness = random.nextInt(50);
|
||||
if (evilness == 17) {
|
||||
if (VERBOSE) {
|
||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader");
|
||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis w/ exception");
|
||||
}
|
||||
// Throw an errant exception from the Reader:
|
||||
|
||||
MockReaderWrapper evilReader = new MockReaderWrapper(random, new StringReader(text));
|
||||
evilReader.throwExcAfterChar(random.nextInt(text.length()+1));
|
||||
reader = evilReader;
|
||||
|
||||
try {
|
||||
// NOTE: some Tokenizers go and read characters
|
||||
// when you call .setReader(Reader), eg
|
||||
// PatternTokenizer. This is a bit
|
||||
// iffy... (really, they should only
|
||||
// pull from the Reader when you call
|
||||
// .incremenToken(), I think?), but we
|
||||
// currently allow it, so, we must call
|
||||
// a.tokenStream inside the try since we may
|
||||
// hit the exc on init:
|
||||
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(evilReader, remainder) : evilReader);
|
||||
ts.reset();
|
||||
while (ts.incrementToken());
|
||||
fail("did not hit exception");
|
||||
} catch (RuntimeException re) {
|
||||
assertTrue(MockReaderWrapper.isMyEvilException(re));
|
||||
}
|
||||
try {
|
||||
ts.end();
|
||||
} catch (AssertionError ae) {
|
||||
// Catch & ignore MockTokenizer's
|
||||
// anger...
|
||||
if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
|
||||
// OK
|
||||
} else {
|
||||
throw ae;
|
||||
}
|
||||
}
|
||||
ts.close();
|
||||
} else if (evilness == 7) {
|
||||
// Only consume a subset of the tokens:
|
||||
final int numTokensToRead = random.nextInt(tokens.size());
|
||||
if (VERBOSE) {
|
||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.size() + " tokens");
|
||||
}
|
||||
|
||||
reader = new MockReaderWrapper(random, reader);
|
||||
reader = new StringReader(text);
|
||||
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
|
||||
ts.reset();
|
||||
for(int tokenCount=0;tokenCount<numTokensToRead;tokenCount++) {
|
||||
assertTrue(ts.incrementToken());
|
||||
}
|
||||
try {
|
||||
ts.end();
|
||||
} catch (AssertionError ae) {
|
||||
// Catch & ignore MockTokenizer's
|
||||
// anger...
|
||||
if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
|
||||
// OK
|
||||
} else {
|
||||
throw ae;
|
||||
}
|
||||
}
|
||||
ts.close();
|
||||
}
|
||||
|
||||
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
|
||||
if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
|
||||
// offset + pos + posLength + type
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
toIntArray(startOffsets),
|
||||
toIntArray(endOffsets),
|
||||
types.toArray(new String[types.size()]),
|
||||
toIntArray(positions),
|
||||
toIntArray(positionLengths),
|
||||
text.length());
|
||||
} else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
|
||||
// offset + pos + type
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
toIntArray(startOffsets),
|
||||
toIntArray(endOffsets),
|
||||
types.toArray(new String[types.size()]),
|
||||
toIntArray(positions),
|
||||
null,
|
||||
text.length());
|
||||
} else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
|
||||
// offset + pos + posLength
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
toIntArray(startOffsets),
|
||||
toIntArray(endOffsets),
|
||||
null,
|
||||
toIntArray(positions),
|
||||
toIntArray(positionLengths),
|
||||
text.length());
|
||||
} else if (posIncAtt != null && offsetAtt != null) {
|
||||
// offset + pos
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
toIntArray(startOffsets),
|
||||
toIntArray(endOffsets),
|
||||
null,
|
||||
toIntArray(positions),
|
||||
null,
|
||||
text.length());
|
||||
} else if (offsetAtt != null) {
|
||||
// offset
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
toIntArray(startOffsets),
|
||||
toIntArray(endOffsets),
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
text.length());
|
||||
} else {
|
||||
// terms only
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]));
|
||||
}
|
||||
|
||||
// Final pass: verify clean tokenization matches
|
||||
// results from first pass:
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
|
||||
}
|
||||
reader = new StringReader(text);
|
||||
|
||||
if (random.nextInt(30) == 7) {
|
||||
if (VERBOSE) {
|
||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader");
|
||||
}
|
||||
|
||||
reader = new MockReaderWrapper(random, reader);
|
||||
}
|
||||
|
||||
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
|
||||
if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
|
||||
// offset + pos + posLength + type
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
toIntArray(startOffsets),
|
||||
toIntArray(endOffsets),
|
||||
types.toArray(new String[types.size()]),
|
||||
toIntArray(positions),
|
||||
toIntArray(positionLengths),
|
||||
text.length());
|
||||
} else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
|
||||
// offset + pos + type
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
toIntArray(startOffsets),
|
||||
toIntArray(endOffsets),
|
||||
types.toArray(new String[types.size()]),
|
||||
toIntArray(positions),
|
||||
null,
|
||||
text.length());
|
||||
} else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
|
||||
// offset + pos + posLength
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
toIntArray(startOffsets),
|
||||
toIntArray(endOffsets),
|
||||
null,
|
||||
toIntArray(positions),
|
||||
toIntArray(positionLengths),
|
||||
text.length());
|
||||
} else if (posIncAtt != null && offsetAtt != null) {
|
||||
// offset + pos
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
toIntArray(startOffsets),
|
||||
toIntArray(endOffsets),
|
||||
null,
|
||||
toIntArray(positions),
|
||||
null,
|
||||
text.length());
|
||||
} else if (offsetAtt != null) {
|
||||
// offset
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
toIntArray(startOffsets),
|
||||
toIntArray(endOffsets),
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
text.length());
|
||||
} else {
|
||||
// terms only
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,10 +27,7 @@ import java.io.OutputStream;
|
|||
import java.io.PrintStream;
|
||||
import java.lang.reflect.Method;
|
||||
import java.nio.CharBuffer;
|
||||
import java.util.Enumeration;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
import java.util.*;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipFile;
|
||||
|
||||
|
@ -414,12 +411,51 @@ public class _TestUtil {
|
|||
case 20: sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1)); break;
|
||||
case 21: sb.append("\n"); break;
|
||||
case 22: sb.append(" ".substring(nextInt(random, 0, 10))); break;
|
||||
case 23: {
|
||||
sb.append("<");
|
||||
if (0 == nextInt(random, 0, 3)) {
|
||||
sb.append(" ".substring(nextInt(random, 1, 10)));
|
||||
}
|
||||
if (0 == nextInt(random, 0, 1)) {
|
||||
sb.append("/");
|
||||
if (0 == nextInt(random, 0, 3)) {
|
||||
sb.append(" ".substring(nextInt(random, 1, 10)));
|
||||
}
|
||||
}
|
||||
switch (nextInt(random, 0, 3)) {
|
||||
case 0: sb.append(randomlyRecaseCodePoints(random, "script")); break;
|
||||
case 1: sb.append(randomlyRecaseCodePoints(random, "style")); break;
|
||||
case 2: sb.append(randomlyRecaseCodePoints(random, "br")); break;
|
||||
// default: append nothing
|
||||
}
|
||||
sb.append(">".substring(nextInt(random, 0, 1)));
|
||||
break;
|
||||
}
|
||||
default: sb.append(randomSimpleString(random));
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Randomly upcases, downcases, or leaves intact each code point in the given string
|
||||
*/
|
||||
public static String randomlyRecaseCodePoints(Random random, String str) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int pos = 0;
|
||||
while (pos < str.length()) {
|
||||
int codePoint = str.codePointAt(pos);
|
||||
pos += Character.charCount(codePoint);
|
||||
String codePointSubstring = new String(new int[] { codePoint }, 0, 1);
|
||||
switch (nextInt(random, 0, 2)) {
|
||||
case 0: builder.append(codePointSubstring.toUpperCase()); break;
|
||||
case 1: builder.append(codePointSubstring.toLowerCase()); break;
|
||||
case 2: builder.append(codePointSubstring); // leave intact
|
||||
}
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
private static final int[] blockStarts = {
|
||||
0x0000, 0x0080, 0x0100, 0x0180, 0x0250, 0x02B0, 0x0300, 0x0370, 0x0400,
|
||||
0x0500, 0x0530, 0x0590, 0x0600, 0x0700, 0x0750, 0x0780, 0x07C0, 0x0800,
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 1/23/12 2:15 AM */
|
||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 3/24/12 4:50 PM */
|
||||
|
||||
package org.apache.lucene.analysis.charfilter;
|
||||
|
||||
|
@ -39,7 +39,7 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
|
|||
/**
|
||||
* This class is a scanner generated by
|
||||
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
||||
* on 1/23/12 2:15 AM from the specification file
|
||||
* on 3/24/12 4:50 PM from the specification file
|
||||
* <tt>C:/cygwin/home/s/svn/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex</tt>
|
||||
*/
|
||||
public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||
|
@ -30967,7 +30967,9 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
case START_TAG_TAIL_EXCLUDE:
|
||||
case SERVER_SIDE_INCLUDE:
|
||||
case START_TAG_TAIL_SUBSTITUTE: { // Exclude
|
||||
// add (length of input that won't be output) [ - (substitution length) = 0 ]
|
||||
cumulativeDiff += yychar - inputStart;
|
||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
outputSegment.clear();
|
||||
eofReturnValue = -1;
|
||||
|
@ -30975,7 +30977,9 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
}
|
||||
case CHARACTER_REFERENCE_TAIL: { // Substitute
|
||||
// At end of file, allow char refs without semicolons
|
||||
// add (length of input that won't be output) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() - outputSegment.length();
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
||||
eofReturnValue = outputSegment.nextChar();
|
||||
break;
|
||||
|
@ -31095,6 +31099,16 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
{ yybegin(STYLE);
|
||||
}
|
||||
case 55: break;
|
||||
case 27:
|
||||
{ // add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return BLOCK_LEVEL_START_TAG_REPLACEMENT;
|
||||
}
|
||||
case 56: break;
|
||||
case 30:
|
||||
{ int length = yylength();
|
||||
inputSegment.write(zzBuffer, zzStartRead, length);
|
||||
|
@ -31104,7 +31118,30 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
outputSegment = entitySegment;
|
||||
yybegin(CHARACTER_REFERENCE_TAIL);
|
||||
}
|
||||
case 56: break;
|
||||
case 57: break;
|
||||
case 48:
|
||||
{ inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
// add (previously matched input length) -- current match and substitution handled below
|
||||
cumulativeDiff += yychar - inputStart;
|
||||
// position the offset correction at (already output length) -- substitution handled below
|
||||
int offsetCorrectionPos = outputCharCount;
|
||||
int returnValue;
|
||||
if (escapeSTYLE) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
returnValue = outputSegment.nextChar();
|
||||
} else {
|
||||
// add (this match length) - (substitution length)
|
||||
cumulativeDiff += yylength() - 1;
|
||||
// add (substitution length)
|
||||
++offsetCorrectionPos;
|
||||
returnValue = STYLE_REPLACEMENT;
|
||||
}
|
||||
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
|
||||
return returnValue;
|
||||
}
|
||||
case 58: break;
|
||||
case 8:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
if (null != escapedTags
|
||||
|
@ -31114,71 +31151,75 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
yybegin(START_TAG_TAIL_SUBSTITUTE);
|
||||
}
|
||||
}
|
||||
case 57: break;
|
||||
case 26:
|
||||
{ cumulativeDiff += inputSegment.length() + yylength();
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
case 58: break;
|
||||
case 59: break;
|
||||
case 2:
|
||||
{ inputStart = yychar;
|
||||
inputSegment.clear();
|
||||
inputSegment.append('<');
|
||||
yybegin(LEFT_ANGLE_BRACKET);
|
||||
}
|
||||
case 59: break;
|
||||
case 34:
|
||||
{ cumulativeDiff += yychar - inputStart + yylength();
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
case 60: break;
|
||||
case 47:
|
||||
{ cumulativeDiff += inputSegment.length() + yylength();
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(CDATA);
|
||||
}
|
||||
case 61: break;
|
||||
case 27:
|
||||
{ cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return BLOCK_LEVEL_START_TAG_REPLACEMENT;
|
||||
}
|
||||
case 62: break;
|
||||
case 44:
|
||||
{ restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
|
||||
}
|
||||
case 63: break;
|
||||
case 61: break;
|
||||
case 21:
|
||||
{ previousRestoreState = restoreState;
|
||||
restoreState = SERVER_SIDE_INCLUDE;
|
||||
yybegin(SINGLE_QUOTED_STRING);
|
||||
}
|
||||
case 64: break;
|
||||
case 62: break;
|
||||
case 11:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
yybegin(LEFT_ANGLE_BRACKET_SPACE);
|
||||
}
|
||||
case 65: break;
|
||||
case 63: break;
|
||||
case 35:
|
||||
{ yybegin(SCRIPT);
|
||||
}
|
||||
case 66: break;
|
||||
case 64: break;
|
||||
case 42:
|
||||
{ restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE);
|
||||
}
|
||||
case 67: break;
|
||||
case 65: break;
|
||||
case 10:
|
||||
{ inputSegment.append('!'); yybegin(BANG);
|
||||
}
|
||||
case 68: break;
|
||||
case 66: break;
|
||||
case 51:
|
||||
{ // Handle paired UTF-16 surrogates.
|
||||
String surrogatePair = yytext();
|
||||
char highSurrogate = '\u0000';
|
||||
char lowSurrogate = '\u0000';
|
||||
try {
|
||||
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing high surrogate '"
|
||||
+ surrogatePair.substring(2, 6) + "'";
|
||||
}
|
||||
try { // Low surrogates are in decimal range [56320, 57343]
|
||||
lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing low surrogate '"
|
||||
+ surrogatePair.substring(9, 14) + "'";
|
||||
}
|
||||
if (Character.isLowSurrogate(lowSurrogate)) {
|
||||
outputSegment = entitySegment;
|
||||
outputSegment.clear();
|
||||
outputSegment.unsafeWrite(lowSurrogate);
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return highSurrogate;
|
||||
}
|
||||
yypushback(surrogatePair.length() - 1); // Consume only '#'
|
||||
inputSegment.append('#');
|
||||
yybegin(NUMERIC_CHARACTER);
|
||||
}
|
||||
case 67: break;
|
||||
case 4:
|
||||
{ yypushback(1);
|
||||
outputSegment = inputSegment;
|
||||
|
@ -31186,37 +31227,48 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
case 69: break;
|
||||
case 48:
|
||||
{ inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
cumulativeDiff += yychar - inputStart;
|
||||
int outputEnd = outputCharCount;
|
||||
int returnValue;
|
||||
if (escapeSTYLE) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
returnValue = outputSegment.nextChar();
|
||||
} else {
|
||||
cumulativeDiff += yylength() - 1;
|
||||
++outputEnd;
|
||||
returnValue = STYLE_REPLACEMENT;
|
||||
}
|
||||
addOffCorrectMap(outputEnd, cumulativeDiff);
|
||||
return returnValue;
|
||||
}
|
||||
case 70: break;
|
||||
case 68: break;
|
||||
case 43:
|
||||
{ restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
|
||||
}
|
||||
case 71: break;
|
||||
case 14:
|
||||
{ cumulativeDiff += inputSegment.length() + yylength();
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
case 69: break;
|
||||
case 52:
|
||||
{ // Handle paired UTF-16 surrogates.
|
||||
String surrogatePair = yytext();
|
||||
char highSurrogate = '\u0000';
|
||||
try { // High surrogates are in decimal range [55296, 56319]
|
||||
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing high surrogate '"
|
||||
+ surrogatePair.substring(1, 6) + "'";
|
||||
}
|
||||
if (Character.isHighSurrogate(highSurrogate)) {
|
||||
outputSegment = entitySegment;
|
||||
outputSegment.clear();
|
||||
try {
|
||||
outputSegment.unsafeWrite
|
||||
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing low surrogate '"
|
||||
+ surrogatePair.substring(10, 14) + "'";
|
||||
}
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return highSurrogate;
|
||||
}
|
||||
yypushback(surrogatePair.length() - 1); // Consume only '#'
|
||||
inputSegment.append('#');
|
||||
yybegin(NUMERIC_CHARACTER);
|
||||
}
|
||||
case 72: break;
|
||||
case 70: break;
|
||||
case 28:
|
||||
{ restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING);
|
||||
}
|
||||
case 71: break;
|
||||
case 50:
|
||||
{ // Handle paired UTF-16 surrogates.
|
||||
outputSegment = entitySegment;
|
||||
|
@ -31236,49 +31288,63 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
assert false: "Exception parsing low surrogate '"
|
||||
+ surrogatePair.substring(10, 14) + "'";
|
||||
}
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return highSurrogate;
|
||||
}
|
||||
case 73: break;
|
||||
case 28:
|
||||
{ restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING);
|
||||
}
|
||||
case 74: break;
|
||||
case 49:
|
||||
{ inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
cumulativeDiff += yychar - inputStart;
|
||||
int outputEnd = outputCharCount;
|
||||
int returnValue;
|
||||
if (escapeSCRIPT) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
returnValue = outputSegment.nextChar();
|
||||
} else {
|
||||
cumulativeDiff += yylength() - 1;
|
||||
++outputEnd;
|
||||
returnValue = SCRIPT_REPLACEMENT;
|
||||
}
|
||||
addOffCorrectMap(outputEnd, cumulativeDiff);
|
||||
return returnValue;
|
||||
}
|
||||
case 75: break;
|
||||
case 72: break;
|
||||
case 16:
|
||||
{ restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING);
|
||||
}
|
||||
case 76: break;
|
||||
case 73: break;
|
||||
case 22:
|
||||
{ previousRestoreState = restoreState;
|
||||
restoreState = SERVER_SIDE_INCLUDE;
|
||||
yybegin(DOUBLE_QUOTED_STRING);
|
||||
}
|
||||
case 77: break;
|
||||
case 74: break;
|
||||
case 26:
|
||||
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
case 75: break;
|
||||
case 20:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
}
|
||||
case 76: break;
|
||||
case 47:
|
||||
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(CDATA);
|
||||
}
|
||||
case 77: break;
|
||||
case 33:
|
||||
{ yybegin(YYINITIAL);
|
||||
if (escapeBR) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
return outputSegment.nextChar();
|
||||
} else {
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.reset();
|
||||
return BR_START_TAG_REPLACEMENT;
|
||||
}
|
||||
}
|
||||
case 78: break;
|
||||
case 23:
|
||||
{ yybegin(restoreState); restoreState = previousRestoreState;
|
||||
|
@ -31288,28 +31354,20 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
{ yybegin(COMMENT);
|
||||
}
|
||||
case 80: break;
|
||||
case 25:
|
||||
{ cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return BLOCK_LEVEL_END_TAG_REPLACEMENT;
|
||||
}
|
||||
case 81: break;
|
||||
case 24:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
case 82: break;
|
||||
case 81: break;
|
||||
case 3:
|
||||
{ inputStart = yychar;
|
||||
inputSegment.clear();
|
||||
inputSegment.append('&');
|
||||
yybegin(AMPERSAND);
|
||||
}
|
||||
case 83: break;
|
||||
case 82: break;
|
||||
case 46:
|
||||
{ yybegin(SCRIPT);
|
||||
if (escapeSCRIPT) {
|
||||
|
@ -31319,6 +31377,15 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
case 83: break;
|
||||
case 14:
|
||||
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
case 84: break;
|
||||
case 6:
|
||||
{ int matchLength = yylength();
|
||||
|
@ -31354,14 +31421,23 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
}
|
||||
}
|
||||
case 85: break;
|
||||
case 34:
|
||||
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0]
|
||||
cumulativeDiff += yychar - inputStart + yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
case 86: break;
|
||||
case 5:
|
||||
{ inputSegment.append('#'); yybegin(NUMERIC_CHARACTER);
|
||||
}
|
||||
case 86: break;
|
||||
case 87: break;
|
||||
case 13:
|
||||
{ inputSegment.append(zzBuffer[zzStartRead]);
|
||||
}
|
||||
case 87: break;
|
||||
case 88: break;
|
||||
case 18:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
if (null != escapedTags
|
||||
|
@ -31369,93 +31445,25 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
yybegin(END_TAG_TAIL_INCLUDE);
|
||||
} else {
|
||||
yybegin(END_TAG_TAIL_SUBSTITUTE);
|
||||
}
|
||||
}
|
||||
case 88: break;
|
||||
case 36:
|
||||
{ yybegin(YYINITIAL);
|
||||
if (escapeBR) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
return outputSegment.nextChar();
|
||||
} else {
|
||||
cumulativeDiff
|
||||
+= inputSegment.length() + yylength() - outputSegment.length();
|
||||
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
||||
inputSegment.reset();
|
||||
return BR_END_TAG_REPLACEMENT;
|
||||
}
|
||||
}
|
||||
case 89: break;
|
||||
case 33:
|
||||
{ yybegin(YYINITIAL);
|
||||
if (escapeBR) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
return outputSegment.nextChar();
|
||||
} else {
|
||||
cumulativeDiff
|
||||
+= inputSegment.length() + yylength() - outputSegment.length();
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.reset();
|
||||
return BR_START_TAG_REPLACEMENT;
|
||||
}
|
||||
}
|
||||
case 90: break;
|
||||
case 40:
|
||||
{ yybegin(SCRIPT_COMMENT);
|
||||
}
|
||||
case 91: break;
|
||||
case 90: break;
|
||||
case 37:
|
||||
{ cumulativeDiff += yylength();
|
||||
{ // add (this match length) [ - (substitution length) = 0 ]
|
||||
cumulativeDiff += yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
case 92: break;
|
||||
case 91: break;
|
||||
case 12:
|
||||
{ inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH);
|
||||
}
|
||||
case 93: break;
|
||||
case 51:
|
||||
{ // Handle paired UTF-16 surrogates.
|
||||
String surrogatePair = yytext();
|
||||
char highSurrogate = '\u0000';
|
||||
char lowSurrogate = '\u0000';
|
||||
try {
|
||||
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing high surrogate '"
|
||||
+ surrogatePair.substring(2, 6) + "'";
|
||||
}
|
||||
try { // Low surrogates are in decimal range [56320, 57343]
|
||||
lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing low surrogate '"
|
||||
+ surrogatePair.substring(9, 14) + "'";
|
||||
}
|
||||
if (Character.isLowSurrogate(lowSurrogate)) {
|
||||
outputSegment = entitySegment;
|
||||
outputSegment.clear();
|
||||
outputSegment.unsafeWrite(lowSurrogate);
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return highSurrogate;
|
||||
}
|
||||
yypushback(surrogatePair.length() - 1); // Consume only '#'
|
||||
inputSegment.append('#');
|
||||
yybegin(NUMERIC_CHARACTER);
|
||||
}
|
||||
case 94: break;
|
||||
case 7:
|
||||
{ cumulativeDiff
|
||||
+= inputSegment.length() + yylength() - outputSegment.length();
|
||||
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
case 95: break;
|
||||
case 92: break;
|
||||
case 9:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
if (null != escapedTags
|
||||
|
@ -31465,15 +31473,38 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
yybegin(START_TAG_TAIL_EXCLUDE);
|
||||
}
|
||||
}
|
||||
case 96: break;
|
||||
case 93: break;
|
||||
case 49:
|
||||
{ inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
// add (previously matched input length) -- current match and substitution handled below
|
||||
cumulativeDiff += yychar - inputStart;
|
||||
// position at (already output length) -- substitution handled below
|
||||
int offsetCorrectionPos = outputCharCount;
|
||||
int returnValue;
|
||||
if (escapeSCRIPT) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
returnValue = outputSegment.nextChar();
|
||||
} else {
|
||||
// add (this match length) - (substitution length)
|
||||
cumulativeDiff += yylength() - 1;
|
||||
// add (substitution length)
|
||||
++offsetCorrectionPos;
|
||||
returnValue = SCRIPT_REPLACEMENT;
|
||||
}
|
||||
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
|
||||
return returnValue;
|
||||
}
|
||||
case 94: break;
|
||||
case 29:
|
||||
{ restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
|
||||
}
|
||||
case 97: break;
|
||||
case 95: break;
|
||||
case 17:
|
||||
{ restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
|
||||
}
|
||||
case 98: break;
|
||||
case 96: break;
|
||||
case 45:
|
||||
{ yybegin(STYLE);
|
||||
if (escapeSTYLE) {
|
||||
|
@ -31483,7 +31514,16 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
case 99: break;
|
||||
case 97: break;
|
||||
case 7:
|
||||
{ // add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length();
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
case 98: break;
|
||||
case 19:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
if (null != escapedTags
|
||||
|
@ -31493,6 +31533,16 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
yybegin(END_TAG_TAIL_EXCLUDE);
|
||||
}
|
||||
}
|
||||
case 99: break;
|
||||
case 25:
|
||||
{ // add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return BLOCK_LEVEL_END_TAG_REPLACEMENT;
|
||||
}
|
||||
case 100: break;
|
||||
case 31:
|
||||
{ int matchLength = yylength();
|
||||
|
@ -31529,49 +31579,6 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
}
|
||||
}
|
||||
case 101: break;
|
||||
case 38:
|
||||
{ yybegin(restoreState);
|
||||
}
|
||||
case 102: break;
|
||||
case 41:
|
||||
{ yybegin(STYLE_COMMENT);
|
||||
}
|
||||
case 103: break;
|
||||
case 1:
|
||||
{ return zzBuffer[zzStartRead];
|
||||
}
|
||||
case 104: break;
|
||||
case 52:
|
||||
{ // Handle paired UTF-16 surrogates.
|
||||
String surrogatePair = yytext();
|
||||
char highSurrogate = '\u0000';
|
||||
try { // High surrogates are in decimal range [55296, 56319]
|
||||
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing high surrogate '"
|
||||
+ surrogatePair.substring(1, 6) + "'";
|
||||
}
|
||||
if (Character.isHighSurrogate(highSurrogate)) {
|
||||
outputSegment = entitySegment;
|
||||
outputSegment.clear();
|
||||
try {
|
||||
outputSegment.unsafeWrite
|
||||
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing low surrogate '"
|
||||
+ surrogatePair.substring(10, 14) + "'";
|
||||
}
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return highSurrogate;
|
||||
}
|
||||
yypushback(surrogatePair.length() - 1); // Consume only '#'
|
||||
inputSegment.append('#');
|
||||
yybegin(NUMERIC_CHARACTER);
|
||||
}
|
||||
case 105: break;
|
||||
case 53:
|
||||
{ // Handle paired UTF-16 surrogates.
|
||||
String surrogatePair = yytext();
|
||||
|
@ -31594,7 +31601,9 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
outputSegment = entitySegment;
|
||||
outputSegment.clear();
|
||||
outputSegment.unsafeWrite(lowSurrogate);
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
|
@ -31605,6 +31614,34 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
inputSegment.append('#');
|
||||
yybegin(NUMERIC_CHARACTER);
|
||||
}
|
||||
case 102: break;
|
||||
case 36:
|
||||
{ yybegin(YYINITIAL);
|
||||
if (escapeBR) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
return outputSegment.nextChar();
|
||||
} else {
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.reset();
|
||||
return BR_END_TAG_REPLACEMENT;
|
||||
}
|
||||
}
|
||||
case 103: break;
|
||||
case 38:
|
||||
{ yybegin(restoreState);
|
||||
}
|
||||
case 104: break;
|
||||
case 41:
|
||||
{ yybegin(STYLE_COMMENT);
|
||||
}
|
||||
case 105: break;
|
||||
case 1:
|
||||
{ return zzBuffer[zzStartRead];
|
||||
}
|
||||
case 106: break;
|
||||
default:
|
||||
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
|
||||
|
|
|
@ -293,7 +293,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
case START_TAG_TAIL_EXCLUDE:
|
||||
case SERVER_SIDE_INCLUDE:
|
||||
case START_TAG_TAIL_SUBSTITUTE: { // Exclude
|
||||
// add (length of input that won't be output) [ - (substitution length) = 0 ]
|
||||
cumulativeDiff += yychar - inputStart;
|
||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
outputSegment.clear();
|
||||
eofReturnValue = -1;
|
||||
|
@ -301,7 +303,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
}
|
||||
case CHARACTER_REFERENCE_TAIL: { // Substitute
|
||||
// At end of file, allow char refs without semicolons
|
||||
// add (length of input that won't be output) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() - outputSegment.length();
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
||||
eofReturnValue = outputSegment.nextChar();
|
||||
break;
|
||||
|
@ -374,7 +378,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
assert false: "Exception parsing low surrogate '"
|
||||
+ surrogatePair.substring(10, 14) + "'";
|
||||
}
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
|
@ -403,7 +409,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
assert false: "Exception parsing low surrogate '"
|
||||
+ surrogatePair.substring(10, 14) + "'";
|
||||
}
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
|
@ -437,7 +445,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
outputSegment = entitySegment;
|
||||
outputSegment.clear();
|
||||
outputSegment.unsafeWrite(lowSurrogate);
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
|
@ -472,7 +482,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
outputSegment = entitySegment;
|
||||
outputSegment.clear();
|
||||
outputSegment.unsafeWrite(lowSurrogate);
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
|
@ -557,8 +569,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
|
||||
<CHARACTER_REFERENCE_TAIL> {
|
||||
";" {
|
||||
cumulativeDiff
|
||||
+= inputSegment.length() + yylength() - outputSegment.length();
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length();
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
|
@ -574,9 +587,10 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
outputSegment = inputSegment;
|
||||
return outputSegment.nextChar();
|
||||
} else {
|
||||
cumulativeDiff
|
||||
+= inputSegment.length() + yylength() - outputSegment.length();
|
||||
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.reset();
|
||||
return BR_END_TAG_REPLACEMENT;
|
||||
}
|
||||
|
@ -612,7 +626,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
|
||||
<END_TAG_TAIL_EXCLUDE> {
|
||||
\s* ">" {
|
||||
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
|
@ -621,7 +637,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
|
||||
<END_TAG_TAIL_SUBSTITUTE> {
|
||||
\s* ">" {
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
|
@ -637,7 +655,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
yybegin(LEFT_ANGLE_BRACKET_SPACE);
|
||||
}
|
||||
"?" [^>]* [/?] ">" {
|
||||
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
|
@ -649,8 +669,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
outputSegment = inputSegment;
|
||||
return outputSegment.nextChar();
|
||||
} else {
|
||||
cumulativeDiff
|
||||
+= inputSegment.length() + yylength() - outputSegment.length();
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.reset();
|
||||
return BR_START_TAG_REPLACEMENT;
|
||||
|
@ -708,7 +729,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
|
||||
<START_TAG_TAIL_EXCLUDE> {
|
||||
( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
|
||||
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
outputSegment = inputSegment;
|
||||
|
@ -718,7 +741,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
|
||||
<START_TAG_TAIL_SUBSTITUTE> {
|
||||
( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
|
@ -729,7 +754,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
<BANG> {
|
||||
"--" { yybegin(COMMENT); }
|
||||
">" {
|
||||
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
|
@ -742,7 +769,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
// [21] CDEnd ::= ']]>'
|
||||
//
|
||||
"[CDATA[" {
|
||||
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(CDATA);
|
||||
|
@ -754,7 +783,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
|
||||
<CDATA> {
|
||||
"]]>" {
|
||||
// add (this match length) [ - (substitution length) = 0 ]
|
||||
cumulativeDiff += yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
|
@ -764,7 +795,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
<COMMENT> {
|
||||
"<!--#" { restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
|
||||
"-->" {
|
||||
// add (previously matched input length) + (this match length) [ - (substitution length) = 0]
|
||||
cumulativeDiff += yychar - inputStart + yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
|
@ -820,19 +853,23 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
"</" \s* [sS][cC][rR][iI][pP][tT] \s* ">" {
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
// add (previously matched input length) -- current match and substitution handled below
|
||||
cumulativeDiff += yychar - inputStart;
|
||||
int outputEnd = outputCharCount;
|
||||
// position at (already output length) -- substitution handled below
|
||||
int offsetCorrectionPos = outputCharCount;
|
||||
int returnValue;
|
||||
if (escapeSCRIPT) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
returnValue = outputSegment.nextChar();
|
||||
} else {
|
||||
// add (this match length) - (substitution length)
|
||||
cumulativeDiff += yylength() - 1;
|
||||
++outputEnd;
|
||||
// add (substitution length)
|
||||
++offsetCorrectionPos;
|
||||
returnValue = SCRIPT_REPLACEMENT;
|
||||
}
|
||||
addOffCorrectMap(outputEnd, cumulativeDiff);
|
||||
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
|
||||
return returnValue;
|
||||
}
|
||||
[^] { }
|
||||
|
@ -843,19 +880,23 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
"</" \s* [sS][tT][yY][lL][eE] \s* ">" {
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
// add (previously matched input length) -- current match and substitution handled below
|
||||
cumulativeDiff += yychar - inputStart;
|
||||
int outputEnd = outputCharCount;
|
||||
// position the offset correction at (already output length) -- substitution handled below
|
||||
int offsetCorrectionPos = outputCharCount;
|
||||
int returnValue;
|
||||
if (escapeSTYLE) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
returnValue = outputSegment.nextChar();
|
||||
} else {
|
||||
// add (this match length) - (substitution length)
|
||||
cumulativeDiff += yylength() - 1;
|
||||
++outputEnd;
|
||||
// add (substitution length)
|
||||
++offsetCorrectionPos;
|
||||
returnValue = STYLE_REPLACEMENT;
|
||||
}
|
||||
addOffCorrectMap(outputEnd, cumulativeDiff);
|
||||
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
|
||||
return returnValue;
|
||||
}
|
||||
[^] { }
|
||||
|
|
|
@ -36,6 +36,21 @@ import org.apache.lucene.util._TestUtil;
|
|||
|
||||
public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||
|
||||
static private Analyzer newTestAnalyzer() {
|
||||
return new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(Reader reader) {
|
||||
return new HTMLStripCharFilter(CharReader.get(reader));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
//this is some text here is a link and another link . This is an entity: & plus a <. Here is an &
|
||||
//
|
||||
public void test() throws IOException {
|
||||
|
@ -493,41 +508,17 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testRandom() throws Exception {
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(Reader reader) {
|
||||
return new HTMLStripCharFilter(CharReader.get(reader));
|
||||
}
|
||||
};
|
||||
|
||||
int numRounds = RANDOM_MULTIPLIER * 10000;
|
||||
checkRandomData(random, analyzer, numRounds);
|
||||
checkRandomData(random, newTestAnalyzer(), numRounds);
|
||||
}
|
||||
|
||||
public void testRandomHugeStrings() throws Exception {
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(Reader reader) {
|
||||
return new HTMLStripCharFilter(CharReader.get(reader));
|
||||
}
|
||||
};
|
||||
|
||||
int numRounds = RANDOM_MULTIPLIER * 200;
|
||||
checkRandomData(random, analyzer, numRounds, 8192);
|
||||
checkRandomData(random, newTestAnalyzer(), numRounds, 8192);
|
||||
}
|
||||
|
||||
public void testCloseBR() throws Exception {
|
||||
checkAnalysisConsistency(random, newTestAnalyzer(), random.nextBoolean(), " Secretary)</br> [[M");
|
||||
}
|
||||
|
||||
public void testServerSideIncludes() throws Exception {
|
||||
|
@ -797,9 +788,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
public void testRandomBrokenHTML() throws Exception {
|
||||
int maxNumElements = 10000;
|
||||
String text = _TestUtil.randomHtmlishString(random, maxNumElements);
|
||||
Reader reader = new HTMLStripCharFilter
|
||||
(CharReader.get(new StringReader(text)));
|
||||
while (reader.read() != -1);
|
||||
checkAnalysisConsistency(random, newTestAnalyzer(), random.nextBoolean(), text);
|
||||
}
|
||||
|
||||
public void testRandomText() throws Exception {
|
||||
|
@ -838,18 +827,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testUTF16Surrogates() throws Exception {
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(Reader reader) {
|
||||
return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
|
||||
}
|
||||
};
|
||||
Analyzer analyzer = newTestAnalyzer();
|
||||
// Paired surrogates
|
||||
assertAnalyzesTo(analyzer, " one two ��three",
|
||||
new String[] { "one", "two", "\uD86C\uDC01three" } );
|
||||
|
|
Loading…
Reference in New Issue