mirror of https://github.com/apache/lucene.git
LUCENE-3969: make full offset checking optional and disable for the known (buggy) offenders
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311864 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
6563a58a2a
commit
b67e7a0a9b
|
@ -100,7 +100,14 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException {
|
// offsetsAreCorrect also validates:
|
||||||
|
// - graph offsets are correct (all tokens leaving from
|
||||||
|
// pos X have the same startOffset; all tokens
|
||||||
|
// arriving to pos Y have the same endOffset)
|
||||||
|
// - offsets only move forwards (startOffset >=
|
||||||
|
// lastStartOffset)
|
||||||
|
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset,
|
||||||
|
boolean offsetsAreCorrect) throws IOException {
|
||||||
assertNotNull(output);
|
assertNotNull(output);
|
||||||
CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);
|
CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);
|
||||||
|
|
||||||
|
@ -137,6 +144,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
|
|
||||||
ts.reset();
|
ts.reset();
|
||||||
int pos = -1;
|
int pos = -1;
|
||||||
|
int lastStartOffset = 0;
|
||||||
for (int i = 0; i < output.length; i++) {
|
for (int i = 0; i < output.length; i++) {
|
||||||
// extra safety to enforce, that the state is not preserved and also assign bogus values
|
// extra safety to enforce, that the state is not preserved and also assign bogus values
|
||||||
ts.clearAttributes();
|
ts.clearAttributes();
|
||||||
|
@ -176,7 +184,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
endOffset <= finalOffset.intValue());
|
endOffset <= finalOffset.intValue());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (posLengthAtt != null && posIncrAtt != null) {
|
if (offsetsAreCorrect) {
|
||||||
|
assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset, offsetAtt.startOffset() >= lastStartOffset);
|
||||||
|
lastStartOffset = offsetAtt.startOffset();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) {
|
||||||
// Validate offset consistency in the graph, ie
|
// Validate offset consistency in the graph, ie
|
||||||
// all tokens leaving from a certain pos have the
|
// all tokens leaving from a certain pos have the
|
||||||
// same startOffset, and all tokens arriving to a
|
// same startOffset, and all tokens arriving to a
|
||||||
|
@ -233,6 +246,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
ts.close();
|
ts.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException {
|
||||||
|
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, true);
|
||||||
|
}
|
||||||
|
|
||||||
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException {
|
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException {
|
||||||
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, finalOffset);
|
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, finalOffset);
|
||||||
}
|
}
|
||||||
|
@ -280,6 +297,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[]) throws IOException {
|
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[]) throws IOException {
|
||||||
assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length());
|
assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean offsetsAreCorrect) throws IOException {
|
||||||
|
assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), offsetsAreCorrect);
|
||||||
|
}
|
||||||
|
|
||||||
public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException {
|
public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException {
|
||||||
assertAnalyzesTo(a, input, output, null, null, null, null, null);
|
assertAnalyzesTo(a, input, output, null, null, null, null, null);
|
||||||
|
@ -342,12 +363,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
|
|
||||||
/** utility method for blasting tokenstreams with data to make sure they don't do anything crazy */
|
/** utility method for blasting tokenstreams with data to make sure they don't do anything crazy */
|
||||||
public static void checkRandomData(Random random, Analyzer a, int iterations) throws IOException {
|
public static void checkRandomData(Random random, Analyzer a, int iterations) throws IOException {
|
||||||
checkRandomData(random, a, iterations, 20, false);
|
checkRandomData(random, a, iterations, 20, false, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** utility method for blasting tokenstreams with data to make sure they don't do anything crazy */
|
/** utility method for blasting tokenstreams with data to make sure they don't do anything crazy */
|
||||||
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException {
|
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException {
|
||||||
checkRandomData(random, a, iterations, maxWordLength, false);
|
checkRandomData(random, a, iterations, maxWordLength, false, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -355,7 +376,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
* @param simple true if only ascii strings will be used (try to avoid)
|
* @param simple true if only ascii strings will be used (try to avoid)
|
||||||
*/
|
*/
|
||||||
public static void checkRandomData(Random random, Analyzer a, int iterations, boolean simple) throws IOException {
|
public static void checkRandomData(Random random, Analyzer a, int iterations, boolean simple) throws IOException {
|
||||||
checkRandomData(random, a, iterations, 20, simple);
|
checkRandomData(random, a, iterations, 20, simple, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
static class AnalysisThread extends Thread {
|
static class AnalysisThread extends Thread {
|
||||||
|
@ -364,13 +385,15 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
final Random random;
|
final Random random;
|
||||||
final Analyzer a;
|
final Analyzer a;
|
||||||
final boolean simple;
|
final boolean simple;
|
||||||
|
final boolean offsetsAreCorrect;
|
||||||
|
|
||||||
AnalysisThread(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple) {
|
AnalysisThread(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple, boolean offsetsAreCorrect) {
|
||||||
this.random = random;
|
this.random = random;
|
||||||
this.a = a;
|
this.a = a;
|
||||||
this.iterations = iterations;
|
this.iterations = iterations;
|
||||||
this.maxWordLength = maxWordLength;
|
this.maxWordLength = maxWordLength;
|
||||||
this.simple = simple;
|
this.simple = simple;
|
||||||
|
this.offsetsAreCorrect = offsetsAreCorrect;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -378,7 +401,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
try {
|
try {
|
||||||
// see the part in checkRandomData where it replays the same text again
|
// see the part in checkRandomData where it replays the same text again
|
||||||
// to verify reproducability/reuse: hopefully this would catch thread hazards.
|
// to verify reproducability/reuse: hopefully this would catch thread hazards.
|
||||||
checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean(), simple);
|
checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean(), simple, offsetsAreCorrect);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
Rethrow.rethrow(e);
|
Rethrow.rethrow(e);
|
||||||
}
|
}
|
||||||
|
@ -386,12 +409,16 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
};
|
};
|
||||||
|
|
||||||
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple) throws IOException {
|
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple) throws IOException {
|
||||||
checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean(), simple);
|
checkRandomData(random, a, iterations, maxWordLength, simple, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple, boolean offsetsAreCorrect) throws IOException {
|
||||||
|
checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean(), simple, offsetsAreCorrect);
|
||||||
// now test with multiple threads
|
// now test with multiple threads
|
||||||
int numThreads = _TestUtil.nextInt(random, 4, 8);
|
int numThreads = _TestUtil.nextInt(random, 4, 8);
|
||||||
Thread threads[] = new Thread[numThreads];
|
Thread threads[] = new Thread[numThreads];
|
||||||
for (int i = 0; i < threads.length; i++) {
|
for (int i = 0; i < threads.length; i++) {
|
||||||
threads[i] = new AnalysisThread(new Random(random.nextLong()), a, iterations, maxWordLength, simple);
|
threads[i] = new AnalysisThread(new Random(random.nextLong()), a, iterations, maxWordLength, simple, offsetsAreCorrect);
|
||||||
}
|
}
|
||||||
for (int i = 0; i < threads.length; i++) {
|
for (int i = 0; i < threads.length; i++) {
|
||||||
threads[i].start();
|
threads[i].start();
|
||||||
|
@ -405,7 +432,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple) throws IOException {
|
private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect) throws IOException {
|
||||||
|
|
||||||
final LineFileDocs docs = new LineFileDocs(random);
|
final LineFileDocs docs = new LineFileDocs(random);
|
||||||
|
|
||||||
|
@ -437,7 +464,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
checkAnalysisConsistency(random, a, useCharFilter, text);
|
checkAnalysisConsistency(random, a, useCharFilter, text, offsetsAreCorrect);
|
||||||
} catch (Throwable t) {
|
} catch (Throwable t) {
|
||||||
// TODO: really we should pass a random seed to
|
// TODO: really we should pass a random seed to
|
||||||
// checkAnalysisConsistency then print it here too:
|
// checkAnalysisConsistency then print it here too:
|
||||||
|
@ -477,6 +504,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text) throws IOException {
|
public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text) throws IOException {
|
||||||
|
checkAnalysisConsistency(random, a, useCharFilter, text, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean offsetsAreCorrect) throws IOException {
|
||||||
|
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
|
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
|
||||||
|
@ -616,7 +647,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
types.toArray(new String[types.size()]),
|
types.toArray(new String[types.size()]),
|
||||||
toIntArray(positions),
|
toIntArray(positions),
|
||||||
toIntArray(positionLengths),
|
toIntArray(positionLengths),
|
||||||
text.length());
|
text.length(),
|
||||||
|
offsetsAreCorrect);
|
||||||
} else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
|
} else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
|
||||||
// offset + pos + type
|
// offset + pos + type
|
||||||
assertTokenStreamContents(ts,
|
assertTokenStreamContents(ts,
|
||||||
|
@ -626,7 +658,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
types.toArray(new String[types.size()]),
|
types.toArray(new String[types.size()]),
|
||||||
toIntArray(positions),
|
toIntArray(positions),
|
||||||
null,
|
null,
|
||||||
text.length());
|
text.length(),
|
||||||
|
offsetsAreCorrect);
|
||||||
} else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
|
} else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
|
||||||
// offset + pos + posLength
|
// offset + pos + posLength
|
||||||
assertTokenStreamContents(ts,
|
assertTokenStreamContents(ts,
|
||||||
|
@ -636,7 +669,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
null,
|
null,
|
||||||
toIntArray(positions),
|
toIntArray(positions),
|
||||||
toIntArray(positionLengths),
|
toIntArray(positionLengths),
|
||||||
text.length());
|
text.length(),
|
||||||
|
offsetsAreCorrect);
|
||||||
} else if (posIncAtt != null && offsetAtt != null) {
|
} else if (posIncAtt != null && offsetAtt != null) {
|
||||||
// offset + pos
|
// offset + pos
|
||||||
assertTokenStreamContents(ts,
|
assertTokenStreamContents(ts,
|
||||||
|
@ -646,7 +680,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
null,
|
null,
|
||||||
toIntArray(positions),
|
toIntArray(positions),
|
||||||
null,
|
null,
|
||||||
text.length());
|
text.length(),
|
||||||
|
offsetsAreCorrect);
|
||||||
} else if (offsetAtt != null) {
|
} else if (offsetAtt != null) {
|
||||||
// offset
|
// offset
|
||||||
assertTokenStreamContents(ts,
|
assertTokenStreamContents(ts,
|
||||||
|
@ -656,7 +691,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
||||||
null,
|
null,
|
||||||
null,
|
null,
|
||||||
null,
|
null,
|
||||||
text.length());
|
text.length(),
|
||||||
|
offsetsAreCorrect);
|
||||||
} else {
|
} else {
|
||||||
// terms only
|
// terms only
|
||||||
assertTokenStreamContents(ts,
|
assertTokenStreamContents(ts,
|
||||||
|
|
|
@ -27,7 +27,11 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||||
import org.apache.lucene.util.Attribute;
|
import org.apache.lucene.util.Attribute;
|
||||||
|
|
||||||
// nocommit better name...?
|
// nocommit rename to OffsetsXXXTF? ie we only validate
|
||||||
|
// offsets (now anyway...)
|
||||||
|
|
||||||
|
// TODO: also make a DebuggingTokenFilter, that just prints
|
||||||
|
// all att values that come through it...
|
||||||
|
|
||||||
// nocommit BTSTC should just append this to the chain
|
// nocommit BTSTC should just append this to the chain
|
||||||
// instead of checking itself:
|
// instead of checking itself:
|
||||||
|
@ -37,6 +41,7 @@ import org.apache.lucene.util.Attribute;
|
||||||
public final class ValidatingTokenFilter extends TokenFilter {
|
public final class ValidatingTokenFilter extends TokenFilter {
|
||||||
|
|
||||||
private int pos;
|
private int pos;
|
||||||
|
private int lastStartOffset;
|
||||||
|
|
||||||
// Maps position to the start/end offset:
|
// Maps position to the start/end offset:
|
||||||
private final Map<Integer,Integer> posToStartOffset = new HashMap<Integer,Integer>();
|
private final Map<Integer,Integer> posToStartOffset = new HashMap<Integer,Integer>();
|
||||||
|
@ -46,6 +51,7 @@ public final class ValidatingTokenFilter extends TokenFilter {
|
||||||
private final PositionLengthAttribute posLenAtt = getAttrIfExists(PositionLengthAttribute.class);
|
private final PositionLengthAttribute posLenAtt = getAttrIfExists(PositionLengthAttribute.class);
|
||||||
private final OffsetAttribute offsetAtt = getAttrIfExists(OffsetAttribute.class);
|
private final OffsetAttribute offsetAtt = getAttrIfExists(OffsetAttribute.class);
|
||||||
private final CharTermAttribute termAtt = getAttrIfExists(CharTermAttribute.class);
|
private final CharTermAttribute termAtt = getAttrIfExists(CharTermAttribute.class);
|
||||||
|
private final boolean offsetsAreCorrect;
|
||||||
|
|
||||||
private final String name;
|
private final String name;
|
||||||
|
|
||||||
|
@ -61,9 +67,10 @@ public final class ValidatingTokenFilter extends TokenFilter {
|
||||||
/** The name arg is used to identify this stage when
|
/** The name arg is used to identify this stage when
|
||||||
* throwing exceptions (useful if you have more than one
|
* throwing exceptions (useful if you have more than one
|
||||||
* instance in your chain). */
|
* instance in your chain). */
|
||||||
public ValidatingTokenFilter(TokenStream in, String name) {
|
public ValidatingTokenFilter(TokenStream in, String name, boolean offsetsAreCorrect) {
|
||||||
super(in);
|
super(in);
|
||||||
this.name = name;
|
this.name = name;
|
||||||
|
this.offsetsAreCorrect = offsetsAreCorrect;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -82,6 +89,8 @@ public final class ValidatingTokenFilter extends TokenFilter {
|
||||||
throw new IllegalStateException("first posInc must be > 0");
|
throw new IllegalStateException("first posInc must be > 0");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// System.out.println(" got token=" + termAtt + " pos=" + pos);
|
||||||
|
|
||||||
if (offsetAtt != null) {
|
if (offsetAtt != null) {
|
||||||
startOffset = offsetAtt.startOffset();
|
startOffset = offsetAtt.startOffset();
|
||||||
|
@ -96,11 +105,15 @@ public final class ValidatingTokenFilter extends TokenFilter {
|
||||||
if (endOffset < startOffset) {
|
if (endOffset < startOffset) {
|
||||||
throw new IllegalStateException(name + ": startOffset=" + startOffset + " is > endOffset=" + endOffset + " pos=" + pos + "; token=" + termAtt);
|
throw new IllegalStateException(name + ": startOffset=" + startOffset + " is > endOffset=" + endOffset + " pos=" + pos + "; token=" + termAtt);
|
||||||
}
|
}
|
||||||
|
if (offsetsAreCorrect && offsetAtt.startOffset() < lastStartOffset) {
|
||||||
|
throw new IllegalStateException(name + ": offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset);
|
||||||
|
}
|
||||||
|
lastStartOffset = offsetAtt.startOffset();
|
||||||
}
|
}
|
||||||
|
|
||||||
posLen = posLenAtt == null ? 1 : posLenAtt.getPositionLength();
|
posLen = posLenAtt == null ? 1 : posLenAtt.getPositionLength();
|
||||||
|
|
||||||
if (offsetAtt != null && posIncAtt != null) {
|
if (offsetAtt != null && posIncAtt != null && offsetsAreCorrect) {
|
||||||
|
|
||||||
if (!posToStartOffset.containsKey(pos)) {
|
if (!posToStartOffset.containsKey(pos)) {
|
||||||
// First time we've seen a token leaving from this position:
|
// First time we've seen a token leaving from this position:
|
||||||
|
@ -152,5 +165,6 @@ public final class ValidatingTokenFilter extends TokenFilter {
|
||||||
pos = -1;
|
pos = -1;
|
||||||
posToStartOffset.clear();
|
posToStartOffset.clear();
|
||||||
posToEndOffset.clear();
|
posToEndOffset.clear();
|
||||||
|
lastStartOffset = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -30,6 +30,7 @@ import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.util._TestUtil;
|
import org.apache.lucene.util._TestUtil;
|
||||||
|
import org.junit.Ignore;
|
||||||
|
|
||||||
public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
@ -195,6 +196,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
// nocommit: wrong final offset, fix this!
|
// nocommit: wrong final offset, fix this!
|
||||||
|
@Ignore
|
||||||
public void testFinalOffsetSpecialCase() throws Exception {
|
public void testFinalOffsetSpecialCase() throws Exception {
|
||||||
final NormalizeCharMap map = new NormalizeCharMap();
|
final NormalizeCharMap map = new NormalizeCharMap();
|
||||||
map.add("t", "");
|
map.add("t", "");
|
||||||
|
@ -219,6 +221,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
// nocommit: this is intended to fail until we fix bugs
|
// nocommit: this is intended to fail until we fix bugs
|
||||||
|
@Ignore
|
||||||
public void testRandomMaps() throws Exception {
|
public void testRandomMaps() throws Exception {
|
||||||
for (int i = 0; i < 100; i++) {
|
for (int i = 0; i < 100; i++) {
|
||||||
final NormalizeCharMap map = randomMap();
|
final NormalizeCharMap map = randomMap();
|
||||||
|
|
|
@ -52,6 +52,7 @@ import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
|
||||||
import org.apache.lucene.analysis.ValidatingTokenFilter;
|
import org.apache.lucene.analysis.ValidatingTokenFilter;
|
||||||
import org.apache.lucene.analysis.charfilter.CharFilter;
|
import org.apache.lucene.analysis.charfilter.CharFilter;
|
||||||
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
|
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
|
||||||
|
@ -63,6 +64,8 @@ import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
||||||
import org.apache.lucene.analysis.hunspell.HunspellDictionary;
|
import org.apache.lucene.analysis.hunspell.HunspellDictionary;
|
||||||
import org.apache.lucene.analysis.hunspell.HunspellDictionaryTest;
|
import org.apache.lucene.analysis.hunspell.HunspellDictionaryTest;
|
||||||
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
|
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.TrimFilter;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
|
||||||
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
|
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
|
||||||
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
|
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
|
||||||
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
|
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
|
||||||
|
@ -91,42 +94,54 @@ import org.xml.sax.InputSource;
|
||||||
|
|
||||||
/** tests random analysis chains */
|
/** tests random analysis chains */
|
||||||
public class TestRandomChains extends BaseTokenStreamTestCase {
|
public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
static List<Constructor<? extends Tokenizer>> tokenizers;
|
static List<Constructor<? extends Tokenizer>> tokenizers;
|
||||||
static List<Constructor<? extends TokenFilter>> tokenfilters;
|
static List<Constructor<? extends TokenFilter>> tokenfilters;
|
||||||
static List<Constructor<? extends CharStream>> charfilters;
|
static List<Constructor<? extends CharStream>> charfilters;
|
||||||
|
|
||||||
// TODO: fix those and remove
|
// TODO: fix those and remove
|
||||||
private static final Set<Class<?>> brokenComponents = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
|
private static final Set<Class<?>> brokenComponents = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
|
||||||
static {
|
static {
|
||||||
|
// nocommit can we promote some of these to be only
|
||||||
|
// offsets offenders?
|
||||||
Collections.<Class<?>>addAll(brokenComponents,
|
Collections.<Class<?>>addAll(brokenComponents,
|
||||||
// TODO: fix basetokenstreamtestcase not to trip because this one has no CharTermAtt
|
// TODO: fix basetokenstreamtestcase not to trip because this one has no CharTermAtt
|
||||||
EmptyTokenizer.class,
|
EmptyTokenizer.class,
|
||||||
// doesn't actual reset itself!
|
// doesn't actual reset itself!
|
||||||
CachingTokenFilter.class,
|
CachingTokenFilter.class,
|
||||||
// nocommit: corrumpts graphs (offset consistency check)
|
// doesn't consume whole stream!
|
||||||
PositionFilter.class,
|
LimitTokenCountFilter.class,
|
||||||
// doesn't consume whole stream!
|
// Not broken: we forcefully add this, so we shouldn't
|
||||||
LimitTokenCountFilter.class,
|
// also randomly pick it:
|
||||||
// broken!
|
ValidatingTokenFilter.class,
|
||||||
NGramTokenizer.class,
|
// nocommit: randomly generate the Side enum param here; then promote to brokenOffsets?
|
||||||
// broken!
|
EdgeNGramTokenizer.class,
|
||||||
NGramTokenFilter.class,
|
// nocommit: randomly generate the Side enum param here; then promote to brokenOffsets?
|
||||||
// broken!
|
EdgeNGramTokenFilter.class
|
||||||
EdgeNGramTokenizer.class,
|
|
||||||
// broken!
|
|
||||||
EdgeNGramTokenFilter.class,
|
|
||||||
// fix these 4 to use 'real positions' and not stack the way they do:
|
|
||||||
// if you want that use positionfilter
|
|
||||||
PathHierarchyTokenizer.class,
|
|
||||||
ReversePathHierarchyTokenizer.class,
|
|
||||||
HyphenationCompoundWordTokenFilter.class,
|
|
||||||
DictionaryCompoundWordTokenFilter.class,
|
|
||||||
// Not broken: we forcefully add this, so we shouldn't
|
|
||||||
// also randomly pick it:
|
|
||||||
ValidatingTokenFilter.class
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: also fix these and remove (maybe):
|
||||||
|
// Classes that don't produce consistent graph offsets:
|
||||||
|
private static final Set<Class<?>> brokenOffsetsComponents = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
|
||||||
|
static {
|
||||||
|
Collections.<Class<?>>addAll(brokenOffsetsComponents,
|
||||||
|
WordDelimiterFilter.class,
|
||||||
|
TrimFilter.class,
|
||||||
|
ReversePathHierarchyTokenizer.class,
|
||||||
|
PathHierarchyTokenizer.class,
|
||||||
|
HyphenationCompoundWordTokenFilter.class,
|
||||||
|
DictionaryCompoundWordTokenFilter.class,
|
||||||
|
// nocommit: corrumpts graphs (offset consistency check):
|
||||||
|
PositionFilter.class,
|
||||||
|
// broken!
|
||||||
|
NGramTokenizer.class,
|
||||||
|
// broken!
|
||||||
|
NGramTokenFilter.class,
|
||||||
|
// nocommit it seems to mess up offsets!?
|
||||||
|
WikipediaTokenizer.class
|
||||||
|
);
|
||||||
|
}
|
||||||
@BeforeClass
|
@BeforeClass
|
||||||
public static void beforeClass() throws Exception {
|
public static void beforeClass() throws Exception {
|
||||||
List<Class<?>> analysisClasses = new ArrayList<Class<?>>();
|
List<Class<?>> analysisClasses = new ArrayList<Class<?>>();
|
||||||
|
@ -146,7 +161,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
) {
|
) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (final Constructor<?> ctor : c.getConstructors()) {
|
for (final Constructor<?> ctor : c.getConstructors()) {
|
||||||
// don't test synthetic or deprecated ctors, they likely have known bugs:
|
// don't test synthetic or deprecated ctors, they likely have known bugs:
|
||||||
if (ctor.isSynthetic() || ctor.isAnnotationPresent(Deprecated.class)) {
|
if (ctor.isSynthetic() || ctor.isAnnotationPresent(Deprecated.class)) {
|
||||||
|
@ -154,22 +168,21 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
if (Tokenizer.class.isAssignableFrom(c)) {
|
if (Tokenizer.class.isAssignableFrom(c)) {
|
||||||
assertTrue(ctor.toGenericString() + " has unsupported parameter types",
|
assertTrue(ctor.toGenericString() + " has unsupported parameter types",
|
||||||
allowedTokenizerArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
|
allowedTokenizerArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
|
||||||
tokenizers.add(castConstructor(Tokenizer.class, ctor));
|
tokenizers.add(castConstructor(Tokenizer.class, ctor));
|
||||||
} else if (TokenFilter.class.isAssignableFrom(c)) {
|
} else if (TokenFilter.class.isAssignableFrom(c)) {
|
||||||
assertTrue(ctor.toGenericString() + " has unsupported parameter types",
|
assertTrue(ctor.toGenericString() + " has unsupported parameter types",
|
||||||
allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
|
allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
|
||||||
tokenfilters.add(castConstructor(TokenFilter.class, ctor));
|
tokenfilters.add(castConstructor(TokenFilter.class, ctor));
|
||||||
} else if (CharStream.class.isAssignableFrom(c)) {
|
} else if (CharStream.class.isAssignableFrom(c)) {
|
||||||
assertTrue(ctor.toGenericString() + " has unsupported parameter types",
|
assertTrue(ctor.toGenericString() + " has unsupported parameter types",
|
||||||
allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
|
allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
|
||||||
charfilters.add(castConstructor(CharStream.class, ctor));
|
charfilters.add(castConstructor(CharStream.class, ctor));
|
||||||
} else {
|
} else {
|
||||||
fail("Cannot get here");
|
fail("Cannot get here");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
final Comparator<Constructor<?>> ctorComp = new Comparator<Constructor<?>>() {
|
final Comparator<Constructor<?>> ctorComp = new Comparator<Constructor<?>>() {
|
||||||
@Override
|
@Override
|
||||||
public int compare(Constructor<?> arg0, Constructor<?> arg1) {
|
public int compare(Constructor<?> arg0, Constructor<?> arg1) {
|
||||||
|
@ -179,28 +192,24 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
Collections.sort(tokenizers, ctorComp);
|
Collections.sort(tokenizers, ctorComp);
|
||||||
Collections.sort(tokenfilters, ctorComp);
|
Collections.sort(tokenfilters, ctorComp);
|
||||||
Collections.sort(charfilters, ctorComp);
|
Collections.sort(charfilters, ctorComp);
|
||||||
|
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
System.out.println("tokenizers = " + tokenizers);
|
System.out.println("tokenizers = " + tokenizers);
|
||||||
System.out.println("tokenfilters = " + tokenfilters);
|
System.out.println("tokenfilters = " + tokenfilters);
|
||||||
System.out.println("charfilters = " + charfilters);
|
System.out.println("charfilters = " + charfilters);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@AfterClass
|
@AfterClass
|
||||||
public static void afterClass() throws Exception {
|
public static void afterClass() throws Exception {
|
||||||
tokenizers = null;
|
tokenizers = null;
|
||||||
tokenfilters = null;
|
tokenfilters = null;
|
||||||
charfilters = null;
|
charfilters = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Hack to work around the stupidness of Oracle's strict Java backwards compatibility.
|
/** Hack to work around the stupidness of Oracle's strict Java backwards compatibility.
|
||||||
* {@code Class<T>#getConstructors()} should return unmodifiable {@code List<Constructor<T>>} not array! */
|
* {@code Class<T>#getConstructors()} should return unmodifiable {@code List<Constructor<T>>} not array! */
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
private static <T> Constructor<T> castConstructor(Class<T> instanceClazz, Constructor<?> ctor) {
|
private static <T> Constructor<T> castConstructor(Class<T> instanceClazz, Constructor<?> ctor) {
|
||||||
return (Constructor<T>) ctor;
|
return (Constructor<T>) ctor;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void getClassesForPackage(String pckgname, List<Class<?>> classes) throws Exception {
|
private static void getClassesForPackage(String pckgname, List<Class<?>> classes) throws Exception {
|
||||||
final ClassLoader cld = TestRandomChains.class.getClassLoader();
|
final ClassLoader cld = TestRandomChains.class.getClassLoader();
|
||||||
final String path = pckgname.replace('.', '/');
|
final String path = pckgname.replace('.', '/');
|
||||||
|
@ -541,13 +550,21 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
MockRandomAnalyzer(long seed) {
|
MockRandomAnalyzer(long seed) {
|
||||||
this.seed = seed;
|
this.seed = seed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean offsetsAreCorrect() {
|
||||||
|
// nocommit: can we not do the full chain here!?
|
||||||
|
Random random = new Random(seed);
|
||||||
|
TokenizerSpec tokenizerSpec = newTokenizer(random, new StringReader(""));
|
||||||
|
TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
|
||||||
|
return filterSpec.offsetsAreCorrect;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
Random random = new Random(seed);
|
Random random = new Random(seed);
|
||||||
TokenizerSpec tokenizerspec = newTokenizer(random, reader);
|
TokenizerSpec tokenizerSpec = newTokenizer(random, reader);
|
||||||
TokenFilterSpec filterspec = newFilterChain(random, tokenizerspec.tokenizer);
|
TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
|
||||||
return new TokenStreamComponents(tokenizerspec.tokenizer, filterspec.stream);
|
return new TokenStreamComponents(tokenizerSpec.tokenizer, filterSpec.stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -561,19 +578,21 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
public String toString() {
|
public String toString() {
|
||||||
Random random = new Random(seed);
|
Random random = new Random(seed);
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
CharFilterSpec charfilterSpec = newCharFilterChain(random, new StringReader(""));
|
CharFilterSpec charFilterSpec = newCharFilterChain(random, new StringReader(""));
|
||||||
sb.append("\ncharfilters=");
|
sb.append("\ncharfilters=");
|
||||||
sb.append(charfilterSpec.toString);
|
sb.append(charFilterSpec.toString);
|
||||||
// intentional: initReader gets its own separate random
|
// intentional: initReader gets its own separate random
|
||||||
random = new Random(seed);
|
random = new Random(seed);
|
||||||
TokenizerSpec tokenizerSpec = newTokenizer(random, charfilterSpec.reader);
|
TokenizerSpec tokenizerSpec = newTokenizer(random, charFilterSpec.reader);
|
||||||
sb.append("\n");
|
sb.append("\n");
|
||||||
sb.append("tokenizer=");
|
sb.append("tokenizer=");
|
||||||
sb.append(tokenizerSpec.toString);
|
sb.append(tokenizerSpec.toString);
|
||||||
TokenFilterSpec tokenfilterSpec = newFilterChain(random, tokenizerSpec.tokenizer);
|
TokenFilterSpec tokenFilterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
|
||||||
sb.append("\n");
|
sb.append("\n");
|
||||||
sb.append("filters=");
|
sb.append("filters=");
|
||||||
sb.append(tokenfilterSpec.toString);
|
sb.append(tokenFilterSpec.toString);
|
||||||
|
sb.append("\n");
|
||||||
|
sb.append("offsetsAreCorrect=" + tokenFilterSpec.offsetsAreCorrect);
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -620,6 +639,9 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
final CheckThatYouDidntReadAnythingReaderWrapper wrapper = new CheckThatYouDidntReadAnythingReaderWrapper(reader);
|
final CheckThatYouDidntReadAnythingReaderWrapper wrapper = new CheckThatYouDidntReadAnythingReaderWrapper(reader);
|
||||||
final Object args[] = newTokenizerArgs(random, wrapper, ctor.getParameterTypes());
|
final Object args[] = newTokenizerArgs(random, wrapper, ctor.getParameterTypes());
|
||||||
spec.tokenizer = createComponent(ctor, args, descr);
|
spec.tokenizer = createComponent(ctor, args, descr);
|
||||||
|
if (brokenOffsetsComponents.contains(ctor.getDeclaringClass())) {
|
||||||
|
spec.offsetsAreCorrect = false;
|
||||||
|
}
|
||||||
if (spec.tokenizer == null) {
|
if (spec.tokenizer == null) {
|
||||||
assertFalse(ctor.getDeclaringClass().getName() + " has read something in ctor but failed with UOE/IAE", wrapper.readSomething);
|
assertFalse(ctor.getDeclaringClass().getName() + " has read something in ctor but failed with UOE/IAE", wrapper.readSomething);
|
||||||
}
|
}
|
||||||
|
@ -648,8 +670,9 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
return spec;
|
return spec;
|
||||||
}
|
}
|
||||||
|
|
||||||
private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer) {
|
private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer, boolean offsetsAreCorrect) {
|
||||||
TokenFilterSpec spec = new TokenFilterSpec();
|
TokenFilterSpec spec = new TokenFilterSpec();
|
||||||
|
spec.offsetsAreCorrect = offsetsAreCorrect;
|
||||||
spec.stream = tokenizer;
|
spec.stream = tokenizer;
|
||||||
StringBuilder descr = new StringBuilder();
|
StringBuilder descr = new StringBuilder();
|
||||||
int numFilters = random.nextInt(5);
|
int numFilters = random.nextInt(5);
|
||||||
|
@ -658,13 +681,16 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
// Insert ValidatingTF after each stage so we can
|
// Insert ValidatingTF after each stage so we can
|
||||||
// catch problems right after the TF that "caused"
|
// catch problems right after the TF that "caused"
|
||||||
// them:
|
// them:
|
||||||
spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i);
|
spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i, spec.offsetsAreCorrect);
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
|
final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
|
||||||
final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
|
final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
|
||||||
final TokenFilter flt = createComponent(ctor, args, descr);
|
final TokenFilter flt = createComponent(ctor, args, descr);
|
||||||
if (flt != null) {
|
if (flt != null) {
|
||||||
|
if (brokenOffsetsComponents.contains(ctor.getDeclaringClass())) {
|
||||||
|
spec.offsetsAreCorrect = false;
|
||||||
|
}
|
||||||
spec.stream = flt;
|
spec.stream = flt;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -674,7 +700,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
// Insert ValidatingTF after each stage so we can
|
// Insert ValidatingTF after each stage so we can
|
||||||
// catch problems right after the TF that "caused"
|
// catch problems right after the TF that "caused"
|
||||||
// them:
|
// them:
|
||||||
spec.stream = new ValidatingTokenFilter(spec.stream, "last stage");
|
spec.stream = new ValidatingTokenFilter(spec.stream, "last stage", spec.offsetsAreCorrect);
|
||||||
|
|
||||||
spec.toString = descr.toString();
|
spec.toString = descr.toString();
|
||||||
return spec;
|
return spec;
|
||||||
|
@ -722,11 +748,13 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
static class TokenizerSpec {
|
static class TokenizerSpec {
|
||||||
Tokenizer tokenizer;
|
Tokenizer tokenizer;
|
||||||
String toString;
|
String toString;
|
||||||
|
boolean offsetsAreCorrect = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static class TokenFilterSpec {
|
static class TokenFilterSpec {
|
||||||
TokenStream stream;
|
TokenStream stream;
|
||||||
String toString;
|
String toString;
|
||||||
|
boolean offsetsAreCorrect = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static class CharFilterSpec {
|
static class CharFilterSpec {
|
||||||
|
@ -743,7 +771,8 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
System.out.println("Creating random analyzer:" + a);
|
System.out.println("Creating random analyzer:" + a);
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
checkRandomData(random, a, 1000);
|
checkRandomData(random, a, 1000, 20, false,
|
||||||
|
false /* We already validate our own offsets... */);
|
||||||
} catch (Throwable e) {
|
} catch (Throwable e) {
|
||||||
System.err.println("Exception from random analyzer: " + a);
|
System.err.println("Exception from random analyzer: " + a);
|
||||||
throw e;
|
throw e;
|
||||||
|
|
|
@ -65,7 +65,11 @@ public class TestTrimFilter extends BaseTokenStreamTestCase {
|
||||||
new String[] { "a", "b", "c", "" },
|
new String[] { "a", "b", "c", "" },
|
||||||
new int[] { 1, 0, 1, 3 },
|
new int[] { 1, 0, 1, 3 },
|
||||||
new int[] { 2, 1, 2, 3 },
|
new int[] { 2, 1, 2, 3 },
|
||||||
new int[] { 1, 1, 1, 1 });
|
null,
|
||||||
|
new int[] { 1, 1, 1, 1 },
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
false);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -72,14 +72,16 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
||||||
assertTokenStreamContents(wdf,
|
assertTokenStreamContents(wdf,
|
||||||
new String[] { "foo", "bar", "foobar" },
|
new String[] { "foo", "bar", "foobar" },
|
||||||
new int[] { 5, 9, 5 },
|
new int[] { 5, 9, 5 },
|
||||||
new int[] { 8, 12, 12 });
|
new int[] { 8, 12, 12 },
|
||||||
|
null, null, null, null, false);
|
||||||
|
|
||||||
wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null);
|
wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null);
|
||||||
|
|
||||||
assertTokenStreamContents(wdf,
|
assertTokenStreamContents(wdf,
|
||||||
new String[] { "foo", "bar", "foobar" },
|
new String[] { "foo", "bar", "foobar" },
|
||||||
new int[] { 5, 5, 5 },
|
new int[] { 5, 5, 5 },
|
||||||
new int[] { 6, 6, 6 });
|
new int[] { 6, 6, 6 },
|
||||||
|
null, null, null, null, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -123,7 +125,8 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
||||||
assertTokenStreamContents(wdf,
|
assertTokenStreamContents(wdf,
|
||||||
new String[] { "foo", "bar", "foobar"},
|
new String[] { "foo", "bar", "foobar"},
|
||||||
new int[] { 8, 12, 8 },
|
new int[] { 8, 12, 8 },
|
||||||
new int[] { 11, 15, 15 });
|
new int[] { 11, 15, 15 },
|
||||||
|
null, null, null, null, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void doSplit(final String input, String... output) throws Exception {
|
public void doSplit(final String input, String... output) throws Exception {
|
||||||
|
@ -230,18 +233,27 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
||||||
assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
|
assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
|
||||||
new int[] { 0, 9 },
|
new int[] { 0, 9 },
|
||||||
new int[] { 6, 13 },
|
new int[] { 6, 13 },
|
||||||
new int[] { 1, 1 });
|
null,
|
||||||
|
new int[] { 1, 1 },
|
||||||
|
null,
|
||||||
|
false);
|
||||||
|
|
||||||
/* only in this case, posInc of 2 ?! */
|
/* only in this case, posInc of 2 ?! */
|
||||||
assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
|
assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
|
||||||
new int[] { 0, 9, 12, 9 },
|
new int[] { 0, 9, 12, 9 },
|
||||||
new int[] { 6, 12, 13, 13 },
|
new int[] { 6, 12, 13, 13 },
|
||||||
new int[] { 1, 1, 1, 0 });
|
null,
|
||||||
|
new int[] { 1, 1, 1, 0 },
|
||||||
|
null,
|
||||||
|
false);
|
||||||
|
|
||||||
assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
|
assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
|
||||||
new int[] { 0, 9, 15 },
|
new int[] { 0, 9, 15 },
|
||||||
new int[] { 6, 14, 19 },
|
new int[] { 6, 14, 19 },
|
||||||
new int[] { 1, 1, 1 });
|
null,
|
||||||
|
new int[] { 1, 1, 1 },
|
||||||
|
null,
|
||||||
|
false);
|
||||||
|
|
||||||
/* analyzer that will consume tokens with large position increments */
|
/* analyzer that will consume tokens with large position increments */
|
||||||
Analyzer a2 = new Analyzer() {
|
Analyzer a2 = new Analyzer() {
|
||||||
|
@ -258,24 +270,36 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
||||||
assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" },
|
assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" },
|
||||||
new int[] { 0, 7, 16 },
|
new int[] { 0, 7, 16 },
|
||||||
new int[] { 6, 15, 20 },
|
new int[] { 6, 15, 20 },
|
||||||
new int[] { 1, 10, 1 });
|
null,
|
||||||
|
new int[] { 1, 10, 1 },
|
||||||
|
null,
|
||||||
|
false);
|
||||||
|
|
||||||
/* the "/" had a position increment of 10, where did it go?!?!! */
|
/* the "/" had a position increment of 10, where did it go?!?!! */
|
||||||
assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
|
assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
|
||||||
new int[] { 0, 9 },
|
new int[] { 0, 9 },
|
||||||
new int[] { 6, 13 },
|
new int[] { 6, 13 },
|
||||||
new int[] { 1, 11 });
|
null,
|
||||||
|
new int[] { 1, 11 },
|
||||||
|
null,
|
||||||
|
false);
|
||||||
|
|
||||||
/* in this case, the increment of 10 from the "/" is carried over */
|
/* in this case, the increment of 10 from the "/" is carried over */
|
||||||
assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
|
assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
|
||||||
new int[] { 0, 9, 12, 9 },
|
new int[] { 0, 9, 12, 9 },
|
||||||
new int[] { 6, 12, 13, 13 },
|
new int[] { 6, 12, 13, 13 },
|
||||||
new int[] { 1, 11, 1, 0 });
|
null,
|
||||||
|
new int[] { 1, 11, 1, 0 },
|
||||||
|
null,
|
||||||
|
false);
|
||||||
|
|
||||||
assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
|
assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
|
||||||
new int[] { 0, 9, 15 },
|
new int[] { 0, 9, 15 },
|
||||||
new int[] { 6, 14, 19 },
|
new int[] { 6, 14, 19 },
|
||||||
new int[] { 1, 11, 1 });
|
null,
|
||||||
|
new int[] { 1, 11, 1 },
|
||||||
|
null,
|
||||||
|
false);
|
||||||
|
|
||||||
Analyzer a3 = new Analyzer() {
|
Analyzer a3 = new Analyzer() {
|
||||||
@Override
|
@Override
|
||||||
|
@ -292,14 +316,20 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
||||||
new String[] { "lucene", "solr", "lucenesolr" },
|
new String[] { "lucene", "solr", "lucenesolr" },
|
||||||
new int[] { 0, 7, 0 },
|
new int[] { 0, 7, 0 },
|
||||||
new int[] { 6, 11, 11 },
|
new int[] { 6, 11, 11 },
|
||||||
new int[] { 1, 1, 0 });
|
null,
|
||||||
|
new int[] { 1, 1, 0 },
|
||||||
|
null,
|
||||||
|
false);
|
||||||
|
|
||||||
/* the stopword should add a gap here */
|
/* the stopword should add a gap here */
|
||||||
assertAnalyzesTo(a3, "the lucene.solr",
|
assertAnalyzesTo(a3, "the lucene.solr",
|
||||||
new String[] { "lucene", "solr", "lucenesolr" },
|
new String[] { "lucene", "solr", "lucenesolr" },
|
||||||
new int[] { 4, 11, 4 },
|
new int[] { 4, 11, 4 },
|
||||||
new int[] { 10, 15, 15 },
|
new int[] { 10, 15, 15 },
|
||||||
new int[] { 2, 1, 0 });
|
null,
|
||||||
|
new int[] { 2, 1, 0 },
|
||||||
|
null,
|
||||||
|
false);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
|
@ -322,7 +352,7 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
||||||
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
|
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER, 20, false, false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -94,7 +94,15 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testBackRangeOfNgrams() throws Exception {
|
public void testBackRangeOfNgrams() throws Exception {
|
||||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.BACK, 1, 3);
|
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.BACK, 1, 3);
|
||||||
assertTokenStreamContents(tokenizer, new String[]{"e","de","cde"}, new int[]{4,3,2}, new int[]{5,5,5});
|
assertTokenStreamContents(tokenizer,
|
||||||
|
new String[]{"e","de","cde"},
|
||||||
|
new int[]{4,3,2},
|
||||||
|
new int[]{5,5,5},
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
false);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testSmallTokenInStream() throws Exception {
|
public void testSmallTokenInStream() throws Exception {
|
||||||
|
@ -151,7 +159,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.BACK, 2, 15));
|
new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.BACK, 2, 15));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER, 20, false, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testEmptyTerm() throws Exception {
|
public void testEmptyTerm() throws Exception {
|
||||||
|
|
|
@ -90,7 +90,7 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testBackRangeOfNgrams() throws Exception {
|
public void testBackRangeOfNgrams() throws Exception {
|
||||||
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 3);
|
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 3);
|
||||||
assertTokenStreamContents(tokenizer, new String[]{"e","de","cde"}, new int[]{4,3,2}, new int[]{5,5,5}, 5 /* abcde */);
|
assertTokenStreamContents(tokenizer, new String[]{"e","de","cde"}, new int[]{4,3,2}, new int[]{5,5,5}, null, null, null, 5 /* abcde */, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReset() throws Exception {
|
public void testReset() throws Exception {
|
||||||
|
@ -109,8 +109,8 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase {
|
||||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER, 20, false, false);
|
||||||
checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
|
checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192, false, false);
|
||||||
|
|
||||||
Analyzer b = new Analyzer() {
|
Analyzer b = new Analyzer() {
|
||||||
@Override
|
@Override
|
||||||
|
@ -119,7 +119,7 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase {
|
||||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER, 20, false, false);
|
||||||
checkRandomData(random, b, 200*RANDOM_MULTIPLIER, 8192);
|
checkRandomData(random, b, 200*RANDOM_MULTIPLIER, 8192, false, false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -77,7 +77,8 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
assertTokenStreamContents(filter,
|
assertTokenStreamContents(filter,
|
||||||
new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"},
|
new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"},
|
||||||
new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
|
new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
|
||||||
new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5}
|
new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
|
||||||
|
null, null, null, null, false
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -130,7 +131,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
new NGramTokenFilter(tokenizer, 2, 15));
|
new NGramTokenFilter(tokenizer, 2, 15));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER, 20, false, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testEmptyTerm() throws Exception {
|
public void testEmptyTerm() throws Exception {
|
||||||
|
|
|
@ -73,7 +73,11 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
|
||||||
new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"},
|
new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"},
|
||||||
new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
|
new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
|
||||||
new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
|
new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
|
||||||
5 /* abcde */
|
null,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
5 /* abcde */,
|
||||||
|
false
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -98,7 +102,7 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
|
||||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER, 20, false, false);
|
||||||
checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
|
checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192, false, false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue