LUCENE-3969: Test all ctors in TestRandomChains and fix bugs discovered by the test

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1324960 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-04-11 19:54:09 +00:00
commit 16f5be0efb
29 changed files with 1237 additions and 284 deletions

View File

@ -100,7 +100,14 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
}
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException {
// offsetsAreCorrect also validates:
// - graph offsets are correct (all tokens leaving from
// pos X have the same startOffset; all tokens
// arriving to pos Y have the same endOffset)
// - offsets only move forwards (startOffset >=
// lastStartOffset)
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset,
boolean offsetsAreCorrect) throws IOException {
assertNotNull(output);
CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);
@ -137,6 +144,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
ts.reset();
int pos = -1;
int lastStartOffset = 0;
for (int i = 0; i < output.length; i++) {
// extra safety to enforce, that the state is not preserved and also assign bogus values
ts.clearAttributes();
@ -176,7 +184,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
endOffset <= finalOffset.intValue());
}
if (posLengthAtt != null && posIncrAtt != null) {
if (offsetsAreCorrect) {
assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset, offsetAtt.startOffset() >= lastStartOffset);
lastStartOffset = offsetAtt.startOffset();
}
if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) {
// Validate offset consistency in the graph, ie
// all tokens leaving from a certain pos have the
// same startOffset, and all tokens arriving to a
@ -194,7 +207,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
// We've seen a token leaving from this position
// before; verify the startOffset is the same:
//System.out.println(" + vs " + pos + " -> " + startOffset);
assertEquals(posToStartOffset.get(pos).intValue(), startOffset);
assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToStartOffset.get(pos).intValue(), startOffset);
}
final int endPos = pos + posLength;
@ -207,7 +220,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
// We've seen a token arriving to this position
// before; verify the endOffset is the same:
//System.out.println(" + ve " + endPos + " -> " + endOffset);
assertEquals(posToEndOffset.get(endPos).intValue(), endOffset);
assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToEndOffset.get(endPos).intValue(), endOffset);
}
}
}
@ -222,7 +235,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
}
}
assertFalse("TokenStream has more tokens than expected", ts.incrementToken());
assertFalse("TokenStream has more tokens than expected (expected count=" + output.length + ")", ts.incrementToken());
ts.end();
if (finalOffset != null) {
assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
@ -233,6 +246,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
ts.close();
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, true);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, finalOffset);
}
@ -280,6 +297,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[]) throws IOException {
assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length());
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean offsetsAreCorrect) throws IOException {
assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), offsetsAreCorrect);
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException {
assertAnalyzesTo(a, input, output, null, null, null, null, null);
@ -342,12 +363,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
/** utility method for blasting tokenstreams with data to make sure they don't do anything crazy */
public static void checkRandomData(Random random, Analyzer a, int iterations) throws IOException {
checkRandomData(random, a, iterations, 20, false);
checkRandomData(random, a, iterations, 20, false, true);
}
/** utility method for blasting tokenstreams with data to make sure they don't do anything crazy */
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException {
checkRandomData(random, a, iterations, maxWordLength, false);
checkRandomData(random, a, iterations, maxWordLength, false, true);
}
/**
@ -355,43 +376,63 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
* @param simple true if only ascii strings will be used (try to avoid)
*/
public static void checkRandomData(Random random, Analyzer a, int iterations, boolean simple) throws IOException {
checkRandomData(random, a, iterations, 20, simple);
checkRandomData(random, a, iterations, 20, simple, true);
}
static class AnalysisThread extends Thread {
final int iterations;
final int maxWordLength;
final Random random;
final long seed;
final Analyzer a;
final boolean useCharFilter;
final boolean simple;
final boolean offsetsAreCorrect;
// NOTE: not volatile because we don't want the tests to
// add memory barriers (ie alter how threads
// interact)... so this is just "best effort":
public boolean failed;
AnalysisThread(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple) {
this.random = random;
AnalysisThread(long seed, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect) {
this.seed = seed;
this.a = a;
this.iterations = iterations;
this.maxWordLength = maxWordLength;
this.useCharFilter = useCharFilter;
this.simple = simple;
this.offsetsAreCorrect = offsetsAreCorrect;
}
@Override
public void run() {
boolean success = false;
try {
// see the part in checkRandomData where it replays the same text again
// to verify reproducability/reuse: hopefully this would catch thread hazards.
checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean(), simple);
checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect);
success = true;
} catch (IOException e) {
Rethrow.rethrow(e);
} finally {
failed = !success;
}
}
};
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple) throws IOException {
checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean(), simple);
// now test with multiple threads
checkRandomData(random, a, iterations, maxWordLength, simple, true);
}
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple, boolean offsetsAreCorrect) throws IOException {
long seed = random.nextLong();
boolean useCharFilter = random.nextBoolean();
checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect);
// now test with multiple threads: note we do the EXACT same thing we did before in each thread,
// so this should only really fail from another thread if its an actual thread problem
int numThreads = _TestUtil.nextInt(random, 4, 8);
Thread threads[] = new Thread[numThreads];
AnalysisThread threads[] = new AnalysisThread[numThreads];
for (int i = 0; i < threads.length; i++) {
threads[i] = new AnalysisThread(new Random(random.nextLong()), a, iterations, maxWordLength, simple);
threads[i] = new AnalysisThread(seed, a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect);
}
for (int i = 0; i < threads.length; i++) {
threads[i].start();
@ -403,9 +444,14 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
throw new RuntimeException(e);
}
}
for (int i = 0; i < threads.length; i++) {
if (threads[i].failed) {
throw new RuntimeException("some thread(s) failed");
}
}
}
private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple) throws IOException {
private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect) throws IOException {
final LineFileDocs docs = new LineFileDocs(random);
@ -437,7 +483,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
}
try {
checkAnalysisConsistency(random, a, useCharFilter, text);
checkAnalysisConsistency(random, a, useCharFilter, text, offsetsAreCorrect);
} catch (Throwable t) {
// TODO: really we should pass a random seed to
// checkAnalysisConsistency then print it here too:
@ -477,6 +523,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
}
public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text) throws IOException {
checkAnalysisConsistency(random, a, useCharFilter, text, true);
}
public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean offsetsAreCorrect) throws IOException {
if (VERBOSE) {
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
@ -616,7 +666,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
types.toArray(new String[types.size()]),
toIntArray(positions),
toIntArray(positionLengths),
text.length());
text.length(),
offsetsAreCorrect);
} else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
// offset + pos + type
assertTokenStreamContents(ts,
@ -626,7 +677,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
types.toArray(new String[types.size()]),
toIntArray(positions),
null,
text.length());
text.length(),
offsetsAreCorrect);
} else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
// offset + pos + posLength
assertTokenStreamContents(ts,
@ -636,7 +688,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
null,
toIntArray(positions),
toIntArray(positionLengths),
text.length());
text.length(),
offsetsAreCorrect);
} else if (posIncAtt != null && offsetAtt != null) {
// offset + pos
assertTokenStreamContents(ts,
@ -646,7 +699,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
null,
toIntArray(positions),
null,
text.length());
text.length(),
offsetsAreCorrect);
} else if (offsetAtt != null) {
// offset
assertTokenStreamContents(ts,
@ -656,7 +710,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
null,
null,
null,
text.length());
text.length(),
offsetsAreCorrect);
} else {
// terms only
assertTokenStreamContents(ts,

View File

@ -151,7 +151,7 @@ public abstract class LookaheadTokenFilter<T extends LookaheadTokenFilter.Positi
startPosData.startOffset = startOffset;
} else {
// Make sure our input isn't messing up offsets:
assert startPosData.startOffset == startOffset;
assert startPosData.startOffset == startOffset: "prev startOffset=" + startPosData.startOffset + " vs new startOffset=" + startOffset + " inputPos=" + inputPos;
}
final int endOffset = offsetAtt.endOffset();
@ -159,7 +159,7 @@ public abstract class LookaheadTokenFilter<T extends LookaheadTokenFilter.Positi
endPosData.endOffset = endOffset;
} else {
// Make sure our input isn't messing up offsets:
assert endPosData.endOffset == endOffset;
assert endPosData.endOffset == endOffset: "prev endOffset=" + endPosData.endOffset + " vs new endOffset=" + endOffset + " inputPos=" + inputPos;
}
tokenPending = true;

View File

@ -76,7 +76,7 @@ public final class MockAnalyzer extends Analyzer {
* MockAnalyzer(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, false}).
*/
public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, boolean lowerCase) {
this(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, false);
this(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, true);
}
/**
@ -93,7 +93,8 @@ public final class MockAnalyzer extends Analyzer {
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
MockTokenizer tokenizer = new MockTokenizer(reader, runAutomaton, lowerCase, maxTokenLength);
tokenizer.setEnableChecks(enableChecks);
TokenFilter filt = new MockTokenFilter(tokenizer, filter, enablePositionIncrements);
MockTokenFilter filt = new MockTokenFilter(tokenizer, filter);
filt.setEnablePositionIncrements(enablePositionIncrements);
return new TokenStreamComponents(tokenizer, maybePayload(filt, fieldName));
}

View File

@ -34,7 +34,9 @@ public class MockCharFilter extends CharStream {
// TODO: instead of fixed remainder... maybe a fixed
// random seed?
this.remainder = remainder;
assert remainder >= 0 && remainder < 10 : "invalid parameter";
if (remainder < 0 || remainder >= 10) {
throw new IllegalArgumentException("invalid remainder parameter (must be 0..10): " + remainder);
}
}
// for testing only, uses a remainder of 0

View File

@ -34,6 +34,9 @@ public final class MockFixedLengthPayloadFilter extends TokenFilter {
public MockFixedLengthPayloadFilter(Random random, TokenStream in, int length) {
super(in);
if (length < 0) {
throw new IllegalArgumentException("length must be >= 0");
}
this.random = random;
this.bytes = new byte[length];
this.payload = new Payload(bytes);

View File

@ -31,10 +31,12 @@ public final class MockRandomLookaheadTokenFilter extends LookaheadTokenFilter<L
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final Random random;
private final long seed;
public MockRandomLookaheadTokenFilter(Random random, TokenStream in) {
super(in);
this.random = random;
this.seed = random.nextLong();
this.random = new Random(seed);
}
@Override
@ -57,9 +59,6 @@ public final class MockRandomLookaheadTokenFilter extends LookaheadTokenFilter<L
if (!end) {
while (true) {
// We can use un-re-seeded random, because how far
// ahead we peek should never alter the resulting
// tokens as seen by the consumer:
if (random.nextInt(3) == 1) {
if (!peekToken()) {
if (DEBUG) {
@ -91,4 +90,10 @@ public final class MockRandomLookaheadTokenFilter extends LookaheadTokenFilter<L
}
return result;
}
@Override
public void reset() throws IOException {
super.reset();
random.setSeed(seed);
}
}

View File

@ -55,7 +55,7 @@ public final class MockTokenFilter extends TokenFilter {
makeString("with"))));
private final CharacterRunAutomaton filter;
private boolean enablePositionIncrements = false;
private boolean enablePositionIncrements = true;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
@ -67,14 +67,16 @@ public final class MockTokenFilter extends TokenFilter {
* @param filter DFA representing the terms that should be removed.
* @param enablePositionIncrements true if the removal should accumulate position increments.
*/
public MockTokenFilter(TokenStream input, CharacterRunAutomaton filter, boolean enablePositionIncrements) {
public MockTokenFilter(TokenStream input, CharacterRunAutomaton filter) {
super(input);
this.filter = filter;
this.enablePositionIncrements = enablePositionIncrements;
}
@Override
public boolean incrementToken() throws IOException {
// TODO: fix me when posInc=false, to work like FilteringTokenFilter in that case and not return
// initial token with posInc=0 ever
// return the first non-stop word found
int skippedPositions = 0;
while (input.incrementToken()) {

View File

@ -0,0 +1,170 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.util.Attribute;
// TODO: rename to OffsetsXXXTF? ie we only validate
// offsets (now anyway...)
// TODO: also make a DebuggingTokenFilter, that just prints
// all att values that come through it...
// TODO: BTSTC should just append this to the chain
// instead of checking itself:
/** A TokenFilter that checks consistency of the tokens (eg
* offsets are consistent with one another). */
public final class ValidatingTokenFilter extends TokenFilter {
private int pos;
private int lastStartOffset;
// Maps position to the start/end offset:
private final Map<Integer,Integer> posToStartOffset = new HashMap<Integer,Integer>();
private final Map<Integer,Integer> posToEndOffset = new HashMap<Integer,Integer>();
private final PositionIncrementAttribute posIncAtt = getAttrIfExists(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLenAtt = getAttrIfExists(PositionLengthAttribute.class);
private final OffsetAttribute offsetAtt = getAttrIfExists(OffsetAttribute.class);
private final CharTermAttribute termAtt = getAttrIfExists(CharTermAttribute.class);
private final boolean offsetsAreCorrect;
private final String name;
// Returns null if the attr wasn't already added
private <A extends Attribute> A getAttrIfExists(Class<A> att) {
if (hasAttribute(att)) {
return getAttribute(att);
} else {
return null;
}
}
/** The name arg is used to identify this stage when
* throwing exceptions (useful if you have more than one
* instance in your chain). */
public ValidatingTokenFilter(TokenStream in, String name, boolean offsetsAreCorrect) {
super(in);
this.name = name;
this.offsetsAreCorrect = offsetsAreCorrect;
}
@Override
public boolean incrementToken() throws IOException {
if (!input.incrementToken()) {
return false;
}
int startOffset = 0;
int endOffset = 0;
int posLen = 0;
if (posIncAtt != null) {
pos += posIncAtt.getPositionIncrement();
if (pos == -1) {
throw new IllegalStateException("first posInc must be > 0");
}
}
// System.out.println(" got token=" + termAtt + " pos=" + pos);
if (offsetAtt != null) {
startOffset = offsetAtt.startOffset();
endOffset = offsetAtt.endOffset();
if (startOffset < 0) {
throw new IllegalStateException(name + ": startOffset=" + startOffset + " is < 0");
}
if (endOffset < 0) {
throw new IllegalStateException(name + ": endOffset=" + endOffset + " is < 0");
}
if (endOffset < startOffset) {
throw new IllegalStateException(name + ": startOffset=" + startOffset + " is > endOffset=" + endOffset + " pos=" + pos + "; token=" + termAtt);
}
if (offsetsAreCorrect && offsetAtt.startOffset() < lastStartOffset) {
throw new IllegalStateException(name + ": offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset);
}
lastStartOffset = offsetAtt.startOffset();
}
posLen = posLenAtt == null ? 1 : posLenAtt.getPositionLength();
if (offsetAtt != null && posIncAtt != null && offsetsAreCorrect) {
if (!posToStartOffset.containsKey(pos)) {
// First time we've seen a token leaving from this position:
posToStartOffset.put(pos, startOffset);
//System.out.println(" + s " + pos + " -> " + startOffset);
} else {
// We've seen a token leaving from this position
// before; verify the startOffset is the same:
//System.out.println(" + vs " + pos + " -> " + startOffset);
final int oldStartOffset = posToStartOffset.get(pos);
if (oldStartOffset != startOffset) {
throw new IllegalStateException(name + ": inconsistent startOffset at pos=" + pos + ": " + oldStartOffset + " vs " + startOffset + "; token=" + termAtt);
}
}
final int endPos = pos + posLen;
if (!posToEndOffset.containsKey(endPos)) {
// First time we've seen a token arriving to this position:
posToEndOffset.put(endPos, endOffset);
//System.out.println(" + e " + endPos + " -> " + endOffset);
} else {
// We've seen a token arriving to this position
// before; verify the endOffset is the same:
//System.out.println(" + ve " + endPos + " -> " + endOffset);
final int oldEndOffset = posToEndOffset.get(endPos);
if (oldEndOffset != endOffset) {
throw new IllegalStateException(name + ": inconsistent endOffset at pos=" + endPos + ": " + oldEndOffset + " vs " + endOffset + "; token=" + termAtt);
}
}
}
return true;
}
@Override
public void end() throws IOException {
super.end();
// TODO: what else to validate
// TODO: check that endOffset is >= max(endOffset)
// we've seen
}
@Override
public void reset() throws IOException {
super.reset();
pos = -1;
posToStartOffset.clear();
posToEndOffset.clear();
lastStartOffset = 0;
}
}

View File

@ -42,6 +42,16 @@ Bug fixes
* LUCENE-3820: PatternReplaceCharFilter could return invalid token positions.
(Dawid Weiss)
* LUCENE-3969: Throw IAE on bad arguments that could cause confusing errors in
CompoundWordTokenFilterBase, PatternTokenizer, PositionFilter,
SnowballFilter, PathHierarchyTokenizer, ReversePathHierarchyTokenizer,
WikipediaTokenizer, and KeywordTokenizer. ShingleFilter and
CommonGramsFilter now populate PositionLengthAttribute. Fixed
PathHierarchyTokenizer to reset() all state. Protect against AIOOBE in
ReversePathHierarchyTokenizer if skip is large. Fixed wrong final
offset calculation in PathHierarchyTokenizer.
(Mike McCandless, Uwe Schindler, Robert Muir)
New Features
* LUCENE-2341: A new analyzer/ filter: Morfologik - a dictionary-driven lemmatizer
@ -108,4 +118,4 @@ New Features
(Chris Male, Robert Muir)
* SOLR-2764: Create a NorwegianLightStemmer and NorwegianMinimalStemmer (janhoy)

View File

@ -16,6 +16,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
@ -54,6 +55,7 @@ public final class CommonGramsFilter extends TokenFilter {
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLenAttribute = addAttribute(PositionLengthAttribute.class);
private int lastStartOffset;
private boolean lastWasCommon;
@ -166,6 +168,7 @@ public final class CommonGramsFilter extends TokenFilter {
buffer.getChars(0, length, termText, 0);
termAttribute.setLength(length);
posIncAttribute.setPositionIncrement(0);
posLenAttribute.setPositionLength(2); // bigram
offsetAttribute.setOffset(lastStartOffset, endOffset);
typeAttribute.setType(GRAM_TYPE);
buffer.setLength(0);

View File

@ -82,8 +82,17 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
super(input);
this.tokens=new LinkedList<CompoundToken>();
if (minWordSize < 0) {
throw new IllegalArgumentException("minWordSize cannot be negative");
}
this.minWordSize=minWordSize;
if (minSubwordSize < 0) {
throw new IllegalArgumentException("minSubwordSize cannot be negative");
}
this.minSubwordSize=minSubwordSize;
if (maxSubwordSize < 0) {
throw new IllegalArgumentException("maxSubwordSize cannot be negative");
}
this.maxSubwordSize=maxSubwordSize;
this.onlyLongestMatch=onlyLongestMatch;
this.dictionary = dictionary;

View File

@ -191,6 +191,8 @@ public class HyphenationCompoundWordTokenFilter extends
// we only put subwords to the token stream
// that are longer than minPartSize
if (partLength < this.minSubwordSize) {
// BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the
// calculation above, and we rely upon minSubwordSize being >=0 to filter them out...
continue;
}

View File

@ -43,16 +43,25 @@ public final class KeywordTokenizer extends Tokenizer {
public KeywordTokenizer(Reader input, int bufferSize) {
super(input);
if (bufferSize <= 0) {
throw new IllegalArgumentException("bufferSize must be > 0");
}
termAtt.resizeBuffer(bufferSize);
}
public KeywordTokenizer(AttributeSource source, Reader input, int bufferSize) {
super(source, input);
if (bufferSize <= 0) {
throw new IllegalArgumentException("bufferSize must be > 0");
}
termAtt.resizeBuffer(bufferSize);
}
public KeywordTokenizer(AttributeFactory factory, Reader input, int bufferSize) {
super(factory, input);
if (bufferSize <= 0) {
throw new IllegalArgumentException("bufferSize must be > 0");
}
termAtt.resizeBuffer(bufferSize);
}

View File

@ -65,6 +65,12 @@ public class PathHierarchyTokenizer extends Tokenizer {
public PathHierarchyTokenizer(Reader input, int bufferSize, char delimiter, char replacement, int skip) {
super(input);
if (bufferSize < 0) {
throw new IllegalArgumentException("bufferSize cannot be negative");
}
if (skip < 0) {
throw new IllegalArgumentException("skip cannot be negative");
}
termAtt.resizeBuffer(bufferSize);
this.delimiter = delimiter;
@ -85,10 +91,11 @@ public class PathHierarchyTokenizer extends Tokenizer {
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
private int startPosition = 0;
private int finalOffset = 0;
private int skipped = 0;
private boolean endDelimiter = false;
private StringBuilder resultToken;
private int charsRead = 0;
@Override
@ -112,12 +119,13 @@ public class PathHierarchyTokenizer extends Tokenizer {
while (true) {
int c = input.read();
if( c < 0 ){
if (c >= 0) {
charsRead++;
} else {
if( skipped > skip ) {
length += resultToken.length();
termAtt.setLength(length);
finalOffset = correctOffset(startPosition + length);
offsetAtt.setOffset(correctOffset(startPosition), finalOffset);
offsetAtt.setOffset(correctOffset(startPosition), correctOffset(startPosition + length));
if( added ){
resultToken.setLength(0);
resultToken.append(termAtt.buffer(), 0, length);
@ -125,7 +133,6 @@ public class PathHierarchyTokenizer extends Tokenizer {
return added;
}
else{
finalOffset = correctOffset(startPosition + length);
return false;
}
}
@ -168,8 +175,7 @@ public class PathHierarchyTokenizer extends Tokenizer {
}
length += resultToken.length();
termAtt.setLength(length);
finalOffset = correctOffset(startPosition + length);
offsetAtt.setOffset(correctOffset(startPosition), finalOffset);
offsetAtt.setOffset(correctOffset(startPosition), correctOffset(startPosition+length));
resultToken.setLength(0);
resultToken.append(termAtt.buffer(), 0, length);
return true;
@ -178,15 +184,17 @@ public class PathHierarchyTokenizer extends Tokenizer {
@Override
public final void end() {
// set final offset
int finalOffset = correctOffset(charsRead);
offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override
public void reset(Reader input) throws IOException {
super.reset(input);
public void reset() throws IOException {
super.reset();
resultToken.setLength(0);
finalOffset = 0;
charsRead = 0;
endDelimiter = false;
skipped = 0;
startPosition = 0;
}
}

View File

@ -77,6 +77,12 @@ public class ReversePathHierarchyTokenizer extends Tokenizer {
public ReversePathHierarchyTokenizer(Reader input, int bufferSize, char delimiter, char replacement, int skip) {
super(input);
if (bufferSize < 0) {
throw new IllegalArgumentException("bufferSize cannot be negative");
}
if (skip < 0) {
throw new IllegalArgumentException("skip cannot be negative");
}
termAtt.resizeBuffer(bufferSize);
this.delimiter = delimiter;
this.replacement = replacement;
@ -137,7 +143,11 @@ public class ReversePathHierarchyTokenizer extends Tokenizer {
}
resultToken.getChars(0, resultToken.length(), resultTokenBuffer, 0);
resultToken.setLength(0);
endPosition = delimiterPositions.get(delimitersCount-1 - skip);
int idx = delimitersCount-1 - skip;
if (idx >= 0) {
// otherwise its ok, because we will skip and return false
endPosition = delimiterPositions.get(idx);
}
finalOffset = correctOffset(length);
posAtt.setPositionIncrement(1);
}
@ -163,10 +173,11 @@ public class ReversePathHierarchyTokenizer extends Tokenizer {
}
@Override
public void reset(Reader input) throws IOException {
super.reset(input);
public void reset() throws IOException {
super.reset();
resultToken.setLength(0);
finalOffset = 0;
endPosition = 0;
skipped = 0;
delimitersCount = -1;
delimiterPositions.clear();

View File

@ -69,8 +69,17 @@ public final class PatternTokenizer extends Tokenizer {
super(input);
this.pattern = pattern;
this.group = group;
// Use "" instead of str so don't consume chars
// (fillBuffer) from the input on throwing IAE below:
matcher = pattern.matcher("");
// confusingly group count depends ENTIRELY on the pattern but is only accessible via matcher
if (group >= 0 && group > matcher.groupCount()) {
throw new IllegalArgumentException("invalid group specified: pattern only has: " + matcher.groupCount() + " capturing groups");
}
fillBuffer(str, input);
matcher = pattern.matcher(str);
matcher.reset(str);
index = 0;
}

View File

@ -57,6 +57,9 @@ public final class PositionFilter extends TokenFilter {
*/
public PositionFilter(final TokenStream input, final int positionIncrement) {
super(input);
if (positionIncrement < 0) {
throw new IllegalArgumentException("positionIncrement may not be negative");
}
this.positionIncrement = positionIncrement;
}

View File

@ -23,9 +23,10 @@ import java.util.LinkedList;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
@ -150,6 +151,7 @@ public final class ShingleFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
@ -319,6 +321,7 @@ public final class ShingleFilter extends TokenFilter {
noShingleOutput = false;
}
offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
posLenAtt.setPositionLength(builtGramSize);
isOutputHere = true;
gramSize.advance();
tokenAvailable = true;
@ -436,6 +439,8 @@ public final class ShingleFilter extends TokenFilter {
super.reset();
gramSize.reset();
inputWindow.clear();
nextInputStreamToken = null;
isNextInputStreamToken = false;
numFillerTokensToInsert = 0;
isOutputHere = false;
noShingleOutput = true;

View File

@ -67,7 +67,7 @@ public final class SnowballFilter extends TokenFilter {
Class.forName("org.tartarus.snowball.ext." + name + "Stemmer").asSubclass(SnowballProgram.class);
stemmer = stemClass.newInstance();
} catch (Exception e) {
throw new RuntimeException(e.toString());
throw new IllegalArgumentException("Invalid stemmer class specified: " + name, e);
}
}

View File

@ -177,6 +177,12 @@ public final class WikipediaTokenizer extends Tokenizer {
}
private void init(int tokenOutput, Set<String> untokenizedTypes) {
// TODO: cutover to enum
if (tokenOutput != TOKENS_ONLY &&
tokenOutput != UNTOKENIZED_ONLY &&
tokenOutput != BOTH) {
throw new IllegalArgumentException("tokenOutput must be TOKENS_ONLY, UNTOKENIZED_ONLY or BOTH");
}
this.tokenOutput = tokenOutput;
this.untokenizedTypes = untokenizedTypes;
}

View File

@ -19,6 +19,8 @@ package org.apache.lucene.analysis.charfilter;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@ -27,6 +29,8 @@ import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util._TestUtil;
import org.junit.Ignore;
public class TestMappingCharFilter extends BaseTokenStreamTestCase {
@ -190,4 +194,67 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
int numRounds = RANDOM_MULTIPLIER * 10000;
checkRandomData(random, analyzer, numRounds);
}
@Ignore("wrong finalOffset: https://issues.apache.org/jira/browse/LUCENE-3971")
public void testFinalOffsetSpecialCase() throws Exception {
final NormalizeCharMap map = new NormalizeCharMap();
map.add("t", "");
// even though this below rule has no effect, the test passes if you remove it!!
map.add("tmakdbl", "c");
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, tokenizer);
}
@Override
protected Reader initReader(Reader reader) {
return new MappingCharFilter(map, CharReader.get(reader));
}
};
String text = "gzw f quaxot";
checkAnalysisConsistency(random, analyzer, false, text);
}
@Ignore("wrong finalOffset: https://issues.apache.org/jira/browse/LUCENE-3971")
public void testRandomMaps() throws Exception {
for (int i = 0; i < 100; i++) {
final NormalizeCharMap map = randomMap();
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, tokenizer);
}
@Override
protected Reader initReader(Reader reader) {
return new MappingCharFilter(map, CharReader.get(reader));
}
};
int numRounds = RANDOM_MULTIPLIER * 100;
checkRandomData(random, analyzer, numRounds);
}
}
private NormalizeCharMap randomMap() {
NormalizeCharMap map = new NormalizeCharMap();
// we can't add duplicate keys, or NormalizeCharMap gets angry
Set<String> keys = new HashSet<String>();
int num = random.nextInt(5);
//System.out.println("NormalizeCharMap=");
for (int i = 0; i < num; i++) {
String key = _TestUtil.randomSimpleString(random);
if (!keys.contains(key)) {
String value = _TestUtil.randomSimpleString(random);
map.add(key, value);
keys.add(key);
//System.out.println("mapping: '" + key + "' => '" + value + "'");
}
}
return map;
}
}

View File

@ -18,17 +18,28 @@ package org.apache.lucene.analysis.core;
*/
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Modifier;
import java.net.URL;
import java.nio.CharBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.IdentityHashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@ -36,67 +47,174 @@ import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.EmptyTokenizer;
import org.apache.lucene.analysis.MockGraphTokenFilter;
import org.apache.lucene.analysis.MockRandomLookaheadTokenFilter;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
import org.apache.lucene.analysis.ValidatingTokenFilter;
import org.apache.lucene.analysis.charfilter.CharFilter;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter;
import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
import org.apache.lucene.analysis.compound.TestCompoundWordTokenFilter;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
import org.apache.lucene.analysis.hunspell.HunspellDictionary;
import org.apache.lucene.analysis.hunspell.HunspellDictionaryTest;
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
import org.apache.lucene.analysis.miscellaneous.TrimFilter;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
import org.apache.lucene.analysis.payloads.IdentityEncoder;
import org.apache.lucene.analysis.payloads.PayloadEncoder;
import org.apache.lucene.analysis.position.PositionFilter;
import org.apache.lucene.analysis.snowball.TestSnowball;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.th.ThaiWordFilter;
import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.AttributeSource.AttributeFactory;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.Rethrow;
import org.apache.lucene.util.Version;
import org.apache.lucene.util._TestUtil;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.tartarus.snowball.SnowballProgram;
import org.xml.sax.InputSource;
/** tests random analysis chains */
public class TestRandomChains extends BaseTokenStreamTestCase {
static List<Class<? extends Tokenizer>> tokenizers;
static List<Class<? extends TokenFilter>> tokenfilters;
static List<Class<? extends CharStream>> charfilters;
static List<Constructor<? extends Tokenizer>> tokenizers;
static List<Constructor<? extends TokenFilter>> tokenfilters;
static List<Constructor<? extends CharStream>> charfilters;
// TODO: fix those and remove
private static final Set<Class<?>> brokenComponents = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
static {
// TODO: can we promote some of these to be only
// offsets offenders?
Collections.<Class<?>>addAll(brokenComponents,
// TODO: fix basetokenstreamtestcase not to trip because this one has no CharTermAtt
EmptyTokenizer.class,
// doesn't actual reset itself!
CachingTokenFilter.class,
// doesn't consume whole stream!
LimitTokenCountFilter.class,
// Not broken: we forcefully add this, so we shouldn't
// also randomly pick it:
ValidatingTokenFilter.class,
// NOTE: these by themselves won't cause any 'basic assertions' to fail.
// but see https://issues.apache.org/jira/browse/LUCENE-3920, if any
// tokenfilter that combines words (e.g. shingles) comes after them,
// this will create bogus offsets because their 'offsets go backwards',
// causing shingle or whatever to make a single token with a
// startOffset thats > its endOffset
// (see LUCENE-3738 for a list of other offenders here)
// broken!
NGramTokenizer.class,
// broken!
NGramTokenFilter.class,
// broken!
EdgeNGramTokenizer.class,
// broken!
EdgeNGramTokenFilter.class,
// broken!
WordDelimiterFilter.class,
// broken!
TrimFilter.class,
// TODO: remove this class after we fix its finalOffset bug
MappingCharFilter.class
);
}
// TODO: also fix these and remove (maybe):
// Classes that don't produce consistent graph offsets:
private static final Set<Class<?>> brokenOffsetsComponents = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
static {
Collections.<Class<?>>addAll(brokenOffsetsComponents,
ReversePathHierarchyTokenizer.class,
PathHierarchyTokenizer.class,
HyphenationCompoundWordTokenFilter.class,
DictionaryCompoundWordTokenFilter.class,
// TODO: corrumpts graphs (offset consistency check):
PositionFilter.class,
// TODO: it seems to mess up offsets!?
WikipediaTokenizer.class,
// TODO: doesn't handle graph inputs
ThaiWordFilter.class,
// TODO: doesn't handle graph inputs
CJKBigramFilter.class
);
}
@BeforeClass
public static void beforeClass() throws Exception {
List<Class<?>> analysisClasses = new ArrayList<Class<?>>();
getClassesForPackage("org.apache.lucene.analysis", analysisClasses);
tokenizers = new ArrayList<Class<? extends Tokenizer>>();
tokenfilters = new ArrayList<Class<? extends TokenFilter>>();
charfilters = new ArrayList<Class<? extends CharStream>>();
for (Class<?> c : analysisClasses) {
// don't waste time with abstract classes or deprecated known-buggy ones
tokenizers = new ArrayList<Constructor<? extends Tokenizer>>();
tokenfilters = new ArrayList<Constructor<? extends TokenFilter>>();
charfilters = new ArrayList<Constructor<? extends CharStream>>();
for (final Class<?> c : analysisClasses) {
final int modifiers = c.getModifiers();
if (Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers)
|| c.getAnnotation(Deprecated.class) != null
|| c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface()
// TODO: fix basetokenstreamtestcase not to trip because this one has no CharTermAtt
|| c.equals(EmptyTokenizer.class)
// doesn't actual reset itself!
|| c.equals(CachingTokenFilter.class)
// broken!
|| c.equals(NGramTokenizer.class)
// broken!
|| c.equals(NGramTokenFilter.class)
// broken!
|| c.equals(EdgeNGramTokenizer.class)
// broken!
|| c.equals(EdgeNGramTokenFilter.class)) {
if (
// don't waste time with abstract classes or deprecated known-buggy ones
Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers)
|| c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface()
|| brokenComponents.contains(c)
|| c.isAnnotationPresent(Deprecated.class)
|| !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharStream.class.isAssignableFrom(c))
) {
continue;
}
if (Tokenizer.class.isAssignableFrom(c)) {
tokenizers.add(c.asSubclass(Tokenizer.class));
} else if (TokenFilter.class.isAssignableFrom(c)) {
tokenfilters.add(c.asSubclass(TokenFilter.class));
} else if (CharStream.class.isAssignableFrom(c)) {
charfilters.add(c.asSubclass(CharStream.class));
for (final Constructor<?> ctor : c.getConstructors()) {
// don't test synthetic or deprecated ctors, they likely have known bugs:
if (ctor.isSynthetic() || ctor.isAnnotationPresent(Deprecated.class)) {
continue;
}
if (Tokenizer.class.isAssignableFrom(c)) {
assertTrue(ctor.toGenericString() + " has unsupported parameter types",
allowedTokenizerArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
tokenizers.add(castConstructor(Tokenizer.class, ctor));
} else if (TokenFilter.class.isAssignableFrom(c)) {
assertTrue(ctor.toGenericString() + " has unsupported parameter types",
allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
tokenfilters.add(castConstructor(TokenFilter.class, ctor));
} else if (CharStream.class.isAssignableFrom(c)) {
assertTrue(ctor.toGenericString() + " has unsupported parameter types",
allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
charfilters.add(castConstructor(CharStream.class, ctor));
} else {
fail("Cannot get here");
}
}
}
final Comparator<Class<?>> classComp = new Comparator<Class<?>>() {
final Comparator<Constructor<?>> ctorComp = new Comparator<Constructor<?>>() {
@Override
public int compare(Class<?> arg0, Class<?> arg1) {
return arg0.getName().compareTo(arg1.getName());
public int compare(Constructor<?> arg0, Constructor<?> arg1) {
return arg0.toGenericString().compareTo(arg1.toGenericString());
}
};
Collections.sort(tokenizers, classComp);
Collections.sort(tokenfilters, classComp);
Collections.sort(charfilters, classComp);
Collections.sort(tokenizers, ctorComp);
Collections.sort(tokenfilters, ctorComp);
Collections.sort(charfilters, ctorComp);
if (VERBOSE) {
System.out.println("tokenizers = " + tokenizers);
System.out.println("tokenfilters = " + tokenfilters);
@ -111,170 +229,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
charfilters = null;
}
static class MockRandomAnalyzer extends Analyzer {
final long seed;
MockRandomAnalyzer(long seed) {
this.seed = seed;
}
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Random random = new Random(seed);
TokenizerSpec tokenizerspec = newTokenizer(random, reader);
TokenFilterSpec filterspec = newFilterChain(random, tokenizerspec.tokenizer);
return new TokenStreamComponents(tokenizerspec.tokenizer, filterspec.stream);
}
@Override
protected Reader initReader(Reader reader) {
Random random = new Random(seed);
CharFilterSpec charfilterspec = newCharFilterChain(random, reader);
return charfilterspec.reader;
}
@Override
public String toString() {
Random random = new Random(seed);
StringBuilder sb = new StringBuilder();
CharFilterSpec charfilterSpec = newCharFilterChain(random, new StringReader(""));
sb.append("\ncharfilters=");
sb.append(charfilterSpec.toString);
// intentional: initReader gets its own separate random
random = new Random(seed);
TokenizerSpec tokenizerSpec = newTokenizer(random, charfilterSpec.reader);
sb.append("\n");
sb.append("tokenizer=");
sb.append(tokenizerSpec.toString);
TokenFilterSpec tokenfilterSpec = newFilterChain(random, tokenizerSpec.tokenizer);
sb.append("\n");
sb.append("filters=");
sb.append(tokenfilterSpec.toString);
return sb.toString();
}
// create a new random tokenizer from classpath
private TokenizerSpec newTokenizer(Random random, Reader reader) {
TokenizerSpec spec = new TokenizerSpec();
boolean success = false;
while (!success) {
try {
// TODO: check Reader+Version,Version+Reader too
// also look for other variants and handle them special
int idx = random.nextInt(tokenizers.size());
try {
Constructor<? extends Tokenizer> c = tokenizers.get(idx).getConstructor(Version.class, Reader.class);
spec.tokenizer = c.newInstance(TEST_VERSION_CURRENT, reader);
} catch (NoSuchMethodException e) {
Constructor<? extends Tokenizer> c = tokenizers.get(idx).getConstructor(Reader.class);
spec.tokenizer = c.newInstance(reader);
}
spec.toString = tokenizers.get(idx).toString();
success = true;
} catch (Exception e) {
// ignore
}
}
return spec;
}
private CharFilterSpec newCharFilterChain(Random random, Reader reader) {
CharFilterSpec spec = new CharFilterSpec();
spec.reader = reader;
StringBuilder descr = new StringBuilder();
int numFilters = random.nextInt(3);
for (int i = 0; i < numFilters; i++) {
boolean success = false;
while (!success) {
try {
// TODO: also look for other variants and handle them special
int idx = random.nextInt(charfilters.size());
try {
Constructor<? extends CharStream> c = charfilters.get(idx).getConstructor(Reader.class);
spec.reader = c.newInstance(spec.reader);
} catch (NoSuchMethodException e) {
Constructor<? extends CharStream> c = charfilters.get(idx).getConstructor(CharStream.class);
spec.reader = c.newInstance(CharReader.get(spec.reader));
}
if (descr.length() > 0) {
descr.append(",");
}
descr.append(charfilters.get(idx).toString());
success = true;
} catch (Exception e) {
// ignore
}
}
}
spec.toString = descr.toString();
return spec;
}
private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer) {
TokenFilterSpec spec = new TokenFilterSpec();
spec.stream = tokenizer;
StringBuilder descr = new StringBuilder();
int numFilters = random.nextInt(5);
for (int i = 0; i < numFilters; i++) {
boolean success = false;
while (!success) {
try {
// TODO: also look for other variants and handle them special
int idx = random.nextInt(tokenfilters.size());
try {
Constructor<? extends TokenFilter> c = tokenfilters.get(idx).getConstructor(Version.class, TokenStream.class);
spec.stream = c.newInstance(TEST_VERSION_CURRENT, spec.stream);
} catch (NoSuchMethodException e) {
Constructor<? extends TokenFilter> c = tokenfilters.get(idx).getConstructor(TokenStream.class);
spec.stream = c.newInstance(spec.stream);
}
if (descr.length() > 0) {
descr.append(",");
}
descr.append(tokenfilters.get(idx).toString());
success = true;
} catch (Exception e) {
// ignore
}
}
}
spec.toString = descr.toString();
return spec;
}
/** Hack to work around the stupidness of Oracle's strict Java backwards compatibility.
* {@code Class<T>#getConstructors()} should return unmodifiable {@code List<Constructor<T>>} not array! */
@SuppressWarnings("unchecked")
private static <T> Constructor<T> castConstructor(Class<T> instanceClazz, Constructor<?> ctor) {
return (Constructor<T>) ctor;
}
static class TokenizerSpec {
Tokenizer tokenizer;
String toString;
}
static class TokenFilterSpec {
TokenStream stream;
String toString;
}
static class CharFilterSpec {
Reader reader;
String toString;
}
public void testRandomChains() throws Throwable {
int numIterations = atLeast(20);
for (int i = 0; i < numIterations; i++) {
MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong());
if (VERBOSE) {
System.out.println("Creating random analyzer:" + a);
}
try {
checkRandomData(random, a, 1000);
} catch (Throwable e) {
System.err.println("Exception from random analyzer: " + a);
throw e;
}
}
}
private static void getClassesForPackage(String pckgname, List<Class<?>> classes) throws Exception {
final ClassLoader cld = TestRandomChains.class.getClassLoader();
final String path = pckgname.replace('.', '/');
@ -303,4 +263,568 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
}
}
}
private static interface ArgProducer {
Object create(Random random);
}
private static final Map<Class<?>,ArgProducer> argProducers = new IdentityHashMap<Class<?>,ArgProducer>() {{
put(int.class, new ArgProducer() {
@Override public Object create(Random random) {
// TODO: could cause huge ram usage to use full int range for some filters
// (e.g. allocate enormous arrays)
// return Integer.valueOf(random.nextInt());
return Integer.valueOf(_TestUtil.nextInt(random, -100, 100));
}
});
put(char.class, new ArgProducer() {
@Override public Object create(Random random) {
// TODO: fix any filters that care to throw IAE instead.
// also add a unicode validating filter to validate termAtt?
// return Character.valueOf((char)random.nextInt(65536));
while(true) {
char c = (char)random.nextInt(65536);
if (c < '\uD800' || c > '\uDFFF') {
return Character.valueOf(c);
}
}
}
});
put(float.class, new ArgProducer() {
@Override public Object create(Random random) {
return Float.valueOf(random.nextFloat());
}
});
put(boolean.class, new ArgProducer() {
@Override public Object create(Random random) {
return Boolean.valueOf(random.nextBoolean());
}
});
put(byte.class, new ArgProducer() {
@Override public Object create(Random random) {
// this wraps to negative when casting to byte
return Byte.valueOf((byte) random.nextInt(256));
}
});
put(byte[].class, new ArgProducer() {
@Override public Object create(Random random) {
byte bytes[] = new byte[random.nextInt(256)];
random.nextBytes(bytes);
return bytes;
}
});
put(Random.class, new ArgProducer() {
@Override public Object create(Random random) {
return new Random(random.nextLong());
}
});
put(Version.class, new ArgProducer() {
@Override public Object create(Random random) {
// we expect bugs in emulating old versions
return TEST_VERSION_CURRENT;
}
});
put(Set.class, new ArgProducer() {
@Override public Object create(Random random) {
// TypeTokenFilter
Set<String> set = new HashSet<String>();
int num = random.nextInt(5);
for (int i = 0; i < num; i++) {
set.add(StandardTokenizer.TOKEN_TYPES[random.nextInt(StandardTokenizer.TOKEN_TYPES.length)]);
}
return set;
}
});
put(Collection.class, new ArgProducer() {
@Override public Object create(Random random) {
// CapitalizationFilter
Collection<char[]> col = new ArrayList<char[]>();
int num = random.nextInt(5);
for (int i = 0; i < num; i++) {
col.add(_TestUtil.randomSimpleString(random).toCharArray());
}
return col;
}
});
put(CharArraySet.class, new ArgProducer() {
@Override public Object create(Random random) {
int num = random.nextInt(10);
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, num, random.nextBoolean());
for (int i = 0; i < num; i++) {
// TODO: make nastier
set.add(_TestUtil.randomSimpleString(random));
}
return set;
}
});
put(Pattern.class, new ArgProducer() {
@Override public Object create(Random random) {
// TODO: don't want to make the exponentially slow ones Dawid documents
// in TestPatternReplaceFilter, so dont use truly random patterns (for now)
return Pattern.compile("a");
}
});
put(PayloadEncoder.class, new ArgProducer() {
@Override public Object create(Random random) {
return new IdentityEncoder(); // the other encoders will throw exceptions if tokens arent numbers?
}
});
put(HunspellDictionary.class, new ArgProducer() {
@Override public Object create(Random random) {
// TODO: make nastier
InputStream affixStream = HunspellDictionaryTest.class.getResourceAsStream("test.aff");
InputStream dictStream = HunspellDictionaryTest.class.getResourceAsStream("test.dic");
try {
return new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT);
} catch (Exception ex) {
Rethrow.rethrow(ex);
return null; // unreachable code
}
}
});
put(EdgeNGramTokenizer.Side.class, new ArgProducer() {
@Override public Object create(Random random) {
return random.nextBoolean()
? EdgeNGramTokenizer.Side.FRONT
: EdgeNGramTokenizer.Side.BACK;
}
});
put(EdgeNGramTokenFilter.Side.class, new ArgProducer() {
@Override public Object create(Random random) {
return random.nextBoolean()
? EdgeNGramTokenFilter.Side.FRONT
: EdgeNGramTokenFilter.Side.BACK;
}
});
put(HyphenationTree.class, new ArgProducer() {
@Override public Object create(Random random) {
// TODO: make nastier
try {
InputSource is = new InputSource(TestCompoundWordTokenFilter.class.getResource("da_UTF8.xml").toExternalForm());
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
return hyphenator;
} catch (Exception ex) {
Rethrow.rethrow(ex);
return null; // unreachable code
}
}
});
put(SnowballProgram.class, new ArgProducer() {
@Override public Object create(Random random) {
try {
String lang = TestSnowball.SNOWBALL_LANGS[random.nextInt(TestSnowball.SNOWBALL_LANGS.length)];
Class<? extends SnowballProgram> clazz = Class.forName("org.tartarus.snowball.ext." + lang + "Stemmer").asSubclass(SnowballProgram.class);
return clazz.newInstance();
} catch (Exception ex) {
Rethrow.rethrow(ex);
return null; // unreachable code
}
}
});
put(String.class, new ArgProducer() {
@Override public Object create(Random random) {
// TODO: make nastier
if (random.nextBoolean()) {
// a token type
return StandardTokenizer.TOKEN_TYPES[random.nextInt(StandardTokenizer.TOKEN_TYPES.length)];
} else {
return _TestUtil.randomSimpleString(random);
}
}
});
put(NormalizeCharMap.class, new ArgProducer() {
@Override public Object create(Random random) {
NormalizeCharMap map = new NormalizeCharMap();
// we can't add duplicate keys, or NormalizeCharMap gets angry
Set<String> keys = new HashSet<String>();
int num = random.nextInt(5);
//System.out.println("NormalizeCharMap=");
for (int i = 0; i < num; i++) {
String key = _TestUtil.randomSimpleString(random);
if (!keys.contains(key)) {
String value = _TestUtil.randomSimpleString(random);
map.add(key, value);
keys.add(key);
//System.out.println("mapping: '" + key + "' => '" + value + "'");
}
}
return map;
}
});
put(CharacterRunAutomaton.class, new ArgProducer() {
@Override public Object create(Random random) {
// TODO: could probably use a purely random automaton
switch(random.nextInt(5)) {
case 0: return MockTokenizer.KEYWORD;
case 1: return MockTokenizer.SIMPLE;
case 2: return MockTokenizer.WHITESPACE;
case 3: return MockTokenFilter.EMPTY_STOPSET;
default: return MockTokenFilter.ENGLISH_STOPSET;
}
}
});
put(CharArrayMap.class, new ArgProducer() {
@Override public Object create(Random random) {
int num = random.nextInt(10);
CharArrayMap<String> map = new CharArrayMap<String>(TEST_VERSION_CURRENT, num, random.nextBoolean());
for (int i = 0; i < num; i++) {
// TODO: make nastier
map.put(_TestUtil.randomSimpleString(random), _TestUtil.randomSimpleString(random));
}
return map;
}
});
put(SynonymMap.class, new ArgProducer() {
@Override public Object create(Random random) {
SynonymMap.Builder b = new SynonymMap.Builder(random.nextBoolean());
final int numEntries = atLeast(10);
for (int j = 0; j < numEntries; j++) {
addSyn(b, randomNonEmptyString(random), randomNonEmptyString(random), random.nextBoolean());
}
try {
return b.build();
} catch (Exception ex) {
Rethrow.rethrow(ex);
return null; // unreachable code
}
}
private void addSyn(SynonymMap.Builder b, String input, String output, boolean keepOrig) {
b.add(new CharsRef(input.replaceAll(" +", "\u0000")),
new CharsRef(output.replaceAll(" +", "\u0000")),
keepOrig);
}
private String randomNonEmptyString(Random random) {
while(true) {
final String s = _TestUtil.randomUnicodeString(random).trim();
if (s.length() != 0 && s.indexOf('\u0000') == -1) {
return s;
}
}
}
});
}};
static final Set<Class<?>> allowedTokenizerArgs, allowedTokenFilterArgs, allowedCharFilterArgs;
static {
allowedTokenizerArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
allowedTokenizerArgs.addAll(argProducers.keySet());
allowedTokenizerArgs.add(Reader.class);
allowedTokenizerArgs.add(AttributeFactory.class);
allowedTokenizerArgs.add(AttributeSource.class);
allowedTokenFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
allowedTokenFilterArgs.addAll(argProducers.keySet());
allowedTokenFilterArgs.add(TokenStream.class);
// TODO: fix this one, thats broken:
allowedTokenFilterArgs.add(CommonGramsFilter.class);
allowedCharFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
allowedCharFilterArgs.addAll(argProducers.keySet());
allowedCharFilterArgs.add(Reader.class);
allowedCharFilterArgs.add(CharStream.class);
}
@SuppressWarnings("unchecked")
static <T> T newRandomArg(Random random, Class<T> paramType) {
final ArgProducer producer = argProducers.get(paramType);
assertNotNull("No producer for arguments of type " + paramType.getName() + " found", producer);
return (T) producer.create(random);
}
static Object[] newTokenizerArgs(Random random, Reader reader, Class<?>[] paramTypes) {
Object[] args = new Object[paramTypes.length];
for (int i = 0; i < args.length; i++) {
Class<?> paramType = paramTypes[i];
if (paramType == Reader.class) {
args[i] = reader;
} else if (paramType == AttributeFactory.class) {
// TODO: maybe the collator one...???
args[i] = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
} else if (paramType == AttributeSource.class) {
// TODO: args[i] = new AttributeSource();
// this is currently too scary to deal with!
args[i] = null; // force IAE
} else {
args[i] = newRandomArg(random, paramType);
}
}
return args;
}
static Object[] newCharFilterArgs(Random random, Reader reader, Class<?>[] paramTypes) {
Object[] args = new Object[paramTypes.length];
for (int i = 0; i < args.length; i++) {
Class<?> paramType = paramTypes[i];
if (paramType == Reader.class) {
args[i] = reader;
} else if (paramType == CharStream.class) {
args[i] = CharReader.get(reader);
} else {
args[i] = newRandomArg(random, paramType);
}
}
return args;
}
static Object[] newFilterArgs(Random random, TokenStream stream, Class<?>[] paramTypes) {
Object[] args = new Object[paramTypes.length];
for (int i = 0; i < args.length; i++) {
Class<?> paramType = paramTypes[i];
if (paramType == TokenStream.class) {
args[i] = stream;
} else if (paramType == CommonGramsFilter.class) {
// TODO: fix this one, thats broken: CommonGramsQueryFilter takes this one explicitly
args[i] = new CommonGramsFilter(TEST_VERSION_CURRENT, stream, newRandomArg(random, CharArraySet.class));
} else {
args[i] = newRandomArg(random, paramType);
}
}
return args;
}
static class MockRandomAnalyzer extends Analyzer {
final long seed;
MockRandomAnalyzer(long seed) {
this.seed = seed;
}
public boolean offsetsAreCorrect() {
// TODO: can we not do the full chain here!?
Random random = new Random(seed);
TokenizerSpec tokenizerSpec = newTokenizer(random, new StringReader(""));
TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
return filterSpec.offsetsAreCorrect;
}
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Random random = new Random(seed);
TokenizerSpec tokenizerSpec = newTokenizer(random, reader);
TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
return new TokenStreamComponents(tokenizerSpec.tokenizer, filterSpec.stream);
}
@Override
protected Reader initReader(Reader reader) {
Random random = new Random(seed);
CharFilterSpec charfilterspec = newCharFilterChain(random, reader);
return charfilterspec.reader;
}
@Override
public String toString() {
Random random = new Random(seed);
StringBuilder sb = new StringBuilder();
CharFilterSpec charFilterSpec = newCharFilterChain(random, new StringReader(""));
sb.append("\ncharfilters=");
sb.append(charFilterSpec.toString);
// intentional: initReader gets its own separate random
random = new Random(seed);
TokenizerSpec tokenizerSpec = newTokenizer(random, charFilterSpec.reader);
sb.append("\n");
sb.append("tokenizer=");
sb.append(tokenizerSpec.toString);
TokenFilterSpec tokenFilterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
sb.append("\n");
sb.append("filters=");
sb.append(tokenFilterSpec.toString);
sb.append("\n");
sb.append("offsetsAreCorrect=" + tokenFilterSpec.offsetsAreCorrect);
return sb.toString();
}
private <T> T createComponent(Constructor<T> ctor, Object[] args, StringBuilder descr) {
try {
final T instance = ctor.newInstance(args);
/*
if (descr.length() > 0) {
descr.append(",");
}
*/
descr.append("\n ");
descr.append(ctor.getDeclaringClass().getName());
String params = Arrays.toString(args);
params = params.substring(1, params.length()-1);
descr.append("(").append(params).append(")");
return instance;
} catch (InvocationTargetException ite) {
final Throwable cause = ite.getCause();
if (cause instanceof IllegalArgumentException ||
cause instanceof UnsupportedOperationException) {
// thats ok, ignore
if (VERBOSE) {
System.err.println("Ignoring IAE/UOE from ctor:");
cause.printStackTrace(System.err);
}
} else {
Rethrow.rethrow(cause);
}
} catch (IllegalAccessException iae) {
Rethrow.rethrow(iae);
} catch (InstantiationException ie) {
Rethrow.rethrow(ie);
}
return null; // no success
}
// create a new random tokenizer from classpath
private TokenizerSpec newTokenizer(Random random, Reader reader) {
TokenizerSpec spec = new TokenizerSpec();
while (spec.tokenizer == null) {
final Constructor<? extends Tokenizer> ctor = tokenizers.get(random.nextInt(tokenizers.size()));
final StringBuilder descr = new StringBuilder();
final CheckThatYouDidntReadAnythingReaderWrapper wrapper = new CheckThatYouDidntReadAnythingReaderWrapper(reader);
final Object args[] = newTokenizerArgs(random, wrapper, ctor.getParameterTypes());
spec.tokenizer = createComponent(ctor, args, descr);
if (brokenOffsetsComponents.contains(ctor.getDeclaringClass())) {
spec.offsetsAreCorrect = false;
}
if (spec.tokenizer == null) {
assertFalse(ctor.getDeclaringClass().getName() + " has read something in ctor but failed with UOE/IAE", wrapper.readSomething);
}
spec.toString = descr.toString();
}
return spec;
}
private CharFilterSpec newCharFilterChain(Random random, Reader reader) {
CharFilterSpec spec = new CharFilterSpec();
spec.reader = reader;
StringBuilder descr = new StringBuilder();
int numFilters = random.nextInt(3);
for (int i = 0; i < numFilters; i++) {
while (true) {
final Constructor<? extends CharStream> ctor = charfilters.get(random.nextInt(charfilters.size()));
final Object args[] = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes());
reader = createComponent(ctor, args, descr);
if (reader != null) {
spec.reader = reader;
break;
}
}
}
spec.toString = descr.toString();
return spec;
}
private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer, boolean offsetsAreCorrect) {
TokenFilterSpec spec = new TokenFilterSpec();
spec.offsetsAreCorrect = offsetsAreCorrect;
spec.stream = tokenizer;
StringBuilder descr = new StringBuilder();
int numFilters = random.nextInt(5);
for (int i = 0; i < numFilters; i++) {
// Insert ValidatingTF after each stage so we can
// catch problems right after the TF that "caused"
// them:
spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i, spec.offsetsAreCorrect);
while (true) {
final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
// hack: MockGraph/MockLookahead has assertions that will trip if they follow
// an offsets violator. so we cant use them after e.g. wikipediatokenizer
if (!spec.offsetsAreCorrect &&
(ctor.getDeclaringClass().equals(MockGraphTokenFilter.class)
|| ctor.getDeclaringClass().equals(MockRandomLookaheadTokenFilter.class))) {
continue;
}
final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
final TokenFilter flt = createComponent(ctor, args, descr);
if (flt != null) {
if (brokenOffsetsComponents.contains(ctor.getDeclaringClass())) {
spec.offsetsAreCorrect = false;
}
spec.stream = flt;
break;
}
}
}
// Insert ValidatingTF after each stage so we can
// catch problems right after the TF that "caused"
// them:
spec.stream = new ValidatingTokenFilter(spec.stream, "last stage", spec.offsetsAreCorrect);
spec.toString = descr.toString();
return spec;
}
}
static final class CheckThatYouDidntReadAnythingReaderWrapper extends CharFilter {
boolean readSomething = false;
CheckThatYouDidntReadAnythingReaderWrapper(Reader in) {
super(CharReader.get(in));
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
readSomething = true;
return super.read(cbuf, off, len);
}
@Override
public int read() throws IOException {
readSomething = true;
return super.read();
}
@Override
public int read(CharBuffer target) throws IOException {
readSomething = true;
return super.read(target);
}
@Override
public int read(char[] cbuf) throws IOException {
readSomething = true;
return super.read(cbuf);
}
@Override
public long skip(long n) throws IOException {
readSomething = true;
return super.skip(n);
}
}
static class TokenizerSpec {
Tokenizer tokenizer;
String toString;
boolean offsetsAreCorrect = true;
}
static class TokenFilterSpec {
TokenStream stream;
String toString;
boolean offsetsAreCorrect = true;
}
static class CharFilterSpec {
Reader reader;
String toString;
}
public void testRandomChains() throws Throwable {
int numIterations = atLeast(20);
for (int i = 0; i < numIterations; i++) {
MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong());
if (VERBOSE) {
System.out.println("Creating random analyzer:" + a);
}
try {
checkRandomData(random, a, 1000, 20, false,
false /* We already validate our own offsets... */);
} catch (Throwable e) {
System.err.println("Exception from random analyzer: " + a);
throw e;
}
}
}
}

View File

@ -65,7 +65,11 @@ public class TestTrimFilter extends BaseTokenStreamTestCase {
new String[] { "a", "b", "c", "" },
new int[] { 1, 0, 1, 3 },
new int[] { 2, 1, 2, 3 },
new int[] { 1, 1, 1, 1 });
null,
new int[] { 1, 1, 1, 1 },
null,
null,
false);
}
/**

View File

@ -72,14 +72,16 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
assertTokenStreamContents(wdf,
new String[] { "foo", "bar", "foobar" },
new int[] { 5, 9, 5 },
new int[] { 8, 12, 12 });
new int[] { 8, 12, 12 },
null, null, null, null, false);
wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf,
new String[] { "foo", "bar", "foobar" },
new int[] { 5, 5, 5 },
new int[] { 6, 6, 6 });
new int[] { 6, 6, 6 },
null, null, null, null, false);
}
@Test
@ -123,7 +125,8 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
assertTokenStreamContents(wdf,
new String[] { "foo", "bar", "foobar"},
new int[] { 8, 12, 8 },
new int[] { 11, 15, 15 });
new int[] { 11, 15, 15 },
null, null, null, null, false);
}
public void doSplit(final String input, String... output) throws Exception {
@ -230,18 +233,27 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
new int[] { 0, 9 },
new int[] { 6, 13 },
new int[] { 1, 1 });
null,
new int[] { 1, 1 },
null,
false);
/* only in this case, posInc of 2 ?! */
assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
new int[] { 0, 9, 12, 9 },
new int[] { 6, 12, 13, 13 },
new int[] { 1, 1, 1, 0 });
null,
new int[] { 1, 1, 1, 0 },
null,
false);
assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
new int[] { 0, 9, 15 },
new int[] { 6, 14, 19 },
new int[] { 1, 1, 1 });
null,
new int[] { 1, 1, 1 },
null,
false);
/* analyzer that will consume tokens with large position increments */
Analyzer a2 = new Analyzer() {
@ -258,24 +270,36 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" },
new int[] { 0, 7, 16 },
new int[] { 6, 15, 20 },
new int[] { 1, 10, 1 });
null,
new int[] { 1, 10, 1 },
null,
false);
/* the "/" had a position increment of 10, where did it go?!?!! */
assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
new int[] { 0, 9 },
new int[] { 6, 13 },
new int[] { 1, 11 });
null,
new int[] { 1, 11 },
null,
false);
/* in this case, the increment of 10 from the "/" is carried over */
assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
new int[] { 0, 9, 12, 9 },
new int[] { 6, 12, 13, 13 },
new int[] { 1, 11, 1, 0 });
null,
new int[] { 1, 11, 1, 0 },
null,
false);
assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
new int[] { 0, 9, 15 },
new int[] { 6, 14, 19 },
new int[] { 1, 11, 1 });
null,
new int[] { 1, 11, 1 },
null,
false);
Analyzer a3 = new Analyzer() {
@Override
@ -292,14 +316,20 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
new String[] { "lucene", "solr", "lucenesolr" },
new int[] { 0, 7, 0 },
new int[] { 6, 11, 11 },
new int[] { 1, 1, 0 });
null,
new int[] { 1, 1, 0 },
null,
false);
/* the stopword should add a gap here */
assertAnalyzesTo(a3, "the lucene.solr",
new String[] { "lucene", "solr", "lucenesolr" },
new int[] { 4, 11, 4 },
new int[] { 10, 15, 15 },
new int[] { 2, 1, 0 });
null,
new int[] { 2, 1, 0 },
null,
false);
}
/** blast some random strings through the analyzer */
@ -322,7 +352,7 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER, 20, false, false);
}
}

View File

@ -94,7 +94,15 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
public void testBackRangeOfNgrams() throws Exception {
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.BACK, 1, 3);
assertTokenStreamContents(tokenizer, new String[]{"e","de","cde"}, new int[]{4,3,2}, new int[]{5,5,5});
assertTokenStreamContents(tokenizer,
new String[]{"e","de","cde"},
new int[]{4,3,2},
new int[]{5,5,5},
null,
null,
null,
null,
false);
}
public void testSmallTokenInStream() throws Exception {
@ -151,7 +159,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.BACK, 2, 15));
}
};
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER, 20, false, false);
}
public void testEmptyTerm() throws Exception {

View File

@ -90,7 +90,7 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase {
public void testBackRangeOfNgrams() throws Exception {
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 3);
assertTokenStreamContents(tokenizer, new String[]{"e","de","cde"}, new int[]{4,3,2}, new int[]{5,5,5}, 5 /* abcde */);
assertTokenStreamContents(tokenizer, new String[]{"e","de","cde"}, new int[]{4,3,2}, new int[]{5,5,5}, null, null, null, 5 /* abcde */, false);
}
public void testReset() throws Exception {
@ -109,8 +109,8 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER, 20, false, false);
checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192, false, false);
Analyzer b = new Analyzer() {
@Override
@ -119,7 +119,7 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
checkRandomData(random, b, 200*RANDOM_MULTIPLIER, 8192);
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER, 20, false, false);
checkRandomData(random, b, 200*RANDOM_MULTIPLIER, 8192, false, false);
}
}

View File

@ -77,7 +77,8 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
assertTokenStreamContents(filter,
new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"},
new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5}
new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
null, null, null, null, false
);
}
@ -130,7 +131,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
new NGramTokenFilter(tokenizer, 2, 15));
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER, 20, false, false);
}
public void testEmptyTerm() throws Exception {

View File

@ -73,7 +73,11 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"},
new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
5 /* abcde */
null,
null,
null,
5 /* abcde */,
false
);
}
@ -98,7 +102,7 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER, 20, false, false);
checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192, false, false);
}
}

View File

@ -142,14 +142,16 @@ public class TestSnowball extends BaseTokenStreamTestCase {
}
}
/** for testing purposes ONLY */
public static String SNOWBALL_LANGS[] = {
"Armenian", "Basque", "Catalan", "Danish", "Dutch", "English",
"Finnish", "French", "German2", "German", "Hungarian", "Irish",
"Italian", "Kp", "Lovins", "Norwegian", "Porter", "Portuguese",
"Romanian", "Russian", "Spanish", "Swedish", "Turkish"
};
public void testEmptyTerm() throws IOException {
String langs[] = {
"Armenian", "Basque", "Catalan", "Danish", "Dutch", "English",
"Finnish", "French", "German2", "German", "Hungarian", "Irish",
"Italian", "Kp", "Lovins", "Norwegian", "Porter", "Portuguese",
"Romanian", "Russian", "Spanish", "Swedish", "Turkish"
};
for (final String lang : langs) {
for (final String lang : SNOWBALL_LANGS) {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {