mirror of https://github.com/apache/lucene.git
LUCENE-3969: Test all ctors in TestRandomChains and fix bugs discovered by the test
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1324960 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
commit
16f5be0efb
|
@ -100,7 +100,14 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException {
|
||||
// offsetsAreCorrect also validates:
|
||||
// - graph offsets are correct (all tokens leaving from
|
||||
// pos X have the same startOffset; all tokens
|
||||
// arriving to pos Y have the same endOffset)
|
||||
// - offsets only move forwards (startOffset >=
|
||||
// lastStartOffset)
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset,
|
||||
boolean offsetsAreCorrect) throws IOException {
|
||||
assertNotNull(output);
|
||||
CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);
|
||||
|
||||
|
@ -137,6 +144,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
|
||||
ts.reset();
|
||||
int pos = -1;
|
||||
int lastStartOffset = 0;
|
||||
for (int i = 0; i < output.length; i++) {
|
||||
// extra safety to enforce, that the state is not preserved and also assign bogus values
|
||||
ts.clearAttributes();
|
||||
|
@ -176,7 +184,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
endOffset <= finalOffset.intValue());
|
||||
}
|
||||
|
||||
if (posLengthAtt != null && posIncrAtt != null) {
|
||||
if (offsetsAreCorrect) {
|
||||
assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset, offsetAtt.startOffset() >= lastStartOffset);
|
||||
lastStartOffset = offsetAtt.startOffset();
|
||||
}
|
||||
|
||||
if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) {
|
||||
// Validate offset consistency in the graph, ie
|
||||
// all tokens leaving from a certain pos have the
|
||||
// same startOffset, and all tokens arriving to a
|
||||
|
@ -194,7 +207,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
// We've seen a token leaving from this position
|
||||
// before; verify the startOffset is the same:
|
||||
//System.out.println(" + vs " + pos + " -> " + startOffset);
|
||||
assertEquals(posToStartOffset.get(pos).intValue(), startOffset);
|
||||
assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToStartOffset.get(pos).intValue(), startOffset);
|
||||
}
|
||||
|
||||
final int endPos = pos + posLength;
|
||||
|
@ -207,7 +220,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
// We've seen a token arriving to this position
|
||||
// before; verify the endOffset is the same:
|
||||
//System.out.println(" + ve " + endPos + " -> " + endOffset);
|
||||
assertEquals(posToEndOffset.get(endPos).intValue(), endOffset);
|
||||
assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToEndOffset.get(endPos).intValue(), endOffset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -222,7 +235,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
|
||||
}
|
||||
}
|
||||
assertFalse("TokenStream has more tokens than expected", ts.incrementToken());
|
||||
assertFalse("TokenStream has more tokens than expected (expected count=" + output.length + ")", ts.incrementToken());
|
||||
ts.end();
|
||||
if (finalOffset != null) {
|
||||
assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
|
||||
|
@ -233,6 +246,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
ts.close();
|
||||
}
|
||||
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException {
|
||||
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, true);
|
||||
}
|
||||
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException {
|
||||
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, finalOffset);
|
||||
}
|
||||
|
@ -280,6 +297,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[]) throws IOException {
|
||||
assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length());
|
||||
}
|
||||
|
||||
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean offsetsAreCorrect) throws IOException {
|
||||
assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), offsetsAreCorrect);
|
||||
}
|
||||
|
||||
public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException {
|
||||
assertAnalyzesTo(a, input, output, null, null, null, null, null);
|
||||
|
@ -342,12 +363,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
|
||||
/** utility method for blasting tokenstreams with data to make sure they don't do anything crazy */
|
||||
public static void checkRandomData(Random random, Analyzer a, int iterations) throws IOException {
|
||||
checkRandomData(random, a, iterations, 20, false);
|
||||
checkRandomData(random, a, iterations, 20, false, true);
|
||||
}
|
||||
|
||||
|
||||
/** utility method for blasting tokenstreams with data to make sure they don't do anything crazy */
|
||||
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException {
|
||||
checkRandomData(random, a, iterations, maxWordLength, false);
|
||||
checkRandomData(random, a, iterations, maxWordLength, false, true);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -355,43 +376,63 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
* @param simple true if only ascii strings will be used (try to avoid)
|
||||
*/
|
||||
public static void checkRandomData(Random random, Analyzer a, int iterations, boolean simple) throws IOException {
|
||||
checkRandomData(random, a, iterations, 20, simple);
|
||||
checkRandomData(random, a, iterations, 20, simple, true);
|
||||
}
|
||||
|
||||
static class AnalysisThread extends Thread {
|
||||
final int iterations;
|
||||
final int maxWordLength;
|
||||
final Random random;
|
||||
final long seed;
|
||||
final Analyzer a;
|
||||
final boolean useCharFilter;
|
||||
final boolean simple;
|
||||
final boolean offsetsAreCorrect;
|
||||
|
||||
// NOTE: not volatile because we don't want the tests to
|
||||
// add memory barriers (ie alter how threads
|
||||
// interact)... so this is just "best effort":
|
||||
public boolean failed;
|
||||
|
||||
AnalysisThread(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple) {
|
||||
this.random = random;
|
||||
AnalysisThread(long seed, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect) {
|
||||
this.seed = seed;
|
||||
this.a = a;
|
||||
this.iterations = iterations;
|
||||
this.maxWordLength = maxWordLength;
|
||||
this.useCharFilter = useCharFilter;
|
||||
this.simple = simple;
|
||||
this.offsetsAreCorrect = offsetsAreCorrect;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
boolean success = false;
|
||||
try {
|
||||
// see the part in checkRandomData where it replays the same text again
|
||||
// to verify reproducability/reuse: hopefully this would catch thread hazards.
|
||||
checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean(), simple);
|
||||
checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect);
|
||||
success = true;
|
||||
} catch (IOException e) {
|
||||
Rethrow.rethrow(e);
|
||||
} finally {
|
||||
failed = !success;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple) throws IOException {
|
||||
checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean(), simple);
|
||||
// now test with multiple threads
|
||||
checkRandomData(random, a, iterations, maxWordLength, simple, true);
|
||||
}
|
||||
|
||||
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple, boolean offsetsAreCorrect) throws IOException {
|
||||
long seed = random.nextLong();
|
||||
boolean useCharFilter = random.nextBoolean();
|
||||
checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect);
|
||||
// now test with multiple threads: note we do the EXACT same thing we did before in each thread,
|
||||
// so this should only really fail from another thread if its an actual thread problem
|
||||
int numThreads = _TestUtil.nextInt(random, 4, 8);
|
||||
Thread threads[] = new Thread[numThreads];
|
||||
AnalysisThread threads[] = new AnalysisThread[numThreads];
|
||||
for (int i = 0; i < threads.length; i++) {
|
||||
threads[i] = new AnalysisThread(new Random(random.nextLong()), a, iterations, maxWordLength, simple);
|
||||
threads[i] = new AnalysisThread(seed, a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect);
|
||||
}
|
||||
for (int i = 0; i < threads.length; i++) {
|
||||
threads[i].start();
|
||||
|
@ -403,9 +444,14 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < threads.length; i++) {
|
||||
if (threads[i].failed) {
|
||||
throw new RuntimeException("some thread(s) failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple) throws IOException {
|
||||
private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect) throws IOException {
|
||||
|
||||
final LineFileDocs docs = new LineFileDocs(random);
|
||||
|
||||
|
@ -437,7 +483,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
}
|
||||
|
||||
try {
|
||||
checkAnalysisConsistency(random, a, useCharFilter, text);
|
||||
checkAnalysisConsistency(random, a, useCharFilter, text, offsetsAreCorrect);
|
||||
} catch (Throwable t) {
|
||||
// TODO: really we should pass a random seed to
|
||||
// checkAnalysisConsistency then print it here too:
|
||||
|
@ -477,6 +523,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text) throws IOException {
|
||||
checkAnalysisConsistency(random, a, useCharFilter, text, true);
|
||||
}
|
||||
|
||||
public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean offsetsAreCorrect) throws IOException {
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
|
||||
|
@ -616,7 +666,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
types.toArray(new String[types.size()]),
|
||||
toIntArray(positions),
|
||||
toIntArray(positionLengths),
|
||||
text.length());
|
||||
text.length(),
|
||||
offsetsAreCorrect);
|
||||
} else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
|
||||
// offset + pos + type
|
||||
assertTokenStreamContents(ts,
|
||||
|
@ -626,7 +677,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
types.toArray(new String[types.size()]),
|
||||
toIntArray(positions),
|
||||
null,
|
||||
text.length());
|
||||
text.length(),
|
||||
offsetsAreCorrect);
|
||||
} else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
|
||||
// offset + pos + posLength
|
||||
assertTokenStreamContents(ts,
|
||||
|
@ -636,7 +688,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
null,
|
||||
toIntArray(positions),
|
||||
toIntArray(positionLengths),
|
||||
text.length());
|
||||
text.length(),
|
||||
offsetsAreCorrect);
|
||||
} else if (posIncAtt != null && offsetAtt != null) {
|
||||
// offset + pos
|
||||
assertTokenStreamContents(ts,
|
||||
|
@ -646,7 +699,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
null,
|
||||
toIntArray(positions),
|
||||
null,
|
||||
text.length());
|
||||
text.length(),
|
||||
offsetsAreCorrect);
|
||||
} else if (offsetAtt != null) {
|
||||
// offset
|
||||
assertTokenStreamContents(ts,
|
||||
|
@ -656,7 +710,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
null,
|
||||
null,
|
||||
null,
|
||||
text.length());
|
||||
text.length(),
|
||||
offsetsAreCorrect);
|
||||
} else {
|
||||
// terms only
|
||||
assertTokenStreamContents(ts,
|
||||
|
|
|
@ -151,7 +151,7 @@ public abstract class LookaheadTokenFilter<T extends LookaheadTokenFilter.Positi
|
|||
startPosData.startOffset = startOffset;
|
||||
} else {
|
||||
// Make sure our input isn't messing up offsets:
|
||||
assert startPosData.startOffset == startOffset;
|
||||
assert startPosData.startOffset == startOffset: "prev startOffset=" + startPosData.startOffset + " vs new startOffset=" + startOffset + " inputPos=" + inputPos;
|
||||
}
|
||||
|
||||
final int endOffset = offsetAtt.endOffset();
|
||||
|
@ -159,7 +159,7 @@ public abstract class LookaheadTokenFilter<T extends LookaheadTokenFilter.Positi
|
|||
endPosData.endOffset = endOffset;
|
||||
} else {
|
||||
// Make sure our input isn't messing up offsets:
|
||||
assert endPosData.endOffset == endOffset;
|
||||
assert endPosData.endOffset == endOffset: "prev endOffset=" + endPosData.endOffset + " vs new endOffset=" + endOffset + " inputPos=" + inputPos;
|
||||
}
|
||||
|
||||
tokenPending = true;
|
||||
|
|
|
@ -76,7 +76,7 @@ public final class MockAnalyzer extends Analyzer {
|
|||
* MockAnalyzer(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, false}).
|
||||
*/
|
||||
public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, boolean lowerCase) {
|
||||
this(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, false);
|
||||
this(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, true);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -93,7 +93,8 @@ public final class MockAnalyzer extends Analyzer {
|
|||
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
MockTokenizer tokenizer = new MockTokenizer(reader, runAutomaton, lowerCase, maxTokenLength);
|
||||
tokenizer.setEnableChecks(enableChecks);
|
||||
TokenFilter filt = new MockTokenFilter(tokenizer, filter, enablePositionIncrements);
|
||||
MockTokenFilter filt = new MockTokenFilter(tokenizer, filter);
|
||||
filt.setEnablePositionIncrements(enablePositionIncrements);
|
||||
return new TokenStreamComponents(tokenizer, maybePayload(filt, fieldName));
|
||||
}
|
||||
|
||||
|
|
|
@ -34,7 +34,9 @@ public class MockCharFilter extends CharStream {
|
|||
// TODO: instead of fixed remainder... maybe a fixed
|
||||
// random seed?
|
||||
this.remainder = remainder;
|
||||
assert remainder >= 0 && remainder < 10 : "invalid parameter";
|
||||
if (remainder < 0 || remainder >= 10) {
|
||||
throw new IllegalArgumentException("invalid remainder parameter (must be 0..10): " + remainder);
|
||||
}
|
||||
}
|
||||
|
||||
// for testing only, uses a remainder of 0
|
||||
|
|
|
@ -34,6 +34,9 @@ public final class MockFixedLengthPayloadFilter extends TokenFilter {
|
|||
|
||||
public MockFixedLengthPayloadFilter(Random random, TokenStream in, int length) {
|
||||
super(in);
|
||||
if (length < 0) {
|
||||
throw new IllegalArgumentException("length must be >= 0");
|
||||
}
|
||||
this.random = random;
|
||||
this.bytes = new byte[length];
|
||||
this.payload = new Payload(bytes);
|
||||
|
|
|
@ -31,10 +31,12 @@ public final class MockRandomLookaheadTokenFilter extends LookaheadTokenFilter<L
|
|||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final Random random;
|
||||
private final long seed;
|
||||
|
||||
public MockRandomLookaheadTokenFilter(Random random, TokenStream in) {
|
||||
super(in);
|
||||
this.random = random;
|
||||
this.seed = random.nextLong();
|
||||
this.random = new Random(seed);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -57,9 +59,6 @@ public final class MockRandomLookaheadTokenFilter extends LookaheadTokenFilter<L
|
|||
|
||||
if (!end) {
|
||||
while (true) {
|
||||
// We can use un-re-seeded random, because how far
|
||||
// ahead we peek should never alter the resulting
|
||||
// tokens as seen by the consumer:
|
||||
if (random.nextInt(3) == 1) {
|
||||
if (!peekToken()) {
|
||||
if (DEBUG) {
|
||||
|
@ -91,4 +90,10 @@ public final class MockRandomLookaheadTokenFilter extends LookaheadTokenFilter<L
|
|||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
random.setSeed(seed);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -55,7 +55,7 @@ public final class MockTokenFilter extends TokenFilter {
|
|||
makeString("with"))));
|
||||
|
||||
private final CharacterRunAutomaton filter;
|
||||
private boolean enablePositionIncrements = false;
|
||||
private boolean enablePositionIncrements = true;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
@ -67,14 +67,16 @@ public final class MockTokenFilter extends TokenFilter {
|
|||
* @param filter DFA representing the terms that should be removed.
|
||||
* @param enablePositionIncrements true if the removal should accumulate position increments.
|
||||
*/
|
||||
public MockTokenFilter(TokenStream input, CharacterRunAutomaton filter, boolean enablePositionIncrements) {
|
||||
public MockTokenFilter(TokenStream input, CharacterRunAutomaton filter) {
|
||||
super(input);
|
||||
this.filter = filter;
|
||||
this.enablePositionIncrements = enablePositionIncrements;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
// TODO: fix me when posInc=false, to work like FilteringTokenFilter in that case and not return
|
||||
// initial token with posInc=0 ever
|
||||
|
||||
// return the first non-stop word found
|
||||
int skippedPositions = 0;
|
||||
while (input.incrementToken()) {
|
||||
|
|
|
@ -0,0 +1,170 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
||||
// TODO: rename to OffsetsXXXTF? ie we only validate
|
||||
// offsets (now anyway...)
|
||||
|
||||
// TODO: also make a DebuggingTokenFilter, that just prints
|
||||
// all att values that come through it...
|
||||
|
||||
// TODO: BTSTC should just append this to the chain
|
||||
// instead of checking itself:
|
||||
|
||||
/** A TokenFilter that checks consistency of the tokens (eg
|
||||
* offsets are consistent with one another). */
|
||||
public final class ValidatingTokenFilter extends TokenFilter {
|
||||
|
||||
private int pos;
|
||||
private int lastStartOffset;
|
||||
|
||||
// Maps position to the start/end offset:
|
||||
private final Map<Integer,Integer> posToStartOffset = new HashMap<Integer,Integer>();
|
||||
private final Map<Integer,Integer> posToEndOffset = new HashMap<Integer,Integer>();
|
||||
|
||||
private final PositionIncrementAttribute posIncAtt = getAttrIfExists(PositionIncrementAttribute.class);
|
||||
private final PositionLengthAttribute posLenAtt = getAttrIfExists(PositionLengthAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = getAttrIfExists(OffsetAttribute.class);
|
||||
private final CharTermAttribute termAtt = getAttrIfExists(CharTermAttribute.class);
|
||||
private final boolean offsetsAreCorrect;
|
||||
|
||||
private final String name;
|
||||
|
||||
// Returns null if the attr wasn't already added
|
||||
private <A extends Attribute> A getAttrIfExists(Class<A> att) {
|
||||
if (hasAttribute(att)) {
|
||||
return getAttribute(att);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/** The name arg is used to identify this stage when
|
||||
* throwing exceptions (useful if you have more than one
|
||||
* instance in your chain). */
|
||||
public ValidatingTokenFilter(TokenStream in, String name, boolean offsetsAreCorrect) {
|
||||
super(in);
|
||||
this.name = name;
|
||||
this.offsetsAreCorrect = offsetsAreCorrect;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (!input.incrementToken()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int startOffset = 0;
|
||||
int endOffset = 0;
|
||||
int posLen = 0;
|
||||
|
||||
if (posIncAtt != null) {
|
||||
pos += posIncAtt.getPositionIncrement();
|
||||
if (pos == -1) {
|
||||
throw new IllegalStateException("first posInc must be > 0");
|
||||
}
|
||||
}
|
||||
|
||||
// System.out.println(" got token=" + termAtt + " pos=" + pos);
|
||||
|
||||
if (offsetAtt != null) {
|
||||
startOffset = offsetAtt.startOffset();
|
||||
endOffset = offsetAtt.endOffset();
|
||||
|
||||
if (startOffset < 0) {
|
||||
throw new IllegalStateException(name + ": startOffset=" + startOffset + " is < 0");
|
||||
}
|
||||
if (endOffset < 0) {
|
||||
throw new IllegalStateException(name + ": endOffset=" + endOffset + " is < 0");
|
||||
}
|
||||
if (endOffset < startOffset) {
|
||||
throw new IllegalStateException(name + ": startOffset=" + startOffset + " is > endOffset=" + endOffset + " pos=" + pos + "; token=" + termAtt);
|
||||
}
|
||||
if (offsetsAreCorrect && offsetAtt.startOffset() < lastStartOffset) {
|
||||
throw new IllegalStateException(name + ": offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset);
|
||||
}
|
||||
lastStartOffset = offsetAtt.startOffset();
|
||||
}
|
||||
|
||||
posLen = posLenAtt == null ? 1 : posLenAtt.getPositionLength();
|
||||
|
||||
if (offsetAtt != null && posIncAtt != null && offsetsAreCorrect) {
|
||||
|
||||
if (!posToStartOffset.containsKey(pos)) {
|
||||
// First time we've seen a token leaving from this position:
|
||||
posToStartOffset.put(pos, startOffset);
|
||||
//System.out.println(" + s " + pos + " -> " + startOffset);
|
||||
} else {
|
||||
// We've seen a token leaving from this position
|
||||
// before; verify the startOffset is the same:
|
||||
//System.out.println(" + vs " + pos + " -> " + startOffset);
|
||||
final int oldStartOffset = posToStartOffset.get(pos);
|
||||
if (oldStartOffset != startOffset) {
|
||||
throw new IllegalStateException(name + ": inconsistent startOffset at pos=" + pos + ": " + oldStartOffset + " vs " + startOffset + "; token=" + termAtt);
|
||||
}
|
||||
}
|
||||
|
||||
final int endPos = pos + posLen;
|
||||
|
||||
if (!posToEndOffset.containsKey(endPos)) {
|
||||
// First time we've seen a token arriving to this position:
|
||||
posToEndOffset.put(endPos, endOffset);
|
||||
//System.out.println(" + e " + endPos + " -> " + endOffset);
|
||||
} else {
|
||||
// We've seen a token arriving to this position
|
||||
// before; verify the endOffset is the same:
|
||||
//System.out.println(" + ve " + endPos + " -> " + endOffset);
|
||||
final int oldEndOffset = posToEndOffset.get(endPos);
|
||||
if (oldEndOffset != endOffset) {
|
||||
throw new IllegalStateException(name + ": inconsistent endOffset at pos=" + endPos + ": " + oldEndOffset + " vs " + endOffset + "; token=" + termAtt);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
super.end();
|
||||
|
||||
// TODO: what else to validate
|
||||
|
||||
// TODO: check that endOffset is >= max(endOffset)
|
||||
// we've seen
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
pos = -1;
|
||||
posToStartOffset.clear();
|
||||
posToEndOffset.clear();
|
||||
lastStartOffset = 0;
|
||||
}
|
||||
}
|
|
@ -42,6 +42,16 @@ Bug fixes
|
|||
* LUCENE-3820: PatternReplaceCharFilter could return invalid token positions.
|
||||
(Dawid Weiss)
|
||||
|
||||
* LUCENE-3969: Throw IAE on bad arguments that could cause confusing errors in
|
||||
CompoundWordTokenFilterBase, PatternTokenizer, PositionFilter,
|
||||
SnowballFilter, PathHierarchyTokenizer, ReversePathHierarchyTokenizer,
|
||||
WikipediaTokenizer, and KeywordTokenizer. ShingleFilter and
|
||||
CommonGramsFilter now populate PositionLengthAttribute. Fixed
|
||||
PathHierarchyTokenizer to reset() all state. Protect against AIOOBE in
|
||||
ReversePathHierarchyTokenizer if skip is large. Fixed wrong final
|
||||
offset calculation in PathHierarchyTokenizer.
|
||||
(Mike McCandless, Uwe Schindler, Robert Muir)
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-2341: A new analyzer/ filter: Morfologik - a dictionary-driven lemmatizer
|
||||
|
@ -108,4 +118,4 @@ New Features
|
|||
(Chris Male, Robert Muir)
|
||||
|
||||
* SOLR-2764: Create a NorwegianLightStemmer and NorwegianMinimalStemmer (janhoy)
|
||||
|
||||
|
||||
|
|
|
@ -16,6 +16,7 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
@ -54,6 +55,7 @@ public final class CommonGramsFilter extends TokenFilter {
|
|||
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
|
||||
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
|
||||
private final PositionLengthAttribute posLenAttribute = addAttribute(PositionLengthAttribute.class);
|
||||
|
||||
private int lastStartOffset;
|
||||
private boolean lastWasCommon;
|
||||
|
@ -166,6 +168,7 @@ public final class CommonGramsFilter extends TokenFilter {
|
|||
buffer.getChars(0, length, termText, 0);
|
||||
termAttribute.setLength(length);
|
||||
posIncAttribute.setPositionIncrement(0);
|
||||
posLenAttribute.setPositionLength(2); // bigram
|
||||
offsetAttribute.setOffset(lastStartOffset, endOffset);
|
||||
typeAttribute.setType(GRAM_TYPE);
|
||||
buffer.setLength(0);
|
||||
|
|
|
@ -82,8 +82,17 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
|||
super(input);
|
||||
|
||||
this.tokens=new LinkedList<CompoundToken>();
|
||||
if (minWordSize < 0) {
|
||||
throw new IllegalArgumentException("minWordSize cannot be negative");
|
||||
}
|
||||
this.minWordSize=minWordSize;
|
||||
if (minSubwordSize < 0) {
|
||||
throw new IllegalArgumentException("minSubwordSize cannot be negative");
|
||||
}
|
||||
this.minSubwordSize=minSubwordSize;
|
||||
if (maxSubwordSize < 0) {
|
||||
throw new IllegalArgumentException("maxSubwordSize cannot be negative");
|
||||
}
|
||||
this.maxSubwordSize=maxSubwordSize;
|
||||
this.onlyLongestMatch=onlyLongestMatch;
|
||||
this.dictionary = dictionary;
|
||||
|
|
|
@ -191,6 +191,8 @@ public class HyphenationCompoundWordTokenFilter extends
|
|||
// we only put subwords to the token stream
|
||||
// that are longer than minPartSize
|
||||
if (partLength < this.minSubwordSize) {
|
||||
// BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the
|
||||
// calculation above, and we rely upon minSubwordSize being >=0 to filter them out...
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
|
@ -43,16 +43,25 @@ public final class KeywordTokenizer extends Tokenizer {
|
|||
|
||||
public KeywordTokenizer(Reader input, int bufferSize) {
|
||||
super(input);
|
||||
if (bufferSize <= 0) {
|
||||
throw new IllegalArgumentException("bufferSize must be > 0");
|
||||
}
|
||||
termAtt.resizeBuffer(bufferSize);
|
||||
}
|
||||
|
||||
public KeywordTokenizer(AttributeSource source, Reader input, int bufferSize) {
|
||||
super(source, input);
|
||||
if (bufferSize <= 0) {
|
||||
throw new IllegalArgumentException("bufferSize must be > 0");
|
||||
}
|
||||
termAtt.resizeBuffer(bufferSize);
|
||||
}
|
||||
|
||||
public KeywordTokenizer(AttributeFactory factory, Reader input, int bufferSize) {
|
||||
super(factory, input);
|
||||
if (bufferSize <= 0) {
|
||||
throw new IllegalArgumentException("bufferSize must be > 0");
|
||||
}
|
||||
termAtt.resizeBuffer(bufferSize);
|
||||
}
|
||||
|
||||
|
|
|
@ -65,6 +65,12 @@ public class PathHierarchyTokenizer extends Tokenizer {
|
|||
|
||||
public PathHierarchyTokenizer(Reader input, int bufferSize, char delimiter, char replacement, int skip) {
|
||||
super(input);
|
||||
if (bufferSize < 0) {
|
||||
throw new IllegalArgumentException("bufferSize cannot be negative");
|
||||
}
|
||||
if (skip < 0) {
|
||||
throw new IllegalArgumentException("skip cannot be negative");
|
||||
}
|
||||
termAtt.resizeBuffer(bufferSize);
|
||||
|
||||
this.delimiter = delimiter;
|
||||
|
@ -85,10 +91,11 @@ public class PathHierarchyTokenizer extends Tokenizer {
|
|||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private int startPosition = 0;
|
||||
private int finalOffset = 0;
|
||||
private int skipped = 0;
|
||||
private boolean endDelimiter = false;
|
||||
private StringBuilder resultToken;
|
||||
|
||||
private int charsRead = 0;
|
||||
|
||||
|
||||
@Override
|
||||
|
@ -112,12 +119,13 @@ public class PathHierarchyTokenizer extends Tokenizer {
|
|||
|
||||
while (true) {
|
||||
int c = input.read();
|
||||
if( c < 0 ){
|
||||
if (c >= 0) {
|
||||
charsRead++;
|
||||
} else {
|
||||
if( skipped > skip ) {
|
||||
length += resultToken.length();
|
||||
termAtt.setLength(length);
|
||||
finalOffset = correctOffset(startPosition + length);
|
||||
offsetAtt.setOffset(correctOffset(startPosition), finalOffset);
|
||||
offsetAtt.setOffset(correctOffset(startPosition), correctOffset(startPosition + length));
|
||||
if( added ){
|
||||
resultToken.setLength(0);
|
||||
resultToken.append(termAtt.buffer(), 0, length);
|
||||
|
@ -125,7 +133,6 @@ public class PathHierarchyTokenizer extends Tokenizer {
|
|||
return added;
|
||||
}
|
||||
else{
|
||||
finalOffset = correctOffset(startPosition + length);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -168,8 +175,7 @@ public class PathHierarchyTokenizer extends Tokenizer {
|
|||
}
|
||||
length += resultToken.length();
|
||||
termAtt.setLength(length);
|
||||
finalOffset = correctOffset(startPosition + length);
|
||||
offsetAtt.setOffset(correctOffset(startPosition), finalOffset);
|
||||
offsetAtt.setOffset(correctOffset(startPosition), correctOffset(startPosition+length));
|
||||
resultToken.setLength(0);
|
||||
resultToken.append(termAtt.buffer(), 0, length);
|
||||
return true;
|
||||
|
@ -178,15 +184,17 @@ public class PathHierarchyTokenizer extends Tokenizer {
|
|||
@Override
|
||||
public final void end() {
|
||||
// set final offset
|
||||
int finalOffset = correctOffset(charsRead);
|
||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader input) throws IOException {
|
||||
super.reset(input);
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
resultToken.setLength(0);
|
||||
finalOffset = 0;
|
||||
charsRead = 0;
|
||||
endDelimiter = false;
|
||||
skipped = 0;
|
||||
startPosition = 0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -77,6 +77,12 @@ public class ReversePathHierarchyTokenizer extends Tokenizer {
|
|||
|
||||
public ReversePathHierarchyTokenizer(Reader input, int bufferSize, char delimiter, char replacement, int skip) {
|
||||
super(input);
|
||||
if (bufferSize < 0) {
|
||||
throw new IllegalArgumentException("bufferSize cannot be negative");
|
||||
}
|
||||
if (skip < 0) {
|
||||
throw new IllegalArgumentException("skip cannot be negative");
|
||||
}
|
||||
termAtt.resizeBuffer(bufferSize);
|
||||
this.delimiter = delimiter;
|
||||
this.replacement = replacement;
|
||||
|
@ -137,7 +143,11 @@ public class ReversePathHierarchyTokenizer extends Tokenizer {
|
|||
}
|
||||
resultToken.getChars(0, resultToken.length(), resultTokenBuffer, 0);
|
||||
resultToken.setLength(0);
|
||||
endPosition = delimiterPositions.get(delimitersCount-1 - skip);
|
||||
int idx = delimitersCount-1 - skip;
|
||||
if (idx >= 0) {
|
||||
// otherwise its ok, because we will skip and return false
|
||||
endPosition = delimiterPositions.get(idx);
|
||||
}
|
||||
finalOffset = correctOffset(length);
|
||||
posAtt.setPositionIncrement(1);
|
||||
}
|
||||
|
@ -163,10 +173,11 @@ public class ReversePathHierarchyTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader input) throws IOException {
|
||||
super.reset(input);
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
resultToken.setLength(0);
|
||||
finalOffset = 0;
|
||||
endPosition = 0;
|
||||
skipped = 0;
|
||||
delimitersCount = -1;
|
||||
delimiterPositions.clear();
|
||||
|
|
|
@ -69,8 +69,17 @@ public final class PatternTokenizer extends Tokenizer {
|
|||
super(input);
|
||||
this.pattern = pattern;
|
||||
this.group = group;
|
||||
|
||||
// Use "" instead of str so don't consume chars
|
||||
// (fillBuffer) from the input on throwing IAE below:
|
||||
matcher = pattern.matcher("");
|
||||
|
||||
// confusingly group count depends ENTIRELY on the pattern but is only accessible via matcher
|
||||
if (group >= 0 && group > matcher.groupCount()) {
|
||||
throw new IllegalArgumentException("invalid group specified: pattern only has: " + matcher.groupCount() + " capturing groups");
|
||||
}
|
||||
fillBuffer(str, input);
|
||||
matcher = pattern.matcher(str);
|
||||
matcher.reset(str);
|
||||
index = 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -57,6 +57,9 @@ public final class PositionFilter extends TokenFilter {
|
|||
*/
|
||||
public PositionFilter(final TokenStream input, final int positionIncrement) {
|
||||
super(input);
|
||||
if (positionIncrement < 0) {
|
||||
throw new IllegalArgumentException("positionIncrement may not be negative");
|
||||
}
|
||||
this.positionIncrement = positionIncrement;
|
||||
}
|
||||
|
||||
|
|
|
@ -23,9 +23,10 @@ import java.util.LinkedList;
|
|||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
|
@ -150,6 +151,7 @@ public final class ShingleFilter extends TokenFilter {
|
|||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
|
||||
|
||||
|
@ -319,6 +321,7 @@ public final class ShingleFilter extends TokenFilter {
|
|||
noShingleOutput = false;
|
||||
}
|
||||
offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
|
||||
posLenAtt.setPositionLength(builtGramSize);
|
||||
isOutputHere = true;
|
||||
gramSize.advance();
|
||||
tokenAvailable = true;
|
||||
|
@ -436,6 +439,8 @@ public final class ShingleFilter extends TokenFilter {
|
|||
super.reset();
|
||||
gramSize.reset();
|
||||
inputWindow.clear();
|
||||
nextInputStreamToken = null;
|
||||
isNextInputStreamToken = false;
|
||||
numFillerTokensToInsert = 0;
|
||||
isOutputHere = false;
|
||||
noShingleOutput = true;
|
||||
|
|
|
@ -67,7 +67,7 @@ public final class SnowballFilter extends TokenFilter {
|
|||
Class.forName("org.tartarus.snowball.ext." + name + "Stemmer").asSubclass(SnowballProgram.class);
|
||||
stemmer = stemClass.newInstance();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e.toString());
|
||||
throw new IllegalArgumentException("Invalid stemmer class specified: " + name, e);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -177,6 +177,12 @@ public final class WikipediaTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
private void init(int tokenOutput, Set<String> untokenizedTypes) {
|
||||
// TODO: cutover to enum
|
||||
if (tokenOutput != TOKENS_ONLY &&
|
||||
tokenOutput != UNTOKENIZED_ONLY &&
|
||||
tokenOutput != BOTH) {
|
||||
throw new IllegalArgumentException("tokenOutput must be TOKENS_ONLY, UNTOKENIZED_ONLY or BOTH");
|
||||
}
|
||||
this.tokenOutput = tokenOutput;
|
||||
this.untokenizedTypes = untokenizedTypes;
|
||||
}
|
||||
|
|
|
@ -19,6 +19,8 @@ package org.apache.lucene.analysis.charfilter;
|
|||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
|
@ -27,6 +29,8 @@ import org.apache.lucene.analysis.CharStream;
|
|||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.junit.Ignore;
|
||||
|
||||
public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
|
@ -190,4 +194,67 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
|||
int numRounds = RANDOM_MULTIPLIER * 10000;
|
||||
checkRandomData(random, analyzer, numRounds);
|
||||
}
|
||||
|
||||
@Ignore("wrong finalOffset: https://issues.apache.org/jira/browse/LUCENE-3971")
|
||||
public void testFinalOffsetSpecialCase() throws Exception {
|
||||
final NormalizeCharMap map = new NormalizeCharMap();
|
||||
map.add("t", "");
|
||||
// even though this below rule has no effect, the test passes if you remove it!!
|
||||
map.add("tmakdbl", "c");
|
||||
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(Reader reader) {
|
||||
return new MappingCharFilter(map, CharReader.get(reader));
|
||||
}
|
||||
};
|
||||
|
||||
String text = "gzw f quaxot";
|
||||
checkAnalysisConsistency(random, analyzer, false, text);
|
||||
}
|
||||
|
||||
@Ignore("wrong finalOffset: https://issues.apache.org/jira/browse/LUCENE-3971")
|
||||
public void testRandomMaps() throws Exception {
|
||||
for (int i = 0; i < 100; i++) {
|
||||
final NormalizeCharMap map = randomMap();
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(Reader reader) {
|
||||
return new MappingCharFilter(map, CharReader.get(reader));
|
||||
}
|
||||
};
|
||||
int numRounds = RANDOM_MULTIPLIER * 100;
|
||||
checkRandomData(random, analyzer, numRounds);
|
||||
}
|
||||
}
|
||||
|
||||
private NormalizeCharMap randomMap() {
|
||||
NormalizeCharMap map = new NormalizeCharMap();
|
||||
// we can't add duplicate keys, or NormalizeCharMap gets angry
|
||||
Set<String> keys = new HashSet<String>();
|
||||
int num = random.nextInt(5);
|
||||
//System.out.println("NormalizeCharMap=");
|
||||
for (int i = 0; i < num; i++) {
|
||||
String key = _TestUtil.randomSimpleString(random);
|
||||
if (!keys.contains(key)) {
|
||||
String value = _TestUtil.randomSimpleString(random);
|
||||
map.add(key, value);
|
||||
keys.add(key);
|
||||
//System.out.println("mapping: '" + key + "' => '" + value + "'");
|
||||
}
|
||||
}
|
||||
return map;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,17 +18,28 @@ package org.apache.lucene.analysis.core;
|
|||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.lang.reflect.Constructor;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.lang.reflect.Modifier;
|
||||
import java.net.URL;
|
||||
import java.nio.CharBuffer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.Enumeration;
|
||||
import java.util.HashSet;
|
||||
import java.util.IdentityHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
|
@ -36,67 +47,174 @@ import org.apache.lucene.analysis.CachingTokenFilter;
|
|||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.EmptyTokenizer;
|
||||
import org.apache.lucene.analysis.MockGraphTokenFilter;
|
||||
import org.apache.lucene.analysis.MockRandomLookaheadTokenFilter;
|
||||
import org.apache.lucene.analysis.MockTokenFilter;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
|
||||
import org.apache.lucene.analysis.ValidatingTokenFilter;
|
||||
import org.apache.lucene.analysis.charfilter.CharFilter;
|
||||
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
|
||||
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
|
||||
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
|
||||
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
|
||||
import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter;
|
||||
import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
|
||||
import org.apache.lucene.analysis.compound.TestCompoundWordTokenFilter;
|
||||
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
||||
import org.apache.lucene.analysis.hunspell.HunspellDictionary;
|
||||
import org.apache.lucene.analysis.hunspell.HunspellDictionaryTest;
|
||||
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.TrimFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
|
||||
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
|
||||
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
|
||||
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
|
||||
import org.apache.lucene.analysis.ngram.NGramTokenizer;
|
||||
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
|
||||
import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
|
||||
import org.apache.lucene.analysis.payloads.IdentityEncoder;
|
||||
import org.apache.lucene.analysis.payloads.PayloadEncoder;
|
||||
import org.apache.lucene.analysis.position.PositionFilter;
|
||||
import org.apache.lucene.analysis.snowball.TestSnowball;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.synonym.SynonymMap;
|
||||
import org.apache.lucene.analysis.th.ThaiWordFilter;
|
||||
import org.apache.lucene.analysis.util.CharArrayMap;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.AttributeSource.AttributeFactory;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.Rethrow;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
import org.tartarus.snowball.SnowballProgram;
|
||||
import org.xml.sax.InputSource;
|
||||
|
||||
/** tests random analysis chains */
|
||||
public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||
static List<Class<? extends Tokenizer>> tokenizers;
|
||||
static List<Class<? extends TokenFilter>> tokenfilters;
|
||||
static List<Class<? extends CharStream>> charfilters;
|
||||
|
||||
static List<Constructor<? extends Tokenizer>> tokenizers;
|
||||
static List<Constructor<? extends TokenFilter>> tokenfilters;
|
||||
static List<Constructor<? extends CharStream>> charfilters;
|
||||
|
||||
// TODO: fix those and remove
|
||||
private static final Set<Class<?>> brokenComponents = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
|
||||
static {
|
||||
// TODO: can we promote some of these to be only
|
||||
// offsets offenders?
|
||||
Collections.<Class<?>>addAll(brokenComponents,
|
||||
// TODO: fix basetokenstreamtestcase not to trip because this one has no CharTermAtt
|
||||
EmptyTokenizer.class,
|
||||
// doesn't actual reset itself!
|
||||
CachingTokenFilter.class,
|
||||
// doesn't consume whole stream!
|
||||
LimitTokenCountFilter.class,
|
||||
// Not broken: we forcefully add this, so we shouldn't
|
||||
// also randomly pick it:
|
||||
ValidatingTokenFilter.class,
|
||||
// NOTE: these by themselves won't cause any 'basic assertions' to fail.
|
||||
// but see https://issues.apache.org/jira/browse/LUCENE-3920, if any
|
||||
// tokenfilter that combines words (e.g. shingles) comes after them,
|
||||
// this will create bogus offsets because their 'offsets go backwards',
|
||||
// causing shingle or whatever to make a single token with a
|
||||
// startOffset thats > its endOffset
|
||||
// (see LUCENE-3738 for a list of other offenders here)
|
||||
// broken!
|
||||
NGramTokenizer.class,
|
||||
// broken!
|
||||
NGramTokenFilter.class,
|
||||
// broken!
|
||||
EdgeNGramTokenizer.class,
|
||||
// broken!
|
||||
EdgeNGramTokenFilter.class,
|
||||
// broken!
|
||||
WordDelimiterFilter.class,
|
||||
// broken!
|
||||
TrimFilter.class,
|
||||
// TODO: remove this class after we fix its finalOffset bug
|
||||
MappingCharFilter.class
|
||||
);
|
||||
}
|
||||
|
||||
// TODO: also fix these and remove (maybe):
|
||||
// Classes that don't produce consistent graph offsets:
|
||||
private static final Set<Class<?>> brokenOffsetsComponents = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
|
||||
static {
|
||||
Collections.<Class<?>>addAll(brokenOffsetsComponents,
|
||||
ReversePathHierarchyTokenizer.class,
|
||||
PathHierarchyTokenizer.class,
|
||||
HyphenationCompoundWordTokenFilter.class,
|
||||
DictionaryCompoundWordTokenFilter.class,
|
||||
// TODO: corrumpts graphs (offset consistency check):
|
||||
PositionFilter.class,
|
||||
// TODO: it seems to mess up offsets!?
|
||||
WikipediaTokenizer.class,
|
||||
// TODO: doesn't handle graph inputs
|
||||
ThaiWordFilter.class,
|
||||
// TODO: doesn't handle graph inputs
|
||||
CJKBigramFilter.class
|
||||
);
|
||||
}
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
List<Class<?>> analysisClasses = new ArrayList<Class<?>>();
|
||||
getClassesForPackage("org.apache.lucene.analysis", analysisClasses);
|
||||
tokenizers = new ArrayList<Class<? extends Tokenizer>>();
|
||||
tokenfilters = new ArrayList<Class<? extends TokenFilter>>();
|
||||
charfilters = new ArrayList<Class<? extends CharStream>>();
|
||||
for (Class<?> c : analysisClasses) {
|
||||
// don't waste time with abstract classes or deprecated known-buggy ones
|
||||
tokenizers = new ArrayList<Constructor<? extends Tokenizer>>();
|
||||
tokenfilters = new ArrayList<Constructor<? extends TokenFilter>>();
|
||||
charfilters = new ArrayList<Constructor<? extends CharStream>>();
|
||||
for (final Class<?> c : analysisClasses) {
|
||||
final int modifiers = c.getModifiers();
|
||||
if (Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers)
|
||||
|| c.getAnnotation(Deprecated.class) != null
|
||||
|| c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface()
|
||||
// TODO: fix basetokenstreamtestcase not to trip because this one has no CharTermAtt
|
||||
|| c.equals(EmptyTokenizer.class)
|
||||
// doesn't actual reset itself!
|
||||
|| c.equals(CachingTokenFilter.class)
|
||||
// broken!
|
||||
|| c.equals(NGramTokenizer.class)
|
||||
// broken!
|
||||
|| c.equals(NGramTokenFilter.class)
|
||||
// broken!
|
||||
|| c.equals(EdgeNGramTokenizer.class)
|
||||
// broken!
|
||||
|| c.equals(EdgeNGramTokenFilter.class)) {
|
||||
if (
|
||||
// don't waste time with abstract classes or deprecated known-buggy ones
|
||||
Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers)
|
||||
|| c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface()
|
||||
|| brokenComponents.contains(c)
|
||||
|| c.isAnnotationPresent(Deprecated.class)
|
||||
|| !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharStream.class.isAssignableFrom(c))
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
if (Tokenizer.class.isAssignableFrom(c)) {
|
||||
tokenizers.add(c.asSubclass(Tokenizer.class));
|
||||
} else if (TokenFilter.class.isAssignableFrom(c)) {
|
||||
tokenfilters.add(c.asSubclass(TokenFilter.class));
|
||||
} else if (CharStream.class.isAssignableFrom(c)) {
|
||||
charfilters.add(c.asSubclass(CharStream.class));
|
||||
|
||||
for (final Constructor<?> ctor : c.getConstructors()) {
|
||||
// don't test synthetic or deprecated ctors, they likely have known bugs:
|
||||
if (ctor.isSynthetic() || ctor.isAnnotationPresent(Deprecated.class)) {
|
||||
continue;
|
||||
}
|
||||
if (Tokenizer.class.isAssignableFrom(c)) {
|
||||
assertTrue(ctor.toGenericString() + " has unsupported parameter types",
|
||||
allowedTokenizerArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
|
||||
tokenizers.add(castConstructor(Tokenizer.class, ctor));
|
||||
} else if (TokenFilter.class.isAssignableFrom(c)) {
|
||||
assertTrue(ctor.toGenericString() + " has unsupported parameter types",
|
||||
allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
|
||||
tokenfilters.add(castConstructor(TokenFilter.class, ctor));
|
||||
} else if (CharStream.class.isAssignableFrom(c)) {
|
||||
assertTrue(ctor.toGenericString() + " has unsupported parameter types",
|
||||
allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
|
||||
charfilters.add(castConstructor(CharStream.class, ctor));
|
||||
} else {
|
||||
fail("Cannot get here");
|
||||
}
|
||||
}
|
||||
}
|
||||
final Comparator<Class<?>> classComp = new Comparator<Class<?>>() {
|
||||
|
||||
final Comparator<Constructor<?>> ctorComp = new Comparator<Constructor<?>>() {
|
||||
@Override
|
||||
public int compare(Class<?> arg0, Class<?> arg1) {
|
||||
return arg0.getName().compareTo(arg1.getName());
|
||||
public int compare(Constructor<?> arg0, Constructor<?> arg1) {
|
||||
return arg0.toGenericString().compareTo(arg1.toGenericString());
|
||||
}
|
||||
};
|
||||
Collections.sort(tokenizers, classComp);
|
||||
Collections.sort(tokenfilters, classComp);
|
||||
Collections.sort(charfilters, classComp);
|
||||
Collections.sort(tokenizers, ctorComp);
|
||||
Collections.sort(tokenfilters, ctorComp);
|
||||
Collections.sort(charfilters, ctorComp);
|
||||
if (VERBOSE) {
|
||||
System.out.println("tokenizers = " + tokenizers);
|
||||
System.out.println("tokenfilters = " + tokenfilters);
|
||||
|
@ -111,170 +229,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
charfilters = null;
|
||||
}
|
||||
|
||||
static class MockRandomAnalyzer extends Analyzer {
|
||||
final long seed;
|
||||
|
||||
MockRandomAnalyzer(long seed) {
|
||||
this.seed = seed;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Random random = new Random(seed);
|
||||
TokenizerSpec tokenizerspec = newTokenizer(random, reader);
|
||||
TokenFilterSpec filterspec = newFilterChain(random, tokenizerspec.tokenizer);
|
||||
return new TokenStreamComponents(tokenizerspec.tokenizer, filterspec.stream);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(Reader reader) {
|
||||
Random random = new Random(seed);
|
||||
CharFilterSpec charfilterspec = newCharFilterChain(random, reader);
|
||||
return charfilterspec.reader;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
Random random = new Random(seed);
|
||||
StringBuilder sb = new StringBuilder();
|
||||
CharFilterSpec charfilterSpec = newCharFilterChain(random, new StringReader(""));
|
||||
sb.append("\ncharfilters=");
|
||||
sb.append(charfilterSpec.toString);
|
||||
// intentional: initReader gets its own separate random
|
||||
random = new Random(seed);
|
||||
TokenizerSpec tokenizerSpec = newTokenizer(random, charfilterSpec.reader);
|
||||
sb.append("\n");
|
||||
sb.append("tokenizer=");
|
||||
sb.append(tokenizerSpec.toString);
|
||||
TokenFilterSpec tokenfilterSpec = newFilterChain(random, tokenizerSpec.tokenizer);
|
||||
sb.append("\n");
|
||||
sb.append("filters=");
|
||||
sb.append(tokenfilterSpec.toString);
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
// create a new random tokenizer from classpath
|
||||
private TokenizerSpec newTokenizer(Random random, Reader reader) {
|
||||
TokenizerSpec spec = new TokenizerSpec();
|
||||
boolean success = false;
|
||||
while (!success) {
|
||||
try {
|
||||
// TODO: check Reader+Version,Version+Reader too
|
||||
// also look for other variants and handle them special
|
||||
int idx = random.nextInt(tokenizers.size());
|
||||
try {
|
||||
Constructor<? extends Tokenizer> c = tokenizers.get(idx).getConstructor(Version.class, Reader.class);
|
||||
spec.tokenizer = c.newInstance(TEST_VERSION_CURRENT, reader);
|
||||
} catch (NoSuchMethodException e) {
|
||||
Constructor<? extends Tokenizer> c = tokenizers.get(idx).getConstructor(Reader.class);
|
||||
spec.tokenizer = c.newInstance(reader);
|
||||
}
|
||||
spec.toString = tokenizers.get(idx).toString();
|
||||
success = true;
|
||||
} catch (Exception e) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
return spec;
|
||||
}
|
||||
|
||||
private CharFilterSpec newCharFilterChain(Random random, Reader reader) {
|
||||
CharFilterSpec spec = new CharFilterSpec();
|
||||
spec.reader = reader;
|
||||
StringBuilder descr = new StringBuilder();
|
||||
int numFilters = random.nextInt(3);
|
||||
for (int i = 0; i < numFilters; i++) {
|
||||
boolean success = false;
|
||||
while (!success) {
|
||||
try {
|
||||
// TODO: also look for other variants and handle them special
|
||||
int idx = random.nextInt(charfilters.size());
|
||||
try {
|
||||
Constructor<? extends CharStream> c = charfilters.get(idx).getConstructor(Reader.class);
|
||||
spec.reader = c.newInstance(spec.reader);
|
||||
} catch (NoSuchMethodException e) {
|
||||
Constructor<? extends CharStream> c = charfilters.get(idx).getConstructor(CharStream.class);
|
||||
spec.reader = c.newInstance(CharReader.get(spec.reader));
|
||||
}
|
||||
|
||||
if (descr.length() > 0) {
|
||||
descr.append(",");
|
||||
}
|
||||
descr.append(charfilters.get(idx).toString());
|
||||
success = true;
|
||||
} catch (Exception e) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
}
|
||||
spec.toString = descr.toString();
|
||||
return spec;
|
||||
}
|
||||
|
||||
private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer) {
|
||||
TokenFilterSpec spec = new TokenFilterSpec();
|
||||
spec.stream = tokenizer;
|
||||
StringBuilder descr = new StringBuilder();
|
||||
int numFilters = random.nextInt(5);
|
||||
for (int i = 0; i < numFilters; i++) {
|
||||
boolean success = false;
|
||||
while (!success) {
|
||||
try {
|
||||
// TODO: also look for other variants and handle them special
|
||||
int idx = random.nextInt(tokenfilters.size());
|
||||
try {
|
||||
Constructor<? extends TokenFilter> c = tokenfilters.get(idx).getConstructor(Version.class, TokenStream.class);
|
||||
spec.stream = c.newInstance(TEST_VERSION_CURRENT, spec.stream);
|
||||
} catch (NoSuchMethodException e) {
|
||||
Constructor<? extends TokenFilter> c = tokenfilters.get(idx).getConstructor(TokenStream.class);
|
||||
spec.stream = c.newInstance(spec.stream);
|
||||
}
|
||||
if (descr.length() > 0) {
|
||||
descr.append(",");
|
||||
}
|
||||
descr.append(tokenfilters.get(idx).toString());
|
||||
success = true;
|
||||
} catch (Exception e) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
}
|
||||
spec.toString = descr.toString();
|
||||
return spec;
|
||||
}
|
||||
/** Hack to work around the stupidness of Oracle's strict Java backwards compatibility.
|
||||
* {@code Class<T>#getConstructors()} should return unmodifiable {@code List<Constructor<T>>} not array! */
|
||||
@SuppressWarnings("unchecked")
|
||||
private static <T> Constructor<T> castConstructor(Class<T> instanceClazz, Constructor<?> ctor) {
|
||||
return (Constructor<T>) ctor;
|
||||
}
|
||||
|
||||
static class TokenizerSpec {
|
||||
Tokenizer tokenizer;
|
||||
String toString;
|
||||
}
|
||||
|
||||
static class TokenFilterSpec {
|
||||
TokenStream stream;
|
||||
String toString;
|
||||
}
|
||||
|
||||
static class CharFilterSpec {
|
||||
Reader reader;
|
||||
String toString;
|
||||
}
|
||||
|
||||
public void testRandomChains() throws Throwable {
|
||||
int numIterations = atLeast(20);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong());
|
||||
if (VERBOSE) {
|
||||
System.out.println("Creating random analyzer:" + a);
|
||||
}
|
||||
try {
|
||||
checkRandomData(random, a, 1000);
|
||||
} catch (Throwable e) {
|
||||
System.err.println("Exception from random analyzer: " + a);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void getClassesForPackage(String pckgname, List<Class<?>> classes) throws Exception {
|
||||
final ClassLoader cld = TestRandomChains.class.getClassLoader();
|
||||
final String path = pckgname.replace('.', '/');
|
||||
|
@ -303,4 +263,568 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static interface ArgProducer {
|
||||
Object create(Random random);
|
||||
}
|
||||
|
||||
private static final Map<Class<?>,ArgProducer> argProducers = new IdentityHashMap<Class<?>,ArgProducer>() {{
|
||||
put(int.class, new ArgProducer() {
|
||||
@Override public Object create(Random random) {
|
||||
// TODO: could cause huge ram usage to use full int range for some filters
|
||||
// (e.g. allocate enormous arrays)
|
||||
// return Integer.valueOf(random.nextInt());
|
||||
return Integer.valueOf(_TestUtil.nextInt(random, -100, 100));
|
||||
}
|
||||
});
|
||||
put(char.class, new ArgProducer() {
|
||||
@Override public Object create(Random random) {
|
||||
// TODO: fix any filters that care to throw IAE instead.
|
||||
// also add a unicode validating filter to validate termAtt?
|
||||
// return Character.valueOf((char)random.nextInt(65536));
|
||||
while(true) {
|
||||
char c = (char)random.nextInt(65536);
|
||||
if (c < '\uD800' || c > '\uDFFF') {
|
||||
return Character.valueOf(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
put(float.class, new ArgProducer() {
|
||||
@Override public Object create(Random random) {
|
||||
return Float.valueOf(random.nextFloat());
|
||||
}
|
||||
});
|
||||
put(boolean.class, new ArgProducer() {
|
||||
@Override public Object create(Random random) {
|
||||
return Boolean.valueOf(random.nextBoolean());
|
||||
}
|
||||
});
|
||||
put(byte.class, new ArgProducer() {
|
||||
@Override public Object create(Random random) {
|
||||
// this wraps to negative when casting to byte
|
||||
return Byte.valueOf((byte) random.nextInt(256));
|
||||
}
|
||||
});
|
||||
put(byte[].class, new ArgProducer() {
|
||||
@Override public Object create(Random random) {
|
||||
byte bytes[] = new byte[random.nextInt(256)];
|
||||
random.nextBytes(bytes);
|
||||
return bytes;
|
||||
}
|
||||
});
|
||||
put(Random.class, new ArgProducer() {
|
||||
@Override public Object create(Random random) {
|
||||
return new Random(random.nextLong());
|
||||
}
|
||||
});
|
||||
put(Version.class, new ArgProducer() {
|
||||
@Override public Object create(Random random) {
|
||||
// we expect bugs in emulating old versions
|
||||
return TEST_VERSION_CURRENT;
|
||||
}
|
||||
});
|
||||
put(Set.class, new ArgProducer() {
|
||||
@Override public Object create(Random random) {
|
||||
// TypeTokenFilter
|
||||
Set<String> set = new HashSet<String>();
|
||||
int num = random.nextInt(5);
|
||||
for (int i = 0; i < num; i++) {
|
||||
set.add(StandardTokenizer.TOKEN_TYPES[random.nextInt(StandardTokenizer.TOKEN_TYPES.length)]);
|
||||
}
|
||||
return set;
|
||||
}
|
||||
});
|
||||
put(Collection.class, new ArgProducer() {
|
||||
@Override public Object create(Random random) {
|
||||
// CapitalizationFilter
|
||||
Collection<char[]> col = new ArrayList<char[]>();
|
||||
int num = random.nextInt(5);
|
||||
for (int i = 0; i < num; i++) {
|
||||
col.add(_TestUtil.randomSimpleString(random).toCharArray());
|
||||
}
|
||||
return col;
|
||||
}
|
||||
});
|
||||
put(CharArraySet.class, new ArgProducer() {
|
||||
@Override public Object create(Random random) {
|
||||
int num = random.nextInt(10);
|
||||
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, num, random.nextBoolean());
|
||||
for (int i = 0; i < num; i++) {
|
||||
// TODO: make nastier
|
||||
set.add(_TestUtil.randomSimpleString(random));
|
||||
}
|
||||
return set;
|
||||
}
|
||||
});
|
||||
put(Pattern.class, new ArgProducer() {
|
||||
@Override public Object create(Random random) {
|
||||
// TODO: don't want to make the exponentially slow ones Dawid documents
|
||||
// in TestPatternReplaceFilter, so dont use truly random patterns (for now)
|
||||
return Pattern.compile("a");
|
||||
}
|
||||
});
|
||||
put(PayloadEncoder.class, new ArgProducer() {
|
||||
@Override public Object create(Random random) {
|
||||
return new IdentityEncoder(); // the other encoders will throw exceptions if tokens arent numbers?
|
||||
}
|
||||
});
|
||||
put(HunspellDictionary.class, new ArgProducer() {
|
||||
@Override public Object create(Random random) {
|
||||
// TODO: make nastier
|
||||
InputStream affixStream = HunspellDictionaryTest.class.getResourceAsStream("test.aff");
|
||||
InputStream dictStream = HunspellDictionaryTest.class.getResourceAsStream("test.dic");
|
||||
try {
|
||||
return new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT);
|
||||
} catch (Exception ex) {
|
||||
Rethrow.rethrow(ex);
|
||||
return null; // unreachable code
|
||||
}
|
||||
}
|
||||
});
|
||||
put(EdgeNGramTokenizer.Side.class, new ArgProducer() {
|
||||
@Override public Object create(Random random) {
|
||||
return random.nextBoolean()
|
||||
? EdgeNGramTokenizer.Side.FRONT
|
||||
: EdgeNGramTokenizer.Side.BACK;
|
||||
}
|
||||
});
|
||||
put(EdgeNGramTokenFilter.Side.class, new ArgProducer() {
|
||||
@Override public Object create(Random random) {
|
||||
return random.nextBoolean()
|
||||
? EdgeNGramTokenFilter.Side.FRONT
|
||||
: EdgeNGramTokenFilter.Side.BACK;
|
||||
}
|
||||
});
|
||||
put(HyphenationTree.class, new ArgProducer() {
|
||||
@Override public Object create(Random random) {
|
||||
// TODO: make nastier
|
||||
try {
|
||||
InputSource is = new InputSource(TestCompoundWordTokenFilter.class.getResource("da_UTF8.xml").toExternalForm());
|
||||
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
|
||||
return hyphenator;
|
||||
} catch (Exception ex) {
|
||||
Rethrow.rethrow(ex);
|
||||
return null; // unreachable code
|
||||
}
|
||||
}
|
||||
});
|
||||
put(SnowballProgram.class, new ArgProducer() {
|
||||
@Override public Object create(Random random) {
|
||||
try {
|
||||
String lang = TestSnowball.SNOWBALL_LANGS[random.nextInt(TestSnowball.SNOWBALL_LANGS.length)];
|
||||
Class<? extends SnowballProgram> clazz = Class.forName("org.tartarus.snowball.ext." + lang + "Stemmer").asSubclass(SnowballProgram.class);
|
||||
return clazz.newInstance();
|
||||
} catch (Exception ex) {
|
||||
Rethrow.rethrow(ex);
|
||||
return null; // unreachable code
|
||||
}
|
||||
}
|
||||
});
|
||||
put(String.class, new ArgProducer() {
|
||||
@Override public Object create(Random random) {
|
||||
// TODO: make nastier
|
||||
if (random.nextBoolean()) {
|
||||
// a token type
|
||||
return StandardTokenizer.TOKEN_TYPES[random.nextInt(StandardTokenizer.TOKEN_TYPES.length)];
|
||||
} else {
|
||||
return _TestUtil.randomSimpleString(random);
|
||||
}
|
||||
}
|
||||
});
|
||||
put(NormalizeCharMap.class, new ArgProducer() {
|
||||
@Override public Object create(Random random) {
|
||||
NormalizeCharMap map = new NormalizeCharMap();
|
||||
// we can't add duplicate keys, or NormalizeCharMap gets angry
|
||||
Set<String> keys = new HashSet<String>();
|
||||
int num = random.nextInt(5);
|
||||
//System.out.println("NormalizeCharMap=");
|
||||
for (int i = 0; i < num; i++) {
|
||||
String key = _TestUtil.randomSimpleString(random);
|
||||
if (!keys.contains(key)) {
|
||||
String value = _TestUtil.randomSimpleString(random);
|
||||
map.add(key, value);
|
||||
keys.add(key);
|
||||
//System.out.println("mapping: '" + key + "' => '" + value + "'");
|
||||
}
|
||||
}
|
||||
return map;
|
||||
}
|
||||
});
|
||||
put(CharacterRunAutomaton.class, new ArgProducer() {
|
||||
@Override public Object create(Random random) {
|
||||
// TODO: could probably use a purely random automaton
|
||||
switch(random.nextInt(5)) {
|
||||
case 0: return MockTokenizer.KEYWORD;
|
||||
case 1: return MockTokenizer.SIMPLE;
|
||||
case 2: return MockTokenizer.WHITESPACE;
|
||||
case 3: return MockTokenFilter.EMPTY_STOPSET;
|
||||
default: return MockTokenFilter.ENGLISH_STOPSET;
|
||||
}
|
||||
}
|
||||
});
|
||||
put(CharArrayMap.class, new ArgProducer() {
|
||||
@Override public Object create(Random random) {
|
||||
int num = random.nextInt(10);
|
||||
CharArrayMap<String> map = new CharArrayMap<String>(TEST_VERSION_CURRENT, num, random.nextBoolean());
|
||||
for (int i = 0; i < num; i++) {
|
||||
// TODO: make nastier
|
||||
map.put(_TestUtil.randomSimpleString(random), _TestUtil.randomSimpleString(random));
|
||||
}
|
||||
return map;
|
||||
}
|
||||
});
|
||||
put(SynonymMap.class, new ArgProducer() {
|
||||
@Override public Object create(Random random) {
|
||||
SynonymMap.Builder b = new SynonymMap.Builder(random.nextBoolean());
|
||||
final int numEntries = atLeast(10);
|
||||
for (int j = 0; j < numEntries; j++) {
|
||||
addSyn(b, randomNonEmptyString(random), randomNonEmptyString(random), random.nextBoolean());
|
||||
}
|
||||
try {
|
||||
return b.build();
|
||||
} catch (Exception ex) {
|
||||
Rethrow.rethrow(ex);
|
||||
return null; // unreachable code
|
||||
}
|
||||
}
|
||||
|
||||
private void addSyn(SynonymMap.Builder b, String input, String output, boolean keepOrig) {
|
||||
b.add(new CharsRef(input.replaceAll(" +", "\u0000")),
|
||||
new CharsRef(output.replaceAll(" +", "\u0000")),
|
||||
keepOrig);
|
||||
}
|
||||
|
||||
private String randomNonEmptyString(Random random) {
|
||||
while(true) {
|
||||
final String s = _TestUtil.randomUnicodeString(random).trim();
|
||||
if (s.length() != 0 && s.indexOf('\u0000') == -1) {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}};
|
||||
|
||||
static final Set<Class<?>> allowedTokenizerArgs, allowedTokenFilterArgs, allowedCharFilterArgs;
|
||||
static {
|
||||
allowedTokenizerArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
|
||||
allowedTokenizerArgs.addAll(argProducers.keySet());
|
||||
allowedTokenizerArgs.add(Reader.class);
|
||||
allowedTokenizerArgs.add(AttributeFactory.class);
|
||||
allowedTokenizerArgs.add(AttributeSource.class);
|
||||
|
||||
allowedTokenFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
|
||||
allowedTokenFilterArgs.addAll(argProducers.keySet());
|
||||
allowedTokenFilterArgs.add(TokenStream.class);
|
||||
// TODO: fix this one, thats broken:
|
||||
allowedTokenFilterArgs.add(CommonGramsFilter.class);
|
||||
|
||||
allowedCharFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
|
||||
allowedCharFilterArgs.addAll(argProducers.keySet());
|
||||
allowedCharFilterArgs.add(Reader.class);
|
||||
allowedCharFilterArgs.add(CharStream.class);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
static <T> T newRandomArg(Random random, Class<T> paramType) {
|
||||
final ArgProducer producer = argProducers.get(paramType);
|
||||
assertNotNull("No producer for arguments of type " + paramType.getName() + " found", producer);
|
||||
return (T) producer.create(random);
|
||||
}
|
||||
|
||||
static Object[] newTokenizerArgs(Random random, Reader reader, Class<?>[] paramTypes) {
|
||||
Object[] args = new Object[paramTypes.length];
|
||||
for (int i = 0; i < args.length; i++) {
|
||||
Class<?> paramType = paramTypes[i];
|
||||
if (paramType == Reader.class) {
|
||||
args[i] = reader;
|
||||
} else if (paramType == AttributeFactory.class) {
|
||||
// TODO: maybe the collator one...???
|
||||
args[i] = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
|
||||
} else if (paramType == AttributeSource.class) {
|
||||
// TODO: args[i] = new AttributeSource();
|
||||
// this is currently too scary to deal with!
|
||||
args[i] = null; // force IAE
|
||||
} else {
|
||||
args[i] = newRandomArg(random, paramType);
|
||||
}
|
||||
}
|
||||
return args;
|
||||
}
|
||||
|
||||
static Object[] newCharFilterArgs(Random random, Reader reader, Class<?>[] paramTypes) {
|
||||
Object[] args = new Object[paramTypes.length];
|
||||
for (int i = 0; i < args.length; i++) {
|
||||
Class<?> paramType = paramTypes[i];
|
||||
if (paramType == Reader.class) {
|
||||
args[i] = reader;
|
||||
} else if (paramType == CharStream.class) {
|
||||
args[i] = CharReader.get(reader);
|
||||
} else {
|
||||
args[i] = newRandomArg(random, paramType);
|
||||
}
|
||||
}
|
||||
return args;
|
||||
}
|
||||
|
||||
static Object[] newFilterArgs(Random random, TokenStream stream, Class<?>[] paramTypes) {
|
||||
Object[] args = new Object[paramTypes.length];
|
||||
for (int i = 0; i < args.length; i++) {
|
||||
Class<?> paramType = paramTypes[i];
|
||||
if (paramType == TokenStream.class) {
|
||||
args[i] = stream;
|
||||
} else if (paramType == CommonGramsFilter.class) {
|
||||
// TODO: fix this one, thats broken: CommonGramsQueryFilter takes this one explicitly
|
||||
args[i] = new CommonGramsFilter(TEST_VERSION_CURRENT, stream, newRandomArg(random, CharArraySet.class));
|
||||
} else {
|
||||
args[i] = newRandomArg(random, paramType);
|
||||
}
|
||||
}
|
||||
return args;
|
||||
}
|
||||
|
||||
static class MockRandomAnalyzer extends Analyzer {
|
||||
final long seed;
|
||||
|
||||
MockRandomAnalyzer(long seed) {
|
||||
this.seed = seed;
|
||||
}
|
||||
|
||||
public boolean offsetsAreCorrect() {
|
||||
// TODO: can we not do the full chain here!?
|
||||
Random random = new Random(seed);
|
||||
TokenizerSpec tokenizerSpec = newTokenizer(random, new StringReader(""));
|
||||
TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
|
||||
return filterSpec.offsetsAreCorrect;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Random random = new Random(seed);
|
||||
TokenizerSpec tokenizerSpec = newTokenizer(random, reader);
|
||||
TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
|
||||
return new TokenStreamComponents(tokenizerSpec.tokenizer, filterSpec.stream);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(Reader reader) {
|
||||
Random random = new Random(seed);
|
||||
CharFilterSpec charfilterspec = newCharFilterChain(random, reader);
|
||||
return charfilterspec.reader;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
Random random = new Random(seed);
|
||||
StringBuilder sb = new StringBuilder();
|
||||
CharFilterSpec charFilterSpec = newCharFilterChain(random, new StringReader(""));
|
||||
sb.append("\ncharfilters=");
|
||||
sb.append(charFilterSpec.toString);
|
||||
// intentional: initReader gets its own separate random
|
||||
random = new Random(seed);
|
||||
TokenizerSpec tokenizerSpec = newTokenizer(random, charFilterSpec.reader);
|
||||
sb.append("\n");
|
||||
sb.append("tokenizer=");
|
||||
sb.append(tokenizerSpec.toString);
|
||||
TokenFilterSpec tokenFilterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
|
||||
sb.append("\n");
|
||||
sb.append("filters=");
|
||||
sb.append(tokenFilterSpec.toString);
|
||||
sb.append("\n");
|
||||
sb.append("offsetsAreCorrect=" + tokenFilterSpec.offsetsAreCorrect);
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private <T> T createComponent(Constructor<T> ctor, Object[] args, StringBuilder descr) {
|
||||
try {
|
||||
final T instance = ctor.newInstance(args);
|
||||
/*
|
||||
if (descr.length() > 0) {
|
||||
descr.append(",");
|
||||
}
|
||||
*/
|
||||
descr.append("\n ");
|
||||
descr.append(ctor.getDeclaringClass().getName());
|
||||
String params = Arrays.toString(args);
|
||||
params = params.substring(1, params.length()-1);
|
||||
descr.append("(").append(params).append(")");
|
||||
return instance;
|
||||
} catch (InvocationTargetException ite) {
|
||||
final Throwable cause = ite.getCause();
|
||||
if (cause instanceof IllegalArgumentException ||
|
||||
cause instanceof UnsupportedOperationException) {
|
||||
// thats ok, ignore
|
||||
if (VERBOSE) {
|
||||
System.err.println("Ignoring IAE/UOE from ctor:");
|
||||
cause.printStackTrace(System.err);
|
||||
}
|
||||
} else {
|
||||
Rethrow.rethrow(cause);
|
||||
}
|
||||
} catch (IllegalAccessException iae) {
|
||||
Rethrow.rethrow(iae);
|
||||
} catch (InstantiationException ie) {
|
||||
Rethrow.rethrow(ie);
|
||||
}
|
||||
return null; // no success
|
||||
}
|
||||
|
||||
// create a new random tokenizer from classpath
|
||||
private TokenizerSpec newTokenizer(Random random, Reader reader) {
|
||||
TokenizerSpec spec = new TokenizerSpec();
|
||||
while (spec.tokenizer == null) {
|
||||
final Constructor<? extends Tokenizer> ctor = tokenizers.get(random.nextInt(tokenizers.size()));
|
||||
final StringBuilder descr = new StringBuilder();
|
||||
final CheckThatYouDidntReadAnythingReaderWrapper wrapper = new CheckThatYouDidntReadAnythingReaderWrapper(reader);
|
||||
final Object args[] = newTokenizerArgs(random, wrapper, ctor.getParameterTypes());
|
||||
spec.tokenizer = createComponent(ctor, args, descr);
|
||||
if (brokenOffsetsComponents.contains(ctor.getDeclaringClass())) {
|
||||
spec.offsetsAreCorrect = false;
|
||||
}
|
||||
if (spec.tokenizer == null) {
|
||||
assertFalse(ctor.getDeclaringClass().getName() + " has read something in ctor but failed with UOE/IAE", wrapper.readSomething);
|
||||
}
|
||||
spec.toString = descr.toString();
|
||||
}
|
||||
return spec;
|
||||
}
|
||||
|
||||
private CharFilterSpec newCharFilterChain(Random random, Reader reader) {
|
||||
CharFilterSpec spec = new CharFilterSpec();
|
||||
spec.reader = reader;
|
||||
StringBuilder descr = new StringBuilder();
|
||||
int numFilters = random.nextInt(3);
|
||||
for (int i = 0; i < numFilters; i++) {
|
||||
while (true) {
|
||||
final Constructor<? extends CharStream> ctor = charfilters.get(random.nextInt(charfilters.size()));
|
||||
final Object args[] = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes());
|
||||
reader = createComponent(ctor, args, descr);
|
||||
if (reader != null) {
|
||||
spec.reader = reader;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
spec.toString = descr.toString();
|
||||
return spec;
|
||||
}
|
||||
|
||||
private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer, boolean offsetsAreCorrect) {
|
||||
TokenFilterSpec spec = new TokenFilterSpec();
|
||||
spec.offsetsAreCorrect = offsetsAreCorrect;
|
||||
spec.stream = tokenizer;
|
||||
StringBuilder descr = new StringBuilder();
|
||||
int numFilters = random.nextInt(5);
|
||||
for (int i = 0; i < numFilters; i++) {
|
||||
|
||||
// Insert ValidatingTF after each stage so we can
|
||||
// catch problems right after the TF that "caused"
|
||||
// them:
|
||||
spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i, spec.offsetsAreCorrect);
|
||||
|
||||
while (true) {
|
||||
final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
|
||||
|
||||
// hack: MockGraph/MockLookahead has assertions that will trip if they follow
|
||||
// an offsets violator. so we cant use them after e.g. wikipediatokenizer
|
||||
if (!spec.offsetsAreCorrect &&
|
||||
(ctor.getDeclaringClass().equals(MockGraphTokenFilter.class)
|
||||
|| ctor.getDeclaringClass().equals(MockRandomLookaheadTokenFilter.class))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
|
||||
final TokenFilter flt = createComponent(ctor, args, descr);
|
||||
if (flt != null) {
|
||||
if (brokenOffsetsComponents.contains(ctor.getDeclaringClass())) {
|
||||
spec.offsetsAreCorrect = false;
|
||||
}
|
||||
spec.stream = flt;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Insert ValidatingTF after each stage so we can
|
||||
// catch problems right after the TF that "caused"
|
||||
// them:
|
||||
spec.stream = new ValidatingTokenFilter(spec.stream, "last stage", spec.offsetsAreCorrect);
|
||||
|
||||
spec.toString = descr.toString();
|
||||
return spec;
|
||||
}
|
||||
}
|
||||
|
||||
static final class CheckThatYouDidntReadAnythingReaderWrapper extends CharFilter {
|
||||
boolean readSomething = false;
|
||||
|
||||
CheckThatYouDidntReadAnythingReaderWrapper(Reader in) {
|
||||
super(CharReader.get(in));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||
readSomething = true;
|
||||
return super.read(cbuf, off, len);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
readSomething = true;
|
||||
return super.read();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(CharBuffer target) throws IOException {
|
||||
readSomething = true;
|
||||
return super.read(target);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(char[] cbuf) throws IOException {
|
||||
readSomething = true;
|
||||
return super.read(cbuf);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long skip(long n) throws IOException {
|
||||
readSomething = true;
|
||||
return super.skip(n);
|
||||
}
|
||||
}
|
||||
|
||||
static class TokenizerSpec {
|
||||
Tokenizer tokenizer;
|
||||
String toString;
|
||||
boolean offsetsAreCorrect = true;
|
||||
}
|
||||
|
||||
static class TokenFilterSpec {
|
||||
TokenStream stream;
|
||||
String toString;
|
||||
boolean offsetsAreCorrect = true;
|
||||
}
|
||||
|
||||
static class CharFilterSpec {
|
||||
Reader reader;
|
||||
String toString;
|
||||
}
|
||||
|
||||
public void testRandomChains() throws Throwable {
|
||||
int numIterations = atLeast(20);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong());
|
||||
if (VERBOSE) {
|
||||
System.out.println("Creating random analyzer:" + a);
|
||||
}
|
||||
try {
|
||||
checkRandomData(random, a, 1000, 20, false,
|
||||
false /* We already validate our own offsets... */);
|
||||
} catch (Throwable e) {
|
||||
System.err.println("Exception from random analyzer: " + a);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -65,7 +65,11 @@ public class TestTrimFilter extends BaseTokenStreamTestCase {
|
|||
new String[] { "a", "b", "c", "" },
|
||||
new int[] { 1, 0, 1, 3 },
|
||||
new int[] { 2, 1, 2, 3 },
|
||||
new int[] { 1, 1, 1, 1 });
|
||||
null,
|
||||
new int[] { 1, 1, 1, 1 },
|
||||
null,
|
||||
null,
|
||||
false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -72,14 +72,16 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
assertTokenStreamContents(wdf,
|
||||
new String[] { "foo", "bar", "foobar" },
|
||||
new int[] { 5, 9, 5 },
|
||||
new int[] { 8, 12, 12 });
|
||||
new int[] { 8, 12, 12 },
|
||||
null, null, null, null, false);
|
||||
|
||||
wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null);
|
||||
|
||||
assertTokenStreamContents(wdf,
|
||||
new String[] { "foo", "bar", "foobar" },
|
||||
new int[] { 5, 5, 5 },
|
||||
new int[] { 6, 6, 6 });
|
||||
new int[] { 6, 6, 6 },
|
||||
null, null, null, null, false);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -123,7 +125,8 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
assertTokenStreamContents(wdf,
|
||||
new String[] { "foo", "bar", "foobar"},
|
||||
new int[] { 8, 12, 8 },
|
||||
new int[] { 11, 15, 15 });
|
||||
new int[] { 11, 15, 15 },
|
||||
null, null, null, null, false);
|
||||
}
|
||||
|
||||
public void doSplit(final String input, String... output) throws Exception {
|
||||
|
@ -230,18 +233,27 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
|
||||
new int[] { 0, 9 },
|
||||
new int[] { 6, 13 },
|
||||
new int[] { 1, 1 });
|
||||
null,
|
||||
new int[] { 1, 1 },
|
||||
null,
|
||||
false);
|
||||
|
||||
/* only in this case, posInc of 2 ?! */
|
||||
assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
|
||||
new int[] { 0, 9, 12, 9 },
|
||||
new int[] { 6, 12, 13, 13 },
|
||||
new int[] { 1, 1, 1, 0 });
|
||||
null,
|
||||
new int[] { 1, 1, 1, 0 },
|
||||
null,
|
||||
false);
|
||||
|
||||
assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
|
||||
new int[] { 0, 9, 15 },
|
||||
new int[] { 6, 14, 19 },
|
||||
new int[] { 1, 1, 1 });
|
||||
null,
|
||||
new int[] { 1, 1, 1 },
|
||||
null,
|
||||
false);
|
||||
|
||||
/* analyzer that will consume tokens with large position increments */
|
||||
Analyzer a2 = new Analyzer() {
|
||||
|
@ -258,24 +270,36 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" },
|
||||
new int[] { 0, 7, 16 },
|
||||
new int[] { 6, 15, 20 },
|
||||
new int[] { 1, 10, 1 });
|
||||
null,
|
||||
new int[] { 1, 10, 1 },
|
||||
null,
|
||||
false);
|
||||
|
||||
/* the "/" had a position increment of 10, where did it go?!?!! */
|
||||
assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
|
||||
new int[] { 0, 9 },
|
||||
new int[] { 6, 13 },
|
||||
new int[] { 1, 11 });
|
||||
null,
|
||||
new int[] { 1, 11 },
|
||||
null,
|
||||
false);
|
||||
|
||||
/* in this case, the increment of 10 from the "/" is carried over */
|
||||
assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
|
||||
new int[] { 0, 9, 12, 9 },
|
||||
new int[] { 6, 12, 13, 13 },
|
||||
new int[] { 1, 11, 1, 0 });
|
||||
null,
|
||||
new int[] { 1, 11, 1, 0 },
|
||||
null,
|
||||
false);
|
||||
|
||||
assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
|
||||
new int[] { 0, 9, 15 },
|
||||
new int[] { 6, 14, 19 },
|
||||
new int[] { 1, 11, 1 });
|
||||
null,
|
||||
new int[] { 1, 11, 1 },
|
||||
null,
|
||||
false);
|
||||
|
||||
Analyzer a3 = new Analyzer() {
|
||||
@Override
|
||||
|
@ -292,14 +316,20 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
new String[] { "lucene", "solr", "lucenesolr" },
|
||||
new int[] { 0, 7, 0 },
|
||||
new int[] { 6, 11, 11 },
|
||||
new int[] { 1, 1, 0 });
|
||||
null,
|
||||
new int[] { 1, 1, 0 },
|
||||
null,
|
||||
false);
|
||||
|
||||
/* the stopword should add a gap here */
|
||||
assertAnalyzesTo(a3, "the lucene.solr",
|
||||
new String[] { "lucene", "solr", "lucenesolr" },
|
||||
new int[] { 4, 11, 4 },
|
||||
new int[] { 10, 15, 15 },
|
||||
new int[] { 2, 1, 0 });
|
||||
null,
|
||||
new int[] { 2, 1, 0 },
|
||||
null,
|
||||
false);
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
|
@ -322,7 +352,7 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
|
||||
}
|
||||
};
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER, 20, false, false);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -94,7 +94,15 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testBackRangeOfNgrams() throws Exception {
|
||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.BACK, 1, 3);
|
||||
assertTokenStreamContents(tokenizer, new String[]{"e","de","cde"}, new int[]{4,3,2}, new int[]{5,5,5});
|
||||
assertTokenStreamContents(tokenizer,
|
||||
new String[]{"e","de","cde"},
|
||||
new int[]{4,3,2},
|
||||
new int[]{5,5,5},
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
false);
|
||||
}
|
||||
|
||||
public void testSmallTokenInStream() throws Exception {
|
||||
|
@ -151,7 +159,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.BACK, 2, 15));
|
||||
}
|
||||
};
|
||||
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER, 20, false, false);
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws Exception {
|
||||
|
|
|
@ -90,7 +90,7 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testBackRangeOfNgrams() throws Exception {
|
||||
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 3);
|
||||
assertTokenStreamContents(tokenizer, new String[]{"e","de","cde"}, new int[]{4,3,2}, new int[]{5,5,5}, 5 /* abcde */);
|
||||
assertTokenStreamContents(tokenizer, new String[]{"e","de","cde"}, new int[]{4,3,2}, new int[]{5,5,5}, null, null, null, 5 /* abcde */, false);
|
||||
}
|
||||
|
||||
public void testReset() throws Exception {
|
||||
|
@ -109,8 +109,8 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER, 20, false, false);
|
||||
checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192, false, false);
|
||||
|
||||
Analyzer b = new Analyzer() {
|
||||
@Override
|
||||
|
@ -119,7 +119,7 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random, b, 200*RANDOM_MULTIPLIER, 8192);
|
||||
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER, 20, false, false);
|
||||
checkRandomData(random, b, 200*RANDOM_MULTIPLIER, 8192, false, false);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -77,7 +77,8 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
assertTokenStreamContents(filter,
|
||||
new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"},
|
||||
new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
|
||||
new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5}
|
||||
new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
|
||||
null, null, null, null, false
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -130,7 +131,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
new NGramTokenFilter(tokenizer, 2, 15));
|
||||
}
|
||||
};
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER, 20, false, false);
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws Exception {
|
||||
|
|
|
@ -73,7 +73,11 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
|
|||
new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"},
|
||||
new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
|
||||
new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
|
||||
5 /* abcde */
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
5 /* abcde */,
|
||||
false
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -98,7 +102,7 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
|
|||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER, 20, false, false);
|
||||
checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192, false, false);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -142,14 +142,16 @@ public class TestSnowball extends BaseTokenStreamTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
/** for testing purposes ONLY */
|
||||
public static String SNOWBALL_LANGS[] = {
|
||||
"Armenian", "Basque", "Catalan", "Danish", "Dutch", "English",
|
||||
"Finnish", "French", "German2", "German", "Hungarian", "Irish",
|
||||
"Italian", "Kp", "Lovins", "Norwegian", "Porter", "Portuguese",
|
||||
"Romanian", "Russian", "Spanish", "Swedish", "Turkish"
|
||||
};
|
||||
|
||||
public void testEmptyTerm() throws IOException {
|
||||
String langs[] = {
|
||||
"Armenian", "Basque", "Catalan", "Danish", "Dutch", "English",
|
||||
"Finnish", "French", "German2", "German", "Hungarian", "Irish",
|
||||
"Italian", "Kp", "Lovins", "Norwegian", "Porter", "Portuguese",
|
||||
"Romanian", "Russian", "Spanish", "Swedish", "Turkish"
|
||||
};
|
||||
for (final String lang : langs) {
|
||||
for (final String lang : SNOWBALL_LANGS) {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
|
|
Loading…
Reference in New Issue