LUCENE-3969: commit current state

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311220 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-04-09 13:25:28 +00:00
parent d55447b25e
commit 6311f71de6
12 changed files with 526 additions and 100 deletions

View File

@ -76,7 +76,7 @@ public final class MockAnalyzer extends Analyzer {
* MockAnalyzer(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, false}).
*/
public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, boolean lowerCase) {
this(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, false);
this(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, true);
}
/**
@ -93,7 +93,8 @@ public final class MockAnalyzer extends Analyzer {
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
MockTokenizer tokenizer = new MockTokenizer(reader, runAutomaton, lowerCase, maxTokenLength);
tokenizer.setEnableChecks(enableChecks);
TokenFilter filt = new MockTokenFilter(tokenizer, filter, enablePositionIncrements);
MockTokenFilter filt = new MockTokenFilter(tokenizer, filter);
filt.setEnablePositionIncrements(enablePositionIncrements);
return new TokenStreamComponents(tokenizer, maybePayload(filt, fieldName));
}

View File

@ -34,7 +34,9 @@ public class MockCharFilter extends CharStream {
// TODO: instead of fixed remainder... maybe a fixed
// random seed?
this.remainder = remainder;
assert remainder >= 0 && remainder < 10 : "invalid parameter";
if (remainder < 0 || remainder >= 10) {
throw new IllegalArgumentException("invalid remainder parameter (must be 0..10): " + remainder);
}
}
// for testing only, uses a remainder of 0

View File

@ -34,6 +34,9 @@ public final class MockFixedLengthPayloadFilter extends TokenFilter {
public MockFixedLengthPayloadFilter(Random random, TokenStream in, int length) {
super(in);
if (length < 0) {
throw new IllegalArgumentException("length must be >= 0");
}
this.random = random;
this.bytes = new byte[length];
this.payload = new Payload(bytes);

View File

@ -55,7 +55,7 @@ public final class MockTokenFilter extends TokenFilter {
makeString("with"))));
private final CharacterRunAutomaton filter;
private boolean enablePositionIncrements = false;
private boolean enablePositionIncrements = true;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
@ -67,14 +67,16 @@ public final class MockTokenFilter extends TokenFilter {
* @param filter DFA representing the terms that should be removed.
* @param enablePositionIncrements true if the removal should accumulate position increments.
*/
public MockTokenFilter(TokenStream input, CharacterRunAutomaton filter, boolean enablePositionIncrements) {
public MockTokenFilter(TokenStream input, CharacterRunAutomaton filter) {
super(input);
this.filter = filter;
this.enablePositionIncrements = enablePositionIncrements;
}
@Override
public boolean incrementToken() throws IOException {
// TODO: fix me when posInc=false, to work like FilteringTokenFilter in that case and not return
// initial token with posInc=0 ever
// return the first non-stop word found
int skippedPositions = 0;
while (input.incrementToken()) {

View File

@ -43,16 +43,25 @@ public final class KeywordTokenizer extends Tokenizer {
public KeywordTokenizer(Reader input, int bufferSize) {
super(input);
if (bufferSize <= 0) {
throw new IllegalArgumentException("bufferSize must be > 0");
}
termAtt.resizeBuffer(bufferSize);
}
public KeywordTokenizer(AttributeSource source, Reader input, int bufferSize) {
super(source, input);
if (bufferSize <= 0) {
throw new IllegalArgumentException("bufferSize must be > 0");
}
termAtt.resizeBuffer(bufferSize);
}
public KeywordTokenizer(AttributeFactory factory, Reader input, int bufferSize) {
super(factory, input);
if (bufferSize <= 0) {
throw new IllegalArgumentException("bufferSize must be > 0");
}
termAtt.resizeBuffer(bufferSize);
}

View File

@ -65,6 +65,12 @@ public class PathHierarchyTokenizer extends Tokenizer {
public PathHierarchyTokenizer(Reader input, int bufferSize, char delimiter, char replacement, int skip) {
super(input);
if (bufferSize < 0) {
throw new IllegalArgumentException("bufferSize cannot be negative");
}
if (skip < 0) {
throw new IllegalArgumentException("skip cannot be negative");
}
termAtt.resizeBuffer(bufferSize);
this.delimiter = delimiter;
@ -85,10 +91,11 @@ public class PathHierarchyTokenizer extends Tokenizer {
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
private int startPosition = 0;
private int finalOffset = 0;
private int skipped = 0;
private boolean endDelimiter = false;
private StringBuilder resultToken;
private int charsRead = 0;
@Override
@ -112,12 +119,13 @@ public class PathHierarchyTokenizer extends Tokenizer {
while (true) {
int c = input.read();
if( c < 0 ){
if (c >= 0) {
charsRead++;
} else {
if( skipped > skip ) {
length += resultToken.length();
termAtt.setLength(length);
finalOffset = correctOffset(startPosition + length);
offsetAtt.setOffset(correctOffset(startPosition), finalOffset);
offsetAtt.setOffset(correctOffset(startPosition), correctOffset(startPosition + length));
if( added ){
resultToken.setLength(0);
resultToken.append(termAtt.buffer(), 0, length);
@ -125,7 +133,6 @@ public class PathHierarchyTokenizer extends Tokenizer {
return added;
}
else{
finalOffset = correctOffset(startPosition + length);
return false;
}
}
@ -168,8 +175,7 @@ public class PathHierarchyTokenizer extends Tokenizer {
}
length += resultToken.length();
termAtt.setLength(length);
finalOffset = correctOffset(startPosition + length);
offsetAtt.setOffset(correctOffset(startPosition), finalOffset);
offsetAtt.setOffset(correctOffset(startPosition), correctOffset(startPosition+length));
resultToken.setLength(0);
resultToken.append(termAtt.buffer(), 0, length);
return true;
@ -178,14 +184,15 @@ public class PathHierarchyTokenizer extends Tokenizer {
@Override
public final void end() {
// set final offset
int finalOffset = correctOffset(charsRead);
offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override
public void reset(Reader input) throws IOException {
super.reset(input);
public void reset() throws IOException {
super.reset();
resultToken.setLength(0);
finalOffset = 0;
charsRead = 0;
endDelimiter = false;
skipped = 0;
}

View File

@ -77,6 +77,13 @@ public class ReversePathHierarchyTokenizer extends Tokenizer {
public ReversePathHierarchyTokenizer(Reader input, int bufferSize, char delimiter, char replacement, int skip) {
super(input);
if (bufferSize < 0) {
throw new IllegalArgumentException("bufferSize cannot be negative");
}
if (skip < 0) {
// nocommit: not quite right right here: see line 84... if skip > numTokensFound we always get a NegativeArrayException? needs fixing!
throw new IllegalArgumentException("skip cannot be negative");
}
termAtt.resizeBuffer(bufferSize);
this.delimiter = delimiter;
this.replacement = replacement;
@ -137,7 +144,11 @@ public class ReversePathHierarchyTokenizer extends Tokenizer {
}
resultToken.getChars(0, resultToken.length(), resultTokenBuffer, 0);
resultToken.setLength(0);
endPosition = delimiterPositions.get(delimitersCount-1 - skip);
int idx = delimitersCount-1 - skip;
if (idx >= 0) {
// otherwise its ok, because we will skip and return false
endPosition = delimiterPositions.get(idx);
}
finalOffset = correctOffset(length);
posAtt.setPositionIncrement(1);
}
@ -163,10 +174,11 @@ public class ReversePathHierarchyTokenizer extends Tokenizer {
}
@Override
public void reset(Reader input) throws IOException {
super.reset(input);
public void reset() throws IOException {
super.reset();
resultToken.setLength(0);
finalOffset = 0;
endPosition = 0;
skipped = 0;
delimitersCount = -1;
delimiterPositions.clear();

View File

@ -71,6 +71,10 @@ public final class PatternTokenizer extends Tokenizer {
this.group = group;
fillBuffer(str, input);
matcher = pattern.matcher(str);
// confusingly group count depends ENTIRELY on the pattern but is only accessible via matcher
if (group >= 0 && group > matcher.groupCount()) {
throw new IllegalArgumentException("invalid group specified: pattern only has: " + matcher.groupCount() + " capturing groups");
}
index = 0;
}

View File

@ -57,6 +57,9 @@ public final class PositionFilter extends TokenFilter {
*/
public PositionFilter(final TokenStream input, final int positionIncrement) {
super(input);
if (positionIncrement < 0) {
throw new IllegalArgumentException("positionIncrement may not be negative");
}
this.positionIncrement = positionIncrement;
}

View File

@ -67,7 +67,7 @@ public final class SnowballFilter extends TokenFilter {
Class.forName("org.tartarus.snowball.ext." + name + "Stemmer").asSubclass(SnowballProgram.class);
stemmer = stemClass.newInstance();
} catch (Exception e) {
throw new RuntimeException(e.toString());
throw new IllegalArgumentException("Invalid stemmer class specified: " + name, e);
}
}

View File

@ -18,17 +18,26 @@ package org.apache.lucene.analysis.core;
*/
import java.io.File;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Modifier;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.List;
import java.util.Random;
import java.util.Set;
import java.util.Map;
import java.util.IdentityHashMap;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@ -36,67 +45,113 @@ import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.EmptyTokenizer;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
import org.apache.lucene.analysis.compound.TestCompoundWordTokenFilter;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
import org.apache.lucene.analysis.hunspell.HunspellDictionary;
import org.apache.lucene.analysis.hunspell.HunspellDictionaryTest;
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.apache.lucene.analysis.payloads.IdentityEncoder;
import org.apache.lucene.analysis.payloads.PayloadEncoder;
import org.apache.lucene.analysis.snowball.TestSnowball;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.AttributeSource.AttributeFactory;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.Rethrow;
import org.apache.lucene.util.Version;
import org.apache.lucene.util._TestUtil;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.tartarus.snowball.SnowballProgram;
import org.xml.sax.InputSource;
/** tests random analysis chains */
public class TestRandomChains extends BaseTokenStreamTestCase {
static List<Class<? extends Tokenizer>> tokenizers;
static List<Class<? extends TokenFilter>> tokenfilters;
static List<Class<? extends CharStream>> charfilters;
static List<Constructor<? extends Tokenizer>> tokenizers;
static List<Constructor<? extends TokenFilter>> tokenfilters;
static List<Constructor<? extends CharStream>> charfilters;
@BeforeClass
public static void beforeClass() throws Exception {
List<Class<?>> analysisClasses = new ArrayList<Class<?>>();
getClassesForPackage("org.apache.lucene.analysis", analysisClasses);
tokenizers = new ArrayList<Class<? extends Tokenizer>>();
tokenfilters = new ArrayList<Class<? extends TokenFilter>>();
charfilters = new ArrayList<Class<? extends CharStream>>();
for (Class<?> c : analysisClasses) {
// don't waste time with abstract classes or deprecated known-buggy ones
tokenizers = new ArrayList<Constructor<? extends Tokenizer>>();
tokenfilters = new ArrayList<Constructor<? extends TokenFilter>>();
charfilters = new ArrayList<Constructor<? extends CharStream>>();
for (final Class<?> c : analysisClasses) {
final int modifiers = c.getModifiers();
if (Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers)
|| c.getAnnotation(Deprecated.class) != null
|| c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface()
// TODO: fix basetokenstreamtestcase not to trip because this one has no CharTermAtt
|| c.equals(EmptyTokenizer.class)
// doesn't actual reset itself!
|| c.equals(CachingTokenFilter.class)
// broken!
|| c.equals(NGramTokenizer.class)
// broken!
|| c.equals(NGramTokenFilter.class)
// broken!
|| c.equals(EdgeNGramTokenizer.class)
// broken!
|| c.equals(EdgeNGramTokenFilter.class)) {
if (
// don't waste time with abstract classes or deprecated known-buggy ones
Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers)
|| c.isAnnotationPresent(Deprecated.class)
|| c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface()
|| !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharStream.class.isAssignableFrom(c))
// TODO: fix basetokenstreamtestcase not to trip because this one has no CharTermAtt
|| c == EmptyTokenizer.class
// doesn't actual reset itself!
|| c == CachingTokenFilter.class
// doesn't consume whole stream!
|| c == LimitTokenCountFilter.class
// broken!
|| c == NGramTokenizer.class
// broken!
|| c == NGramTokenFilter.class
// broken!
|| c == EdgeNGramTokenizer.class
// broken!
|| c == EdgeNGramTokenFilter.class
) {
continue;
}
if (Tokenizer.class.isAssignableFrom(c)) {
tokenizers.add(c.asSubclass(Tokenizer.class));
} else if (TokenFilter.class.isAssignableFrom(c)) {
tokenfilters.add(c.asSubclass(TokenFilter.class));
} else if (CharStream.class.isAssignableFrom(c)) {
charfilters.add(c.asSubclass(CharStream.class));
for (final Constructor<?> ctor : c.getConstructors()) {
// don't test deprecated ctors, they likely have known bugs:
if (ctor.isAnnotationPresent(Deprecated.class) || ctor.isSynthetic()) {
continue;
}
if (Tokenizer.class.isAssignableFrom(c)) {
assertTrue(ctor.toGenericString() + " has unsupported parameter types",
allowedTokenizerArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
tokenizers.add(castConstructor(Tokenizer.class, ctor));
} else if (TokenFilter.class.isAssignableFrom(c)) {
assertTrue(ctor.toGenericString() + " has unsupported parameter types",
allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
tokenfilters.add(castConstructor(TokenFilter.class, ctor));
} else if (CharStream.class.isAssignableFrom(c)) {
assertTrue(ctor.toGenericString() + " has unsupported parameter types",
allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
charfilters.add(castConstructor(CharStream.class, ctor));
} else {
fail("Cannot get here");
}
}
}
final Comparator<Class<?>> classComp = new Comparator<Class<?>>() {
final Comparator<Constructor<?>> ctorComp = new Comparator<Constructor<?>>() {
@Override
public int compare(Class<?> arg0, Class<?> arg1) {
return arg0.getName().compareTo(arg1.getName());
public int compare(Constructor<?> arg0, Constructor<?> arg1) {
return arg0.toGenericString().compareTo(arg1.toGenericString());
}
};
Collections.sort(tokenizers, classComp);
Collections.sort(tokenfilters, classComp);
Collections.sort(charfilters, classComp);
Collections.sort(tokenizers, ctorComp);
Collections.sort(tokenfilters, ctorComp);
Collections.sort(charfilters, ctorComp);
if (VERBOSE) {
System.out.println("tokenizers = " + tokenizers);
System.out.println("tokenfilters = " + tokenfilters);
@ -111,6 +166,304 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
charfilters = null;
}
/** Hack to work around the stupidness of Oracle's strict Java backwards compatibility.
* {@code Class<T>#getConstructors()} should return unmodifiable {@code List<Constructor<T>>} not array! */
@SuppressWarnings("unchecked")
private static <T> Constructor<? extends T> castConstructor(Class<T> instanceClazz, Constructor<?> ctor) {
return (Constructor<? extends T>) ctor;
}
private static interface ArgProducer {
Object create(Random random);
}
private static final Map<Class<?>,ArgProducer> argProducers = new IdentityHashMap<Class<?>,ArgProducer>() {{
put(int.class, new ArgProducer() {
@Override public Object create(Random random) {
// TODO: could cause huge ram usage to use full int range for some filters
// (e.g. allocate enormous arrays)
// return Integer.valueOf(random.nextInt());
return Integer.valueOf(_TestUtil.nextInt(random, -100, 100));
}
});
put(char.class, new ArgProducer() {
@Override public Object create(Random random) {
return Character.valueOf((char)random.nextInt(65536));
}
});
put(float.class, new ArgProducer() {
@Override public Object create(Random random) {
return Float.valueOf(random.nextFloat());
}
});
put(boolean.class, new ArgProducer() {
@Override public Object create(Random random) {
return Boolean.valueOf(random.nextBoolean());
}
});
put(byte.class, new ArgProducer() {
@Override public Object create(Random random) {
byte bytes[] = new byte[1];
random.nextBytes(bytes);
return Byte.valueOf(bytes[0]);
}
});
put(byte[].class, new ArgProducer() {
@Override public Object create(Random random) {
byte bytes[] = new byte[random.nextInt(256)];
random.nextBytes(bytes);
return bytes;
}
});
put(Random.class, new ArgProducer() {
@Override public Object create(Random random) {
return new Random(random.nextLong());
}
});
put(Version.class, new ArgProducer() {
@Override public Object create(Random random) {
// we expect bugs in emulating old versions
return TEST_VERSION_CURRENT;
}
});
put(Set.class, new ArgProducer() {
@Override public Object create(Random random) {
// TypeTokenFilter
Set<String> set = new HashSet<String>();
int num = random.nextInt(5);
for (int i = 0; i < num; i++) {
set.add(StandardTokenizer.TOKEN_TYPES[random.nextInt(StandardTokenizer.TOKEN_TYPES.length)]);
}
return set;
}
});
put(Collection.class, new ArgProducer() {
@Override public Object create(Random random) {
// CapitalizationFilter
Collection<char[]> col = new ArrayList<char[]>();
int num = random.nextInt(5);
for (int i = 0; i < num; i++) {
col.add(_TestUtil.randomSimpleString(random).toCharArray());
}
return col;
}
});
put(CharArraySet.class, new ArgProducer() {
@Override public Object create(Random random) {
int num = random.nextInt(10);
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, num, random.nextBoolean());
for (int i = 0; i < num; i++) {
// TODO: make nastier
set.add(_TestUtil.randomSimpleString(random));
}
return set;
}
});
put(Pattern.class, new ArgProducer() {
@Override public Object create(Random random) {
// TODO: don't want to make the exponentially slow ones Dawid documents
// in TestPatternReplaceFilter, so dont use truly random patterns (for now)
return Pattern.compile("a");
}
});
put(PayloadEncoder.class, new ArgProducer() {
@Override public Object create(Random random) {
return new IdentityEncoder(); // the other encoders will throw exceptions if tokens arent numbers?
}
});
put(HunspellDictionary.class, new ArgProducer() {
@Override public Object create(Random random) {
// TODO: make nastier
InputStream affixStream = HunspellDictionaryTest.class.getResourceAsStream("test.aff");
InputStream dictStream = HunspellDictionaryTest.class.getResourceAsStream("test.dic");
try {
return new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT);
} catch (Exception ex) {
throw new RuntimeException(ex);
}
}
});
put(HyphenationTree.class, new ArgProducer() {
@Override public Object create(Random random) {
// TODO: make nastier
try {
InputSource is = new InputSource(TestCompoundWordTokenFilter.class.getResource("da_UTF8.xml").toExternalForm());
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
return hyphenator;
} catch (Exception ex) {
Rethrow.rethrow(ex);
return null; // unreachable code
}
}
});
put(SnowballProgram.class, new ArgProducer() {
@Override public Object create(Random random) {
try {
String lang = TestSnowball.SNOWBALL_LANGS[random.nextInt(TestSnowball.SNOWBALL_LANGS.length)];
Class<? extends SnowballProgram> clazz = Class.forName("org.tartarus.snowball.ext." + lang + "Stemmer").asSubclass(SnowballProgram.class);
return clazz.newInstance();
} catch (Exception ex) {
Rethrow.rethrow(ex);
return null; // unreachable code
}
}
});
put(String.class, new ArgProducer() {
@Override public Object create(Random random) {
// TODO: make nastier
if (random.nextBoolean()) {
// a token type
return StandardTokenizer.TOKEN_TYPES[random.nextInt(StandardTokenizer.TOKEN_TYPES.length)];
} else {
return _TestUtil.randomSimpleString(random);
}
}
});
put(NormalizeCharMap.class, new ArgProducer() {
@Override public Object create(Random random) {
NormalizeCharMap map = new NormalizeCharMap();
// we can't add duplicate keys, or NormalizeCharMap gets angry
Set<String> keys = new HashSet<String>();
int num = random.nextInt(5);
for (int i = 0; i < num; i++) {
String key = _TestUtil.randomSimpleString(random);
if (!keys.contains(key)) {
map.add(key,_TestUtil.randomSimpleString(random));
keys.add(key);
}
}
return map;
}
});
put(CharacterRunAutomaton.class, new ArgProducer() {
@Override public Object create(Random random) {
// TODO: could probably use a purely random automaton
switch(random.nextInt(5)) {
case 0: return MockTokenizer.KEYWORD;
case 1: return MockTokenizer.SIMPLE;
case 2: return MockTokenizer.WHITESPACE;
case 3: return MockTokenFilter.EMPTY_STOPSET;
default: return MockTokenFilter.ENGLISH_STOPSET;
}
}
});
put(CharArrayMap.class, new ArgProducer() {
@Override public Object create(Random random) {
int num = random.nextInt(10);
CharArrayMap<String> map = new CharArrayMap<String>(TEST_VERSION_CURRENT, num, random.nextBoolean());
for (int i = 0; i < num; i++) {
// TODO: make nastier
map.put(_TestUtil.randomSimpleString(random), _TestUtil.randomSimpleString(random));
}
return map;
}
});
put(SynonymMap.class, new ArgProducer() {
@Override public Object create(Random random) {
SynonymMap.Builder b = new SynonymMap.Builder(random.nextBoolean());
final int numEntries = atLeast(10);
for (int j = 0; j < numEntries; j++) {
addSyn(b, randomNonEmptyString(random), randomNonEmptyString(random), random.nextBoolean());
}
try {
return b.build();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
private void addSyn(SynonymMap.Builder b, String input, String output, boolean keepOrig) {
b.add(new CharsRef(input.replaceAll(" +", "\u0000")),
new CharsRef(output.replaceAll(" +", "\u0000")),
keepOrig);
}
private String randomNonEmptyString(Random random) {
while(true) {
final String s = _TestUtil.randomUnicodeString(random).trim();
if (s.length() != 0 && s.indexOf('\u0000') == -1) {
return s;
}
}
}
});
}};
static final Set<Class<?>> allowedTokenizerArgs, allowedTokenFilterArgs, allowedCharFilterArgs;
static {
allowedTokenizerArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
allowedTokenizerArgs.addAll(argProducers.keySet());
allowedTokenizerArgs.add(Reader.class);
allowedTokenizerArgs.add(AttributeFactory.class);
allowedTokenizerArgs.add(AttributeSource.class);
allowedTokenFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
allowedTokenFilterArgs.addAll(argProducers.keySet());
allowedTokenFilterArgs.add(TokenStream.class);
allowedTokenFilterArgs.add(CommonGramsFilter.class);
allowedCharFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
allowedCharFilterArgs.addAll(argProducers.keySet());
allowedCharFilterArgs.add(Reader.class);
allowedCharFilterArgs.add(CharStream.class);
}
@SuppressWarnings("unchecked")
static <T> T createRandomArg(Random random, Class<T> paramType) {
final ArgProducer producer = argProducers.get(paramType);
assertNotNull("No producer for arguments of type " + paramType.getName() + " found", producer);
return (T) producer.create(random);
}
static Object[] newTokenizerArgs(Random random, Reader reader, Class<?>[] paramTypes) {
Object[] args = new Object[paramTypes.length];
for (int i = 0; i < args.length; i++) {
Class<?> paramType = paramTypes[i];
if (paramType == Reader.class) {
args[i] = reader;
} else if (paramType == AttributeFactory.class) {
// TODO: maybe the collator one...???
args[i] = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
} else if (paramType == AttributeSource.class) {
args[i] = null; // this always gives IAE: fine
} else {
args[i] = createRandomArg(random, paramType);
}
}
return args;
}
static Object[] newCharFilterArgs(Random random, Reader reader, Class<?>[] paramTypes) {
Object[] args = new Object[paramTypes.length];
for (int i = 0; i < args.length; i++) {
Class<?> paramType = paramTypes[i];
if (paramType == Reader.class) {
args[i] = reader;
} else if (paramType == CharStream.class) {
args[i] = CharReader.get(reader);
} else {
args[i] = createRandomArg(random, paramType);
}
}
return args;
}
static Object[] newFilterArgs(Random random, TokenStream stream, Class<?>[] paramTypes) {
Object[] args = new Object[paramTypes.length];
for (int i = 0; i < args.length; i++) {
Class<?> paramType = paramTypes[i];
if (paramType == TokenStream.class) {
args[i] = stream;
} else if (paramType == CommonGramsFilter.class) {
// CommonGramsQueryFilter takes this one explicitly
args[i] = new CommonGramsFilter(TEST_VERSION_CURRENT, stream, createRandomArg(random, CharArraySet.class));
} else {
args[i] = createRandomArg(random, paramType);
}
}
return args;
}
static class MockRandomAnalyzer extends Analyzer {
final long seed;
@ -123,6 +476,8 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
Random random = new Random(seed);
TokenizerSpec tokenizerspec = newTokenizer(random, reader);
TokenFilterSpec filterspec = newFilterChain(random, tokenizerspec.tokenizer);
//System.out.println("seed=" + seed + ",tokenizerSpec=" + tokenizerspec.toString);
//System.out.println("seed=" + seed + ",tokenfilterSpec=" + filterspec.toString);
return new TokenStreamComponents(tokenizerspec.tokenizer, filterspec.stream);
}
@ -130,6 +485,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
protected Reader initReader(Reader reader) {
Random random = new Random(seed);
CharFilterSpec charfilterspec = newCharFilterChain(random, reader);
//System.out.println("seed=" + seed + ",charFilterSpec=" + charfilterspec.toString);
return charfilterspec.reader;
}
@ -159,20 +515,27 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
boolean success = false;
while (!success) {
try {
// TODO: check Reader+Version,Version+Reader too
// also look for other variants and handle them special
int idx = random.nextInt(tokenizers.size());
try {
Constructor<? extends Tokenizer> c = tokenizers.get(idx).getConstructor(Version.class, Reader.class);
spec.tokenizer = c.newInstance(TEST_VERSION_CURRENT, reader);
} catch (NoSuchMethodException e) {
Constructor<? extends Tokenizer> c = tokenizers.get(idx).getConstructor(Reader.class);
spec.tokenizer = c.newInstance(reader);
}
spec.toString = tokenizers.get(idx).toString();
final Constructor<? extends Tokenizer> ctor = tokenizers.get(random.nextInt(tokenizers.size()));
final Object args[] = newTokenizerArgs(random, reader, ctor.getParameterTypes());
spec.tokenizer = ctor.newInstance(args);
spec.toString = ctor.getDeclaringClass().getName() + ("(" + Arrays.toString(args) + ")");
success = true;
} catch (Exception e) {
// ignore
} catch (InvocationTargetException ite) {
final Throwable cause = ite.getCause();
if (cause instanceof IllegalArgumentException ||
cause instanceof UnsupportedOperationException) {
// thats ok, ignore
if (VERBOSE) {
System.err.println("Ignoring IAE/UOE from ctor:");
cause.printStackTrace(System.err);
}
} else {
Rethrow.rethrow(cause);
}
} catch (IllegalAccessException iae) {
Rethrow.rethrow(iae);
} catch (InstantiationException ie) {
Rethrow.rethrow(ie);
}
}
return spec;
@ -187,23 +550,32 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
boolean success = false;
while (!success) {
try {
// TODO: also look for other variants and handle them special
int idx = random.nextInt(charfilters.size());
try {
Constructor<? extends CharStream> c = charfilters.get(idx).getConstructor(Reader.class);
spec.reader = c.newInstance(spec.reader);
} catch (NoSuchMethodException e) {
Constructor<? extends CharStream> c = charfilters.get(idx).getConstructor(CharStream.class);
spec.reader = c.newInstance(CharReader.get(spec.reader));
}
final Constructor<? extends CharStream> ctor = charfilters.get(random.nextInt(charfilters.size()));
final Object args[] = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes());
spec.reader = ctor.newInstance(args);
if (descr.length() > 0) {
descr.append(",");
}
descr.append(charfilters.get(idx).toString());
descr.append(ctor.getDeclaringClass().getName());
descr.append("(" + Arrays.toString(args) + ")");
success = true;
} catch (Exception e) {
// ignore
} catch (InvocationTargetException ite) {
final Throwable cause = ite.getCause();
if (cause instanceof IllegalArgumentException ||
cause instanceof UnsupportedOperationException) {
// thats ok, ignore
if (VERBOSE) {
System.err.println("Ignoring IAE/UOE from ctor:");
cause.printStackTrace(System.err);
}
} else {
Rethrow.rethrow(cause);
}
} catch (IllegalAccessException iae) {
Rethrow.rethrow(iae);
} catch (InstantiationException ie) {
Rethrow.rethrow(ie);
}
}
}
@ -220,22 +592,31 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
boolean success = false;
while (!success) {
try {
// TODO: also look for other variants and handle them special
int idx = random.nextInt(tokenfilters.size());
try {
Constructor<? extends TokenFilter> c = tokenfilters.get(idx).getConstructor(Version.class, TokenStream.class);
spec.stream = c.newInstance(TEST_VERSION_CURRENT, spec.stream);
} catch (NoSuchMethodException e) {
Constructor<? extends TokenFilter> c = tokenfilters.get(idx).getConstructor(TokenStream.class);
spec.stream = c.newInstance(spec.stream);
}
final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
spec.stream = ctor.newInstance(args);
if (descr.length() > 0) {
descr.append(",");
}
descr.append(tokenfilters.get(idx).toString());
descr.append(ctor.getDeclaringClass().getName());
descr.append("(" + Arrays.toString(args) + ")");
success = true;
} catch (Exception e) {
// ignore
} catch (InvocationTargetException ite) {
final Throwable cause = ite.getCause();
if (cause instanceof IllegalArgumentException ||
cause instanceof UnsupportedOperationException) {
// thats ok, ignore
if (VERBOSE) {
System.err.println("Ignoring IAE/UOE from ctor:");
cause.printStackTrace(System.err);
}
} else {
Rethrow.rethrow(cause);
}
} catch (IllegalAccessException iae) {
Rethrow.rethrow(iae);
} catch (InstantiationException ie) {
Rethrow.rethrow(ie);
}
}
}
@ -263,7 +644,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
int numIterations = atLeast(20);
for (int i = 0; i < numIterations; i++) {
MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong());
if (VERBOSE) {
if (true || VERBOSE) {
System.out.println("Creating random analyzer:" + a);
}
try {

View File

@ -142,14 +142,16 @@ public class TestSnowball extends BaseTokenStreamTestCase {
}
}
/** for testing purposes ONLY */
public static String SNOWBALL_LANGS[] = {
"Armenian", "Basque", "Catalan", "Danish", "Dutch", "English",
"Finnish", "French", "German2", "German", "Hungarian", "Irish",
"Italian", "Kp", "Lovins", "Norwegian", "Porter", "Portuguese",
"Romanian", "Russian", "Spanish", "Swedish", "Turkish"
};
public void testEmptyTerm() throws IOException {
String langs[] = {
"Armenian", "Basque", "Catalan", "Danish", "Dutch", "English",
"Finnish", "French", "German2", "German", "Hungarian", "Irish",
"Italian", "Kp", "Lovins", "Norwegian", "Porter", "Portuguese",
"Romanian", "Russian", "Spanish", "Swedish", "Turkish"
};
for (final String lang : langs) {
for (final String lang : SNOWBALL_LANGS) {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {