mirror of https://github.com/apache/lucene.git
LUCENE-5269: Fix NGramTokenFilter length filtering
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1531186 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5c9cbe9847
commit
d2d435ecf5
|
@ -131,6 +131,10 @@ Bug Fixes
|
|||
terms were present in the query and the high-frequent operator was set
|
||||
to SHOULD. (Simon Willnauer)
|
||||
|
||||
* LUCENE-5269: Fix bug in NGramTokenFilter where it would sometimes count
|
||||
unicode characters incorrectly. Adds CodepointCountFilter.
|
||||
(Mike McCandless, Robert Muir)
|
||||
|
||||
API Changes:
|
||||
|
||||
* LUCENE-5222: Add SortField.needsScores(). Previously it was not possible
|
||||
|
|
|
@ -0,0 +1,69 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Removes words that are too long or too short from the stream.
|
||||
* <p>
|
||||
* Note: Length is calculated as the number of Unicode codepoints.
|
||||
* </p>
|
||||
*/
|
||||
public final class CodepointCountFilter extends FilteringTokenFilter {
|
||||
|
||||
private final int min;
|
||||
private final int max;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
/**
|
||||
* Create a new {@link CodepointCountFilter}. This will filter out tokens whose
|
||||
* {@link CharTermAttribute} is either too short ({@link Character#codePointCount(char[], int, int)}
|
||||
* < min) or too long ({@link Character#codePointCount(char[], int, int)} > max).
|
||||
* @param version the Lucene match version
|
||||
* @param in the {@link TokenStream} to consume
|
||||
* @param min the minimum length
|
||||
* @param max the maximum length
|
||||
*/
|
||||
public CodepointCountFilter(Version version, TokenStream in, int min, int max) {
|
||||
super(version, in);
|
||||
this.min = min;
|
||||
this.max = max;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean accept() {
|
||||
final int max32 = termAtt.length();
|
||||
final int min32 = max32 >> 1;
|
||||
if (min32 >= min && max32 <= max) {
|
||||
// definitely within range
|
||||
return true;
|
||||
} else if (min32 > max || max32 < min) {
|
||||
// definitely not
|
||||
return false;
|
||||
} else {
|
||||
// we must count to be sure
|
||||
int len = Character.codePointCount(termAtt.buffer(), 0, termAtt.length());
|
||||
return (len >= min && len <= max);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,55 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link CodepointCountFilter}.
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_lngth" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.CodepointCountFilterFactory" min="0" max="1" />
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*/
|
||||
public class CodepointCountFilterFactory extends TokenFilterFactory {
|
||||
final int min;
|
||||
final int max;
|
||||
public static final String MIN_KEY = "min";
|
||||
public static final String MAX_KEY = "max";
|
||||
|
||||
/** Creates a new CodepointCountFilterFactory */
|
||||
public CodepointCountFilterFactory(Map<String, String> args) {
|
||||
super(args);
|
||||
min = requireInt(args, MIN_KEY);
|
||||
max = requireInt(args, MAX_KEY);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public CodepointCountFilter create(TokenStream input) {
|
||||
return new CodepointCountFilter(luceneMatchVersion, input, min, max);
|
||||
}
|
||||
}
|
|
@ -21,7 +21,7 @@ import java.io.IOException;
|
|||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.LengthFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
@ -81,7 +81,7 @@ public final class NGramTokenFilter extends TokenFilter {
|
|||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) {
|
||||
super(new LengthFilter(version, input, minGram, Integer.MAX_VALUE));
|
||||
super(new CodepointCountFilter(version, input, minGram, Integer.MAX_VALUE));
|
||||
this.version = version;
|
||||
this.charUtils = version.onOrAfter(Version.LUCENE_44)
|
||||
? CharacterUtils.getInstance(version)
|
||||
|
|
|
@ -55,6 +55,7 @@ org.apache.lucene.analysis.it.ItalianLightStemFilterFactory
|
|||
org.apache.lucene.analysis.lv.LatvianStemFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.KeepWordFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
package org.apache.lucene.analysis.core;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.nio.CharBuffer;
|
||||
|
@ -11,10 +12,14 @@ import org.apache.lucene.analysis.MockCharFilter;
|
|||
import org.apache.lucene.analysis.MockTokenFilter;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
|
||||
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
|
||||
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
|
||||
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
|
||||
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
|
||||
import org.apache.lucene.analysis.shingle.ShingleFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
/*
|
||||
|
@ -195,4 +200,58 @@ public class TestBugInSomething extends BaseTokenStreamTestCase {
|
|||
assertEquals("read(char[], int, int)", e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
// todo: test framework?
|
||||
|
||||
static final class SopTokenFilter extends TokenFilter {
|
||||
|
||||
SopTokenFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
System.out.println(input.getClass().getSimpleName() + "->" + this.reflectAsString(false));
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
super.end();
|
||||
System.out.println(input.getClass().getSimpleName() + ".end()");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
super.close();
|
||||
System.out.println(input.getClass().getSimpleName() + ".close()");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
System.out.println(input.getClass().getSimpleName() + ".reset()");
|
||||
}
|
||||
}
|
||||
|
||||
// LUCENE-5269
|
||||
public void testUnicodeShinglesAndNgrams() throws Exception {
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 94);
|
||||
//TokenStream stream = new SopTokenFilter(tokenizer);
|
||||
TokenStream stream = new ShingleFilter(tokenizer, 54);
|
||||
//stream = new SopTokenFilter(stream);
|
||||
stream = new NGramTokenFilter(TEST_VERSION_CURRENT, stream, 55, 83);
|
||||
//stream = new SopTokenFilter(stream);
|
||||
return new TokenStreamComponents(tokenizer, stream);
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), analyzer, 10);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,69 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
public class TestCodepointCountFilter extends BaseTokenStreamTestCase {
|
||||
public void testFilterWithPosIncr() throws Exception {
|
||||
TokenStream stream = new MockTokenizer(
|
||||
new StringReader("short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false);
|
||||
CodepointCountFilter filter = new CodepointCountFilter(TEST_VERSION_CURRENT, stream, 2, 6);
|
||||
assertTokenStreamContents(filter,
|
||||
new String[]{"short", "ab", "foo"},
|
||||
new int[]{1, 4, 2}
|
||||
);
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws IOException {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new KeywordTokenizer(reader);
|
||||
return new TokenStreamComponents(tokenizer, new CodepointCountFilter(TEST_VERSION_CURRENT, tokenizer, 0, 5));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
|
||||
public void testRandomStrings() throws IOException {
|
||||
for (int i = 0; i < 10000; i++) {
|
||||
String text = _TestUtil.randomUnicodeString(random(), 100);
|
||||
int min = _TestUtil.nextInt(random(), 0, 100);
|
||||
int max = _TestUtil.nextInt(random(), 0, 100);
|
||||
int count = text.codePointCount(0, text.length());
|
||||
boolean expected = count >= min && count <= max;
|
||||
TokenStream stream = new KeywordTokenizer(new StringReader(text));
|
||||
stream = new CodepointCountFilter(TEST_VERSION_CURRENT, stream, min, max);
|
||||
stream.reset();
|
||||
assertEquals(expected, stream.incrementToken());
|
||||
stream.end();
|
||||
stream.close();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,50 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
||||
|
||||
public class TestCodepointCountFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||
|
||||
public void testPositionIncrements() throws Exception {
|
||||
Reader reader = new StringReader("foo foobar super-duper-trooper");
|
||||
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
stream = tokenFilterFactory("CodepointCount",
|
||||
"min", "4",
|
||||
"max", "10").create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 2 });
|
||||
}
|
||||
|
||||
/** Test that bogus arguments result in exception */
|
||||
public void testBogusArguments() throws Exception {
|
||||
try {
|
||||
tokenFilterFactory("CodepointCount",
|
||||
"min", "4",
|
||||
"max", "5",
|
||||
"bogusArg", "bogusValue");
|
||||
fail();
|
||||
} catch (IllegalArgumentException expected) {
|
||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -169,15 +169,20 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer,
|
||||
new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 2, 4));
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
final int min = _TestUtil.nextInt(random(), 2, 10);
|
||||
final int max = _TestUtil.nextInt(random(), min, 20);
|
||||
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer,
|
||||
new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, min, max));
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), a, 100*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws Exception {
|
||||
|
|
|
@ -96,15 +96,20 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 4);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
|
||||
checkRandomData(random(), a, 100*RANDOM_MULTIPLIER, 8192, false, false);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
final int min = _TestUtil.nextInt(random(), 2, 10);
|
||||
final int max = _TestUtil.nextInt(random(), min, 20);
|
||||
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, min, max);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), a, 100*RANDOM_MULTIPLIER, 20);
|
||||
checkRandomData(random(), a, 10*RANDOM_MULTIPLIER, 8192);
|
||||
}
|
||||
}
|
||||
|
||||
public void testTokenizerPositions() throws Exception {
|
||||
|
|
|
@ -144,15 +144,19 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer,
|
||||
new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 2, 4));
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
final int min = _TestUtil.nextInt(random(), 2, 10);
|
||||
final int max = _TestUtil.nextInt(random(), min, 20);
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer,
|
||||
new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, min, max));
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), a, 200*RANDOM_MULTIPLIER, 20);
|
||||
}
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws Exception {
|
||||
|
|
|
@ -107,15 +107,19 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 4);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
|
||||
checkRandomData(random(), a, 50*RANDOM_MULTIPLIER, 1027, false, false);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
final int min = _TestUtil.nextInt(random(), 2, 10);
|
||||
final int max = _TestUtil.nextInt(random(), min, 20);
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, reader, min, max);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), a, 200*RANDOM_MULTIPLIER, 20);
|
||||
checkRandomData(random(), a, 10*RANDOM_MULTIPLIER, 1027);
|
||||
}
|
||||
}
|
||||
|
||||
private static void testNGrams(int minGram, int maxGram, int length, final String nonTokenChars) throws IOException {
|
||||
|
|
Loading…
Reference in New Issue