LUCENE-5269: Fix NGramTokenFilter length filtering

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1531186 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2013-10-11 03:53:42 +00:00
parent 5c9cbe9847
commit d2d435ecf5
12 changed files with 363 additions and 38 deletions

View File

@ -131,6 +131,10 @@ Bug Fixes
terms were present in the query and the high-frequent operator was set
to SHOULD. (Simon Willnauer)
* LUCENE-5269: Fix bug in NGramTokenFilter where it would sometimes count
unicode characters incorrectly. Adds CodepointCountFilter.
(Mike McCandless, Robert Muir)
API Changes:
* LUCENE-5222: Add SortField.needsScores(). Previously it was not possible

View File

@ -0,0 +1,69 @@
package org.apache.lucene.analysis.miscellaneous;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
/**
* Removes words that are too long or too short from the stream.
* <p>
* Note: Length is calculated as the number of Unicode codepoints.
* </p>
*/
public final class CodepointCountFilter extends FilteringTokenFilter {
private final int min;
private final int max;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/**
* Create a new {@link CodepointCountFilter}. This will filter out tokens whose
* {@link CharTermAttribute} is either too short ({@link Character#codePointCount(char[], int, int)}
* &lt; min) or too long ({@link Character#codePointCount(char[], int, int)} &gt; max).
* @param version the Lucene match version
* @param in the {@link TokenStream} to consume
* @param min the minimum length
* @param max the maximum length
*/
public CodepointCountFilter(Version version, TokenStream in, int min, int max) {
super(version, in);
this.min = min;
this.max = max;
}
@Override
public boolean accept() {
final int max32 = termAtt.length();
final int min32 = max32 >> 1;
if (min32 >= min && max32 <= max) {
// definitely within range
return true;
} else if (min32 > max || max32 < min) {
// definitely not
return false;
} else {
// we must count to be sure
int len = Character.codePointCount(termAtt.buffer(), 0, termAtt.length());
return (len >= min && len <= max);
}
}
}

View File

@ -0,0 +1,55 @@
package org.apache.lucene.analysis.miscellaneous;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link CodepointCountFilter}.
* <pre class="prettyprint">
* &lt;fieldType name="text_lngth" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.CodepointCountFilterFactory" min="0" max="1" /&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
public class CodepointCountFilterFactory extends TokenFilterFactory {
final int min;
final int max;
public static final String MIN_KEY = "min";
public static final String MAX_KEY = "max";
/** Creates a new CodepointCountFilterFactory */
public CodepointCountFilterFactory(Map<String, String> args) {
super(args);
min = requireInt(args, MIN_KEY);
max = requireInt(args, MAX_KEY);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public CodepointCountFilter create(TokenStream input) {
return new CodepointCountFilter(luceneMatchVersion, input, min, max);
}
}

View File

@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.LengthFilter;
import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
@ -81,7 +81,7 @@ public final class NGramTokenFilter extends TokenFilter {
* @param maxGram the largest n-gram to generate
*/
public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) {
super(new LengthFilter(version, input, minGram, Integer.MAX_VALUE));
super(new CodepointCountFilter(version, input, minGram, Integer.MAX_VALUE));
this.version = version;
this.charUtils = version.onOrAfter(Version.LUCENE_44)
? CharacterUtils.getInstance(version)

View File

@ -55,6 +55,7 @@ org.apache.lucene.analysis.it.ItalianLightStemFilterFactory
org.apache.lucene.analysis.lv.LatvianStemFilterFactory
org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory
org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory
org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory
org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory
org.apache.lucene.analysis.miscellaneous.KeepWordFilterFactory
org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory

View File

@ -1,5 +1,6 @@
package org.apache.lucene.analysis.core;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.nio.CharBuffer;
@ -11,10 +12,14 @@ import org.apache.lucene.analysis.MockCharFilter;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.util.CharArraySet;
/*
@ -195,4 +200,58 @@ public class TestBugInSomething extends BaseTokenStreamTestCase {
assertEquals("read(char[], int, int)", e.getMessage());
}
}
// todo: test framework?
static final class SopTokenFilter extends TokenFilter {
SopTokenFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
System.out.println(input.getClass().getSimpleName() + "->" + this.reflectAsString(false));
return true;
} else {
return false;
}
}
@Override
public void end() throws IOException {
super.end();
System.out.println(input.getClass().getSimpleName() + ".end()");
}
@Override
public void close() throws IOException {
super.close();
System.out.println(input.getClass().getSimpleName() + ".close()");
}
@Override
public void reset() throws IOException {
super.reset();
System.out.println(input.getClass().getSimpleName() + ".reset()");
}
}
// LUCENE-5269
public void testUnicodeShinglesAndNgrams() throws Exception {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 94);
//TokenStream stream = new SopTokenFilter(tokenizer);
TokenStream stream = new ShingleFilter(tokenizer, 54);
//stream = new SopTokenFilter(stream);
stream = new NGramTokenFilter(TEST_VERSION_CURRENT, stream, 55, 83);
//stream = new SopTokenFilter(stream);
return new TokenStreamComponents(tokenizer, stream);
}
};
checkRandomData(random(), analyzer, 10);
}
}

View File

@ -0,0 +1,69 @@
package org.apache.lucene.analysis.miscellaneous;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.util._TestUtil;
public class TestCodepointCountFilter extends BaseTokenStreamTestCase {
public void testFilterWithPosIncr() throws Exception {
TokenStream stream = new MockTokenizer(
new StringReader("short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false);
CodepointCountFilter filter = new CodepointCountFilter(TEST_VERSION_CURRENT, stream, 2, 6);
assertTokenStreamContents(filter,
new String[]{"short", "ab", "foo"},
new int[]{1, 4, 2}
);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new CodepointCountFilter(TEST_VERSION_CURRENT, tokenizer, 0, 5));
}
};
checkOneTerm(a, "", "");
}
public void testRandomStrings() throws IOException {
for (int i = 0; i < 10000; i++) {
String text = _TestUtil.randomUnicodeString(random(), 100);
int min = _TestUtil.nextInt(random(), 0, 100);
int max = _TestUtil.nextInt(random(), 0, 100);
int count = text.codePointCount(0, text.length());
boolean expected = count >= min && count <= max;
TokenStream stream = new KeywordTokenizer(new StringReader(text));
stream = new CodepointCountFilter(TEST_VERSION_CURRENT, stream, min, max);
stream.reset();
assertEquals(expected, stream.incrementToken());
stream.end();
stream.close();
}
}
}

View File

@ -0,0 +1,50 @@
package org.apache.lucene.analysis.miscellaneous;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
public class TestCodepointCountFilterFactory extends BaseTokenStreamFactoryTestCase {
public void testPositionIncrements() throws Exception {
Reader reader = new StringReader("foo foobar super-duper-trooper");
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
stream = tokenFilterFactory("CodepointCount",
"min", "4",
"max", "10").create(stream);
assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 2 });
}
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
try {
tokenFilterFactory("CodepointCount",
"min", "4",
"max", "5",
"bogusArg", "bogusValue");
fail();
} catch (IllegalArgumentException expected) {
assertTrue(expected.getMessage().contains("Unknown parameters"));
}
}
}

View File

@ -169,15 +169,20 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer,
new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 2, 4));
}
};
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
for (int i = 0; i < 10; i++) {
final int min = _TestUtil.nextInt(random(), 2, 10);
final int max = _TestUtil.nextInt(random(), min, 20);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer,
new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, min, max));
}
};
checkRandomData(random(), a, 100*RANDOM_MULTIPLIER);
}
}
public void testEmptyTerm() throws Exception {

View File

@ -96,15 +96,20 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 4);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
checkRandomData(random(), a, 100*RANDOM_MULTIPLIER, 8192, false, false);
for (int i = 0; i < 10; i++) {
final int min = _TestUtil.nextInt(random(), 2, 10);
final int max = _TestUtil.nextInt(random(), min, 20);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, min, max);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
checkRandomData(random(), a, 100*RANDOM_MULTIPLIER, 20);
checkRandomData(random(), a, 10*RANDOM_MULTIPLIER, 8192);
}
}
public void testTokenizerPositions() throws Exception {

View File

@ -144,15 +144,19 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer,
new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 2, 4));
}
};
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
for (int i = 0; i < 10; i++) {
final int min = _TestUtil.nextInt(random(), 2, 10);
final int max = _TestUtil.nextInt(random(), min, 20);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer,
new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, min, max));
}
};
checkRandomData(random(), a, 200*RANDOM_MULTIPLIER, 20);
}
}
public void testEmptyTerm() throws Exception {

View File

@ -107,15 +107,19 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 4);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
checkRandomData(random(), a, 50*RANDOM_MULTIPLIER, 1027, false, false);
for (int i = 0; i < 10; i++) {
final int min = _TestUtil.nextInt(random(), 2, 10);
final int max = _TestUtil.nextInt(random(), min, 20);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, reader, min, max);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
checkRandomData(random(), a, 200*RANDOM_MULTIPLIER, 20);
checkRandomData(random(), a, 10*RANDOM_MULTIPLIER, 1027);
}
}
private static void testNGrams(int minGram, int maxGram, int length, final String nonTokenChars) throws IOException {