LUCENE-5269: Fix NGramTokenFilter length filtering

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1531186 13f79535-47bb-0310-9956-ffa450edef68
2013-10-11 03:53:42 +00:00 · 2013-10-11 03:53:42 +00:00 · d2d435ecf5
parent 5c9cbe9847
commit d2d435ecf5
12 changed files with 363 additions and 38 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -131,6 +131,10 @@ Bug Fixes
  terms were present in the query and the high-frequent operator was set
  to SHOULD. (Simon Willnauer)

+* LUCENE-5269: Fix bug in NGramTokenFilter where it would sometimes count
+  unicode characters incorrectly. Adds CodepointCountFilter.
+  (Mike McCandless, Robert Muir)
+
 API Changes:

 * LUCENE-5222: Add SortField.needsScores(). Previously it was not possible
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CodepointCountFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CodepointCountFilter.java
@ -0,0 +1,69 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.FilteringTokenFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.Version;
+
+/**
+ * Removes words that are too long or too short from the stream.
+ * <p>
+ * Note: Length is calculated as the number of Unicode codepoints.
+ * </p>
+ */
+public final class CodepointCountFilter extends FilteringTokenFilter {
+
+  private final int min;
+  private final int max;
+  
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+  /**
+   * Create a new {@link CodepointCountFilter}. This will filter out tokens whose
+   * {@link CharTermAttribute} is either too short ({@link Character#codePointCount(char[], int, int)}
+   * &lt; min) or too long ({@link Character#codePointCount(char[], int, int)} &gt; max).
+   * @param version the Lucene match version
+   * @param in      the {@link TokenStream} to consume
+   * @param min     the minimum length
+   * @param max     the maximum length
+   */
+  public CodepointCountFilter(Version version, TokenStream in, int min, int max) {
+    super(version, in);
+    this.min = min;
+    this.max = max;
+  }
+
+  @Override
+  public boolean accept() {
+    final int max32 = termAtt.length();
+    final int min32 = max32 >> 1;
+    if (min32 >= min && max32 <= max) {
+      // definitely within range
+      return true;
+    } else if (min32 > max || max32 < min) {
+      // definitely not
+      return false;
+    } else {
+      // we must count to be sure
+      int len = Character.codePointCount(termAtt.buffer(), 0, termAtt.length());
+      return (len >= min && len <= max);
+    }
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CodepointCountFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CodepointCountFilterFactory.java
@ -0,0 +1,55 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link CodepointCountFilter}. 
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_lngth" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.CodepointCountFilterFactory" min="0" max="1" /&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ */
+public class CodepointCountFilterFactory extends TokenFilterFactory {
+  final int min;
+  final int max;
+  public static final String MIN_KEY = "min";
+  public static final String MAX_KEY = "max";
+
+  /** Creates a new CodepointCountFilterFactory */
+  public CodepointCountFilterFactory(Map<String, String> args) {
+    super(args);
+    min = requireInt(args, MIN_KEY);
+    max = requireInt(args, MAX_KEY);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+  
+  @Override
+  public CodepointCountFilter create(TokenStream input) {
+    return new CodepointCountFilter(luceneMatchVersion, input, min, max);
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
@ -21,7 +21,7 @@ import java.io.IOException;

 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.miscellaneous.LengthFilter;
+import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
@ -81,7 +81,7 @@ public final class NGramTokenFilter extends TokenFilter {
   * @param maxGram the largest n-gram to generate
   */
  public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) {
-    super(new LengthFilter(version, input, minGram, Integer.MAX_VALUE));
+    super(new CodepointCountFilter(version, input, minGram, Integer.MAX_VALUE));
    this.version = version;
    this.charUtils = version.onOrAfter(Version.LUCENE_44)
        ? CharacterUtils.getInstance(version)
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@ -55,6 +55,7 @@ org.apache.lucene.analysis.it.ItalianLightStemFilterFactory
 org.apache.lucene.analysis.lv.LatvianStemFilterFactory
 org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory
 org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory
+org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory
 org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory
 org.apache.lucene.analysis.miscellaneous.KeepWordFilterFactory
 org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java
@ -1,5 +1,6 @@
 package org.apache.lucene.analysis.core;

+import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
 import java.nio.CharBuffer;
@ -11,10 +12,14 @@ import org.apache.lucene.analysis.MockCharFilter;
 import org.apache.lucene.analysis.MockTokenFilter;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.charfilter.MappingCharFilter;
 import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
 import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
+import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
+import org.apache.lucene.analysis.ngram.NGramTokenFilter;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
 import org.apache.lucene.analysis.util.CharArraySet;

 /*
@ -195,4 +200,58 @@ public class TestBugInSomething extends BaseTokenStreamTestCase {
      assertEquals("read(char[], int, int)", e.getMessage());
    }
  }
+  
+  // todo: test framework?
+  
+  static final class SopTokenFilter extends TokenFilter {
+
+    SopTokenFilter(TokenStream input) {
+      super(input);
+    }
+
+    @Override
+    public boolean incrementToken() throws IOException {
+      if (input.incrementToken()) {
+        System.out.println(input.getClass().getSimpleName() + "->" + this.reflectAsString(false));
+        return true;
+      } else {
+        return false;
+      }
+    }
+
+    @Override
+    public void end() throws IOException {
+      super.end();
+      System.out.println(input.getClass().getSimpleName() + ".end()");
+    }
+
+    @Override
+    public void close() throws IOException {
+      super.close();
+      System.out.println(input.getClass().getSimpleName() + ".close()");
+    }
+
+    @Override
+    public void reset() throws IOException {
+      super.reset();
+      System.out.println(input.getClass().getSimpleName() + ".reset()");
+    }
+  }
+  
+  // LUCENE-5269
+  public void testUnicodeShinglesAndNgrams() throws Exception {
+    Analyzer analyzer = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 94);
+        //TokenStream stream = new SopTokenFilter(tokenizer);
+        TokenStream stream = new ShingleFilter(tokenizer, 54);
+        //stream = new SopTokenFilter(stream);
+        stream = new NGramTokenFilter(TEST_VERSION_CURRENT, stream, 55, 83);
+        //stream = new SopTokenFilter(stream);
+        return new TokenStreamComponents(tokenizer, stream);
+      }  
+    };
+    checkRandomData(random(), analyzer, 10);
+  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCodepointCountFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCodepointCountFilter.java
@ -0,0 +1,69 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.util._TestUtil;
+
+public class TestCodepointCountFilter extends BaseTokenStreamTestCase {
+  public void testFilterWithPosIncr() throws Exception {
+    TokenStream stream = new MockTokenizer(
+        new StringReader("short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false);
+    CodepointCountFilter filter = new CodepointCountFilter(TEST_VERSION_CURRENT, stream, 2, 6);
+    assertTokenStreamContents(filter,
+      new String[]{"short", "ab", "foo"},
+      new int[]{1, 4, 2}
+    );
+  }
+  
+  public void testEmptyTerm() throws IOException {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new KeywordTokenizer(reader);
+        return new TokenStreamComponents(tokenizer, new CodepointCountFilter(TEST_VERSION_CURRENT, tokenizer, 0, 5));
+      }
+    };
+    checkOneTerm(a, "", "");
+  }
+  
+  public void testRandomStrings() throws IOException {
+    for (int i = 0; i < 10000; i++) {
+      String text = _TestUtil.randomUnicodeString(random(), 100);
+      int min = _TestUtil.nextInt(random(), 0, 100);
+      int max = _TestUtil.nextInt(random(), 0, 100);
+      int count = text.codePointCount(0, text.length());
+      boolean expected = count >= min && count <= max;
+      TokenStream stream = new KeywordTokenizer(new StringReader(text));
+      stream = new CodepointCountFilter(TEST_VERSION_CURRENT, stream, min, max);
+      stream.reset();
+      assertEquals(expected, stream.incrementToken());
+      stream.end();
+      stream.close();
+    }
+  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCodepointCountFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCodepointCountFilterFactory.java
@ -0,0 +1,50 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+
+public class TestCodepointCountFilterFactory extends BaseTokenStreamFactoryTestCase {
+
+  public void testPositionIncrements() throws Exception {
+    Reader reader = new StringReader("foo foobar super-duper-trooper");
+    TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+    stream = tokenFilterFactory("CodepointCount",
+        "min", "4",
+        "max", "10").create(stream);
+    assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 2 });
+  }
+  
+  /** Test that bogus arguments result in exception */
+  public void testBogusArguments() throws Exception {
+    try {
+      tokenFilterFactory("CodepointCount", 
+          "min", "4", 
+          "max", "5", 
+          "bogusArg", "bogusValue");
+      fail();
+    } catch (IllegalArgumentException expected) {
+      assertTrue(expected.getMessage().contains("Unknown parameters"));
+    }
+  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
@ -169,15 +169,20 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
  
  /** blast some random strings through the analyzer */
  public void testRandomStrings() throws Exception {
-    Analyzer a = new Analyzer() {
-      @Override
-      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
-        return new TokenStreamComponents(tokenizer, 
-            new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 2, 4));
-      }    
-    };
-    checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
+    for (int i = 0; i < 10; i++) {
+      final int min = _TestUtil.nextInt(random(), 2, 10);
+      final int max = _TestUtil.nextInt(random(), min, 20);
+    
+      Analyzer a = new Analyzer() {
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+          Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+          return new TokenStreamComponents(tokenizer, 
+            new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, min, max));
+        }    
+      };
+      checkRandomData(random(), a, 100*RANDOM_MULTIPLIER);
+    }
  }
  
  public void testEmptyTerm() throws Exception {
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
@ -96,15 +96,20 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase {
  
  /** blast some random strings through the analyzer */
  public void testRandomStrings() throws Exception {
-    Analyzer a = new Analyzer() {
-      @Override
-      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-        Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 4);
-        return new TokenStreamComponents(tokenizer, tokenizer);
-      }    
-    };
-    checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
-    checkRandomData(random(), a, 100*RANDOM_MULTIPLIER, 8192, false, false);
+    for (int i = 0; i < 10; i++) {
+      final int min = _TestUtil.nextInt(random(), 2, 10);
+      final int max = _TestUtil.nextInt(random(), min, 20);
+      
+      Analyzer a = new Analyzer() {
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+          Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, min, max);
+          return new TokenStreamComponents(tokenizer, tokenizer);
+        }    
+      };
+      checkRandomData(random(), a, 100*RANDOM_MULTIPLIER, 20);
+      checkRandomData(random(), a, 10*RANDOM_MULTIPLIER, 8192);
+    }
  }

  public void testTokenizerPositions() throws Exception {
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
@ -144,15 +144,19 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
  
  /** blast some random strings through the analyzer */
  public void testRandomStrings() throws Exception {
-    Analyzer a = new Analyzer() {
-      @Override
-      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
-        return new TokenStreamComponents(tokenizer, 
-            new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 2, 4));
-      }    
-    };
-    checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
+    for (int i = 0; i < 10; i++) {
+      final int min = _TestUtil.nextInt(random(), 2, 10);
+      final int max = _TestUtil.nextInt(random(), min, 20);
+      Analyzer a = new Analyzer() {
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+          Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+          return new TokenStreamComponents(tokenizer, 
+              new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, min, max));
+        }    
+      };
+      checkRandomData(random(), a, 200*RANDOM_MULTIPLIER, 20);
+    }
  }
  
  public void testEmptyTerm() throws Exception {
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
@ -107,15 +107,19 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
  
  /** blast some random strings through the analyzer */
  public void testRandomStrings() throws Exception {
-    Analyzer a = new Analyzer() {
-      @Override
-      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-        Tokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 4);
-        return new TokenStreamComponents(tokenizer, tokenizer);
-      }    
-    };
-    checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
-    checkRandomData(random(), a, 50*RANDOM_MULTIPLIER, 1027, false, false);
+    for (int i = 0; i < 10; i++) {
+      final int min = _TestUtil.nextInt(random(), 2, 10);
+      final int max = _TestUtil.nextInt(random(), min, 20);
+      Analyzer a = new Analyzer() {
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+          Tokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, reader, min, max);
+          return new TokenStreamComponents(tokenizer, tokenizer);
+        }    
+      };
+      checkRandomData(random(), a, 200*RANDOM_MULTIPLIER, 20);
+      checkRandomData(random(), a, 10*RANDOM_MULTIPLIER, 1027);
+    }
  }

  private static void testNGrams(int minGram, int maxGram, int length, final String nonTokenChars) throws IOException {