- LUCENE-759: Two n-gram producting TokenFilters (using them for the spellchecker in SOLR-81)

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@513876 13f79535-47bb-0310-9956-ffa450edef68
2007-03-02 18:19:53 +00:00 · 2007-03-02 18:19:53 +00:00 · 534be1599d
parent 6636d88def
commit 534be1599d
3 changed files with 263 additions and 0 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -34,6 +34,11 @@ Bug fixes
    implementations to be specified via the System property
    org.apache.lucene.store.FSDirectoryLockFactoryClass.  (Mike McCandless)

+New features
+
+ 1. LUCENE-759: Added two n-gram-producing TokenFilters.
+    (Otis Gospodnetic)
+
 Optimizations

 ======================= Release 2.1.0 2007-02-14 =======================
--- a/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
+++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
@ -0,0 +1,119 @@
+package org.apache.lucene.analysis.ngram;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+
+import java.io.StringReader;
+
+import junit.framework.TestCase;
+
+/**
+ * Tests {@link EdgeNGramTokenFilter} for correctness.
+ * @author Otis Gospodnetic
+ */
+public class EdgeNGramTokenFilterTest extends TestCase {
+  private TokenStream input;
+
+  public void setUp() {
+    input = new WhitespaceTokenizer(new StringReader("abcde"));
+  }
+
+  public void testInvalidInput() throws Exception {
+    boolean gotException = false;
+    try {        
+      new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 0, 0);
+    } catch (IllegalArgumentException e) {
+      gotException = true;
+    }
+    assertTrue(gotException);
+  }
+
+  public void testInvalidInput2() throws Exception {
+    boolean gotException = false;
+    try {        
+      new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 2, 1);
+    } catch (IllegalArgumentException e) {
+      gotException = true;
+    }
+    assertTrue(gotException);
+  }
+
+  public void testInvalidInput3() throws Exception {
+    boolean gotException = false;
+    try {        
+      new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, -1, 2);
+    } catch (IllegalArgumentException e) {
+      gotException = true;
+    }
+    assertTrue(gotException);
+  }
+
+  public void testFrontUnigram() throws Exception {
+    EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 1, 1);
+    Token token = null;
+    token = tokenizer.next();
+    assertEquals("(a,0,1)", token.toString());
+    token = tokenizer.next();
+    assertNull(token);
+  }
+
+  public void testBackUnigram() throws Exception {
+    EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.BACK, 1, 1);
+    Token token = null;
+    token = tokenizer.next();
+    assertEquals("(e,4,5)", token.toString());
+    token = tokenizer.next();
+    assertNull(token);
+  }
+
+  public void testOversizedNgrams() throws Exception {
+    EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 6, 6);
+    Token token = null;
+    token = tokenizer.next();
+    assertNull(token);
+  }
+
+  public void testFrontRangeOfNgrams() throws Exception {
+    EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
+    Token token = null;
+    token = tokenizer.next();
+    assertEquals("(a,0,1)", token.toString());
+    token = tokenizer.next();
+    assertEquals("(ab,0,2)", token.toString());
+    token = tokenizer.next();
+    assertEquals("(abc,0,3)", token.toString());
+    token = tokenizer.next();
+    assertNull(token);
+  }
+
+  public void testBackRangeOfNgrams() throws Exception {
+    EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.BACK, 1, 3);
+    Token token = null;
+    token = tokenizer.next();
+    assertEquals("(e,4,5)", token.toString());
+    token = tokenizer.next();
+    assertEquals("(de,3,5)", token.toString());
+    token = tokenizer.next();
+    assertEquals("(cde,2,5)", token.toString());
+    token = tokenizer.next();
+    assertNull(token);
+  }
+}
--- a/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
+++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
@ -0,0 +1,139 @@
+package org.apache.lucene.analysis.ngram;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+
+import java.io.StringReader;
+import java.util.ArrayList;
+
+import junit.framework.TestCase;
+
+/**
+ * Tests {@link NGramTokenFilter} for correctness.
+ * @author Otis Gospodnetic
+ */
+public class NGramTokenFilterTest extends TestCase {
+    private TokenStream input;
+    private ArrayList tokens = new ArrayList();
+    
+    public void setUp() {
+        input = new WhitespaceTokenizer(new StringReader("abcde"));
+    }
+
+    public void testInvalidInput() throws Exception {
+        boolean gotException = false;
+        try {        
+            new NGramTokenFilter(input, 2, 1);
+        } catch (IllegalArgumentException e) {
+            gotException = true;
+        }
+        assertTrue(gotException);
+    }
+
+    public void testInvalidInput2() throws Exception {
+        boolean gotException = false;
+        try {        
+            new NGramTokenFilter(input, 0, 1);
+        } catch (IllegalArgumentException e) {
+            gotException = true;
+        }
+        assertTrue(gotException);
+    }
+
+    public void testUnigrams() throws Exception {
+      NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1);
+        
+        Token token = null;
+        do { 
+            token = filter.next();
+            if (token != null) {
+                tokens.add(token.toString());
+//                System.out.println(token.termText());
+//                System.out.println(token);
+//                Thread.sleep(1000);
+            }
+        } while (token != null);
+
+        assertEquals(5, tokens.size());
+        ArrayList exp = new ArrayList();
+        exp.add("(a,0,1)"); exp.add("(b,1,2)"); exp.add("(c,2,3)"); exp.add("(d,3,4)"); exp.add("(e,4,5)");
+        assertEquals(exp, tokens);
+    }
+
+    public void testBigrams() throws Exception {
+      NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2);
+        
+        Token token = null;
+        do { 
+            token = filter.next();
+            if (token != null) {
+                tokens.add(token.toString());
+//                System.out.println(token.termText());
+//                System.out.println(token);
+//                Thread.sleep(1000);
+            }
+        } while (token != null);
+
+        assertEquals(4, tokens.size());
+        ArrayList exp = new ArrayList();
+        exp.add("(ab,0,2)"); exp.add("(bc,1,3)"); exp.add("(cd,2,4)"); exp.add("(de,3,5)");
+        assertEquals(exp, tokens);
+    }
+
+    public void testNgrams() throws Exception {
+      NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
+        
+        Token token = null;
+        do { 
+            token = filter.next();
+            if (token != null) {
+                tokens.add(token.toString());
+//                System.out.println(token.termText());
+//                System.out.println(token);
+//                Thread.sleep(1000);
+            }
+        } while (token != null);
+
+        assertEquals(12, tokens.size());
+        ArrayList exp = new ArrayList();
+        exp.add("(a,0,1)"); exp.add("(b,1,2)"); exp.add("(c,2,3)"); exp.add("(d,3,4)"); exp.add("(e,4,5)");
+        exp.add("(ab,0,2)"); exp.add("(bc,1,3)"); exp.add("(cd,2,4)"); exp.add("(de,3,5)");
+        exp.add("(abc,0,3)"); exp.add("(bcd,1,4)"); exp.add("(cde,2,5)");
+        assertEquals(exp, tokens);
+    }
+
+    public void testOversizedNgrams() throws Exception {
+      NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7);
+        
+        Token token = null;
+        do { 
+            token = filter.next();
+            if (token != null) {
+                tokens.add(token.toString());
+//                System.out.println(token.termText());
+//                System.out.println(token);
+//                Thread.sleep(1000);
+            }
+        } while (token != null);
+
+        assertTrue(tokens.isEmpty());
+    }
+}