LUCENE-2413: consolidate WDF into contrib/analyzers

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940781 13f79535-47bb-0310-9956-ffa450edef68
2010-05-04 09:11:05 +00:00 · 2010-05-04 09:11:05 +00:00 · 9238c5f5ca
parent a9ef636cb1
commit 9238c5f5ca
6 changed files with 203 additions and 157 deletions
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@ -161,6 +161,8 @@ New features
     and phrases. 
   - o.a.l.analysis.charfilter.HTMLStripCharFilter: CharFilter that strips HTML 
     constructs.
+   - o.a.l.analysis.miscellaneous.WordDelimiterFilter: TokenFilter that splits words 
+     into subwords and performs optional transformations on subword groups.
   (... in progress)

 Build
--- a/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
+++ b/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
@ -15,7 +15,7 @@
 * limitations under the License.
 */
 
-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.miscellaneous;

 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
@ -58,10 +58,9 @@ import java.io.IOException;
 *  in the analyzer used for querying.  Given that the current StandardTokenizer immediately removes many intra-word
 *  delimiters, it is recommended that this filter be used after a tokenizer that does not do this (such as WhitespaceTokenizer).
 *
- *  @version $Id$
 */

-final class WordDelimiterFilter extends TokenFilter {
+public final class WordDelimiterFilter extends TokenFilter {
  
  public static final int LOWER = 0x01;
  public static final int UPPER = 0x02;
--- a/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java
+++ b/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java
@ -1,4 +1,4 @@
-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.miscellaneous;

 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
@ -17,7 +17,7 @@ package org.apache.solr.analysis;
 * limitations under the License.
 */

-import static org.apache.solr.analysis.WordDelimiterFilter.*;
+import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*;

 /**
 * A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterFilter rules.
--- a/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
+++ b/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
@ -15,9 +15,10 @@
 * limitations under the License.
 */

-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.miscellaneous;

 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.KeywordTokenizer;
 import org.apache.lucene.analysis.StopFilter;
@ -29,12 +30,8 @@ import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.solr.SolrTestCaseJ4;
-import org.junit.BeforeClass;
 import org.junit.Test;

-import static org.apache.solr.analysis.BaseTokenTestCase.*;
-
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
@ -43,96 +40,10 @@ import java.util.HashSet;

 /**
 * New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
+ * TODO: should explicitly test things like protWords and not rely on
+ * the factory tests in Solr.
 */
-public class TestWordDelimiterFilter extends SolrTestCaseJ4 {
-
-  @BeforeClass
-  public static void beforeClass() throws Exception {
-    initCore("solrconfig.xml","schema.xml");
-  }
-
-  public void posTst(String v1, String v2, String s1, String s2) {
-    assertU(adoc("id",  "42",
-                 "subword", v1,
-                 "subword", v2));
-    assertU(commit());
-
-    // there is a positionIncrementGap of 100 between field values, so
-    // we test if that was maintained.
-    assertQ("position increment lost",
-            req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~90")
-            ,"//result[@numFound=0]"
-    );
-    assertQ("position increment lost",
-            req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~110")
-            ,"//result[@numFound=1]"
-    );
-    clearIndex();
-  }
-
-  @Test
-  public void testRetainPositionIncrement() {
-    posTst("foo","bar","foo","bar");
-    posTst("-foo-","-bar-","foo","bar");
-    posTst("foo","bar","-foo-","-bar-");
-
-    posTst("123","456","123","456");
-    posTst("/123/","/456/","123","456");
-
-    posTst("/123/abc","qwe/456/","abc","qwe");
-
-    posTst("zoo-foo","bar-baz","foo","bar");
-    posTst("zoo-foo-123","456-bar-baz","foo","bar");
-  }
-
-  @Test
-  public void testNoGenerationEdgeCase() {
-    assertU(adoc("id", "222", "numberpartfail", "123.123.123.123"));
-    clearIndex();
-  }
-
-  @Test
-  public void testIgnoreCaseChange() {
-
-    assertU(adoc("id",  "43",
-                 "wdf_nocase", "HellO WilliAM",
-                 "subword", "GoodBye JonEs"));
-    assertU(commit());
-    
-    assertQ("no case change",
-            req("wdf_nocase:(hell o am)")
-            ,"//result[@numFound=0]"
-    );
-    assertQ("case change",
-            req("subword:(good jon)")
-            ,"//result[@numFound=1]"
-    );
-    clearIndex();
-  }
-
-  @Test
-  public void testPreserveOrignalTrue() {
-
-    assertU(adoc("id",  "144",
-                 "wdf_preserve", "404-123"));
-    assertU(commit());
-    
-    assertQ("preserving original word",
-            req("wdf_preserve:404")
-            ,"//result[@numFound=1]"
-    );
-    
-    assertQ("preserving original word",
-        req("wdf_preserve:123")
-        ,"//result[@numFound=1]"
-    );
-
-    assertQ("preserving original word",
-        req("wdf_preserve:404-123*")
-        ,"//result[@numFound=1]"
-    );
-    clearIndex();
-  }
+public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {

  /***
  public void testPerformance() throws IOException {
@ -232,59 +143,6 @@ public class TestWordDelimiterFilter extends SolrTestCaseJ4 {
        new int[] { 11, 15, 15 });
  }

-  @Test
-  public void testAlphaNumericWords(){
-     assertU(adoc("id",  "68","numericsubword","Java/J2SE"));
-     assertU(commit());
-
-     assertQ("j2se found",
-            req("numericsubword:(J2SE)")
-            ,"//result[@numFound=1]"
-    );
-      assertQ("no j2 or se",
-            req("numericsubword:(J2 OR SE)")
-            ,"//result[@numFound=0]"
-    );
-    clearIndex();
-  }
-
-  @Test
-  public void testProtectedWords(){
-    assertU(adoc("id", "70","protectedsubword","c# c++ .net Java/J2SE"));
-    assertU(commit());
-
-    assertQ("java found",
-            req("protectedsubword:(java)")
-            ,"//result[@numFound=1]"
-    );
-
-    assertQ(".net found",
-            req("protectedsubword:(.net)")
-            ,"//result[@numFound=1]"
-    );
-
-    assertQ("c# found",
-            req("protectedsubword:(c#)")
-            ,"//result[@numFound=1]"
-    );
-
-    assertQ("c++ found",
-            req("protectedsubword:(c++)")
-            ,"//result[@numFound=1]"
-    );
-
-    assertQ("c found?",
-            req("protectedsubword:c")
-            ,"//result[@numFound=0]"
-    );
-    assertQ("net found?",
-            req("protectedsubword:net")
-            ,"//result[@numFound=0]"
-    );
-    clearIndex();
-  }
-
-
  public void doSplit(final String input, String... output) throws Exception {
    WordDelimiterFilter wdf = new WordDelimiterFilter(new KeywordTokenizer(
        new StringReader(input)), 1, 1, 0, 0, 0);
@ -368,13 +226,13 @@ public class TestWordDelimiterFilter extends SolrTestCaseJ4 {
  
  @Test
  public void testPositionIncrements() throws Exception {
-    final CharArraySet protWords = new CharArraySet(DEFAULT_VERSION, new HashSet<String>(Arrays.asList("NUTCH")), false);
+    final CharArraySet protWords = new CharArraySet(TEST_VERSION_CURRENT, new HashSet<String>(Arrays.asList("NUTCH")), false);
    
    /* analyzer that uses whitespace + wdf */
    Analyzer a = new Analyzer() {
      public TokenStream tokenStream(String field, Reader reader) {
        return new WordDelimiterFilter(
-            new WhitespaceTokenizer(DEFAULT_VERSION, reader),
+            new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader),
            1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
      }
    };
@ -401,7 +259,7 @@ public class TestWordDelimiterFilter extends SolrTestCaseJ4 {
      public TokenStream tokenStream(String field, Reader reader) {
        return new WordDelimiterFilter(
            new LargePosIncTokenFilter(
-            new WhitespaceTokenizer(DEFAULT_VERSION, reader)),
+            new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader)),
            1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
      }
    };
@ -431,8 +289,8 @@ public class TestWordDelimiterFilter extends SolrTestCaseJ4 {

    Analyzer a3 = new Analyzer() {
      public TokenStream tokenStream(String field, Reader reader) {
-        StopFilter filter = new StopFilter(DEFAULT_VERSION,
-            new WhitespaceTokenizer(DEFAULT_VERSION, reader), StandardAnalyzer.STOP_WORDS_SET);
+        StopFilter filter = new StopFilter(TEST_VERSION_CURRENT,
+            new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader), StandardAnalyzer.STOP_WORDS_SET);
        filter.setEnablePositionIncrements(true);
        return new WordDelimiterFilter(filter, 
            1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
--- a/solr/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java
+++ b/solr/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java
@ -18,6 +18,7 @@
 package org.apache.solr.analysis;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;

 import org.apache.solr.util.plugin.ResourceLoaderAware;
 import org.apache.solr.common.ResourceLoader;
--- a/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java
+++ b/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java
@ -0,0 +1,186 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.apache.solr.SolrTestCaseJ4;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
+ */
+public class TestWordDelimiterFilterFactory extends SolrTestCaseJ4 {
+
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    initCore("solrconfig.xml","schema.xml");
+  }
+
+  public void posTst(String v1, String v2, String s1, String s2) {
+    assertU(adoc("id",  "42",
+                 "subword", v1,
+                 "subword", v2));
+    assertU(commit());
+
+    // there is a positionIncrementGap of 100 between field values, so
+    // we test if that was maintained.
+    assertQ("position increment lost",
+            req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~90")
+            ,"//result[@numFound=0]"
+    );
+    assertQ("position increment lost",
+            req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~110")
+            ,"//result[@numFound=1]"
+    );
+    clearIndex();
+  }
+
+  @Test
+  public void testRetainPositionIncrement() {
+    posTst("foo","bar","foo","bar");
+    posTst("-foo-","-bar-","foo","bar");
+    posTst("foo","bar","-foo-","-bar-");
+
+    posTst("123","456","123","456");
+    posTst("/123/","/456/","123","456");
+
+    posTst("/123/abc","qwe/456/","abc","qwe");
+
+    posTst("zoo-foo","bar-baz","foo","bar");
+    posTst("zoo-foo-123","456-bar-baz","foo","bar");
+  }
+
+  @Test
+  public void testNoGenerationEdgeCase() {
+    assertU(adoc("id", "222", "numberpartfail", "123.123.123.123"));
+    clearIndex();
+  }
+
+  @Test
+  public void testIgnoreCaseChange() {
+
+    assertU(adoc("id",  "43",
+                 "wdf_nocase", "HellO WilliAM",
+                 "subword", "GoodBye JonEs"));
+    assertU(commit());
+    
+    assertQ("no case change",
+            req("wdf_nocase:(hell o am)")
+            ,"//result[@numFound=0]"
+    );
+    assertQ("case change",
+            req("subword:(good jon)")
+            ,"//result[@numFound=1]"
+    );
+    clearIndex();
+  }
+
+  @Test
+  public void testPreserveOrignalTrue() {
+
+    assertU(adoc("id",  "144",
+                 "wdf_preserve", "404-123"));
+    assertU(commit());
+    
+    assertQ("preserving original word",
+            req("wdf_preserve:404")
+            ,"//result[@numFound=1]"
+    );
+    
+    assertQ("preserving original word",
+        req("wdf_preserve:123")
+        ,"//result[@numFound=1]"
+    );
+
+    assertQ("preserving original word",
+        req("wdf_preserve:404-123*")
+        ,"//result[@numFound=1]"
+    );
+    clearIndex();
+  }
+
+  /***
+  public void testPerformance() throws IOException {
+    String s = "now is the time-for all good men to come to-the aid of their country.";
+    Token tok = new Token();
+    long start = System.currentTimeMillis();
+    int ret=0;
+    for (int i=0; i<1000000; i++) {
+      StringReader r = new StringReader(s);
+      TokenStream ts = new WhitespaceTokenizer(r);
+      ts = new WordDelimiterFilter(ts, 1,1,1,1,0);
+
+      while (ts.next(tok) != null) ret++;
+    }
+
+    System.out.println("ret="+ret+" time="+(System.currentTimeMillis()-start));
+  }
+  ***/
+
+  @Test
+  public void testAlphaNumericWords(){
+     assertU(adoc("id",  "68","numericsubword","Java/J2SE"));
+     assertU(commit());
+
+     assertQ("j2se found",
+            req("numericsubword:(J2SE)")
+            ,"//result[@numFound=1]"
+    );
+      assertQ("no j2 or se",
+            req("numericsubword:(J2 OR SE)")
+            ,"//result[@numFound=0]"
+    );
+    clearIndex();
+  }
+
+  @Test
+  public void testProtectedWords(){
+    assertU(adoc("id", "70","protectedsubword","c# c++ .net Java/J2SE"));
+    assertU(commit());
+
+    assertQ("java found",
+            req("protectedsubword:(java)")
+            ,"//result[@numFound=1]"
+    );
+
+    assertQ(".net found",
+            req("protectedsubword:(.net)")
+            ,"//result[@numFound=1]"
+    );
+
+    assertQ("c# found",
+            req("protectedsubword:(c#)")
+            ,"//result[@numFound=1]"
+    );
+
+    assertQ("c++ found",
+            req("protectedsubword:(c++)")
+            ,"//result[@numFound=1]"
+    );
+
+    assertQ("c found?",
+            req("protectedsubword:c")
+            ,"//result[@numFound=0]"
+    );
+    assertQ("net found?",
+            req("protectedsubword:net")
+            ,"//result[@numFound=0]"
+    );
+    clearIndex();
+  }
+}