LUCENE-2413: consolidate commongrams into contrib/analyzers

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940761 13f79535-47bb-0310-9956-ffa450edef68
2010-05-04 07:50:41 +00:00 · 2010-05-04 07:50:41 +00:00 · 1d1331fa03
parent 435cc41758
commit 1d1331fa03
7 changed files with 48 additions and 16 deletions
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@ -155,6 +155,12 @@ New features
   of AttributeSource.cloneAttributes() instances and the new copyTo() method.
   (Steven Rowe via Uwe Schindler)

+ * LUCENE-2413: Consolidated Solr analysis components into contrib/analyzers. 
+   New features from Solr now available to Lucene users include:
+   - o.a.l.analysis.commongrams: Constructs n-grams for frequently occurring terms
+     and phrases. 
+   (... in progress)
+
 Build

 * LUCENE-2124: Moved the JDK-based collation support from contrib/collation 
--- a/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java
+++ b/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java
@ -7,7 +7,7 @@
 * See the License for the specific language governing permissions and limitations under the License. 
 */

-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.commongrams;

 import java.io.IOException;
 import java.util.Arrays;
--- a/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilter.java
+++ b/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilter.java
@ -14,7 +14,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.commongrams;

 import java.io.IOException;

@ -22,7 +22,7 @@ import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;

-import static org.apache.solr.analysis.CommonGramsFilter.GRAM_TYPE;
+import static org.apache.lucene.analysis.commongrams.CommonGramsFilter.GRAM_TYPE;

 /**
 * Wrap a CommonGramsFilter optimizing phrase queries by only returning single
--- a/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/commongrams/package.html
+++ b/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/commongrams/package.html
@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Construct n-grams for frequently occurring terms and phrases.
+</body>
+</html>
--- a/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java
+++ b/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java
@ -14,28 +14,29 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.commongrams;

 import java.io.Reader;
 import java.io.StringReader;
 import java.util.Set;

 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

 /**
- * Tests CommonGramsQueryFilter
+ * Tests CommonGrams(Query)Filter
 */
-public class CommonGramsFilterTest extends BaseTokenTestCase {
+public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
  private static final String[] commonWords = { "s", "a", "b", "c", "d", "the",
      "of" };
  
  public void testReset() throws Exception {
    final String input = "How the s a brown s cow d like A B thing?";
-    WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
+    WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
    CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
    
    CharTermAttribute term = cgf.addAttribute(CharTermAttribute.class);
@ -56,7 +57,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
  
  public void testQueryReset() throws Exception {
    final String input = "How the s a brown s cow d like A B thing?";
-    WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
+    WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
    CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
    CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf);
    
@ -88,7 +89,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
      @Override
      public TokenStream tokenStream(String field, Reader in) {
        return new CommonGramsQueryFilter(new CommonGramsFilter(
-            new WhitespaceTokenizer(DEFAULT_VERSION, in), commonWords));
+            new WhitespaceTokenizer(TEST_VERSION_CURRENT, in), commonWords));
      } 
    };

@ -157,7 +158,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
      @Override
      public TokenStream tokenStream(String field, Reader in) {
        return new CommonGramsFilter(
-            new WhitespaceTokenizer(DEFAULT_VERSION, in), commonWords);
+            new WhitespaceTokenizer(TEST_VERSION_CURRENT, in), commonWords);
      } 
    };

@ -243,7 +244,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
   */
  public void testCaseSensitive() throws Exception {
    final String input = "How The s a brown s cow d like A B thing?";
-    WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
+    WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
    Set common = CommonGramsFilter.makeCommonSet(commonWords);
    TokenFilter cgf = new CommonGramsFilter(wt, common, false);
    assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s",
@ -256,7 +257,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
   */
  public void testLastWordisStopWord() throws Exception {
    final String input = "dog the";
-    WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
+    WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
    CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
    TokenFilter nsf = new CommonGramsQueryFilter(cgf);
    assertTokenStreamContents(nsf, new String[] { "dog_the" });
@ -267,7 +268,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
   */
  public void testFirstWordisStopWord() throws Exception {
    final String input = "the dog";
-    WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
+    WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
    CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
    TokenFilter nsf = new CommonGramsQueryFilter(cgf);
    assertTokenStreamContents(nsf, new String[] { "the_dog" });
@ -278,7 +279,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
   */
  public void testOneWordQueryStopWord() throws Exception {
    final String input = "the";
-    WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
+    WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
    CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
    TokenFilter nsf = new CommonGramsQueryFilter(cgf);
    assertTokenStreamContents(nsf, new String[] { "the" });
@ -289,7 +290,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
   */
  public void testOneWordQuery() throws Exception {
    final String input = "monster";
-    WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
+    WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
    CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
    TokenFilter nsf = new CommonGramsQueryFilter(cgf);
    assertTokenStreamContents(nsf, new String[] { "monster" });
@ -300,7 +301,7 @@ public class CommonGramsFilterTest extends BaseTokenTestCase {
   */
  public void TestFirstAndLastStopWord() throws Exception {
    final String input = "the of";
-    WhitespaceTokenizer wt = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
+    WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
    CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
    TokenFilter nsf = new CommonGramsQueryFilter(cgf);
    assertTokenStreamContents(nsf, new String[] { "the_of" });
--- a/solr/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java
+++ b/solr/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java
@ -22,6 +22,7 @@ import java.util.Set;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.StopAnalyzer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
 import org.apache.solr.common.ResourceLoader;
 import org.apache.solr.util.plugin.ResourceLoaderAware;

--- a/solr/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java
+++ b/solr/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java
@ -23,6 +23,8 @@ import java.util.Set;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.StopAnalyzer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
+import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter;
 import org.apache.solr.common.ResourceLoader;
 import org.apache.solr.util.plugin.ResourceLoaderAware;