LUCENE-2413: Consolidate KeepWords,HyphenatedWords,Trim filters to contrib/analyzers

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940962 13f79535-47bb-0310-9956-ffa450edef68
2010-05-04 17:07:28 +00:00 · 2010-05-04 17:07:28 +00:00 · 27502aa045
parent 1fdaf68d0d
commit 27502aa045
12 changed files with 127 additions and 110 deletions
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@ -57,6 +57,12 @@ New features
     into subwords and performs optional transformations on subword groups.
   - o.a.l.analysis.miscellaneous.RemoveDuplicatesTokenFilter: TokenFilter which 
     filters out Tokens at the same position and Term text as the previous token.
+   - o.a.l.analysis.miscellaneous.TrimFilter: Trims leading and trailing whitespace 
+     from Tokens in the stream.
+   - o.a.l.analysis.miscellaneous.KeepWordFilter: A TokenFilter that only keeps tokens 
+     with text contained in the required words (inverse of StopFilter).
+   - o.a.l.analysis.miscellaneous.HyphenatedWordsFilter: A TokenFilter that puts 
+     hyphenated words broken into two lines back together.
   - o.a.l.analysis.pattern: Package for pattern-based analysis, containing a 
     CharFilter, Tokenizer, and Tokenfilter for transforming text with regexes.
   (... in progress)
--- a/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilter.java
+++ b/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilter.java
@ -1,4 +1,4 @@
-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.miscellaneous;

 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
--- a/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java
+++ b/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java
@ -15,7 +15,7 @@
 * limitations under the License.
 */

-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.miscellaneous;

 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
@ -29,14 +29,13 @@ import java.util.Set;
 * A TokenFilter that only keeps tokens with text contained in the
 * required words.  This filter behaves like the inverse of StopFilter.
 * 
- * @version $Id$
 * @since solr 1.3
 */
 public final class KeepWordFilter extends TokenFilter {
  private final CharArraySet words;
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

-  /** @deprecated Use {@link #KeepWordFilter(TokenStream, Set, boolean)} instead */
+  /** @deprecated Use {@link #KeepWordFilter(TokenStream, CharArraySet)} instead */
  @Deprecated
  public KeepWordFilter(TokenStream in, Set<String> words, boolean ignoreCase ) {
    this(in, new CharArraySet(words, ignoreCase));
--- a/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilter.java
+++ b/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilter.java
@ -15,7 +15,7 @@
 * limitations under the License.
 */

-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.miscellaneous;

 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
@ -26,8 +26,6 @@ import java.io.IOException;

 /**
 * Trims leading and trailing whitespace from Tokens in the stream.
- *
- * @version $Id:$
 */
 public final class TrimFilter extends TokenFilter {

--- a/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestHyphenatedWordsFilter.java
+++ b/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestHyphenatedWordsFilter.java
@ -15,23 +15,23 @@
 * limitations under the License.
 */

-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.miscellaneous;

 import java.io.StringReader;

+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.WhitespaceTokenizer;

 /**
 * HyphenatedWordsFilter test
 */
-public class TestHyphenatedWordsFilter extends BaseTokenTestCase {
+public class TestHyphenatedWordsFilter extends BaseTokenStreamTestCase {
 	public void testHyphenatedWords() throws Exception {
 		String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecologi-\ncal";
 		// first test
-		TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
-		HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory();
-		ts = factory.create(ts);
+		TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
+		ts = new HyphenatedWordsFilter(ts);
 		assertTokenStreamContents(ts, 
 		    new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecological" });
 	}
@ -42,9 +42,8 @@ public class TestHyphenatedWordsFilter extends BaseTokenTestCase {
 	public void testHyphenAtEnd() throws Exception {
 	    String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecology-";
 	    // first test
-	    TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input));
-	    HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory();
-	    ts = factory.create(ts);
+	    TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
+	    ts = new HyphenatedWordsFilter(ts);
 	    assertTokenStreamContents(ts, 
 	        new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecology-" });
 	  }
--- a/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java
+++ b/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java
@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.StringReader;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+
+/** Test {@link KeepWordFilter} */
+public class TestKeepWordFilter extends BaseTokenStreamTestCase {
+  
+  public void testStopAndGo() throws Exception 
+  {  
+    Set<String> words = new HashSet<String>();
+    words.add( "aaa" );
+    words.add( "bbb" );
+    
+    String input = "aaa BBB ccc ddd EEE";
+    
+    // Test Stopwords
+    TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
+    stream = new KeepWordFilter(stream, words, true);
+    assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
+       
+    // Now force case
+    stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
+    stream = new KeepWordFilter(stream, words, false);
+    assertTokenStreamContents(stream, new String[] { "aaa" });
+  }
+}
--- a/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java
+++ b/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java
@ -15,13 +15,12 @@
 * limitations under the License.
 */

-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.miscellaneous;

 import java.io.IOException;
 import java.util.Collection;
-import java.util.HashMap;
-import java.util.Map;

+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
@ -34,7 +33,7 @@ import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 /**
 * @version $Id:$
 */
-public class TestTrimFilter extends BaseTokenTestCase {
+public class TestTrimFilter extends BaseTokenStreamTestCase {

  public void testTrim() throws Exception {
    char[] a = " a ".toCharArray();
@ -42,15 +41,13 @@ public class TestTrimFilter extends BaseTokenTestCase {
    char[] ccc = "cCc".toCharArray();
    char[] whitespace = "   ".toCharArray();
    char[] empty = "".toCharArray();
-    TrimFilterFactory factory = new TrimFilterFactory();
-    Map<String,String> args = new HashMap<String,String>();
-    args.put("updateOffsets", "false");
-    factory.init(args);
-    TokenStream ts = factory.create(new IterTokenStream(new Token(a, 0, a.length, 1, 5),
+
+    TokenStream ts = new IterTokenStream(new Token(a, 0, a.length, 1, 5),
                    new Token(b, 0, b.length, 6, 10),
                    new Token(ccc, 0, ccc.length, 11, 15),
                    new Token(whitespace, 0, whitespace.length, 16, 20),
-                    new Token(empty, 0, empty.length, 21, 21)));
+                    new Token(empty, 0, empty.length, 21, 21));
+    ts = new TrimFilter(ts, false);

    assertTokenStreamContents(ts, new String[] { "a", "b", "cCc", "", ""});

@ -58,15 +55,12 @@ public class TestTrimFilter extends BaseTokenTestCase {
    b = "b ".toCharArray();
    ccc = " c ".toCharArray();
    whitespace = "   ".toCharArray();
-    factory = new TrimFilterFactory();
-    args = new HashMap<String,String>();
-    args.put("updateOffsets", "true");
-    factory.init(args);
-    ts = factory.create(new IterTokenStream(
+    ts = new IterTokenStream(
            new Token(a, 0, a.length, 0, 2),
            new Token(b, 0, b.length, 0, 2),
            new Token(ccc, 0, ccc.length, 0, 3),
-            new Token(whitespace, 0, whitespace.length, 0, 3)));
+            new Token(whitespace, 0, whitespace.length, 0, 3));
+    ts = new TrimFilter(ts, true);
    
    assertTokenStreamContents(ts, 
        new String[] { "a", "b", "c", "" },
--- a/solr/src/java/org/apache/solr/analysis/HyphenatedWordsFilterFactory.java
+++ b/solr/src/java/org/apache/solr/analysis/HyphenatedWordsFilterFactory.java
@ -18,6 +18,7 @@ package org.apache.solr.analysis;
 */

 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter;
 import org.apache.solr.analysis.BaseTokenFilterFactory;

 /**
--- a/solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java
+++ b/solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java
@ -21,6 +21,7 @@ import org.apache.solr.common.ResourceLoader;
 import org.apache.solr.util.plugin.ResourceLoaderAware;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.miscellaneous.KeepWordFilter;

 import java.util.Set;
 import java.io.IOException;
--- a/solr/src/java/org/apache/solr/analysis/TrimFilterFactory.java
+++ b/solr/src/java/org/apache/solr/analysis/TrimFilterFactory.java
@ -20,6 +20,7 @@ package org.apache.solr.analysis;
 import java.util.Map;

 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.TrimFilter;
 import org.apache.solr.common.SolrException;

 /**
--- a/solr/src/test/org/apache/solr/analysis/TestKeepWordFilter.java
+++ b/solr/src/test/org/apache/solr/analysis/TestKeepWordFilter.java
@ -1,79 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.solr.analysis;
-
-import java.io.StringReader;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.WhitespaceTokenizer;
-import org.apache.solr.common.ResourceLoader;
-import org.apache.solr.core.SolrResourceLoader;
-
-
-/**
- * @version $Id$
- */
-public class TestKeepWordFilter extends BaseTokenTestCase {
-  
-  public void testStopAndGo() throws Exception 
-  {  
-    Set<String> words = new HashSet<String>();
-    words.add( "aaa" );
-    words.add( "bbb" );
-    
-    String input = "aaa BBB ccc ddd EEE";
-    Map<String,String> args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
-    ResourceLoader loader = new SolrResourceLoader(null, null);
-    
-    // Test Stopwords
-    KeepWordFilterFactory factory = new KeepWordFilterFactory();
-    args.put( "ignoreCase", "true" );
-    factory.init( args );
-    factory.inform( loader );
-    factory.setWords( words );
-    assertTrue(factory.isIgnoreCase());
-    TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
-    assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
-    
-    // Test Stopwords (ignoreCase via the setter instead)
-    factory = new KeepWordFilterFactory();
-    args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
-    factory.init( args );
-    factory.inform( loader );
-    factory.setIgnoreCase(true);
-    factory.setWords( words );
-    assertTrue(factory.isIgnoreCase());
-    stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
-    assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
-    
-    // Now force case
-    factory = new KeepWordFilterFactory();
-    args = new HashMap<String, String>(DEFAULT_VERSION_PARAM);
-    args.put( "ignoreCase", "false" );
-    factory.init( args );
-    factory.inform( loader );
-    factory.setWords( words );    
-    assertFalse(factory.isIgnoreCase());
-    stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
-    assertTokenStreamContents(stream, new String[] { "aaa" });
-  }
-}
--- a/solr/src/test/org/apache/solr/analysis/TestTrimFilterFactory.java
+++ b/solr/src/test/org/apache/solr/analysis/TestTrimFilterFactory.java
@ -0,0 +1,48 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.KeywordTokenizer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+
+/**
+ * Simple tests to ensure this factory is working
+ */
+public class TestTrimFilterFactory extends BaseTokenTestCase {
+  public void testTrimming() throws Exception {
+    TrimFilterFactory factory = new TrimFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("updateOffsets", "false");
+    factory.init(args);
+    TokenStream ts = factory.create(new KeywordTokenizer(new StringReader("trim me    ")));
+    assertTokenStreamContents(ts, new String[] { "trim me" });
+  }
+}