LUCENE-1058: Added new buffering tokenizer/token filter for reducing redundant analysis

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@599478 13f79535-47bb-0310-9956-ffa450edef68
2007-11-29 15:18:08 +00:00 · 2007-11-29 15:18:08 +00:00 · 261382fd4e
parent 12d0cc180e
commit 261382fd4e
4 changed files with 258 additions and 0 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -524,6 +524,12 @@ New features
 8. LUCENE-446: Added Solr's search.function for scores based on field 
    values, plus CustomScoreQuery for simple score (post) customization.
    (Yonik Seeley, Doron Cohen)
+
+ 9. LUCENE-1058: Added new TeeTokenFilter (like the UNIX 'tee' command) and SinkTokenizer which can be used to share tokens between two or more
+    Fields such that the other Fields do not have to go through the whole Analysis process over again.  For instance, if you have two
+    Fields that share all the same analysis steps except one lowercases tokens and the other does not, you can coordinate the operations
+    between the two using the TeeTokenFilter and the SinkTokenizer.  See TeeSinkTokenTest.java for examples.
+    (Grant Ingersoll, Michael Busch, Yonik Seeley)
 
 Optimizations

--- a/src/java/org/apache/lucene/analysis/SinkTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/SinkTokenizer.java
@ -0,0 +1,60 @@
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+
+/**
+ * A SinkTokenizer can be used to cache Tokens for use in an Analyzer
+ *
+ * @see TeeTokenFilter
+ *
+ **/
+public class SinkTokenizer extends Tokenizer {
+  protected List/*<Token>*/ lst = new ArrayList/*<Token>*/();
+  protected Iterator/*<Token>*/ iter;
+
+  public SinkTokenizer(List/*<Token>*/ input) {
+    this.lst = input;
+    if (this.lst == null) this.lst = new ArrayList/*<Token>*/();
+  }
+
+  /**
+   * only valid if tokens have not been consumed,
+   * i.e. if this tokenizer is not part of another tokenstream
+   *
+   * @return A List of {@link org.apache.lucene.analysis.Token}s
+   */
+  public List/*<Token>*/ getTokens() {
+    return lst;
+  }
+
+  /**
+   * Ignores the input result Token
+   * @param result
+   * @return The next {@link org.apache.lucene.analysis.Token} in the Sink.
+   * @throws IOException
+   */
+  public Token next(Token result) throws IOException {
+    if (iter == null) iter = lst.iterator();
+    return iter.hasNext() ? (Token) iter.next() : null;
+  }
+
+  /**
+   * Override this method to cache only certain tokens, or new tokens based
+   * on the old tokens.
+   *
+   * @param t The {@link org.apache.lucene.analysis.Token} to add to the sink
+   */
+  public void add(Token t) {
+    if (t == null) return;
+    lst.add((Token) t.clone());
+  }
+
+  public void reset() throws IOException {
+    iter = lst.iterator();
+  }
+}
+
--- a/src/java/org/apache/lucene/analysis/TeeTokenFilter.java
+++ b/src/java/org/apache/lucene/analysis/TeeTokenFilter.java
@ -0,0 +1,54 @@
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+
+
+/**
+ * Works in conjunction with the SinkTokenizer to provide the ability to set aside tokens
+ * that have already been analyzed.  This is useful in situations where multiple fields share
+ * many common analysis steps and then go their separate ways.
+ * <p/>
+ * It is also useful for doing things like entity extraction or proper noun analysis as
+ * part of the analysis workflow and saving off those tokens for use in another field.
+ *
+ * <pre>
+SinkTokenizer sink1 = new SinkTokenizer(null);
+SinkTokenizer sink2 = new SinkTokenizer(null);
+
+TokenStream source1 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(reader1), sink1), sink2);
+TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(reader2), sink1), sink2);
+
+TokenStream final1 = new LowerCaseFilter(source1);
+TokenStream final2 = source2;
+TokenStream final3 = new EntityDetect(sink1);
+TokenStream final4 = new URLDetect(sink2);
+
+d.add(new Field("f1", final1));
+d.add(new Field("f2", final2));
+d.add(new Field("f3", final3));
+d.add(new Field("f4", final4));
+ * </pre>
+ * In this example, sink1 and sink2 will both get tokens from both reader1 and reader2 after whitespace tokenizer
+   and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired.
+ Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene
+ <p/>
+ *
+ * See http://issues.apache.org/jira/browse/LUCENE-1058
+ * @see SinkTokenizer
+ *
+ **/
+public class TeeTokenFilter extends TokenFilter {
+  SinkTokenizer sink;
+
+  protected TeeTokenFilter(TokenStream input, SinkTokenizer sink) {
+    super(input);
+    this.sink = sink;
+  }
+
+  public Token next(Token result) throws IOException {
+    Token t = input.next(result);
+    sink.add(t);
+    return t;
+  }
+
+}
--- a/src/test/org/apache/lucene/analysis/TeeSinkTokenTest.java
+++ b/src/test/org/apache/lucene/analysis/TeeSinkTokenTest.java
@ -0,0 +1,138 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import junit.framework.TestCase;
+
+import java.io.StringReader;
+import java.io.IOException;
+
+/**
+ * tests for the TeeTokenFilter and SinkTokenizer
+ */
+public class TeeSinkTokenTest extends TestCase {
+  protected StringBuffer buffer1;
+  protected StringBuffer buffer2;
+  protected String[] tokens1;
+  protected String[] tokens2;
+
+
+  public TeeSinkTokenTest(String s) {
+    super(s);
+  }
+
+  protected void setUp() {
+    tokens1 = new String[]{"The", "quick", "Burgundy", "Fox", "jumped", "over", "the", "lazy", "Red", "Dogs"};
+    tokens2 = new String[]{"The", "Lazy", "Dogs", "should", "stay", "on", "the", "porch"};
+    buffer1 = new StringBuffer();
+
+    for (int i = 0; i < tokens1.length; i++) {
+      buffer1.append(tokens1[i]).append(' ');
+    }
+    buffer2 = new StringBuffer();
+    for (int i = 0; i < tokens2.length; i++) {
+      buffer2.append(tokens2[i]).append(' ');
+
+    }
+  }
+
+  protected void tearDown() {
+
+  }
+
+  public void test() throws IOException {
+
+    SinkTokenizer sink1 = new SinkTokenizer(null){
+      public void add(Token t) {
+        if (t != null && t.termText().equalsIgnoreCase("The")){
+          super.add(t);
+        }
+      }
+    };
+    TokenStream source = new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), sink1);
+    Token token = null;
+    int i = 0;
+    while ((token = source.next()) != null){
+      assertTrue(token.termText() + " is not equal to " + tokens1[i], token.termText().equals(tokens1[i]) == true);
+      i++;
+    }
+    assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
+    assertTrue("sink1 Size: " + sink1.getTokens().size() + " is not: " + 2, sink1.getTokens().size() == 2);
+    i = 0;
+    while ((token = sink1.next()) != null){
+      assertTrue(token.termText() + " is not equal to " + "The", token.termText().equalsIgnoreCase("The") == true);
+      i++;
+    }
+    assertTrue(i + " does not equal: " + sink1.getTokens().size(), i == sink1.getTokens().size());
+  }
+
+  public void testMultipleSources() throws Exception {
+    SinkTokenizer theDetector = new SinkTokenizer(null){
+      public void add(Token t) {
+        if (t != null && t.termText().equalsIgnoreCase("The")){
+          super.add(t);
+        }
+      }
+    };
+    SinkTokenizer dogDetector = new SinkTokenizer(null){
+      public void add(Token t) {
+        if (t != null && t.termText().equalsIgnoreCase("Dogs")){
+          super.add(t);
+        }
+      }
+    };
+    TokenStream source1 = new CachingTokenFilter(new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), theDetector), dogDetector));
+    TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer2.toString())), theDetector), dogDetector);
+    Token token = null;
+    int i = 0;
+    while ((token = source1.next()) != null){
+      assertTrue(token.termText() + " is not equal to " + tokens1[i], token.termText().equals(tokens1[i]) == true);
+      i++;
+    }
+    assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
+    assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 2, theDetector.getTokens().size() == 2);
+    assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 1, dogDetector.getTokens().size() == 1);
+    i = 0;
+    while ((token = source2.next()) != null){
+      assertTrue(token.termText() + " is not equal to " + tokens2[i], token.termText().equals(tokens2[i]) == true);
+      i++;
+    }
+    assertTrue(i + " does not equal: " + tokens2.length, i == tokens2.length);
+    assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 4, theDetector.getTokens().size() == 4);
+    assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 2, dogDetector.getTokens().size() == 2);
+    i = 0;
+    while ((token = theDetector.next()) != null){
+      assertTrue(token.termText() + " is not equal to " + "The", token.termText().equalsIgnoreCase("The") == true);
+      i++;
+    }
+    assertTrue(i + " does not equal: " + theDetector.getTokens().size(), i == theDetector.getTokens().size());
+    i = 0;
+    while ((token = dogDetector.next()) != null){
+      assertTrue(token.termText() + " is not equal to " + "Dogs", token.termText().equalsIgnoreCase("Dogs") == true);
+      i++;
+    }
+    assertTrue(i + " does not equal: " + dogDetector.getTokens().size(), i == dogDetector.getTokens().size());
+    source1.reset();
+    TokenStream lowerCasing = new LowerCaseFilter(source1);
+    i = 0;
+    while ((token = lowerCasing.next()) != null){
+      assertTrue(token.termText() + " is not equal to " + tokens1[i].toLowerCase(), token.termText().equals(tokens1[i].toLowerCase()) == true);
+      i++;
+    }
+    assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
+  }
+}