LUCENE-1058: Added new buffering tokenizer/token filter for reducing redundant analysis

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@599478 13f79535-47bb-0310-9956-ffa450edef68
2007-11-29 15:18:08 +00:00 · 2007-11-29 15:18:08 +00:00 · 261382fd4e
parent 12d0cc180e
commit 261382fd4e
4 changed files with 258 additions and 0 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -525,6 +525,12 @@ New features
    values, plus CustomScoreQuery for simple score (post) customization.
    (Yonik Seeley, Doron Cohen)
 9. LUCENE-1058: Added new TeeTokenFilter (like the UNIX 'tee' command) and SinkTokenizer which can be used to share tokens between two or more
    Fields such that the other Fields do not have to go through the whole Analysis process over again.  For instance, if you have two
    Fields that share all the same analysis steps except one lowercases tokens and the other does not, you can coordinate the operations
    between the two using the TeeTokenFilter and the SinkTokenizer.  See TeeSinkTokenTest.java for examples.
    (Grant Ingersoll, Michael Busch, Yonik Seeley)
 Optimizations
 1. LUCENE-761: The proxStream is now cloned lazily in SegmentTermPositions
--- a/src/java/org/apache/lucene/analysis/SinkTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/SinkTokenizer.java
@ -0,0 +1,60 @@
 package org.apache.lucene.analysis;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
 /**
 * A SinkTokenizer can be used to cache Tokens for use in an Analyzer
 *
 * @see TeeTokenFilter
 *
 **/
 public class SinkTokenizer extends Tokenizer {
  protected List/*<Token>*/ lst = new ArrayList/*<Token>*/();
  protected Iterator/*<Token>*/ iter;
  public SinkTokenizer(List/*<Token>*/ input) {
    this.lst = input;
    if (this.lst == null) this.lst = new ArrayList/*<Token>*/();
  }
  /**
   * only valid if tokens have not been consumed,
   * i.e. if this tokenizer is not part of another tokenstream
   *
   * @return A List of {@link org.apache.lucene.analysis.Token}s
   */
  public List/*<Token>*/ getTokens() {
    return lst;
  }
  /**
   * Ignores the input result Token
   * @param result
   * @return The next {@link org.apache.lucene.analysis.Token} in the Sink.
   * @throws IOException
   */
  public Token next(Token result) throws IOException {
    if (iter == null) iter = lst.iterator();
    return iter.hasNext() ? (Token) iter.next() : null;
  }
  /**
   * Override this method to cache only certain tokens, or new tokens based
   * on the old tokens.
   *
   * @param t The {@link org.apache.lucene.analysis.Token} to add to the sink
   */
  public void add(Token t) {
    if (t == null) return;
    lst.add((Token) t.clone());
  }
  public void reset() throws IOException {
    iter = lst.iterator();
  }
 }
--- a/src/java/org/apache/lucene/analysis/TeeTokenFilter.java
+++ b/src/java/org/apache/lucene/analysis/TeeTokenFilter.java
@ -0,0 +1,54 @@
 package org.apache.lucene.analysis;
 import java.io.IOException;
 /**
 * Works in conjunction with the SinkTokenizer to provide the ability to set aside tokens
 * that have already been analyzed.  This is useful in situations where multiple fields share
 * many common analysis steps and then go their separate ways.
 * <p/>
 * It is also useful for doing things like entity extraction or proper noun analysis as
 * part of the analysis workflow and saving off those tokens for use in another field.
 *
 * <pre>
 SinkTokenizer sink1 = new SinkTokenizer(null);
 SinkTokenizer sink2 = new SinkTokenizer(null);
 TokenStream source1 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(reader1), sink1), sink2);
 TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(reader2), sink1), sink2);
 TokenStream final1 = new LowerCaseFilter(source1);
 TokenStream final2 = source2;
 TokenStream final3 = new EntityDetect(sink1);
 TokenStream final4 = new URLDetect(sink2);
 d.add(new Field("f1", final1));
 d.add(new Field("f2", final2));
 d.add(new Field("f3", final3));
 d.add(new Field("f4", final4));
 * </pre>
 * In this example, sink1 and sink2 will both get tokens from both reader1 and reader2 after whitespace tokenizer
   and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired.
 Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene
 <p/>
 *
 * See http://issues.apache.org/jira/browse/LUCENE-1058
 * @see SinkTokenizer
 *
 **/
 public class TeeTokenFilter extends TokenFilter {
  SinkTokenizer sink;
  protected TeeTokenFilter(TokenStream input, SinkTokenizer sink) {
    super(input);
    this.sink = sink;
  }
  public Token next(Token result) throws IOException {
    Token t = input.next(result);
    sink.add(t);
    return t;
  }
 }
--- a/src/test/org/apache/lucene/analysis/TeeSinkTokenTest.java
+++ b/src/test/org/apache/lucene/analysis/TeeSinkTokenTest.java
@ -0,0 +1,138 @@
 package org.apache.lucene.analysis;
 /**
 * Copyright 2004 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import junit.framework.TestCase;
 import java.io.StringReader;
 import java.io.IOException;
 /**
 * tests for the TeeTokenFilter and SinkTokenizer
 */
 public class TeeSinkTokenTest extends TestCase {
  protected StringBuffer buffer1;
  protected StringBuffer buffer2;
  protected String[] tokens1;
  protected String[] tokens2;
  public TeeSinkTokenTest(String s) {
    super(s);
  }
  protected void setUp() {
    tokens1 = new String[]{"The", "quick", "Burgundy", "Fox", "jumped", "over", "the", "lazy", "Red", "Dogs"};
    tokens2 = new String[]{"The", "Lazy", "Dogs", "should", "stay", "on", "the", "porch"};
    buffer1 = new StringBuffer();
    for (int i = 0; i < tokens1.length; i++) {
      buffer1.append(tokens1[i]).append(' ');
    }
    buffer2 = new StringBuffer();
    for (int i = 0; i < tokens2.length; i++) {
      buffer2.append(tokens2[i]).append(' ');
    }
  }
  protected void tearDown() {
  }
  public void test() throws IOException {
    SinkTokenizer sink1 = new SinkTokenizer(null){
      public void add(Token t) {
        if (t != null && t.termText().equalsIgnoreCase("The")){
          super.add(t);
        }
      }
    };
    TokenStream source = new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), sink1);
    Token token = null;
    int i = 0;
    while ((token = source.next()) != null){
      assertTrue(token.termText() + " is not equal to " + tokens1[i], token.termText().equals(tokens1[i]) == true);
      i++;
    }
    assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
    assertTrue("sink1 Size: " + sink1.getTokens().size() + " is not: " + 2, sink1.getTokens().size() == 2);
    i = 0;
    while ((token = sink1.next()) != null){
      assertTrue(token.termText() + " is not equal to " + "The", token.termText().equalsIgnoreCase("The") == true);
      i++;
    }
    assertTrue(i + " does not equal: " + sink1.getTokens().size(), i == sink1.getTokens().size());
  }
  public void testMultipleSources() throws Exception {
    SinkTokenizer theDetector = new SinkTokenizer(null){
      public void add(Token t) {
        if (t != null && t.termText().equalsIgnoreCase("The")){
          super.add(t);
        }
      }
    };
    SinkTokenizer dogDetector = new SinkTokenizer(null){
      public void add(Token t) {
        if (t != null && t.termText().equalsIgnoreCase("Dogs")){
          super.add(t);
        }
      }
    };
    TokenStream source1 = new CachingTokenFilter(new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), theDetector), dogDetector));
    TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer2.toString())), theDetector), dogDetector);
    Token token = null;
    int i = 0;
    while ((token = source1.next()) != null){
      assertTrue(token.termText() + " is not equal to " + tokens1[i], token.termText().equals(tokens1[i]) == true);
      i++;
    }
    assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
    assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 2, theDetector.getTokens().size() == 2);
    assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 1, dogDetector.getTokens().size() == 1);
    i = 0;
    while ((token = source2.next()) != null){
      assertTrue(token.termText() + " is not equal to " + tokens2[i], token.termText().equals(tokens2[i]) == true);
      i++;
    }
    assertTrue(i + " does not equal: " + tokens2.length, i == tokens2.length);
    assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 4, theDetector.getTokens().size() == 4);
    assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 2, dogDetector.getTokens().size() == 2);
    i = 0;
    while ((token = theDetector.next()) != null){
      assertTrue(token.termText() + " is not equal to " + "The", token.termText().equalsIgnoreCase("The") == true);
      i++;
    }
    assertTrue(i + " does not equal: " + theDetector.getTokens().size(), i == theDetector.getTokens().size());
    i = 0;
    while ((token = dogDetector.next()) != null){
      assertTrue(token.termText() + " is not equal to " + "Dogs", token.termText().equalsIgnoreCase("Dogs") == true);
      i++;
    }
    assertTrue(i + " does not equal: " + dogDetector.getTokens().size(), i == dogDetector.getTokens().size());
    source1.reset();
    TokenStream lowerCasing = new LowerCaseFilter(source1);
    i = 0;
    while ((token = lowerCasing.next()) != null){
      assertTrue(token.termText() + " is not equal to " + tokens1[i].toLowerCase(), token.termText().equals(tokens1[i].toLowerCase()) == true);
      i++;
    }
    assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
  }
 }