SOLR-11 - BufferedTokenStream and RemoveDuplicatesTokenFilter from SOLR-11-BufferedTokenStream-RemoveDuplicatesTokenFilter.patch plus some additional tests and example config changes

git-svn-id: https://svn.apache.org/repos/asf/incubator/solr/trunk@419443 13f79535-47bb-0310-9956-ffa450edef68
2006-07-06 05:39:04 +00:00 · 2006-07-06 05:39:04 +00:00 · 9561be65e8
parent a35e30cb35
commit 9561be65e8
11 changed files with 466 additions and 3 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -20,7 +20,13 @@ New Features
 11. new DocSet.andNot(), DocSet.andNotSize() (yonik)
 12. Ability to store term vectors. (Note: standard request handler does
    not currently do anything with term vectors) (Mike Klaas via yonik, SOLR-23)
-     
+13. New abstract BufferedTokenStream for people who want to write
+    Tokenizers or TokenFilters that require arbitrary buffering of the
+    stream. (SOLR-11 / yonik, hossman)    
+14. New RemoveDuplicatesToken - useful in situations where
+    synonyms, stemming, or word-deliminater-ing produce identical tokens at
+    the same position. (SOLR-11 / yonik, hossman)
+
 Changes in runtime behavior
 1. classes reorganized into different packages, package names changed to Apache
 2. force read of document stored fields in QuerySenderListener
--- a/example/solr/conf/schema.xml
+++ b/example/solr/conf/schema.xml
@ -85,6 +85,7 @@
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.StopFilterFactory"/>
        <filter class="solr.EnglishPorterFilterFactory"/>
+        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
      </analyzer>
    </fieldtype>
    <!-- One could also specify an existing Analyzer implementation in Java 
@ -104,7 +105,10 @@
    <!-- A text field that uses WordDelimiterFilter to enable splitting and matching of
        words on case-change, alpha numeric boundaries, and non-alphanumeric chars
        so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi".
-        Synonyms and stopwords are customized by external files, and stemming is enabled -->
+        Synonyms and stopwords are customized by external files, and stemming is enabled
+        Duplicate tokens at the same position (which may result from Stemmed Synonyms or
+        WordDelim parts) are removed.
+        -->
    <fieldtype name="text" class="solr.TextField" positionIncrementGap="100">
      <analyzer type="index">
          <tokenizer class="solr.WhitespaceTokenizerFactory"/>
@ -115,6 +119,7 @@
          <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
          <filter class="solr.LowerCaseFilterFactory"/>
          <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
+          <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
      </analyzer>
      <analyzer type="query">
          <tokenizer class="solr.WhitespaceTokenizerFactory"/>
@ -123,6 +128,7 @@
          <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
          <filter class="solr.LowerCaseFilterFactory"/>
          <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
+          <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
      </analyzer>
    </fieldtype>

@ -137,6 +143,7 @@
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
+        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
      </analyzer>
    </fieldtype>

--- a/example/solr/conf/synonyms.txt
+++ b/example/solr/conf/synonyms.txt
@ -16,3 +16,5 @@ MB,mib,megabyte,megabytes
 #spelling correction
 pixima => pixma

+Television, Televisions, TV, TVs
+
--- a/src/java/org/apache/solr/analysis/BufferedTokenStream.java
+++ b/src/java/org/apache/solr/analysis/BufferedTokenStream.java
@ -0,0 +1,144 @@
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.List;
+
+/**
+ * Handles input and output buffering of TokenStream
+ *
+ * <pre>
+ * // Example of a class implementing the rule "A" "B" => "Q" "B"
+ * class MyTokenStream extends BufferedTokenStream {
+ *   public MyTokenStream(TokenStream input) {super(input);}
+ *   protected Token process(Token t) throws IOException {
+ *     if ("A".equals(t.termText())) {
+ *       Token t2 = read();
+ *       if (t2!=null && "B".equals(t2.termText())) t.setTermText("Q");
+ *       if (t2!=null) pushBack(t2);
+ *     }
+ *     return t;
+ *   }
+ * }
+ *
+ * // Example of a class implementing "A" "B" => "A" "A" "B"
+ * class MyTokenStream extends BufferedTokenStream {
+ *   public MyTokenStream(TokenStream input) {super(input);}
+ *   protected Token process(Token t) throws IOException {
+ *     if ("A".equals(t.termText()) && "B".equals(peek(1).termText()))
+ *       write(t);
+ *     return t;
+ *   }
+ * }
+ * </pre>
+ *
+ *
+ * @author yonik
+ * @version $Id$
+ */
+public abstract class BufferedTokenStream extends TokenStream {
+  // in the futute, might be faster if we implemented as an array based CircularQueue
+  private final LinkedList<Token> inQueue = new LinkedList<Token>();
+  private final LinkedList<Token> outQueue = new LinkedList<Token>();
+  private final TokenStream input;
+
+  public BufferedTokenStream(TokenStream input) {
+    this.input = input;
+  }
+
+  /**
+   * Process a token.  Subclasses may read more tokens from the input stream,
+   * write more tokens to the output stream, or simply return the next token
+   * to be output.  Subclasses may return null if the token is to be dropped.
+   * If a subclass writes tokens to the output stream and returns a
+   * non-null Token, the returned Token is considered to be at the head of
+   * the token output stream.
+   */
+  protected abstract Token process(Token t) throws IOException;
+
+  public final Token next() throws IOException {
+    while (true) {
+      if (!outQueue.isEmpty()) return outQueue.removeFirst();
+      Token t = read();
+      if (null == t) return null;
+      Token out = process(t);
+      if (null != out) return out;
+      // loop back to top in case process() put something on the output queue
+    }
+  }
+
+  /**
+   * Read a token from the buffered input stream.  
+   * @return null at EOS
+   */
+  protected Token read() throws IOException {
+    if (inQueue.isEmpty()) {
+      Token t = input.next();
+      return t;
+    }
+    return inQueue.removeFirst();
+  }
+
+  /**
+   * Push a token back into the buffered input stream, such that it will
+   * be returned by a future call to <code>read()</code>
+   */
+  protected void pushBack(Token t) {
+    inQueue.addFirst(t);
+  }
+
+  /**
+   * Peek n tokens ahead in the buffered input stream, without modifying
+   * the stream. 
+   * @param n Number of tokens into the input stream to peek, 1 based ...
+   *          0 is invalid
+   * @return a Token which exists in the input stream, any modifications
+   *         made to this Token will be "real" if/when the Token is
+   *         <code>read()</code> from the stream.
+   */
+  protected Token peek(int n) throws IOException {
+    int fillCount = n-inQueue.size();
+    for (int i=0; i < fillCount; i++) {
+      Token t = input.next();
+      if (null==t) return null;
+      inQueue.addLast(t);
+    }
+    return inQueue.get(n-1);
+  }
+
+  /**
+   * Write a token to the buffered output stream
+   */
+  protected void write(Token t) {
+    outQueue.addLast(t);
+  }
+
+  /**
+   * Provides direct Iterator access to the buffered output stream.
+   * Modifying any token in this Iterator will affect the resulting stream.
+   */
+  protected Iterable<Token> output() {
+    return outQueue;
+  }
+
+
+} 
--- a/src/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilter.java
+++ b/src/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilter.java
@ -0,0 +1,53 @@
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.List;
+
+/**
+ * A TokenFilter which filters out Tokens at the same position and Term
+ * text as the previous token in the stream.
+ */
+public class RemoveDuplicatesTokenFilter extends BufferedTokenStream {
+  public RemoveDuplicatesTokenFilter(TokenStream input) {super(input);}
+  protected Token process(Token t) throws IOException {
+    Token tok = read();
+    OUT: while (tok != null && tok.getPositionIncrement()==0) {
+      if (null != t) {
+        write(t);
+        t = null;
+      }
+      boolean dup=false;
+      IN: for (Token outTok : output()) {
+        if (outTok.termText().equals(tok.termText())) {
+          dup=true;
+          break IN;
+        }
+      }
+      if (!dup)
+        write(tok);
+      tok = read();
+    }
+    if (tok != null) pushBack(tok);
+    return t;
+  }
+} 
--- a/src/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilterFactory.java
+++ b/src/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilterFactory.java
@ -0,0 +1,28 @@
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * @version $Id:$
+ */
+public class RemoveDuplicatesTokenFilterFactory extends BaseTokenFilterFactory {
+  public TokenStream create(TokenStream input) {
+    return new RemoveDuplicatesTokenFilter(input);
+  }
+}
--- a/src/test/org/apache/solr/BasicFunctionalityTest.java
+++ b/src/test/org/apache/solr/BasicFunctionalityTest.java
@ -17,6 +17,9 @@
 package org.apache.solr;

 import org.apache.lucene.document.Field;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.solr.search.*;
 import org.apache.solr.request.*;
 import org.apache.solr.util.*;
 import org.apache.solr.schema.*;
@ -200,6 +203,16 @@ public class BasicFunctionalityTest extends AbstractSolrTestCase {
            );
  }

+  /** @see TestRemoveDuplicatesTokenFilter */
+  public void testRemoveDuplicatesTokenFilter() {
+    Query q = QueryParsing.parseQuery("TV", "dedup",
+                                      h.getCore().getSchema());
+    assertTrue("not boolean?", q instanceof BooleanQuery);
+    assertEquals("unexpected number of stemmed synonym tokens",
+                 2, ((BooleanQuery) q).getClauses().length);
+  }
+
+  
  public void testTermVectorFields() {
    
    IndexSchema ischema = new IndexSchema(getSchemaFile());
--- a/src/test/org/apache/solr/analysis/TestBufferedTokenStream.java
+++ b/src/test/org/apache/solr/analysis/TestBufferedTokenStream.java
@ -0,0 +1,87 @@
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import junit.framework.TestCase;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+/**
+ * Test that BufferedTokenStream behaves as advertized in subclasses.
+ */
+public class TestBufferedTokenStream extends TestCase {
+
+  /** Example of a class implementing the rule "A" "B" => "Q" "B" */
+  public static class AB_Q_Stream extends BufferedTokenStream {
+    public AB_Q_Stream(TokenStream input) {super(input);}
+    protected Token process(Token t) throws IOException {
+      if ("A".equals(t.termText())) {
+        Token t2 = read();
+        if (t2!=null && "B".equals(t2.termText())) t.setTermText("Q");
+        if (t2!=null) pushBack(t2);
+      }
+      return t;
+    }
+  }
+
+  /** Example of a class implementing "A" "B" => "A" "A" "B" */
+  public static class AB_AAB_Stream extends BufferedTokenStream {
+    public AB_AAB_Stream(TokenStream input) {super(input);}
+    protected Token process(Token t) throws IOException {
+      if ("A".equals(t.termText()) && "B".equals(peek(1).termText()))
+        write(t);
+      return t;
+    }
+  }
+  
+  public static String tsToString(TokenStream in) throws IOException {
+    StringBuffer out = new StringBuffer();
+    Token t = in.next();
+    if (null != t)
+      out.append(t.termText());
+    
+    for (t = in.next(); null != t; t = in.next()) {
+      out.append(" ").append(t.termText());
+    }
+    in.close();
+    return out.toString();
+  }
+  
+  public void testABQ() throws Exception {
+    final String input = "How now A B brown A cow B like A B thing?";
+    final String expected = "How now Q B brown A cow B like Q B thing?";
+    TokenStream ts = new AB_Q_Stream
+      (new WhitespaceTokenizer(new StringReader(input)));
+    final String actual = tsToString(ts);
+    //System.out.println(actual);
+    assertEquals(expected, actual);
+  }
+  
+  public void testABAAB() throws Exception {
+    final String input = "How now A B brown A cow B like A B thing?";
+    final String expected = "How now A A B brown A cow B like A A B thing?";
+    TokenStream ts = new AB_AAB_Stream
+      (new WhitespaceTokenizer(new StringReader(input)));
+    final String actual = tsToString(ts);
+    //System.out.println(actual);
+    assertEquals(expected, actual);
+  }
+}
--- a/src/test/org/apache/solr/analysis/TestRemoveDuplicatesTokenFilter.java
+++ b/src/test/org/apache/solr/analysis/TestRemoveDuplicatesTokenFilter.java
@ -0,0 +1,107 @@
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import junit.framework.TestCase;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.Arrays;
+
+public class TestRemoveDuplicatesTokenFilter extends TestCase {
+
+  public static Token tok(int pos, String t, int start, int end) {
+    Token tok = new Token(t,start,end);
+    tok.setPositionIncrement(pos);
+    return tok;
+  }
+  public static Token tok(int pos, String t) {
+    return tok(pos, t, 0,0);
+  }
+
+  public void testDups(final String expected, final Token... tokens)
+    throws Exception {
+
+    final Iterator<Token> toks = Arrays.asList(tokens).iterator();
+    
+    final TokenStream ts = new RemoveDuplicatesTokenFilter
+      (new TokenStream() {
+          public Token next() { return toks.hasNext() ? toks.next() : null; }
+        });
+    
+    final String actual = TestBufferedTokenStream.tsToString(ts);
+    assertEquals(expected + " != " + actual, expected, actual);
+    
+  }
+  
+  public void testNoDups() throws Exception {
+
+    testDups("A B B C D E"
+             ,tok(1,"A", 0,  4)
+             ,tok(1,"B", 5, 10)
+             ,tok(1,"B",11, 15)
+             ,tok(1,"C",16, 20)
+             ,tok(0,"D",16, 20)
+             ,tok(1,"E",21, 25)
+             );
+    
+  }
+  
+
+  public void testSimpleDups() throws Exception {
+
+    testDups("A B C D E"
+             ,tok(1,"A", 0,  4)
+             ,tok(1,"B", 5, 10)
+             ,tok(0,"B",11, 15)
+             ,tok(1,"C",16, 20)
+             ,tok(0,"D",16, 20)
+             ,tok(1,"E",21, 25)
+             );
+    
+  }
+  
+  public void testComplexDups() throws Exception {
+
+    testDups("A B C D E F G H I J K"
+             ,tok(1,"A")
+             ,tok(1,"B")
+             ,tok(0,"B")
+             ,tok(1,"C")
+             ,tok(1,"D")
+             ,tok(0,"D")
+             ,tok(0,"D")
+             ,tok(1,"E")
+             ,tok(1,"F")
+             ,tok(0,"F")
+             ,tok(1,"G")
+             ,tok(0,"H")
+             ,tok(0,"H")
+             ,tok(1,"I")
+             ,tok(1,"J")
+             ,tok(0,"K")
+             ,tok(0,"J")
+             );
+             
+  }
+  
+  
+
+}
--- a/src/test/test-files/solr/conf/schema.xml
+++ b/src/test/test-files/solr/conf/schema.xml
@ -226,6 +226,19 @@
          <filter name="syn" class="solr.SynonymFilterFactory" synonyms="synonyms.txt"/>
      </analyzer>
    </fieldtype>
+    
+    <!-- Demonstrates How RemoveDuplicatesTokenFilter makes stemmed
+         synonyms "better"
+      -->
+    <fieldtype name="dedup" class="solr.TextField">
+      <analyzer>
+          <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+          <filter class="solr.SynonymFilterFactory"
+                  synonyms="synonyms.txt" expand="true" />
+          <filter class="solr.EnglishPorterFilterFactory"/>
+          <filter class="solr.RemoveDuplicatesTokenFilterFactory" />
+      </analyzer>
+    </fieldtype>

    <fieldtype  name="unstored" class="solr.StrField" indexed="true" stored="false"/>

@ -296,6 +309,7 @@
   <field name="stopfilt" type="stopfilt" indexed="true" stored="true"/>
   <field name="custstopfilt" type="custstopfilt" indexed="true" stored="true"/>
   <field name="lengthfilt" type="lengthfilt" indexed="true" stored="true"/>
+   <field name="dedup" type="dedup" indexed="true" stored="true"/>


   <field name="subword" type="subword" indexed="true" stored="true"/>
--- a/src/test/test-files/solr/conf/synonyms.txt
+++ b/src/test/test-files/solr/conf/synonyms.txt
@ -3,4 +3,6 @@ b => b1 b2
 c => c1,c2
 a\=>a => b\=>b
 a\,a => b\,b
-foo,bar,baz
+foo,bar,baz
+
+Television,TV,Televisions