SOLR-11 - BufferedTokenStream and RemoveDuplicatesTokenFilter from SOLR-11-BufferedTokenStream-RemoveDuplicatesTokenFilter.patch plus some additional tests and example config changes

git-svn-id: https://svn.apache.org/repos/asf/incubator/solr/trunk@419443 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Chris M. Hostetter 2006-07-06 05:39:04 +00:00
parent a35e30cb35
commit 9561be65e8
11 changed files with 466 additions and 3 deletions

View File

@ -20,7 +20,13 @@ New Features
11. new DocSet.andNot(), DocSet.andNotSize() (yonik)
12. Ability to store term vectors. (Note: standard request handler does
not currently do anything with term vectors) (Mike Klaas via yonik, SOLR-23)
13. New abstract BufferedTokenStream for people who want to write
Tokenizers or TokenFilters that require arbitrary buffering of the
stream. (SOLR-11 / yonik, hossman)
14. New RemoveDuplicatesToken - useful in situations where
synonyms, stemming, or word-deliminater-ing produce identical tokens at
the same position. (SOLR-11 / yonik, hossman)
Changes in runtime behavior
1. classes reorganized into different packages, package names changed to Apache
2. force read of document stored fields in QuerySenderListener

View File

@ -85,6 +85,7 @@
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldtype>
<!-- One could also specify an existing Analyzer implementation in Java
@ -104,7 +105,10 @@
<!-- A text field that uses WordDelimiterFilter to enable splitting and matching of
words on case-change, alpha numeric boundaries, and non-alphanumeric chars
so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi".
Synonyms and stopwords are customized by external files, and stemming is enabled -->
Synonyms and stopwords are customized by external files, and stemming is enabled
Duplicate tokens at the same position (which may result from Stemmed Synonyms or
WordDelim parts) are removed.
-->
<fieldtype name="text" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
@ -115,6 +119,7 @@
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
@ -123,6 +128,7 @@
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldtype>
@ -137,6 +143,7 @@
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldtype>

View File

@ -16,3 +16,5 @@ MB,mib,megabyte,megabytes
#spelling correction
pixima => pixma
Television, Televisions, TV, TVs

View File

@ -0,0 +1,144 @@
/**
* Copyright 2006 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
/**
* Handles input and output buffering of TokenStream
*
* <pre>
* // Example of a class implementing the rule "A" "B" => "Q" "B"
* class MyTokenStream extends BufferedTokenStream {
* public MyTokenStream(TokenStream input) {super(input);}
* protected Token process(Token t) throws IOException {
* if ("A".equals(t.termText())) {
* Token t2 = read();
* if (t2!=null && "B".equals(t2.termText())) t.setTermText("Q");
* if (t2!=null) pushBack(t2);
* }
* return t;
* }
* }
*
* // Example of a class implementing "A" "B" => "A" "A" "B"
* class MyTokenStream extends BufferedTokenStream {
* public MyTokenStream(TokenStream input) {super(input);}
* protected Token process(Token t) throws IOException {
* if ("A".equals(t.termText()) && "B".equals(peek(1).termText()))
* write(t);
* return t;
* }
* }
* </pre>
*
*
* @author yonik
* @version $Id$
*/
public abstract class BufferedTokenStream extends TokenStream {
// in the futute, might be faster if we implemented as an array based CircularQueue
private final LinkedList<Token> inQueue = new LinkedList<Token>();
private final LinkedList<Token> outQueue = new LinkedList<Token>();
private final TokenStream input;
public BufferedTokenStream(TokenStream input) {
this.input = input;
}
/**
* Process a token. Subclasses may read more tokens from the input stream,
* write more tokens to the output stream, or simply return the next token
* to be output. Subclasses may return null if the token is to be dropped.
* If a subclass writes tokens to the output stream and returns a
* non-null Token, the returned Token is considered to be at the head of
* the token output stream.
*/
protected abstract Token process(Token t) throws IOException;
public final Token next() throws IOException {
while (true) {
if (!outQueue.isEmpty()) return outQueue.removeFirst();
Token t = read();
if (null == t) return null;
Token out = process(t);
if (null != out) return out;
// loop back to top in case process() put something on the output queue
}
}
/**
* Read a token from the buffered input stream.
* @return null at EOS
*/
protected Token read() throws IOException {
if (inQueue.isEmpty()) {
Token t = input.next();
return t;
}
return inQueue.removeFirst();
}
/**
* Push a token back into the buffered input stream, such that it will
* be returned by a future call to <code>read()</code>
*/
protected void pushBack(Token t) {
inQueue.addFirst(t);
}
/**
* Peek n tokens ahead in the buffered input stream, without modifying
* the stream.
* @param n Number of tokens into the input stream to peek, 1 based ...
* 0 is invalid
* @return a Token which exists in the input stream, any modifications
* made to this Token will be "real" if/when the Token is
* <code>read()</code> from the stream.
*/
protected Token peek(int n) throws IOException {
int fillCount = n-inQueue.size();
for (int i=0; i < fillCount; i++) {
Token t = input.next();
if (null==t) return null;
inQueue.addLast(t);
}
return inQueue.get(n-1);
}
/**
* Write a token to the buffered output stream
*/
protected void write(Token t) {
outQueue.addLast(t);
}
/**
* Provides direct Iterator access to the buffered output stream.
* Modifying any token in this Iterator will affect the resulting stream.
*/
protected Iterable<Token> output() {
return outQueue;
}
}

View File

@ -0,0 +1,53 @@
/**
* Copyright 2006 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
/**
* A TokenFilter which filters out Tokens at the same position and Term
* text as the previous token in the stream.
*/
public class RemoveDuplicatesTokenFilter extends BufferedTokenStream {
public RemoveDuplicatesTokenFilter(TokenStream input) {super(input);}
protected Token process(Token t) throws IOException {
Token tok = read();
OUT: while (tok != null && tok.getPositionIncrement()==0) {
if (null != t) {
write(t);
t = null;
}
boolean dup=false;
IN: for (Token outTok : output()) {
if (outTok.termText().equals(tok.termText())) {
dup=true;
break IN;
}
}
if (!dup)
write(tok);
tok = read();
}
if (tok != null) pushBack(tok);
return t;
}
}

View File

@ -0,0 +1,28 @@
/**
* Copyright 2006 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
/**
* @version $Id:$
*/
public class RemoveDuplicatesTokenFilterFactory extends BaseTokenFilterFactory {
public TokenStream create(TokenStream input) {
return new RemoveDuplicatesTokenFilter(input);
}
}

View File

@ -17,6 +17,9 @@
package org.apache.solr;
import org.apache.lucene.document.Field;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.BooleanQuery;
import org.apache.solr.search.*;
import org.apache.solr.request.*;
import org.apache.solr.util.*;
import org.apache.solr.schema.*;
@ -200,6 +203,16 @@ public class BasicFunctionalityTest extends AbstractSolrTestCase {
);
}
/** @see TestRemoveDuplicatesTokenFilter */
public void testRemoveDuplicatesTokenFilter() {
Query q = QueryParsing.parseQuery("TV", "dedup",
h.getCore().getSchema());
assertTrue("not boolean?", q instanceof BooleanQuery);
assertEquals("unexpected number of stemmed synonym tokens",
2, ((BooleanQuery) q).getClauses().length);
}
public void testTermVectorFields() {
IndexSchema ischema = new IndexSchema(getSchemaFile());

View File

@ -0,0 +1,87 @@
/**
* Copyright 2006 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import java.io.IOException;
import java.io.StringReader;
/**
* Test that BufferedTokenStream behaves as advertized in subclasses.
*/
public class TestBufferedTokenStream extends TestCase {
/** Example of a class implementing the rule "A" "B" => "Q" "B" */
public static class AB_Q_Stream extends BufferedTokenStream {
public AB_Q_Stream(TokenStream input) {super(input);}
protected Token process(Token t) throws IOException {
if ("A".equals(t.termText())) {
Token t2 = read();
if (t2!=null && "B".equals(t2.termText())) t.setTermText("Q");
if (t2!=null) pushBack(t2);
}
return t;
}
}
/** Example of a class implementing "A" "B" => "A" "A" "B" */
public static class AB_AAB_Stream extends BufferedTokenStream {
public AB_AAB_Stream(TokenStream input) {super(input);}
protected Token process(Token t) throws IOException {
if ("A".equals(t.termText()) && "B".equals(peek(1).termText()))
write(t);
return t;
}
}
public static String tsToString(TokenStream in) throws IOException {
StringBuffer out = new StringBuffer();
Token t = in.next();
if (null != t)
out.append(t.termText());
for (t = in.next(); null != t; t = in.next()) {
out.append(" ").append(t.termText());
}
in.close();
return out.toString();
}
public void testABQ() throws Exception {
final String input = "How now A B brown A cow B like A B thing?";
final String expected = "How now Q B brown A cow B like Q B thing?";
TokenStream ts = new AB_Q_Stream
(new WhitespaceTokenizer(new StringReader(input)));
final String actual = tsToString(ts);
//System.out.println(actual);
assertEquals(expected, actual);
}
public void testABAAB() throws Exception {
final String input = "How now A B brown A cow B like A B thing?";
final String expected = "How now A A B brown A cow B like A A B thing?";
TokenStream ts = new AB_AAB_Stream
(new WhitespaceTokenizer(new StringReader(input)));
final String actual = tsToString(ts);
//System.out.println(actual);
assertEquals(expected, actual);
}
}

View File

@ -0,0 +1,107 @@
/**
* Copyright 2006 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import java.io.IOException;
import java.util.Iterator;
import java.util.Arrays;
public class TestRemoveDuplicatesTokenFilter extends TestCase {
public static Token tok(int pos, String t, int start, int end) {
Token tok = new Token(t,start,end);
tok.setPositionIncrement(pos);
return tok;
}
public static Token tok(int pos, String t) {
return tok(pos, t, 0,0);
}
public void testDups(final String expected, final Token... tokens)
throws Exception {
final Iterator<Token> toks = Arrays.asList(tokens).iterator();
final TokenStream ts = new RemoveDuplicatesTokenFilter
(new TokenStream() {
public Token next() { return toks.hasNext() ? toks.next() : null; }
});
final String actual = TestBufferedTokenStream.tsToString(ts);
assertEquals(expected + " != " + actual, expected, actual);
}
public void testNoDups() throws Exception {
testDups("A B B C D E"
,tok(1,"A", 0, 4)
,tok(1,"B", 5, 10)
,tok(1,"B",11, 15)
,tok(1,"C",16, 20)
,tok(0,"D",16, 20)
,tok(1,"E",21, 25)
);
}
public void testSimpleDups() throws Exception {
testDups("A B C D E"
,tok(1,"A", 0, 4)
,tok(1,"B", 5, 10)
,tok(0,"B",11, 15)
,tok(1,"C",16, 20)
,tok(0,"D",16, 20)
,tok(1,"E",21, 25)
);
}
public void testComplexDups() throws Exception {
testDups("A B C D E F G H I J K"
,tok(1,"A")
,tok(1,"B")
,tok(0,"B")
,tok(1,"C")
,tok(1,"D")
,tok(0,"D")
,tok(0,"D")
,tok(1,"E")
,tok(1,"F")
,tok(0,"F")
,tok(1,"G")
,tok(0,"H")
,tok(0,"H")
,tok(1,"I")
,tok(1,"J")
,tok(0,"K")
,tok(0,"J")
);
}
}

View File

@ -226,6 +226,19 @@
<filter name="syn" class="solr.SynonymFilterFactory" synonyms="synonyms.txt"/>
</analyzer>
</fieldtype>
<!-- Demonstrates How RemoveDuplicatesTokenFilter makes stemmed
synonyms "better"
-->
<fieldtype name="dedup" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory"
synonyms="synonyms.txt" expand="true" />
<filter class="solr.EnglishPorterFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory" />
</analyzer>
</fieldtype>
<fieldtype name="unstored" class="solr.StrField" indexed="true" stored="false"/>
@ -296,6 +309,7 @@
<field name="stopfilt" type="stopfilt" indexed="true" stored="true"/>
<field name="custstopfilt" type="custstopfilt" indexed="true" stored="true"/>
<field name="lengthfilt" type="lengthfilt" indexed="true" stored="true"/>
<field name="dedup" type="dedup" indexed="true" stored="true"/>
<field name="subword" type="subword" indexed="true" stored="true"/>

View File

@ -3,4 +3,6 @@ b => b1 b2
c => c1,c2
a\=>a => b\=>b
a\,a => b\,b
foo,bar,baz
foo,bar,baz
Television,TV,Televisions