mirror of https://github.com/apache/lucene.git
SOLR-11 - BufferedTokenStream and RemoveDuplicatesTokenFilter from SOLR-11-BufferedTokenStream-RemoveDuplicatesTokenFilter.patch plus some additional tests and example config changes
git-svn-id: https://svn.apache.org/repos/asf/incubator/solr/trunk@419443 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a35e30cb35
commit
9561be65e8
|
@ -20,7 +20,13 @@ New Features
|
|||
11. new DocSet.andNot(), DocSet.andNotSize() (yonik)
|
||||
12. Ability to store term vectors. (Note: standard request handler does
|
||||
not currently do anything with term vectors) (Mike Klaas via yonik, SOLR-23)
|
||||
|
||||
13. New abstract BufferedTokenStream for people who want to write
|
||||
Tokenizers or TokenFilters that require arbitrary buffering of the
|
||||
stream. (SOLR-11 / yonik, hossman)
|
||||
14. New RemoveDuplicatesToken - useful in situations where
|
||||
synonyms, stemming, or word-deliminater-ing produce identical tokens at
|
||||
the same position. (SOLR-11 / yonik, hossman)
|
||||
|
||||
Changes in runtime behavior
|
||||
1. classes reorganized into different packages, package names changed to Apache
|
||||
2. force read of document stored fields in QuerySenderListener
|
||||
|
|
|
@ -85,6 +85,7 @@
|
|||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory"/>
|
||||
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
<!-- One could also specify an existing Analyzer implementation in Java
|
||||
|
@ -104,7 +105,10 @@
|
|||
<!-- A text field that uses WordDelimiterFilter to enable splitting and matching of
|
||||
words on case-change, alpha numeric boundaries, and non-alphanumeric chars
|
||||
so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi".
|
||||
Synonyms and stopwords are customized by external files, and stemming is enabled -->
|
||||
Synonyms and stopwords are customized by external files, and stemming is enabled
|
||||
Duplicate tokens at the same position (which may result from Stemmed Synonyms or
|
||||
WordDelim parts) are removed.
|
||||
-->
|
||||
<fieldtype name="text" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
|
@ -115,6 +119,7 @@
|
|||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
|
@ -123,6 +128,7 @@
|
|||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
|
@ -137,6 +143,7 @@
|
|||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
|
|
|
@ -16,3 +16,5 @@ MB,mib,megabyte,megabytes
|
|||
#spelling correction
|
||||
pixima => pixma
|
||||
|
||||
Television, Televisions, TV, TVs
|
||||
|
||||
|
|
|
@ -0,0 +1,144 @@
|
|||
/**
|
||||
* Copyright 2006 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Handles input and output buffering of TokenStream
|
||||
*
|
||||
* <pre>
|
||||
* // Example of a class implementing the rule "A" "B" => "Q" "B"
|
||||
* class MyTokenStream extends BufferedTokenStream {
|
||||
* public MyTokenStream(TokenStream input) {super(input);}
|
||||
* protected Token process(Token t) throws IOException {
|
||||
* if ("A".equals(t.termText())) {
|
||||
* Token t2 = read();
|
||||
* if (t2!=null && "B".equals(t2.termText())) t.setTermText("Q");
|
||||
* if (t2!=null) pushBack(t2);
|
||||
* }
|
||||
* return t;
|
||||
* }
|
||||
* }
|
||||
*
|
||||
* // Example of a class implementing "A" "B" => "A" "A" "B"
|
||||
* class MyTokenStream extends BufferedTokenStream {
|
||||
* public MyTokenStream(TokenStream input) {super(input);}
|
||||
* protected Token process(Token t) throws IOException {
|
||||
* if ("A".equals(t.termText()) && "B".equals(peek(1).termText()))
|
||||
* write(t);
|
||||
* return t;
|
||||
* }
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
*
|
||||
* @author yonik
|
||||
* @version $Id$
|
||||
*/
|
||||
public abstract class BufferedTokenStream extends TokenStream {
|
||||
// in the futute, might be faster if we implemented as an array based CircularQueue
|
||||
private final LinkedList<Token> inQueue = new LinkedList<Token>();
|
||||
private final LinkedList<Token> outQueue = new LinkedList<Token>();
|
||||
private final TokenStream input;
|
||||
|
||||
public BufferedTokenStream(TokenStream input) {
|
||||
this.input = input;
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a token. Subclasses may read more tokens from the input stream,
|
||||
* write more tokens to the output stream, or simply return the next token
|
||||
* to be output. Subclasses may return null if the token is to be dropped.
|
||||
* If a subclass writes tokens to the output stream and returns a
|
||||
* non-null Token, the returned Token is considered to be at the head of
|
||||
* the token output stream.
|
||||
*/
|
||||
protected abstract Token process(Token t) throws IOException;
|
||||
|
||||
public final Token next() throws IOException {
|
||||
while (true) {
|
||||
if (!outQueue.isEmpty()) return outQueue.removeFirst();
|
||||
Token t = read();
|
||||
if (null == t) return null;
|
||||
Token out = process(t);
|
||||
if (null != out) return out;
|
||||
// loop back to top in case process() put something on the output queue
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read a token from the buffered input stream.
|
||||
* @return null at EOS
|
||||
*/
|
||||
protected Token read() throws IOException {
|
||||
if (inQueue.isEmpty()) {
|
||||
Token t = input.next();
|
||||
return t;
|
||||
}
|
||||
return inQueue.removeFirst();
|
||||
}
|
||||
|
||||
/**
|
||||
* Push a token back into the buffered input stream, such that it will
|
||||
* be returned by a future call to <code>read()</code>
|
||||
*/
|
||||
protected void pushBack(Token t) {
|
||||
inQueue.addFirst(t);
|
||||
}
|
||||
|
||||
/**
|
||||
* Peek n tokens ahead in the buffered input stream, without modifying
|
||||
* the stream.
|
||||
* @param n Number of tokens into the input stream to peek, 1 based ...
|
||||
* 0 is invalid
|
||||
* @return a Token which exists in the input stream, any modifications
|
||||
* made to this Token will be "real" if/when the Token is
|
||||
* <code>read()</code> from the stream.
|
||||
*/
|
||||
protected Token peek(int n) throws IOException {
|
||||
int fillCount = n-inQueue.size();
|
||||
for (int i=0; i < fillCount; i++) {
|
||||
Token t = input.next();
|
||||
if (null==t) return null;
|
||||
inQueue.addLast(t);
|
||||
}
|
||||
return inQueue.get(n-1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a token to the buffered output stream
|
||||
*/
|
||||
protected void write(Token t) {
|
||||
outQueue.addLast(t);
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides direct Iterator access to the buffered output stream.
|
||||
* Modifying any token in this Iterator will affect the resulting stream.
|
||||
*/
|
||||
protected Iterable<Token> output() {
|
||||
return outQueue;
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
/**
|
||||
* Copyright 2006 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* A TokenFilter which filters out Tokens at the same position and Term
|
||||
* text as the previous token in the stream.
|
||||
*/
|
||||
public class RemoveDuplicatesTokenFilter extends BufferedTokenStream {
|
||||
public RemoveDuplicatesTokenFilter(TokenStream input) {super(input);}
|
||||
protected Token process(Token t) throws IOException {
|
||||
Token tok = read();
|
||||
OUT: while (tok != null && tok.getPositionIncrement()==0) {
|
||||
if (null != t) {
|
||||
write(t);
|
||||
t = null;
|
||||
}
|
||||
boolean dup=false;
|
||||
IN: for (Token outTok : output()) {
|
||||
if (outTok.termText().equals(tok.termText())) {
|
||||
dup=true;
|
||||
break IN;
|
||||
}
|
||||
}
|
||||
if (!dup)
|
||||
write(tok);
|
||||
tok = read();
|
||||
}
|
||||
if (tok != null) pushBack(tok);
|
||||
return t;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
/**
|
||||
* Copyright 2006 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* @version $Id:$
|
||||
*/
|
||||
public class RemoveDuplicatesTokenFilterFactory extends BaseTokenFilterFactory {
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new RemoveDuplicatesTokenFilter(input);
|
||||
}
|
||||
}
|
|
@ -17,6 +17,9 @@
|
|||
package org.apache.solr;
|
||||
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.solr.search.*;
|
||||
import org.apache.solr.request.*;
|
||||
import org.apache.solr.util.*;
|
||||
import org.apache.solr.schema.*;
|
||||
|
@ -200,6 +203,16 @@ public class BasicFunctionalityTest extends AbstractSolrTestCase {
|
|||
);
|
||||
}
|
||||
|
||||
/** @see TestRemoveDuplicatesTokenFilter */
|
||||
public void testRemoveDuplicatesTokenFilter() {
|
||||
Query q = QueryParsing.parseQuery("TV", "dedup",
|
||||
h.getCore().getSchema());
|
||||
assertTrue("not boolean?", q instanceof BooleanQuery);
|
||||
assertEquals("unexpected number of stemmed synonym tokens",
|
||||
2, ((BooleanQuery) q).getClauses().length);
|
||||
}
|
||||
|
||||
|
||||
public void testTermVectorFields() {
|
||||
|
||||
IndexSchema ischema = new IndexSchema(getSchemaFile());
|
||||
|
|
|
@ -0,0 +1,87 @@
|
|||
/**
|
||||
* Copyright 2006 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
/**
|
||||
* Test that BufferedTokenStream behaves as advertized in subclasses.
|
||||
*/
|
||||
public class TestBufferedTokenStream extends TestCase {
|
||||
|
||||
/** Example of a class implementing the rule "A" "B" => "Q" "B" */
|
||||
public static class AB_Q_Stream extends BufferedTokenStream {
|
||||
public AB_Q_Stream(TokenStream input) {super(input);}
|
||||
protected Token process(Token t) throws IOException {
|
||||
if ("A".equals(t.termText())) {
|
||||
Token t2 = read();
|
||||
if (t2!=null && "B".equals(t2.termText())) t.setTermText("Q");
|
||||
if (t2!=null) pushBack(t2);
|
||||
}
|
||||
return t;
|
||||
}
|
||||
}
|
||||
|
||||
/** Example of a class implementing "A" "B" => "A" "A" "B" */
|
||||
public static class AB_AAB_Stream extends BufferedTokenStream {
|
||||
public AB_AAB_Stream(TokenStream input) {super(input);}
|
||||
protected Token process(Token t) throws IOException {
|
||||
if ("A".equals(t.termText()) && "B".equals(peek(1).termText()))
|
||||
write(t);
|
||||
return t;
|
||||
}
|
||||
}
|
||||
|
||||
public static String tsToString(TokenStream in) throws IOException {
|
||||
StringBuffer out = new StringBuffer();
|
||||
Token t = in.next();
|
||||
if (null != t)
|
||||
out.append(t.termText());
|
||||
|
||||
for (t = in.next(); null != t; t = in.next()) {
|
||||
out.append(" ").append(t.termText());
|
||||
}
|
||||
in.close();
|
||||
return out.toString();
|
||||
}
|
||||
|
||||
public void testABQ() throws Exception {
|
||||
final String input = "How now A B brown A cow B like A B thing?";
|
||||
final String expected = "How now Q B brown A cow B like Q B thing?";
|
||||
TokenStream ts = new AB_Q_Stream
|
||||
(new WhitespaceTokenizer(new StringReader(input)));
|
||||
final String actual = tsToString(ts);
|
||||
//System.out.println(actual);
|
||||
assertEquals(expected, actual);
|
||||
}
|
||||
|
||||
public void testABAAB() throws Exception {
|
||||
final String input = "How now A B brown A cow B like A B thing?";
|
||||
final String expected = "How now A A B brown A cow B like A A B thing?";
|
||||
TokenStream ts = new AB_AAB_Stream
|
||||
(new WhitespaceTokenizer(new StringReader(input)));
|
||||
final String actual = tsToString(ts);
|
||||
//System.out.println(actual);
|
||||
assertEquals(expected, actual);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,107 @@
|
|||
/**
|
||||
* Copyright 2006 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.Arrays;
|
||||
|
||||
public class TestRemoveDuplicatesTokenFilter extends TestCase {
|
||||
|
||||
public static Token tok(int pos, String t, int start, int end) {
|
||||
Token tok = new Token(t,start,end);
|
||||
tok.setPositionIncrement(pos);
|
||||
return tok;
|
||||
}
|
||||
public static Token tok(int pos, String t) {
|
||||
return tok(pos, t, 0,0);
|
||||
}
|
||||
|
||||
public void testDups(final String expected, final Token... tokens)
|
||||
throws Exception {
|
||||
|
||||
final Iterator<Token> toks = Arrays.asList(tokens).iterator();
|
||||
|
||||
final TokenStream ts = new RemoveDuplicatesTokenFilter
|
||||
(new TokenStream() {
|
||||
public Token next() { return toks.hasNext() ? toks.next() : null; }
|
||||
});
|
||||
|
||||
final String actual = TestBufferedTokenStream.tsToString(ts);
|
||||
assertEquals(expected + " != " + actual, expected, actual);
|
||||
|
||||
}
|
||||
|
||||
public void testNoDups() throws Exception {
|
||||
|
||||
testDups("A B B C D E"
|
||||
,tok(1,"A", 0, 4)
|
||||
,tok(1,"B", 5, 10)
|
||||
,tok(1,"B",11, 15)
|
||||
,tok(1,"C",16, 20)
|
||||
,tok(0,"D",16, 20)
|
||||
,tok(1,"E",21, 25)
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void testSimpleDups() throws Exception {
|
||||
|
||||
testDups("A B C D E"
|
||||
,tok(1,"A", 0, 4)
|
||||
,tok(1,"B", 5, 10)
|
||||
,tok(0,"B",11, 15)
|
||||
,tok(1,"C",16, 20)
|
||||
,tok(0,"D",16, 20)
|
||||
,tok(1,"E",21, 25)
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
public void testComplexDups() throws Exception {
|
||||
|
||||
testDups("A B C D E F G H I J K"
|
||||
,tok(1,"A")
|
||||
,tok(1,"B")
|
||||
,tok(0,"B")
|
||||
,tok(1,"C")
|
||||
,tok(1,"D")
|
||||
,tok(0,"D")
|
||||
,tok(0,"D")
|
||||
,tok(1,"E")
|
||||
,tok(1,"F")
|
||||
,tok(0,"F")
|
||||
,tok(1,"G")
|
||||
,tok(0,"H")
|
||||
,tok(0,"H")
|
||||
,tok(1,"I")
|
||||
,tok(1,"J")
|
||||
,tok(0,"K")
|
||||
,tok(0,"J")
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
|
@ -226,6 +226,19 @@
|
|||
<filter name="syn" class="solr.SynonymFilterFactory" synonyms="synonyms.txt"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
<!-- Demonstrates How RemoveDuplicatesTokenFilter makes stemmed
|
||||
synonyms "better"
|
||||
-->
|
||||
<fieldtype name="dedup" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.SynonymFilterFactory"
|
||||
synonyms="synonyms.txt" expand="true" />
|
||||
<filter class="solr.EnglishPorterFilterFactory"/>
|
||||
<filter class="solr.RemoveDuplicatesTokenFilterFactory" />
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
<fieldtype name="unstored" class="solr.StrField" indexed="true" stored="false"/>
|
||||
|
||||
|
@ -296,6 +309,7 @@
|
|||
<field name="stopfilt" type="stopfilt" indexed="true" stored="true"/>
|
||||
<field name="custstopfilt" type="custstopfilt" indexed="true" stored="true"/>
|
||||
<field name="lengthfilt" type="lengthfilt" indexed="true" stored="true"/>
|
||||
<field name="dedup" type="dedup" indexed="true" stored="true"/>
|
||||
|
||||
|
||||
<field name="subword" type="subword" indexed="true" stored="true"/>
|
||||
|
|
|
@ -3,4 +3,6 @@ b => b1 b2
|
|||
c => c1,c2
|
||||
a\=>a => b\=>b
|
||||
a\,a => b\,b
|
||||
foo,bar,baz
|
||||
foo,bar,baz
|
||||
|
||||
Television,TV,Televisions
|
||||
|
|
Loading…
Reference in New Issue