mirror of https://github.com/apache/lucene.git
LUCENE-1058: Added new buffering tokenizer/token filter for reducing redundant analysis
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@599478 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
12d0cc180e
commit
261382fd4e
|
@ -525,6 +525,12 @@ New features
|
||||||
values, plus CustomScoreQuery for simple score (post) customization.
|
values, plus CustomScoreQuery for simple score (post) customization.
|
||||||
(Yonik Seeley, Doron Cohen)
|
(Yonik Seeley, Doron Cohen)
|
||||||
|
|
||||||
|
9. LUCENE-1058: Added new TeeTokenFilter (like the UNIX 'tee' command) and SinkTokenizer which can be used to share tokens between two or more
|
||||||
|
Fields such that the other Fields do not have to go through the whole Analysis process over again. For instance, if you have two
|
||||||
|
Fields that share all the same analysis steps except one lowercases tokens and the other does not, you can coordinate the operations
|
||||||
|
between the two using the TeeTokenFilter and the SinkTokenizer. See TeeSinkTokenTest.java for examples.
|
||||||
|
(Grant Ingersoll, Michael Busch, Yonik Seeley)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
1. LUCENE-761: The proxStream is now cloned lazily in SegmentTermPositions
|
1. LUCENE-761: The proxStream is now cloned lazily in SegmentTermPositions
|
||||||
|
|
|
@ -0,0 +1,60 @@
|
||||||
|
package org.apache.lucene.analysis;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A SinkTokenizer can be used to cache Tokens for use in an Analyzer
|
||||||
|
*
|
||||||
|
* @see TeeTokenFilter
|
||||||
|
*
|
||||||
|
**/
|
||||||
|
public class SinkTokenizer extends Tokenizer {
|
||||||
|
protected List/*<Token>*/ lst = new ArrayList/*<Token>*/();
|
||||||
|
protected Iterator/*<Token>*/ iter;
|
||||||
|
|
||||||
|
public SinkTokenizer(List/*<Token>*/ input) {
|
||||||
|
this.lst = input;
|
||||||
|
if (this.lst == null) this.lst = new ArrayList/*<Token>*/();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* only valid if tokens have not been consumed,
|
||||||
|
* i.e. if this tokenizer is not part of another tokenstream
|
||||||
|
*
|
||||||
|
* @return A List of {@link org.apache.lucene.analysis.Token}s
|
||||||
|
*/
|
||||||
|
public List/*<Token>*/ getTokens() {
|
||||||
|
return lst;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Ignores the input result Token
|
||||||
|
* @param result
|
||||||
|
* @return The next {@link org.apache.lucene.analysis.Token} in the Sink.
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
public Token next(Token result) throws IOException {
|
||||||
|
if (iter == null) iter = lst.iterator();
|
||||||
|
return iter.hasNext() ? (Token) iter.next() : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Override this method to cache only certain tokens, or new tokens based
|
||||||
|
* on the old tokens.
|
||||||
|
*
|
||||||
|
* @param t The {@link org.apache.lucene.analysis.Token} to add to the sink
|
||||||
|
*/
|
||||||
|
public void add(Token t) {
|
||||||
|
if (t == null) return;
|
||||||
|
lst.add((Token) t.clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void reset() throws IOException {
|
||||||
|
iter = lst.iterator();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,54 @@
|
||||||
|
package org.apache.lucene.analysis;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Works in conjunction with the SinkTokenizer to provide the ability to set aside tokens
|
||||||
|
* that have already been analyzed. This is useful in situations where multiple fields share
|
||||||
|
* many common analysis steps and then go their separate ways.
|
||||||
|
* <p/>
|
||||||
|
* It is also useful for doing things like entity extraction or proper noun analysis as
|
||||||
|
* part of the analysis workflow and saving off those tokens for use in another field.
|
||||||
|
*
|
||||||
|
* <pre>
|
||||||
|
SinkTokenizer sink1 = new SinkTokenizer(null);
|
||||||
|
SinkTokenizer sink2 = new SinkTokenizer(null);
|
||||||
|
|
||||||
|
TokenStream source1 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(reader1), sink1), sink2);
|
||||||
|
TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(reader2), sink1), sink2);
|
||||||
|
|
||||||
|
TokenStream final1 = new LowerCaseFilter(source1);
|
||||||
|
TokenStream final2 = source2;
|
||||||
|
TokenStream final3 = new EntityDetect(sink1);
|
||||||
|
TokenStream final4 = new URLDetect(sink2);
|
||||||
|
|
||||||
|
d.add(new Field("f1", final1));
|
||||||
|
d.add(new Field("f2", final2));
|
||||||
|
d.add(new Field("f3", final3));
|
||||||
|
d.add(new Field("f4", final4));
|
||||||
|
* </pre>
|
||||||
|
* In this example, sink1 and sink2 will both get tokens from both reader1 and reader2 after whitespace tokenizer
|
||||||
|
and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired.
|
||||||
|
Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene
|
||||||
|
<p/>
|
||||||
|
*
|
||||||
|
* See http://issues.apache.org/jira/browse/LUCENE-1058
|
||||||
|
* @see SinkTokenizer
|
||||||
|
*
|
||||||
|
**/
|
||||||
|
public class TeeTokenFilter extends TokenFilter {
|
||||||
|
SinkTokenizer sink;
|
||||||
|
|
||||||
|
protected TeeTokenFilter(TokenStream input, SinkTokenizer sink) {
|
||||||
|
super(input);
|
||||||
|
this.sink = sink;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Token next(Token result) throws IOException {
|
||||||
|
Token t = input.next(result);
|
||||||
|
sink.add(t);
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,138 @@
|
||||||
|
package org.apache.lucene.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* tests for the TeeTokenFilter and SinkTokenizer
|
||||||
|
*/
|
||||||
|
public class TeeSinkTokenTest extends TestCase {
|
||||||
|
protected StringBuffer buffer1;
|
||||||
|
protected StringBuffer buffer2;
|
||||||
|
protected String[] tokens1;
|
||||||
|
protected String[] tokens2;
|
||||||
|
|
||||||
|
|
||||||
|
public TeeSinkTokenTest(String s) {
|
||||||
|
super(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void setUp() {
|
||||||
|
tokens1 = new String[]{"The", "quick", "Burgundy", "Fox", "jumped", "over", "the", "lazy", "Red", "Dogs"};
|
||||||
|
tokens2 = new String[]{"The", "Lazy", "Dogs", "should", "stay", "on", "the", "porch"};
|
||||||
|
buffer1 = new StringBuffer();
|
||||||
|
|
||||||
|
for (int i = 0; i < tokens1.length; i++) {
|
||||||
|
buffer1.append(tokens1[i]).append(' ');
|
||||||
|
}
|
||||||
|
buffer2 = new StringBuffer();
|
||||||
|
for (int i = 0; i < tokens2.length; i++) {
|
||||||
|
buffer2.append(tokens2[i]).append(' ');
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void tearDown() {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void test() throws IOException {
|
||||||
|
|
||||||
|
SinkTokenizer sink1 = new SinkTokenizer(null){
|
||||||
|
public void add(Token t) {
|
||||||
|
if (t != null && t.termText().equalsIgnoreCase("The")){
|
||||||
|
super.add(t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
TokenStream source = new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), sink1);
|
||||||
|
Token token = null;
|
||||||
|
int i = 0;
|
||||||
|
while ((token = source.next()) != null){
|
||||||
|
assertTrue(token.termText() + " is not equal to " + tokens1[i], token.termText().equals(tokens1[i]) == true);
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
|
||||||
|
assertTrue("sink1 Size: " + sink1.getTokens().size() + " is not: " + 2, sink1.getTokens().size() == 2);
|
||||||
|
i = 0;
|
||||||
|
while ((token = sink1.next()) != null){
|
||||||
|
assertTrue(token.termText() + " is not equal to " + "The", token.termText().equalsIgnoreCase("The") == true);
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
assertTrue(i + " does not equal: " + sink1.getTokens().size(), i == sink1.getTokens().size());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testMultipleSources() throws Exception {
|
||||||
|
SinkTokenizer theDetector = new SinkTokenizer(null){
|
||||||
|
public void add(Token t) {
|
||||||
|
if (t != null && t.termText().equalsIgnoreCase("The")){
|
||||||
|
super.add(t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
SinkTokenizer dogDetector = new SinkTokenizer(null){
|
||||||
|
public void add(Token t) {
|
||||||
|
if (t != null && t.termText().equalsIgnoreCase("Dogs")){
|
||||||
|
super.add(t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
TokenStream source1 = new CachingTokenFilter(new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), theDetector), dogDetector));
|
||||||
|
TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer2.toString())), theDetector), dogDetector);
|
||||||
|
Token token = null;
|
||||||
|
int i = 0;
|
||||||
|
while ((token = source1.next()) != null){
|
||||||
|
assertTrue(token.termText() + " is not equal to " + tokens1[i], token.termText().equals(tokens1[i]) == true);
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
|
||||||
|
assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 2, theDetector.getTokens().size() == 2);
|
||||||
|
assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 1, dogDetector.getTokens().size() == 1);
|
||||||
|
i = 0;
|
||||||
|
while ((token = source2.next()) != null){
|
||||||
|
assertTrue(token.termText() + " is not equal to " + tokens2[i], token.termText().equals(tokens2[i]) == true);
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
assertTrue(i + " does not equal: " + tokens2.length, i == tokens2.length);
|
||||||
|
assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 4, theDetector.getTokens().size() == 4);
|
||||||
|
assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 2, dogDetector.getTokens().size() == 2);
|
||||||
|
i = 0;
|
||||||
|
while ((token = theDetector.next()) != null){
|
||||||
|
assertTrue(token.termText() + " is not equal to " + "The", token.termText().equalsIgnoreCase("The") == true);
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
assertTrue(i + " does not equal: " + theDetector.getTokens().size(), i == theDetector.getTokens().size());
|
||||||
|
i = 0;
|
||||||
|
while ((token = dogDetector.next()) != null){
|
||||||
|
assertTrue(token.termText() + " is not equal to " + "Dogs", token.termText().equalsIgnoreCase("Dogs") == true);
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
assertTrue(i + " does not equal: " + dogDetector.getTokens().size(), i == dogDetector.getTokens().size());
|
||||||
|
source1.reset();
|
||||||
|
TokenStream lowerCasing = new LowerCaseFilter(source1);
|
||||||
|
i = 0;
|
||||||
|
while ((token = lowerCasing.next()) != null){
|
||||||
|
assertTrue(token.termText() + " is not equal to " + tokens1[i].toLowerCase(), token.termText().equals(tokens1[i].toLowerCase()) == true);
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue