mirror of https://github.com/apache/lucene.git
LUCENE-2413: consolidate TeeSinkTokenFilter to contrib/analyzers
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940633 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5f68c89e2d
commit
edcc0666c9
|
@ -10,6 +10,7 @@ Changes in backwards compatibility policy
|
|||
- o.a.l.analysis.ISOLatin1AccentFilter -> o.a.l.analysis.miscellaneous.ISOLatin1AccentFilter
|
||||
- o.a.l.analysis.LengthFilter -> o.a.l.analysis.miscellaneous.LengthFilter
|
||||
- o.a.l.analysis.PerFieldAnalyzerWrapper -> o.a.l.analysis.miscellaneous.PerFieldAnalyzerWrapper
|
||||
- o.a.l.analysis.TeeSinkTokenFilter -> o.a.l.analysis.sinks.TeeSinkTokenFilter
|
||||
... (in progress)
|
||||
|
||||
* LUCENE-1458, LUCENE-2111, LUCENE-2354: Changes from flexible indexing:
|
||||
|
|
|
@ -1,245 +0,0 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.ref.WeakReference;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* This TokenFilter provides the ability to set aside attribute states
|
||||
* that have already been analyzed. This is useful in situations where multiple fields share
|
||||
* many common analysis steps and then go their separate ways.
|
||||
* <p/>
|
||||
* It is also useful for doing things like entity extraction or proper noun analysis as
|
||||
* part of the analysis workflow and saving off those tokens for use in another field.
|
||||
*
|
||||
* <pre>
|
||||
TeeSinkTokenFilter source1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader1));
|
||||
TeeSinkTokenFilter.SinkTokenStream sink1 = source1.newSinkTokenStream();
|
||||
TeeSinkTokenFilter.SinkTokenStream sink2 = source1.newSinkTokenStream();
|
||||
|
||||
TeeSinkTokenFilter source2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader2));
|
||||
source2.addSinkTokenStream(sink1);
|
||||
source2.addSinkTokenStream(sink2);
|
||||
|
||||
TokenStream final1 = new LowerCaseFilter(source1);
|
||||
TokenStream final2 = source2;
|
||||
TokenStream final3 = new EntityDetect(sink1);
|
||||
TokenStream final4 = new URLDetect(sink2);
|
||||
|
||||
d.add(new Field("f1", final1));
|
||||
d.add(new Field("f2", final2));
|
||||
d.add(new Field("f3", final3));
|
||||
d.add(new Field("f4", final4));
|
||||
* </pre>
|
||||
* In this example, <code>sink1</code> and <code>sink2</code> will both get tokens from both
|
||||
* <code>reader1</code> and <code>reader2</code> after whitespace tokenizer
|
||||
* and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired.
|
||||
* It is important, that tees are consumed before sinks (in the above example, the field names must be
|
||||
* less the sink's field names). If you are not sure, which stream is consumed first, you can simply
|
||||
* add another sink and then pass all tokens to the sinks at once using {@link #consumeAllTokens}.
|
||||
* This TokenFilter is exhausted after this. In the above example, change
|
||||
* the example above to:
|
||||
* <pre>
|
||||
...
|
||||
TokenStream final1 = new LowerCaseFilter(source1.newSinkTokenStream());
|
||||
TokenStream final2 = source2.newSinkTokenStream();
|
||||
sink1.consumeAllTokens();
|
||||
sink2.consumeAllTokens();
|
||||
...
|
||||
* </pre>
|
||||
* In this case, the fields can be added in any order, because the sources are not used anymore and all sinks are ready.
|
||||
* <p>Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene.
|
||||
*/
|
||||
public final class TeeSinkTokenFilter extends TokenFilter {
|
||||
private final List<WeakReference<SinkTokenStream>> sinks = new LinkedList<WeakReference<SinkTokenStream>>();
|
||||
|
||||
/**
|
||||
* Instantiates a new TeeSinkTokenFilter.
|
||||
*/
|
||||
public TeeSinkTokenFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a new {@link SinkTokenStream} that receives all tokens consumed by this stream.
|
||||
*/
|
||||
public SinkTokenStream newSinkTokenStream() {
|
||||
return newSinkTokenStream(ACCEPT_ALL_FILTER);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a new {@link SinkTokenStream} that receives all tokens consumed by this stream
|
||||
* that pass the supplied filter.
|
||||
* @see SinkFilter
|
||||
*/
|
||||
public SinkTokenStream newSinkTokenStream(SinkFilter filter) {
|
||||
SinkTokenStream sink = new SinkTokenStream(this.cloneAttributes(), filter);
|
||||
this.sinks.add(new WeakReference<SinkTokenStream>(sink));
|
||||
return sink;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a {@link SinkTokenStream} created by another <code>TeeSinkTokenFilter</code>
|
||||
* to this one. The supplied stream will also receive all consumed tokens.
|
||||
* This method can be used to pass tokens from two different tees to one sink.
|
||||
*/
|
||||
public void addSinkTokenStream(final SinkTokenStream sink) {
|
||||
// check that sink has correct factory
|
||||
if (!this.getAttributeFactory().equals(sink.getAttributeFactory())) {
|
||||
throw new IllegalArgumentException("The supplied sink is not compatible to this tee");
|
||||
}
|
||||
// add eventually missing attribute impls to the existing sink
|
||||
for (Iterator<AttributeImpl> it = this.cloneAttributes().getAttributeImplsIterator(); it.hasNext(); ) {
|
||||
sink.addAttributeImpl(it.next());
|
||||
}
|
||||
this.sinks.add(new WeakReference<SinkTokenStream>(sink));
|
||||
}
|
||||
|
||||
/**
|
||||
* <code>TeeSinkTokenFilter</code> passes all tokens to the added sinks
|
||||
* when itself is consumed. To be sure, that all tokens from the input
|
||||
* stream are passed to the sinks, you can call this methods.
|
||||
* This instance is exhausted after this, but all sinks are instant available.
|
||||
*/
|
||||
public void consumeAllTokens() throws IOException {
|
||||
while (incrementToken());
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
// capture state lazily - maybe no SinkFilter accepts this state
|
||||
AttributeSource.State state = null;
|
||||
for (WeakReference<SinkTokenStream> ref : sinks) {
|
||||
final SinkTokenStream sink = ref.get();
|
||||
if (sink != null) {
|
||||
if (sink.accept(this)) {
|
||||
if (state == null) {
|
||||
state = this.captureState();
|
||||
}
|
||||
sink.addState(state);
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void end() throws IOException {
|
||||
super.end();
|
||||
AttributeSource.State finalState = captureState();
|
||||
for (WeakReference<SinkTokenStream> ref : sinks) {
|
||||
final SinkTokenStream sink = ref.get();
|
||||
if (sink != null) {
|
||||
sink.setFinalState(finalState);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A filter that decides which {@link AttributeSource} states to store in the sink.
|
||||
*/
|
||||
public static abstract class SinkFilter {
|
||||
/**
|
||||
* Returns true, iff the current state of the passed-in {@link AttributeSource} shall be stored
|
||||
* in the sink.
|
||||
*/
|
||||
public abstract boolean accept(AttributeSource source);
|
||||
|
||||
/**
|
||||
* Called by {@link SinkTokenStream#reset()}. This method does nothing by default
|
||||
* and can optionally be overridden.
|
||||
*/
|
||||
public void reset() throws IOException {
|
||||
// nothing to do; can be overridden
|
||||
}
|
||||
}
|
||||
|
||||
public static final class SinkTokenStream extends TokenStream {
|
||||
private final List<AttributeSource.State> cachedStates = new LinkedList<AttributeSource.State>();
|
||||
private AttributeSource.State finalState;
|
||||
private Iterator<AttributeSource.State> it = null;
|
||||
private SinkFilter filter;
|
||||
|
||||
private SinkTokenStream(AttributeSource source, SinkFilter filter) {
|
||||
super(source);
|
||||
this.filter = filter;
|
||||
}
|
||||
|
||||
private boolean accept(AttributeSource source) {
|
||||
return filter.accept(source);
|
||||
}
|
||||
|
||||
private void addState(AttributeSource.State state) {
|
||||
if (it != null) {
|
||||
throw new IllegalStateException("The tee must be consumed before sinks are consumed.");
|
||||
}
|
||||
cachedStates.add(state);
|
||||
}
|
||||
|
||||
private void setFinalState(AttributeSource.State finalState) {
|
||||
this.finalState = finalState;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
// lazy init the iterator
|
||||
if (it == null) {
|
||||
it = cachedStates.iterator();
|
||||
}
|
||||
|
||||
if (!it.hasNext()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
AttributeSource.State state = it.next();
|
||||
restoreState(state);
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void end() throws IOException {
|
||||
if (finalState != null) {
|
||||
restoreState(finalState);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void reset() {
|
||||
it = cachedStates.iterator();
|
||||
}
|
||||
}
|
||||
|
||||
private static final SinkFilter ACCEPT_ALL_FILTER = new SinkFilter() {
|
||||
@Override
|
||||
public boolean accept(AttributeSource source) {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
|
@ -1,272 +0,0 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.English;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* tests for the TestTeeSinkTokenFilter
|
||||
*/
|
||||
public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
|
||||
protected StringBuilder buffer1;
|
||||
protected StringBuilder buffer2;
|
||||
protected String[] tokens1;
|
||||
protected String[] tokens2;
|
||||
|
||||
|
||||
public TestTeeSinkTokenFilter(String s) {
|
||||
super(s);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void setUp() throws Exception {
|
||||
super.setUp();
|
||||
tokens1 = new String[]{"The", "quick", "Burgundy", "Fox", "jumped", "over", "the", "lazy", "Red", "Dogs"};
|
||||
tokens2 = new String[]{"The", "Lazy", "Dogs", "should", "stay", "on", "the", "porch"};
|
||||
buffer1 = new StringBuilder();
|
||||
|
||||
for (int i = 0; i < tokens1.length; i++) {
|
||||
buffer1.append(tokens1[i]).append(' ');
|
||||
}
|
||||
buffer2 = new StringBuilder();
|
||||
for (int i = 0; i < tokens2.length; i++) {
|
||||
buffer2.append(tokens2[i]).append(' ');
|
||||
}
|
||||
}
|
||||
|
||||
static final TeeSinkTokenFilter.SinkFilter theFilter = new TeeSinkTokenFilter.SinkFilter() {
|
||||
@Override
|
||||
public boolean accept(AttributeSource a) {
|
||||
TermAttribute termAtt = a.getAttribute(TermAttribute.class);
|
||||
return termAtt.term().equalsIgnoreCase("The");
|
||||
}
|
||||
};
|
||||
|
||||
static final TeeSinkTokenFilter.SinkFilter dogFilter = new TeeSinkTokenFilter.SinkFilter() {
|
||||
@Override
|
||||
public boolean accept(AttributeSource a) {
|
||||
TermAttribute termAtt = a.getAttribute(TermAttribute.class);
|
||||
return termAtt.term().equalsIgnoreCase("Dogs");
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
public void testGeneral() throws IOException {
|
||||
final TeeSinkTokenFilter source = new TeeSinkTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())));
|
||||
final TokenStream sink1 = source.newSinkTokenStream();
|
||||
final TokenStream sink2 = source.newSinkTokenStream(theFilter);
|
||||
int i = 0;
|
||||
TermAttribute termAtt = source.getAttribute(TermAttribute.class);
|
||||
while (source.incrementToken()) {
|
||||
assertEquals(tokens1[i], termAtt.term());
|
||||
i++;
|
||||
}
|
||||
assertEquals(tokens1.length, i);
|
||||
|
||||
i = 0;
|
||||
termAtt = sink1.getAttribute(TermAttribute.class);
|
||||
while (sink1.incrementToken()) {
|
||||
assertEquals(tokens1[i], termAtt.term());
|
||||
i++;
|
||||
}
|
||||
assertEquals(tokens1.length, i);
|
||||
|
||||
i = 0;
|
||||
termAtt = sink2.getAttribute(TermAttribute.class);
|
||||
while (sink2.incrementToken()) {
|
||||
assertTrue(termAtt.term().equalsIgnoreCase("The"));
|
||||
i++;
|
||||
}
|
||||
assertEquals("there should be two times 'the' in the stream", 2, i);
|
||||
}
|
||||
|
||||
public void testMultipleSources() throws Exception {
|
||||
final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())));
|
||||
final TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter);
|
||||
final TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter);
|
||||
final TokenStream source1 = new CachingTokenFilter(tee1);
|
||||
|
||||
final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(new StringReader(buffer2.toString())));
|
||||
tee2.addSinkTokenStream(dogDetector);
|
||||
tee2.addSinkTokenStream(theDetector);
|
||||
final TokenStream source2 = tee2;
|
||||
|
||||
int i = 0;
|
||||
TermAttribute termAtt = source1.getAttribute(TermAttribute.class);
|
||||
while (source1.incrementToken()) {
|
||||
assertEquals(tokens1[i], termAtt.term());
|
||||
i++;
|
||||
}
|
||||
assertEquals(tokens1.length, i);
|
||||
i = 0;
|
||||
termAtt = source2.getAttribute(TermAttribute.class);
|
||||
while (source2.incrementToken()) {
|
||||
assertEquals(tokens2[i], termAtt.term());
|
||||
i++;
|
||||
}
|
||||
assertEquals(tokens2.length, i);
|
||||
i = 0;
|
||||
termAtt = theDetector.getAttribute(TermAttribute.class);
|
||||
while (theDetector.incrementToken()) {
|
||||
assertTrue("'" + termAtt.term() + "' is not equal to 'The'", termAtt.term().equalsIgnoreCase("The"));
|
||||
i++;
|
||||
}
|
||||
assertEquals("there must be 4 times 'The' in the stream", 4, i);
|
||||
i = 0;
|
||||
termAtt = dogDetector.getAttribute(TermAttribute.class);
|
||||
while (dogDetector.incrementToken()) {
|
||||
assertTrue("'" + termAtt.term() + "' is not equal to 'Dogs'", termAtt.term().equalsIgnoreCase("Dogs"));
|
||||
i++;
|
||||
}
|
||||
assertEquals("there must be 2 times 'Dog' in the stream", 2, i);
|
||||
|
||||
source1.reset();
|
||||
TokenStream lowerCasing = new LowerCaseFilter(source1);
|
||||
i = 0;
|
||||
termAtt = lowerCasing.getAttribute(TermAttribute.class);
|
||||
while (lowerCasing.incrementToken()) {
|
||||
assertEquals(tokens1[i].toLowerCase(), termAtt.term());
|
||||
i++;
|
||||
}
|
||||
assertEquals(i, tokens1.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Not an explicit test, just useful to print out some info on performance
|
||||
*
|
||||
* @throws Exception
|
||||
*/
|
||||
public void performance() throws Exception {
|
||||
int[] tokCount = {100, 500, 1000, 2000, 5000, 10000};
|
||||
int[] modCounts = {1, 2, 5, 10, 20, 50, 100, 200, 500};
|
||||
for (int k = 0; k < tokCount.length; k++) {
|
||||
StringBuilder buffer = new StringBuilder();
|
||||
System.out.println("-----Tokens: " + tokCount[k] + "-----");
|
||||
for (int i = 0; i < tokCount[k]; i++) {
|
||||
buffer.append(English.intToEnglish(i).toUpperCase()).append(' ');
|
||||
}
|
||||
//make sure we produce the same tokens
|
||||
TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new StringReader(buffer.toString()))));
|
||||
TokenStream sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(100));
|
||||
teeStream.consumeAllTokens();
|
||||
TokenStream stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new StringReader(buffer.toString()))), 100);
|
||||
TermAttribute tfTok = stream.addAttribute(TermAttribute.class);
|
||||
TermAttribute sinkTok = sink.addAttribute(TermAttribute.class);
|
||||
for (int i=0; stream.incrementToken(); i++) {
|
||||
assertTrue(sink.incrementToken());
|
||||
assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.equals(sinkTok) == true);
|
||||
}
|
||||
|
||||
//simulate two fields, each being analyzed once, for 20 documents
|
||||
for (int j = 0; j < modCounts.length; j++) {
|
||||
int tfPos = 0;
|
||||
long start = System.currentTimeMillis();
|
||||
for (int i = 0; i < 20; i++) {
|
||||
stream = new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new StringReader(buffer.toString())));
|
||||
PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
|
||||
while (stream.incrementToken()) {
|
||||
tfPos += posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new StringReader(buffer.toString()))), modCounts[j]);
|
||||
posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
|
||||
while (stream.incrementToken()) {
|
||||
tfPos += posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
}
|
||||
long finish = System.currentTimeMillis();
|
||||
System.out.println("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms");
|
||||
int sinkPos = 0;
|
||||
//simulate one field with one sink
|
||||
start = System.currentTimeMillis();
|
||||
for (int i = 0; i < 20; i++) {
|
||||
teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new StringReader(buffer.toString()))));
|
||||
sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(modCounts[j]));
|
||||
PositionIncrementAttribute posIncrAtt = teeStream.getAttribute(PositionIncrementAttribute.class);
|
||||
while (teeStream.incrementToken()) {
|
||||
sinkPos += posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
//System.out.println("Modulo--------");
|
||||
posIncrAtt = sink.getAttribute(PositionIncrementAttribute.class);
|
||||
while (sink.incrementToken()) {
|
||||
sinkPos += posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
}
|
||||
finish = System.currentTimeMillis();
|
||||
System.out.println("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms");
|
||||
assertTrue(sinkPos + " does not equal: " + tfPos, sinkPos == tfPos);
|
||||
|
||||
}
|
||||
System.out.println("- End Tokens: " + tokCount[k] + "-----");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
class ModuloTokenFilter extends TokenFilter {
|
||||
|
||||
int modCount;
|
||||
|
||||
ModuloTokenFilter(TokenStream input, int mc) {
|
||||
super(input);
|
||||
modCount = mc;
|
||||
}
|
||||
|
||||
int count = 0;
|
||||
|
||||
//return every 100 tokens
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
boolean hasNext;
|
||||
for (hasNext = input.incrementToken();
|
||||
hasNext && count % modCount != 0;
|
||||
hasNext = input.incrementToken()) {
|
||||
count++;
|
||||
}
|
||||
count++;
|
||||
return hasNext;
|
||||
}
|
||||
}
|
||||
|
||||
class ModuloSinkFilter extends TeeSinkTokenFilter.SinkFilter {
|
||||
int count = 0;
|
||||
int modCount;
|
||||
|
||||
ModuloSinkFilter(int mc) {
|
||||
modCount = mc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean accept(AttributeSource a) {
|
||||
boolean b = (a != null && count % modCount == 0);
|
||||
count++;
|
||||
return b;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -36,7 +36,6 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||
import org.apache.lucene.analysis.SimpleAnalyzer;
|
||||
import org.apache.lucene.analysis.StopAnalyzer;
|
||||
import org.apache.lucene.analysis.TeeSinkTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
|
@ -4189,32 +4188,6 @@ public class TestIndexWriter extends LuceneTestCase {
|
|||
dir.close();
|
||||
}
|
||||
|
||||
// LUCENE-1448
|
||||
public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
|
||||
MockRAMDirectory dir = new MockRAMDirectory();
|
||||
Analyzer analyzer = new WhitespaceAnalyzer();
|
||||
IndexWriter w = new IndexWriter(dir, analyzer, IndexWriter.MaxFieldLength.LIMITED);
|
||||
Document doc = new Document();
|
||||
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(analyzer.tokenStream("field", new StringReader("abcd ")));
|
||||
TokenStream sink = tee.newSinkTokenStream();
|
||||
Field f1 = new Field("field", tee, Field.TermVector.WITH_POSITIONS_OFFSETS);
|
||||
Field f2 = new Field("field", sink, Field.TermVector.WITH_POSITIONS_OFFSETS);
|
||||
doc.add(f1);
|
||||
doc.add(f2);
|
||||
w.addDocument(doc);
|
||||
w.close();
|
||||
|
||||
IndexReader r = IndexReader.open(dir, true);
|
||||
TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
|
||||
assertEquals(2, termOffsets.length);
|
||||
assertEquals(0, termOffsets[0].getStartOffset());
|
||||
assertEquals(4, termOffsets[0].getEndOffset());
|
||||
assertEquals(8, termOffsets[1].getStartOffset());
|
||||
assertEquals(12, termOffsets[1].getEndOffset());
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// LUCENE-1448
|
||||
public void testEndOffsetPositionStopFilter() throws Exception {
|
||||
MockRAMDirectory dir = new MockRAMDirectory();
|
||||
|
|
|
@ -21,7 +21,6 @@ import java.text.DateFormat;
|
|||
import java.text.ParseException;
|
||||
import java.util.Date;
|
||||
|
||||
import org.apache.lucene.analysis.TeeSinkTokenFilter.SinkFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
|
@ -31,7 +30,7 @@ import org.apache.lucene.util.AttributeSource;
|
|||
* <p/>
|
||||
*
|
||||
**/
|
||||
public class DateRecognizerSinkFilter extends SinkFilter {
|
||||
public class DateRecognizerSinkFilter extends TeeSinkTokenFilter.SinkFilter {
|
||||
public static final String DATE_TYPE = "date";
|
||||
|
||||
protected DateFormat dateFormat;
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.analysis;
|
||||
package org.apache.lucene.analysis.sinks;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -23,6 +23,8 @@ import java.util.Iterator;
|
|||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
|
@ -19,14 +19,13 @@ package org.apache.lucene.analysis.sinks;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TeeSinkTokenFilter.SinkFilter;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* Counts the tokens as they go by and saves to the internal list those between the range of lower and upper, exclusive of upper
|
||||
*
|
||||
**/
|
||||
public class TokenRangeSinkFilter extends SinkFilter {
|
||||
public class TokenRangeSinkFilter extends TeeSinkTokenFilter.SinkFilter {
|
||||
private int lower;
|
||||
private int upper;
|
||||
private int count;
|
||||
|
|
|
@ -17,11 +17,10 @@ package org.apache.lucene.analysis.sinks;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TeeSinkTokenFilter.SinkFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
public class TokenTypeSinkFilter extends SinkFilter {
|
||||
public class TokenTypeSinkFilter extends TeeSinkTokenFilter.SinkFilter {
|
||||
private String typeToMatch;
|
||||
private TypeAttribute typeAtt;
|
||||
|
||||
|
|
|
@ -22,9 +22,7 @@ import java.text.SimpleDateFormat;
|
|||
import java.util.Locale;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TeeSinkTokenFilter;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.TeeSinkTokenFilter.SinkTokenStream;
|
||||
|
||||
public class DateRecognizerSinkTokenizerTest extends BaseTokenStreamTestCase {
|
||||
|
||||
|
@ -37,7 +35,7 @@ public class DateRecognizerSinkTokenizerTest extends BaseTokenStreamTestCase {
|
|||
DateRecognizerSinkFilter sinkFilter = new DateRecognizerSinkFilter(new SimpleDateFormat("MM/dd/yyyy", Locale.US));
|
||||
String test = "The quick red fox jumped over the lazy brown dogs on 7/11/2006 The dogs finally reacted on 7/12/2006";
|
||||
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)));
|
||||
SinkTokenStream sink = tee.newSinkTokenStream(sinkFilter);
|
||||
TeeSinkTokenFilter.SinkTokenStream sink = tee.newSinkTokenStream(sinkFilter);
|
||||
int count = 0;
|
||||
|
||||
tee.reset();
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.analysis;
|
||||
package org.apache.lucene.analysis.sinks;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
|
@ -16,10 +16,26 @@ package org.apache.lucene.analysis;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.TermPositionVector;
|
||||
import org.apache.lucene.index.TermVectorOffsetInfo;
|
||||
import org.apache.lucene.store.MockRAMDirectory;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.English;
|
||||
import java.io.IOException;
|
||||
|
@ -72,6 +88,33 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
};
|
||||
|
||||
// LUCENE-1448
|
||||
// TODO: instead of testing it this way, we can test
|
||||
// with BaseTokenStreamTestCase now...
|
||||
public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
|
||||
MockRAMDirectory dir = new MockRAMDirectory();
|
||||
Analyzer analyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
|
||||
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
|
||||
Document doc = new Document();
|
||||
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(analyzer.tokenStream("field", new StringReader("abcd ")));
|
||||
TokenStream sink = tee.newSinkTokenStream();
|
||||
Field f1 = new Field("field", tee, Field.TermVector.WITH_POSITIONS_OFFSETS);
|
||||
Field f2 = new Field("field", sink, Field.TermVector.WITH_POSITIONS_OFFSETS);
|
||||
doc.add(f1);
|
||||
doc.add(f2);
|
||||
w.addDocument(doc);
|
||||
w.close();
|
||||
|
||||
IndexReader r = IndexReader.open(dir, true);
|
||||
TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
|
||||
assertEquals(2, termOffsets.length);
|
||||
assertEquals(0, termOffsets[0].getStartOffset());
|
||||
assertEquals(4, termOffsets[0].getEndOffset());
|
||||
assertEquals(8, termOffsets[1].getStartOffset());
|
||||
assertEquals(12, termOffsets[1].getEndOffset());
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testGeneral() throws IOException {
|
||||
final TeeSinkTokenFilter source = new TeeSinkTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer1.toString())));
|
|
@ -20,9 +20,7 @@ import java.io.IOException;
|
|||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TeeSinkTokenFilter;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.TeeSinkTokenFilter.SinkTokenStream;
|
||||
|
||||
public class TokenRangeSinkTokenizerTest extends BaseTokenStreamTestCase {
|
||||
|
||||
|
@ -35,7 +33,7 @@ public class TokenRangeSinkTokenizerTest extends BaseTokenStreamTestCase {
|
|||
TokenRangeSinkFilter sinkFilter = new TokenRangeSinkFilter(2, 4);
|
||||
String test = "The quick red fox jumped over the lazy brown dogs";
|
||||
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)));
|
||||
SinkTokenStream rangeToks = tee.newSinkTokenStream(sinkFilter);
|
||||
TeeSinkTokenFilter.SinkTokenStream rangeToks = tee.newSinkTokenStream(sinkFilter);
|
||||
|
||||
int count = 0;
|
||||
tee.reset();
|
||||
|
|
|
@ -20,11 +20,9 @@ import java.io.IOException;
|
|||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TeeSinkTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.TeeSinkTokenFilter.SinkTokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
|
||||
|
@ -39,7 +37,7 @@ public class TokenTypeSinkTokenizerTest extends BaseTokenStreamTestCase {
|
|||
String test = "The quick red fox jumped over the lazy brown dogs";
|
||||
|
||||
TeeSinkTokenFilter ttf = new TeeSinkTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test))));
|
||||
SinkTokenStream sink = ttf.newSinkTokenStream(sinkFilter);
|
||||
TeeSinkTokenFilter.SinkTokenStream sink = ttf.newSinkTokenStream(sinkFilter);
|
||||
|
||||
boolean seenDogs = false;
|
||||
|
||||
|
|
|
@ -39,7 +39,6 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||
import org.apache.lucene.analysis.SimpleAnalyzer;
|
||||
import org.apache.lucene.analysis.StopAnalyzer;
|
||||
import org.apache.lucene.analysis.TeeSinkTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
|
@ -4183,32 +4182,6 @@ public class TestIndexWriter extends LuceneTestCase {
|
|||
dir.close();
|
||||
}
|
||||
|
||||
// LUCENE-1448
|
||||
public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
|
||||
MockRAMDirectory dir = new MockRAMDirectory();
|
||||
Analyzer analyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
|
||||
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
|
||||
Document doc = new Document();
|
||||
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(analyzer.tokenStream("field", new StringReader("abcd ")));
|
||||
TokenStream sink = tee.newSinkTokenStream();
|
||||
Field f1 = new Field("field", tee, Field.TermVector.WITH_POSITIONS_OFFSETS);
|
||||
Field f2 = new Field("field", sink, Field.TermVector.WITH_POSITIONS_OFFSETS);
|
||||
doc.add(f1);
|
||||
doc.add(f2);
|
||||
w.addDocument(doc);
|
||||
w.close();
|
||||
|
||||
IndexReader r = IndexReader.open(dir, true);
|
||||
TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
|
||||
assertEquals(2, termOffsets.length);
|
||||
assertEquals(0, termOffsets[0].getStartOffset());
|
||||
assertEquals(4, termOffsets[0].getEndOffset());
|
||||
assertEquals(8, termOffsets[1].getStartOffset());
|
||||
assertEquals(12, termOffsets[1].getEndOffset());
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// LUCENE-1448
|
||||
public void testEndOffsetPositionStopFilter() throws Exception {
|
||||
MockRAMDirectory dir = new MockRAMDirectory();
|
||||
|
|
Loading…
Reference in New Issue