mirror of https://github.com/apache/lucene.git
LUCENE-6973: Improve TeeSinkTokenFilter
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1724789 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d804cc620d
commit
073dbdc17e
|
@ -144,6 +144,16 @@ API Changes
|
|||
instead of extending DocIdSetIterator. asTwoPhaseIterator() has been renamed
|
||||
to twoPhaseIterator() for consistency. (Adrien Grand)
|
||||
|
||||
* LUCENE-6973: TeeSinkTokenFilter no longer accepts a SinkFilter (the latter
|
||||
has been removed). If you wish to filter the sinks, you can wrap them with
|
||||
any other TokenFilter (e.g. a FilteringTokenFilter). Also, you can no longer
|
||||
add a SinkTokenStream to an existing TeeSinkTokenFilter. If you need to
|
||||
share multiple streams with a single sink, chain them with multiple
|
||||
TeeSinkTokenFilters.
|
||||
DateRecognizerSinkFilter was renamed to DateRecognizerFilter and moved under
|
||||
analysis/common. TokenTypeSinkFilter was removed (use TypeTokenFilter instead).
|
||||
TokenRangeSinkFilter was removed. (Shai Erera, Uwe Schindler)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-6951: Improve GeoPointInPolygonQuery using point orientation based
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.text.DateFormat;
|
||||
import java.text.ParseException;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
||||
|
||||
/** Filters all tokens that cannot be parsed to a date, using the provided {@link DateFormat}. */
|
||||
public class DateRecognizerFilter extends FilteringTokenFilter {
|
||||
|
||||
public static final String DATE_TYPE = "date";
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final DateFormat dateFormat;
|
||||
|
||||
/**
|
||||
* Uses {@link DateFormat#DEFAULT} and {@link Locale#ENGLISH} to create a {@link DateFormat} instance.
|
||||
*/
|
||||
public DateRecognizerFilter(TokenStream input) {
|
||||
this(input, null);
|
||||
}
|
||||
|
||||
public DateRecognizerFilter(TokenStream input, DateFormat dateFormat) {
|
||||
super(input);
|
||||
this.dateFormat = dateFormat != null ? dateFormat : DateFormat.getDateInstance(DateFormat.DEFAULT, Locale.ENGLISH);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean accept() {
|
||||
try {
|
||||
// We don't care about the date, just that the term can be parsed to one.
|
||||
dateFormat.parse(termAtt.toString());
|
||||
return true;
|
||||
} catch (ParseException e) {
|
||||
// This term is not a date.
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,84 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.text.DateFormat;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link DateRecognizerFilter}.
|
||||
*
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_filter_none_date" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.DateRecognizerFilterFactory" datePattern="yyyy/mm/dd" locale="en-US" />
|
||||
* </analyzer>
|
||||
* </fieldType>
|
||||
* </pre>
|
||||
*
|
||||
* <p>
|
||||
* The {@code datePattern} is optional. If omitted, {@link DateRecognizerFilter} will be created with the default date
|
||||
* format of the system. The {@code locale} is optional and if omitted the filter will be created with
|
||||
* {@link Locale#ENGLISH}.
|
||||
*/
|
||||
public class DateRecognizerFilterFactory extends TokenFilterFactory {
|
||||
|
||||
public static final String DATE_PATTERN = "datePattern";
|
||||
public static final String LOCALE = "locale";
|
||||
|
||||
private final DateFormat dateFormat;
|
||||
private final Locale locale;
|
||||
|
||||
/** Creates a new FingerprintFilterFactory */
|
||||
public DateRecognizerFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
this.locale = getLocale(get(args, LOCALE));
|
||||
this.dateFormat = getDataFormat(get(args, DATE_PATTERN));
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new DateRecognizerFilter(input, dateFormat);
|
||||
}
|
||||
|
||||
private Locale getLocale(String localeStr) {
|
||||
if (localeStr == null) {
|
||||
return Locale.ENGLISH;
|
||||
} else {
|
||||
return new Locale.Builder().setLanguageTag(localeStr).build();
|
||||
}
|
||||
}
|
||||
|
||||
public DateFormat getDataFormat(String datePattern) {
|
||||
if (datePattern != null) {
|
||||
return new SimpleDateFormat(datePattern, locale);
|
||||
} else {
|
||||
return DateFormat.getDateInstance(DateFormat.DEFAULT, locale);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,69 +0,0 @@
|
|||
package org.apache.lucene.analysis.sinks;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.text.DateFormat;
|
||||
import java.text.ParseException;
|
||||
import java.util.Date;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* Attempts to parse the {@link CharTermAttribute#buffer()} as a Date using a {@link java.text.DateFormat}.
|
||||
* If the value is a Date, it will add it to the sink.
|
||||
*
|
||||
**/
|
||||
public class DateRecognizerSinkFilter extends TeeSinkTokenFilter.SinkFilter {
|
||||
public static final String DATE_TYPE = "date";
|
||||
|
||||
protected DateFormat dateFormat;
|
||||
protected CharTermAttribute termAtt;
|
||||
|
||||
/**
|
||||
* Uses {@link java.text.DateFormat#getDateInstance(int, Locale)
|
||||
* DateFormat#getDateInstance(DateFormat.DEFAULT, Locale.ROOT)} as
|
||||
* the {@link java.text.DateFormat} object.
|
||||
*/
|
||||
public DateRecognizerSinkFilter() {
|
||||
this(DateFormat.getDateInstance(DateFormat.DEFAULT, Locale.ROOT));
|
||||
}
|
||||
|
||||
public DateRecognizerSinkFilter(DateFormat dateFormat) {
|
||||
this.dateFormat = dateFormat;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean accept(AttributeSource source) {
|
||||
if (termAtt == null) {
|
||||
termAtt = source.addAttribute(CharTermAttribute.class);
|
||||
}
|
||||
try {
|
||||
Date date = dateFormat.parse(termAtt.toString());//We don't care about the date, just that we can parse it as a date
|
||||
if (date != null) {
|
||||
return true;
|
||||
}
|
||||
} catch (ParseException e) {
|
||||
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
|
@ -18,233 +18,155 @@ package org.apache.lucene.analysis.sinks;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.ref.WeakReference;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* This TokenFilter provides the ability to set aside attribute states
|
||||
* that have already been analyzed. This is useful in situations where multiple fields share
|
||||
* many common analysis steps and then go their separate ways.
|
||||
* This TokenFilter provides the ability to set aside attribute states that have already been analyzed. This is useful
|
||||
* in situations where multiple fields share many common analysis steps and then go their separate ways.
|
||||
*
|
||||
* <p>
|
||||
* It is also useful for doing things like entity extraction or proper noun analysis as
|
||||
* part of the analysis workflow and saving off those tokens for use in another field.
|
||||
* It is also useful for doing things like entity extraction or proper noun analysis as part of the analysis workflow
|
||||
* and saving off those tokens for use in another field.
|
||||
* </p>
|
||||
*
|
||||
* <pre class="prettyprint">
|
||||
TeeSinkTokenFilter source1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(version, reader1));
|
||||
TeeSinkTokenFilter.SinkTokenStream sink1 = source1.newSinkTokenStream();
|
||||
TeeSinkTokenFilter.SinkTokenStream sink2 = source1.newSinkTokenStream();
|
||||
|
||||
TeeSinkTokenFilter source2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(version, reader2));
|
||||
source2.addSinkTokenStream(sink1);
|
||||
source2.addSinkTokenStream(sink2);
|
||||
|
||||
TokenStream final1 = new LowerCaseFilter(version, source1);
|
||||
TokenStream final2 = source2;
|
||||
TokenStream final3 = new EntityDetect(sink1);
|
||||
TokenStream final4 = new URLDetect(sink2);
|
||||
|
||||
d.add(new TextField("f1", final1, Field.Store.NO));
|
||||
d.add(new TextField("f2", final2, Field.Store.NO));
|
||||
d.add(new TextField("f3", final3, Field.Store.NO));
|
||||
d.add(new TextField("f4", final4, Field.Store.NO));
|
||||
* TeeSinkTokenFilter source1 = new TeeSinkTokenFilter(new WhitespaceTokenizer());
|
||||
* TeeSinkTokenFilter.SinkTokenStream sink1 = source1.newSinkTokenStream();
|
||||
* TeeSinkTokenFilter.SinkTokenStream sink2 = source1.newSinkTokenStream();
|
||||
*
|
||||
* TokenStream final1 = new LowerCaseFilter(source1);
|
||||
* TokenStream final2 = new EntityDetect(sink1);
|
||||
* TokenStream final3 = new URLDetect(sink2);
|
||||
*
|
||||
* d.add(new TextField("f1", final1));
|
||||
* d.add(new TextField("f2", final2));
|
||||
* d.add(new TextField("f3", final3));
|
||||
* </pre>
|
||||
* In this example, <code>sink1</code> and <code>sink2</code> will both get tokens from both
|
||||
* <code>reader1</code> and <code>reader2</code> after whitespace tokenizer
|
||||
* and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired.
|
||||
* It is important, that tees are consumed before sinks (in the above example, the field names must be
|
||||
* less the sink's field names). If you are not sure, which stream is consumed first, you can simply
|
||||
* add another sink and then pass all tokens to the sinks at once using {@link #consumeAllTokens}.
|
||||
* This TokenFilter is exhausted after this. In the above example, change
|
||||
* the example above to:
|
||||
* <pre class="prettyprint">
|
||||
...
|
||||
TokenStream final1 = new LowerCaseFilter(version, source1.newSinkTokenStream());
|
||||
TokenStream final2 = source2.newSinkTokenStream();
|
||||
sink1.consumeAllTokens();
|
||||
sink2.consumeAllTokens();
|
||||
...
|
||||
* </pre>
|
||||
* In this case, the fields can be added in any order, because the sources are not used anymore and all sinks are ready.
|
||||
* <p>Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene.
|
||||
*
|
||||
* <p>
|
||||
* In this example, {@code sink1} and {@code sink2} will both get tokens from {@code source1} after whitespace
|
||||
* tokenization, and will further do additional token filtering, e.g. detect entities and URLs.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* <b>NOTE</b>: it is important, that tees are consumed before sinks, therefore you should add them to the document
|
||||
* before the sinks. In the above example, <i>f1</i> is added before the other fields, and so by the time they are
|
||||
* processed, it has already been consumed, which is the correct way to index the three streams. If for some reason you
|
||||
* cannot ensure that, you should call {@link #consumeAllTokens()} before adding the sinks to document fields.
|
||||
*/
|
||||
public final class TeeSinkTokenFilter extends TokenFilter {
|
||||
private final List<WeakReference<SinkTokenStream>> sinks = new LinkedList<>();
|
||||
|
||||
/**
|
||||
* Instantiates a new TeeSinkTokenFilter.
|
||||
*/
|
||||
|
||||
private final States cachedStates = new States();
|
||||
|
||||
public TeeSinkTokenFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a new {@link SinkTokenStream} that receives all tokens consumed by this stream.
|
||||
*/
|
||||
public SinkTokenStream newSinkTokenStream() {
|
||||
return newSinkTokenStream(ACCEPT_ALL_FILTER);
|
||||
/** Returns a new {@link SinkTokenStream} that receives all tokens consumed by this stream. */
|
||||
public TokenStream newSinkTokenStream() {
|
||||
return new SinkTokenStream(this.cloneAttributes(), cachedStates);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns a new {@link SinkTokenStream} that receives all tokens consumed by this stream
|
||||
* that pass the supplied filter.
|
||||
* @see SinkFilter
|
||||
*/
|
||||
public SinkTokenStream newSinkTokenStream(SinkFilter filter) {
|
||||
SinkTokenStream sink = new SinkTokenStream(this.cloneAttributes(), filter);
|
||||
this.sinks.add(new WeakReference<>(sink));
|
||||
return sink;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a {@link SinkTokenStream} created by another <code>TeeSinkTokenFilter</code>
|
||||
* to this one. The supplied stream will also receive all consumed tokens.
|
||||
* This method can be used to pass tokens from two different tees to one sink.
|
||||
*/
|
||||
public void addSinkTokenStream(final SinkTokenStream sink) {
|
||||
// check that sink has correct factory
|
||||
if (!this.getAttributeFactory().equals(sink.getAttributeFactory())) {
|
||||
throw new IllegalArgumentException("The supplied sink is not compatible to this tee");
|
||||
}
|
||||
// add eventually missing attribute impls to the existing sink
|
||||
for (Iterator<AttributeImpl> it = this.cloneAttributes().getAttributeImplsIterator(); it.hasNext(); ) {
|
||||
sink.addAttributeImpl(it.next());
|
||||
}
|
||||
this.sinks.add(new WeakReference<>(sink));
|
||||
}
|
||||
|
||||
/**
|
||||
* <code>TeeSinkTokenFilter</code> passes all tokens to the added sinks
|
||||
* when itself is consumed. To be sure, that all tokens from the input
|
||||
* stream are passed to the sinks, you can call this methods.
|
||||
* This instance is exhausted after this, but all sinks are instant available.
|
||||
* <code>TeeSinkTokenFilter</code> passes all tokens to the added sinks when itself is consumed. To be sure that all
|
||||
* tokens from the input stream are passed to the sinks, you can call this methods. This instance is exhausted after
|
||||
* this method returns, but all sinks are instant available.
|
||||
*/
|
||||
public void consumeAllTokens() throws IOException {
|
||||
while (incrementToken()) {}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
// capture state lazily - maybe no SinkFilter accepts this state
|
||||
AttributeSource.State state = null;
|
||||
for (WeakReference<SinkTokenStream> ref : sinks) {
|
||||
final SinkTokenStream sink = ref.get();
|
||||
if (sink != null) {
|
||||
if (sink.accept(this)) {
|
||||
if (state == null) {
|
||||
state = this.captureState();
|
||||
}
|
||||
sink.addState(state);
|
||||
}
|
||||
}
|
||||
}
|
||||
cachedStates.add(captureState());
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public final void end() throws IOException {
|
||||
super.end();
|
||||
AttributeSource.State finalState = captureState();
|
||||
for (WeakReference<SinkTokenStream> ref : sinks) {
|
||||
final SinkTokenStream sink = ref.get();
|
||||
if (sink != null) {
|
||||
sink.setFinalState(finalState);
|
||||
}
|
||||
}
|
||||
cachedStates.setFinalState(captureState());
|
||||
}
|
||||
|
||||
/**
|
||||
* A filter that decides which {@link AttributeSource} states to store in the sink.
|
||||
*/
|
||||
public static abstract class SinkFilter {
|
||||
/**
|
||||
* Returns true, iff the current state of the passed-in {@link AttributeSource} shall be stored
|
||||
* in the sink.
|
||||
*/
|
||||
public abstract boolean accept(AttributeSource source);
|
||||
|
||||
/**
|
||||
* Called by {@link SinkTokenStream#reset()}. This method does nothing by default
|
||||
* and can optionally be overridden.
|
||||
*/
|
||||
public void reset() throws IOException {
|
||||
// nothing to do; can be overridden
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
cachedStates.reset();
|
||||
super.reset();
|
||||
}
|
||||
|
||||
/**
|
||||
* TokenStream output from a tee with optional filtering.
|
||||
*/
|
||||
|
||||
/** TokenStream output from a tee. */
|
||||
public static final class SinkTokenStream extends TokenStream {
|
||||
private final List<AttributeSource.State> cachedStates = new LinkedList<>();
|
||||
private AttributeSource.State finalState;
|
||||
private final States cachedStates;
|
||||
private Iterator<AttributeSource.State> it = null;
|
||||
private SinkFilter filter;
|
||||
|
||||
private SinkTokenStream(AttributeSource source, SinkFilter filter) {
|
||||
|
||||
private SinkTokenStream(AttributeSource source, States cachedStates) {
|
||||
super(source);
|
||||
this.filter = filter;
|
||||
this.cachedStates = cachedStates;
|
||||
}
|
||||
|
||||
private boolean accept(AttributeSource source) {
|
||||
return filter.accept(source);
|
||||
}
|
||||
|
||||
private void addState(AttributeSource.State state) {
|
||||
if (it != null) {
|
||||
throw new IllegalStateException("The tee must be consumed before sinks are consumed.");
|
||||
}
|
||||
cachedStates.add(state);
|
||||
}
|
||||
|
||||
private void setFinalState(AttributeSource.State finalState) {
|
||||
this.finalState = finalState;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() {
|
||||
// lazy init the iterator
|
||||
if (it == null) {
|
||||
it = cachedStates.iterator();
|
||||
}
|
||||
|
||||
if (!it.hasNext()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
AttributeSource.State state = it.next();
|
||||
restoreState(state);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public final void end() {
|
||||
public void end() throws IOException {
|
||||
State finalState = cachedStates.getFinalState();
|
||||
if (finalState != null) {
|
||||
restoreState(finalState);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public final void reset() {
|
||||
it = cachedStates.iterator();
|
||||
it = cachedStates.getStates();
|
||||
}
|
||||
}
|
||||
|
||||
private static final SinkFilter ACCEPT_ALL_FILTER = new SinkFilter() {
|
||||
@Override
|
||||
public boolean accept(AttributeSource source) {
|
||||
return true;
|
||||
|
||||
/** A convenience wrapper for storing the cached states as well the final state of the stream. */
|
||||
private static final class States {
|
||||
|
||||
private final List<State> states = new ArrayList<>();
|
||||
private State finalState;
|
||||
|
||||
public States() {}
|
||||
|
||||
void setFinalState(State finalState) {
|
||||
this.finalState = finalState;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
State getFinalState() {
|
||||
return finalState;
|
||||
}
|
||||
|
||||
void add(State state) {
|
||||
states.add(state);
|
||||
}
|
||||
|
||||
Iterator<State> getStates() {
|
||||
return states.iterator();
|
||||
}
|
||||
|
||||
void reset() {
|
||||
finalState = null;
|
||||
states.clear();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,61 +0,0 @@
|
|||
package org.apache.lucene.analysis.sinks;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* Counts the tokens as they go by and saves to the internal list those between the range of lower and upper, exclusive of upper
|
||||
*
|
||||
**/
|
||||
public class TokenRangeSinkFilter extends TeeSinkTokenFilter.SinkFilter {
|
||||
private int lower;
|
||||
private int upper;
|
||||
private int count;
|
||||
|
||||
public TokenRangeSinkFilter(int lower, int upper) {
|
||||
if (lower < 1) {
|
||||
throw new IllegalArgumentException("lower must be greater than zero");
|
||||
}
|
||||
if (lower > upper) {
|
||||
throw new IllegalArgumentException("lower must not be greater than upper");
|
||||
}
|
||||
this.lower = lower;
|
||||
this.upper = upper;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean accept(AttributeSource source) {
|
||||
try {
|
||||
if (count >= lower && count < upper){
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
} finally {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
count = 0;
|
||||
}
|
||||
}
|
|
@ -1,44 +0,0 @@
|
|||
package org.apache.lucene.analysis.sinks;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* Adds a token to the sink if it has a specific type.
|
||||
*/
|
||||
public class TokenTypeSinkFilter extends TeeSinkTokenFilter.SinkFilter {
|
||||
private String typeToMatch;
|
||||
private TypeAttribute typeAtt;
|
||||
|
||||
public TokenTypeSinkFilter(String typeToMatch) {
|
||||
this.typeToMatch = typeToMatch;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean accept(AttributeSource source) {
|
||||
if (typeAtt == null) {
|
||||
typeAtt = source.addAttribute(TypeAttribute.class);
|
||||
}
|
||||
|
||||
//check to see if this is a Category
|
||||
return (typeToMatch.equals(typeAtt.type()));
|
||||
}
|
||||
|
||||
}
|
|
@ -16,8 +16,6 @@
|
|||
*/
|
||||
|
||||
/**
|
||||
* {@link org.apache.lucene.analysis.sinks.TeeSinkTokenFilter} and implementations
|
||||
* of {@link org.apache.lucene.analysis.sinks.TeeSinkTokenFilter.SinkFilter} that
|
||||
* might be useful.
|
||||
* {@link org.apache.lucene.analysis.sinks.TeeSinkTokenFilter}.
|
||||
*/
|
||||
package org.apache.lucene.analysis.sinks;
|
||||
|
|
|
@ -61,6 +61,7 @@ org.apache.lucene.analysis.lv.LatvianStemFilterFactory
|
|||
org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.DateRecognizerFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.FingerprintFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.KeepWordFilterFactory
|
||||
|
|
|
@ -31,6 +31,7 @@ import java.nio.file.DirectoryStream;
|
|||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.text.DateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
|
@ -41,6 +42,7 @@ import java.util.HashMap;
|
|||
import java.util.HashSet;
|
||||
import java.util.IdentityHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
import java.util.Set;
|
||||
|
@ -574,6 +576,13 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
}
|
||||
}
|
||||
});
|
||||
put(DateFormat.class, new ArgProducer() {
|
||||
@Override
|
||||
public Object create(Random random) {
|
||||
if (random.nextBoolean()) return null;
|
||||
return DateFormat.getDateInstance(DateFormat.DEFAULT, randomLocale(random));
|
||||
}
|
||||
});
|
||||
}};
|
||||
|
||||
static final Set<Class<?>> allowedTokenizerArgs, allowedTokenFilterArgs, allowedCharFilterArgs;
|
||||
|
|
|
@ -0,0 +1,43 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/*
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
|
||||
public class DateRecognizerFilterFactoryTest extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testBadLanguageTagThrowsException() {
|
||||
try {
|
||||
final Map<String,String> args = new HashMap<>();
|
||||
args.put(DateRecognizerFilterFactory.LOCALE, "en_US");
|
||||
new DateRecognizerFilterFactory(args);
|
||||
fail("Bad language tag should have thrown an exception");
|
||||
} catch (Exception e) {
|
||||
// expected;
|
||||
}
|
||||
}
|
||||
|
||||
public void testGoodLocaleParsesWell() {
|
||||
final Map<String,String> args = new HashMap<>();
|
||||
args.put(DateRecognizerFilterFactory.LOCALE, "en-US");
|
||||
new DateRecognizerFilterFactory(args);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/*
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.text.DateFormat;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
public class DateRecognizerFilterTest extends BaseTokenStreamTestCase {
|
||||
|
||||
public void test() throws IOException {
|
||||
final String test = "The red fox jumped over the lazy dogs on 7/11/2006 The dogs finally reacted on 7/12/2006";
|
||||
final DateFormat dateFormat = new SimpleDateFormat("MM/dd/yyyy", Locale.ENGLISH);
|
||||
try (TokenStream ts = new DateRecognizerFilter(whitespaceMockTokenizer(test), dateFormat);) {
|
||||
assertStreamHasNumberOfTokens(ts, 2);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,52 +0,0 @@
|
|||
package org.apache.lucene.analysis.sinks;
|
||||
|
||||
/*
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
|
||||
public class DateRecognizerSinkTokenizerTest extends BaseTokenStreamTestCase {
|
||||
|
||||
public void test() throws IOException {
|
||||
DateRecognizerSinkFilter sinkFilter = new DateRecognizerSinkFilter(new SimpleDateFormat("MM/dd/yyyy", Locale.ROOT));
|
||||
String test = "The quick red fox jumped over the lazy brown dogs on 7/11/2006 The dogs finally reacted on 7/12/2006";
|
||||
final MockTokenizer input = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
input.setReader(new StringReader(test));
|
||||
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(input);
|
||||
TeeSinkTokenFilter.SinkTokenStream sink = tee.newSinkTokenStream(sinkFilter);
|
||||
int count = 0;
|
||||
|
||||
tee.reset();
|
||||
while (tee.incrementToken()) {
|
||||
count++;
|
||||
}
|
||||
assertTrue(count + " does not equal: " + 18, count == 18);
|
||||
|
||||
int sinkCount = 0;
|
||||
sink.reset();
|
||||
while (sink.incrementToken()) {
|
||||
sinkCount++;
|
||||
}
|
||||
assertTrue("sink Size: " + sinkCount + " is not: " + 2, sinkCount == 2);
|
||||
|
||||
}
|
||||
}
|
|
@ -20,28 +20,33 @@ import java.io.IOException;
|
|||
import java.io.StringReader;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.English;
|
||||
|
||||
|
||||
/**
|
||||
* tests for the TestTeeSinkTokenFilter
|
||||
*/
|
||||
|
@ -67,22 +72,6 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
static final TeeSinkTokenFilter.SinkFilter theFilter = new TeeSinkTokenFilter.SinkFilter() {
|
||||
@Override
|
||||
public boolean accept(AttributeSource a) {
|
||||
CharTermAttribute termAtt = a.getAttribute(CharTermAttribute.class);
|
||||
return termAtt.toString().equalsIgnoreCase("The");
|
||||
}
|
||||
};
|
||||
|
||||
static final TeeSinkTokenFilter.SinkFilter dogFilter = new TeeSinkTokenFilter.SinkFilter() {
|
||||
@Override
|
||||
public boolean accept(AttributeSource a) {
|
||||
CharTermAttribute termAtt = a.getAttribute(CharTermAttribute.class);
|
||||
return termAtt.toString().equalsIgnoreCase("Dogs");
|
||||
}
|
||||
};
|
||||
|
||||
// LUCENE-1448
|
||||
// TODO: instead of testing it this way, we can test
|
||||
// with BaseTokenStreamTestCase now...
|
||||
|
@ -128,41 +117,29 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testGeneral() throws IOException {
|
||||
final TeeSinkTokenFilter source = new TeeSinkTokenFilter(whitespaceMockTokenizer(buffer1.toString()));
|
||||
final TokenStream sink1 = source.newSinkTokenStream();
|
||||
final TokenStream sink2 = source.newSinkTokenStream(theFilter);
|
||||
final TokenStream sink = source.newSinkTokenStream();
|
||||
|
||||
source.addAttribute(CheckClearAttributesAttribute.class);
|
||||
sink1.addAttribute(CheckClearAttributesAttribute.class);
|
||||
sink2.addAttribute(CheckClearAttributesAttribute.class);
|
||||
sink.addAttribute(CheckClearAttributesAttribute.class);
|
||||
|
||||
assertTokenStreamContents(source, tokens1);
|
||||
assertTokenStreamContents(sink1, tokens1);
|
||||
assertTokenStreamContents(sink2, new String[]{"The", "the"});
|
||||
assertTokenStreamContents(sink, tokens1);
|
||||
}
|
||||
|
||||
public void testMultipleSources() throws Exception {
|
||||
final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(whitespaceMockTokenizer(buffer1.toString()));
|
||||
final TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter);
|
||||
final TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter);
|
||||
final TokenStream source1 = new CachingTokenFilter(tee1);
|
||||
|
||||
tee1.addAttribute(CheckClearAttributesAttribute.class);
|
||||
dogDetector.addAttribute(CheckClearAttributesAttribute.class);
|
||||
theDetector.addAttribute(CheckClearAttributesAttribute.class);
|
||||
|
||||
MockTokenizer tokenizer = new MockTokenizer(tee1.getAttributeFactory(), MockTokenizer.WHITESPACE, false);
|
||||
tokenizer.setReader(new StringReader(buffer2.toString()));
|
||||
final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(tokenizer);
|
||||
tee2.addSinkTokenStream(dogDetector);
|
||||
tee2.addSinkTokenStream(theDetector);
|
||||
final TokenStream source2 = tee2;
|
||||
|
||||
assertTokenStreamContents(source1, tokens1);
|
||||
assertTokenStreamContents(source2, tokens2);
|
||||
|
||||
assertTokenStreamContents(theDetector, new String[]{"The", "the", "The", "the"});
|
||||
assertTokenStreamContents(dogDetector, new String[]{"Dogs", "Dogs"});
|
||||
|
||||
TokenStream lowerCasing = new LowerCaseFilter(source1);
|
||||
String[] lowerCaseTokens = new String[tokens1.length];
|
||||
for (int i = 0; i < tokens1.length; i++)
|
||||
|
@ -170,7 +147,7 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
|
|||
assertTokenStreamContents(lowerCasing, lowerCaseTokens);
|
||||
}
|
||||
|
||||
private StandardTokenizer standardTokenizer(StringBuilder builder) throws IOException {
|
||||
private StandardTokenizer standardTokenizer(StringBuilder builder) {
|
||||
StandardTokenizer tokenizer = new StandardTokenizer();
|
||||
tokenizer.setReader(new StringReader(builder.toString()));
|
||||
return tokenizer;
|
||||
|
@ -179,6 +156,7 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
|
|||
/**
|
||||
* Not an explicit test, just useful to print out some info on performance
|
||||
*/
|
||||
@SuppressWarnings("resource")
|
||||
public void performance() throws Exception {
|
||||
int[] tokCount = {100, 500, 1000, 2000, 5000, 10000};
|
||||
int[] modCounts = {1, 2, 5, 10, 20, 50, 100, 200, 500};
|
||||
|
@ -190,7 +168,7 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
//make sure we produce the same tokens
|
||||
TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(standardTokenizer(buffer)));
|
||||
TokenStream sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(100));
|
||||
TokenStream sink = new ModuloTokenFilter(teeStream.newSinkTokenStream(), 100);
|
||||
teeStream.consumeAllTokens();
|
||||
TokenStream stream = new ModuloTokenFilter(new StandardFilter(standardTokenizer(buffer)), 100);
|
||||
CharTermAttribute tfTok = stream.addAttribute(CharTermAttribute.class);
|
||||
|
@ -223,7 +201,7 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
|
|||
start = System.currentTimeMillis();
|
||||
for (int i = 0; i < 20; i++) {
|
||||
teeStream = new TeeSinkTokenFilter(new StandardFilter( standardTokenizer(buffer)));
|
||||
sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(modCounts[j]));
|
||||
sink = new ModuloTokenFilter(teeStream.newSinkTokenStream(), modCounts[j]);
|
||||
PositionIncrementAttribute posIncrAtt = teeStream.getAttribute(PositionIncrementAttribute.class);
|
||||
while (teeStream.incrementToken()) {
|
||||
sinkPos += posIncrAtt.getPositionIncrement();
|
||||
|
@ -270,17 +248,18 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
class ModuloSinkFilter extends TeeSinkTokenFilter.SinkFilter {
|
||||
class ModuloSinkFilter extends FilteringTokenFilter {
|
||||
int count = 0;
|
||||
int modCount;
|
||||
|
||||
ModuloSinkFilter(int mc) {
|
||||
ModuloSinkFilter(TokenStream input, int mc) {
|
||||
super(input);
|
||||
modCount = mc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean accept(AttributeSource a) {
|
||||
boolean b = (a != null && count % modCount == 0);
|
||||
protected boolean accept() throws IOException {
|
||||
boolean b = count % modCount == 0;
|
||||
count++;
|
||||
return b;
|
||||
}
|
||||
|
|
|
@ -1,55 +0,0 @@
|
|||
package org.apache.lucene.analysis.sinks;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.junit.Test;
|
||||
|
||||
public class TokenRangeSinkTokenizerTest extends BaseTokenStreamTestCase {
|
||||
|
||||
public void test() throws IOException {
|
||||
TokenRangeSinkFilter sinkFilter = new TokenRangeSinkFilter(2, 4);
|
||||
String test = "The quick red fox jumped over the lazy brown dogs";
|
||||
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(whitespaceMockTokenizer(test));
|
||||
TeeSinkTokenFilter.SinkTokenStream rangeToks = tee.newSinkTokenStream(sinkFilter);
|
||||
|
||||
int count = 0;
|
||||
tee.reset();
|
||||
while(tee.incrementToken()) {
|
||||
count++;
|
||||
}
|
||||
|
||||
int sinkCount = 0;
|
||||
rangeToks.reset();
|
||||
while (rangeToks.incrementToken()) {
|
||||
sinkCount++;
|
||||
}
|
||||
|
||||
assertTrue(count + " does not equal: " + 10, count == 10);
|
||||
assertTrue("rangeToks Size: " + sinkCount + " is not: " + 2, sinkCount == 2);
|
||||
}
|
||||
|
||||
@Test(expected = IllegalArgumentException.class)
|
||||
public void testIllegalArguments() throws Exception {
|
||||
new TokenRangeSinkFilter(4, 2);
|
||||
}
|
||||
}
|
|
@ -1,78 +0,0 @@
|
|||
package org.apache.lucene.analysis.sinks;
|
||||
|
||||
/*
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
|
||||
public class TokenTypeSinkTokenizerTest extends BaseTokenStreamTestCase {
|
||||
|
||||
public void test() throws IOException {
|
||||
TokenTypeSinkFilter sinkFilter = new TokenTypeSinkFilter("D");
|
||||
String test = "The quick red fox jumped over the lazy brown dogs";
|
||||
|
||||
TeeSinkTokenFilter ttf = new TeeSinkTokenFilter(new WordTokenFilter(whitespaceMockTokenizer(test)));
|
||||
TeeSinkTokenFilter.SinkTokenStream sink = ttf.newSinkTokenStream(sinkFilter);
|
||||
|
||||
boolean seenDogs = false;
|
||||
|
||||
CharTermAttribute termAtt = ttf.addAttribute(CharTermAttribute.class);
|
||||
TypeAttribute typeAtt = ttf.addAttribute(TypeAttribute.class);
|
||||
ttf.reset();
|
||||
while (ttf.incrementToken()) {
|
||||
if (termAtt.toString().equals("dogs")) {
|
||||
seenDogs = true;
|
||||
assertTrue(typeAtt.type() + " is not equal to " + "D", typeAtt.type().equals("D") == true);
|
||||
} else {
|
||||
assertTrue(typeAtt.type() + " is not null and it should be", typeAtt.type().equals("word"));
|
||||
}
|
||||
}
|
||||
assertTrue(seenDogs + " does not equal: " + true, seenDogs == true);
|
||||
|
||||
int sinkCount = 0;
|
||||
sink.reset();
|
||||
while (sink.incrementToken()) {
|
||||
sinkCount++;
|
||||
}
|
||||
|
||||
assertTrue("sink Size: " + sinkCount + " is not: " + 1, sinkCount == 1);
|
||||
}
|
||||
|
||||
private class WordTokenFilter extends TokenFilter {
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
|
||||
private WordTokenFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (!input.incrementToken()) return false;
|
||||
|
||||
if (termAtt.toString().equals("dogs")) {
|
||||
typeAtt.setType("D");
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -447,6 +447,17 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
checkRandomData(random, a, iterations, 20, simple, true);
|
||||
}
|
||||
|
||||
/** Asserts that the given stream has expected number of tokens. */
|
||||
public static void assertStreamHasNumberOfTokens(TokenStream ts, int expectedCount) throws IOException {
|
||||
ts.reset();
|
||||
int count = 0;
|
||||
while (ts.incrementToken()) {
|
||||
count++;
|
||||
}
|
||||
ts.end();
|
||||
assertEquals("wrong number of tokens", expectedCount, count);
|
||||
}
|
||||
|
||||
static class AnalysisThread extends Thread {
|
||||
final int iterations;
|
||||
final int maxWordLength;
|
||||
|
|
Loading…
Reference in New Issue