diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 3b7a34867e3..35c4cba3e37 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -144,6 +144,16 @@ API Changes instead of extending DocIdSetIterator. asTwoPhaseIterator() has been renamed to twoPhaseIterator() for consistency. (Adrien Grand) +* LUCENE-6973: TeeSinkTokenFilter no longer accepts a SinkFilter (the latter + has been removed). If you wish to filter the sinks, you can wrap them with + any other TokenFilter (e.g. a FilteringTokenFilter). Also, you can no longer + add a SinkTokenStream to an existing TeeSinkTokenFilter. If you need to + share multiple streams with a single sink, chain them with multiple + TeeSinkTokenFilters. + DateRecognizerSinkFilter was renamed to DateRecognizerFilter and moved under + analysis/common. TokenTypeSinkFilter was removed (use TypeTokenFilter instead). + TokenRangeSinkFilter was removed. (Shai Erera, Uwe Schindler) + Optimizations * LUCENE-6951: Improve GeoPointInPolygonQuery using point orientation based diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DateRecognizerFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DateRecognizerFilter.java new file mode 100644 index 00000000000..52763f45ecf --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DateRecognizerFilter.java @@ -0,0 +1,61 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.text.DateFormat; +import java.text.ParseException; +import java.util.Locale; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.util.FilteringTokenFilter; + +/** Filters all tokens that cannot be parsed to a date, using the provided {@link DateFormat}. */ +public class DateRecognizerFilter extends FilteringTokenFilter { + + public static final String DATE_TYPE = "date"; + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final DateFormat dateFormat; + + /** + * Uses {@link DateFormat#DEFAULT} and {@link Locale#ENGLISH} to create a {@link DateFormat} instance. + */ + public DateRecognizerFilter(TokenStream input) { + this(input, null); + } + + public DateRecognizerFilter(TokenStream input, DateFormat dateFormat) { + super(input); + this.dateFormat = dateFormat != null ? dateFormat : DateFormat.getDateInstance(DateFormat.DEFAULT, Locale.ENGLISH); + } + + @Override + public boolean accept() { + try { + // We don't care about the date, just that the term can be parsed to one. + dateFormat.parse(termAtt.toString()); + return true; + } catch (ParseException e) { + // This term is not a date. + } + + return false; + } + +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DateRecognizerFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DateRecognizerFilterFactory.java new file mode 100755 index 00000000000..7f02b8751b1 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DateRecognizerFilterFactory.java @@ -0,0 +1,84 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.Locale; +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link DateRecognizerFilter}. + * + *
+ * <fieldType name="text_filter_none_date" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.DateRecognizerFilterFactory" datePattern="yyyy/mm/dd" locale="en-US" />
+ *   </analyzer>
+ * </fieldType>
+ * 
+ * + *

+ * The {@code datePattern} is optional. If omitted, {@link DateRecognizerFilter} will be created with the default date + * format of the system. The {@code locale} is optional and if omitted the filter will be created with + * {@link Locale#ENGLISH}. + */ +public class DateRecognizerFilterFactory extends TokenFilterFactory { + + public static final String DATE_PATTERN = "datePattern"; + public static final String LOCALE = "locale"; + + private final DateFormat dateFormat; + private final Locale locale; + + /** Creates a new FingerprintFilterFactory */ + public DateRecognizerFilterFactory(Map args) { + super(args); + this.locale = getLocale(get(args, LOCALE)); + this.dateFormat = getDataFormat(get(args, DATE_PATTERN)); + if (!args.isEmpty()) { + throw new IllegalArgumentException("Unknown parameters: " + args); + } + } + + @Override + public TokenStream create(TokenStream input) { + return new DateRecognizerFilter(input, dateFormat); + } + + private Locale getLocale(String localeStr) { + if (localeStr == null) { + return Locale.ENGLISH; + } else { + return new Locale.Builder().setLanguageTag(localeStr).build(); + } + } + + public DateFormat getDataFormat(String datePattern) { + if (datePattern != null) { + return new SimpleDateFormat(datePattern, locale); + } else { + return DateFormat.getDateInstance(DateFormat.DEFAULT, locale); + } + } + +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sinks/DateRecognizerSinkFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sinks/DateRecognizerSinkFilter.java deleted file mode 100644 index d73f86660be..00000000000 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sinks/DateRecognizerSinkFilter.java +++ /dev/null @@ -1,69 +0,0 @@ -package org.apache.lucene.analysis.sinks; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.text.DateFormat; -import java.text.ParseException; -import java.util.Date; -import java.util.Locale; - -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.util.AttributeSource; - -/** - * Attempts to parse the {@link CharTermAttribute#buffer()} as a Date using a {@link java.text.DateFormat}. - * If the value is a Date, it will add it to the sink. - * - **/ -public class DateRecognizerSinkFilter extends TeeSinkTokenFilter.SinkFilter { - public static final String DATE_TYPE = "date"; - - protected DateFormat dateFormat; - protected CharTermAttribute termAtt; - - /** - * Uses {@link java.text.DateFormat#getDateInstance(int, Locale) - * DateFormat#getDateInstance(DateFormat.DEFAULT, Locale.ROOT)} as - * the {@link java.text.DateFormat} object. - */ - public DateRecognizerSinkFilter() { - this(DateFormat.getDateInstance(DateFormat.DEFAULT, Locale.ROOT)); - } - - public DateRecognizerSinkFilter(DateFormat dateFormat) { - this.dateFormat = dateFormat; - } - - @Override - public boolean accept(AttributeSource source) { - if (termAtt == null) { - termAtt = source.addAttribute(CharTermAttribute.class); - } - try { - Date date = dateFormat.parse(termAtt.toString());//We don't care about the date, just that we can parse it as a date - if (date != null) { - return true; - } - } catch (ParseException e) { - - } - - return false; - } - -} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sinks/TeeSinkTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sinks/TeeSinkTokenFilter.java index a6a9b579e9b..5c30c590cf6 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sinks/TeeSinkTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sinks/TeeSinkTokenFilter.java @@ -18,233 +18,155 @@ package org.apache.lucene.analysis.sinks; */ import java.io.IOException; -import java.lang.ref.WeakReference; +import java.util.ArrayList; import java.util.Iterator; -import java.util.LinkedList; import java.util.List; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.util.AttributeImpl; import org.apache.lucene.util.AttributeSource; /** - * This TokenFilter provides the ability to set aside attribute states - * that have already been analyzed. This is useful in situations where multiple fields share - * many common analysis steps and then go their separate ways. + * This TokenFilter provides the ability to set aside attribute states that have already been analyzed. This is useful + * in situations where multiple fields share many common analysis steps and then go their separate ways. + * *

- * It is also useful for doing things like entity extraction or proper noun analysis as - * part of the analysis workflow and saving off those tokens for use in another field. + * It is also useful for doing things like entity extraction or proper noun analysis as part of the analysis workflow + * and saving off those tokens for use in another field. + *

* *
-TeeSinkTokenFilter source1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(version, reader1));
-TeeSinkTokenFilter.SinkTokenStream sink1 = source1.newSinkTokenStream();
-TeeSinkTokenFilter.SinkTokenStream sink2 = source1.newSinkTokenStream();
-
-TeeSinkTokenFilter source2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(version, reader2));
-source2.addSinkTokenStream(sink1);
-source2.addSinkTokenStream(sink2);
-
-TokenStream final1 = new LowerCaseFilter(version, source1);
-TokenStream final2 = source2;
-TokenStream final3 = new EntityDetect(sink1);
-TokenStream final4 = new URLDetect(sink2);
-
-d.add(new TextField("f1", final1, Field.Store.NO));
-d.add(new TextField("f2", final2, Field.Store.NO));
-d.add(new TextField("f3", final3, Field.Store.NO));
-d.add(new TextField("f4", final4, Field.Store.NO));
+ * TeeSinkTokenFilter source1 = new TeeSinkTokenFilter(new WhitespaceTokenizer());
+ * TeeSinkTokenFilter.SinkTokenStream sink1 = source1.newSinkTokenStream();
+ * TeeSinkTokenFilter.SinkTokenStream sink2 = source1.newSinkTokenStream();
+ *
+ * TokenStream final1 = new LowerCaseFilter(source1);
+ * TokenStream final2 = new EntityDetect(sink1);
+ * TokenStream final3 = new URLDetect(sink2);
+ *
+ * d.add(new TextField("f1", final1));
+ * d.add(new TextField("f2", final2));
+ * d.add(new TextField("f3", final3));
  * 
- * In this example, sink1 and sink2 will both get tokens from both - * reader1 and reader2 after whitespace tokenizer - * and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired. - * It is important, that tees are consumed before sinks (in the above example, the field names must be - * less the sink's field names). If you are not sure, which stream is consumed first, you can simply - * add another sink and then pass all tokens to the sinks at once using {@link #consumeAllTokens}. - * This TokenFilter is exhausted after this. In the above example, change - * the example above to: - *
-...
-TokenStream final1 = new LowerCaseFilter(version, source1.newSinkTokenStream());
-TokenStream final2 = source2.newSinkTokenStream();
-sink1.consumeAllTokens();
-sink2.consumeAllTokens();
-...
- * 
- * In this case, the fields can be added in any order, because the sources are not used anymore and all sinks are ready. - *

Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene. + * + *

+ * In this example, {@code sink1} and {@code sink2} will both get tokens from {@code source1} after whitespace + * tokenization, and will further do additional token filtering, e.g. detect entities and URLs. + *

+ * + *

+ * NOTE: it is important, that tees are consumed before sinks, therefore you should add them to the document + * before the sinks. In the above example, f1 is added before the other fields, and so by the time they are + * processed, it has already been consumed, which is the correct way to index the three streams. If for some reason you + * cannot ensure that, you should call {@link #consumeAllTokens()} before adding the sinks to document fields. */ public final class TeeSinkTokenFilter extends TokenFilter { - private final List> sinks = new LinkedList<>(); - - /** - * Instantiates a new TeeSinkTokenFilter. - */ + + private final States cachedStates = new States(); + public TeeSinkTokenFilter(TokenStream input) { super(input); } - /** - * Returns a new {@link SinkTokenStream} that receives all tokens consumed by this stream. - */ - public SinkTokenStream newSinkTokenStream() { - return newSinkTokenStream(ACCEPT_ALL_FILTER); + /** Returns a new {@link SinkTokenStream} that receives all tokens consumed by this stream. */ + public TokenStream newSinkTokenStream() { + return new SinkTokenStream(this.cloneAttributes(), cachedStates); } - + /** - * Returns a new {@link SinkTokenStream} that receives all tokens consumed by this stream - * that pass the supplied filter. - * @see SinkFilter - */ - public SinkTokenStream newSinkTokenStream(SinkFilter filter) { - SinkTokenStream sink = new SinkTokenStream(this.cloneAttributes(), filter); - this.sinks.add(new WeakReference<>(sink)); - return sink; - } - - /** - * Adds a {@link SinkTokenStream} created by another TeeSinkTokenFilter - * to this one. The supplied stream will also receive all consumed tokens. - * This method can be used to pass tokens from two different tees to one sink. - */ - public void addSinkTokenStream(final SinkTokenStream sink) { - // check that sink has correct factory - if (!this.getAttributeFactory().equals(sink.getAttributeFactory())) { - throw new IllegalArgumentException("The supplied sink is not compatible to this tee"); - } - // add eventually missing attribute impls to the existing sink - for (Iterator it = this.cloneAttributes().getAttributeImplsIterator(); it.hasNext(); ) { - sink.addAttributeImpl(it.next()); - } - this.sinks.add(new WeakReference<>(sink)); - } - - /** - * TeeSinkTokenFilter passes all tokens to the added sinks - * when itself is consumed. To be sure, that all tokens from the input - * stream are passed to the sinks, you can call this methods. - * This instance is exhausted after this, but all sinks are instant available. + * TeeSinkTokenFilter passes all tokens to the added sinks when itself is consumed. To be sure that all + * tokens from the input stream are passed to the sinks, you can call this methods. This instance is exhausted after + * this method returns, but all sinks are instant available. */ public void consumeAllTokens() throws IOException { while (incrementToken()) {} } - + @Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { - // capture state lazily - maybe no SinkFilter accepts this state - AttributeSource.State state = null; - for (WeakReference ref : sinks) { - final SinkTokenStream sink = ref.get(); - if (sink != null) { - if (sink.accept(this)) { - if (state == null) { - state = this.captureState(); - } - sink.addState(state); - } - } - } + cachedStates.add(captureState()); return true; } - + return false; } - + @Override public final void end() throws IOException { super.end(); - AttributeSource.State finalState = captureState(); - for (WeakReference ref : sinks) { - final SinkTokenStream sink = ref.get(); - if (sink != null) { - sink.setFinalState(finalState); - } - } + cachedStates.setFinalState(captureState()); } - - /** - * A filter that decides which {@link AttributeSource} states to store in the sink. - */ - public static abstract class SinkFilter { - /** - * Returns true, iff the current state of the passed-in {@link AttributeSource} shall be stored - * in the sink. - */ - public abstract boolean accept(AttributeSource source); - - /** - * Called by {@link SinkTokenStream#reset()}. This method does nothing by default - * and can optionally be overridden. - */ - public void reset() throws IOException { - // nothing to do; can be overridden - } + + @Override + public void reset() throws IOException { + cachedStates.reset(); + super.reset(); } - - /** - * TokenStream output from a tee with optional filtering. - */ + + /** TokenStream output from a tee. */ public static final class SinkTokenStream extends TokenStream { - private final List cachedStates = new LinkedList<>(); - private AttributeSource.State finalState; + private final States cachedStates; private Iterator it = null; - private SinkFilter filter; - - private SinkTokenStream(AttributeSource source, SinkFilter filter) { + + private SinkTokenStream(AttributeSource source, States cachedStates) { super(source); - this.filter = filter; + this.cachedStates = cachedStates; } - - private boolean accept(AttributeSource source) { - return filter.accept(source); - } - - private void addState(AttributeSource.State state) { - if (it != null) { - throw new IllegalStateException("The tee must be consumed before sinks are consumed."); - } - cachedStates.add(state); - } - - private void setFinalState(AttributeSource.State finalState) { - this.finalState = finalState; - } - + @Override public final boolean incrementToken() { - // lazy init the iterator - if (it == null) { - it = cachedStates.iterator(); - } - if (!it.hasNext()) { return false; } - + AttributeSource.State state = it.next(); restoreState(state); return true; } - + @Override - public final void end() { + public void end() throws IOException { + State finalState = cachedStates.getFinalState(); if (finalState != null) { restoreState(finalState); } } - + @Override public final void reset() { - it = cachedStates.iterator(); + it = cachedStates.getStates(); } } - - private static final SinkFilter ACCEPT_ALL_FILTER = new SinkFilter() { - @Override - public boolean accept(AttributeSource source) { - return true; + + /** A convenience wrapper for storing the cached states as well the final state of the stream. */ + private static final class States { + + private final List states = new ArrayList<>(); + private State finalState; + + public States() {} + + void setFinalState(State finalState) { + this.finalState = finalState; } - }; - + + State getFinalState() { + return finalState; + } + + void add(State state) { + states.add(state); + } + + Iterator getStates() { + return states.iterator(); + } + + void reset() { + finalState = null; + states.clear(); + } + } + } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sinks/TokenRangeSinkFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sinks/TokenRangeSinkFilter.java deleted file mode 100644 index 4fd1945afde..00000000000 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sinks/TokenRangeSinkFilter.java +++ /dev/null @@ -1,61 +0,0 @@ -package org.apache.lucene.analysis.sinks; - -/* -* Licensed to the Apache Software Foundation (ASF) under one or more -* contributor license agreements. See the NOTICE file distributed with -* this work for additional information regarding copyright ownership. -* The ASF licenses this file to You under the Apache License, Version 2.0 -* (the "License"); you may not use this file except in compliance with -* the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -import java.io.IOException; - -import org.apache.lucene.util.AttributeSource; - -/** - * Counts the tokens as they go by and saves to the internal list those between the range of lower and upper, exclusive of upper - * - **/ -public class TokenRangeSinkFilter extends TeeSinkTokenFilter.SinkFilter { - private int lower; - private int upper; - private int count; - - public TokenRangeSinkFilter(int lower, int upper) { - if (lower < 1) { - throw new IllegalArgumentException("lower must be greater than zero"); - } - if (lower > upper) { - throw new IllegalArgumentException("lower must not be greater than upper"); - } - this.lower = lower; - this.upper = upper; - } - - - @Override - public boolean accept(AttributeSource source) { - try { - if (count >= lower && count < upper){ - return true; - } - return false; - } finally { - count++; - } - } - - @Override - public void reset() throws IOException { - count = 0; - } -} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sinks/TokenTypeSinkFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sinks/TokenTypeSinkFilter.java deleted file mode 100644 index b10bd7841ae..00000000000 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sinks/TokenTypeSinkFilter.java +++ /dev/null @@ -1,44 +0,0 @@ -package org.apache.lucene.analysis.sinks; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; -import org.apache.lucene.util.AttributeSource; - -/** - * Adds a token to the sink if it has a specific type. - */ -public class TokenTypeSinkFilter extends TeeSinkTokenFilter.SinkFilter { - private String typeToMatch; - private TypeAttribute typeAtt; - - public TokenTypeSinkFilter(String typeToMatch) { - this.typeToMatch = typeToMatch; - } - - @Override - public boolean accept(AttributeSource source) { - if (typeAtt == null) { - typeAtt = source.addAttribute(TypeAttribute.class); - } - - //check to see if this is a Category - return (typeToMatch.equals(typeAtt.type())); - } - -} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sinks/package-info.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sinks/package-info.java index c8d5e3af6da..964e5f43aaa 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sinks/package-info.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sinks/package-info.java @@ -16,8 +16,6 @@ */ /** - * {@link org.apache.lucene.analysis.sinks.TeeSinkTokenFilter} and implementations - * of {@link org.apache.lucene.analysis.sinks.TeeSinkTokenFilter.SinkFilter} that - * might be useful. + * {@link org.apache.lucene.analysis.sinks.TeeSinkTokenFilter}. */ package org.apache.lucene.analysis.sinks; diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory index 5b3f5787671..8b19cdfc45f 100644 --- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory +++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory @@ -61,6 +61,7 @@ org.apache.lucene.analysis.lv.LatvianStemFilterFactory org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory +org.apache.lucene.analysis.miscellaneous.DateRecognizerFilterFactory org.apache.lucene.analysis.miscellaneous.FingerprintFilterFactory org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory org.apache.lucene.analysis.miscellaneous.KeepWordFilterFactory diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java index bd5dac5af43..d218d891abd 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java @@ -31,6 +31,7 @@ import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.text.DateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -41,6 +42,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.IdentityHashMap; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Random; import java.util.Set; @@ -574,6 +576,13 @@ public class TestRandomChains extends BaseTokenStreamTestCase { } } }); + put(DateFormat.class, new ArgProducer() { + @Override + public Object create(Random random) { + if (random.nextBoolean()) return null; + return DateFormat.getDateInstance(DateFormat.DEFAULT, randomLocale(random)); + } + }); }}; static final Set> allowedTokenizerArgs, allowedTokenFilterArgs, allowedCharFilterArgs; diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/DateRecognizerFilterFactoryTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/DateRecognizerFilterFactoryTest.java new file mode 100755 index 00000000000..241a0e5cab4 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/DateRecognizerFilterFactoryTest.java @@ -0,0 +1,43 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; + +public class DateRecognizerFilterFactoryTest extends BaseTokenStreamTestCase { + + public void testBadLanguageTagThrowsException() { + try { + final Map args = new HashMap<>(); + args.put(DateRecognizerFilterFactory.LOCALE, "en_US"); + new DateRecognizerFilterFactory(args); + fail("Bad language tag should have thrown an exception"); + } catch (Exception e) { + // expected; + } + } + + public void testGoodLocaleParsesWell() { + final Map args = new HashMap<>(); + args.put(DateRecognizerFilterFactory.LOCALE, "en-US"); + new DateRecognizerFilterFactory(args); + } + +} \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/DateRecognizerFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/DateRecognizerFilterTest.java new file mode 100644 index 00000000000..6ae81e27d48 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/DateRecognizerFilterTest.java @@ -0,0 +1,37 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.Locale; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; + +public class DateRecognizerFilterTest extends BaseTokenStreamTestCase { + + public void test() throws IOException { + final String test = "The red fox jumped over the lazy dogs on 7/11/2006 The dogs finally reacted on 7/12/2006"; + final DateFormat dateFormat = new SimpleDateFormat("MM/dd/yyyy", Locale.ENGLISH); + try (TokenStream ts = new DateRecognizerFilter(whitespaceMockTokenizer(test), dateFormat);) { + assertStreamHasNumberOfTokens(ts, 2); + } + } + +} \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sinks/DateRecognizerSinkTokenizerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sinks/DateRecognizerSinkTokenizerTest.java deleted file mode 100644 index 39cca608445..00000000000 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sinks/DateRecognizerSinkTokenizerTest.java +++ /dev/null @@ -1,52 +0,0 @@ -package org.apache.lucene.analysis.sinks; - -/* - * Copyright 2004 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.StringReader; -import java.text.SimpleDateFormat; -import java.util.Locale; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; - -public class DateRecognizerSinkTokenizerTest extends BaseTokenStreamTestCase { - - public void test() throws IOException { - DateRecognizerSinkFilter sinkFilter = new DateRecognizerSinkFilter(new SimpleDateFormat("MM/dd/yyyy", Locale.ROOT)); - String test = "The quick red fox jumped over the lazy brown dogs on 7/11/2006 The dogs finally reacted on 7/12/2006"; - final MockTokenizer input = new MockTokenizer(MockTokenizer.WHITESPACE, false); - input.setReader(new StringReader(test)); - TeeSinkTokenFilter tee = new TeeSinkTokenFilter(input); - TeeSinkTokenFilter.SinkTokenStream sink = tee.newSinkTokenStream(sinkFilter); - int count = 0; - - tee.reset(); - while (tee.incrementToken()) { - count++; - } - assertTrue(count + " does not equal: " + 18, count == 18); - - int sinkCount = 0; - sink.reset(); - while (sink.incrementToken()) { - sinkCount++; - } - assertTrue("sink Size: " + sinkCount + " is not: " + 2, sinkCount == 2); - - } -} \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sinks/TestTeeSinkTokenFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sinks/TestTeeSinkTokenFilter.java index a0e2636e2b6..4b2b8c4dfc8 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sinks/TestTeeSinkTokenFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sinks/TestTeeSinkTokenFilter.java @@ -20,28 +20,33 @@ import java.io.IOException; import java.io.StringReader; import java.util.Locale; -import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.CachingTokenFilter; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.util.FilteringTokenFilter; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.Directory; -import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.English; - /** * tests for the TestTeeSinkTokenFilter */ @@ -67,22 +72,6 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase { } } - static final TeeSinkTokenFilter.SinkFilter theFilter = new TeeSinkTokenFilter.SinkFilter() { - @Override - public boolean accept(AttributeSource a) { - CharTermAttribute termAtt = a.getAttribute(CharTermAttribute.class); - return termAtt.toString().equalsIgnoreCase("The"); - } - }; - - static final TeeSinkTokenFilter.SinkFilter dogFilter = new TeeSinkTokenFilter.SinkFilter() { - @Override - public boolean accept(AttributeSource a) { - CharTermAttribute termAtt = a.getAttribute(CharTermAttribute.class); - return termAtt.toString().equalsIgnoreCase("Dogs"); - } - }; - // LUCENE-1448 // TODO: instead of testing it this way, we can test // with BaseTokenStreamTestCase now... @@ -128,41 +117,29 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase { public void testGeneral() throws IOException { final TeeSinkTokenFilter source = new TeeSinkTokenFilter(whitespaceMockTokenizer(buffer1.toString())); - final TokenStream sink1 = source.newSinkTokenStream(); - final TokenStream sink2 = source.newSinkTokenStream(theFilter); + final TokenStream sink = source.newSinkTokenStream(); source.addAttribute(CheckClearAttributesAttribute.class); - sink1.addAttribute(CheckClearAttributesAttribute.class); - sink2.addAttribute(CheckClearAttributesAttribute.class); + sink.addAttribute(CheckClearAttributesAttribute.class); assertTokenStreamContents(source, tokens1); - assertTokenStreamContents(sink1, tokens1); - assertTokenStreamContents(sink2, new String[]{"The", "the"}); + assertTokenStreamContents(sink, tokens1); } public void testMultipleSources() throws Exception { final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(whitespaceMockTokenizer(buffer1.toString())); - final TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter); - final TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter); final TokenStream source1 = new CachingTokenFilter(tee1); tee1.addAttribute(CheckClearAttributesAttribute.class); - dogDetector.addAttribute(CheckClearAttributesAttribute.class); - theDetector.addAttribute(CheckClearAttributesAttribute.class); MockTokenizer tokenizer = new MockTokenizer(tee1.getAttributeFactory(), MockTokenizer.WHITESPACE, false); tokenizer.setReader(new StringReader(buffer2.toString())); final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(tokenizer); - tee2.addSinkTokenStream(dogDetector); - tee2.addSinkTokenStream(theDetector); final TokenStream source2 = tee2; assertTokenStreamContents(source1, tokens1); assertTokenStreamContents(source2, tokens2); - assertTokenStreamContents(theDetector, new String[]{"The", "the", "The", "the"}); - assertTokenStreamContents(dogDetector, new String[]{"Dogs", "Dogs"}); - TokenStream lowerCasing = new LowerCaseFilter(source1); String[] lowerCaseTokens = new String[tokens1.length]; for (int i = 0; i < tokens1.length; i++) @@ -170,7 +147,7 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase { assertTokenStreamContents(lowerCasing, lowerCaseTokens); } - private StandardTokenizer standardTokenizer(StringBuilder builder) throws IOException { + private StandardTokenizer standardTokenizer(StringBuilder builder) { StandardTokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader(builder.toString())); return tokenizer; @@ -179,6 +156,7 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase { /** * Not an explicit test, just useful to print out some info on performance */ + @SuppressWarnings("resource") public void performance() throws Exception { int[] tokCount = {100, 500, 1000, 2000, 5000, 10000}; int[] modCounts = {1, 2, 5, 10, 20, 50, 100, 200, 500}; @@ -190,7 +168,7 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase { } //make sure we produce the same tokens TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(standardTokenizer(buffer))); - TokenStream sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(100)); + TokenStream sink = new ModuloTokenFilter(teeStream.newSinkTokenStream(), 100); teeStream.consumeAllTokens(); TokenStream stream = new ModuloTokenFilter(new StandardFilter(standardTokenizer(buffer)), 100); CharTermAttribute tfTok = stream.addAttribute(CharTermAttribute.class); @@ -223,7 +201,7 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase { start = System.currentTimeMillis(); for (int i = 0; i < 20; i++) { teeStream = new TeeSinkTokenFilter(new StandardFilter( standardTokenizer(buffer))); - sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(modCounts[j])); + sink = new ModuloTokenFilter(teeStream.newSinkTokenStream(), modCounts[j]); PositionIncrementAttribute posIncrAtt = teeStream.getAttribute(PositionIncrementAttribute.class); while (teeStream.incrementToken()) { sinkPos += posIncrAtt.getPositionIncrement(); @@ -270,17 +248,18 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase { } } - class ModuloSinkFilter extends TeeSinkTokenFilter.SinkFilter { + class ModuloSinkFilter extends FilteringTokenFilter { int count = 0; int modCount; - ModuloSinkFilter(int mc) { + ModuloSinkFilter(TokenStream input, int mc) { + super(input); modCount = mc; } @Override - public boolean accept(AttributeSource a) { - boolean b = (a != null && count % modCount == 0); + protected boolean accept() throws IOException { + boolean b = count % modCount == 0; count++; return b; } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sinks/TokenRangeSinkTokenizerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sinks/TokenRangeSinkTokenizerTest.java deleted file mode 100644 index b1acecd8a99..00000000000 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sinks/TokenRangeSinkTokenizerTest.java +++ /dev/null @@ -1,55 +0,0 @@ -package org.apache.lucene.analysis.sinks; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.junit.Test; - -public class TokenRangeSinkTokenizerTest extends BaseTokenStreamTestCase { - - public void test() throws IOException { - TokenRangeSinkFilter sinkFilter = new TokenRangeSinkFilter(2, 4); - String test = "The quick red fox jumped over the lazy brown dogs"; - TeeSinkTokenFilter tee = new TeeSinkTokenFilter(whitespaceMockTokenizer(test)); - TeeSinkTokenFilter.SinkTokenStream rangeToks = tee.newSinkTokenStream(sinkFilter); - - int count = 0; - tee.reset(); - while(tee.incrementToken()) { - count++; - } - - int sinkCount = 0; - rangeToks.reset(); - while (rangeToks.incrementToken()) { - sinkCount++; - } - - assertTrue(count + " does not equal: " + 10, count == 10); - assertTrue("rangeToks Size: " + sinkCount + " is not: " + 2, sinkCount == 2); - } - - @Test(expected = IllegalArgumentException.class) - public void testIllegalArguments() throws Exception { - new TokenRangeSinkFilter(4, 2); - } -} \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizerTest.java deleted file mode 100644 index d03ee35eef2..00000000000 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizerTest.java +++ /dev/null @@ -1,78 +0,0 @@ -package org.apache.lucene.analysis.sinks; - -/* - * Copyright 2004 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; - -public class TokenTypeSinkTokenizerTest extends BaseTokenStreamTestCase { - - public void test() throws IOException { - TokenTypeSinkFilter sinkFilter = new TokenTypeSinkFilter("D"); - String test = "The quick red fox jumped over the lazy brown dogs"; - - TeeSinkTokenFilter ttf = new TeeSinkTokenFilter(new WordTokenFilter(whitespaceMockTokenizer(test))); - TeeSinkTokenFilter.SinkTokenStream sink = ttf.newSinkTokenStream(sinkFilter); - - boolean seenDogs = false; - - CharTermAttribute termAtt = ttf.addAttribute(CharTermAttribute.class); - TypeAttribute typeAtt = ttf.addAttribute(TypeAttribute.class); - ttf.reset(); - while (ttf.incrementToken()) { - if (termAtt.toString().equals("dogs")) { - seenDogs = true; - assertTrue(typeAtt.type() + " is not equal to " + "D", typeAtt.type().equals("D") == true); - } else { - assertTrue(typeAtt.type() + " is not null and it should be", typeAtt.type().equals("word")); - } - } - assertTrue(seenDogs + " does not equal: " + true, seenDogs == true); - - int sinkCount = 0; - sink.reset(); - while (sink.incrementToken()) { - sinkCount++; - } - - assertTrue("sink Size: " + sinkCount + " is not: " + 1, sinkCount == 1); - } - - private class WordTokenFilter extends TokenFilter { - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); - - private WordTokenFilter(TokenStream input) { - super(input); - } - - @Override - public final boolean incrementToken() throws IOException { - if (!input.incrementToken()) return false; - - if (termAtt.toString().equals("dogs")) { - typeAtt.setType("D"); - } - return true; - } - } -} \ No newline at end of file diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java index 9afd785f884..183b900446a 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java @@ -447,6 +447,17 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { checkRandomData(random, a, iterations, 20, simple, true); } + /** Asserts that the given stream has expected number of tokens. */ + public static void assertStreamHasNumberOfTokens(TokenStream ts, int expectedCount) throws IOException { + ts.reset(); + int count = 0; + while (ts.incrementToken()) { + count++; + } + ts.end(); + assertEquals("wrong number of tokens", expectedCount, count); + } + static class AnalysisThread extends Thread { final int iterations; final int maxWordLength;