From e7088279f754fe5809a78302eda2a0874a30dbf7 Mon Sep 17 00:00:00 2001 From: Uwe Schindler Date: Sun, 30 Jan 2011 18:30:34 +0000 Subject: [PATCH] LUCENE-1253: LengthFilter (and Solr's KeepWordTokenFilter) now require up front specification of enablePositionIncrement. Together with StopFilter they have a common base class (FilteringTokenFilter) that handles the position increments automatically. Implementors only need to override an accept() method that filters tokens git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1065343 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 6 ++ .../lucene/analysis/core/StopFilter.java | 55 +---------- .../miscellaneous/KeepWordFilter.java | 14 ++- .../analysis/miscellaneous/LengthFilter.java | 25 ++--- .../analysis/util/FilteringTokenFilter.java | 96 +++++++++++++++++++ .../miscellaneous/TestKeepWordFilter.java | 20 +++- .../miscellaneous/TestLengthFilter.java | 25 +++-- .../solr/analysis/KeepWordFilterFactory.java | 29 ++++-- .../solr/analysis/LengthFilterFactory.java | 5 +- .../solr/analysis/LengthFilterTest.java | 12 ++- 10 files changed, 186 insertions(+), 101 deletions(-) create mode 100644 modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index d666278b314..79ded77817e 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -643,6 +643,12 @@ API Changes deletes remain buffered so that the next time you open an NRT reader and pass true, all deletes will be a applied. (Mike McCandless) +* LUCENE-1253: LengthFilter (and Solr's KeepWordTokenFilter) now + require up front specification of enablePositionIncrement. Together with + StopFilter they have a common base class (FilteringTokenFilter) that handles + the position increments automatically. Implementors only need to override an + accept() method that filters tokens. (Uwe Schindler, Robert Muir) + Bug fixes * LUCENE-2249: ParallelMultiSearcher should shut down thread pool on diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java index 0aba57fd08e..45b847a833e 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java @@ -22,10 +22,9 @@ import java.util.Arrays; import java.util.List; import java.util.Set; -import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.util.FilteringTokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.util.Version; @@ -42,14 +41,10 @@ import org.apache.lucene.util.Version; * increments are preserved * */ -public final class StopFilter extends TokenFilter { +public final class StopFilter extends FilteringTokenFilter { private final CharArraySet stopWords; - private boolean enablePositionIncrements = true; - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); - /** * Construct a token stream filtering the given input. If @@ -75,7 +70,7 @@ public final class StopFilter extends TokenFilter { */ public StopFilter(Version matchVersion, TokenStream input, Set stopWords, boolean ignoreCase) { - super(input); + super(true, input); this.stopWords = stopWords instanceof CharArraySet ? (CharArraySet) stopWords : new CharArraySet(matchVersion, stopWords, ignoreCase); } @@ -157,48 +152,8 @@ public final class StopFilter extends TokenFilter { * Returns the next input Token whose term() is not a stop word. */ @Override - public final boolean incrementToken() throws IOException { - // return the first non-stop word found - int skippedPositions = 0; - while (input.incrementToken()) { - if (!stopWords.contains(termAtt.buffer(), 0, termAtt.length())) { - if (enablePositionIncrements) { - posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions); - } - return true; - } - skippedPositions += posIncrAtt.getPositionIncrement(); - } - // reached EOS -- return false - return false; + protected boolean accept() throws IOException { + return !stopWords.contains(termAtt.buffer(), 0, termAtt.length()); } - /** - * @see #setEnablePositionIncrements(boolean) - */ - public boolean getEnablePositionIncrements() { - return enablePositionIncrements; - } - - /** - * If true, this StopFilter will preserve - * positions of the incoming tokens (ie, accumulate and - * set position increments of the removed stop tokens). - * Generally, true is best as it does not - * lose information (positions of the original tokens) - * during indexing. - * - * Default is true. - * - *

When set, when a token is stopped - * (omitted), the position increment of the following - * token is incremented. - * - *

NOTE: be sure to also - * set {@link QueryParser#setEnablePositionIncrements} if - * you use QueryParser to create queries. - */ - public void setEnablePositionIncrements(boolean enable) { - this.enablePositionIncrements = enable; - } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java index e488fe4dd46..935c96f5bb7 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java @@ -21,6 +21,7 @@ import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.FilteringTokenFilter; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.util.CharArraySet; @@ -30,22 +31,19 @@ import org.apache.lucene.analysis.util.CharArraySet; * * @since solr 1.3 */ -public final class KeepWordFilter extends TokenFilter { +public final class KeepWordFilter extends FilteringTokenFilter { private final CharArraySet words; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); /** The words set passed to this constructor will be directly used by this filter * and should not be modified, */ - public KeepWordFilter(TokenStream in, CharArraySet words) { - super(in); + public KeepWordFilter(boolean enablePositionIncrements, TokenStream in, CharArraySet words) { + super(enablePositionIncrements, in); this.words = words; } @Override - public boolean incrementToken() throws IOException { - while (input.incrementToken()) { - if (words.contains(termAtt.buffer(), 0, termAtt.length())) return true; - } - return false; + public boolean accept() throws IOException { + return words.contains(termAtt.buffer(), 0, termAtt.length()); } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java index bfccddbeab4..3f36f2f48e2 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java @@ -21,6 +21,7 @@ import java.io.IOException; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.util.FilteringTokenFilter; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; /** @@ -29,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; * Note: Length is calculated as the number of UTF-16 code units. *

*/ -public final class LengthFilter extends TokenFilter { +public final class LengthFilter extends FilteringTokenFilter { private final int min; private final int max; @@ -40,27 +41,15 @@ public final class LengthFilter extends TokenFilter { * Build a filter that removes words that are too long or too * short from the text. */ - public LengthFilter(TokenStream in, int min, int max) - { - super(in); + public LengthFilter(boolean enablePositionIncrements, TokenStream in, int min, int max) { + super(enablePositionIncrements, in); this.min = min; this.max = max; } - /** - * Returns the next input Token whose term() is the right len - */ @Override - public final boolean incrementToken() throws IOException { - // return the first non-stop word found - while (input.incrementToken()) { - int len = termAtt.length(); - if (len >= min && len <= max) { - return true; - } - // note: else we ignore it but should we index each part of it? - } - // reached EOS -- return false - return false; + public boolean accept() throws IOException { + final int len = termAtt.length(); + return (len >= min && len <= max); } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java new file mode 100644 index 00000000000..aa5d41fdc7c --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java @@ -0,0 +1,96 @@ +package org.apache.lucene.analysis.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.queryParser.QueryParser; // for javadoc + +/** + * Abstract base class for TokenFilters that may remove tokens. + * You have to implement {@link #accept} and return a boolean if the current + * token should be preserved. {@link #incrementToken} uses this method + * to decide if a token should be passed to the caller. + */ +public abstract class FilteringTokenFilter extends TokenFilter { + + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value! + + public FilteringTokenFilter(boolean enablePositionIncrements, TokenStream input){ + super(input); + this.enablePositionIncrements = enablePositionIncrements; + } + + /** Override this method and return if the current input token should be returned by {@link #incrementToken}. */ + protected abstract boolean accept() throws IOException; + + @Override + public final boolean incrementToken() throws IOException { + if (enablePositionIncrements) { + int skippedPositions = 0; + while (input.incrementToken()) { + if (accept()) { + if (skippedPositions != 0) { + posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions); + } + return true; + } + skippedPositions += posIncrAtt.getPositionIncrement(); + } + } else { + while (input.incrementToken()) { + if (accept()) { + return true; + } + } + } + // reached EOS -- return false + return false; + } + + /** + * @see #setEnablePositionIncrements(boolean) + */ + public boolean getEnablePositionIncrements() { + return enablePositionIncrements; + } + + /** + * If true, this TokenFilter will preserve + * positions of the incoming tokens (ie, accumulate and + * set position increments of the removed tokens). + * Generally, true is best as it does not + * lose information (positions of the original tokens) + * during indexing. + * + *

When set, when a token is stopped + * (omitted), the position increment of the following + * token is incremented. + * + *

NOTE: be sure to also + * set {@link QueryParser#setEnablePositionIncrements} if + * you use QueryParser to create queries. + */ + public void setEnablePositionIncrements(boolean enable) { + this.enablePositionIncrements = enable; + } +} diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java index 5039b4bc47a..2ec9cb92872 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java @@ -35,16 +35,26 @@ public class TestKeepWordFilter extends BaseTokenStreamTestCase { words.add( "aaa" ); words.add( "bbb" ); - String input = "aaa BBB ccc ddd EEE"; + String input = "xxx yyy aaa zzz BBB ccc ddd EEE"; // Test Stopwords TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)); - stream = new KeepWordFilter(stream, new CharArraySet(TEST_VERSION_CURRENT, words, true)); - assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }); + stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true)); + assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 3, 2 }); // Now force case stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)); - stream = new KeepWordFilter(stream, new CharArraySet(TEST_VERSION_CURRENT,words, false)); - assertTokenStreamContents(stream, new String[] { "aaa" }); + stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false)); + assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 3 }); + + // Test Stopwords + stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)); + stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true)); + assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 1, 1 }); + + // Now force case + stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)); + stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false)); + assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 1 }); } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilter.java index de8b7311d19..070164c0161 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilter.java @@ -24,19 +24,24 @@ import java.io.StringReader; public class TestLengthFilter extends BaseTokenStreamTestCase { - public void testFilter() throws Exception { + public void testFilterNoPosIncr() throws Exception { TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("short toolong evenmuchlongertext a ab toolong foo")); - LengthFilter filter = new LengthFilter(stream, 2, 6); - CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class); + LengthFilter filter = new LengthFilter(false, stream, 2, 6); + assertTokenStreamContents(filter, + new String[]{"short", "ab", "foo"}, + new int[]{1, 1, 1} + ); + } - assertTrue(filter.incrementToken()); - assertEquals("short", termAtt.toString()); - assertTrue(filter.incrementToken()); - assertEquals("ab", termAtt.toString()); - assertTrue(filter.incrementToken()); - assertEquals("foo", termAtt.toString()); - assertFalse(filter.incrementToken()); + public void testFilterWithPosIncr() throws Exception { + TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, + new StringReader("short toolong evenmuchlongertext a ab toolong foo")); + LengthFilter filter = new LengthFilter(true, stream, 2, 6); + assertTokenStreamContents(filter, + new String[]{"short", "ab", "foo"}, + new int[]{1, 4, 2} + ); } } diff --git a/solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java b/solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java index eaff7c72341..d9b8ee90a88 100644 --- a/solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java @@ -23,22 +23,27 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.KeepWordFilter; import org.apache.lucene.analysis.util.CharArraySet; +import java.util.Map; import java.util.Set; import java.io.IOException; /** * @version $Id$ - * @since solr 1.3 */ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { - private CharArraySet words; - private boolean ignoreCase; + @Override + public void init(Map args) { + super.init(args); + assureMatchVersion(); + } public void inform(ResourceLoader loader) { String wordFiles = args.get("words"); ignoreCase = getBoolean("ignoreCase", false); - if (wordFiles != null) { + enablePositionIncrements = getBoolean("enablePositionIncrements",false); + + if (wordFiles != null) { try { words = getWordSet(loader, wordFiles, ignoreCase); } catch (IOException e) { @@ -47,6 +52,10 @@ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements Res } } + private CharArraySet words; + private boolean ignoreCase; + private boolean enablePositionIncrements; + /** * Set the keep word list. * NOTE: if ignoreCase==true, the words are expected to be lowercase @@ -62,15 +71,19 @@ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements Res this.ignoreCase = ignoreCase; } - public KeepWordFilter create(TokenStream input) { - return new KeepWordFilter(input, words); + public boolean isEnablePositionIncrements() { + return enablePositionIncrements; + } + + public boolean isIgnoreCase() { + return ignoreCase; } public CharArraySet getWords() { return words; } - public boolean isIgnoreCase() { - return ignoreCase; + public KeepWordFilter create(TokenStream input) { + return new KeepWordFilter(enablePositionIncrements, input, words); } } diff --git a/solr/src/java/org/apache/solr/analysis/LengthFilterFactory.java b/solr/src/java/org/apache/solr/analysis/LengthFilterFactory.java index f8105c77709..74d67422269 100644 --- a/solr/src/java/org/apache/solr/analysis/LengthFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/LengthFilterFactory.java @@ -27,6 +27,7 @@ import java.util.Map; */ public class LengthFilterFactory extends BaseTokenFilterFactory { int min,max; + boolean enablePositionIncrements; public static final String MIN_KEY = "min"; public static final String MAX_KEY = "max"; @@ -35,8 +36,10 @@ public class LengthFilterFactory extends BaseTokenFilterFactory { super.init(args); min=Integer.parseInt(args.get(MIN_KEY)); max=Integer.parseInt(args.get(MAX_KEY)); + enablePositionIncrements = getBoolean("enablePositionIncrements",false); } + public LengthFilter create(TokenStream input) { - return new LengthFilter(input,min,max); + return new LengthFilter(enablePositionIncrements, input,min,max); } } diff --git a/solr/src/test/org/apache/solr/analysis/LengthFilterTest.java b/solr/src/test/org/apache/solr/analysis/LengthFilterTest.java index 66ba3a89281..95f5dc1cf25 100644 --- a/solr/src/test/org/apache/solr/analysis/LengthFilterTest.java +++ b/solr/src/test/org/apache/solr/analysis/LengthFilterTest.java @@ -31,9 +31,19 @@ public class LengthFilterTest extends BaseTokenTestCase { Map args = new HashMap(); args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4)); args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10)); + // default: args.put("enablePositionIncrements", "false"); factory.init(args); String test = "foo foobar super-duper-trooper"; TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(test))); - assertTokenStreamContents(stream, new String[] { "foobar" }); + assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 1 }); + + factory = new LengthFilterFactory(); + args = new HashMap(); + args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4)); + args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10)); + args.put("enablePositionIncrements", "true"); + factory.init(args); + stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(test))); + assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 2 }); } } \ No newline at end of file