diff --git a/modules/analysis/CHANGES.txt b/modules/analysis/CHANGES.txt index d051127dda4..24599552002 100644 --- a/modules/analysis/CHANGES.txt +++ b/modules/analysis/CHANGES.txt @@ -38,6 +38,10 @@ API Changes since they prevented reuse. Stopwords are now generated at instantiation through the Analyzer's constructors. (Chris Male) + * LUCENE-3434: Removed ShingleAnalyzerWrapper.set* and PerFieldAnalyzerWrapper.addAnalyzer + since they prevent reuse. Both Analyzers should be configured at instantiation. + (Chris Male) + New Features * LUCENE-2341: A new analyzer/ filter: Morfologik - a dictionary-driven lemmatizer diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PerFieldAnalyzerWrapper.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PerFieldAnalyzerWrapper.java index 65f64c2072f..08ec36ac17a 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PerFieldAnalyzerWrapper.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PerFieldAnalyzerWrapper.java @@ -23,21 +23,25 @@ import org.apache.lucene.index.IndexableField; import java.io.Reader; import java.io.IOException; +import java.util.Collections; import java.util.Map; import java.util.HashMap; /** * This analyzer is used to facilitate scenarios where different - * fields require different analysis techniques. Use {@link #addAnalyzer} - * to add a non-default analyzer on a field name basis. + * fields require different analysis techniques. Use the Map + * argument in {@link #PerFieldAnalyzerWrapper(Analyzer, java.util.Map)} + * to add non-default analyzers for fields. * *

Example usage: * *

+ *   Map analyzerPerField = new HashMap();
+ *   analyzerPerField.put("firstname", new KeywordAnalyzer());
+ *   analyzerPerField.put("lastname", new KeywordAnalyzer());
+ *
  *   PerFieldAnalyzerWrapper aWrapper =
- *      new PerFieldAnalyzerWrapper(new StandardAnalyzer());
- *   aWrapper.addAnalyzer("firstname", new KeywordAnalyzer());
- *   aWrapper.addAnalyzer("lastname", new KeywordAnalyzer());
+ *      new PerFieldAnalyzerWrapper(new StandardAnalyzer(), analyzerPerField);
  * 
* *

In this example, StandardAnalyzer will be used for all fields except "firstname" @@ -47,9 +51,8 @@ import java.util.HashMap; * and query parsing. */ public final class PerFieldAnalyzerWrapper extends Analyzer { - private Analyzer defaultAnalyzer; - private Map analyzerMap = new HashMap(); - + private final Analyzer defaultAnalyzer; + private final Map fieldAnalyzers; /** * Constructs with default analyzer. @@ -70,28 +73,15 @@ public final class PerFieldAnalyzerWrapper extends Analyzer { * @param fieldAnalyzers a Map (String field name to the Analyzer) to be * used for those fields */ - public PerFieldAnalyzerWrapper(Analyzer defaultAnalyzer, + public PerFieldAnalyzerWrapper(Analyzer defaultAnalyzer, Map fieldAnalyzers) { this.defaultAnalyzer = defaultAnalyzer; - if (fieldAnalyzers != null) { - analyzerMap.putAll(fieldAnalyzers); - } - } - - - /** - * Defines an analyzer to use for the specified field. - * - * @param fieldName field name requiring a non-default analyzer - * @param analyzer non-default analyzer to use for field - */ - public void addAnalyzer(String fieldName, Analyzer analyzer) { - analyzerMap.put(fieldName, analyzer); + this.fieldAnalyzers = (fieldAnalyzers != null) ? fieldAnalyzers : Collections.emptyMap(); } @Override public TokenStream tokenStream(String fieldName, Reader reader) { - Analyzer analyzer = analyzerMap.get(fieldName); + Analyzer analyzer = fieldAnalyzers.get(fieldName); if (analyzer == null) { analyzer = defaultAnalyzer; } @@ -101,7 +91,7 @@ public final class PerFieldAnalyzerWrapper extends Analyzer { @Override public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { - Analyzer analyzer = analyzerMap.get(fieldName); + Analyzer analyzer = fieldAnalyzers.get(fieldName); if (analyzer == null) analyzer = defaultAnalyzer; @@ -111,7 +101,7 @@ public final class PerFieldAnalyzerWrapper extends Analyzer { /** Return the positionIncrementGap from the analyzer assigned to fieldName */ @Override public int getPositionIncrementGap(String fieldName) { - Analyzer analyzer = analyzerMap.get(fieldName); + Analyzer analyzer = fieldAnalyzers.get(fieldName); if (analyzer == null) analyzer = defaultAnalyzer; return analyzer.getPositionIncrementGap(fieldName); @@ -120,7 +110,7 @@ public final class PerFieldAnalyzerWrapper extends Analyzer { /** Return the offsetGap from the analyzer assigned to field */ @Override public int getOffsetGap(IndexableField field) { - Analyzer analyzer = analyzerMap.get(field.name()); + Analyzer analyzer = fieldAnalyzers.get(field.name()); if (analyzer == null) { analyzer = defaultAnalyzer; } @@ -129,6 +119,6 @@ public final class PerFieldAnalyzerWrapper extends Analyzer { @Override public String toString() { - return "PerFieldAnalyzerWrapper(" + analyzerMap + ", default=" + defaultAnalyzer + ")"; + return "PerFieldAnalyzerWrapper(" + fieldAnalyzers + ", default=" + defaultAnalyzer + ")"; } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java index 2bccdf12e47..217a3622b51 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java @@ -34,43 +34,79 @@ import org.apache.lucene.util.Version; public final class ShingleAnalyzerWrapper extends Analyzer { private final Analyzer defaultAnalyzer; - private int maxShingleSize = ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE; - private int minShingleSize = ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE; - private String tokenSeparator = ShingleFilter.TOKEN_SEPARATOR; - private boolean outputUnigrams = true; - private boolean outputUnigramsIfNoShingles = false; + private final int maxShingleSize; + private final int minShingleSize; + private final String tokenSeparator; + private final boolean outputUnigrams; + private final boolean outputUnigramsIfNoShingles; public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) { - super(); - this.defaultAnalyzer = defaultAnalyzer; + this(defaultAnalyzer, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE); } public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int maxShingleSize) { - this(defaultAnalyzer); - setMaxShingleSize(maxShingleSize); + this(defaultAnalyzer, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, maxShingleSize); } public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int minShingleSize, int maxShingleSize) { - this(defaultAnalyzer); - setMaxShingleSize(maxShingleSize); - setMinShingleSize(minShingleSize); + this(defaultAnalyzer, minShingleSize, maxShingleSize, ShingleFilter.TOKEN_SEPARATOR, true, false); + } + + /** + * Creates a new ShingleAnalyzerWrapper + * + * @param defaultAnalyzer Analyzer whose TokenStream is to be filtered + * @param minShingleSize Min shingle (token ngram) size + * @param maxShingleSize Max shingle size + * @param tokenSeparator Used to separate input stream tokens in output shingles + * @param outputUnigrams Whether or not the filter shall pass the original + * tokens to the output stream + * @param outputUnigramsIfNoShingles Overrides the behavior of outputUnigrams==false for those + * times when no shingles are available (because there are fewer than + * minShingleSize tokens in the input stream)? + * Note that if outputUnigrams==true, then unigrams are always output, + * regardless of whether any shingles are available. + */ + public ShingleAnalyzerWrapper( + Analyzer defaultAnalyzer, + int minShingleSize, + int maxShingleSize, + String tokenSeparator, + boolean outputUnigrams, + boolean outputUnigramsIfNoShingles) { + this.defaultAnalyzer = defaultAnalyzer; + + if (maxShingleSize < 2) { + throw new IllegalArgumentException("Max shingle size must be >= 2"); + } + this.maxShingleSize = maxShingleSize; + + if (minShingleSize < 2) { + throw new IllegalArgumentException("Min shingle size must be >= 2"); + } + if (minShingleSize > maxShingleSize) { + throw new IllegalArgumentException + ("Min shingle size must be <= max shingle size"); + } + this.minShingleSize = minShingleSize; + + this.tokenSeparator = (tokenSeparator == null ? "" : tokenSeparator); + this.outputUnigrams = outputUnigrams; + this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles; } /** * Wraps {@link StandardAnalyzer}. */ public ShingleAnalyzerWrapper(Version matchVersion) { - super(); - this.defaultAnalyzer = new StandardAnalyzer(matchVersion); + this(matchVersion, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE); } /** * Wraps {@link StandardAnalyzer}. */ public ShingleAnalyzerWrapper(Version matchVersion, int minShingleSize, int maxShingleSize) { - this(matchVersion); - setMaxShingleSize(maxShingleSize); - setMinShingleSize(minShingleSize); + this(new StandardAnalyzer(matchVersion), minShingleSize, maxShingleSize); } /** @@ -82,18 +118,6 @@ public final class ShingleAnalyzerWrapper extends Analyzer { return maxShingleSize; } - /** - * Set the maximum size of output shingles (default: 2) - * - * @param maxShingleSize max shingle size - */ - public void setMaxShingleSize(int maxShingleSize) { - if (maxShingleSize < 2) { - throw new IllegalArgumentException("Max shingle size must be >= 2"); - } - this.maxShingleSize = maxShingleSize; - } - /** * The min shingle (token ngram) size * @@ -103,69 +127,17 @@ public final class ShingleAnalyzerWrapper extends Analyzer { return minShingleSize; } - /** - *

Set the min shingle size (default: 2). - *

This method requires that the passed in minShingleSize is not greater - * than maxShingleSize, so make sure that maxShingleSize is set before - * calling this method. - * - * @param minShingleSize min size of output shingles - */ - public void setMinShingleSize(int minShingleSize) { - if (minShingleSize < 2) { - throw new IllegalArgumentException("Min shingle size must be >= 2"); - } - if (minShingleSize > maxShingleSize) { - throw new IllegalArgumentException - ("Min shingle size must be <= max shingle size"); - } - this.minShingleSize = minShingleSize; - } - public String getTokenSeparator() { return tokenSeparator; } - - /** - * Sets the string to use when joining adjacent tokens to form a shingle - * @param tokenSeparator used to separate input stream tokens in output shingles - */ - public void setTokenSeparator(String tokenSeparator) { - this.tokenSeparator = (tokenSeparator == null ? "" : tokenSeparator); - } public boolean isOutputUnigrams() { return outputUnigrams; } - - /** - * Shall the filter pass the original tokens (the "unigrams") to the output - * stream? - * - * @param outputUnigrams Whether or not the filter shall pass the original - * tokens to the output stream - */ - public void setOutputUnigrams(boolean outputUnigrams) { - this.outputUnigrams = outputUnigrams; - } public boolean isOutputUnigramsIfNoShingles() { return outputUnigramsIfNoShingles; } - - /** - *

Shall we override the behavior of outputUnigrams==false for those - * times when no shingles are available (because there are fewer than - * minShingleSize tokens in the input stream)? (default: false.) - *

Note that if outputUnigrams==true, then unigrams are always output, - * regardless of whether any shingles are available. - * - * @param outputUnigramsIfNoShingles Whether or not to output a single - * unigram when no shingles are available. - */ - public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) { - this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles; - } @Override public TokenStream tokenStream(String fieldName, Reader reader) { diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPerFieldAnalzyerWrapper.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPerFieldAnalzyerWrapper.java index fa2c51d2d1a..6f9ad44d8c4 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPerFieldAnalzyerWrapper.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPerFieldAnalzyerWrapper.java @@ -1,6 +1,8 @@ package org.apache.lucene.analysis.miscellaneous; import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; import org.apache.lucene.analysis.*; import org.apache.lucene.analysis.core.SimpleAnalyzer; @@ -27,9 +29,12 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; public class TestPerFieldAnalzyerWrapper extends BaseTokenStreamTestCase { public void testPerField() throws Exception { String text = "Qwerty"; + + Map analyzerPerField = new HashMap(); + analyzerPerField.put("special", new SimpleAnalyzer(TEST_VERSION_CURRENT)); + PerFieldAnalyzerWrapper analyzer = - new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT)); - analyzer.addAnalyzer("special", new SimpleAnalyzer(TEST_VERSION_CURRENT)); + new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), analyzerPerField); TokenStream tokenStream = analyzer.tokenStream("field", new StringReader(text)); diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java index e4b4f35fe03..aaade9fd350 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java @@ -17,7 +17,6 @@ package org.apache.lucene.analysis.shingle; * limitations under the License. */ -import java.io.Reader; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; @@ -162,7 +161,9 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase { new int[] { 0, 0, 0, 7, 7, 7, 14, 14, 14, 19, 19, 28, 33 }, new int[] { 6, 18, 27, 13, 27, 32, 18, 32, 41, 27, 41, 32, 41 }, new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1 }); - analyzer.setOutputUnigrams(false); + + analyzer = new ShingleAnalyzerWrapper( + new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 3, 4, ShingleFilter.TOKEN_SEPARATOR, false, false); assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles", new String[] { "please divide this", "please divide this sentence", "divide this sentence", "divide this sentence into", @@ -186,7 +187,9 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase { new int[] { 0, 0, 7, 7, 14, 14, 19, 19, 28, 33 }, new int[] { 6, 18, 13, 27, 18, 32, 27, 41, 32, 41 }, new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 1 }); - analyzer.setOutputUnigrams(false); + + analyzer = new ShingleAnalyzerWrapper( + new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 3, 3, ShingleFilter.TOKEN_SEPARATOR, false, false); assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles", new String[] { "please divide this", "divide this sentence", @@ -198,9 +201,11 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase { } public void testNoTokenSeparator() throws Exception { - ShingleAnalyzerWrapper analyzer - = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)); - analyzer.setTokenSeparator(""); + ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper( + new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), + ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, + ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, + "", true, false); assertAnalyzesToReuse(analyzer, "please divide into shingles", new String[] { "please", "pleasedivide", "divide", "divideinto", @@ -209,7 +214,12 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase { new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 0, 1 }); - analyzer.setOutputUnigrams(false); + + analyzer = new ShingleAnalyzerWrapper( + new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), + ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, + ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, + "", false, false); assertAnalyzesToReuse(analyzer, "please divide into shingles", new String[] { "pleasedivide", "divideinto", @@ -220,9 +230,11 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase { } public void testNullTokenSeparator() throws Exception { - ShingleAnalyzerWrapper analyzer - = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)); - analyzer.setTokenSeparator(null); + ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper( + new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), + ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, + ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, + null, true, false); assertAnalyzesToReuse(analyzer, "please divide into shingles", new String[] { "please", "pleasedivide", "divide", "divideinto", @@ -231,7 +243,12 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase { new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 0, 1 }); - analyzer.setOutputUnigrams(false); + + analyzer = new ShingleAnalyzerWrapper( + new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), + ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, + ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, + "", false, false); assertAnalyzesToReuse(analyzer, "please divide into shingles", new String[] { "pleasedivide", "divideinto", @@ -241,9 +258,11 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase { new int[] { 1, 1, 1 }); } public void testAltTokenSeparator() throws Exception { - ShingleAnalyzerWrapper analyzer - = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)); - analyzer.setTokenSeparator(""); + ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper( + new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), + ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, + ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, + "", true, false); assertAnalyzesToReuse(analyzer, "please divide into shingles", new String[] { "please", "pleasedivide", "divide", "divideinto", @@ -252,7 +271,12 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase { new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 0, 1 }); - analyzer.setOutputUnigrams(false); + + analyzer = new ShingleAnalyzerWrapper( + new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), + ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, + ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, + "", false, false); assertAnalyzesToReuse(analyzer, "please divide into shingles", new String[] { "pleasedivide", "divideinto", @@ -263,10 +287,11 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase { } public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception { - ShingleAnalyzerWrapper analyzer - = new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)); - analyzer.setOutputUnigrams(false); - analyzer.setOutputUnigramsIfNoShingles(true); + ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper( + new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), + ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, + ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, + "", false, true); assertAnalyzesToReuse(analyzer, "please", new String[] { "please" }, new int[] { 0 }, diff --git a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java index a4af36ae12d..6dfd4cf0817 100644 --- a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java +++ b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java @@ -21,6 +21,7 @@ import java.util.StringTokenizer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper; +import org.apache.lucene.analysis.shingle.ShingleFilter; import org.apache.lucene.benchmark.byTask.PerfRunData; /** @@ -64,9 +65,14 @@ public class NewShingleAnalyzerTask extends PerfTask { } wrappedAnalyzer = NewAnalyzerTask.createAnalyzer(analyzerClassName); } - ShingleAnalyzerWrapper analyzer - = new ShingleAnalyzerWrapper(wrappedAnalyzer, maxShingleSize); - analyzer.setOutputUnigrams(outputUnigrams); + + ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper( + wrappedAnalyzer, + ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, + maxShingleSize, + ShingleFilter.TOKEN_SEPARATOR, + outputUnigrams, + false); getRunData().setAnalyzer(analyzer); }