LUCENE-3434: Removed state changing setters in ShingleAnalyzerWrapper and PerFieldAnalyzerWrapper

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1170942 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Christopher John Male 2011-09-15 03:21:17 +00:00
parent 5f9c4235dc
commit 318911200d
6 changed files with 135 additions and 133 deletions

View File

@ -38,6 +38,10 @@ API Changes
since they prevented reuse. Stopwords are now generated at instantiation through
the Analyzer's constructors. (Chris Male)
* LUCENE-3434: Removed ShingleAnalyzerWrapper.set* and PerFieldAnalyzerWrapper.addAnalyzer
since they prevent reuse. Both Analyzers should be configured at instantiation.
(Chris Male)
New Features
* LUCENE-2341: A new analyzer/ filter: Morfologik - a dictionary-driven lemmatizer

View File

@ -23,21 +23,25 @@ import org.apache.lucene.index.IndexableField;
import java.io.Reader;
import java.io.IOException;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
/**
* This analyzer is used to facilitate scenarios where different
* fields require different analysis techniques. Use {@link #addAnalyzer}
* to add a non-default analyzer on a field name basis.
* fields require different analysis techniques. Use the Map
* argument in {@link #PerFieldAnalyzerWrapper(Analyzer, java.util.Map)}
* to add non-default analyzers for fields.
*
* <p>Example usage:
*
* <pre>
* Map analyzerPerField = new HashMap();
* analyzerPerField.put("firstname", new KeywordAnalyzer());
* analyzerPerField.put("lastname", new KeywordAnalyzer());
*
* PerFieldAnalyzerWrapper aWrapper =
* new PerFieldAnalyzerWrapper(new StandardAnalyzer());
* aWrapper.addAnalyzer("firstname", new KeywordAnalyzer());
* aWrapper.addAnalyzer("lastname", new KeywordAnalyzer());
* new PerFieldAnalyzerWrapper(new StandardAnalyzer(), analyzerPerField);
* </pre>
*
* <p>In this example, StandardAnalyzer will be used for all fields except "firstname"
@ -47,9 +51,8 @@ import java.util.HashMap;
* and query parsing.
*/
public final class PerFieldAnalyzerWrapper extends Analyzer {
private Analyzer defaultAnalyzer;
private Map<String,Analyzer> analyzerMap = new HashMap<String,Analyzer>();
private final Analyzer defaultAnalyzer;
private final Map<String, Analyzer> fieldAnalyzers;
/**
* Constructs with default analyzer.
@ -70,28 +73,15 @@ public final class PerFieldAnalyzerWrapper extends Analyzer {
* @param fieldAnalyzers a Map (String field name to the Analyzer) to be
* used for those fields
*/
public PerFieldAnalyzerWrapper(Analyzer defaultAnalyzer,
public PerFieldAnalyzerWrapper(Analyzer defaultAnalyzer,
Map<String,Analyzer> fieldAnalyzers) {
this.defaultAnalyzer = defaultAnalyzer;
if (fieldAnalyzers != null) {
analyzerMap.putAll(fieldAnalyzers);
}
}
/**
* Defines an analyzer to use for the specified field.
*
* @param fieldName field name requiring a non-default analyzer
* @param analyzer non-default analyzer to use for field
*/
public void addAnalyzer(String fieldName, Analyzer analyzer) {
analyzerMap.put(fieldName, analyzer);
this.fieldAnalyzers = (fieldAnalyzers != null) ? fieldAnalyzers : Collections.<String, Analyzer>emptyMap();
}
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
Analyzer analyzer = analyzerMap.get(fieldName);
Analyzer analyzer = fieldAnalyzers.get(fieldName);
if (analyzer == null) {
analyzer = defaultAnalyzer;
}
@ -101,7 +91,7 @@ public final class PerFieldAnalyzerWrapper extends Analyzer {
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
Analyzer analyzer = analyzerMap.get(fieldName);
Analyzer analyzer = fieldAnalyzers.get(fieldName);
if (analyzer == null)
analyzer = defaultAnalyzer;
@ -111,7 +101,7 @@ public final class PerFieldAnalyzerWrapper extends Analyzer {
/** Return the positionIncrementGap from the analyzer assigned to fieldName */
@Override
public int getPositionIncrementGap(String fieldName) {
Analyzer analyzer = analyzerMap.get(fieldName);
Analyzer analyzer = fieldAnalyzers.get(fieldName);
if (analyzer == null)
analyzer = defaultAnalyzer;
return analyzer.getPositionIncrementGap(fieldName);
@ -120,7 +110,7 @@ public final class PerFieldAnalyzerWrapper extends Analyzer {
/** Return the offsetGap from the analyzer assigned to field */
@Override
public int getOffsetGap(IndexableField field) {
Analyzer analyzer = analyzerMap.get(field.name());
Analyzer analyzer = fieldAnalyzers.get(field.name());
if (analyzer == null) {
analyzer = defaultAnalyzer;
}
@ -129,6 +119,6 @@ public final class PerFieldAnalyzerWrapper extends Analyzer {
@Override
public String toString() {
return "PerFieldAnalyzerWrapper(" + analyzerMap + ", default=" + defaultAnalyzer + ")";
return "PerFieldAnalyzerWrapper(" + fieldAnalyzers + ", default=" + defaultAnalyzer + ")";
}
}

View File

@ -34,43 +34,79 @@ import org.apache.lucene.util.Version;
public final class ShingleAnalyzerWrapper extends Analyzer {
private final Analyzer defaultAnalyzer;
private int maxShingleSize = ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE;
private int minShingleSize = ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE;
private String tokenSeparator = ShingleFilter.TOKEN_SEPARATOR;
private boolean outputUnigrams = true;
private boolean outputUnigramsIfNoShingles = false;
private final int maxShingleSize;
private final int minShingleSize;
private final String tokenSeparator;
private final boolean outputUnigrams;
private final boolean outputUnigramsIfNoShingles;
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
super();
this.defaultAnalyzer = defaultAnalyzer;
this(defaultAnalyzer, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
}
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int maxShingleSize) {
this(defaultAnalyzer);
setMaxShingleSize(maxShingleSize);
this(defaultAnalyzer, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, maxShingleSize);
}
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int minShingleSize, int maxShingleSize) {
this(defaultAnalyzer);
setMaxShingleSize(maxShingleSize);
setMinShingleSize(minShingleSize);
this(defaultAnalyzer, minShingleSize, maxShingleSize, ShingleFilter.TOKEN_SEPARATOR, true, false);
}
/**
* Creates a new ShingleAnalyzerWrapper
*
* @param defaultAnalyzer Analyzer whose TokenStream is to be filtered
* @param minShingleSize Min shingle (token ngram) size
* @param maxShingleSize Max shingle size
* @param tokenSeparator Used to separate input stream tokens in output shingles
* @param outputUnigrams Whether or not the filter shall pass the original
* tokens to the output stream
* @param outputUnigramsIfNoShingles Overrides the behavior of outputUnigrams==false for those
* times when no shingles are available (because there are fewer than
* minShingleSize tokens in the input stream)?
* Note that if outputUnigrams==true, then unigrams are always output,
* regardless of whether any shingles are available.
*/
public ShingleAnalyzerWrapper(
Analyzer defaultAnalyzer,
int minShingleSize,
int maxShingleSize,
String tokenSeparator,
boolean outputUnigrams,
boolean outputUnigramsIfNoShingles) {
this.defaultAnalyzer = defaultAnalyzer;
if (maxShingleSize < 2) {
throw new IllegalArgumentException("Max shingle size must be >= 2");
}
this.maxShingleSize = maxShingleSize;
if (minShingleSize < 2) {
throw new IllegalArgumentException("Min shingle size must be >= 2");
}
if (minShingleSize > maxShingleSize) {
throw new IllegalArgumentException
("Min shingle size must be <= max shingle size");
}
this.minShingleSize = minShingleSize;
this.tokenSeparator = (tokenSeparator == null ? "" : tokenSeparator);
this.outputUnigrams = outputUnigrams;
this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
}
/**
* Wraps {@link StandardAnalyzer}.
*/
public ShingleAnalyzerWrapper(Version matchVersion) {
super();
this.defaultAnalyzer = new StandardAnalyzer(matchVersion);
this(matchVersion, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
}
/**
* Wraps {@link StandardAnalyzer}.
*/
public ShingleAnalyzerWrapper(Version matchVersion, int minShingleSize, int maxShingleSize) {
this(matchVersion);
setMaxShingleSize(maxShingleSize);
setMinShingleSize(minShingleSize);
this(new StandardAnalyzer(matchVersion), minShingleSize, maxShingleSize);
}
/**
@ -82,18 +118,6 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
return maxShingleSize;
}
/**
* Set the maximum size of output shingles (default: 2)
*
* @param maxShingleSize max shingle size
*/
public void setMaxShingleSize(int maxShingleSize) {
if (maxShingleSize < 2) {
throw new IllegalArgumentException("Max shingle size must be >= 2");
}
this.maxShingleSize = maxShingleSize;
}
/**
* The min shingle (token ngram) size
*
@ -103,69 +127,17 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
return minShingleSize;
}
/**
* <p>Set the min shingle size (default: 2).
* <p>This method requires that the passed in minShingleSize is not greater
* than maxShingleSize, so make sure that maxShingleSize is set before
* calling this method.
*
* @param minShingleSize min size of output shingles
*/
public void setMinShingleSize(int minShingleSize) {
if (minShingleSize < 2) {
throw new IllegalArgumentException("Min shingle size must be >= 2");
}
if (minShingleSize > maxShingleSize) {
throw new IllegalArgumentException
("Min shingle size must be <= max shingle size");
}
this.minShingleSize = minShingleSize;
}
public String getTokenSeparator() {
return tokenSeparator;
}
/**
* Sets the string to use when joining adjacent tokens to form a shingle
* @param tokenSeparator used to separate input stream tokens in output shingles
*/
public void setTokenSeparator(String tokenSeparator) {
this.tokenSeparator = (tokenSeparator == null ? "" : tokenSeparator);
}
public boolean isOutputUnigrams() {
return outputUnigrams;
}
/**
* Shall the filter pass the original tokens (the "unigrams") to the output
* stream?
*
* @param outputUnigrams Whether or not the filter shall pass the original
* tokens to the output stream
*/
public void setOutputUnigrams(boolean outputUnigrams) {
this.outputUnigrams = outputUnigrams;
}
public boolean isOutputUnigramsIfNoShingles() {
return outputUnigramsIfNoShingles;
}
/**
* <p>Shall we override the behavior of outputUnigrams==false for those
* times when no shingles are available (because there are fewer than
* minShingleSize tokens in the input stream)? (default: false.)
* <p>Note that if outputUnigrams==true, then unigrams are always output,
* regardless of whether any shingles are available.
*
* @param outputUnigramsIfNoShingles Whether or not to output a single
* unigram when no shingles are available.
*/
public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) {
this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
}
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {

View File

@ -1,6 +1,8 @@
package org.apache.lucene.analysis.miscellaneous;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
@ -27,9 +29,12 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
public class TestPerFieldAnalzyerWrapper extends BaseTokenStreamTestCase {
public void testPerField() throws Exception {
String text = "Qwerty";
Map<String, Analyzer> analyzerPerField = new HashMap<String, Analyzer>();
analyzerPerField.put("special", new SimpleAnalyzer(TEST_VERSION_CURRENT));
PerFieldAnalyzerWrapper analyzer =
new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
analyzer.addAnalyzer("special", new SimpleAnalyzer(TEST_VERSION_CURRENT));
new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), analyzerPerField);
TokenStream tokenStream = analyzer.tokenStream("field",
new StringReader(text));

View File

@ -17,7 +17,6 @@ package org.apache.lucene.analysis.shingle;
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
@ -162,7 +161,9 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new int[] { 0, 0, 0, 7, 7, 7, 14, 14, 14, 19, 19, 28, 33 },
new int[] { 6, 18, 27, 13, 27, 32, 18, 32, 41, 27, 41, 32, 41 },
new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1 });
analyzer.setOutputUnigrams(false);
analyzer = new ShingleAnalyzerWrapper(
new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 3, 4, ShingleFilter.TOKEN_SEPARATOR, false, false);
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
new String[] { "please divide this", "please divide this sentence",
"divide this sentence", "divide this sentence into",
@ -186,7 +187,9 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new int[] { 0, 0, 7, 7, 14, 14, 19, 19, 28, 33 },
new int[] { 6, 18, 13, 27, 18, 32, 27, 41, 32, 41 },
new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 1 });
analyzer.setOutputUnigrams(false);
analyzer = new ShingleAnalyzerWrapper(
new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 3, 3, ShingleFilter.TOKEN_SEPARATOR, false, false);
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
new String[] { "please divide this",
"divide this sentence",
@ -198,9 +201,11 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
}
public void testNoTokenSeparator() throws Exception {
ShingleAnalyzerWrapper analyzer
= new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
analyzer.setTokenSeparator("");
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
new MockAnalyzer(random, MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
"", true, false);
assertAnalyzesToReuse(analyzer, "please divide into shingles",
new String[] { "please", "pleasedivide",
"divide", "divideinto",
@ -209,7 +214,12 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new int[] { 0, 0, 7, 7, 14, 14, 19 },
new int[] { 6, 13, 13, 18, 18, 27, 27 },
new int[] { 1, 0, 1, 0, 1, 0, 1 });
analyzer.setOutputUnigrams(false);
analyzer = new ShingleAnalyzerWrapper(
new MockAnalyzer(random, MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
"", false, false);
assertAnalyzesToReuse(analyzer, "please divide into shingles",
new String[] { "pleasedivide",
"divideinto",
@ -220,9 +230,11 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
}
public void testNullTokenSeparator() throws Exception {
ShingleAnalyzerWrapper analyzer
= new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
analyzer.setTokenSeparator(null);
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
new MockAnalyzer(random, MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
null, true, false);
assertAnalyzesToReuse(analyzer, "please divide into shingles",
new String[] { "please", "pleasedivide",
"divide", "divideinto",
@ -231,7 +243,12 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new int[] { 0, 0, 7, 7, 14, 14, 19 },
new int[] { 6, 13, 13, 18, 18, 27, 27 },
new int[] { 1, 0, 1, 0, 1, 0, 1 });
analyzer.setOutputUnigrams(false);
analyzer = new ShingleAnalyzerWrapper(
new MockAnalyzer(random, MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
"", false, false);
assertAnalyzesToReuse(analyzer, "please divide into shingles",
new String[] { "pleasedivide",
"divideinto",
@ -241,9 +258,11 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new int[] { 1, 1, 1 });
}
public void testAltTokenSeparator() throws Exception {
ShingleAnalyzerWrapper analyzer
= new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
analyzer.setTokenSeparator("<SEP>");
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
new MockAnalyzer(random, MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
"<SEP>", true, false);
assertAnalyzesToReuse(analyzer, "please divide into shingles",
new String[] { "please", "please<SEP>divide",
"divide", "divide<SEP>into",
@ -252,7 +271,12 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new int[] { 0, 0, 7, 7, 14, 14, 19 },
new int[] { 6, 13, 13, 18, 18, 27, 27 },
new int[] { 1, 0, 1, 0, 1, 0, 1 });
analyzer.setOutputUnigrams(false);
analyzer = new ShingleAnalyzerWrapper(
new MockAnalyzer(random, MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
"<SEP>", false, false);
assertAnalyzesToReuse(analyzer, "please divide into shingles",
new String[] { "please<SEP>divide",
"divide<SEP>into",
@ -263,10 +287,11 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
}
public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception {
ShingleAnalyzerWrapper analyzer
= new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
analyzer.setOutputUnigrams(false);
analyzer.setOutputUnigramsIfNoShingles(true);
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
new MockAnalyzer(random, MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
"", false, true);
assertAnalyzesToReuse(analyzer, "please",
new String[] { "please" },
new int[] { 0 },

View File

@ -21,6 +21,7 @@ import java.util.StringTokenizer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.benchmark.byTask.PerfRunData;
/**
@ -64,9 +65,14 @@ public class NewShingleAnalyzerTask extends PerfTask {
}
wrappedAnalyzer = NewAnalyzerTask.createAnalyzer(analyzerClassName);
}
ShingleAnalyzerWrapper analyzer
= new ShingleAnalyzerWrapper(wrappedAnalyzer, maxShingleSize);
analyzer.setOutputUnigrams(outputUnigrams);
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
wrappedAnalyzer,
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
maxShingleSize,
ShingleFilter.TOKEN_SEPARATOR,
outputUnigrams,
false);
getRunData().setAnalyzer(analyzer);
}