mirror of https://github.com/apache/lucene.git
LUCENE-3434: Removed state changing setters in ShingleAnalyzerWrapper and PerFieldAnalyzerWrapper
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1170942 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5f9c4235dc
commit
318911200d
|
@ -38,6 +38,10 @@ API Changes
|
|||
since they prevented reuse. Stopwords are now generated at instantiation through
|
||||
the Analyzer's constructors. (Chris Male)
|
||||
|
||||
* LUCENE-3434: Removed ShingleAnalyzerWrapper.set* and PerFieldAnalyzerWrapper.addAnalyzer
|
||||
since they prevent reuse. Both Analyzers should be configured at instantiation.
|
||||
(Chris Male)
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-2341: A new analyzer/ filter: Morfologik - a dictionary-driven lemmatizer
|
||||
|
|
|
@ -23,21 +23,25 @@ import org.apache.lucene.index.IndexableField;
|
|||
|
||||
import java.io.Reader;
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* This analyzer is used to facilitate scenarios where different
|
||||
* fields require different analysis techniques. Use {@link #addAnalyzer}
|
||||
* to add a non-default analyzer on a field name basis.
|
||||
* fields require different analysis techniques. Use the Map
|
||||
* argument in {@link #PerFieldAnalyzerWrapper(Analyzer, java.util.Map)}
|
||||
* to add non-default analyzers for fields.
|
||||
*
|
||||
* <p>Example usage:
|
||||
*
|
||||
* <pre>
|
||||
* Map analyzerPerField = new HashMap();
|
||||
* analyzerPerField.put("firstname", new KeywordAnalyzer());
|
||||
* analyzerPerField.put("lastname", new KeywordAnalyzer());
|
||||
*
|
||||
* PerFieldAnalyzerWrapper aWrapper =
|
||||
* new PerFieldAnalyzerWrapper(new StandardAnalyzer());
|
||||
* aWrapper.addAnalyzer("firstname", new KeywordAnalyzer());
|
||||
* aWrapper.addAnalyzer("lastname", new KeywordAnalyzer());
|
||||
* new PerFieldAnalyzerWrapper(new StandardAnalyzer(), analyzerPerField);
|
||||
* </pre>
|
||||
*
|
||||
* <p>In this example, StandardAnalyzer will be used for all fields except "firstname"
|
||||
|
@ -47,9 +51,8 @@ import java.util.HashMap;
|
|||
* and query parsing.
|
||||
*/
|
||||
public final class PerFieldAnalyzerWrapper extends Analyzer {
|
||||
private Analyzer defaultAnalyzer;
|
||||
private Map<String,Analyzer> analyzerMap = new HashMap<String,Analyzer>();
|
||||
|
||||
private final Analyzer defaultAnalyzer;
|
||||
private final Map<String, Analyzer> fieldAnalyzers;
|
||||
|
||||
/**
|
||||
* Constructs with default analyzer.
|
||||
|
@ -70,28 +73,15 @@ public final class PerFieldAnalyzerWrapper extends Analyzer {
|
|||
* @param fieldAnalyzers a Map (String field name to the Analyzer) to be
|
||||
* used for those fields
|
||||
*/
|
||||
public PerFieldAnalyzerWrapper(Analyzer defaultAnalyzer,
|
||||
public PerFieldAnalyzerWrapper(Analyzer defaultAnalyzer,
|
||||
Map<String,Analyzer> fieldAnalyzers) {
|
||||
this.defaultAnalyzer = defaultAnalyzer;
|
||||
if (fieldAnalyzers != null) {
|
||||
analyzerMap.putAll(fieldAnalyzers);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Defines an analyzer to use for the specified field.
|
||||
*
|
||||
* @param fieldName field name requiring a non-default analyzer
|
||||
* @param analyzer non-default analyzer to use for field
|
||||
*/
|
||||
public void addAnalyzer(String fieldName, Analyzer analyzer) {
|
||||
analyzerMap.put(fieldName, analyzer);
|
||||
this.fieldAnalyzers = (fieldAnalyzers != null) ? fieldAnalyzers : Collections.<String, Analyzer>emptyMap();
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
Analyzer analyzer = analyzerMap.get(fieldName);
|
||||
Analyzer analyzer = fieldAnalyzers.get(fieldName);
|
||||
if (analyzer == null) {
|
||||
analyzer = defaultAnalyzer;
|
||||
}
|
||||
|
@ -101,7 +91,7 @@ public final class PerFieldAnalyzerWrapper extends Analyzer {
|
|||
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||
Analyzer analyzer = analyzerMap.get(fieldName);
|
||||
Analyzer analyzer = fieldAnalyzers.get(fieldName);
|
||||
if (analyzer == null)
|
||||
analyzer = defaultAnalyzer;
|
||||
|
||||
|
@ -111,7 +101,7 @@ public final class PerFieldAnalyzerWrapper extends Analyzer {
|
|||
/** Return the positionIncrementGap from the analyzer assigned to fieldName */
|
||||
@Override
|
||||
public int getPositionIncrementGap(String fieldName) {
|
||||
Analyzer analyzer = analyzerMap.get(fieldName);
|
||||
Analyzer analyzer = fieldAnalyzers.get(fieldName);
|
||||
if (analyzer == null)
|
||||
analyzer = defaultAnalyzer;
|
||||
return analyzer.getPositionIncrementGap(fieldName);
|
||||
|
@ -120,7 +110,7 @@ public final class PerFieldAnalyzerWrapper extends Analyzer {
|
|||
/** Return the offsetGap from the analyzer assigned to field */
|
||||
@Override
|
||||
public int getOffsetGap(IndexableField field) {
|
||||
Analyzer analyzer = analyzerMap.get(field.name());
|
||||
Analyzer analyzer = fieldAnalyzers.get(field.name());
|
||||
if (analyzer == null) {
|
||||
analyzer = defaultAnalyzer;
|
||||
}
|
||||
|
@ -129,6 +119,6 @@ public final class PerFieldAnalyzerWrapper extends Analyzer {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "PerFieldAnalyzerWrapper(" + analyzerMap + ", default=" + defaultAnalyzer + ")";
|
||||
return "PerFieldAnalyzerWrapper(" + fieldAnalyzers + ", default=" + defaultAnalyzer + ")";
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,43 +34,79 @@ import org.apache.lucene.util.Version;
|
|||
public final class ShingleAnalyzerWrapper extends Analyzer {
|
||||
|
||||
private final Analyzer defaultAnalyzer;
|
||||
private int maxShingleSize = ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE;
|
||||
private int minShingleSize = ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE;
|
||||
private String tokenSeparator = ShingleFilter.TOKEN_SEPARATOR;
|
||||
private boolean outputUnigrams = true;
|
||||
private boolean outputUnigramsIfNoShingles = false;
|
||||
private final int maxShingleSize;
|
||||
private final int minShingleSize;
|
||||
private final String tokenSeparator;
|
||||
private final boolean outputUnigrams;
|
||||
private final boolean outputUnigramsIfNoShingles;
|
||||
|
||||
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
|
||||
super();
|
||||
this.defaultAnalyzer = defaultAnalyzer;
|
||||
this(defaultAnalyzer, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
|
||||
}
|
||||
|
||||
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int maxShingleSize) {
|
||||
this(defaultAnalyzer);
|
||||
setMaxShingleSize(maxShingleSize);
|
||||
this(defaultAnalyzer, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, maxShingleSize);
|
||||
}
|
||||
|
||||
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int minShingleSize, int maxShingleSize) {
|
||||
this(defaultAnalyzer);
|
||||
setMaxShingleSize(maxShingleSize);
|
||||
setMinShingleSize(minShingleSize);
|
||||
this(defaultAnalyzer, minShingleSize, maxShingleSize, ShingleFilter.TOKEN_SEPARATOR, true, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new ShingleAnalyzerWrapper
|
||||
*
|
||||
* @param defaultAnalyzer Analyzer whose TokenStream is to be filtered
|
||||
* @param minShingleSize Min shingle (token ngram) size
|
||||
* @param maxShingleSize Max shingle size
|
||||
* @param tokenSeparator Used to separate input stream tokens in output shingles
|
||||
* @param outputUnigrams Whether or not the filter shall pass the original
|
||||
* tokens to the output stream
|
||||
* @param outputUnigramsIfNoShingles Overrides the behavior of outputUnigrams==false for those
|
||||
* times when no shingles are available (because there are fewer than
|
||||
* minShingleSize tokens in the input stream)?
|
||||
* Note that if outputUnigrams==true, then unigrams are always output,
|
||||
* regardless of whether any shingles are available.
|
||||
*/
|
||||
public ShingleAnalyzerWrapper(
|
||||
Analyzer defaultAnalyzer,
|
||||
int minShingleSize,
|
||||
int maxShingleSize,
|
||||
String tokenSeparator,
|
||||
boolean outputUnigrams,
|
||||
boolean outputUnigramsIfNoShingles) {
|
||||
this.defaultAnalyzer = defaultAnalyzer;
|
||||
|
||||
if (maxShingleSize < 2) {
|
||||
throw new IllegalArgumentException("Max shingle size must be >= 2");
|
||||
}
|
||||
this.maxShingleSize = maxShingleSize;
|
||||
|
||||
if (minShingleSize < 2) {
|
||||
throw new IllegalArgumentException("Min shingle size must be >= 2");
|
||||
}
|
||||
if (minShingleSize > maxShingleSize) {
|
||||
throw new IllegalArgumentException
|
||||
("Min shingle size must be <= max shingle size");
|
||||
}
|
||||
this.minShingleSize = minShingleSize;
|
||||
|
||||
this.tokenSeparator = (tokenSeparator == null ? "" : tokenSeparator);
|
||||
this.outputUnigrams = outputUnigrams;
|
||||
this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
|
||||
}
|
||||
|
||||
/**
|
||||
* Wraps {@link StandardAnalyzer}.
|
||||
*/
|
||||
public ShingleAnalyzerWrapper(Version matchVersion) {
|
||||
super();
|
||||
this.defaultAnalyzer = new StandardAnalyzer(matchVersion);
|
||||
this(matchVersion, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Wraps {@link StandardAnalyzer}.
|
||||
*/
|
||||
public ShingleAnalyzerWrapper(Version matchVersion, int minShingleSize, int maxShingleSize) {
|
||||
this(matchVersion);
|
||||
setMaxShingleSize(maxShingleSize);
|
||||
setMinShingleSize(minShingleSize);
|
||||
this(new StandardAnalyzer(matchVersion), minShingleSize, maxShingleSize);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -82,18 +118,6 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
|
|||
return maxShingleSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the maximum size of output shingles (default: 2)
|
||||
*
|
||||
* @param maxShingleSize max shingle size
|
||||
*/
|
||||
public void setMaxShingleSize(int maxShingleSize) {
|
||||
if (maxShingleSize < 2) {
|
||||
throw new IllegalArgumentException("Max shingle size must be >= 2");
|
||||
}
|
||||
this.maxShingleSize = maxShingleSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* The min shingle (token ngram) size
|
||||
*
|
||||
|
@ -103,69 +127,17 @@ public final class ShingleAnalyzerWrapper extends Analyzer {
|
|||
return minShingleSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Set the min shingle size (default: 2).
|
||||
* <p>This method requires that the passed in minShingleSize is not greater
|
||||
* than maxShingleSize, so make sure that maxShingleSize is set before
|
||||
* calling this method.
|
||||
*
|
||||
* @param minShingleSize min size of output shingles
|
||||
*/
|
||||
public void setMinShingleSize(int minShingleSize) {
|
||||
if (minShingleSize < 2) {
|
||||
throw new IllegalArgumentException("Min shingle size must be >= 2");
|
||||
}
|
||||
if (minShingleSize > maxShingleSize) {
|
||||
throw new IllegalArgumentException
|
||||
("Min shingle size must be <= max shingle size");
|
||||
}
|
||||
this.minShingleSize = minShingleSize;
|
||||
}
|
||||
|
||||
public String getTokenSeparator() {
|
||||
return tokenSeparator;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the string to use when joining adjacent tokens to form a shingle
|
||||
* @param tokenSeparator used to separate input stream tokens in output shingles
|
||||
*/
|
||||
public void setTokenSeparator(String tokenSeparator) {
|
||||
this.tokenSeparator = (tokenSeparator == null ? "" : tokenSeparator);
|
||||
}
|
||||
|
||||
public boolean isOutputUnigrams() {
|
||||
return outputUnigrams;
|
||||
}
|
||||
|
||||
/**
|
||||
* Shall the filter pass the original tokens (the "unigrams") to the output
|
||||
* stream?
|
||||
*
|
||||
* @param outputUnigrams Whether or not the filter shall pass the original
|
||||
* tokens to the output stream
|
||||
*/
|
||||
public void setOutputUnigrams(boolean outputUnigrams) {
|
||||
this.outputUnigrams = outputUnigrams;
|
||||
}
|
||||
|
||||
public boolean isOutputUnigramsIfNoShingles() {
|
||||
return outputUnigramsIfNoShingles;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Shall we override the behavior of outputUnigrams==false for those
|
||||
* times when no shingles are available (because there are fewer than
|
||||
* minShingleSize tokens in the input stream)? (default: false.)
|
||||
* <p>Note that if outputUnigrams==true, then unigrams are always output,
|
||||
* regardless of whether any shingles are available.
|
||||
*
|
||||
* @param outputUnigramsIfNoShingles Whether or not to output a single
|
||||
* unigram when no shingles are available.
|
||||
*/
|
||||
public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) {
|
||||
this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.core.SimpleAnalyzer;
|
||||
|
@ -27,9 +29,12 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
public class TestPerFieldAnalzyerWrapper extends BaseTokenStreamTestCase {
|
||||
public void testPerField() throws Exception {
|
||||
String text = "Qwerty";
|
||||
|
||||
Map<String, Analyzer> analyzerPerField = new HashMap<String, Analyzer>();
|
||||
analyzerPerField.put("special", new SimpleAnalyzer(TEST_VERSION_CURRENT));
|
||||
|
||||
PerFieldAnalyzerWrapper analyzer =
|
||||
new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
|
||||
analyzer.addAnalyzer("special", new SimpleAnalyzer(TEST_VERSION_CURRENT));
|
||||
new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), analyzerPerField);
|
||||
|
||||
TokenStream tokenStream = analyzer.tokenStream("field",
|
||||
new StringReader(text));
|
||||
|
|
|
@ -17,7 +17,6 @@ package org.apache.lucene.analysis.shingle;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
|
@ -162,7 +161,9 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
new int[] { 0, 0, 0, 7, 7, 7, 14, 14, 14, 19, 19, 28, 33 },
|
||||
new int[] { 6, 18, 27, 13, 27, 32, 18, 32, 41, 27, 41, 32, 41 },
|
||||
new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1 });
|
||||
analyzer.setOutputUnigrams(false);
|
||||
|
||||
analyzer = new ShingleAnalyzerWrapper(
|
||||
new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 3, 4, ShingleFilter.TOKEN_SEPARATOR, false, false);
|
||||
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
|
||||
new String[] { "please divide this", "please divide this sentence",
|
||||
"divide this sentence", "divide this sentence into",
|
||||
|
@ -186,7 +187,9 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
new int[] { 0, 0, 7, 7, 14, 14, 19, 19, 28, 33 },
|
||||
new int[] { 6, 18, 13, 27, 18, 32, 27, 41, 32, 41 },
|
||||
new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 1 });
|
||||
analyzer.setOutputUnigrams(false);
|
||||
|
||||
analyzer = new ShingleAnalyzerWrapper(
|
||||
new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 3, 3, ShingleFilter.TOKEN_SEPARATOR, false, false);
|
||||
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
|
||||
new String[] { "please divide this",
|
||||
"divide this sentence",
|
||||
|
@ -198,9 +201,11 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testNoTokenSeparator() throws Exception {
|
||||
ShingleAnalyzerWrapper analyzer
|
||||
= new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
|
||||
analyzer.setTokenSeparator("");
|
||||
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
|
||||
new MockAnalyzer(random, MockTokenizer.WHITESPACE, false),
|
||||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||
"", true, false);
|
||||
assertAnalyzesToReuse(analyzer, "please divide into shingles",
|
||||
new String[] { "please", "pleasedivide",
|
||||
"divide", "divideinto",
|
||||
|
@ -209,7 +214,12 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
new int[] { 0, 0, 7, 7, 14, 14, 19 },
|
||||
new int[] { 6, 13, 13, 18, 18, 27, 27 },
|
||||
new int[] { 1, 0, 1, 0, 1, 0, 1 });
|
||||
analyzer.setOutputUnigrams(false);
|
||||
|
||||
analyzer = new ShingleAnalyzerWrapper(
|
||||
new MockAnalyzer(random, MockTokenizer.WHITESPACE, false),
|
||||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||
"", false, false);
|
||||
assertAnalyzesToReuse(analyzer, "please divide into shingles",
|
||||
new String[] { "pleasedivide",
|
||||
"divideinto",
|
||||
|
@ -220,9 +230,11 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testNullTokenSeparator() throws Exception {
|
||||
ShingleAnalyzerWrapper analyzer
|
||||
= new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
|
||||
analyzer.setTokenSeparator(null);
|
||||
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
|
||||
new MockAnalyzer(random, MockTokenizer.WHITESPACE, false),
|
||||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||
null, true, false);
|
||||
assertAnalyzesToReuse(analyzer, "please divide into shingles",
|
||||
new String[] { "please", "pleasedivide",
|
||||
"divide", "divideinto",
|
||||
|
@ -231,7 +243,12 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
new int[] { 0, 0, 7, 7, 14, 14, 19 },
|
||||
new int[] { 6, 13, 13, 18, 18, 27, 27 },
|
||||
new int[] { 1, 0, 1, 0, 1, 0, 1 });
|
||||
analyzer.setOutputUnigrams(false);
|
||||
|
||||
analyzer = new ShingleAnalyzerWrapper(
|
||||
new MockAnalyzer(random, MockTokenizer.WHITESPACE, false),
|
||||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||
"", false, false);
|
||||
assertAnalyzesToReuse(analyzer, "please divide into shingles",
|
||||
new String[] { "pleasedivide",
|
||||
"divideinto",
|
||||
|
@ -241,9 +258,11 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
new int[] { 1, 1, 1 });
|
||||
}
|
||||
public void testAltTokenSeparator() throws Exception {
|
||||
ShingleAnalyzerWrapper analyzer
|
||||
= new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
|
||||
analyzer.setTokenSeparator("<SEP>");
|
||||
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
|
||||
new MockAnalyzer(random, MockTokenizer.WHITESPACE, false),
|
||||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||
"<SEP>", true, false);
|
||||
assertAnalyzesToReuse(analyzer, "please divide into shingles",
|
||||
new String[] { "please", "please<SEP>divide",
|
||||
"divide", "divide<SEP>into",
|
||||
|
@ -252,7 +271,12 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
new int[] { 0, 0, 7, 7, 14, 14, 19 },
|
||||
new int[] { 6, 13, 13, 18, 18, 27, 27 },
|
||||
new int[] { 1, 0, 1, 0, 1, 0, 1 });
|
||||
analyzer.setOutputUnigrams(false);
|
||||
|
||||
analyzer = new ShingleAnalyzerWrapper(
|
||||
new MockAnalyzer(random, MockTokenizer.WHITESPACE, false),
|
||||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||
"<SEP>", false, false);
|
||||
assertAnalyzesToReuse(analyzer, "please divide into shingles",
|
||||
new String[] { "please<SEP>divide",
|
||||
"divide<SEP>into",
|
||||
|
@ -263,10 +287,11 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception {
|
||||
ShingleAnalyzerWrapper analyzer
|
||||
= new ShingleAnalyzerWrapper(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
|
||||
analyzer.setOutputUnigrams(false);
|
||||
analyzer.setOutputUnigramsIfNoShingles(true);
|
||||
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
|
||||
new MockAnalyzer(random, MockTokenizer.WHITESPACE, false),
|
||||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
|
||||
"", false, true);
|
||||
assertAnalyzesToReuse(analyzer, "please",
|
||||
new String[] { "please" },
|
||||
new int[] { 0 },
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.util.StringTokenizer;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper;
|
||||
import org.apache.lucene.analysis.shingle.ShingleFilter;
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
|
||||
/**
|
||||
|
@ -64,9 +65,14 @@ public class NewShingleAnalyzerTask extends PerfTask {
|
|||
}
|
||||
wrappedAnalyzer = NewAnalyzerTask.createAnalyzer(analyzerClassName);
|
||||
}
|
||||
ShingleAnalyzerWrapper analyzer
|
||||
= new ShingleAnalyzerWrapper(wrappedAnalyzer, maxShingleSize);
|
||||
analyzer.setOutputUnigrams(outputUnigrams);
|
||||
|
||||
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
|
||||
wrappedAnalyzer,
|
||||
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
|
||||
maxShingleSize,
|
||||
ShingleFilter.TOKEN_SEPARATOR,
|
||||
outputUnigrams,
|
||||
false);
|
||||
getRunData().setAnalyzer(analyzer);
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue