mirror of https://github.com/apache/lucene.git
LUCENE-8497: Replace MultiTermAwareComponent with normalize() method
This commit is contained in:
parent
759af0127c
commit
65486442c4
|
@ -87,6 +87,11 @@ API Changes
|
|||
* LUCENE-8513: MultiFields.getFields is now removed. Please avoid this class,
|
||||
and Fields in general, when possible. (David Smiley)
|
||||
|
||||
* LUCENE-8497: MultiTermAwareComponent has been removed, and in its place
|
||||
TokenFilterFactory and CharFilterFactory now expose type-safe normalize()
|
||||
methods. This decouples normalization from tokenization entirely.
|
||||
(Mayya Sharipova, Alan Woodward)
|
||||
|
||||
Changes in Runtime Behavior
|
||||
|
||||
* LUCENE-8333: Switch MoreLikeThis.setMaxDocFreqPct to use maxDoc instead of
|
||||
|
|
|
@ -145,3 +145,8 @@ use a TokenFilter chain as you would with any other Tokenizer.
|
|||
|
||||
Both Highlighter and FastVectorHighlighter need a custom WeightedSpanTermExtractor or FieldQuery respectively
|
||||
in order to support ToParent/ToChildBlockJoinQuery.
|
||||
|
||||
## MultiTermAwareComponent replaced by CharFilterFactory#normalize() and TokenFilterFactory#normalize() ##
|
||||
|
||||
Normalization is now type-safe, with CharFilterFactory#normalize() returning a Reader and
|
||||
TokenFilterFactory#normalize() returning a TokenFilter.
|
||||
|
|
|
@ -20,8 +20,6 @@ package org.apache.lucene.analysis.ar;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
|
@ -36,7 +34,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
*
|
||||
* @since 3.1
|
||||
*/
|
||||
public class ArabicNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||
public class ArabicNormalizationFilterFactory extends TokenFilterFactory {
|
||||
|
||||
/** Creates a new ArabicNormalizationFilterFactory */
|
||||
public ArabicNormalizationFilterFactory(Map<String,String> args) {
|
||||
|
@ -47,12 +45,12 @@ public class ArabicNormalizationFilterFactory extends TokenFilterFactory impleme
|
|||
}
|
||||
|
||||
@Override
|
||||
public ArabicNormalizationFilter create(TokenStream input) {
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new ArabicNormalizationFilter(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
public TokenStream normalize(TokenStream input) {
|
||||
return create(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,13 +17,11 @@
|
|||
package org.apache.lucene.analysis.bn;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link BengaliNormalizationFilter}.
|
||||
* <pre class="prettyprint">
|
||||
|
@ -35,7 +33,7 @@ import java.util.Map;
|
|||
* </fieldType></pre>
|
||||
* @since 7.1.0
|
||||
*/
|
||||
public class BengaliNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||
public class BengaliNormalizationFilterFactory extends TokenFilterFactory {
|
||||
|
||||
public BengaliNormalizationFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
|
@ -48,9 +46,9 @@ public class BengaliNormalizationFilterFactory extends TokenFilterFactory implem
|
|||
public TokenStream create(TokenStream input) {
|
||||
return new BengaliNormalizationFilter(input);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
public TokenStream normalize(TokenStream input) {
|
||||
return create(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,9 +25,7 @@ import java.util.Map;
|
|||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.CharFilterFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.ResourceLoader;
|
||||
import org.apache.lucene.analysis.util.ResourceLoaderAware;
|
||||
|
||||
|
@ -44,7 +42,7 @@ import org.apache.lucene.analysis.util.ResourceLoaderAware;
|
|||
* @since Solr 1.4
|
||||
*/
|
||||
public class MappingCharFilterFactory extends CharFilterFactory implements
|
||||
ResourceLoaderAware, MultiTermAwareComponent {
|
||||
ResourceLoaderAware {
|
||||
|
||||
protected NormalizeCharMap normMap;
|
||||
private final String mapping;
|
||||
|
@ -86,6 +84,11 @@ public class MappingCharFilterFactory extends CharFilterFactory implements
|
|||
return normMap == null ? input : new MappingCharFilter(normMap,input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Reader normalize(Reader input) {
|
||||
return create(input);
|
||||
}
|
||||
|
||||
// "source" => "target"
|
||||
static Pattern p = Pattern.compile( "\"(.*)\"\\s*=>\\s*\"(.*)\"\\s*$" );
|
||||
|
||||
|
@ -131,8 +134,4 @@ public class MappingCharFilterFactory extends CharFilterFactory implements
|
|||
return new String( out, 0, writePos );
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,8 +20,6 @@ package org.apache.lucene.analysis.cjk;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
|
@ -37,7 +35,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
* </fieldType></pre>
|
||||
* @since 3.6.0
|
||||
*/
|
||||
public class CJKWidthFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||
public class CJKWidthFilterFactory extends TokenFilterFactory {
|
||||
|
||||
/** Creates a new CJKWidthFilterFactory */
|
||||
public CJKWidthFilterFactory(Map<String,String> args) {
|
||||
|
@ -51,9 +49,9 @@ public class CJKWidthFilterFactory extends TokenFilterFactory implements MultiTe
|
|||
public TokenStream create(TokenStream input) {
|
||||
return new CJKWidthFilter(input);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
public TokenStream normalize(TokenStream input) {
|
||||
return create(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,8 +20,6 @@ package org.apache.lucene.analysis.ckb;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
|
@ -35,7 +33,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
* </fieldType></pre>
|
||||
* @since 4.7.0
|
||||
*/
|
||||
public class SoraniNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||
public class SoraniNormalizationFilterFactory extends TokenFilterFactory {
|
||||
|
||||
/** Creates a new SoraniNormalizationFilterFactory */
|
||||
public SoraniNormalizationFilterFactory(Map<String,String> args) {
|
||||
|
@ -46,12 +44,12 @@ public class SoraniNormalizationFilterFactory extends TokenFilterFactory impleme
|
|||
}
|
||||
|
||||
@Override
|
||||
public SoraniNormalizationFilter create(TokenStream input) {
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new SoraniNormalizationFilter(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
public TokenStream normalize(TokenStream input) {
|
||||
return create(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,8 +20,6 @@ package org.apache.lucene.analysis.core;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
|
@ -35,7 +33,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
* </fieldType></pre>
|
||||
* @since 5.4.0
|
||||
*/
|
||||
public class DecimalDigitFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||
public class DecimalDigitFilterFactory extends TokenFilterFactory {
|
||||
|
||||
/** Creates a new DecimalDigitFilterFactory */
|
||||
public DecimalDigitFilterFactory(Map<String,String> args) {
|
||||
|
@ -46,12 +44,12 @@ public class DecimalDigitFilterFactory extends TokenFilterFactory implements Mul
|
|||
}
|
||||
|
||||
@Override
|
||||
public DecimalDigitFilter create(TokenStream input) {
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new DecimalDigitFilter(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
public TokenStream normalize(TokenStream input) {
|
||||
return create(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,8 +20,6 @@ package org.apache.lucene.analysis.core;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
|
@ -36,7 +34,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
*
|
||||
* @since 3.1
|
||||
*/
|
||||
public class LowerCaseFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||
public class LowerCaseFilterFactory extends TokenFilterFactory {
|
||||
|
||||
/** Creates a new LowerCaseFilterFactory */
|
||||
public LowerCaseFilterFactory(Map<String,String> args) {
|
||||
|
@ -47,12 +45,12 @@ public class LowerCaseFilterFactory extends TokenFilterFactory implements MultiT
|
|||
}
|
||||
|
||||
@Override
|
||||
public LowerCaseFilter create(TokenStream input) {
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new LowerCaseFilter(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
public TokenStream normalize(TokenStream input) {
|
||||
return create(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,8 +20,6 @@ package org.apache.lucene.analysis.core;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
|
@ -40,7 +38,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
* general search matching
|
||||
* @since 4.7.0
|
||||
*/
|
||||
public class UpperCaseFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||
public class UpperCaseFilterFactory extends TokenFilterFactory {
|
||||
|
||||
/** Creates a new UpperCaseFilterFactory */
|
||||
public UpperCaseFilterFactory(Map<String,String> args) {
|
||||
|
@ -51,12 +49,12 @@ public class UpperCaseFilterFactory extends TokenFilterFactory implements MultiT
|
|||
}
|
||||
|
||||
@Override
|
||||
public UpperCaseFilter create(TokenStream input) {
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new UpperCaseFilter(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
public TokenStream normalize(TokenStream input) {
|
||||
return create(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -40,7 +40,6 @@ import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
|||
import org.apache.lucene.analysis.util.CharFilterFactory;
|
||||
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
|
||||
import org.apache.lucene.analysis.util.FilesystemResourceLoader;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.ResourceLoader;
|
||||
import org.apache.lucene.analysis.util.ResourceLoaderAware;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
@ -143,10 +142,7 @@ public final class CustomAnalyzer extends Analyzer {
|
|||
@Override
|
||||
protected Reader initReaderForNormalization(String fieldName, Reader reader) {
|
||||
for (CharFilterFactory charFilter : charFilters) {
|
||||
if (charFilter instanceof MultiTermAwareComponent) {
|
||||
charFilter = (CharFilterFactory) ((MultiTermAwareComponent) charFilter).getMultiTermComponent();
|
||||
reader = charFilter.create(reader);
|
||||
}
|
||||
reader = charFilter.normalize(reader);
|
||||
}
|
||||
return reader;
|
||||
}
|
||||
|
@ -164,17 +160,8 @@ public final class CustomAnalyzer extends Analyzer {
|
|||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = in;
|
||||
// tokenizers can return a tokenfilter if the tokenizer does normalization,
|
||||
// although this is really bogus/abstraction violation...
|
||||
if (tokenizer instanceof MultiTermAwareComponent) {
|
||||
TokenFilterFactory filter = (TokenFilterFactory) ((MultiTermAwareComponent) tokenizer).getMultiTermComponent();
|
||||
result = filter.create(result);
|
||||
}
|
||||
for (TokenFilterFactory filter : tokenFilters) {
|
||||
if (filter instanceof MultiTermAwareComponent) {
|
||||
filter = (TokenFilterFactory) ((MultiTermAwareComponent) filter).getMultiTermComponent();
|
||||
result = filter.create(result);
|
||||
}
|
||||
result = filter.normalize(result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
|
|
@ -20,8 +20,6 @@ package org.apache.lucene.analysis.de;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
|
@ -36,7 +34,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
* </fieldType></pre>
|
||||
* @since 3.6.0
|
||||
*/
|
||||
public class GermanNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||
public class GermanNormalizationFilterFactory extends TokenFilterFactory {
|
||||
|
||||
/** Creates a new GermanNormalizationFilterFactory */
|
||||
public GermanNormalizationFilterFactory(Map<String,String> args) {
|
||||
|
@ -50,9 +48,9 @@ public class GermanNormalizationFilterFactory extends TokenFilterFactory impleme
|
|||
public TokenStream create(TokenStream input) {
|
||||
return new GermanNormalizationFilter(input);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
public TokenStream normalize(TokenStream input) {
|
||||
return create(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,8 +20,6 @@ package org.apache.lucene.analysis.el;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
|
@ -36,7 +34,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
*
|
||||
* @since 3.1
|
||||
*/
|
||||
public class GreekLowerCaseFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||
public class GreekLowerCaseFilterFactory extends TokenFilterFactory {
|
||||
|
||||
/** Creates a new GreekLowerCaseFilterFactory */
|
||||
public GreekLowerCaseFilterFactory(Map<String,String> args) {
|
||||
|
@ -47,13 +45,13 @@ public class GreekLowerCaseFilterFactory extends TokenFilterFactory implements M
|
|||
}
|
||||
|
||||
@Override
|
||||
public GreekLowerCaseFilter create(TokenStream in) {
|
||||
public TokenStream create(TokenStream in) {
|
||||
return new GreekLowerCaseFilter(in);
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
public TokenStream normalize(TokenStream input) {
|
||||
return create(input);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -20,10 +20,7 @@ package org.apache.lucene.analysis.fa;
|
|||
import java.io.Reader;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.CharFilter;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.CharFilterFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
|
||||
/**
|
||||
* Factory for {@link PersianCharFilter}.
|
||||
|
@ -37,7 +34,7 @@ import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
|||
*
|
||||
* @since 3.1
|
||||
*/
|
||||
public class PersianCharFilterFactory extends CharFilterFactory implements MultiTermAwareComponent {
|
||||
public class PersianCharFilterFactory extends CharFilterFactory {
|
||||
|
||||
/** Creates a new PersianCharFilterFactory */
|
||||
public PersianCharFilterFactory(Map<String,String> args) {
|
||||
|
@ -48,12 +45,12 @@ public class PersianCharFilterFactory extends CharFilterFactory implements Multi
|
|||
}
|
||||
|
||||
@Override
|
||||
public CharFilter create(Reader input) {
|
||||
public Reader create(Reader input) {
|
||||
return new PersianCharFilter(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
public Reader normalize(Reader input) {
|
||||
return create(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,8 +20,6 @@ package org.apache.lucene.analysis.fa;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
|
@ -37,7 +35,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
*
|
||||
* @since 3.1
|
||||
*/
|
||||
public class PersianNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||
public class PersianNormalizationFilterFactory extends TokenFilterFactory {
|
||||
|
||||
/** Creates a new PersianNormalizationFilterFactory */
|
||||
public PersianNormalizationFilterFactory(Map<String,String> args) {
|
||||
|
@ -48,13 +46,13 @@ public class PersianNormalizationFilterFactory extends TokenFilterFactory implem
|
|||
}
|
||||
|
||||
@Override
|
||||
public PersianNormalizationFilter create(TokenStream input) {
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new PersianNormalizationFilter(input);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
public TokenStream normalize(TokenStream input) {
|
||||
return create(input);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -20,8 +20,6 @@ package org.apache.lucene.analysis.ga;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
|
@ -35,7 +33,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
* </fieldType></pre>
|
||||
* @since 3.6.0
|
||||
*/
|
||||
public class IrishLowerCaseFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||
public class IrishLowerCaseFilterFactory extends TokenFilterFactory {
|
||||
|
||||
/** Creates a new IrishLowerCaseFilterFactory */
|
||||
public IrishLowerCaseFilterFactory(Map<String,String> args) {
|
||||
|
@ -52,7 +50,7 @@ public class IrishLowerCaseFilterFactory extends TokenFilterFactory implements M
|
|||
|
||||
// this will 'mostly work', except for special cases, just like most other filters
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
public TokenStream normalize(TokenStream input) {
|
||||
return create(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,8 +20,6 @@ package org.apache.lucene.analysis.hi;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
|
@ -35,7 +33,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
* </fieldType></pre>
|
||||
* @since 3.1.0
|
||||
*/
|
||||
public class HindiNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||
public class HindiNormalizationFilterFactory extends TokenFilterFactory {
|
||||
|
||||
/** Creates a new HindiNormalizationFilterFactory */
|
||||
public HindiNormalizationFilterFactory(Map<String,String> args) {
|
||||
|
@ -49,9 +47,9 @@ public class HindiNormalizationFilterFactory extends TokenFilterFactory implemen
|
|||
public TokenStream create(TokenStream input) {
|
||||
return new HindiNormalizationFilter(input);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
public TokenStream normalize(TokenStream input) {
|
||||
return create(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,8 +20,6 @@ package org.apache.lucene.analysis.in;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
|
@ -35,7 +33,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
* </fieldType></pre>
|
||||
* @since 3.1.0
|
||||
*/
|
||||
public class IndicNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||
public class IndicNormalizationFilterFactory extends TokenFilterFactory {
|
||||
|
||||
/** Creates a new IndicNormalizationFilterFactory */
|
||||
public IndicNormalizationFilterFactory(Map<String,String> args) {
|
||||
|
@ -49,9 +47,9 @@ public class IndicNormalizationFilterFactory extends TokenFilterFactory implemen
|
|||
public TokenStream create(TokenStream input) {
|
||||
return new IndicNormalizationFilter(input);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
public TokenStream normalize(TokenStream input) {
|
||||
return create(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,13 +17,10 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link ASCIIFoldingFilter}.
|
||||
|
@ -37,7 +34,7 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
*
|
||||
* @since 3.1
|
||||
*/
|
||||
public class ASCIIFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||
public class ASCIIFoldingFilterFactory extends TokenFilterFactory {
|
||||
private static final String PRESERVE_ORIGINAL = "preserveOriginal";
|
||||
|
||||
private final boolean preserveOriginal;
|
||||
|
@ -52,23 +49,18 @@ public class ASCIIFoldingFilterFactory extends TokenFilterFactory implements Mul
|
|||
}
|
||||
|
||||
@Override
|
||||
public ASCIIFoldingFilter create(TokenStream input) {
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new ASCIIFoldingFilter(input, preserveOriginal);
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
if (preserveOriginal) {
|
||||
// The main use-case for using preserveOriginal is to match regardless of
|
||||
// case but to give better scores to exact matches. Since most multi-term
|
||||
// queries return constant scores anyway, the multi-term component only
|
||||
// emits the folded token
|
||||
Map<String, String> args = new HashMap<>(getOriginalArgs());
|
||||
args.remove(PRESERVE_ORIGINAL);
|
||||
return new ASCIIFoldingFilterFactory(args);
|
||||
} else {
|
||||
return this;
|
||||
}
|
||||
public TokenStream normalize(TokenStream input) {
|
||||
// The main use-case for using preserveOriginal is to match regardless of
|
||||
// case and to give better scores to exact matches. Since most multi-term
|
||||
// queries return constant scores anyway, for normalization we
|
||||
// emit only the folded token
|
||||
return new ASCIIFoldingFilter(input, false);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -17,13 +17,11 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link ScandinavianFoldingFilter}.
|
||||
* <pre class="prettyprint">
|
||||
|
@ -35,8 +33,7 @@ import java.util.Map;
|
|||
* </fieldType></pre>
|
||||
* @since 4.4.0
|
||||
*/
|
||||
public class ScandinavianFoldingFilterFactory extends TokenFilterFactory
|
||||
implements MultiTermAwareComponent {
|
||||
public class ScandinavianFoldingFilterFactory extends TokenFilterFactory {
|
||||
|
||||
public ScandinavianFoldingFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
|
@ -46,12 +43,12 @@ public class ScandinavianFoldingFilterFactory extends TokenFilterFactory
|
|||
}
|
||||
|
||||
@Override
|
||||
public ScandinavianFoldingFilter create(TokenStream input) {
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new ScandinavianFoldingFilter(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
public TokenStream normalize(TokenStream input) {
|
||||
return create(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,13 +17,11 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter}.
|
||||
* <pre class="prettyprint">
|
||||
|
@ -35,8 +33,7 @@ import java.util.Map;
|
|||
* </fieldType></pre>
|
||||
* @since 4.4.0
|
||||
*/
|
||||
public class ScandinavianNormalizationFilterFactory extends TokenFilterFactory
|
||||
implements MultiTermAwareComponent {
|
||||
public class ScandinavianNormalizationFilterFactory extends TokenFilterFactory {
|
||||
|
||||
public ScandinavianNormalizationFilterFactory(Map<String, String> args) {
|
||||
super(args);
|
||||
|
@ -51,7 +48,7 @@ public class ScandinavianNormalizationFilterFactory extends TokenFilterFactory
|
|||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
public TokenStream normalize(TokenStream input) {
|
||||
return create(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,8 +20,6 @@ package org.apache.lucene.analysis.miscellaneous;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
|
@ -38,7 +36,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
*
|
||||
* @since 3.1
|
||||
*/
|
||||
public class TrimFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||
public class TrimFilterFactory extends TokenFilterFactory {
|
||||
|
||||
/** Creates a new TrimFilterFactory */
|
||||
public TrimFilterFactory(Map<String,String> args) {
|
||||
|
@ -49,12 +47,12 @@ public class TrimFilterFactory extends TokenFilterFactory implements MultiTermAw
|
|||
}
|
||||
|
||||
@Override
|
||||
public TrimFilter create(TokenStream input) {
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new TrimFilter(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
public TokenStream normalize(TokenStream input) {
|
||||
return create(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,10 +21,7 @@ import java.io.Reader;
|
|||
import java.util.Map;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.CharFilter;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.CharFilterFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
|
||||
/**
|
||||
* Factory for {@link PatternReplaceCharFilter}.
|
||||
|
@ -39,7 +36,7 @@ import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
|||
*
|
||||
* @since Solr 3.1
|
||||
*/
|
||||
public class PatternReplaceCharFilterFactory extends CharFilterFactory implements MultiTermAwareComponent {
|
||||
public class PatternReplaceCharFilterFactory extends CharFilterFactory {
|
||||
private final Pattern pattern;
|
||||
private final String replacement;
|
||||
|
||||
|
@ -54,12 +51,12 @@ public class PatternReplaceCharFilterFactory extends CharFilterFactory implement
|
|||
}
|
||||
|
||||
@Override
|
||||
public CharFilter create(Reader input) {
|
||||
public Reader create(Reader input) {
|
||||
return new PatternReplaceCharFilter(pattern, replacement, input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
public Reader normalize(Reader input) {
|
||||
return create(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,8 +21,6 @@ import java.util.Arrays;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
|
@ -38,7 +36,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
* </fieldType></pre>
|
||||
* @since 5.0.0
|
||||
*/
|
||||
public class SerbianNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||
public class SerbianNormalizationFilterFactory extends TokenFilterFactory {
|
||||
final String haircut;
|
||||
|
||||
/** Creates a new SerbianNormalizationFilterFactory */
|
||||
|
@ -61,8 +59,7 @@ public class SerbianNormalizationFilterFactory extends TokenFilterFactory implem
|
|||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
public TokenStream normalize(TokenStream input) {
|
||||
return create(input);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -20,8 +20,6 @@ package org.apache.lucene.analysis.tr;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
|
@ -35,7 +33,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
* </fieldType></pre>
|
||||
* @since 3.1.0
|
||||
*/
|
||||
public class TurkishLowerCaseFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||
public class TurkishLowerCaseFilterFactory extends TokenFilterFactory {
|
||||
|
||||
/** Creates a new TurkishLowerCaseFilterFactory */
|
||||
public TurkishLowerCaseFilterFactory(Map<String,String> args) {
|
||||
|
@ -51,7 +49,7 @@ public class TurkishLowerCaseFilterFactory extends TokenFilterFactory implement
|
|||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
public TokenStream normalize(TokenStream input) {
|
||||
return create(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -73,4 +73,13 @@ public abstract class CharFilterFactory extends AbstractAnalysisFactory {
|
|||
|
||||
/** Wraps the given Reader with a CharFilter. */
|
||||
public abstract Reader create(Reader input);
|
||||
|
||||
/**
|
||||
* Normalize the specified input Reader
|
||||
* While the default implementation returns input unchanged,
|
||||
* char filters that should be applied at normalization time can delegate to {@code create} method.
|
||||
*/
|
||||
public Reader normalize(Reader input) {
|
||||
return input;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -38,7 +38,7 @@ import org.apache.lucene.analysis.fr.FrenchAnalyzer;
|
|||
*
|
||||
* @since 3.1
|
||||
*/
|
||||
public class ElisionFilterFactory extends TokenFilterFactory implements ResourceLoaderAware, MultiTermAwareComponent {
|
||||
public class ElisionFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
|
||||
private final String articlesFile;
|
||||
private final boolean ignoreCase;
|
||||
private CharArraySet articles;
|
||||
|
@ -63,13 +63,13 @@ public class ElisionFilterFactory extends TokenFilterFactory implements Resource
|
|||
}
|
||||
|
||||
@Override
|
||||
public ElisionFilter create(TokenStream input) {
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new ElisionFilter(input, articles);
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
public TokenStream normalize(TokenStream input) {
|
||||
return create(input);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,36 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.util;
|
||||
|
||||
|
||||
/** Add to any analysis factory component to allow returning an
|
||||
* analysis component factory for use with partial terms in prefix queries,
|
||||
* wildcard queries, range query endpoints, regex queries, etc.
|
||||
*
|
||||
* Components implementing this interface should not add or remove tokens from
|
||||
* the token stream, and should be able to deal with special characters
|
||||
* indicating that multi-term queries are required (eg slashes for regex, wildcard
|
||||
* characters, etc)
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public interface MultiTermAwareComponent {
|
||||
/** Returns an analysis component to handle analysis if multi-term queries.
|
||||
* The returned component must be a TokenizerFactory, TokenFilterFactory or CharFilterFactory.
|
||||
*/
|
||||
public AbstractAnalysisFactory getMultiTermComponent();
|
||||
}
|
|
@ -73,4 +73,13 @@ public abstract class TokenFilterFactory extends AbstractAnalysisFactory {
|
|||
|
||||
/** Transform the specified input TokenStream */
|
||||
public abstract TokenStream create(TokenStream input);
|
||||
|
||||
/**
|
||||
* Normalize the specified input TokenStream
|
||||
* While the default implementation returns input unchanged,
|
||||
* filters that should be applied at normalization time can delegate to {@code create} method.
|
||||
*/
|
||||
public TokenStream normalize(TokenStream input) {
|
||||
return input;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,7 +34,6 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||
import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilterFactory;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.CharFilterFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.ResourceLoaderAware;
|
||||
import org.apache.lucene.analysis.util.StringMockResourceLoader;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
@ -78,15 +77,6 @@ public class TestFactories extends BaseTokenStreamTestCase {
|
|||
TokenizerFactory factory = (TokenizerFactory) initialize(factoryClazz);
|
||||
if (factory != null) {
|
||||
// we managed to fully create an instance. check a few more things:
|
||||
|
||||
// if it implements MultiTermAware, sanity check its impl
|
||||
if (factory instanceof MultiTermAwareComponent) {
|
||||
AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
|
||||
assertNotNull(mtc);
|
||||
// it's not ok to return e.g. a charfilter here: but a tokenizer could wrap a filter around it
|
||||
assertFalse(mtc instanceof CharFilterFactory);
|
||||
}
|
||||
|
||||
if (!EXCLUDE_FACTORIES_RANDOM_DATA.contains(factory.getClass())) {
|
||||
// beast it just a little, it shouldnt throw exceptions:
|
||||
// (it should have thrown them in initialize)
|
||||
|
@ -102,15 +92,6 @@ public class TestFactories extends BaseTokenStreamTestCase {
|
|||
TokenFilterFactory factory = (TokenFilterFactory) initialize(factoryClazz);
|
||||
if (factory != null) {
|
||||
// we managed to fully create an instance. check a few more things:
|
||||
|
||||
// if it implements MultiTermAware, sanity check its impl
|
||||
if (factory instanceof MultiTermAwareComponent) {
|
||||
AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
|
||||
assertNotNull(mtc);
|
||||
// it's not ok to return a charfilter or tokenizer here, this makes no sense
|
||||
assertTrue(mtc instanceof TokenFilterFactory);
|
||||
}
|
||||
|
||||
if (!EXCLUDE_FACTORIES_RANDOM_DATA.contains(factory.getClass())) {
|
||||
// beast it just a little, it shouldnt throw exceptions:
|
||||
// (it should have thrown them in initialize)
|
||||
|
@ -126,15 +107,6 @@ public class TestFactories extends BaseTokenStreamTestCase {
|
|||
CharFilterFactory factory = (CharFilterFactory) initialize(factoryClazz);
|
||||
if (factory != null) {
|
||||
// we managed to fully create an instance. check a few more things:
|
||||
|
||||
// if it implements MultiTermAware, sanity check its impl
|
||||
if (factory instanceof MultiTermAwareComponent) {
|
||||
AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
|
||||
assertNotNull(mtc);
|
||||
// it's not ok to return a tokenizer or tokenfilter here, this makes no sense
|
||||
assertTrue(mtc instanceof CharFilterFactory);
|
||||
}
|
||||
|
||||
if (!EXCLUDE_FACTORIES_RANDOM_DATA.contains(factory.getClass())) {
|
||||
// beast it just a little, it shouldnt throw exceptions:
|
||||
// (it should have thrown them in initialize)
|
||||
|
|
|
@ -39,9 +39,7 @@ import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory;
|
|||
import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory;
|
||||
import org.apache.lucene.analysis.standard.ClassicTokenizerFactory;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizerFactory;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.CharFilterFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
|
@ -397,17 +395,16 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
}
|
||||
|
||||
public static class DummyMultiTermAwareCharFilterFactory extends DummyCharFilterFactory implements MultiTermAwareComponent {
|
||||
public static class DummyMultiTermAwareCharFilterFactory extends DummyCharFilterFactory {
|
||||
|
||||
public DummyMultiTermAwareCharFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return new DummyCharFilterFactory(Collections.emptyMap(), '0', '2');
|
||||
public Reader normalize(Reader input) {
|
||||
return create(input);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static class DummyTokenizerFactory extends TokenizerFactory {
|
||||
|
@ -423,19 +420,6 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
}
|
||||
|
||||
public static class DummyMultiTermAwareTokenizerFactory extends DummyTokenizerFactory implements MultiTermAwareComponent {
|
||||
|
||||
public DummyMultiTermAwareTokenizerFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return new DummyTokenFilterFactory(Collections.emptyMap());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static class DummyTokenFilterFactory extends TokenFilterFactory {
|
||||
|
||||
public DummyTokenFilterFactory(Map<String,String> args) {
|
||||
|
@ -449,15 +433,15 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
}
|
||||
|
||||
public static class DummyMultiTermAwareTokenFilterFactory extends DummyTokenFilterFactory implements MultiTermAwareComponent {
|
||||
public static class DummyMultiTermAwareTokenFilterFactory extends DummyTokenFilterFactory {
|
||||
|
||||
public DummyMultiTermAwareTokenFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return new ASCIIFoldingFilterFactory(Collections.emptyMap());
|
||||
public TokenStream normalize(TokenStream input) {
|
||||
return new ASCIIFoldingFilterFactory(Collections.emptyMap()).normalize(input);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -472,12 +456,13 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
|
|||
assertEquals(new BytesRef("0À"), analyzer1.normalize("dummy", "0À"));
|
||||
|
||||
CustomAnalyzer analyzer2 = CustomAnalyzer.builder()
|
||||
// this component in not multi-term aware so it should not be applied
|
||||
.withTokenizer(DummyTokenizerFactory.class, Collections.emptyMap())
|
||||
// these components are multi-term aware so they should be applied
|
||||
.withTokenizer(DummyMultiTermAwareTokenizerFactory.class, Collections.emptyMap())
|
||||
.addCharFilter(DummyMultiTermAwareCharFilterFactory.class, Collections.emptyMap())
|
||||
.addTokenFilter(DummyMultiTermAwareTokenFilterFactory.class, Collections.emptyMap())
|
||||
.build();
|
||||
assertEquals(new BytesRef("2A"), analyzer2.normalize("dummy", "0À"));
|
||||
assertEquals(new BytesRef("1A"), analyzer2.normalize("dummy", "0À"));
|
||||
}
|
||||
|
||||
public void testNormalizationWithMultipleTokenFilters() throws IOException {
|
||||
|
|
|
@ -24,7 +24,6 @@ import org.apache.lucene.analysis.CannedTokenStream;
|
|||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
public class TestAsciiFoldingFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||
|
@ -35,9 +34,8 @@ public class TestAsciiFoldingFilterFactory extends BaseTokenStreamFactoryTestCas
|
|||
stream = factory.create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "Ete" });
|
||||
|
||||
factory = (TokenFilterFactory) ((MultiTermAwareComponent) factory).getMultiTermComponent();
|
||||
stream = new CannedTokenStream(new Token("Été", 0, 3));
|
||||
stream = factory.create(stream);
|
||||
stream = factory.normalize(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "Ete" });
|
||||
|
||||
factory = new ASCIIFoldingFilterFactory(new HashMap<>(Collections.singletonMap("preserveOriginal", "true")));
|
||||
|
@ -45,9 +43,8 @@ public class TestAsciiFoldingFilterFactory extends BaseTokenStreamFactoryTestCas
|
|||
stream = factory.create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "Ete", "Été" });
|
||||
|
||||
factory = (TokenFilterFactory) ((MultiTermAwareComponent) factory).getMultiTermComponent();
|
||||
stream = new CannedTokenStream(new Token("Été", 0, 3));
|
||||
stream = factory.create(stream);
|
||||
stream = factory.normalize(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "Ete" });
|
||||
}
|
||||
|
||||
|
|
|
@ -19,14 +19,11 @@ package org.apache.lucene.analysis.icu;
|
|||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory; // javadocs
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
import com.ibm.icu.text.FilteredNormalizer2;
|
||||
import com.ibm.icu.text.Normalizer2;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link ICUFoldingFilter}.
|
||||
|
@ -39,7 +36,7 @@ import com.ibm.icu.text.UnicodeSet;
|
|||
* </fieldType></pre>
|
||||
* @since 3.1.0
|
||||
*/
|
||||
public class ICUFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||
public class ICUFoldingFilterFactory extends TokenFilterFactory {
|
||||
private final Normalizer2 normalizer;
|
||||
|
||||
/** Creates a new ICUFoldingFilterFactory */
|
||||
|
@ -67,7 +64,7 @@ public class ICUFoldingFilterFactory extends TokenFilterFactory implements Multi
|
|||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
public TokenStream normalize(TokenStream input) {
|
||||
return create(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,13 +21,10 @@ import java.io.Reader;
|
|||
import java.util.Arrays;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.CharFilterFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
|
||||
import com.ibm.icu.text.FilteredNormalizer2;
|
||||
import com.ibm.icu.text.Normalizer2;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import org.apache.lucene.analysis.util.CharFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link ICUNormalizer2CharFilter}
|
||||
|
@ -47,7 +44,7 @@ import com.ibm.icu.text.UnicodeSet;
|
|||
*
|
||||
* @since 4.10.0
|
||||
*/
|
||||
public class ICUNormalizer2CharFilterFactory extends CharFilterFactory implements MultiTermAwareComponent {
|
||||
public class ICUNormalizer2CharFilterFactory extends CharFilterFactory {
|
||||
private final Normalizer2 normalizer;
|
||||
|
||||
/** Creates a new ICUNormalizer2CharFilterFactory */
|
||||
|
@ -78,8 +75,7 @@ public class ICUNormalizer2CharFilterFactory extends CharFilterFactory implement
|
|||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
public Reader normalize(Reader input) {
|
||||
return create(input);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -20,14 +20,11 @@ package org.apache.lucene.analysis.icu;
|
|||
import java.util.Arrays;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory; // javadocs
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
import com.ibm.icu.text.FilteredNormalizer2;
|
||||
import com.ibm.icu.text.Normalizer2;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link ICUNormalizer2Filter}
|
||||
|
@ -46,7 +43,7 @@ import com.ibm.icu.text.UnicodeSet;
|
|||
* @see FilteredNormalizer2
|
||||
* @since 3.1.0
|
||||
*/
|
||||
public class ICUNormalizer2FilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||
public class ICUNormalizer2FilterFactory extends TokenFilterFactory {
|
||||
private final Normalizer2 normalizer;
|
||||
|
||||
/** Creates a new ICUNormalizer2FilterFactory */
|
||||
|
@ -79,7 +76,7 @@ public class ICUNormalizer2FilterFactory extends TokenFilterFactory implements M
|
|||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
public TokenStream normalize(TokenStream input) {
|
||||
return create(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,12 +20,9 @@ package org.apache.lucene.analysis.icu;
|
|||
import java.util.Arrays;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory; // javadocs
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link ICUTransformFilter}.
|
||||
|
@ -38,7 +35,7 @@ import com.ibm.icu.text.Transliterator;
|
|||
* @see Transliterator
|
||||
* @since 3.1.0
|
||||
*/
|
||||
public class ICUTransformFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||
public class ICUTransformFilterFactory extends TokenFilterFactory {
|
||||
private final Transliterator transliterator;
|
||||
|
||||
// TODO: add support for custom rules
|
||||
|
@ -58,9 +55,9 @@ public class ICUTransformFilterFactory extends TokenFilterFactory implements Mul
|
|||
public TokenStream create(TokenStream input) {
|
||||
return new ICUTransformFilter(input, transliterator);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
public TokenStream normalize(TokenStream input) {
|
||||
return create(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,14 +17,11 @@
|
|||
package org.apache.lucene.analysis.ja;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.CharFilter;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.CharFilterFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.util.CharFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link org.apache.lucene.analysis.ja.JapaneseIterationMarkCharFilter}.
|
||||
* <pre class="prettyprint">
|
||||
|
@ -37,7 +34,7 @@ import java.util.Map;
|
|||
*
|
||||
* @since 4.0.0
|
||||
*/
|
||||
public class JapaneseIterationMarkCharFilterFactory extends CharFilterFactory implements MultiTermAwareComponent {
|
||||
public class JapaneseIterationMarkCharFilterFactory extends CharFilterFactory {
|
||||
|
||||
private static final String NORMALIZE_KANJI_PARAM = "normalizeKanji";
|
||||
private static final String NORMALIZE_KANA_PARAM = "normalizeKana";
|
||||
|
@ -56,12 +53,12 @@ public class JapaneseIterationMarkCharFilterFactory extends CharFilterFactory im
|
|||
}
|
||||
|
||||
@Override
|
||||
public CharFilter create(Reader input) {
|
||||
public Reader create(Reader input) {
|
||||
return new JapaneseIterationMarkCharFilter(input, normalizeKanji, normalizeKana);
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return this;
|
||||
public Reader normalize(Reader input) {
|
||||
return create(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,7 +34,6 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||
import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilterFactory;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.CharFilterFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.ResourceLoaderAware;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||
|
@ -75,15 +74,6 @@ public class TestFactories extends BaseTokenStreamTestCase {
|
|||
TokenizerFactory factory = (TokenizerFactory) initialize(factoryClazz);
|
||||
if (factory != null) {
|
||||
// we managed to fully create an instance. check a few more things:
|
||||
|
||||
// if it implements MultiTermAware, sanity check its impl
|
||||
if (factory instanceof MultiTermAwareComponent) {
|
||||
AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
|
||||
assertNotNull(mtc);
|
||||
// it's not ok to return e.g. a charfilter here: but a tokenizer could wrap a filter around it
|
||||
assertFalse(mtc instanceof CharFilterFactory);
|
||||
}
|
||||
|
||||
if (!EXCLUDE_FACTORIES_RANDOM_DATA.contains(factory.getClass())) {
|
||||
// beast it just a little, it shouldnt throw exceptions:
|
||||
// (it should have thrown them in initialize)
|
||||
|
@ -99,15 +89,6 @@ public class TestFactories extends BaseTokenStreamTestCase {
|
|||
TokenFilterFactory factory = (TokenFilterFactory) initialize(factoryClazz);
|
||||
if (factory != null) {
|
||||
// we managed to fully create an instance. check a few more things:
|
||||
|
||||
// if it implements MultiTermAware, sanity check its impl
|
||||
if (factory instanceof MultiTermAwareComponent) {
|
||||
AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
|
||||
assertNotNull(mtc);
|
||||
// it's not ok to return a charfilter or tokenizer here, this makes no sense
|
||||
assertTrue(mtc instanceof TokenFilterFactory);
|
||||
}
|
||||
|
||||
if (!EXCLUDE_FACTORIES_RANDOM_DATA.contains(factory.getClass())) {
|
||||
// beast it just a little, it shouldnt throw exceptions:
|
||||
// (it should have thrown them in initialize)
|
||||
|
@ -123,15 +104,6 @@ public class TestFactories extends BaseTokenStreamTestCase {
|
|||
CharFilterFactory factory = (CharFilterFactory) initialize(factoryClazz);
|
||||
if (factory != null) {
|
||||
// we managed to fully create an instance. check a few more things:
|
||||
|
||||
// if it implements MultiTermAware, sanity check its impl
|
||||
if (factory instanceof MultiTermAwareComponent) {
|
||||
AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
|
||||
assertNotNull(mtc);
|
||||
// it's not ok to return a tokenizer or tokenfilter here, this makes no sense
|
||||
assertTrue(mtc instanceof CharFilterFactory);
|
||||
}
|
||||
|
||||
if (!EXCLUDE_FACTORIES_RANDOM_DATA.contains(factory.getClass())) {
|
||||
// beast it just a little, it shouldnt throw exceptions:
|
||||
// (it should have thrown them in initialize)
|
||||
|
|
|
@ -18,12 +18,12 @@ package org.apache.lucene.analysis.ja;
|
|||
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharFilter;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
@ -36,7 +36,7 @@ public class TestJapaneseIterationMarkCharFilterFactory extends BaseTokenStreamT
|
|||
public void testIterationMarksWithKeywordTokenizer() throws IOException {
|
||||
final String text = "時々馬鹿々々しいところゞゝゝミスヾ";
|
||||
JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(new HashMap<String,String>());
|
||||
CharFilter filter = filterFactory.create(new StringReader(text));
|
||||
Reader filter = filterFactory.create(new StringReader(text));
|
||||
TokenStream tokenStream = new MockTokenizer(MockTokenizer.KEYWORD, false);
|
||||
((Tokenizer)tokenStream).setReader(filter);
|
||||
assertTokenStreamContents(tokenStream, new String[]{"時時馬鹿馬鹿しいところどころミスズ"});
|
||||
|
@ -47,7 +47,7 @@ public class TestJapaneseIterationMarkCharFilterFactory extends BaseTokenStreamT
|
|||
tokenizerFactory.inform(new StringMockResourceLoader(""));
|
||||
|
||||
JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(new HashMap<String,String>());
|
||||
CharFilter filter = filterFactory.create(
|
||||
Reader filter = filterFactory.create(
|
||||
new StringReader("時々馬鹿々々しいところゞゝゝミスヾ")
|
||||
);
|
||||
TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory());
|
||||
|
@ -64,7 +64,7 @@ public class TestJapaneseIterationMarkCharFilterFactory extends BaseTokenStreamT
|
|||
filterArgs.put("normalizeKana", "false");
|
||||
JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(filterArgs);
|
||||
|
||||
CharFilter filter = filterFactory.create(
|
||||
Reader filter = filterFactory.create(
|
||||
new StringReader("時々馬鹿々々しいところゞゝゝミスヾ")
|
||||
);
|
||||
TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory());
|
||||
|
@ -81,7 +81,7 @@ public class TestJapaneseIterationMarkCharFilterFactory extends BaseTokenStreamT
|
|||
filterArgs.put("normalizeKana", "true");
|
||||
JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(filterArgs);
|
||||
|
||||
CharFilter filter = filterFactory.create(
|
||||
Reader filter = filterFactory.create(
|
||||
new StringReader("時々馬鹿々々しいところゞゝゝミスヾ")
|
||||
);
|
||||
TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory());
|
||||
|
|
|
@ -18,10 +18,11 @@ package org.apache.solr.analysis;
|
|||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharFilterFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||
|
||||
|
@ -91,10 +92,7 @@ public final class TokenizerChain extends SolrAnalyzer {
|
|||
protected Reader initReaderForNormalization(String fieldName, Reader reader) {
|
||||
if (charFilters != null && charFilters.length > 0) {
|
||||
for (CharFilterFactory charFilter : charFilters) {
|
||||
if (charFilter instanceof MultiTermAwareComponent) {
|
||||
charFilter = (CharFilterFactory) ((MultiTermAwareComponent) charFilter).getMultiTermComponent();
|
||||
reader = charFilter.create(reader);
|
||||
}
|
||||
reader = charFilter.normalize(reader);
|
||||
}
|
||||
}
|
||||
return reader;
|
||||
|
@ -114,10 +112,7 @@ public final class TokenizerChain extends SolrAnalyzer {
|
|||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = in;
|
||||
for (TokenFilterFactory filter : filters) {
|
||||
if (filter instanceof MultiTermAwareComponent) {
|
||||
filter = (TokenFilterFactory) ((MultiTermAwareComponent) filter).getMultiTermComponent();
|
||||
result = filter.create(result);
|
||||
}
|
||||
result = filter.normalize(result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
@ -138,4 +133,30 @@ public final class TokenizerChain extends SolrAnalyzer {
|
|||
return sb.toString();
|
||||
}
|
||||
|
||||
public Analyzer getMultiTermAnalyzer() {
|
||||
return new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tk = new KeywordTokenizer();
|
||||
TokenStream ts = tk;
|
||||
for (TokenFilterFactory filter : filters) {
|
||||
ts = filter.normalize(ts);
|
||||
}
|
||||
return new TokenStreamComponents(tk, ts);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(String fieldName, Reader reader) {
|
||||
if (charFilters != null && charFilters.length > 0) {
|
||||
Reader cs = reader;
|
||||
for (CharFilterFactory charFilter : charFilters) {
|
||||
cs = charFilter.normalize(cs);
|
||||
}
|
||||
reader = cs;
|
||||
}
|
||||
return reader;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -20,19 +20,14 @@ import javax.xml.xpath.XPath;
|
|||
import javax.xml.xpath.XPathConstants;
|
||||
import javax.xml.xpath.XPathExpressionException;
|
||||
import javax.xml.xpath.XPathFactory;
|
||||
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.KeywordAnalyzer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizerFactory;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.CharFilterFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
@ -186,61 +181,9 @@ public final class FieldTypePluginLoader
|
|||
return new KeywordAnalyzer();
|
||||
}
|
||||
|
||||
TokenizerChain tc = (TokenizerChain) queryAnalyzer;
|
||||
MultiTermChainBuilder builder = new MultiTermChainBuilder();
|
||||
|
||||
CharFilterFactory[] charFactories = tc.getCharFilterFactories();
|
||||
for (CharFilterFactory fact : charFactories) {
|
||||
builder.add(fact);
|
||||
}
|
||||
|
||||
builder.add(tc.getTokenizerFactory());
|
||||
|
||||
for (TokenFilterFactory fact : tc.getTokenFilterFactories()) {
|
||||
builder.add(fact);
|
||||
}
|
||||
|
||||
return builder.build();
|
||||
return ((TokenizerChain) queryAnalyzer).getMultiTermAnalyzer();
|
||||
}
|
||||
|
||||
private static class MultiTermChainBuilder {
|
||||
static final KeywordTokenizerFactory keyFactory = new KeywordTokenizerFactory(new HashMap<String,String>());
|
||||
|
||||
ArrayList<CharFilterFactory> charFilters = null;
|
||||
ArrayList<TokenFilterFactory> filters = new ArrayList<>(2);
|
||||
TokenizerFactory tokenizer = keyFactory;
|
||||
|
||||
public void add(Object current) {
|
||||
if (!(current instanceof MultiTermAwareComponent)) return;
|
||||
AbstractAnalysisFactory newComponent = ((MultiTermAwareComponent)current).getMultiTermComponent();
|
||||
if (newComponent instanceof TokenFilterFactory) {
|
||||
if (filters == null) {
|
||||
filters = new ArrayList<>(2);
|
||||
}
|
||||
filters.add((TokenFilterFactory)newComponent);
|
||||
} else if (newComponent instanceof TokenizerFactory) {
|
||||
tokenizer = (TokenizerFactory)newComponent;
|
||||
} else if (newComponent instanceof CharFilterFactory) {
|
||||
if (charFilters == null) {
|
||||
charFilters = new ArrayList<>(1);
|
||||
}
|
||||
charFilters.add( (CharFilterFactory)newComponent);
|
||||
|
||||
} else {
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown analysis component from MultiTermAwareComponent: " + newComponent);
|
||||
}
|
||||
}
|
||||
|
||||
public TokenizerChain build() {
|
||||
CharFilterFactory[] charFilterArr = charFilters == null ? null : charFilters.toArray(new CharFilterFactory[charFilters.size()]);
|
||||
TokenFilterFactory[] filterArr = filters == null ? new TokenFilterFactory[0] : filters.toArray(new TokenFilterFactory[filters.size()]);
|
||||
return new TokenizerChain(charFilterArr, tokenizer, filterArr);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// <analyzer><tokenizer class="...."/><tokenizer class="...." arg="....">
|
||||
//
|
||||
|
|
|
@ -1,93 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.schema;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.charfilter.MappingCharFilterFactory;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizerFactory;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory;
|
||||
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory;
|
||||
import org.apache.lucene.analysis.miscellaneous.TrimFilterFactory;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.analysis.TokenizerChain;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
public class MultiTermTest extends SolrTestCaseJ4 {
|
||||
public String getCoreName() {
|
||||
return "basic";
|
||||
}
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeTests() throws Exception {
|
||||
initCore("solrconfig-basic.xml", "schema-folding.xml");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultiFound() {
|
||||
SchemaField field = h.getCore().getLatestSchema().getField("content_multi");
|
||||
Analyzer analyzer = ((TextField)field.getType()).getMultiTermAnalyzer();
|
||||
assertTrue(analyzer instanceof TokenizerChain);
|
||||
assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory);
|
||||
TokenizerChain tc = (TokenizerChain) analyzer;
|
||||
for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
|
||||
assertTrue((factory instanceof ASCIIFoldingFilterFactory) || (factory instanceof LowerCaseFilterFactory));
|
||||
}
|
||||
|
||||
analyzer = field.getType().getIndexAnalyzer();
|
||||
assertTrue(analyzer instanceof TokenizerChain);
|
||||
assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory);
|
||||
tc = (TokenizerChain) analyzer;
|
||||
for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
|
||||
assertTrue((factory instanceof ASCIIFoldingFilterFactory) || (factory instanceof TrimFilterFactory));
|
||||
}
|
||||
|
||||
assertTrue(tc.getCharFilterFactories().length == 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testQueryCopiedToMulti() {
|
||||
SchemaField field = h.getCore().getLatestSchema().getField("content_charfilter");
|
||||
Analyzer analyzer = ((TextField)field.getType()).getMultiTermAnalyzer();
|
||||
assertTrue(analyzer instanceof TokenizerChain);
|
||||
assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof KeywordTokenizerFactory);
|
||||
TokenizerChain tc = (TokenizerChain) analyzer;
|
||||
for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
|
||||
assertTrue(factory instanceof LowerCaseFilterFactory);
|
||||
}
|
||||
|
||||
assertTrue(tc.getCharFilterFactories().length == 1);
|
||||
assertTrue(tc.getCharFilterFactories()[0] instanceof MappingCharFilterFactory);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDefaultCopiedToMulti() {
|
||||
SchemaField field = h.getCore().getLatestSchema().getField("content_ws");
|
||||
Analyzer analyzer = ((TextField)field.getType()).getMultiTermAnalyzer();
|
||||
assertTrue(analyzer instanceof TokenizerChain);
|
||||
assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof KeywordTokenizerFactory);
|
||||
TokenizerChain tc = (TokenizerChain) analyzer;
|
||||
for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
|
||||
assertTrue((factory instanceof ASCIIFoldingFilterFactory) || (factory instanceof LowerCaseFilterFactory));
|
||||
}
|
||||
|
||||
assertTrue(tc.getCharFilterFactories().length == 0);
|
||||
|
||||
}
|
||||
}
|
|
@ -170,18 +170,6 @@ public class TestFoldingMultitermQuery extends SolrTestCaseJ4 {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLowerTokenizer() {
|
||||
// The lowercasetokenizer will remove the '1' from the index, but not from the query, thus the special test.
|
||||
assertQ(req("q", "content_lower_token:Á*C*"), "//result[@numFound='1']");
|
||||
assertQ(req("q", "content_lower_token:Á*C*1"), "//result[@numFound='0']");
|
||||
assertQ(req("q", "content_lower_token:h*1"), "//result[@numFound='0']");
|
||||
assertQ(req("q", "content_lower_token:H*1"), "//result[@numFound='0']");
|
||||
assertQ(req("q", "content_lower_token:*1"), "//result[@numFound='0']");
|
||||
assertQ(req("q", "content_lower_token:HÏ*l?*"), "//result[@numFound='1']");
|
||||
assertQ(req("q", "content_lower_token:hȉ*l?*"), "//result[@numFound='1']");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFuzzy() throws Exception {
|
||||
assertQ(req("q", "content:ZiLLx~1"),
|
||||
|
|
Loading…
Reference in New Issue