LUCENE-8497: Replace MultiTermAwareComponent with normalize() method

This commit is contained in:
Alan Woodward 2018-11-19 09:47:52 +00:00
parent 759af0127c
commit 65486442c4
43 changed files with 209 additions and 515 deletions

View File

@ -87,6 +87,11 @@ API Changes
* LUCENE-8513: MultiFields.getFields is now removed. Please avoid this class,
and Fields in general, when possible. (David Smiley)
* LUCENE-8497: MultiTermAwareComponent has been removed, and in its place
TokenFilterFactory and CharFilterFactory now expose type-safe normalize()
methods. This decouples normalization from tokenization entirely.
(Mayya Sharipova, Alan Woodward)
Changes in Runtime Behavior
* LUCENE-8333: Switch MoreLikeThis.setMaxDocFreqPct to use maxDoc instead of

View File

@ -145,3 +145,8 @@ use a TokenFilter chain as you would with any other Tokenizer.
Both Highlighter and FastVectorHighlighter need a custom WeightedSpanTermExtractor or FieldQuery respectively
in order to support ToParent/ToChildBlockJoinQuery.
## MultiTermAwareComponent replaced by CharFilterFactory#normalize() and TokenFilterFactory#normalize() ##
Normalization is now type-safe, with CharFilterFactory#normalize() returning a Reader and
TokenFilterFactory#normalize() returning a TokenFilter.

View File

@ -20,8 +20,6 @@ package org.apache.lucene.analysis.ar;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
@ -36,7 +34,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
*
* @since 3.1
*/
public class ArabicNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
public class ArabicNormalizationFilterFactory extends TokenFilterFactory {
/** Creates a new ArabicNormalizationFilterFactory */
public ArabicNormalizationFilterFactory(Map<String,String> args) {
@ -47,12 +45,12 @@ public class ArabicNormalizationFilterFactory extends TokenFilterFactory impleme
}
@Override
public ArabicNormalizationFilter create(TokenStream input) {
public TokenStream create(TokenStream input) {
return new ArabicNormalizationFilter(input);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
public TokenStream normalize(TokenStream input) {
return create(input);
}
}

View File

@ -17,13 +17,11 @@
package org.apache.lucene.analysis.bn;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link BengaliNormalizationFilter}.
* <pre class="prettyprint">
@ -35,7 +33,7 @@ import java.util.Map;
* &lt;/fieldType&gt;</pre>
* @since 7.1.0
*/
public class BengaliNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
public class BengaliNormalizationFilterFactory extends TokenFilterFactory {
public BengaliNormalizationFilterFactory(Map<String,String> args) {
super(args);
@ -48,9 +46,9 @@ public class BengaliNormalizationFilterFactory extends TokenFilterFactory implem
public TokenStream create(TokenStream input) {
return new BengaliNormalizationFilter(input);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
public TokenStream normalize(TokenStream input) {
return create(input);
}
}

View File

@ -25,9 +25,7 @@ import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
@ -44,7 +42,7 @@ import org.apache.lucene.analysis.util.ResourceLoaderAware;
* @since Solr 1.4
*/
public class MappingCharFilterFactory extends CharFilterFactory implements
ResourceLoaderAware, MultiTermAwareComponent {
ResourceLoaderAware {
protected NormalizeCharMap normMap;
private final String mapping;
@ -86,6 +84,11 @@ public class MappingCharFilterFactory extends CharFilterFactory implements
return normMap == null ? input : new MappingCharFilter(normMap,input);
}
@Override
public Reader normalize(Reader input) {
return create(input);
}
// "source" => "target"
static Pattern p = Pattern.compile( "\"(.*)\"\\s*=>\\s*\"(.*)\"\\s*$" );
@ -131,8 +134,4 @@ public class MappingCharFilterFactory extends CharFilterFactory implements
return new String( out, 0, writePos );
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
}
}

View File

@ -20,8 +20,6 @@ package org.apache.lucene.analysis.cjk;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
@ -37,7 +35,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
* &lt;/fieldType&gt;</pre>
* @since 3.6.0
*/
public class CJKWidthFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
public class CJKWidthFilterFactory extends TokenFilterFactory {
/** Creates a new CJKWidthFilterFactory */
public CJKWidthFilterFactory(Map<String,String> args) {
@ -51,9 +49,9 @@ public class CJKWidthFilterFactory extends TokenFilterFactory implements MultiTe
public TokenStream create(TokenStream input) {
return new CJKWidthFilter(input);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
public TokenStream normalize(TokenStream input) {
return create(input);
}
}

View File

@ -20,8 +20,6 @@ package org.apache.lucene.analysis.ckb;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
@ -35,7 +33,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
* &lt;/fieldType&gt;</pre>
* @since 4.7.0
*/
public class SoraniNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
public class SoraniNormalizationFilterFactory extends TokenFilterFactory {
/** Creates a new SoraniNormalizationFilterFactory */
public SoraniNormalizationFilterFactory(Map<String,String> args) {
@ -46,12 +44,12 @@ public class SoraniNormalizationFilterFactory extends TokenFilterFactory impleme
}
@Override
public SoraniNormalizationFilter create(TokenStream input) {
public TokenStream create(TokenStream input) {
return new SoraniNormalizationFilter(input);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
public TokenStream normalize(TokenStream input) {
return create(input);
}
}

View File

@ -20,8 +20,6 @@ package org.apache.lucene.analysis.core;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
@ -35,7 +33,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
* &lt;/fieldType&gt;</pre>
* @since 5.4.0
*/
public class DecimalDigitFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
public class DecimalDigitFilterFactory extends TokenFilterFactory {
/** Creates a new DecimalDigitFilterFactory */
public DecimalDigitFilterFactory(Map<String,String> args) {
@ -46,12 +44,12 @@ public class DecimalDigitFilterFactory extends TokenFilterFactory implements Mul
}
@Override
public DecimalDigitFilter create(TokenStream input) {
public TokenStream create(TokenStream input) {
return new DecimalDigitFilter(input);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
public TokenStream normalize(TokenStream input) {
return create(input);
}
}

View File

@ -20,8 +20,6 @@ package org.apache.lucene.analysis.core;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
@ -36,7 +34,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
*
* @since 3.1
*/
public class LowerCaseFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
public class LowerCaseFilterFactory extends TokenFilterFactory {
/** Creates a new LowerCaseFilterFactory */
public LowerCaseFilterFactory(Map<String,String> args) {
@ -47,12 +45,12 @@ public class LowerCaseFilterFactory extends TokenFilterFactory implements MultiT
}
@Override
public LowerCaseFilter create(TokenStream input) {
public TokenStream create(TokenStream input) {
return new LowerCaseFilter(input);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
public TokenStream normalize(TokenStream input) {
return create(input);
}
}

View File

@ -20,8 +20,6 @@ package org.apache.lucene.analysis.core;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
@ -40,7 +38,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
* general search matching
* @since 4.7.0
*/
public class UpperCaseFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
public class UpperCaseFilterFactory extends TokenFilterFactory {
/** Creates a new UpperCaseFilterFactory */
public UpperCaseFilterFactory(Map<String,String> args) {
@ -51,12 +49,12 @@ public class UpperCaseFilterFactory extends TokenFilterFactory implements MultiT
}
@Override
public UpperCaseFilter create(TokenStream input) {
public TokenStream create(TokenStream input) {
return new UpperCaseFilter(input);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
public TokenStream normalize(TokenStream input) {
return create(input);
}
}

View File

@ -40,7 +40,6 @@ import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
import org.apache.lucene.analysis.util.FilesystemResourceLoader;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
@ -143,10 +142,7 @@ public final class CustomAnalyzer extends Analyzer {
@Override
protected Reader initReaderForNormalization(String fieldName, Reader reader) {
for (CharFilterFactory charFilter : charFilters) {
if (charFilter instanceof MultiTermAwareComponent) {
charFilter = (CharFilterFactory) ((MultiTermAwareComponent) charFilter).getMultiTermComponent();
reader = charFilter.create(reader);
}
reader = charFilter.normalize(reader);
}
return reader;
}
@ -164,17 +160,8 @@ public final class CustomAnalyzer extends Analyzer {
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = in;
// tokenizers can return a tokenfilter if the tokenizer does normalization,
// although this is really bogus/abstraction violation...
if (tokenizer instanceof MultiTermAwareComponent) {
TokenFilterFactory filter = (TokenFilterFactory) ((MultiTermAwareComponent) tokenizer).getMultiTermComponent();
result = filter.create(result);
}
for (TokenFilterFactory filter : tokenFilters) {
if (filter instanceof MultiTermAwareComponent) {
filter = (TokenFilterFactory) ((MultiTermAwareComponent) filter).getMultiTermComponent();
result = filter.create(result);
}
result = filter.normalize(result);
}
return result;
}

View File

@ -20,8 +20,6 @@ package org.apache.lucene.analysis.de;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
@ -36,7 +34,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
* &lt;/fieldType&gt;</pre>
* @since 3.6.0
*/
public class GermanNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
public class GermanNormalizationFilterFactory extends TokenFilterFactory {
/** Creates a new GermanNormalizationFilterFactory */
public GermanNormalizationFilterFactory(Map<String,String> args) {
@ -50,9 +48,9 @@ public class GermanNormalizationFilterFactory extends TokenFilterFactory impleme
public TokenStream create(TokenStream input) {
return new GermanNormalizationFilter(input);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
public TokenStream normalize(TokenStream input) {
return create(input);
}
}

View File

@ -20,8 +20,6 @@ package org.apache.lucene.analysis.el;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
@ -36,7 +34,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
*
* @since 3.1
*/
public class GreekLowerCaseFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
public class GreekLowerCaseFilterFactory extends TokenFilterFactory {
/** Creates a new GreekLowerCaseFilterFactory */
public GreekLowerCaseFilterFactory(Map<String,String> args) {
@ -47,13 +45,13 @@ public class GreekLowerCaseFilterFactory extends TokenFilterFactory implements M
}
@Override
public GreekLowerCaseFilter create(TokenStream in) {
public TokenStream create(TokenStream in) {
return new GreekLowerCaseFilter(in);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
public TokenStream normalize(TokenStream input) {
return create(input);
}
}

View File

@ -20,10 +20,7 @@ package org.apache.lucene.analysis.fa;
import java.io.Reader;
import java.util.Map;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
/**
* Factory for {@link PersianCharFilter}.
@ -37,7 +34,7 @@ import org.apache.lucene.analysis.util.MultiTermAwareComponent;
*
* @since 3.1
*/
public class PersianCharFilterFactory extends CharFilterFactory implements MultiTermAwareComponent {
public class PersianCharFilterFactory extends CharFilterFactory {
/** Creates a new PersianCharFilterFactory */
public PersianCharFilterFactory(Map<String,String> args) {
@ -48,12 +45,12 @@ public class PersianCharFilterFactory extends CharFilterFactory implements Multi
}
@Override
public CharFilter create(Reader input) {
public Reader create(Reader input) {
return new PersianCharFilter(input);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
public Reader normalize(Reader input) {
return create(input);
}
}

View File

@ -20,8 +20,6 @@ package org.apache.lucene.analysis.fa;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
@ -37,7 +35,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
*
* @since 3.1
*/
public class PersianNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
public class PersianNormalizationFilterFactory extends TokenFilterFactory {
/** Creates a new PersianNormalizationFilterFactory */
public PersianNormalizationFilterFactory(Map<String,String> args) {
@ -48,13 +46,13 @@ public class PersianNormalizationFilterFactory extends TokenFilterFactory implem
}
@Override
public PersianNormalizationFilter create(TokenStream input) {
public TokenStream create(TokenStream input) {
return new PersianNormalizationFilter(input);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
public TokenStream normalize(TokenStream input) {
return create(input);
}
}

View File

@ -20,8 +20,6 @@ package org.apache.lucene.analysis.ga;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
@ -35,7 +33,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
* &lt;/fieldType&gt;</pre>
* @since 3.6.0
*/
public class IrishLowerCaseFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
public class IrishLowerCaseFilterFactory extends TokenFilterFactory {
/** Creates a new IrishLowerCaseFilterFactory */
public IrishLowerCaseFilterFactory(Map<String,String> args) {
@ -52,7 +50,7 @@ public class IrishLowerCaseFilterFactory extends TokenFilterFactory implements M
// this will 'mostly work', except for special cases, just like most other filters
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
public TokenStream normalize(TokenStream input) {
return create(input);
}
}

View File

@ -20,8 +20,6 @@ package org.apache.lucene.analysis.hi;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
@ -35,7 +33,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
* &lt;/fieldType&gt;</pre>
* @since 3.1.0
*/
public class HindiNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
public class HindiNormalizationFilterFactory extends TokenFilterFactory {
/** Creates a new HindiNormalizationFilterFactory */
public HindiNormalizationFilterFactory(Map<String,String> args) {
@ -49,9 +47,9 @@ public class HindiNormalizationFilterFactory extends TokenFilterFactory implemen
public TokenStream create(TokenStream input) {
return new HindiNormalizationFilter(input);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
public TokenStream normalize(TokenStream input) {
return create(input);
}
}

View File

@ -20,8 +20,6 @@ package org.apache.lucene.analysis.in;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
@ -35,7 +33,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
* &lt;/fieldType&gt;</pre>
* @since 3.1.0
*/
public class IndicNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
public class IndicNormalizationFilterFactory extends TokenFilterFactory {
/** Creates a new IndicNormalizationFilterFactory */
public IndicNormalizationFilterFactory(Map<String,String> args) {
@ -49,9 +47,9 @@ public class IndicNormalizationFilterFactory extends TokenFilterFactory implemen
public TokenStream create(TokenStream input) {
return new IndicNormalizationFilter(input);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
public TokenStream normalize(TokenStream input) {
return create(input);
}
}

View File

@ -17,13 +17,10 @@
package org.apache.lucene.analysis.miscellaneous;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link ASCIIFoldingFilter}.
@ -37,7 +34,7 @@ import org.apache.lucene.analysis.TokenStream;
*
* @since 3.1
*/
public class ASCIIFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
public class ASCIIFoldingFilterFactory extends TokenFilterFactory {
private static final String PRESERVE_ORIGINAL = "preserveOriginal";
private final boolean preserveOriginal;
@ -52,23 +49,18 @@ public class ASCIIFoldingFilterFactory extends TokenFilterFactory implements Mul
}
@Override
public ASCIIFoldingFilter create(TokenStream input) {
public TokenStream create(TokenStream input) {
return new ASCIIFoldingFilter(input, preserveOriginal);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
if (preserveOriginal) {
// The main use-case for using preserveOriginal is to match regardless of
// case but to give better scores to exact matches. Since most multi-term
// queries return constant scores anyway, the multi-term component only
// emits the folded token
Map<String, String> args = new HashMap<>(getOriginalArgs());
args.remove(PRESERVE_ORIGINAL);
return new ASCIIFoldingFilterFactory(args);
} else {
return this;
}
public TokenStream normalize(TokenStream input) {
// The main use-case for using preserveOriginal is to match regardless of
// case and to give better scores to exact matches. Since most multi-term
// queries return constant scores anyway, for normalization we
// emit only the folded token
return new ASCIIFoldingFilter(input, false);
}
}

View File

@ -17,13 +17,11 @@
package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link ScandinavianFoldingFilter}.
* <pre class="prettyprint">
@ -35,8 +33,7 @@ import java.util.Map;
* &lt;/fieldType&gt;</pre>
* @since 4.4.0
*/
public class ScandinavianFoldingFilterFactory extends TokenFilterFactory
implements MultiTermAwareComponent {
public class ScandinavianFoldingFilterFactory extends TokenFilterFactory {
public ScandinavianFoldingFilterFactory(Map<String,String> args) {
super(args);
@ -46,12 +43,12 @@ public class ScandinavianFoldingFilterFactory extends TokenFilterFactory
}
@Override
public ScandinavianFoldingFilter create(TokenStream input) {
public TokenStream create(TokenStream input) {
return new ScandinavianFoldingFilter(input);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
public TokenStream normalize(TokenStream input) {
return create(input);
}
}

View File

@ -17,13 +17,11 @@
package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter}.
* <pre class="prettyprint">
@ -35,8 +33,7 @@ import java.util.Map;
* &lt;/fieldType&gt;</pre>
* @since 4.4.0
*/
public class ScandinavianNormalizationFilterFactory extends TokenFilterFactory
implements MultiTermAwareComponent {
public class ScandinavianNormalizationFilterFactory extends TokenFilterFactory {
public ScandinavianNormalizationFilterFactory(Map<String, String> args) {
super(args);
@ -51,7 +48,7 @@ public class ScandinavianNormalizationFilterFactory extends TokenFilterFactory
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
public TokenStream normalize(TokenStream input) {
return create(input);
}
}

View File

@ -20,8 +20,6 @@ package org.apache.lucene.analysis.miscellaneous;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
@ -38,7 +36,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
*
* @since 3.1
*/
public class TrimFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
public class TrimFilterFactory extends TokenFilterFactory {
/** Creates a new TrimFilterFactory */
public TrimFilterFactory(Map<String,String> args) {
@ -49,12 +47,12 @@ public class TrimFilterFactory extends TokenFilterFactory implements MultiTermAw
}
@Override
public TrimFilter create(TokenStream input) {
public TokenStream create(TokenStream input) {
return new TrimFilter(input);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
public TokenStream normalize(TokenStream input) {
return create(input);
}
}

View File

@ -21,10 +21,7 @@ import java.io.Reader;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
/**
* Factory for {@link PatternReplaceCharFilter}.
@ -39,7 +36,7 @@ import org.apache.lucene.analysis.util.MultiTermAwareComponent;
*
* @since Solr 3.1
*/
public class PatternReplaceCharFilterFactory extends CharFilterFactory implements MultiTermAwareComponent {
public class PatternReplaceCharFilterFactory extends CharFilterFactory {
private final Pattern pattern;
private final String replacement;
@ -54,12 +51,12 @@ public class PatternReplaceCharFilterFactory extends CharFilterFactory implement
}
@Override
public CharFilter create(Reader input) {
public Reader create(Reader input) {
return new PatternReplaceCharFilter(pattern, replacement, input);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
public Reader normalize(Reader input) {
return create(input);
}
}

View File

@ -21,8 +21,6 @@ import java.util.Arrays;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
@ -38,7 +36,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
* &lt;/fieldType&gt;</pre>
* @since 5.0.0
*/
public class SerbianNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
public class SerbianNormalizationFilterFactory extends TokenFilterFactory {
final String haircut;
/** Creates a new SerbianNormalizationFilterFactory */
@ -61,8 +59,7 @@ public class SerbianNormalizationFilterFactory extends TokenFilterFactory implem
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
public TokenStream normalize(TokenStream input) {
return create(input);
}
}

View File

@ -20,8 +20,6 @@ package org.apache.lucene.analysis.tr;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
@ -35,7 +33,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
* &lt;/fieldType&gt;</pre>
* @since 3.1.0
*/
public class TurkishLowerCaseFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
public class TurkishLowerCaseFilterFactory extends TokenFilterFactory {
/** Creates a new TurkishLowerCaseFilterFactory */
public TurkishLowerCaseFilterFactory(Map<String,String> args) {
@ -51,7 +49,7 @@ public class TurkishLowerCaseFilterFactory extends TokenFilterFactory implement
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
public TokenStream normalize(TokenStream input) {
return create(input);
}
}

View File

@ -73,4 +73,13 @@ public abstract class CharFilterFactory extends AbstractAnalysisFactory {
/** Wraps the given Reader with a CharFilter. */
public abstract Reader create(Reader input);
/**
* Normalize the specified input Reader
* While the default implementation returns input unchanged,
* char filters that should be applied at normalization time can delegate to {@code create} method.
*/
public Reader normalize(Reader input) {
return input;
}
}

View File

@ -38,7 +38,7 @@ import org.apache.lucene.analysis.fr.FrenchAnalyzer;
*
* @since 3.1
*/
public class ElisionFilterFactory extends TokenFilterFactory implements ResourceLoaderAware, MultiTermAwareComponent {
public class ElisionFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
private final String articlesFile;
private final boolean ignoreCase;
private CharArraySet articles;
@ -63,13 +63,13 @@ public class ElisionFilterFactory extends TokenFilterFactory implements Resource
}
@Override
public ElisionFilter create(TokenStream input) {
public TokenStream create(TokenStream input) {
return new ElisionFilter(input, articles);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
public TokenStream normalize(TokenStream input) {
return create(input);
}
}

View File

@ -1,36 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.util;
/** Add to any analysis factory component to allow returning an
* analysis component factory for use with partial terms in prefix queries,
* wildcard queries, range query endpoints, regex queries, etc.
*
* Components implementing this interface should not add or remove tokens from
* the token stream, and should be able to deal with special characters
* indicating that multi-term queries are required (eg slashes for regex, wildcard
* characters, etc)
*
* @lucene.experimental
*/
public interface MultiTermAwareComponent {
/** Returns an analysis component to handle analysis if multi-term queries.
* The returned component must be a TokenizerFactory, TokenFilterFactory or CharFilterFactory.
*/
public AbstractAnalysisFactory getMultiTermComponent();
}

View File

@ -73,4 +73,13 @@ public abstract class TokenFilterFactory extends AbstractAnalysisFactory {
/** Transform the specified input TokenStream */
public abstract TokenStream create(TokenStream input);
/**
* Normalize the specified input TokenStream
* While the default implementation returns input unchanged,
* filters that should be applied at normalization time can delegate to {@code create} method.
*/
public TokenStream normalize(TokenStream input) {
return input;
}
}

View File

@ -34,7 +34,6 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilterFactory;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.StringMockResourceLoader;
import org.apache.lucene.analysis.util.TokenFilterFactory;
@ -78,15 +77,6 @@ public class TestFactories extends BaseTokenStreamTestCase {
TokenizerFactory factory = (TokenizerFactory) initialize(factoryClazz);
if (factory != null) {
// we managed to fully create an instance. check a few more things:
// if it implements MultiTermAware, sanity check its impl
if (factory instanceof MultiTermAwareComponent) {
AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
assertNotNull(mtc);
// it's not ok to return e.g. a charfilter here: but a tokenizer could wrap a filter around it
assertFalse(mtc instanceof CharFilterFactory);
}
if (!EXCLUDE_FACTORIES_RANDOM_DATA.contains(factory.getClass())) {
// beast it just a little, it shouldnt throw exceptions:
// (it should have thrown them in initialize)
@ -102,15 +92,6 @@ public class TestFactories extends BaseTokenStreamTestCase {
TokenFilterFactory factory = (TokenFilterFactory) initialize(factoryClazz);
if (factory != null) {
// we managed to fully create an instance. check a few more things:
// if it implements MultiTermAware, sanity check its impl
if (factory instanceof MultiTermAwareComponent) {
AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
assertNotNull(mtc);
// it's not ok to return a charfilter or tokenizer here, this makes no sense
assertTrue(mtc instanceof TokenFilterFactory);
}
if (!EXCLUDE_FACTORIES_RANDOM_DATA.contains(factory.getClass())) {
// beast it just a little, it shouldnt throw exceptions:
// (it should have thrown them in initialize)
@ -126,15 +107,6 @@ public class TestFactories extends BaseTokenStreamTestCase {
CharFilterFactory factory = (CharFilterFactory) initialize(factoryClazz);
if (factory != null) {
// we managed to fully create an instance. check a few more things:
// if it implements MultiTermAware, sanity check its impl
if (factory instanceof MultiTermAwareComponent) {
AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
assertNotNull(mtc);
// it's not ok to return a tokenizer or tokenfilter here, this makes no sense
assertTrue(mtc instanceof CharFilterFactory);
}
if (!EXCLUDE_FACTORIES_RANDOM_DATA.contains(factory.getClass())) {
// beast it just a little, it shouldnt throw exceptions:
// (it should have thrown them in initialize)

View File

@ -39,9 +39,7 @@ import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory;
import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory;
import org.apache.lucene.analysis.standard.ClassicTokenizerFactory;
import org.apache.lucene.analysis.standard.StandardTokenizerFactory;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
@ -397,17 +395,16 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
}
public static class DummyMultiTermAwareCharFilterFactory extends DummyCharFilterFactory implements MultiTermAwareComponent {
public static class DummyMultiTermAwareCharFilterFactory extends DummyCharFilterFactory {
public DummyMultiTermAwareCharFilterFactory(Map<String,String> args) {
super(args);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return new DummyCharFilterFactory(Collections.emptyMap(), '0', '2');
public Reader normalize(Reader input) {
return create(input);
}
}
public static class DummyTokenizerFactory extends TokenizerFactory {
@ -423,19 +420,6 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
}
public static class DummyMultiTermAwareTokenizerFactory extends DummyTokenizerFactory implements MultiTermAwareComponent {
public DummyMultiTermAwareTokenizerFactory(Map<String,String> args) {
super(args);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return new DummyTokenFilterFactory(Collections.emptyMap());
}
}
public static class DummyTokenFilterFactory extends TokenFilterFactory {
public DummyTokenFilterFactory(Map<String,String> args) {
@ -449,15 +433,15 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
}
public static class DummyMultiTermAwareTokenFilterFactory extends DummyTokenFilterFactory implements MultiTermAwareComponent {
public static class DummyMultiTermAwareTokenFilterFactory extends DummyTokenFilterFactory {
public DummyMultiTermAwareTokenFilterFactory(Map<String,String> args) {
super(args);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return new ASCIIFoldingFilterFactory(Collections.emptyMap());
public TokenStream normalize(TokenStream input) {
return new ASCIIFoldingFilterFactory(Collections.emptyMap()).normalize(input);
}
}
@ -472,12 +456,13 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
assertEquals(new BytesRef(""), analyzer1.normalize("dummy", ""));
CustomAnalyzer analyzer2 = CustomAnalyzer.builder()
// this component in not multi-term aware so it should not be applied
.withTokenizer(DummyTokenizerFactory.class, Collections.emptyMap())
// these components are multi-term aware so they should be applied
.withTokenizer(DummyMultiTermAwareTokenizerFactory.class, Collections.emptyMap())
.addCharFilter(DummyMultiTermAwareCharFilterFactory.class, Collections.emptyMap())
.addTokenFilter(DummyMultiTermAwareTokenFilterFactory.class, Collections.emptyMap())
.build();
assertEquals(new BytesRef("2A"), analyzer2.normalize("dummy", ""));
assertEquals(new BytesRef("1A"), analyzer2.normalize("dummy", ""));
}
public void testNormalizationWithMultipleTokenFilters() throws IOException {

View File

@ -24,7 +24,6 @@ import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
public class TestAsciiFoldingFilterFactory extends BaseTokenStreamFactoryTestCase {
@ -35,9 +34,8 @@ public class TestAsciiFoldingFilterFactory extends BaseTokenStreamFactoryTestCas
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "Ete" });
factory = (TokenFilterFactory) ((MultiTermAwareComponent) factory).getMultiTermComponent();
stream = new CannedTokenStream(new Token("Été", 0, 3));
stream = factory.create(stream);
stream = factory.normalize(stream);
assertTokenStreamContents(stream, new String[] { "Ete" });
factory = new ASCIIFoldingFilterFactory(new HashMap<>(Collections.singletonMap("preserveOriginal", "true")));
@ -45,9 +43,8 @@ public class TestAsciiFoldingFilterFactory extends BaseTokenStreamFactoryTestCas
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "Ete", "Été" });
factory = (TokenFilterFactory) ((MultiTermAwareComponent) factory).getMultiTermComponent();
stream = new CannedTokenStream(new Token("Été", 0, 3));
stream = factory.create(stream);
stream = factory.normalize(stream);
assertTokenStreamContents(stream, new String[] { "Ete" });
}

View File

@ -19,14 +19,11 @@ package org.apache.lucene.analysis.icu;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory; // javadocs
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import com.ibm.icu.text.FilteredNormalizer2;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.UnicodeSet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link ICUFoldingFilter}.
@ -39,7 +36,7 @@ import com.ibm.icu.text.UnicodeSet;
* &lt;/fieldType&gt;</pre>
* @since 3.1.0
*/
public class ICUFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
public class ICUFoldingFilterFactory extends TokenFilterFactory {
private final Normalizer2 normalizer;
/** Creates a new ICUFoldingFilterFactory */
@ -67,7 +64,7 @@ public class ICUFoldingFilterFactory extends TokenFilterFactory implements Multi
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
public TokenStream normalize(TokenStream input) {
return create(input);
}
}

View File

@ -21,13 +21,10 @@ import java.io.Reader;
import java.util.Arrays;
import java.util.Map;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import com.ibm.icu.text.FilteredNormalizer2;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.UnicodeSet;
import org.apache.lucene.analysis.util.CharFilterFactory;
/**
* Factory for {@link ICUNormalizer2CharFilter}
@ -47,7 +44,7 @@ import com.ibm.icu.text.UnicodeSet;
*
* @since 4.10.0
*/
public class ICUNormalizer2CharFilterFactory extends CharFilterFactory implements MultiTermAwareComponent {
public class ICUNormalizer2CharFilterFactory extends CharFilterFactory {
private final Normalizer2 normalizer;
/** Creates a new ICUNormalizer2CharFilterFactory */
@ -78,8 +75,7 @@ public class ICUNormalizer2CharFilterFactory extends CharFilterFactory implement
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
public Reader normalize(Reader input) {
return create(input);
}
}

View File

@ -20,14 +20,11 @@ package org.apache.lucene.analysis.icu;
import java.util.Arrays;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory; // javadocs
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import com.ibm.icu.text.FilteredNormalizer2;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.UnicodeSet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link ICUNormalizer2Filter}
@ -46,7 +43,7 @@ import com.ibm.icu.text.UnicodeSet;
* @see FilteredNormalizer2
* @since 3.1.0
*/
public class ICUNormalizer2FilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
public class ICUNormalizer2FilterFactory extends TokenFilterFactory {
private final Normalizer2 normalizer;
/** Creates a new ICUNormalizer2FilterFactory */
@ -79,7 +76,7 @@ public class ICUNormalizer2FilterFactory extends TokenFilterFactory implements M
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
public TokenStream normalize(TokenStream input) {
return create(input);
}
}

View File

@ -20,12 +20,9 @@ package org.apache.lucene.analysis.icu;
import java.util.Arrays;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory; // javadocs
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import com.ibm.icu.text.Transliterator;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link ICUTransformFilter}.
@ -38,7 +35,7 @@ import com.ibm.icu.text.Transliterator;
* @see Transliterator
* @since 3.1.0
*/
public class ICUTransformFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
public class ICUTransformFilterFactory extends TokenFilterFactory {
private final Transliterator transliterator;
// TODO: add support for custom rules
@ -58,9 +55,9 @@ public class ICUTransformFilterFactory extends TokenFilterFactory implements Mul
public TokenStream create(TokenStream input) {
return new ICUTransformFilter(input, transliterator);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
public TokenStream normalize(TokenStream input) {
return create(input);
}
}

View File

@ -17,14 +17,11 @@
package org.apache.lucene.analysis.ja;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import java.io.Reader;
import java.util.Map;
import org.apache.lucene.analysis.util.CharFilterFactory;
/**
* Factory for {@link org.apache.lucene.analysis.ja.JapaneseIterationMarkCharFilter}.
* <pre class="prettyprint">
@ -37,7 +34,7 @@ import java.util.Map;
*
* @since 4.0.0
*/
public class JapaneseIterationMarkCharFilterFactory extends CharFilterFactory implements MultiTermAwareComponent {
public class JapaneseIterationMarkCharFilterFactory extends CharFilterFactory {
private static final String NORMALIZE_KANJI_PARAM = "normalizeKanji";
private static final String NORMALIZE_KANA_PARAM = "normalizeKana";
@ -56,12 +53,12 @@ public class JapaneseIterationMarkCharFilterFactory extends CharFilterFactory im
}
@Override
public CharFilter create(Reader input) {
public Reader create(Reader input) {
return new JapaneseIterationMarkCharFilter(input, normalizeKanji, normalizeKana);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
public Reader normalize(Reader input) {
return create(input);
}
}

View File

@ -34,7 +34,6 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilterFactory;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
@ -75,15 +74,6 @@ public class TestFactories extends BaseTokenStreamTestCase {
TokenizerFactory factory = (TokenizerFactory) initialize(factoryClazz);
if (factory != null) {
// we managed to fully create an instance. check a few more things:
// if it implements MultiTermAware, sanity check its impl
if (factory instanceof MultiTermAwareComponent) {
AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
assertNotNull(mtc);
// it's not ok to return e.g. a charfilter here: but a tokenizer could wrap a filter around it
assertFalse(mtc instanceof CharFilterFactory);
}
if (!EXCLUDE_FACTORIES_RANDOM_DATA.contains(factory.getClass())) {
// beast it just a little, it shouldnt throw exceptions:
// (it should have thrown them in initialize)
@ -99,15 +89,6 @@ public class TestFactories extends BaseTokenStreamTestCase {
TokenFilterFactory factory = (TokenFilterFactory) initialize(factoryClazz);
if (factory != null) {
// we managed to fully create an instance. check a few more things:
// if it implements MultiTermAware, sanity check its impl
if (factory instanceof MultiTermAwareComponent) {
AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
assertNotNull(mtc);
// it's not ok to return a charfilter or tokenizer here, this makes no sense
assertTrue(mtc instanceof TokenFilterFactory);
}
if (!EXCLUDE_FACTORIES_RANDOM_DATA.contains(factory.getClass())) {
// beast it just a little, it shouldnt throw exceptions:
// (it should have thrown them in initialize)
@ -123,15 +104,6 @@ public class TestFactories extends BaseTokenStreamTestCase {
CharFilterFactory factory = (CharFilterFactory) initialize(factoryClazz);
if (factory != null) {
// we managed to fully create an instance. check a few more things:
// if it implements MultiTermAware, sanity check its impl
if (factory instanceof MultiTermAwareComponent) {
AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
assertNotNull(mtc);
// it's not ok to return a tokenizer or tokenfilter here, this makes no sense
assertTrue(mtc instanceof CharFilterFactory);
}
if (!EXCLUDE_FACTORIES_RANDOM_DATA.contains(factory.getClass())) {
// beast it just a little, it shouldnt throw exceptions:
// (it should have thrown them in initialize)

View File

@ -18,12 +18,12 @@ package org.apache.lucene.analysis.ja;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
@ -36,7 +36,7 @@ public class TestJapaneseIterationMarkCharFilterFactory extends BaseTokenStreamT
public void testIterationMarksWithKeywordTokenizer() throws IOException {
final String text = "時々馬鹿々々しいところゞゝゝミスヾ";
JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(new HashMap<String,String>());
CharFilter filter = filterFactory.create(new StringReader(text));
Reader filter = filterFactory.create(new StringReader(text));
TokenStream tokenStream = new MockTokenizer(MockTokenizer.KEYWORD, false);
((Tokenizer)tokenStream).setReader(filter);
assertTokenStreamContents(tokenStream, new String[]{"時時馬鹿馬鹿しいところどころミスズ"});
@ -47,7 +47,7 @@ public class TestJapaneseIterationMarkCharFilterFactory extends BaseTokenStreamT
tokenizerFactory.inform(new StringMockResourceLoader(""));
JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(new HashMap<String,String>());
CharFilter filter = filterFactory.create(
Reader filter = filterFactory.create(
new StringReader("時々馬鹿々々しいところゞゝゝミスヾ")
);
TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory());
@ -64,7 +64,7 @@ public class TestJapaneseIterationMarkCharFilterFactory extends BaseTokenStreamT
filterArgs.put("normalizeKana", "false");
JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(filterArgs);
CharFilter filter = filterFactory.create(
Reader filter = filterFactory.create(
new StringReader("時々馬鹿々々しいところゞゝゝミスヾ")
);
TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory());
@ -81,7 +81,7 @@ public class TestJapaneseIterationMarkCharFilterFactory extends BaseTokenStreamT
filterArgs.put("normalizeKana", "true");
JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(filterArgs);
CharFilter filter = filterFactory.create(
Reader filter = filterFactory.create(
new StringReader("時々馬鹿々々しいところゞゝゝミスヾ")
);
TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory());

View File

@ -18,10 +18,11 @@ package org.apache.solr.analysis;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
@ -91,10 +92,7 @@ public final class TokenizerChain extends SolrAnalyzer {
protected Reader initReaderForNormalization(String fieldName, Reader reader) {
if (charFilters != null && charFilters.length > 0) {
for (CharFilterFactory charFilter : charFilters) {
if (charFilter instanceof MultiTermAwareComponent) {
charFilter = (CharFilterFactory) ((MultiTermAwareComponent) charFilter).getMultiTermComponent();
reader = charFilter.create(reader);
}
reader = charFilter.normalize(reader);
}
}
return reader;
@ -114,10 +112,7 @@ public final class TokenizerChain extends SolrAnalyzer {
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = in;
for (TokenFilterFactory filter : filters) {
if (filter instanceof MultiTermAwareComponent) {
filter = (TokenFilterFactory) ((MultiTermAwareComponent) filter).getMultiTermComponent();
result = filter.create(result);
}
result = filter.normalize(result);
}
return result;
}
@ -138,4 +133,30 @@ public final class TokenizerChain extends SolrAnalyzer {
return sb.toString();
}
public Analyzer getMultiTermAnalyzer() {
return new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tk = new KeywordTokenizer();
TokenStream ts = tk;
for (TokenFilterFactory filter : filters) {
ts = filter.normalize(ts);
}
return new TokenStreamComponents(tk, ts);
}
@Override
protected Reader initReader(String fieldName, Reader reader) {
if (charFilters != null && charFilters.length > 0) {
Reader cs = reader;
for (CharFilterFactory charFilter : charFilters) {
cs = charFilter.normalize(cs);
}
reader = cs;
}
return reader;
}
};
}
}

View File

@ -20,19 +20,14 @@ import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.core.KeywordTokenizerFactory;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.Version;
@ -186,61 +181,9 @@ public final class FieldTypePluginLoader
return new KeywordAnalyzer();
}
TokenizerChain tc = (TokenizerChain) queryAnalyzer;
MultiTermChainBuilder builder = new MultiTermChainBuilder();
CharFilterFactory[] charFactories = tc.getCharFilterFactories();
for (CharFilterFactory fact : charFactories) {
builder.add(fact);
}
builder.add(tc.getTokenizerFactory());
for (TokenFilterFactory fact : tc.getTokenFilterFactories()) {
builder.add(fact);
}
return builder.build();
return ((TokenizerChain) queryAnalyzer).getMultiTermAnalyzer();
}
private static class MultiTermChainBuilder {
static final KeywordTokenizerFactory keyFactory = new KeywordTokenizerFactory(new HashMap<String,String>());
ArrayList<CharFilterFactory> charFilters = null;
ArrayList<TokenFilterFactory> filters = new ArrayList<>(2);
TokenizerFactory tokenizer = keyFactory;
public void add(Object current) {
if (!(current instanceof MultiTermAwareComponent)) return;
AbstractAnalysisFactory newComponent = ((MultiTermAwareComponent)current).getMultiTermComponent();
if (newComponent instanceof TokenFilterFactory) {
if (filters == null) {
filters = new ArrayList<>(2);
}
filters.add((TokenFilterFactory)newComponent);
} else if (newComponent instanceof TokenizerFactory) {
tokenizer = (TokenizerFactory)newComponent;
} else if (newComponent instanceof CharFilterFactory) {
if (charFilters == null) {
charFilters = new ArrayList<>(1);
}
charFilters.add( (CharFilterFactory)newComponent);
} else {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown analysis component from MultiTermAwareComponent: " + newComponent);
}
}
public TokenizerChain build() {
CharFilterFactory[] charFilterArr = charFilters == null ? null : charFilters.toArray(new CharFilterFactory[charFilters.size()]);
TokenFilterFactory[] filterArr = filters == null ? new TokenFilterFactory[0] : filters.toArray(new TokenFilterFactory[filters.size()]);
return new TokenizerChain(charFilterArr, tokenizer, filterArr);
}
}
//
// <analyzer><tokenizer class="...."/><tokenizer class="...." arg="....">
//

View File

@ -1,93 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.schema;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.charfilter.MappingCharFilterFactory;
import org.apache.lucene.analysis.core.KeywordTokenizerFactory;
import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory;
import org.apache.lucene.analysis.miscellaneous.TrimFilterFactory;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.analysis.TokenizerChain;
import org.junit.BeforeClass;
import org.junit.Test;
public class MultiTermTest extends SolrTestCaseJ4 {
public String getCoreName() {
return "basic";
}
@BeforeClass
public static void beforeTests() throws Exception {
initCore("solrconfig-basic.xml", "schema-folding.xml");
}
@Test
public void testMultiFound() {
SchemaField field = h.getCore().getLatestSchema().getField("content_multi");
Analyzer analyzer = ((TextField)field.getType()).getMultiTermAnalyzer();
assertTrue(analyzer instanceof TokenizerChain);
assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory);
TokenizerChain tc = (TokenizerChain) analyzer;
for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
assertTrue((factory instanceof ASCIIFoldingFilterFactory) || (factory instanceof LowerCaseFilterFactory));
}
analyzer = field.getType().getIndexAnalyzer();
assertTrue(analyzer instanceof TokenizerChain);
assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof WhitespaceTokenizerFactory);
tc = (TokenizerChain) analyzer;
for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
assertTrue((factory instanceof ASCIIFoldingFilterFactory) || (factory instanceof TrimFilterFactory));
}
assertTrue(tc.getCharFilterFactories().length == 0);
}
@Test
public void testQueryCopiedToMulti() {
SchemaField field = h.getCore().getLatestSchema().getField("content_charfilter");
Analyzer analyzer = ((TextField)field.getType()).getMultiTermAnalyzer();
assertTrue(analyzer instanceof TokenizerChain);
assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof KeywordTokenizerFactory);
TokenizerChain tc = (TokenizerChain) analyzer;
for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
assertTrue(factory instanceof LowerCaseFilterFactory);
}
assertTrue(tc.getCharFilterFactories().length == 1);
assertTrue(tc.getCharFilterFactories()[0] instanceof MappingCharFilterFactory);
}
@Test
public void testDefaultCopiedToMulti() {
SchemaField field = h.getCore().getLatestSchema().getField("content_ws");
Analyzer analyzer = ((TextField)field.getType()).getMultiTermAnalyzer();
assertTrue(analyzer instanceof TokenizerChain);
assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof KeywordTokenizerFactory);
TokenizerChain tc = (TokenizerChain) analyzer;
for (TokenFilterFactory factory : tc.getTokenFilterFactories()) {
assertTrue((factory instanceof ASCIIFoldingFilterFactory) || (factory instanceof LowerCaseFilterFactory));
}
assertTrue(tc.getCharFilterFactories().length == 0);
}
}

View File

@ -170,18 +170,6 @@ public class TestFoldingMultitermQuery extends SolrTestCaseJ4 {
}
}
@Test
public void testLowerTokenizer() {
// The lowercasetokenizer will remove the '1' from the index, but not from the query, thus the special test.
assertQ(req("q", "content_lower_token:Á*C*"), "//result[@numFound='1']");
assertQ(req("q", "content_lower_token:Á*C*1"), "//result[@numFound='0']");
assertQ(req("q", "content_lower_token:h*1"), "//result[@numFound='0']");
assertQ(req("q", "content_lower_token:H*1"), "//result[@numFound='0']");
assertQ(req("q", "content_lower_token:*1"), "//result[@numFound='0']");
assertQ(req("q", "content_lower_token:HÏ*l?*"), "//result[@numFound='1']");
assertQ(req("q", "content_lower_token:hȉ*l?*"), "//result[@numFound='1']");
}
@Test
public void testFuzzy() throws Exception {
assertQ(req("q", "content:ZiLLx~1"),