mirror of https://github.com/apache/lucene.git
LUCENE-7355: Add Analyzer#normalize() and use it in query parsers.
This commit is contained in:
parent
ccd3bc8466
commit
7c2e7a0fb8
|
@ -26,6 +26,9 @@ New Features
|
|||
methods Directory.rename and Directory.syncMetaData instead (Robert Muir,
|
||||
Uwe Schindler, Mike McCandless)
|
||||
|
||||
* LUCENE-7355: Added Analyzer#normalize(), which only applies normalization to
|
||||
an input string. (Adrien Grand)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-6662: Fixed potential resource leaks. (Rishabh Patel via Adrien Grand)
|
||||
|
@ -77,6 +80,10 @@ Improvements
|
|||
* LUCENE-7276: MatchNoDocsQuery now includes an optional reason for
|
||||
why it was used (Jim Ferenczi via Mike McCandless)
|
||||
|
||||
* LUCENE-7355: AnalyzingQueryParser now only applies the subset of the analysis
|
||||
chain that is about normalization for range/fuzzy/wildcard queries.
|
||||
(Adrien Grand)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-7330, LUCENE-7339: Speed up conjunction queries. (Adrien Grand)
|
||||
|
|
|
@ -146,5 +146,13 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
|
|||
}
|
||||
return new TokenStreamComponents(source, new ArabicStemFilter(result));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new LowerCaseFilter(in);
|
||||
result = new DecimalDigitFilter(result);
|
||||
result = new ArabicNormalizationFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -126,4 +126,11 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new BulgarianStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new LowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -127,5 +127,12 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new SetKeywordMarkerFilter(result, excltable);
|
||||
return new TokenStreamComponents(source, new BrazilianStemFilter(result));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new LowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -130,4 +130,12 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new SnowballFilter(result, new CatalanStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new ElisionFilter(result, DEFAULT_ARTICLES);
|
||||
result = new LowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -92,4 +92,11 @@ public final class CJKAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new CJKBigramFilter(result);
|
||||
return new TokenStreamComponents(source, new StopFilter(result, stopwords));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new CJKWidthFilter(in);
|
||||
result = new LowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -129,4 +129,13 @@ public final class SoraniAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new SoraniStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new SoraniNormalizationFilter(result);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new DecimalDigitFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.core;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/** An {@link Analyzer} that filters {@link LetterTokenizer}
|
||||
* with {@link LowerCaseFilter}
|
||||
|
@ -35,4 +36,9 @@ public final class SimpleAnalyzer extends Analyzer {
|
|||
protected TokenStreamComponents createComponents(final String fieldName) {
|
||||
return new TokenStreamComponents(new LowerCaseTokenizer());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
return new LowerCaseFilter(in);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,6 +25,7 @@ import org.apache.lucene.analysis.CharArraySet;
|
|||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
|
@ -79,5 +80,10 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
|
|||
final Tokenizer source = new LowerCaseTokenizer();
|
||||
return new TokenStreamComponents(source, new StopFilter(source, stopwords));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
return new LowerCaseFilter(in);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -37,6 +37,7 @@ import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
|||
import org.apache.lucene.analysis.util.CharFilterFactory;
|
||||
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
|
||||
import org.apache.lucene.analysis.util.FilesystemResourceLoader;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.ResourceLoader;
|
||||
import org.apache.lucene.analysis.util.ResourceLoaderAware;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
@ -117,16 +118,39 @@ public final class CustomAnalyzer extends Analyzer {
|
|||
return reader;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReaderForNormalization(String fieldName, Reader reader) {
|
||||
for (CharFilterFactory charFilter : charFilters) {
|
||||
if (charFilter instanceof MultiTermAwareComponent) {
|
||||
charFilter = (CharFilterFactory) ((MultiTermAwareComponent) charFilter).getMultiTermComponent();
|
||||
reader = charFilter.create(reader);
|
||||
}
|
||||
}
|
||||
return reader;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer tk = tokenizer.create();
|
||||
final Tokenizer tk = tokenizer.create(attributeFactory());
|
||||
TokenStream ts = tk;
|
||||
for (final TokenFilterFactory filter : tokenFilters) {
|
||||
ts = filter.create(ts);
|
||||
}
|
||||
return new TokenStreamComponents(tk, ts);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = in;
|
||||
for (TokenFilterFactory filter : tokenFilters) {
|
||||
if (filter instanceof MultiTermAwareComponent) {
|
||||
filter = (TokenFilterFactory) ((MultiTermAwareComponent) filter).getMultiTermComponent();
|
||||
result = filter.create(in);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getPositionIncrementGap(String fieldName) {
|
||||
// use default from Analyzer base class if null
|
||||
|
|
|
@ -125,5 +125,12 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new CzechStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new LowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -124,4 +124,11 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new SnowballFilter(result, new DanishStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new LowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -139,4 +139,12 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new GermanLightStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new GermanNormalizationFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -104,4 +104,11 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new GreekStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new GreekLowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -107,4 +107,11 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new PorterStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new LowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -123,4 +123,11 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new SpanishLightStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new LowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -121,4 +121,11 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new SnowballFilter(result, new BasqueStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new LowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -29,6 +29,7 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
|
||||
import org.apache.lucene.analysis.core.DecimalDigitFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
|
@ -128,7 +129,18 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
return new TokenStreamComponents(source, new StopFilter(result, stopwords));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new DecimalDigitFilter(result);
|
||||
result = new ArabicNormalizationFilter(result);
|
||||
/* additional persian-specific normalization */
|
||||
result = new PersianNormalizationFilter(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Wraps the Reader with {@link PersianCharFilter}
|
||||
*/
|
||||
|
|
|
@ -124,4 +124,11 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new SnowballFilter(result, new FinnishStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new LowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -144,5 +144,13 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new FrenchLightStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new ElisionFilter(result, DEFAULT_ARTICLES);
|
||||
result = new LowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -141,4 +141,12 @@ public final class IrishAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new SnowballFilter(result, new IrishStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new ElisionFilter(result, DEFAULT_ARTICLES);
|
||||
result = new IrishLowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -122,4 +122,11 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new GalicianStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new LowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -29,6 +29,7 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||
import org.apache.lucene.analysis.core.DecimalDigitFilter;
|
||||
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
|
@ -128,4 +129,14 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new HindiStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new DecimalDigitFilter(result);
|
||||
result = new IndicNormalizationFilter(result);
|
||||
result = new HindiNormalizationFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -124,4 +124,11 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new SnowballFilter(result, new HungarianStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new LowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -121,4 +121,11 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new SnowballFilter(result, new ArmenianStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new LowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -119,4 +119,11 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
|
|||
}
|
||||
return new TokenStreamComponents(source, new IndonesianStemFilter(result));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new LowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -133,4 +133,12 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new ItalianLightStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new ElisionFilter(result, DEFAULT_ARTICLES);
|
||||
result = new LowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -121,4 +121,11 @@ public final class LithuanianAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new SnowballFilter(result, new LithuanianStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new LowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -122,4 +122,11 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new LatvianStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new LowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -159,4 +159,11 @@ public final class DutchAnalyzer extends Analyzer {
|
|||
result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new LowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -124,5 +124,12 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new SnowballFilter(result, new NorwegianStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new LowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -123,4 +123,11 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new PortugueseLightStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new LowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -126,4 +126,11 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new SnowballFilter(result, new RomanianStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new LowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -121,4 +121,11 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new LowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -100,4 +100,9 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase {
|
|||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
return new LowerCaseFilter(in);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -97,4 +97,9 @@ public final class UAX29URLEmailAnalyzer extends StopwordAnalyzerBase {
|
|||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
return new LowerCaseFilter(in);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -124,4 +124,11 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new SnowballFilter(result, new SwedishStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new LowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -104,4 +104,11 @@ public final class ThaiAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new StopFilter(result, stopwords);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new LowerCaseFilter(in);
|
||||
result = new DecimalDigitFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -127,4 +127,11 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new SnowballFilter(result, new TurkishStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new TurkishLowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,6 +20,8 @@ package org.apache.lucene.collation;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
|
||||
import java.text.Collator;
|
||||
|
||||
/**
|
||||
|
@ -82,6 +84,11 @@ public final class CollationKeyAnalyzer extends Analyzer {
|
|||
this.factory = new CollationAttributeFactory(collator);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected AttributeFactory attributeFactory() {
|
||||
return factory;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
KeywordTokenizer tokenizer = new KeywordTokenizer(factory, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
|
||||
|
|
|
@ -35,6 +35,7 @@ import org.apache.lucene.analysis.MockCharFilter;
|
|||
import org.apache.lucene.analysis.MockFixedLengthPayloadFilter;
|
||||
import org.apache.lucene.analysis.MockGraphTokenFilter;
|
||||
import org.apache.lucene.analysis.MockHoleInjectingTokenFilter;
|
||||
import org.apache.lucene.analysis.MockLowerCaseFilter;
|
||||
import org.apache.lucene.analysis.MockRandomLookaheadTokenFilter;
|
||||
import org.apache.lucene.analysis.MockSynonymFilter;
|
||||
import org.apache.lucene.analysis.MockTokenFilter;
|
||||
|
@ -75,6 +76,7 @@ public class TestAllAnalyzersHaveFactories extends LuceneTestCase {
|
|||
MockFixedLengthPayloadFilter.class,
|
||||
MockGraphTokenFilter.class,
|
||||
MockHoleInjectingTokenFilter.class,
|
||||
MockLowerCaseFilter.class,
|
||||
MockRandomLookaheadTokenFilter.class,
|
||||
MockSynonymFilter.class,
|
||||
MockTokenFilter.class,
|
||||
|
|
|
@ -52,6 +52,7 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
|
|||
new String[] { "b" });
|
||||
assertAnalyzesTo(a, "\"QUOTED\" word",
|
||||
new String[] { "quoted", "word" });
|
||||
assertEquals(new BytesRef("\"\\à3[]()! cz@"), a.normalize("dummy", "\"\\À3[]()! Cz@"));
|
||||
a.close();
|
||||
}
|
||||
|
||||
|
@ -73,6 +74,7 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
|
|||
new String[] { "2B" });
|
||||
assertAnalyzesTo(a, "\"QUOTED\" word",
|
||||
new String[] { "\"QUOTED\"", "word" });
|
||||
assertEquals(new BytesRef("\"\\À3[]()! Cz@"), a.normalize("dummy", "\"\\À3[]()! Cz@"));
|
||||
a.close();
|
||||
}
|
||||
|
||||
|
@ -82,6 +84,8 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
|
|||
new String[] { "foo", "bar", "foo", "bar" });
|
||||
assertAnalyzesTo(a, "foo a bar such FOO THESE BAR",
|
||||
new String[] { "foo", "bar", "foo", "bar" });
|
||||
assertEquals(new BytesRef("\"\\à3[]()! cz@"), a.normalize("dummy", "\"\\À3[]()! Cz@"));
|
||||
assertEquals(new BytesRef("the"), a.normalize("dummy", "the"));
|
||||
a.close();
|
||||
}
|
||||
|
||||
|
|
|
@ -928,6 +928,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
System.out.println("Creating random analyzer:" + a);
|
||||
}
|
||||
try {
|
||||
checkNormalize(a);
|
||||
checkRandomData(random, a, 500*RANDOM_MULTIPLIER, 20, false,
|
||||
false /* We already validate our own offsets... */);
|
||||
} catch (Throwable e) {
|
||||
|
@ -937,7 +938,14 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void checkNormalize(Analyzer a) {
|
||||
// normalization should not modify characters that may be used for wildcards
|
||||
// or regular expressions
|
||||
String s = "([0-9]+)?*";
|
||||
assertEquals(s, a.normalize("dummy", s).utf8ToString());
|
||||
}
|
||||
|
||||
// we might regret this decision...
|
||||
public void testRandomChainsWithLargeStrings() throws Throwable {
|
||||
int numIterations = TEST_NIGHTLY ? atLeast(20) : 3;
|
||||
|
|
|
@ -17,6 +17,8 @@
|
|||
package org.apache.lucene.analysis.custom;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
|
@ -24,16 +26,25 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizerFactory;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
|
||||
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
|
||||
import org.apache.lucene.analysis.core.StopFilterFactory;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory;
|
||||
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory;
|
||||
import org.apache.lucene.analysis.standard.ClassicTokenizerFactory;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizerFactory;
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.CharFilterFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.SetOnce.AlreadySetException;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
|
@ -336,4 +347,136 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
|
|||
});
|
||||
}
|
||||
|
||||
private static class DummyCharFilter extends CharFilter {
|
||||
|
||||
private final char match, repl;
|
||||
|
||||
public DummyCharFilter(Reader input, char match, char repl) {
|
||||
super(input);
|
||||
this.match = match;
|
||||
this.repl = repl;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int correct(int currentOff) {
|
||||
return currentOff;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||
final int read = input.read(cbuf, off, len);
|
||||
for (int i = 0; i < read; ++i) {
|
||||
if (cbuf[off+i] == match) {
|
||||
cbuf[off+i] = repl;
|
||||
}
|
||||
}
|
||||
return read;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static class DummyCharFilterFactory extends CharFilterFactory {
|
||||
|
||||
private final char match, repl;
|
||||
|
||||
public DummyCharFilterFactory(Map<String,String> args) {
|
||||
this(args, '0', '1');
|
||||
}
|
||||
|
||||
DummyCharFilterFactory(Map<String,String> args, char match, char repl) {
|
||||
super(args);
|
||||
this.match = match;
|
||||
this.repl = repl;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Reader create(Reader input) {
|
||||
return new DummyCharFilter(input, match, repl);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static class DummyMultiTermAwareCharFilterFactory extends DummyCharFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
public DummyMultiTermAwareCharFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return new DummyCharFilterFactory(Collections.emptyMap(), '0', '2');
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static class DummyTokenizerFactory extends TokenizerFactory {
|
||||
|
||||
public DummyTokenizerFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tokenizer create(AttributeFactory factory) {
|
||||
return new LowerCaseTokenizer(factory);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static class DummyMultiTermAwareTokenizerFactory extends DummyTokenizerFactory implements MultiTermAwareComponent {
|
||||
|
||||
public DummyMultiTermAwareTokenizerFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return new KeywordTokenizerFactory(getOriginalArgs());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static class DummyTokenFilterFactory extends TokenFilterFactory {
|
||||
|
||||
public DummyTokenFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return input;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static class DummyMultiTermAwareTokenFilterFactory extends DummyTokenFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
public DummyMultiTermAwareTokenFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return new ASCIIFoldingFilterFactory(Collections.emptyMap());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public void testNormalization() throws IOException {
|
||||
CustomAnalyzer analyzer1 = CustomAnalyzer.builder()
|
||||
// none of these components are multi-term aware so they should not be applied
|
||||
.withTokenizer(DummyTokenizerFactory.class, Collections.emptyMap())
|
||||
.addCharFilter(DummyCharFilterFactory.class, Collections.emptyMap())
|
||||
.addTokenFilter(DummyTokenFilterFactory.class, Collections.emptyMap())
|
||||
.build();
|
||||
assertEquals(new BytesRef("0À"), analyzer1.normalize("dummy", "0À"));
|
||||
|
||||
CustomAnalyzer analyzer2 = CustomAnalyzer.builder()
|
||||
// these components are multi-term aware so they should be applied
|
||||
.withTokenizer(DummyMultiTermAwareTokenizerFactory.class, Collections.emptyMap())
|
||||
.addCharFilter(DummyMultiTermAwareCharFilterFactory.class, Collections.emptyMap())
|
||||
.addTokenFilter(DummyMultiTermAwareTokenFilterFactory.class, Collections.emptyMap())
|
||||
.build();
|
||||
assertEquals(new BytesRef("2A"), analyzer2.normalize("dummy", "0À"));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -94,4 +94,11 @@ public class JapaneseAnalyzer extends StopwordAnalyzerBase {
|
|||
stream = new LowerCaseFilter(stream);
|
||||
return new TokenStreamComponents(tokenizer, stream);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new CJKWidthFilter(in);
|
||||
result = new LowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@ import morfologik.stemming.Dictionary;
|
|||
import morfologik.stemming.polish.PolishStemmer;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
|
@ -69,4 +70,9 @@ public class MorfologikAnalyzer extends Analyzer {
|
|||
src,
|
||||
new MorfologikFilter(new StandardFilter(src), dictionary));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
return new StandardFilter(in);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.util.Set;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
@ -139,4 +140,9 @@ public final class SmartChineseAnalyzer extends Analyzer {
|
|||
}
|
||||
return new TokenStreamComponents(tokenizer, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
return new LowerCaseFilter(in);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -146,4 +146,11 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new StempelFilter(result, new StempelStemmer(stemTable));
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new LowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,11 +18,18 @@ package org.apache.lucene.analysis;
|
|||
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
|
||||
import org.apache.lucene.store.AlreadyClosedException;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CloseableThreadLocal;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
|
@ -44,6 +51,12 @@ import org.apache.lucene.util.Version;
|
|||
* filter = new BarFilter(filter);
|
||||
* return new TokenStreamComponents(source, filter);
|
||||
* }
|
||||
* {@literal @Override}
|
||||
* protected TokenStream normalize(TokenStream in) {
|
||||
* // Assuming FooFilter is about normalization and BarFilter is about
|
||||
* // stemming, only FooFilter should be applied
|
||||
* return new FooFilter(in);
|
||||
* }
|
||||
* };
|
||||
* </pre>
|
||||
* For more examples, see the {@link org.apache.lucene.analysis Analysis package documentation}.
|
||||
|
@ -107,6 +120,15 @@ public abstract class Analyzer implements Closeable {
|
|||
*/
|
||||
protected abstract TokenStreamComponents createComponents(String fieldName);
|
||||
|
||||
/**
|
||||
* Wrap the given {@link TokenStream} in order to apply normalization filters.
|
||||
* The default implementation returns the {@link TokenStream} as-is. This is
|
||||
* used by {@link #normalize(String, String)}.
|
||||
*/
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
return in;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a TokenStream suitable for <code>fieldName</code>, tokenizing
|
||||
* the contents of <code>reader</code>.
|
||||
|
@ -181,7 +203,65 @@ public abstract class Analyzer implements Closeable {
|
|||
components.reusableStringReader = strReader;
|
||||
return components.getTokenStream();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Normalize a string down to the representation that it would have in the
|
||||
* index.
|
||||
* <p>
|
||||
* This is typically used by query parsers in order to generate a query on
|
||||
* a given term, without tokenizing or stemming, which are undesirable if
|
||||
* the string to analyze is a partial word (eg. in case of a wildcard or
|
||||
* fuzzy query).
|
||||
* <p>
|
||||
* This method uses {@link #initReaderForNormalization(String, Reader)} in
|
||||
* order to apply necessary character-level normalization and then
|
||||
* {@link #normalize(String, TokenStream)} in order to apply the normalizing
|
||||
* token filters.
|
||||
*/
|
||||
public final BytesRef normalize(final String fieldName, final String text) {
|
||||
try {
|
||||
// apply char filters
|
||||
final String filteredText;
|
||||
try (Reader reader = new StringReader(text)) {
|
||||
Reader filterReader = initReaderForNormalization(fieldName, reader);
|
||||
char[] buffer = new char[64];
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for (;;) {
|
||||
final int read = filterReader.read(buffer, 0, buffer.length);
|
||||
if (read == -1) {
|
||||
break;
|
||||
}
|
||||
builder.append(buffer, 0, read);
|
||||
}
|
||||
filteredText = builder.toString();
|
||||
} catch (IOException e) {
|
||||
throw new IllegalStateException("Normalization threw an unexpected exeption", e);
|
||||
}
|
||||
|
||||
final AttributeFactory attributeFactory = attributeFactory();
|
||||
try (TokenStream ts = normalize(fieldName,
|
||||
new StringTokenStream(attributeFactory, filteredText, text.length()))) {
|
||||
final TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
||||
ts.reset();
|
||||
if (ts.incrementToken() == false) {
|
||||
throw new IllegalStateException("The normalization token stream is "
|
||||
+ "expected to produce exactly 1 token, but got 0 for analyzer "
|
||||
+ this + " and input \"" + text + "\"");
|
||||
}
|
||||
final BytesRef term = BytesRef.deepCopyOf(termAtt.getBytesRef());
|
||||
if (ts.incrementToken()) {
|
||||
throw new IllegalStateException("The normalization token stream is "
|
||||
+ "expected to produce exactly 1 token, but got 2+ for analyzer "
|
||||
+ this + " and input \"" + text + "\"");
|
||||
}
|
||||
ts.end();
|
||||
return term;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new IllegalStateException("Normalization threw an unexpected exeption", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Override this if you want to add a CharFilter chain.
|
||||
* <p>
|
||||
|
@ -196,6 +276,22 @@ public abstract class Analyzer implements Closeable {
|
|||
return reader;
|
||||
}
|
||||
|
||||
/** Wrap the given {@link Reader} with {@link CharFilter}s that make sense
|
||||
* for normalization. This is typically a subset of the {@link CharFilter}s
|
||||
* that are applied in {@link #initReader(String, Reader)}. This is used by
|
||||
* {@link #normalize(String, String)}. */
|
||||
protected Reader initReaderForNormalization(String fieldName, Reader reader) {
|
||||
return reader;
|
||||
}
|
||||
|
||||
/** Return the {@link AttributeFactory} to be used for
|
||||
* {@link #tokenStream analysis} and
|
||||
* {@link #normalize(String, String) normalization}. The default
|
||||
* implementation returns {@link AttributeFactory#DEFAULT_ATTRIBUTE_FACTORY}. */
|
||||
protected AttributeFactory attributeFactory() {
|
||||
return AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
|
||||
}
|
||||
|
||||
/**
|
||||
* Invoked before indexing a IndexableField instance if
|
||||
* terms have already been added to that field. This allows custom
|
||||
|
@ -435,4 +531,41 @@ public abstract class Analyzer implements Closeable {
|
|||
}
|
||||
};
|
||||
|
||||
private static final class StringTokenStream extends TokenStream {
|
||||
|
||||
private final String value;
|
||||
private final int length;
|
||||
private boolean used = true;
|
||||
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
|
||||
|
||||
StringTokenStream(AttributeFactory attributeFactory, String value, int length) {
|
||||
super(attributeFactory);
|
||||
this.value = value;
|
||||
this.length = length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() {
|
||||
used = false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() {
|
||||
if (used) {
|
||||
return false;
|
||||
}
|
||||
clearAttributes();
|
||||
termAttribute.append(value);
|
||||
offsetAttribute.setOffset(0, length);
|
||||
used = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
super.end();
|
||||
offsetAttribute.setOffset(length, length);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -112,4 +112,11 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
|
|||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new StandardFilter(in);
|
||||
result = new LowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,6 +27,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
|||
import org.apache.lucene.analysis.MockGraphTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
|
||||
|
@ -387,4 +388,9 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
|
|||
checkRandomData(random, analyzer, 100*RANDOM_MULTIPLIER, 8192);
|
||||
analyzer.close();
|
||||
}
|
||||
|
||||
public void testNormalize() {
|
||||
Analyzer a = new StandardAnalyzer();
|
||||
assertEquals(new BytesRef("\"\\à3[]()! cz@"), a.normalize("dummy", "\"\\À3[]()! Cz@"));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,15 +16,15 @@
|
|||
*/
|
||||
package org.apache.lucene.queryparser.analyzing;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.queryparser.classic.ParseException;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
|
||||
/**
|
||||
* Overrides Lucene's default QueryParser so that Fuzzy-, Prefix-, Range-, and WildcardQuerys
|
||||
|
@ -39,7 +39,7 @@ import org.apache.lucene.search.Query;
|
|||
*/
|
||||
public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.QueryParser {
|
||||
// gobble escaped chars or find a wildcard character
|
||||
private final Pattern wildcardPattern = Pattern.compile("(\\.)|([?*]+)");
|
||||
private static final Pattern WILDCARD_PATTERN = Pattern.compile("(\\\\.)|([?*]+)");
|
||||
public AnalyzingQueryParser(String field, Analyzer analyzer) {
|
||||
super(field, analyzer);
|
||||
setAnalyzeRangeTerms(true);
|
||||
|
@ -65,42 +65,41 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.
|
|||
*/
|
||||
@Override
|
||||
protected Query getWildcardQuery(String field, String termStr) throws ParseException {
|
||||
if ("*".equals(field)) {
|
||||
if ("*".equals(termStr)) return newMatchAllDocsQuery();
|
||||
}
|
||||
if (getAllowLeadingWildcard() == false && (termStr.startsWith("*") || termStr.startsWith("?")))
|
||||
throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery");
|
||||
|
||||
if (termStr == null){
|
||||
//can't imagine this would ever happen
|
||||
throw new ParseException("Passed null value as term to getWildcardQuery");
|
||||
}
|
||||
if ( ! getAllowLeadingWildcard() && (termStr.startsWith("*") || termStr.startsWith("?"))) {
|
||||
throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery"
|
||||
+ " unless getAllowLeadingWildcard() returns true");
|
||||
}
|
||||
|
||||
Matcher wildcardMatcher = wildcardPattern.matcher(termStr);
|
||||
StringBuilder sb = new StringBuilder();
|
||||
Term t = new Term(field, analyzeWildcard(field, termStr));
|
||||
return newWildcardQuery(t);
|
||||
}
|
||||
|
||||
private BytesRef analyzeWildcard(String field, String termStr) {
|
||||
// best effort to not pass the wildcard characters and escaped characters through #normalize
|
||||
Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(termStr);
|
||||
BytesRefBuilder sb = new BytesRefBuilder();
|
||||
int last = 0;
|
||||
|
||||
|
||||
while (wildcardMatcher.find()){
|
||||
// continue if escaped char
|
||||
if (wildcardMatcher.group(1) != null){
|
||||
continue;
|
||||
}
|
||||
|
||||
if (wildcardMatcher.start() > 0){
|
||||
if (wildcardMatcher.start() > 0) {
|
||||
String chunk = termStr.substring(last, wildcardMatcher.start());
|
||||
String analyzed = analyzeSingleChunk(field, termStr, chunk);
|
||||
sb.append(analyzed);
|
||||
BytesRef normalized = getAnalyzer().normalize(field, chunk);
|
||||
sb.append(normalized);
|
||||
}
|
||||
//append the wildcard character
|
||||
sb.append(wildcardMatcher.group(2));
|
||||
|
||||
//append the matched group - without normalizing
|
||||
sb.append(new BytesRef(wildcardMatcher.group()));
|
||||
|
||||
last = wildcardMatcher.end();
|
||||
}
|
||||
if (last < termStr.length()){
|
||||
sb.append(analyzeSingleChunk(field, termStr, termStr.substring(last)));
|
||||
String chunk = termStr.substring(last);
|
||||
BytesRef normalized = getAnalyzer().normalize(field, chunk);
|
||||
sb.append(normalized);
|
||||
}
|
||||
return super.getWildcardQuery(field, sb.toString());
|
||||
return sb.toBytesRef();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Called when parser parses an input term
|
||||
* that uses prefix notation; that is, contains a single '*' wildcard
|
||||
|
@ -121,8 +120,14 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.
|
|||
*/
|
||||
@Override
|
||||
protected Query getPrefixQuery(String field, String termStr) throws ParseException {
|
||||
String analyzed = analyzeSingleChunk(field, termStr, termStr);
|
||||
return super.getPrefixQuery(field, analyzed);
|
||||
if (!getAllowLeadingWildcard() && termStr.startsWith("*"))
|
||||
throw new ParseException("'*' not allowed as first character in PrefixQuery");
|
||||
if (getLowercaseExpandedTerms()) {
|
||||
termStr = termStr.toLowerCase(getLocale());
|
||||
}
|
||||
BytesRef term = getAnalyzer().normalize(field, termStr);
|
||||
Term t = new Term(field, term);
|
||||
return newPrefixQuery(t);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -142,61 +147,9 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.
|
|||
protected Query getFuzzyQuery(String field, String termStr, float minSimilarity)
|
||||
throws ParseException {
|
||||
|
||||
String analyzed = analyzeSingleChunk(field, termStr, termStr);
|
||||
return super.getFuzzyQuery(field, analyzed, minSimilarity);
|
||||
BytesRef term = getAnalyzer().normalize(field, termStr);
|
||||
Term t = new Term(field, term);
|
||||
return newFuzzyQuery(t, minSimilarity, getFuzzyPrefixLength());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the analyzed form for the given chunk
|
||||
*
|
||||
* If the analyzer produces more than one output token from the given chunk,
|
||||
* a ParseException is thrown.
|
||||
*
|
||||
* @param field The target field
|
||||
* @param termStr The full term from which the given chunk is excerpted
|
||||
* @param chunk The portion of the given termStr to be analyzed
|
||||
* @return The result of analyzing the given chunk
|
||||
* @throws ParseException when analysis returns other than one output token
|
||||
*/
|
||||
protected String analyzeSingleChunk(String field, String termStr, String chunk) throws ParseException{
|
||||
String analyzed = null;
|
||||
try (TokenStream stream = getAnalyzer().tokenStream(field, chunk)) {
|
||||
stream.reset();
|
||||
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
||||
// get first and hopefully only output token
|
||||
if (stream.incrementToken()) {
|
||||
analyzed = termAtt.toString();
|
||||
|
||||
// try to increment again, there should only be one output token
|
||||
StringBuilder multipleOutputs = null;
|
||||
while (stream.incrementToken()) {
|
||||
if (null == multipleOutputs) {
|
||||
multipleOutputs = new StringBuilder();
|
||||
multipleOutputs.append('"');
|
||||
multipleOutputs.append(analyzed);
|
||||
multipleOutputs.append('"');
|
||||
}
|
||||
multipleOutputs.append(',');
|
||||
multipleOutputs.append('"');
|
||||
multipleOutputs.append(termAtt.toString());
|
||||
multipleOutputs.append('"');
|
||||
}
|
||||
stream.end();
|
||||
if (null != multipleOutputs) {
|
||||
throw new ParseException(
|
||||
String.format(getLocale(),
|
||||
"Analyzer created multiple terms for \"%s\": %s", chunk, multipleOutputs.toString()));
|
||||
}
|
||||
} else {
|
||||
// nothing returned by analyzer. Was it a stop word and the user accidentally
|
||||
// used an analyzer with stop words?
|
||||
stream.end();
|
||||
throw new ParseException(String.format(getLocale(), "Analyzer returned nothing for \"%s\"", chunk));
|
||||
}
|
||||
} catch (IOException e){
|
||||
throw new ParseException(
|
||||
String.format(getLocale(), "IO error while trying to analyze single term: \"%s\"", termStr));
|
||||
}
|
||||
return analyzed;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,14 +16,11 @@
|
|||
*/
|
||||
package org.apache.lucene.queryparser.classic;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.text.DateFormat;
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
|
||||
import org.apache.lucene.document.DateTools;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.queryparser.classic.QueryParser.Operator;
|
||||
|
@ -41,9 +38,6 @@ import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZ
|
|||
* and acts to separate the majority of the Java code from the .jj grammar file.
|
||||
*/
|
||||
public abstract class QueryParserBase extends QueryBuilder implements CommonQueryParserConfiguration {
|
||||
|
||||
/** Do not catch this exception in your code, it means you are using methods that you should no longer use. */
|
||||
public static class MethodRemovedUseAnother extends Throwable {}
|
||||
|
||||
static final int CONJ_NONE = 0;
|
||||
static final int CONJ_AND = 1;
|
||||
|
@ -640,31 +634,6 @@ public abstract class QueryParserBase extends QueryBuilder implements CommonQuer
|
|||
return new FuzzyQuery(term,numEdits,prefixLength);
|
||||
}
|
||||
|
||||
// TODO: Should this be protected instead?
|
||||
private BytesRef analyzeMultitermTerm(String field, String part) {
|
||||
return analyzeMultitermTerm(field, part, getAnalyzer());
|
||||
}
|
||||
|
||||
protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) {
|
||||
if (analyzerIn == null) analyzerIn = getAnalyzer();
|
||||
|
||||
try (TokenStream source = analyzerIn.tokenStream(field, part)) {
|
||||
source.reset();
|
||||
|
||||
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
|
||||
|
||||
if (!source.incrementToken())
|
||||
throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part);
|
||||
BytesRef bytes = BytesRef.deepCopyOf(termAtt.getBytesRef());
|
||||
if (source.incrementToken())
|
||||
throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part);
|
||||
source.end();
|
||||
return bytes;
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Error analyzing multiTerm term: " + part, e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a new {@link TermRangeQuery} instance
|
||||
* @param field Field
|
||||
|
@ -681,13 +650,13 @@ public abstract class QueryParserBase extends QueryBuilder implements CommonQuer
|
|||
if (part1 == null) {
|
||||
start = null;
|
||||
} else {
|
||||
start = analyzeRangeTerms ? analyzeMultitermTerm(field, part1) : new BytesRef(part1);
|
||||
start = analyzeRangeTerms ? getAnalyzer().normalize(field, part1) : new BytesRef(part1);
|
||||
}
|
||||
|
||||
if (part2 == null) {
|
||||
end = null;
|
||||
} else {
|
||||
end = analyzeRangeTerms ? analyzeMultitermTerm(field, part2) : new BytesRef(part2);
|
||||
end = analyzeRangeTerms ? getAnalyzer().normalize(field, part2) : new BytesRef(part2);
|
||||
}
|
||||
|
||||
final TermRangeQuery query = new TermRangeQuery(field, start, end, startInclusive, endInclusive);
|
||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.search.MatchAllDocsQuery;
|
|||
import org.apache.lucene.search.MatchNoDocsQuery;
|
||||
import org.apache.lucene.search.PrefixQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.QueryBuilder;
|
||||
import org.apache.lucene.util.automaton.LevenshteinAutomata;
|
||||
|
||||
|
@ -551,7 +552,9 @@ public class SimpleQueryParser extends QueryBuilder {
|
|||
BooleanQuery.Builder bq = new BooleanQuery.Builder();
|
||||
bq.setDisableCoord(true);
|
||||
for (Map.Entry<String,Float> entry : weights.entrySet()) {
|
||||
Query q = new FuzzyQuery(new Term(entry.getKey(), text), fuzziness);
|
||||
final String fieldName = entry.getKey();
|
||||
final BytesRef term = getAnalyzer().normalize(fieldName, text);
|
||||
Query q = new FuzzyQuery(new Term(fieldName, term), fuzziness);
|
||||
float boost = entry.getValue();
|
||||
if (boost != 1f) {
|
||||
q = new BoostQuery(q, boost);
|
||||
|
@ -587,7 +590,9 @@ public class SimpleQueryParser extends QueryBuilder {
|
|||
BooleanQuery.Builder bq = new BooleanQuery.Builder();
|
||||
bq.setDisableCoord(true);
|
||||
for (Map.Entry<String,Float> entry : weights.entrySet()) {
|
||||
Query q = new PrefixQuery(new Term(entry.getKey(), text));
|
||||
final String fieldName = entry.getKey();
|
||||
final BytesRef term = getAnalyzer().normalize(fieldName, text);
|
||||
Query q = new PrefixQuery(new Term(fieldName, term));
|
||||
float boost = entry.getValue();
|
||||
if (boost != 1f) {
|
||||
q = new BoostQuery(q, boost);
|
||||
|
|
|
@ -21,9 +21,8 @@ import java.util.Map;
|
|||
import java.util.TreeMap;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.CharacterUtils;
|
||||
import org.apache.lucene.analysis.MockBytesAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenFilter;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -100,24 +99,6 @@ public class TestAnalyzingQueryParser extends LuceneTestCase {
|
|||
|
||||
a = new ASCIIAnalyzer();
|
||||
}
|
||||
|
||||
public void testSingleChunkExceptions() {
|
||||
String termStr = "the*tre";
|
||||
|
||||
Analyzer stopsAnalyzer = new MockAnalyzer
|
||||
(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||
|
||||
ParseException expected = expectThrows(ParseException.class, () -> {
|
||||
parseWithAnalyzingQueryParser(termStr, stopsAnalyzer, true);
|
||||
});
|
||||
assertTrue(expected.getMessage().contains("returned nothing"));
|
||||
|
||||
AnalyzingQueryParser qp = new AnalyzingQueryParser(FIELD, a);
|
||||
expected = expectThrows(ParseException.class, () -> {
|
||||
qp.analyzeSingleChunk(FIELD, "", "not a single chunk");
|
||||
});
|
||||
assertTrue(expected.getMessage().contains("multiple terms"));
|
||||
}
|
||||
|
||||
public void testWildcardAlone() throws ParseException {
|
||||
//seems like crazy edge case, but can be useful in concordance
|
||||
|
@ -221,12 +202,36 @@ public class TestAnalyzingQueryParser extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
final static class LowercaseFilter extends TokenFilter {
|
||||
|
||||
final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
LowercaseFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
CharacterUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
|
||||
return true;
|
||||
} else
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
final static class ASCIIAnalyzer extends Analyzer {
|
||||
@Override
|
||||
public TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer result = new MockTokenizer(MockTokenizer.WHITESPACE, true);
|
||||
return new TokenStreamComponents(result, new FoldingFilter(result));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
return new FoldingFilter(new LowercaseFilter(in));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -1169,6 +1169,10 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
|
|||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
|
||||
return new TokenStreamComponents(tokenizer, new MockCollationFilter(tokenizer));
|
||||
}
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
return new MockCollationFilter(in);
|
||||
}
|
||||
}
|
||||
|
||||
public void testCollatedRange() throws Exception {
|
||||
|
|
|
@ -883,7 +883,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]));
|
||||
}
|
||||
|
||||
|
||||
a.normalize("dummy", text);
|
||||
// TODO: what can we do besides testing that the above method does not throw?
|
||||
|
||||
if (field != null) {
|
||||
reader = new StringReader(text);
|
||||
random = new Random(seed);
|
||||
|
|
|
@ -92,7 +92,16 @@ public final class MockAnalyzer extends Analyzer {
|
|||
MockTokenFilter filt = new MockTokenFilter(tokenizer, filter);
|
||||
return new TokenStreamComponents(tokenizer, maybePayload(filt, fieldName));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = in;
|
||||
if (lowerCase) {
|
||||
result = new MockLowerCaseFilter(result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private synchronized TokenFilter maybePayload(TokenFilter stream, String fieldName) {
|
||||
Integer val = previousMappings.get(fieldName);
|
||||
if (val == null) {
|
||||
|
|
|
@ -16,6 +16,8 @@
|
|||
*/
|
||||
package org.apache.lucene.analysis;
|
||||
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
|
||||
/**
|
||||
* Analyzer for testing that encodes terms as UTF-16 bytes.
|
||||
*/
|
||||
|
@ -26,4 +28,9 @@ public final class MockBytesAnalyzer extends Analyzer {
|
|||
MockTokenizer.KEYWORD, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
|
||||
return new TokenStreamComponents(t);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected AttributeFactory attributeFactory() {
|
||||
return MockUTF16TermAttributeImpl.UTF16_TERM_ATTRIBUTE_FACTORY;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,40 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
/** A lowercasing {@link TokenFilter}. */
|
||||
public final class MockLowerCaseFilter extends TokenFilter {
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
/** Sole constructor. */
|
||||
public MockLowerCaseFilter(TokenStream in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
CharacterUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
|
||||
return true;
|
||||
} else
|
||||
return false;
|
||||
}
|
||||
}
|
|
@ -18,6 +18,7 @@ package org.apache.solr.analysis;
|
|||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.util.CharFilterFactory;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||
|
||||
|
@ -83,9 +84,22 @@ public final class TokenizerChain extends SolrAnalyzer {
|
|||
return reader;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReaderForNormalization(String fieldName, Reader reader) {
|
||||
if (charFilters != null && charFilters.length > 0) {
|
||||
for (CharFilterFactory charFilter : charFilters) {
|
||||
if (charFilter instanceof MultiTermAwareComponent) {
|
||||
charFilter = (CharFilterFactory) ((MultiTermAwareComponent) charFilter).getMultiTermComponent();
|
||||
reader = charFilter.create(reader);
|
||||
}
|
||||
}
|
||||
}
|
||||
return reader;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tk = tokenizer.create();
|
||||
Tokenizer tk = tokenizer.create(attributeFactory());
|
||||
TokenStream ts = tk;
|
||||
for (TokenFilterFactory filter : filters) {
|
||||
ts = filter.create(ts);
|
||||
|
@ -93,6 +107,18 @@ public final class TokenizerChain extends SolrAnalyzer {
|
|||
return new TokenStreamComponents(tk, ts);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = in;
|
||||
for (TokenFilterFactory filter : filters) {
|
||||
if (filter instanceof MultiTermAwareComponent) {
|
||||
filter = (TokenFilterFactory) ((MultiTermAwareComponent) filter).getMultiTermComponent();
|
||||
result = filter.create(in);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder("TokenizerChain(");
|
||||
|
|
Loading…
Reference in New Issue