LUCENE-7355: Add Analyzer#normalize() and use it in query parsers.

This commit is contained in:
Adrien Grand 2016-06-28 18:23:11 +02:00
parent ccd3bc8466
commit 7c2e7a0fb8
61 changed files with 808 additions and 150 deletions

View File

@ -26,6 +26,9 @@ New Features
methods Directory.rename and Directory.syncMetaData instead (Robert Muir,
Uwe Schindler, Mike McCandless)
* LUCENE-7355: Added Analyzer#normalize(), which only applies normalization to
an input string. (Adrien Grand)
Bug Fixes
* LUCENE-6662: Fixed potential resource leaks. (Rishabh Patel via Adrien Grand)
@ -77,6 +80,10 @@ Improvements
* LUCENE-7276: MatchNoDocsQuery now includes an optional reason for
why it was used (Jim Ferenczi via Mike McCandless)
* LUCENE-7355: AnalyzingQueryParser now only applies the subset of the analysis
chain that is about normalization for range/fuzzy/wildcard queries.
(Adrien Grand)
Optimizations
* LUCENE-7330, LUCENE-7339: Speed up conjunction queries. (Adrien Grand)

View File

@ -146,5 +146,13 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
}
return new TokenStreamComponents(source, new ArabicStemFilter(result));
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new LowerCaseFilter(in);
result = new DecimalDigitFilter(result);
result = new ArabicNormalizationFilter(result);
return result;
}
}

View File

@ -126,4 +126,11 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
result = new BulgarianStemFilter(result);
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new LowerCaseFilter(result);
return result;
}
}

View File

@ -127,5 +127,12 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
result = new SetKeywordMarkerFilter(result, excltable);
return new TokenStreamComponents(source, new BrazilianStemFilter(result));
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new LowerCaseFilter(result);
return result;
}
}

View File

@ -130,4 +130,12 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
result = new SnowballFilter(result, new CatalanStemmer());
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new ElisionFilter(result, DEFAULT_ARTICLES);
result = new LowerCaseFilter(result);
return result;
}
}

View File

@ -92,4 +92,11 @@ public final class CJKAnalyzer extends StopwordAnalyzerBase {
result = new CJKBigramFilter(result);
return new TokenStreamComponents(source, new StopFilter(result, stopwords));
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new CJKWidthFilter(in);
result = new LowerCaseFilter(result);
return result;
}
}

View File

@ -129,4 +129,13 @@ public final class SoraniAnalyzer extends StopwordAnalyzerBase {
result = new SoraniStemFilter(result);
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new SoraniNormalizationFilter(result);
result = new LowerCaseFilter(result);
result = new DecimalDigitFilter(result);
return result;
}
}

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis.core;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
/** An {@link Analyzer} that filters {@link LetterTokenizer}
* with {@link LowerCaseFilter}
@ -35,4 +36,9 @@ public final class SimpleAnalyzer extends Analyzer {
protected TokenStreamComponents createComponents(final String fieldName) {
return new TokenStreamComponents(new LowerCaseTokenizer());
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
return new LowerCaseFilter(in);
}
}

View File

@ -25,6 +25,7 @@ import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
@ -79,5 +80,10 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
final Tokenizer source = new LowerCaseTokenizer();
return new TokenStreamComponents(source, new StopFilter(source, stopwords));
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
return new LowerCaseFilter(in);
}
}

View File

@ -37,6 +37,7 @@ import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
import org.apache.lucene.analysis.util.FilesystemResourceLoader;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
@ -117,16 +118,39 @@ public final class CustomAnalyzer extends Analyzer {
return reader;
}
@Override
protected Reader initReaderForNormalization(String fieldName, Reader reader) {
for (CharFilterFactory charFilter : charFilters) {
if (charFilter instanceof MultiTermAwareComponent) {
charFilter = (CharFilterFactory) ((MultiTermAwareComponent) charFilter).getMultiTermComponent();
reader = charFilter.create(reader);
}
}
return reader;
}
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer tk = tokenizer.create();
final Tokenizer tk = tokenizer.create(attributeFactory());
TokenStream ts = tk;
for (final TokenFilterFactory filter : tokenFilters) {
ts = filter.create(ts);
}
return new TokenStreamComponents(tk, ts);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = in;
for (TokenFilterFactory filter : tokenFilters) {
if (filter instanceof MultiTermAwareComponent) {
filter = (TokenFilterFactory) ((MultiTermAwareComponent) filter).getMultiTermComponent();
result = filter.create(in);
}
}
return result;
}
@Override
public int getPositionIncrementGap(String fieldName) {
// use default from Analyzer base class if null

View File

@ -125,5 +125,12 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
result = new CzechStemFilter(result);
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new LowerCaseFilter(result);
return result;
}
}

View File

@ -124,4 +124,11 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
result = new SnowballFilter(result, new DanishStemmer());
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new LowerCaseFilter(result);
return result;
}
}

View File

@ -139,4 +139,12 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
result = new GermanLightStemFilter(result);
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new LowerCaseFilter(result);
result = new GermanNormalizationFilter(result);
return result;
}
}

View File

@ -104,4 +104,11 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase {
result = new GreekStemFilter(result);
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new GreekLowerCaseFilter(result);
return result;
}
}

View File

@ -107,4 +107,11 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
result = new PorterStemFilter(result);
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new LowerCaseFilter(result);
return result;
}
}

View File

@ -123,4 +123,11 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
result = new SpanishLightStemFilter(result);
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new LowerCaseFilter(result);
return result;
}
}

View File

@ -121,4 +121,11 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase {
result = new SnowballFilter(result, new BasqueStemmer());
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new LowerCaseFilter(result);
return result;
}
}

View File

@ -29,6 +29,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
import org.apache.lucene.analysis.core.DecimalDigitFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;
@ -128,7 +129,18 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
*/
return new TokenStreamComponents(source, new StopFilter(result, stopwords));
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new LowerCaseFilter(result);
result = new DecimalDigitFilter(result);
result = new ArabicNormalizationFilter(result);
/* additional persian-specific normalization */
result = new PersianNormalizationFilter(result);
return result;
}
/**
* Wraps the Reader with {@link PersianCharFilter}
*/

View File

@ -124,4 +124,11 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
result = new SnowballFilter(result, new FinnishStemmer());
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new LowerCaseFilter(result);
return result;
}
}

View File

@ -144,5 +144,13 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
result = new FrenchLightStemFilter(result);
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new ElisionFilter(result, DEFAULT_ARTICLES);
result = new LowerCaseFilter(result);
return result;
}
}

View File

@ -141,4 +141,12 @@ public final class IrishAnalyzer extends StopwordAnalyzerBase {
result = new SnowballFilter(result, new IrishStemmer());
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new ElisionFilter(result, DEFAULT_ARTICLES);
result = new IrishLowerCaseFilter(result);
return result;
}
}

View File

@ -122,4 +122,11 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
result = new GalicianStemFilter(result);
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new LowerCaseFilter(result);
return result;
}
}

View File

@ -29,6 +29,7 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.DecimalDigitFilter;
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;
@ -128,4 +129,14 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase {
result = new HindiStemFilter(result);
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new LowerCaseFilter(result);
result = new DecimalDigitFilter(result);
result = new IndicNormalizationFilter(result);
result = new HindiNormalizationFilter(result);
return result;
}
}

View File

@ -124,4 +124,11 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
result = new SnowballFilter(result, new HungarianStemmer());
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new LowerCaseFilter(result);
return result;
}
}

View File

@ -121,4 +121,11 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
result = new SnowballFilter(result, new ArmenianStemmer());
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new LowerCaseFilter(result);
return result;
}
}

View File

@ -119,4 +119,11 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
}
return new TokenStreamComponents(source, new IndonesianStemFilter(result));
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new LowerCaseFilter(result);
return result;
}
}

View File

@ -133,4 +133,12 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
result = new ItalianLightStemFilter(result);
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new ElisionFilter(result, DEFAULT_ARTICLES);
result = new LowerCaseFilter(result);
return result;
}
}

View File

@ -121,4 +121,11 @@ public final class LithuanianAnalyzer extends StopwordAnalyzerBase {
result = new SnowballFilter(result, new LithuanianStemmer());
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new LowerCaseFilter(result);
return result;
}
}

View File

@ -122,4 +122,11 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
result = new LatvianStemFilter(result);
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new LowerCaseFilter(result);
return result;
}
}

View File

@ -159,4 +159,11 @@ public final class DutchAnalyzer extends Analyzer {
result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer());
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new LowerCaseFilter(result);
return result;
}
}

View File

@ -124,5 +124,12 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
result = new SnowballFilter(result, new NorwegianStemmer());
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new LowerCaseFilter(result);
return result;
}
}

View File

@ -123,4 +123,11 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
result = new PortugueseLightStemFilter(result);
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new LowerCaseFilter(result);
return result;
}
}

View File

@ -126,4 +126,11 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
result = new SnowballFilter(result, new RomanianStemmer());
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new LowerCaseFilter(result);
return result;
}
}

View File

@ -121,4 +121,11 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase {
result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer());
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new LowerCaseFilter(result);
return result;
}
}

View File

@ -100,4 +100,9 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase {
}
};
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
return new LowerCaseFilter(in);
}
}

View File

@ -97,4 +97,9 @@ public final class UAX29URLEmailAnalyzer extends StopwordAnalyzerBase {
}
};
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
return new LowerCaseFilter(in);
}
}

View File

@ -124,4 +124,11 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
result = new SnowballFilter(result, new SwedishStemmer());
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new LowerCaseFilter(result);
return result;
}
}

View File

@ -104,4 +104,11 @@ public final class ThaiAnalyzer extends StopwordAnalyzerBase {
result = new StopFilter(result, stopwords);
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new LowerCaseFilter(in);
result = new DecimalDigitFilter(result);
return result;
}
}

View File

@ -127,4 +127,11 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
result = new SnowballFilter(result, new TurkishStemmer());
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new TurkishLowerCaseFilter(result);
return result;
}
}

View File

@ -20,6 +20,8 @@ package org.apache.lucene.collation;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.util.AttributeFactory;
import java.text.Collator;
/**
@ -82,6 +84,11 @@ public final class CollationKeyAnalyzer extends Analyzer {
this.factory = new CollationAttributeFactory(collator);
}
@Override
protected AttributeFactory attributeFactory() {
return factory;
}
@Override
protected TokenStreamComponents createComponents(String fieldName) {
KeywordTokenizer tokenizer = new KeywordTokenizer(factory, KeywordTokenizer.DEFAULT_BUFFER_SIZE);

View File

@ -35,6 +35,7 @@ import org.apache.lucene.analysis.MockCharFilter;
import org.apache.lucene.analysis.MockFixedLengthPayloadFilter;
import org.apache.lucene.analysis.MockGraphTokenFilter;
import org.apache.lucene.analysis.MockHoleInjectingTokenFilter;
import org.apache.lucene.analysis.MockLowerCaseFilter;
import org.apache.lucene.analysis.MockRandomLookaheadTokenFilter;
import org.apache.lucene.analysis.MockSynonymFilter;
import org.apache.lucene.analysis.MockTokenFilter;
@ -75,6 +76,7 @@ public class TestAllAnalyzersHaveFactories extends LuceneTestCase {
MockFixedLengthPayloadFilter.class,
MockGraphTokenFilter.class,
MockHoleInjectingTokenFilter.class,
MockLowerCaseFilter.class,
MockRandomLookaheadTokenFilter.class,
MockSynonymFilter.class,
MockTokenFilter.class,

View File

@ -52,6 +52,7 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
new String[] { "b" });
assertAnalyzesTo(a, "\"QUOTED\" word",
new String[] { "quoted", "word" });
assertEquals(new BytesRef("\"\\à3[]()! cz@"), a.normalize("dummy", "\"\\À3[]()! Cz@"));
a.close();
}
@ -73,6 +74,7 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
new String[] { "2B" });
assertAnalyzesTo(a, "\"QUOTED\" word",
new String[] { "\"QUOTED\"", "word" });
assertEquals(new BytesRef("\"\\À3[]()! Cz@"), a.normalize("dummy", "\"\\À3[]()! Cz@"));
a.close();
}
@ -82,6 +84,8 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
new String[] { "foo", "bar", "foo", "bar" });
assertAnalyzesTo(a, "foo a bar such FOO THESE BAR",
new String[] { "foo", "bar", "foo", "bar" });
assertEquals(new BytesRef("\"\\à3[]()! cz@"), a.normalize("dummy", "\"\\À3[]()! Cz@"));
assertEquals(new BytesRef("the"), a.normalize("dummy", "the"));
a.close();
}

View File

@ -928,6 +928,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
System.out.println("Creating random analyzer:" + a);
}
try {
checkNormalize(a);
checkRandomData(random, a, 500*RANDOM_MULTIPLIER, 20, false,
false /* We already validate our own offsets... */);
} catch (Throwable e) {
@ -937,7 +938,14 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
}
}
}
public void checkNormalize(Analyzer a) {
// normalization should not modify characters that may be used for wildcards
// or regular expressions
String s = "([0-9]+)?*";
assertEquals(s, a.normalize("dummy", s).utf8ToString());
}
// we might regret this decision...
public void testRandomChainsWithLargeStrings() throws Throwable {
int numIterations = TEST_NIGHTLY ? atLeast(20) : 3;

View File

@ -17,6 +17,8 @@
package org.apache.lucene.analysis.custom;
import java.io.IOException;
import java.io.Reader;
import java.nio.file.Paths;
import java.util.Collections;
import java.util.HashMap;
@ -24,16 +26,25 @@ import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory;
import org.apache.lucene.analysis.core.KeywordTokenizerFactory;
import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.core.StopFilterFactory;
import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory;
import org.apache.lucene.analysis.standard.ClassicTokenizerFactory;
import org.apache.lucene.analysis.standard.StandardTokenizerFactory;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SetOnce.AlreadySetException;
import org.apache.lucene.util.Version;
@ -336,4 +347,136 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
});
}
private static class DummyCharFilter extends CharFilter {
private final char match, repl;
public DummyCharFilter(Reader input, char match, char repl) {
super(input);
this.match = match;
this.repl = repl;
}
@Override
protected int correct(int currentOff) {
return currentOff;
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
final int read = input.read(cbuf, off, len);
for (int i = 0; i < read; ++i) {
if (cbuf[off+i] == match) {
cbuf[off+i] = repl;
}
}
return read;
}
}
public static class DummyCharFilterFactory extends CharFilterFactory {
private final char match, repl;
public DummyCharFilterFactory(Map<String,String> args) {
this(args, '0', '1');
}
DummyCharFilterFactory(Map<String,String> args, char match, char repl) {
super(args);
this.match = match;
this.repl = repl;
}
@Override
public Reader create(Reader input) {
return new DummyCharFilter(input, match, repl);
}
}
public static class DummyMultiTermAwareCharFilterFactory extends DummyCharFilterFactory implements MultiTermAwareComponent {
public DummyMultiTermAwareCharFilterFactory(Map<String,String> args) {
super(args);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return new DummyCharFilterFactory(Collections.emptyMap(), '0', '2');
}
}
public static class DummyTokenizerFactory extends TokenizerFactory {
public DummyTokenizerFactory(Map<String,String> args) {
super(args);
}
@Override
public Tokenizer create(AttributeFactory factory) {
return new LowerCaseTokenizer(factory);
}
}
public static class DummyMultiTermAwareTokenizerFactory extends DummyTokenizerFactory implements MultiTermAwareComponent {
public DummyMultiTermAwareTokenizerFactory(Map<String,String> args) {
super(args);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return new KeywordTokenizerFactory(getOriginalArgs());
}
}
public static class DummyTokenFilterFactory extends TokenFilterFactory {
public DummyTokenFilterFactory(Map<String,String> args) {
super(args);
}
@Override
public TokenStream create(TokenStream input) {
return input;
}
}
public static class DummyMultiTermAwareTokenFilterFactory extends DummyTokenFilterFactory implements MultiTermAwareComponent {
public DummyMultiTermAwareTokenFilterFactory(Map<String,String> args) {
super(args);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return new ASCIIFoldingFilterFactory(Collections.emptyMap());
}
}
public void testNormalization() throws IOException {
CustomAnalyzer analyzer1 = CustomAnalyzer.builder()
// none of these components are multi-term aware so they should not be applied
.withTokenizer(DummyTokenizerFactory.class, Collections.emptyMap())
.addCharFilter(DummyCharFilterFactory.class, Collections.emptyMap())
.addTokenFilter(DummyTokenFilterFactory.class, Collections.emptyMap())
.build();
assertEquals(new BytesRef(""), analyzer1.normalize("dummy", ""));
CustomAnalyzer analyzer2 = CustomAnalyzer.builder()
// these components are multi-term aware so they should be applied
.withTokenizer(DummyMultiTermAwareTokenizerFactory.class, Collections.emptyMap())
.addCharFilter(DummyMultiTermAwareCharFilterFactory.class, Collections.emptyMap())
.addTokenFilter(DummyMultiTermAwareTokenFilterFactory.class, Collections.emptyMap())
.build();
assertEquals(new BytesRef("2A"), analyzer2.normalize("dummy", ""));
}
}

View File

@ -94,4 +94,11 @@ public class JapaneseAnalyzer extends StopwordAnalyzerBase {
stream = new LowerCaseFilter(stream);
return new TokenStreamComponents(tokenizer, stream);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new CJKWidthFilter(in);
result = new LowerCaseFilter(result);
return result;
}
}

View File

@ -23,6 +23,7 @@ import morfologik.stemming.Dictionary;
import morfologik.stemming.polish.PolishStemmer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
@ -69,4 +70,9 @@ public class MorfologikAnalyzer extends Analyzer {
src,
new MorfologikFilter(new StandardFilter(src), dictionary));
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
return new StandardFilter(in);
}
}

View File

@ -22,6 +22,7 @@ import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
@ -139,4 +140,9 @@ public final class SmartChineseAnalyzer extends Analyzer {
}
return new TokenStreamComponents(tokenizer, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
return new LowerCaseFilter(in);
}
}

View File

@ -146,4 +146,11 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
result = new StempelFilter(result, new StempelStemmer(stemTable));
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new LowerCaseFilter(result);
return result;
}
}

View File

@ -18,11 +18,18 @@ package org.apache.lucene.analysis;
import java.io.Closeable;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CloseableThreadLocal;
import org.apache.lucene.util.Version;
@ -44,6 +51,12 @@ import org.apache.lucene.util.Version;
* filter = new BarFilter(filter);
* return new TokenStreamComponents(source, filter);
* }
* {@literal @Override}
* protected TokenStream normalize(TokenStream in) {
* // Assuming FooFilter is about normalization and BarFilter is about
* // stemming, only FooFilter should be applied
* return new FooFilter(in);
* }
* };
* </pre>
* For more examples, see the {@link org.apache.lucene.analysis Analysis package documentation}.
@ -107,6 +120,15 @@ public abstract class Analyzer implements Closeable {
*/
protected abstract TokenStreamComponents createComponents(String fieldName);
/**
* Wrap the given {@link TokenStream} in order to apply normalization filters.
* The default implementation returns the {@link TokenStream} as-is. This is
* used by {@link #normalize(String, String)}.
*/
protected TokenStream normalize(String fieldName, TokenStream in) {
return in;
}
/**
* Returns a TokenStream suitable for <code>fieldName</code>, tokenizing
* the contents of <code>reader</code>.
@ -181,7 +203,65 @@ public abstract class Analyzer implements Closeable {
components.reusableStringReader = strReader;
return components.getTokenStream();
}
/**
* Normalize a string down to the representation that it would have in the
* index.
* <p>
* This is typically used by query parsers in order to generate a query on
* a given term, without tokenizing or stemming, which are undesirable if
* the string to analyze is a partial word (eg. in case of a wildcard or
* fuzzy query).
* <p>
* This method uses {@link #initReaderForNormalization(String, Reader)} in
* order to apply necessary character-level normalization and then
* {@link #normalize(String, TokenStream)} in order to apply the normalizing
* token filters.
*/
public final BytesRef normalize(final String fieldName, final String text) {
try {
// apply char filters
final String filteredText;
try (Reader reader = new StringReader(text)) {
Reader filterReader = initReaderForNormalization(fieldName, reader);
char[] buffer = new char[64];
StringBuilder builder = new StringBuilder();
for (;;) {
final int read = filterReader.read(buffer, 0, buffer.length);
if (read == -1) {
break;
}
builder.append(buffer, 0, read);
}
filteredText = builder.toString();
} catch (IOException e) {
throw new IllegalStateException("Normalization threw an unexpected exeption", e);
}
final AttributeFactory attributeFactory = attributeFactory();
try (TokenStream ts = normalize(fieldName,
new StringTokenStream(attributeFactory, filteredText, text.length()))) {
final TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
ts.reset();
if (ts.incrementToken() == false) {
throw new IllegalStateException("The normalization token stream is "
+ "expected to produce exactly 1 token, but got 0 for analyzer "
+ this + " and input \"" + text + "\"");
}
final BytesRef term = BytesRef.deepCopyOf(termAtt.getBytesRef());
if (ts.incrementToken()) {
throw new IllegalStateException("The normalization token stream is "
+ "expected to produce exactly 1 token, but got 2+ for analyzer "
+ this + " and input \"" + text + "\"");
}
ts.end();
return term;
}
} catch (IOException e) {
throw new IllegalStateException("Normalization threw an unexpected exeption", e);
}
}
/**
* Override this if you want to add a CharFilter chain.
* <p>
@ -196,6 +276,22 @@ public abstract class Analyzer implements Closeable {
return reader;
}
/** Wrap the given {@link Reader} with {@link CharFilter}s that make sense
* for normalization. This is typically a subset of the {@link CharFilter}s
* that are applied in {@link #initReader(String, Reader)}. This is used by
* {@link #normalize(String, String)}. */
protected Reader initReaderForNormalization(String fieldName, Reader reader) {
return reader;
}
/** Return the {@link AttributeFactory} to be used for
* {@link #tokenStream analysis} and
* {@link #normalize(String, String) normalization}. The default
* implementation returns {@link AttributeFactory#DEFAULT_ATTRIBUTE_FACTORY}. */
protected AttributeFactory attributeFactory() {
return AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
}
/**
* Invoked before indexing a IndexableField instance if
* terms have already been added to that field. This allows custom
@ -435,4 +531,41 @@ public abstract class Analyzer implements Closeable {
}
};
private static final class StringTokenStream extends TokenStream {
private final String value;
private final int length;
private boolean used = true;
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
StringTokenStream(AttributeFactory attributeFactory, String value, int length) {
super(attributeFactory);
this.value = value;
this.length = length;
}
@Override
public void reset() {
used = false;
}
@Override
public boolean incrementToken() {
if (used) {
return false;
}
clearAttributes();
termAttribute.append(value);
offsetAttribute.setOffset(0, length);
used = true;
return true;
}
@Override
public void end() throws IOException {
super.end();
offsetAttribute.setOffset(length, length);
}
}
}

View File

@ -112,4 +112,11 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
}
};
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new StandardFilter(in);
result = new LowerCaseFilter(result);
return result;
}
}

View File

@ -27,6 +27,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockGraphTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TestUtil;
public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
@ -387,4 +388,9 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
checkRandomData(random, analyzer, 100*RANDOM_MULTIPLIER, 8192);
analyzer.close();
}
public void testNormalize() {
Analyzer a = new StandardAnalyzer();
assertEquals(new BytesRef("\"\\à3[]()! cz@"), a.normalize("dummy", "\"\\À3[]()! Cz@"));
}
}

View File

@ -16,15 +16,15 @@
*/
package org.apache.lucene.queryparser.analyzing;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
/**
* Overrides Lucene's default QueryParser so that Fuzzy-, Prefix-, Range-, and WildcardQuerys
@ -39,7 +39,7 @@ import org.apache.lucene.search.Query;
*/
public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.QueryParser {
// gobble escaped chars or find a wildcard character
private final Pattern wildcardPattern = Pattern.compile("(\\.)|([?*]+)");
private static final Pattern WILDCARD_PATTERN = Pattern.compile("(\\\\.)|([?*]+)");
public AnalyzingQueryParser(String field, Analyzer analyzer) {
super(field, analyzer);
setAnalyzeRangeTerms(true);
@ -65,42 +65,41 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.
*/
@Override
protected Query getWildcardQuery(String field, String termStr) throws ParseException {
if ("*".equals(field)) {
if ("*".equals(termStr)) return newMatchAllDocsQuery();
}
if (getAllowLeadingWildcard() == false && (termStr.startsWith("*") || termStr.startsWith("?")))
throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery");
if (termStr == null){
//can't imagine this would ever happen
throw new ParseException("Passed null value as term to getWildcardQuery");
}
if ( ! getAllowLeadingWildcard() && (termStr.startsWith("*") || termStr.startsWith("?"))) {
throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery"
+ " unless getAllowLeadingWildcard() returns true");
}
Matcher wildcardMatcher = wildcardPattern.matcher(termStr);
StringBuilder sb = new StringBuilder();
Term t = new Term(field, analyzeWildcard(field, termStr));
return newWildcardQuery(t);
}
private BytesRef analyzeWildcard(String field, String termStr) {
// best effort to not pass the wildcard characters and escaped characters through #normalize
Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(termStr);
BytesRefBuilder sb = new BytesRefBuilder();
int last = 0;
while (wildcardMatcher.find()){
// continue if escaped char
if (wildcardMatcher.group(1) != null){
continue;
}
if (wildcardMatcher.start() > 0){
if (wildcardMatcher.start() > 0) {
String chunk = termStr.substring(last, wildcardMatcher.start());
String analyzed = analyzeSingleChunk(field, termStr, chunk);
sb.append(analyzed);
BytesRef normalized = getAnalyzer().normalize(field, chunk);
sb.append(normalized);
}
//append the wildcard character
sb.append(wildcardMatcher.group(2));
//append the matched group - without normalizing
sb.append(new BytesRef(wildcardMatcher.group()));
last = wildcardMatcher.end();
}
if (last < termStr.length()){
sb.append(analyzeSingleChunk(field, termStr, termStr.substring(last)));
String chunk = termStr.substring(last);
BytesRef normalized = getAnalyzer().normalize(field, chunk);
sb.append(normalized);
}
return super.getWildcardQuery(field, sb.toString());
return sb.toBytesRef();
}
/**
* Called when parser parses an input term
* that uses prefix notation; that is, contains a single '*' wildcard
@ -121,8 +120,14 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.
*/
@Override
protected Query getPrefixQuery(String field, String termStr) throws ParseException {
String analyzed = analyzeSingleChunk(field, termStr, termStr);
return super.getPrefixQuery(field, analyzed);
if (!getAllowLeadingWildcard() && termStr.startsWith("*"))
throw new ParseException("'*' not allowed as first character in PrefixQuery");
if (getLowercaseExpandedTerms()) {
termStr = termStr.toLowerCase(getLocale());
}
BytesRef term = getAnalyzer().normalize(field, termStr);
Term t = new Term(field, term);
return newPrefixQuery(t);
}
/**
@ -142,61 +147,9 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.
protected Query getFuzzyQuery(String field, String termStr, float minSimilarity)
throws ParseException {
String analyzed = analyzeSingleChunk(field, termStr, termStr);
return super.getFuzzyQuery(field, analyzed, minSimilarity);
BytesRef term = getAnalyzer().normalize(field, termStr);
Term t = new Term(field, term);
return newFuzzyQuery(t, minSimilarity, getFuzzyPrefixLength());
}
/**
* Returns the analyzed form for the given chunk
*
* If the analyzer produces more than one output token from the given chunk,
* a ParseException is thrown.
*
* @param field The target field
* @param termStr The full term from which the given chunk is excerpted
* @param chunk The portion of the given termStr to be analyzed
* @return The result of analyzing the given chunk
* @throws ParseException when analysis returns other than one output token
*/
protected String analyzeSingleChunk(String field, String termStr, String chunk) throws ParseException{
String analyzed = null;
try (TokenStream stream = getAnalyzer().tokenStream(field, chunk)) {
stream.reset();
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
// get first and hopefully only output token
if (stream.incrementToken()) {
analyzed = termAtt.toString();
// try to increment again, there should only be one output token
StringBuilder multipleOutputs = null;
while (stream.incrementToken()) {
if (null == multipleOutputs) {
multipleOutputs = new StringBuilder();
multipleOutputs.append('"');
multipleOutputs.append(analyzed);
multipleOutputs.append('"');
}
multipleOutputs.append(',');
multipleOutputs.append('"');
multipleOutputs.append(termAtt.toString());
multipleOutputs.append('"');
}
stream.end();
if (null != multipleOutputs) {
throw new ParseException(
String.format(getLocale(),
"Analyzer created multiple terms for \"%s\": %s", chunk, multipleOutputs.toString()));
}
} else {
// nothing returned by analyzer. Was it a stop word and the user accidentally
// used an analyzer with stop words?
stream.end();
throw new ParseException(String.format(getLocale(), "Analyzer returned nothing for \"%s\"", chunk));
}
} catch (IOException e){
throw new ParseException(
String.format(getLocale(), "IO error while trying to analyze single term: \"%s\"", termStr));
}
return analyzed;
}
}

View File

@ -16,14 +16,11 @@
*/
package org.apache.lucene.queryparser.classic;
import java.io.IOException;
import java.io.StringReader;
import java.text.DateFormat;
import java.util.*;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser.Operator;
@ -41,9 +38,6 @@ import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZ
* and acts to separate the majority of the Java code from the .jj grammar file.
*/
public abstract class QueryParserBase extends QueryBuilder implements CommonQueryParserConfiguration {
/** Do not catch this exception in your code, it means you are using methods that you should no longer use. */
public static class MethodRemovedUseAnother extends Throwable {}
static final int CONJ_NONE = 0;
static final int CONJ_AND = 1;
@ -640,31 +634,6 @@ public abstract class QueryParserBase extends QueryBuilder implements CommonQuer
return new FuzzyQuery(term,numEdits,prefixLength);
}
// TODO: Should this be protected instead?
private BytesRef analyzeMultitermTerm(String field, String part) {
return analyzeMultitermTerm(field, part, getAnalyzer());
}
protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) {
if (analyzerIn == null) analyzerIn = getAnalyzer();
try (TokenStream source = analyzerIn.tokenStream(field, part)) {
source.reset();
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
if (!source.incrementToken())
throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part);
BytesRef bytes = BytesRef.deepCopyOf(termAtt.getBytesRef());
if (source.incrementToken())
throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part);
source.end();
return bytes;
} catch (IOException e) {
throw new RuntimeException("Error analyzing multiTerm term: " + part, e);
}
}
/**
* Builds a new {@link TermRangeQuery} instance
* @param field Field
@ -681,13 +650,13 @@ public abstract class QueryParserBase extends QueryBuilder implements CommonQuer
if (part1 == null) {
start = null;
} else {
start = analyzeRangeTerms ? analyzeMultitermTerm(field, part1) : new BytesRef(part1);
start = analyzeRangeTerms ? getAnalyzer().normalize(field, part1) : new BytesRef(part1);
}
if (part2 == null) {
end = null;
} else {
end = analyzeRangeTerms ? analyzeMultitermTerm(field, part2) : new BytesRef(part2);
end = analyzeRangeTerms ? getAnalyzer().normalize(field, part2) : new BytesRef(part2);
}
final TermRangeQuery query = new TermRangeQuery(field, start, end, startInclusive, endInclusive);

View File

@ -26,6 +26,7 @@ import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.QueryBuilder;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
@ -551,7 +552,9 @@ public class SimpleQueryParser extends QueryBuilder {
BooleanQuery.Builder bq = new BooleanQuery.Builder();
bq.setDisableCoord(true);
for (Map.Entry<String,Float> entry : weights.entrySet()) {
Query q = new FuzzyQuery(new Term(entry.getKey(), text), fuzziness);
final String fieldName = entry.getKey();
final BytesRef term = getAnalyzer().normalize(fieldName, text);
Query q = new FuzzyQuery(new Term(fieldName, term), fuzziness);
float boost = entry.getValue();
if (boost != 1f) {
q = new BoostQuery(q, boost);
@ -587,7 +590,9 @@ public class SimpleQueryParser extends QueryBuilder {
BooleanQuery.Builder bq = new BooleanQuery.Builder();
bq.setDisableCoord(true);
for (Map.Entry<String,Float> entry : weights.entrySet()) {
Query q = new PrefixQuery(new Term(entry.getKey(), text));
final String fieldName = entry.getKey();
final BytesRef term = getAnalyzer().normalize(fieldName, text);
Query q = new PrefixQuery(new Term(fieldName, term));
float boost = entry.getValue();
if (boost != 1f) {
q = new BoostQuery(q, boost);

View File

@ -21,9 +21,8 @@ import java.util.Map;
import java.util.TreeMap;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.CharacterUtils;
import org.apache.lucene.analysis.MockBytesAnalyzer;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
@ -100,24 +99,6 @@ public class TestAnalyzingQueryParser extends LuceneTestCase {
a = new ASCIIAnalyzer();
}
public void testSingleChunkExceptions() {
String termStr = "the*tre";
Analyzer stopsAnalyzer = new MockAnalyzer
(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
ParseException expected = expectThrows(ParseException.class, () -> {
parseWithAnalyzingQueryParser(termStr, stopsAnalyzer, true);
});
assertTrue(expected.getMessage().contains("returned nothing"));
AnalyzingQueryParser qp = new AnalyzingQueryParser(FIELD, a);
expected = expectThrows(ParseException.class, () -> {
qp.analyzeSingleChunk(FIELD, "", "not a single chunk");
});
assertTrue(expected.getMessage().contains("multiple terms"));
}
public void testWildcardAlone() throws ParseException {
//seems like crazy edge case, but can be useful in concordance
@ -221,12 +202,36 @@ public class TestAnalyzingQueryParser extends LuceneTestCase {
}
}
final static class LowercaseFilter extends TokenFilter {
final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
LowercaseFilter(TokenStream input) {
super(input);
}
@Override
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
CharacterUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
return true;
} else
return false;
}
}
final static class ASCIIAnalyzer extends Analyzer {
@Override
public TokenStreamComponents createComponents(String fieldName) {
Tokenizer result = new MockTokenizer(MockTokenizer.WHITESPACE, true);
return new TokenStreamComponents(result, new FoldingFilter(result));
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
return new FoldingFilter(new LowercaseFilter(in));
}
}

View File

@ -1169,6 +1169,10 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
return new TokenStreamComponents(tokenizer, new MockCollationFilter(tokenizer));
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
return new MockCollationFilter(in);
}
}
public void testCollatedRange() throws Exception {

View File

@ -883,7 +883,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]));
}
a.normalize("dummy", text);
// TODO: what can we do besides testing that the above method does not throw?
if (field != null) {
reader = new StringReader(text);
random = new Random(seed);

View File

@ -92,7 +92,16 @@ public final class MockAnalyzer extends Analyzer {
MockTokenFilter filt = new MockTokenFilter(tokenizer, filter);
return new TokenStreamComponents(tokenizer, maybePayload(filt, fieldName));
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = in;
if (lowerCase) {
result = new MockLowerCaseFilter(result);
}
return result;
}
private synchronized TokenFilter maybePayload(TokenFilter stream, String fieldName) {
Integer val = previousMappings.get(fieldName);
if (val == null) {

View File

@ -16,6 +16,8 @@
*/
package org.apache.lucene.analysis;
import org.apache.lucene.util.AttributeFactory;
/**
* Analyzer for testing that encodes terms as UTF-16 bytes.
*/
@ -26,4 +28,9 @@ public final class MockBytesAnalyzer extends Analyzer {
MockTokenizer.KEYWORD, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
return new TokenStreamComponents(t);
}
@Override
protected AttributeFactory attributeFactory() {
return MockUTF16TermAttributeImpl.UTF16_TERM_ATTRIBUTE_FACTORY;
}
}

View File

@ -0,0 +1,40 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis;
import java.io.IOException;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/** A lowercasing {@link TokenFilter}. */
public final class MockLowerCaseFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/** Sole constructor. */
public MockLowerCaseFilter(TokenStream in) {
super(in);
}
@Override
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
CharacterUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
return true;
} else
return false;
}
}

View File

@ -18,6 +18,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
@ -83,9 +84,22 @@ public final class TokenizerChain extends SolrAnalyzer {
return reader;
}
@Override
protected Reader initReaderForNormalization(String fieldName, Reader reader) {
if (charFilters != null && charFilters.length > 0) {
for (CharFilterFactory charFilter : charFilters) {
if (charFilter instanceof MultiTermAwareComponent) {
charFilter = (CharFilterFactory) ((MultiTermAwareComponent) charFilter).getMultiTermComponent();
reader = charFilter.create(reader);
}
}
}
return reader;
}
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tk = tokenizer.create();
Tokenizer tk = tokenizer.create(attributeFactory());
TokenStream ts = tk;
for (TokenFilterFactory filter : filters) {
ts = filter.create(ts);
@ -93,6 +107,18 @@ public final class TokenizerChain extends SolrAnalyzer {
return new TokenStreamComponents(tk, ts);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = in;
for (TokenFilterFactory filter : filters) {
if (filter instanceof MultiTermAwareComponent) {
filter = (TokenFilterFactory) ((MultiTermAwareComponent) filter).getMultiTermComponent();
result = filter.create(in);
}
}
return result;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder("TokenizerChain(");