mirror of https://github.com/apache/lucene.git
LUCENE-7429: AnalyzerWrapper can now wrap the normalization analysis chain too.
This commit is contained in:
parent
6400b9c3cb
commit
ed102d634a
|
@ -56,6 +56,9 @@ Bug Fixes
|
||||||
allTermsRequired is false and context filters are specified (Mike
|
allTermsRequired is false and context filters are specified (Mike
|
||||||
McCandless)
|
McCandless)
|
||||||
|
|
||||||
|
* LUCENE-7429: AnalyzerWrapper can now modify the normalization chain too and
|
||||||
|
DelegatingAnalyzerWrapper does the right thing automatically. (Adrien Grand)
|
||||||
|
|
||||||
Improvements
|
Improvements
|
||||||
|
|
||||||
* LUCENE-7439: FuzzyQuery now matches all terms within the specified
|
* LUCENE-7439: FuzzyQuery now matches all terms within the specified
|
||||||
|
|
|
@ -131,7 +131,7 @@ public final class CustomAnalyzer extends Analyzer {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
final Tokenizer tk = tokenizer.create(attributeFactory());
|
final Tokenizer tk = tokenizer.create(attributeFactory(fieldName));
|
||||||
TokenStream ts = tk;
|
TokenStream ts = tk;
|
||||||
for (final TokenFilterFactory filter : tokenFilters) {
|
for (final TokenFilterFactory filter : tokenFilters) {
|
||||||
ts = filter.create(ts);
|
ts = filter.create(ts);
|
||||||
|
|
|
@ -85,7 +85,7 @@ public final class CollationKeyAnalyzer extends Analyzer {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected AttributeFactory attributeFactory() {
|
protected AttributeFactory attributeFactory(String fieldName) {
|
||||||
return factory;
|
return factory;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -238,7 +238,7 @@ public abstract class Analyzer implements Closeable {
|
||||||
throw new IllegalStateException("Normalization threw an unexpected exeption", e);
|
throw new IllegalStateException("Normalization threw an unexpected exeption", e);
|
||||||
}
|
}
|
||||||
|
|
||||||
final AttributeFactory attributeFactory = attributeFactory();
|
final AttributeFactory attributeFactory = attributeFactory(fieldName);
|
||||||
try (TokenStream ts = normalize(fieldName,
|
try (TokenStream ts = normalize(fieldName,
|
||||||
new StringTokenStream(attributeFactory, filteredText, text.length()))) {
|
new StringTokenStream(attributeFactory, filteredText, text.length()))) {
|
||||||
final TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
final TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
||||||
|
@ -286,9 +286,10 @@ public abstract class Analyzer implements Closeable {
|
||||||
|
|
||||||
/** Return the {@link AttributeFactory} to be used for
|
/** Return the {@link AttributeFactory} to be used for
|
||||||
* {@link #tokenStream analysis} and
|
* {@link #tokenStream analysis} and
|
||||||
* {@link #normalize(String, String) normalization}. The default
|
* {@link #normalize(String, String) normalization} on the given
|
||||||
* implementation returns {@link TokenStream#DEFAULT_TOKEN_ATTRIBUTE_FACTORY}. */
|
* {@code FieldName}. The default implementation returns
|
||||||
protected AttributeFactory attributeFactory() {
|
* {@link TokenStream#DEFAULT_TOKEN_ATTRIBUTE_FACTORY}. */
|
||||||
|
protected AttributeFactory attributeFactory(String fieldName) {
|
||||||
return TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY;
|
return TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -19,6 +19,8 @@ package org.apache.lucene.analysis;
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.AttributeFactory;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extension to {@link Analyzer} suitable for Analyzers which wrap
|
* Extension to {@link Analyzer} suitable for Analyzers which wrap
|
||||||
* other Analyzers.
|
* other Analyzers.
|
||||||
|
@ -81,6 +83,22 @@ public abstract class AnalyzerWrapper extends Analyzer {
|
||||||
return components;
|
return components;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wraps / alters the given TokenStream for normalization purposes, taken
|
||||||
|
* from the wrapped Analyzer, to form new components. It is through this
|
||||||
|
* method that new TokenFilters can be added by AnalyzerWrappers. By default,
|
||||||
|
* the given token stream are returned.
|
||||||
|
*
|
||||||
|
* @param fieldName
|
||||||
|
* Name of the field which is to be analyzed
|
||||||
|
* @param in
|
||||||
|
* TokenStream taken from the wrapped Analyzer
|
||||||
|
* @return Wrapped / altered TokenStreamComponents.
|
||||||
|
*/
|
||||||
|
protected TokenStream wrapTokenStreamForNormalization(String fieldName, TokenStream in) {
|
||||||
|
return in;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Wraps / alters the given Reader. Through this method AnalyzerWrappers can
|
* Wraps / alters the given Reader. Through this method AnalyzerWrappers can
|
||||||
* implement {@link #initReader(String, Reader)}. By default, the given reader
|
* implement {@link #initReader(String, Reader)}. By default, the given reader
|
||||||
|
@ -95,12 +113,32 @@ public abstract class AnalyzerWrapper extends Analyzer {
|
||||||
protected Reader wrapReader(String fieldName, Reader reader) {
|
protected Reader wrapReader(String fieldName, Reader reader) {
|
||||||
return reader;
|
return reader;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wraps / alters the given Reader. Through this method AnalyzerWrappers can
|
||||||
|
* implement {@link #initReaderForNormalization(String, Reader)}. By default,
|
||||||
|
* the given reader is returned.
|
||||||
|
*
|
||||||
|
* @param fieldName
|
||||||
|
* name of the field which is to be analyzed
|
||||||
|
* @param reader
|
||||||
|
* the reader to wrap
|
||||||
|
* @return the wrapped reader
|
||||||
|
*/
|
||||||
|
protected Reader wrapReaderForNormalization(String fieldName, Reader reader) {
|
||||||
|
return reader;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected final TokenStreamComponents createComponents(String fieldName) {
|
protected final TokenStreamComponents createComponents(String fieldName) {
|
||||||
return wrapComponents(fieldName, getWrappedAnalyzer(fieldName).createComponents(fieldName));
|
return wrapComponents(fieldName, getWrappedAnalyzer(fieldName).createComponents(fieldName));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected final TokenStream normalize(String fieldName, TokenStream in) {
|
||||||
|
return wrapTokenStreamForNormalization(fieldName, getWrappedAnalyzer(fieldName).normalize(fieldName, in));
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int getPositionIncrementGap(String fieldName) {
|
public int getPositionIncrementGap(String fieldName) {
|
||||||
return getWrappedAnalyzer(fieldName).getPositionIncrementGap(fieldName);
|
return getWrappedAnalyzer(fieldName).getPositionIncrementGap(fieldName);
|
||||||
|
@ -115,4 +153,14 @@ public abstract class AnalyzerWrapper extends Analyzer {
|
||||||
public final Reader initReader(String fieldName, Reader reader) {
|
public final Reader initReader(String fieldName, Reader reader) {
|
||||||
return getWrappedAnalyzer(fieldName).initReader(fieldName, wrapReader(fieldName, reader));
|
return getWrappedAnalyzer(fieldName).initReader(fieldName, wrapReader(fieldName, reader));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected final Reader initReaderForNormalization(String fieldName, Reader reader) {
|
||||||
|
return getWrappedAnalyzer(fieldName).initReaderForNormalization(fieldName, wrapReaderForNormalization(fieldName, reader));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected final AttributeFactory attributeFactory(String fieldName) {
|
||||||
|
return getWrappedAnalyzer(fieldName).attributeFactory(fieldName);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -54,12 +54,22 @@ public abstract class DelegatingAnalyzerWrapper extends AnalyzerWrapper {
|
||||||
protected final TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
|
protected final TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
|
||||||
return super.wrapComponents(fieldName, components);
|
return super.wrapComponents(fieldName, components);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected final TokenStream wrapTokenStreamForNormalization(String fieldName, TokenStream in) {
|
||||||
|
return super.wrapTokenStreamForNormalization(fieldName, in);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected final Reader wrapReader(String fieldName, Reader reader) {
|
protected final Reader wrapReader(String fieldName, Reader reader) {
|
||||||
return super.wrapReader(fieldName, reader);
|
return super.wrapReader(fieldName, reader);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected final Reader wrapReaderForNormalization(String fieldName, Reader reader) {
|
||||||
|
return super.wrapReaderForNormalization(fieldName, reader);
|
||||||
|
}
|
||||||
|
|
||||||
private static final class DelegatingReuseStrategy extends ReuseStrategy {
|
private static final class DelegatingReuseStrategy extends ReuseStrategy {
|
||||||
DelegatingAnalyzerWrapper wrapper;
|
DelegatingAnalyzerWrapper wrapper;
|
||||||
private final ReuseStrategy fallbackStrategy;
|
private final ReuseStrategy fallbackStrategy;
|
||||||
|
|
|
@ -0,0 +1,107 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
|
public class TestDelegatingAnalyzerWrapper extends LuceneTestCase {
|
||||||
|
|
||||||
|
public void testDelegatesNormalization() {
|
||||||
|
Analyzer analyzer1 = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
|
||||||
|
DelegatingAnalyzerWrapper w1 = new DelegatingAnalyzerWrapper(Analyzer.GLOBAL_REUSE_STRATEGY) {
|
||||||
|
@Override
|
||||||
|
protected Analyzer getWrappedAnalyzer(String fieldName) {
|
||||||
|
return analyzer1;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
assertEquals(new BytesRef("Ab C"), w1.normalize("foo", "Ab C"));
|
||||||
|
|
||||||
|
Analyzer analyzer2 = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true);
|
||||||
|
DelegatingAnalyzerWrapper w2 = new DelegatingAnalyzerWrapper(Analyzer.GLOBAL_REUSE_STRATEGY) {
|
||||||
|
@Override
|
||||||
|
protected Analyzer getWrappedAnalyzer(String fieldName) {
|
||||||
|
return analyzer2;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
assertEquals(new BytesRef("ab c"), w2.normalize("foo", "Ab C"));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testDelegatesAttributeFactory() throws Exception {
|
||||||
|
Analyzer analyzer1 = new MockBytesAnalyzer();
|
||||||
|
DelegatingAnalyzerWrapper w1 = new DelegatingAnalyzerWrapper(Analyzer.GLOBAL_REUSE_STRATEGY) {
|
||||||
|
@Override
|
||||||
|
protected Analyzer getWrappedAnalyzer(String fieldName) {
|
||||||
|
return analyzer1;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
assertEquals(new BytesRef("Ab C".getBytes(StandardCharsets.UTF_16LE)), w1.normalize("foo", "Ab C"));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testDelegatesCharFilter() throws Exception {
|
||||||
|
Analyzer analyzer1 = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected Reader initReaderForNormalization(String fieldName, Reader reader) {
|
||||||
|
return new DummyCharFilter(reader, 'b', 'z');
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
|
Tokenizer tokenizer = new MockTokenizer(attributeFactory(fieldName));
|
||||||
|
return new TokenStreamComponents(tokenizer);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
DelegatingAnalyzerWrapper w1 = new DelegatingAnalyzerWrapper(Analyzer.GLOBAL_REUSE_STRATEGY) {
|
||||||
|
@Override
|
||||||
|
protected Analyzer getWrappedAnalyzer(String fieldName) {
|
||||||
|
return analyzer1;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
assertEquals(new BytesRef("az c"), w1.normalize("foo", "ab c"));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class DummyCharFilter extends CharFilter {
|
||||||
|
|
||||||
|
private final char match, repl;
|
||||||
|
|
||||||
|
public DummyCharFilter(Reader input, char match, char repl) {
|
||||||
|
super(input);
|
||||||
|
this.match = match;
|
||||||
|
this.repl = repl;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected int correct(int currentOff) {
|
||||||
|
return currentOff;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||||
|
final int read = input.read(cbuf, off, len);
|
||||||
|
for (int i = 0; i < read; ++i) {
|
||||||
|
if (cbuf[off+i] == match) {
|
||||||
|
cbuf[off+i] = repl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return read;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
|
@ -30,7 +30,7 @@ public final class MockBytesAnalyzer extends Analyzer {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected AttributeFactory attributeFactory() {
|
protected AttributeFactory attributeFactory(String fieldName) {
|
||||||
return MockUTF16TermAttributeImpl.UTF16_TERM_ATTRIBUTE_FACTORY;
|
return MockUTF16TermAttributeImpl.UTF16_TERM_ATTRIBUTE_FACTORY;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -99,7 +99,7 @@ public final class TokenizerChain extends SolrAnalyzer {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
Tokenizer tk = tokenizer.create(attributeFactory());
|
Tokenizer tk = tokenizer.create(attributeFactory(fieldName));
|
||||||
TokenStream ts = tk;
|
TokenStream ts = tk;
|
||||||
for (TokenFilterFactory filter : filters) {
|
for (TokenFilterFactory filter : filters) {
|
||||||
ts = filter.create(ts);
|
ts = filter.create(ts);
|
||||||
|
|
Loading…
Reference in New Issue