mirror of https://github.com/apache/lucene.git
LUCENE-7429: AnalyzerWrapper can now wrap the normalization analysis chain too.
This commit is contained in:
parent
2172f3e008
commit
af60048097
|
@ -107,6 +107,9 @@ Bug Fixes
|
|||
allTermsRequired is false and context filters are specified (Mike
|
||||
McCandless)
|
||||
|
||||
* LUCENE-7429: AnalyzerWrapper can now modify the normalization chain too and
|
||||
DelegatingAnalyzerWrapper does the right thing automatically. (Adrien Grand)
|
||||
|
||||
Improvements
|
||||
|
||||
* LUCENE-7439: FuzzyQuery now matches all terms within the specified
|
||||
|
|
|
@ -131,7 +131,7 @@ public final class CustomAnalyzer extends Analyzer {
|
|||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer tk = tokenizer.create(attributeFactory());
|
||||
final Tokenizer tk = tokenizer.create(attributeFactory(fieldName));
|
||||
TokenStream ts = tk;
|
||||
for (final TokenFilterFactory filter : tokenFilters) {
|
||||
ts = filter.create(ts);
|
||||
|
|
|
@ -85,7 +85,7 @@ public final class CollationKeyAnalyzer extends Analyzer {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected AttributeFactory attributeFactory() {
|
||||
protected AttributeFactory attributeFactory(String fieldName) {
|
||||
return factory;
|
||||
}
|
||||
|
||||
|
|
|
@ -238,7 +238,7 @@ public abstract class Analyzer implements Closeable {
|
|||
throw new IllegalStateException("Normalization threw an unexpected exeption", e);
|
||||
}
|
||||
|
||||
final AttributeFactory attributeFactory = attributeFactory();
|
||||
final AttributeFactory attributeFactory = attributeFactory(fieldName);
|
||||
try (TokenStream ts = normalize(fieldName,
|
||||
new StringTokenStream(attributeFactory, filteredText, text.length()))) {
|
||||
final TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
||||
|
@ -286,9 +286,10 @@ public abstract class Analyzer implements Closeable {
|
|||
|
||||
/** Return the {@link AttributeFactory} to be used for
|
||||
* {@link #tokenStream analysis} and
|
||||
* {@link #normalize(String, String) normalization}. The default
|
||||
* implementation returns {@link TokenStream#DEFAULT_TOKEN_ATTRIBUTE_FACTORY}. */
|
||||
protected AttributeFactory attributeFactory() {
|
||||
* {@link #normalize(String, String) normalization} on the given
|
||||
* {@code FieldName}. The default implementation returns
|
||||
* {@link TokenStream#DEFAULT_TOKEN_ATTRIBUTE_FACTORY}. */
|
||||
protected AttributeFactory attributeFactory(String fieldName) {
|
||||
return TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY;
|
||||
}
|
||||
|
||||
|
|
|
@ -19,6 +19,8 @@ package org.apache.lucene.analysis;
|
|||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
|
||||
/**
|
||||
* Extension to {@link Analyzer} suitable for Analyzers which wrap
|
||||
* other Analyzers.
|
||||
|
@ -81,6 +83,22 @@ public abstract class AnalyzerWrapper extends Analyzer {
|
|||
return components;
|
||||
}
|
||||
|
||||
/**
|
||||
* Wraps / alters the given TokenStream for normalization purposes, taken
|
||||
* from the wrapped Analyzer, to form new components. It is through this
|
||||
* method that new TokenFilters can be added by AnalyzerWrappers. By default,
|
||||
* the given token stream are returned.
|
||||
*
|
||||
* @param fieldName
|
||||
* Name of the field which is to be analyzed
|
||||
* @param in
|
||||
* TokenStream taken from the wrapped Analyzer
|
||||
* @return Wrapped / altered TokenStreamComponents.
|
||||
*/
|
||||
protected TokenStream wrapTokenStreamForNormalization(String fieldName, TokenStream in) {
|
||||
return in;
|
||||
}
|
||||
|
||||
/**
|
||||
* Wraps / alters the given Reader. Through this method AnalyzerWrappers can
|
||||
* implement {@link #initReader(String, Reader)}. By default, the given reader
|
||||
|
@ -95,12 +113,32 @@ public abstract class AnalyzerWrapper extends Analyzer {
|
|||
protected Reader wrapReader(String fieldName, Reader reader) {
|
||||
return reader;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Wraps / alters the given Reader. Through this method AnalyzerWrappers can
|
||||
* implement {@link #initReaderForNormalization(String, Reader)}. By default,
|
||||
* the given reader is returned.
|
||||
*
|
||||
* @param fieldName
|
||||
* name of the field which is to be analyzed
|
||||
* @param reader
|
||||
* the reader to wrap
|
||||
* @return the wrapped reader
|
||||
*/
|
||||
protected Reader wrapReaderForNormalization(String fieldName, Reader reader) {
|
||||
return reader;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected final TokenStreamComponents createComponents(String fieldName) {
|
||||
return wrapComponents(fieldName, getWrappedAnalyzer(fieldName).createComponents(fieldName));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected final TokenStream normalize(String fieldName, TokenStream in) {
|
||||
return wrapTokenStreamForNormalization(fieldName, getWrappedAnalyzer(fieldName).normalize(fieldName, in));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getPositionIncrementGap(String fieldName) {
|
||||
return getWrappedAnalyzer(fieldName).getPositionIncrementGap(fieldName);
|
||||
|
@ -115,4 +153,14 @@ public abstract class AnalyzerWrapper extends Analyzer {
|
|||
public final Reader initReader(String fieldName, Reader reader) {
|
||||
return getWrappedAnalyzer(fieldName).initReader(fieldName, wrapReader(fieldName, reader));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected final Reader initReaderForNormalization(String fieldName, Reader reader) {
|
||||
return getWrappedAnalyzer(fieldName).initReaderForNormalization(fieldName, wrapReaderForNormalization(fieldName, reader));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected final AttributeFactory attributeFactory(String fieldName) {
|
||||
return getWrappedAnalyzer(fieldName).attributeFactory(fieldName);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -54,12 +54,22 @@ public abstract class DelegatingAnalyzerWrapper extends AnalyzerWrapper {
|
|||
protected final TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
|
||||
return super.wrapComponents(fieldName, components);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
protected final TokenStream wrapTokenStreamForNormalization(String fieldName, TokenStream in) {
|
||||
return super.wrapTokenStreamForNormalization(fieldName, in);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected final Reader wrapReader(String fieldName, Reader reader) {
|
||||
return super.wrapReader(fieldName, reader);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
protected final Reader wrapReaderForNormalization(String fieldName, Reader reader) {
|
||||
return super.wrapReaderForNormalization(fieldName, reader);
|
||||
}
|
||||
|
||||
private static final class DelegatingReuseStrategy extends ReuseStrategy {
|
||||
DelegatingAnalyzerWrapper wrapper;
|
||||
private final ReuseStrategy fallbackStrategy;
|
||||
|
|
|
@ -0,0 +1,107 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class TestDelegatingAnalyzerWrapper extends LuceneTestCase {
|
||||
|
||||
public void testDelegatesNormalization() {
|
||||
Analyzer analyzer1 = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
|
||||
DelegatingAnalyzerWrapper w1 = new DelegatingAnalyzerWrapper(Analyzer.GLOBAL_REUSE_STRATEGY) {
|
||||
@Override
|
||||
protected Analyzer getWrappedAnalyzer(String fieldName) {
|
||||
return analyzer1;
|
||||
}
|
||||
};
|
||||
assertEquals(new BytesRef("Ab C"), w1.normalize("foo", "Ab C"));
|
||||
|
||||
Analyzer analyzer2 = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true);
|
||||
DelegatingAnalyzerWrapper w2 = new DelegatingAnalyzerWrapper(Analyzer.GLOBAL_REUSE_STRATEGY) {
|
||||
@Override
|
||||
protected Analyzer getWrappedAnalyzer(String fieldName) {
|
||||
return analyzer2;
|
||||
}
|
||||
};
|
||||
assertEquals(new BytesRef("ab c"), w2.normalize("foo", "Ab C"));
|
||||
}
|
||||
|
||||
public void testDelegatesAttributeFactory() throws Exception {
|
||||
Analyzer analyzer1 = new MockBytesAnalyzer();
|
||||
DelegatingAnalyzerWrapper w1 = new DelegatingAnalyzerWrapper(Analyzer.GLOBAL_REUSE_STRATEGY) {
|
||||
@Override
|
||||
protected Analyzer getWrappedAnalyzer(String fieldName) {
|
||||
return analyzer1;
|
||||
}
|
||||
};
|
||||
assertEquals(new BytesRef("Ab C".getBytes(StandardCharsets.UTF_16LE)), w1.normalize("foo", "Ab C"));
|
||||
}
|
||||
|
||||
public void testDelegatesCharFilter() throws Exception {
|
||||
Analyzer analyzer1 = new Analyzer() {
|
||||
@Override
|
||||
protected Reader initReaderForNormalization(String fieldName, Reader reader) {
|
||||
return new DummyCharFilter(reader, 'b', 'z');
|
||||
}
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new MockTokenizer(attributeFactory(fieldName));
|
||||
return new TokenStreamComponents(tokenizer);
|
||||
}
|
||||
};
|
||||
DelegatingAnalyzerWrapper w1 = new DelegatingAnalyzerWrapper(Analyzer.GLOBAL_REUSE_STRATEGY) {
|
||||
@Override
|
||||
protected Analyzer getWrappedAnalyzer(String fieldName) {
|
||||
return analyzer1;
|
||||
}
|
||||
};
|
||||
assertEquals(new BytesRef("az c"), w1.normalize("foo", "ab c"));
|
||||
}
|
||||
|
||||
private static class DummyCharFilter extends CharFilter {
|
||||
|
||||
private final char match, repl;
|
||||
|
||||
public DummyCharFilter(Reader input, char match, char repl) {
|
||||
super(input);
|
||||
this.match = match;
|
||||
this.repl = repl;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int correct(int currentOff) {
|
||||
return currentOff;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||
final int read = input.read(cbuf, off, len);
|
||||
for (int i = 0; i < read; ++i) {
|
||||
if (cbuf[off+i] == match) {
|
||||
cbuf[off+i] = repl;
|
||||
}
|
||||
}
|
||||
return read;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -30,7 +30,7 @@ public final class MockBytesAnalyzer extends Analyzer {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected AttributeFactory attributeFactory() {
|
||||
protected AttributeFactory attributeFactory(String fieldName) {
|
||||
return MockUTF16TermAttributeImpl.UTF16_TERM_ATTRIBUTE_FACTORY;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -99,7 +99,7 @@ public final class TokenizerChain extends SolrAnalyzer {
|
|||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tk = tokenizer.create(attributeFactory());
|
||||
Tokenizer tk = tokenizer.create(attributeFactory(fieldName));
|
||||
TokenStream ts = tk;
|
||||
for (TokenFilterFactory filter : filters) {
|
||||
ts = filter.create(ts);
|
||||
|
|
Loading…
Reference in New Issue