LUCENE-7429: AnalyzerWrapper can now wrap the normalization analysis chain too.

This commit is contained in:
Adrien Grand 2016-10-27 16:27:45 +02:00
parent 2172f3e008
commit af60048097
9 changed files with 180 additions and 11 deletions

View File

@ -107,6 +107,9 @@ Bug Fixes
allTermsRequired is false and context filters are specified (Mike
McCandless)
* LUCENE-7429: AnalyzerWrapper can now modify the normalization chain too and
DelegatingAnalyzerWrapper does the right thing automatically. (Adrien Grand)
Improvements
* LUCENE-7439: FuzzyQuery now matches all terms within the specified

View File

@ -131,7 +131,7 @@ public final class CustomAnalyzer extends Analyzer {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer tk = tokenizer.create(attributeFactory());
final Tokenizer tk = tokenizer.create(attributeFactory(fieldName));
TokenStream ts = tk;
for (final TokenFilterFactory filter : tokenFilters) {
ts = filter.create(ts);

View File

@ -85,7 +85,7 @@ public final class CollationKeyAnalyzer extends Analyzer {
}
@Override
protected AttributeFactory attributeFactory() {
protected AttributeFactory attributeFactory(String fieldName) {
return factory;
}

View File

@ -238,7 +238,7 @@ public abstract class Analyzer implements Closeable {
throw new IllegalStateException("Normalization threw an unexpected exeption", e);
}
final AttributeFactory attributeFactory = attributeFactory();
final AttributeFactory attributeFactory = attributeFactory(fieldName);
try (TokenStream ts = normalize(fieldName,
new StringTokenStream(attributeFactory, filteredText, text.length()))) {
final TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
@ -286,9 +286,10 @@ public abstract class Analyzer implements Closeable {
/** Return the {@link AttributeFactory} to be used for
* {@link #tokenStream analysis} and
* {@link #normalize(String, String) normalization}. The default
* implementation returns {@link TokenStream#DEFAULT_TOKEN_ATTRIBUTE_FACTORY}. */
protected AttributeFactory attributeFactory() {
* {@link #normalize(String, String) normalization} on the given
* {@code FieldName}. The default implementation returns
* {@link TokenStream#DEFAULT_TOKEN_ATTRIBUTE_FACTORY}. */
protected AttributeFactory attributeFactory(String fieldName) {
return TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY;
}

View File

@ -19,6 +19,8 @@ package org.apache.lucene.analysis;
import java.io.Reader;
import org.apache.lucene.util.AttributeFactory;
/**
* Extension to {@link Analyzer} suitable for Analyzers which wrap
* other Analyzers.
@ -81,6 +83,22 @@ public abstract class AnalyzerWrapper extends Analyzer {
return components;
}
/**
* Wraps / alters the given TokenStream for normalization purposes, taken
* from the wrapped Analyzer, to form new components. It is through this
* method that new TokenFilters can be added by AnalyzerWrappers. By default,
* the given token stream are returned.
*
* @param fieldName
* Name of the field which is to be analyzed
* @param in
* TokenStream taken from the wrapped Analyzer
* @return Wrapped / altered TokenStreamComponents.
*/
protected TokenStream wrapTokenStreamForNormalization(String fieldName, TokenStream in) {
return in;
}
/**
* Wraps / alters the given Reader. Through this method AnalyzerWrappers can
* implement {@link #initReader(String, Reader)}. By default, the given reader
@ -96,11 +114,31 @@ public abstract class AnalyzerWrapper extends Analyzer {
return reader;
}
/**
* Wraps / alters the given Reader. Through this method AnalyzerWrappers can
* implement {@link #initReaderForNormalization(String, Reader)}. By default,
* the given reader is returned.
*
* @param fieldName
* name of the field which is to be analyzed
* @param reader
* the reader to wrap
* @return the wrapped reader
*/
protected Reader wrapReaderForNormalization(String fieldName, Reader reader) {
return reader;
}
@Override
protected final TokenStreamComponents createComponents(String fieldName) {
return wrapComponents(fieldName, getWrappedAnalyzer(fieldName).createComponents(fieldName));
}
@Override
protected final TokenStream normalize(String fieldName, TokenStream in) {
return wrapTokenStreamForNormalization(fieldName, getWrappedAnalyzer(fieldName).normalize(fieldName, in));
}
@Override
public int getPositionIncrementGap(String fieldName) {
return getWrappedAnalyzer(fieldName).getPositionIncrementGap(fieldName);
@ -115,4 +153,14 @@ public abstract class AnalyzerWrapper extends Analyzer {
public final Reader initReader(String fieldName, Reader reader) {
return getWrappedAnalyzer(fieldName).initReader(fieldName, wrapReader(fieldName, reader));
}
@Override
protected final Reader initReaderForNormalization(String fieldName, Reader reader) {
return getWrappedAnalyzer(fieldName).initReaderForNormalization(fieldName, wrapReaderForNormalization(fieldName, reader));
}
@Override
protected final AttributeFactory attributeFactory(String fieldName) {
return getWrappedAnalyzer(fieldName).attributeFactory(fieldName);
}
}

View File

@ -55,11 +55,21 @@ public abstract class DelegatingAnalyzerWrapper extends AnalyzerWrapper {
return super.wrapComponents(fieldName, components);
}
@Override
protected final TokenStream wrapTokenStreamForNormalization(String fieldName, TokenStream in) {
return super.wrapTokenStreamForNormalization(fieldName, in);
}
@Override
protected final Reader wrapReader(String fieldName, Reader reader) {
return super.wrapReader(fieldName, reader);
}
@Override
protected final Reader wrapReaderForNormalization(String fieldName, Reader reader) {
return super.wrapReaderForNormalization(fieldName, reader);
}
private static final class DelegatingReuseStrategy extends ReuseStrategy {
DelegatingAnalyzerWrapper wrapper;
private final ReuseStrategy fallbackStrategy;

View File

@ -0,0 +1,107 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
public class TestDelegatingAnalyzerWrapper extends LuceneTestCase {
public void testDelegatesNormalization() {
Analyzer analyzer1 = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
DelegatingAnalyzerWrapper w1 = new DelegatingAnalyzerWrapper(Analyzer.GLOBAL_REUSE_STRATEGY) {
@Override
protected Analyzer getWrappedAnalyzer(String fieldName) {
return analyzer1;
}
};
assertEquals(new BytesRef("Ab C"), w1.normalize("foo", "Ab C"));
Analyzer analyzer2 = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true);
DelegatingAnalyzerWrapper w2 = new DelegatingAnalyzerWrapper(Analyzer.GLOBAL_REUSE_STRATEGY) {
@Override
protected Analyzer getWrappedAnalyzer(String fieldName) {
return analyzer2;
}
};
assertEquals(new BytesRef("ab c"), w2.normalize("foo", "Ab C"));
}
public void testDelegatesAttributeFactory() throws Exception {
Analyzer analyzer1 = new MockBytesAnalyzer();
DelegatingAnalyzerWrapper w1 = new DelegatingAnalyzerWrapper(Analyzer.GLOBAL_REUSE_STRATEGY) {
@Override
protected Analyzer getWrappedAnalyzer(String fieldName) {
return analyzer1;
}
};
assertEquals(new BytesRef("Ab C".getBytes(StandardCharsets.UTF_16LE)), w1.normalize("foo", "Ab C"));
}
public void testDelegatesCharFilter() throws Exception {
Analyzer analyzer1 = new Analyzer() {
@Override
protected Reader initReaderForNormalization(String fieldName, Reader reader) {
return new DummyCharFilter(reader, 'b', 'z');
}
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(attributeFactory(fieldName));
return new TokenStreamComponents(tokenizer);
}
};
DelegatingAnalyzerWrapper w1 = new DelegatingAnalyzerWrapper(Analyzer.GLOBAL_REUSE_STRATEGY) {
@Override
protected Analyzer getWrappedAnalyzer(String fieldName) {
return analyzer1;
}
};
assertEquals(new BytesRef("az c"), w1.normalize("foo", "ab c"));
}
private static class DummyCharFilter extends CharFilter {
private final char match, repl;
public DummyCharFilter(Reader input, char match, char repl) {
super(input);
this.match = match;
this.repl = repl;
}
@Override
protected int correct(int currentOff) {
return currentOff;
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
final int read = input.read(cbuf, off, len);
for (int i = 0; i < read; ++i) {
if (cbuf[off+i] == match) {
cbuf[off+i] = repl;
}
}
return read;
}
}
}

View File

@ -30,7 +30,7 @@ public final class MockBytesAnalyzer extends Analyzer {
}
@Override
protected AttributeFactory attributeFactory() {
protected AttributeFactory attributeFactory(String fieldName) {
return MockUTF16TermAttributeImpl.UTF16_TERM_ATTRIBUTE_FACTORY;
}
}

View File

@ -99,7 +99,7 @@ public final class TokenizerChain extends SolrAnalyzer {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tk = tokenizer.create(attributeFactory());
Tokenizer tk = tokenizer.create(attributeFactory(fieldName));
TokenStream ts = tk;
for (TokenFilterFactory filter : filters) {
ts = filter.create(ts);