diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index d574a8a2861..5a6601b6048 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -107,6 +107,9 @@ Bug Fixes allTermsRequired is false and context filters are specified (Mike McCandless) +* LUCENE-7429: AnalyzerWrapper can now modify the normalization chain too and + DelegatingAnalyzerWrapper does the right thing automatically. (Adrien Grand) + Improvements * LUCENE-7439: FuzzyQuery now matches all terms within the specified diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java index b2de5e8b34b..466642c9f37 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java @@ -131,7 +131,7 @@ public final class CustomAnalyzer extends Analyzer { @Override protected TokenStreamComponents createComponents(String fieldName) { - final Tokenizer tk = tokenizer.create(attributeFactory()); + final Tokenizer tk = tokenizer.create(attributeFactory(fieldName)); TokenStream ts = tk; for (final TokenFilterFactory filter : tokenFilters) { ts = filter.create(ts); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java index ea987315b07..4d0f03956f9 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java @@ -85,7 +85,7 @@ public final class CollationKeyAnalyzer extends Analyzer { } @Override - protected AttributeFactory attributeFactory() { + protected AttributeFactory attributeFactory(String fieldName) { return factory; } diff --git a/lucene/core/src/java/org/apache/lucene/analysis/Analyzer.java b/lucene/core/src/java/org/apache/lucene/analysis/Analyzer.java index aa4b42db6a0..3a5d41c999f 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/Analyzer.java +++ b/lucene/core/src/java/org/apache/lucene/analysis/Analyzer.java @@ -238,7 +238,7 @@ public abstract class Analyzer implements Closeable { throw new IllegalStateException("Normalization threw an unexpected exeption", e); } - final AttributeFactory attributeFactory = attributeFactory(); + final AttributeFactory attributeFactory = attributeFactory(fieldName); try (TokenStream ts = normalize(fieldName, new StringTokenStream(attributeFactory, filteredText, text.length()))) { final TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class); @@ -286,9 +286,10 @@ public abstract class Analyzer implements Closeable { /** Return the {@link AttributeFactory} to be used for * {@link #tokenStream analysis} and - * {@link #normalize(String, String) normalization}. The default - * implementation returns {@link TokenStream#DEFAULT_TOKEN_ATTRIBUTE_FACTORY}. */ - protected AttributeFactory attributeFactory() { + * {@link #normalize(String, String) normalization} on the given + * {@code FieldName}. The default implementation returns + * {@link TokenStream#DEFAULT_TOKEN_ATTRIBUTE_FACTORY}. */ + protected AttributeFactory attributeFactory(String fieldName) { return TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY; } diff --git a/lucene/core/src/java/org/apache/lucene/analysis/AnalyzerWrapper.java b/lucene/core/src/java/org/apache/lucene/analysis/AnalyzerWrapper.java index 1e5640f71c0..d23d004d729 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/AnalyzerWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/analysis/AnalyzerWrapper.java @@ -19,6 +19,8 @@ package org.apache.lucene.analysis; import java.io.Reader; +import org.apache.lucene.util.AttributeFactory; + /** * Extension to {@link Analyzer} suitable for Analyzers which wrap * other Analyzers. @@ -81,6 +83,22 @@ public abstract class AnalyzerWrapper extends Analyzer { return components; } + /** + * Wraps / alters the given TokenStream for normalization purposes, taken + * from the wrapped Analyzer, to form new components. It is through this + * method that new TokenFilters can be added by AnalyzerWrappers. By default, + * the given token stream are returned. + * + * @param fieldName + * Name of the field which is to be analyzed + * @param in + * TokenStream taken from the wrapped Analyzer + * @return Wrapped / altered TokenStreamComponents. + */ + protected TokenStream wrapTokenStreamForNormalization(String fieldName, TokenStream in) { + return in; + } + /** * Wraps / alters the given Reader. Through this method AnalyzerWrappers can * implement {@link #initReader(String, Reader)}. By default, the given reader @@ -95,12 +113,32 @@ public abstract class AnalyzerWrapper extends Analyzer { protected Reader wrapReader(String fieldName, Reader reader) { return reader; } - + + /** + * Wraps / alters the given Reader. Through this method AnalyzerWrappers can + * implement {@link #initReaderForNormalization(String, Reader)}. By default, + * the given reader is returned. + * + * @param fieldName + * name of the field which is to be analyzed + * @param reader + * the reader to wrap + * @return the wrapped reader + */ + protected Reader wrapReaderForNormalization(String fieldName, Reader reader) { + return reader; + } + @Override protected final TokenStreamComponents createComponents(String fieldName) { return wrapComponents(fieldName, getWrappedAnalyzer(fieldName).createComponents(fieldName)); } + @Override + protected final TokenStream normalize(String fieldName, TokenStream in) { + return wrapTokenStreamForNormalization(fieldName, getWrappedAnalyzer(fieldName).normalize(fieldName, in)); + } + @Override public int getPositionIncrementGap(String fieldName) { return getWrappedAnalyzer(fieldName).getPositionIncrementGap(fieldName); @@ -115,4 +153,14 @@ public abstract class AnalyzerWrapper extends Analyzer { public final Reader initReader(String fieldName, Reader reader) { return getWrappedAnalyzer(fieldName).initReader(fieldName, wrapReader(fieldName, reader)); } + + @Override + protected final Reader initReaderForNormalization(String fieldName, Reader reader) { + return getWrappedAnalyzer(fieldName).initReaderForNormalization(fieldName, wrapReaderForNormalization(fieldName, reader)); + } + + @Override + protected final AttributeFactory attributeFactory(String fieldName) { + return getWrappedAnalyzer(fieldName).attributeFactory(fieldName); + } } diff --git a/lucene/core/src/java/org/apache/lucene/analysis/DelegatingAnalyzerWrapper.java b/lucene/core/src/java/org/apache/lucene/analysis/DelegatingAnalyzerWrapper.java index 6f05d4d49d8..edf5b2b43ea 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/DelegatingAnalyzerWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/analysis/DelegatingAnalyzerWrapper.java @@ -54,12 +54,22 @@ public abstract class DelegatingAnalyzerWrapper extends AnalyzerWrapper { protected final TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) { return super.wrapComponents(fieldName, components); } - + + @Override + protected final TokenStream wrapTokenStreamForNormalization(String fieldName, TokenStream in) { + return super.wrapTokenStreamForNormalization(fieldName, in); + } + @Override protected final Reader wrapReader(String fieldName, Reader reader) { return super.wrapReader(fieldName, reader); } - + + @Override + protected final Reader wrapReaderForNormalization(String fieldName, Reader reader) { + return super.wrapReaderForNormalization(fieldName, reader); + } + private static final class DelegatingReuseStrategy extends ReuseStrategy { DelegatingAnalyzerWrapper wrapper; private final ReuseStrategy fallbackStrategy; diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestDelegatingAnalyzerWrapper.java b/lucene/core/src/test/org/apache/lucene/analysis/TestDelegatingAnalyzerWrapper.java new file mode 100644 index 00000000000..1d6cf153aec --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/analysis/TestDelegatingAnalyzerWrapper.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis; + +import java.io.IOException; +import java.io.Reader; +import java.nio.charset.StandardCharsets; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; + +public class TestDelegatingAnalyzerWrapper extends LuceneTestCase { + + public void testDelegatesNormalization() { + Analyzer analyzer1 = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); + DelegatingAnalyzerWrapper w1 = new DelegatingAnalyzerWrapper(Analyzer.GLOBAL_REUSE_STRATEGY) { + @Override + protected Analyzer getWrappedAnalyzer(String fieldName) { + return analyzer1; + } + }; + assertEquals(new BytesRef("Ab C"), w1.normalize("foo", "Ab C")); + + Analyzer analyzer2 = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true); + DelegatingAnalyzerWrapper w2 = new DelegatingAnalyzerWrapper(Analyzer.GLOBAL_REUSE_STRATEGY) { + @Override + protected Analyzer getWrappedAnalyzer(String fieldName) { + return analyzer2; + } + }; + assertEquals(new BytesRef("ab c"), w2.normalize("foo", "Ab C")); + } + + public void testDelegatesAttributeFactory() throws Exception { + Analyzer analyzer1 = new MockBytesAnalyzer(); + DelegatingAnalyzerWrapper w1 = new DelegatingAnalyzerWrapper(Analyzer.GLOBAL_REUSE_STRATEGY) { + @Override + protected Analyzer getWrappedAnalyzer(String fieldName) { + return analyzer1; + } + }; + assertEquals(new BytesRef("Ab C".getBytes(StandardCharsets.UTF_16LE)), w1.normalize("foo", "Ab C")); + } + + public void testDelegatesCharFilter() throws Exception { + Analyzer analyzer1 = new Analyzer() { + @Override + protected Reader initReaderForNormalization(String fieldName, Reader reader) { + return new DummyCharFilter(reader, 'b', 'z'); + } + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(attributeFactory(fieldName)); + return new TokenStreamComponents(tokenizer); + } + }; + DelegatingAnalyzerWrapper w1 = new DelegatingAnalyzerWrapper(Analyzer.GLOBAL_REUSE_STRATEGY) { + @Override + protected Analyzer getWrappedAnalyzer(String fieldName) { + return analyzer1; + } + }; + assertEquals(new BytesRef("az c"), w1.normalize("foo", "ab c")); + } + + private static class DummyCharFilter extends CharFilter { + + private final char match, repl; + + public DummyCharFilter(Reader input, char match, char repl) { + super(input); + this.match = match; + this.repl = repl; + } + + @Override + protected int correct(int currentOff) { + return currentOff; + } + + @Override + public int read(char[] cbuf, int off, int len) throws IOException { + final int read = input.read(cbuf, off, len); + for (int i = 0; i < read; ++i) { + if (cbuf[off+i] == match) { + cbuf[off+i] = repl; + } + } + return read; + } + + } +} diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockBytesAnalyzer.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockBytesAnalyzer.java index b8cfc5be1d6..4d51717a0d2 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockBytesAnalyzer.java +++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockBytesAnalyzer.java @@ -30,7 +30,7 @@ public final class MockBytesAnalyzer extends Analyzer { } @Override - protected AttributeFactory attributeFactory() { + protected AttributeFactory attributeFactory(String fieldName) { return MockUTF16TermAttributeImpl.UTF16_TERM_ATTRIBUTE_FACTORY; } } diff --git a/solr/core/src/java/org/apache/solr/analysis/TokenizerChain.java b/solr/core/src/java/org/apache/solr/analysis/TokenizerChain.java index a5afbeccd8e..ab5458c4a72 100644 --- a/solr/core/src/java/org/apache/solr/analysis/TokenizerChain.java +++ b/solr/core/src/java/org/apache/solr/analysis/TokenizerChain.java @@ -99,7 +99,7 @@ public final class TokenizerChain extends SolrAnalyzer { @Override protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer tk = tokenizer.create(attributeFactory()); + Tokenizer tk = tokenizer.create(attributeFactory(fieldName)); TokenStream ts = tk; for (TokenFilterFactory filter : filters) { ts = filter.create(ts);