diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 56da726c03b..0142f465af4 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -83,6 +83,9 @@ API Changes * LUCENE-7868: IndexWriterConfig.setMaxBufferedDeleteTerms is removed. (Simon Willnauer, Mike McCandless) +* LUCENE-7877: PrefixAwareTokenStream is replaced with ConcatenatingTokenStream + (Alan Woodward, Uwe Schindler, Adrien Grand) + Bug Fixes * LUCENE-7626: IndexWriter will no longer accept broken token offsets diff --git a/lucene/MIGRATE.txt b/lucene/MIGRATE.txt index 10ce4bc5b38..089d196ba9a 100644 --- a/lucene/MIGRATE.txt +++ b/lucene/MIGRATE.txt @@ -131,3 +131,9 @@ they might have more than 2B matches in total. However TopDocs instances returned by IndexSearcher will still have a total number of hits which is less than 2B since Lucene indexes are still bound to at most 2B documents, so it can safely be casted to an int in that case. + +## PrefixAwareTokenFilter and PrefixAndSuffixAwareTokenFilter removed +(LUCENE-7877) + +Instead use ConcatentingTokenStream, which will allow for the use of custom +attributes. diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenatingTokenStream.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenatingTokenStream.java new file mode 100644 index 00000000000..960cae1876b --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenatingTokenStream.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.miscellaneous; + +import java.io.IOException; +import java.util.Iterator; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.IOUtils; + +/** + * A TokenStream that takes an array of input TokenStreams as sources, and + * concatenates them together. + * + * Offsets from the second and subsequent sources are incremented to behave + * as if all the inputs were from a single source. + * + * All of the input TokenStreams must have the same attribute implementations + */ +public final class ConcatenatingTokenStream extends TokenStream { + + private final TokenStream[] sources; + private final OffsetAttribute[] sourceOffsets; + private final OffsetAttribute offsetAtt; + + private int currentSource; + private int offsetIncrement; + + /** + * Create a new ConcatenatingTokenStream from a set of inputs + * @param sources an array of TokenStream inputs to concatenate + */ + public ConcatenatingTokenStream(TokenStream... sources) { + super(combineSources(sources)); + this.sources = sources; + this.offsetAtt = addAttribute(OffsetAttribute.class); + this.sourceOffsets = new OffsetAttribute[sources.length]; + for (int i = 0; i < sources.length; i++) { + this.sourceOffsets[i] = sources[i].addAttribute(OffsetAttribute.class); + } + } + + private static AttributeSource combineSources(TokenStream... sources) { + AttributeSource base = sources[0].cloneAttributes(); + try { + for (int i = 1; i < sources.length; i++) { + Iterator> it = sources[i].getAttributeClassesIterator(); + while (it.hasNext()) { + base.addAttribute(it.next()); + } + // check attributes can be captured + sources[i].copyTo(base); + } + return base; + } + catch (IllegalArgumentException e) { + throw new IllegalArgumentException("Attempted to concatenate TokenStreams with different attribute types", e); + } + } + + @Override + public boolean incrementToken() throws IOException { + while (sources[currentSource].incrementToken() == false) { + if (currentSource >= sources.length - 1) + return false; + sources[currentSource].end(); + OffsetAttribute att = sourceOffsets[currentSource]; + if (att != null) + offsetIncrement += att.endOffset(); + currentSource++; + } + + clearAttributes(); + sources[currentSource].copyTo(this); + offsetAtt.setOffset(offsetAtt.startOffset() + offsetIncrement, offsetAtt.endOffset() + offsetIncrement); + + return true; + } + + @Override + public void end() throws IOException { + sources[currentSource].end(); + super.end(); + } + + @Override + public void reset() throws IOException { + for (TokenStream source : sources) { + source.reset(); + } + super.reset(); + } + + @Override + public void close() throws IOException { + try { + IOUtils.close(sources); + } + finally { + super.close(); + } + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java deleted file mode 100644 index ee669e02ac3..00000000000 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.analysis.miscellaneous; - - -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenStream; - -import java.io.IOException; - -/** - * Links two {@link PrefixAwareTokenFilter}. - *

- * NOTE: This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than - * the ones located in org.apache.lucene.analysis.tokenattributes. - */ -public class PrefixAndSuffixAwareTokenFilter extends TokenStream { - - private PrefixAwareTokenFilter suffix; - - public PrefixAndSuffixAwareTokenFilter(TokenStream prefix, TokenStream input, TokenStream suffix) { - super(suffix); - prefix = new PrefixAwareTokenFilter(prefix, input) { - @Override - public Token updateSuffixToken(Token suffixToken, Token lastInputToken) { - return PrefixAndSuffixAwareTokenFilter.this.updateInputToken(suffixToken, lastInputToken); - } - }; - this.suffix = new PrefixAwareTokenFilter(prefix, suffix) { - @Override - public Token updateSuffixToken(Token suffixToken, Token lastInputToken) { - return PrefixAndSuffixAwareTokenFilter.this.updateSuffixToken(suffixToken, lastInputToken); - } - }; - } - - public Token updateInputToken(Token inputToken, Token lastPrefixToken) { - inputToken.setOffset(lastPrefixToken.endOffset() + inputToken.startOffset(), - lastPrefixToken.endOffset() + inputToken.endOffset()); - return inputToken; - } - - public Token updateSuffixToken(Token suffixToken, Token lastInputToken) { - suffixToken.setOffset(lastInputToken.endOffset() + suffixToken.startOffset(), - lastInputToken.endOffset() + suffixToken.endOffset()); - return suffixToken; - } - - - @Override - public final boolean incrementToken() throws IOException { - return suffix.incrementToken(); - } - - @Override - public void reset() throws IOException { - suffix.reset(); - } - - - @Override - public void close() throws IOException { - suffix.close(); - } - - @Override - public void end() throws IOException { - suffix.end(); - } -} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java deleted file mode 100644 index cb866bdd3db..00000000000 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java +++ /dev/null @@ -1,202 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.analysis.miscellaneous; - - -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; -import org.apache.lucene.util.BytesRef; - -import java.io.IOException; - - -/** - * Joins two token streams and leaves the last token of the first stream available - * to be used when updating the token values in the second stream based on that token. - * - * The default implementation adds last prefix token end offset to the suffix token start and end offsets. - *

- * NOTE: This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than - * the ones located in org.apache.lucene.analysis.tokenattributes. - */ -public class PrefixAwareTokenFilter extends TokenStream { - - private TokenStream prefix; - private TokenStream suffix; - - private CharTermAttribute termAtt; - private PositionIncrementAttribute posIncrAtt; - private PayloadAttribute payloadAtt; - private OffsetAttribute offsetAtt; - private TypeAttribute typeAtt; - private FlagsAttribute flagsAtt; - - private CharTermAttribute p_termAtt; - private PositionIncrementAttribute p_posIncrAtt; - private PayloadAttribute p_payloadAtt; - private OffsetAttribute p_offsetAtt; - private TypeAttribute p_typeAtt; - private FlagsAttribute p_flagsAtt; - - public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) { - super(suffix); - this.suffix = suffix; - this.prefix = prefix; - prefixExhausted = false; - - termAtt = addAttribute(CharTermAttribute.class); - posIncrAtt = addAttribute(PositionIncrementAttribute.class); - payloadAtt = addAttribute(PayloadAttribute.class); - offsetAtt = addAttribute(OffsetAttribute.class); - typeAtt = addAttribute(TypeAttribute.class); - flagsAtt = addAttribute(FlagsAttribute.class); - - p_termAtt = prefix.addAttribute(CharTermAttribute.class); - p_posIncrAtt = prefix.addAttribute(PositionIncrementAttribute.class); - p_payloadAtt = prefix.addAttribute(PayloadAttribute.class); - p_offsetAtt = prefix.addAttribute(OffsetAttribute.class); - p_typeAtt = prefix.addAttribute(TypeAttribute.class); - p_flagsAtt = prefix.addAttribute(FlagsAttribute.class); - } - - private Token previousPrefixToken = new Token(); - private Token reusableToken = new Token(); - - private boolean prefixExhausted; - - @Override - public final boolean incrementToken() throws IOException { - if (!prefixExhausted) { - Token nextToken = getNextPrefixInputToken(reusableToken); - if (nextToken == null) { - prefixExhausted = true; - } else { - previousPrefixToken.reinit(nextToken); - // Make it a deep copy - BytesRef p = previousPrefixToken.getPayload(); - if (p != null) { - previousPrefixToken.setPayload(p.clone()); - } - setCurrentToken(nextToken); - return true; - } - } - - Token nextToken = getNextSuffixInputToken(reusableToken); - if (nextToken == null) { - return false; - } - - nextToken = updateSuffixToken(nextToken, previousPrefixToken); - setCurrentToken(nextToken); - return true; - } - - private void setCurrentToken(Token token) { - if (token == null) return; - clearAttributes(); - termAtt.copyBuffer(token.buffer(), 0, token.length()); - posIncrAtt.setPositionIncrement(token.getPositionIncrement()); - flagsAtt.setFlags(token.getFlags()); - offsetAtt.setOffset(token.startOffset(), token.endOffset()); - typeAtt.setType(token.type()); - payloadAtt.setPayload(token.getPayload()); - } - - private Token getNextPrefixInputToken(Token token) throws IOException { - if (!prefix.incrementToken()) return null; - token.copyBuffer(p_termAtt.buffer(), 0, p_termAtt.length()); - token.setPositionIncrement(p_posIncrAtt.getPositionIncrement()); - token.setFlags(p_flagsAtt.getFlags()); - token.setOffset(p_offsetAtt.startOffset(), p_offsetAtt.endOffset()); - token.setType(p_typeAtt.type()); - token.setPayload(p_payloadAtt.getPayload()); - return token; - } - - private Token getNextSuffixInputToken(Token token) throws IOException { - if (!suffix.incrementToken()) return null; - token.copyBuffer(termAtt.buffer(), 0, termAtt.length()); - token.setPositionIncrement(posIncrAtt.getPositionIncrement()); - token.setFlags(flagsAtt.getFlags()); - token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); - token.setType(typeAtt.type()); - token.setPayload(payloadAtt.getPayload()); - return token; - } - - /** - * The default implementation adds last prefix token end offset to the suffix token start and end offsets. - * - * @param suffixToken a token from the suffix stream - * @param lastPrefixToken the last token from the prefix stream - * @return consumer token - */ - public Token updateSuffixToken(Token suffixToken, Token lastPrefixToken) { - suffixToken.setOffset(lastPrefixToken.endOffset() + suffixToken.startOffset(), - lastPrefixToken.endOffset() + suffixToken.endOffset()); - return suffixToken; - } - - @Override - public void end() throws IOException { - prefix.end(); - suffix.end(); - } - - @Override - public void close() throws IOException { - prefix.close(); - suffix.close(); - } - - @Override - public void reset() throws IOException { - super.reset(); - if (prefix != null) { - prefixExhausted = false; - prefix.reset(); - } - if (suffix != null) { - suffix.reset(); - } - - - } - - public TokenStream getPrefix() { - return prefix; - } - - public void setPrefix(TokenStream prefix) { - this.prefix = prefix; - } - - public TokenStream getSuffix() { - return suffix; - } - - public void setSuffix(TokenStream suffix) { - this.suffix = suffix; - } -} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java new file mode 100644 index 00000000000..258f9b8632f --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.miscellaneous; + +import java.io.IOException; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.util.AttributeFactory; + +public class TestConcatenatingTokenStream extends BaseTokenStreamTestCase { + + public void testBasic() throws IOException { + + AttributeFactory factory = newAttributeFactory(); + + final MockTokenizer first = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false); + first.setReader(new StringReader("first words ")); + final MockTokenizer second = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false); + second.setReader(new StringReader("second words")); + final MockTokenizer third = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false); + third.setReader(new StringReader(" third words")); + + TokenStream ts = new ConcatenatingTokenStream(first, second, new EmptyTokenStream(), third); + assertTokenStreamContents(ts, + new String[] { "first", "words", "second", "words", "third", "words" }, + new int[]{ 0, 6, 12, 19, 25, 31 }, + new int[]{ 5, 11, 18, 24, 30, 36 }); + + } + + public void testInconsistentAttributes() throws IOException { + + AttributeFactory factory = newAttributeFactory(); + + final MockTokenizer first = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false); + first.setReader(new StringReader("first words ")); + first.addAttribute(PayloadAttribute.class); + final MockTokenizer second = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false); + second.setReader(new StringReader("second words")); + second.addAttribute(FlagsAttribute.class); + + TokenStream ts = new ConcatenatingTokenStream(first, second); + assertTrue(ts.hasAttribute(FlagsAttribute.class)); + assertTrue(ts.hasAttribute(PayloadAttribute.class)); + + assertTokenStreamContents(ts, + new String[] { "first", "words", "second", "words" }, + new int[]{ 0, 6, 12, 19, }, + new int[]{ 5, 11, 18, 24, }); + + } + + public void testInconsistentAttributeFactories() throws IOException { + + final MockTokenizer first = new MockTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, MockTokenizer.WHITESPACE, true); + final MockTokenizer second = new MockTokenizer(TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, MockTokenizer.WHITESPACE, true); + + expectThrows(IllegalArgumentException.class, () -> new ConcatenatingTokenStream(first, second)); + + } + +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAndSuffixAwareTokenFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAndSuffixAwareTokenFilter.java deleted file mode 100644 index 0e6c61a7c01..00000000000 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAndSuffixAwareTokenFilter.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.analysis.miscellaneous; - - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.CannedTokenStream; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.Token; - -import java.io.IOException; -import java.io.StringReader; - -public class TestPrefixAndSuffixAwareTokenFilter extends BaseTokenStreamTestCase { - - public void test() throws IOException { - - final MockTokenizer input = new MockTokenizer(MockTokenizer.WHITESPACE, false); - input.setReader(new StringReader("hello world")); - PrefixAndSuffixAwareTokenFilter ts = new PrefixAndSuffixAwareTokenFilter( - new CannedTokenStream(createToken("^", 0, 0)), - input, - new CannedTokenStream(createToken("$", 0, 0))); - - assertTokenStreamContents(ts, - new String[] { "^", "hello", "world", "$" }, - new int[] { 0, 0, 6, 11 }, - new int[] { 0, 5, 11, 11 }); - } - - private static Token createToken(String term, int start, int offset) - { - return new Token(term, start, offset); - } -} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAwareTokenFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAwareTokenFilter.java deleted file mode 100644 index c407c790817..00000000000 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAwareTokenFilter.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.analysis.miscellaneous; - - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.CannedTokenStream; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.Token; - -import java.io.IOException; -import java.io.StringReader; - -public class TestPrefixAwareTokenFilter extends BaseTokenStreamTestCase { - - public void test() throws IOException { - - PrefixAwareTokenFilter ts; - - ts = new PrefixAwareTokenFilter( - new CannedTokenStream(createToken("a", 0, 1)), - new CannedTokenStream(createToken("b", 0, 1))); - assertTokenStreamContents(ts, - new String[] { "a", "b" }, - new int[] { 0, 1 }, - new int[] { 1, 2 }); - - // prefix and suffix using 2x prefix - - final MockTokenizer suffix = new MockTokenizer(MockTokenizer.WHITESPACE, false); - suffix.setReader(new StringReader("hello world")); - ts = new PrefixAwareTokenFilter(new CannedTokenStream(createToken("^", 0, 0)), - suffix); - ts = new PrefixAwareTokenFilter(ts, new CannedTokenStream(createToken("$", 0, 0))); - - assertTokenStreamContents(ts, - new String[] { "^", "hello", "world", "$" }, - new int[] { 0, 0, 6, 11 }, - new int[] { 0, 5, 11, 11 }); - } - - private static Token createToken(String term, int start, int offset) - { - return new Token(term, start, offset); - } -}