diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java index bc5f4b825f5..65fcc45fe1a 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java @@ -42,6 +42,7 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter { protected final int minSubwordSize; protected final int maxSubwordSize; protected final boolean onlyLongestMatch; + protected final int subtokenPositionIncrement; protected final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); protected final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); @@ -58,7 +59,8 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter { DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, - onlyLongestMatch); + onlyLongestMatch, + 0); } protected CompoundWordTokenFilterBase(TokenStream input, CharArraySet dictionary) { @@ -68,7 +70,20 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter { DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, - false); + false, + 0); + } + + protected CompoundWordTokenFilterBase( + TokenStream input, CharArraySet dictionary, int subtokenPositionIncrement) { + this( + input, + dictionary, + DEFAULT_MIN_WORD_SIZE, + DEFAULT_MIN_SUBWORD_SIZE, + DEFAULT_MAX_SUBWORD_SIZE, + false, + subtokenPositionIncrement); } protected CompoundWordTokenFilterBase( @@ -77,7 +92,8 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter { int minWordSize, int minSubwordSize, int maxSubwordSize, - boolean onlyLongestMatch) { + boolean onlyLongestMatch, + int subtokenPositionIncrement) { super(input); this.tokens = new LinkedList<>(); if (minWordSize < 0) { @@ -91,6 +107,10 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter { if (maxSubwordSize < 0) { throw new IllegalArgumentException("maxSubwordSize cannot be negative"); } + if (subtokenPositionIncrement != 0 && subtokenPositionIncrement != 1) { + throw new IllegalArgumentException("subtokenPositionIncrement must either be 0 or 1"); + } + this.subtokenPositionIncrement = subtokenPositionIncrement; this.maxSubwordSize = maxSubwordSize; this.onlyLongestMatch = onlyLongestMatch; this.dictionary = dictionary; @@ -98,31 +118,40 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter { @Override public final boolean incrementToken() throws IOException { + if (!tokens.isEmpty()) { + return processSubtokens(); + } + current = null; // For safety + if (!input.incrementToken()) { + return false; + } + if (termAtt.length() >= minWordSize) { + decompose(); + if (!tokens.isEmpty()) { + current = captureState(); + if (subtokenPositionIncrement == 1) { + // provided that we have sub-tokens with increment one, + // we don't want to write the original token into the output + return processSubtokens(); + } + } else if (subtokenPositionIncrement == 1) { + current = captureState(); + } + } + return true; // Return original token + } + + private boolean processSubtokens() { if (!tokens.isEmpty()) { assert current != null; CompoundToken token = tokens.removeFirst(); restoreState(current); // keep all other attributes untouched termAtt.setEmpty().append(token.txt); offsetAtt.setOffset(token.startOffset, token.endOffset); - posIncAtt.setPositionIncrement(0); + posIncAtt.setPositionIncrement(this.subtokenPositionIncrement); return true; } - - current = null; // not really needed, but for safety - if (input.incrementToken()) { - // Only words longer than minWordSize get processed - if (termAtt.length() >= this.minWordSize) { - decompose(); - // only capture the state if we really need it for producing new tokens - if (!tokens.isEmpty()) { - current = captureState(); - } - } - // return original token: - return true; - } else { - return false; - } + return false; } /** diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java index c6278a80a1f..7495bd643d0 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java @@ -50,6 +50,7 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa * @param minSubwordSize only subwords longer than this get to the output stream * @param maxSubwordSize only subwords shorter than this get to the output stream * @param onlyLongestMatch Add only the longest matching subword to the stream + * @param subtokenPositionIncrement set a positional increment for subtokens to 0 or 1. */ public DictionaryCompoundWordTokenFilter( TokenStream input, @@ -57,11 +58,16 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa int minWordSize, int minSubwordSize, int maxSubwordSize, - boolean onlyLongestMatch) { - super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); - if (dictionary == null) { - throw new IllegalArgumentException("dictionary must not be null"); - } + boolean onlyLongestMatch, + int subtokenPositionIncrement) { + super( + input, + dictionary, + minWordSize, + minSubwordSize, + maxSubwordSize, + onlyLongestMatch, + subtokenPositionIncrement); } @Override diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java index 69819736d69..97d5e73af83 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java @@ -51,7 +51,7 @@ public class DictionaryCompoundWordTokenFilterFactory extends TokenFilterFactory private final int minSubwordSize; private final int maxSubwordSize; private final boolean onlyLongestMatch; - + private final int subtokenPositionIncrement; /** Creates a new DictionaryCompoundWordTokenFilterFactory */ public DictionaryCompoundWordTokenFilterFactory(Map args) { super(args); @@ -62,6 +62,7 @@ public class DictionaryCompoundWordTokenFilterFactory extends TokenFilterFactory maxSubwordSize = getInt(args, "maxSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE); onlyLongestMatch = getBoolean(args, "onlyLongestMatch", true); + subtokenPositionIncrement = getInt(args, "subtokenPositionIncrement", 0); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } @@ -84,6 +85,12 @@ public class DictionaryCompoundWordTokenFilterFactory extends TokenFilterFactory return input; } return new DictionaryCompoundWordTokenFilter( - input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); + input, + dictionary, + minWordSize, + minSubwordSize, + maxSubwordSize, + onlyLongestMatch, + subtokenPositionIncrement); } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java index 254d6c61b48..1540c5e7210 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java @@ -113,7 +113,7 @@ public class HyphenationCompoundWordTokenFilter extends CompoundWordTokenFilterB boolean onlyLongestMatch, boolean noSubMatches, boolean noOverlappingMatches) { - super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); + super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch, 0); this.hyphenator = Objects.requireNonNull(hyphenator, "hyphenator"); this.noSubMatches = noSubMatches; diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java index 446f25d8d1d..e9bca6dfca4 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java @@ -251,7 +251,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase { CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, - true); + true, + 0); assertTokenStreamContents( tf, @@ -275,7 +276,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase { CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, - false); + false, + 0); assertTokenStreamContents( tf, @@ -297,7 +299,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase { CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, - false); + false, + 0); // since "d" is shorter than the minimum subword size, it should not be added to the token // stream @@ -323,7 +326,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase { CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, - false); + false, + 0); CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class); tf.reset(); @@ -351,7 +355,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase { CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, - false); + false, + 0); MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class); stream.reset(); while (stream.incrementToken()) { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilterFactory.java index cfe2f432b65..1ccc31d9fa1 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilterFactory.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilterFactory.java @@ -18,6 +18,8 @@ package org.apache.lucene.analysis.compound; import java.io.Reader; import java.io.StringReader; +import java.util.Arrays; +import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.tests.analysis.BaseTokenStreamFactoryTestCase; @@ -25,6 +27,11 @@ import org.apache.lucene.tests.analysis.MockTokenizer; /** Simple tests to ensure the Dictionary compound filter factory is working. */ public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenStreamFactoryTestCase { + + private static CharArraySet makeDictionary(String... dictionary) { + return new CharArraySet(Arrays.asList(dictionary), true); + } + /** Ensure the filter actually decompounds text. */ public void testDecompounding() throws Exception { Reader reader = new StringReader("I like to play softball"); @@ -37,6 +44,25 @@ public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenStrea stream, new String[] {"I", "like", "to", "play", "softball", "soft", "ball"}); } + /** Ensure subtoken can be set with a positional increment of 1 * */ + public void testDecompounderWithSubtokenIncrement() throws Exception { + CharArraySet dict = makeDictionary("læse", "hest"); + + DictionaryCompoundWordTokenFilter tf = + new DictionaryCompoundWordTokenFilter( + whitespaceMockTokenizer("min veninde som er lidt af en læsehest"), + dict, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, + false, + 1); + assertTokenStreamContents( + tf, + new String[] {"min", "veninde", "som", "er", "lidt", "af", "en", "læse", "hest"}, + new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1}); + } + /** Test that bogus arguments result in exception */ public void testBogusArguments() throws Exception { IllegalArgumentException expected =