From 3af0c6872a631345d9d59ae5855aa219f20fe6c0 Mon Sep 17 00:00:00 2001 From: Matthias Osswald Date: Mon, 31 Jul 2023 14:47:26 +0200 Subject: [PATCH] Add Option to Set Subtoken Position Increment for Dictonary Decompounder This pull request adds a new feature to Lucene's DictionaryDecompounder. Now, you can set the position increment of subtokens to one. This feature is required when you're doing AND searches that involve subtokens. Right now, the position increment is set to zero. That's how DictionaryDecompounder currently operates. But with this update, users can set the subtokenPositionIncrement to one. This changes the position increment of the subtokens to one. This means, if you're using the AND operator in Elasticsearch match clauses to search for 'orangenschokolade', and 'orangen' and 'schokolade' are in your dictionary, it will correctly search for 'orangen AND schokolade'. By default, the DictionaryDecompounder emits the original compounded token. This behavior remains unchanged when the flag is set to zero. However, when set to one, it changes the DictionaryDecompounder's output to individual subtokens, and the original compounded token will not be emitted. --- .../compound/CompoundWordTokenFilterBase.java | 69 +++++++++++++------ .../DictionaryCompoundWordTokenFilter.java | 16 +++-- ...tionaryCompoundWordTokenFilterFactory.java | 11 ++- .../HyphenationCompoundWordTokenFilter.java | 2 +- .../compound/TestCompoundWordTokenFilter.java | 15 ++-- ...tionaryCompoundWordTokenFilterFactory.java | 26 +++++++ 6 files changed, 106 insertions(+), 33 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java index bc5f4b825f5..65fcc45fe1a 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java @@ -42,6 +42,7 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter { protected final int minSubwordSize; protected final int maxSubwordSize; protected final boolean onlyLongestMatch; + protected final int subtokenPositionIncrement; protected final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); protected final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); @@ -58,7 +59,8 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter { DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, - onlyLongestMatch); + onlyLongestMatch, + 0); } protected CompoundWordTokenFilterBase(TokenStream input, CharArraySet dictionary) { @@ -68,7 +70,20 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter { DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, - false); + false, + 0); + } + + protected CompoundWordTokenFilterBase( + TokenStream input, CharArraySet dictionary, int subtokenPositionIncrement) { + this( + input, + dictionary, + DEFAULT_MIN_WORD_SIZE, + DEFAULT_MIN_SUBWORD_SIZE, + DEFAULT_MAX_SUBWORD_SIZE, + false, + subtokenPositionIncrement); } protected CompoundWordTokenFilterBase( @@ -77,7 +92,8 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter { int minWordSize, int minSubwordSize, int maxSubwordSize, - boolean onlyLongestMatch) { + boolean onlyLongestMatch, + int subtokenPositionIncrement) { super(input); this.tokens = new LinkedList<>(); if (minWordSize < 0) { @@ -91,6 +107,10 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter { if (maxSubwordSize < 0) { throw new IllegalArgumentException("maxSubwordSize cannot be negative"); } + if (subtokenPositionIncrement != 0 && subtokenPositionIncrement != 1) { + throw new IllegalArgumentException("subtokenPositionIncrement must either be 0 or 1"); + } + this.subtokenPositionIncrement = subtokenPositionIncrement; this.maxSubwordSize = maxSubwordSize; this.onlyLongestMatch = onlyLongestMatch; this.dictionary = dictionary; @@ -98,31 +118,40 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter { @Override public final boolean incrementToken() throws IOException { + if (!tokens.isEmpty()) { + return processSubtokens(); + } + current = null; // For safety + if (!input.incrementToken()) { + return false; + } + if (termAtt.length() >= minWordSize) { + decompose(); + if (!tokens.isEmpty()) { + current = captureState(); + if (subtokenPositionIncrement == 1) { + // provided that we have sub-tokens with increment one, + // we don't want to write the original token into the output + return processSubtokens(); + } + } else if (subtokenPositionIncrement == 1) { + current = captureState(); + } + } + return true; // Return original token + } + + private boolean processSubtokens() { if (!tokens.isEmpty()) { assert current != null; CompoundToken token = tokens.removeFirst(); restoreState(current); // keep all other attributes untouched termAtt.setEmpty().append(token.txt); offsetAtt.setOffset(token.startOffset, token.endOffset); - posIncAtt.setPositionIncrement(0); + posIncAtt.setPositionIncrement(this.subtokenPositionIncrement); return true; } - - current = null; // not really needed, but for safety - if (input.incrementToken()) { - // Only words longer than minWordSize get processed - if (termAtt.length() >= this.minWordSize) { - decompose(); - // only capture the state if we really need it for producing new tokens - if (!tokens.isEmpty()) { - current = captureState(); - } - } - // return original token: - return true; - } else { - return false; - } + return false; } /** diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java index c6278a80a1f..7495bd643d0 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java @@ -50,6 +50,7 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa * @param minSubwordSize only subwords longer than this get to the output stream * @param maxSubwordSize only subwords shorter than this get to the output stream * @param onlyLongestMatch Add only the longest matching subword to the stream + * @param subtokenPositionIncrement set a positional increment for subtokens to 0 or 1. */ public DictionaryCompoundWordTokenFilter( TokenStream input, @@ -57,11 +58,16 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa int minWordSize, int minSubwordSize, int maxSubwordSize, - boolean onlyLongestMatch) { - super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); - if (dictionary == null) { - throw new IllegalArgumentException("dictionary must not be null"); - } + boolean onlyLongestMatch, + int subtokenPositionIncrement) { + super( + input, + dictionary, + minWordSize, + minSubwordSize, + maxSubwordSize, + onlyLongestMatch, + subtokenPositionIncrement); } @Override diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java index 69819736d69..97d5e73af83 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java @@ -51,7 +51,7 @@ public class DictionaryCompoundWordTokenFilterFactory extends TokenFilterFactory private final int minSubwordSize; private final int maxSubwordSize; private final boolean onlyLongestMatch; - + private final int subtokenPositionIncrement; /** Creates a new DictionaryCompoundWordTokenFilterFactory */ public DictionaryCompoundWordTokenFilterFactory(Map args) { super(args); @@ -62,6 +62,7 @@ public class DictionaryCompoundWordTokenFilterFactory extends TokenFilterFactory maxSubwordSize = getInt(args, "maxSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE); onlyLongestMatch = getBoolean(args, "onlyLongestMatch", true); + subtokenPositionIncrement = getInt(args, "subtokenPositionIncrement", 0); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } @@ -84,6 +85,12 @@ public class DictionaryCompoundWordTokenFilterFactory extends TokenFilterFactory return input; } return new DictionaryCompoundWordTokenFilter( - input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); + input, + dictionary, + minWordSize, + minSubwordSize, + maxSubwordSize, + onlyLongestMatch, + subtokenPositionIncrement); } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java index 254d6c61b48..1540c5e7210 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java @@ -113,7 +113,7 @@ public class HyphenationCompoundWordTokenFilter extends CompoundWordTokenFilterB boolean onlyLongestMatch, boolean noSubMatches, boolean noOverlappingMatches) { - super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); + super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch, 0); this.hyphenator = Objects.requireNonNull(hyphenator, "hyphenator"); this.noSubMatches = noSubMatches; diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java index 446f25d8d1d..e9bca6dfca4 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java @@ -251,7 +251,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase { CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, - true); + true, + 0); assertTokenStreamContents( tf, @@ -275,7 +276,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase { CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, - false); + false, + 0); assertTokenStreamContents( tf, @@ -297,7 +299,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase { CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, - false); + false, + 0); // since "d" is shorter than the minimum subword size, it should not be added to the token // stream @@ -323,7 +326,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase { CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, - false); + false, + 0); CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class); tf.reset(); @@ -351,7 +355,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase { CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, - false); + false, + 0); MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class); stream.reset(); while (stream.incrementToken()) { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilterFactory.java index cfe2f432b65..1ccc31d9fa1 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilterFactory.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilterFactory.java @@ -18,6 +18,8 @@ package org.apache.lucene.analysis.compound; import java.io.Reader; import java.io.StringReader; +import java.util.Arrays; +import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.tests.analysis.BaseTokenStreamFactoryTestCase; @@ -25,6 +27,11 @@ import org.apache.lucene.tests.analysis.MockTokenizer; /** Simple tests to ensure the Dictionary compound filter factory is working. */ public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenStreamFactoryTestCase { + + private static CharArraySet makeDictionary(String... dictionary) { + return new CharArraySet(Arrays.asList(dictionary), true); + } + /** Ensure the filter actually decompounds text. */ public void testDecompounding() throws Exception { Reader reader = new StringReader("I like to play softball"); @@ -37,6 +44,25 @@ public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenStrea stream, new String[] {"I", "like", "to", "play", "softball", "soft", "ball"}); } + /** Ensure subtoken can be set with a positional increment of 1 * */ + public void testDecompounderWithSubtokenIncrement() throws Exception { + CharArraySet dict = makeDictionary("læse", "hest"); + + DictionaryCompoundWordTokenFilter tf = + new DictionaryCompoundWordTokenFilter( + whitespaceMockTokenizer("min veninde som er lidt af en læsehest"), + dict, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, + false, + 1); + assertTokenStreamContents( + tf, + new String[] {"min", "veninde", "som", "er", "lidt", "af", "en", "læse", "hest"}, + new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1}); + } + /** Test that bogus arguments result in exception */ public void testBogusArguments() throws Exception { IllegalArgumentException expected =