Add Option to Set Subtoken Position Increment for Dictonary Decompounder

This pull request adds a new feature to Lucene's DictionaryDecompounder. Now, you can set the position increment of subtokens to one. This feature is required when you're doing AND searches that involve subtokens.

Right now, the position increment is set to zero. That's how DictionaryDecompounder currently operates. But with this update, users can set the subtokenPositionIncrement to one. This changes the position increment of the subtokens to one. This means, if you're using the AND operator in Elasticsearch match clauses to search for 'orangenschokolade', and 'orangen' and 'schokolade' are in your dictionary, it will correctly search for 'orangen AND schokolade'.

By default, the DictionaryDecompounder emits the original compounded token. This behavior remains unchanged when the flag is set to zero. However, when set to one, it changes the DictionaryDecompounder's output to individual subtokens, and the original compounded token will not be emitted.
This commit is contained in:
Matthias Osswald 2023-07-31 14:47:26 +02:00
parent 5e725964a0
commit 3af0c6872a
6 changed files with 106 additions and 33 deletions

View File

@ -42,6 +42,7 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
protected final int minSubwordSize;
protected final int maxSubwordSize;
protected final boolean onlyLongestMatch;
protected final int subtokenPositionIncrement;
protected final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
protected final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@ -58,7 +59,8 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
DEFAULT_MIN_WORD_SIZE,
DEFAULT_MIN_SUBWORD_SIZE,
DEFAULT_MAX_SUBWORD_SIZE,
onlyLongestMatch);
onlyLongestMatch,
0);
}
protected CompoundWordTokenFilterBase(TokenStream input, CharArraySet dictionary) {
@ -68,7 +70,20 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
DEFAULT_MIN_WORD_SIZE,
DEFAULT_MIN_SUBWORD_SIZE,
DEFAULT_MAX_SUBWORD_SIZE,
false);
false,
0);
}
protected CompoundWordTokenFilterBase(
TokenStream input, CharArraySet dictionary, int subtokenPositionIncrement) {
this(
input,
dictionary,
DEFAULT_MIN_WORD_SIZE,
DEFAULT_MIN_SUBWORD_SIZE,
DEFAULT_MAX_SUBWORD_SIZE,
false,
subtokenPositionIncrement);
}
protected CompoundWordTokenFilterBase(
@ -77,7 +92,8 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
int minWordSize,
int minSubwordSize,
int maxSubwordSize,
boolean onlyLongestMatch) {
boolean onlyLongestMatch,
int subtokenPositionIncrement) {
super(input);
this.tokens = new LinkedList<>();
if (minWordSize < 0) {
@ -91,6 +107,10 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
if (maxSubwordSize < 0) {
throw new IllegalArgumentException("maxSubwordSize cannot be negative");
}
if (subtokenPositionIncrement != 0 && subtokenPositionIncrement != 1) {
throw new IllegalArgumentException("subtokenPositionIncrement must either be 0 or 1");
}
this.subtokenPositionIncrement = subtokenPositionIncrement;
this.maxSubwordSize = maxSubwordSize;
this.onlyLongestMatch = onlyLongestMatch;
this.dictionary = dictionary;
@ -98,31 +118,40 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
@Override
public final boolean incrementToken() throws IOException {
if (!tokens.isEmpty()) {
return processSubtokens();
}
current = null; // For safety
if (!input.incrementToken()) {
return false;
}
if (termAtt.length() >= minWordSize) {
decompose();
if (!tokens.isEmpty()) {
current = captureState();
if (subtokenPositionIncrement == 1) {
// provided that we have sub-tokens with increment one,
// we don't want to write the original token into the output
return processSubtokens();
}
} else if (subtokenPositionIncrement == 1) {
current = captureState();
}
}
return true; // Return original token
}
private boolean processSubtokens() {
if (!tokens.isEmpty()) {
assert current != null;
CompoundToken token = tokens.removeFirst();
restoreState(current); // keep all other attributes untouched
termAtt.setEmpty().append(token.txt);
offsetAtt.setOffset(token.startOffset, token.endOffset);
posIncAtt.setPositionIncrement(0);
posIncAtt.setPositionIncrement(this.subtokenPositionIncrement);
return true;
}
current = null; // not really needed, but for safety
if (input.incrementToken()) {
// Only words longer than minWordSize get processed
if (termAtt.length() >= this.minWordSize) {
decompose();
// only capture the state if we really need it for producing new tokens
if (!tokens.isEmpty()) {
current = captureState();
}
}
// return original token:
return true;
} else {
return false;
}
return false;
}
/**

View File

@ -50,6 +50,7 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
* @param minSubwordSize only subwords longer than this get to the output stream
* @param maxSubwordSize only subwords shorter than this get to the output stream
* @param onlyLongestMatch Add only the longest matching subword to the stream
* @param subtokenPositionIncrement set a positional increment for subtokens to 0 or 1.
*/
public DictionaryCompoundWordTokenFilter(
TokenStream input,
@ -57,11 +58,16 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
int minWordSize,
int minSubwordSize,
int maxSubwordSize,
boolean onlyLongestMatch) {
super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
if (dictionary == null) {
throw new IllegalArgumentException("dictionary must not be null");
}
boolean onlyLongestMatch,
int subtokenPositionIncrement) {
super(
input,
dictionary,
minWordSize,
minSubwordSize,
maxSubwordSize,
onlyLongestMatch,
subtokenPositionIncrement);
}
@Override

View File

@ -51,7 +51,7 @@ public class DictionaryCompoundWordTokenFilterFactory extends TokenFilterFactory
private final int minSubwordSize;
private final int maxSubwordSize;
private final boolean onlyLongestMatch;
private final int subtokenPositionIncrement;
/** Creates a new DictionaryCompoundWordTokenFilterFactory */
public DictionaryCompoundWordTokenFilterFactory(Map<String, String> args) {
super(args);
@ -62,6 +62,7 @@ public class DictionaryCompoundWordTokenFilterFactory extends TokenFilterFactory
maxSubwordSize =
getInt(args, "maxSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
onlyLongestMatch = getBoolean(args, "onlyLongestMatch", true);
subtokenPositionIncrement = getInt(args, "subtokenPositionIncrement", 0);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@ -84,6 +85,12 @@ public class DictionaryCompoundWordTokenFilterFactory extends TokenFilterFactory
return input;
}
return new DictionaryCompoundWordTokenFilter(
input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
input,
dictionary,
minWordSize,
minSubwordSize,
maxSubwordSize,
onlyLongestMatch,
subtokenPositionIncrement);
}
}

View File

@ -113,7 +113,7 @@ public class HyphenationCompoundWordTokenFilter extends CompoundWordTokenFilterB
boolean onlyLongestMatch,
boolean noSubMatches,
boolean noOverlappingMatches) {
super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch, 0);
this.hyphenator = Objects.requireNonNull(hyphenator, "hyphenator");
this.noSubMatches = noSubMatches;

View File

@ -251,7 +251,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
true);
true,
0);
assertTokenStreamContents(
tf,
@ -275,7 +276,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
false);
false,
0);
assertTokenStreamContents(
tf,
@ -297,7 +299,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
false);
false,
0);
// since "d" is shorter than the minimum subword size, it should not be added to the token
// stream
@ -323,7 +326,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
false);
false,
0);
CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class);
tf.reset();
@ -351,7 +355,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
false);
false,
0);
MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class);
stream.reset();
while (stream.incrementToken()) {

View File

@ -18,6 +18,8 @@ package org.apache.lucene.analysis.compound;
import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.tests.analysis.BaseTokenStreamFactoryTestCase;
@ -25,6 +27,11 @@ import org.apache.lucene.tests.analysis.MockTokenizer;
/** Simple tests to ensure the Dictionary compound filter factory is working. */
public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenStreamFactoryTestCase {
private static CharArraySet makeDictionary(String... dictionary) {
return new CharArraySet(Arrays.asList(dictionary), true);
}
/** Ensure the filter actually decompounds text. */
public void testDecompounding() throws Exception {
Reader reader = new StringReader("I like to play softball");
@ -37,6 +44,25 @@ public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenStrea
stream, new String[] {"I", "like", "to", "play", "softball", "soft", "ball"});
}
/** Ensure subtoken can be set with a positional increment of 1 * */
public void testDecompounderWithSubtokenIncrement() throws Exception {
CharArraySet dict = makeDictionary("læse", "hest");
DictionaryCompoundWordTokenFilter tf =
new DictionaryCompoundWordTokenFilter(
whitespaceMockTokenizer("min veninde som er lidt af en læsehest"),
dict,
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
false,
1);
assertTokenStreamContents(
tf,
new String[] {"min", "veninde", "som", "er", "lidt", "af", "en", "læse", "hest"},
new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1});
}
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
IllegalArgumentException expected =