LUCENE-9018: ConcatenateGraphFilter now has a configurable separator.

This commit is contained in:
David Smiley 2019-11-14 13:26:50 -05:00
parent d9f41f8a5a
commit e466d622c8
5 changed files with 144 additions and 22 deletions

View File

@ -85,6 +85,8 @@ Improvements
* LUCENE-9028: introducing Intervals.multiterm() (Mikhail Khludnev)
* LUCENE-9018: ConcatenateGraphFilter now has a configurable separator. (Stanislav Mikulchik, David Smiley)
Optimizations
* LUCENE-8928: When building a kd-tree for dimensions n > 2, compute exact bounds for an inner node every N splits

View File

@ -56,11 +56,11 @@ public final class ConcatenateGraphFilter extends TokenStream {
*/
/**
* Represents the separation between tokens, if
* <code>preserveSep</code> is <code>true</code>.
* Represents the default separator between tokens.
*/
public final static int SEP_LABEL = TokenStreamToAutomaton.POS_SEP;
public final static int DEFAULT_MAX_GRAPH_EXPANSIONS = Operations.DEFAULT_MAX_DETERMINIZED_STATES;
public final static Character DEFAULT_TOKEN_SEPARATOR = SEP_LABEL;
public final static boolean DEFAULT_PRESERVE_SEP = true;
public final static boolean DEFAULT_PRESERVE_POSITION_INCREMENTS = true;
@ -69,7 +69,7 @@ public final class ConcatenateGraphFilter extends TokenStream {
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final TokenStream inputTokenStream;
private final boolean preserveSep;
private final Character tokenSeparator;
private final boolean preservePositionIncrements;
private final int maxGraphExpansions;
@ -85,7 +85,7 @@ public final class ConcatenateGraphFilter extends TokenStream {
* This constructor uses the default settings of the constants in this class.
*/
public ConcatenateGraphFilter(TokenStream inputTokenStream) {
this(inputTokenStream, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, DEFAULT_MAX_GRAPH_EXPANSIONS);
this(inputTokenStream, DEFAULT_TOKEN_SEPARATOR, DEFAULT_PRESERVE_POSITION_INCREMENTS, DEFAULT_MAX_GRAPH_EXPANSIONS);
}
/**
@ -93,7 +93,8 @@ public final class ConcatenateGraphFilter extends TokenStream {
* of accepted strings by its token stream graph.
*
* @param inputTokenStream The input/incoming TokenStream
* @param preserveSep Whether {@link #SEP_LABEL} should separate the input tokens in the concatenated token
* @param tokenSeparator Separator to use for concatenation. Can be null, in this case tokens will be concatenated
* without any separators.
* @param preservePositionIncrements Whether to add an empty token for missing positions.
* The effect is a consecutive {@link #SEP_LABEL}.
* When false, it's as if there were no missing positions
@ -105,15 +106,23 @@ public final class ConcatenateGraphFilter extends TokenStream {
* expansions
*
*/
public ConcatenateGraphFilter(TokenStream inputTokenStream, boolean preserveSep, boolean preservePositionIncrements, int maxGraphExpansions) {
public ConcatenateGraphFilter(TokenStream inputTokenStream, Character tokenSeparator, boolean preservePositionIncrements, int maxGraphExpansions) {
// Don't call the super(input) ctor - this is a true delegate and has a new attribute source since we consume
// the input stream entirely in the first call to incrementToken
this.inputTokenStream = inputTokenStream;
this.preserveSep = preserveSep;
this.tokenSeparator = tokenSeparator;
this.preservePositionIncrements = preservePositionIncrements;
this.maxGraphExpansions = maxGraphExpansions;
}
/**
* Calls {@link #ConcatenateGraphFilter(org.apache.lucene.analysis.TokenStream, java.lang.Character, boolean, int)}
* @param preserveSep Whether {@link #SEP_LABEL} should separate the input tokens in the concatenated token
*/
public ConcatenateGraphFilter(TokenStream inputTokenStream, boolean preserveSep, boolean preservePositionIncrements, int maxGraphExpansions) {
this(inputTokenStream, (preserveSep) ? DEFAULT_TOKEN_SEPARATOR : null, preservePositionIncrements, maxGraphExpansions);
}
@Override
public void reset() throws IOException {
super.reset();
@ -196,8 +205,8 @@ public final class ConcatenateGraphFilter extends TokenStream {
// from each analyzed token, with byte 0 used as
// separator between tokens:
final TokenStreamToAutomaton tsta;
if (preserveSep) {
tsta = new EscapingTokenStreamToAutomaton(SEP_LABEL);
if (tokenSeparator != null) {
tsta = new EscapingTokenStreamToAutomaton(tokenSeparator);
} else {
// When we're not preserving sep, we don't steal 0xff
// byte, so we don't need to do any escaping:
@ -210,7 +219,7 @@ public final class ConcatenateGraphFilter extends TokenStream {
// TODO: we can optimize this somewhat by determinizing
// while we convert
automaton = replaceSep(automaton, preserveSep, SEP_LABEL);
automaton = replaceSep(automaton, tokenSeparator);
// This automaton should not blow up during determinize:
return Operations.determinize(automaton, maxGraphExpansions);
}
@ -249,7 +258,7 @@ public final class ConcatenateGraphFilter extends TokenStream {
// Replaces SEP with epsilon or remaps them if
// we were asked to preserve them:
private static Automaton replaceSep(Automaton a, boolean preserveSep, int sepLabel) {
private static Automaton replaceSep(Automaton a, Character tokenSeparator) {
Automaton result = new Automaton();
@ -271,9 +280,9 @@ public final class ConcatenateGraphFilter extends TokenStream {
a.getNextTransition(t);
if (t.min == TokenStreamToAutomaton.POS_SEP) {
assert t.max == TokenStreamToAutomaton.POS_SEP;
if (preserveSep) {
// Remap to SEP_LABEL:
result.addTransition(state, t.dest, sepLabel);
if (tokenSeparator != null) {
// Remap to tokenSeparator:
result.addTransition(state, t.dest, tokenSeparator);
} else {
result.addEpsilon(state, t.dest);
}

View File

@ -20,6 +20,7 @@ import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
/**
@ -27,9 +28,15 @@ import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
*
* <ul>
* <li><tt>preserveSep</tt>:
* For lucene versions lesser than {@link org.apache.lucene.util.Version#LUCENE_8_4_0}
* Whether {@link ConcatenateGraphFilter#SEP_LABEL}
* should separate the input tokens in the concatenated token
* </li>
* <li><tt>tokenSeparator</tt>:
* Separator to use for concatenation. If not present,
* {@link ConcatenateGraphFilter#DEFAULT_TOKEN_SEPARATOR} will be used.
* If empty, tokens will be concatenated without any separators.
* </li>
* <li><tt>preservePositionIncrements</tt>:
* Whether to add an empty token for missing positions.
* The effect is a consecutive {@link ConcatenateGraphFilter#SEP_LABEL}.
@ -51,14 +58,19 @@ public class ConcatenateGraphFilterFactory extends TokenFilterFactory {
/** SPI name */
public static final String NAME = "concatenateGraph";
private boolean preserveSep;
private Character tokenSeparator;
private boolean preservePositionIncrements;
private int maxGraphExpansions;
public ConcatenateGraphFilterFactory(Map<String, String> args) {
super(args);
preserveSep = getBoolean(args, "preserveSep", ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP);
Version luceneMatchVersion = getLuceneMatchVersion();
if (luceneMatchVersion.onOrAfter(Version.LUCENE_8_4_0)) {
tokenSeparator = getCharacter(args, "tokenSeparator", ConcatenateGraphFilter.DEFAULT_TOKEN_SEPARATOR);
} else {
boolean preserveSep = getBoolean(args, "preserveSep", ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP);
tokenSeparator = (preserveSep) ? ConcatenateGraphFilter.DEFAULT_TOKEN_SEPARATOR : null;
}
preservePositionIncrements = getBoolean(args, "preservePositionIncrements", ConcatenateGraphFilter.DEFAULT_PRESERVE_POSITION_INCREMENTS);
maxGraphExpansions = getInt(args, "maxGraphExpansions", ConcatenateGraphFilter.DEFAULT_MAX_GRAPH_EXPANSIONS);
@ -69,6 +81,21 @@ public class ConcatenateGraphFilterFactory extends TokenFilterFactory {
@Override
public TokenStream create(TokenStream input) {
return new ConcatenateGraphFilter(input, preserveSep, preservePositionIncrements, maxGraphExpansions);
return new ConcatenateGraphFilter(input, tokenSeparator, preservePositionIncrements, maxGraphExpansions);
}
protected Character getCharacter(Map<String,String> args, String name, Character defaultVal) {
String s = args.remove(name);
if (s == null) {
return defaultVal;
} else if (s.length() == 0) {
return null;
} else {
if (s.length() != 1) {
throw new IllegalArgumentException(name + " should be a char. \"" + s + "\" is invalid");
} else {
return s.charAt(0);
}
}
}
}

View File

@ -23,8 +23,10 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;
@ -48,7 +50,7 @@ public class TestConcatenateGraphFilter extends BaseTokenStreamTestCase {
Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
String input = "mykeyword another keyword";
tokenStream.setReader(new StringReader(input));
ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream, false, false, 100);
ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream, null, false, 100);
assertTokenStreamContents(stream, new String[] {"mykeywordanotherkeyword"}, null, null, new int[] { 1 });
}
@ -86,7 +88,7 @@ public class TestConcatenateGraphFilter extends BaseTokenStreamTestCase {
String input = "mykeyword another keyword";
tokenStream.setReader(new StringReader(input));
SynonymFilter filter = new SynonymFilter(tokenStream, builder.build(), true);
ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter, true, false, 100);
ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter, SEP_LABEL, false, 100);
String[] expectedOutputs = new String[2];
CharsRefBuilder expectedOutput = new CharsRefBuilder();
expectedOutput.append("mykeyword");
@ -112,7 +114,7 @@ public class TestConcatenateGraphFilter extends BaseTokenStreamTestCase {
String input = "a mykeyword a keyword"; //LUCENE-8344 add "a"
tokenStream.setReader(new StringReader(input));
TokenFilter tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("a"));
ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenFilter, true, preservePosInc, 10);
ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenFilter, SEP_LABEL, preservePosInc, 10);
CharsRefBuilder builder = new CharsRefBuilder();
if (preservePosInc) {
builder.append(SEP_LABEL);
@ -165,4 +167,52 @@ public class TestConcatenateGraphFilter extends BaseTokenStreamTestCase {
assertTokenStreamContents(filter, new String[0]);
}
@Test
public void testSeparator() throws IOException {
Tokenizer tokenStream = new MockTokenizer(MockTokenizer.SIMPLE, true);
String input = "...mykeyword.another.keyword.";
tokenStream.setReader(new StringReader(input));
ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream, ' ', false, 100); //not \u001F
assertTokenStreamContents(stream, new String[] {"mykeyword another keyword"}, null, null, new int[] { 1 });
}
@Test
public void testSeparatorWithStopWords() throws IOException {
Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
String input = "A B C D E F J H";
tokenStream.setReader(new StringReader(input));
TokenStream tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("A", "D", "E", "J"));
ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenFilter, '-', false, 100);
assertTokenStreamContents(stream, new String[] {"B-C-F-H"}, null, null, new int[] { 1 });
}
@Test
public void testSeparatorWithStopWordsAndPreservePositionIncrements() throws IOException {
Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
String input = "A B C D E F J H";
tokenStream.setReader(new StringReader(input));
TokenStream tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("A", "D", "E", "J"));
ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenFilter, '-', true, 100);
assertTokenStreamContents(stream, new String[] {"-B-C---F--H"}, null, null, new int[] { 1 });
}
@Test
public void testSeparatorWithSynonyms() throws IOException {
SynonymMap.Builder builder = new SynonymMap.Builder(true);
builder.add(new CharsRef("mykeyword"), new CharsRef("mysynonym"), true);
builder.add(new CharsRef("mykeyword"), new CharsRef("three words synonym"), true);
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
String input = " mykeyword another keyword ";
tokenizer.setReader(new StringReader(input));
SynonymGraphFilter filter = new SynonymGraphFilter(tokenizer, builder.build(), true);
ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter, '-', false, 100);
assertTokenStreamContents(stream, new String[] {
"mykeyword-another-keyword",
"mysynonym-another-keyword",
"three words synonym-another-keyword"
}, null, null, new int[] { 1, 0 ,0});
}
}

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
import org.apache.lucene.util.Version;
public class TestConcatenateGraphFilterFactory extends BaseTokenStreamFactoryTestCase {
public void test() throws Exception {
@ -34,11 +35,27 @@ public class TestConcatenateGraphFilterFactory extends BaseTokenStreamFactoryTes
tokenizer.setReader(reader);
tokenizer.setEnableChecks(consumeAll);
TokenStream stream = tokenizer;
stream = tokenFilterFactory("ConcatenateGraph").create(stream);
stream = tokenFilterFactory("ConcatenateGraph",
"tokenSeparator", "\u001F"
).create(stream);
assertTokenStreamContents(stream, new String[]{input.replace(' ', (char) ConcatenateGraphFilter.SEP_LABEL)});
}
}
public void testEmptyTokenSeparator() throws Exception {
final String input = "A1 B2 A1 D4 C3";
final String output = "A1A1D4C3";
Reader reader = new StringReader(input);
MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
tokenizer.setReader(reader);
TokenStream stream = tokenizer;
stream = new StopFilter(stream, StopFilter.makeStopSet("B2"));
stream = tokenFilterFactory("ConcatenateGraph",
"tokenSeparator", ""
).create(stream);
assertTokenStreamContents(stream, new String[]{output});
}
public void testPreserveSep() throws Exception {
final String input = "A1 B2 A1 D4 C3";
final String output = "A1A1D4C3";
@ -48,6 +65,7 @@ public class TestConcatenateGraphFilterFactory extends BaseTokenStreamFactoryTes
TokenStream stream = tokenizer;
stream = new StopFilter(stream, StopFilter.makeStopSet("B2"));
stream = tokenFilterFactory("ConcatenateGraph",
Version.LUCENE_8_0_0,
"preserveSep", "false"
).create(stream);
assertTokenStreamContents(stream, new String[]{output});
@ -62,6 +80,7 @@ public class TestConcatenateGraphFilterFactory extends BaseTokenStreamFactoryTes
TokenStream stream = tokenizer;
stream = new StopFilter(stream, StopFilter.makeStopSet("B2"));
stream = tokenFilterFactory("ConcatenateGraph",
"tokenSeparator", "\u001F",
"preservePositionIncrements", "false"
).create(stream);
assertTokenStreamContents(stream, new String[]{output.replace(' ', (char) ConcatenateGraphFilter.SEP_LABEL)});
@ -80,4 +99,19 @@ public class TestConcatenateGraphFilterFactory extends BaseTokenStreamFactoryTes
tokenFilterFactory("ConcatenateGraph", "bogusArg", "bogusValue"));
assertTrue(expected.getMessage().contains("Unknown parameters"));
}
public void testSeparator() throws Exception {
final String input = "A B C D E F J H";
final String output = "B-C-F-H";
Reader reader = new StringReader(input);
MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
tokenizer.setReader(reader);
TokenStream stream = tokenizer;
stream = new StopFilter(stream, StopFilter.makeStopSet("A", "D", "E", "J"));
stream = tokenFilterFactory("ConcatenateGraph",
"tokenSeparator", "-",
"preservePositionIncrements", "false"
).create(stream);
assertTokenStreamContents(stream, new String[]{output});
}
}