mirror of https://github.com/apache/lucene.git
LUCENE-9018: ConcatenateGraphFilter now has a configurable separator.
This commit is contained in:
parent
d9f41f8a5a
commit
e466d622c8
|
@ -85,6 +85,8 @@ Improvements
|
|||
|
||||
* LUCENE-9028: introducing Intervals.multiterm() (Mikhail Khludnev)
|
||||
|
||||
* LUCENE-9018: ConcatenateGraphFilter now has a configurable separator. (Stanislav Mikulchik, David Smiley)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-8928: When building a kd-tree for dimensions n > 2, compute exact bounds for an inner node every N splits
|
||||
|
|
|
@ -56,11 +56,11 @@ public final class ConcatenateGraphFilter extends TokenStream {
|
|||
*/
|
||||
|
||||
/**
|
||||
* Represents the separation between tokens, if
|
||||
* <code>preserveSep</code> is <code>true</code>.
|
||||
* Represents the default separator between tokens.
|
||||
*/
|
||||
public final static int SEP_LABEL = TokenStreamToAutomaton.POS_SEP;
|
||||
public final static int DEFAULT_MAX_GRAPH_EXPANSIONS = Operations.DEFAULT_MAX_DETERMINIZED_STATES;
|
||||
public final static Character DEFAULT_TOKEN_SEPARATOR = SEP_LABEL;
|
||||
public final static boolean DEFAULT_PRESERVE_SEP = true;
|
||||
public final static boolean DEFAULT_PRESERVE_POSITION_INCREMENTS = true;
|
||||
|
||||
|
@ -69,7 +69,7 @@ public final class ConcatenateGraphFilter extends TokenStream {
|
|||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
private final TokenStream inputTokenStream;
|
||||
private final boolean preserveSep;
|
||||
private final Character tokenSeparator;
|
||||
private final boolean preservePositionIncrements;
|
||||
private final int maxGraphExpansions;
|
||||
|
||||
|
@ -85,7 +85,7 @@ public final class ConcatenateGraphFilter extends TokenStream {
|
|||
* This constructor uses the default settings of the constants in this class.
|
||||
*/
|
||||
public ConcatenateGraphFilter(TokenStream inputTokenStream) {
|
||||
this(inputTokenStream, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, DEFAULT_MAX_GRAPH_EXPANSIONS);
|
||||
this(inputTokenStream, DEFAULT_TOKEN_SEPARATOR, DEFAULT_PRESERVE_POSITION_INCREMENTS, DEFAULT_MAX_GRAPH_EXPANSIONS);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -93,7 +93,8 @@ public final class ConcatenateGraphFilter extends TokenStream {
|
|||
* of accepted strings by its token stream graph.
|
||||
*
|
||||
* @param inputTokenStream The input/incoming TokenStream
|
||||
* @param preserveSep Whether {@link #SEP_LABEL} should separate the input tokens in the concatenated token
|
||||
* @param tokenSeparator Separator to use for concatenation. Can be null, in this case tokens will be concatenated
|
||||
* without any separators.
|
||||
* @param preservePositionIncrements Whether to add an empty token for missing positions.
|
||||
* The effect is a consecutive {@link #SEP_LABEL}.
|
||||
* When false, it's as if there were no missing positions
|
||||
|
@ -105,15 +106,23 @@ public final class ConcatenateGraphFilter extends TokenStream {
|
|||
* expansions
|
||||
*
|
||||
*/
|
||||
public ConcatenateGraphFilter(TokenStream inputTokenStream, boolean preserveSep, boolean preservePositionIncrements, int maxGraphExpansions) {
|
||||
public ConcatenateGraphFilter(TokenStream inputTokenStream, Character tokenSeparator, boolean preservePositionIncrements, int maxGraphExpansions) {
|
||||
// Don't call the super(input) ctor - this is a true delegate and has a new attribute source since we consume
|
||||
// the input stream entirely in the first call to incrementToken
|
||||
this.inputTokenStream = inputTokenStream;
|
||||
this.preserveSep = preserveSep;
|
||||
this.tokenSeparator = tokenSeparator;
|
||||
this.preservePositionIncrements = preservePositionIncrements;
|
||||
this.maxGraphExpansions = maxGraphExpansions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calls {@link #ConcatenateGraphFilter(org.apache.lucene.analysis.TokenStream, java.lang.Character, boolean, int)}
|
||||
* @param preserveSep Whether {@link #SEP_LABEL} should separate the input tokens in the concatenated token
|
||||
*/
|
||||
public ConcatenateGraphFilter(TokenStream inputTokenStream, boolean preserveSep, boolean preservePositionIncrements, int maxGraphExpansions) {
|
||||
this(inputTokenStream, (preserveSep) ? DEFAULT_TOKEN_SEPARATOR : null, preservePositionIncrements, maxGraphExpansions);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
|
@ -196,8 +205,8 @@ public final class ConcatenateGraphFilter extends TokenStream {
|
|||
// from each analyzed token, with byte 0 used as
|
||||
// separator between tokens:
|
||||
final TokenStreamToAutomaton tsta;
|
||||
if (preserveSep) {
|
||||
tsta = new EscapingTokenStreamToAutomaton(SEP_LABEL);
|
||||
if (tokenSeparator != null) {
|
||||
tsta = new EscapingTokenStreamToAutomaton(tokenSeparator);
|
||||
} else {
|
||||
// When we're not preserving sep, we don't steal 0xff
|
||||
// byte, so we don't need to do any escaping:
|
||||
|
@ -210,7 +219,7 @@ public final class ConcatenateGraphFilter extends TokenStream {
|
|||
|
||||
// TODO: we can optimize this somewhat by determinizing
|
||||
// while we convert
|
||||
automaton = replaceSep(automaton, preserveSep, SEP_LABEL);
|
||||
automaton = replaceSep(automaton, tokenSeparator);
|
||||
// This automaton should not blow up during determinize:
|
||||
return Operations.determinize(automaton, maxGraphExpansions);
|
||||
}
|
||||
|
@ -249,7 +258,7 @@ public final class ConcatenateGraphFilter extends TokenStream {
|
|||
|
||||
// Replaces SEP with epsilon or remaps them if
|
||||
// we were asked to preserve them:
|
||||
private static Automaton replaceSep(Automaton a, boolean preserveSep, int sepLabel) {
|
||||
private static Automaton replaceSep(Automaton a, Character tokenSeparator) {
|
||||
|
||||
Automaton result = new Automaton();
|
||||
|
||||
|
@ -271,9 +280,9 @@ public final class ConcatenateGraphFilter extends TokenStream {
|
|||
a.getNextTransition(t);
|
||||
if (t.min == TokenStreamToAutomaton.POS_SEP) {
|
||||
assert t.max == TokenStreamToAutomaton.POS_SEP;
|
||||
if (preserveSep) {
|
||||
// Remap to SEP_LABEL:
|
||||
result.addTransition(state, t.dest, sepLabel);
|
||||
if (tokenSeparator != null) {
|
||||
// Remap to tokenSeparator:
|
||||
result.addTransition(state, t.dest, tokenSeparator);
|
||||
} else {
|
||||
result.addEpsilon(state, t.dest);
|
||||
}
|
||||
|
|
|
@ -20,6 +20,7 @@ import java.util.Map;
|
|||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
|
||||
|
||||
/**
|
||||
|
@ -27,9 +28,15 @@ import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
|
|||
*
|
||||
* <ul>
|
||||
* <li><tt>preserveSep</tt>:
|
||||
* For lucene versions lesser than {@link org.apache.lucene.util.Version#LUCENE_8_4_0}
|
||||
* Whether {@link ConcatenateGraphFilter#SEP_LABEL}
|
||||
* should separate the input tokens in the concatenated token
|
||||
* </li>
|
||||
* <li><tt>tokenSeparator</tt>:
|
||||
* Separator to use for concatenation. If not present,
|
||||
* {@link ConcatenateGraphFilter#DEFAULT_TOKEN_SEPARATOR} will be used.
|
||||
* If empty, tokens will be concatenated without any separators.
|
||||
* </li>
|
||||
* <li><tt>preservePositionIncrements</tt>:
|
||||
* Whether to add an empty token for missing positions.
|
||||
* The effect is a consecutive {@link ConcatenateGraphFilter#SEP_LABEL}.
|
||||
|
@ -51,14 +58,19 @@ public class ConcatenateGraphFilterFactory extends TokenFilterFactory {
|
|||
/** SPI name */
|
||||
public static final String NAME = "concatenateGraph";
|
||||
|
||||
private boolean preserveSep;
|
||||
private Character tokenSeparator;
|
||||
private boolean preservePositionIncrements;
|
||||
private int maxGraphExpansions;
|
||||
|
||||
public ConcatenateGraphFilterFactory(Map<String, String> args) {
|
||||
super(args);
|
||||
|
||||
preserveSep = getBoolean(args, "preserveSep", ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP);
|
||||
Version luceneMatchVersion = getLuceneMatchVersion();
|
||||
if (luceneMatchVersion.onOrAfter(Version.LUCENE_8_4_0)) {
|
||||
tokenSeparator = getCharacter(args, "tokenSeparator", ConcatenateGraphFilter.DEFAULT_TOKEN_SEPARATOR);
|
||||
} else {
|
||||
boolean preserveSep = getBoolean(args, "preserveSep", ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP);
|
||||
tokenSeparator = (preserveSep) ? ConcatenateGraphFilter.DEFAULT_TOKEN_SEPARATOR : null;
|
||||
}
|
||||
preservePositionIncrements = getBoolean(args, "preservePositionIncrements", ConcatenateGraphFilter.DEFAULT_PRESERVE_POSITION_INCREMENTS);
|
||||
maxGraphExpansions = getInt(args, "maxGraphExpansions", ConcatenateGraphFilter.DEFAULT_MAX_GRAPH_EXPANSIONS);
|
||||
|
||||
|
@ -69,6 +81,21 @@ public class ConcatenateGraphFilterFactory extends TokenFilterFactory {
|
|||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new ConcatenateGraphFilter(input, preserveSep, preservePositionIncrements, maxGraphExpansions);
|
||||
return new ConcatenateGraphFilter(input, tokenSeparator, preservePositionIncrements, maxGraphExpansions);
|
||||
}
|
||||
|
||||
protected Character getCharacter(Map<String,String> args, String name, Character defaultVal) {
|
||||
String s = args.remove(name);
|
||||
if (s == null) {
|
||||
return defaultVal;
|
||||
} else if (s.length() == 0) {
|
||||
return null;
|
||||
} else {
|
||||
if (s.length() != 1) {
|
||||
throw new IllegalArgumentException(name + " should be a char. \"" + s + "\" is invalid");
|
||||
} else {
|
||||
return s.charAt(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,8 +23,10 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
|||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.synonym.SynonymFilter;
|
||||
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
|
||||
import org.apache.lucene.analysis.synonym.SynonymMap;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.CharsRefBuilder;
|
||||
|
@ -48,7 +50,7 @@ public class TestConcatenateGraphFilter extends BaseTokenStreamTestCase {
|
|||
Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
|
||||
String input = "mykeyword another keyword";
|
||||
tokenStream.setReader(new StringReader(input));
|
||||
ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream, false, false, 100);
|
||||
ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream, null, false, 100);
|
||||
assertTokenStreamContents(stream, new String[] {"mykeywordanotherkeyword"}, null, null, new int[] { 1 });
|
||||
}
|
||||
|
||||
|
@ -86,7 +88,7 @@ public class TestConcatenateGraphFilter extends BaseTokenStreamTestCase {
|
|||
String input = "mykeyword another keyword";
|
||||
tokenStream.setReader(new StringReader(input));
|
||||
SynonymFilter filter = new SynonymFilter(tokenStream, builder.build(), true);
|
||||
ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter, true, false, 100);
|
||||
ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter, SEP_LABEL, false, 100);
|
||||
String[] expectedOutputs = new String[2];
|
||||
CharsRefBuilder expectedOutput = new CharsRefBuilder();
|
||||
expectedOutput.append("mykeyword");
|
||||
|
@ -112,7 +114,7 @@ public class TestConcatenateGraphFilter extends BaseTokenStreamTestCase {
|
|||
String input = "a mykeyword a keyword"; //LUCENE-8344 add "a"
|
||||
tokenStream.setReader(new StringReader(input));
|
||||
TokenFilter tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("a"));
|
||||
ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenFilter, true, preservePosInc, 10);
|
||||
ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenFilter, SEP_LABEL, preservePosInc, 10);
|
||||
CharsRefBuilder builder = new CharsRefBuilder();
|
||||
if (preservePosInc) {
|
||||
builder.append(SEP_LABEL);
|
||||
|
@ -165,4 +167,52 @@ public class TestConcatenateGraphFilter extends BaseTokenStreamTestCase {
|
|||
assertTokenStreamContents(filter, new String[0]);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSeparator() throws IOException {
|
||||
Tokenizer tokenStream = new MockTokenizer(MockTokenizer.SIMPLE, true);
|
||||
String input = "...mykeyword.another.keyword.";
|
||||
tokenStream.setReader(new StringReader(input));
|
||||
ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream, ' ', false, 100); //not \u001F
|
||||
assertTokenStreamContents(stream, new String[] {"mykeyword another keyword"}, null, null, new int[] { 1 });
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSeparatorWithStopWords() throws IOException {
|
||||
Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
String input = "A B C D E F J H";
|
||||
tokenStream.setReader(new StringReader(input));
|
||||
TokenStream tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("A", "D", "E", "J"));
|
||||
ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenFilter, '-', false, 100);
|
||||
|
||||
assertTokenStreamContents(stream, new String[] {"B-C-F-H"}, null, null, new int[] { 1 });
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSeparatorWithStopWordsAndPreservePositionIncrements() throws IOException {
|
||||
Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
String input = "A B C D E F J H";
|
||||
tokenStream.setReader(new StringReader(input));
|
||||
TokenStream tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("A", "D", "E", "J"));
|
||||
ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenFilter, '-', true, 100);
|
||||
|
||||
assertTokenStreamContents(stream, new String[] {"-B-C---F--H"}, null, null, new int[] { 1 });
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSeparatorWithSynonyms() throws IOException {
|
||||
SynonymMap.Builder builder = new SynonymMap.Builder(true);
|
||||
builder.add(new CharsRef("mykeyword"), new CharsRef("mysynonym"), true);
|
||||
builder.add(new CharsRef("mykeyword"), new CharsRef("three words synonym"), true);
|
||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
|
||||
String input = " mykeyword another keyword ";
|
||||
tokenizer.setReader(new StringReader(input));
|
||||
SynonymGraphFilter filter = new SynonymGraphFilter(tokenizer, builder.build(), true);
|
||||
ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter, '-', false, 100);
|
||||
assertTokenStreamContents(stream, new String[] {
|
||||
"mykeyword-another-keyword",
|
||||
"mysynonym-another-keyword",
|
||||
"three words synonym-another-keyword"
|
||||
}, null, null, new int[] { 1, 0 ,0});
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.analysis.MockTokenizer;
|
|||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestConcatenateGraphFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||
public void test() throws Exception {
|
||||
|
@ -34,11 +35,27 @@ public class TestConcatenateGraphFilterFactory extends BaseTokenStreamFactoryTes
|
|||
tokenizer.setReader(reader);
|
||||
tokenizer.setEnableChecks(consumeAll);
|
||||
TokenStream stream = tokenizer;
|
||||
stream = tokenFilterFactory("ConcatenateGraph").create(stream);
|
||||
stream = tokenFilterFactory("ConcatenateGraph",
|
||||
"tokenSeparator", "\u001F"
|
||||
).create(stream);
|
||||
assertTokenStreamContents(stream, new String[]{input.replace(' ', (char) ConcatenateGraphFilter.SEP_LABEL)});
|
||||
}
|
||||
}
|
||||
|
||||
public void testEmptyTokenSeparator() throws Exception {
|
||||
final String input = "A1 B2 A1 D4 C3";
|
||||
final String output = "A1A1D4C3";
|
||||
Reader reader = new StringReader(input);
|
||||
MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
tokenizer.setReader(reader);
|
||||
TokenStream stream = tokenizer;
|
||||
stream = new StopFilter(stream, StopFilter.makeStopSet("B2"));
|
||||
stream = tokenFilterFactory("ConcatenateGraph",
|
||||
"tokenSeparator", ""
|
||||
).create(stream);
|
||||
assertTokenStreamContents(stream, new String[]{output});
|
||||
}
|
||||
|
||||
public void testPreserveSep() throws Exception {
|
||||
final String input = "A1 B2 A1 D4 C3";
|
||||
final String output = "A1A1D4C3";
|
||||
|
@ -48,6 +65,7 @@ public class TestConcatenateGraphFilterFactory extends BaseTokenStreamFactoryTes
|
|||
TokenStream stream = tokenizer;
|
||||
stream = new StopFilter(stream, StopFilter.makeStopSet("B2"));
|
||||
stream = tokenFilterFactory("ConcatenateGraph",
|
||||
Version.LUCENE_8_0_0,
|
||||
"preserveSep", "false"
|
||||
).create(stream);
|
||||
assertTokenStreamContents(stream, new String[]{output});
|
||||
|
@ -62,6 +80,7 @@ public class TestConcatenateGraphFilterFactory extends BaseTokenStreamFactoryTes
|
|||
TokenStream stream = tokenizer;
|
||||
stream = new StopFilter(stream, StopFilter.makeStopSet("B2"));
|
||||
stream = tokenFilterFactory("ConcatenateGraph",
|
||||
"tokenSeparator", "\u001F",
|
||||
"preservePositionIncrements", "false"
|
||||
).create(stream);
|
||||
assertTokenStreamContents(stream, new String[]{output.replace(' ', (char) ConcatenateGraphFilter.SEP_LABEL)});
|
||||
|
@ -80,4 +99,19 @@ public class TestConcatenateGraphFilterFactory extends BaseTokenStreamFactoryTes
|
|||
tokenFilterFactory("ConcatenateGraph", "bogusArg", "bogusValue"));
|
||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||
}
|
||||
|
||||
public void testSeparator() throws Exception {
|
||||
final String input = "A B C D E F J H";
|
||||
final String output = "B-C-F-H";
|
||||
Reader reader = new StringReader(input);
|
||||
MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
tokenizer.setReader(reader);
|
||||
TokenStream stream = tokenizer;
|
||||
stream = new StopFilter(stream, StopFilter.makeStopSet("A", "D", "E", "J"));
|
||||
stream = tokenFilterFactory("ConcatenateGraph",
|
||||
"tokenSeparator", "-",
|
||||
"preservePositionIncrements", "false"
|
||||
).create(stream);
|
||||
assertTokenStreamContents(stream, new String[]{output});
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue