LUCENE-9018: ConcatenateGraphFilter now has a configurable separator.

2019-11-14 13:26:50 -05:00 · 2019-11-14 13:26:50 -05:00 · e466d622c8
parent d9f41f8a5a
commit e466d622c8
5 changed files with 144 additions and 22 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -85,6 +85,8 @@ Improvements

 * LUCENE-9028: introducing Intervals.multiterm() (Mikhail Khludnev)

+* LUCENE-9018: ConcatenateGraphFilter now has a configurable separator. (Stanislav Mikulchik, David Smiley)
+
 Optimizations

 * LUCENE-8928: When building a kd-tree for dimensions n > 2, compute exact bounds for an inner node every N splits
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java
@ -56,11 +56,11 @@ public final class ConcatenateGraphFilter extends TokenStream {
   */

  /**
-   * Represents the separation between tokens, if
-   * <code>preserveSep</code> is <code>true</code>.
+   * Represents the default separator between tokens.
   */
  public final static int SEP_LABEL = TokenStreamToAutomaton.POS_SEP;
  public final static int DEFAULT_MAX_GRAPH_EXPANSIONS = Operations.DEFAULT_MAX_DETERMINIZED_STATES;
+  public final static Character DEFAULT_TOKEN_SEPARATOR = SEP_LABEL;
  public final static boolean DEFAULT_PRESERVE_SEP = true;
  public final static boolean DEFAULT_PRESERVE_POSITION_INCREMENTS = true;

@ -69,7 +69,7 @@ public final class ConcatenateGraphFilter extends TokenStream {
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

  private final TokenStream inputTokenStream;
-  private final boolean preserveSep;
+  private final Character tokenSeparator;
  private final boolean preservePositionIncrements;
  private final int maxGraphExpansions;

@ -85,7 +85,7 @@ public final class ConcatenateGraphFilter extends TokenStream {
   * This constructor uses the default settings of the constants in this class.
   */
  public ConcatenateGraphFilter(TokenStream inputTokenStream) {
-    this(inputTokenStream, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, DEFAULT_MAX_GRAPH_EXPANSIONS);
+    this(inputTokenStream, DEFAULT_TOKEN_SEPARATOR, DEFAULT_PRESERVE_POSITION_INCREMENTS, DEFAULT_MAX_GRAPH_EXPANSIONS);
  }

  /**
@ -93,7 +93,8 @@ public final class ConcatenateGraphFilter extends TokenStream {
   * of accepted strings by its token stream graph.
   *
   * @param inputTokenStream The input/incoming TokenStream
-   * @param preserveSep Whether {@link #SEP_LABEL} should separate the input tokens in the concatenated token
+   * @param tokenSeparator Separator to use for concatenation. Can be null, in this case tokens will be concatenated
+   *                       without any separators.
   * @param preservePositionIncrements Whether to add an empty token for missing positions.
   *                                   The effect is a consecutive {@link #SEP_LABEL}.
   *                                   When false, it's as if there were no missing positions
@ -105,15 +106,23 @@ public final class ConcatenateGraphFilter extends TokenStream {
   *         expansions
   *
   */
-  public ConcatenateGraphFilter(TokenStream inputTokenStream, boolean preserveSep, boolean preservePositionIncrements, int maxGraphExpansions) {
+  public ConcatenateGraphFilter(TokenStream inputTokenStream, Character tokenSeparator, boolean preservePositionIncrements, int maxGraphExpansions) {
    // Don't call the super(input) ctor - this is a true delegate and has a new attribute source since we consume
    // the input stream entirely in the first call to incrementToken
    this.inputTokenStream = inputTokenStream;
-    this.preserveSep = preserveSep;
+    this.tokenSeparator = tokenSeparator;
    this.preservePositionIncrements = preservePositionIncrements;
    this.maxGraphExpansions = maxGraphExpansions;
  }

+  /**
+   * Calls {@link #ConcatenateGraphFilter(org.apache.lucene.analysis.TokenStream, java.lang.Character, boolean, int)}
+   * @param preserveSep Whether {@link #SEP_LABEL} should separate the input tokens in the concatenated token
+   */
+  public ConcatenateGraphFilter(TokenStream inputTokenStream, boolean preserveSep, boolean preservePositionIncrements, int maxGraphExpansions) {
+    this(inputTokenStream, (preserveSep) ? DEFAULT_TOKEN_SEPARATOR : null, preservePositionIncrements, maxGraphExpansions);
+  }
+
  @Override
  public void reset() throws IOException {
    super.reset();
@ -196,8 +205,8 @@ public final class ConcatenateGraphFilter extends TokenStream {
    // from each analyzed token, with byte 0 used as
    // separator between tokens:
    final TokenStreamToAutomaton tsta;
-    if (preserveSep) {
-      tsta = new EscapingTokenStreamToAutomaton(SEP_LABEL);
+    if (tokenSeparator != null) {
+      tsta = new EscapingTokenStreamToAutomaton(tokenSeparator);
    } else {
      // When we're not preserving sep, we don't steal 0xff
      // byte, so we don't need to do any escaping:
@ -210,7 +219,7 @@ public final class ConcatenateGraphFilter extends TokenStream {

    // TODO: we can optimize this somewhat by determinizing
    // while we convert
-    automaton = replaceSep(automaton, preserveSep, SEP_LABEL);
+    automaton = replaceSep(automaton, tokenSeparator);
    // This automaton should not blow up during determinize:
    return Operations.determinize(automaton, maxGraphExpansions);
  }
@ -249,7 +258,7 @@ public final class ConcatenateGraphFilter extends TokenStream {

  // Replaces SEP with epsilon or remaps them if
  // we were asked to preserve them:
-  private static Automaton replaceSep(Automaton a, boolean preserveSep, int sepLabel) {
+  private static Automaton replaceSep(Automaton a, Character tokenSeparator) {

    Automaton result = new Automaton();

@ -271,9 +280,9 @@ public final class ConcatenateGraphFilter extends TokenStream {
        a.getNextTransition(t);
        if (t.min == TokenStreamToAutomaton.POS_SEP) {
          assert t.max == TokenStreamToAutomaton.POS_SEP;
-          if (preserveSep) {
-            // Remap to SEP_LABEL:
-            result.addTransition(state, t.dest, sepLabel);
+          if (tokenSeparator != null) {
+            // Remap to tokenSeparator:
+            result.addTransition(state, t.dest, tokenSeparator);
          } else {
            result.addEpsilon(state, t.dest);
          }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilterFactory.java
@ -20,6 +20,7 @@ import java.util.Map;

 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.apache.lucene.util.Version;
 import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;

 /**
@ -27,9 +28,15 @@ import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
 *
 * <ul>
 *   <li><tt>preserveSep</tt>:
+ *                            For lucene versions lesser than {@link org.apache.lucene.util.Version#LUCENE_8_4_0}
 *                            Whether {@link ConcatenateGraphFilter#SEP_LABEL}
 *                            should separate the input tokens in the concatenated token
 *                            </li>
+ *   <li><tt>tokenSeparator</tt>:
+ *                            Separator to use for concatenation. If not present,
+ *                            {@link ConcatenateGraphFilter#DEFAULT_TOKEN_SEPARATOR} will be used.
+ *                            If empty, tokens will be concatenated without any separators.
+ *                            </li>
 *   <li><tt>preservePositionIncrements</tt>:
 *                                       Whether to add an empty token for missing positions.
 *                                       The effect is a consecutive {@link ConcatenateGraphFilter#SEP_LABEL}.
@ -51,14 +58,19 @@ public class ConcatenateGraphFilterFactory extends TokenFilterFactory {
  /** SPI name */
  public static final String NAME = "concatenateGraph";

-  private boolean preserveSep;
+  private Character tokenSeparator;
  private boolean preservePositionIncrements;
  private int maxGraphExpansions;

  public ConcatenateGraphFilterFactory(Map<String, String> args) {
    super(args);
-
-    preserveSep = getBoolean(args, "preserveSep", ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP);
+    Version luceneMatchVersion = getLuceneMatchVersion();
+    if (luceneMatchVersion.onOrAfter(Version.LUCENE_8_4_0)) {
+      tokenSeparator = getCharacter(args, "tokenSeparator", ConcatenateGraphFilter.DEFAULT_TOKEN_SEPARATOR);
+    } else {
+      boolean preserveSep = getBoolean(args, "preserveSep", ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP);
+      tokenSeparator = (preserveSep) ? ConcatenateGraphFilter.DEFAULT_TOKEN_SEPARATOR : null;
+    }
    preservePositionIncrements = getBoolean(args, "preservePositionIncrements", ConcatenateGraphFilter.DEFAULT_PRESERVE_POSITION_INCREMENTS);
    maxGraphExpansions = getInt(args, "maxGraphExpansions", ConcatenateGraphFilter.DEFAULT_MAX_GRAPH_EXPANSIONS);

@ -69,6 +81,21 @@ public class ConcatenateGraphFilterFactory extends TokenFilterFactory {

  @Override
  public TokenStream create(TokenStream input) {
-    return new ConcatenateGraphFilter(input, preserveSep, preservePositionIncrements, maxGraphExpansions);
+    return new ConcatenateGraphFilter(input, tokenSeparator, preservePositionIncrements, maxGraphExpansions);
+  }
+
+  protected Character getCharacter(Map<String,String> args, String name, Character defaultVal) {
+    String s = args.remove(name);
+    if (s == null) {
+      return defaultVal;
+    } else if (s.length() == 0) {
+      return null;
+    } else {
+      if (s.length() != 1) {
+        throw new IllegalArgumentException(name + " should be a char. \"" + s + "\" is invalid");
+      } else {
+        return s.charAt(0);
+      }
+    }
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilter.java
@ -23,8 +23,10 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.synonym.SynonymFilter;
+import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
 import org.apache.lucene.analysis.synonym.SynonymMap;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.CharsRefBuilder;
@ -48,7 +50,7 @@ public class TestConcatenateGraphFilter extends BaseTokenStreamTestCase {
    Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
    String input = "mykeyword another keyword";
    tokenStream.setReader(new StringReader(input));
-    ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream, false, false, 100);
+    ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream, null, false, 100);
    assertTokenStreamContents(stream, new String[] {"mykeywordanotherkeyword"}, null, null, new int[] { 1 });
  }

@ -86,7 +88,7 @@ public class TestConcatenateGraphFilter extends BaseTokenStreamTestCase {
    String input = "mykeyword another keyword";
    tokenStream.setReader(new StringReader(input));
    SynonymFilter filter = new SynonymFilter(tokenStream, builder.build(), true);
-    ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter, true, false, 100);
+    ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter, SEP_LABEL, false, 100);
    String[] expectedOutputs = new String[2];
    CharsRefBuilder expectedOutput = new CharsRefBuilder();
    expectedOutput.append("mykeyword");
@ -112,7 +114,7 @@ public class TestConcatenateGraphFilter extends BaseTokenStreamTestCase {
      String input = "a mykeyword a keyword"; //LUCENE-8344 add "a"
      tokenStream.setReader(new StringReader(input));
      TokenFilter tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("a"));
-      ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenFilter, true, preservePosInc, 10);
+      ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenFilter, SEP_LABEL, preservePosInc, 10);
      CharsRefBuilder builder = new CharsRefBuilder();
      if (preservePosInc) {
        builder.append(SEP_LABEL);
@ -165,4 +167,52 @@ public class TestConcatenateGraphFilter extends BaseTokenStreamTestCase {
    assertTokenStreamContents(filter, new String[0]);
  }

+  @Test
+  public void testSeparator() throws IOException {
+    Tokenizer tokenStream = new MockTokenizer(MockTokenizer.SIMPLE, true);
+    String input = "...mykeyword.another.keyword.";
+    tokenStream.setReader(new StringReader(input));
+    ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream, ' ', false, 100); //not \u001F
+    assertTokenStreamContents(stream, new String[] {"mykeyword another keyword"}, null, null, new int[] { 1 });
+  }
+
+  @Test
+  public void testSeparatorWithStopWords() throws IOException {
+    Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+    String input = "A B C D E F J H";
+    tokenStream.setReader(new StringReader(input));
+    TokenStream tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("A", "D", "E", "J"));
+    ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenFilter, '-', false, 100);
+
+    assertTokenStreamContents(stream, new String[] {"B-C-F-H"}, null, null, new int[] { 1 });
+  }
+
+  @Test
+  public void testSeparatorWithStopWordsAndPreservePositionIncrements() throws IOException {
+    Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+    String input = "A B C D E F J H";
+    tokenStream.setReader(new StringReader(input));
+    TokenStream tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("A", "D", "E", "J"));
+    ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenFilter, '-', true, 100);
+
+    assertTokenStreamContents(stream, new String[] {"-B-C---F--H"}, null, null, new int[] { 1 });
+  }
+
+  @Test
+  public void testSeparatorWithSynonyms() throws IOException {
+    SynonymMap.Builder builder = new SynonymMap.Builder(true);
+    builder.add(new CharsRef("mykeyword"), new CharsRef("mysynonym"), true);
+    builder.add(new CharsRef("mykeyword"), new CharsRef("three words synonym"), true);
+    Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
+    String input = " mykeyword another keyword   ";
+    tokenizer.setReader(new StringReader(input));
+    SynonymGraphFilter filter = new SynonymGraphFilter(tokenizer, builder.build(), true);
+    ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter, '-', false, 100);
+    assertTokenStreamContents(stream, new String[] {
+        "mykeyword-another-keyword",
+        "mysynonym-another-keyword",
+        "three words synonym-another-keyword"
+    }, null, null, new int[] { 1, 0 ,0});
+  }
+
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilterFactory.java
@ -24,6 +24,7 @@ import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+import org.apache.lucene.util.Version;

 public class TestConcatenateGraphFilterFactory extends BaseTokenStreamFactoryTestCase {
  public void test() throws Exception {
@ -34,11 +35,27 @@ public class TestConcatenateGraphFilterFactory extends BaseTokenStreamFactoryTes
      tokenizer.setReader(reader);
      tokenizer.setEnableChecks(consumeAll);
      TokenStream stream = tokenizer;
-      stream = tokenFilterFactory("ConcatenateGraph").create(stream);
+      stream = tokenFilterFactory("ConcatenateGraph",
+          "tokenSeparator", "\u001F"
+      ).create(stream);
      assertTokenStreamContents(stream, new String[]{input.replace(' ', (char) ConcatenateGraphFilter.SEP_LABEL)});
    }
  }

+  public void testEmptyTokenSeparator() throws Exception {
+    final String input = "A1 B2 A1 D4 C3";
+    final String output = "A1A1D4C3";
+    Reader reader = new StringReader(input);
+    MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+    tokenizer.setReader(reader);
+    TokenStream stream = tokenizer;
+    stream = new StopFilter(stream, StopFilter.makeStopSet("B2"));
+    stream = tokenFilterFactory("ConcatenateGraph",
+        "tokenSeparator", ""
+    ).create(stream);
+    assertTokenStreamContents(stream, new String[]{output});
+  }
+
  public void testPreserveSep() throws Exception {
    final String input = "A1 B2 A1 D4 C3";
    final String output = "A1A1D4C3";
@ -48,6 +65,7 @@ public class TestConcatenateGraphFilterFactory extends BaseTokenStreamFactoryTes
    TokenStream stream = tokenizer;
    stream = new StopFilter(stream, StopFilter.makeStopSet("B2"));
    stream = tokenFilterFactory("ConcatenateGraph",
+        Version.LUCENE_8_0_0,
        "preserveSep", "false"
    ).create(stream);
    assertTokenStreamContents(stream, new String[]{output});
@ -62,6 +80,7 @@ public class TestConcatenateGraphFilterFactory extends BaseTokenStreamFactoryTes
    TokenStream stream = tokenizer;
    stream = new StopFilter(stream, StopFilter.makeStopSet("B2"));
    stream = tokenFilterFactory("ConcatenateGraph",
+        "tokenSeparator", "\u001F",
        "preservePositionIncrements", "false"
        ).create(stream);
    assertTokenStreamContents(stream, new String[]{output.replace(' ', (char) ConcatenateGraphFilter.SEP_LABEL)});
@ -80,4 +99,19 @@ public class TestConcatenateGraphFilterFactory extends BaseTokenStreamFactoryTes
        tokenFilterFactory("ConcatenateGraph", "bogusArg", "bogusValue"));
    assertTrue(expected.getMessage().contains("Unknown parameters"));
  }
+
+  public void testSeparator() throws Exception {
+    final String input = "A B C D E F J H";
+    final String output = "B-C-F-H";
+    Reader reader = new StringReader(input);
+    MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+    tokenizer.setReader(reader);
+    TokenStream stream = tokenizer;
+    stream = new StopFilter(stream, StopFilter.makeStopSet("A", "D", "E", "J"));
+    stream = tokenFilterFactory("ConcatenateGraph",
+        "tokenSeparator", "-",
+        "preservePositionIncrements", "false"
+    ).create(stream);
+    assertTokenStreamContents(stream, new String[]{output});
+  }
 }