LUCENE-8265: WordDelimiter*Filter ignores keywords

2018-04-22 20:41:08 +00:00 · 2018-04-22 20:41:08 +00:00 · fc0878cc2f
parent 4fba55c864
commit fc0878cc2f
4 changed files with 90 additions and 16 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
@ -25,6 +25,7 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
@ -164,7 +165,12 @@ public final class WordDelimiterFilter extends TokenFilter {
   * "O'Neil's" =&gt; "O", "Neil"
   */
  public static final int STEM_ENGLISH_POSSESSIVE = 256;
-  
+
+  /**
+   * Suppresses processing terms with {@link KeywordAttribute#isKeyword()}=true.
+   */
+  public static final int IGNORE_KEYWORDS = 512;
+
  /**
   * If not null is the set of tokens to protect from being delimited
   *
@ -174,6 +180,7 @@ public final class WordDelimiterFilter extends TokenFilter {
  private final int flags;
    
  private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);;
  private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
  private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
  private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
@ -243,7 +250,9 @@ public final class WordDelimiterFilter extends TokenFilter {
        if (!input.incrementToken()) {
          return false;
        }
-
+        if (has(IGNORE_KEYWORDS) && keywordAttribute.isKeyword()) {
+            return true;
+        }
        int termLength = termAttribute.length();
        char[] termBuffer = termAttribute.buffer();
        
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
@ -24,6 +24,7 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
@ -39,7 +40,7 @@ import org.apache.lucene.util.RamUsageEstimator;
 * work correctly when this filter is used in the search-time analyzer.  Unlike
 * the deprecated {@link WordDelimiterFilter}, this token filter produces a
 * correct token graph as output.  However, it cannot consume an input token
- * graph correctly.
+ * graph correctly. Processing is suppressed by {@link KeywordAttribute#isKeyword()}=true.
 *
 * <p>
 * Words are split into subwords with the following rules:
@ -156,7 +157,12 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
   * "O'Neil's" =&gt; "O", "Neil"
   */
  public static final int STEM_ENGLISH_POSSESSIVE = 256;
-  
+
+  /**
+   * Suppresses processing terms with {@link KeywordAttribute#isKeyword()}=true.
+   */
+  public static final int IGNORE_KEYWORDS = 512;
+
  /**
   * If not null is the set of tokens to protect from being delimited
   *
@ -174,6 +180,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
  private char[][] bufferedTermParts = new char[4][];
  
  private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);;
  private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
  private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
  private final PositionLengthAttribute posLenAttribute = addAttribute(PositionLengthAttribute.class);
@ -225,7 +232,8 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
          PRESERVE_ORIGINAL |
          SPLIT_ON_CASE_CHANGE |
          SPLIT_ON_NUMERICS |
-          STEM_ENGLISH_POSSESSIVE)) != 0) {
+          STEM_ENGLISH_POSSESSIVE |
+          IGNORE_KEYWORDS)) != 0) {
      throw new IllegalArgumentException("flags contains unrecognized flag: " + configurationFlags);
    }
    this.flags = configurationFlags;
@ -335,7 +343,9 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
        if (input.incrementToken() == false) {
          return false;
        }
-
+        if (has(IGNORE_KEYWORDS) && keywordAttribute.isKeyword()) {
+            return true;
+        }
        int termLength = termAttribute.length();
        char[] termBuffer = termAttribute.buffer();

--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
@ -27,7 +27,6 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.util.IOUtils;
-import org.junit.Test;

 import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*;
 import static org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
@ -57,7 +56,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
  }
  ***/

-  @Test
  public void testOffsets() throws IOException {
    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
    // test that subwords and catenated subwords have
@ -77,7 +75,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
        new int[] { 6, 6, 6 });
  }
  
-  @Test
  public void testOffsetChange() throws Exception {
    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
    WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("übelkeit)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
@ -88,7 +85,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
        new int[] { 15 });
  }
  
-  @Test
  public void testOffsetChange2() throws Exception {
    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
    WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("(übelkeit", 7, 17)), DEFAULT_WORD_DELIM_TABLE, flags, null);
@ -99,7 +95,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
        new int[] { 17 });
  }
  
-  @Test
  public void testOffsetChange3() throws Exception {
    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
    WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("(übelkeit", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
@ -110,7 +105,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
        new int[] { 16 });
  }
  
-  @Test
  public void testOffsetChange4() throws Exception {
    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
    WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
@ -129,7 +123,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
    assertTokenStreamContents(wdf, output);
  }

-  @Test
  public void testSplits() throws Exception {
    doSplit("basic-split","basic","split");
    doSplit("camelCase","camel","Case");
@ -175,7 +168,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
  /*
   * Test option that allows disabling the special "'s" stemming, instead treating the single quote like other delimiters. 
   */
-  @Test
  public void testPossessives() throws Exception {
    doSplitPossessive(1, "ra's", "ra");
    doSplitPossessive(0, "ra's", "ra", "s");
@ -204,7 +196,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
    }  
  }
  
-  @Test
  public void testPositionIncrements() throws Exception {
    final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
    final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("NUTCH")), false);
@ -323,6 +314,38 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
    IOUtils.close(a, a2, a3);
  }
  
+  public void testKeywordFilter() throws Exception {
+    assertAnalyzesTo(keywordTestAnalyzer(GENERATE_WORD_PARTS),
+                     "abc-def klm-nop kpop",
+                     new String[] {"abc", "def", "klm", "nop", "kpop"});
+    assertAnalyzesTo(keywordTestAnalyzer(GENERATE_WORD_PARTS | IGNORE_KEYWORDS),
+                     "abc-def klm-nop kpop",
+                     new String[] {"abc", "def", "klm-nop", "kpop"},
+                     new int[]{0, 4, 8, 16},
+                     new int[]{3, 7, 15, 20},
+                     null,
+                     new int[]{1, 1, 1, 1},
+                     null,
+                     false);
+  }
+
+  private Analyzer keywordTestAnalyzer(int flags) throws Exception {
+    return new Analyzer() {
+      @Override
+      public TokenStreamComponents createComponents(String field) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        KeywordMarkerFilter kFilter = new KeywordMarkerFilter(tokenizer) {
+          private final CharTermAttribute term = addAttribute(CharTermAttribute.class);
+          @Override public boolean isKeyword() {
+            // Marks terms starting with the letter 'k' as keywords
+            return term.toString().charAt(0) == 'k';
+          }
+        };
+        return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(kFilter, flags, null));
+      }
+    };
+  }
+  
  /** concat numbers + words + all */
  public void testLotsOfConcatenating() throws Exception {
    final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;    
@ -346,7 +369,7 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
        false);
    a.close();
  }
-  
+
  /** concat numbers + words + all + preserve original */
  public void testLotsOfConcatenating2() throws Exception {
    final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;    
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
@ -309,6 +309,38 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
    IOUtils.close(a, a2, a3);
  }
  
+  public void testKeywordFilter() throws Exception {
+    assertAnalyzesTo(keywordTestAnalyzer(GENERATE_WORD_PARTS),
+                     "abc-def klm-nop kpop",
+                     new String[] {"abc", "def", "klm", "nop", "kpop"});
+    assertAnalyzesTo(keywordTestAnalyzer(GENERATE_WORD_PARTS | IGNORE_KEYWORDS),
+                     "abc-def klm-nop kpop",
+                     new String[] {"abc", "def", "klm-nop", "kpop"},
+                     new int[]{0, 4, 8, 16},
+                     new int[]{3, 7, 15, 20},
+                     null,
+                     new int[]{1, 1, 1, 1},
+                     null,
+                     false);
+  }
+
+  private Analyzer keywordTestAnalyzer(int flags) throws Exception {
+    return new Analyzer() {
+      @Override
+      public TokenStreamComponents createComponents(String field) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        KeywordMarkerFilter kFilter = new KeywordMarkerFilter(tokenizer) {
+          private final CharTermAttribute term = addAttribute(CharTermAttribute.class);
+          @Override public boolean isKeyword() {
+            // Marks terms starting with the letter 'k' as keywords
+            return term.toString().charAt(0) == 'k';
+          }
+        };
+        return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(kFilter, flags, null));
+      }
+    };
+  }
+
  /** concat numbers + words + all */
  public void testLotsOfConcatenating() throws Exception {
    final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;