LUCENE-7363: Fixed DecimalDigitFilter in case of supplementary code points.

2016-10-18 10:38:51 +02:00 · 2016-10-18 10:38:51 +02:00 · 9260a1378f
parent c22725f0b5
commit 9260a1378f
3 changed files with 140 additions and 16 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -86,6 +86,9 @@ Bug Fixes
  dimensional points could cause unexpected merge exceptions (Hans
  Lund, Mike McCandless)

+* LUCENE-7363: Fixed DecimalDigitFilter in case of supplementary code points.
+  (Hossman)
+
 Improvements

 * LUCENE-7439: FuzzyQuery now matches all terms within the specified
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java
@ -52,7 +52,7 @@ public final class DecimalDigitFilter extends TokenFilter {
          buffer[i] = (char) ('0' + Character.getNumericValue(ch));
          // if the original was supplementary, shrink the string
          if (ch > 0xFFFF) {
-            length = StemmerUtil.delete(buffer, ++i, length);
+            length = StemmerUtil.delete(buffer, i+1, length);
            termAtt.setLength(length);
          }
        }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java
@ -21,8 +21,17 @@ import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.SparseFixedBitSet;
 import org.apache.lucene.util.TestUtil;

+import java.util.Random;
+
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+
 /**
 * Tests for {@link DecimalDigitFilter}
 */
@ -30,6 +39,25 @@ public class TestDecimalDigitFilter extends BaseTokenStreamTestCase {
  private Analyzer tokenized;
  private Analyzer keyword;

+  private static SparseFixedBitSet DECIMAL_DIGIT_CODEPOINTS;
+
+  @BeforeClass
+  public static void init_DECIMAL_DIGIT_CODEPOINTS() {
+    DECIMAL_DIGIT_CODEPOINTS = new SparseFixedBitSet(Character.MAX_CODE_POINT);
+    for (int codepoint = Character.MIN_CODE_POINT; codepoint < Character.MAX_CODE_POINT; codepoint++) {
+      if (Character.isDigit(codepoint)) {
+        DECIMAL_DIGIT_CODEPOINTS.set(codepoint);
+      }
+    }
+    assert 0 < DECIMAL_DIGIT_CODEPOINTS.cardinality();
+  }
+  
+  @AfterClass
+  public static void destroy_DECIMAL_DIGIT_CODEPOINTS() {
+    DECIMAL_DIGIT_CODEPOINTS = null;
+  }
+
+  
  @Override
  public void setUp() throws Exception {
    super.setUp();
@ -64,11 +92,62 @@ public class TestDecimalDigitFilter extends BaseTokenStreamTestCase {
  }
  
  /**
-   * test all digits in different locations of strings.
+   * test that double struck digits are normalized
+   */
+  public void testDoubleStruck() throws Exception {
+    // MATHEMATICAL DOUBLE-STRUCK DIGIT ... 1, 9, 8, 4
+    final String input = "𝟙 𝟡 𝟠 𝟜";
+    final String expected = "1 9 8 4";
+    checkOneTerm(keyword, input, expected);
+    checkOneTerm(keyword, input.replaceAll("\\s",""), expected.replaceAll("\\s",""));
+  }
+
+  /**
+   * test sequences of digits mixed with other random simple string data
+   */
+  public void testRandomSequences() throws Exception {
+    
+    // test numIters random strings containing a sequence of numDigits codepoints
+    final int numIters = atLeast(5);
+    for (int iter = 0; iter < numIters; iter++) {
+      final int numDigits = atLeast(20);
+      final StringBuilder expected = new StringBuilder();
+      final StringBuilder actual = new StringBuilder();
+      for (int digitCounter = 0; digitCounter < numDigits; digitCounter++) {
+        
+        // increased odds of 0 length random string prefix
+        final String prefix = random().nextBoolean() ? "" : TestUtil.randomSimpleString(random());
+        expected.append(prefix);
+        actual.append(prefix);
+        
+        int codepoint = getRandomDecimalDigit(random());
+
+        int value = Character.getNumericValue(codepoint);
+        assert value >= 0 && value <= 9;
+        expected.append(Integer.toString(value));
+        actual.appendCodePoint(codepoint);
+      }
+      // occasional suffix, increased odds of 0 length random string
+      final String suffix = random().nextBoolean() ? "" : TestUtil.randomSimpleString(random());
+      expected.append(suffix);
+      actual.append(suffix);
+      
+      checkOneTerm(keyword, actual.toString(), expected.toString());
+    }
+
+  }
+  
+  /**
+   * test each individual digit in different locations of strings.
   */
  public void testRandom() throws Exception {
-    for (int codepoint = Character.MIN_CODE_POINT; codepoint < Character.MAX_CODE_POINT; codepoint++) {
-      if (Character.isDigit(codepoint)) {
+    int numCodePointsChecked = 0; // sanity check
+    for (int codepoint = DECIMAL_DIGIT_CODEPOINTS.nextSetBit(0);
+         codepoint != DocIdSetIterator.NO_MORE_DOCS;
+         codepoint = DECIMAL_DIGIT_CODEPOINTS.nextSetBit(codepoint+1)) {
+      
+      assert Character.isDigit(codepoint);
+      
      // add some a-z before/after the string
      String prefix = TestUtil.randomSimpleString(random());
      String suffix = TestUtil.randomSimpleString(random());
@ -86,8 +165,10 @@ public class TestDecimalDigitFilter extends BaseTokenStreamTestCase {
      actual.append(suffix);
      
      checkOneTerm(keyword, actual.toString(), expected.toString());
+      
+      numCodePointsChecked++;
    }
-    }
+    assert DECIMAL_DIGIT_CODEPOINTS.cardinality() == numCodePointsChecked;
  }
  
  /**
@ -103,4 +184,44 @@ public class TestDecimalDigitFilter extends BaseTokenStreamTestCase {
  public void testRandomStrings() throws Exception {
    checkRandomData(random(), tokenized, 1000*RANDOM_MULTIPLIER);
  }
+
+  /** returns a psuedo-random codepoint which is a Decimal Digit */
+  public static int getRandomDecimalDigit(Random r) {
+    final int aprox = TestUtil.nextInt(r, 0, DECIMAL_DIGIT_CODEPOINTS.length()-1);
+    
+    if (DECIMAL_DIGIT_CODEPOINTS.get(aprox)) { // lucky guess
+      assert Character.isDigit(aprox);
+      return aprox;
+    }
+    
+    // seek up and down for closest set bit
+    final int lower = DECIMAL_DIGIT_CODEPOINTS.prevSetBit(aprox);
+    final int higher = DECIMAL_DIGIT_CODEPOINTS.nextSetBit(aprox);
+    
+    // sanity check edge cases
+    if (lower < 0) {
+      assert higher != DocIdSetIterator.NO_MORE_DOCS;
+      assert Character.isDigit(higher);
+      return higher;
+    }
+    if (higher == DocIdSetIterator.NO_MORE_DOCS) {
+      assert 0 <= lower;
+      assert Character.isDigit(lower);
+      return lower;
+    }
+    
+    // which is closer?
+    final int cmp = Integer.compare(aprox - lower, higher - aprox);
+    
+    if (0 == cmp) {
+      // dead even, flip a coin
+      final int result = random().nextBoolean() ? lower : higher;
+      assert Character.isDigit(result);
+      return result;
+    }
+    
+    final int result = (cmp < 0) ? lower : higher;
+    assert Character.isDigit(result);
+    return result;
+  }
 }