From 39ed4bdef64e5ac3ea528e0f98f57276fed03312 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Tue, 18 Oct 2016 10:38:51 +0200 Subject: [PATCH] LUCENE-7363: Fixed DecimalDigitFilter in case of supplementary code points. --- lucene/CHANGES.txt | 3 + .../analysis/core/DecimalDigitFilter.java | 2 +- .../analysis/core/TestDecimalDigitFilter.java | 151 ++++++++++++++++-- 3 files changed, 140 insertions(+), 16 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 5a6ce61b0a9..6cc10a2f54e 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -43,6 +43,9 @@ Bug Fixes dimensional points could cause unexpected merge exceptions (Hans Lund, Mike McCandless) +* LUCENE-7363: Fixed DecimalDigitFilter in case of supplementary code points. + (Hossman) + Improvements * LUCENE-7439: FuzzyQuery now matches all terms within the specified diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java index b81d42fedc9..de459cfdf55 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java @@ -52,7 +52,7 @@ public final class DecimalDigitFilter extends TokenFilter { buffer[i] = (char) ('0' + Character.getNumericValue(ch)); // if the original was supplementary, shrink the string if (ch > 0xFFFF) { - length = StemmerUtil.delete(buffer, ++i, length); + length = StemmerUtil.delete(buffer, i+1, length); termAtt.setLength(length); } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java index ae251936a5a..e5e18ef448e 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java @@ -21,14 +21,42 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.SparseFixedBitSet; import org.apache.lucene.util.TestUtil; +import java.util.Random; + +import org.junit.AfterClass; +import org.junit.BeforeClass; + /** * Tests for {@link DecimalDigitFilter} */ public class TestDecimalDigitFilter extends BaseTokenStreamTestCase { private Analyzer tokenized; private Analyzer keyword; + + private static SparseFixedBitSet DECIMAL_DIGIT_CODEPOINTS; + + @BeforeClass + public static void init_DECIMAL_DIGIT_CODEPOINTS() { + DECIMAL_DIGIT_CODEPOINTS = new SparseFixedBitSet(Character.MAX_CODE_POINT); + for (int codepoint = Character.MIN_CODE_POINT; codepoint < Character.MAX_CODE_POINT; codepoint++) { + if (Character.isDigit(codepoint)) { + DECIMAL_DIGIT_CODEPOINTS.set(codepoint); + } + } + assert 0 < DECIMAL_DIGIT_CODEPOINTS.cardinality(); + } + + @AfterClass + public static void destroy_DECIMAL_DIGIT_CODEPOINTS() { + DECIMAL_DIGIT_CODEPOINTS = null; + } + @Override public void setUp() throws Exception { @@ -64,30 +92,83 @@ public class TestDecimalDigitFilter extends BaseTokenStreamTestCase { } /** - * test all digits in different locations of strings. + * test that double struck digits are normalized */ - public void testRandom() throws Exception { - for (int codepoint = Character.MIN_CODE_POINT; codepoint < Character.MAX_CODE_POINT; codepoint++) { - if (Character.isDigit(codepoint)) { - // add some a-z before/after the string - String prefix = TestUtil.randomSimpleString(random()); - String suffix = TestUtil.randomSimpleString(random()); + public void testDoubleStruck() throws Exception { + // MATHEMATICAL DOUBLE-STRUCK DIGIT ... 1, 9, 8, 4 + final String input = "𝟙 𝟡 𝟠 𝟜"; + final String expected = "1 9 8 4"; + checkOneTerm(keyword, input, expected); + checkOneTerm(keyword, input.replaceAll("\\s",""), expected.replaceAll("\\s","")); + } + + /** + * test sequences of digits mixed with other random simple string data + */ + public void testRandomSequences() throws Exception { + + // test numIters random strings containing a sequence of numDigits codepoints + final int numIters = atLeast(5); + for (int iter = 0; iter < numIters; iter++) { + final int numDigits = atLeast(20); + final StringBuilder expected = new StringBuilder(); + final StringBuilder actual = new StringBuilder(); + for (int digitCounter = 0; digitCounter < numDigits; digitCounter++) { - StringBuilder expected = new StringBuilder(); + // increased odds of 0 length random string prefix + final String prefix = random().nextBoolean() ? "" : TestUtil.randomSimpleString(random()); expected.append(prefix); + actual.append(prefix); + + int codepoint = getRandomDecimalDigit(random()); + int value = Character.getNumericValue(codepoint); assert value >= 0 && value <= 9; expected.append(Integer.toString(value)); - expected.append(suffix); - - StringBuilder actual = new StringBuilder(); - actual.append(prefix); actual.appendCodePoint(codepoint); - actual.append(suffix); - - checkOneTerm(keyword, actual.toString(), expected.toString()); } + // occasional suffix, increased odds of 0 length random string + final String suffix = random().nextBoolean() ? "" : TestUtil.randomSimpleString(random()); + expected.append(suffix); + actual.append(suffix); + + checkOneTerm(keyword, actual.toString(), expected.toString()); } + + } + + /** + * test each individual digit in different locations of strings. + */ + public void testRandom() throws Exception { + int numCodePointsChecked = 0; // sanity check + for (int codepoint = DECIMAL_DIGIT_CODEPOINTS.nextSetBit(0); + codepoint != DocIdSetIterator.NO_MORE_DOCS; + codepoint = DECIMAL_DIGIT_CODEPOINTS.nextSetBit(codepoint+1)) { + + assert Character.isDigit(codepoint); + + // add some a-z before/after the string + String prefix = TestUtil.randomSimpleString(random()); + String suffix = TestUtil.randomSimpleString(random()); + + StringBuilder expected = new StringBuilder(); + expected.append(prefix); + int value = Character.getNumericValue(codepoint); + assert value >= 0 && value <= 9; + expected.append(Integer.toString(value)); + expected.append(suffix); + + StringBuilder actual = new StringBuilder(); + actual.append(prefix); + actual.appendCodePoint(codepoint); + actual.append(suffix); + + checkOneTerm(keyword, actual.toString(), expected.toString()); + + numCodePointsChecked++; + } + assert DECIMAL_DIGIT_CODEPOINTS.cardinality() == numCodePointsChecked; } /** @@ -103,4 +184,44 @@ public class TestDecimalDigitFilter extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random(), tokenized, 1000*RANDOM_MULTIPLIER); } + + /** returns a psuedo-random codepoint which is a Decimal Digit */ + public static int getRandomDecimalDigit(Random r) { + final int aprox = TestUtil.nextInt(r, 0, DECIMAL_DIGIT_CODEPOINTS.length()-1); + + if (DECIMAL_DIGIT_CODEPOINTS.get(aprox)) { // lucky guess + assert Character.isDigit(aprox); + return aprox; + } + + // seek up and down for closest set bit + final int lower = DECIMAL_DIGIT_CODEPOINTS.prevSetBit(aprox); + final int higher = DECIMAL_DIGIT_CODEPOINTS.nextSetBit(aprox); + + // sanity check edge cases + if (lower < 0) { + assert higher != DocIdSetIterator.NO_MORE_DOCS; + assert Character.isDigit(higher); + return higher; + } + if (higher == DocIdSetIterator.NO_MORE_DOCS) { + assert 0 <= lower; + assert Character.isDigit(lower); + return lower; + } + + // which is closer? + final int cmp = Integer.compare(aprox - lower, higher - aprox); + + if (0 == cmp) { + // dead even, flip a coin + final int result = random().nextBoolean() ? lower : higher; + assert Character.isDigit(result); + return result; + } + + final int result = (cmp < 0) ? lower : higher; + assert Character.isDigit(result); + return result; + } }