LUCENE-7363: Fixed DecimalDigitFilter in case of supplementary code points.

This commit is contained in:
Adrien Grand 2016-10-18 10:38:51 +02:00
parent 30c9f4311a
commit 39ed4bdef6
3 changed files with 140 additions and 16 deletions

View File

@ -43,6 +43,9 @@ Bug Fixes
dimensional points could cause unexpected merge exceptions (Hans
Lund, Mike McCandless)
* LUCENE-7363: Fixed DecimalDigitFilter in case of supplementary code points.
(Hossman)
Improvements
* LUCENE-7439: FuzzyQuery now matches all terms within the specified

View File

@ -52,7 +52,7 @@ public final class DecimalDigitFilter extends TokenFilter {
buffer[i] = (char) ('0' + Character.getNumericValue(ch));
// if the original was supplementary, shrink the string
if (ch > 0xFFFF) {
length = StemmerUtil.delete(buffer, ++i, length);
length = StemmerUtil.delete(buffer, i+1, length);
termAtt.setLength(length);
}
}

View File

@ -21,8 +21,17 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.SparseFixedBitSet;
import org.apache.lucene.util.TestUtil;
import java.util.Random;
import org.junit.AfterClass;
import org.junit.BeforeClass;
/**
* Tests for {@link DecimalDigitFilter}
*/
@ -30,6 +39,25 @@ public class TestDecimalDigitFilter extends BaseTokenStreamTestCase {
private Analyzer tokenized;
private Analyzer keyword;
private static SparseFixedBitSet DECIMAL_DIGIT_CODEPOINTS;
@BeforeClass
public static void init_DECIMAL_DIGIT_CODEPOINTS() {
DECIMAL_DIGIT_CODEPOINTS = new SparseFixedBitSet(Character.MAX_CODE_POINT);
for (int codepoint = Character.MIN_CODE_POINT; codepoint < Character.MAX_CODE_POINT; codepoint++) {
if (Character.isDigit(codepoint)) {
DECIMAL_DIGIT_CODEPOINTS.set(codepoint);
}
}
assert 0 < DECIMAL_DIGIT_CODEPOINTS.cardinality();
}
@AfterClass
public static void destroy_DECIMAL_DIGIT_CODEPOINTS() {
DECIMAL_DIGIT_CODEPOINTS = null;
}
@Override
public void setUp() throws Exception {
super.setUp();
@ -64,11 +92,62 @@ public class TestDecimalDigitFilter extends BaseTokenStreamTestCase {
}
/**
* test all digits in different locations of strings.
* test that double struck digits are normalized
*/
public void testDoubleStruck() throws Exception {
// MATHEMATICAL DOUBLE-STRUCK DIGIT ... 1, 9, 8, 4
final String input = "𝟙 𝟡 𝟠 𝟜";
final String expected = "1 9 8 4";
checkOneTerm(keyword, input, expected);
checkOneTerm(keyword, input.replaceAll("\\s",""), expected.replaceAll("\\s",""));
}
/**
* test sequences of digits mixed with other random simple string data
*/
public void testRandomSequences() throws Exception {
// test numIters random strings containing a sequence of numDigits codepoints
final int numIters = atLeast(5);
for (int iter = 0; iter < numIters; iter++) {
final int numDigits = atLeast(20);
final StringBuilder expected = new StringBuilder();
final StringBuilder actual = new StringBuilder();
for (int digitCounter = 0; digitCounter < numDigits; digitCounter++) {
// increased odds of 0 length random string prefix
final String prefix = random().nextBoolean() ? "" : TestUtil.randomSimpleString(random());
expected.append(prefix);
actual.append(prefix);
int codepoint = getRandomDecimalDigit(random());
int value = Character.getNumericValue(codepoint);
assert value >= 0 && value <= 9;
expected.append(Integer.toString(value));
actual.appendCodePoint(codepoint);
}
// occasional suffix, increased odds of 0 length random string
final String suffix = random().nextBoolean() ? "" : TestUtil.randomSimpleString(random());
expected.append(suffix);
actual.append(suffix);
checkOneTerm(keyword, actual.toString(), expected.toString());
}
}
/**
* test each individual digit in different locations of strings.
*/
public void testRandom() throws Exception {
for (int codepoint = Character.MIN_CODE_POINT; codepoint < Character.MAX_CODE_POINT; codepoint++) {
if (Character.isDigit(codepoint)) {
int numCodePointsChecked = 0; // sanity check
for (int codepoint = DECIMAL_DIGIT_CODEPOINTS.nextSetBit(0);
codepoint != DocIdSetIterator.NO_MORE_DOCS;
codepoint = DECIMAL_DIGIT_CODEPOINTS.nextSetBit(codepoint+1)) {
assert Character.isDigit(codepoint);
// add some a-z before/after the string
String prefix = TestUtil.randomSimpleString(random());
String suffix = TestUtil.randomSimpleString(random());
@ -86,8 +165,10 @@ public class TestDecimalDigitFilter extends BaseTokenStreamTestCase {
actual.append(suffix);
checkOneTerm(keyword, actual.toString(), expected.toString());
numCodePointsChecked++;
}
}
assert DECIMAL_DIGIT_CODEPOINTS.cardinality() == numCodePointsChecked;
}
/**
@ -103,4 +184,44 @@ public class TestDecimalDigitFilter extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
checkRandomData(random(), tokenized, 1000*RANDOM_MULTIPLIER);
}
/** returns a psuedo-random codepoint which is a Decimal Digit */
public static int getRandomDecimalDigit(Random r) {
final int aprox = TestUtil.nextInt(r, 0, DECIMAL_DIGIT_CODEPOINTS.length()-1);
if (DECIMAL_DIGIT_CODEPOINTS.get(aprox)) { // lucky guess
assert Character.isDigit(aprox);
return aprox;
}
// seek up and down for closest set bit
final int lower = DECIMAL_DIGIT_CODEPOINTS.prevSetBit(aprox);
final int higher = DECIMAL_DIGIT_CODEPOINTS.nextSetBit(aprox);
// sanity check edge cases
if (lower < 0) {
assert higher != DocIdSetIterator.NO_MORE_DOCS;
assert Character.isDigit(higher);
return higher;
}
if (higher == DocIdSetIterator.NO_MORE_DOCS) {
assert 0 <= lower;
assert Character.isDigit(lower);
return lower;
}
// which is closer?
final int cmp = Integer.compare(aprox - lower, higher - aprox);
if (0 == cmp) {
// dead even, flip a coin
final int result = random().nextBoolean() ? lower : higher;
assert Character.isDigit(result);
return result;
}
final int result = (cmp < 0) ? lower : higher;
assert Character.isDigit(result);
return result;
}
}