mirror of https://github.com/apache/lucene.git
LUCENE-7363: Fixed DecimalDigitFilter in case of supplementary code points.
This commit is contained in:
parent
c22725f0b5
commit
9260a1378f
|
@ -86,6 +86,9 @@ Bug Fixes
|
|||
dimensional points could cause unexpected merge exceptions (Hans
|
||||
Lund, Mike McCandless)
|
||||
|
||||
* LUCENE-7363: Fixed DecimalDigitFilter in case of supplementary code points.
|
||||
(Hossman)
|
||||
|
||||
Improvements
|
||||
|
||||
* LUCENE-7439: FuzzyQuery now matches all terms within the specified
|
||||
|
|
|
@ -52,7 +52,7 @@ public final class DecimalDigitFilter extends TokenFilter {
|
|||
buffer[i] = (char) ('0' + Character.getNumericValue(ch));
|
||||
// if the original was supplementary, shrink the string
|
||||
if (ch > 0xFFFF) {
|
||||
length = StemmerUtil.delete(buffer, ++i, length);
|
||||
length = StemmerUtil.delete(buffer, i+1, length);
|
||||
termAtt.setLength(length);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,14 +21,42 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.SparseFixedBitSet;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
import java.util.Random;
|
||||
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
/**
|
||||
* Tests for {@link DecimalDigitFilter}
|
||||
*/
|
||||
public class TestDecimalDigitFilter extends BaseTokenStreamTestCase {
|
||||
private Analyzer tokenized;
|
||||
private Analyzer keyword;
|
||||
|
||||
private static SparseFixedBitSet DECIMAL_DIGIT_CODEPOINTS;
|
||||
|
||||
@BeforeClass
|
||||
public static void init_DECIMAL_DIGIT_CODEPOINTS() {
|
||||
DECIMAL_DIGIT_CODEPOINTS = new SparseFixedBitSet(Character.MAX_CODE_POINT);
|
||||
for (int codepoint = Character.MIN_CODE_POINT; codepoint < Character.MAX_CODE_POINT; codepoint++) {
|
||||
if (Character.isDigit(codepoint)) {
|
||||
DECIMAL_DIGIT_CODEPOINTS.set(codepoint);
|
||||
}
|
||||
}
|
||||
assert 0 < DECIMAL_DIGIT_CODEPOINTS.cardinality();
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public static void destroy_DECIMAL_DIGIT_CODEPOINTS() {
|
||||
DECIMAL_DIGIT_CODEPOINTS = null;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
|
@ -64,30 +92,83 @@ public class TestDecimalDigitFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
/**
|
||||
* test all digits in different locations of strings.
|
||||
* test that double struck digits are normalized
|
||||
*/
|
||||
public void testRandom() throws Exception {
|
||||
for (int codepoint = Character.MIN_CODE_POINT; codepoint < Character.MAX_CODE_POINT; codepoint++) {
|
||||
if (Character.isDigit(codepoint)) {
|
||||
// add some a-z before/after the string
|
||||
String prefix = TestUtil.randomSimpleString(random());
|
||||
String suffix = TestUtil.randomSimpleString(random());
|
||||
public void testDoubleStruck() throws Exception {
|
||||
// MATHEMATICAL DOUBLE-STRUCK DIGIT ... 1, 9, 8, 4
|
||||
final String input = "𝟙 𝟡 𝟠 𝟜";
|
||||
final String expected = "1 9 8 4";
|
||||
checkOneTerm(keyword, input, expected);
|
||||
checkOneTerm(keyword, input.replaceAll("\\s",""), expected.replaceAll("\\s",""));
|
||||
}
|
||||
|
||||
/**
|
||||
* test sequences of digits mixed with other random simple string data
|
||||
*/
|
||||
public void testRandomSequences() throws Exception {
|
||||
|
||||
// test numIters random strings containing a sequence of numDigits codepoints
|
||||
final int numIters = atLeast(5);
|
||||
for (int iter = 0; iter < numIters; iter++) {
|
||||
final int numDigits = atLeast(20);
|
||||
final StringBuilder expected = new StringBuilder();
|
||||
final StringBuilder actual = new StringBuilder();
|
||||
for (int digitCounter = 0; digitCounter < numDigits; digitCounter++) {
|
||||
|
||||
StringBuilder expected = new StringBuilder();
|
||||
// increased odds of 0 length random string prefix
|
||||
final String prefix = random().nextBoolean() ? "" : TestUtil.randomSimpleString(random());
|
||||
expected.append(prefix);
|
||||
actual.append(prefix);
|
||||
|
||||
int codepoint = getRandomDecimalDigit(random());
|
||||
|
||||
int value = Character.getNumericValue(codepoint);
|
||||
assert value >= 0 && value <= 9;
|
||||
expected.append(Integer.toString(value));
|
||||
expected.append(suffix);
|
||||
|
||||
StringBuilder actual = new StringBuilder();
|
||||
actual.append(prefix);
|
||||
actual.appendCodePoint(codepoint);
|
||||
actual.append(suffix);
|
||||
|
||||
checkOneTerm(keyword, actual.toString(), expected.toString());
|
||||
}
|
||||
// occasional suffix, increased odds of 0 length random string
|
||||
final String suffix = random().nextBoolean() ? "" : TestUtil.randomSimpleString(random());
|
||||
expected.append(suffix);
|
||||
actual.append(suffix);
|
||||
|
||||
checkOneTerm(keyword, actual.toString(), expected.toString());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* test each individual digit in different locations of strings.
|
||||
*/
|
||||
public void testRandom() throws Exception {
|
||||
int numCodePointsChecked = 0; // sanity check
|
||||
for (int codepoint = DECIMAL_DIGIT_CODEPOINTS.nextSetBit(0);
|
||||
codepoint != DocIdSetIterator.NO_MORE_DOCS;
|
||||
codepoint = DECIMAL_DIGIT_CODEPOINTS.nextSetBit(codepoint+1)) {
|
||||
|
||||
assert Character.isDigit(codepoint);
|
||||
|
||||
// add some a-z before/after the string
|
||||
String prefix = TestUtil.randomSimpleString(random());
|
||||
String suffix = TestUtil.randomSimpleString(random());
|
||||
|
||||
StringBuilder expected = new StringBuilder();
|
||||
expected.append(prefix);
|
||||
int value = Character.getNumericValue(codepoint);
|
||||
assert value >= 0 && value <= 9;
|
||||
expected.append(Integer.toString(value));
|
||||
expected.append(suffix);
|
||||
|
||||
StringBuilder actual = new StringBuilder();
|
||||
actual.append(prefix);
|
||||
actual.appendCodePoint(codepoint);
|
||||
actual.append(suffix);
|
||||
|
||||
checkOneTerm(keyword, actual.toString(), expected.toString());
|
||||
|
||||
numCodePointsChecked++;
|
||||
}
|
||||
assert DECIMAL_DIGIT_CODEPOINTS.cardinality() == numCodePointsChecked;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -103,4 +184,44 @@ public class TestDecimalDigitFilter extends BaseTokenStreamTestCase {
|
|||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), tokenized, 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
/** returns a psuedo-random codepoint which is a Decimal Digit */
|
||||
public static int getRandomDecimalDigit(Random r) {
|
||||
final int aprox = TestUtil.nextInt(r, 0, DECIMAL_DIGIT_CODEPOINTS.length()-1);
|
||||
|
||||
if (DECIMAL_DIGIT_CODEPOINTS.get(aprox)) { // lucky guess
|
||||
assert Character.isDigit(aprox);
|
||||
return aprox;
|
||||
}
|
||||
|
||||
// seek up and down for closest set bit
|
||||
final int lower = DECIMAL_DIGIT_CODEPOINTS.prevSetBit(aprox);
|
||||
final int higher = DECIMAL_DIGIT_CODEPOINTS.nextSetBit(aprox);
|
||||
|
||||
// sanity check edge cases
|
||||
if (lower < 0) {
|
||||
assert higher != DocIdSetIterator.NO_MORE_DOCS;
|
||||
assert Character.isDigit(higher);
|
||||
return higher;
|
||||
}
|
||||
if (higher == DocIdSetIterator.NO_MORE_DOCS) {
|
||||
assert 0 <= lower;
|
||||
assert Character.isDigit(lower);
|
||||
return lower;
|
||||
}
|
||||
|
||||
// which is closer?
|
||||
final int cmp = Integer.compare(aprox - lower, higher - aprox);
|
||||
|
||||
if (0 == cmp) {
|
||||
// dead even, flip a coin
|
||||
final int result = random().nextBoolean() ? lower : higher;
|
||||
assert Character.isDigit(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
final int result = (cmp < 0) ? lower : higher;
|
||||
assert Character.isDigit(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue