mirror of https://github.com/apache/lucene.git
LUCENE-7363: Fixed DecimalDigitFilter in case of supplementary code points.
This commit is contained in:
parent
30c9f4311a
commit
39ed4bdef6
|
@ -43,6 +43,9 @@ Bug Fixes
|
||||||
dimensional points could cause unexpected merge exceptions (Hans
|
dimensional points could cause unexpected merge exceptions (Hans
|
||||||
Lund, Mike McCandless)
|
Lund, Mike McCandless)
|
||||||
|
|
||||||
|
* LUCENE-7363: Fixed DecimalDigitFilter in case of supplementary code points.
|
||||||
|
(Hossman)
|
||||||
|
|
||||||
Improvements
|
Improvements
|
||||||
|
|
||||||
* LUCENE-7439: FuzzyQuery now matches all terms within the specified
|
* LUCENE-7439: FuzzyQuery now matches all terms within the specified
|
||||||
|
|
|
@ -52,7 +52,7 @@ public final class DecimalDigitFilter extends TokenFilter {
|
||||||
buffer[i] = (char) ('0' + Character.getNumericValue(ch));
|
buffer[i] = (char) ('0' + Character.getNumericValue(ch));
|
||||||
// if the original was supplementary, shrink the string
|
// if the original was supplementary, shrink the string
|
||||||
if (ch > 0xFFFF) {
|
if (ch > 0xFFFF) {
|
||||||
length = StemmerUtil.delete(buffer, ++i, length);
|
length = StemmerUtil.delete(buffer, i+1, length);
|
||||||
termAtt.setLength(length);
|
termAtt.setLength(length);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,14 +21,42 @@ import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
import org.apache.lucene.util.Bits;
|
||||||
|
import org.apache.lucene.util.SparseFixedBitSet;
|
||||||
import org.apache.lucene.util.TestUtil;
|
import org.apache.lucene.util.TestUtil;
|
||||||
|
|
||||||
|
import java.util.Random;
|
||||||
|
|
||||||
|
import org.junit.AfterClass;
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tests for {@link DecimalDigitFilter}
|
* Tests for {@link DecimalDigitFilter}
|
||||||
*/
|
*/
|
||||||
public class TestDecimalDigitFilter extends BaseTokenStreamTestCase {
|
public class TestDecimalDigitFilter extends BaseTokenStreamTestCase {
|
||||||
private Analyzer tokenized;
|
private Analyzer tokenized;
|
||||||
private Analyzer keyword;
|
private Analyzer keyword;
|
||||||
|
|
||||||
|
private static SparseFixedBitSet DECIMAL_DIGIT_CODEPOINTS;
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public static void init_DECIMAL_DIGIT_CODEPOINTS() {
|
||||||
|
DECIMAL_DIGIT_CODEPOINTS = new SparseFixedBitSet(Character.MAX_CODE_POINT);
|
||||||
|
for (int codepoint = Character.MIN_CODE_POINT; codepoint < Character.MAX_CODE_POINT; codepoint++) {
|
||||||
|
if (Character.isDigit(codepoint)) {
|
||||||
|
DECIMAL_DIGIT_CODEPOINTS.set(codepoint);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert 0 < DECIMAL_DIGIT_CODEPOINTS.cardinality();
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterClass
|
||||||
|
public static void destroy_DECIMAL_DIGIT_CODEPOINTS() {
|
||||||
|
DECIMAL_DIGIT_CODEPOINTS = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setUp() throws Exception {
|
public void setUp() throws Exception {
|
||||||
|
@ -64,30 +92,83 @@ public class TestDecimalDigitFilter extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* test all digits in different locations of strings.
|
* test that double struck digits are normalized
|
||||||
*/
|
*/
|
||||||
public void testRandom() throws Exception {
|
public void testDoubleStruck() throws Exception {
|
||||||
for (int codepoint = Character.MIN_CODE_POINT; codepoint < Character.MAX_CODE_POINT; codepoint++) {
|
// MATHEMATICAL DOUBLE-STRUCK DIGIT ... 1, 9, 8, 4
|
||||||
if (Character.isDigit(codepoint)) {
|
final String input = "𝟙 𝟡 𝟠 𝟜";
|
||||||
// add some a-z before/after the string
|
final String expected = "1 9 8 4";
|
||||||
String prefix = TestUtil.randomSimpleString(random());
|
checkOneTerm(keyword, input, expected);
|
||||||
String suffix = TestUtil.randomSimpleString(random());
|
checkOneTerm(keyword, input.replaceAll("\\s",""), expected.replaceAll("\\s",""));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* test sequences of digits mixed with other random simple string data
|
||||||
|
*/
|
||||||
|
public void testRandomSequences() throws Exception {
|
||||||
|
|
||||||
|
// test numIters random strings containing a sequence of numDigits codepoints
|
||||||
|
final int numIters = atLeast(5);
|
||||||
|
for (int iter = 0; iter < numIters; iter++) {
|
||||||
|
final int numDigits = atLeast(20);
|
||||||
|
final StringBuilder expected = new StringBuilder();
|
||||||
|
final StringBuilder actual = new StringBuilder();
|
||||||
|
for (int digitCounter = 0; digitCounter < numDigits; digitCounter++) {
|
||||||
|
|
||||||
StringBuilder expected = new StringBuilder();
|
// increased odds of 0 length random string prefix
|
||||||
|
final String prefix = random().nextBoolean() ? "" : TestUtil.randomSimpleString(random());
|
||||||
expected.append(prefix);
|
expected.append(prefix);
|
||||||
|
actual.append(prefix);
|
||||||
|
|
||||||
|
int codepoint = getRandomDecimalDigit(random());
|
||||||
|
|
||||||
int value = Character.getNumericValue(codepoint);
|
int value = Character.getNumericValue(codepoint);
|
||||||
assert value >= 0 && value <= 9;
|
assert value >= 0 && value <= 9;
|
||||||
expected.append(Integer.toString(value));
|
expected.append(Integer.toString(value));
|
||||||
expected.append(suffix);
|
|
||||||
|
|
||||||
StringBuilder actual = new StringBuilder();
|
|
||||||
actual.append(prefix);
|
|
||||||
actual.appendCodePoint(codepoint);
|
actual.appendCodePoint(codepoint);
|
||||||
actual.append(suffix);
|
|
||||||
|
|
||||||
checkOneTerm(keyword, actual.toString(), expected.toString());
|
|
||||||
}
|
}
|
||||||
|
// occasional suffix, increased odds of 0 length random string
|
||||||
|
final String suffix = random().nextBoolean() ? "" : TestUtil.randomSimpleString(random());
|
||||||
|
expected.append(suffix);
|
||||||
|
actual.append(suffix);
|
||||||
|
|
||||||
|
checkOneTerm(keyword, actual.toString(), expected.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* test each individual digit in different locations of strings.
|
||||||
|
*/
|
||||||
|
public void testRandom() throws Exception {
|
||||||
|
int numCodePointsChecked = 0; // sanity check
|
||||||
|
for (int codepoint = DECIMAL_DIGIT_CODEPOINTS.nextSetBit(0);
|
||||||
|
codepoint != DocIdSetIterator.NO_MORE_DOCS;
|
||||||
|
codepoint = DECIMAL_DIGIT_CODEPOINTS.nextSetBit(codepoint+1)) {
|
||||||
|
|
||||||
|
assert Character.isDigit(codepoint);
|
||||||
|
|
||||||
|
// add some a-z before/after the string
|
||||||
|
String prefix = TestUtil.randomSimpleString(random());
|
||||||
|
String suffix = TestUtil.randomSimpleString(random());
|
||||||
|
|
||||||
|
StringBuilder expected = new StringBuilder();
|
||||||
|
expected.append(prefix);
|
||||||
|
int value = Character.getNumericValue(codepoint);
|
||||||
|
assert value >= 0 && value <= 9;
|
||||||
|
expected.append(Integer.toString(value));
|
||||||
|
expected.append(suffix);
|
||||||
|
|
||||||
|
StringBuilder actual = new StringBuilder();
|
||||||
|
actual.append(prefix);
|
||||||
|
actual.appendCodePoint(codepoint);
|
||||||
|
actual.append(suffix);
|
||||||
|
|
||||||
|
checkOneTerm(keyword, actual.toString(), expected.toString());
|
||||||
|
|
||||||
|
numCodePointsChecked++;
|
||||||
|
}
|
||||||
|
assert DECIMAL_DIGIT_CODEPOINTS.cardinality() == numCodePointsChecked;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -103,4 +184,44 @@ public class TestDecimalDigitFilter extends BaseTokenStreamTestCase {
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), tokenized, 1000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), tokenized, 1000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** returns a psuedo-random codepoint which is a Decimal Digit */
|
||||||
|
public static int getRandomDecimalDigit(Random r) {
|
||||||
|
final int aprox = TestUtil.nextInt(r, 0, DECIMAL_DIGIT_CODEPOINTS.length()-1);
|
||||||
|
|
||||||
|
if (DECIMAL_DIGIT_CODEPOINTS.get(aprox)) { // lucky guess
|
||||||
|
assert Character.isDigit(aprox);
|
||||||
|
return aprox;
|
||||||
|
}
|
||||||
|
|
||||||
|
// seek up and down for closest set bit
|
||||||
|
final int lower = DECIMAL_DIGIT_CODEPOINTS.prevSetBit(aprox);
|
||||||
|
final int higher = DECIMAL_DIGIT_CODEPOINTS.nextSetBit(aprox);
|
||||||
|
|
||||||
|
// sanity check edge cases
|
||||||
|
if (lower < 0) {
|
||||||
|
assert higher != DocIdSetIterator.NO_MORE_DOCS;
|
||||||
|
assert Character.isDigit(higher);
|
||||||
|
return higher;
|
||||||
|
}
|
||||||
|
if (higher == DocIdSetIterator.NO_MORE_DOCS) {
|
||||||
|
assert 0 <= lower;
|
||||||
|
assert Character.isDigit(lower);
|
||||||
|
return lower;
|
||||||
|
}
|
||||||
|
|
||||||
|
// which is closer?
|
||||||
|
final int cmp = Integer.compare(aprox - lower, higher - aprox);
|
||||||
|
|
||||||
|
if (0 == cmp) {
|
||||||
|
// dead even, flip a coin
|
||||||
|
final int result = random().nextBoolean() ? lower : higher;
|
||||||
|
assert Character.isDigit(result);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
final int result = (cmp < 0) ? lower : higher;
|
||||||
|
assert Character.isDigit(result);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue