mirror of https://github.com/apache/lucene.git
LUCENE-7468: ASCIIFoldingFilter should not emit duplicated tokens when preserve original is on.
This commit is contained in:
parent
36b3b0884a
commit
28d187acd1
|
@ -58,6 +58,9 @@ Bug Fixes
|
||||||
* LUCENE-7456: PerFieldPostings/DocValues was failing to delegate the
|
* LUCENE-7456: PerFieldPostings/DocValues was failing to delegate the
|
||||||
merge method (Julien MASSENET via Mike McCandless)
|
merge method (Julien MASSENET via Mike McCandless)
|
||||||
|
|
||||||
|
* LUCENE-7468: ASCIIFoldingFilter should not emit duplicated tokens when
|
||||||
|
preserve original is on. (David Causse via Adrien Grand)
|
||||||
|
|
||||||
Improvements
|
Improvements
|
||||||
|
|
||||||
* LUCENE-7439: FuzzyQuery now matches all terms within the specified
|
* LUCENE-7439: FuzzyQuery now matches all terms within the specified
|
||||||
|
|
|
@ -134,9 +134,6 @@ public final class ASCIIFoldingFilter extends TokenFilter {
|
||||||
*/
|
*/
|
||||||
public void foldToASCII(char[] input, int length)
|
public void foldToASCII(char[] input, int length)
|
||||||
{
|
{
|
||||||
if (preserveOriginal) {
|
|
||||||
state = captureState();
|
|
||||||
}
|
|
||||||
// Worst-case length required:
|
// Worst-case length required:
|
||||||
final int maxSizeNeeded = 4 * length;
|
final int maxSizeNeeded = 4 * length;
|
||||||
if (output.length < maxSizeNeeded) {
|
if (output.length < maxSizeNeeded) {
|
||||||
|
@ -144,6 +141,27 @@ public final class ASCIIFoldingFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
|
|
||||||
outputPos = foldToASCII(input, 0, output, 0, length);
|
outputPos = foldToASCII(input, 0, output, 0, length);
|
||||||
|
if (preserveOriginal && needToPreserve(input, length)) {
|
||||||
|
state = captureState();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if foldToASCII generated a different token.
|
||||||
|
* @param input original term
|
||||||
|
* @param inputLength length of the original term
|
||||||
|
* @return true if foldToASCII generated a different token
|
||||||
|
*/
|
||||||
|
private boolean needToPreserve(char[] input, int inputLength) {
|
||||||
|
if(inputLength != outputPos) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
for(int i = 0; i < inputLength; i++) {
|
||||||
|
if(input[i] != output[i]) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -131,6 +131,19 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase {
|
||||||
assertFalse(filter.incrementToken());
|
assertFalse(filter.incrementToken());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Test that we do not emit duplicated tokens when preserve original is on
|
||||||
|
public void testUnmodifiedLetters() throws Exception {
|
||||||
|
TokenStream stream = whitespaceMockTokenizer("§ ¦ ¤ END");
|
||||||
|
ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream, true);
|
||||||
|
|
||||||
|
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
|
||||||
|
filter.reset();
|
||||||
|
assertNextTerms("§", "§", filter, termAtt);
|
||||||
|
assertNextTerms("¦", "¦", filter, termAtt);
|
||||||
|
assertNextTerms("¤", "¤", filter, termAtt);
|
||||||
|
assertNextTerms("END", "END", filter, termAtt);
|
||||||
|
assertFalse(filter.incrementToken());
|
||||||
|
}
|
||||||
|
|
||||||
// The following Perl script generated the foldings[] array automatically
|
// The following Perl script generated the foldings[] array automatically
|
||||||
// from ASCIIFoldingFilter.java:
|
// from ASCIIFoldingFilter.java:
|
||||||
|
|
Loading…
Reference in New Issue