LUCENE-7468: ASCIIFoldingFilter should not emit duplicated tokens when preserve original is on.

This commit is contained in:
Adrien Grand 2016-10-06 10:56:43 +02:00
parent 36b3b0884a
commit 28d187acd1
3 changed files with 37 additions and 3 deletions

View File

@ -58,6 +58,9 @@ Bug Fixes
* LUCENE-7456: PerFieldPostings/DocValues was failing to delegate the * LUCENE-7456: PerFieldPostings/DocValues was failing to delegate the
merge method (Julien MASSENET via Mike McCandless) merge method (Julien MASSENET via Mike McCandless)
* LUCENE-7468: ASCIIFoldingFilter should not emit duplicated tokens when
preserve original is on. (David Causse via Adrien Grand)
Improvements Improvements
* LUCENE-7439: FuzzyQuery now matches all terms within the specified * LUCENE-7439: FuzzyQuery now matches all terms within the specified

View File

@ -134,9 +134,6 @@ public final class ASCIIFoldingFilter extends TokenFilter {
*/ */
public void foldToASCII(char[] input, int length) public void foldToASCII(char[] input, int length)
{ {
if (preserveOriginal) {
state = captureState();
}
// Worst-case length required: // Worst-case length required:
final int maxSizeNeeded = 4 * length; final int maxSizeNeeded = 4 * length;
if (output.length < maxSizeNeeded) { if (output.length < maxSizeNeeded) {
@ -144,6 +141,27 @@ public final class ASCIIFoldingFilter extends TokenFilter {
} }
outputPos = foldToASCII(input, 0, output, 0, length); outputPos = foldToASCII(input, 0, output, 0, length);
if (preserveOriginal && needToPreserve(input, length)) {
state = captureState();
}
}
/**
* Check if foldToASCII generated a different token.
* @param input original term
* @param inputLength length of the original term
* @return true if foldToASCII generated a different token
*/
private boolean needToPreserve(char[] input, int inputLength) {
if(inputLength != outputPos) {
return true;
}
for(int i = 0; i < inputLength; i++) {
if(input[i] != output[i]) {
return true;
}
}
return false;
} }
/** /**

View File

@ -131,6 +131,19 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase {
assertFalse(filter.incrementToken()); assertFalse(filter.incrementToken());
} }
// Test that we do not emit duplicated tokens when preserve original is on
public void testUnmodifiedLetters() throws Exception {
TokenStream stream = whitespaceMockTokenizer("§ ¦ ¤ END");
ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream, true);
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
filter.reset();
assertNextTerms("§", "§", filter, termAtt);
assertNextTerms("¦", "¦", filter, termAtt);
assertNextTerms("¤", "¤", filter, termAtt);
assertNextTerms("END", "END", filter, termAtt);
assertFalse(filter.incrementToken());
}
// The following Perl script generated the foldings[] array automatically // The following Perl script generated the foldings[] array automatically
// from ASCIIFoldingFilter.java: // from ASCIIFoldingFilter.java: