mirror of https://github.com/apache/lucene.git
LUCENE-7468: ASCIIFoldingFilter should not emit duplicated tokens when preserve original is on.
This commit is contained in:
parent
59d83f57e1
commit
739c0a7bf2
|
@ -23,6 +23,9 @@ Bug Fixes
|
|||
* LUCENE-7456: PerFieldPostings/DocValues was failing to delegate the
|
||||
merge method (Julien MASSENET via Mike McCandless)
|
||||
|
||||
* LUCENE-7468: ASCIIFoldingFilter should not emit duplicated tokens when
|
||||
preserve original is on. (David Causse via Adrien Grand)
|
||||
|
||||
Improvements
|
||||
|
||||
* LUCENE-7439: FuzzyQuery now matches all terms within the specified
|
||||
|
|
|
@ -134,9 +134,6 @@ public final class ASCIIFoldingFilter extends TokenFilter {
|
|||
*/
|
||||
public void foldToASCII(char[] input, int length)
|
||||
{
|
||||
if (preserveOriginal) {
|
||||
state = captureState();
|
||||
}
|
||||
// Worst-case length required:
|
||||
final int maxSizeNeeded = 4 * length;
|
||||
if (output.length < maxSizeNeeded) {
|
||||
|
@ -144,6 +141,27 @@ public final class ASCIIFoldingFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
outputPos = foldToASCII(input, 0, output, 0, length);
|
||||
if (preserveOriginal && needToPreserve(input, length)) {
|
||||
state = captureState();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if foldToASCII generated a different token.
|
||||
* @param input original term
|
||||
* @param inputLength length of the original term
|
||||
* @return true if foldToASCII generated a different token
|
||||
*/
|
||||
private boolean needToPreserve(char[] input, int inputLength) {
|
||||
if(inputLength != outputPos) {
|
||||
return true;
|
||||
}
|
||||
for(int i = 0; i < inputLength; i++) {
|
||||
if(input[i] != output[i]) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -131,6 +131,19 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase {
|
|||
assertFalse(filter.incrementToken());
|
||||
}
|
||||
|
||||
// Test that we do not emit duplicated tokens when preserve original is on
|
||||
public void testUnmodifiedLetters() throws Exception {
|
||||
TokenStream stream = whitespaceMockTokenizer("§ ¦ ¤ END");
|
||||
ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream, true);
|
||||
|
||||
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
|
||||
filter.reset();
|
||||
assertNextTerms("§", "§", filter, termAtt);
|
||||
assertNextTerms("¦", "¦", filter, termAtt);
|
||||
assertNextTerms("¤", "¤", filter, termAtt);
|
||||
assertNextTerms("END", "END", filter, termAtt);
|
||||
assertFalse(filter.incrementToken());
|
||||
}
|
||||
|
||||
// The following Perl script generated the foldings[] array automatically
|
||||
// from ASCIIFoldingFilter.java:
|
||||
|
|
Loading…
Reference in New Issue