SOLR-1593: fix reverse wildcard filter for surrogate pairs

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@883386 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2009-11-23 16:09:05 +00:00
parent 0d639dad03
commit dc18c42b6b
4 changed files with 89 additions and 17 deletions

View File

@ -74,6 +74,11 @@ Bug Fixes
fl=score to the parameter list instead of appending score to the
existing field list. (yonik)
* SOLR-1593: ReverseWildcardFilter didn't work for surrogate pairs
(i.e. code points outside of the BMP), resulting in incorrect
matching. This change requires reindexing for any content with
such characters. (Robert Muir, yonik)
Other Changes
----------------------

View File

@ -20,7 +20,6 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
@ -73,13 +72,79 @@ public class ReversedWildcardFilter extends TokenFilter {
}
char [] buffer = termAtt.resizeTermBuffer(oldLen + 1);
buffer[oldLen] = markerChar;
//String reversed = reverseAndMark(value, markerChar);
ReverseStringFilter.reverse(buffer, oldLen + 1);
reverse(buffer, 0, oldLen + 1);
posAtt.setPositionIncrement(origOffset);
termAtt.setTermBuffer(buffer, 0, oldLen +1);
return true;
}
/**
* Partially reverses the given input buffer in-place from the given offset
* up to the given length, keeping surrogate pairs in the correct (non-reversed) order.
* @param buffer the input char array to reverse
* @param start the offset from where to reverse the buffer
* @param len the length in the buffer up to where the
* buffer should be reversed
*/
public static void reverse(final char[] buffer, final int start, final int len) {
/* modified version of Apache Harmony AbstractStringBuilder reverse0() */
if (len < 2)
return;
int end = (start + len) - 1;
char frontHigh = buffer[start];
char endLow = buffer[end];
boolean allowFrontSur = true, allowEndSur = true;
final int mid = start + (len >> 1);
for (int i = start; i < mid; ++i, --end) {
final char frontLow = buffer[i + 1];
final char endHigh = buffer[end - 1];
final boolean surAtFront = allowFrontSur
&& Character.isSurrogatePair(frontHigh, frontLow);
if (surAtFront && (len < 3)) {
// nothing to do since surAtFront is allowed and 1 char left
return;
}
final boolean surAtEnd = allowEndSur
&& Character.isSurrogatePair(endHigh, endLow);
allowFrontSur = allowEndSur = true;
if (surAtFront == surAtEnd) {
if (surAtFront) {
// both surrogates
buffer[end] = frontLow;
buffer[--end] = frontHigh;
buffer[i] = endHigh;
buffer[++i] = endLow;
frontHigh = buffer[i + 1];
endLow = buffer[end - 1];
} else {
// neither surrogates
buffer[end] = frontHigh;
buffer[i] = endLow;
frontHigh = frontLow;
endLow = endHigh;
}
} else {
if (surAtFront) {
// surrogate only at the front
buffer[end] = frontLow;
buffer[i] = endLow;
endLow = endHigh;
allowFrontSur = false;
} else {
// surrogate only at the end
buffer[end] = frontHigh;
buffer[i] = endHigh;
frontHigh = frontLow;
allowEndSur = false;
}
}
}
if ((len & 0x01) == 1 && !(allowFrontSur && allowEndSur)) {
// only if odd length
buffer[end] = allowFrontSur ? endLow : frontHigh;
}
}
}

View File

@ -27,10 +27,7 @@ import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
import org.apache.solr.analysis.ReversedWildcardFilter;
import org.apache.solr.analysis.ReversedWildcardFilterFactory;
import org.apache.solr.analysis.TokenFilterFactory;
import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.analysis.*;
import org.apache.solr.common.SolrException;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
@ -193,7 +190,12 @@ public class SolrQueryParser extends QueryParser {
String type = schema.getFieldType(field).getTypeName();
ReversedWildcardFilterFactory factory = leadingWildcards.get(type);
if (factory != null && factory.shouldReverse(termStr)) {
termStr = ReverseStringFilter.reverse(termStr + factory.getMarkerChar());
int len = termStr.length();
char[] chars = new char[len+1];
chars[0] = factory.getMarkerChar();
termStr.getChars(0, len, chars, 1);
ReversedWildcardFilter.reverse(chars, 1, len);
termStr = new String(chars);
}
Query q = super.getWildcardQuery(field, termStr);
if (q instanceof WildcardQuery) {

View File

@ -77,8 +77,8 @@ public class TestReversedWildcardFilterFactory extends BaseTokenTestCase {
public void testIndexingAnalysis() throws Exception {
Analyzer a = schema.getAnalyzer();
String text = "one two three";
String expected1 = "one \u0001eno two \u0001owt three \u0001eerht";
String text = "one two three si\uD834\uDD1Ex";
String expected1 = "one \u0001eno two \u0001owt three \u0001eerht si\uD834\uDD1Ex \u0001x\uD834\uDD1Eis";
List<Token> expectedTokens1 = getTokens(
new WhitespaceTokenizer(new StringReader(expected1)));
// set positionIncrements and offsets in expected tokens
@ -86,10 +86,10 @@ public class TestReversedWildcardFilterFactory extends BaseTokenTestCase {
Token t = expectedTokens1.get(i);
t.setPositionIncrement(0);
}
String expected2 = "\u0001eno \u0001owt \u0001eerht";
String expected2 = "\u0001eno \u0001owt \u0001eerht \u0001x\uD834\uDD1Eis";
List<Token> expectedTokens2 = getTokens(
new WhitespaceTokenizer(new StringReader(expected2)));
String expected3 = "one two three";
String expected3 = "one two three si\uD834\uDD1Ex";
List<Token> expectedTokens3 = getTokens(
new WhitespaceTokenizer(new StringReader(expected3)));
// field one
@ -116,10 +116,10 @@ public class TestReversedWildcardFilterFactory extends BaseTokenTestCase {
// XXX note: this should be false, but for now we return true for any field,
// XXX if at least one field uses the reversing
assertTrue(parserThree.getAllowLeadingWildcard());
String text = "one +two *hree f*ur fiv*";
String expectedOne = "one:one +one:two one:\u0001eerh* one:\u0001ru*f one:fiv*";
String expectedTwo = "two:one +two:two two:\u0001eerh* two:\u0001ru*f two:fiv*";
String expectedThree = "three:one +three:two three:*hree three:f*ur three:fiv*";
String text = "one +two *hree f*ur fiv* *si\uD834\uDD1Ex";
String expectedOne = "one:one +one:two one:\u0001eerh* one:\u0001ru*f one:fiv* one:\u0001x\uD834\uDD1Eis*";
String expectedTwo = "two:one +two:two two:\u0001eerh* two:\u0001ru*f two:fiv* two:\u0001x\uD834\uDD1Eis*";
String expectedThree = "three:one +three:two three:*hree three:f*ur three:fiv* three:*si\uD834\uDD1Ex";
Query q = parserOne.parse(text);
assertEquals(expectedOne, q.toString());
q = parserTwo.parse(text);