mirror of https://github.com/apache/lucene.git
SOLR-1593: fix reverse wildcard filter for surrogate pairs
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@883386 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
0d639dad03
commit
dc18c42b6b
|
@ -74,6 +74,11 @@ Bug Fixes
|
|||
fl=score to the parameter list instead of appending score to the
|
||||
existing field list. (yonik)
|
||||
|
||||
* SOLR-1593: ReverseWildcardFilter didn't work for surrogate pairs
|
||||
(i.e. code points outside of the BMP), resulting in incorrect
|
||||
matching. This change requires reindexing for any content with
|
||||
such characters. (Robert Muir, yonik)
|
||||
|
||||
|
||||
Other Changes
|
||||
----------------------
|
||||
|
|
|
@ -20,7 +20,6 @@ import java.io.IOException;
|
|||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
|
@ -73,13 +72,79 @@ public class ReversedWildcardFilter extends TokenFilter {
|
|||
}
|
||||
char [] buffer = termAtt.resizeTermBuffer(oldLen + 1);
|
||||
buffer[oldLen] = markerChar;
|
||||
//String reversed = reverseAndMark(value, markerChar);
|
||||
ReverseStringFilter.reverse(buffer, oldLen + 1);
|
||||
reverse(buffer, 0, oldLen + 1);
|
||||
|
||||
posAtt.setPositionIncrement(origOffset);
|
||||
termAtt.setTermBuffer(buffer, 0, oldLen +1);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Partially reverses the given input buffer in-place from the given offset
|
||||
* up to the given length, keeping surrogate pairs in the correct (non-reversed) order.
|
||||
* @param buffer the input char array to reverse
|
||||
* @param start the offset from where to reverse the buffer
|
||||
* @param len the length in the buffer up to where the
|
||||
* buffer should be reversed
|
||||
*/
|
||||
public static void reverse(final char[] buffer, final int start, final int len) {
|
||||
/* modified version of Apache Harmony AbstractStringBuilder reverse0() */
|
||||
if (len < 2)
|
||||
return;
|
||||
int end = (start + len) - 1;
|
||||
char frontHigh = buffer[start];
|
||||
char endLow = buffer[end];
|
||||
boolean allowFrontSur = true, allowEndSur = true;
|
||||
final int mid = start + (len >> 1);
|
||||
for (int i = start; i < mid; ++i, --end) {
|
||||
final char frontLow = buffer[i + 1];
|
||||
final char endHigh = buffer[end - 1];
|
||||
final boolean surAtFront = allowFrontSur
|
||||
&& Character.isSurrogatePair(frontHigh, frontLow);
|
||||
if (surAtFront && (len < 3)) {
|
||||
// nothing to do since surAtFront is allowed and 1 char left
|
||||
return;
|
||||
}
|
||||
final boolean surAtEnd = allowEndSur
|
||||
&& Character.isSurrogatePair(endHigh, endLow);
|
||||
allowFrontSur = allowEndSur = true;
|
||||
if (surAtFront == surAtEnd) {
|
||||
if (surAtFront) {
|
||||
// both surrogates
|
||||
buffer[end] = frontLow;
|
||||
buffer[--end] = frontHigh;
|
||||
buffer[i] = endHigh;
|
||||
buffer[++i] = endLow;
|
||||
frontHigh = buffer[i + 1];
|
||||
endLow = buffer[end - 1];
|
||||
} else {
|
||||
// neither surrogates
|
||||
buffer[end] = frontHigh;
|
||||
buffer[i] = endLow;
|
||||
frontHigh = frontLow;
|
||||
endLow = endHigh;
|
||||
}
|
||||
} else {
|
||||
if (surAtFront) {
|
||||
// surrogate only at the front
|
||||
buffer[end] = frontLow;
|
||||
buffer[i] = endLow;
|
||||
endLow = endHigh;
|
||||
allowFrontSur = false;
|
||||
} else {
|
||||
// surrogate only at the end
|
||||
buffer[end] = frontHigh;
|
||||
buffer[i] = endHigh;
|
||||
frontHigh = frontLow;
|
||||
allowEndSur = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ((len & 0x01) == 1 && !(allowFrontSur && allowEndSur)) {
|
||||
// only if odd length
|
||||
buffer[end] = allowFrontSur ? endLow : frontHigh;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -27,10 +27,7 @@ import org.apache.lucene.queryParser.QueryParser;
|
|||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
|
||||
import org.apache.solr.analysis.ReversedWildcardFilter;
|
||||
import org.apache.solr.analysis.ReversedWildcardFilterFactory;
|
||||
import org.apache.solr.analysis.TokenFilterFactory;
|
||||
import org.apache.solr.analysis.TokenizerChain;
|
||||
import org.apache.solr.analysis.*;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.schema.FieldType;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
|
@ -193,7 +190,12 @@ public class SolrQueryParser extends QueryParser {
|
|||
String type = schema.getFieldType(field).getTypeName();
|
||||
ReversedWildcardFilterFactory factory = leadingWildcards.get(type);
|
||||
if (factory != null && factory.shouldReverse(termStr)) {
|
||||
termStr = ReverseStringFilter.reverse(termStr + factory.getMarkerChar());
|
||||
int len = termStr.length();
|
||||
char[] chars = new char[len+1];
|
||||
chars[0] = factory.getMarkerChar();
|
||||
termStr.getChars(0, len, chars, 1);
|
||||
ReversedWildcardFilter.reverse(chars, 1, len);
|
||||
termStr = new String(chars);
|
||||
}
|
||||
Query q = super.getWildcardQuery(field, termStr);
|
||||
if (q instanceof WildcardQuery) {
|
||||
|
|
|
@ -77,8 +77,8 @@ public class TestReversedWildcardFilterFactory extends BaseTokenTestCase {
|
|||
|
||||
public void testIndexingAnalysis() throws Exception {
|
||||
Analyzer a = schema.getAnalyzer();
|
||||
String text = "one two three";
|
||||
String expected1 = "one \u0001eno two \u0001owt three \u0001eerht";
|
||||
String text = "one two three si\uD834\uDD1Ex";
|
||||
String expected1 = "one \u0001eno two \u0001owt three \u0001eerht si\uD834\uDD1Ex \u0001x\uD834\uDD1Eis";
|
||||
List<Token> expectedTokens1 = getTokens(
|
||||
new WhitespaceTokenizer(new StringReader(expected1)));
|
||||
// set positionIncrements and offsets in expected tokens
|
||||
|
@ -86,10 +86,10 @@ public class TestReversedWildcardFilterFactory extends BaseTokenTestCase {
|
|||
Token t = expectedTokens1.get(i);
|
||||
t.setPositionIncrement(0);
|
||||
}
|
||||
String expected2 = "\u0001eno \u0001owt \u0001eerht";
|
||||
String expected2 = "\u0001eno \u0001owt \u0001eerht \u0001x\uD834\uDD1Eis";
|
||||
List<Token> expectedTokens2 = getTokens(
|
||||
new WhitespaceTokenizer(new StringReader(expected2)));
|
||||
String expected3 = "one two three";
|
||||
String expected3 = "one two three si\uD834\uDD1Ex";
|
||||
List<Token> expectedTokens3 = getTokens(
|
||||
new WhitespaceTokenizer(new StringReader(expected3)));
|
||||
// field one
|
||||
|
@ -116,10 +116,10 @@ public class TestReversedWildcardFilterFactory extends BaseTokenTestCase {
|
|||
// XXX note: this should be false, but for now we return true for any field,
|
||||
// XXX if at least one field uses the reversing
|
||||
assertTrue(parserThree.getAllowLeadingWildcard());
|
||||
String text = "one +two *hree f*ur fiv*";
|
||||
String expectedOne = "one:one +one:two one:\u0001eerh* one:\u0001ru*f one:fiv*";
|
||||
String expectedTwo = "two:one +two:two two:\u0001eerh* two:\u0001ru*f two:fiv*";
|
||||
String expectedThree = "three:one +three:two three:*hree three:f*ur three:fiv*";
|
||||
String text = "one +two *hree f*ur fiv* *si\uD834\uDD1Ex";
|
||||
String expectedOne = "one:one +one:two one:\u0001eerh* one:\u0001ru*f one:fiv* one:\u0001x\uD834\uDD1Eis*";
|
||||
String expectedTwo = "two:one +two:two two:\u0001eerh* two:\u0001ru*f two:fiv* two:\u0001x\uD834\uDD1Eis*";
|
||||
String expectedThree = "three:one +three:two three:*hree three:f*ur three:fiv* three:*si\uD834\uDD1Ex";
|
||||
Query q = parserOne.parse(text);
|
||||
assertEquals(expectedOne, q.toString());
|
||||
q = parserTwo.parse(text);
|
||||
|
|
Loading…
Reference in New Issue