mirror of https://github.com/apache/lucene.git
SOLR-1593: fix reverse wildcard filter for surrogate pairs
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@883386 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
0d639dad03
commit
dc18c42b6b
|
@ -74,6 +74,11 @@ Bug Fixes
|
||||||
fl=score to the parameter list instead of appending score to the
|
fl=score to the parameter list instead of appending score to the
|
||||||
existing field list. (yonik)
|
existing field list. (yonik)
|
||||||
|
|
||||||
|
* SOLR-1593: ReverseWildcardFilter didn't work for surrogate pairs
|
||||||
|
(i.e. code points outside of the BMP), resulting in incorrect
|
||||||
|
matching. This change requires reindexing for any content with
|
||||||
|
such characters. (Robert Muir, yonik)
|
||||||
|
|
||||||
|
|
||||||
Other Changes
|
Other Changes
|
||||||
----------------------
|
----------------------
|
||||||
|
|
|
@ -20,7 +20,6 @@ import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
|
@ -73,13 +72,79 @@ public class ReversedWildcardFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
char [] buffer = termAtt.resizeTermBuffer(oldLen + 1);
|
char [] buffer = termAtt.resizeTermBuffer(oldLen + 1);
|
||||||
buffer[oldLen] = markerChar;
|
buffer[oldLen] = markerChar;
|
||||||
//String reversed = reverseAndMark(value, markerChar);
|
reverse(buffer, 0, oldLen + 1);
|
||||||
ReverseStringFilter.reverse(buffer, oldLen + 1);
|
|
||||||
|
|
||||||
posAtt.setPositionIncrement(origOffset);
|
posAtt.setPositionIncrement(origOffset);
|
||||||
termAtt.setTermBuffer(buffer, 0, oldLen +1);
|
termAtt.setTermBuffer(buffer, 0, oldLen +1);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Partially reverses the given input buffer in-place from the given offset
|
||||||
|
* up to the given length, keeping surrogate pairs in the correct (non-reversed) order.
|
||||||
|
* @param buffer the input char array to reverse
|
||||||
|
* @param start the offset from where to reverse the buffer
|
||||||
|
* @param len the length in the buffer up to where the
|
||||||
|
* buffer should be reversed
|
||||||
|
*/
|
||||||
|
public static void reverse(final char[] buffer, final int start, final int len) {
|
||||||
|
/* modified version of Apache Harmony AbstractStringBuilder reverse0() */
|
||||||
|
if (len < 2)
|
||||||
|
return;
|
||||||
|
int end = (start + len) - 1;
|
||||||
|
char frontHigh = buffer[start];
|
||||||
|
char endLow = buffer[end];
|
||||||
|
boolean allowFrontSur = true, allowEndSur = true;
|
||||||
|
final int mid = start + (len >> 1);
|
||||||
|
for (int i = start; i < mid; ++i, --end) {
|
||||||
|
final char frontLow = buffer[i + 1];
|
||||||
|
final char endHigh = buffer[end - 1];
|
||||||
|
final boolean surAtFront = allowFrontSur
|
||||||
|
&& Character.isSurrogatePair(frontHigh, frontLow);
|
||||||
|
if (surAtFront && (len < 3)) {
|
||||||
|
// nothing to do since surAtFront is allowed and 1 char left
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
final boolean surAtEnd = allowEndSur
|
||||||
|
&& Character.isSurrogatePair(endHigh, endLow);
|
||||||
|
allowFrontSur = allowEndSur = true;
|
||||||
|
if (surAtFront == surAtEnd) {
|
||||||
|
if (surAtFront) {
|
||||||
|
// both surrogates
|
||||||
|
buffer[end] = frontLow;
|
||||||
|
buffer[--end] = frontHigh;
|
||||||
|
buffer[i] = endHigh;
|
||||||
|
buffer[++i] = endLow;
|
||||||
|
frontHigh = buffer[i + 1];
|
||||||
|
endLow = buffer[end - 1];
|
||||||
|
} else {
|
||||||
|
// neither surrogates
|
||||||
|
buffer[end] = frontHigh;
|
||||||
|
buffer[i] = endLow;
|
||||||
|
frontHigh = frontLow;
|
||||||
|
endLow = endHigh;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (surAtFront) {
|
||||||
|
// surrogate only at the front
|
||||||
|
buffer[end] = frontLow;
|
||||||
|
buffer[i] = endLow;
|
||||||
|
endLow = endHigh;
|
||||||
|
allowFrontSur = false;
|
||||||
|
} else {
|
||||||
|
// surrogate only at the end
|
||||||
|
buffer[end] = frontHigh;
|
||||||
|
buffer[i] = endHigh;
|
||||||
|
frontHigh = frontLow;
|
||||||
|
allowEndSur = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ((len & 0x01) == 1 && !(allowFrontSur && allowEndSur)) {
|
||||||
|
// only if odd length
|
||||||
|
buffer[end] = allowFrontSur ? endLow : frontHigh;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,10 +27,7 @@ import org.apache.lucene.queryParser.QueryParser;
|
||||||
import org.apache.lucene.search.*;
|
import org.apache.lucene.search.*;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
|
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
|
||||||
import org.apache.solr.analysis.ReversedWildcardFilter;
|
import org.apache.solr.analysis.*;
|
||||||
import org.apache.solr.analysis.ReversedWildcardFilterFactory;
|
|
||||||
import org.apache.solr.analysis.TokenFilterFactory;
|
|
||||||
import org.apache.solr.analysis.TokenizerChain;
|
|
||||||
import org.apache.solr.common.SolrException;
|
import org.apache.solr.common.SolrException;
|
||||||
import org.apache.solr.schema.FieldType;
|
import org.apache.solr.schema.FieldType;
|
||||||
import org.apache.solr.schema.IndexSchema;
|
import org.apache.solr.schema.IndexSchema;
|
||||||
|
@ -193,7 +190,12 @@ public class SolrQueryParser extends QueryParser {
|
||||||
String type = schema.getFieldType(field).getTypeName();
|
String type = schema.getFieldType(field).getTypeName();
|
||||||
ReversedWildcardFilterFactory factory = leadingWildcards.get(type);
|
ReversedWildcardFilterFactory factory = leadingWildcards.get(type);
|
||||||
if (factory != null && factory.shouldReverse(termStr)) {
|
if (factory != null && factory.shouldReverse(termStr)) {
|
||||||
termStr = ReverseStringFilter.reverse(termStr + factory.getMarkerChar());
|
int len = termStr.length();
|
||||||
|
char[] chars = new char[len+1];
|
||||||
|
chars[0] = factory.getMarkerChar();
|
||||||
|
termStr.getChars(0, len, chars, 1);
|
||||||
|
ReversedWildcardFilter.reverse(chars, 1, len);
|
||||||
|
termStr = new String(chars);
|
||||||
}
|
}
|
||||||
Query q = super.getWildcardQuery(field, termStr);
|
Query q = super.getWildcardQuery(field, termStr);
|
||||||
if (q instanceof WildcardQuery) {
|
if (q instanceof WildcardQuery) {
|
||||||
|
|
|
@ -77,8 +77,8 @@ public class TestReversedWildcardFilterFactory extends BaseTokenTestCase {
|
||||||
|
|
||||||
public void testIndexingAnalysis() throws Exception {
|
public void testIndexingAnalysis() throws Exception {
|
||||||
Analyzer a = schema.getAnalyzer();
|
Analyzer a = schema.getAnalyzer();
|
||||||
String text = "one two three";
|
String text = "one two three si\uD834\uDD1Ex";
|
||||||
String expected1 = "one \u0001eno two \u0001owt three \u0001eerht";
|
String expected1 = "one \u0001eno two \u0001owt three \u0001eerht si\uD834\uDD1Ex \u0001x\uD834\uDD1Eis";
|
||||||
List<Token> expectedTokens1 = getTokens(
|
List<Token> expectedTokens1 = getTokens(
|
||||||
new WhitespaceTokenizer(new StringReader(expected1)));
|
new WhitespaceTokenizer(new StringReader(expected1)));
|
||||||
// set positionIncrements and offsets in expected tokens
|
// set positionIncrements and offsets in expected tokens
|
||||||
|
@ -86,10 +86,10 @@ public class TestReversedWildcardFilterFactory extends BaseTokenTestCase {
|
||||||
Token t = expectedTokens1.get(i);
|
Token t = expectedTokens1.get(i);
|
||||||
t.setPositionIncrement(0);
|
t.setPositionIncrement(0);
|
||||||
}
|
}
|
||||||
String expected2 = "\u0001eno \u0001owt \u0001eerht";
|
String expected2 = "\u0001eno \u0001owt \u0001eerht \u0001x\uD834\uDD1Eis";
|
||||||
List<Token> expectedTokens2 = getTokens(
|
List<Token> expectedTokens2 = getTokens(
|
||||||
new WhitespaceTokenizer(new StringReader(expected2)));
|
new WhitespaceTokenizer(new StringReader(expected2)));
|
||||||
String expected3 = "one two three";
|
String expected3 = "one two three si\uD834\uDD1Ex";
|
||||||
List<Token> expectedTokens3 = getTokens(
|
List<Token> expectedTokens3 = getTokens(
|
||||||
new WhitespaceTokenizer(new StringReader(expected3)));
|
new WhitespaceTokenizer(new StringReader(expected3)));
|
||||||
// field one
|
// field one
|
||||||
|
@ -116,10 +116,10 @@ public class TestReversedWildcardFilterFactory extends BaseTokenTestCase {
|
||||||
// XXX note: this should be false, but for now we return true for any field,
|
// XXX note: this should be false, but for now we return true for any field,
|
||||||
// XXX if at least one field uses the reversing
|
// XXX if at least one field uses the reversing
|
||||||
assertTrue(parserThree.getAllowLeadingWildcard());
|
assertTrue(parserThree.getAllowLeadingWildcard());
|
||||||
String text = "one +two *hree f*ur fiv*";
|
String text = "one +two *hree f*ur fiv* *si\uD834\uDD1Ex";
|
||||||
String expectedOne = "one:one +one:two one:\u0001eerh* one:\u0001ru*f one:fiv*";
|
String expectedOne = "one:one +one:two one:\u0001eerh* one:\u0001ru*f one:fiv* one:\u0001x\uD834\uDD1Eis*";
|
||||||
String expectedTwo = "two:one +two:two two:\u0001eerh* two:\u0001ru*f two:fiv*";
|
String expectedTwo = "two:one +two:two two:\u0001eerh* two:\u0001ru*f two:fiv* two:\u0001x\uD834\uDD1Eis*";
|
||||||
String expectedThree = "three:one +three:two three:*hree three:f*ur three:fiv*";
|
String expectedThree = "three:one +three:two three:*hree three:f*ur three:fiv* three:*si\uD834\uDD1Ex";
|
||||||
Query q = parserOne.parse(text);
|
Query q = parserOne.parse(text);
|
||||||
assertEquals(expectedOne, q.toString());
|
assertEquals(expectedOne, q.toString());
|
||||||
q = parserTwo.parse(text);
|
q = parserTwo.parse(text);
|
||||||
|
|
Loading…
Reference in New Issue