diff --git a/CHANGES.txt b/CHANGES.txt index 40d715af332..c16988179f6 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -74,6 +74,11 @@ Bug Fixes fl=score to the parameter list instead of appending score to the existing field list. (yonik) +* SOLR-1593: ReverseWildcardFilter didn't work for surrogate pairs + (i.e. code points outside of the BMP), resulting in incorrect + matching. This change requires reindexing for any content with + such characters. (Robert Muir, yonik) + Other Changes ---------------------- diff --git a/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java b/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java index 262bc90edd1..99911843b6f 100644 --- a/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java +++ b/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java @@ -20,7 +20,6 @@ import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.reverse.ReverseStringFilter; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; @@ -73,13 +72,79 @@ public class ReversedWildcardFilter extends TokenFilter { } char [] buffer = termAtt.resizeTermBuffer(oldLen + 1); buffer[oldLen] = markerChar; - //String reversed = reverseAndMark(value, markerChar); - ReverseStringFilter.reverse(buffer, oldLen + 1); + reverse(buffer, 0, oldLen + 1); posAtt.setPositionIncrement(origOffset); termAtt.setTermBuffer(buffer, 0, oldLen +1); return true; } - + + /** + * Partially reverses the given input buffer in-place from the given offset + * up to the given length, keeping surrogate pairs in the correct (non-reversed) order. + * @param buffer the input char array to reverse + * @param start the offset from where to reverse the buffer + * @param len the length in the buffer up to where the + * buffer should be reversed + */ + public static void reverse(final char[] buffer, final int start, final int len) { + /* modified version of Apache Harmony AbstractStringBuilder reverse0() */ + if (len < 2) + return; + int end = (start + len) - 1; + char frontHigh = buffer[start]; + char endLow = buffer[end]; + boolean allowFrontSur = true, allowEndSur = true; + final int mid = start + (len >> 1); + for (int i = start; i < mid; ++i, --end) { + final char frontLow = buffer[i + 1]; + final char endHigh = buffer[end - 1]; + final boolean surAtFront = allowFrontSur + && Character.isSurrogatePair(frontHigh, frontLow); + if (surAtFront && (len < 3)) { + // nothing to do since surAtFront is allowed and 1 char left + return; + } + final boolean surAtEnd = allowEndSur + && Character.isSurrogatePair(endHigh, endLow); + allowFrontSur = allowEndSur = true; + if (surAtFront == surAtEnd) { + if (surAtFront) { + // both surrogates + buffer[end] = frontLow; + buffer[--end] = frontHigh; + buffer[i] = endHigh; + buffer[++i] = endLow; + frontHigh = buffer[i + 1]; + endLow = buffer[end - 1]; + } else { + // neither surrogates + buffer[end] = frontHigh; + buffer[i] = endLow; + frontHigh = frontLow; + endLow = endHigh; + } + } else { + if (surAtFront) { + // surrogate only at the front + buffer[end] = frontLow; + buffer[i] = endLow; + endLow = endHigh; + allowFrontSur = false; + } else { + // surrogate only at the end + buffer[end] = frontHigh; + buffer[i] = endHigh; + frontHigh = frontLow; + allowEndSur = false; + } + } + } + if ((len & 0x01) == 1 && !(allowFrontSur && allowEndSur)) { + // only if odd length + buffer[end] = allowFrontSur ? endLow : frontHigh; + } + } + } diff --git a/src/java/org/apache/solr/search/SolrQueryParser.java b/src/java/org/apache/solr/search/SolrQueryParser.java index a180e3bb7fc..d55107f35a8 100644 --- a/src/java/org/apache/solr/search/SolrQueryParser.java +++ b/src/java/org/apache/solr/search/SolrQueryParser.java @@ -27,10 +27,7 @@ import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.*; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.reverse.ReverseStringFilter; -import org.apache.solr.analysis.ReversedWildcardFilter; -import org.apache.solr.analysis.ReversedWildcardFilterFactory; -import org.apache.solr.analysis.TokenFilterFactory; -import org.apache.solr.analysis.TokenizerChain; +import org.apache.solr.analysis.*; import org.apache.solr.common.SolrException; import org.apache.solr.schema.FieldType; import org.apache.solr.schema.IndexSchema; @@ -193,7 +190,12 @@ public class SolrQueryParser extends QueryParser { String type = schema.getFieldType(field).getTypeName(); ReversedWildcardFilterFactory factory = leadingWildcards.get(type); if (factory != null && factory.shouldReverse(termStr)) { - termStr = ReverseStringFilter.reverse(termStr + factory.getMarkerChar()); + int len = termStr.length(); + char[] chars = new char[len+1]; + chars[0] = factory.getMarkerChar(); + termStr.getChars(0, len, chars, 1); + ReversedWildcardFilter.reverse(chars, 1, len); + termStr = new String(chars); } Query q = super.getWildcardQuery(field, termStr); if (q instanceof WildcardQuery) { diff --git a/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java b/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java index 04fe1c64544..2729862586c 100644 --- a/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java +++ b/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java @@ -77,8 +77,8 @@ public class TestReversedWildcardFilterFactory extends BaseTokenTestCase { public void testIndexingAnalysis() throws Exception { Analyzer a = schema.getAnalyzer(); - String text = "one two three"; - String expected1 = "one \u0001eno two \u0001owt three \u0001eerht"; + String text = "one two three si\uD834\uDD1Ex"; + String expected1 = "one \u0001eno two \u0001owt three \u0001eerht si\uD834\uDD1Ex \u0001x\uD834\uDD1Eis"; List expectedTokens1 = getTokens( new WhitespaceTokenizer(new StringReader(expected1))); // set positionIncrements and offsets in expected tokens @@ -86,10 +86,10 @@ public class TestReversedWildcardFilterFactory extends BaseTokenTestCase { Token t = expectedTokens1.get(i); t.setPositionIncrement(0); } - String expected2 = "\u0001eno \u0001owt \u0001eerht"; + String expected2 = "\u0001eno \u0001owt \u0001eerht \u0001x\uD834\uDD1Eis"; List expectedTokens2 = getTokens( new WhitespaceTokenizer(new StringReader(expected2))); - String expected3 = "one two three"; + String expected3 = "one two three si\uD834\uDD1Ex"; List expectedTokens3 = getTokens( new WhitespaceTokenizer(new StringReader(expected3))); // field one @@ -116,10 +116,10 @@ public class TestReversedWildcardFilterFactory extends BaseTokenTestCase { // XXX note: this should be false, but for now we return true for any field, // XXX if at least one field uses the reversing assertTrue(parserThree.getAllowLeadingWildcard()); - String text = "one +two *hree f*ur fiv*"; - String expectedOne = "one:one +one:two one:\u0001eerh* one:\u0001ru*f one:fiv*"; - String expectedTwo = "two:one +two:two two:\u0001eerh* two:\u0001ru*f two:fiv*"; - String expectedThree = "three:one +three:two three:*hree three:f*ur three:fiv*"; + String text = "one +two *hree f*ur fiv* *si\uD834\uDD1Ex"; + String expectedOne = "one:one +one:two one:\u0001eerh* one:\u0001ru*f one:fiv* one:\u0001x\uD834\uDD1Eis*"; + String expectedTwo = "two:one +two:two two:\u0001eerh* two:\u0001ru*f two:fiv* two:\u0001x\uD834\uDD1Eis*"; + String expectedThree = "three:one +three:two three:*hree three:f*ur three:fiv* three:*si\uD834\uDD1Ex"; Query q = parserOne.parse(text); assertEquals(expectedOne, q.toString()); q = parserTwo.parse(text);