mirror of
https://github.com/apache/lucene.git
synced 2025-02-28 05:19:17 +00:00
LUCENE-2068: Fixed ReverseStringFilter for Unicode 4.0. Reverse Supplementary Characters correctly.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@883149 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
0808b44445
commit
e69141c51a
@ -2,6 +2,15 @@ Lucene contrib change Log
|
||||
|
||||
======================= Trunk (not yet released) =======================
|
||||
|
||||
Bug fixes
|
||||
|
||||
* LUCENE-2068: Fixed ReverseStringFilter which was not aware of supplementary
|
||||
characters. During reverse the filter created unpaired surrogates, which
|
||||
will be replaced by U+FFFD by the indexer, but not at query time. The filter
|
||||
now reverses supplementary characters correctly if used with Version > 3.0.
|
||||
(Simon Willnauer, Robert Muir)
|
||||
|
||||
|
||||
======================= Release 3.0.0 2009-11-25 =======================
|
||||
|
||||
Changes in backwards compatibility policy
|
||||
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.reverse;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
@ -31,11 +32,19 @@ import java.io.IOException;
|
||||
* "\u0001yrtnuoc". This is useful when implementing efficient leading
|
||||
* wildcards search.
|
||||
* </p>
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating ReverseStringFilter, or when using any of
|
||||
* its static methods:
|
||||
* <ul>
|
||||
* <li> As of 3.1, supplementary characters are handled correctly
|
||||
* </ul>
|
||||
*/
|
||||
public final class ReverseStringFilter extends TokenFilter {
|
||||
|
||||
private TermAttribute termAtt;
|
||||
private final char marker;
|
||||
private final Version matchVersion;
|
||||
private static final char NOMARKER = '\uFFFF';
|
||||
|
||||
/**
|
||||
@ -66,6 +75,8 @@ public final class ReverseStringFilter extends TokenFilter {
|
||||
* </p>
|
||||
*
|
||||
* @param in {@link TokenStream} to filter
|
||||
* @deprecated use {@link #ReverseStringFilter(Version, TokenStream)}
|
||||
* instead. This constructor will be removed in Lucene 4.0
|
||||
*/
|
||||
public ReverseStringFilter(TokenStream in) {
|
||||
this(in, NOMARKER);
|
||||
@ -81,9 +92,42 @@ public final class ReverseStringFilter extends TokenFilter {
|
||||
*
|
||||
* @param in {@link TokenStream} to filter
|
||||
* @param marker A character used to mark reversed tokens
|
||||
* @deprecated use {@link #ReverseStringFilter(Version, TokenStream, char)}
|
||||
* instead. This constructor will be removed in Lucene 4.0
|
||||
*/
|
||||
public ReverseStringFilter(TokenStream in, char marker) {
|
||||
this(Version.LUCENE_30, in, marker);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new ReverseStringFilter that reverses all tokens in the
|
||||
* supplied {@link TokenStream}.
|
||||
* <p>
|
||||
* The reversed tokens will not be marked.
|
||||
* </p>
|
||||
*
|
||||
* @param matchVersion See <a href="#version">above</a>
|
||||
* @param in {@link TokenStream} to filter
|
||||
*/
|
||||
public ReverseStringFilter(Version matchVersion, TokenStream in) {
|
||||
this(matchVersion, in, NOMARKER);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new ReverseStringFilter that reverses and marks all tokens in the
|
||||
* supplied {@link TokenStream}.
|
||||
* <p>
|
||||
* The reversed tokens will be prepended (marked) by the <code>marker</code>
|
||||
* character.
|
||||
* </p>
|
||||
*
|
||||
* @param matchVersion See <a href="#version">above</a>
|
||||
* @param in {@link TokenStream} to filter
|
||||
* @param marker A character used to mark reversed tokens
|
||||
*/
|
||||
public ReverseStringFilter(Version matchVersion, TokenStream in, char marker) {
|
||||
super(in);
|
||||
this.matchVersion = matchVersion;
|
||||
this.marker = marker;
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
}
|
||||
@ -97,7 +141,7 @@ public final class ReverseStringFilter extends TokenFilter {
|
||||
termAtt.resizeTermBuffer(len);
|
||||
termAtt.termBuffer()[len - 1] = marker;
|
||||
}
|
||||
reverse( termAtt.termBuffer(), len );
|
||||
reverse( matchVersion, termAtt.termBuffer(), 0, len );
|
||||
termAtt.setTermLength(len);
|
||||
return true;
|
||||
} else {
|
||||
@ -105,21 +149,94 @@ public final class ReverseStringFilter extends TokenFilter {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reverses the given input string
|
||||
*
|
||||
* @param input the string to reverse
|
||||
* @return the given input string in reversed order
|
||||
* @deprecated use {@link #reverse(Version, String)} instead. This method
|
||||
* will be removed in Lucene 4.0
|
||||
*/
|
||||
public static String reverse( final String input ){
|
||||
char[] charInput = input.toCharArray();
|
||||
reverse( charInput );
|
||||
return reverse(Version.LUCENE_30, input);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reverses the given input string
|
||||
*
|
||||
* @param matchVersion See <a href="#version">above</a>
|
||||
* @param input the string to reverse
|
||||
* @return the given input string in reversed order
|
||||
*/
|
||||
public static String reverse( Version matchVersion, final String input ){
|
||||
final char[] charInput = input.toCharArray();
|
||||
reverse( matchVersion, charInput, 0, charInput.length );
|
||||
return new String( charInput );
|
||||
}
|
||||
|
||||
public static void reverse( char[] buffer ){
|
||||
reverse( buffer, buffer.length );
|
||||
/**
|
||||
* Reverses the given input buffer in-place
|
||||
* @param buffer the input char array to reverse
|
||||
* @deprecated use {@link #reverse(Version, char[])} instead. This
|
||||
* method will be removed in Lucene 4.0
|
||||
*/
|
||||
public static void reverse( final char[] buffer ){
|
||||
reverse( buffer, 0, buffer.length );
|
||||
}
|
||||
|
||||
public static void reverse( char[] buffer, int len ){
|
||||
/**
|
||||
* Reverses the given input buffer in-place
|
||||
* @param matchVersion See <a href="#version">above</a>
|
||||
* @param buffer the input char array to reverse
|
||||
*/
|
||||
public static void reverse(Version matchVersion, final char[] buffer) {
|
||||
reverse(matchVersion, buffer, 0, buffer.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Partially reverses the given input buffer in-place from offset 0
|
||||
* up to the given length.
|
||||
* @param buffer the input char array to reverse
|
||||
* @param len the length in the buffer up to where the
|
||||
* buffer should be reversed
|
||||
* @deprecated use {@link #reverse(Version, char[], int)} instead. This
|
||||
* method will be removed in Lucene 4.0
|
||||
*/
|
||||
public static void reverse( final char[] buffer, final int len ){
|
||||
reverse( buffer, 0, len );
|
||||
}
|
||||
|
||||
public static void reverse( char[] buffer, int start, int len ){
|
||||
/**
|
||||
* Partially reverses the given input buffer in-place from offset 0
|
||||
* up to the given length.
|
||||
* @param matchVersion See <a href="#version">above</a>
|
||||
* @param buffer the input char array to reverse
|
||||
* @param len the length in the buffer up to where the
|
||||
* buffer should be reversed
|
||||
*/
|
||||
public static void reverse(Version matchVersion, final char[] buffer,
|
||||
final int len) {
|
||||
reverse( matchVersion, buffer, 0, len );
|
||||
}
|
||||
|
||||
/**
|
||||
* Partially reverses the given input buffer in-place from the given offset
|
||||
* up to the given length.
|
||||
* @param buffer the input char array to reverse
|
||||
* @param start the offset from where to reverse the buffer
|
||||
* @param len the length in the buffer up to where the
|
||||
* buffer should be reversed
|
||||
* @deprecated use {@link #reverse(Version, char[], int, int)} instead. This
|
||||
* method will be removed in Lucene 4.0
|
||||
*/
|
||||
public static void reverse(char[] buffer, int start, int len ) {
|
||||
reverseUnicode3(buffer, start, len);
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Remove this when support for 3.0 indexes is no longer needed.
|
||||
*/
|
||||
private static void reverseUnicode3( char[] buffer, int start, int len ){
|
||||
if( len <= 1 ) return;
|
||||
int num = len>>1;
|
||||
for( int i = start; i < ( start + num ); i++ ){
|
||||
@ -128,4 +245,77 @@ public final class ReverseStringFilter extends TokenFilter {
|
||||
buffer[start * 2 + len - i - 1] = c;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Partially reverses the given input buffer in-place from the given offset
|
||||
* up to the given length.
|
||||
* @param matchVersion See <a href="#version">above</a>
|
||||
* @param buffer the input char array to reverse
|
||||
* @param start the offset from where to reverse the buffer
|
||||
* @param len the length in the buffer up to where the
|
||||
* buffer should be reversed
|
||||
*/
|
||||
public static void reverse(Version matchVersion, final char[] buffer,
|
||||
final int start, final int len) {
|
||||
if (!matchVersion.onOrAfter(Version.LUCENE_31)) {
|
||||
reverseUnicode3(buffer, start, len);
|
||||
return;
|
||||
}
|
||||
/* modified version of Apache Harmony AbstractStringBuilder reverse0() */
|
||||
if (len < 2)
|
||||
return;
|
||||
int end = (start + len) - 1;
|
||||
char frontHigh = buffer[start];
|
||||
char endLow = buffer[end];
|
||||
boolean allowFrontSur = true, allowEndSur = true;
|
||||
final int mid = start + (len >> 1);
|
||||
for (int i = start; i < mid; ++i, --end) {
|
||||
final char frontLow = buffer[i + 1];
|
||||
final char endHigh = buffer[end - 1];
|
||||
final boolean surAtFront = allowFrontSur
|
||||
&& Character.isSurrogatePair(frontHigh, frontLow);
|
||||
if (surAtFront && (len < 3)) {
|
||||
// nothing to do since surAtFront is allowed and 1 char left
|
||||
return;
|
||||
}
|
||||
final boolean surAtEnd = allowEndSur
|
||||
&& Character.isSurrogatePair(endHigh, endLow);
|
||||
allowFrontSur = allowEndSur = true;
|
||||
if (surAtFront == surAtEnd) {
|
||||
if (surAtFront) {
|
||||
// both surrogates
|
||||
buffer[end] = frontLow;
|
||||
buffer[--end] = frontHigh;
|
||||
buffer[i] = endHigh;
|
||||
buffer[++i] = endLow;
|
||||
frontHigh = buffer[i + 1];
|
||||
endLow = buffer[end - 1];
|
||||
} else {
|
||||
// neither surrogates
|
||||
buffer[end] = frontHigh;
|
||||
buffer[i] = endLow;
|
||||
frontHigh = frontLow;
|
||||
endLow = endHigh;
|
||||
}
|
||||
} else {
|
||||
if (surAtFront) {
|
||||
// surrogate only at the front
|
||||
buffer[end] = frontLow;
|
||||
buffer[i] = endLow;
|
||||
endLow = endHigh;
|
||||
allowFrontSur = false;
|
||||
} else {
|
||||
// surrogate only at the end
|
||||
buffer[end] = frontHigh;
|
||||
buffer[i] = endHigh;
|
||||
frontHigh = frontLow;
|
||||
allowEndSur = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ((len & 0x01) == 1 && !(allowFrontSur && allowEndSur)) {
|
||||
// only if odd length
|
||||
buffer[end] = allowFrontSur ? endLow : frontHigh;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -23,6 +23,7 @@ import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestReverseStringFilter extends BaseTokenStreamTestCase {
|
||||
public void testFilter() throws Exception {
|
||||
@ -73,4 +74,47 @@ public class TestReverseStringFilter extends BaseTokenStreamTestCase {
|
||||
ReverseStringFilter.reverse( buffer, 2, 3 );
|
||||
assertEquals( "ABEDCF", new String( buffer ) );
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the broken 3.0 behavior, for back compat
|
||||
*/
|
||||
public void testBackCompat() throws Exception {
|
||||
assertEquals("\uDF05\uD866\uDF05\uD866", ReverseStringFilter.reverse("𩬅𩬅"));
|
||||
}
|
||||
|
||||
public void testReverseSupplementary() throws Exception {
|
||||
// supplementary at end
|
||||
assertEquals("𩬅艱鍟䇹愯瀛", ReverseStringFilter.reverse(Version.LUCENE_CURRENT, "瀛愯䇹鍟艱𩬅"));
|
||||
// supplementary at end - 1
|
||||
assertEquals("a𩬅艱鍟䇹愯瀛", ReverseStringFilter.reverse(Version.LUCENE_CURRENT, "瀛愯䇹鍟艱𩬅a"));
|
||||
// supplementary at start
|
||||
assertEquals("fedcba𩬅", ReverseStringFilter.reverse(Version.LUCENE_CURRENT, "𩬅abcdef"));
|
||||
// supplementary at start + 1
|
||||
assertEquals("fedcba𩬅z", ReverseStringFilter.reverse(Version.LUCENE_CURRENT, "z𩬅abcdef"));
|
||||
// supplementary medial
|
||||
assertEquals("gfe𩬅dcba", ReverseStringFilter.reverse(Version.LUCENE_CURRENT, "abcd𩬅efg"));
|
||||
}
|
||||
|
||||
public void testReverseSupplementaryChar() throws Exception {
|
||||
// supplementary at end
|
||||
char[] buffer = "abc瀛愯䇹鍟艱𩬅".toCharArray();
|
||||
ReverseStringFilter.reverse(Version.LUCENE_CURRENT, buffer, 3, 7);
|
||||
assertEquals("abc𩬅艱鍟䇹愯瀛", new String(buffer));
|
||||
// supplementary at end - 1
|
||||
buffer = "abc瀛愯䇹鍟艱𩬅d".toCharArray();
|
||||
ReverseStringFilter.reverse(Version.LUCENE_CURRENT, buffer, 3, 8);
|
||||
assertEquals("abcd𩬅艱鍟䇹愯瀛", new String(buffer));
|
||||
// supplementary at start
|
||||
buffer = "abc𩬅瀛愯䇹鍟艱".toCharArray();
|
||||
ReverseStringFilter.reverse(Version.LUCENE_CURRENT, buffer, 3, 7);
|
||||
assertEquals("abc艱鍟䇹愯瀛𩬅", new String(buffer));
|
||||
// supplementary at start + 1
|
||||
buffer = "abcd𩬅瀛愯䇹鍟艱".toCharArray();
|
||||
ReverseStringFilter.reverse(Version.LUCENE_CURRENT, buffer, 3, 8);
|
||||
assertEquals("abc艱鍟䇹愯瀛𩬅d", new String(buffer));
|
||||
// supplementary medial
|
||||
buffer = "abc瀛愯𩬅def".toCharArray();
|
||||
ReverseStringFilter.reverse(Version.LUCENE_CURRENT, buffer, 3, 7);
|
||||
assertEquals("abcfed𩬅愯瀛", new String(buffer));
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user