LUCENE-4310: non-BMP characters were failing to match with MappingCharFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1374381 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-08-17 18:14:44 +00:00
parent 852291e84e
commit 9a00dbc928
4 changed files with 38 additions and 2 deletions

View File

@ -79,6 +79,10 @@ Bug Fixes
Also, ensure immutability and use only one instance of this table in RAM (lazy Also, ensure immutability and use only one instance of this table in RAM (lazy
loaded) since its quite large. (sausarkar, Steven Rowe, Robert Muir) loaded) since its quite large. (sausarkar, Steven Rowe, Robert Muir)
* LUCENE-4310: MappingCharFilter was failing to match input strings
containing non-BMP Unicode characters. (Dawid Weiss, Robert Muir,
Mike McCandless)
Build Build
* LUCENE-3985: Upgrade to randomizedtesting 2.0.0. Added support for * LUCENE-3985: Upgrade to randomizedtesting 2.0.0. Added support for

View File

@ -111,9 +111,8 @@ public class NormalizeCharMap {
final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<CharsRef>(FST.INPUT_TYPE.BYTE2, outputs); final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<CharsRef>(FST.INPUT_TYPE.BYTE2, outputs);
final IntsRef scratch = new IntsRef(); final IntsRef scratch = new IntsRef();
for(Map.Entry<String,String> ent : pendingPairs.entrySet()) { for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
builder.add(Util.toUTF32(ent.getKey(), scratch), builder.add(Util.toUTF16(ent.getKey(), scratch),
new CharsRef(ent.getValue())); new CharsRef(ent.getValue()));
} }
map = builder.finish(); map = builder.finish();
pendingPairs.clear(); pendingPairs.clear();

View File

@ -33,6 +33,7 @@ import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util._TestUtil; import org.apache.lucene.util._TestUtil;
public class TestMappingCharFilter extends BaseTokenStreamTestCase { public class TestMappingCharFilter extends BaseTokenStreamTestCase {
@ -55,6 +56,11 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
builder.add( "empty", "" ); builder.add( "empty", "" );
// BMP (surrogate pair):
builder.add(UnicodeUtil.newString(new int[] {0x1D122}, 0, 1), "fclef");
builder.add("\uff01", "full-width-exclamation");
normMap = builder.build(); normMap = builder.build();
} }
@ -128,6 +134,18 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
assertTokenStreamContents(ts, new String[0], new int[]{}, new int[]{}, 5); assertTokenStreamContents(ts, new String[0], new int[]{}, new int[]{}, 5);
} }
public void testNonBMPChar() throws Exception {
CharFilter cs = new MappingCharFilter( normMap, new StringReader( UnicodeUtil.newString(new int[] {0x1D122}, 0, 1) ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[]{"fclef"}, new int[]{0}, new int[]{2}, 2);
}
public void testFullWidthChar() throws Exception {
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "\uff01") );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[]{"full-width-exclamation"}, new int[]{0}, new int[]{1}, 1);
}
// //
// 1111111111222 // 1111111111222
// 01234567890123456789012 // 01234567890123456789012

View File

@ -767,6 +767,21 @@ public final class Util {
} }
} }
/** Just maps each UTF16 unit (char) to the ints in an
* IntsRef. */
public static IntsRef toUTF16(CharSequence s, IntsRef scratch) {
final int charLimit = s.length();
scratch.grow(charLimit);
int idx = 0;
while(idx < charLimit) {
scratch.ints[idx] = (int) s.charAt(idx);
idx++;
}
scratch.offset = 0;
scratch.length = idx;
return scratch;
}
/** Decodes the Unicode codepoints from the provided /** Decodes the Unicode codepoints from the provided
* CharSequence and places them in the provided scratch * CharSequence and places them in the provided scratch
* IntsRef, which must not be null, returning it. */ * IntsRef, which must not be null, returning it. */