mirror of https://github.com/apache/lucene.git
LUCENE-4310: non-BMP characters were failing to match with MappingCharFilter
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1374381 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
852291e84e
commit
9a00dbc928
|
@ -79,6 +79,10 @@ Bug Fixes
|
||||||
Also, ensure immutability and use only one instance of this table in RAM (lazy
|
Also, ensure immutability and use only one instance of this table in RAM (lazy
|
||||||
loaded) since its quite large. (sausarkar, Steven Rowe, Robert Muir)
|
loaded) since its quite large. (sausarkar, Steven Rowe, Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-4310: MappingCharFilter was failing to match input strings
|
||||||
|
containing non-BMP Unicode characters. (Dawid Weiss, Robert Muir,
|
||||||
|
Mike McCandless)
|
||||||
|
|
||||||
Build
|
Build
|
||||||
|
|
||||||
* LUCENE-3985: Upgrade to randomizedtesting 2.0.0. Added support for
|
* LUCENE-3985: Upgrade to randomizedtesting 2.0.0. Added support for
|
||||||
|
|
|
@ -111,9 +111,8 @@ public class NormalizeCharMap {
|
||||||
final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<CharsRef>(FST.INPUT_TYPE.BYTE2, outputs);
|
final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<CharsRef>(FST.INPUT_TYPE.BYTE2, outputs);
|
||||||
final IntsRef scratch = new IntsRef();
|
final IntsRef scratch = new IntsRef();
|
||||||
for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
|
for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
|
||||||
builder.add(Util.toUTF32(ent.getKey(), scratch),
|
builder.add(Util.toUTF16(ent.getKey(), scratch),
|
||||||
new CharsRef(ent.getValue()));
|
new CharsRef(ent.getValue()));
|
||||||
|
|
||||||
}
|
}
|
||||||
map = builder.finish();
|
map = builder.finish();
|
||||||
pendingPairs.clear();
|
pendingPairs.clear();
|
||||||
|
|
|
@ -33,6 +33,7 @@ import org.apache.lucene.analysis.CharFilter;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.util.UnicodeUtil;
|
||||||
import org.apache.lucene.util._TestUtil;
|
import org.apache.lucene.util._TestUtil;
|
||||||
|
|
||||||
public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
||||||
|
@ -55,6 +56,11 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
builder.add( "empty", "" );
|
builder.add( "empty", "" );
|
||||||
|
|
||||||
|
// BMP (surrogate pair):
|
||||||
|
builder.add(UnicodeUtil.newString(new int[] {0x1D122}, 0, 1), "fclef");
|
||||||
|
|
||||||
|
builder.add("\uff01", "full-width-exclamation");
|
||||||
|
|
||||||
normMap = builder.build();
|
normMap = builder.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -128,6 +134,18 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
||||||
assertTokenStreamContents(ts, new String[0], new int[]{}, new int[]{}, 5);
|
assertTokenStreamContents(ts, new String[0], new int[]{}, new int[]{}, 5);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testNonBMPChar() throws Exception {
|
||||||
|
CharFilter cs = new MappingCharFilter( normMap, new StringReader( UnicodeUtil.newString(new int[] {0x1D122}, 0, 1) ) );
|
||||||
|
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||||
|
assertTokenStreamContents(ts, new String[]{"fclef"}, new int[]{0}, new int[]{2}, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testFullWidthChar() throws Exception {
|
||||||
|
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "\uff01") );
|
||||||
|
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||||
|
assertTokenStreamContents(ts, new String[]{"full-width-exclamation"}, new int[]{0}, new int[]{1}, 1);
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// 1111111111222
|
// 1111111111222
|
||||||
// 01234567890123456789012
|
// 01234567890123456789012
|
||||||
|
|
|
@ -767,6 +767,21 @@ public final class Util {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Just maps each UTF16 unit (char) to the ints in an
|
||||||
|
* IntsRef. */
|
||||||
|
public static IntsRef toUTF16(CharSequence s, IntsRef scratch) {
|
||||||
|
final int charLimit = s.length();
|
||||||
|
scratch.grow(charLimit);
|
||||||
|
int idx = 0;
|
||||||
|
while(idx < charLimit) {
|
||||||
|
scratch.ints[idx] = (int) s.charAt(idx);
|
||||||
|
idx++;
|
||||||
|
}
|
||||||
|
scratch.offset = 0;
|
||||||
|
scratch.length = idx;
|
||||||
|
return scratch;
|
||||||
|
}
|
||||||
|
|
||||||
/** Decodes the Unicode codepoints from the provided
|
/** Decodes the Unicode codepoints from the provided
|
||||||
* CharSequence and places them in the provided scratch
|
* CharSequence and places them in the provided scratch
|
||||||
* IntsRef, which must not be null, returning it. */
|
* IntsRef, which must not be null, returning it. */
|
||||||
|
|
Loading…
Reference in New Issue