diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index a137ef2b67a..2d8d20e329e 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -79,6 +79,10 @@ Bug Fixes Also, ensure immutability and use only one instance of this table in RAM (lazy loaded) since its quite large. (sausarkar, Steven Rowe, Robert Muir) +* LUCENE-4310: MappingCharFilter was failing to match input strings + containing non-BMP Unicode characters. (Dawid Weiss, Robert Muir, + Mike McCandless) + Build * LUCENE-3985: Upgrade to randomizedtesting 2.0.0. Added support for diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java index c6470611d2c..c22203a76a4 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java @@ -111,9 +111,8 @@ public class NormalizeCharMap { final org.apache.lucene.util.fst.Builder builder = new org.apache.lucene.util.fst.Builder(FST.INPUT_TYPE.BYTE2, outputs); final IntsRef scratch = new IntsRef(); for(Map.Entry ent : pendingPairs.entrySet()) { - builder.add(Util.toUTF32(ent.getKey(), scratch), + builder.add(Util.toUTF16(ent.getKey(), scratch), new CharsRef(ent.getValue())); - } map = builder.finish(); pendingPairs.clear(); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java index c4fab5519c0..d692a57c328 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java @@ -33,6 +33,7 @@ import org.apache.lucene.analysis.CharFilter; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util._TestUtil; public class TestMappingCharFilter extends BaseTokenStreamTestCase { @@ -55,6 +56,11 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase { builder.add( "empty", "" ); + // BMP (surrogate pair): + builder.add(UnicodeUtil.newString(new int[] {0x1D122}, 0, 1), "fclef"); + + builder.add("\uff01", "full width exclamation"); + normMap = builder.build(); } @@ -128,6 +134,18 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase { assertTokenStreamContents(ts, new String[0], new int[]{}, new int[]{}, 5); } + public void testNonBMPChar() throws Exception { + CharFilter cs = new MappingCharFilter( normMap, new StringReader( UnicodeUtil.newString(new int[] {0x1D122}, 0, 1) ) ); + TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); + assertTokenStreamContents(ts, new String[]{"fclef"}, new int[]{0}, new int[]{2}, 2); + } + + public void testFullWidthChar() throws Exception { + CharFilter cs = new MappingCharFilter( normMap, new StringReader( "\uff01") ); + TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); + assertTokenStreamContents(ts, new String[]{"full width exclamation"}, new int[]{0}, new int[]{1}, 1); + } + // // 1111111111222 // 01234567890123456789012 diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java index 55823a3f829..7a9685745ed 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java @@ -767,6 +767,21 @@ public final class Util { } } + /** Just maps each UTF16 unit (char) to the ints in an + * IntsRef. */ + public static IntsRef toUTF16(CharSequence s, IntsRef scratch) { + final int charLimit = s.length(); + scratch.grow(charLimit); + int idx = 0; + while(idx < charLimit) { + scratch.ints[idx] = (int) s.charAt(idx); + idx++; + } + scratch.offset = 0; + scratch.length = idx; + return scratch; + } + /** Decodes the Unicode codepoints from the provided * CharSequence and places them in the provided scratch * IntsRef, which must not be null, returning it. */