LUCENE-4072: add ICUNormalizer2CharFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1579488 13f79535-47bb-0310-9956-ffa450edef68
2014-03-20 00:19:00 +00:00 · 2014-03-20 00:19:00 +00:00 · 9a1ab11007
parent b569844e04
commit 9a1ab11007
3 changed files with 434 additions and 0 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -112,6 +112,9 @@ New Features
  you update the value of a BinaryDocValuesField without reindexing the
  document(s). (Shai Erera)

+* LUCENE-4072: Add ICUNormalizer2CharFilter, which lets you do unicode normalization
+  with offset correction before the tokenizer. (David Goldfarb, Ippei UKAI via Robert Muir)
+
 API Changes

 * LUCENE-5454: Add RandomAccessOrds, an optional extension of SortedSetDocValues
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
@ -0,0 +1,206 @@
+package org.apache.lucene.analysis.icu;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.charfilter.BaseCharFilter;
+
+import com.ibm.icu.text.Normalizer2;
+
+/**
+ * Normalize token text with ICU's {@link Normalizer2}.
+ */
+public final class ICUNormalizer2CharFilter extends BaseCharFilter {
+
+  private static final int IO_BUFFER_SIZE = 128;
+
+  private final Normalizer2 normalizer;
+  private final StringBuilder inputBuffer = new StringBuilder();
+  private final StringBuilder resultBuffer = new StringBuilder();
+
+  private boolean inputFinished;
+  private boolean afterQuickCheckYes;
+  private int checkedInputBoundary;
+  private int charCount;
+
+
+  /**
+   * Create a new Normalizer2CharFilter that combines NFKC normalization, Case
+   * Folding, and removes Default Ignorables (NFKC_Casefold)
+   */
+  public ICUNormalizer2CharFilter(Reader in) {
+    this(in, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
+  }
+
+  /**
+   * Create a new Normalizer2CharFilter with the specified Normalizer2
+   * @param in text
+   * @param normalizer normalizer to use
+   */
+  public ICUNormalizer2CharFilter(Reader in, Normalizer2 normalizer) {
+    super(in);
+    if (normalizer == null) {
+      throw new NullPointerException("normalizer == null");
+    }
+    this.normalizer = normalizer;
+  }
+
+  @Override
+  public int read(char[] cbuf, int off, int len) throws IOException {
+    if (off < 0) throw new IllegalArgumentException("off < 0");
+    if (off >= cbuf.length) throw new IllegalArgumentException("off >= cbuf.length");
+    if (len <= 0) throw new IllegalArgumentException("len <= 0");
+
+    while (!inputFinished || inputBuffer.length() > 0 || resultBuffer.length() > 0) {
+      int retLen;
+
+      if (resultBuffer.length() > 0) {
+        retLen = outputFromResultBuffer(cbuf, off, len);
+        if (retLen > 0) {
+          return retLen;
+        }
+      }
+
+      int resLen = readAndNormalizeFromInput();
+      if (resLen > 0) {
+        retLen = outputFromResultBuffer(cbuf, off, len);
+        if (retLen > 0) {
+          return retLen;
+        }
+      }
+
+      readInputToBuffer();
+    }
+
+    return -1;
+  }
+
+  private final char[] tmpBuffer = new char[IO_BUFFER_SIZE];
+
+  private int readInputToBuffer() throws IOException {
+    final int len = input.read(tmpBuffer);
+    if (len == -1) {
+      inputFinished = true;
+      return 0;
+    }
+    inputBuffer.append(tmpBuffer, 0, len);
+
+    // if checkedInputBoundary was at the end of a buffer, we need to check that char again
+    checkedInputBoundary = Math.max(checkedInputBoundary - 1, 0);
+    if (normalizer.isInert(tmpBuffer[len - 1]) && !Character.isHighSurrogate(tmpBuffer[len-1])) {
+      return len;
+    } else return len + readInputToBuffer();
+  }
+
+  private int readAndNormalizeFromInput() {
+    if (inputBuffer.length() <= 0) {
+      afterQuickCheckYes = false;
+      return 0;
+    }
+    if (!afterQuickCheckYes) {
+      int resLen = readFromInputWhileSpanQuickCheckYes();
+      afterQuickCheckYes = true;
+      if (resLen > 0) return resLen;
+    }
+    int resLen = readFromIoNormalizeUptoBoundary();
+    if(resLen > 0){
+      afterQuickCheckYes = false;
+    }
+    return resLen;
+  }
+
+  private int readFromInputWhileSpanQuickCheckYes() {
+    int end = normalizer.spanQuickCheckYes(inputBuffer);
+    if (end > 0) {
+      resultBuffer.append(inputBuffer.subSequence(0, end));
+      inputBuffer.delete(0, end);
+      checkedInputBoundary = Math.max(checkedInputBoundary - end, 0);
+      charCount += end;
+    }
+    return end;
+  }
+
+  private int readFromIoNormalizeUptoBoundary() {
+    // if there's no buffer to normalize, return 0
+    if (inputBuffer.length() <= 0) {
+      return 0;
+    }
+
+    boolean foundBoundary = false;
+    final int bufLen = inputBuffer.length();
+
+    while (checkedInputBoundary <= bufLen - 1) {
+      int charLen = Character.charCount(inputBuffer.codePointAt(checkedInputBoundary));
+      checkedInputBoundary += charLen;
+      if (checkedInputBoundary < bufLen && normalizer.hasBoundaryBefore(inputBuffer
+        .codePointAt(checkedInputBoundary))) {
+        foundBoundary = true;
+        break;
+      }
+    }
+    if (!foundBoundary && checkedInputBoundary >= bufLen && inputFinished) {
+      foundBoundary = true;
+      checkedInputBoundary = bufLen;
+    }
+
+    if (!foundBoundary) {
+      return 0;
+    }
+
+    return normalizeInputUpto(checkedInputBoundary);
+  }
+
+  private int normalizeInputUpto(final int length) {
+    final int destOrigLen = resultBuffer.length();
+    normalizer.normalizeSecondAndAppend(resultBuffer,
+      inputBuffer.subSequence(0, length));
+    inputBuffer.delete(0, length);
+    checkedInputBoundary = Math.max(checkedInputBoundary - length, 0);
+    final int resultLength = resultBuffer.length() - destOrigLen;
+    recordOffsetDiff(length, resultLength);
+    return resultLength;
+  }
+
+  private void recordOffsetDiff(int inputLength, int outputLength) {
+    if (inputLength == outputLength) {
+      charCount += outputLength;
+      return;
+    }
+    final int diff = inputLength - outputLength;
+    final int cumuDiff = getLastCumulativeDiff();
+    if (diff < 0) {
+      for (int i = 1;  i <= -diff; ++i) {
+        addOffCorrectMap(charCount + i, cumuDiff - i);
+      }
+    } else {
+      addOffCorrectMap(charCount + Math.min(1, outputLength), cumuDiff + diff);
+    }
+    charCount += outputLength;
+  }
+
+  private int outputFromResultBuffer(char[] cbuf, int begin, int len) {
+    len = Math.min(resultBuffer.length(), len);
+    resultBuffer.getChars(0, len, cbuf, begin);
+    if (len > 0) {
+      resultBuffer.delete(0, len);
+    }
+    return len;
+  }
+}
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
@ -0,0 +1,225 @@
+package org.apache.lucene.analysis.icu;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharFilter;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ngram.NGramTokenizer;
+import org.apache.lucene.util.TestUtil;
+
+import com.ibm.icu.text.Normalizer2;
+
+public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase {
+
+  public void testNormalization() throws IOException {
+    String input = "ʰ㌰゙5℃№㈱㌘，バッファーの正規化のテスト．㋐㋑㋒㋓㋔ｶｷｸｹｺｻﾞｼﾞｽﾞｾﾞｿﾞg̈각/각நிเกषिchkʷक्षि";
+    Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
+    String expectedOutput = normalizer.normalize(input);
+
+    CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), normalizer);
+    char[] tempBuff = new char[10];
+    StringBuilder output = new StringBuilder();
+    while (true) {
+      int length = reader.read(tempBuff);
+      if (length == -1) {
+        break;
+      }
+      output.append(tempBuff, 0, length);
+      assertEquals(output.toString(), normalizer.normalize(input.substring(0, reader.correctOffset(output.length()))));
+    }
+
+    assertEquals(expectedOutput, output.toString());
+  }
+
+  public void testTokenStream() throws IOException {
+    // '℃', '№', '㈱', '㌘', 'ｻ'+'<<', 'ｿ'+'<<', '㌰'+'<<'
+    String input = "℃ № ㈱ ㌘ ｻﾞ ｿﾞ ㌰ﾞ";
+
+    CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
+      Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE));
+
+    Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+    tokenStream.setReader(reader);
+
+    assertTokenStreamContents(tokenStream,
+      new String[] {"°C", "No", "(株)", "グラム", "ザ", "ゾ", "ピゴ"},
+      new int[] {0, 2, 4, 6, 8, 11, 14},
+      new int[] {1, 3, 5, 7, 10, 13, 16},
+      input.length());
+  }
+
+  public void testTokenStream2() throws IOException {
+    // '㌰', '<<'゙, '5', '℃', '№', '㈱', '㌘', 'ｻ', '<<', 'ｿ', '<<'
+    String input = "㌰゙5℃№㈱㌘ｻﾞｿﾞ";
+
+    CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
+      Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
+
+    Tokenizer tokenStream = new NGramTokenizer(TEST_VERSION_CURRENT, 1, 1);
+    tokenStream.setReader(reader);
+
+    assertTokenStreamContents(tokenStream,
+      new String[] {"ピ", "ゴ", "5", "°", "c", "n", "o", "(", "株", ")", "グ", "ラ", "ム", "ザ", "ゾ"},
+      new int[]{0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9},
+      new int[]{1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9, 11},
+      input.length()
+    );
+  }
+  
+  public void testMassiveLigature() throws IOException {
+    String input = "\uFDFA";
+
+    CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
+      Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
+
+    Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+    tokenStream.setReader(reader);
+
+    assertTokenStreamContents(tokenStream,
+      new String[] {"صلى", "الله", "عليه", "وسلم"},
+      new int[]{0, 0, 0, 0},
+      new int[]{0, 0, 0, 1},
+      input.length()
+    );
+  }
+
+  public void doTestMode(final Normalizer2 normalizer, int maxLength, int iterations) throws IOException {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        return new TokenStreamComponents(new MockTokenizer(MockTokenizer.KEYWORD, false));
+      }
+
+      @Override
+      protected Reader initReader(String fieldName, Reader reader) {
+        return new ICUNormalizer2CharFilter(reader, normalizer);
+      }
+    };
+
+    for (int i = 0; i < iterations; i++) {
+      String input = TestUtil.randomUnicodeString(random(), maxLength);
+      if (input.length() == 0) {
+        continue;
+      }
+      String normalized = normalizer.normalize(input);
+      if (normalized.length() == 0) {
+        continue; // MockTokenizer doesnt tokenize empty string...
+      }
+      checkOneTerm(a, input, normalized);
+    }
+  }
+
+  public void testNFC() throws Exception {
+    doTestMode(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE), 20, RANDOM_MULTIPLIER*1000);
+  }
+  
+  public void testNFCHuge() throws Exception {
+    doTestMode(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE), 8192, RANDOM_MULTIPLIER*100);
+  }
+
+  public void testNFD() throws Exception {
+    doTestMode(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE), 20, RANDOM_MULTIPLIER*1000);
+  }
+  
+  public void testNFDHuge() throws Exception {
+    doTestMode(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE), 8192, RANDOM_MULTIPLIER*100);
+  }
+
+  public void testNFKC() throws Exception {
+    doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE), 20, RANDOM_MULTIPLIER*1000);
+  }
+  
+  public void testNFKCHuge() throws Exception {
+    doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE), 8192, RANDOM_MULTIPLIER*100);
+  }
+
+  public void testNFKD() throws Exception {
+    doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE), 20, RANDOM_MULTIPLIER*1000);
+  }
+  
+  public void testNFKDHuge() throws Exception {
+    doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE), 8192, RANDOM_MULTIPLIER*100);
+  }
+
+  public void testNFKC_CF() throws Exception {
+    doTestMode(Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE), 20, RANDOM_MULTIPLIER*1000);
+  }
+  
+  public void testNFKC_CFHuge() throws Exception {
+    doTestMode(Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE), 8192, RANDOM_MULTIPLIER*100);
+  }
+
+  public void testRandomStrings() throws IOException {
+    // nfkc_cf
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        return new TokenStreamComponents(new MockTokenizer(MockTokenizer.WHITESPACE, false));
+      }
+
+      @Override
+      protected Reader initReader(String fieldName, Reader reader) {
+        return new ICUNormalizer2CharFilter(reader, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
+      }
+    };
+    checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
+    // huge strings
+    checkRandomData(random(), a, 100*RANDOM_MULTIPLIER, 8192);
+
+    // nfkd
+    a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        return new TokenStreamComponents(new MockTokenizer(MockTokenizer.WHITESPACE, false));
+      }
+
+      @Override
+      protected Reader initReader(String fieldName, Reader reader) {
+        return new ICUNormalizer2CharFilter(reader, Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE));
+      }
+    };
+    checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
+    // huge strings
+    checkRandomData(random(), a, 100*RANDOM_MULTIPLIER, 8192);
+  }
+  
+  public void testCuriousString() throws Exception {
+    String text = "\udb40\udc3d\uf273\ue960\u06c8\ud955\udc13\ub7fc\u0692 \u2089\u207b\u2073\u2075";
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        return new TokenStreamComponents(new MockTokenizer(MockTokenizer.WHITESPACE, false));
+      }
+
+      @Override
+      protected Reader initReader(String fieldName, Reader reader) {
+        return new ICUNormalizer2CharFilter(reader, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
+      }
+    };
+    for (int i = 0; i < 1000; i++) {
+      checkAnalysisConsistency(random(), a, false, text);
+    }
+  }
+}