LUCENE-4072: add ICUNormalizer2CharFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1579488 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-03-20 00:19:00 +00:00
parent b569844e04
commit 9a1ab11007
3 changed files with 434 additions and 0 deletions

View File

@ -112,6 +112,9 @@ New Features
you update the value of a BinaryDocValuesField without reindexing the
document(s). (Shai Erera)
* LUCENE-4072: Add ICUNormalizer2CharFilter, which lets you do unicode normalization
with offset correction before the tokenizer. (David Goldfarb, Ippei UKAI via Robert Muir)
API Changes
* LUCENE-5454: Add RandomAccessOrds, an optional extension of SortedSetDocValues

View File

@ -0,0 +1,206 @@
package org.apache.lucene.analysis.icu;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.charfilter.BaseCharFilter;
import com.ibm.icu.text.Normalizer2;
/**
* Normalize token text with ICU's {@link Normalizer2}.
*/
public final class ICUNormalizer2CharFilter extends BaseCharFilter {
private static final int IO_BUFFER_SIZE = 128;
private final Normalizer2 normalizer;
private final StringBuilder inputBuffer = new StringBuilder();
private final StringBuilder resultBuffer = new StringBuilder();
private boolean inputFinished;
private boolean afterQuickCheckYes;
private int checkedInputBoundary;
private int charCount;
/**
* Create a new Normalizer2CharFilter that combines NFKC normalization, Case
* Folding, and removes Default Ignorables (NFKC_Casefold)
*/
public ICUNormalizer2CharFilter(Reader in) {
this(in, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
}
/**
* Create a new Normalizer2CharFilter with the specified Normalizer2
* @param in text
* @param normalizer normalizer to use
*/
public ICUNormalizer2CharFilter(Reader in, Normalizer2 normalizer) {
super(in);
if (normalizer == null) {
throw new NullPointerException("normalizer == null");
}
this.normalizer = normalizer;
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
if (off < 0) throw new IllegalArgumentException("off < 0");
if (off >= cbuf.length) throw new IllegalArgumentException("off >= cbuf.length");
if (len <= 0) throw new IllegalArgumentException("len <= 0");
while (!inputFinished || inputBuffer.length() > 0 || resultBuffer.length() > 0) {
int retLen;
if (resultBuffer.length() > 0) {
retLen = outputFromResultBuffer(cbuf, off, len);
if (retLen > 0) {
return retLen;
}
}
int resLen = readAndNormalizeFromInput();
if (resLen > 0) {
retLen = outputFromResultBuffer(cbuf, off, len);
if (retLen > 0) {
return retLen;
}
}
readInputToBuffer();
}
return -1;
}
private final char[] tmpBuffer = new char[IO_BUFFER_SIZE];
private int readInputToBuffer() throws IOException {
final int len = input.read(tmpBuffer);
if (len == -1) {
inputFinished = true;
return 0;
}
inputBuffer.append(tmpBuffer, 0, len);
// if checkedInputBoundary was at the end of a buffer, we need to check that char again
checkedInputBoundary = Math.max(checkedInputBoundary - 1, 0);
if (normalizer.isInert(tmpBuffer[len - 1]) && !Character.isHighSurrogate(tmpBuffer[len-1])) {
return len;
} else return len + readInputToBuffer();
}
private int readAndNormalizeFromInput() {
if (inputBuffer.length() <= 0) {
afterQuickCheckYes = false;
return 0;
}
if (!afterQuickCheckYes) {
int resLen = readFromInputWhileSpanQuickCheckYes();
afterQuickCheckYes = true;
if (resLen > 0) return resLen;
}
int resLen = readFromIoNormalizeUptoBoundary();
if(resLen > 0){
afterQuickCheckYes = false;
}
return resLen;
}
private int readFromInputWhileSpanQuickCheckYes() {
int end = normalizer.spanQuickCheckYes(inputBuffer);
if (end > 0) {
resultBuffer.append(inputBuffer.subSequence(0, end));
inputBuffer.delete(0, end);
checkedInputBoundary = Math.max(checkedInputBoundary - end, 0);
charCount += end;
}
return end;
}
private int readFromIoNormalizeUptoBoundary() {
// if there's no buffer to normalize, return 0
if (inputBuffer.length() <= 0) {
return 0;
}
boolean foundBoundary = false;
final int bufLen = inputBuffer.length();
while (checkedInputBoundary <= bufLen - 1) {
int charLen = Character.charCount(inputBuffer.codePointAt(checkedInputBoundary));
checkedInputBoundary += charLen;
if (checkedInputBoundary < bufLen && normalizer.hasBoundaryBefore(inputBuffer
.codePointAt(checkedInputBoundary))) {
foundBoundary = true;
break;
}
}
if (!foundBoundary && checkedInputBoundary >= bufLen && inputFinished) {
foundBoundary = true;
checkedInputBoundary = bufLen;
}
if (!foundBoundary) {
return 0;
}
return normalizeInputUpto(checkedInputBoundary);
}
private int normalizeInputUpto(final int length) {
final int destOrigLen = resultBuffer.length();
normalizer.normalizeSecondAndAppend(resultBuffer,
inputBuffer.subSequence(0, length));
inputBuffer.delete(0, length);
checkedInputBoundary = Math.max(checkedInputBoundary - length, 0);
final int resultLength = resultBuffer.length() - destOrigLen;
recordOffsetDiff(length, resultLength);
return resultLength;
}
private void recordOffsetDiff(int inputLength, int outputLength) {
if (inputLength == outputLength) {
charCount += outputLength;
return;
}
final int diff = inputLength - outputLength;
final int cumuDiff = getLastCumulativeDiff();
if (diff < 0) {
for (int i = 1; i <= -diff; ++i) {
addOffCorrectMap(charCount + i, cumuDiff - i);
}
} else {
addOffCorrectMap(charCount + Math.min(1, outputLength), cumuDiff + diff);
}
charCount += outputLength;
}
private int outputFromResultBuffer(char[] cbuf, int begin, int len) {
len = Math.min(resultBuffer.length(), len);
resultBuffer.getChars(0, len, cbuf, begin);
if (len > 0) {
resultBuffer.delete(0, len);
}
return len;
}
}

View File

@ -0,0 +1,225 @@
package org.apache.lucene.analysis.icu;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.apache.lucene.util.TestUtil;
import com.ibm.icu.text.Normalizer2;
public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase {
public void testNormalization() throws IOException {
String input = "ʰ㌰゙5℃№㈱㌘バッファーの正規化のテスト㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि";
Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
String expectedOutput = normalizer.normalize(input);
CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), normalizer);
char[] tempBuff = new char[10];
StringBuilder output = new StringBuilder();
while (true) {
int length = reader.read(tempBuff);
if (length == -1) {
break;
}
output.append(tempBuff, 0, length);
assertEquals(output.toString(), normalizer.normalize(input.substring(0, reader.correctOffset(output.length()))));
}
assertEquals(expectedOutput, output.toString());
}
public void testTokenStream() throws IOException {
// '℃', '№', '㈱', '㌘', 'サ'+'<<', 'ソ'+'<<', '㌰'+'<<'
String input = "℃ № ㈱ ㌘ ザ ゾ ㌰゙";
CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE));
Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
tokenStream.setReader(reader);
assertTokenStreamContents(tokenStream,
new String[] {"°C", "No", "(株)", "グラム", "", "", "ピゴ"},
new int[] {0, 2, 4, 6, 8, 11, 14},
new int[] {1, 3, 5, 7, 10, 13, 16},
input.length());
}
public void testTokenStream2() throws IOException {
// '㌰', '<<', '5', '℃', '№', '㈱', '㌘', 'サ', '<<', 'ソ', '<<'
String input = "㌰゙5℃№㈱㌘ザゾ";
CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
Tokenizer tokenStream = new NGramTokenizer(TEST_VERSION_CURRENT, 1, 1);
tokenStream.setReader(reader);
assertTokenStreamContents(tokenStream,
new String[] {"", "", "5", "°", "c", "n", "o", "(", "", ")", "", "", "", "", ""},
new int[]{0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9},
new int[]{1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9, 11},
input.length()
);
}
public void testMassiveLigature() throws IOException {
String input = "\uFDFA";
CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
tokenStream.setReader(reader);
assertTokenStreamContents(tokenStream,
new String[] {"صلى", "الله", "عليه", "وسلم"},
new int[]{0, 0, 0, 0},
new int[]{0, 0, 0, 1},
input.length()
);
}
public void doTestMode(final Normalizer2 normalizer, int maxLength, int iterations) throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
return new TokenStreamComponents(new MockTokenizer(MockTokenizer.KEYWORD, false));
}
@Override
protected Reader initReader(String fieldName, Reader reader) {
return new ICUNormalizer2CharFilter(reader, normalizer);
}
};
for (int i = 0; i < iterations; i++) {
String input = TestUtil.randomUnicodeString(random(), maxLength);
if (input.length() == 0) {
continue;
}
String normalized = normalizer.normalize(input);
if (normalized.length() == 0) {
continue; // MockTokenizer doesnt tokenize empty string...
}
checkOneTerm(a, input, normalized);
}
}
public void testNFC() throws Exception {
doTestMode(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE), 20, RANDOM_MULTIPLIER*1000);
}
public void testNFCHuge() throws Exception {
doTestMode(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE), 8192, RANDOM_MULTIPLIER*100);
}
public void testNFD() throws Exception {
doTestMode(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE), 20, RANDOM_MULTIPLIER*1000);
}
public void testNFDHuge() throws Exception {
doTestMode(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE), 8192, RANDOM_MULTIPLIER*100);
}
public void testNFKC() throws Exception {
doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE), 20, RANDOM_MULTIPLIER*1000);
}
public void testNFKCHuge() throws Exception {
doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE), 8192, RANDOM_MULTIPLIER*100);
}
public void testNFKD() throws Exception {
doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE), 20, RANDOM_MULTIPLIER*1000);
}
public void testNFKDHuge() throws Exception {
doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE), 8192, RANDOM_MULTIPLIER*100);
}
public void testNFKC_CF() throws Exception {
doTestMode(Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE), 20, RANDOM_MULTIPLIER*1000);
}
public void testNFKC_CFHuge() throws Exception {
doTestMode(Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE), 8192, RANDOM_MULTIPLIER*100);
}
public void testRandomStrings() throws IOException {
// nfkc_cf
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
return new TokenStreamComponents(new MockTokenizer(MockTokenizer.WHITESPACE, false));
}
@Override
protected Reader initReader(String fieldName, Reader reader) {
return new ICUNormalizer2CharFilter(reader, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
}
};
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
// huge strings
checkRandomData(random(), a, 100*RANDOM_MULTIPLIER, 8192);
// nfkd
a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
return new TokenStreamComponents(new MockTokenizer(MockTokenizer.WHITESPACE, false));
}
@Override
protected Reader initReader(String fieldName, Reader reader) {
return new ICUNormalizer2CharFilter(reader, Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE));
}
};
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
// huge strings
checkRandomData(random(), a, 100*RANDOM_MULTIPLIER, 8192);
}
public void testCuriousString() throws Exception {
String text = "\udb40\udc3d\uf273\ue960\u06c8\ud955\udc13\ub7fc\u0692 \u2089\u207b\u2073\u2075";
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
return new TokenStreamComponents(new MockTokenizer(MockTokenizer.WHITESPACE, false));
}
@Override
protected Reader initReader(String fieldName, Reader reader) {
return new ICUNormalizer2CharFilter(reader, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
}
};
for (int i = 0; i < 1000; i++) {
checkAnalysisConsistency(random(), a, false, text);
}
}
}