mirror of https://github.com/apache/lucene.git
LUCENE-4072: add ICUNormalizer2CharFilter
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1579488 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b569844e04
commit
9a1ab11007
|
@ -112,6 +112,9 @@ New Features
|
|||
you update the value of a BinaryDocValuesField without reindexing the
|
||||
document(s). (Shai Erera)
|
||||
|
||||
* LUCENE-4072: Add ICUNormalizer2CharFilter, which lets you do unicode normalization
|
||||
with offset correction before the tokenizer. (David Goldfarb, Ippei UKAI via Robert Muir)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-5454: Add RandomAccessOrds, an optional extension of SortedSetDocValues
|
||||
|
|
|
@ -0,0 +1,206 @@
|
|||
package org.apache.lucene.analysis.icu;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.charfilter.BaseCharFilter;
|
||||
|
||||
import com.ibm.icu.text.Normalizer2;
|
||||
|
||||
/**
|
||||
* Normalize token text with ICU's {@link Normalizer2}.
|
||||
*/
|
||||
public final class ICUNormalizer2CharFilter extends BaseCharFilter {
|
||||
|
||||
private static final int IO_BUFFER_SIZE = 128;
|
||||
|
||||
private final Normalizer2 normalizer;
|
||||
private final StringBuilder inputBuffer = new StringBuilder();
|
||||
private final StringBuilder resultBuffer = new StringBuilder();
|
||||
|
||||
private boolean inputFinished;
|
||||
private boolean afterQuickCheckYes;
|
||||
private int checkedInputBoundary;
|
||||
private int charCount;
|
||||
|
||||
|
||||
/**
|
||||
* Create a new Normalizer2CharFilter that combines NFKC normalization, Case
|
||||
* Folding, and removes Default Ignorables (NFKC_Casefold)
|
||||
*/
|
||||
public ICUNormalizer2CharFilter(Reader in) {
|
||||
this(in, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new Normalizer2CharFilter with the specified Normalizer2
|
||||
* @param in text
|
||||
* @param normalizer normalizer to use
|
||||
*/
|
||||
public ICUNormalizer2CharFilter(Reader in, Normalizer2 normalizer) {
|
||||
super(in);
|
||||
if (normalizer == null) {
|
||||
throw new NullPointerException("normalizer == null");
|
||||
}
|
||||
this.normalizer = normalizer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||
if (off < 0) throw new IllegalArgumentException("off < 0");
|
||||
if (off >= cbuf.length) throw new IllegalArgumentException("off >= cbuf.length");
|
||||
if (len <= 0) throw new IllegalArgumentException("len <= 0");
|
||||
|
||||
while (!inputFinished || inputBuffer.length() > 0 || resultBuffer.length() > 0) {
|
||||
int retLen;
|
||||
|
||||
if (resultBuffer.length() > 0) {
|
||||
retLen = outputFromResultBuffer(cbuf, off, len);
|
||||
if (retLen > 0) {
|
||||
return retLen;
|
||||
}
|
||||
}
|
||||
|
||||
int resLen = readAndNormalizeFromInput();
|
||||
if (resLen > 0) {
|
||||
retLen = outputFromResultBuffer(cbuf, off, len);
|
||||
if (retLen > 0) {
|
||||
return retLen;
|
||||
}
|
||||
}
|
||||
|
||||
readInputToBuffer();
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
private final char[] tmpBuffer = new char[IO_BUFFER_SIZE];
|
||||
|
||||
private int readInputToBuffer() throws IOException {
|
||||
final int len = input.read(tmpBuffer);
|
||||
if (len == -1) {
|
||||
inputFinished = true;
|
||||
return 0;
|
||||
}
|
||||
inputBuffer.append(tmpBuffer, 0, len);
|
||||
|
||||
// if checkedInputBoundary was at the end of a buffer, we need to check that char again
|
||||
checkedInputBoundary = Math.max(checkedInputBoundary - 1, 0);
|
||||
if (normalizer.isInert(tmpBuffer[len - 1]) && !Character.isHighSurrogate(tmpBuffer[len-1])) {
|
||||
return len;
|
||||
} else return len + readInputToBuffer();
|
||||
}
|
||||
|
||||
private int readAndNormalizeFromInput() {
|
||||
if (inputBuffer.length() <= 0) {
|
||||
afterQuickCheckYes = false;
|
||||
return 0;
|
||||
}
|
||||
if (!afterQuickCheckYes) {
|
||||
int resLen = readFromInputWhileSpanQuickCheckYes();
|
||||
afterQuickCheckYes = true;
|
||||
if (resLen > 0) return resLen;
|
||||
}
|
||||
int resLen = readFromIoNormalizeUptoBoundary();
|
||||
if(resLen > 0){
|
||||
afterQuickCheckYes = false;
|
||||
}
|
||||
return resLen;
|
||||
}
|
||||
|
||||
private int readFromInputWhileSpanQuickCheckYes() {
|
||||
int end = normalizer.spanQuickCheckYes(inputBuffer);
|
||||
if (end > 0) {
|
||||
resultBuffer.append(inputBuffer.subSequence(0, end));
|
||||
inputBuffer.delete(0, end);
|
||||
checkedInputBoundary = Math.max(checkedInputBoundary - end, 0);
|
||||
charCount += end;
|
||||
}
|
||||
return end;
|
||||
}
|
||||
|
||||
private int readFromIoNormalizeUptoBoundary() {
|
||||
// if there's no buffer to normalize, return 0
|
||||
if (inputBuffer.length() <= 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
boolean foundBoundary = false;
|
||||
final int bufLen = inputBuffer.length();
|
||||
|
||||
while (checkedInputBoundary <= bufLen - 1) {
|
||||
int charLen = Character.charCount(inputBuffer.codePointAt(checkedInputBoundary));
|
||||
checkedInputBoundary += charLen;
|
||||
if (checkedInputBoundary < bufLen && normalizer.hasBoundaryBefore(inputBuffer
|
||||
.codePointAt(checkedInputBoundary))) {
|
||||
foundBoundary = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!foundBoundary && checkedInputBoundary >= bufLen && inputFinished) {
|
||||
foundBoundary = true;
|
||||
checkedInputBoundary = bufLen;
|
||||
}
|
||||
|
||||
if (!foundBoundary) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return normalizeInputUpto(checkedInputBoundary);
|
||||
}
|
||||
|
||||
private int normalizeInputUpto(final int length) {
|
||||
final int destOrigLen = resultBuffer.length();
|
||||
normalizer.normalizeSecondAndAppend(resultBuffer,
|
||||
inputBuffer.subSequence(0, length));
|
||||
inputBuffer.delete(0, length);
|
||||
checkedInputBoundary = Math.max(checkedInputBoundary - length, 0);
|
||||
final int resultLength = resultBuffer.length() - destOrigLen;
|
||||
recordOffsetDiff(length, resultLength);
|
||||
return resultLength;
|
||||
}
|
||||
|
||||
private void recordOffsetDiff(int inputLength, int outputLength) {
|
||||
if (inputLength == outputLength) {
|
||||
charCount += outputLength;
|
||||
return;
|
||||
}
|
||||
final int diff = inputLength - outputLength;
|
||||
final int cumuDiff = getLastCumulativeDiff();
|
||||
if (diff < 0) {
|
||||
for (int i = 1; i <= -diff; ++i) {
|
||||
addOffCorrectMap(charCount + i, cumuDiff - i);
|
||||
}
|
||||
} else {
|
||||
addOffCorrectMap(charCount + Math.min(1, outputLength), cumuDiff + diff);
|
||||
}
|
||||
charCount += outputLength;
|
||||
}
|
||||
|
||||
private int outputFromResultBuffer(char[] cbuf, int begin, int len) {
|
||||
len = Math.min(resultBuffer.length(), len);
|
||||
resultBuffer.getChars(0, len, cbuf, begin);
|
||||
if (len > 0) {
|
||||
resultBuffer.delete(0, len);
|
||||
}
|
||||
return len;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,225 @@
|
|||
package org.apache.lucene.analysis.icu;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharFilter;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.ngram.NGramTokenizer;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
import com.ibm.icu.text.Normalizer2;
|
||||
|
||||
public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testNormalization() throws IOException {
|
||||
String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि";
|
||||
Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
|
||||
String expectedOutput = normalizer.normalize(input);
|
||||
|
||||
CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), normalizer);
|
||||
char[] tempBuff = new char[10];
|
||||
StringBuilder output = new StringBuilder();
|
||||
while (true) {
|
||||
int length = reader.read(tempBuff);
|
||||
if (length == -1) {
|
||||
break;
|
||||
}
|
||||
output.append(tempBuff, 0, length);
|
||||
assertEquals(output.toString(), normalizer.normalize(input.substring(0, reader.correctOffset(output.length()))));
|
||||
}
|
||||
|
||||
assertEquals(expectedOutput, output.toString());
|
||||
}
|
||||
|
||||
public void testTokenStream() throws IOException {
|
||||
// '℃', '№', '㈱', '㌘', 'サ'+'<<', 'ソ'+'<<', '㌰'+'<<'
|
||||
String input = "℃ № ㈱ ㌘ ザ ゾ ㌰゙";
|
||||
|
||||
CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
|
||||
Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE));
|
||||
|
||||
Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
tokenStream.setReader(reader);
|
||||
|
||||
assertTokenStreamContents(tokenStream,
|
||||
new String[] {"°C", "No", "(株)", "グラム", "ザ", "ゾ", "ピゴ"},
|
||||
new int[] {0, 2, 4, 6, 8, 11, 14},
|
||||
new int[] {1, 3, 5, 7, 10, 13, 16},
|
||||
input.length());
|
||||
}
|
||||
|
||||
public void testTokenStream2() throws IOException {
|
||||
// '㌰', '<<'゙, '5', '℃', '№', '㈱', '㌘', 'サ', '<<', 'ソ', '<<'
|
||||
String input = "㌰゙5℃№㈱㌘ザゾ";
|
||||
|
||||
CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
|
||||
Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
|
||||
|
||||
Tokenizer tokenStream = new NGramTokenizer(TEST_VERSION_CURRENT, 1, 1);
|
||||
tokenStream.setReader(reader);
|
||||
|
||||
assertTokenStreamContents(tokenStream,
|
||||
new String[] {"ピ", "ゴ", "5", "°", "c", "n", "o", "(", "株", ")", "グ", "ラ", "ム", "ザ", "ゾ"},
|
||||
new int[]{0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9},
|
||||
new int[]{1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9, 11},
|
||||
input.length()
|
||||
);
|
||||
}
|
||||
|
||||
public void testMassiveLigature() throws IOException {
|
||||
String input = "\uFDFA";
|
||||
|
||||
CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
|
||||
Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
|
||||
|
||||
Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
tokenStream.setReader(reader);
|
||||
|
||||
assertTokenStreamContents(tokenStream,
|
||||
new String[] {"صلى", "الله", "عليه", "وسلم"},
|
||||
new int[]{0, 0, 0, 0},
|
||||
new int[]{0, 0, 0, 1},
|
||||
input.length()
|
||||
);
|
||||
}
|
||||
|
||||
public void doTestMode(final Normalizer2 normalizer, int maxLength, int iterations) throws IOException {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
return new TokenStreamComponents(new MockTokenizer(MockTokenizer.KEYWORD, false));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(String fieldName, Reader reader) {
|
||||
return new ICUNormalizer2CharFilter(reader, normalizer);
|
||||
}
|
||||
};
|
||||
|
||||
for (int i = 0; i < iterations; i++) {
|
||||
String input = TestUtil.randomUnicodeString(random(), maxLength);
|
||||
if (input.length() == 0) {
|
||||
continue;
|
||||
}
|
||||
String normalized = normalizer.normalize(input);
|
||||
if (normalized.length() == 0) {
|
||||
continue; // MockTokenizer doesnt tokenize empty string...
|
||||
}
|
||||
checkOneTerm(a, input, normalized);
|
||||
}
|
||||
}
|
||||
|
||||
public void testNFC() throws Exception {
|
||||
doTestMode(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE), 20, RANDOM_MULTIPLIER*1000);
|
||||
}
|
||||
|
||||
public void testNFCHuge() throws Exception {
|
||||
doTestMode(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE), 8192, RANDOM_MULTIPLIER*100);
|
||||
}
|
||||
|
||||
public void testNFD() throws Exception {
|
||||
doTestMode(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE), 20, RANDOM_MULTIPLIER*1000);
|
||||
}
|
||||
|
||||
public void testNFDHuge() throws Exception {
|
||||
doTestMode(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE), 8192, RANDOM_MULTIPLIER*100);
|
||||
}
|
||||
|
||||
public void testNFKC() throws Exception {
|
||||
doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE), 20, RANDOM_MULTIPLIER*1000);
|
||||
}
|
||||
|
||||
public void testNFKCHuge() throws Exception {
|
||||
doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE), 8192, RANDOM_MULTIPLIER*100);
|
||||
}
|
||||
|
||||
public void testNFKD() throws Exception {
|
||||
doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE), 20, RANDOM_MULTIPLIER*1000);
|
||||
}
|
||||
|
||||
public void testNFKDHuge() throws Exception {
|
||||
doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE), 8192, RANDOM_MULTIPLIER*100);
|
||||
}
|
||||
|
||||
public void testNFKC_CF() throws Exception {
|
||||
doTestMode(Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE), 20, RANDOM_MULTIPLIER*1000);
|
||||
}
|
||||
|
||||
public void testNFKC_CFHuge() throws Exception {
|
||||
doTestMode(Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE), 8192, RANDOM_MULTIPLIER*100);
|
||||
}
|
||||
|
||||
public void testRandomStrings() throws IOException {
|
||||
// nfkc_cf
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
return new TokenStreamComponents(new MockTokenizer(MockTokenizer.WHITESPACE, false));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(String fieldName, Reader reader) {
|
||||
return new ICUNormalizer2CharFilter(reader, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||
// huge strings
|
||||
checkRandomData(random(), a, 100*RANDOM_MULTIPLIER, 8192);
|
||||
|
||||
// nfkd
|
||||
a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
return new TokenStreamComponents(new MockTokenizer(MockTokenizer.WHITESPACE, false));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(String fieldName, Reader reader) {
|
||||
return new ICUNormalizer2CharFilter(reader, Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE));
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||
// huge strings
|
||||
checkRandomData(random(), a, 100*RANDOM_MULTIPLIER, 8192);
|
||||
}
|
||||
|
||||
public void testCuriousString() throws Exception {
|
||||
String text = "\udb40\udc3d\uf273\ue960\u06c8\ud955\udc13\ub7fc\u0692 \u2089\u207b\u2073\u2075";
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
return new TokenStreamComponents(new MockTokenizer(MockTokenizer.WHITESPACE, false));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(String fieldName, Reader reader) {
|
||||
return new ICUNormalizer2CharFilter(reader, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
|
||||
}
|
||||
};
|
||||
for (int i = 0; i < 1000; i++) {
|
||||
checkAnalysisConsistency(random(), a, false, text);
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue