mirror of https://github.com/apache/lucene.git
LUCENE-2409: Add ICUTransformFilter to support ICU transforms
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@937039 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e5e05ae8ba
commit
f1b979721a
|
@ -133,6 +133,10 @@ New features
|
|||
does a more thorough job of normalizing unicode text for search.
|
||||
(Robert Haschart, Robert Muir)
|
||||
|
||||
* LUCENE-2409: Add ICUTransformFilter, which transforms text in a context
|
||||
sensitive way, either from ICU built-in rules (such as Traditional-Simplified),
|
||||
or from rules you write yourself. (Robert Muir)
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
||||
|
|
|
@ -0,0 +1,184 @@
|
|||
package org.apache.lucene.analysis.icu;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
import com.ibm.icu.text.Replaceable;
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilter} that transforms text with ICU.
|
||||
* <p>
|
||||
* ICU provides text-transformation functionality via its Transliteration API.
|
||||
* Although script conversion is its most common use, a Transliterator can
|
||||
* actually perform a more general class of tasks. In fact, Transliterator
|
||||
* defines a very general API which specifies only that a segment of the input
|
||||
* text is replaced by new text. The particulars of this conversion are
|
||||
* determined entirely by subclasses of Transliterator.
|
||||
* </p>
|
||||
* <p>
|
||||
* Some useful transformations for search are built-in:
|
||||
* <ul>
|
||||
* <li>Conversion from Traditional to Simplified Chinese characters
|
||||
* <li>Conversion from Hiragana to Katakana
|
||||
* <li>Conversion from Fullwidth to Halfwidth forms.
|
||||
* <li>Script conversions, for example Serbian Cyrillic to Latin
|
||||
* </ul>
|
||||
* </p>
|
||||
* <p>
|
||||
* Example usage: <blockquote>stream = new ICUTransformFilter(stream,
|
||||
* Transliterator.getInstance("Traditional-Simplified"));</blockquote>
|
||||
* </p>
|
||||
* For more details, see the <a
|
||||
* href="http://userguide.icu-project.org/transforms/general">ICU User
|
||||
* Guide</a>.
|
||||
*/
|
||||
public final class ICUTransformFilter extends TokenFilter {
|
||||
// Transliterator to transform the text
|
||||
private final Transliterator transform;
|
||||
|
||||
// Reusable position object
|
||||
private final Transliterator.Position position = new Transliterator.Position();
|
||||
|
||||
// term attribute, will be updated with transformed text.
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
// Wraps a termAttribute around the replaceable interface.
|
||||
private final ReplaceableTermAttribute replaceableAttribute = new ReplaceableTermAttribute();
|
||||
|
||||
/**
|
||||
* Create a new ICUTransformFilter that transforms text on the given stream.
|
||||
*
|
||||
* @param input {@link TokenStream} to filter.
|
||||
* @param transform Transliterator to transform the text.
|
||||
*/
|
||||
public ICUTransformFilter(TokenStream input, Transliterator transform) {
|
||||
super(input);
|
||||
this.transform = transform;
|
||||
|
||||
/*
|
||||
* This is cheating, but speeds things up a lot.
|
||||
* If we wanted to use pkg-private APIs we could probably do better.
|
||||
*/
|
||||
if (transform.getFilter() == null && transform instanceof com.ibm.icu.text.RuleBasedTransliterator) {
|
||||
final UnicodeSet sourceSet = transform.getSourceSet();
|
||||
if (sourceSet != null && !sourceSet.isEmpty())
|
||||
transform.setFilter(sourceSet);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
/*
|
||||
* Wrap around replaceable. clear the positions, and transliterate.
|
||||
*/
|
||||
if (input.incrementToken()) {
|
||||
replaceableAttribute.setText(termAtt);
|
||||
|
||||
final int length = termAtt.length();
|
||||
position.start = 0;
|
||||
position.limit = length;
|
||||
position.contextStart = 0;
|
||||
position.contextLimit = length;
|
||||
|
||||
transform.filteredTransliterate(replaceableAttribute, position, false);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Wrap a {@link CharTermAttribute} with the Replaceable API.
|
||||
*/
|
||||
final class ReplaceableTermAttribute implements Replaceable {
|
||||
private char buffer[];
|
||||
private int length;
|
||||
private CharTermAttribute token;
|
||||
|
||||
void setText(final CharTermAttribute token) {
|
||||
this.token = token;
|
||||
this.buffer = token.buffer();
|
||||
this.length = token.length();
|
||||
}
|
||||
|
||||
public int char32At(int pos) {
|
||||
return UTF16.charAt(buffer, 0, length, pos);
|
||||
}
|
||||
|
||||
public char charAt(int pos) {
|
||||
return buffer[pos];
|
||||
}
|
||||
|
||||
public void copy(int start, int limit, int dest) {
|
||||
char text[] = new char[limit - start];
|
||||
getChars(start, limit, text, 0);
|
||||
replace(dest, dest, text, 0, limit - start);
|
||||
}
|
||||
|
||||
public void getChars(int srcStart, int srcLimit, char[] dst, int dstStart) {
|
||||
System.arraycopy(buffer, srcStart, dst, dstStart, srcLimit - srcStart);
|
||||
}
|
||||
|
||||
public boolean hasMetaData() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public int length() {
|
||||
return length;
|
||||
}
|
||||
|
||||
public void replace(int start, int limit, String text) {
|
||||
final int charsLen = text.length();
|
||||
final int newLength = shiftForReplace(start, limit, charsLen);
|
||||
// insert the replacement text
|
||||
text.getChars(0, charsLen, buffer, start);
|
||||
token.setLength(length = newLength);
|
||||
}
|
||||
|
||||
public void replace(int start, int limit, char[] text, int charsStart,
|
||||
int charsLen) {
|
||||
// shift text if necessary for the replacement
|
||||
final int newLength = shiftForReplace(start, limit, charsLen);
|
||||
// insert the replacement text
|
||||
System.arraycopy(text, charsStart, buffer, start, charsLen);
|
||||
token.setLength(length = newLength);
|
||||
}
|
||||
|
||||
/** shift text (if necessary) for a replacement operation */
|
||||
private int shiftForReplace(int start, int limit, int charsLen) {
|
||||
final int replacementLength = limit - start;
|
||||
final int newLength = length - replacementLength + charsLen;
|
||||
// resize if necessary
|
||||
if (newLength > length)
|
||||
buffer = token.resizeBuffer(newLength);
|
||||
// if the substring being replaced is longer or shorter than the
|
||||
// replacement, need to shift things around
|
||||
if (replacementLength != charsLen && limit < length)
|
||||
System.arraycopy(buffer, limit, buffer, start + charsLen, length - limit);
|
||||
return newLength;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -38,6 +38,8 @@ APIs. This module exposes the following functionality:
|
|||
Unicode's Default Caseless Matching algorithm.</li>
|
||||
<li><a href="#searchfolding">Search Term Folding</a>: Removes distinctions
|
||||
(such as accent marks) between similar characters for a loose or fuzzy search.</li>
|
||||
<li><a href="#transform">Text Transformation</a>: Transforms Unicode text in
|
||||
a context-sensitive fashion: e.g. mapping Traditional to Simplified Chinese</li>
|
||||
</ul>
|
||||
<hr/>
|
||||
<h1><a name="collation">Collation</a></h1>
|
||||
|
@ -286,6 +288,42 @@ many character foldings recursively.
|
|||
TokenStream tokenstream = new ICUFoldingFilter(tokenizer);
|
||||
</pre></code>
|
||||
<hr/>
|
||||
<h1><a name="transform">Text Transformation</a></h1>
|
||||
<p>
|
||||
ICU provides text-transformation functionality via its Transliteration API. This allows
|
||||
you to transform text in a variety of ways, taking context into account.
|
||||
</p>
|
||||
<p>
|
||||
For more information, see the
|
||||
<a href="http://userguide.icu-project.org/transforms/general">User's Guide</a>
|
||||
and
|
||||
<a href="http://userguide.icu-project.org/transforms/general/rules">Rule Tutorial</a>.
|
||||
</p>
|
||||
<h2>Use Cases</h2>
|
||||
<ul>
|
||||
<li>
|
||||
Convert Traditional to Simplified
|
||||
</li>
|
||||
<li>
|
||||
Transliterate between different writing systems: e.g. Romanization
|
||||
</li>
|
||||
</ul>
|
||||
<h2>Example Usages</h2>
|
||||
<h3>Convert Traditional to Simplified</h3>
|
||||
<code><pre>
|
||||
/**
|
||||
* This filter will map Traditional Chinese to Simplified Chinese
|
||||
*/
|
||||
TokenStream tokenstream = new ICUTransformFilter(tokenizer, Transliterator.getInstance("Traditional-Simplified"));
|
||||
</pre></code>
|
||||
<h3>Transliterate Serbian Cyrillic to Serbian Latin</h3>
|
||||
<code><pre>
|
||||
/**
|
||||
* This filter will map Serbian Cyrillic to Serbian Latin according to BGN rules
|
||||
*/
|
||||
TokenStream tokenstream = new ICUTransformFilter(tokenizer, Transliterator.getInstance("Serbian-Latin/BGN"));
|
||||
</pre></code>
|
||||
<hr/>
|
||||
<h1><a name="backcompat">Backwards Compatibility</a></h1>
|
||||
<p>
|
||||
This module exists to provide up-to-date Unicode functionality that supports
|
||||
|
|
|
@ -0,0 +1,86 @@
|
|||
package org.apache.lucene.analysis.icu;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
|
||||
/**
|
||||
* Test the ICUTransformFilter with some basic examples.
|
||||
*/
|
||||
public class TestICUTransformFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testBasicFunctionality() throws Exception {
|
||||
checkToken(Transliterator.getInstance("Traditional-Simplified"),
|
||||
"簡化字", "简化字");
|
||||
checkToken(Transliterator.getInstance("Katakana-Hiragana"),
|
||||
"ヒラガナ", "ひらがな");
|
||||
checkToken(Transliterator.getInstance("Fullwidth-Halfwidth"),
|
||||
"アルアノリウ", "アルアノリウ");
|
||||
checkToken(Transliterator.getInstance("Any-Latin"),
|
||||
"Αλφαβητικός Κατάλογος", "Alphabētikós Katálogos");
|
||||
checkToken(Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove"),
|
||||
"Alphabētikós Katálogos", "Alphabetikos Katalogos");
|
||||
checkToken(Transliterator.getInstance("Han-Latin"),
|
||||
"中国", "zhōng guó");
|
||||
}
|
||||
|
||||
public void testCustomFunctionality() throws Exception {
|
||||
String rules = "a > b; b > c;"; // convert a's to b's and b's to c's
|
||||
checkToken(Transliterator.createFromRules("test", rules, Transliterator.FORWARD), "abacadaba", "bcbcbdbcb");
|
||||
}
|
||||
|
||||
public void testCustomFunctionality2() throws Exception {
|
||||
String rules = "c { a > b; a > d;"; // convert a's to b's and b's to c's
|
||||
checkToken(Transliterator.createFromRules("test", rules, Transliterator.FORWARD), "caa", "cbd");
|
||||
}
|
||||
|
||||
public void testOptimizer() throws Exception {
|
||||
String rules = "a > b; b > c;"; // convert a's to b's and b's to c's
|
||||
Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD);
|
||||
assertTrue(custom.getFilter() == null);
|
||||
new ICUTransformFilter(new KeywordTokenizer(new StringReader("")), custom);
|
||||
assertTrue(custom.getFilter().equals(new UnicodeSet("[ab]")));
|
||||
}
|
||||
|
||||
public void testOptimizer2() throws Exception {
|
||||
checkToken(Transliterator.getInstance("Traditional-Simplified; CaseFold"),
|
||||
"ABCDE", "abcde");
|
||||
}
|
||||
|
||||
public void testOptimizerSurrogate() throws Exception {
|
||||
String rules = "\\U00020087 > x;"; // convert CJK UNIFIED IDEOGRAPH-20087 to an x
|
||||
Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD);
|
||||
assertTrue(custom.getFilter() == null);
|
||||
new ICUTransformFilter(new KeywordTokenizer(new StringReader("")), custom);
|
||||
assertTrue(custom.getFilter().equals(new UnicodeSet("[\\U00020087]")));
|
||||
}
|
||||
|
||||
private void checkToken(Transliterator transform, String input, String expected) throws IOException {
|
||||
TokenStream ts = new ICUTransformFilter(new KeywordTokenizer((new StringReader(input))), transform);
|
||||
assertTokenStreamContents(ts, new String[] { expected });
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue