LUCENE-2906: filter to process output of Standard/ICUTokenizer and create overlapping bigrams for CJK

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1225433 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-12-29 05:04:49 +00:00
parent a1a8eda2cd
commit b2970db4bc
15 changed files with 1202 additions and 12 deletions

View File

@ -102,6 +102,12 @@ New Features
* SOLR-2982: Added phonetic encoders to contrib/analyzers/phonetic:
Metaphone, Soundex, Caverphone, Beider-Morse, etc. (Robert Muir)
* LUCENE-2906: Added CJKBigramFilter that forms bigrams from StandardTokenizer or
ICUTokenizer CJK tokens, and CJKWidthFilter that normalizes halfwidth/fullwidth.
This filter supports unicode supplementary characters and you can toggle whether
bigrams are formed for each of Han/Hiragana/Katakana/Hangul independently. Deprecates
CJKTokenizer. (Tom Burton-West, Robert Muir)
API Changes
* LUCENE-3596: DirectoryTaxonomyWriter.openIndexWriter() now takes an

View File

@ -22,16 +22,19 @@ import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;
/**
* An {@link Analyzer} that tokenizes text with {@link CJKTokenizer} and
* filters with {@link StopFilter}
*
* An {@link Analyzer} that tokenizes text with {@link StandardTokenizer},
* normalizes content with {@link CJKWidthFilter}, folds case with
* {@link LowerCaseFilter}, forms bigrams of CJK with {@link CJKBigramFilter},
* and filters stopwords with {@link StopFilter}
*/
public final class CJKAnalyzer extends StopwordAnalyzerBase {
/**
@ -86,7 +89,16 @@ public final class CJKAnalyzer extends StopwordAnalyzerBase {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new CJKTokenizer(reader);
return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords));
if (matchVersion.onOrAfter(Version.LUCENE_36)) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
// run the widthfilter first before bigramming, it sometimes combines characters.
TokenStream result = new CJKWidthFilter(source);
result = new LowerCaseFilter(matchVersion, result);
result = new CJKBigramFilter(result);
return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
} else {
final Tokenizer source = new CJKTokenizer(reader);
return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords));
}
}
}

View File

@ -0,0 +1,307 @@
package org.apache.lucene.analysis.cjk;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.ArrayUtil;
/**
* Forms bigrams of CJK terms that are generated from StandardTokenizer
* or ICUTokenizer.
* <p>
* CJK types are set by these tokenizers, but you can also use
* {@link #CJKBigramFilter(TokenStream, int)} to explicitly control which
* of the CJK scripts are turned into bigrams.
* <p>
* In all cases, all non-CJK input is passed thru unmodified.
*/
public final class CJKBigramFilter extends TokenFilter {
// configuration
/** bigram flag for Han Ideographs */
public static final int HAN = 1;
/** bigram flag for Hiragana */
public static final int HIRAGANA = 2;
/** bigram flag for Katakana */
public static final int KATAKANA = 4;
/** bigram flag for Hangul */
public static final int HANGUL = 8;
/** when we emit a bigram, its then marked as this type */
public static final String DOUBLE_TYPE = "<DOUBLE>";
/** when we emit a unigram, its then marked as this type */
public static final String SINGLE_TYPE = "<SINGLE>";
// the types from standardtokenizer
private static final String HAN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
private static final String HIRAGANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
private static final String KATAKANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
private static final String HANGUL_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];
// sentinel value for ignoring a script
private static final Object NO = new Object();
// these are set to either their type or NO if we want to pass them thru
private final Object doHan;
private final Object doHiragana;
private final Object doKatakana;
private final Object doHangul;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
// buffers containing codepoint and offsets in parallel
int buffer[] = new int[8];
int startOffset[] = new int[8];
int endOffset[] = new int[8];
// length of valid buffer
int bufferLen;
// current buffer index
int index;
// the last end offset, to determine if we should bigram across tokens
int lastEndOffset;
private boolean exhausted;
/**
* Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int)
* CJKBigramFilter(HAN | HIRAGANA | KATAKANA | HANGUL)}
*/
public CJKBigramFilter(TokenStream in) {
this(in, HAN | HIRAGANA | KATAKANA | HANGUL);
}
/**
* Create a new CJKBigramFilter, specifying which writing systems should be bigrammed.
* @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA},
* {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL}
*/
public CJKBigramFilter(TokenStream in, int flags) {
super(in);
doHan = (flags & HAN) == 0 ? NO : HAN_TYPE;
doHiragana = (flags & HIRAGANA) == 0 ? NO : HIRAGANA_TYPE;
doKatakana = (flags & KATAKANA) == 0 ? NO : KATAKANA_TYPE;
doHangul = (flags & HANGUL) == 0 ? NO : HANGUL_TYPE;
}
/*
* much of this complexity revolves around handling the special case of a
* "lone cjk character" where cjktokenizer would output a unigram. this
* is also the only time we ever have to captureState.
*/
@Override
public boolean incrementToken() throws IOException {
while (true) {
if (hasBufferedBigram()) {
// case 1: we have multiple remaining codepoints buffered,
// so we can emit a bigram here.
flushBigram();
return true;
} else if (doNext()) {
// case 2: look at the token type. should we form any n-grams?
String type = typeAtt.type();
if (type == doHan || type == doHiragana || type == doKatakana || type == doHangul) {
// acceptable CJK type: we form n-grams from these.
// as long as the offsets are aligned, we just add these to our current buffer.
// otherwise, we clear the buffer and start over.
if (offsetAtt.startOffset() != lastEndOffset) { // unaligned, clear queue
if (hasBufferedUnigram()) {
// we have a buffered unigram, and we peeked ahead to see if we could form
// a bigram, but we can't, because the offsets are unaligned. capture the state
// of this peeked data to be revisited next time thru the loop, and dump our unigram.
loneState = captureState();
flushUnigram();
return true;
}
index = 0;
bufferLen = 0;
}
refill();
} else {
// not a CJK type: we just return these as-is.
if (hasBufferedUnigram()) {
// we have a buffered unigram, and we peeked ahead to see if we could form
// a bigram, but we can't, because its not a CJK type. capture the state
// of this peeked data to be revisited next time thru the loop, and dump our unigram.
loneState = captureState();
flushUnigram();
return true;
}
return true;
}
} else {
// case 3: we have only zero or 1 codepoints buffered,
// so not enough to form a bigram. But, we also have no
// more input. So if we have a buffered codepoint, emit
// a unigram, otherwise, its end of stream.
if (hasBufferedUnigram()) {
flushUnigram(); // flush our remaining unigram
return true;
}
return false;
}
}
}
private State loneState; // rarely used: only for "lone cjk characters", where we emit unigrams
/**
* looks at next input token, returning false is none is available
*/
private boolean doNext() throws IOException {
if (loneState != null) {
restoreState(loneState);
loneState = null;
return true;
} else {
if (exhausted) {
return false;
} else if (input.incrementToken()) {
return true;
} else {
exhausted = true;
return false;
}
}
}
/**
* refills buffers with new data from the current token.
*/
private void refill() throws IOException {
// compact buffers to keep them smallish if they become large
// just a safety check, but technically we only need the last codepoint
if (bufferLen > 64) {
int last = bufferLen - 1;
buffer[0] = buffer[last];
startOffset[0] = startOffset[last];
endOffset[0] = endOffset[last];
bufferLen = 1;
index -= last;
}
char termBuffer[] = termAtt.buffer();
int len = termAtt.length();
int start = offsetAtt.startOffset();
int end = offsetAtt.endOffset();
int newSize = bufferLen + len;
buffer = ArrayUtil.grow(buffer, newSize);
startOffset = ArrayUtil.grow(startOffset, newSize);
endOffset = ArrayUtil.grow(endOffset, newSize);
lastEndOffset = end;
if (end - start != len) {
// crazy offsets (modified by synonym or charfilter): just preserve
for (int i = 0, cp = 0; i < len; i += Character.charCount(cp)) {
cp = buffer[bufferLen] = Character.codePointAt(termBuffer, i, len);
startOffset[bufferLen] = start;
endOffset[bufferLen] = end;
bufferLen++;
}
} else {
// normal offsets
for (int i = 0, cp = 0, cpLen = 0; i < len; i += cpLen) {
cp = buffer[bufferLen] = Character.codePointAt(termBuffer, i, len);
cpLen = Character.charCount(cp);
startOffset[bufferLen] = start;
start = endOffset[bufferLen] = start + cpLen;
bufferLen++;
}
}
}
/**
* Flushes a bigram token to output from our buffer
* This is the normal case, e.g. ABC -> AB BC
*/
private void flushBigram() {
clearAttributes();
char termBuffer[] = termAtt.resizeBuffer(4); // maximum bigram length in code units (2 supplementaries)
int len1 = Character.toChars(buffer[index], termBuffer, 0);
int len2 = len1 + Character.toChars(buffer[index+1], termBuffer, len1);
termAtt.setLength(len2);
offsetAtt.setOffset(startOffset[index], endOffset[index+1]);
typeAtt.setType(DOUBLE_TYPE);
index++;
}
/**
* Flushes a unigram token to output from our buffer.
* This happens when we encounter isolated CJK characters, either the whole
* CJK string is a single character, or we encounter a CJK character surrounded
* by space, punctuation, english, etc, but not beside any other CJK.
*/
private void flushUnigram() {
clearAttributes();
char termBuffer[] = termAtt.resizeBuffer(2); // maximum unigram length (2 surrogates)
int len = Character.toChars(buffer[index], termBuffer, 0);
termAtt.setLength(len);
offsetAtt.setOffset(startOffset[index], endOffset[index]);
typeAtt.setType(SINGLE_TYPE);
index++;
}
/**
* True if we have multiple codepoints sitting in our buffer
*/
private boolean hasBufferedBigram() {
return bufferLen - index > 1;
}
/**
* True if we have a single codepoint sitting in our buffer, where its future
* (whether it is emitted as unigram or forms a bigram) depends upon not-yet-seen
* inputs.
*/
private boolean hasBufferedUnigram() {
return bufferLen == 1 && index == 0;
}
@Override
public void reset() throws IOException {
super.reset();
bufferLen = 0;
index = 0;
lastEndOffset = 0;
loneState = null;
exhausted = false;
}
}

View File

@ -44,7 +44,9 @@ import org.apache.lucene.util.AttributeSource;
* please search <a
* href="http://www.google.com/search?q=word+chinese+segment">google</a>
*
* @deprecated Use StandardTokenizer, CJKWidthFilter, CJKBigramFilter, and LowerCaseFilter instead.
*/
@Deprecated
public final class CJKTokenizer extends Tokenizer {
//~ Static fields/initializers ---------------------------------------------
/** Word token type */

View File

@ -0,0 +1,95 @@
package org.apache.lucene.analysis.cjk;
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.StemmerUtil;
/**
* A {@link TokenFilter} that normalizes CJK width differences:
* <ul>
* <li>Folds fullwidth ASCII variants into the equivalent basic latin
* <li>Folds halfwidth Katakana variants into the equivalent kana
* </ul>
* <p>
* NOTE: this filter can be viewed as a (practical) subset of NFKC/NFKD
* Unicode normalization. See the normalization support in the ICU package
* for full normalization.
*/
public final class CJKWidthFilter extends TokenFilter {
private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/* halfwidth kana mappings: 0xFF65-0xFF9D
*
* note: 0xFF9C and 0xFF9D are only mapped to 0x3099 and 0x309A
* as a fallback when they cannot properly combine with a preceding
* character into a composed form.
*/
private static final char KANA_NORM[] = new char[] {
0x30fb, 0x30f2, 0x30a1, 0x30a3, 0x30a5, 0x30a7, 0x30a9, 0x30e3, 0x30e5,
0x30e7, 0x30c3, 0x30fc, 0x30a2, 0x30a4, 0x30a6, 0x30a8, 0x30aa, 0x30ab,
0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9, 0x30bb, 0x30bd,
0x30bf, 0x30c1, 0x30c4, 0x30c6, 0x30c8, 0x30ca, 0x30cb, 0x30cc, 0x30cd,
0x30ce, 0x30cf, 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de, 0x30df, 0x30e0,
0x30e1, 0x30e2, 0x30e4, 0x30e6, 0x30e8, 0x30e9, 0x30ea, 0x30eb, 0x30ec,
0x30ed, 0x30ef, 0x30f3, 0x3099, 0x309A
};
public CJKWidthFilter(TokenStream input) {
super(input);
}
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
char text[] = termAtt.buffer();
int length = termAtt.length();
for (int i = 0; i < length; i++) {
final char ch = text[i];
if (ch >= 0xFF01 && ch <= 0xFF5E) {
// Fullwidth ASCII variants
text[i] -= 0xFEE0;
} else if (ch >= 0xFF65 && ch <= 0xFF9F) {
// Halfwidth Katakana variants
if ((ch == 0xFF9E || ch == 0xFF9F) && i > 0 && combine(text, i, length, ch)) {
length = StemmerUtil.delete(text, i--, length);
} else {
text[i] = KANA_NORM[ch - 0xFF65];
}
}
}
termAtt.setLength(length);
return true;
} else {
return false;
}
}
/* kana combining diffs: 0x30A6-0x30FD */
private static final byte KANA_COMBINE_VOICED[] = new byte[] {
78, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
};
private static final byte KANA_COMBINE_HALF_VOICED[] = new byte[] {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2,
0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
/** returns true if we successfully combined the voice mark */
private static boolean combine(char text[], int pos, int length, char ch) {
final char prev = text[pos-1];
if (prev >= 0x30A6 && prev <= 0x30FD) {
text[pos-1] += (ch == 0xFF9F)
? KANA_COMBINE_HALF_VOICED[prev - 0x30A6]
: KANA_COMBINE_VOICED[prev - 0x30A6];
return text[pos-1] != prev;
}
return false;
}
}

View File

@ -0,0 +1,275 @@
package org.apache.lucene.analysis.cjk;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
/**
* Most tests adopted from TestCJKTokenizer
*/
public class TestCJKAnalyzer extends BaseTokenStreamTestCase {
private Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT);
public void testJa1() throws IOException {
assertAnalyzesTo(analyzer, "一二三四五六七八九十",
new String[] { "一二", "二三", "三四", "四五", "五六", "六七", "七八", "八九", "九十" },
new int[] { 0, 1, 2, 3, 4, 5, 6, 7, 8 },
new int[] { 2, 3, 4, 5, 6, 7, 8, 9, 10 },
new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>" },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1 });
}
public void testJa2() throws IOException {
assertAnalyzesTo(analyzer, "一 二三四 五六七八九 十",
new String[] { "", "二三", "三四", "五六", "六七", "七八", "八九", "" },
new int[] { 0, 2, 3, 6, 7, 8, 9, 12 },
new int[] { 1, 4, 5, 8, 9, 10, 11, 13 },
new String[] { "<SINGLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<SINGLE>" },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1 });
}
public void testC() throws IOException {
assertAnalyzesTo(analyzer, "abc defgh ijklmn opqrstu vwxy z",
new String[] { "abc", "defgh", "ijklmn", "opqrstu", "vwxy", "z" },
new int[] { 0, 4, 10, 17, 25, 30 },
new int[] { 3, 9, 16, 24, 29, 31 },
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" },
new int[] { 1, 1, 1, 1, 1, 1 });
}
/**
* LUCENE-2207: wrong offset calculated by end()
*/
public void testFinalOffset() throws IOException {
assertAnalyzesTo(analyzer, "あい",
new String[] { "あい" },
new int[] { 0 },
new int[] { 2 },
new String[] { "<DOUBLE>" },
new int[] { 1 });
assertAnalyzesTo(analyzer, "あい ",
new String[] { "あい" },
new int[] { 0 },
new int[] { 2 },
new String[] { "<DOUBLE>" },
new int[] { 1 });
assertAnalyzesTo(analyzer, "test",
new String[] { "test" },
new int[] { 0 },
new int[] { 4 },
new String[] { "<ALPHANUM>" },
new int[] { 1 });
assertAnalyzesTo(analyzer, "test ",
new String[] { "test" },
new int[] { 0 },
new int[] { 4 },
new String[] { "<ALPHANUM>" },
new int[] { 1 });
assertAnalyzesTo(analyzer, "あいtest",
new String[] { "あい", "test" },
new int[] { 0, 2 },
new int[] { 2, 6 },
new String[] { "<DOUBLE>", "<ALPHANUM>" },
new int[] { 1, 1 });
assertAnalyzesTo(analyzer, "testあい ",
new String[] { "test", "あい" },
new int[] { 0, 4 },
new int[] { 4, 6 },
new String[] { "<ALPHANUM>", "<DOUBLE>" },
new int[] { 1, 1 });
}
public void testMix() throws IOException {
assertAnalyzesTo(analyzer, "あいうえおabcかきくけこ",
new String[] { "あい", "いう", "うえ", "えお", "abc", "かき", "きく", "くけ", "けこ" },
new int[] { 0, 1, 2, 3, 5, 8, 9, 10, 11 },
new int[] { 2, 3, 4, 5, 8, 10, 11, 12, 13 },
new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<ALPHANUM>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>" },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1});
}
public void testMix2() throws IOException {
assertAnalyzesTo(analyzer, "あいうえおabんcかきくけ こ",
new String[] { "あい", "いう", "うえ", "えお", "ab", "", "c", "かき", "きく", "くけ", "" },
new int[] { 0, 1, 2, 3, 5, 7, 8, 9, 10, 11, 14 },
new int[] { 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15 },
new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<ALPHANUM>", "<SINGLE>", "<ALPHANUM>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<SINGLE>" },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
}
/**
* Non-english text (outside of CJK) is treated normally, according to unicode rules
*/
public void testNonIdeographic() throws IOException {
assertAnalyzesTo(analyzer, "一 روبرت موير",
new String[] { "", "روبرت", "موير" },
new int[] { 0, 2, 8 },
new int[] { 1, 7, 12 },
new String[] { "<SINGLE>", "<ALPHANUM>", "<ALPHANUM>" },
new int[] { 1, 1, 1 });
}
/**
* Same as the above, except with a nonspacing mark to show correctness.
*/
public void testNonIdeographicNonLetter() throws IOException {
assertAnalyzesTo(analyzer, "一 رُوبرت موير",
new String[] { "", "رُوبرت", "موير" },
new int[] { 0, 2, 9 },
new int[] { 1, 8, 13 },
new String[] { "<SINGLE>", "<ALPHANUM>", "<ALPHANUM>" },
new int[] { 1, 1, 1 });
}
public void testSurrogates() throws IOException {
assertAnalyzesTo(analyzer, "𩬅艱鍟䇹愯瀛",
new String[] { "𩬅艱", "艱鍟", "鍟䇹", "䇹愯", "愯瀛" },
new int[] { 0, 2, 3, 4, 5 },
new int[] { 3, 4, 5, 6, 7 },
new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>" },
new int[] { 1, 1, 1, 1, 1 });
}
public void testReusableTokenStream() throws IOException {
assertAnalyzesToReuse(analyzer, "あいうえおabcかきくけこ",
new String[] { "あい", "いう", "うえ", "えお", "abc", "かき", "きく", "くけ", "けこ" },
new int[] { 0, 1, 2, 3, 5, 8, 9, 10, 11 },
new int[] { 2, 3, 4, 5, 8, 10, 11, 12, 13 },
new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<ALPHANUM>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>" },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1});
assertAnalyzesToReuse(analyzer, "あいうえおabんcかきくけ こ",
new String[] { "あい", "いう", "うえ", "えお", "ab", "", "c", "かき", "きく", "くけ", "" },
new int[] { 0, 1, 2, 3, 5, 7, 8, 9, 10, 11, 14 },
new int[] { 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15 },
new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<ALPHANUM>", "<SINGLE>", "<ALPHANUM>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<SINGLE>" },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
}
public void testSingleChar() throws IOException {
assertAnalyzesTo(analyzer, "",
new String[] { "" },
new int[] { 0 },
new int[] { 1 },
new String[] { "<SINGLE>" },
new int[] { 1 });
}
public void testTokenStream() throws IOException {
assertAnalyzesTo(analyzer, "一丁丂",
new String[] { "一丁", "丁丂"},
new int[] { 0, 1 },
new int[] { 2, 3 },
new String[] { "<DOUBLE>", "<DOUBLE>" },
new int[] { 1, 1 });
}
/** test that offsets are correct when mappingcharfilter is previously applied */
public void testChangedOffsets() throws IOException {
final NormalizeCharMap norm = new NormalizeCharMap();
norm.add("a", "一二");
norm.add("b", "二三");
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(tokenizer, new CJKBigramFilter(tokenizer));
}
@Override
protected Reader initReader(Reader reader) {
return new MappingCharFilter(norm, CharReader.get(reader));
}
};
assertAnalyzesTo(analyzer, "ab",
new String[] { "一二", "二二", "二三" },
new int[] { 0, 0, 1 },
new int[] { 1, 1, 2 });
// note: offsets are strange since this is how the charfilter maps them...
// before bigramming, the 4 tokens look like:
// { 0, 0, 1, 1 },
// { 0, 1, 1, 2 }
}
private static class FakeStandardTokenizer extends TokenFilter {
final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
public FakeStandardTokenizer(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
typeAtt.setType(StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC]);
return true;
} else {
return false;
}
}
}
public void testSingleChar2() throws Exception {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenFilter filter = new FakeStandardTokenizer(tokenizer);
filter = new StopFilter(TEST_VERSION_CURRENT, filter, CharArraySet.EMPTY_SET);
filter = new CJKBigramFilter(filter);
return new TokenStreamComponents(tokenizer, filter);
}
};
assertAnalyzesTo(analyzer, "",
new String[] { "" },
new int[] { 0 },
new int[] { 1 },
new String[] { "<SINGLE>" },
new int[] { 1 });
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random, new CJKAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -21,7 +21,10 @@ import java.io.IOException;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.util.Version;
/** @deprecated Remove when CJKTokenizer is removed (5.0) */
@Deprecated
public class TestCJKTokenizer extends BaseTokenStreamTestCase {
class TestToken {
@ -41,7 +44,7 @@ public class TestCJKTokenizer extends BaseTokenStreamTestCase {
}
public void checkCJKToken(final String str, final TestToken[] out_tokens) throws IOException {
Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT);
Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30);
String terms[] = new String[out_tokens.length];
int startOffsets[] = new int[out_tokens.length];
int endOffsets[] = new int[out_tokens.length];
@ -56,7 +59,7 @@ public class TestCJKTokenizer extends BaseTokenStreamTestCase {
}
public void checkCJKTokenReusable(final Analyzer a, final String str, final TestToken[] out_tokens) throws IOException {
Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT);
Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30);
String terms[] = new String[out_tokens.length];
int startOffsets[] = new int[out_tokens.length];
int endOffsets[] = new int[out_tokens.length];
@ -212,13 +215,13 @@ public class TestCJKTokenizer extends BaseTokenStreamTestCase {
}
public void testTokenStream() throws Exception {
Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT);
Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30);
assertAnalyzesTo(analyzer, "\u4e00\u4e01\u4e02",
new String[] { "\u4e00\u4e01", "\u4e01\u4e02"});
}
public void testReusableTokenStream() throws Exception {
Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT);
Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30);
String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";
TestToken[] out_tokens = {
@ -273,6 +276,6 @@ public class TestCJKTokenizer extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random, new CJKAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
checkRandomData(random, new CJKAnalyzer(Version.LUCENE_30), 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -0,0 +1,67 @@
package org.apache.lucene.analysis.cjk;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
/**
* Tests for {@link CJKWidthFilter}
*/
public class TestCJKWidthFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(source, new CJKWidthFilter(source));
}
};
/**
* Full-width ASCII forms normalized to half-width (basic latin)
*/
public void testFullWidthASCII() throws IOException {
assertAnalyzesTo(analyzer, " ",
new String[] { "Test", "1234" },
new int[] { 0, 5 },
new int[] { 4, 9 });
}
/**
* Half-width katakana forms normalized to standard katakana.
* A bit trickier in some cases, since half-width forms are decomposed
* and voice marks need to be recombined with a preceding base form.
*/
public void testHalfWidthKana() throws IOException {
assertAnalyzesTo(analyzer, "カタカナ",
new String[] { "カタカナ" });
assertAnalyzesTo(analyzer, "ヴィッツ",
new String[] { "ヴィッツ" });
assertAnalyzesTo(analyzer, "パナソニック",
new String[] { "パナソニック" });
}
public void testRandomData() throws IOException {
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
}
}

View File

@ -0,0 +1,226 @@
package org.apache.lucene.analysis.icu.segmentation;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
import org.apache.lucene.analysis.util.CharArraySet;
/**
* Tests ICUTokenizer's ability to work with CJKBigramFilter.
* Most tests adopted from TestCJKTokenizer
*/
public class TestWithCJKBigramFilter extends BaseTokenStreamTestCase {
/**
* ICUTokenizer+CJKBigramFilter
*/
private Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new ICUTokenizer(reader);
TokenStream result = new CJKBigramFilter(source);
return new TokenStreamComponents(source, new StopFilter(TEST_VERSION_CURRENT, result, CharArraySet.EMPTY_SET));
}
};
/**
* ICUTokenizer+ICUNormalizer2Filter+CJKBigramFilter.
*
* ICUNormalizer2Filter uses nfkc_casefold by default, so this is a language-independent
* superset of CJKWidthFilter's foldings.
*/
private Analyzer analyzer2 = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new ICUTokenizer(reader);
// we put this before the CJKBigramFilter, because the normalization might combine
// some halfwidth katakana forms, which will affect the bigramming.
TokenStream result = new ICUNormalizer2Filter(source);
result = new CJKBigramFilter(source);
return new TokenStreamComponents(source, new StopFilter(TEST_VERSION_CURRENT, result, CharArraySet.EMPTY_SET));
}
};
public void testJa1() throws IOException {
assertAnalyzesTo(analyzer, "一二三四五六七八九十",
new String[] { "一二", "二三", "三四", "四五", "五六", "六七", "七八", "八九", "九十" },
new int[] { 0, 1, 2, 3, 4, 5, 6, 7, 8 },
new int[] { 2, 3, 4, 5, 6, 7, 8, 9, 10 },
new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>" },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1 });
}
public void testJa2() throws IOException {
assertAnalyzesTo(analyzer, "一 二三四 五六七八九 十",
new String[] { "", "二三", "三四", "五六", "六七", "七八", "八九", "" },
new int[] { 0, 2, 3, 6, 7, 8, 9, 12 },
new int[] { 1, 4, 5, 8, 9, 10, 11, 13 },
new String[] { "<SINGLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<SINGLE>" },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1 });
}
public void testC() throws IOException {
assertAnalyzesTo(analyzer, "abc defgh ijklmn opqrstu vwxy z",
new String[] { "abc", "defgh", "ijklmn", "opqrstu", "vwxy", "z" },
new int[] { 0, 4, 10, 17, 25, 30 },
new int[] { 3, 9, 16, 24, 29, 31 },
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" },
new int[] { 1, 1, 1, 1, 1, 1 });
}
/**
* LUCENE-2207: wrong offset calculated by end()
*/
public void testFinalOffset() throws IOException {
assertAnalyzesTo(analyzer, "あい",
new String[] { "あい" },
new int[] { 0 },
new int[] { 2 },
new String[] { "<DOUBLE>" },
new int[] { 1 });
assertAnalyzesTo(analyzer, "あい ",
new String[] { "あい" },
new int[] { 0 },
new int[] { 2 },
new String[] { "<DOUBLE>" },
new int[] { 1 });
assertAnalyzesTo(analyzer, "test",
new String[] { "test" },
new int[] { 0 },
new int[] { 4 },
new String[] { "<ALPHANUM>" },
new int[] { 1 });
assertAnalyzesTo(analyzer, "test ",
new String[] { "test" },
new int[] { 0 },
new int[] { 4 },
new String[] { "<ALPHANUM>" },
new int[] { 1 });
assertAnalyzesTo(analyzer, "あいtest",
new String[] { "あい", "test" },
new int[] { 0, 2 },
new int[] { 2, 6 },
new String[] { "<DOUBLE>", "<ALPHANUM>" },
new int[] { 1, 1 });
assertAnalyzesTo(analyzer, "testあい ",
new String[] { "test", "あい" },
new int[] { 0, 4 },
new int[] { 4, 6 },
new String[] { "<ALPHANUM>", "<DOUBLE>" },
new int[] { 1, 1 });
}
public void testMix() throws IOException {
assertAnalyzesTo(analyzer, "あいうえおabcかきくけこ",
new String[] { "あい", "いう", "うえ", "えお", "abc", "かき", "きく", "くけ", "けこ" },
new int[] { 0, 1, 2, 3, 5, 8, 9, 10, 11 },
new int[] { 2, 3, 4, 5, 8, 10, 11, 12, 13 },
new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<ALPHANUM>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>" },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1});
}
public void testMix2() throws IOException {
assertAnalyzesTo(analyzer, "あいうえおabんcかきくけ こ",
new String[] { "あい", "いう", "うえ", "えお", "ab", "", "c", "かき", "きく", "くけ", "" },
new int[] { 0, 1, 2, 3, 5, 7, 8, 9, 10, 11, 14 },
new int[] { 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15 },
new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<ALPHANUM>", "<SINGLE>", "<ALPHANUM>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<SINGLE>" },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
}
/**
* Non-english text (outside of CJK) is treated normally, according to unicode rules
*/
public void testNonIdeographic() throws IOException {
assertAnalyzesTo(analyzer, "一 روبرت موير",
new String[] { "", "روبرت", "موير" },
new int[] { 0, 2, 8 },
new int[] { 1, 7, 12 },
new String[] { "<SINGLE>", "<ALPHANUM>", "<ALPHANUM>" },
new int[] { 1, 1, 1 });
}
/**
* Same as the above, except with a nonspacing mark to show correctness.
*/
public void testNonIdeographicNonLetter() throws IOException {
assertAnalyzesTo(analyzer, "一 رُوبرت موير",
new String[] { "", "رُوبرت", "موير" },
new int[] { 0, 2, 9 },
new int[] { 1, 8, 13 },
new String[] { "<SINGLE>", "<ALPHANUM>", "<ALPHANUM>" },
new int[] { 1, 1, 1 });
}
public void testSurrogates() throws IOException {
assertAnalyzesTo(analyzer, "𩬅艱鍟䇹愯瀛",
new String[] { "𩬅艱", "艱鍟", "鍟䇹", "䇹愯", "愯瀛" },
new int[] { 0, 2, 3, 4, 5 },
new int[] { 3, 4, 5, 6, 7 },
new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>" },
new int[] { 1, 1, 1, 1, 1 });
}
public void testReusableTokenStream() throws IOException {
assertAnalyzesToReuse(analyzer, "あいうえおabcかきくけこ",
new String[] { "あい", "いう", "うえ", "えお", "abc", "かき", "きく", "くけ", "けこ" },
new int[] { 0, 1, 2, 3, 5, 8, 9, 10, 11 },
new int[] { 2, 3, 4, 5, 8, 10, 11, 12, 13 },
new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<ALPHANUM>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>" },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1});
assertAnalyzesToReuse(analyzer, "あいうえおabんcかきくけ こ",
new String[] { "あい", "いう", "うえ", "えお", "ab", "", "c", "かき", "きく", "くけ", "" },
new int[] { 0, 1, 2, 3, 5, 7, 8, 9, 10, 11, 14 },
new int[] { 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15 },
new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<ALPHANUM>", "<SINGLE>", "<ALPHANUM>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<SINGLE>" },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
}
public void testSingleChar() throws IOException {
assertAnalyzesTo(analyzer, "",
new String[] { "" },
new int[] { 0 },
new int[] { 1 },
new String[] { "<SINGLE>" },
new int[] { 1 });
}
public void testTokenStream() throws IOException {
assertAnalyzesTo(analyzer, "一丁丂",
new String[] { "一丁", "丁丂"},
new int[] { 0, 1 },
new int[] { 2, 3 },
new String[] { "<DOUBLE>", "<DOUBLE>" },
new int[] { 1, 1 });
}
}

View File

@ -0,0 +1,64 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
/**
* Factory for {@link CJKBigramFilter}.
* <pre class="prettyprint" >
* &lt;fieldType name="text_cjk" class="solr.TextField"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
* &lt;filter class="solr.CJKWidthFilterFactory"/&gt;
* &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
* &lt;filter class="solr.CJKBigramFilterFactory"
* han="true" hiragana="true"
* katakana="true" hangul="true" /&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
public class CJKBigramFilterFactory extends BaseTokenFilterFactory {
int flags;
@Override
public void init(Map<String,String> args) {
super.init(args);
flags = 0;
if (getBoolean("han", true)) {
flags |= CJKBigramFilter.HAN;
}
if (getBoolean("hiragana", true)) {
flags |= CJKBigramFilter.HIRAGANA;
}
if (getBoolean("katakana", true)) {
flags |= CJKBigramFilter.KATAKANA;
}
if (getBoolean("hangul", true)) {
flags |= CJKBigramFilter.HANGUL;
}
}
@Override
public TokenStream create(TokenStream input) {
return new CJKBigramFilter(input, flags);
}
}

View File

@ -30,8 +30,9 @@ import java.io.Reader;
* &lt;tokenizer class="solr.CJKTokenizerFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
* @deprecated
*/
@Deprecated
public class CJKTokenizerFactory extends BaseTokenizerFactory {
public CJKTokenizer create(Reader in) {
return new CJKTokenizer(in);

View File

@ -0,0 +1,42 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKWidthFilter;
/**
* Factory for {@link CJKWidthFilter}.
* <pre class="prettyprint" >
* &lt;fieldType name="text_cjk" class="solr.TextField"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
* &lt;filter class="solr.CJKWidthFilterFactory"/&gt;
* &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
* &lt;filter class="solr.CJKBigramFilterFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
public class CJKWidthFilterFactory extends BaseTokenFilterFactory {
@Override
public TokenStream create(TokenStream input) {
return new CJKWidthFilter(input);
}
}

View File

@ -0,0 +1,52 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
/**
* Simple tests to ensure the CJK bigram factory is working.
* @deprecated
*/
public class TestCJKBigramFilterFactory extends BaseTokenTestCase {
public void testDefaults() throws Exception {
Reader reader = new StringReader("多くの学生が試験に落ちた。");
CJKBigramFilterFactory factory = new CJKBigramFilterFactory();
factory.init(DEFAULT_VERSION_PARAM);
TokenStream stream = factory.create(new StandardTokenizer(TEST_VERSION_CURRENT, reader));
assertTokenStreamContents(stream,
new String[] { "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" });
}
public void testHanOnly() throws Exception {
Reader reader = new StringReader("多くの学生が試験に落ちた。");
CJKBigramFilterFactory factory = new CJKBigramFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("hiragana", "false");
factory.init(args);
TokenStream stream = factory.create(new StandardTokenizer(TEST_VERSION_CURRENT, reader));
assertTokenStreamContents(stream,
new String[] { "", "", "", "学生", "", "試験", "", "", "", "" });
}
}

View File

@ -24,7 +24,9 @@ import org.apache.lucene.analysis.TokenStream;
/**
* Simple tests to ensure the CJK tokenizer factory is working.
* @deprecated
*/
@Deprecated
public class TestCJKTokenizerFactory extends BaseTokenTestCase {
/**
* Ensure the tokenizer actually tokenizes CJK text correctly

View File

@ -0,0 +1,36 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
/**
* Simple tests to ensure the CJKWidthFilterFactory is working
*/
public class TestCJKWidthFilterFactory extends BaseTokenTestCase {
public void test() throws Exception {
Reader reader = new StringReader(" ");
CJKWidthFilterFactory factory = new CJKWidthFilterFactory();
TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false));
assertTokenStreamContents(stream, new String[] { "Test", "1234" });
}
}