mirror of https://github.com/apache/lucene.git
LUCENE-2906: filter to process output of Standard/ICUTokenizer and create overlapping bigrams for CJK
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1225433 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a1a8eda2cd
commit
b2970db4bc
|
@ -102,6 +102,12 @@ New Features
|
|||
* SOLR-2982: Added phonetic encoders to contrib/analyzers/phonetic:
|
||||
Metaphone, Soundex, Caverphone, Beider-Morse, etc. (Robert Muir)
|
||||
|
||||
* LUCENE-2906: Added CJKBigramFilter that forms bigrams from StandardTokenizer or
|
||||
ICUTokenizer CJK tokens, and CJKWidthFilter that normalizes halfwidth/fullwidth.
|
||||
This filter supports unicode supplementary characters and you can toggle whether
|
||||
bigrams are formed for each of Han/Hiragana/Katakana/Hangul independently. Deprecates
|
||||
CJKTokenizer. (Tom Burton-West, Robert Muir)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-3596: DirectoryTaxonomyWriter.openIndexWriter() now takes an
|
||||
|
|
|
@ -22,16 +22,19 @@ import java.io.Reader;
|
|||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
|
||||
/**
|
||||
* An {@link Analyzer} that tokenizes text with {@link CJKTokenizer} and
|
||||
* filters with {@link StopFilter}
|
||||
*
|
||||
* An {@link Analyzer} that tokenizes text with {@link StandardTokenizer},
|
||||
* normalizes content with {@link CJKWidthFilter}, folds case with
|
||||
* {@link LowerCaseFilter}, forms bigrams of CJK with {@link CJKBigramFilter},
|
||||
* and filters stopwords with {@link StopFilter}
|
||||
*/
|
||||
public final class CJKAnalyzer extends StopwordAnalyzerBase {
|
||||
/**
|
||||
|
@ -86,7 +89,16 @@ public final class CJKAnalyzer extends StopwordAnalyzerBase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new CJKTokenizer(reader);
|
||||
return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords));
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_36)) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
// run the widthfilter first before bigramming, it sometimes combines characters.
|
||||
TokenStream result = new CJKWidthFilter(source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new CJKBigramFilter(result);
|
||||
return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
|
||||
} else {
|
||||
final Tokenizer source = new CJKTokenizer(reader);
|
||||
return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,307 @@
|
|||
package org.apache.lucene.analysis.cjk;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
||||
/**
|
||||
* Forms bigrams of CJK terms that are generated from StandardTokenizer
|
||||
* or ICUTokenizer.
|
||||
* <p>
|
||||
* CJK types are set by these tokenizers, but you can also use
|
||||
* {@link #CJKBigramFilter(TokenStream, int)} to explicitly control which
|
||||
* of the CJK scripts are turned into bigrams.
|
||||
* <p>
|
||||
* In all cases, all non-CJK input is passed thru unmodified.
|
||||
*/
|
||||
public final class CJKBigramFilter extends TokenFilter {
|
||||
// configuration
|
||||
/** bigram flag for Han Ideographs */
|
||||
public static final int HAN = 1;
|
||||
/** bigram flag for Hiragana */
|
||||
public static final int HIRAGANA = 2;
|
||||
/** bigram flag for Katakana */
|
||||
public static final int KATAKANA = 4;
|
||||
/** bigram flag for Hangul */
|
||||
public static final int HANGUL = 8;
|
||||
|
||||
/** when we emit a bigram, its then marked as this type */
|
||||
public static final String DOUBLE_TYPE = "<DOUBLE>";
|
||||
/** when we emit a unigram, its then marked as this type */
|
||||
public static final String SINGLE_TYPE = "<SINGLE>";
|
||||
|
||||
// the types from standardtokenizer
|
||||
private static final String HAN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
|
||||
private static final String HIRAGANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
|
||||
private static final String KATAKANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
|
||||
private static final String HANGUL_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];
|
||||
|
||||
// sentinel value for ignoring a script
|
||||
private static final Object NO = new Object();
|
||||
|
||||
// these are set to either their type or NO if we want to pass them thru
|
||||
private final Object doHan;
|
||||
private final Object doHiragana;
|
||||
private final Object doKatakana;
|
||||
private final Object doHangul;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
// buffers containing codepoint and offsets in parallel
|
||||
int buffer[] = new int[8];
|
||||
int startOffset[] = new int[8];
|
||||
int endOffset[] = new int[8];
|
||||
// length of valid buffer
|
||||
int bufferLen;
|
||||
// current buffer index
|
||||
int index;
|
||||
|
||||
// the last end offset, to determine if we should bigram across tokens
|
||||
int lastEndOffset;
|
||||
|
||||
private boolean exhausted;
|
||||
|
||||
/**
|
||||
* Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int)
|
||||
* CJKBigramFilter(HAN | HIRAGANA | KATAKANA | HANGUL)}
|
||||
*/
|
||||
public CJKBigramFilter(TokenStream in) {
|
||||
this(in, HAN | HIRAGANA | KATAKANA | HANGUL);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new CJKBigramFilter, specifying which writing systems should be bigrammed.
|
||||
* @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA},
|
||||
* {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL}
|
||||
*/
|
||||
public CJKBigramFilter(TokenStream in, int flags) {
|
||||
super(in);
|
||||
doHan = (flags & HAN) == 0 ? NO : HAN_TYPE;
|
||||
doHiragana = (flags & HIRAGANA) == 0 ? NO : HIRAGANA_TYPE;
|
||||
doKatakana = (flags & KATAKANA) == 0 ? NO : KATAKANA_TYPE;
|
||||
doHangul = (flags & HANGUL) == 0 ? NO : HANGUL_TYPE;
|
||||
}
|
||||
|
||||
/*
|
||||
* much of this complexity revolves around handling the special case of a
|
||||
* "lone cjk character" where cjktokenizer would output a unigram. this
|
||||
* is also the only time we ever have to captureState.
|
||||
*/
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
while (true) {
|
||||
if (hasBufferedBigram()) {
|
||||
|
||||
// case 1: we have multiple remaining codepoints buffered,
|
||||
// so we can emit a bigram here.
|
||||
|
||||
flushBigram();
|
||||
return true;
|
||||
} else if (doNext()) {
|
||||
|
||||
// case 2: look at the token type. should we form any n-grams?
|
||||
|
||||
String type = typeAtt.type();
|
||||
if (type == doHan || type == doHiragana || type == doKatakana || type == doHangul) {
|
||||
|
||||
// acceptable CJK type: we form n-grams from these.
|
||||
// as long as the offsets are aligned, we just add these to our current buffer.
|
||||
// otherwise, we clear the buffer and start over.
|
||||
|
||||
if (offsetAtt.startOffset() != lastEndOffset) { // unaligned, clear queue
|
||||
if (hasBufferedUnigram()) {
|
||||
|
||||
// we have a buffered unigram, and we peeked ahead to see if we could form
|
||||
// a bigram, but we can't, because the offsets are unaligned. capture the state
|
||||
// of this peeked data to be revisited next time thru the loop, and dump our unigram.
|
||||
|
||||
loneState = captureState();
|
||||
flushUnigram();
|
||||
return true;
|
||||
}
|
||||
index = 0;
|
||||
bufferLen = 0;
|
||||
}
|
||||
refill();
|
||||
} else {
|
||||
|
||||
// not a CJK type: we just return these as-is.
|
||||
|
||||
if (hasBufferedUnigram()) {
|
||||
|
||||
// we have a buffered unigram, and we peeked ahead to see if we could form
|
||||
// a bigram, but we can't, because its not a CJK type. capture the state
|
||||
// of this peeked data to be revisited next time thru the loop, and dump our unigram.
|
||||
|
||||
loneState = captureState();
|
||||
flushUnigram();
|
||||
return true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
|
||||
// case 3: we have only zero or 1 codepoints buffered,
|
||||
// so not enough to form a bigram. But, we also have no
|
||||
// more input. So if we have a buffered codepoint, emit
|
||||
// a unigram, otherwise, its end of stream.
|
||||
|
||||
if (hasBufferedUnigram()) {
|
||||
flushUnigram(); // flush our remaining unigram
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private State loneState; // rarely used: only for "lone cjk characters", where we emit unigrams
|
||||
|
||||
/**
|
||||
* looks at next input token, returning false is none is available
|
||||
*/
|
||||
private boolean doNext() throws IOException {
|
||||
if (loneState != null) {
|
||||
restoreState(loneState);
|
||||
loneState = null;
|
||||
return true;
|
||||
} else {
|
||||
if (exhausted) {
|
||||
return false;
|
||||
} else if (input.incrementToken()) {
|
||||
return true;
|
||||
} else {
|
||||
exhausted = true;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* refills buffers with new data from the current token.
|
||||
*/
|
||||
private void refill() throws IOException {
|
||||
// compact buffers to keep them smallish if they become large
|
||||
// just a safety check, but technically we only need the last codepoint
|
||||
if (bufferLen > 64) {
|
||||
int last = bufferLen - 1;
|
||||
buffer[0] = buffer[last];
|
||||
startOffset[0] = startOffset[last];
|
||||
endOffset[0] = endOffset[last];
|
||||
bufferLen = 1;
|
||||
index -= last;
|
||||
}
|
||||
|
||||
char termBuffer[] = termAtt.buffer();
|
||||
int len = termAtt.length();
|
||||
int start = offsetAtt.startOffset();
|
||||
int end = offsetAtt.endOffset();
|
||||
|
||||
int newSize = bufferLen + len;
|
||||
buffer = ArrayUtil.grow(buffer, newSize);
|
||||
startOffset = ArrayUtil.grow(startOffset, newSize);
|
||||
endOffset = ArrayUtil.grow(endOffset, newSize);
|
||||
lastEndOffset = end;
|
||||
|
||||
if (end - start != len) {
|
||||
// crazy offsets (modified by synonym or charfilter): just preserve
|
||||
for (int i = 0, cp = 0; i < len; i += Character.charCount(cp)) {
|
||||
cp = buffer[bufferLen] = Character.codePointAt(termBuffer, i, len);
|
||||
startOffset[bufferLen] = start;
|
||||
endOffset[bufferLen] = end;
|
||||
bufferLen++;
|
||||
}
|
||||
} else {
|
||||
// normal offsets
|
||||
for (int i = 0, cp = 0, cpLen = 0; i < len; i += cpLen) {
|
||||
cp = buffer[bufferLen] = Character.codePointAt(termBuffer, i, len);
|
||||
cpLen = Character.charCount(cp);
|
||||
startOffset[bufferLen] = start;
|
||||
start = endOffset[bufferLen] = start + cpLen;
|
||||
bufferLen++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Flushes a bigram token to output from our buffer
|
||||
* This is the normal case, e.g. ABC -> AB BC
|
||||
*/
|
||||
private void flushBigram() {
|
||||
clearAttributes();
|
||||
char termBuffer[] = termAtt.resizeBuffer(4); // maximum bigram length in code units (2 supplementaries)
|
||||
int len1 = Character.toChars(buffer[index], termBuffer, 0);
|
||||
int len2 = len1 + Character.toChars(buffer[index+1], termBuffer, len1);
|
||||
termAtt.setLength(len2);
|
||||
offsetAtt.setOffset(startOffset[index], endOffset[index+1]);
|
||||
typeAtt.setType(DOUBLE_TYPE);
|
||||
index++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Flushes a unigram token to output from our buffer.
|
||||
* This happens when we encounter isolated CJK characters, either the whole
|
||||
* CJK string is a single character, or we encounter a CJK character surrounded
|
||||
* by space, punctuation, english, etc, but not beside any other CJK.
|
||||
*/
|
||||
private void flushUnigram() {
|
||||
clearAttributes();
|
||||
char termBuffer[] = termAtt.resizeBuffer(2); // maximum unigram length (2 surrogates)
|
||||
int len = Character.toChars(buffer[index], termBuffer, 0);
|
||||
termAtt.setLength(len);
|
||||
offsetAtt.setOffset(startOffset[index], endOffset[index]);
|
||||
typeAtt.setType(SINGLE_TYPE);
|
||||
index++;
|
||||
}
|
||||
|
||||
/**
|
||||
* True if we have multiple codepoints sitting in our buffer
|
||||
*/
|
||||
private boolean hasBufferedBigram() {
|
||||
return bufferLen - index > 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* True if we have a single codepoint sitting in our buffer, where its future
|
||||
* (whether it is emitted as unigram or forms a bigram) depends upon not-yet-seen
|
||||
* inputs.
|
||||
*/
|
||||
private boolean hasBufferedUnigram() {
|
||||
return bufferLen == 1 && index == 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
bufferLen = 0;
|
||||
index = 0;
|
||||
lastEndOffset = 0;
|
||||
loneState = null;
|
||||
exhausted = false;
|
||||
}
|
||||
}
|
|
@ -44,7 +44,9 @@ import org.apache.lucene.util.AttributeSource;
|
|||
* please search <a
|
||||
* href="http://www.google.com/search?q=word+chinese+segment">google</a>
|
||||
*
|
||||
* @deprecated Use StandardTokenizer, CJKWidthFilter, CJKBigramFilter, and LowerCaseFilter instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public final class CJKTokenizer extends Tokenizer {
|
||||
//~ Static fields/initializers ---------------------------------------------
|
||||
/** Word token type */
|
||||
|
|
|
@ -0,0 +1,95 @@
|
|||
package org.apache.lucene.analysis.cjk;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.util.StemmerUtil;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilter} that normalizes CJK width differences:
|
||||
* <ul>
|
||||
* <li>Folds fullwidth ASCII variants into the equivalent basic latin
|
||||
* <li>Folds halfwidth Katakana variants into the equivalent kana
|
||||
* </ul>
|
||||
* <p>
|
||||
* NOTE: this filter can be viewed as a (practical) subset of NFKC/NFKD
|
||||
* Unicode normalization. See the normalization support in the ICU package
|
||||
* for full normalization.
|
||||
*/
|
||||
public final class CJKWidthFilter extends TokenFilter {
|
||||
private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
/* halfwidth kana mappings: 0xFF65-0xFF9D
|
||||
*
|
||||
* note: 0xFF9C and 0xFF9D are only mapped to 0x3099 and 0x309A
|
||||
* as a fallback when they cannot properly combine with a preceding
|
||||
* character into a composed form.
|
||||
*/
|
||||
private static final char KANA_NORM[] = new char[] {
|
||||
0x30fb, 0x30f2, 0x30a1, 0x30a3, 0x30a5, 0x30a7, 0x30a9, 0x30e3, 0x30e5,
|
||||
0x30e7, 0x30c3, 0x30fc, 0x30a2, 0x30a4, 0x30a6, 0x30a8, 0x30aa, 0x30ab,
|
||||
0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9, 0x30bb, 0x30bd,
|
||||
0x30bf, 0x30c1, 0x30c4, 0x30c6, 0x30c8, 0x30ca, 0x30cb, 0x30cc, 0x30cd,
|
||||
0x30ce, 0x30cf, 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de, 0x30df, 0x30e0,
|
||||
0x30e1, 0x30e2, 0x30e4, 0x30e6, 0x30e8, 0x30e9, 0x30ea, 0x30eb, 0x30ec,
|
||||
0x30ed, 0x30ef, 0x30f3, 0x3099, 0x309A
|
||||
};
|
||||
|
||||
public CJKWidthFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
char text[] = termAtt.buffer();
|
||||
int length = termAtt.length();
|
||||
for (int i = 0; i < length; i++) {
|
||||
final char ch = text[i];
|
||||
if (ch >= 0xFF01 && ch <= 0xFF5E) {
|
||||
// Fullwidth ASCII variants
|
||||
text[i] -= 0xFEE0;
|
||||
} else if (ch >= 0xFF65 && ch <= 0xFF9F) {
|
||||
// Halfwidth Katakana variants
|
||||
if ((ch == 0xFF9E || ch == 0xFF9F) && i > 0 && combine(text, i, length, ch)) {
|
||||
length = StemmerUtil.delete(text, i--, length);
|
||||
} else {
|
||||
text[i] = KANA_NORM[ch - 0xFF65];
|
||||
}
|
||||
}
|
||||
}
|
||||
termAtt.setLength(length);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/* kana combining diffs: 0x30A6-0x30FD */
|
||||
private static final byte KANA_COMBINE_VOICED[] = new byte[] {
|
||||
78, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
|
||||
0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
|
||||
0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
|
||||
};
|
||||
|
||||
private static final byte KANA_COMBINE_HALF_VOICED[] = new byte[] {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2,
|
||||
0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
};
|
||||
|
||||
/** returns true if we successfully combined the voice mark */
|
||||
private static boolean combine(char text[], int pos, int length, char ch) {
|
||||
final char prev = text[pos-1];
|
||||
if (prev >= 0x30A6 && prev <= 0x30FD) {
|
||||
text[pos-1] += (ch == 0xFF9F)
|
||||
? KANA_COMBINE_HALF_VOICED[prev - 0x30A6]
|
||||
: KANA_COMBINE_VOICED[prev - 0x30A6];
|
||||
return text[pos-1] != prev;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,275 @@
|
|||
package org.apache.lucene.analysis.cjk;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
|
||||
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
/**
|
||||
* Most tests adopted from TestCJKTokenizer
|
||||
*/
|
||||
public class TestCJKAnalyzer extends BaseTokenStreamTestCase {
|
||||
private Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT);
|
||||
|
||||
public void testJa1() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "一二三四五六七八九十",
|
||||
new String[] { "一二", "二三", "三四", "四五", "五六", "六七", "七八", "八九", "九十" },
|
||||
new int[] { 0, 1, 2, 3, 4, 5, 6, 7, 8 },
|
||||
new int[] { 2, 3, 4, 5, 6, 7, 8, 9, 10 },
|
||||
new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>" },
|
||||
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1 });
|
||||
}
|
||||
|
||||
public void testJa2() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "一 二三四 五六七八九 十",
|
||||
new String[] { "一", "二三", "三四", "五六", "六七", "七八", "八九", "十" },
|
||||
new int[] { 0, 2, 3, 6, 7, 8, 9, 12 },
|
||||
new int[] { 1, 4, 5, 8, 9, 10, 11, 13 },
|
||||
new String[] { "<SINGLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<SINGLE>" },
|
||||
new int[] { 1, 1, 1, 1, 1, 1, 1, 1 });
|
||||
}
|
||||
|
||||
public void testC() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "abc defgh ijklmn opqrstu vwxy z",
|
||||
new String[] { "abc", "defgh", "ijklmn", "opqrstu", "vwxy", "z" },
|
||||
new int[] { 0, 4, 10, 17, 25, 30 },
|
||||
new int[] { 3, 9, 16, 24, 29, 31 },
|
||||
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" },
|
||||
new int[] { 1, 1, 1, 1, 1, 1 });
|
||||
}
|
||||
|
||||
/**
|
||||
* LUCENE-2207: wrong offset calculated by end()
|
||||
*/
|
||||
public void testFinalOffset() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "あい",
|
||||
new String[] { "あい" },
|
||||
new int[] { 0 },
|
||||
new int[] { 2 },
|
||||
new String[] { "<DOUBLE>" },
|
||||
new int[] { 1 });
|
||||
|
||||
assertAnalyzesTo(analyzer, "あい ",
|
||||
new String[] { "あい" },
|
||||
new int[] { 0 },
|
||||
new int[] { 2 },
|
||||
new String[] { "<DOUBLE>" },
|
||||
new int[] { 1 });
|
||||
|
||||
assertAnalyzesTo(analyzer, "test",
|
||||
new String[] { "test" },
|
||||
new int[] { 0 },
|
||||
new int[] { 4 },
|
||||
new String[] { "<ALPHANUM>" },
|
||||
new int[] { 1 });
|
||||
|
||||
assertAnalyzesTo(analyzer, "test ",
|
||||
new String[] { "test" },
|
||||
new int[] { 0 },
|
||||
new int[] { 4 },
|
||||
new String[] { "<ALPHANUM>" },
|
||||
new int[] { 1 });
|
||||
|
||||
assertAnalyzesTo(analyzer, "あいtest",
|
||||
new String[] { "あい", "test" },
|
||||
new int[] { 0, 2 },
|
||||
new int[] { 2, 6 },
|
||||
new String[] { "<DOUBLE>", "<ALPHANUM>" },
|
||||
new int[] { 1, 1 });
|
||||
|
||||
assertAnalyzesTo(analyzer, "testあい ",
|
||||
new String[] { "test", "あい" },
|
||||
new int[] { 0, 4 },
|
||||
new int[] { 4, 6 },
|
||||
new String[] { "<ALPHANUM>", "<DOUBLE>" },
|
||||
new int[] { 1, 1 });
|
||||
}
|
||||
|
||||
public void testMix() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "あいうえおabcかきくけこ",
|
||||
new String[] { "あい", "いう", "うえ", "えお", "abc", "かき", "きく", "くけ", "けこ" },
|
||||
new int[] { 0, 1, 2, 3, 5, 8, 9, 10, 11 },
|
||||
new int[] { 2, 3, 4, 5, 8, 10, 11, 12, 13 },
|
||||
new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<ALPHANUM>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>" },
|
||||
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1});
|
||||
}
|
||||
|
||||
public void testMix2() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "あいうえおabんcかきくけ こ",
|
||||
new String[] { "あい", "いう", "うえ", "えお", "ab", "ん", "c", "かき", "きく", "くけ", "こ" },
|
||||
new int[] { 0, 1, 2, 3, 5, 7, 8, 9, 10, 11, 14 },
|
||||
new int[] { 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15 },
|
||||
new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<ALPHANUM>", "<SINGLE>", "<ALPHANUM>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<SINGLE>" },
|
||||
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
|
||||
}
|
||||
|
||||
/**
|
||||
* Non-english text (outside of CJK) is treated normally, according to unicode rules
|
||||
*/
|
||||
public void testNonIdeographic() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "一 روبرت موير",
|
||||
new String[] { "一", "روبرت", "موير" },
|
||||
new int[] { 0, 2, 8 },
|
||||
new int[] { 1, 7, 12 },
|
||||
new String[] { "<SINGLE>", "<ALPHANUM>", "<ALPHANUM>" },
|
||||
new int[] { 1, 1, 1 });
|
||||
}
|
||||
|
||||
/**
|
||||
* Same as the above, except with a nonspacing mark to show correctness.
|
||||
*/
|
||||
public void testNonIdeographicNonLetter() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "一 رُوبرت موير",
|
||||
new String[] { "一", "رُوبرت", "موير" },
|
||||
new int[] { 0, 2, 9 },
|
||||
new int[] { 1, 8, 13 },
|
||||
new String[] { "<SINGLE>", "<ALPHANUM>", "<ALPHANUM>" },
|
||||
new int[] { 1, 1, 1 });
|
||||
}
|
||||
|
||||
public void testSurrogates() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "𩬅艱鍟䇹愯瀛",
|
||||
new String[] { "𩬅艱", "艱鍟", "鍟䇹", "䇹愯", "愯瀛" },
|
||||
new int[] { 0, 2, 3, 4, 5 },
|
||||
new int[] { 3, 4, 5, 6, 7 },
|
||||
new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>" },
|
||||
new int[] { 1, 1, 1, 1, 1 });
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws IOException {
|
||||
assertAnalyzesToReuse(analyzer, "あいうえおabcかきくけこ",
|
||||
new String[] { "あい", "いう", "うえ", "えお", "abc", "かき", "きく", "くけ", "けこ" },
|
||||
new int[] { 0, 1, 2, 3, 5, 8, 9, 10, 11 },
|
||||
new int[] { 2, 3, 4, 5, 8, 10, 11, 12, 13 },
|
||||
new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<ALPHANUM>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>" },
|
||||
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1});
|
||||
|
||||
assertAnalyzesToReuse(analyzer, "あいうえおabんcかきくけ こ",
|
||||
new String[] { "あい", "いう", "うえ", "えお", "ab", "ん", "c", "かき", "きく", "くけ", "こ" },
|
||||
new int[] { 0, 1, 2, 3, 5, 7, 8, 9, 10, 11, 14 },
|
||||
new int[] { 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15 },
|
||||
new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<ALPHANUM>", "<SINGLE>", "<ALPHANUM>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<SINGLE>" },
|
||||
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
|
||||
}
|
||||
|
||||
public void testSingleChar() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "一",
|
||||
new String[] { "一" },
|
||||
new int[] { 0 },
|
||||
new int[] { 1 },
|
||||
new String[] { "<SINGLE>" },
|
||||
new int[] { 1 });
|
||||
}
|
||||
|
||||
public void testTokenStream() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "一丁丂",
|
||||
new String[] { "一丁", "丁丂"},
|
||||
new int[] { 0, 1 },
|
||||
new int[] { 2, 3 },
|
||||
new String[] { "<DOUBLE>", "<DOUBLE>" },
|
||||
new int[] { 1, 1 });
|
||||
}
|
||||
|
||||
/** test that offsets are correct when mappingcharfilter is previously applied */
|
||||
public void testChangedOffsets() throws IOException {
|
||||
final NormalizeCharMap norm = new NormalizeCharMap();
|
||||
norm.add("a", "一二");
|
||||
norm.add("b", "二三");
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
return new TokenStreamComponents(tokenizer, new CJKBigramFilter(tokenizer));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(Reader reader) {
|
||||
return new MappingCharFilter(norm, CharReader.get(reader));
|
||||
}
|
||||
};
|
||||
|
||||
assertAnalyzesTo(analyzer, "ab",
|
||||
new String[] { "一二", "二二", "二三" },
|
||||
new int[] { 0, 0, 1 },
|
||||
new int[] { 1, 1, 2 });
|
||||
|
||||
// note: offsets are strange since this is how the charfilter maps them...
|
||||
// before bigramming, the 4 tokens look like:
|
||||
// { 0, 0, 1, 1 },
|
||||
// { 0, 1, 1, 2 }
|
||||
}
|
||||
|
||||
private static class FakeStandardTokenizer extends TokenFilter {
|
||||
final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
|
||||
public FakeStandardTokenizer(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
typeAtt.setType(StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC]);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testSingleChar2() throws Exception {
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenFilter filter = new FakeStandardTokenizer(tokenizer);
|
||||
filter = new StopFilter(TEST_VERSION_CURRENT, filter, CharArraySet.EMPTY_SET);
|
||||
filter = new CJKBigramFilter(filter);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
}
|
||||
};
|
||||
|
||||
assertAnalyzesTo(analyzer, "一",
|
||||
new String[] { "一" },
|
||||
new int[] { 0 },
|
||||
new int[] { 1 },
|
||||
new String[] { "<SINGLE>" },
|
||||
new int[] { 1 });
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random, new CJKAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
|
@ -21,7 +21,10 @@ import java.io.IOException;
|
|||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/** @deprecated Remove when CJKTokenizer is removed (5.0) */
|
||||
@Deprecated
|
||||
public class TestCJKTokenizer extends BaseTokenStreamTestCase {
|
||||
|
||||
class TestToken {
|
||||
|
@ -41,7 +44,7 @@ public class TestCJKTokenizer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void checkCJKToken(final String str, final TestToken[] out_tokens) throws IOException {
|
||||
Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT);
|
||||
Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30);
|
||||
String terms[] = new String[out_tokens.length];
|
||||
int startOffsets[] = new int[out_tokens.length];
|
||||
int endOffsets[] = new int[out_tokens.length];
|
||||
|
@ -56,7 +59,7 @@ public class TestCJKTokenizer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void checkCJKTokenReusable(final Analyzer a, final String str, final TestToken[] out_tokens) throws IOException {
|
||||
Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT);
|
||||
Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30);
|
||||
String terms[] = new String[out_tokens.length];
|
||||
int startOffsets[] = new int[out_tokens.length];
|
||||
int endOffsets[] = new int[out_tokens.length];
|
||||
|
@ -212,13 +215,13 @@ public class TestCJKTokenizer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testTokenStream() throws Exception {
|
||||
Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT);
|
||||
Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30);
|
||||
assertAnalyzesTo(analyzer, "\u4e00\u4e01\u4e02",
|
||||
new String[] { "\u4e00\u4e01", "\u4e01\u4e02"});
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT);
|
||||
Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30);
|
||||
String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";
|
||||
|
||||
TestToken[] out_tokens = {
|
||||
|
@ -273,6 +276,6 @@ public class TestCJKTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random, new CJKAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random, new CJKAnalyzer(Version.LUCENE_30), 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,67 @@
|
|||
package org.apache.lucene.analysis.cjk;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
/**
|
||||
* Tests for {@link CJKWidthFilter}
|
||||
*/
|
||||
public class TestCJKWidthFilter extends BaseTokenStreamTestCase {
|
||||
private Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(source, new CJKWidthFilter(source));
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Full-width ASCII forms normalized to half-width (basic latin)
|
||||
*/
|
||||
public void testFullWidthASCII() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "Test 1234",
|
||||
new String[] { "Test", "1234" },
|
||||
new int[] { 0, 5 },
|
||||
new int[] { 4, 9 });
|
||||
}
|
||||
|
||||
/**
|
||||
* Half-width katakana forms normalized to standard katakana.
|
||||
* A bit trickier in some cases, since half-width forms are decomposed
|
||||
* and voice marks need to be recombined with a preceding base form.
|
||||
*/
|
||||
public void testHalfWidthKana() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "カタカナ",
|
||||
new String[] { "カタカナ" });
|
||||
assertAnalyzesTo(analyzer, "ヴィッツ",
|
||||
new String[] { "ヴィッツ" });
|
||||
assertAnalyzesTo(analyzer, "パナソニック",
|
||||
new String[] { "パナソニック" });
|
||||
}
|
||||
|
||||
public void testRandomData() throws IOException {
|
||||
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,226 @@
|
|||
package org.apache.lucene.analysis.icu.segmentation;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
/**
|
||||
* Tests ICUTokenizer's ability to work with CJKBigramFilter.
|
||||
* Most tests adopted from TestCJKTokenizer
|
||||
*/
|
||||
public class TestWithCJKBigramFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
/**
|
||||
* ICUTokenizer+CJKBigramFilter
|
||||
*/
|
||||
private Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new ICUTokenizer(reader);
|
||||
TokenStream result = new CJKBigramFilter(source);
|
||||
return new TokenStreamComponents(source, new StopFilter(TEST_VERSION_CURRENT, result, CharArraySet.EMPTY_SET));
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* ICUTokenizer+ICUNormalizer2Filter+CJKBigramFilter.
|
||||
*
|
||||
* ICUNormalizer2Filter uses nfkc_casefold by default, so this is a language-independent
|
||||
* superset of CJKWidthFilter's foldings.
|
||||
*/
|
||||
private Analyzer analyzer2 = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new ICUTokenizer(reader);
|
||||
// we put this before the CJKBigramFilter, because the normalization might combine
|
||||
// some halfwidth katakana forms, which will affect the bigramming.
|
||||
TokenStream result = new ICUNormalizer2Filter(source);
|
||||
result = new CJKBigramFilter(source);
|
||||
return new TokenStreamComponents(source, new StopFilter(TEST_VERSION_CURRENT, result, CharArraySet.EMPTY_SET));
|
||||
}
|
||||
};
|
||||
|
||||
public void testJa1() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "一二三四五六七八九十",
|
||||
new String[] { "一二", "二三", "三四", "四五", "五六", "六七", "七八", "八九", "九十" },
|
||||
new int[] { 0, 1, 2, 3, 4, 5, 6, 7, 8 },
|
||||
new int[] { 2, 3, 4, 5, 6, 7, 8, 9, 10 },
|
||||
new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>" },
|
||||
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1 });
|
||||
}
|
||||
|
||||
public void testJa2() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "一 二三四 五六七八九 十",
|
||||
new String[] { "一", "二三", "三四", "五六", "六七", "七八", "八九", "十" },
|
||||
new int[] { 0, 2, 3, 6, 7, 8, 9, 12 },
|
||||
new int[] { 1, 4, 5, 8, 9, 10, 11, 13 },
|
||||
new String[] { "<SINGLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<SINGLE>" },
|
||||
new int[] { 1, 1, 1, 1, 1, 1, 1, 1 });
|
||||
}
|
||||
|
||||
public void testC() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "abc defgh ijklmn opqrstu vwxy z",
|
||||
new String[] { "abc", "defgh", "ijklmn", "opqrstu", "vwxy", "z" },
|
||||
new int[] { 0, 4, 10, 17, 25, 30 },
|
||||
new int[] { 3, 9, 16, 24, 29, 31 },
|
||||
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" },
|
||||
new int[] { 1, 1, 1, 1, 1, 1 });
|
||||
}
|
||||
|
||||
/**
|
||||
* LUCENE-2207: wrong offset calculated by end()
|
||||
*/
|
||||
public void testFinalOffset() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "あい",
|
||||
new String[] { "あい" },
|
||||
new int[] { 0 },
|
||||
new int[] { 2 },
|
||||
new String[] { "<DOUBLE>" },
|
||||
new int[] { 1 });
|
||||
|
||||
assertAnalyzesTo(analyzer, "あい ",
|
||||
new String[] { "あい" },
|
||||
new int[] { 0 },
|
||||
new int[] { 2 },
|
||||
new String[] { "<DOUBLE>" },
|
||||
new int[] { 1 });
|
||||
|
||||
assertAnalyzesTo(analyzer, "test",
|
||||
new String[] { "test" },
|
||||
new int[] { 0 },
|
||||
new int[] { 4 },
|
||||
new String[] { "<ALPHANUM>" },
|
||||
new int[] { 1 });
|
||||
|
||||
assertAnalyzesTo(analyzer, "test ",
|
||||
new String[] { "test" },
|
||||
new int[] { 0 },
|
||||
new int[] { 4 },
|
||||
new String[] { "<ALPHANUM>" },
|
||||
new int[] { 1 });
|
||||
|
||||
assertAnalyzesTo(analyzer, "あいtest",
|
||||
new String[] { "あい", "test" },
|
||||
new int[] { 0, 2 },
|
||||
new int[] { 2, 6 },
|
||||
new String[] { "<DOUBLE>", "<ALPHANUM>" },
|
||||
new int[] { 1, 1 });
|
||||
|
||||
assertAnalyzesTo(analyzer, "testあい ",
|
||||
new String[] { "test", "あい" },
|
||||
new int[] { 0, 4 },
|
||||
new int[] { 4, 6 },
|
||||
new String[] { "<ALPHANUM>", "<DOUBLE>" },
|
||||
new int[] { 1, 1 });
|
||||
}
|
||||
|
||||
public void testMix() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "あいうえおabcかきくけこ",
|
||||
new String[] { "あい", "いう", "うえ", "えお", "abc", "かき", "きく", "くけ", "けこ" },
|
||||
new int[] { 0, 1, 2, 3, 5, 8, 9, 10, 11 },
|
||||
new int[] { 2, 3, 4, 5, 8, 10, 11, 12, 13 },
|
||||
new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<ALPHANUM>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>" },
|
||||
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1});
|
||||
}
|
||||
|
||||
public void testMix2() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "あいうえおabんcかきくけ こ",
|
||||
new String[] { "あい", "いう", "うえ", "えお", "ab", "ん", "c", "かき", "きく", "くけ", "こ" },
|
||||
new int[] { 0, 1, 2, 3, 5, 7, 8, 9, 10, 11, 14 },
|
||||
new int[] { 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15 },
|
||||
new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<ALPHANUM>", "<SINGLE>", "<ALPHANUM>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<SINGLE>" },
|
||||
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
|
||||
}
|
||||
|
||||
/**
|
||||
* Non-english text (outside of CJK) is treated normally, according to unicode rules
|
||||
*/
|
||||
public void testNonIdeographic() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "一 روبرت موير",
|
||||
new String[] { "一", "روبرت", "موير" },
|
||||
new int[] { 0, 2, 8 },
|
||||
new int[] { 1, 7, 12 },
|
||||
new String[] { "<SINGLE>", "<ALPHANUM>", "<ALPHANUM>" },
|
||||
new int[] { 1, 1, 1 });
|
||||
}
|
||||
|
||||
/**
|
||||
* Same as the above, except with a nonspacing mark to show correctness.
|
||||
*/
|
||||
public void testNonIdeographicNonLetter() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "一 رُوبرت موير",
|
||||
new String[] { "一", "رُوبرت", "موير" },
|
||||
new int[] { 0, 2, 9 },
|
||||
new int[] { 1, 8, 13 },
|
||||
new String[] { "<SINGLE>", "<ALPHANUM>", "<ALPHANUM>" },
|
||||
new int[] { 1, 1, 1 });
|
||||
}
|
||||
|
||||
public void testSurrogates() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "𩬅艱鍟䇹愯瀛",
|
||||
new String[] { "𩬅艱", "艱鍟", "鍟䇹", "䇹愯", "愯瀛" },
|
||||
new int[] { 0, 2, 3, 4, 5 },
|
||||
new int[] { 3, 4, 5, 6, 7 },
|
||||
new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>" },
|
||||
new int[] { 1, 1, 1, 1, 1 });
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws IOException {
|
||||
assertAnalyzesToReuse(analyzer, "あいうえおabcかきくけこ",
|
||||
new String[] { "あい", "いう", "うえ", "えお", "abc", "かき", "きく", "くけ", "けこ" },
|
||||
new int[] { 0, 1, 2, 3, 5, 8, 9, 10, 11 },
|
||||
new int[] { 2, 3, 4, 5, 8, 10, 11, 12, 13 },
|
||||
new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<ALPHANUM>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>" },
|
||||
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1});
|
||||
|
||||
assertAnalyzesToReuse(analyzer, "あいうえおabんcかきくけ こ",
|
||||
new String[] { "あい", "いう", "うえ", "えお", "ab", "ん", "c", "かき", "きく", "くけ", "こ" },
|
||||
new int[] { 0, 1, 2, 3, 5, 7, 8, 9, 10, 11, 14 },
|
||||
new int[] { 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15 },
|
||||
new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<ALPHANUM>", "<SINGLE>", "<ALPHANUM>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<SINGLE>" },
|
||||
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
|
||||
}
|
||||
|
||||
public void testSingleChar() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "一",
|
||||
new String[] { "一" },
|
||||
new int[] { 0 },
|
||||
new int[] { 1 },
|
||||
new String[] { "<SINGLE>" },
|
||||
new int[] { 1 });
|
||||
}
|
||||
|
||||
public void testTokenStream() throws IOException {
|
||||
assertAnalyzesTo(analyzer, "一丁丂",
|
||||
new String[] { "一丁", "丁丂"},
|
||||
new int[] { 0, 1 },
|
||||
new int[] { 2, 3 },
|
||||
new String[] { "<DOUBLE>", "<DOUBLE>" },
|
||||
new int[] { 1, 1 });
|
||||
}
|
||||
}
|
|
@ -0,0 +1,64 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
|
||||
|
||||
/**
|
||||
* Factory for {@link CJKBigramFilter}.
|
||||
* <pre class="prettyprint" >
|
||||
* <fieldType name="text_cjk" class="solr.TextField">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
* <filter class="solr.CJKWidthFilterFactory"/>
|
||||
* <filter class="solr.LowerCaseFilterFactory"/>
|
||||
* <filter class="solr.CJKBigramFilterFactory"
|
||||
* han="true" hiragana="true"
|
||||
* katakana="true" hangul="true" />
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*/
|
||||
public class CJKBigramFilterFactory extends BaseTokenFilterFactory {
|
||||
int flags;
|
||||
|
||||
@Override
|
||||
public void init(Map<String,String> args) {
|
||||
super.init(args);
|
||||
flags = 0;
|
||||
if (getBoolean("han", true)) {
|
||||
flags |= CJKBigramFilter.HAN;
|
||||
}
|
||||
if (getBoolean("hiragana", true)) {
|
||||
flags |= CJKBigramFilter.HIRAGANA;
|
||||
}
|
||||
if (getBoolean("katakana", true)) {
|
||||
flags |= CJKBigramFilter.KATAKANA;
|
||||
}
|
||||
if (getBoolean("hangul", true)) {
|
||||
flags |= CJKBigramFilter.HANGUL;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new CJKBigramFilter(input, flags);
|
||||
}
|
||||
}
|
|
@ -30,8 +30,9 @@ import java.io.Reader;
|
|||
* <tokenizer class="solr.CJKTokenizerFactory"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
* @deprecated
|
||||
*/
|
||||
@Deprecated
|
||||
public class CJKTokenizerFactory extends BaseTokenizerFactory {
|
||||
public CJKTokenizer create(Reader in) {
|
||||
return new CJKTokenizer(in);
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.cjk.CJKWidthFilter;
|
||||
|
||||
/**
|
||||
* Factory for {@link CJKWidthFilter}.
|
||||
* <pre class="prettyprint" >
|
||||
* <fieldType name="text_cjk" class="solr.TextField">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
* <filter class="solr.CJKWidthFilterFactory"/>
|
||||
* <filter class="solr.LowerCaseFilterFactory"/>
|
||||
* <filter class="solr.CJKBigramFilterFactory"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*/
|
||||
|
||||
public class CJKWidthFilterFactory extends BaseTokenFilterFactory {
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new CJKWidthFilter(input);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the CJK bigram factory is working.
|
||||
* @deprecated
|
||||
*/
|
||||
public class TestCJKBigramFilterFactory extends BaseTokenTestCase {
|
||||
public void testDefaults() throws Exception {
|
||||
Reader reader = new StringReader("多くの学生が試験に落ちた。");
|
||||
CJKBigramFilterFactory factory = new CJKBigramFilterFactory();
|
||||
factory.init(DEFAULT_VERSION_PARAM);
|
||||
TokenStream stream = factory.create(new StandardTokenizer(TEST_VERSION_CURRENT, reader));
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" });
|
||||
}
|
||||
|
||||
public void testHanOnly() throws Exception {
|
||||
Reader reader = new StringReader("多くの学生が試験に落ちた。");
|
||||
CJKBigramFilterFactory factory = new CJKBigramFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("hiragana", "false");
|
||||
factory.init(args);
|
||||
TokenStream stream = factory.create(new StandardTokenizer(TEST_VERSION_CURRENT, reader));
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" });
|
||||
}
|
||||
}
|
|
@ -24,7 +24,9 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
|
||||
/**
|
||||
* Simple tests to ensure the CJK tokenizer factory is working.
|
||||
* @deprecated
|
||||
*/
|
||||
@Deprecated
|
||||
public class TestCJKTokenizerFactory extends BaseTokenTestCase {
|
||||
/**
|
||||
* Ensure the tokenizer actually tokenizes CJK text correctly
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the CJKWidthFilterFactory is working
|
||||
*/
|
||||
public class TestCJKWidthFilterFactory extends BaseTokenTestCase {
|
||||
public void test() throws Exception {
|
||||
Reader reader = new StringReader("Test 1234");
|
||||
CJKWidthFilterFactory factory = new CJKWidthFilterFactory();
|
||||
TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false));
|
||||
assertTokenStreamContents(stream, new String[] { "Test", "1234" });
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue