mirror of https://github.com/apache/lucene.git
LUCENE-4286: add unibigram option to CJKBigramFilter
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1369502 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
41d4ef2b08
commit
b4540d6369
|
@ -47,6 +47,11 @@ New features
|
|||
int docID), to attempt deletion by docID as long as the provided
|
||||
reader is an NRT reader, and the segment has not yet been merged
|
||||
away (Mike McCandless).
|
||||
|
||||
* LUCENE-4286: Added option to CJKBigramFilter to always also output
|
||||
unigrams. This can be used for a unigram+bigram approach, or at
|
||||
index-time only for better support of short queries.
|
||||
(Tom Burton-West, Robert Muir)
|
||||
|
||||
API Changes
|
||||
|
||||
|
|
|
@ -24,6 +24,8 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
||||
|
@ -35,6 +37,12 @@ import org.apache.lucene.util.ArrayUtil;
|
|||
* {@link #CJKBigramFilter(TokenStream, int)} to explicitly control which
|
||||
* of the CJK scripts are turned into bigrams.
|
||||
* <p>
|
||||
* By default, when a CJK character has no adjacent characters to form
|
||||
* a bigram, it is output in unigram form. If you want to always output
|
||||
* both unigrams and bigrams, set the <code>outputUnigrams</code>
|
||||
* flag in {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)}.
|
||||
* This can be used for a combined unigram+bigram approach.
|
||||
* <p>
|
||||
* In all cases, all non-CJK input is passed thru unmodified.
|
||||
*/
|
||||
public final class CJKBigramFilter extends TokenFilter {
|
||||
|
@ -67,10 +75,16 @@ public final class CJKBigramFilter extends TokenFilter {
|
|||
private final Object doHiragana;
|
||||
private final Object doKatakana;
|
||||
private final Object doHangul;
|
||||
|
||||
// true if we should output unigram tokens always
|
||||
private final boolean outputUnigrams;
|
||||
private boolean ngramState; // false = output unigram, true = output bigram
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
|
||||
|
||||
// buffers containing codepoint and offsets in parallel
|
||||
int buffer[] = new int[8];
|
||||
|
@ -88,23 +102,36 @@ public final class CJKBigramFilter extends TokenFilter {
|
|||
|
||||
/**
|
||||
* Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int)
|
||||
* CJKBigramFilter(HAN | HIRAGANA | KATAKANA | HANGUL)}
|
||||
* CJKBigramFilter(in, HAN | HIRAGANA | KATAKANA | HANGUL)}
|
||||
*/
|
||||
public CJKBigramFilter(TokenStream in) {
|
||||
this(in, HAN | HIRAGANA | KATAKANA | HANGUL);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new CJKBigramFilter, specifying which writing systems should be bigrammed.
|
||||
* @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA},
|
||||
* {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL}
|
||||
* Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)
|
||||
* CJKBigramFilter(in, flags, false)}
|
||||
*/
|
||||
public CJKBigramFilter(TokenStream in, int flags) {
|
||||
this(in, flags, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new CJKBigramFilter, specifying which writing systems should be bigrammed,
|
||||
* and whether or not unigrams should also be output.
|
||||
* @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA},
|
||||
* {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL}
|
||||
* @param outputUnigrams true if unigrams for the selected writing systems should also be output.
|
||||
* when this is false, this is only done when there are no adjacent characters to form
|
||||
* a bigram.
|
||||
*/
|
||||
public CJKBigramFilter(TokenStream in, int flags, boolean outputUnigrams) {
|
||||
super(in);
|
||||
doHan = (flags & HAN) == 0 ? NO : HAN_TYPE;
|
||||
doHiragana = (flags & HIRAGANA) == 0 ? NO : HIRAGANA_TYPE;
|
||||
doKatakana = (flags & KATAKANA) == 0 ? NO : KATAKANA_TYPE;
|
||||
doHangul = (flags & HANGUL) == 0 ? NO : HANGUL_TYPE;
|
||||
this.outputUnigrams = outputUnigrams;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -120,7 +147,24 @@ public final class CJKBigramFilter extends TokenFilter {
|
|||
// case 1: we have multiple remaining codepoints buffered,
|
||||
// so we can emit a bigram here.
|
||||
|
||||
flushBigram();
|
||||
if (outputUnigrams) {
|
||||
|
||||
// when also outputting unigrams, we output the unigram first,
|
||||
// then rewind back to revisit the bigram.
|
||||
// so an input of ABC is A + (rewind)AB + B + (rewind)BC + C
|
||||
// the logic in hasBufferedUnigram ensures we output the C,
|
||||
// even though it did actually have adjacent CJK characters.
|
||||
|
||||
if (ngramState) {
|
||||
flushBigram();
|
||||
} else {
|
||||
flushUnigram();
|
||||
index--;
|
||||
}
|
||||
ngramState = !ngramState;
|
||||
} else {
|
||||
flushBigram();
|
||||
}
|
||||
return true;
|
||||
} else if (doNext()) {
|
||||
|
||||
|
@ -260,6 +304,11 @@ public final class CJKBigramFilter extends TokenFilter {
|
|||
termAtt.setLength(len2);
|
||||
offsetAtt.setOffset(startOffset[index], endOffset[index+1]);
|
||||
typeAtt.setType(DOUBLE_TYPE);
|
||||
// when outputting unigrams, all bigrams are synonyms that span two unigrams
|
||||
if (outputUnigrams) {
|
||||
posIncAtt.setPositionIncrement(0);
|
||||
posLengthAtt.setPositionLength(2);
|
||||
}
|
||||
index++;
|
||||
}
|
||||
|
||||
|
@ -292,7 +341,13 @@ public final class CJKBigramFilter extends TokenFilter {
|
|||
* inputs.
|
||||
*/
|
||||
private boolean hasBufferedUnigram() {
|
||||
return bufferLen == 1 && index == 0;
|
||||
if (outputUnigrams) {
|
||||
// when outputting unigrams always
|
||||
return bufferLen - index == 1;
|
||||
} else {
|
||||
// otherwise its only when we have a lone CJK character
|
||||
return bufferLen == 1 && index == 0;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -303,5 +358,6 @@ public final class CJKBigramFilter extends TokenFilter {
|
|||
lastEndOffset = 0;
|
||||
loneState = null;
|
||||
exhausted = false;
|
||||
ngramState = false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -33,12 +33,13 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
* <filter class="solr.LowerCaseFilterFactory"/>
|
||||
* <filter class="solr.CJKBigramFilterFactory"
|
||||
* han="true" hiragana="true"
|
||||
* katakana="true" hangul="true" />
|
||||
* katakana="true" hangul="true" outputUnigrams="false" />
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*/
|
||||
public class CJKBigramFilterFactory extends TokenFilterFactory {
|
||||
int flags;
|
||||
boolean outputUnigrams;
|
||||
|
||||
@Override
|
||||
public void init(Map<String,String> args) {
|
||||
|
@ -56,10 +57,11 @@ public class CJKBigramFilterFactory extends TokenFilterFactory {
|
|||
if (getBoolean("hangul", true)) {
|
||||
flags |= CJKBigramFilter.HANGUL;
|
||||
}
|
||||
outputUnigrams = getBoolean("outputUnigrams", false);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new CJKBigramFilter(input, flags);
|
||||
return new CJKBigramFilter(input, flags, outputUnigrams);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.analysis.cjk;
|
|||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
|
@ -33,6 +34,15 @@ public class TestCJKBigramFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
};
|
||||
|
||||
Analyzer unibiAnalyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
return new TokenStreamComponents(t,
|
||||
new CJKBigramFilter(t, 0xff, true));
|
||||
}
|
||||
};
|
||||
|
||||
public void testHuge() throws Exception {
|
||||
assertAnalyzesTo(analyzer, "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
|
||||
+ "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
|
||||
|
@ -62,6 +72,96 @@ public class TestCJKBigramFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
};
|
||||
assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
|
||||
new String[] { "多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" });
|
||||
new String[] { "多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" },
|
||||
new int[] { 0, 1, 2, 3, 5, 6, 8, 9, 10, 11 },
|
||||
new int[] { 1, 2, 3, 5, 6, 8, 9, 10, 11, 12 },
|
||||
new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<DOUBLE>", "<HIRAGANA>", "<DOUBLE>",
|
||||
"<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
|
||||
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
|
||||
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
|
||||
}
|
||||
|
||||
public void testAllScripts() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
return new TokenStreamComponents(t,
|
||||
new CJKBigramFilter(t, 0xff, false));
|
||||
}
|
||||
};
|
||||
assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
|
||||
new String[] { "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" });
|
||||
}
|
||||
|
||||
public void testUnigramsAndBigramsAllScripts() throws Exception {
|
||||
assertAnalyzesTo(unibiAnalyzer, "多くの学生が試験に落ちた。",
|
||||
new String[] {
|
||||
"多", "多く", "く", "くの", "の", "の学", "学", "学生", "生",
|
||||
"生が", "が", "が試", "試", "試験", "験", "験に", "に",
|
||||
"に落", "落", "落ち", "ち", "ちた", "た"
|
||||
},
|
||||
new int[] { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
|
||||
6, 7, 7, 8, 8, 9, 9, 10, 10, 11 },
|
||||
new int[] { 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,
|
||||
8, 8, 9, 9, 10, 10, 11, 11, 12, 12 },
|
||||
new String[] { "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>",
|
||||
"<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>",
|
||||
"<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>" },
|
||||
new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
|
||||
0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
|
||||
new int[] { 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1,
|
||||
2, 1, 2, 1, 2, 1, 2, 1, 2, 1 }
|
||||
);
|
||||
}
|
||||
|
||||
public void testUnigramsAndBigramsHanOnly() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN, true));
|
||||
}
|
||||
};
|
||||
assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
|
||||
new String[] { "多", "く", "の", "学", "学生", "生", "が", "試", "試験", "験", "に", "落", "ち", "た" },
|
||||
new int[] { 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11 },
|
||||
new int[] { 1, 2, 3, 4, 5, 5, 6, 7, 8, 8, 9, 10, 11, 12 },
|
||||
new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>",
|
||||
"<SINGLE>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", "<SINGLE>",
|
||||
"<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
|
||||
new int[] { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1 },
|
||||
new int[] { 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1 });
|
||||
}
|
||||
|
||||
public void testUnigramsAndBigramsHuge() throws Exception {
|
||||
assertAnalyzesTo(unibiAnalyzer, "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
|
||||
+ "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
|
||||
+ "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた",
|
||||
new String[] {
|
||||
"多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多",
|
||||
"多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多",
|
||||
"多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多",
|
||||
"多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多",
|
||||
"多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多",
|
||||
"多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多",
|
||||
"多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多",
|
||||
"多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多",
|
||||
"多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多",
|
||||
"多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多",
|
||||
"多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た"
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomUnibiStrings() throws Exception {
|
||||
checkRandomData(random(), unibiAnalyzer, 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomUnibiHugeStrings() throws Exception {
|
||||
Random random = random();
|
||||
checkRandomData(random, unibiAnalyzer, 100*RANDOM_MULTIPLIER, 8192);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -52,4 +52,16 @@ public class TestCJKBigramFilterFactory extends BaseTokenStreamTestCase {
|
|||
assertTokenStreamContents(stream,
|
||||
new String[] { "多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" });
|
||||
}
|
||||
|
||||
public void testHanOnlyUnigrams() throws Exception {
|
||||
Reader reader = new StringReader("多くの学生が試験に落ちた。");
|
||||
CJKBigramFilterFactory factory = new CJKBigramFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("hiragana", "false");
|
||||
args.put("outputUnigrams", "true");
|
||||
factory.init(args);
|
||||
TokenStream stream = factory.create(new StandardTokenizer(TEST_VERSION_CURRENT, reader));
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "多", "く", "の", "学", "学生", "生", "が", "試", "試験", "験", "に", "落", "ち", "た" });
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue