LUCENE-4286: add unibigram option to CJKBigramFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1369502 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-08-04 22:37:14 +00:00
parent 41d4ef2b08
commit b4540d6369
5 changed files with 184 additions and 9 deletions

View File

@ -47,6 +47,11 @@ New features
int docID), to attempt deletion by docID as long as the provided
reader is an NRT reader, and the segment has not yet been merged
away (Mike McCandless).
* LUCENE-4286: Added option to CJKBigramFilter to always also output
unigrams. This can be used for a unigram+bigram approach, or at
index-time only for better support of short queries.
(Tom Burton-West, Robert Muir)
API Changes

View File

@ -24,6 +24,8 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.ArrayUtil;
@ -35,6 +37,12 @@ import org.apache.lucene.util.ArrayUtil;
* {@link #CJKBigramFilter(TokenStream, int)} to explicitly control which
* of the CJK scripts are turned into bigrams.
* <p>
* By default, when a CJK character has no adjacent characters to form
* a bigram, it is output in unigram form. If you want to always output
* both unigrams and bigrams, set the <code>outputUnigrams</code>
* flag in {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)}.
* This can be used for a combined unigram+bigram approach.
* <p>
* In all cases, all non-CJK input is passed thru unmodified.
*/
public final class CJKBigramFilter extends TokenFilter {
@ -67,10 +75,16 @@ public final class CJKBigramFilter extends TokenFilter {
private final Object doHiragana;
private final Object doKatakana;
private final Object doHangul;
// true if we should output unigram tokens always
private final boolean outputUnigrams;
private boolean ngramState; // false = output unigram, true = output bigram
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
// buffers containing codepoint and offsets in parallel
int buffer[] = new int[8];
@ -88,23 +102,36 @@ public final class CJKBigramFilter extends TokenFilter {
/**
* Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int)
* CJKBigramFilter(HAN | HIRAGANA | KATAKANA | HANGUL)}
* CJKBigramFilter(in, HAN | HIRAGANA | KATAKANA | HANGUL)}
*/
public CJKBigramFilter(TokenStream in) {
this(in, HAN | HIRAGANA | KATAKANA | HANGUL);
}
/**
* Create a new CJKBigramFilter, specifying which writing systems should be bigrammed.
* @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA},
* {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL}
* Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)
* CJKBigramFilter(in, flags, false)}
*/
public CJKBigramFilter(TokenStream in, int flags) {
this(in, flags, false);
}
/**
* Create a new CJKBigramFilter, specifying which writing systems should be bigrammed,
* and whether or not unigrams should also be output.
* @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA},
* {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL}
* @param outputUnigrams true if unigrams for the selected writing systems should also be output.
* when this is false, this is only done when there are no adjacent characters to form
* a bigram.
*/
public CJKBigramFilter(TokenStream in, int flags, boolean outputUnigrams) {
super(in);
doHan = (flags & HAN) == 0 ? NO : HAN_TYPE;
doHiragana = (flags & HIRAGANA) == 0 ? NO : HIRAGANA_TYPE;
doKatakana = (flags & KATAKANA) == 0 ? NO : KATAKANA_TYPE;
doHangul = (flags & HANGUL) == 0 ? NO : HANGUL_TYPE;
this.outputUnigrams = outputUnigrams;
}
/*
@ -120,7 +147,24 @@ public final class CJKBigramFilter extends TokenFilter {
// case 1: we have multiple remaining codepoints buffered,
// so we can emit a bigram here.
flushBigram();
if (outputUnigrams) {
// when also outputting unigrams, we output the unigram first,
// then rewind back to revisit the bigram.
// so an input of ABC is A + (rewind)AB + B + (rewind)BC + C
// the logic in hasBufferedUnigram ensures we output the C,
// even though it did actually have adjacent CJK characters.
if (ngramState) {
flushBigram();
} else {
flushUnigram();
index--;
}
ngramState = !ngramState;
} else {
flushBigram();
}
return true;
} else if (doNext()) {
@ -260,6 +304,11 @@ public final class CJKBigramFilter extends TokenFilter {
termAtt.setLength(len2);
offsetAtt.setOffset(startOffset[index], endOffset[index+1]);
typeAtt.setType(DOUBLE_TYPE);
// when outputting unigrams, all bigrams are synonyms that span two unigrams
if (outputUnigrams) {
posIncAtt.setPositionIncrement(0);
posLengthAtt.setPositionLength(2);
}
index++;
}
@ -292,7 +341,13 @@ public final class CJKBigramFilter extends TokenFilter {
* inputs.
*/
private boolean hasBufferedUnigram() {
return bufferLen == 1 && index == 0;
if (outputUnigrams) {
// when outputting unigrams always
return bufferLen - index == 1;
} else {
// otherwise its only when we have a lone CJK character
return bufferLen == 1 && index == 0;
}
}
@Override
@ -303,5 +358,6 @@ public final class CJKBigramFilter extends TokenFilter {
lastEndOffset = 0;
loneState = null;
exhausted = false;
ngramState = false;
}
}

View File

@ -33,12 +33,13 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
* &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
* &lt;filter class="solr.CJKBigramFilterFactory"
* han="true" hiragana="true"
* katakana="true" hangul="true" /&gt;
* katakana="true" hangul="true" outputUnigrams="false" /&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
public class CJKBigramFilterFactory extends TokenFilterFactory {
int flags;
boolean outputUnigrams;
@Override
public void init(Map<String,String> args) {
@ -56,10 +57,11 @@ public class CJKBigramFilterFactory extends TokenFilterFactory {
if (getBoolean("hangul", true)) {
flags |= CJKBigramFilter.HANGUL;
}
outputUnigrams = getBoolean("outputUnigrams", false);
}
@Override
public TokenStream create(TokenStream input) {
return new CJKBigramFilter(input, flags);
return new CJKBigramFilter(input, flags, outputUnigrams);
}
}

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis.cjk;
*/
import java.io.Reader;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@ -33,6 +34,15 @@ public class TestCJKBigramFilter extends BaseTokenStreamTestCase {
}
};
Analyzer unibiAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(t,
new CJKBigramFilter(t, 0xff, true));
}
};
public void testHuge() throws Exception {
assertAnalyzesTo(analyzer, "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
+ "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
@ -62,6 +72,96 @@ public class TestCJKBigramFilter extends BaseTokenStreamTestCase {
}
};
assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
new String[] { "", "", "", "学生", "", "試験", "", "", "", "" });
new String[] { "", "", "", "学生", "", "試験", "", "", "", "" },
new int[] { 0, 1, 2, 3, 5, 6, 8, 9, 10, 11 },
new int[] { 1, 2, 3, 5, 6, 8, 9, 10, 11, 12 },
new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<DOUBLE>", "<HIRAGANA>", "<DOUBLE>",
"<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
}
public void testAllScripts() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(t,
new CJKBigramFilter(t, 0xff, false));
}
};
assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
new String[] { "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" });
}
public void testUnigramsAndBigramsAllScripts() throws Exception {
assertAnalyzesTo(unibiAnalyzer, "多くの学生が試験に落ちた。",
new String[] {
"", "多く", "", "くの", "", "の学", "", "学生", "",
"生が", "", "が試", "", "試験", "", "験に", "",
"に落", "", "落ち", "", "ちた", ""
},
new int[] { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
6, 7, 7, 8, 8, 9, 9, 10, 10, 11 },
new int[] { 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,
8, 8, 9, 9, 10, 10, 11, 11, 12, 12 },
new String[] { "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>",
"<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>",
"<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>" },
new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
new int[] { 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1,
2, 1, 2, 1, 2, 1, 2, 1, 2, 1 }
);
}
public void testUnigramsAndBigramsHanOnly() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN, true));
}
};
assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
new String[] { "", "", "", "", "学生", "", "", "", "試験", "", "", "", "", "" },
new int[] { 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11 },
new int[] { 1, 2, 3, 4, 5, 5, 6, 7, 8, 8, 9, 10, 11, 12 },
new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>",
"<SINGLE>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", "<SINGLE>",
"<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
new int[] { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1 },
new int[] { 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1 });
}
public void testUnigramsAndBigramsHuge() throws Exception {
assertAnalyzesTo(unibiAnalyzer, "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
+ "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
+ "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた",
new String[] {
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", ""
}
);
}
/** blast some random strings through the analyzer */
public void testRandomUnibiStrings() throws Exception {
checkRandomData(random(), unibiAnalyzer, 1000*RANDOM_MULTIPLIER);
}
/** blast some random strings through the analyzer */
public void testRandomUnibiHugeStrings() throws Exception {
Random random = random();
checkRandomData(random, unibiAnalyzer, 100*RANDOM_MULTIPLIER, 8192);
}
}

View File

@ -52,4 +52,16 @@ public class TestCJKBigramFilterFactory extends BaseTokenStreamTestCase {
assertTokenStreamContents(stream,
new String[] { "", "", "", "学生", "", "試験", "", "", "", "" });
}
public void testHanOnlyUnigrams() throws Exception {
Reader reader = new StringReader("多くの学生が試験に落ちた。");
CJKBigramFilterFactory factory = new CJKBigramFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("hiragana", "false");
args.put("outputUnigrams", "true");
factory.init(args);
TokenStream stream = factory.create(new StandardTokenizer(TEST_VERSION_CURRENT, reader));
assertTokenStreamContents(stream,
new String[] { "", "", "", "", "学生", "", "", "", "試験", "", "", "", "", "" });
}
}