LUCENE-4286: add unibigram option to CJKBigramFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1369502 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-08-04 22:37:14 +00:00
parent 41d4ef2b08
commit b4540d6369
5 changed files with 184 additions and 9 deletions

View File

@ -48,6 +48,11 @@ New features
reader is an NRT reader, and the segment has not yet been merged reader is an NRT reader, and the segment has not yet been merged
away (Mike McCandless). away (Mike McCandless).
* LUCENE-4286: Added option to CJKBigramFilter to always also output
unigrams. This can be used for a unigram+bigram approach, or at
index-time only for better support of short queries.
(Tom Burton-West, Robert Muir)
API Changes API Changes
* LUCENE-4138: update of morfologik (Polish morphological analyzer) to 1.5.3. * LUCENE-4138: update of morfologik (Polish morphological analyzer) to 1.5.3.

View File

@ -24,6 +24,8 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
@ -35,6 +37,12 @@ import org.apache.lucene.util.ArrayUtil;
* {@link #CJKBigramFilter(TokenStream, int)} to explicitly control which * {@link #CJKBigramFilter(TokenStream, int)} to explicitly control which
* of the CJK scripts are turned into bigrams. * of the CJK scripts are turned into bigrams.
* <p> * <p>
* By default, when a CJK character has no adjacent characters to form
* a bigram, it is output in unigram form. If you want to always output
* both unigrams and bigrams, set the <code>outputUnigrams</code>
* flag in {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)}.
* This can be used for a combined unigram+bigram approach.
* <p>
* In all cases, all non-CJK input is passed thru unmodified. * In all cases, all non-CJK input is passed thru unmodified.
*/ */
public final class CJKBigramFilter extends TokenFilter { public final class CJKBigramFilter extends TokenFilter {
@ -68,9 +76,15 @@ public final class CJKBigramFilter extends TokenFilter {
private final Object doKatakana; private final Object doKatakana;
private final Object doHangul; private final Object doHangul;
// true if we should output unigram tokens always
private final boolean outputUnigrams;
private boolean ngramState; // false = output unigram, true = output bigram
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
// buffers containing codepoint and offsets in parallel // buffers containing codepoint and offsets in parallel
int buffer[] = new int[8]; int buffer[] = new int[8];
@ -88,23 +102,36 @@ public final class CJKBigramFilter extends TokenFilter {
/** /**
* Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int) * Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int)
* CJKBigramFilter(HAN | HIRAGANA | KATAKANA | HANGUL)} * CJKBigramFilter(in, HAN | HIRAGANA | KATAKANA | HANGUL)}
*/ */
public CJKBigramFilter(TokenStream in) { public CJKBigramFilter(TokenStream in) {
this(in, HAN | HIRAGANA | KATAKANA | HANGUL); this(in, HAN | HIRAGANA | KATAKANA | HANGUL);
} }
/** /**
* Create a new CJKBigramFilter, specifying which writing systems should be bigrammed. * Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)
* @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA}, * CJKBigramFilter(in, flags, false)}
* {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL}
*/ */
public CJKBigramFilter(TokenStream in, int flags) { public CJKBigramFilter(TokenStream in, int flags) {
this(in, flags, false);
}
/**
* Create a new CJKBigramFilter, specifying which writing systems should be bigrammed,
* and whether or not unigrams should also be output.
* @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA},
* {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL}
* @param outputUnigrams true if unigrams for the selected writing systems should also be output.
* when this is false, this is only done when there are no adjacent characters to form
* a bigram.
*/
public CJKBigramFilter(TokenStream in, int flags, boolean outputUnigrams) {
super(in); super(in);
doHan = (flags & HAN) == 0 ? NO : HAN_TYPE; doHan = (flags & HAN) == 0 ? NO : HAN_TYPE;
doHiragana = (flags & HIRAGANA) == 0 ? NO : HIRAGANA_TYPE; doHiragana = (flags & HIRAGANA) == 0 ? NO : HIRAGANA_TYPE;
doKatakana = (flags & KATAKANA) == 0 ? NO : KATAKANA_TYPE; doKatakana = (flags & KATAKANA) == 0 ? NO : KATAKANA_TYPE;
doHangul = (flags & HANGUL) == 0 ? NO : HANGUL_TYPE; doHangul = (flags & HANGUL) == 0 ? NO : HANGUL_TYPE;
this.outputUnigrams = outputUnigrams;
} }
/* /*
@ -120,7 +147,24 @@ public final class CJKBigramFilter extends TokenFilter {
// case 1: we have multiple remaining codepoints buffered, // case 1: we have multiple remaining codepoints buffered,
// so we can emit a bigram here. // so we can emit a bigram here.
if (outputUnigrams) {
// when also outputting unigrams, we output the unigram first,
// then rewind back to revisit the bigram.
// so an input of ABC is A + (rewind)AB + B + (rewind)BC + C
// the logic in hasBufferedUnigram ensures we output the C,
// even though it did actually have adjacent CJK characters.
if (ngramState) {
flushBigram(); flushBigram();
} else {
flushUnigram();
index--;
}
ngramState = !ngramState;
} else {
flushBigram();
}
return true; return true;
} else if (doNext()) { } else if (doNext()) {
@ -260,6 +304,11 @@ public final class CJKBigramFilter extends TokenFilter {
termAtt.setLength(len2); termAtt.setLength(len2);
offsetAtt.setOffset(startOffset[index], endOffset[index+1]); offsetAtt.setOffset(startOffset[index], endOffset[index+1]);
typeAtt.setType(DOUBLE_TYPE); typeAtt.setType(DOUBLE_TYPE);
// when outputting unigrams, all bigrams are synonyms that span two unigrams
if (outputUnigrams) {
posIncAtt.setPositionIncrement(0);
posLengthAtt.setPositionLength(2);
}
index++; index++;
} }
@ -292,8 +341,14 @@ public final class CJKBigramFilter extends TokenFilter {
* inputs. * inputs.
*/ */
private boolean hasBufferedUnigram() { private boolean hasBufferedUnigram() {
if (outputUnigrams) {
// when outputting unigrams always
return bufferLen - index == 1;
} else {
// otherwise its only when we have a lone CJK character
return bufferLen == 1 && index == 0; return bufferLen == 1 && index == 0;
} }
}
@Override @Override
public void reset() throws IOException { public void reset() throws IOException {
@ -303,5 +358,6 @@ public final class CJKBigramFilter extends TokenFilter {
lastEndOffset = 0; lastEndOffset = 0;
loneState = null; loneState = null;
exhausted = false; exhausted = false;
ngramState = false;
} }
} }

View File

@ -33,12 +33,13 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
* &lt;filter class="solr.LowerCaseFilterFactory"/&gt; * &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
* &lt;filter class="solr.CJKBigramFilterFactory" * &lt;filter class="solr.CJKBigramFilterFactory"
* han="true" hiragana="true" * han="true" hiragana="true"
* katakana="true" hangul="true" /&gt; * katakana="true" hangul="true" outputUnigrams="false" /&gt;
* &lt;/analyzer&gt; * &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre> * &lt;/fieldType&gt;</pre>
*/ */
public class CJKBigramFilterFactory extends TokenFilterFactory { public class CJKBigramFilterFactory extends TokenFilterFactory {
int flags; int flags;
boolean outputUnigrams;
@Override @Override
public void init(Map<String,String> args) { public void init(Map<String,String> args) {
@ -56,10 +57,11 @@ public class CJKBigramFilterFactory extends TokenFilterFactory {
if (getBoolean("hangul", true)) { if (getBoolean("hangul", true)) {
flags |= CJKBigramFilter.HANGUL; flags |= CJKBigramFilter.HANGUL;
} }
outputUnigrams = getBoolean("outputUnigrams", false);
} }
@Override @Override
public TokenStream create(TokenStream input) { public TokenStream create(TokenStream input) {
return new CJKBigramFilter(input, flags); return new CJKBigramFilter(input, flags, outputUnigrams);
} }
} }

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis.cjk;
*/ */
import java.io.Reader; import java.io.Reader;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@ -33,6 +34,15 @@ public class TestCJKBigramFilter extends BaseTokenStreamTestCase {
} }
}; };
Analyzer unibiAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(t,
new CJKBigramFilter(t, 0xff, true));
}
};
public void testHuge() throws Exception { public void testHuge() throws Exception {
assertAnalyzesTo(analyzer, "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" assertAnalyzesTo(analyzer, "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
+ "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
@ -62,6 +72,96 @@ public class TestCJKBigramFilter extends BaseTokenStreamTestCase {
} }
}; };
assertAnalyzesTo(a, "多くの学生が試験に落ちた。", assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
new String[] { "", "", "", "学生", "", "試験", "", "", "", "" }); new String[] { "", "", "", "学生", "", "試験", "", "", "", "" },
new int[] { 0, 1, 2, 3, 5, 6, 8, 9, 10, 11 },
new int[] { 1, 2, 3, 5, 6, 8, 9, 10, 11, 12 },
new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<DOUBLE>", "<HIRAGANA>", "<DOUBLE>",
"<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
}
public void testAllScripts() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(t,
new CJKBigramFilter(t, 0xff, false));
}
};
assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
new String[] { "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" });
}
public void testUnigramsAndBigramsAllScripts() throws Exception {
assertAnalyzesTo(unibiAnalyzer, "多くの学生が試験に落ちた。",
new String[] {
"", "多く", "", "くの", "", "の学", "", "学生", "",
"生が", "", "が試", "", "試験", "", "験に", "",
"に落", "", "落ち", "", "ちた", ""
},
new int[] { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
6, 7, 7, 8, 8, 9, 9, 10, 10, 11 },
new int[] { 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,
8, 8, 9, 9, 10, 10, 11, 11, 12, 12 },
new String[] { "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>",
"<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>",
"<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>" },
new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
new int[] { 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1,
2, 1, 2, 1, 2, 1, 2, 1, 2, 1 }
);
}
public void testUnigramsAndBigramsHanOnly() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN, true));
}
};
assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
new String[] { "", "", "", "", "学生", "", "", "", "試験", "", "", "", "", "" },
new int[] { 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11 },
new int[] { 1, 2, 3, 4, 5, 5, 6, 7, 8, 8, 9, 10, 11, 12 },
new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>",
"<SINGLE>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", "<SINGLE>",
"<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
new int[] { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1 },
new int[] { 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1 });
}
public void testUnigramsAndBigramsHuge() throws Exception {
assertAnalyzesTo(unibiAnalyzer, "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
+ "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
+ "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた",
new String[] {
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", ""
}
);
}
/** blast some random strings through the analyzer */
public void testRandomUnibiStrings() throws Exception {
checkRandomData(random(), unibiAnalyzer, 1000*RANDOM_MULTIPLIER);
}
/** blast some random strings through the analyzer */
public void testRandomUnibiHugeStrings() throws Exception {
Random random = random();
checkRandomData(random, unibiAnalyzer, 100*RANDOM_MULTIPLIER, 8192);
} }
} }

View File

@ -52,4 +52,16 @@ public class TestCJKBigramFilterFactory extends BaseTokenStreamTestCase {
assertTokenStreamContents(stream, assertTokenStreamContents(stream,
new String[] { "", "", "", "学生", "", "試験", "", "", "", "" }); new String[] { "", "", "", "学生", "", "試験", "", "", "", "" });
} }
public void testHanOnlyUnigrams() throws Exception {
Reader reader = new StringReader("多くの学生が試験に落ちた。");
CJKBigramFilterFactory factory = new CJKBigramFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("hiragana", "false");
args.put("outputUnigrams", "true");
factory.init(args);
TokenStream stream = factory.create(new StandardTokenizer(TEST_VERSION_CURRENT, reader));
assertTokenStreamContents(stream,
new String[] { "", "", "", "", "学生", "", "", "", "試験", "", "", "", "", "" });
}
} }