From b4540d63697890494663a9ba17c096b0b22d005d Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Sat, 4 Aug 2012 22:37:14 +0000 Subject: [PATCH] LUCENE-4286: add unibigram option to CJKBigramFilter git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1369502 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 5 + .../lucene/analysis/cjk/CJKBigramFilter.java | 68 ++++++++++-- .../analysis/cjk/CJKBigramFilterFactory.java | 6 +- .../analysis/cjk/TestCJKBigramFilter.java | 102 +++++++++++++++++- .../cjk/TestCJKBigramFilterFactory.java | 12 +++ 5 files changed, 184 insertions(+), 9 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 2c458ef8844..f4027d5a252 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -47,6 +47,11 @@ New features int docID), to attempt deletion by docID as long as the provided reader is an NRT reader, and the segment has not yet been merged away (Mike McCandless). + +* LUCENE-4286: Added option to CJKBigramFilter to always also output + unigrams. This can be used for a unigram+bigram approach, or at + index-time only for better support of short queries. + (Tom Burton-West, Robert Muir) API Changes diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java index cdd5f73dbe6..dc98909e5a8 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java @@ -24,6 +24,8 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.util.ArrayUtil; @@ -35,6 +37,12 @@ import org.apache.lucene.util.ArrayUtil; * {@link #CJKBigramFilter(TokenStream, int)} to explicitly control which * of the CJK scripts are turned into bigrams. *

+ * By default, when a CJK character has no adjacent characters to form + * a bigram, it is output in unigram form. If you want to always output + * both unigrams and bigrams, set the outputUnigrams + * flag in {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)}. + * This can be used for a combined unigram+bigram approach. + *

* In all cases, all non-CJK input is passed thru unmodified. */ public final class CJKBigramFilter extends TokenFilter { @@ -67,10 +75,16 @@ public final class CJKBigramFilter extends TokenFilter { private final Object doHiragana; private final Object doKatakana; private final Object doHangul; + + // true if we should output unigram tokens always + private final boolean outputUnigrams; + private boolean ngramState; // false = output unigram, true = output bigram private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); + private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class); // buffers containing codepoint and offsets in parallel int buffer[] = new int[8]; @@ -88,23 +102,36 @@ public final class CJKBigramFilter extends TokenFilter { /** * Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int) - * CJKBigramFilter(HAN | HIRAGANA | KATAKANA | HANGUL)} + * CJKBigramFilter(in, HAN | HIRAGANA | KATAKANA | HANGUL)} */ public CJKBigramFilter(TokenStream in) { this(in, HAN | HIRAGANA | KATAKANA | HANGUL); } /** - * Create a new CJKBigramFilter, specifying which writing systems should be bigrammed. - * @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA}, - * {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL} + * Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean) + * CJKBigramFilter(in, flags, false)} */ public CJKBigramFilter(TokenStream in, int flags) { + this(in, flags, false); + } + + /** + * Create a new CJKBigramFilter, specifying which writing systems should be bigrammed, + * and whether or not unigrams should also be output. + * @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA}, + * {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL} + * @param outputUnigrams true if unigrams for the selected writing systems should also be output. + * when this is false, this is only done when there are no adjacent characters to form + * a bigram. + */ + public CJKBigramFilter(TokenStream in, int flags, boolean outputUnigrams) { super(in); doHan = (flags & HAN) == 0 ? NO : HAN_TYPE; doHiragana = (flags & HIRAGANA) == 0 ? NO : HIRAGANA_TYPE; doKatakana = (flags & KATAKANA) == 0 ? NO : KATAKANA_TYPE; doHangul = (flags & HANGUL) == 0 ? NO : HANGUL_TYPE; + this.outputUnigrams = outputUnigrams; } /* @@ -120,7 +147,24 @@ public final class CJKBigramFilter extends TokenFilter { // case 1: we have multiple remaining codepoints buffered, // so we can emit a bigram here. - flushBigram(); + if (outputUnigrams) { + + // when also outputting unigrams, we output the unigram first, + // then rewind back to revisit the bigram. + // so an input of ABC is A + (rewind)AB + B + (rewind)BC + C + // the logic in hasBufferedUnigram ensures we output the C, + // even though it did actually have adjacent CJK characters. + + if (ngramState) { + flushBigram(); + } else { + flushUnigram(); + index--; + } + ngramState = !ngramState; + } else { + flushBigram(); + } return true; } else if (doNext()) { @@ -260,6 +304,11 @@ public final class CJKBigramFilter extends TokenFilter { termAtt.setLength(len2); offsetAtt.setOffset(startOffset[index], endOffset[index+1]); typeAtt.setType(DOUBLE_TYPE); + // when outputting unigrams, all bigrams are synonyms that span two unigrams + if (outputUnigrams) { + posIncAtt.setPositionIncrement(0); + posLengthAtt.setPositionLength(2); + } index++; } @@ -292,7 +341,13 @@ public final class CJKBigramFilter extends TokenFilter { * inputs. */ private boolean hasBufferedUnigram() { - return bufferLen == 1 && index == 0; + if (outputUnigrams) { + // when outputting unigrams always + return bufferLen - index == 1; + } else { + // otherwise its only when we have a lone CJK character + return bufferLen == 1 && index == 0; + } } @Override @@ -303,5 +358,6 @@ public final class CJKBigramFilter extends TokenFilter { lastEndOffset = 0; loneState = null; exhausted = false; + ngramState = false; } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java index ca76956a81b..7675e5b16ed 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java @@ -33,12 +33,13 @@ import org.apache.lucene.analysis.util.TokenFilterFactory; * <filter class="solr.LowerCaseFilterFactory"/> * <filter class="solr.CJKBigramFilterFactory" * han="true" hiragana="true" - * katakana="true" hangul="true" /> + * katakana="true" hangul="true" outputUnigrams="false" /> * </analyzer> * </fieldType> */ public class CJKBigramFilterFactory extends TokenFilterFactory { int flags; + boolean outputUnigrams; @Override public void init(Map args) { @@ -56,10 +57,11 @@ public class CJKBigramFilterFactory extends TokenFilterFactory { if (getBoolean("hangul", true)) { flags |= CJKBigramFilter.HANGUL; } + outputUnigrams = getBoolean("outputUnigrams", false); } @Override public TokenStream create(TokenStream input) { - return new CJKBigramFilter(input, flags); + return new CJKBigramFilter(input, flags, outputUnigrams); } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilter.java index a859897037b..80c595856d2 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilter.java @@ -18,6 +18,7 @@ package org.apache.lucene.analysis.cjk; */ import java.io.Reader; +import java.util.Random; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; @@ -33,6 +34,15 @@ public class TestCJKBigramFilter extends BaseTokenStreamTestCase { } }; + Analyzer unibiAnalyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader); + return new TokenStreamComponents(t, + new CJKBigramFilter(t, 0xff, true)); + } + }; + public void testHuge() throws Exception { assertAnalyzesTo(analyzer, "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" @@ -62,6 +72,96 @@ public class TestCJKBigramFilter extends BaseTokenStreamTestCase { } }; assertAnalyzesTo(a, "多くの学生が試験に落ちた。", - new String[] { "多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" }); + new String[] { "多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" }, + new int[] { 0, 1, 2, 3, 5, 6, 8, 9, 10, 11 }, + new int[] { 1, 2, 3, 5, 6, 8, 9, 10, 11, 12 }, + new String[] { "", "", "", "", "", "", + "", "", "", "", "" }, + new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, + new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }); + } + + public void testAllScripts() throws Exception { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader); + return new TokenStreamComponents(t, + new CJKBigramFilter(t, 0xff, false)); + } + }; + assertAnalyzesTo(a, "多くの学生が試験に落ちた。", + new String[] { "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" }); + } + + public void testUnigramsAndBigramsAllScripts() throws Exception { + assertAnalyzesTo(unibiAnalyzer, "多くの学生が試験に落ちた。", + new String[] { + "多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", + "生が", "が", "が試", "試", "試験", "験", "験に", "に", + "に落", "落", "落ち", "ち", "ちた", "た" + }, + new int[] { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 }, + new int[] { 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, + 8, 8, 9, 9, 10, 10, 11, 11, 12, 12 }, + new String[] { "", "", "", "", "", "", "", "", + "", "", "", "", "", "", "", "", + "", "", "", "", "", "", "" }, + new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 }, + new int[] { 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, + 2, 1, 2, 1, 2, 1, 2, 1, 2, 1 } + ); + } + + public void testUnigramsAndBigramsHanOnly() throws Exception { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader); + return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN, true)); + } + }; + assertAnalyzesTo(a, "多くの学生が試験に落ちた。", + new String[] { "多", "く", "の", "学", "学生", "生", "が", "試", "試験", "験", "に", "落", "ち", "た" }, + new int[] { 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11 }, + new int[] { 1, 2, 3, 4, 5, 5, 6, 7, 8, 8, 9, 10, 11, 12 }, + new String[] { "", "", "", "", "", + "", "", "", "", "", + "", "", "", "", "" }, + new int[] { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1 }, + new int[] { 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1 }); + } + + public void testUnigramsAndBigramsHuge() throws Exception { + assertAnalyzesTo(unibiAnalyzer, "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた", + new String[] { + "多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多", + "多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多", + "多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多", + "多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多", + "多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多", + "多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多", + "多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多", + "多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多", + "多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多", + "多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多", + "多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た" + } + ); + } + + /** blast some random strings through the analyzer */ + public void testRandomUnibiStrings() throws Exception { + checkRandomData(random(), unibiAnalyzer, 1000*RANDOM_MULTIPLIER); + } + + /** blast some random strings through the analyzer */ + public void testRandomUnibiHugeStrings() throws Exception { + Random random = random(); + checkRandomData(random, unibiAnalyzer, 100*RANDOM_MULTIPLIER, 8192); } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilterFactory.java index 316a42a2fc8..8eb1092f721 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilterFactory.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilterFactory.java @@ -52,4 +52,16 @@ public class TestCJKBigramFilterFactory extends BaseTokenStreamTestCase { assertTokenStreamContents(stream, new String[] { "多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" }); } + + public void testHanOnlyUnigrams() throws Exception { + Reader reader = new StringReader("多くの学生が試験に落ちた。"); + CJKBigramFilterFactory factory = new CJKBigramFilterFactory(); + Map args = new HashMap(); + args.put("hiragana", "false"); + args.put("outputUnigrams", "true"); + factory.init(args); + TokenStream stream = factory.create(new StandardTokenizer(TEST_VERSION_CURRENT, reader)); + assertTokenStreamContents(stream, + new String[] { "多", "く", "の", "学", "学生", "生", "が", "試", "試験", "験", "に", "落", "ち", "た" }); + } }