LUCENE-4991: QueryParser doesnt handle synonyms correctly for chinese

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1481100 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2013-05-10 16:59:36 +00:00
parent afbca4dc8a
commit 2a0d8086df
4 changed files with 235 additions and 18 deletions

View File

@ -129,6 +129,11 @@ Bug Fixes
* LUCENE-4996: Ensure DocInverterPerField always includes field name
in exception messages. (Markus Jelsma via Robert Muir)
* LUCENE-4991: Fix handling of synonyms in classic QueryParser.getFieldQuery for
terms not separated by whitespace. PositionIncrementAttribute was ignored, so with
default AND synonyms wrongly became mandatory clauses, and with OR, the
coordination factor was wrong. (李威, Robert Muir)
Optimizations

View File

@ -572,24 +572,53 @@ public abstract class QueryParserBase implements CommonQueryParserConfiguration
if (severalTokensAtSamePosition || (!quoted && !autoGeneratePhraseQueries)) {
if (positionCount == 1 || (!quoted && !autoGeneratePhraseQueries)) {
// no phrase query:
BooleanQuery q = newBooleanQuery(positionCount == 1);
BooleanClause.Occur occur = positionCount > 1 && operator == AND_OPERATOR ?
BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD;
for (int i = 0; i < numTokens; i++) {
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
termAtt.fillBytesRef();
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
if (positionCount == 1) {
// simple case: only one position, with synonyms
BooleanQuery q = newBooleanQuery(true);
for (int i = 0; i < numTokens; i++) {
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
termAtt.fillBytesRef();
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
Query currentQuery = newTermQuery(
new Term(field, BytesRef.deepCopyOf(bytes)));
q.add(currentQuery, BooleanClause.Occur.SHOULD);
}
return q;
} else {
// multiple positions
BooleanQuery q = newBooleanQuery(false);
final BooleanClause.Occur occur = operator == Operator.AND ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD;
Query currentQuery = null;
for (int i = 0; i < numTokens; i++) {
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
termAtt.fillBytesRef();
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
if (posIncrAtt != null && posIncrAtt.getPositionIncrement() == 0) {
if (!(currentQuery instanceof BooleanQuery)) {
Query t = currentQuery;
currentQuery = newBooleanQuery(true);
((BooleanQuery)currentQuery).add(t, BooleanClause.Occur.SHOULD);
}
((BooleanQuery)currentQuery).add(newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes))), BooleanClause.Occur.SHOULD);
} else {
if (currentQuery != null) {
q.add(currentQuery, occur);
}
currentQuery = newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes)));
}
}
Query currentQuery = newTermQuery(
new Term(field, BytesRef.deepCopyOf(bytes)));
q.add(currentQuery, occur);
return q;
}
return q;
}
else {
// phrase query:

View File

@ -17,9 +17,17 @@ package org.apache.lucene.queryparser.classic;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.DateTools.Resolution;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser.Operator;
@ -27,6 +35,7 @@ import org.apache.lucene.queryparser.flexible.standard.CommonQueryParserConfigur
import org.apache.lucene.queryparser.util.QueryParserTestBase;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
@ -307,4 +316,178 @@ public class TestQueryParser extends QueryParserTestBase {
assertEquals(unexpanded, smart.parse("\"dogs\""));
}
// TODO: fold these into QueryParserTestBase
/** adds synonym of "dog" for "dogs". */
static class MockSynonymAnalyzer extends Analyzer {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
MockTokenizer tokenizer = new MockTokenizer(reader);
return new TokenStreamComponents(tokenizer, new MockSynonymFilter(tokenizer));
}
}
/** simple synonyms test */
public void testSynonyms() throws Exception {
BooleanQuery expected = new BooleanQuery(true);
expected.add(new TermQuery(new Term("field", "dogs")), BooleanClause.Occur.SHOULD);
expected.add(new TermQuery(new Term("field", "dog")), BooleanClause.Occur.SHOULD);
QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockSynonymAnalyzer());
assertEquals(expected, qp.parse("dogs"));
assertEquals(expected, qp.parse("\"dogs\""));
qp.setDefaultOperator(Operator.AND);
assertEquals(expected, qp.parse("dogs"));
assertEquals(expected, qp.parse("\"dogs\""));
expected.setBoost(2.0f);
assertEquals(expected, qp.parse("dogs^2"));
assertEquals(expected, qp.parse("\"dogs\"^2"));
}
/** forms multiphrase query */
public void testSynonymsPhrase() throws Exception {
MultiPhraseQuery expected = new MultiPhraseQuery();
expected.add(new Term("field", "old"));
expected.add(new Term[] { new Term("field", "dogs"), new Term("field", "dog") });
QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockSynonymAnalyzer());
assertEquals(expected, qp.parse("\"old dogs\""));
qp.setDefaultOperator(Operator.AND);
assertEquals(expected, qp.parse("\"old dogs\""));
expected.setBoost(2.0f);
assertEquals(expected, qp.parse("\"old dogs\"^2"));
expected.setSlop(3);
assertEquals(expected, qp.parse("\"old dogs\"~3^2"));
}
/**
* adds synonym of "" for "".
*/
protected static class MockCJKSynonymFilter extends TokenFilter {
CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
boolean addSynonym = false;
public MockCJKSynonymFilter(TokenStream input) {
super(input);
}
@Override
public final boolean incrementToken() throws IOException {
if (addSynonym) { // inject our synonym
clearAttributes();
termAtt.setEmpty().append("");
posIncAtt.setPositionIncrement(0);
addSynonym = false;
return true;
}
if (input.incrementToken()) {
addSynonym = termAtt.toString().equals("");
return true;
} else {
return false;
}
}
}
static class MockCJKSynonymAnalyzer extends Analyzer {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new SimpleCJKTokenizer(reader);
return new TokenStreamComponents(tokenizer, new MockCJKSynonymFilter(tokenizer));
}
}
/** simple CJK synonym test */
public void testCJKSynonym() throws Exception {
BooleanQuery expected = new BooleanQuery(true);
expected.add(new TermQuery(new Term("field", "")), BooleanClause.Occur.SHOULD);
expected.add(new TermQuery(new Term("field", "")), BooleanClause.Occur.SHOULD);
QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockCJKSynonymAnalyzer());
assertEquals(expected, qp.parse(""));
qp.setDefaultOperator(Operator.AND);
assertEquals(expected, qp.parse(""));
expected.setBoost(2.0f);
assertEquals(expected, qp.parse("国^2"));
}
/** synonyms with default OR operator */
public void testCJKSynonymsOR() throws Exception {
BooleanQuery expected = new BooleanQuery();
expected.add(new TermQuery(new Term("field", "")), BooleanClause.Occur.SHOULD);
BooleanQuery inner = new BooleanQuery(true);
inner.add(new TermQuery(new Term("field", "")), BooleanClause.Occur.SHOULD);
inner.add(new TermQuery(new Term("field", "")), BooleanClause.Occur.SHOULD);
expected.add(inner, BooleanClause.Occur.SHOULD);
QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockCJKSynonymAnalyzer());
assertEquals(expected, qp.parse("中国"));
expected.setBoost(2.0f);
assertEquals(expected, qp.parse("中国^2"));
}
/** more complex synonyms with default OR operator */
public void testCJKSynonymsOR2() throws Exception {
BooleanQuery expected = new BooleanQuery();
expected.add(new TermQuery(new Term("field", "")), BooleanClause.Occur.SHOULD);
BooleanQuery inner = new BooleanQuery(true);
inner.add(new TermQuery(new Term("field", "")), BooleanClause.Occur.SHOULD);
inner.add(new TermQuery(new Term("field", "")), BooleanClause.Occur.SHOULD);
expected.add(inner, BooleanClause.Occur.SHOULD);
BooleanQuery inner2 = new BooleanQuery(true);
inner2.add(new TermQuery(new Term("field", "")), BooleanClause.Occur.SHOULD);
inner2.add(new TermQuery(new Term("field", "")), BooleanClause.Occur.SHOULD);
expected.add(inner2, BooleanClause.Occur.SHOULD);
QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockCJKSynonymAnalyzer());
assertEquals(expected, qp.parse("中国国"));
expected.setBoost(2.0f);
assertEquals(expected, qp.parse("中国国^2"));
}
/** synonyms with default AND operator */
public void testCJKSynonymsAND() throws Exception {
BooleanQuery expected = new BooleanQuery();
expected.add(new TermQuery(new Term("field", "")), BooleanClause.Occur.MUST);
BooleanQuery inner = new BooleanQuery(true);
inner.add(new TermQuery(new Term("field", "")), BooleanClause.Occur.SHOULD);
inner.add(new TermQuery(new Term("field", "")), BooleanClause.Occur.SHOULD);
expected.add(inner, BooleanClause.Occur.MUST);
QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockCJKSynonymAnalyzer());
qp.setDefaultOperator(Operator.AND);
assertEquals(expected, qp.parse("中国"));
expected.setBoost(2.0f);
assertEquals(expected, qp.parse("中国^2"));
}
/** more complex synonyms with default AND operator */
public void testCJKSynonymsAND2() throws Exception {
BooleanQuery expected = new BooleanQuery();
expected.add(new TermQuery(new Term("field", "")), BooleanClause.Occur.MUST);
BooleanQuery inner = new BooleanQuery(true);
inner.add(new TermQuery(new Term("field", "")), BooleanClause.Occur.SHOULD);
inner.add(new TermQuery(new Term("field", "")), BooleanClause.Occur.SHOULD);
expected.add(inner, BooleanClause.Occur.MUST);
BooleanQuery inner2 = new BooleanQuery(true);
inner2.add(new TermQuery(new Term("field", "")), BooleanClause.Occur.SHOULD);
inner2.add(new TermQuery(new Term("field", "")), BooleanClause.Occur.SHOULD);
expected.add(inner2, BooleanClause.Occur.MUST);
QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockCJKSynonymAnalyzer());
qp.setDefaultOperator(Operator.AND);
assertEquals(expected, qp.parse("中国国"));
expected.setBoost(2.0f);
assertEquals(expected, qp.parse("中国国^2"));
}
/** forms multiphrase query */
public void testCJKSynonymsPhrase() throws Exception {
MultiPhraseQuery expected = new MultiPhraseQuery();
expected.add(new Term("field", ""));
expected.add(new Term[] { new Term("field", ""), new Term("field", "")});
QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockCJKSynonymAnalyzer());
qp.setDefaultOperator(Operator.AND);
assertEquals(expected, qp.parse("\"中国\""));
expected.setBoost(2.0f);
assertEquals(expected, qp.parse("\"中国\"^2"));
expected.setSlop(3);
assertEquals(expected, qp.parse("\"中国\"~3^2"));
}
}

View File

@ -236,7 +236,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
}
//individual CJK chars as terms, like StandardAnalyzer
private class SimpleCJKTokenizer extends Tokenizer {
protected static class SimpleCJKTokenizer extends Tokenizer {
private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
public SimpleCJKTokenizer(Reader input) {
@ -244,7 +244,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
}
@Override
public boolean incrementToken() throws IOException {
public final boolean incrementToken() throws IOException {
int ch = input.read();
if (ch < 0)
return false;
@ -1088,7 +1088,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
/**
* adds synonym of "dog" for "dogs".
*/
private class MockSynonymFilter extends TokenFilter {
protected static class MockSynonymFilter extends TokenFilter {
CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
boolean addSynonym = false;