From 17d113dac1e6081a48144679234b00a556210160 Mon Sep 17 00:00:00 2001 From: Steve Rowe Date: Tue, 5 Jul 2016 18:03:42 -0400 Subject: [PATCH] LUCENE-2605: Add classic QueryParser option setSplitOnWhitespace() to control whether to split on whitespace prior to text analysis. Default behavior remains unchanged: split-on-whitespace=true. --- lucene/CHANGES.txt | 4 + .../core/TestAllAnalyzersHaveFactories.java | 2 + .../lucene/analysis/TestStopFilter.java | 37 -- .../apache/lucene/util/TestQueryBuilder.java | 43 +- .../queryparser/classic/CharStream.java | 2 +- .../classic/MultiFieldQueryParser.java | 51 ++- .../queryparser/classic/ParseException.java | 2 +- .../queryparser/classic/QueryParser.java | 399 +++++++++++++----- .../lucene/queryparser/classic/QueryParser.jj | 265 +++++++----- .../queryparser/classic/QueryParserBase.java | 39 ++ .../classic/QueryParserTokenManager.java | 27 +- .../lucene/queryparser/classic/Token.java | 2 +- .../queryparser/classic/TokenMgrError.java | 2 +- .../queryparser/classic/TestQueryParser.java | 247 ++++++++++- .../ext/TestExtendableQueryParser.java | 1 + .../flexible/standard/TestStandardQP.java | 11 + .../queryparser/util/QueryParserTestBase.java | 49 +-- .../lucene/analysis/MockSynonymAnalyzer.java | 28 ++ .../lucene/analysis/MockSynonymFilter.java | 97 +++++ .../analysis/TestMockSynonymFilter.java | 151 +++++++ 20 files changed, 1117 insertions(+), 342 deletions(-) create mode 100644 lucene/test-framework/src/java/org/apache/lucene/analysis/MockSynonymAnalyzer.java create mode 100644 lucene/test-framework/src/java/org/apache/lucene/analysis/MockSynonymFilter.java create mode 100644 lucene/test-framework/src/test/org/apache/lucene/analysis/TestMockSynonymFilter.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index a79d69eb047..50b5f3cf5b0 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -74,6 +74,10 @@ Improvements ScandinavianNormalizationFilterFactory now implement MultiTermAwareComponent. (Adrien Grand) +* LUCENE-2605: Add classic QueryParser option setSplitOnWhitespace() to + control whether to split on whitespace prior to text analysis. Default + behavior remains unchanged: split-on-whitespace=true. (Steve Rowe) + Optimizations * LUCENE-7330, LUCENE-7339: Speed up conjunction queries. (Adrien Grand) diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java index f8874eb13e4..d826a60d677 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java @@ -36,6 +36,7 @@ import org.apache.lucene.analysis.MockFixedLengthPayloadFilter; import org.apache.lucene.analysis.MockGraphTokenFilter; import org.apache.lucene.analysis.MockHoleInjectingTokenFilter; import org.apache.lucene.analysis.MockRandomLookaheadTokenFilter; +import org.apache.lucene.analysis.MockSynonymFilter; import org.apache.lucene.analysis.MockTokenFilter; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockVariableLengthPayloadFilter; @@ -75,6 +76,7 @@ public class TestAllAnalyzersHaveFactories extends LuceneTestCase { MockGraphTokenFilter.class, MockHoleInjectingTokenFilter.class, MockRandomLookaheadTokenFilter.class, + MockSynonymFilter.class, MockTokenFilter.class, MockVariableLengthPayloadFilter.class, ValidatingTokenFilter.class, diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestStopFilter.java b/lucene/core/src/test/org/apache/lucene/analysis/TestStopFilter.java index c2246823dcf..3e26965deee 100644 --- a/lucene/core/src/test/org/apache/lucene/analysis/TestStopFilter.java +++ b/lucene/core/src/test/org/apache/lucene/analysis/TestStopFilter.java @@ -22,7 +22,6 @@ import java.util.ArrayList; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; @@ -137,40 +136,4 @@ public class TestStopFilter extends BaseTokenStreamTestCase { System.out.println(s); } } - - // stupid filter that inserts synonym of 'hte' for 'the' - private class MockSynonymFilter extends TokenFilter { - State bufferedState; - CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); - - MockSynonymFilter(TokenStream input) { - super(input); - } - - @Override - public boolean incrementToken() throws IOException { - if (bufferedState != null) { - restoreState(bufferedState); - posIncAtt.setPositionIncrement(0); - termAtt.setEmpty().append("hte"); - bufferedState = null; - return true; - } else if (input.incrementToken()) { - if (termAtt.toString().equals("the")) { - bufferedState = captureState(); - } - return true; - } else { - return false; - } - } - - @Override - public void reset() throws IOException { - super.reset(); - bufferedState = null; - } - } - } diff --git a/lucene/core/src/test/org/apache/lucene/util/TestQueryBuilder.java b/lucene/core/src/test/org/apache/lucene/util/TestQueryBuilder.java index 205fbab0981..d3019e3d077 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestQueryBuilder.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestQueryBuilder.java @@ -21,6 +21,7 @@ import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockSynonymFilter; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; @@ -121,7 +122,7 @@ public class TestQueryBuilder extends LuceneTestCase { assertNull(builder.createBooleanQuery("field", "")); } - /** adds synonym of "dog" for "dogs". */ + /** adds synonym of "dog" for "dogs", and synonym of "cavy" for "guinea pig". */ static class MockSynonymAnalyzer extends Analyzer { @Override protected TokenStreamComponents createComponents(String fieldName) { @@ -130,37 +131,6 @@ public class TestQueryBuilder extends LuceneTestCase { } } - /** - * adds synonym of "dog" for "dogs". - */ - protected static class MockSynonymFilter extends TokenFilter { - CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); - boolean addSynonym = false; - - public MockSynonymFilter(TokenStream input) { - super(input); - } - - @Override - public final boolean incrementToken() throws IOException { - if (addSynonym) { // inject our synonym - clearAttributes(); - termAtt.setEmpty().append("dog"); - posIncAtt.setPositionIncrement(0); - addSynonym = false; - return true; - } - - if (input.incrementToken()) { - addSynonym = termAtt.toString().equals("dogs"); - return true; - } else { - return false; - } - } - } - /** simple synonyms test */ public void testSynonyms() throws Exception { SynonymQuery expected = new SynonymQuery(new Term("field", "dogs"), new Term("field", "dog")); @@ -180,6 +150,15 @@ public class TestQueryBuilder extends LuceneTestCase { assertEquals(expectedBuilder.build(), builder.createPhraseQuery("field", "old dogs")); } + /** forms multiphrase query */ + public void testMultiWordSynonymsPhrase() throws Exception { + MultiPhraseQuery.Builder expectedBuilder = new MultiPhraseQuery.Builder(); + expectedBuilder.add(new Term[] { new Term("field", "guinea"), new Term("field", "cavy") }); + expectedBuilder.add(new Term("field", "pig")); + QueryBuilder queryBuilder = new QueryBuilder(new MockSynonymAnalyzer()); + assertEquals(expectedBuilder.build(), queryBuilder.createPhraseQuery("field", "guinea pig")); + } + protected static class SimpleCJKTokenizer extends Tokenizer { private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/CharStream.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/CharStream.java index 85b14614435..2c5fcbabde5 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/CharStream.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/CharStream.java @@ -112,4 +112,4 @@ interface CharStream { void Done(); } -/* JavaCC - OriginalChecksum=c847dd1920bf7901125a7244125682ad (do not edit this line) */ +/* JavaCC - OriginalChecksum=30b94cad7b10d0d81e3a59a1083939d0 (do not edit this line) */ diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/MultiFieldQueryParser.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/MultiFieldQueryParser.java index b9963ec1bd5..69a7559b71a 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/MultiFieldQueryParser.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/MultiFieldQueryParser.java @@ -27,6 +27,7 @@ import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; /** * A QueryParser which constructs queries to search multiple fields. @@ -148,18 +149,54 @@ public class MultiFieldQueryParser extends QueryParser protected Query getFieldQuery(String field, String queryText, boolean quoted) throws ParseException { if (field == null) { List clauses = new ArrayList<>(); + Query[] fieldQueries = new Query[fields.length]; + int maxTerms = 0; for (int i = 0; i < fields.length; i++) { Query q = super.getFieldQuery(fields[i], queryText, quoted); if (q != null) { - //If the user passes a map of boosts - if (boosts != null) { - //Get the boost from the map and apply them - Float boost = boosts.get(fields[i]); - if (boost != null) { - q = new BoostQuery(q, boost.floatValue()); + if (q instanceof TermQuery) { + maxTerms = Math.max(1, maxTerms); + } else if (q instanceof BooleanQuery) { + maxTerms = Math.max(maxTerms, ((BooleanQuery)q).clauses().size()); + } + fieldQueries[i] = q; + } + } + for (int termNum = 0; termNum < maxTerms; termNum++) { + List termClauses = new ArrayList<>(); + for (int i = 0; i < fields.length; i++) { + if (fieldQueries[i] != null) { + Query q = null; + if (fieldQueries[i] instanceof BooleanQuery) { + List nestedClauses = ((BooleanQuery)fieldQueries[i]).clauses(); + if (termNum < nestedClauses.size()) { + q = nestedClauses.get(termNum).getQuery(); + } + } else if (termNum == 0) { // e.g. TermQuery-s + q = fieldQueries[i]; + } + if (q != null) { + if (boosts != null) { + //Get the boost from the map and apply them + Float boost = boosts.get(fields[i]); + if (boost != null) { + q = new BoostQuery(q, boost); + } + } + termClauses.add(q); } } - clauses.add(q); + } + if (maxTerms > 1) { + if (termClauses.size() > 0) { + BooleanQuery.Builder builder = newBooleanQuery(); + for (Query termClause : termClauses) { + builder.add(termClause, BooleanClause.Occur.SHOULD); + } + clauses.add(builder.build()); + } + } else { + clauses.addAll(termClauses); } } if (clauses.size() == 0) // happens for stopwords diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/ParseException.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/ParseException.java index a0ddab2d363..3c02be3f004 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/ParseException.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/ParseException.java @@ -184,4 +184,4 @@ public class ParseException extends Exception { } } -/* JavaCC - OriginalChecksum=61602edcb3a15810cbc58f5593eba40d (do not edit this line) */ +/* JavaCC - OriginalChecksum=b187d97d5bb75c3fc63d642c1c26ac6e (do not edit this line) */ diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParser.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParser.java index 08a477e79b4..c137d3043b7 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParser.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParser.java @@ -3,8 +3,11 @@ package org.apache.lucene.queryparser.classic; import java.io.StringReader; import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; import java.util.List; import java.util.Locale; +import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.DateTools; @@ -81,6 +84,9 @@ public class QueryParser extends QueryParserBase implements QueryParserConstants */ static public enum Operator { OR, AND } + /** default split on whitespace behavior */ + public static final boolean DEFAULT_SPLIT_ON_WHITESPACE = true; + /** Create a query parser. * @param f the default field for query terms. * @param a used to find terms in the query text. @@ -90,6 +96,28 @@ public class QueryParser extends QueryParserBase implements QueryParserConstants init(f, a); } + /** + * @see #setSplitOnWhitespace(boolean) + */ + public boolean getSplitOnWhitespace() { + return splitOnWhitespace; + } + + /** + * Whether query text should be split on whitespace prior to analysis. + * Default is {@value #DEFAULT_SPLIT_ON_WHITESPACE}. + */ + public void setSplitOnWhitespace(boolean splitOnWhitespace) { + this.splitOnWhitespace = splitOnWhitespace; + } + + private boolean splitOnWhitespace = DEFAULT_SPLIT_ON_WHITESPACE; + private static Set disallowedPostMultiTerm + = new HashSet(Arrays.asList(COLON, STAR, FUZZY_SLOP, CARAT, AND, OR)); + private static boolean allowedPostMultiTerm(int tokenKind) { + return disallowedPostMultiTerm.contains(tokenKind) == false; + } + // * Query ::= ( Clause )* // * Clause ::= ["+", "-"] [ ":"] ( | "(" Query ")" ) final public int Conjunction() throws ParseException { @@ -129,15 +157,15 @@ public class QueryParser extends QueryParserBase implements QueryParserConstants switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case PLUS: jj_consume_token(PLUS); - ret = MOD_REQ; + ret = MOD_REQ; break; case MINUS: jj_consume_token(MINUS); - ret = MOD_NOT; + ret = MOD_NOT; break; case NOT: jj_consume_token(NOT); - ret = MOD_NOT; + ret = MOD_NOT; break; default: jj_la1[2] = jj_gen; @@ -166,11 +194,37 @@ public class QueryParser extends QueryParserBase implements QueryParserConstants List clauses = new ArrayList(); Query q, firstQuery=null; int conj, mods; - mods = Modifiers(); - q = Clause(field); - addClause(clauses, CONJ_NONE, mods, q); - if (mods == MOD_NONE) - firstQuery=q; + if (jj_2_1(2)) { + firstQuery = MultiTerm(field, clauses); + } else { + switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case NOT: + case PLUS: + case MINUS: + case BAREOPER: + case LPAREN: + case STAR: + case QUOTED: + case TERM: + case PREFIXTERM: + case WILDTERM: + case REGEXPTERM: + case RANGEIN_START: + case RANGEEX_START: + case NUMBER: + mods = Modifiers(); + q = Clause(field); + addClause(clauses, CONJ_NONE, mods, q); + if (mods == MOD_NONE) { + firstQuery = q; + } + break; + default: + jj_la1[4] = jj_gen; + jj_consume_token(-1); + throw new ParseException(); + } + } label_1: while (true) { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { @@ -193,39 +247,66 @@ public class QueryParser extends QueryParserBase implements QueryParserConstants ; break; default: - jj_la1[4] = jj_gen; + jj_la1[5] = jj_gen; break label_1; } - conj = Conjunction(); - mods = Modifiers(); - q = Clause(field); - addClause(clauses, conj, mods, q); - } - if (clauses.size() == 1 && firstQuery != null) - {if (true) return firstQuery;} - else { - {if (true) return getBooleanQuery(clauses);} + if (jj_2_2(2)) { + MultiTerm(field, clauses); + } else { + switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case AND: + case OR: + case NOT: + case PLUS: + case MINUS: + case BAREOPER: + case LPAREN: + case STAR: + case QUOTED: + case TERM: + case PREFIXTERM: + case WILDTERM: + case REGEXPTERM: + case RANGEIN_START: + case RANGEEX_START: + case NUMBER: + conj = Conjunction(); + mods = Modifiers(); + q = Clause(field); + addClause(clauses, conj, mods, q); + break; + default: + jj_la1[6] = jj_gen; + jj_consume_token(-1); + throw new ParseException(); + } } + } + if (clauses.size() == 1 && firstQuery != null) { + {if (true) return firstQuery;} + } else { + {if (true) return getBooleanQuery(clauses);} + } throw new Error("Missing return statement in function"); } final public Query Clause(String field) throws ParseException { Query q; Token fieldToken=null, boost=null; - if (jj_2_1(2)) { + if (jj_2_3(2)) { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case TERM: fieldToken = jj_consume_token(TERM); jj_consume_token(COLON); - field=discardEscapeChar(fieldToken.image); + field=discardEscapeChar(fieldToken.image); break; case STAR: jj_consume_token(STAR); jj_consume_token(COLON); - field="*"; + field="*"; break; default: - jj_la1[5] = jj_gen; + jj_la1[7] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -255,16 +336,16 @@ public class QueryParser extends QueryParserBase implements QueryParserConstants boost = jj_consume_token(NUMBER); break; default: - jj_la1[6] = jj_gen; + jj_la1[8] = jj_gen; ; } break; default: - jj_la1[7] = jj_gen; + jj_la1[9] = jj_gen; jj_consume_token(-1); throw new ParseException(); } - {if (true) return handleBoost(q, boost);} + {if (true) return handleBoost(q, boost);} throw new Error("Missing return statement in function"); } @@ -291,73 +372,86 @@ public class QueryParser extends QueryParserBase implements QueryParserConstants break; case STAR: term = jj_consume_token(STAR); - wildcard=true; + wildcard=true; break; case PREFIXTERM: term = jj_consume_token(PREFIXTERM); - prefix=true; + prefix=true; break; case WILDTERM: term = jj_consume_token(WILDTERM); - wildcard=true; + wildcard=true; break; case REGEXPTERM: term = jj_consume_token(REGEXPTERM); - regexp=true; + regexp=true; break; case NUMBER: term = jj_consume_token(NUMBER); break; case BAREOPER: term = jj_consume_token(BAREOPER); - term.image = term.image.substring(0,1); + term.image = term.image.substring(0,1); break; default: - jj_la1[8] = jj_gen; + jj_la1[10] = jj_gen; jj_consume_token(-1); throw new ParseException(); } switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case FUZZY_SLOP: - fuzzySlop = jj_consume_token(FUZZY_SLOP); - fuzzy=true; - break; - default: - jj_la1[9] = jj_gen; - ; - } - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case CARAT: - jj_consume_token(CARAT); - boost = jj_consume_token(NUMBER); + case FUZZY_SLOP: switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case CARAT: + jj_consume_token(CARAT); + boost = jj_consume_token(NUMBER); + switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case FUZZY_SLOP: + fuzzySlop = jj_consume_token(FUZZY_SLOP); + fuzzy=true; + break; + default: + jj_la1[11] = jj_gen; + ; + } + break; case FUZZY_SLOP: fuzzySlop = jj_consume_token(FUZZY_SLOP); - fuzzy=true; + fuzzy=true; + switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case CARAT: + jj_consume_token(CARAT); + boost = jj_consume_token(NUMBER); + break; + default: + jj_la1[12] = jj_gen; + ; + } break; default: - jj_la1[10] = jj_gen; - ; + jj_la1[13] = jj_gen; + jj_consume_token(-1); + throw new ParseException(); } break; default: - jj_la1[11] = jj_gen; + jj_la1[14] = jj_gen; ; } - q = handleBareTokenQuery(field, term, fuzzySlop, prefix, wildcard, fuzzy, regexp); + q = handleBareTokenQuery(field, term, fuzzySlop, prefix, wildcard, fuzzy, regexp); break; case RANGEIN_START: case RANGEEX_START: switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case RANGEIN_START: jj_consume_token(RANGEIN_START); - startInc=true; + startInc = true; break; case RANGEEX_START: jj_consume_token(RANGEEX_START); break; default: - jj_la1[12] = jj_gen; + jj_la1[15] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -369,7 +463,7 @@ public class QueryParser extends QueryParserBase implements QueryParserConstants goop1 = jj_consume_token(RANGE_QUOTED); break; default: - jj_la1[13] = jj_gen; + jj_la1[16] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -378,7 +472,7 @@ public class QueryParser extends QueryParserBase implements QueryParserConstants jj_consume_token(RANGE_TO); break; default: - jj_la1[14] = jj_gen; + jj_la1[17] = jj_gen; ; } switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { @@ -389,20 +483,20 @@ public class QueryParser extends QueryParserBase implements QueryParserConstants goop2 = jj_consume_token(RANGE_QUOTED); break; default: - jj_la1[15] = jj_gen; + jj_la1[18] = jj_gen; jj_consume_token(-1); throw new ParseException(); } switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case RANGEIN_END: jj_consume_token(RANGEIN_END); - endInc=true; + endInc = true; break; case RANGEEX_END: jj_consume_token(RANGEEX_END); break; default: - jj_la1[16] = jj_gen; + jj_la1[19] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -412,46 +506,69 @@ public class QueryParser extends QueryParserBase implements QueryParserConstants boost = jj_consume_token(NUMBER); break; default: - jj_la1[17] = jj_gen; + jj_la1[20] = jj_gen; ; } - boolean startOpen=false; - boolean endOpen=false; - if (goop1.kind == RANGE_QUOTED) { - goop1.image = goop1.image.substring(1, goop1.image.length()-1); - } else if ("*".equals(goop1.image)) { - startOpen=true; - } - if (goop2.kind == RANGE_QUOTED) { - goop2.image = goop2.image.substring(1, goop2.image.length()-1); - } else if ("*".equals(goop2.image)) { - endOpen=true; - } - q = getRangeQuery(field, startOpen ? null : discardEscapeChar(goop1.image), endOpen ? null : discardEscapeChar(goop2.image), startInc, endInc); + boolean startOpen=false; + boolean endOpen=false; + if (goop1.kind == RANGE_QUOTED) { + goop1.image = goop1.image.substring(1, goop1.image.length()-1); + } else if ("*".equals(goop1.image)) { + startOpen=true; + } + if (goop2.kind == RANGE_QUOTED) { + goop2.image = goop2.image.substring(1, goop2.image.length()-1); + } else if ("*".equals(goop2.image)) { + endOpen=true; + } + q = getRangeQuery(field, startOpen ? null : discardEscapeChar(goop1.image), endOpen ? null : discardEscapeChar(goop2.image), startInc, endInc); break; case QUOTED: term = jj_consume_token(QUOTED); switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case FUZZY_SLOP: - fuzzySlop = jj_consume_token(FUZZY_SLOP); - break; - default: - jj_la1[18] = jj_gen; - ; - } - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case CARAT: - jj_consume_token(CARAT); - boost = jj_consume_token(NUMBER); + case FUZZY_SLOP: + switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case CARAT: + jj_consume_token(CARAT); + boost = jj_consume_token(NUMBER); + switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case FUZZY_SLOP: + fuzzySlop = jj_consume_token(FUZZY_SLOP); + fuzzy=true; + break; + default: + jj_la1[21] = jj_gen; + ; + } + break; + case FUZZY_SLOP: + fuzzySlop = jj_consume_token(FUZZY_SLOP); + fuzzy=true; + switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + case CARAT: + jj_consume_token(CARAT); + boost = jj_consume_token(NUMBER); + break; + default: + jj_la1[22] = jj_gen; + ; + } + break; + default: + jj_la1[23] = jj_gen; + jj_consume_token(-1); + throw new ParseException(); + } break; default: - jj_la1[19] = jj_gen; + jj_la1[24] = jj_gen; ; } - q = handleQuotedTerm(field, term, fuzzySlop); + q = handleQuotedTerm(field, term, fuzzySlop); break; default: - jj_la1[20] = jj_gen; + jj_la1[25] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -459,6 +576,44 @@ public class QueryParser extends QueryParserBase implements QueryParserConstants throw new Error("Missing return statement in function"); } +/** Returns the first query if splitOnWhitespace=true or otherwise the entire produced query */ + final public Query MultiTerm(String field, List clauses) throws ParseException { + Token text, whitespace, followingText; + Query firstQuery = null; + text = jj_consume_token(TERM); + if (splitOnWhitespace) { + firstQuery = getFieldQuery(field, discardEscapeChar(text.image), false); + addClause(clauses, CONJ_NONE, MOD_NONE, firstQuery); + } + if (getToken(1).kind == TERM && allowedPostMultiTerm(getToken(2).kind)) { + + } else { + jj_consume_token(-1); + throw new ParseException(); + } + label_2: + while (true) { + followingText = jj_consume_token(TERM); + if (splitOnWhitespace) { + Query q = getFieldQuery(field, discardEscapeChar(followingText.image), false); + addClause(clauses, CONJ_NONE, MOD_NONE, q); + } else { // build up the text to send to analysis + text.image += " " + followingText.image; + } + if (getToken(1).kind == TERM && allowedPostMultiTerm(getToken(2).kind)) { + ; + } else { + break label_2; + } + } + if (splitOnWhitespace == false) { + firstQuery = getFieldQuery(field, discardEscapeChar(text.image), false); + addMultiTermClauses(clauses, firstQuery); + } + {if (true) return firstQuery;} + throw new Error("Missing return statement in function"); + } + private boolean jj_2_1(int xla) { jj_la = xla; jj_lastpos = jj_scanpos = token; try { return !jj_3_1(); } @@ -466,23 +621,71 @@ public class QueryParser extends QueryParserBase implements QueryParserConstants finally { jj_save(0, xla); } } - private boolean jj_3R_2() { + private boolean jj_2_2(int xla) { + jj_la = xla; jj_lastpos = jj_scanpos = token; + try { return !jj_3_2(); } + catch(LookaheadSuccess ls) { return true; } + finally { jj_save(1, xla); } + } + + private boolean jj_2_3(int xla) { + jj_la = xla; jj_lastpos = jj_scanpos = token; + try { return !jj_3_3(); } + catch(LookaheadSuccess ls) { return true; } + finally { jj_save(2, xla); } + } + + private boolean jj_3R_4() { if (jj_scan_token(TERM)) return true; if (jj_scan_token(COLON)) return true; return false; } + private boolean jj_3_2() { + if (jj_3R_3()) return true; + return false; + } + private boolean jj_3_1() { + if (jj_3R_3()) return true; + return false; + } + + private boolean jj_3R_7() { + if (jj_scan_token(TERM)) return true; + return false; + } + + private boolean jj_3_3() { Token xsp; xsp = jj_scanpos; - if (jj_3R_2()) { + if (jj_3R_4()) { jj_scanpos = xsp; - if (jj_3R_3()) return true; + if (jj_3R_5()) return true; } return false; } private boolean jj_3R_3() { + if (jj_scan_token(TERM)) return true; + jj_lookingAhead = true; + jj_semLA = getToken(1).kind == TERM && allowedPostMultiTerm(getToken(2).kind); + jj_lookingAhead = false; + if (!jj_semLA || jj_3R_6()) return true; + Token xsp; + if (jj_3R_7()) return true; + while (true) { + xsp = jj_scanpos; + if (jj_3R_7()) { jj_scanpos = xsp; break; } + } + return false; + } + + private boolean jj_3R_6() { + return false; + } + + private boolean jj_3R_5() { if (jj_scan_token(STAR)) return true; if (jj_scan_token(COLON)) return true; return false; @@ -497,8 +700,11 @@ public class QueryParser extends QueryParserBase implements QueryParserConstants private int jj_ntk; private Token jj_scanpos, jj_lastpos; private int jj_la; + /** Whether we are looking ahead. */ + private boolean jj_lookingAhead = false; + private boolean jj_semLA; private int jj_gen; - final private int[] jj_la1 = new int[21]; + final private int[] jj_la1 = new int[26]; static private int[] jj_la1_0; static private int[] jj_la1_1; static { @@ -506,12 +712,12 @@ public class QueryParser extends QueryParserBase implements QueryParserConstants jj_la1_init_1(); } private static void jj_la1_init_0() { - jj_la1_0 = new int[] {0x300,0x300,0x1c00,0x1c00,0xfda7f00,0x120000,0x40000,0xfda6000,0x9d22000,0x200000,0x200000,0x40000,0x6000000,0x80000000,0x10000000,0x80000000,0x60000000,0x40000,0x200000,0x40000,0xfda2000,}; + jj_la1_0 = new int[] {0x300,0x300,0x1c00,0x1c00,0xfda7c00,0xfda7f00,0xfda7f00,0x120000,0x40000,0xfda6000,0x9d22000,0x200000,0x40000,0x240000,0x240000,0x6000000,0x80000000,0x10000000,0x80000000,0x60000000,0x40000,0x200000,0x40000,0x240000,0x240000,0xfda2000,}; } private static void jj_la1_init_1() { - jj_la1_1 = new int[] {0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x1,0x0,0x0,0x0,0x0,0x0,}; + jj_la1_1 = new int[] {0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,}; } - final private JJCalls[] jj_2_rtns = new JJCalls[1]; + final private JJCalls[] jj_2_rtns = new JJCalls[3]; private boolean jj_rescan = false; private int jj_gc = 0; @@ -521,7 +727,7 @@ public class QueryParser extends QueryParserBase implements QueryParserConstants token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 21; i++) jj_la1[i] = -1; + for (int i = 0; i < 26; i++) jj_la1[i] = -1; for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); } @@ -530,8 +736,9 @@ public class QueryParser extends QueryParserBase implements QueryParserConstants token_source.ReInit(stream); token = new Token(); jj_ntk = -1; + jj_lookingAhead = false; jj_gen = 0; - for (int i = 0; i < 21; i++) jj_la1[i] = -1; + for (int i = 0; i < 26; i++) jj_la1[i] = -1; for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); } @@ -541,7 +748,7 @@ public class QueryParser extends QueryParserBase implements QueryParserConstants token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 21; i++) jj_la1[i] = -1; + for (int i = 0; i < 26; i++) jj_la1[i] = -1; for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); } @@ -551,7 +758,7 @@ public class QueryParser extends QueryParserBase implements QueryParserConstants token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 21; i++) jj_la1[i] = -1; + for (int i = 0; i < 26; i++) jj_la1[i] = -1; for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); } @@ -614,7 +821,7 @@ public class QueryParser extends QueryParserBase implements QueryParserConstants /** Get the specific Token. */ final public Token getToken(int index) { - Token t = token; + Token t = jj_lookingAhead ? jj_scanpos : token; for (int i = 0; i < index; i++) { if (t.next != null) t = t.next; else t = t.next = token_source.getNextToken(); @@ -668,7 +875,7 @@ public class QueryParser extends QueryParserBase implements QueryParserConstants la1tokens[jj_kind] = true; jj_kind = -1; } - for (int i = 0; i < 21; i++) { + for (int i = 0; i < 26; i++) { if (jj_la1[i] == jj_gen) { for (int j = 0; j < 32; j++) { if ((jj_la1_0[i] & (1<{@value #DEFAULT_SPLIT_ON_WHITESPACE}. + */ + public void setSplitOnWhitespace(boolean splitOnWhitespace) { + this.splitOnWhitespace = splitOnWhitespace; + } + + private boolean splitOnWhitespace = DEFAULT_SPLIT_ON_WHITESPACE; + private static Set disallowedPostMultiTerm + = new HashSet(Arrays.asList(COLON, STAR, FUZZY_SLOP, CARAT, AND, OR)); + private static boolean allowedPostMultiTerm(int tokenKind) { + return disallowedPostMultiTerm.contains(tokenKind) == false; + } } PARSER_END(QueryParser) @@ -123,15 +150,14 @@ PARSER_END(QueryParser) /* ***************** */ <*> TOKEN : { - <#_NUM_CHAR: ["0"-"9"] > -// every character that follows a backslash is considered as an escaped character -| <#_ESCAPED_CHAR: "\\" ~[] > + <#_NUM_CHAR: ["0"-"9"] > +| <#_ESCAPED_CHAR: "\\" ~[] > // every character that follows a backslash is considered as an escaped character | <#_TERM_START_CHAR: ( ~[ " ", "\t", "\n", "\r", "\u3000", "+", "-", "!", "(", ")", ":", "^", "[", "]", "\"", "{", "}", "~", "*", "?", "\\", "/" ] - | <_ESCAPED_CHAR> ) > -| <#_TERM_CHAR: ( <_TERM_START_CHAR> | <_ESCAPED_CHAR> | "-" | "+" ) > -| <#_WHITESPACE: ( " " | "\t" | "\n" | "\r" | "\u3000") > -| <#_QUOTED_CHAR: ( ~[ "\"", "\\" ] | <_ESCAPED_CHAR> ) > + | <_ESCAPED_CHAR> ) > +| <#_TERM_CHAR: ( <_TERM_START_CHAR> | "-" | "+" ) > +| <#_WHITESPACE: ( " " | "\t" | "\n" | "\r" | "\u3000") > +| <#_QUOTED_CHAR: ( ~[ "\"", "\\" ] | <_ESCAPED_CHAR> ) > } SKIP : { @@ -139,37 +165,37 @@ PARSER_END(QueryParser) } TOKEN : { - -| -| -| -| -| > -| -| -| -| -| : Boost -| )* "\""> -| (<_TERM_CHAR>)* > -| )+ (( "." (<_NUM_CHAR>)+ )? (<_TERM_CHAR>)*) | (<_TERM_CHAR>)*) > -| (<_TERM_CHAR>)* "*" ) > -| | [ "*", "?" ]) (<_TERM_CHAR> | ( [ "*", "?" ] ))* > -| + +| +| +| +| +| > +| +| +| +| +| : Boost +| )* "\""> +| (<_TERM_CHAR>)* > +| )+ (( "." (<_NUM_CHAR>)+ )? (<_TERM_CHAR>)*) | (<_TERM_CHAR>)*) > +| (<_TERM_CHAR>)* "*" ) > +| | [ "*", "?" ]) (<_TERM_CHAR> | ( [ "*", "?" ] ))* > +| | : Range | : Range } TOKEN : { -)+ ( "." (<_NUM_CHAR>)+ )? > : DEFAULT + )+ ( "." (<_NUM_CHAR>)+ )? > : DEFAULT } TOKEN : { - -| : DEFAULT -| : DEFAULT + +| : DEFAULT +| : DEFAULT | -| +| } // * Query ::= ( Clause )* @@ -191,23 +217,20 @@ int Modifiers() : { } { [ - { ret = MOD_REQ; } - | { ret = MOD_NOT; } - | { ret = MOD_NOT; } + { ret = MOD_REQ; } + | { ret = MOD_NOT; } + | { ret = MOD_NOT; } ] { return ret; } } // This makes sure that there is no garbage after the query string -Query TopLevelQuery(String field) : -{ +Query TopLevelQuery(String field) : { Query q; } { q=Query(field) - { - return q; - } + { return q; } } Query Query(String field) : @@ -217,23 +240,30 @@ Query Query(String field) : int conj, mods; } { - mods=Modifiers() q=Clause(field) - { - addClause(clauses, CONJ_NONE, mods, q); - if (mods == MOD_NONE) - firstQuery=q; - } ( - conj=Conjunction() mods=Modifiers() q=Clause(field) - { addClause(clauses, conj, mods, q); } - )* - { - if (clauses.size() == 1 && firstQuery != null) - return firstQuery; - else { - return getBooleanQuery(clauses); + LOOKAHEAD(2) + firstQuery=MultiTerm(field, clauses) + | mods=Modifiers() q=Clause(field) + { + addClause(clauses, CONJ_NONE, mods, q); + if (mods == MOD_NONE) { + firstQuery = q; + } } + ) + ( + LOOKAHEAD(2) + MultiTerm(field, clauses) + | conj=Conjunction() mods=Modifiers() q=Clause(field) + { addClause(clauses, conj, mods, q); } + )* + { + if (clauses.size() == 1 && firstQuery != null) { + return firstQuery; + } else { + return getBooleanQuery(clauses); } + } } Query Clause(String field) : { @@ -244,20 +274,17 @@ Query Clause(String field) : { [ LOOKAHEAD(2) ( - fieldToken= {field=discardEscapeChar(fieldToken.image);} - | {field="*";} + fieldToken= {field=discardEscapeChar(fieldToken.image);} + | {field="*";} ) ] - ( - q=Term(field) - | q=Query(field) ( boost=)? - + q=Term(field) + | q=Query(field) [ boost= ] ) - { return handleBoost(q, boost); } + { return handleBoost(q, boost); } } - Query Term(String field) : { Token term, boost=null, fuzzySlop=null, goop1, goop2; boolean prefix = false; @@ -270,45 +297,85 @@ Query Term(String field) : { } { ( - ( - term= - | term= { wildcard=true; } - | term= { prefix=true; } - | term= { wildcard=true; } - | term= { regexp=true; } - | term= - | term= { term.image = term.image.substring(0,1); } - ) - [ fuzzySlop= { fuzzy=true; } ] - [ boost= [ fuzzySlop= { fuzzy=true; } ] ] - { - q = handleBareTokenQuery(field, term, fuzzySlop, prefix, wildcard, fuzzy, regexp); - } - | ( ( {startInc=true;} | ) - ( goop1=|goop1= ) - [ ] - ( goop2=|goop2= ) - ( {endInc=true;} | )) - [ boost= ] - { - boolean startOpen=false; - boolean endOpen=false; - if (goop1.kind == RANGE_QUOTED) { - goop1.image = goop1.image.substring(1, goop1.image.length()-1); - } else if ("*".equals(goop1.image)) { - startOpen=true; - } - if (goop2.kind == RANGE_QUOTED) { - goop2.image = goop2.image.substring(1, goop2.image.length()-1); - } else if ("*".equals(goop2.image)) { - endOpen=true; - } - q = getRangeQuery(field, startOpen ? null : discardEscapeChar(goop1.image), endOpen ? null : discardEscapeChar(goop2.image), startInc, endInc); - } - | term= - [ fuzzySlop= ] - [ boost= ] - { q = handleQuotedTerm(field, term, fuzzySlop); } + ( + term= + | term= { wildcard=true; } + | term= { prefix=true; } + | term= { wildcard=true; } + | term= { regexp=true; } + | term= + | term= { term.image = term.image.substring(0,1); } + ) + [ + boost= [ fuzzySlop= { fuzzy=true; } ] + | fuzzySlop= { fuzzy=true; } [ boost= ] + ] + { q = handleBareTokenQuery(field, term, fuzzySlop, prefix, wildcard, fuzzy, regexp); } + + | ( { startInc = true; } | ) + ( goop1= | goop1= ) + [ ] + ( goop2= | goop2= ) + ( { endInc = true; } | ) + [ boost= ] + { + boolean startOpen=false; + boolean endOpen=false; + if (goop1.kind == RANGE_QUOTED) { + goop1.image = goop1.image.substring(1, goop1.image.length()-1); + } else if ("*".equals(goop1.image)) { + startOpen=true; + } + if (goop2.kind == RANGE_QUOTED) { + goop2.image = goop2.image.substring(1, goop2.image.length()-1); + } else if ("*".equals(goop2.image)) { + endOpen=true; + } + q = getRangeQuery(field, startOpen ? null : discardEscapeChar(goop1.image), endOpen ? null : discardEscapeChar(goop2.image), startInc, endInc); + } + + | term= + [ + boost= [ fuzzySlop= { fuzzy=true; } ] + | fuzzySlop= { fuzzy=true; } [ boost= ] + ] + { q = handleQuotedTerm(field, term, fuzzySlop); } ) { return handleBoost(q, boost); } } + +/** Returns the first query if splitOnWhitespace=true or otherwise the entire produced query */ +Query MultiTerm(String field, List clauses) : { + Token text, whitespace, followingText; + Query firstQuery = null; +} +{ + text= + { + if (splitOnWhitespace) { + firstQuery = getFieldQuery(field, discardEscapeChar(text.image), false); + addClause(clauses, CONJ_NONE, MOD_NONE, firstQuery); + } + } + // Both lookaheads are required; the first lookahead vets the first following term and the second lookahead vets the rest + LOOKAHEAD({ getToken(1).kind == TERM && allowedPostMultiTerm(getToken(2).kind) }) + ( + LOOKAHEAD({ getToken(1).kind == TERM && allowedPostMultiTerm(getToken(2).kind) }) + followingText= + { + if (splitOnWhitespace) { + Query q = getFieldQuery(field, discardEscapeChar(followingText.image), false); + addClause(clauses, CONJ_NONE, MOD_NONE, q); + } else { // build up the text to send to analysis + text.image += " " + followingText.image; + } + } + )+ + { + if (splitOnWhitespace == false) { + firstQuery = getFieldQuery(field, discardEscapeChar(text.image), false); + addMultiTermClauses(clauses, firstQuery); + } + return firstQuery; + } +} \ No newline at end of file diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java index c00d88eecff..cdfa4776175 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java @@ -464,6 +464,45 @@ public abstract class QueryParserBase extends QueryBuilder implements CommonQuer throw new RuntimeException("Clause cannot be both required and prohibited"); } + /** + * Adds clauses generated from analysis over text containing whitespace. + * There are no operators, so the query's clauses can either be MUST (if the + * default operator is AND) or SHOULD (default OR). + * + * If all of the clauses in the given Query are TermQuery-s, this method flattens the result + * by adding the TermQuery-s individually to the output clause list; otherwise, the given Query + * is added as a single clause including its nested clauses. + */ + protected void addMultiTermClauses(List clauses, Query q) { + // We might have been passed a null query; the term might have been + // filtered away by the analyzer. + if (q == null) { + return; + } + boolean allNestedTermQueries = false; + if (q instanceof BooleanQuery) { + allNestedTermQueries = true; + for (BooleanClause clause : ((BooleanQuery)q).clauses()) { + if ( ! (clause.getQuery() instanceof TermQuery)) { + allNestedTermQueries = false; + break; + } + } + } + if (allNestedTermQueries) { + clauses.addAll(((BooleanQuery)q).clauses()); + } else { + BooleanClause.Occur occur = operator == OR_OPERATOR ? BooleanClause.Occur.SHOULD : BooleanClause.Occur.MUST; + if (q instanceof BooleanQuery) { + for (BooleanClause clause : ((BooleanQuery)q).clauses()) { + clauses.add(newBooleanClause(clause.getQuery(), occur)); + } + } else { + clauses.add(newBooleanClause(q, occur)); + } + } + } + /** * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to disallow */ diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserTokenManager.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserTokenManager.java index 8c8951e1b83..065ff8b4411 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserTokenManager.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserTokenManager.java @@ -285,7 +285,7 @@ private int jjMoveNfa_2(int startState, int curPos) jjCheckNAddTwoStates(33, 34); } else if (curChar == 92) - jjCheckNAddTwoStates(35, 35); + jjCheckNAdd(35); break; case 0: if ((0x97ffffff87ffffffL & l) != 0L) @@ -384,7 +384,7 @@ private int jjMoveNfa_2(int startState, int curPos) break; case 26: if (curChar == 92) - jjAddStates(27, 28); + jjstateSet[jjnewStateCnt++] = 27; break; case 27: if (kind > 21) @@ -400,7 +400,7 @@ private int jjMoveNfa_2(int startState, int curPos) break; case 29: if (curChar == 92) - jjAddStates(29, 30); + jjstateSet[jjnewStateCnt++] = 30; break; case 30: if (kind > 21) @@ -423,7 +423,7 @@ private int jjMoveNfa_2(int startState, int curPos) break; case 34: if (curChar == 92) - jjCheckNAddTwoStates(35, 35); + jjCheckNAdd(35); break; case 35: if (kind > 23) @@ -453,7 +453,7 @@ private int jjMoveNfa_2(int startState, int curPos) break; case 43: if (curChar == 92) - jjCheckNAddTwoStates(44, 44); + jjCheckNAdd(44); break; case 44: if (kind > 20) @@ -466,7 +466,7 @@ private int jjMoveNfa_2(int startState, int curPos) break; case 46: if (curChar == 92) - jjCheckNAddTwoStates(47, 47); + jjCheckNAdd(47); break; case 47: jjCheckNAddStates(18, 20); @@ -645,7 +645,7 @@ private int jjMoveNfa_0(int startState, int curPos) break; if (kind > 27) kind = 27; - jjAddStates(31, 32); + jjAddStates(27, 28); break; case 1: if (curChar == 46) @@ -799,11 +799,11 @@ private int jjMoveNfa_1(int startState, int curPos) break; case 2: if ((0xfffffffbffffffffL & l) != 0L) - jjCheckNAddStates(33, 35); + jjCheckNAddStates(29, 31); break; case 3: if (curChar == 34) - jjCheckNAddStates(33, 35); + jjCheckNAddStates(29, 31); break; case 5: if (curChar == 34 && kind > 31) @@ -836,7 +836,7 @@ private int jjMoveNfa_1(int startState, int curPos) jjCheckNAdd(6); break; case 2: - jjAddStates(33, 35); + jjAddStates(29, 31); break; case 4: if (curChar == 92) @@ -872,7 +872,7 @@ private int jjMoveNfa_1(int startState, int curPos) break; case 2: if (jjCanMove_1(hiByte, i1, i2, l1, l2)) - jjAddStates(33, 35); + jjAddStates(29, 31); break; case 6: if (!jjCanMove_1(hiByte, i1, i2, l1, l2)) @@ -899,9 +899,8 @@ private int jjMoveNfa_1(int startState, int curPos) } } static final int[] jjnextStates = { - 37, 39, 40, 17, 18, 20, 42, 45, 31, 46, 43, 22, 23, 25, 26, 24, - 25, 26, 45, 31, 46, 44, 47, 35, 22, 28, 29, 27, 27, 30, 30, 0, - 1, 2, 4, 5, + 37, 39, 40, 17, 18, 20, 42, 43, 45, 46, 31, 22, 23, 25, 26, 24, + 25, 26, 45, 46, 31, 44, 47, 35, 22, 28, 29, 0, 1, 2, 4, 5, }; private static final boolean jjCanMove_0(int hiByte, int i1, int i2, long l1, long l2) { diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/Token.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/Token.java index aa57487f4c2..0e52ec21969 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/Token.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/Token.java @@ -128,4 +128,4 @@ public class Token implements java.io.Serializable { } } -/* JavaCC - OriginalChecksum=c1e1418b35aa9e47ef8dc98b87423d70 (do not edit this line) */ +/* JavaCC - OriginalChecksum=405bb5d2fcd84e94ac1c8f0b12c1f914 (do not edit this line) */ diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/TokenMgrError.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/TokenMgrError.java index 7101f098f6e..ad111d0cd26 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/TokenMgrError.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/TokenMgrError.java @@ -144,4 +144,4 @@ public class TokenMgrError extends Error this(LexicalError(EOFSeen, lexState, errorLine, errorColumn, errorAfter, curChar), reason); } } -/* JavaCC - OriginalChecksum=0c275864a1972d9a01601ab81426872d (do not edit this line) */ +/* JavaCC - OriginalChecksum=f433e1a52b8eadbf12f3fbbbf87fd140 (do not edit this line) */ diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestQueryParser.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestQueryParser.java index 5b4eba87994..c3d7b37f5c1 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestQueryParser.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestQueryParser.java @@ -18,6 +18,7 @@ package org.apache.lucene.queryparser.classic; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockSynonymAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; @@ -33,6 +34,7 @@ import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.MultiPhraseQuery; +import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.SynonymQuery; import org.apache.lucene.search.TermQuery; @@ -44,7 +46,9 @@ import java.io.IOException; * Tests QueryParser. */ public class TestQueryParser extends QueryParserTestBase { - + + protected boolean splitOnWhitespace = QueryParser.DEFAULT_SPLIT_ON_WHITESPACE; + public static class QPTestParser extends QueryParser { public QPTestParser(String f, Analyzer a) { super(f, a); @@ -67,6 +71,7 @@ public class TestQueryParser extends QueryParserTestBase { if (a == null) a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); QueryParser qp = new QueryParser(getDefaultField(), a); qp.setDefaultOperator(QueryParserBase.OR_OPERATOR); + qp.setSplitOnWhitespace(splitOnWhitespace); return qp; } @@ -310,18 +315,7 @@ public class TestQueryParser extends QueryParserTestBase { Query unexpanded = new TermQuery(new Term("field", "dogs")); assertEquals(unexpanded, smart.parse("\"dogs\"")); } - - // TODO: fold these into QueryParserTestBase - - /** adds synonym of "dog" for "dogs". */ - static class MockSynonymAnalyzer extends Analyzer { - @Override - protected TokenStreamComponents createComponents(String fieldName) { - MockTokenizer tokenizer = new MockTokenizer(); - return new TokenStreamComponents(tokenizer, new MockSynonymFilter(tokenizer)); - } - } - + /** simple synonyms test */ public void testSynonyms() throws Exception { Query expected = new SynonymQuery(new Term("field", "dogs"), new Term("field", "dog")); @@ -483,4 +477,229 @@ public class TestQueryParser extends QueryParserTestBase { qp.parse("a*aaaaaaa"); }); } -} + + // TODO: Remove this specialization once the flexible standard parser gets multi-word synonym support + @Override + public void testQPA() throws Exception { + boolean oldSplitOnWhitespace = splitOnWhitespace; + splitOnWhitespace = false; + + assertQueryEquals("term phrase term", qpAnalyzer, "term phrase1 phrase2 term"); + + CommonQueryParserConfiguration cqpc = getParserConfig(qpAnalyzer); + setDefaultOperatorAND(cqpc); + assertQueryEquals(cqpc, "field", "term phrase term", "+term +phrase1 +phrase2 +term"); + + splitOnWhitespace = oldSplitOnWhitespace; + } + + // TODO: Move to QueryParserTestBase once standard flexible parser gets this capability + public void testMultiWordSynonyms() throws Exception { + QueryParser dumb = new QueryParser("field", new Analyzer1()); + dumb.setSplitOnWhitespace(false); + + // A multi-word synonym source will form a synonym query for the same-starting-position tokens + BooleanQuery.Builder multiWordExpandedBqBuilder = new BooleanQuery.Builder(); + Query multiWordSynonymQuery = new SynonymQuery(new Term("field", "guinea"), new Term("field", "cavy")); + multiWordExpandedBqBuilder.add(multiWordSynonymQuery, BooleanClause.Occur.SHOULD); + multiWordExpandedBqBuilder.add(new TermQuery(new Term("field", "pig")), BooleanClause.Occur.SHOULD); + Query multiWordExpandedBq = multiWordExpandedBqBuilder.build(); + assertEquals(multiWordExpandedBq, dumb.parse("guinea pig")); + + // With the phrase operator, a multi-word synonym source will form a multiphrase query. + // When the number of expanded term(s) is different from that of the original term(s), this is not good. + MultiPhraseQuery.Builder multiWordExpandedMpqBuilder = new MultiPhraseQuery.Builder(); + multiWordExpandedMpqBuilder.add(new Term[]{new Term("field", "guinea"), new Term("field", "cavy")}); + multiWordExpandedMpqBuilder.add(new Term("field", "pig")); + Query multiWordExpandedMPQ = multiWordExpandedMpqBuilder.build(); + assertEquals(multiWordExpandedMPQ, dumb.parse("\"guinea pig\"")); + + // custom behavior, the synonyms are expanded, unless you use quote operator + QueryParser smart = new SmartQueryParser(); + smart.setSplitOnWhitespace(false); + assertEquals(multiWordExpandedBq, smart.parse("guinea pig")); + + PhraseQuery.Builder multiWordUnexpandedPqBuilder = new PhraseQuery.Builder(); + multiWordUnexpandedPqBuilder.add(new Term("field", "guinea")); + multiWordUnexpandedPqBuilder.add(new Term("field", "pig")); + Query multiWordUnexpandedPq = multiWordUnexpandedPqBuilder.build(); + assertEquals(multiWordUnexpandedPq, smart.parse("\"guinea pig\"")); + } + + // TODO: Move to QueryParserTestBase once standard flexible parser gets this capability + public void testOperatorsAndMultiWordSynonyms() throws Exception { + Analyzer a = new MockSynonymAnalyzer(); + + boolean oldSplitOnWhitespace = splitOnWhitespace; + splitOnWhitespace = false; + + // Operators should interrupt multiword analysis of adjacent words if they associate + assertQueryEquals("+guinea pig", a, "+guinea pig"); + assertQueryEquals("-guinea pig", a, "-guinea pig"); + assertQueryEquals("!guinea pig", a, "-guinea pig"); + assertQueryEquals("guinea* pig", a, "guinea* pig"); + assertQueryEquals("guinea? pig", a, "guinea? pig"); + assertQueryEquals("guinea~2 pig", a, "guinea~2 pig"); + assertQueryEquals("guinea^2 pig", a, "(guinea)^2.0 pig"); + + assertQueryEquals("guinea +pig", a, "guinea +pig"); + assertQueryEquals("guinea -pig", a, "guinea -pig"); + assertQueryEquals("guinea !pig", a, "guinea -pig"); + assertQueryEquals("guinea pig*", a, "guinea pig*"); + assertQueryEquals("guinea pig?", a, "guinea pig?"); + assertQueryEquals("guinea pig~2", a, "guinea pig~2"); + assertQueryEquals("guinea pig^2", a, "guinea (pig)^2.0"); + + assertQueryEquals("field:guinea pig", a, "guinea pig"); + assertQueryEquals("guinea field:pig", a, "guinea pig"); + + assertQueryEquals("NOT guinea pig", a, "-guinea pig"); + assertQueryEquals("guinea NOT pig", a, "guinea -pig"); + + assertQueryEquals("guinea pig AND dogs", a, "guinea +pig +Synonym(dog dogs)"); + assertQueryEquals("dogs AND guinea pig", a, "+Synonym(dog dogs) +guinea pig"); + assertQueryEquals("guinea pig && dogs", a, "guinea +pig +Synonym(dog dogs)"); + assertQueryEquals("dogs && guinea pig", a, "+Synonym(dog dogs) +guinea pig"); + + assertQueryEquals("guinea pig OR dogs", a, "guinea pig Synonym(dog dogs)"); + assertQueryEquals("dogs OR guinea pig", a, "Synonym(dog dogs) guinea pig"); + assertQueryEquals("guinea pig || dogs", a, "guinea pig Synonym(dog dogs)"); + assertQueryEquals("dogs || guinea pig", a, "Synonym(dog dogs) guinea pig"); + + assertQueryEquals("\"guinea\" pig", a, "guinea pig"); + assertQueryEquals("guinea \"pig\"", a, "guinea pig"); + + assertQueryEquals("(guinea) pig", a, "guinea pig"); + assertQueryEquals("guinea (pig)", a, "guinea pig"); + + assertQueryEquals("/guinea/ pig", a, "/guinea/ pig"); + assertQueryEquals("guinea /pig/", a, "guinea /pig/"); + + // Operators should not interrupt multiword analysis if not don't associate + assertQueryEquals("(guinea pig)", a, "Synonym(cavy guinea) pig"); + assertQueryEquals("+(guinea pig)", a, "+(Synonym(cavy guinea) pig)"); + assertQueryEquals("-(guinea pig)", a, "-(Synonym(cavy guinea) pig)"); + assertQueryEquals("!(guinea pig)", a, "-(Synonym(cavy guinea) pig)"); + assertQueryEquals("NOT (guinea pig)", a, "-(Synonym(cavy guinea) pig)"); + assertQueryEquals("(guinea pig)^2", a, "(Synonym(cavy guinea) pig)^2.0"); + + assertQueryEquals("field:(guinea pig)", a, "Synonym(cavy guinea) pig"); + + assertQueryEquals("+small guinea pig", a, "+small Synonym(cavy guinea) pig"); + assertQueryEquals("-small guinea pig", a, "-small Synonym(cavy guinea) pig"); + assertQueryEquals("!small guinea pig", a, "-small Synonym(cavy guinea) pig"); + assertQueryEquals("NOT small guinea pig", a, "-small Synonym(cavy guinea) pig"); + assertQueryEquals("small* guinea pig", a, "small* Synonym(cavy guinea) pig"); + assertQueryEquals("small? guinea pig", a, "small? Synonym(cavy guinea) pig"); + assertQueryEquals("\"small\" guinea pig", a, "small Synonym(cavy guinea) pig"); + + assertQueryEquals("guinea pig +running", a, "Synonym(cavy guinea) pig +running"); + assertQueryEquals("guinea pig -running", a, "Synonym(cavy guinea) pig -running"); + assertQueryEquals("guinea pig !running", a, "Synonym(cavy guinea) pig -running"); + assertQueryEquals("guinea pig NOT running", a, "Synonym(cavy guinea) pig -running"); + assertQueryEquals("guinea pig running*", a, "Synonym(cavy guinea) pig running*"); + assertQueryEquals("guinea pig running?", a, "Synonym(cavy guinea) pig running?"); + assertQueryEquals("guinea pig \"running\"", a, "Synonym(cavy guinea) pig running"); + + assertQueryEquals("\"guinea pig\"~2", a, "\"(guinea cavy) pig\"~2"); + + assertQueryEquals("field:\"guinea pig\"", a, "\"(guinea cavy) pig\""); + + splitOnWhitespace = oldSplitOnWhitespace; + } + + public void testOperatorsAndMultiWordSynonymsSplitOnWhitespace() throws Exception { + Analyzer a = new MockSynonymAnalyzer(); + + boolean oldSplitOnWhitespace = splitOnWhitespace; + splitOnWhitespace = true; + + assertQueryEquals("+guinea pig", a, "+guinea pig"); + assertQueryEquals("-guinea pig", a, "-guinea pig"); + assertQueryEquals("!guinea pig", a, "-guinea pig"); + assertQueryEquals("guinea* pig", a, "guinea* pig"); + assertQueryEquals("guinea? pig", a, "guinea? pig"); + assertQueryEquals("guinea~2 pig", a, "guinea~2 pig"); + assertQueryEquals("guinea^2 pig", a, "(guinea)^2.0 pig"); + + assertQueryEquals("guinea +pig", a, "guinea +pig"); + assertQueryEquals("guinea -pig", a, "guinea -pig"); + assertQueryEquals("guinea !pig", a, "guinea -pig"); + assertQueryEquals("guinea pig*", a, "guinea pig*"); + assertQueryEquals("guinea pig?", a, "guinea pig?"); + assertQueryEquals("guinea pig~2", a, "guinea pig~2"); + assertQueryEquals("guinea pig^2", a, "guinea (pig)^2.0"); + + assertQueryEquals("field:guinea pig", a, "guinea pig"); + assertQueryEquals("guinea field:pig", a, "guinea pig"); + + assertQueryEquals("NOT guinea pig", a, "-guinea pig"); + assertQueryEquals("guinea NOT pig", a, "guinea -pig"); + + assertQueryEquals("guinea pig AND dogs", a, "guinea +pig +Synonym(dog dogs)"); + assertQueryEquals("dogs AND guinea pig", a, "+Synonym(dog dogs) +guinea pig"); + assertQueryEquals("guinea pig && dogs", a, "guinea +pig +Synonym(dog dogs)"); + assertQueryEquals("dogs && guinea pig", a, "+Synonym(dog dogs) +guinea pig"); + + assertQueryEquals("guinea pig OR dogs", a, "guinea pig Synonym(dog dogs)"); + assertQueryEquals("dogs OR guinea pig", a, "Synonym(dog dogs) guinea pig"); + assertQueryEquals("guinea pig || dogs", a, "guinea pig Synonym(dog dogs)"); + assertQueryEquals("dogs || guinea pig", a, "Synonym(dog dogs) guinea pig"); + + assertQueryEquals("\"guinea\" pig", a, "guinea pig"); + assertQueryEquals("guinea \"pig\"", a, "guinea pig"); + + assertQueryEquals("(guinea) pig", a, "guinea pig"); + assertQueryEquals("guinea (pig)", a, "guinea pig"); + + assertQueryEquals("/guinea/ pig", a, "/guinea/ pig"); + assertQueryEquals("guinea /pig/", a, "guinea /pig/"); + + assertQueryEquals("(guinea pig)", a, "guinea pig"); + assertQueryEquals("+(guinea pig)", a, "+(guinea pig)"); + assertQueryEquals("-(guinea pig)", a, "-(guinea pig)"); + assertQueryEquals("!(guinea pig)", a, "-(guinea pig)"); + assertQueryEquals("NOT (guinea pig)", a, "-(guinea pig)"); + assertQueryEquals("(guinea pig)^2", a, "(guinea pig)^2.0"); + + assertQueryEquals("field:(guinea pig)", a, "guinea pig"); + + assertQueryEquals("+small guinea pig", a, "+small guinea pig"); + assertQueryEquals("-small guinea pig", a, "-small guinea pig"); + assertQueryEquals("!small guinea pig", a, "-small guinea pig"); + assertQueryEquals("NOT small guinea pig", a, "-small guinea pig"); + assertQueryEquals("small* guinea pig", a, "small* guinea pig"); + assertQueryEquals("small? guinea pig", a, "small? guinea pig"); + assertQueryEquals("\"small\" guinea pig", a, "small guinea pig"); + + assertQueryEquals("guinea pig +running", a, "guinea pig +running"); + assertQueryEquals("guinea pig -running", a, "guinea pig -running"); + assertQueryEquals("guinea pig !running", a, "guinea pig -running"); + assertQueryEquals("guinea pig NOT running", a, "guinea pig -running"); + assertQueryEquals("guinea pig running*", a, "guinea pig running*"); + assertQueryEquals("guinea pig running?", a, "guinea pig running?"); + assertQueryEquals("guinea pig \"running\"", a, "guinea pig running"); + + assertQueryEquals("\"guinea pig\"~2", a, "\"(guinea cavy) pig\"~2"); + + assertQueryEquals("field:\"guinea pig\"", a, "\"(guinea cavy) pig\""); + + splitOnWhitespace = oldSplitOnWhitespace; + } + + public void testDefaultSplitOnWhitespace() throws Exception { + QueryParser parser = new QueryParser("field", new Analyzer1()); + + assertTrue(parser.getSplitOnWhitespace()); // default is true + + BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder(); + bqBuilder.add(new TermQuery(new Term("field", "guinea")), BooleanClause.Occur.SHOULD); + bqBuilder.add(new TermQuery(new Term("field", "pig")), BooleanClause.Occur.SHOULD); + assertEquals(bqBuilder.build(), parser.parse("guinea pig")); + + boolean oldSplitOnWhitespace = splitOnWhitespace; + splitOnWhitespace = QueryParser.DEFAULT_SPLIT_ON_WHITESPACE; + assertQueryEquals("guinea pig", new MockSynonymAnalyzer(), "guinea pig"); + splitOnWhitespace = oldSplitOnWhitespace; + } +} \ No newline at end of file diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/ext/TestExtendableQueryParser.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/ext/TestExtendableQueryParser.java index 785dd1c23dc..934a4dac254 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/ext/TestExtendableQueryParser.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/ext/TestExtendableQueryParser.java @@ -50,6 +50,7 @@ public class TestExtendableQueryParser extends TestQueryParser { getDefaultField(), a) : new ExtendableQueryParser( getDefaultField(), a, extensions); qp.setDefaultOperator(QueryParserBase.OR_OPERATOR); + qp.setSplitOnWhitespace(splitOnWhitespace); return qp; } diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestStandardQP.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestStandardQP.java index 25c737f214c..78d2bfda628 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestStandardQP.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestStandardQP.java @@ -203,4 +203,15 @@ public class TestStandardQP extends QueryParserTestBase { //TODO test something like "SmartQueryParser()" } + // TODO: Remove this specialization once the flexible standard parser gets multi-word synonym support + @Override + public void testQPA() throws Exception { + super.testQPA(); + + assertQueryEquals("term phrase term", qpAnalyzer, "term (phrase1 phrase2) term"); + + CommonQueryParserConfiguration cqpc = getParserConfig(qpAnalyzer); + setDefaultOperatorAND(cqpc); + assertQueryEquals(cqpc, "field", "term phrase term", "+term +(+phrase1 +phrase2) +term"); + } } diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java index 70dc15a7cfe..f1eccf467ce 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java @@ -27,7 +27,6 @@ import java.util.TimeZone; import org.apache.lucene.analysis.*; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.document.DateTools; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -535,8 +534,10 @@ public abstract class QueryParserTestBase extends LuceneTestCase { assertQueryEquals("term -(stop) term", qpAnalyzer, "term term"); assertQueryEquals("drop AND stop AND roll", qpAnalyzer, "+drop +roll"); - assertQueryEquals("term phrase term", qpAnalyzer, - "term (phrase1 phrase2) term"); + +// TODO: Re-enable once flexible standard parser gets multi-word synonym support +// assertQueryEquals("term phrase term", qpAnalyzer, +// "term phrase1 phrase2 term"); assertQueryEquals("term AND NOT phrase term", qpAnalyzer, "+term -(phrase1 phrase2) term"); assertQueryEquals("stop^3", qpAnalyzer, ""); @@ -552,8 +553,9 @@ public abstract class QueryParserTestBase extends LuceneTestCase { CommonQueryParserConfiguration cqpc = getParserConfig(qpAnalyzer); setDefaultOperatorAND(cqpc); - assertQueryEquals(cqpc, "field", "term phrase term", - "+term +(+phrase1 +phrase2) +term"); +// TODO: Re-enable once flexible standard parser gets multi-word synonym support +// assertQueryEquals(cqpc, "field", "term phrase term", +// "+term +phrase1 +phrase2 +term"); assertQueryEquals(cqpc, "field", "phrase", "+phrase1 +phrase2"); } @@ -1101,37 +1103,6 @@ public abstract class QueryParserTestBase extends LuceneTestCase { dir.close(); } - /** - * adds synonym of "dog" for "dogs". - */ - protected static class MockSynonymFilter extends TokenFilter { - CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); - boolean addSynonym = false; - - public MockSynonymFilter(TokenStream input) { - super(input); - } - - @Override - public final boolean incrementToken() throws IOException { - if (addSynonym) { // inject our synonym - clearAttributes(); - termAtt.setEmpty().append("dog"); - posIncAtt.setPositionIncrement(0); - addSynonym = false; - return true; - } - - if (input.incrementToken()) { - addSynonym = termAtt.toString().equals("dogs"); - return true; - } else { - return false; - } - } - } - /** whitespace+lowercase analyzer with synonyms */ protected class Analyzer1 extends Analyzer { public Analyzer1(){ @@ -1251,10 +1222,8 @@ public abstract class QueryParserTestBase extends LuceneTestCase { CharacterRunAutomaton stopStopList = new CharacterRunAutomaton(new RegExp("[sS][tT][oO][pP]").toAutomaton()); - CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList)); - - qp = getParserConfig( - new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList)); + CommonQueryParserConfiguration qp + = getParserConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList)); qp.setEnablePositionIncrements(true); PhraseQuery.Builder phraseQuery = new PhraseQuery.Builder(); diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockSynonymAnalyzer.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockSynonymAnalyzer.java new file mode 100644 index 00000000000..a2ce33e74e2 --- /dev/null +++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockSynonymAnalyzer.java @@ -0,0 +1,28 @@ +package org.apache.lucene.analysis; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** adds synonym of "dog" for "dogs", and synonym of "cavy" for "guinea pig". */ +public class MockSynonymAnalyzer extends Analyzer { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + MockTokenizer tokenizer = new MockTokenizer(); + return new TokenStreamComponents(tokenizer, new MockSynonymFilter(tokenizer)); + } +} + diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockSynonymFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockSynonymFilter.java new file mode 100644 index 00000000000..b50be0735dd --- /dev/null +++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockSynonymFilter.java @@ -0,0 +1,97 @@ +package org.apache.lucene.analysis; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.util.AttributeSource; + +/** adds synonym of "dog" for "dogs", and synonym of "cavy" for "guinea pig". */ +public class MockSynonymFilter extends TokenFilter { + CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); + OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); + List tokenQueue = new ArrayList<>(); + boolean endOfInput = false; + + public MockSynonymFilter(TokenStream input) { + super(input); + } + + @Override + public void reset() throws IOException { + super.reset(); + tokenQueue.clear(); + endOfInput = false; + } + + @Override + public final boolean incrementToken() throws IOException { + if (tokenQueue.size() > 0) { + tokenQueue.remove(0).copyTo(this); + return true; + } + if (endOfInput == false && input.incrementToken()) { + if (termAtt.toString().equals("dogs")) { + addSynonymAndRestoreOrigToken("dog", 1, offsetAtt.endOffset()); + } else if (termAtt.toString().equals("guinea")) { + AttributeSource firstSavedToken = cloneAttributes(); + if (input.incrementToken()) { + if (termAtt.toString().equals("pig")) { + AttributeSource secondSavedToken = cloneAttributes(); + int secondEndOffset = offsetAtt.endOffset(); + firstSavedToken.copyTo(this); + addSynonym("cavy", 2, secondEndOffset); + tokenQueue.add(secondSavedToken); + } else if (termAtt.toString().equals("dogs")) { + tokenQueue.add(cloneAttributes()); + addSynonym("dog", 1, offsetAtt.endOffset()); + } + } else { + endOfInput = true; + } + firstSavedToken.copyTo(this); + } + return true; + } else { + endOfInput = true; + return false; + } + } + private void addSynonym(String synonymText, int posLen, int endOffset) { + termAtt.setEmpty().append(synonymText); + posIncAtt.setPositionIncrement(0); + posLenAtt.setPositionLength(posLen); + offsetAtt.setOffset(offsetAtt.startOffset(), endOffset); + tokenQueue.add(cloneAttributes()); + } + private void addSynonymAndRestoreOrigToken(String synonymText, int posLen, int endOffset) { + AttributeSource origToken = cloneAttributes(); + addSynonym(synonymText, posLen, endOffset); + origToken.copyTo(this); + } +} + + diff --git a/lucene/test-framework/src/test/org/apache/lucene/analysis/TestMockSynonymFilter.java b/lucene/test-framework/src/test/org/apache/lucene/analysis/TestMockSynonymFilter.java new file mode 100644 index 00000000000..fb0d0657744 --- /dev/null +++ b/lucene/test-framework/src/test/org/apache/lucene/analysis/TestMockSynonymFilter.java @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis; + +import java.io.IOException; + +/** test the mock synonym filter */ +public class TestMockSynonymFilter extends BaseTokenStreamTestCase { + + /** test the mock synonym filter */ + public void test() throws IOException { + Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + MockTokenizer tokenizer = new MockTokenizer(); + return new TokenStreamComponents(tokenizer, new MockSynonymFilter(tokenizer)); + } + }; + + assertAnalyzesTo(analyzer, "dogs", + new String[]{"dogs", "dog"}, + new int[]{0, 0}, // start offset + new int[]{4, 4}, // end offset + null, + new int[]{1, 0}, // position increment + new int[]{1, 1}, // position length + true); // check that offsets are correct + + assertAnalyzesTo(analyzer, "small dogs", + new String[]{"small", "dogs", "dog"}, + new int[]{0, 6, 6}, // start offset + new int[]{5, 10, 10}, // end offset + null, + new int[]{1, 1, 0}, // position increment + new int[]{1, 1, 1}, // position length + true); // check that offsets are correct + + assertAnalyzesTo(analyzer, "dogs running", + new String[]{"dogs", "dog", "running"}, + new int[]{0, 0, 5}, // start offset + new int[]{4, 4, 12}, // end offset + null, + new int[]{1, 0, 1}, // position increment + new int[]{1, 1, 1}, // position length + true); // check that offsets are correct + + assertAnalyzesTo(analyzer, "small dogs running", + new String[]{"small", "dogs", "dog", "running"}, + new int[]{0, 6, 6, 11}, // start offset + new int[]{5, 10, 10, 18}, // end offset + null, + new int[]{1, 1, 0, 1}, // position increment + new int[]{1, 1, 1, 1}, // position length + true); // check that offsets are correct + + assertAnalyzesTo(analyzer, "guinea", + new String[]{"guinea"}, + new int[]{0}, // start offset + new int[]{6}, // end offset + null, + new int[]{1}, // position increment + new int[]{1}, // position length + true); // check that offsets are correct + + assertAnalyzesTo(analyzer, "pig", + new String[]{"pig"}, + new int[]{0}, // start offset + new int[]{3}, // end offset + null, + new int[]{1}, // position increment + new int[]{1}, // position length + true); // check that offsets are correct + + assertAnalyzesTo(analyzer, "guinea pig", + new String[]{"guinea", "cavy", "pig"}, + new int[]{0, 0, 7}, // start offset + new int[]{6, 10, 10}, // end offset + null, + new int[]{1, 0, 1}, // position increment + new int[]{1, 2, 1}, // position length + true); // check that offsets are correct + + assertAnalyzesTo(analyzer, "guinea dogs", + new String[]{"guinea", "dogs", "dog"}, + new int[]{0, 7, 7}, // start offset + new int[]{6, 11, 11}, // end offset + null, + new int[]{1, 1, 0}, // position increment + new int[]{1, 1, 1}, // position length + true); // check that offsets are correct + + assertAnalyzesTo(analyzer, "dogs guinea", + new String[]{"dogs", "dog", "guinea"}, + new int[]{0, 0, 5}, // start offset + new int[]{4, 4, 11}, // end offset + null, + new int[]{1, 0, 1}, // position increment + new int[]{1, 1, 1}, // position length + true); // check that offsets are correct + + assertAnalyzesTo(analyzer, "dogs guinea pig", + new String[]{"dogs", "dog", "guinea", "cavy", "pig"}, + new int[]{0, 0, 5, 5, 12}, // start offset + new int[]{4, 4, 11, 15, 15}, // end offset + null, + new int[]{1, 0, 1, 0, 1}, // position increment + new int[]{1, 1, 1, 2, 1}, // position length + true); // check that offsets are correct + + assertAnalyzesTo(analyzer, "guinea pig dogs", + new String[]{"guinea", "cavy", "pig", "dogs", "dog"}, + new int[]{0, 0, 7, 11, 11}, // start offset + new int[]{6, 10, 10, 15, 15}, // end offset + null, + new int[]{1, 0, 1, 1, 0}, // position increment + new int[]{1, 2, 1, 1, 1}, // position length + true); // check that offsets are correct + + assertAnalyzesTo(analyzer, "small dogs and guinea pig running", + new String[]{"small", "dogs", "dog", "and", "guinea", "cavy", "pig", "running"}, + new int[]{0, 6, 6, 11, 15, 15, 22, 26}, // start offset + new int[]{5, 10, 10, 14, 21, 25, 25, 33}, // end offset + null, + new int[]{1, 1, 0, 1, 1, 0, 1, 1}, // position increment + new int[]{1, 1, 1, 1, 1, 2, 1, 1}, // position length + true); // check that offsets are correct + + assertAnalyzesTo(analyzer, "small guinea pig and dogs running", + new String[]{"small", "guinea", "cavy", "pig", "and", "dogs", "dog", "running"}, + new int[]{0, 6, 6, 13, 17, 21, 21, 26}, // start offset + new int[]{5, 12, 16, 16, 20, 25, 25, 33}, // end offset + null, + new int[]{1, 1, 0, 1, 1, 1, 0, 1}, // position increment + new int[]{1, 1, 2, 1, 1, 1, 1, 1}, // position length + true); // check that offsets are correct + } +}