diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 7d76af0b813..a4c3d2556f4 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -421,6 +421,8 @@ New features * LUCENE-2862: Added TermsEnum.totalTermFreq() and Terms.getSumTotalTermFreq(). (Mike McCandless, Robert Muir) +* LUCENE-3290: Added Terms.getSumDocFreq() (Mike McCandless, Robert Muir) + * LUCENE-3003: Added new expert class oal.index.DocTermsOrd, refactored from Solr's UnInvertedField, for accessing term ords for multi-valued fields, per document. This is similar to FieldCache in @@ -512,6 +514,11 @@ Bug fixes causing the file to sometimes be larger than it needed to be. (Mike McCandless) +New Features + +* LUCENE-3290: Added FieldInvertState.numUniqueTerms + (Mike McCandless, Robert Muir) + Optimizations * LUCENE-3201, LUCENE-3218: CompoundFileSystem code has been consolidated diff --git a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java index b66a7b1ed70..396b29ed58c 100644 --- a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java +++ b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java @@ -53,24 +53,13 @@ import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.queryParser.ParseException; -import org.apache.lucene.queryParser.QueryParser; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.FilteredQuery; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.MultiPhraseQuery; -import org.apache.lucene.search.MultiTermQuery; -import org.apache.lucene.search.NumericRangeQuery; -import org.apache.lucene.search.PhraseQuery; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.TermRangeFilter; -import org.apache.lucene.search.TopDocs; -import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.search.*; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.highlight.SynonymTokenizer.TestHighlightRunner; import org.apache.lucene.search.regex.RegexQuery; import org.apache.lucene.search.spans.*; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.BasicAutomata; import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.apache.lucene.util.automaton.RegExp; @@ -102,8 +91,12 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte public void testQueryScorerHits() throws Exception { Analyzer analyzer = new MockAnalyzer(random, MockTokenizer.SIMPLE, true); - QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, FIELD_NAME, analyzer); - query = qp.parse("\"very long\""); + + PhraseQuery phraseQuery = new PhraseQuery(); + phraseQuery.add(new Term(FIELD_NAME, "very")); + phraseQuery.add(new Term(FIELD_NAME, "long")); + + query = phraseQuery; searcher = new IndexSearcher(ramDir, true); TopDocs hits = searcher.search(query, 10); @@ -133,12 +126,15 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte String s1 = "I call our world Flatland, not because we call it so,"; - QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, FIELD_NAME, new MockAnalyzer(random, MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)); - // Verify that a query against the default field results in text being // highlighted // regardless of the field name. - Query q = parser.parse("\"world Flatland\"~3"); + + PhraseQuery q = new PhraseQuery(); + q.setSlop(3); + q.add(new Term(FIELD_NAME, "world")); + q.add(new Term(FIELD_NAME, "flatland")); + String expected = "I call our world Flatland, not because we call it so,"; String observed = highlightField(q, "SOME_FIELD_NAME", s1); if (VERBOSE) System.out.println("Expected: \"" + expected + "\n" + "Observed: \"" + observed); @@ -150,7 +146,11 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte // when the query field name differs from the name of the field being // highlighted, // which in this example happens to be the default field name. - q = parser.parse("text:\"world Flatland\"~3"); + q = new PhraseQuery(); + q.setSlop(3); + q.add(new Term("text", "world")); + q.add(new Term("text", "flatland")); + expected = s1; observed = highlightField(q, FIELD_NAME, s1); if (VERBOSE) System.out.println("Expected: \"" + expected + "\n" + "Observed: \"" + observed); @@ -177,7 +177,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte } public void testSimpleSpanHighlighter() throws Exception { - doSearching("Kennedy"); + doSearching(new TermQuery(new Term(FIELD_NAME, "kennedy"))); int maxNumFragmentsRequired = 2; @@ -202,23 +202,49 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte // LUCENE-1752 public void testRepeatingTermsInMultBooleans() throws Exception { String content = "x y z a b c d e f g b c g"; - String ph1 = "\"a b c d\""; - String ph2 = "\"b c g\""; String f1 = "f1"; String f2 = "f2"; - String f1c = f1 + ":"; - String f2c = f2 + ":"; - String q = "(" + f1c + ph1 + " OR " + f2c + ph1 + ") AND (" + f1c + ph2 - + " OR " + f2c + ph2 + ")"; - Analyzer analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false); - QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, f1, analyzer); - Query query = qp.parse(q); - QueryScorer scorer = new QueryScorer(query, f1); + PhraseQuery f1ph1 = new PhraseQuery(); + f1ph1.add(new Term(f1, "a")); + f1ph1.add(new Term(f1, "b")); + f1ph1.add(new Term(f1, "c")); + f1ph1.add(new Term(f1, "d")); + + PhraseQuery f2ph1 = new PhraseQuery(); + f2ph1.add(new Term(f2, "a")); + f2ph1.add(new Term(f2, "b")); + f2ph1.add(new Term(f2, "c")); + f2ph1.add(new Term(f2, "d")); + + PhraseQuery f1ph2 = new PhraseQuery(); + f1ph2.add(new Term(f1, "b")); + f1ph2.add(new Term(f1, "c")); + f1ph2.add(new Term(f1, "g")); + + PhraseQuery f2ph2 = new PhraseQuery(); + f2ph2.add(new Term(f2, "b")); + f2ph2.add(new Term(f2, "c")); + f2ph2.add(new Term(f2, "g")); + + BooleanQuery booleanQuery = new BooleanQuery(); + BooleanQuery leftChild = new BooleanQuery(); + leftChild.add(f1ph1, Occur.SHOULD); + leftChild.add(f2ph1, Occur.SHOULD); + booleanQuery.add(leftChild, Occur.MUST); + + BooleanQuery rightChild = new BooleanQuery(); + rightChild.add(f1ph2, Occur.SHOULD); + rightChild.add(f2ph2, Occur.SHOULD); + booleanQuery.add(rightChild, Occur.MUST); + + QueryScorer scorer = new QueryScorer(booleanQuery, f1); scorer.setExpandMultiTermQuery(false); Highlighter h = new Highlighter(this, scorer); + Analyzer analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false); + h.getBestFragment(analyzer, f1, content); assertTrue("Failed to find correct number of highlights " + numHighlights + " found", @@ -226,7 +252,11 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte } public void testSimpleQueryScorerPhraseHighlighting() throws Exception { - doSearching("\"very long and contains\""); + PhraseQuery phraseQuery = new PhraseQuery(); + phraseQuery.add(new Term(FIELD_NAME, "very")); + phraseQuery.add(new Term(FIELD_NAME, "long")); + phraseQuery.add(new Term(FIELD_NAME, "contains"), 3); + doSearching(phraseQuery); int maxNumFragmentsRequired = 2; @@ -248,7 +278,14 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte numHighlights == 3); numHighlights = 0; - doSearching("\"This piece of text refers to Kennedy\""); + + phraseQuery = new PhraseQuery(); + phraseQuery.add(new Term(FIELD_NAME, "piece"), 1); + phraseQuery.add(new Term(FIELD_NAME, "text"), 3); + phraseQuery.add(new Term(FIELD_NAME, "refers"), 4); + phraseQuery.add(new Term(FIELD_NAME, "kennedy"), 6); + + doSearching(phraseQuery); maxNumFragmentsRequired = 2; @@ -270,7 +307,14 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte numHighlights == 4); numHighlights = 0; - doSearching("\"lets is a the lets is a the lets is a the lets\""); + + phraseQuery = new PhraseQuery(); + phraseQuery.add(new Term(FIELD_NAME, "lets")); + phraseQuery.add(new Term(FIELD_NAME, "lets"), 4); + phraseQuery.add(new Term(FIELD_NAME, "lets"), 8); + phraseQuery.add(new Term(FIELD_NAME, "lets"), 12); + + doSearching(phraseQuery); maxNumFragmentsRequired = 2; @@ -366,7 +410,12 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte } public void testSimpleQueryScorerPhraseHighlighting2() throws Exception { - doSearching("\"text piece long\"~5"); + PhraseQuery phraseQuery = new PhraseQuery(); + phraseQuery.setSlop(5); + phraseQuery.add(new Term(FIELD_NAME, "text")); + phraseQuery.add(new Term(FIELD_NAME, "piece")); + phraseQuery.add(new Term(FIELD_NAME, "long")); + doSearching(phraseQuery); int maxNumFragmentsRequired = 2; @@ -388,7 +437,11 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte } public void testSimpleQueryScorerPhraseHighlighting3() throws Exception { - doSearching("\"x y z\""); + PhraseQuery phraseQuery = new PhraseQuery(); + phraseQuery.add(new Term(FIELD_NAME, "x")); + phraseQuery.add(new Term(FIELD_NAME, "y")); + phraseQuery.add(new Term(FIELD_NAME, "z")); + doSearching(phraseQuery); int maxNumFragmentsRequired = 2; @@ -410,7 +463,12 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte } public void testSimpleSpanFragmenter() throws Exception { - doSearching("\"piece of text that is very long\""); + PhraseQuery phraseQuery = new PhraseQuery(); + phraseQuery.add(new Term(FIELD_NAME, "piece")); + phraseQuery.add(new Term(FIELD_NAME, "text"), 2); + phraseQuery.add(new Term(FIELD_NAME, "very"), 5); + phraseQuery.add(new Term(FIELD_NAME, "long"), 6); + doSearching(phraseQuery); int maxNumFragmentsRequired = 2; @@ -428,8 +486,12 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte if (VERBOSE) System.out.println("\t" + result); } - - doSearching("\"been shot\""); + + phraseQuery = new PhraseQuery(); + phraseQuery.add(new Term(FIELD_NAME, "been")); + phraseQuery.add(new Term(FIELD_NAME, "shot")); + + doSearching(query); maxNumFragmentsRequired = 2; @@ -451,7 +513,16 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte // position sensitive query added after position insensitive query public void testPosTermStdTerm() throws Exception { - doSearching("y \"x y z\""); + BooleanQuery booleanQuery = new BooleanQuery(); + booleanQuery.add(new TermQuery(new Term(FIELD_NAME, "y")), Occur.SHOULD); + + PhraseQuery phraseQuery = new PhraseQuery(); + phraseQuery.add(new Term(FIELD_NAME, "x")); + phraseQuery.add(new Term(FIELD_NAME, "y")); + phraseQuery.add(new Term(FIELD_NAME, "z")); + booleanQuery.add(phraseQuery, Occur.SHOULD); + + doSearching(booleanQuery); int maxNumFragmentsRequired = 2; @@ -525,7 +596,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte } public void testSimpleQueryTermScorerHighlighter() throws Exception { - doSearching("Kennedy"); + doSearching(new TermQuery(new Term(FIELD_NAME, "kennedy"))); Highlighter highlighter = new Highlighter(new QueryTermScorer(query)); highlighter.setTextFragmenter(new SimpleFragmenter(40)); int maxNumFragmentsRequired = 2; @@ -591,7 +662,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte @Override public void run() throws Exception { numHighlights = 0; - doSearching("Kennedy"); + doSearching(new TermQuery(new Term(FIELD_NAME, "kennedy"))); doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this); assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); @@ -607,7 +678,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte @Override public void run() throws Exception { numHighlights = 0; - doSearching("Kinnedy~0.5"); + FuzzyQuery fuzzyQuery = new FuzzyQuery(new Term(FIELD_NAME, "kinnedy"), 0.5f); + fuzzyQuery.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); + doSearching(fuzzyQuery); doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this, true); assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); @@ -623,7 +696,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte @Override public void run() throws Exception { numHighlights = 0; - doSearching("K?nnedy"); + WildcardQuery wildcardQuery = new WildcardQuery(new Term(FIELD_NAME, "k?nnedy")); + wildcardQuery.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); + doSearching(wildcardQuery); doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this); assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); @@ -639,7 +714,9 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte @Override public void run() throws Exception { numHighlights = 0; - doSearching("K*dy"); + WildcardQuery wildcardQuery = new WildcardQuery(new Term(FIELD_NAME, "k*dy")); + wildcardQuery.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); + doSearching(wildcardQuery); doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this); assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); @@ -660,9 +737,15 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte // Need to explicitly set the QueryParser property to use TermRangeQuery // rather // than RangeFilters - QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, FIELD_NAME, analyzer); - parser.setMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); - query = parser.parse(queryString); + + TermRangeQuery rangeQuery = new TermRangeQuery( + FIELD_NAME, + new BytesRef("kannedy"), + new BytesRef("kznnedy"), + true, true); + rangeQuery.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); + + query = rangeQuery; doSearching(query); doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this); @@ -772,7 +855,10 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte @Override public void run() throws Exception { numHighlights = 0; - doSearching("\"John Kennedy\""); + PhraseQuery phraseQuery = new PhraseQuery(); + phraseQuery.add(new Term(FIELD_NAME, "john")); + phraseQuery.add(new Term(FIELD_NAME, "kennedy")); + doSearching(phraseQuery); doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this); // Currently highlights "John" and "Kennedy" separately assertTrue("Failed to find correct number of highlights " + numHighlights + " found", @@ -874,7 +960,13 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte @Override public void run() throws Exception { numHighlights = 0; - doSearching("John Kenn*"); + BooleanQuery booleanQuery = new BooleanQuery(); + booleanQuery.add(new TermQuery(new Term(FIELD_NAME, "john")), Occur.SHOULD); + PrefixQuery prefixQuery = new PrefixQuery(new Term(FIELD_NAME, "kenn")); + prefixQuery.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); + booleanQuery.add(prefixQuery, Occur.SHOULD); + + doSearching(booleanQuery); doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this); assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); @@ -890,7 +982,12 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte @Override public void run() throws Exception { numHighlights = 0; - doSearching("JFK OR Kennedy"); + + BooleanQuery query = new BooleanQuery(); + query.add(new TermQuery(new Term(FIELD_NAME, "jfk")), Occur.SHOULD); + query.add(new TermQuery(new Term(FIELD_NAME, "kennedy")), Occur.SHOULD); + + doSearching(query); doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this); assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); @@ -905,7 +1002,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte @Override public void run() throws Exception { - doSearching("Kennedy"); + doSearching(new TermQuery(new Term(FIELD_NAME, "kennedy"))); numHighlights = 0; for (int i = 0; i < hits.totalHits; i++) { String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME); @@ -1006,11 +1103,13 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte HashMap synonyms = new HashMap(); synonyms.put("football", "soccer,footie"); Analyzer analyzer = new SynonymAnalyzer(synonyms); - String srchkey = "football"; String s = "football-soccer in the euro 2004 footie competition"; - QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, "bookid", analyzer); - Query query = parser.parse(srchkey); + + BooleanQuery query = new BooleanQuery(); + query.add(new TermQuery(new Term("bookid", "football")), Occur.SHOULD); + query.add(new TermQuery(new Term("bookid", "soccer")), Occur.SHOULD); + query.add(new TermQuery(new Term("bookid", "footie")), Occur.SHOULD); TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(s)); @@ -1037,7 +1136,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte @Override public void run() throws Exception { numHighlights = 0; - doSearching("Kennedy"); + doSearching(new TermQuery(new Term(FIELD_NAME, "kennedy"))); // new Highlighter(HighlighterTest.this, new QueryTermScorer(query)); for (int i = 0; i < hits.totalHits; i++) { @@ -1061,7 +1160,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte @Override public void run() throws Exception { - doSearching("Kennedy"); + doSearching(new TermQuery(new Term(FIELD_NAME, "kennedy"))); for (int i = 0; i < hits.totalHits; i++) { String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME); @@ -1102,7 +1201,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte @Override public void run() throws Exception { numHighlights = 0; - doSearching("meat"); + doSearching(new TermQuery(new Term(FIELD_NAME, "meat"))); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(texts[0])); Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream, HighlighterTest.this);// new Highlighter(this, new @@ -1199,9 +1298,11 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte if (searcher != null) searcher.close(); searcher = new IndexSearcher(ramDir, true); Analyzer analyzer = new MockAnalyzer(random, MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true); + + BooleanQuery query = new BooleanQuery(); + query.add(new WildcardQuery(new Term(FIELD_NAME, "jf?")), Occur.SHOULD); + query.add(new WildcardQuery(new Term(FIELD_NAME, "kenned*")), Occur.SHOULD); - QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, FIELD_NAME, analyzer); - Query query = parser.parse("JF? or Kenned*"); if (VERBOSE) System.out.println("Searching with primitive query"); // forget to set this and... // query=query.rewrite(reader); @@ -1243,7 +1344,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte @Override public void run() throws Exception { - doSearching("AnInvalidQueryWhichShouldYieldNoResults"); + doSearching(new TermQuery(new Term(FIELD_NAME, "aninvalidquerywhichshouldyieldnoresults"))); for (String text : texts) { TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); @@ -1313,8 +1414,10 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte @Override public void run() throws Exception { String docMainText = "fred is one of the people"; - QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, FIELD_NAME, analyzer); - Query query = parser.parse("fred category:people"); + + BooleanQuery query = new BooleanQuery(); + query.add(new TermQuery(new Term(FIELD_NAME, "fred")), Occur.SHOULD); + query.add(new TermQuery(new Term("category", "people")), Occur.SHOULD); // highlighting respects fieldnames used in query @@ -1453,64 +1556,68 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte Highlighter highlighter; String result; - query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("foo"); + query = new TermQuery(new Term("text", "foo")); highlighter = getHighlighter(query, "text", getTS2(), HighlighterTest.this); result = highlighter.getBestFragments(getTS2(), s, 3, "..."); assertEquals("Hi-Speed10 foo", result); - query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("10"); + query = new TermQuery(new Term("text", "10")); highlighter = getHighlighter(query, "text", getTS2(), HighlighterTest.this); result = highlighter.getBestFragments(getTS2(), s, 3, "..."); assertEquals("Hi-Speed10 foo", result); - query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("hi"); + query = new TermQuery(new Term("text", "hi")); highlighter = getHighlighter(query, "text", getTS2(), HighlighterTest.this); result = highlighter.getBestFragments(getTS2(), s, 3, "..."); assertEquals("Hi-Speed10 foo", result); - query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("speed"); + query = new TermQuery(new Term("text", "speed")); highlighter = getHighlighter(query, "text", getTS2(), HighlighterTest.this); result = highlighter.getBestFragments(getTS2(), s, 3, "..."); assertEquals("Hi-Speed10 foo", result); - query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("hispeed"); + query = new TermQuery(new Term("text", "hispeed")); highlighter = getHighlighter(query, "text", getTS2(), HighlighterTest.this); result = highlighter.getBestFragments(getTS2(), s, 3, "..."); assertEquals("Hi-Speed10 foo", result); - query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("hi speed"); + BooleanQuery booleanQuery = new BooleanQuery(); + booleanQuery.add(new TermQuery(new Term("text", "hi")), Occur.SHOULD); + booleanQuery.add(new TermQuery(new Term("text", "speed")), Occur.SHOULD); + + query = booleanQuery; highlighter = getHighlighter(query, "text", getTS2(), HighlighterTest.this); result = highlighter.getBestFragments(getTS2(), s, 3, "..."); assertEquals("Hi-Speed10 foo", result); // ///////////////// same tests, just put the bigger overlapping token // first - query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("foo"); + query = new TermQuery(new Term("text", "foo")); highlighter = getHighlighter(query, "text", getTS2a(), HighlighterTest.this); result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); assertEquals("Hi-Speed10 foo", result); - query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("10"); + query = new TermQuery(new Term("text", "10")); highlighter = getHighlighter(query, "text", getTS2a(), HighlighterTest.this); result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); assertEquals("Hi-Speed10 foo", result); - query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("hi"); + query = new TermQuery(new Term("text", "hi")); highlighter = getHighlighter(query, "text", getTS2a(), HighlighterTest.this); result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); assertEquals("Hi-Speed10 foo", result); - query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("speed"); + query = new TermQuery(new Term("text", "speed")); highlighter = getHighlighter(query, "text", getTS2a(), HighlighterTest.this); result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); assertEquals("Hi-Speed10 foo", result); - query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("hispeed"); + query = new TermQuery(new Term("text", "hispeed")); highlighter = getHighlighter(query, "text", getTS2a(), HighlighterTest.this); result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); assertEquals("Hi-Speed10 foo", result); - query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("hi speed"); + query = booleanQuery; highlighter = getHighlighter(query, "text", getTS2a(), HighlighterTest.this); result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); assertEquals("Hi-Speed10 foo", result); @@ -1554,9 +1661,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte } private void searchIndex() throws IOException, ParseException, InvalidTokenOffsetsException { - String q = "t_text1:random"; - QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, "t_text1", a ); - Query query = parser.parse( q ); + Query query = new TermQuery(new Term("t_text1", "random")); IndexSearcher searcher = new IndexSearcher( dir, true ); // This scorer can return negative idf -> null fragment Scorer scorer = new QueryTermScorer( query, searcher.getIndexReader(), "t_text1" ); @@ -1608,14 +1713,6 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte return "" + originalText + ""; } - public void doSearching(String queryString) throws Exception { - QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, FIELD_NAME, analyzer); - parser.setEnablePositionIncrements(true); - parser.setMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); - query = parser.parse(queryString); - doSearching(query); - } - public void doSearching(Query unReWrittenQuery) throws Exception { if (searcher != null) searcher.close(); searcher = new IndexSearcher(ramDir, true); diff --git a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java index 0f19ebfd459..d7fbe0b0909 100644 --- a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java +++ b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java @@ -19,7 +19,10 @@ package org.apache.lucene.search.vectorhighlight; import java.io.IOException; import java.io.Reader; +import java.io.StringReader; +import java.util.ArrayList; import java.util.Collection; +import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; @@ -28,6 +31,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; @@ -44,6 +48,7 @@ import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; public abstract class AbstractTestCase extends LuceneTestCase { @@ -56,9 +61,7 @@ public abstract class AbstractTestCase extends LuceneTestCase { protected Analyzer analyzerB; protected Analyzer analyzerK; protected IndexReader reader; - protected QueryParser paW; - protected QueryParser paB; - + protected static final String[] shortMVValues = { "", "", @@ -90,8 +93,6 @@ public abstract class AbstractTestCase extends LuceneTestCase { analyzerW = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false); analyzerB = new BigramAnalyzer(); analyzerK = new MockAnalyzer(random, MockTokenizer.KEYWORD, false); - paW = new QueryParser(TEST_VERSION_CURRENT, F, analyzerW ); - paB = new QueryParser(TEST_VERSION_CURRENT, F, analyzerB ); dir = newDirectory(); } @@ -172,6 +173,33 @@ public abstract class AbstractTestCase extends LuceneTestCase { } } + protected List analyze(String text, String field, Analyzer analyzer) throws IOException { + List bytesRefs = new ArrayList(); + + TokenStream tokenStream = analyzer.reusableTokenStream(field, new StringReader(text)); + TermToBytesRefAttribute termAttribute = tokenStream.getAttribute(TermToBytesRefAttribute.class); + + BytesRef bytesRef = termAttribute.getBytesRef(); + + while (tokenStream.incrementToken()) { + termAttribute.fillBytesRef(); + bytesRefs.add(new BytesRef(bytesRef)); + } + + tokenStream.end(); + tokenStream.close(); + + return bytesRefs; + } + + protected PhraseQuery toPhraseQuery(List bytesRefs, String field) { + PhraseQuery phraseQuery = new PhraseQuery(); + for (BytesRef bytesRef : bytesRefs) { + phraseQuery.add(new Term(field, bytesRef)); + } + return phraseQuery; + } + static final class BigramAnalyzer extends Analyzer { @Override public TokenStream tokenStream(String fieldName, Reader reader) { diff --git a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldQueryTest.java b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldQueryTest.java index 42924fdc8c9..ac0e69ddd81 100644 --- a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldQueryTest.java +++ b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldQueryTest.java @@ -22,19 +22,33 @@ import java.util.List; import java.util.Map; import java.util.Set; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.vectorhighlight.FieldQuery.QueryPhraseMap; import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo; +import org.apache.lucene.util.BytesRef; public class FieldQueryTest extends AbstractTestCase { public void testFlattenBoolean() throws Exception { - Query query = paW.parse( "A AND B OR C NOT (D AND E)" ); - FieldQuery fq = new FieldQuery( query, true, true ); + BooleanQuery booleanQuery = new BooleanQuery(); + booleanQuery.add(new TermQuery(new Term(F, "A")), Occur.MUST); + booleanQuery.add(new TermQuery(new Term(F, "B")), Occur.MUST); + booleanQuery.add(new TermQuery(new Term(F, "C")), Occur.SHOULD); + + BooleanQuery innerQuery = new BooleanQuery(); + innerQuery.add(new TermQuery(new Term(F, "D")), Occur.MUST); + innerQuery.add(new TermQuery(new Term(F, "E")), Occur.MUST); + booleanQuery.add(innerQuery, Occur.MUST_NOT); + + FieldQuery fq = new FieldQuery(booleanQuery, true, true ); Set flatQueries = new HashSet(); - fq.flatten( query, flatQueries ); + fq.flatten(booleanQuery, flatQueries); assertCollectionQueries( flatQueries, tq( "A" ), tq( "B" ), tq( "C" ) ); } @@ -47,15 +61,25 @@ public class FieldQueryTest extends AbstractTestCase { } public void testFlattenTermAndPhrase() throws Exception { - Query query = paW.parse( "A AND \"B C\"" ); - FieldQuery fq = new FieldQuery( query, true, true ); + BooleanQuery booleanQuery = new BooleanQuery(); + booleanQuery.add(new TermQuery(new Term(F, "A")), Occur.MUST); + PhraseQuery phraseQuery = new PhraseQuery(); + phraseQuery.add(new Term(F, "B")); + phraseQuery.add(new Term(F, "C")); + booleanQuery.add(phraseQuery, Occur.MUST); + + FieldQuery fq = new FieldQuery(booleanQuery, true, true ); Set flatQueries = new HashSet(); - fq.flatten( query, flatQueries ); + fq.flatten(booleanQuery, flatQueries); assertCollectionQueries( flatQueries, tq( "A" ), pqF( "B", "C" ) ); } public void testFlattenTermAndPhrase2gram() throws Exception { - Query query = paB.parse( "AA AND \"BCD\" OR \"EFGH\"" ); + BooleanQuery query = new BooleanQuery(); + query.add(new TermQuery(new Term(F, "AA")), Occur.MUST); + query.add(toPhraseQuery(analyze("BCD", F, analyzerB), F), Occur.MUST); + query.add(toPhraseQuery(analyze("EFGH", F, analyzerB), F), Occur.SHOULD); + FieldQuery fq = new FieldQuery( query, true, true ); Set flatQueries = new HashSet(); fq.flatten( query, flatQueries ); @@ -232,7 +256,16 @@ public class FieldQueryTest extends AbstractTestCase { } public void testGetTermSet() throws Exception { - Query query = paW.parse( "A AND B OR x:C NOT (D AND E)" ); + BooleanQuery query = new BooleanQuery(); + query.add(new TermQuery(new Term(F, "A")), Occur.MUST); + query.add(new TermQuery(new Term(F, "B")), Occur.MUST); + query.add(new TermQuery(new Term("x", "C")), Occur.SHOULD); + + BooleanQuery innerQuery = new BooleanQuery(); + innerQuery.add(new TermQuery(new Term(F, "D")), Occur.MUST); + innerQuery.add(new TermQuery(new Term(F, "E")), Occur.MUST); + query.add(innerQuery, Occur.MUST_NOT); + FieldQuery fq = new FieldQuery( query, true, true ); assertEquals( 2, fq.termSetMap.size() ); Set termSet = fq.getTermSet( F ); @@ -679,8 +712,10 @@ public class FieldQueryTest extends AbstractTestCase { } public void testQueryPhraseMapOverlap2gram() throws Exception { - Query query = paB.parse( "\"abc\" AND \"bcd\"" ); - + BooleanQuery query = new BooleanQuery(); + query.add(toPhraseQuery(analyze("abc", F, analyzerB), F), Occur.MUST); + query.add(toPhraseQuery(analyze("bcd", F, analyzerB), F), Occur.MUST); + // phraseHighlight = true, fieldMatch = true FieldQuery fq = new FieldQuery( query, true, true ); Map map = fq.rootMaps; diff --git a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilderTest.java b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilderTest.java index 31bc37d7e5a..961bd4cbe8d 100644 --- a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilderTest.java +++ b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilderTest.java @@ -17,12 +17,20 @@ package org.apache.lucene.search.vectorhighlight; * limitations under the License. */ +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; public class ScoreOrderFragmentsBuilderTest extends AbstractTestCase { public void test3Frags() throws Exception { - FieldFragList ffl = ffl( "a c", "a b b b b b b b b b b b a b a b b b b b c a a b b" ); + BooleanQuery query = new BooleanQuery(); + query.add(new TermQuery(new Term(F, "a")), BooleanClause.Occur.SHOULD); + query.add(new TermQuery(new Term(F, "c")), BooleanClause.Occur.SHOULD); + + FieldFragList ffl = ffl(query, "a b b b b b b b b b b b a b a b b b b b c a a b b" ); ScoreOrderFragmentsBuilder sofb = new ScoreOrderFragmentsBuilder(); String[] f = sofb.createFragments( reader, 0, F, ffl, 3 ); assertEquals( 3, f.length ); @@ -32,9 +40,8 @@ public class ScoreOrderFragmentsBuilderTest extends AbstractTestCase { assertEquals( "a b b b b b b b b b ", f[2] ); } - private FieldFragList ffl( String queryValue, String indexValue ) throws Exception { + private FieldFragList ffl(Query query, String indexValue ) throws Exception { make1d1fIndex( indexValue ); - Query query = paW.parse( queryValue ); FieldQuery fq = new FieldQuery( query, true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); diff --git a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilderTest.java b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilderTest.java index b8d2a2b1ce5..20d0949a0e9 100644 --- a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilderTest.java +++ b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilderTest.java @@ -17,20 +17,21 @@ package org.apache.lucene.search.vectorhighlight; * limitations under the License. */ -import org.apache.lucene.search.Query; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.*; public class SimpleFragListBuilderTest extends AbstractTestCase { public void testNullFieldFragList() throws Exception { SimpleFragListBuilder sflb = new SimpleFragListBuilder(); - FieldFragList ffl = sflb.createFieldFragList( fpl( "a", "b c d" ), 100 ); + FieldFragList ffl = sflb.createFieldFragList( fpl(new TermQuery(new Term(F, "a")), "b c d" ), 100 ); assertEquals( 0, ffl.getFragInfos().size() ); } public void testTooSmallFragSize() throws Exception { try{ SimpleFragListBuilder sflb = new SimpleFragListBuilder(); - sflb.createFieldFragList( fpl( "a", "b c d" ), SimpleFragListBuilder.MIN_FRAG_CHAR_SIZE - 1 ); + sflb.createFieldFragList( fpl(new TermQuery(new Term(F, "a")), "b c d" ), SimpleFragListBuilder.MIN_FRAG_CHAR_SIZE - 1 ); fail( "IllegalArgumentException must be thrown" ); } catch ( IllegalArgumentException expected ) { @@ -39,14 +40,19 @@ public class SimpleFragListBuilderTest extends AbstractTestCase { public void testSmallerFragSizeThanTermQuery() throws Exception { SimpleFragListBuilder sflb = new SimpleFragListBuilder(); - FieldFragList ffl = sflb.createFieldFragList( fpl( "abcdefghijklmnopqrs", "abcdefghijklmnopqrs" ), SimpleFragListBuilder.MIN_FRAG_CHAR_SIZE ); + FieldFragList ffl = sflb.createFieldFragList( fpl(new TermQuery(new Term(F, "abcdefghijklmnopqrs")), "abcdefghijklmnopqrs" ), SimpleFragListBuilder.MIN_FRAG_CHAR_SIZE ); assertEquals( 1, ffl.getFragInfos().size() ); assertEquals( "subInfos=(abcdefghijklmnopqrs((0,19)))/1.0(0,19)", ffl.getFragInfos().get( 0 ).toString() ); } public void testSmallerFragSizeThanPhraseQuery() throws Exception { SimpleFragListBuilder sflb = new SimpleFragListBuilder(); - FieldFragList ffl = sflb.createFieldFragList( fpl( "\"abcdefgh jklmnopqrs\"", "abcdefgh jklmnopqrs" ), SimpleFragListBuilder.MIN_FRAG_CHAR_SIZE ); + + PhraseQuery phraseQuery = new PhraseQuery(); + phraseQuery.add(new Term(F, "abcdefgh")); + phraseQuery.add(new Term(F, "jklmnopqrs")); + + FieldFragList ffl = sflb.createFieldFragList( fpl(phraseQuery, "abcdefgh jklmnopqrs" ), SimpleFragListBuilder.MIN_FRAG_CHAR_SIZE ); assertEquals( 1, ffl.getFragInfos().size() ); if (VERBOSE) System.out.println( ffl.getFragInfos().get( 0 ).toString() ); assertEquals( "subInfos=(abcdefghjklmnopqrs((0,21)))/1.0(0,21)", ffl.getFragInfos().get( 0 ).toString() ); @@ -54,39 +60,39 @@ public class SimpleFragListBuilderTest extends AbstractTestCase { public void test1TermIndex() throws Exception { SimpleFragListBuilder sflb = new SimpleFragListBuilder(); - FieldFragList ffl = sflb.createFieldFragList( fpl( "a", "a" ), 100 ); + FieldFragList ffl = sflb.createFieldFragList( fpl(new TermQuery(new Term(F, "a")), "a" ), 100 ); assertEquals( 1, ffl.getFragInfos().size() ); assertEquals( "subInfos=(a((0,1)))/1.0(0,100)", ffl.getFragInfos().get( 0 ).toString() ); } public void test2TermsIndex1Frag() throws Exception { SimpleFragListBuilder sflb = new SimpleFragListBuilder(); - FieldFragList ffl = sflb.createFieldFragList( fpl( "a", "a a" ), 100 ); + FieldFragList ffl = sflb.createFieldFragList( fpl(new TermQuery(new Term(F, "a")), "a a" ), 100 ); assertEquals( 1, ffl.getFragInfos().size() ); assertEquals( "subInfos=(a((0,1))a((2,3)))/2.0(0,100)", ffl.getFragInfos().get( 0 ).toString() ); - ffl = sflb.createFieldFragList( fpl( "a", "a b b b b b b b b a" ), 20 ); + ffl = sflb.createFieldFragList( fpl(new TermQuery(new Term(F, "a")), "a b b b b b b b b a" ), 20 ); assertEquals( 1, ffl.getFragInfos().size() ); assertEquals( "subInfos=(a((0,1))a((18,19)))/2.0(0,20)", ffl.getFragInfos().get( 0 ).toString() ); - ffl = sflb.createFieldFragList( fpl( "a", "b b b b a b b b b a" ), 20 ); + ffl = sflb.createFieldFragList( fpl(new TermQuery(new Term(F, "a")), "b b b b a b b b b a" ), 20 ); assertEquals( 1, ffl.getFragInfos().size() ); assertEquals( "subInfos=(a((8,9))a((18,19)))/2.0(2,22)", ffl.getFragInfos().get( 0 ).toString() ); } public void test2TermsIndex2Frags() throws Exception { SimpleFragListBuilder sflb = new SimpleFragListBuilder(); - FieldFragList ffl = sflb.createFieldFragList( fpl( "a", "a b b b b b b b b b b b b b a" ), 20 ); + FieldFragList ffl = sflb.createFieldFragList( fpl(new TermQuery(new Term(F, "a")), "a b b b b b b b b b b b b b a" ), 20 ); assertEquals( 2, ffl.getFragInfos().size() ); assertEquals( "subInfos=(a((0,1)))/1.0(0,20)", ffl.getFragInfos().get( 0 ).toString() ); assertEquals( "subInfos=(a((28,29)))/1.0(22,42)", ffl.getFragInfos().get( 1 ).toString() ); - ffl = sflb.createFieldFragList( fpl( "a", "a b b b b b b b b b b b b a" ), 20 ); + ffl = sflb.createFieldFragList( fpl(new TermQuery(new Term(F, "a")), "a b b b b b b b b b b b b a" ), 20 ); assertEquals( 2, ffl.getFragInfos().size() ); assertEquals( "subInfos=(a((0,1)))/1.0(0,20)", ffl.getFragInfos().get( 0 ).toString() ); assertEquals( "subInfos=(a((26,27)))/1.0(20,40)", ffl.getFragInfos().get( 1 ).toString() ); - ffl = sflb.createFieldFragList( fpl( "a", "a b b b b b b b b b a" ), 20 ); + ffl = sflb.createFieldFragList( fpl(new TermQuery(new Term(F, "a")), "a b b b b b b b b b a" ), 20 ); assertEquals( 2, ffl.getFragInfos().size() ); assertEquals( "subInfos=(a((0,1)))/1.0(0,20)", ffl.getFragInfos().get( 0 ).toString() ); assertEquals( "subInfos=(a((20,21)))/1.0(20,40)", ffl.getFragInfos().get( 1 ).toString() ); @@ -94,41 +100,56 @@ public class SimpleFragListBuilderTest extends AbstractTestCase { public void test2TermsQuery() throws Exception { SimpleFragListBuilder sflb = new SimpleFragListBuilder(); - FieldFragList ffl = sflb.createFieldFragList( fpl( "a b", "c d e" ), 20 ); + + BooleanQuery booleanQuery = new BooleanQuery(); + booleanQuery.add(new TermQuery(new Term(F, "a")), BooleanClause.Occur.SHOULD); + booleanQuery.add(new TermQuery(new Term(F, "b")), BooleanClause.Occur.SHOULD); + + FieldFragList ffl = sflb.createFieldFragList( fpl(booleanQuery, "c d e" ), 20 ); assertEquals( 0, ffl.getFragInfos().size() ); - ffl = sflb.createFieldFragList( fpl( "a b", "d b c" ), 20 ); + ffl = sflb.createFieldFragList( fpl(booleanQuery, "d b c" ), 20 ); assertEquals( 1, ffl.getFragInfos().size() ); assertEquals( "subInfos=(b((2,3)))/1.0(0,20)", ffl.getFragInfos().get( 0 ).toString() ); - ffl = sflb.createFieldFragList( fpl( "a b", "a b c" ), 20 ); + ffl = sflb.createFieldFragList( fpl(booleanQuery, "a b c" ), 20 ); assertEquals( 1, ffl.getFragInfos().size() ); assertEquals( "subInfos=(a((0,1))b((2,3)))/2.0(0,20)", ffl.getFragInfos().get( 0 ).toString() ); } public void testPhraseQuery() throws Exception { SimpleFragListBuilder sflb = new SimpleFragListBuilder(); - FieldFragList ffl = sflb.createFieldFragList( fpl( "\"a b\"", "c d e" ), 20 ); + + PhraseQuery phraseQuery = new PhraseQuery(); + phraseQuery.add(new Term(F, "a")); + phraseQuery.add(new Term(F, "b")); + + FieldFragList ffl = sflb.createFieldFragList( fpl(phraseQuery, "c d e" ), 20 ); assertEquals( 0, ffl.getFragInfos().size() ); - ffl = sflb.createFieldFragList( fpl( "\"a b\"", "a c b" ), 20 ); + ffl = sflb.createFieldFragList( fpl(phraseQuery, "a c b" ), 20 ); assertEquals( 0, ffl.getFragInfos().size() ); - ffl = sflb.createFieldFragList( fpl( "\"a b\"", "a b c" ), 20 ); + ffl = sflb.createFieldFragList( fpl(phraseQuery, "a b c" ), 20 ); assertEquals( 1, ffl.getFragInfos().size() ); assertEquals( "subInfos=(ab((0,3)))/1.0(0,20)", ffl.getFragInfos().get( 0 ).toString() ); } public void testPhraseQuerySlop() throws Exception { SimpleFragListBuilder sflb = new SimpleFragListBuilder(); - FieldFragList ffl = sflb.createFieldFragList( fpl( "\"a b\"~1", "a c b" ), 20 ); + + PhraseQuery phraseQuery = new PhraseQuery(); + phraseQuery.setSlop(1); + phraseQuery.add(new Term(F, "a")); + phraseQuery.add(new Term(F, "b")); + + FieldFragList ffl = sflb.createFieldFragList( fpl(phraseQuery, "a c b" ), 20 ); assertEquals( 1, ffl.getFragInfos().size() ); assertEquals( "subInfos=(ab((0,1)(4,5)))/1.0(0,20)", ffl.getFragInfos().get( 0 ).toString() ); } - private FieldPhraseList fpl( String queryValue, String indexValue ) throws Exception { + private FieldPhraseList fpl(Query query, String indexValue ) throws Exception { make1d1fIndex( indexValue ); - Query query = paW.parse( queryValue ); FieldQuery fq = new FieldQuery( query, true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); return new FieldPhraseList( stack, fq ); diff --git a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java index 72c17d8ac24..795a7884224 100644 --- a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java +++ b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java @@ -26,13 +26,17 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.highlight.SimpleHTMLEncoder; public class SimpleFragmentsBuilderTest extends AbstractTestCase { public void test1TermIndex() throws Exception { - FieldFragList ffl = ffl( "a", "a" ); + FieldFragList ffl = ffl(new TermQuery(new Term(F, "a")), "a" ); SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder(); assertEquals( "a ", sfb.createFragment( reader, 0, F, ffl ) ); @@ -42,7 +46,7 @@ public class SimpleFragmentsBuilderTest extends AbstractTestCase { } public void test2Frags() throws Exception { - FieldFragList ffl = ffl( "a", "a b b b b b b b b b b b a b a b" ); + FieldFragList ffl = ffl(new TermQuery(new Term(F, "a")), "a b b b b b b b b b b b a b a b" ); SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder(); String[] f = sfb.createFragments( reader, 0, F, ffl, 3 ); // 3 snippets requested, but should be 2 @@ -52,7 +56,11 @@ public class SimpleFragmentsBuilderTest extends AbstractTestCase { } public void test3Frags() throws Exception { - FieldFragList ffl = ffl( "a c", "a b b b b b b b b b b b a b a b b b b b c a a b b" ); + BooleanQuery booleanQuery = new BooleanQuery(); + booleanQuery.add(new TermQuery(new Term(F, "a")), BooleanClause.Occur.SHOULD); + booleanQuery.add(new TermQuery(new Term(F, "c")), BooleanClause.Occur.SHOULD); + + FieldFragList ffl = ffl(booleanQuery, "a b b b b b b b b b b b a b a b b b b b c a a b b" ); SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder(); String[] f = sfb.createFragments( reader, 0, F, ffl, 3 ); assertEquals( 3, f.length ); @@ -62,7 +70,7 @@ public class SimpleFragmentsBuilderTest extends AbstractTestCase { } public void testTagsAndEncoder() throws Exception { - FieldFragList ffl = ffl( "a", "

a

" ); + FieldFragList ffl = ffl(new TermQuery(new Term(F, "a")), "

a

" ); SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder(); String[] preTags = { "[" }; String[] postTags = { "]" }; @@ -70,9 +78,8 @@ public class SimpleFragmentsBuilderTest extends AbstractTestCase { sfb.createFragment( reader, 0, F, ffl, preTags, postTags, new SimpleHTMLEncoder() ) ); } - private FieldFragList ffl( String queryValue, String indexValue ) throws Exception { + private FieldFragList ffl(Query query, String indexValue ) throws Exception { make1d1fIndex( indexValue ); - Query query = paW.parse( queryValue ); FieldQuery fq = new FieldQuery( query, true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); diff --git a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SingleFragListBuilderTest.java b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SingleFragListBuilderTest.java index 1857ef90e24..0f42bae8b96 100644 --- a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SingleFragListBuilderTest.java +++ b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SingleFragListBuilderTest.java @@ -1,6 +1,8 @@ package org.apache.lucene.search.vectorhighlight; +import org.apache.lucene.index.Term; import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -23,27 +25,26 @@ public class SingleFragListBuilderTest extends AbstractTestCase { public void testNullFieldFragList() throws Exception { SingleFragListBuilder sflb = new SingleFragListBuilder(); - FieldFragList ffl = sflb.createFieldFragList( fpl( "a", "b c d" ), 100 ); + FieldFragList ffl = sflb.createFieldFragList( fpl(new TermQuery(new Term(F, "a")), "b c d" ), 100 ); assertEquals( 0, ffl.getFragInfos().size() ); } public void testShortFieldFragList() throws Exception { SingleFragListBuilder sflb = new SingleFragListBuilder(); - FieldFragList ffl = sflb.createFieldFragList( fpl( "a", "a b c d" ), 100 ); + FieldFragList ffl = sflb.createFieldFragList( fpl(new TermQuery(new Term(F, "a")), "a b c d" ), 100 ); assertEquals( 1, ffl.getFragInfos().size() ); assertEquals( "subInfos=(a((0,1)))/1.0(0,2147483647)", ffl.getFragInfos().get( 0 ).toString() ); } public void testLongFieldFragList() throws Exception { SingleFragListBuilder sflb = new SingleFragListBuilder(); - FieldFragList ffl = sflb.createFieldFragList( fpl( "a", "a b c d", "a b c d e f g h i", "j k l m n o p q r s t u v w x y z a b c", "d e f g" ), 100 ); + FieldFragList ffl = sflb.createFieldFragList( fpl(new TermQuery(new Term(F, "a")), "a b c d", "a b c d e f g h i", "j k l m n o p q r s t u v w x y z a b c", "d e f g" ), 100 ); assertEquals( 1, ffl.getFragInfos().size() ); assertEquals( "subInfos=(a((0,1))a((8,9))a((60,61)))/3.0(0,2147483647)", ffl.getFragInfos().get( 0 ).toString() ); } - private FieldPhraseList fpl( String queryValue, String... indexValues ) throws Exception { + private FieldPhraseList fpl(Query query, String... indexValues ) throws Exception { make1dmfIndex( indexValues ); - Query query = paW.parse( queryValue ); FieldQuery fq = new FieldQuery( query, true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); return new FieldPhraseList( stack, fq ); diff --git a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java index 57aca7a109a..b8b10477a34 100644 --- a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java +++ b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java @@ -426,6 +426,12 @@ public class InstantiatedIndexReader extends IndexReader { public long getSumTotalTermFreq() { return sumTotalTermFreq; } + + // TODO: support this? + @Override + public long getSumDocFreq() { + return -1; + } @Override public Comparator getComparator() { diff --git a/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index 0c93f3feedb..d6e7e09e50d 100644 --- a/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -842,6 +842,12 @@ public class MemoryIndex { public long getSumTotalTermFreq() { return info.getSumTotalTermFreq(); } + + @Override + public long getSumDocFreq() throws IOException { + // each term has df=1 + return info.sortedTerms.length; + } }; } } diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/FieldCacheRewriteMethod.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/FieldCacheRewriteMethod.java index 8539f855f9f..8a999d628db 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/FieldCacheRewriteMethod.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/FieldCacheRewriteMethod.java @@ -128,6 +128,11 @@ public final class FieldCacheRewriteMethod extends MultiTermQuery.RewriteMethod public long getSumTotalTermFreq() { return -1; } + + @Override + public long getSumDocFreq() throws IOException { + return -1; + } }); assert termsEnum != null; diff --git a/lucene/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/src/java/org/apache/lucene/index/CheckIndex.java index a3fec4d2f58..4b0164f549d 100644 --- a/lucene/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/src/java/org/apache/lucene/index/CheckIndex.java @@ -691,7 +691,7 @@ public class CheckIndex { Comparator termComp = terms.getComparator(); long sumTotalTermFreq = 0; - + long sumDocFreq = 0; while(true) { final BytesRef term = terms.next(); @@ -712,6 +712,7 @@ public class CheckIndex { final int docFreq = terms.docFreq(); status.totFreq += docFreq; + sumDocFreq += docFreq; docs = terms.docs(liveDocs, docs); postings = terms.docsAndPositions(liveDocs, postings); @@ -879,6 +880,13 @@ public class CheckIndex { throw new RuntimeException("sumTotalTermFreq for field " + field + "=" + v + " != recomputed sumTotalTermFreq=" + sumTotalTermFreq); } } + + if (sumDocFreq != 0) { + final long v = fields.terms(field).getSumDocFreq(); + if (v != -1 && sumDocFreq != v) { + throw new RuntimeException("sumDocFreq for field " + field + "=" + v + " != recomputed sumDocFreq=" + sumDocFreq); + } + } // Test seek to last term: if (lastTerm != null) { diff --git a/lucene/src/java/org/apache/lucene/index/FieldInvertState.java b/lucene/src/java/org/apache/lucene/index/FieldInvertState.java index 8c4e92ad4ea..0b54500c71d 100644 --- a/lucene/src/java/org/apache/lucene/index/FieldInvertState.java +++ b/lucene/src/java/org/apache/lucene/index/FieldInvertState.java @@ -31,6 +31,7 @@ public final class FieldInvertState { int numOverlap; int offset; int maxTermFrequency; + int uniqueTermCount; float boost; AttributeSource attributeSource; @@ -55,6 +56,7 @@ public final class FieldInvertState { numOverlap = 0; offset = 0; maxTermFrequency = 0; + uniqueTermCount = 0; boost = docBoost; attributeSource = null; } @@ -122,6 +124,13 @@ public final class FieldInvertState { return maxTermFrequency; } + /** + * Return the number of unique terms encountered in this field. + */ + public int getUniqueTermCount() { + return uniqueTermCount; + } + public AttributeSource getAttributeSource() { return attributeSource; } diff --git a/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java b/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java index e7d2190774a..d57591a6b15 100644 --- a/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java +++ b/lucene/src/java/org/apache/lucene/index/FilterIndexReader.java @@ -105,6 +105,11 @@ public class FilterIndexReader extends IndexReader { public long getSumTotalTermFreq() throws IOException { return in.getSumTotalTermFreq(); } + + @Override + public long getSumDocFreq() throws IOException { + return in.getSumDocFreq(); + } } /** Base class for filtering {@link TermsEnum} implementations. */ diff --git a/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java b/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java index a770c9011bf..ddc4e441e71 100644 --- a/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java +++ b/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java @@ -134,6 +134,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem writeProx(termID, fieldState.position); } fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency); + fieldState.uniqueTermCount++; } @Override @@ -151,6 +152,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]); postings.lastDocCodes[termID] = docState.docID - postings.lastDocIDs[termID]; postings.lastDocIDs[termID] = docState.docID; + fieldState.uniqueTermCount++; } } else { if (docState.docID != postings.lastDocIDs[termID]) { @@ -171,6 +173,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1; postings.lastDocIDs[termID] = docState.docID; writeProx(termID, fieldState.position); + fieldState.uniqueTermCount++; } else { fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.docFreqs[termID]); writeProx(termID, fieldState.position-postings.lastPositions[termID]); @@ -251,6 +254,8 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem final ByteSliceReader prox = new ByteSliceReader(); long sumTotalTermFreq = 0; + long sumDocFreq = 0; + for (int i = 0; i < numTerms; i++) { final int termID = termIDs[i]; // Get BytesRef @@ -389,9 +394,10 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem } termsConsumer.finishTerm(text, new TermStats(numDocs, totTF)); sumTotalTermFreq += totTF; + sumDocFreq += numDocs; } - termsConsumer.finish(sumTotalTermFreq); + termsConsumer.finish(sumTotalTermFreq, sumDocFreq); } } diff --git a/lucene/src/java/org/apache/lucene/index/MultiTerms.java b/lucene/src/java/org/apache/lucene/index/MultiTerms.java index 2da5db54df8..2785170dfb3 100644 --- a/lucene/src/java/org/apache/lucene/index/MultiTerms.java +++ b/lucene/src/java/org/apache/lucene/index/MultiTerms.java @@ -88,6 +88,19 @@ public final class MultiTerms extends Terms { } return sum; } + + @Override + public long getSumDocFreq() throws IOException { + long sum = 0; + for(Terms terms : subs) { + final long v = terms.getSumDocFreq(); + if (v == -1) { + return -1; + } + sum += v; + } + return sum; + } @Override public Comparator getComparator() { diff --git a/lucene/src/java/org/apache/lucene/index/Terms.java b/lucene/src/java/org/apache/lucene/index/Terms.java index b7600c8a784..b7a27e0d0c9 100644 --- a/lucene/src/java/org/apache/lucene/index/Terms.java +++ b/lucene/src/java/org/apache/lucene/index/Terms.java @@ -132,6 +132,13 @@ public abstract class Terms { * into account. */ public abstract long getSumTotalTermFreq() throws IOException; + /** Returns the sum of {@link #docFreq(BytesRef)} for + * all terms in this field, or -1 if this measure isn't + * stored by the codec. Note that, just like other term + * measures, this measure does not take deleted documents + * into account. */ + public abstract long getSumDocFreq() throws IOException; + /** * Returns a thread-private {@link TermsEnum} instance. Obtaining * {@link TermsEnum} from this method might be more efficient than using diff --git a/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java b/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java index 19c280b0c7a..a970d57966e 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java @@ -137,8 +137,9 @@ public class BlockTermsReader extends FieldsProducer { final long termsStartPointer = in.readVLong(); final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); final long sumTotalTermFreq = fieldInfo.omitTermFreqAndPositions ? -1 : in.readVLong(); + final long sumDocFreq = in.readVLong(); assert !fields.containsKey(fieldInfo.name); - fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq)); + fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq, sumDocFreq)); } success = true; } finally { @@ -245,13 +246,15 @@ public class BlockTermsReader extends FieldsProducer { final FieldInfo fieldInfo; final long termsStartPointer; final long sumTotalTermFreq; + final long sumDocFreq; - FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq) { + FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq) { assert numTerms > 0; this.fieldInfo = fieldInfo; this.numTerms = numTerms; this.termsStartPointer = termsStartPointer; this.sumTotalTermFreq = sumTotalTermFreq; + this.sumDocFreq = sumDocFreq; } @Override @@ -279,6 +282,11 @@ public class BlockTermsReader extends FieldsProducer { return sumTotalTermFreq; } + @Override + public long getSumDocFreq() throws IOException { + return sumDocFreq; + } + // Iterates through terms in this field private final class SegmentTermsEnum extends TermsEnum { private final IndexInput in; diff --git a/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java index 926a6af6301..89ab114919e 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java @@ -132,6 +132,7 @@ public class BlockTermsWriter extends FieldsConsumer { if (!field.fieldInfo.omitTermFreqAndPositions) { out.writeVLong(field.sumTotalTermFreq); } + out.writeVLong(field.sumDocFreq); } } writeTrailer(dirStart); @@ -157,6 +158,7 @@ public class BlockTermsWriter extends FieldsConsumer { private long numTerms; private final TermsIndexWriterBase.FieldWriter fieldIndexWriter; long sumTotalTermFreq; + long sumDocFreq; private TermEntry[] pendingTerms; @@ -231,7 +233,7 @@ public class BlockTermsWriter extends FieldsConsumer { // Finishes all terms in this field @Override - public void finish(long sumTotalTermFreq) throws IOException { + public void finish(long sumTotalTermFreq, long sumDocFreq) throws IOException { if (pendingCount > 0) { flushBlock(); } @@ -239,6 +241,7 @@ public class BlockTermsWriter extends FieldsConsumer { out.writeVInt(0); this.sumTotalTermFreq = sumTotalTermFreq; + this.sumDocFreq = sumDocFreq; fieldIndexWriter.finish(out.getFilePointer()); } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java b/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java index 875da3bbb63..01280154cac 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java @@ -41,7 +41,7 @@ public abstract class TermsConsumer { public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException; /** Called when we are done adding terms to this field */ - public abstract void finish(long sumTotalTermFreq) throws IOException; + public abstract void finish(long sumTotalTermFreq, long sumDocFreq) throws IOException; /** Return the BytesRef Comparator used to sort terms * before feeding to this API. */ @@ -56,7 +56,8 @@ public abstract class TermsConsumer { BytesRef term; assert termsEnum != null; long sumTotalTermFreq = 0; - long sumDF = 0; + long sumDocFreq = 0; + long sumDFsinceLastAbortCheck = 0; if (mergeState.fieldInfo.omitTermFreqAndPositions) { if (docsEnum == null) { @@ -74,10 +75,11 @@ public abstract class TermsConsumer { final TermStats stats = postingsConsumer.merge(mergeState, docsEnum); if (stats.docFreq > 0) { finishTerm(term, stats); - sumDF += stats.docFreq; - if (sumDF > 60000) { - mergeState.checkAbort.work(sumDF/5.0); - sumDF = 0; + sumDFsinceLastAbortCheck += stats.docFreq; + sumDocFreq += stats.docFreq; + if (sumDFsinceLastAbortCheck > 60000) { + mergeState.checkAbort.work(sumDFsinceLastAbortCheck/5.0); + sumDFsinceLastAbortCheck = 0; } } } @@ -105,16 +107,17 @@ public abstract class TermsConsumer { if (stats.docFreq > 0) { finishTerm(term, stats); sumTotalTermFreq += stats.totalTermFreq; - sumDF += stats.docFreq; - if (sumDF > 60000) { - mergeState.checkAbort.work(sumDF/5.0); - sumDF = 0; + sumDFsinceLastAbortCheck += stats.docFreq; + sumDocFreq += stats.docFreq; + if (sumDFsinceLastAbortCheck > 60000) { + mergeState.checkAbort.work(sumDFsinceLastAbortCheck/5.0); + sumDFsinceLastAbortCheck = 0; } } } } } - finish(sumTotalTermFreq); + finish(sumTotalTermFreq, sumDocFreq); } } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java b/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java index dbe6265900a..e36994fe990 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java @@ -219,13 +219,14 @@ public class MemoryCodec extends Codec { } @Override - public void finish(long sumTotalTermFreq) throws IOException { + public void finish(long sumTotalTermFreq, long sumDocFreq) throws IOException { if (termCount > 0) { out.writeVInt(termCount); out.writeVInt(field.number); if (!field.omitTermFreqAndPositions) { out.writeVLong(sumTotalTermFreq); } + out.writeVLong(sumDocFreq); builder.finish().save(out); if (VERBOSE) System.out.println("finish field=" + field.name + " fp=" + out.getFilePointer()); } @@ -683,6 +684,7 @@ public class MemoryCodec extends Codec { private final static class TermsReader extends Terms { private final long sumTotalTermFreq; + private final long sumDocFreq; private FST fst; private final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); private final FieldInfo field; @@ -695,6 +697,7 @@ public class MemoryCodec extends Codec { } else { sumTotalTermFreq = 0; } + sumDocFreq = in.readVLong(); fst = new FST(in, outputs); } @@ -704,6 +707,11 @@ public class MemoryCodec extends Codec { return sumTotalTermFreq; } + @Override + public long getSumDocFreq() throws IOException { + return sumDocFreq; + } + @Override public TermsEnum iterator() { return new FSTTermsEnum(field, fst); diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java index ac3962d50e4..a994b468527 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java @@ -266,6 +266,11 @@ public class PreFlexFields extends FieldsProducer { public long getSumTotalTermFreq() { return -1; } + + @Override + public long getSumDocFreq() throws IOException { + return -1; + } } private class PreTermsEnum extends TermsEnum { diff --git a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java index 34451939966..e2a37f6b199 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java @@ -463,6 +463,7 @@ class SimpleTextFieldsReader extends FieldsProducer { private final long termsStart; private final boolean omitTF; private long sumTotalTermFreq; + private long sumDocFreq; private FST>> fst; private int termCount; private final BytesRef scratch = new BytesRef(10); @@ -500,6 +501,7 @@ class SimpleTextFieldsReader extends FieldsProducer { break; } else if (scratch.startsWith(DOC)) { docFreq++; + sumDocFreq++; } else if (scratch.startsWith(POS)) { totalTermFreq++; } else if (scratch.startsWith(TERM)) { @@ -554,6 +556,11 @@ class SimpleTextFieldsReader extends FieldsProducer { public long getSumTotalTermFreq() { return sumTotalTermFreq; } + + @Override + public long getSumDocFreq() throws IOException { + return sumDocFreq; + } } @Override diff --git a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java index f822ec6a72c..49ed1f5c786 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java @@ -89,7 +89,7 @@ class SimpleTextFieldsWriter extends FieldsConsumer { } @Override - public void finish(long sumTotalTermFreq) throws IOException { + public void finish(long sumTotalTermFreq, long sumDocFreq) throws IOException { } @Override diff --git a/lucene/src/test-framework/org/apache/lucene/index/codecs/preflexrw/PreFlexFieldsWriter.java b/lucene/src/test-framework/org/apache/lucene/index/codecs/preflexrw/PreFlexFieldsWriter.java index 4950cf97ea9..e10323146dd 100644 --- a/lucene/src/test-framework/org/apache/lucene/index/codecs/preflexrw/PreFlexFieldsWriter.java +++ b/lucene/src/test-framework/org/apache/lucene/index/codecs/preflexrw/PreFlexFieldsWriter.java @@ -195,7 +195,7 @@ class PreFlexFieldsWriter extends FieldsConsumer { } @Override - public void finish(long sumTotalTermCount) throws IOException { + public void finish(long sumTotalTermCount, long sumDocFreq) throws IOException { } @Override diff --git a/lucene/src/test/org/apache/lucene/TestExternalCodecs.java b/lucene/src/test/org/apache/lucene/TestExternalCodecs.java index 50485853819..b3cebe05567 100644 --- a/lucene/src/test/org/apache/lucene/TestExternalCodecs.java +++ b/lucene/src/test/org/apache/lucene/TestExternalCodecs.java @@ -102,6 +102,7 @@ public class TestExternalCodecs extends LuceneTestCase { final String field; final SortedMap termToDocs = new TreeMap(); long sumTotalTermFreq; + long sumDocFreq; RAMField(String field) { this.field = field; @@ -116,6 +117,11 @@ public class TestExternalCodecs extends LuceneTestCase { public long getSumTotalTermFreq() { return sumTotalTermFreq; } + + @Override + public long getSumDocFreq() throws IOException { + return sumDocFreq; + } @Override public TermsEnum iterator() { @@ -204,8 +210,9 @@ public class TestExternalCodecs extends LuceneTestCase { } @Override - public void finish(long sumTotalTermFreq) { + public void finish(long sumTotalTermFreq, long sumDocFreq) { field.sumTotalTermFreq = sumTotalTermFreq; + field.sumDocFreq = sumDocFreq; } } diff --git a/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java b/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java index 5a629dacd85..b3d91e2d80d 100644 --- a/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java +++ b/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java @@ -455,9 +455,12 @@ public class TestBackwardsCompatibility extends LuceneTestCase { File indexDir = new File(LuceneTestCase.TEMP_DIR, dirName); _TestUtil.rmDir(indexDir); Directory dir = newFSDirectory(indexDir); - - IndexWriterConfig conf = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMaxBufferedDocs(10); - ((LogMergePolicy) conf.getMergePolicy()).setUseCompoundFile(doCFS); + LogByteSizeMergePolicy mp = new LogByteSizeMergePolicy(); + mp.setUseCompoundFile(doCFS); + mp.setNoCFSRatio(1.0); + // TODO: remove randomness + IndexWriterConfig conf = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)) + .setMaxBufferedDocs(10).setMergePolicy(mp); IndexWriter writer = new IndexWriter(dir, conf); for(int i=0;i<35;i++) { @@ -471,8 +474,12 @@ public class TestBackwardsCompatibility extends LuceneTestCase { if (!optimized) { // open fresh writer so we get no prx file in the added segment - conf = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMaxBufferedDocs(10); - ((LogMergePolicy) conf.getMergePolicy()).setUseCompoundFile(doCFS); + mp = new LogByteSizeMergePolicy(); + mp.setUseCompoundFile(doCFS); + mp.setNoCFSRatio(1.0); + // TODO: remove randomness + conf = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)) + .setMaxBufferedDocs(10).setMergePolicy(mp); writer = new IndexWriter(dir, conf); addNoProxDoc(writer); writer.close(); diff --git a/lucene/src/test/org/apache/lucene/index/TestCodecs.java b/lucene/src/test/org/apache/lucene/index/TestCodecs.java index fb702291507..c655d8a6b03 100644 --- a/lucene/src/test/org/apache/lucene/index/TestCodecs.java +++ b/lucene/src/test/org/apache/lucene/index/TestCodecs.java @@ -101,10 +101,12 @@ public class TestCodecs extends LuceneTestCase { Arrays.sort(terms); final TermsConsumer termsConsumer = consumer.addField(fieldInfo); long sumTotalTermCount = 0; + long sumDF = 0; for (final TermData term : terms) { + sumDF += term.docs.length; sumTotalTermCount += term.write(termsConsumer); } - termsConsumer.finish(sumTotalTermCount); + termsConsumer.finish(sumTotalTermCount, sumDF); } } diff --git a/lucene/src/test/org/apache/lucene/index/TestSumDocFreq.java b/lucene/src/test/org/apache/lucene/index/TestSumDocFreq.java new file mode 100644 index 00000000000..92d579f70df --- /dev/null +++ b/lucene/src/test/org/apache/lucene/index/TestSumDocFreq.java @@ -0,0 +1,101 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; + +/** + * Tests {@link Terms#getSumDocFreq()} + * @lucene.experimental + */ +public class TestSumDocFreq extends LuceneTestCase { + + public void testSumDocFreq() throws Exception { + final int numDocs = atLeast(500); + + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random, dir); + + Document doc = new Document(); + Field field1 = newField("foo", "", Field.Index.ANALYZED); + Field field2 = newField("bar", "", Field.Index.ANALYZED); + doc.add(field1); + doc.add(field2); + for (int i = 0; i < numDocs; i++) { + char ch1 = (char) _TestUtil.nextInt(random, 'a', 'z'); + char ch2 = (char) _TestUtil.nextInt(random, 'a', 'z'); + field1.setValue("" + ch1 + " " + ch2); + ch1 = (char) _TestUtil.nextInt(random, 'a', 'z'); + ch2 = (char) _TestUtil.nextInt(random, 'a', 'z'); + field2.setValue("" + ch1 + " " + ch2); + writer.addDocument(doc); + } + + IndexReader ir = writer.getReader(); + writer.close(); + + assertSumDocFreq(ir); + ir.close(); + + ir = IndexReader.open(dir, false); + int numDeletions = atLeast(20); + for (int i = 0; i < numDeletions; i++) { + ir.deleteDocument(random.nextInt(ir.maxDoc())); + } + ir.close(); + + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))); + w.optimize(); + w.close(); + + ir = IndexReader.open(dir, true); + assertSumDocFreq(ir); + ir.close(); + + dir.close(); + } + + private void assertSumDocFreq(IndexReader ir) throws Exception { + // compute sumDocFreq across all fields + Fields fields = MultiFields.getFields(ir); + FieldsEnum fieldEnum = fields.iterator(); + String f = null; + while ((f = fieldEnum.next()) != null) { + Terms terms = fields.terms(f); + long sumDocFreq = terms.getSumDocFreq(); + if (sumDocFreq == -1) { + if (VERBOSE) { + System.out.println("skipping field: " + f + ", codec does not support sumDocFreq"); + } + continue; + } + + long computedSumDocFreq = 0; + TermsEnum termsEnum = terms.iterator(); + while (termsEnum.next() != null) { + computedSumDocFreq += termsEnum.docFreq(); + } + assertEquals(computedSumDocFreq, sumDocFreq); + } + } +} diff --git a/lucene/src/test/org/apache/lucene/index/TestUniqueTermCount.java b/lucene/src/test/org/apache/lucene/index/TestUniqueTermCount.java new file mode 100644 index 00000000000..3fd64fa4926 --- /dev/null +++ b/lucene/src/test/org/apache/lucene/index/TestUniqueTermCount.java @@ -0,0 +1,108 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.HashSet; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.search.DefaultSimilarity; +import org.apache.lucene.search.DefaultSimilarityProvider; +import org.apache.lucene.search.Similarity; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; + +/** + * Tests the uniqueTermCount statistic in FieldInvertState + */ +public class TestUniqueTermCount extends LuceneTestCase { + Directory dir; + IndexReader reader; + /* expected uniqueTermCount values for our documents */ + ArrayList expected = new ArrayList(); + + @Override + public void setUp() throws Exception { + super.setUp(); + dir = newDirectory(); + IndexWriterConfig config = newIndexWriterConfig(TEST_VERSION_CURRENT, + new MockAnalyzer(random, MockTokenizer.SIMPLE, true)).setMergePolicy(newLogMergePolicy()); + config.setSimilarityProvider(new DefaultSimilarityProvider() { + @Override + public Similarity get(String field) { + return new TestSimilarity(); + } + }); + RandomIndexWriter writer = new RandomIndexWriter(random, dir, config); + Document doc = new Document(); + Field foo = newField("foo", "", Field.Store.NO, Field.Index.ANALYZED); + doc.add(foo); + for (int i = 0; i < 100; i++) { + foo.setValue(addValue()); + writer.addDocument(doc); + } + reader = writer.getReader(); + writer.close(); + } + + @Override + public void tearDown() throws Exception { + reader.close(); + dir.close(); + super.tearDown(); + } + + public void test() throws Exception { + byte fooNorms[] = MultiNorms.norms(reader, "foo"); + for (int i = 0; i < reader.maxDoc(); i++) + assertEquals(expected.get(i).intValue(), fooNorms[i] & 0xff); + } + + /** + * Makes a bunch of single-char tokens (the max # unique terms will at most be 26). + * puts the # unique terms into expected, to be checked against the norm. + */ + private String addValue() { + StringBuilder sb = new StringBuilder(); + HashSet terms = new HashSet(); + int num = _TestUtil.nextInt(random, 0, 255); + for (int i = 0; i < num; i++) { + sb.append(' '); + char term = (char) _TestUtil.nextInt(random, 'a', 'z'); + sb.append(term); + terms.add("" + term); + } + expected.add(terms.size()); + return sb.toString(); + } + + /** + * Simple similarity that encodes maxTermFrequency directly as a byte + */ + class TestSimilarity extends DefaultSimilarity { + + @Override + public byte computeNorm(FieldInvertState state) { + return (byte) state.getUniqueTermCount(); + } + } +} diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java index c97d4ab6bfa..422938b6bd5 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java @@ -16,7 +16,6 @@ package org.apache.lucene.analysis.query; * limitations under the License. */ -import java.io.IOException; import java.io.Reader; import java.io.StringReader; @@ -25,19 +24,12 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.core.LetterTokenizer; -import org.apache.lucene.analysis.core.WhitespaceAnalyzer; -import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; -import org.apache.lucene.queryParser.ParseException; -import org.apache.lucene.queryParser.QueryParser; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; import org.apache.lucene.store.RAMDirectory; public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase { @@ -74,22 +66,14 @@ public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase { super.tearDown(); } - //Helper method to query - private int search(Analyzer a, String queryString) throws IOException, ParseException { - QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "repetitiveField", a); - Query q = qp.parse(queryString); - IndexSearcher searcher = newSearcher(reader); - int hits = searcher.search(q, null, 1000).totalHits; - searcher.close(); - return hits; - } - public void testUninitializedAnalyzer() throws Exception { - //Note: no calls to "addStopWord" - String query = "variedField:quick repetitiveField:boring"; - int numHits1 = search(protectedAnalyzer, query); - int numHits2 = search(appAnalyzer, query); - assertEquals("No filtering test", numHits1, numHits2); + // Note: no calls to "addStopWord" + // query = "variedField:quick repetitiveField:boring"; + TokenStream protectedTokenStream = protectedAnalyzer.reusableTokenStream("variedField", new StringReader("quick")); + assertTokenStreamContents(protectedTokenStream, new String[]{"quick"}); + + protectedTokenStream = protectedAnalyzer.reusableTokenStream("repetitiveField", new StringReader("boring")); + assertTokenStreamContents(protectedTokenStream, new String[]{"boring"}); } /* @@ -97,36 +81,41 @@ public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase { */ public void testDefaultAddStopWordsIndexReader() throws Exception { protectedAnalyzer.addStopWords(reader); - int numHits = search(protectedAnalyzer, "repetitiveField:boring"); - assertEquals("Default filter should remove all docs", 0, numHits); - } + TokenStream protectedTokenStream = protectedAnalyzer.reusableTokenStream("repetitiveField", new StringReader("boring")); + assertTokenStreamContents(protectedTokenStream, new String[0]); // Default stop word filtering will remove boring + } /* * Test method for 'org.apache.lucene.analysis.QueryAutoStopWordAnalyzer.addStopWords(IndexReader, int)' */ public void testAddStopWordsIndexReaderInt() throws Exception { protectedAnalyzer.addStopWords(reader, 1f / 2f); - int numHits = search(protectedAnalyzer, "repetitiveField:boring"); - assertEquals("A filter on terms in > one half of docs remove boring docs", 0, numHits); - numHits = search(protectedAnalyzer, "repetitiveField:vaguelyboring"); - assertTrue("A filter on terms in > half of docs should not remove vaguelyBoring docs", numHits > 1); + TokenStream protectedTokenStream = protectedAnalyzer.reusableTokenStream("repetitiveField", new StringReader("boring")); + // A filter on terms in > one half of docs remove boring + assertTokenStreamContents(protectedTokenStream, new String[0]); + + protectedTokenStream = protectedAnalyzer.reusableTokenStream("repetitiveField", new StringReader("vaguelyboring")); + // A filter on terms in > half of docs should not remove vaguelyBoring + assertTokenStreamContents(protectedTokenStream, new String[]{"vaguelyboring"}); protectedAnalyzer.addStopWords(reader, 1f / 4f); - numHits = search(protectedAnalyzer, "repetitiveField:vaguelyboring"); - assertEquals("A filter on terms in > quarter of docs should remove vaguelyBoring docs", 0, numHits); + protectedTokenStream = protectedAnalyzer.reusableTokenStream("repetitiveField", new StringReader("vaguelyboring")); + // A filter on terms in > quarter of docs should remove vaguelyBoring + assertTokenStreamContents(protectedTokenStream, new String[0]); } - public void testAddStopWordsIndexReaderStringFloat() throws Exception { protectedAnalyzer.addStopWords(reader, "variedField", 1f / 2f); - int numHits = search(protectedAnalyzer, "repetitiveField:boring"); - assertTrue("A filter on one Field should not affect queris on another", numHits > 0); + TokenStream protectedTokenStream = protectedAnalyzer.reusableTokenStream("repetitiveField", new StringReader("boring")); + // A filter on one Field should not affect queries on another + assertTokenStreamContents(protectedTokenStream, new String[]{"boring"}); protectedAnalyzer.addStopWords(reader, "repetitiveField", 1f / 2f); - numHits = search(protectedAnalyzer, "repetitiveField:boring"); - assertEquals("A filter on the right Field should affect queries on it", numHits, 0); + protectedTokenStream = protectedAnalyzer.reusableTokenStream("repetitiveField", new StringReader("boring")); + // A filter on the right Field should affect queries on it + assertTokenStreamContents(protectedTokenStream, new String[0]); } public void testAddStopWordsIndexReaderStringInt() throws Exception { @@ -144,12 +133,14 @@ public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase { public void testNoFieldNamePollution() throws Exception { protectedAnalyzer.addStopWords(reader, "repetitiveField", 10); - int numHits = search(protectedAnalyzer, "repetitiveField:boring"); - assertEquals("Check filter set up OK", 0, numHits); - numHits = search(protectedAnalyzer, "variedField:boring"); - assertTrue("Filter should not prevent stopwords in one field being used in another ", numHits > 0); + TokenStream protectedTokenStream = protectedAnalyzer.reusableTokenStream("repetitiveField", new StringReader("boring")); + // Check filter set up OK + assertTokenStreamContents(protectedTokenStream, new String[0]); + protectedTokenStream = protectedAnalyzer.reusableTokenStream("variedField", new StringReader("boring")); + // Filter should not prevent stopwords in one field being used in another + assertTokenStreamContents(protectedTokenStream, new String[]{"boring"}); } /* @@ -170,10 +161,12 @@ public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase { public void testWrappingNonReusableAnalyzer() throws Exception { QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, new NonreusableAnalyzer()); a.addStopWords(reader, 10); - int numHits = search(a, "repetitiveField:boring"); - assertTrue(numHits == 0); - numHits = search(a, "repetitiveField:vaguelyboring"); - assertTrue(numHits == 0); + + TokenStream tokenStream = a.reusableTokenStream("repetitiveField", new StringReader("boring")); + assertTokenStreamContents(tokenStream, new String[0]); + + tokenStream = a.reusableTokenStream("repetitiveField", new StringReader("vaguelyboring")); + assertTokenStreamContents(tokenStream, new String[0]); } public void testTokenStream() throws Exception { diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java index 3cf4ac22aae..9d2d6870c82 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java @@ -32,14 +32,7 @@ import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; -import org.apache.lucene.queryParser.QueryParser; -import org.apache.lucene.search.BooleanClause; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.PhraseQuery; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.*; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; @@ -82,16 +75,6 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase { return new IndexSearcher(dir, true); } - protected ScoreDoc[] queryParsingTest(Analyzer analyzer, String qs) throws Exception { - searcher = setUpSearcher(analyzer); - - QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "content", analyzer); - - Query q = qp.parse(qs); - - return searcher.search(q, null, 1000).scoreDocs; - } - protected void compareRanks(ScoreDoc[] hits, int[] ranks) throws Exception { assertEquals(ranks.length, hits.length); for (int i = 0; i < ranks.length; i++) { @@ -99,51 +82,6 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase { } } - /* - * Will not work on an index without unigrams, since QueryParser automatically - * tokenizes on whitespace. - */ - public void testShingleAnalyzerWrapperQueryParsing() throws Exception { - ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper - (new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2), - "test sentence"); - int[] ranks = new int[] { 1, 2, 0 }; - compareRanks(hits, ranks); - } - - /* - * This one fails with an exception. - */ - public void testShingleAnalyzerWrapperPhraseQueryParsingFails() throws Exception { - ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper - (new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2), - "\"this sentence\""); - int[] ranks = new int[] { 0 }; - compareRanks(hits, ranks); - } - - /* - * This one works, actually. - */ - public void testShingleAnalyzerWrapperPhraseQueryParsing() throws Exception { - ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper - (new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2), - "\"test sentence\""); - int[] ranks = new int[] { 1 }; - compareRanks(hits, ranks); - } - - /* - * Same as above, is tokenized without using the analyzer. - */ - public void testShingleAnalyzerWrapperRequiredQueryParsing() throws Exception { - ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper - (new MockAnalyzer(random, MockTokenizer.WHITESPACE, false), 2), - "+test +sentence"); - int[] ranks = new int[] { 1, 2 }; - compareRanks(hits, ranks); - } - /* * This shows how to construct a phrase query containing shingles. */ @@ -153,8 +91,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase { PhraseQuery q = new PhraseQuery(); - TokenStream ts = analyzer.tokenStream("content", - new StringReader("this sentence")); + TokenStream ts = analyzer.tokenStream("content", new StringReader("this sentence")); int j = -1; PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class); @@ -183,8 +120,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase { BooleanQuery q = new BooleanQuery(); - TokenStream ts = analyzer.tokenStream("content", - new StringReader("test sentence")); + TokenStream ts = analyzer.tokenStream("content", new StringReader("test sentence")); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); diff --git a/modules/queries/src/test/org/apache/lucene/queries/TestCustomScoreQuery.java b/modules/queries/src/test/org/apache/lucene/queries/TestCustomScoreQuery.java index fc69bb812fe..779d296b532 100755 --- a/modules/queries/src/test/org/apache/lucene/queries/TestCustomScoreQuery.java +++ b/modules/queries/src/test/org/apache/lucene/queries/TestCustomScoreQuery.java @@ -20,11 +20,7 @@ package org.apache.lucene.queries; import org.apache.lucene.queries.function.FunctionQuery; import org.apache.lucene.queries.function.FunctionTestSetup; import org.apache.lucene.queries.function.ValueSource; -import org.apache.lucene.queries.function.valuesource.ByteFieldSource; import org.apache.lucene.queries.function.valuesource.FloatFieldSource; -import org.apache.lucene.queries.function.valuesource.IntFieldSource; -import org.apache.lucene.queries.function.valuesource.ShortFieldSource; -import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.*; import org.apache.lucene.search.cache.*; import org.junit.BeforeClass; @@ -198,9 +194,10 @@ public class TestCustomScoreQuery extends FunctionTestSetup { @Test public void testCustomExternalQuery() throws Exception { - QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, TEXT_FIELD,anlzr); - String qtxt = "first aid text"; // from the doc texts in FunctionQuerySetup. - Query q1 = qp.parse(qtxt); + BooleanQuery q1 = new BooleanQuery(); + q1.add(new TermQuery(new Term(TEXT_FIELD, "first")), BooleanClause.Occur.SHOULD); + q1.add(new TermQuery(new Term(TEXT_FIELD, "aid")), BooleanClause.Occur.SHOULD); + q1.add(new TermQuery(new Term(TEXT_FIELD, "text")), BooleanClause.Occur.SHOULD); final Query q = new CustomExternalQuery(q1); log(q); @@ -243,11 +240,12 @@ public class TestCustomScoreQuery extends FunctionTestSetup { FunctionQuery functionQuery = new FunctionQuery(valueSource); float boost = (float) dboost; IndexSearcher s = new IndexSearcher(dir, true); - QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, TEXT_FIELD, anlzr); - String qtxt = "first aid text"; // from the doc texts in FunctionQuerySetup. // regular (boolean) query. - Query q1 = qp.parse(qtxt); + BooleanQuery q1 = new BooleanQuery(); + q1.add(new TermQuery(new Term(TEXT_FIELD, "first")), BooleanClause.Occur.SHOULD); + q1.add(new TermQuery(new Term(TEXT_FIELD, "aid")), BooleanClause.Occur.SHOULD); + q1.add(new TermQuery(new Term(TEXT_FIELD, "text")), BooleanClause.Occur.SHOULD); log(q1); // custom query, that should score the same as q1. diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 6761975e542..3b6ce8aa5cf 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -256,7 +256,13 @@ Other Changes * LUCENE-2883: FunctionQuery, DocValues (and its impls), ValueSource (and its impls) and BoostedQuery have been consolidated into the queries module. They - can now be found at o.a.l.queries.function. + can now be found at o.a.l.queries.function. + +* SOLR-2027: FacetField.getValues() now returns an empty list if there are no + values, instead of null (Chris Male) + +* SOLR-1825: SolrQuery.addFacetQuery now enables facets automatically, like + addFacetField (Chris Male) Documentation ---------------------- diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/SolrQuery.java b/solr/solrj/src/java/org/apache/solr/client/solrj/SolrQuery.java index a7228f0da9d..5771028741f 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/SolrQuery.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/SolrQuery.java @@ -292,6 +292,7 @@ public class SolrQuery extends ModifiableSolrParams */ public SolrQuery addFacetQuery(String f) { this.add(FacetParams.FACET_QUERY, f); + this.set(FacetParams.FACET, true); return this; } diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/response/FacetField.java b/solr/solrj/src/java/org/apache/solr/client/solrj/response/FacetField.java index 6b653487344..f46fd97a8cf 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/response/FacetField.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/response/FacetField.java @@ -19,6 +19,7 @@ package org.apache.solr.client.solrj.response; import java.io.Serializable; import java.util.ArrayList; +import java.util.Collections; import java.util.Date; import java.util.List; @@ -145,7 +146,7 @@ import org.apache.solr.client.solrj.util.ClientUtils; } public List getValues() { - return _values; + return _values == null ? Collections.emptyList() : _values; } public int getValueCount() diff --git a/solr/solrj/src/test/org/apache/solr/client/solrj/SolrQueryTest.java b/solr/solrj/src/test/org/apache/solr/client/solrj/SolrQueryTest.java index 9c24c4d9c0c..4eedeb9a57c 100644 --- a/solr/solrj/src/test/org/apache/solr/client/solrj/SolrQueryTest.java +++ b/solr/solrj/src/test/org/apache/solr/client/solrj/SolrQueryTest.java @@ -195,4 +195,10 @@ public class SolrQueryTest extends LuceneTestCase { q.setTermsRegexFlag("multiline"); assertEquals(2, q.getTermsRegexFlags().length); } + + public void testAddFacetQuery() { + SolrQuery solrQuery = new SolrQuery(); + solrQuery.addFacetQuery("field:value"); + assertTrue("Adding a Facet Query should enable facets", solrQuery.getBool(FacetParams.FACET)); + } } diff --git a/solr/solrj/src/test/org/apache/solr/client/solrj/response/FacetFieldTest.java b/solr/solrj/src/test/org/apache/solr/client/solrj/response/FacetFieldTest.java new file mode 100644 index 00000000000..beaab3b2d3d --- /dev/null +++ b/solr/solrj/src/test/org/apache/solr/client/solrj/response/FacetFieldTest.java @@ -0,0 +1,33 @@ +package org.apache.solr.client.solrj.response; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.LuceneTestCase; + +public class FacetFieldTest extends LuceneTestCase { + + public void testGetValues() { + FacetField facetField = new FacetField("field"); + + assertNotNull(facetField.getValues()); + assertEquals(0, facetField.getValues().size()); + + facetField.add("value", 1); + assertEquals(1, facetField.getValues().size()); + } +}