From dc02cdd38450f6a269964bc5d24584994578fa54 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 13 Sep 2012 11:21:03 +0000 Subject: [PATCH] LUCENE-4355: improve AtomicReader sugar apis git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1384274 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 6 ++ .../SimpleNaiveBayesClassifier.java | 2 +- .../org/apache/lucene/index/AtomicReader.java | 72 +++++++------------ .../lucene/index/BaseCompositeReader.java | 20 ++++-- .../org/apache/lucene/index/IndexReader.java | 23 +++--- .../org/apache/lucene/search/PhraseQuery.java | 6 +- .../org/apache/lucene/search/TermQuery.java | 6 +- .../apache/lucene/search/TermStatistics.java | 6 +- .../codecs/lucene40/TestReuseDocsEnum.java | 10 ++- .../lucene/index/TestDocValuesIndexing.java | 2 +- .../lucene/index/TestDocsAndPositions.java | 11 ++- .../lucene/index/TestDocumentWriter.java | 6 +- .../lucene/index/TestMultiLevelSkipList.java | 5 +- .../org/apache/lucene/index/TestPayloads.java | 4 +- .../lucene/search/TestPositionIncrement.java | 4 +- .../lucene/store/TestNRTCachingDirectory.java | 2 +- .../directory/DirectoryTaxonomyWriter.java | 24 ++++--- .../lucene/search/grouping/TestGrouping.java | 2 +- .../vectorhighlight/FieldTermStack.java | 3 +- .../lucene/index/memory/MemoryIndexTest.java | 2 +- .../org/apache/lucene/misc/GetTermInfo.java | 9 +-- .../org/apache/lucene/misc/HighFreqTerms.java | 12 ++-- .../apache/lucene/misc/TestHighFreqTerms.java | 5 +- .../valuesource/JoinDocFreqValueSource.java | 12 +++- .../valuesource/TotalTermFreqValueSource.java | 3 +- .../handler/admin/LukeRequestHandler.java | 18 ++--- .../component/TermVectorComponent.java | 25 +++---- .../apache/solr/search/SolrIndexSearcher.java | 21 +++--- 28 files changed, 169 insertions(+), 152 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 54f55f69a75..3befe420ede 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -103,6 +103,12 @@ API Changes Use DataOutput.copyBytes(DataInput, long) instead. (Mike McCandless, Robert Muir) +* LUCENE-4355: Simplify AtomicReader's sugar methods such as termDocsEnum, + termPositionsEnum, docFreq, and totalTermFreq to only take Term as a + parameter. If you want to do expert things such as pass a different + Bits as liveDocs, then use the flex apis (fields(), terms(), etc) directly. + (Mike McCandless, Robert Muir) + Bug Fixes * LUCENE-4297: BooleanScorer2 would multiply the coord() factor diff --git a/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java b/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java index b86e0da41c2..be90b3d1679 100644 --- a/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java +++ b/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java @@ -121,7 +121,7 @@ public class SimpleNaiveBayesClassifier implements Classifier { Terms terms = MultiFields.getTerms(atomicReader, textFieldName); long numPostings = terms.getSumDocFreq(); // number of term/doc pairs double avgNumberOfUniqueTerms = numPostings / (double) terms.getDocCount(); // avg # of unique terms per doc - int docsWithC = atomicReader.docFreq(classFieldName, new BytesRef(c)); + int docsWithC = atomicReader.docFreq(new Term(classFieldName, c)); return avgNumberOfUniqueTerms * docsWithC; // avg # of unique terms in text field per doc * # docs with c } diff --git a/lucene/core/src/java/org/apache/lucene/index/AtomicReader.java b/lucene/core/src/java/org/apache/lucene/index/AtomicReader.java index 2192447ba48..0e04c495aaa 100644 --- a/lucene/core/src/java/org/apache/lucene/index/AtomicReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/AtomicReader.java @@ -21,7 +21,6 @@ import java.io.IOException; import org.apache.lucene.search.SearcherManager; // javadocs import org.apache.lucene.util.Bits; -import org.apache.lucene.util.BytesRef; /** {@code AtomicReader} is an abstract class, providing an interface for accessing an index. Search of an index is done entirely through this abstract interface, @@ -67,17 +66,17 @@ public abstract class AtomicReader extends IndexReader { public abstract Fields fields() throws IOException; @Override - public final int docFreq(String field, BytesRef term) throws IOException { + public final int docFreq(Term term) throws IOException { final Fields fields = fields(); if (fields == null) { return 0; } - final Terms terms = fields.terms(field); + final Terms terms = fields.terms(term.field()); if (terms == null) { return 0; } final TermsEnum termsEnum = terms.iterator(null); - if (termsEnum.seekExact(term, true)) { + if (termsEnum.seekExact(term.bytes(), true)) { return termsEnum.docFreq(); } else { return 0; @@ -89,17 +88,17 @@ public abstract class AtomicReader extends IndexReader { * field does not exists. This method does not take into * account deleted documents that have not yet been merged * away. */ - public final long totalTermFreq(String field, BytesRef term) throws IOException { + public final long totalTermFreq(Term term) throws IOException { final Fields fields = fields(); if (fields == null) { return 0; } - final Terms terms = fields.terms(field); + final Terms terms = fields.terms(term.field()); if (terms == null) { return 0; } final TermsEnum termsEnum = terms.iterator(null); - if (termsEnum.seekExact(term, true)) { + if (termsEnum.seekExact(term.bytes(), true)) { return termsEnum.totalTermFreq(); } else { return 0; @@ -115,61 +114,40 @@ public abstract class AtomicReader extends IndexReader { return fields.terms(field); } - /** Returns {@link DocsEnum} for the specified field & - * term. This will return null if either the field or - * term does not exist. */ - public final DocsEnum termDocsEnum(Bits liveDocs, String field, BytesRef term) throws IOException { - return termDocsEnum(liveDocs, field, term, DocsEnum.FLAG_FREQS); - } - - /** Returns {@link DocsEnum} for the specified field & - * term, with control over whether freqs are required. - * Some codecs may be able to optimize their - * implementation when freqs are not required. This will - * return null if the field or term does not - * exist. See {@link TermsEnum#docs(Bits,DocsEnum,int)}. */ - public final DocsEnum termDocsEnum(Bits liveDocs, String field, BytesRef term, int flags) throws IOException { - assert field != null; - assert term != null; + /** Returns {@link DocsEnum} for the specified term. + * This will return null if either the field or + * term does not exist. + * @see TermsEnum#docs(Bits, DocsEnum) */ + public final DocsEnum termDocsEnum(Term term) throws IOException { + assert term.field() != null; + assert term.bytes() != null; final Fields fields = fields(); if (fields != null) { - final Terms terms = fields.terms(field); + final Terms terms = fields.terms(term.field()); if (terms != null) { final TermsEnum termsEnum = terms.iterator(null); - if (termsEnum.seekExact(term, true)) { - return termsEnum.docs(liveDocs, null, flags); + if (termsEnum.seekExact(term.bytes(), true)) { + return termsEnum.docs(getLiveDocs(), null); } } } return null; } - + /** Returns {@link DocsAndPositionsEnum} for the specified - * field & term. This will return null if the + * term. This will return null if the * field or term does not exist or positions weren't indexed. - * @see #termPositionsEnum(Bits, String, BytesRef, int) */ - public final DocsAndPositionsEnum termPositionsEnum(Bits liveDocs, String field, BytesRef term) throws IOException { - return termPositionsEnum(liveDocs, field, term, DocsAndPositionsEnum.FLAG_OFFSETS | DocsAndPositionsEnum.FLAG_PAYLOADS); - } - - - /** Returns {@link DocsAndPositionsEnum} for the specified - * field & term, with control over whether offsets and payloads are - * required. Some codecs may be able to optimize their - * implementation when offsets and/or payloads are not required. - * This will return null if the field or term - * does not exist or positions weren't indexed. See - * {@link TermsEnum#docsAndPositions(Bits,DocsAndPositionsEnum,int)}. */ - public final DocsAndPositionsEnum termPositionsEnum(Bits liveDocs, String field, BytesRef term, int flags) throws IOException { - assert field != null; - assert term != null; + * @see TermsEnum#docsAndPositions(Bits, DocsAndPositionsEnum) */ + public final DocsAndPositionsEnum termPositionsEnum(Term term) throws IOException { + assert term.field() != null; + assert term.bytes() != null; final Fields fields = fields(); if (fields != null) { - final Terms terms = fields.terms(field); + final Terms terms = fields.terms(term.field()); if (terms != null) { final TermsEnum termsEnum = terms.iterator(null); - if (termsEnum.seekExact(term, true)) { - return termsEnum.docsAndPositions(liveDocs, null, flags); + if (termsEnum.seekExact(term.bytes(), true)) { + return termsEnum.docsAndPositions(getLiveDocs(), null); } } } diff --git a/lucene/core/src/java/org/apache/lucene/index/BaseCompositeReader.java b/lucene/core/src/java/org/apache/lucene/index/BaseCompositeReader.java index 02d9765439d..bbe1c66f700 100644 --- a/lucene/core/src/java/org/apache/lucene/index/BaseCompositeReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/BaseCompositeReader.java @@ -22,8 +22,6 @@ import java.util.Arrays; import java.util.Collections; import java.util.List; -import org.apache.lucene.util.BytesRef; - /** Base class for implementing {@link CompositeReader}s based on an array * of sub-readers. The implementing class has to add code for * correctly refcounting and closing the sub-readers. @@ -125,11 +123,25 @@ public abstract class BaseCompositeReader extends Composi } @Override - public final int docFreq(String field, BytesRef t) throws IOException { + public final int docFreq(Term term) throws IOException { ensureOpen(); int total = 0; // sum freqs in subreaders for (int i = 0; i < subReaders.length; i++) { - total += subReaders[i].docFreq(field, t); + total += subReaders[i].docFreq(term); + } + return total; + } + + @Override + public final long totalTermFreq(Term term) throws IOException { + ensureOpen(); + long total = 0; // sum freqs in subreaders + for (int i = 0; i < subReaders.length; i++) { + long sub = subReaders[i].totalTermFreq(term); + if (sub == -1) { + return -1; + } + total += sub; } return total; } diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexReader.java b/lucene/core/src/java/org/apache/lucene/index/IndexReader.java index e982e629bfc..18775f64ec7 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexReader.java @@ -31,7 +31,6 @@ import org.apache.lucene.document.DocumentStoredFieldVisitor; import org.apache.lucene.search.SearcherManager; // javadocs import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.util.Bits; -import org.apache.lucene.util.BytesRef; /** IndexReader is an abstract class, providing an interface for accessing an index. Search of an index is done entirely through this abstract interface, @@ -432,15 +431,17 @@ public abstract class IndexReader implements Closeable { * term. This method returns 0 if the term or * field does not exists. This method does not take into * account deleted documents that have not yet been merged - * away. */ - public final int docFreq(Term term) throws IOException { - return docFreq(term.field(), term.bytes()); - } - - /** Returns the number of documents containing the + * away. + * @see TermsEnum#docFreq() + */ + public abstract int docFreq(Term term) throws IOException; + + /** Returns the number of documents containing the term * term. This method returns 0 if the term or - * field does not exists. This method does not take into - * account deleted documents that have not yet been merged - * away. */ - public abstract int docFreq(String field, BytesRef term) throws IOException; + * field does not exists, or -1 if the Codec does not support + * the measure. This method does not take into account deleted + * documents that have not yet been merged away. + * @see TermsEnum#totalTermFreq() + */ + public abstract long totalTermFreq(Term term) throws IOException; } diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java index b51c54daad5..d8e28356e7c 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java @@ -260,7 +260,7 @@ public class PhraseQuery extends Query { final Term t = terms.get(i); final TermState state = states[i].get(context.ord); if (state == null) { /* term doesnt exist in this segment */ - assert termNotInReader(reader, field, t.bytes()): "no termstate found but term exists in reader"; + assert termNotInReader(reader, t): "no termstate found but term exists in reader"; return null; } te.seekExact(t.bytes(), state); @@ -295,8 +295,8 @@ public class PhraseQuery extends Query { } // only called from assert - private boolean termNotInReader(AtomicReader reader, String field, BytesRef bytes) throws IOException { - return reader.docFreq(field, bytes) == 0; + private boolean termNotInReader(AtomicReader reader, Term term) throws IOException { + return reader.docFreq(term) == 0; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/TermQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermQuery.java index cf9ef6c7a38..7f854f8cbba 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermQuery.java @@ -95,7 +95,7 @@ public class TermQuery extends Query { private TermsEnum getTermsEnum(AtomicReaderContext context) throws IOException { final TermState state = termStates.get(context.ord); if (state == null) { // term is not present in that reader - assert termNotInReader(context.reader(), term.field(), term.bytes()) : "no termstate found but term exists in reader term=" + term; + assert termNotInReader(context.reader(), term) : "no termstate found but term exists in reader term=" + term; return null; } //System.out.println("LD=" + reader.getLiveDocs() + " set?=" + (reader.getLiveDocs() != null ? reader.getLiveDocs().get(0) : "null")); @@ -104,10 +104,10 @@ public class TermQuery extends Query { return termsEnum; } - private boolean termNotInReader(AtomicReader reader, String field, BytesRef bytes) throws IOException { + private boolean termNotInReader(AtomicReader reader, Term term) throws IOException { // only called from assert //System.out.println("TQ.termNotInReader reader=" + reader + " term=" + field + ":" + bytes.utf8ToString()); - return reader.docFreq(field, bytes) == 0; + return reader.docFreq(term) == 0; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/TermStatistics.java b/lucene/core/src/java/org/apache/lucene/search/TermStatistics.java index a17d50fb02c..13480ab6462 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermStatistics.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermStatistics.java @@ -17,7 +17,7 @@ package org.apache.lucene.search; * limitations under the License. */ -import org.apache.lucene.index.AtomicReader; // javadocs +import org.apache.lucene.index.TermsEnum; // javadocs import org.apache.lucene.util.BytesRef; /** * Contains statistics for a specific term @@ -42,13 +42,13 @@ public class TermStatistics { } /** returns the number of documents this term occurs in - * @see AtomicReader#docFreq(String, BytesRef) */ + * @see TermsEnum#docFreq() */ public final long docFreq() { return docFreq; } /** returns the total number of occurrences of this term - * @see AtomicReader#totalTermFreq(String, BytesRef) */ + * @see TermsEnum#totalTermFreq() */ public final long totalTermFreq() { return totalTermFreq; } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestReuseDocsEnum.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestReuseDocsEnum.java index c799828c0ee..e5a0ae51ff0 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestReuseDocsEnum.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestReuseDocsEnum.java @@ -156,7 +156,15 @@ public class TestReuseDocsEnum extends LuceneTestCase { return null; } AtomicReader indexReader = readers.get(random().nextInt(readers.size())).reader(); - return indexReader.termDocsEnum(bits, field, term, random().nextBoolean() ? DocsEnum.FLAG_FREQS : 0); + Terms terms = indexReader.terms(field); + if (terms == null) { + return null; + } + TermsEnum iterator = terms.iterator(null); + if (iterator.seekExact(term, true)) { + return iterator.docs(bits, null, random().nextBoolean() ? DocsEnum.FLAG_FREQS : 0); + } + return null; } /** diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDocValuesIndexing.java b/lucene/core/src/test/org/apache/lucene/index/TestDocValuesIndexing.java index fda85001fd4..68d5d696c83 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDocValuesIndexing.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDocValuesIndexing.java @@ -881,7 +881,7 @@ public class TestDocValuesIndexing extends LuceneTestCase { public int docId(AtomicReader reader, Term term) throws IOException { int docFreq = reader.docFreq(term); assertEquals(1, docFreq); - DocsEnum termDocsEnum = reader.termDocsEnum(null, term.field, term.bytes, 0); + DocsEnum termDocsEnum = reader.termDocsEnum(term); int nextDoc = termDocsEnum.nextDoc(); assertEquals(DocIdSetIterator.NO_MORE_DOCS, termDocsEnum.nextDoc()); return nextDoc; diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDocsAndPositions.java b/lucene/core/src/test/org/apache/lucene/index/TestDocsAndPositions.java index 0f92f0bf020..238d7591ca3 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDocsAndPositions.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDocsAndPositions.java @@ -92,7 +92,14 @@ public class TestDocsAndPositions extends LuceneTestCase { public DocsAndPositionsEnum getDocsAndPositions(AtomicReader reader, BytesRef bytes, Bits liveDocs) throws IOException { - return reader.termPositionsEnum(null, fieldName, bytes); + Terms terms = reader.terms(fieldName); + if (terms != null) { + TermsEnum te = terms.iterator(null); + if (te.seekExact(bytes, true)) { + return te.docsAndPositions(liveDocs, null); + } + } + return null; } /** @@ -352,7 +359,7 @@ public class TestDocsAndPositions extends LuceneTestCase { writer.addDocument(doc); DirectoryReader reader = writer.getReader(); AtomicReader r = getOnlySegmentReader(reader); - DocsAndPositionsEnum disi = r.termPositionsEnum(null, "foo", new BytesRef("bar")); + DocsAndPositionsEnum disi = r.termPositionsEnum(new Term("foo", "bar")); int docid = disi.docID(); assertTrue(docid == -1 || docid == DocIdSetIterator.NO_MORE_DOCS); assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDocumentWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestDocumentWriter.java index eb9e1bb1657..203ea5c3938 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDocumentWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDocumentWriter.java @@ -243,18 +243,18 @@ public class TestDocumentWriter extends LuceneTestCase { writer.close(); SegmentReader reader = new SegmentReader(info, DirectoryReader.DEFAULT_TERMS_INDEX_DIVISOR, newIOContext(random())); - DocsAndPositionsEnum termPositions = reader.termPositionsEnum(reader.getLiveDocs(), "preanalyzed", new BytesRef("term1")); + DocsAndPositionsEnum termPositions = reader.termPositionsEnum(new Term("preanalyzed", "term1")); assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(1, termPositions.freq()); assertEquals(0, termPositions.nextPosition()); - termPositions = reader.termPositionsEnum(reader.getLiveDocs(), "preanalyzed", new BytesRef("term2")); + termPositions = reader.termPositionsEnum(new Term("preanalyzed", "term2")); assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(2, termPositions.freq()); assertEquals(1, termPositions.nextPosition()); assertEquals(3, termPositions.nextPosition()); - termPositions = reader.termPositionsEnum(reader.getLiveDocs(), "preanalyzed", new BytesRef("term3")); + termPositions = reader.termPositionsEnum(new Term("preanalyzed", "term3")); assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(1, termPositions.freq()); assertEquals(2, termPositions.nextPosition()); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestMultiLevelSkipList.java b/lucene/core/src/test/org/apache/lucene/index/TestMultiLevelSkipList.java index 70338293dd2..42de0b6e414 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestMultiLevelSkipList.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestMultiLevelSkipList.java @@ -84,10 +84,7 @@ public class TestMultiLevelSkipList extends LuceneTestCase { for (int i = 0; i < 2; i++) { counter = 0; - DocsAndPositionsEnum tp = reader.termPositionsEnum(reader.getLiveDocs(), - term.field(), - new BytesRef(term.text())); - + DocsAndPositionsEnum tp = reader.termPositionsEnum(term); checkSkipTo(tp, 14, 185); // no skips checkSkipTo(tp, 17, 190); // one skip on level 0 checkSkipTo(tp, 287, 200); // one skip on level 1, two on level 0 diff --git a/lucene/core/src/test/org/apache/lucene/index/TestPayloads.java b/lucene/core/src/test/org/apache/lucene/index/TestPayloads.java index 2fbff1b5fdc..a53ad86b5b0 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestPayloads.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestPayloads.java @@ -605,7 +605,7 @@ public class TestPayloads extends LuceneTestCase { writer.addDocument(doc); DirectoryReader reader = writer.getReader(); AtomicReader sr = SlowCompositeReaderWrapper.wrap(reader); - DocsAndPositionsEnum de = sr.termPositionsEnum(null, "field", new BytesRef("withPayload")); + DocsAndPositionsEnum de = sr.termPositionsEnum(new Term("field", "withPayload")); de.nextDoc(); de.nextPosition(); assertEquals(new BytesRef("test"), de.getPayload()); @@ -639,7 +639,7 @@ public class TestPayloads extends LuceneTestCase { writer.addDocument(doc); DirectoryReader reader = writer.getReader(); SegmentReader sr = getOnlySegmentReader(reader); - DocsAndPositionsEnum de = sr.termPositionsEnum(null, "field", new BytesRef("withPayload")); + DocsAndPositionsEnum de = sr.termPositionsEnum(new Term("field", "withPayload")); de.nextDoc(); de.nextPosition(); assertEquals(new BytesRef("test"), de.getPayload()); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestPositionIncrement.java b/lucene/core/src/test/org/apache/lucene/search/TestPositionIncrement.java index 8a76d64b156..95bbcfb6a69 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestPositionIncrement.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestPositionIncrement.java @@ -212,9 +212,7 @@ public class TestPositionIncrement extends LuceneTestCase { final IndexReader readerFromWriter = writer.getReader(); AtomicReader r = SlowCompositeReaderWrapper.wrap(readerFromWriter); - DocsAndPositionsEnum tp = r.termPositionsEnum(r.getLiveDocs(), - "content", - new BytesRef("a")); + DocsAndPositionsEnum tp = r.termPositionsEnum(new Term("content", "a")); int count = 0; assertTrue(tp.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); diff --git a/lucene/core/src/test/org/apache/lucene/store/TestNRTCachingDirectory.java b/lucene/core/src/test/org/apache/lucene/store/TestNRTCachingDirectory.java index 767212a46f5..6f6aa237e48 100644 --- a/lucene/core/src/test/org/apache/lucene/store/TestNRTCachingDirectory.java +++ b/lucene/core/src/test/org/apache/lucene/store/TestNRTCachingDirectory.java @@ -95,7 +95,7 @@ public class TestNRTCachingDirectory extends LuceneTestCase { r = DirectoryReader.open(dir); for(BytesRef id : ids) { - assertEquals(1, r.docFreq("docid", id)); + assertEquals(1, r.docFreq(new Term("docid", id))); } r.close(); cachedDir.close(); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java index 465c5f9fd95..a7763511e82 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java @@ -411,10 +411,14 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter { try { final BytesRef catTerm = new BytesRef(categoryPath.toString(delimiter)); for (AtomicReaderContext ctx : reader.leaves()) { - DocsEnum docs = ctx.reader().termDocsEnum(null, Consts.FULL, catTerm, 0); - if (docs != null) { - doc = docs.nextDoc() + ctx.docBase; - break; + Terms terms = ctx.reader().terms(Consts.FULL); + if (terms != null) { + TermsEnum termsEnum = terms.iterator(null); + if (termsEnum.seekExact(catTerm, true)) { + // TODO: is it really ok that null is passed here as liveDocs? + DocsEnum docs = termsEnum.docs(null, null, 0); + doc = docs.nextDoc() + ctx.docBase; + } } } } finally { @@ -452,10 +456,14 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter { try { final BytesRef catTerm = new BytesRef(categoryPath.toString(delimiter, prefixLen)); for (AtomicReaderContext ctx : reader.leaves()) { - DocsEnum docs = ctx.reader().termDocsEnum(null, Consts.FULL, catTerm, 0); - if (docs != null) { - doc = docs.nextDoc() + ctx.docBase; - break; + Terms terms = ctx.reader().terms(Consts.FULL); + if (terms != null) { + TermsEnum termsEnum = terms.iterator(null); + if (termsEnum.seekExact(catTerm, true)) { + // TODO: is it really ok that null is passed here as liveDocs? + DocsEnum docs = termsEnum.docs(null, null, 0); + doc = docs.nextDoc() + ctx.docBase; + } } } } finally { diff --git a/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java b/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java index 495f30ced00..52df5fc6f3d 100644 --- a/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java +++ b/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java @@ -864,7 +864,7 @@ public class TestGrouping extends LuceneTestCase { final boolean doCache = random().nextBoolean(); final boolean doAllGroups = random().nextBoolean(); if (VERBOSE) { - System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " dF=" + r.docFreq("content", new BytesRef(searchTerm)) +" dFBlock=" + rBlocks.docFreq("content", new BytesRef(searchTerm)) + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup + " doAllGroups=" + doAllGroups + " getScores=" + getScores + " getMaxScores=" + getMaxScores); + System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " dF=" + r.docFreq(new Term("content", searchTerm)) +" dFBlock=" + rBlocks.docFreq(new Term("content", searchTerm)) + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup + " doAllGroups=" + doAllGroups + " getScores=" + getScores + " getMaxScores=" + getMaxScores); } final AbstractFirstPassGroupingCollector c1 = createRandomFirstPassCollector("group", groupSort, groupOffset+topNGroups, canUseIDV); diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java index 6de182e2a34..456b2e3861d 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java @@ -24,6 +24,7 @@ import java.util.Set; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.BytesRef; @@ -112,7 +113,7 @@ public class FieldTermStack { dpEnum.nextDoc(); // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html - final float weight = ( float ) ( Math.log( numDocs / ( double ) ( reader.docFreq( fieldName, text ) + 1 ) ) + 1.0 ); + final float weight = ( float ) ( Math.log( numDocs / ( double ) ( reader.docFreq( new Term(fieldName, text) ) + 1 ) ) + 1.0 ); final int freq = dpEnum.freq(); diff --git a/lucene/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java b/lucene/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java index 02aacfb0a16..68ac8106fd9 100644 --- a/lucene/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java +++ b/lucene/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java @@ -225,7 +225,7 @@ public class MemoryIndexTest extends BaseTokenStreamTestCase { MemoryIndex memory = new MemoryIndex(true); memory.addField("foo", "bar", analyzer); AtomicReader reader = (AtomicReader) memory.createSearcher().getIndexReader(); - DocsAndPositionsEnum disi = reader.termPositionsEnum(null, "foo", new BytesRef("bar")); + DocsAndPositionsEnum disi = reader.termPositionsEnum(new Term("foo", "bar")); int docid = disi.docID(); assertTrue(docid == -1 || docid == DocIdSetIterator.NO_MORE_DOCS); assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); diff --git a/lucene/misc/src/java/org/apache/lucene/misc/GetTermInfo.java b/lucene/misc/src/java/org/apache/lucene/misc/GetTermInfo.java index 8aadc8da7d9..30ce3f4297e 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/GetTermInfo.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/GetTermInfo.java @@ -23,6 +23,7 @@ import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; /** * Utility to get document frequency and total number of occurrences (sum of the tf for each doc) of a term. @@ -44,14 +45,14 @@ public class GetTermInfo { System.exit(1); } - getTermInfo(dir,field, new BytesRef(inputStr)); + getTermInfo(dir,new Term(field, inputStr)); } - public static void getTermInfo(Directory dir, String field, BytesRef termtext) throws Exception { + public static void getTermInfo(Directory dir, Term term) throws Exception { IndexReader reader = DirectoryReader.open(dir); - long totalTF = HighFreqTerms.getTotalTermFreq(reader, field, termtext); + long totalTF = HighFreqTerms.getTotalTermFreq(reader, term); System.out.printf("%s:%s \t totalTF = %,d \t doc freq = %,d \n", - field, termtext.utf8ToString(), totalTF, reader.docFreq(field, termtext)); + term.field(), term.text(), totalTF, reader.docFreq(term)); } private static void usage() { diff --git a/lucene/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java b/lucene/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java index 07c4fa8c256..9f4d68b9bdf 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java @@ -24,6 +24,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Fields; import org.apache.lucene.index.ReaderUtil; +import org.apache.lucene.index.Term; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.Terms; import org.apache.lucene.index.DocsEnum; @@ -167,7 +168,7 @@ public class HighFreqTerms { TermStats[] ts = new TermStats[terms.length]; // array for sorting long totalTF; for (int i = 0; i < terms.length; i++) { - totalTF = getTotalTermFreq(reader, terms[i].field, terms[i].termtext); + totalTF = getTotalTermFreq(reader, new Term(terms[i].field, terms[i].termtext)); ts[i] = new TermStats(terms[i].field, terms[i].termtext, terms[i].docFreq, totalTF); } @@ -177,24 +178,23 @@ public class HighFreqTerms { return ts; } - public static long getTotalTermFreq(IndexReader reader, final String field, final BytesRef termText) throws Exception { + public static long getTotalTermFreq(IndexReader reader, Term term) throws Exception { long totalTF = 0L; for (final AtomicReaderContext ctx : reader.leaves()) { AtomicReader r = ctx.reader(); - Bits liveDocs = r.getLiveDocs(); - if (liveDocs == null) { + if (!r.hasDeletions()) { // TODO: we could do this up front, during the scan // (next()), instead of after-the-fact here w/ seek, // if the codec supports it and there are no del // docs... - final long totTF = r.totalTermFreq(field, termText); + final long totTF = r.totalTermFreq(term); if (totTF != -1) { totalTF += totTF; continue; } // otherwise we fall-through } // note: what should we do if field omits freqs? currently it counts as 1... - DocsEnum de = r.termDocsEnum(liveDocs, field, termText); + DocsEnum de = r.termDocsEnum(term); if (de != null) { while (de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) totalTF += de.freq(); diff --git a/lucene/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java b/lucene/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java index 2177e20eeac..8e36cc93a0e 100644 --- a/lucene/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java +++ b/lucene/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java @@ -26,6 +26,7 @@ import org.apache.lucene.document.Field; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; @@ -182,7 +183,7 @@ public class TestHighFreqTerms extends LuceneTestCase { String term ="highTF"; BytesRef termtext = new BytesRef (term); String field = "FIELD_1"; - long totalTermFreq = HighFreqTerms.getTotalTermFreq(reader, field, termtext); + long totalTermFreq = HighFreqTerms.getTotalTermFreq(reader, new Term(field, termtext)); assertEquals("highTf tf should be 200",200,totalTermFreq); } @@ -191,7 +192,7 @@ public class TestHighFreqTerms extends LuceneTestCase { String term ="foobar"; BytesRef termtext = new BytesRef (term); String field = "FIELD_1"; - long totalTermFreq = HighFreqTerms.getTotalTermFreq(reader, field, termtext); + long totalTermFreq = HighFreqTerms.getTotalTermFreq(reader, new Term(field, termtext)); assertEquals("totalTermFreq should be 0 for term not in index",0,totalTermFreq); } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/JoinDocFreqValueSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/JoinDocFreqValueSource.java index 1d8a8b181c8..24b6c10854e 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/JoinDocFreqValueSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/JoinDocFreqValueSource.java @@ -22,7 +22,10 @@ import java.util.Map; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.ReaderUtil; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.queries.function.FunctionValues; import org.apache.lucene.queries.function.docvalues.IntDocValues; import org.apache.lucene.search.FieldCache.DocTerms; @@ -55,6 +58,8 @@ public class JoinDocFreqValueSource extends FieldCacheSource { { final DocTerms terms = cache.getTerms(readerContext.reader(), field, PackedInts.FAST); final IndexReader top = ReaderUtil.getTopLevelContext(readerContext).reader(); + Terms t = MultiFields.getTerms(top, qfield); + final TermsEnum termsEnum = t == null ? TermsEnum.EMPTY : t.iterator(null); return new IntDocValues(this) { final BytesRef ref = new BytesRef(); @@ -64,8 +69,11 @@ public class JoinDocFreqValueSource extends FieldCacheSource { { try { terms.getTerm(doc, ref); - //System.out.println( NAME+"["+field+"="+ref.utf8ToString()+"=("+qfield+":"+v+")]" ); - return top.docFreq( qfield, ref ); + if (termsEnum.seekExact(ref, true)) { + return termsEnum.docFreq(); + } else { + return 0; + } } catch (IOException e) { throw new RuntimeException("caught exception in function "+description()+" : doc="+doc, e); diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/TotalTermFreqValueSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/TotalTermFreqValueSource.java index 65b2abd1af5..90ae4e18625 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/TotalTermFreqValueSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/TotalTermFreqValueSource.java @@ -18,6 +18,7 @@ package org.apache.lucene.queries.function.valuesource; import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.Term; import org.apache.lucene.queries.function.FunctionValues; import org.apache.lucene.queries.function.ValueSource; import org.apache.lucene.queries.function.docvalues.LongDocValues; @@ -65,7 +66,7 @@ public class TotalTermFreqValueSource extends ValueSource { public void createWeight(Map context, IndexSearcher searcher) throws IOException { long totalTermFreq = 0; for (AtomicReaderContext readerContext : searcher.getTopReaderContext().leaves()) { - long val = readerContext.reader().totalTermFreq(indexedField, indexedBytes); + long val = readerContext.reader().totalTermFreq(new Term(indexedField, indexedBytes)); if (val == -1) { totalTermFreq = -1; break; diff --git a/solr/core/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java index bb737fb39d6..cf47ca62e7c 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java @@ -33,6 +33,7 @@ import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.PriorityQueue; @@ -344,7 +345,7 @@ public class LukeRequestHandler extends RequestHandlerBase if(sfield != null && sfield.indexed() ) { // In the pre-4.0 days, this did a veeeery expensive range query. But we can be much faster now, // so just do this all the time. - StoredDocument doc = getFirstLiveDoc(reader, fieldName, terms); + StoredDocument doc = getFirstLiveDoc(terms, reader); if( doc != null ) { @@ -378,7 +379,7 @@ public class LukeRequestHandler extends RequestHandlerBase // Just get a document with the term in it, the first one will do! // Is there a better way to do this? Shouldn't actually be very costly // to do it this way. - private static StoredDocument getFirstLiveDoc(AtomicReader reader, String fieldName, Terms terms) throws IOException { + private static StoredDocument getFirstLiveDoc(Terms terms, AtomicReader reader) throws IOException { DocsEnum docsEnum = null; TermsEnum termsEnum = terms.iterator(null); BytesRef text; @@ -388,16 +389,9 @@ public class LukeRequestHandler extends RequestHandlerBase if (text == null) { // Ran off the end of the terms enum without finding any live docs with that field in them. return null; } - Term term = new Term(fieldName, text); - docsEnum = reader.termDocsEnum(reader.getLiveDocs(), - term.field(), - new BytesRef(term.text()), - 0); - if (docsEnum != null) { - int docId; - if ((docId = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { - return reader.document(docId); - } + docsEnum = termsEnum.docs(reader.getLiveDocs(), docsEnum, 0); + if (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + return reader.document(docsEnum.docID()); } } return null; diff --git a/solr/core/src/java/org/apache/solr/handler/component/TermVectorComponent.java b/solr/core/src/java/org/apache/solr/handler/component/TermVectorComponent.java index e9550aebb9a..f1e92d73dd8 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/TermVectorComponent.java +++ b/solr/core/src/java/org/apache/solr/handler/component/TermVectorComponent.java @@ -18,6 +18,7 @@ import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.StoredFieldVisitor; +import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.BytesRef; @@ -381,13 +382,19 @@ public class TermVectorComponent extends SearchComponent implements SolrCoreAwar } } } - - if (fieldOptions.docFreq) { - termInfo.add("df", getDocFreq(reader, field, text)); + + int df = 0; + if (fieldOptions.docFreq || fieldOptions.tfIdf) { + df = reader.docFreq(new Term(field, text)); } + if (fieldOptions.docFreq) { + termInfo.add("df", df); + } + + // TODO: this is not TF/IDF by anyone's definition! if (fieldOptions.tfIdf) { - double tfIdfVal = ((double) freq) / getDocFreq(reader, field, text); + double tfIdfVal = ((double) freq) / df; termInfo.add("tf-idf", tfIdfVal); } } @@ -408,16 +415,6 @@ public class TermVectorComponent extends SearchComponent implements SolrCoreAwar return result; } - private static int getDocFreq(IndexReader reader, String field, BytesRef term) { - int result = 1; - try { - result = reader.docFreq(field, term); - } catch (IOException e) { - throw new RuntimeException(e); - } - return result; - } - @Override public void prepare(ResponseBuilder rb) throws IOException { diff --git a/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java b/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java index e51d43bf4a3..5f8294c9dad 100644 --- a/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java +++ b/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java @@ -615,19 +615,18 @@ public class SolrIndexSearcher extends IndexSearcher implements Closeable,SolrIn final AtomicReaderContext leaf = leafContexts.get(i); final AtomicReader reader = leaf.reader(); - final Fields fields = reader.fields(); - if (fields == null) continue; - - final Bits liveDocs = reader.getLiveDocs(); + final Terms terms = reader.terms(field); + if (terms == null) continue; - final DocsEnum docs = reader.termDocsEnum(liveDocs, field, idBytes, 0); + TermsEnum te = terms.iterator(null); + if (te.seekExact(idBytes, true)) { + DocsEnum docs = te.docs(reader.getLiveDocs(), null, 0); + int id = docs.nextDoc(); + if (id == DocIdSetIterator.NO_MORE_DOCS) continue; + assert docs.nextDoc() == DocIdSetIterator.NO_MORE_DOCS; - if (docs == null) continue; - int id = docs.nextDoc(); - if (id == DocIdSetIterator.NO_MORE_DOCS) continue; - assert docs.nextDoc() == DocIdSetIterator.NO_MORE_DOCS; - - return (((long)i) << 32) | id; + return (((long)i) << 32) | id; + } } return -1;