mirror of https://github.com/apache/lucene.git
LUCENE-4355: improve AtomicReader sugar apis
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1384274 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
bc425d77bc
commit
dc02cdd384
|
@ -103,6 +103,12 @@ API Changes
|
|||
Use DataOutput.copyBytes(DataInput, long) instead.
|
||||
(Mike McCandless, Robert Muir)
|
||||
|
||||
* LUCENE-4355: Simplify AtomicReader's sugar methods such as termDocsEnum,
|
||||
termPositionsEnum, docFreq, and totalTermFreq to only take Term as a
|
||||
parameter. If you want to do expert things such as pass a different
|
||||
Bits as liveDocs, then use the flex apis (fields(), terms(), etc) directly.
|
||||
(Mike McCandless, Robert Muir)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-4297: BooleanScorer2 would multiply the coord() factor
|
||||
|
|
|
@ -121,7 +121,7 @@ public class SimpleNaiveBayesClassifier implements Classifier {
|
|||
Terms terms = MultiFields.getTerms(atomicReader, textFieldName);
|
||||
long numPostings = terms.getSumDocFreq(); // number of term/doc pairs
|
||||
double avgNumberOfUniqueTerms = numPostings / (double) terms.getDocCount(); // avg # of unique terms per doc
|
||||
int docsWithC = atomicReader.docFreq(classFieldName, new BytesRef(c));
|
||||
int docsWithC = atomicReader.docFreq(new Term(classFieldName, c));
|
||||
return avgNumberOfUniqueTerms * docsWithC; // avg # of unique terms in text field per doc * # docs with c
|
||||
}
|
||||
|
||||
|
|
|
@ -21,7 +21,6 @@ import java.io.IOException;
|
|||
|
||||
import org.apache.lucene.search.SearcherManager; // javadocs
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/** {@code AtomicReader} is an abstract class, providing an interface for accessing an
|
||||
index. Search of an index is done entirely through this abstract interface,
|
||||
|
@ -67,17 +66,17 @@ public abstract class AtomicReader extends IndexReader {
|
|||
public abstract Fields fields() throws IOException;
|
||||
|
||||
@Override
|
||||
public final int docFreq(String field, BytesRef term) throws IOException {
|
||||
public final int docFreq(Term term) throws IOException {
|
||||
final Fields fields = fields();
|
||||
if (fields == null) {
|
||||
return 0;
|
||||
}
|
||||
final Terms terms = fields.terms(field);
|
||||
final Terms terms = fields.terms(term.field());
|
||||
if (terms == null) {
|
||||
return 0;
|
||||
}
|
||||
final TermsEnum termsEnum = terms.iterator(null);
|
||||
if (termsEnum.seekExact(term, true)) {
|
||||
if (termsEnum.seekExact(term.bytes(), true)) {
|
||||
return termsEnum.docFreq();
|
||||
} else {
|
||||
return 0;
|
||||
|
@ -89,17 +88,17 @@ public abstract class AtomicReader extends IndexReader {
|
|||
* field does not exists. This method does not take into
|
||||
* account deleted documents that have not yet been merged
|
||||
* away. */
|
||||
public final long totalTermFreq(String field, BytesRef term) throws IOException {
|
||||
public final long totalTermFreq(Term term) throws IOException {
|
||||
final Fields fields = fields();
|
||||
if (fields == null) {
|
||||
return 0;
|
||||
}
|
||||
final Terms terms = fields.terms(field);
|
||||
final Terms terms = fields.terms(term.field());
|
||||
if (terms == null) {
|
||||
return 0;
|
||||
}
|
||||
final TermsEnum termsEnum = terms.iterator(null);
|
||||
if (termsEnum.seekExact(term, true)) {
|
||||
if (termsEnum.seekExact(term.bytes(), true)) {
|
||||
return termsEnum.totalTermFreq();
|
||||
} else {
|
||||
return 0;
|
||||
|
@ -115,29 +114,20 @@ public abstract class AtomicReader extends IndexReader {
|
|||
return fields.terms(field);
|
||||
}
|
||||
|
||||
/** Returns {@link DocsEnum} for the specified field &
|
||||
* term. This will return null if either the field or
|
||||
* term does not exist. */
|
||||
public final DocsEnum termDocsEnum(Bits liveDocs, String field, BytesRef term) throws IOException {
|
||||
return termDocsEnum(liveDocs, field, term, DocsEnum.FLAG_FREQS);
|
||||
}
|
||||
|
||||
/** Returns {@link DocsEnum} for the specified field &
|
||||
* term, with control over whether freqs are required.
|
||||
* Some codecs may be able to optimize their
|
||||
* implementation when freqs are not required. This will
|
||||
* return null if the field or term does not
|
||||
* exist. See {@link TermsEnum#docs(Bits,DocsEnum,int)}. */
|
||||
public final DocsEnum termDocsEnum(Bits liveDocs, String field, BytesRef term, int flags) throws IOException {
|
||||
assert field != null;
|
||||
assert term != null;
|
||||
/** Returns {@link DocsEnum} for the specified term.
|
||||
* This will return null if either the field or
|
||||
* term does not exist.
|
||||
* @see TermsEnum#docs(Bits, DocsEnum) */
|
||||
public final DocsEnum termDocsEnum(Term term) throws IOException {
|
||||
assert term.field() != null;
|
||||
assert term.bytes() != null;
|
||||
final Fields fields = fields();
|
||||
if (fields != null) {
|
||||
final Terms terms = fields.terms(field);
|
||||
final Terms terms = fields.terms(term.field());
|
||||
if (terms != null) {
|
||||
final TermsEnum termsEnum = terms.iterator(null);
|
||||
if (termsEnum.seekExact(term, true)) {
|
||||
return termsEnum.docs(liveDocs, null, flags);
|
||||
if (termsEnum.seekExact(term.bytes(), true)) {
|
||||
return termsEnum.docs(getLiveDocs(), null);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -145,31 +135,19 @@ public abstract class AtomicReader extends IndexReader {
|
|||
}
|
||||
|
||||
/** Returns {@link DocsAndPositionsEnum} for the specified
|
||||
* field & term. This will return null if the
|
||||
* term. This will return null if the
|
||||
* field or term does not exist or positions weren't indexed.
|
||||
* @see #termPositionsEnum(Bits, String, BytesRef, int) */
|
||||
public final DocsAndPositionsEnum termPositionsEnum(Bits liveDocs, String field, BytesRef term) throws IOException {
|
||||
return termPositionsEnum(liveDocs, field, term, DocsAndPositionsEnum.FLAG_OFFSETS | DocsAndPositionsEnum.FLAG_PAYLOADS);
|
||||
}
|
||||
|
||||
|
||||
/** Returns {@link DocsAndPositionsEnum} for the specified
|
||||
* field & term, with control over whether offsets and payloads are
|
||||
* required. Some codecs may be able to optimize their
|
||||
* implementation when offsets and/or payloads are not required.
|
||||
* This will return null if the field or term
|
||||
* does not exist or positions weren't indexed. See
|
||||
* {@link TermsEnum#docsAndPositions(Bits,DocsAndPositionsEnum,int)}. */
|
||||
public final DocsAndPositionsEnum termPositionsEnum(Bits liveDocs, String field, BytesRef term, int flags) throws IOException {
|
||||
assert field != null;
|
||||
assert term != null;
|
||||
* @see TermsEnum#docsAndPositions(Bits, DocsAndPositionsEnum) */
|
||||
public final DocsAndPositionsEnum termPositionsEnum(Term term) throws IOException {
|
||||
assert term.field() != null;
|
||||
assert term.bytes() != null;
|
||||
final Fields fields = fields();
|
||||
if (fields != null) {
|
||||
final Terms terms = fields.terms(field);
|
||||
final Terms terms = fields.terms(term.field());
|
||||
if (terms != null) {
|
||||
final TermsEnum termsEnum = terms.iterator(null);
|
||||
if (termsEnum.seekExact(term, true)) {
|
||||
return termsEnum.docsAndPositions(liveDocs, null, flags);
|
||||
if (termsEnum.seekExact(term.bytes(), true)) {
|
||||
return termsEnum.docsAndPositions(getLiveDocs(), null);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,8 +22,6 @@ import java.util.Arrays;
|
|||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/** Base class for implementing {@link CompositeReader}s based on an array
|
||||
* of sub-readers. The implementing class has to add code for
|
||||
* correctly refcounting and closing the sub-readers.
|
||||
|
@ -125,11 +123,25 @@ public abstract class BaseCompositeReader<R extends IndexReader> extends Composi
|
|||
}
|
||||
|
||||
@Override
|
||||
public final int docFreq(String field, BytesRef t) throws IOException {
|
||||
public final int docFreq(Term term) throws IOException {
|
||||
ensureOpen();
|
||||
int total = 0; // sum freqs in subreaders
|
||||
for (int i = 0; i < subReaders.length; i++) {
|
||||
total += subReaders[i].docFreq(field, t);
|
||||
total += subReaders[i].docFreq(term);
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final long totalTermFreq(Term term) throws IOException {
|
||||
ensureOpen();
|
||||
long total = 0; // sum freqs in subreaders
|
||||
for (int i = 0; i < subReaders.length; i++) {
|
||||
long sub = subReaders[i].totalTermFreq(term);
|
||||
if (sub == -1) {
|
||||
return -1;
|
||||
}
|
||||
total += sub;
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
|
|
@ -31,7 +31,6 @@ import org.apache.lucene.document.DocumentStoredFieldVisitor;
|
|||
import org.apache.lucene.search.SearcherManager; // javadocs
|
||||
import org.apache.lucene.store.AlreadyClosedException;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/** IndexReader is an abstract class, providing an interface for accessing an
|
||||
index. Search of an index is done entirely through this abstract interface,
|
||||
|
@ -432,15 +431,17 @@ public abstract class IndexReader implements Closeable {
|
|||
* <code>term</code>. This method returns 0 if the term or
|
||||
* field does not exists. This method does not take into
|
||||
* account deleted documents that have not yet been merged
|
||||
* away. */
|
||||
public final int docFreq(Term term) throws IOException {
|
||||
return docFreq(term.field(), term.bytes());
|
||||
}
|
||||
* away.
|
||||
* @see TermsEnum#docFreq()
|
||||
*/
|
||||
public abstract int docFreq(Term term) throws IOException;
|
||||
|
||||
/** Returns the number of documents containing the
|
||||
/** Returns the number of documents containing the term
|
||||
* <code>term</code>. This method returns 0 if the term or
|
||||
* field does not exists. This method does not take into
|
||||
* account deleted documents that have not yet been merged
|
||||
* away. */
|
||||
public abstract int docFreq(String field, BytesRef term) throws IOException;
|
||||
* field does not exists, or -1 if the Codec does not support
|
||||
* the measure. This method does not take into account deleted
|
||||
* documents that have not yet been merged away.
|
||||
* @see TermsEnum#totalTermFreq()
|
||||
*/
|
||||
public abstract long totalTermFreq(Term term) throws IOException;
|
||||
}
|
||||
|
|
|
@ -260,7 +260,7 @@ public class PhraseQuery extends Query {
|
|||
final Term t = terms.get(i);
|
||||
final TermState state = states[i].get(context.ord);
|
||||
if (state == null) { /* term doesnt exist in this segment */
|
||||
assert termNotInReader(reader, field, t.bytes()): "no termstate found but term exists in reader";
|
||||
assert termNotInReader(reader, t): "no termstate found but term exists in reader";
|
||||
return null;
|
||||
}
|
||||
te.seekExact(t.bytes(), state);
|
||||
|
@ -295,8 +295,8 @@ public class PhraseQuery extends Query {
|
|||
}
|
||||
|
||||
// only called from assert
|
||||
private boolean termNotInReader(AtomicReader reader, String field, BytesRef bytes) throws IOException {
|
||||
return reader.docFreq(field, bytes) == 0;
|
||||
private boolean termNotInReader(AtomicReader reader, Term term) throws IOException {
|
||||
return reader.docFreq(term) == 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -95,7 +95,7 @@ public class TermQuery extends Query {
|
|||
private TermsEnum getTermsEnum(AtomicReaderContext context) throws IOException {
|
||||
final TermState state = termStates.get(context.ord);
|
||||
if (state == null) { // term is not present in that reader
|
||||
assert termNotInReader(context.reader(), term.field(), term.bytes()) : "no termstate found but term exists in reader term=" + term;
|
||||
assert termNotInReader(context.reader(), term) : "no termstate found but term exists in reader term=" + term;
|
||||
return null;
|
||||
}
|
||||
//System.out.println("LD=" + reader.getLiveDocs() + " set?=" + (reader.getLiveDocs() != null ? reader.getLiveDocs().get(0) : "null"));
|
||||
|
@ -104,10 +104,10 @@ public class TermQuery extends Query {
|
|||
return termsEnum;
|
||||
}
|
||||
|
||||
private boolean termNotInReader(AtomicReader reader, String field, BytesRef bytes) throws IOException {
|
||||
private boolean termNotInReader(AtomicReader reader, Term term) throws IOException {
|
||||
// only called from assert
|
||||
//System.out.println("TQ.termNotInReader reader=" + reader + " term=" + field + ":" + bytes.utf8ToString());
|
||||
return reader.docFreq(field, bytes) == 0;
|
||||
return reader.docFreq(term) == 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -17,7 +17,7 @@ package org.apache.lucene.search;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.AtomicReader; // javadocs
|
||||
import org.apache.lucene.index.TermsEnum; // javadocs
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
/**
|
||||
* Contains statistics for a specific term
|
||||
|
@ -42,13 +42,13 @@ public class TermStatistics {
|
|||
}
|
||||
|
||||
/** returns the number of documents this term occurs in
|
||||
* @see AtomicReader#docFreq(String, BytesRef) */
|
||||
* @see TermsEnum#docFreq() */
|
||||
public final long docFreq() {
|
||||
return docFreq;
|
||||
}
|
||||
|
||||
/** returns the total number of occurrences of this term
|
||||
* @see AtomicReader#totalTermFreq(String, BytesRef) */
|
||||
* @see TermsEnum#totalTermFreq() */
|
||||
public final long totalTermFreq() {
|
||||
return totalTermFreq;
|
||||
}
|
||||
|
|
|
@ -156,7 +156,15 @@ public class TestReuseDocsEnum extends LuceneTestCase {
|
|||
return null;
|
||||
}
|
||||
AtomicReader indexReader = readers.get(random().nextInt(readers.size())).reader();
|
||||
return indexReader.termDocsEnum(bits, field, term, random().nextBoolean() ? DocsEnum.FLAG_FREQS : 0);
|
||||
Terms terms = indexReader.terms(field);
|
||||
if (terms == null) {
|
||||
return null;
|
||||
}
|
||||
TermsEnum iterator = terms.iterator(null);
|
||||
if (iterator.seekExact(term, true)) {
|
||||
return iterator.docs(bits, null, random().nextBoolean() ? DocsEnum.FLAG_FREQS : 0);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -881,7 +881,7 @@ public class TestDocValuesIndexing extends LuceneTestCase {
|
|||
public int docId(AtomicReader reader, Term term) throws IOException {
|
||||
int docFreq = reader.docFreq(term);
|
||||
assertEquals(1, docFreq);
|
||||
DocsEnum termDocsEnum = reader.termDocsEnum(null, term.field, term.bytes, 0);
|
||||
DocsEnum termDocsEnum = reader.termDocsEnum(term);
|
||||
int nextDoc = termDocsEnum.nextDoc();
|
||||
assertEquals(DocIdSetIterator.NO_MORE_DOCS, termDocsEnum.nextDoc());
|
||||
return nextDoc;
|
||||
|
|
|
@ -92,7 +92,14 @@ public class TestDocsAndPositions extends LuceneTestCase {
|
|||
|
||||
public DocsAndPositionsEnum getDocsAndPositions(AtomicReader reader,
|
||||
BytesRef bytes, Bits liveDocs) throws IOException {
|
||||
return reader.termPositionsEnum(null, fieldName, bytes);
|
||||
Terms terms = reader.terms(fieldName);
|
||||
if (terms != null) {
|
||||
TermsEnum te = terms.iterator(null);
|
||||
if (te.seekExact(bytes, true)) {
|
||||
return te.docsAndPositions(liveDocs, null);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -352,7 +359,7 @@ public class TestDocsAndPositions extends LuceneTestCase {
|
|||
writer.addDocument(doc);
|
||||
DirectoryReader reader = writer.getReader();
|
||||
AtomicReader r = getOnlySegmentReader(reader);
|
||||
DocsAndPositionsEnum disi = r.termPositionsEnum(null, "foo", new BytesRef("bar"));
|
||||
DocsAndPositionsEnum disi = r.termPositionsEnum(new Term("foo", "bar"));
|
||||
int docid = disi.docID();
|
||||
assertTrue(docid == -1 || docid == DocIdSetIterator.NO_MORE_DOCS);
|
||||
assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
|
||||
|
|
|
@ -243,18 +243,18 @@ public class TestDocumentWriter extends LuceneTestCase {
|
|||
writer.close();
|
||||
SegmentReader reader = new SegmentReader(info, DirectoryReader.DEFAULT_TERMS_INDEX_DIVISOR, newIOContext(random()));
|
||||
|
||||
DocsAndPositionsEnum termPositions = reader.termPositionsEnum(reader.getLiveDocs(), "preanalyzed", new BytesRef("term1"));
|
||||
DocsAndPositionsEnum termPositions = reader.termPositionsEnum(new Term("preanalyzed", "term1"));
|
||||
assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
|
||||
assertEquals(1, termPositions.freq());
|
||||
assertEquals(0, termPositions.nextPosition());
|
||||
|
||||
termPositions = reader.termPositionsEnum(reader.getLiveDocs(), "preanalyzed", new BytesRef("term2"));
|
||||
termPositions = reader.termPositionsEnum(new Term("preanalyzed", "term2"));
|
||||
assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
|
||||
assertEquals(2, termPositions.freq());
|
||||
assertEquals(1, termPositions.nextPosition());
|
||||
assertEquals(3, termPositions.nextPosition());
|
||||
|
||||
termPositions = reader.termPositionsEnum(reader.getLiveDocs(), "preanalyzed", new BytesRef("term3"));
|
||||
termPositions = reader.termPositionsEnum(new Term("preanalyzed", "term3"));
|
||||
assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
|
||||
assertEquals(1, termPositions.freq());
|
||||
assertEquals(2, termPositions.nextPosition());
|
||||
|
|
|
@ -84,10 +84,7 @@ public class TestMultiLevelSkipList extends LuceneTestCase {
|
|||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
counter = 0;
|
||||
DocsAndPositionsEnum tp = reader.termPositionsEnum(reader.getLiveDocs(),
|
||||
term.field(),
|
||||
new BytesRef(term.text()));
|
||||
|
||||
DocsAndPositionsEnum tp = reader.termPositionsEnum(term);
|
||||
checkSkipTo(tp, 14, 185); // no skips
|
||||
checkSkipTo(tp, 17, 190); // one skip on level 0
|
||||
checkSkipTo(tp, 287, 200); // one skip on level 1, two on level 0
|
||||
|
|
|
@ -605,7 +605,7 @@ public class TestPayloads extends LuceneTestCase {
|
|||
writer.addDocument(doc);
|
||||
DirectoryReader reader = writer.getReader();
|
||||
AtomicReader sr = SlowCompositeReaderWrapper.wrap(reader);
|
||||
DocsAndPositionsEnum de = sr.termPositionsEnum(null, "field", new BytesRef("withPayload"));
|
||||
DocsAndPositionsEnum de = sr.termPositionsEnum(new Term("field", "withPayload"));
|
||||
de.nextDoc();
|
||||
de.nextPosition();
|
||||
assertEquals(new BytesRef("test"), de.getPayload());
|
||||
|
@ -639,7 +639,7 @@ public class TestPayloads extends LuceneTestCase {
|
|||
writer.addDocument(doc);
|
||||
DirectoryReader reader = writer.getReader();
|
||||
SegmentReader sr = getOnlySegmentReader(reader);
|
||||
DocsAndPositionsEnum de = sr.termPositionsEnum(null, "field", new BytesRef("withPayload"));
|
||||
DocsAndPositionsEnum de = sr.termPositionsEnum(new Term("field", "withPayload"));
|
||||
de.nextDoc();
|
||||
de.nextPosition();
|
||||
assertEquals(new BytesRef("test"), de.getPayload());
|
||||
|
|
|
@ -212,9 +212,7 @@ public class TestPositionIncrement extends LuceneTestCase {
|
|||
final IndexReader readerFromWriter = writer.getReader();
|
||||
AtomicReader r = SlowCompositeReaderWrapper.wrap(readerFromWriter);
|
||||
|
||||
DocsAndPositionsEnum tp = r.termPositionsEnum(r.getLiveDocs(),
|
||||
"content",
|
||||
new BytesRef("a"));
|
||||
DocsAndPositionsEnum tp = r.termPositionsEnum(new Term("content", "a"));
|
||||
|
||||
int count = 0;
|
||||
assertTrue(tp.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
|
||||
|
|
|
@ -95,7 +95,7 @@ public class TestNRTCachingDirectory extends LuceneTestCase {
|
|||
|
||||
r = DirectoryReader.open(dir);
|
||||
for(BytesRef id : ids) {
|
||||
assertEquals(1, r.docFreq("docid", id));
|
||||
assertEquals(1, r.docFreq(new Term("docid", id)));
|
||||
}
|
||||
r.close();
|
||||
cachedDir.close();
|
||||
|
|
|
@ -411,10 +411,14 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
|
|||
try {
|
||||
final BytesRef catTerm = new BytesRef(categoryPath.toString(delimiter));
|
||||
for (AtomicReaderContext ctx : reader.leaves()) {
|
||||
DocsEnum docs = ctx.reader().termDocsEnum(null, Consts.FULL, catTerm, 0);
|
||||
if (docs != null) {
|
||||
Terms terms = ctx.reader().terms(Consts.FULL);
|
||||
if (terms != null) {
|
||||
TermsEnum termsEnum = terms.iterator(null);
|
||||
if (termsEnum.seekExact(catTerm, true)) {
|
||||
// TODO: is it really ok that null is passed here as liveDocs?
|
||||
DocsEnum docs = termsEnum.docs(null, null, 0);
|
||||
doc = docs.nextDoc() + ctx.docBase;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
|
@ -452,10 +456,14 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
|
|||
try {
|
||||
final BytesRef catTerm = new BytesRef(categoryPath.toString(delimiter, prefixLen));
|
||||
for (AtomicReaderContext ctx : reader.leaves()) {
|
||||
DocsEnum docs = ctx.reader().termDocsEnum(null, Consts.FULL, catTerm, 0);
|
||||
if (docs != null) {
|
||||
Terms terms = ctx.reader().terms(Consts.FULL);
|
||||
if (terms != null) {
|
||||
TermsEnum termsEnum = terms.iterator(null);
|
||||
if (termsEnum.seekExact(catTerm, true)) {
|
||||
// TODO: is it really ok that null is passed here as liveDocs?
|
||||
DocsEnum docs = termsEnum.docs(null, null, 0);
|
||||
doc = docs.nextDoc() + ctx.docBase;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
|
|
|
@ -864,7 +864,7 @@ public class TestGrouping extends LuceneTestCase {
|
|||
final boolean doCache = random().nextBoolean();
|
||||
final boolean doAllGroups = random().nextBoolean();
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " dF=" + r.docFreq("content", new BytesRef(searchTerm)) +" dFBlock=" + rBlocks.docFreq("content", new BytesRef(searchTerm)) + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup + " doAllGroups=" + doAllGroups + " getScores=" + getScores + " getMaxScores=" + getMaxScores);
|
||||
System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " dF=" + r.docFreq(new Term("content", searchTerm)) +" dFBlock=" + rBlocks.docFreq(new Term("content", searchTerm)) + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup + " doAllGroups=" + doAllGroups + " getScores=" + getScores + " getMaxScores=" + getMaxScores);
|
||||
}
|
||||
|
||||
final AbstractFirstPassGroupingCollector<?> c1 = createRandomFirstPassCollector("group", groupSort, groupOffset+topNGroups, canUseIDV);
|
||||
|
|
|
@ -24,6 +24,7 @@ import java.util.Set;
|
|||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -112,7 +113,7 @@ public class FieldTermStack {
|
|||
dpEnum.nextDoc();
|
||||
|
||||
// For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
|
||||
final float weight = ( float ) ( Math.log( numDocs / ( double ) ( reader.docFreq( fieldName, text ) + 1 ) ) + 1.0 );
|
||||
final float weight = ( float ) ( Math.log( numDocs / ( double ) ( reader.docFreq( new Term(fieldName, text) ) + 1 ) ) + 1.0 );
|
||||
|
||||
final int freq = dpEnum.freq();
|
||||
|
||||
|
|
|
@ -225,7 +225,7 @@ public class MemoryIndexTest extends BaseTokenStreamTestCase {
|
|||
MemoryIndex memory = new MemoryIndex(true);
|
||||
memory.addField("foo", "bar", analyzer);
|
||||
AtomicReader reader = (AtomicReader) memory.createSearcher().getIndexReader();
|
||||
DocsAndPositionsEnum disi = reader.termPositionsEnum(null, "foo", new BytesRef("bar"));
|
||||
DocsAndPositionsEnum disi = reader.termPositionsEnum(new Term("foo", "bar"));
|
||||
int docid = disi.docID();
|
||||
assertTrue(docid == -1 || docid == DocIdSetIterator.NO_MORE_DOCS);
|
||||
assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
|
||||
|
|
|
@ -23,6 +23,7 @@ import org.apache.lucene.store.FSDirectory;
|
|||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
|
||||
/**
|
||||
* Utility to get document frequency and total number of occurrences (sum of the tf for each doc) of a term.
|
||||
|
@ -44,14 +45,14 @@ public class GetTermInfo {
|
|||
System.exit(1);
|
||||
}
|
||||
|
||||
getTermInfo(dir,field, new BytesRef(inputStr));
|
||||
getTermInfo(dir,new Term(field, inputStr));
|
||||
}
|
||||
|
||||
public static void getTermInfo(Directory dir, String field, BytesRef termtext) throws Exception {
|
||||
public static void getTermInfo(Directory dir, Term term) throws Exception {
|
||||
IndexReader reader = DirectoryReader.open(dir);
|
||||
long totalTF = HighFreqTerms.getTotalTermFreq(reader, field, termtext);
|
||||
long totalTF = HighFreqTerms.getTotalTermFreq(reader, term);
|
||||
System.out.printf("%s:%s \t totalTF = %,d \t doc freq = %,d \n",
|
||||
field, termtext.utf8ToString(), totalTF, reader.docFreq(field, termtext));
|
||||
term.field(), term.text(), totalTF, reader.docFreq(term));
|
||||
}
|
||||
|
||||
private static void usage() {
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.index.IndexReader;
|
|||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.ReaderUtil;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
|
@ -167,7 +168,7 @@ public class HighFreqTerms {
|
|||
TermStats[] ts = new TermStats[terms.length]; // array for sorting
|
||||
long totalTF;
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
totalTF = getTotalTermFreq(reader, terms[i].field, terms[i].termtext);
|
||||
totalTF = getTotalTermFreq(reader, new Term(terms[i].field, terms[i].termtext));
|
||||
ts[i] = new TermStats(terms[i].field, terms[i].termtext, terms[i].docFreq, totalTF);
|
||||
}
|
||||
|
||||
|
@ -177,24 +178,23 @@ public class HighFreqTerms {
|
|||
return ts;
|
||||
}
|
||||
|
||||
public static long getTotalTermFreq(IndexReader reader, final String field, final BytesRef termText) throws Exception {
|
||||
public static long getTotalTermFreq(IndexReader reader, Term term) throws Exception {
|
||||
long totalTF = 0L;
|
||||
for (final AtomicReaderContext ctx : reader.leaves()) {
|
||||
AtomicReader r = ctx.reader();
|
||||
Bits liveDocs = r.getLiveDocs();
|
||||
if (liveDocs == null) {
|
||||
if (!r.hasDeletions()) {
|
||||
// TODO: we could do this up front, during the scan
|
||||
// (next()), instead of after-the-fact here w/ seek,
|
||||
// if the codec supports it and there are no del
|
||||
// docs...
|
||||
final long totTF = r.totalTermFreq(field, termText);
|
||||
final long totTF = r.totalTermFreq(term);
|
||||
if (totTF != -1) {
|
||||
totalTF += totTF;
|
||||
continue;
|
||||
} // otherwise we fall-through
|
||||
}
|
||||
// note: what should we do if field omits freqs? currently it counts as 1...
|
||||
DocsEnum de = r.termDocsEnum(liveDocs, field, termText);
|
||||
DocsEnum de = r.termDocsEnum(term);
|
||||
if (de != null) {
|
||||
while (de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS)
|
||||
totalTF += de.freq();
|
||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.document.Field;
|
|||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
@ -182,7 +183,7 @@ public class TestHighFreqTerms extends LuceneTestCase {
|
|||
String term ="highTF";
|
||||
BytesRef termtext = new BytesRef (term);
|
||||
String field = "FIELD_1";
|
||||
long totalTermFreq = HighFreqTerms.getTotalTermFreq(reader, field, termtext);
|
||||
long totalTermFreq = HighFreqTerms.getTotalTermFreq(reader, new Term(field, termtext));
|
||||
assertEquals("highTf tf should be 200",200,totalTermFreq);
|
||||
|
||||
}
|
||||
|
@ -191,7 +192,7 @@ public class TestHighFreqTerms extends LuceneTestCase {
|
|||
String term ="foobar";
|
||||
BytesRef termtext = new BytesRef (term);
|
||||
String field = "FIELD_1";
|
||||
long totalTermFreq = HighFreqTerms.getTotalTermFreq(reader, field, termtext);
|
||||
long totalTermFreq = HighFreqTerms.getTotalTermFreq(reader, new Term(field, termtext));
|
||||
assertEquals("totalTermFreq should be 0 for term not in index",0,totalTermFreq);
|
||||
|
||||
}
|
||||
|
|
|
@ -22,7 +22,10 @@ import java.util.Map;
|
|||
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.ReaderUtil;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.queries.function.FunctionValues;
|
||||
import org.apache.lucene.queries.function.docvalues.IntDocValues;
|
||||
import org.apache.lucene.search.FieldCache.DocTerms;
|
||||
|
@ -55,6 +58,8 @@ public class JoinDocFreqValueSource extends FieldCacheSource {
|
|||
{
|
||||
final DocTerms terms = cache.getTerms(readerContext.reader(), field, PackedInts.FAST);
|
||||
final IndexReader top = ReaderUtil.getTopLevelContext(readerContext).reader();
|
||||
Terms t = MultiFields.getTerms(top, qfield);
|
||||
final TermsEnum termsEnum = t == null ? TermsEnum.EMPTY : t.iterator(null);
|
||||
|
||||
return new IntDocValues(this) {
|
||||
final BytesRef ref = new BytesRef();
|
||||
|
@ -64,8 +69,11 @@ public class JoinDocFreqValueSource extends FieldCacheSource {
|
|||
{
|
||||
try {
|
||||
terms.getTerm(doc, ref);
|
||||
//System.out.println( NAME+"["+field+"="+ref.utf8ToString()+"=("+qfield+":"+v+")]" );
|
||||
return top.docFreq( qfield, ref );
|
||||
if (termsEnum.seekExact(ref, true)) {
|
||||
return termsEnum.docFreq();
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new RuntimeException("caught exception in function "+description()+" : doc="+doc, e);
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
package org.apache.lucene.queries.function.valuesource;
|
||||
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.queries.function.FunctionValues;
|
||||
import org.apache.lucene.queries.function.ValueSource;
|
||||
import org.apache.lucene.queries.function.docvalues.LongDocValues;
|
||||
|
@ -65,7 +66,7 @@ public class TotalTermFreqValueSource extends ValueSource {
|
|||
public void createWeight(Map context, IndexSearcher searcher) throws IOException {
|
||||
long totalTermFreq = 0;
|
||||
for (AtomicReaderContext readerContext : searcher.getTopReaderContext().leaves()) {
|
||||
long val = readerContext.reader().totalTermFreq(indexedField, indexedBytes);
|
||||
long val = readerContext.reader().totalTermFreq(new Term(indexedField, indexedBytes));
|
||||
if (val == -1) {
|
||||
totalTermFreq = -1;
|
||||
break;
|
||||
|
|
|
@ -33,6 +33,7 @@ import org.apache.lucene.index.FieldInfo.IndexOptions;
|
|||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
@ -344,7 +345,7 @@ public class LukeRequestHandler extends RequestHandlerBase
|
|||
if(sfield != null && sfield.indexed() ) {
|
||||
// In the pre-4.0 days, this did a veeeery expensive range query. But we can be much faster now,
|
||||
// so just do this all the time.
|
||||
StoredDocument doc = getFirstLiveDoc(reader, fieldName, terms);
|
||||
StoredDocument doc = getFirstLiveDoc(terms, reader);
|
||||
|
||||
|
||||
if( doc != null ) {
|
||||
|
@ -378,7 +379,7 @@ public class LukeRequestHandler extends RequestHandlerBase
|
|||
// Just get a document with the term in it, the first one will do!
|
||||
// Is there a better way to do this? Shouldn't actually be very costly
|
||||
// to do it this way.
|
||||
private static StoredDocument getFirstLiveDoc(AtomicReader reader, String fieldName, Terms terms) throws IOException {
|
||||
private static StoredDocument getFirstLiveDoc(Terms terms, AtomicReader reader) throws IOException {
|
||||
DocsEnum docsEnum = null;
|
||||
TermsEnum termsEnum = terms.iterator(null);
|
||||
BytesRef text;
|
||||
|
@ -388,16 +389,9 @@ public class LukeRequestHandler extends RequestHandlerBase
|
|||
if (text == null) { // Ran off the end of the terms enum without finding any live docs with that field in them.
|
||||
return null;
|
||||
}
|
||||
Term term = new Term(fieldName, text);
|
||||
docsEnum = reader.termDocsEnum(reader.getLiveDocs(),
|
||||
term.field(),
|
||||
new BytesRef(term.text()),
|
||||
0);
|
||||
if (docsEnum != null) {
|
||||
int docId;
|
||||
if ((docId = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
return reader.document(docId);
|
||||
}
|
||||
docsEnum = termsEnum.docs(reader.getLiveDocs(), docsEnum, 0);
|
||||
if (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
return reader.document(docsEnum.docID());
|
||||
}
|
||||
}
|
||||
return null;
|
||||
|
|
|
@ -18,6 +18,7 @@ import org.apache.lucene.index.FieldInfo;
|
|||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.StoredFieldVisitor;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -382,12 +383,18 @@ public class TermVectorComponent extends SearchComponent implements SolrCoreAwar
|
|||
}
|
||||
}
|
||||
|
||||
if (fieldOptions.docFreq) {
|
||||
termInfo.add("df", getDocFreq(reader, field, text));
|
||||
int df = 0;
|
||||
if (fieldOptions.docFreq || fieldOptions.tfIdf) {
|
||||
df = reader.docFreq(new Term(field, text));
|
||||
}
|
||||
|
||||
if (fieldOptions.docFreq) {
|
||||
termInfo.add("df", df);
|
||||
}
|
||||
|
||||
// TODO: this is not TF/IDF by anyone's definition!
|
||||
if (fieldOptions.tfIdf) {
|
||||
double tfIdfVal = ((double) freq) / getDocFreq(reader, field, text);
|
||||
double tfIdfVal = ((double) freq) / df;
|
||||
termInfo.add("tf-idf", tfIdfVal);
|
||||
}
|
||||
}
|
||||
|
@ -408,16 +415,6 @@ public class TermVectorComponent extends SearchComponent implements SolrCoreAwar
|
|||
return result;
|
||||
}
|
||||
|
||||
private static int getDocFreq(IndexReader reader, String field, BytesRef term) {
|
||||
int result = 1;
|
||||
try {
|
||||
result = reader.docFreq(field, term);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void prepare(ResponseBuilder rb) throws IOException {
|
||||
|
||||
|
|
|
@ -615,20 +615,19 @@ public class SolrIndexSearcher extends IndexSearcher implements Closeable,SolrIn
|
|||
final AtomicReaderContext leaf = leafContexts.get(i);
|
||||
final AtomicReader reader = leaf.reader();
|
||||
|
||||
final Fields fields = reader.fields();
|
||||
if (fields == null) continue;
|
||||
final Terms terms = reader.terms(field);
|
||||
if (terms == null) continue;
|
||||
|
||||
final Bits liveDocs = reader.getLiveDocs();
|
||||
|
||||
final DocsEnum docs = reader.termDocsEnum(liveDocs, field, idBytes, 0);
|
||||
|
||||
if (docs == null) continue;
|
||||
TermsEnum te = terms.iterator(null);
|
||||
if (te.seekExact(idBytes, true)) {
|
||||
DocsEnum docs = te.docs(reader.getLiveDocs(), null, 0);
|
||||
int id = docs.nextDoc();
|
||||
if (id == DocIdSetIterator.NO_MORE_DOCS) continue;
|
||||
assert docs.nextDoc() == DocIdSetIterator.NO_MORE_DOCS;
|
||||
|
||||
return (((long)i) << 32) | id;
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue