LUCENE-4355: improve AtomicReader sugar apis

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1384274 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-09-13 11:21:03 +00:00
parent bc425d77bc
commit dc02cdd384
28 changed files with 169 additions and 152 deletions

View File

@ -103,6 +103,12 @@ API Changes
Use DataOutput.copyBytes(DataInput, long) instead.
(Mike McCandless, Robert Muir)
* LUCENE-4355: Simplify AtomicReader's sugar methods such as termDocsEnum,
termPositionsEnum, docFreq, and totalTermFreq to only take Term as a
parameter. If you want to do expert things such as pass a different
Bits as liveDocs, then use the flex apis (fields(), terms(), etc) directly.
(Mike McCandless, Robert Muir)
Bug Fixes
* LUCENE-4297: BooleanScorer2 would multiply the coord() factor

View File

@ -121,7 +121,7 @@ public class SimpleNaiveBayesClassifier implements Classifier {
Terms terms = MultiFields.getTerms(atomicReader, textFieldName);
long numPostings = terms.getSumDocFreq(); // number of term/doc pairs
double avgNumberOfUniqueTerms = numPostings / (double) terms.getDocCount(); // avg # of unique terms per doc
int docsWithC = atomicReader.docFreq(classFieldName, new BytesRef(c));
int docsWithC = atomicReader.docFreq(new Term(classFieldName, c));
return avgNumberOfUniqueTerms * docsWithC; // avg # of unique terms in text field per doc * # docs with c
}

View File

@ -21,7 +21,6 @@ import java.io.IOException;
import org.apache.lucene.search.SearcherManager; // javadocs
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
/** {@code AtomicReader} is an abstract class, providing an interface for accessing an
index. Search of an index is done entirely through this abstract interface,
@ -67,17 +66,17 @@ public abstract class AtomicReader extends IndexReader {
public abstract Fields fields() throws IOException;
@Override
public final int docFreq(String field, BytesRef term) throws IOException {
public final int docFreq(Term term) throws IOException {
final Fields fields = fields();
if (fields == null) {
return 0;
}
final Terms terms = fields.terms(field);
final Terms terms = fields.terms(term.field());
if (terms == null) {
return 0;
}
final TermsEnum termsEnum = terms.iterator(null);
if (termsEnum.seekExact(term, true)) {
if (termsEnum.seekExact(term.bytes(), true)) {
return termsEnum.docFreq();
} else {
return 0;
@ -89,17 +88,17 @@ public abstract class AtomicReader extends IndexReader {
* field does not exists. This method does not take into
* account deleted documents that have not yet been merged
* away. */
public final long totalTermFreq(String field, BytesRef term) throws IOException {
public final long totalTermFreq(Term term) throws IOException {
final Fields fields = fields();
if (fields == null) {
return 0;
}
final Terms terms = fields.terms(field);
final Terms terms = fields.terms(term.field());
if (terms == null) {
return 0;
}
final TermsEnum termsEnum = terms.iterator(null);
if (termsEnum.seekExact(term, true)) {
if (termsEnum.seekExact(term.bytes(), true)) {
return termsEnum.totalTermFreq();
} else {
return 0;
@ -115,61 +114,40 @@ public abstract class AtomicReader extends IndexReader {
return fields.terms(field);
}
/** Returns {@link DocsEnum} for the specified field &
* term. This will return null if either the field or
* term does not exist. */
public final DocsEnum termDocsEnum(Bits liveDocs, String field, BytesRef term) throws IOException {
return termDocsEnum(liveDocs, field, term, DocsEnum.FLAG_FREQS);
}
/** Returns {@link DocsEnum} for the specified field &
* term, with control over whether freqs are required.
* Some codecs may be able to optimize their
* implementation when freqs are not required. This will
* return null if the field or term does not
* exist. See {@link TermsEnum#docs(Bits,DocsEnum,int)}. */
public final DocsEnum termDocsEnum(Bits liveDocs, String field, BytesRef term, int flags) throws IOException {
assert field != null;
assert term != null;
/** Returns {@link DocsEnum} for the specified term.
* This will return null if either the field or
* term does not exist.
* @see TermsEnum#docs(Bits, DocsEnum) */
public final DocsEnum termDocsEnum(Term term) throws IOException {
assert term.field() != null;
assert term.bytes() != null;
final Fields fields = fields();
if (fields != null) {
final Terms terms = fields.terms(field);
final Terms terms = fields.terms(term.field());
if (terms != null) {
final TermsEnum termsEnum = terms.iterator(null);
if (termsEnum.seekExact(term, true)) {
return termsEnum.docs(liveDocs, null, flags);
if (termsEnum.seekExact(term.bytes(), true)) {
return termsEnum.docs(getLiveDocs(), null);
}
}
}
return null;
}
/** Returns {@link DocsAndPositionsEnum} for the specified
* field & term. This will return null if the
* term. This will return null if the
* field or term does not exist or positions weren't indexed.
* @see #termPositionsEnum(Bits, String, BytesRef, int) */
public final DocsAndPositionsEnum termPositionsEnum(Bits liveDocs, String field, BytesRef term) throws IOException {
return termPositionsEnum(liveDocs, field, term, DocsAndPositionsEnum.FLAG_OFFSETS | DocsAndPositionsEnum.FLAG_PAYLOADS);
}
/** Returns {@link DocsAndPositionsEnum} for the specified
* field & term, with control over whether offsets and payloads are
* required. Some codecs may be able to optimize their
* implementation when offsets and/or payloads are not required.
* This will return null if the field or term
* does not exist or positions weren't indexed. See
* {@link TermsEnum#docsAndPositions(Bits,DocsAndPositionsEnum,int)}. */
public final DocsAndPositionsEnum termPositionsEnum(Bits liveDocs, String field, BytesRef term, int flags) throws IOException {
assert field != null;
assert term != null;
* @see TermsEnum#docsAndPositions(Bits, DocsAndPositionsEnum) */
public final DocsAndPositionsEnum termPositionsEnum(Term term) throws IOException {
assert term.field() != null;
assert term.bytes() != null;
final Fields fields = fields();
if (fields != null) {
final Terms terms = fields.terms(field);
final Terms terms = fields.terms(term.field());
if (terms != null) {
final TermsEnum termsEnum = terms.iterator(null);
if (termsEnum.seekExact(term, true)) {
return termsEnum.docsAndPositions(liveDocs, null, flags);
if (termsEnum.seekExact(term.bytes(), true)) {
return termsEnum.docsAndPositions(getLiveDocs(), null);
}
}
}

View File

@ -22,8 +22,6 @@ import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.util.BytesRef;
/** Base class for implementing {@link CompositeReader}s based on an array
* of sub-readers. The implementing class has to add code for
* correctly refcounting and closing the sub-readers.
@ -125,11 +123,25 @@ public abstract class BaseCompositeReader<R extends IndexReader> extends Composi
}
@Override
public final int docFreq(String field, BytesRef t) throws IOException {
public final int docFreq(Term term) throws IOException {
ensureOpen();
int total = 0; // sum freqs in subreaders
for (int i = 0; i < subReaders.length; i++) {
total += subReaders[i].docFreq(field, t);
total += subReaders[i].docFreq(term);
}
return total;
}
@Override
public final long totalTermFreq(Term term) throws IOException {
ensureOpen();
long total = 0; // sum freqs in subreaders
for (int i = 0; i < subReaders.length; i++) {
long sub = subReaders[i].totalTermFreq(term);
if (sub == -1) {
return -1;
}
total += sub;
}
return total;
}

View File

@ -31,7 +31,6 @@ import org.apache.lucene.document.DocumentStoredFieldVisitor;
import org.apache.lucene.search.SearcherManager; // javadocs
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
/** IndexReader is an abstract class, providing an interface for accessing an
index. Search of an index is done entirely through this abstract interface,
@ -432,15 +431,17 @@ public abstract class IndexReader implements Closeable {
* <code>term</code>. This method returns 0 if the term or
* field does not exists. This method does not take into
* account deleted documents that have not yet been merged
* away. */
public final int docFreq(Term term) throws IOException {
return docFreq(term.field(), term.bytes());
}
/** Returns the number of documents containing the
* away.
* @see TermsEnum#docFreq()
*/
public abstract int docFreq(Term term) throws IOException;
/** Returns the number of documents containing the term
* <code>term</code>. This method returns 0 if the term or
* field does not exists. This method does not take into
* account deleted documents that have not yet been merged
* away. */
public abstract int docFreq(String field, BytesRef term) throws IOException;
* field does not exists, or -1 if the Codec does not support
* the measure. This method does not take into account deleted
* documents that have not yet been merged away.
* @see TermsEnum#totalTermFreq()
*/
public abstract long totalTermFreq(Term term) throws IOException;
}

View File

@ -260,7 +260,7 @@ public class PhraseQuery extends Query {
final Term t = terms.get(i);
final TermState state = states[i].get(context.ord);
if (state == null) { /* term doesnt exist in this segment */
assert termNotInReader(reader, field, t.bytes()): "no termstate found but term exists in reader";
assert termNotInReader(reader, t): "no termstate found but term exists in reader";
return null;
}
te.seekExact(t.bytes(), state);
@ -295,8 +295,8 @@ public class PhraseQuery extends Query {
}
// only called from assert
private boolean termNotInReader(AtomicReader reader, String field, BytesRef bytes) throws IOException {
return reader.docFreq(field, bytes) == 0;
private boolean termNotInReader(AtomicReader reader, Term term) throws IOException {
return reader.docFreq(term) == 0;
}
@Override

View File

@ -95,7 +95,7 @@ public class TermQuery extends Query {
private TermsEnum getTermsEnum(AtomicReaderContext context) throws IOException {
final TermState state = termStates.get(context.ord);
if (state == null) { // term is not present in that reader
assert termNotInReader(context.reader(), term.field(), term.bytes()) : "no termstate found but term exists in reader term=" + term;
assert termNotInReader(context.reader(), term) : "no termstate found but term exists in reader term=" + term;
return null;
}
//System.out.println("LD=" + reader.getLiveDocs() + " set?=" + (reader.getLiveDocs() != null ? reader.getLiveDocs().get(0) : "null"));
@ -104,10 +104,10 @@ public class TermQuery extends Query {
return termsEnum;
}
private boolean termNotInReader(AtomicReader reader, String field, BytesRef bytes) throws IOException {
private boolean termNotInReader(AtomicReader reader, Term term) throws IOException {
// only called from assert
//System.out.println("TQ.termNotInReader reader=" + reader + " term=" + field + ":" + bytes.utf8ToString());
return reader.docFreq(field, bytes) == 0;
return reader.docFreq(term) == 0;
}
@Override

View File

@ -17,7 +17,7 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import org.apache.lucene.index.AtomicReader; // javadocs
import org.apache.lucene.index.TermsEnum; // javadocs
import org.apache.lucene.util.BytesRef;
/**
* Contains statistics for a specific term
@ -42,13 +42,13 @@ public class TermStatistics {
}
/** returns the number of documents this term occurs in
* @see AtomicReader#docFreq(String, BytesRef) */
* @see TermsEnum#docFreq() */
public final long docFreq() {
return docFreq;
}
/** returns the total number of occurrences of this term
* @see AtomicReader#totalTermFreq(String, BytesRef) */
* @see TermsEnum#totalTermFreq() */
public final long totalTermFreq() {
return totalTermFreq;
}

View File

@ -156,7 +156,15 @@ public class TestReuseDocsEnum extends LuceneTestCase {
return null;
}
AtomicReader indexReader = readers.get(random().nextInt(readers.size())).reader();
return indexReader.termDocsEnum(bits, field, term, random().nextBoolean() ? DocsEnum.FLAG_FREQS : 0);
Terms terms = indexReader.terms(field);
if (terms == null) {
return null;
}
TermsEnum iterator = terms.iterator(null);
if (iterator.seekExact(term, true)) {
return iterator.docs(bits, null, random().nextBoolean() ? DocsEnum.FLAG_FREQS : 0);
}
return null;
}
/**

View File

@ -881,7 +881,7 @@ public class TestDocValuesIndexing extends LuceneTestCase {
public int docId(AtomicReader reader, Term term) throws IOException {
int docFreq = reader.docFreq(term);
assertEquals(1, docFreq);
DocsEnum termDocsEnum = reader.termDocsEnum(null, term.field, term.bytes, 0);
DocsEnum termDocsEnum = reader.termDocsEnum(term);
int nextDoc = termDocsEnum.nextDoc();
assertEquals(DocIdSetIterator.NO_MORE_DOCS, termDocsEnum.nextDoc());
return nextDoc;

View File

@ -92,7 +92,14 @@ public class TestDocsAndPositions extends LuceneTestCase {
public DocsAndPositionsEnum getDocsAndPositions(AtomicReader reader,
BytesRef bytes, Bits liveDocs) throws IOException {
return reader.termPositionsEnum(null, fieldName, bytes);
Terms terms = reader.terms(fieldName);
if (terms != null) {
TermsEnum te = terms.iterator(null);
if (te.seekExact(bytes, true)) {
return te.docsAndPositions(liveDocs, null);
}
}
return null;
}
/**
@ -352,7 +359,7 @@ public class TestDocsAndPositions extends LuceneTestCase {
writer.addDocument(doc);
DirectoryReader reader = writer.getReader();
AtomicReader r = getOnlySegmentReader(reader);
DocsAndPositionsEnum disi = r.termPositionsEnum(null, "foo", new BytesRef("bar"));
DocsAndPositionsEnum disi = r.termPositionsEnum(new Term("foo", "bar"));
int docid = disi.docID();
assertTrue(docid == -1 || docid == DocIdSetIterator.NO_MORE_DOCS);
assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);

View File

@ -243,18 +243,18 @@ public class TestDocumentWriter extends LuceneTestCase {
writer.close();
SegmentReader reader = new SegmentReader(info, DirectoryReader.DEFAULT_TERMS_INDEX_DIVISOR, newIOContext(random()));
DocsAndPositionsEnum termPositions = reader.termPositionsEnum(reader.getLiveDocs(), "preanalyzed", new BytesRef("term1"));
DocsAndPositionsEnum termPositions = reader.termPositionsEnum(new Term("preanalyzed", "term1"));
assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(1, termPositions.freq());
assertEquals(0, termPositions.nextPosition());
termPositions = reader.termPositionsEnum(reader.getLiveDocs(), "preanalyzed", new BytesRef("term2"));
termPositions = reader.termPositionsEnum(new Term("preanalyzed", "term2"));
assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(2, termPositions.freq());
assertEquals(1, termPositions.nextPosition());
assertEquals(3, termPositions.nextPosition());
termPositions = reader.termPositionsEnum(reader.getLiveDocs(), "preanalyzed", new BytesRef("term3"));
termPositions = reader.termPositionsEnum(new Term("preanalyzed", "term3"));
assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(1, termPositions.freq());
assertEquals(2, termPositions.nextPosition());

View File

@ -84,10 +84,7 @@ public class TestMultiLevelSkipList extends LuceneTestCase {
for (int i = 0; i < 2; i++) {
counter = 0;
DocsAndPositionsEnum tp = reader.termPositionsEnum(reader.getLiveDocs(),
term.field(),
new BytesRef(term.text()));
DocsAndPositionsEnum tp = reader.termPositionsEnum(term);
checkSkipTo(tp, 14, 185); // no skips
checkSkipTo(tp, 17, 190); // one skip on level 0
checkSkipTo(tp, 287, 200); // one skip on level 1, two on level 0

View File

@ -605,7 +605,7 @@ public class TestPayloads extends LuceneTestCase {
writer.addDocument(doc);
DirectoryReader reader = writer.getReader();
AtomicReader sr = SlowCompositeReaderWrapper.wrap(reader);
DocsAndPositionsEnum de = sr.termPositionsEnum(null, "field", new BytesRef("withPayload"));
DocsAndPositionsEnum de = sr.termPositionsEnum(new Term("field", "withPayload"));
de.nextDoc();
de.nextPosition();
assertEquals(new BytesRef("test"), de.getPayload());
@ -639,7 +639,7 @@ public class TestPayloads extends LuceneTestCase {
writer.addDocument(doc);
DirectoryReader reader = writer.getReader();
SegmentReader sr = getOnlySegmentReader(reader);
DocsAndPositionsEnum de = sr.termPositionsEnum(null, "field", new BytesRef("withPayload"));
DocsAndPositionsEnum de = sr.termPositionsEnum(new Term("field", "withPayload"));
de.nextDoc();
de.nextPosition();
assertEquals(new BytesRef("test"), de.getPayload());

View File

@ -212,9 +212,7 @@ public class TestPositionIncrement extends LuceneTestCase {
final IndexReader readerFromWriter = writer.getReader();
AtomicReader r = SlowCompositeReaderWrapper.wrap(readerFromWriter);
DocsAndPositionsEnum tp = r.termPositionsEnum(r.getLiveDocs(),
"content",
new BytesRef("a"));
DocsAndPositionsEnum tp = r.termPositionsEnum(new Term("content", "a"));
int count = 0;
assertTrue(tp.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);

View File

@ -95,7 +95,7 @@ public class TestNRTCachingDirectory extends LuceneTestCase {
r = DirectoryReader.open(dir);
for(BytesRef id : ids) {
assertEquals(1, r.docFreq("docid", id));
assertEquals(1, r.docFreq(new Term("docid", id)));
}
r.close();
cachedDir.close();

View File

@ -411,10 +411,14 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
try {
final BytesRef catTerm = new BytesRef(categoryPath.toString(delimiter));
for (AtomicReaderContext ctx : reader.leaves()) {
DocsEnum docs = ctx.reader().termDocsEnum(null, Consts.FULL, catTerm, 0);
if (docs != null) {
doc = docs.nextDoc() + ctx.docBase;
break;
Terms terms = ctx.reader().terms(Consts.FULL);
if (terms != null) {
TermsEnum termsEnum = terms.iterator(null);
if (termsEnum.seekExact(catTerm, true)) {
// TODO: is it really ok that null is passed here as liveDocs?
DocsEnum docs = termsEnum.docs(null, null, 0);
doc = docs.nextDoc() + ctx.docBase;
}
}
}
} finally {
@ -452,10 +456,14 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
try {
final BytesRef catTerm = new BytesRef(categoryPath.toString(delimiter, prefixLen));
for (AtomicReaderContext ctx : reader.leaves()) {
DocsEnum docs = ctx.reader().termDocsEnum(null, Consts.FULL, catTerm, 0);
if (docs != null) {
doc = docs.nextDoc() + ctx.docBase;
break;
Terms terms = ctx.reader().terms(Consts.FULL);
if (terms != null) {
TermsEnum termsEnum = terms.iterator(null);
if (termsEnum.seekExact(catTerm, true)) {
// TODO: is it really ok that null is passed here as liveDocs?
DocsEnum docs = termsEnum.docs(null, null, 0);
doc = docs.nextDoc() + ctx.docBase;
}
}
}
} finally {

View File

@ -864,7 +864,7 @@ public class TestGrouping extends LuceneTestCase {
final boolean doCache = random().nextBoolean();
final boolean doAllGroups = random().nextBoolean();
if (VERBOSE) {
System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " dF=" + r.docFreq("content", new BytesRef(searchTerm)) +" dFBlock=" + rBlocks.docFreq("content", new BytesRef(searchTerm)) + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup + " doAllGroups=" + doAllGroups + " getScores=" + getScores + " getMaxScores=" + getMaxScores);
System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " dF=" + r.docFreq(new Term("content", searchTerm)) +" dFBlock=" + rBlocks.docFreq(new Term("content", searchTerm)) + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup + " doAllGroups=" + doAllGroups + " getScores=" + getScores + " getMaxScores=" + getMaxScores);
}
final AbstractFirstPassGroupingCollector<?> c1 = createRandomFirstPassCollector("group", groupSort, groupOffset+topNGroups, canUseIDV);

View File

@ -24,6 +24,7 @@ import java.util.Set;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
@ -112,7 +113,7 @@ public class FieldTermStack {
dpEnum.nextDoc();
// For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
final float weight = ( float ) ( Math.log( numDocs / ( double ) ( reader.docFreq( fieldName, text ) + 1 ) ) + 1.0 );
final float weight = ( float ) ( Math.log( numDocs / ( double ) ( reader.docFreq( new Term(fieldName, text) ) + 1 ) ) + 1.0 );
final int freq = dpEnum.freq();

View File

@ -225,7 +225,7 @@ public class MemoryIndexTest extends BaseTokenStreamTestCase {
MemoryIndex memory = new MemoryIndex(true);
memory.addField("foo", "bar", analyzer);
AtomicReader reader = (AtomicReader) memory.createSearcher().getIndexReader();
DocsAndPositionsEnum disi = reader.termPositionsEnum(null, "foo", new BytesRef("bar"));
DocsAndPositionsEnum disi = reader.termPositionsEnum(new Term("foo", "bar"));
int docid = disi.docID();
assertTrue(docid == -1 || docid == DocIdSetIterator.NO_MORE_DOCS);
assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);

View File

@ -23,6 +23,7 @@ import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
/**
* Utility to get document frequency and total number of occurrences (sum of the tf for each doc) of a term.
@ -44,14 +45,14 @@ public class GetTermInfo {
System.exit(1);
}
getTermInfo(dir,field, new BytesRef(inputStr));
getTermInfo(dir,new Term(field, inputStr));
}
public static void getTermInfo(Directory dir, String field, BytesRef termtext) throws Exception {
public static void getTermInfo(Directory dir, Term term) throws Exception {
IndexReader reader = DirectoryReader.open(dir);
long totalTF = HighFreqTerms.getTotalTermFreq(reader, field, termtext);
long totalTF = HighFreqTerms.getTotalTermFreq(reader, term);
System.out.printf("%s:%s \t totalTF = %,d \t doc freq = %,d \n",
field, termtext.utf8ToString(), totalTF, reader.docFreq(field, termtext));
term.field(), term.text(), totalTF, reader.docFreq(term));
}
private static void usage() {

View File

@ -24,6 +24,7 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.DocsEnum;
@ -167,7 +168,7 @@ public class HighFreqTerms {
TermStats[] ts = new TermStats[terms.length]; // array for sorting
long totalTF;
for (int i = 0; i < terms.length; i++) {
totalTF = getTotalTermFreq(reader, terms[i].field, terms[i].termtext);
totalTF = getTotalTermFreq(reader, new Term(terms[i].field, terms[i].termtext));
ts[i] = new TermStats(terms[i].field, terms[i].termtext, terms[i].docFreq, totalTF);
}
@ -177,24 +178,23 @@ public class HighFreqTerms {
return ts;
}
public static long getTotalTermFreq(IndexReader reader, final String field, final BytesRef termText) throws Exception {
public static long getTotalTermFreq(IndexReader reader, Term term) throws Exception {
long totalTF = 0L;
for (final AtomicReaderContext ctx : reader.leaves()) {
AtomicReader r = ctx.reader();
Bits liveDocs = r.getLiveDocs();
if (liveDocs == null) {
if (!r.hasDeletions()) {
// TODO: we could do this up front, during the scan
// (next()), instead of after-the-fact here w/ seek,
// if the codec supports it and there are no del
// docs...
final long totTF = r.totalTermFreq(field, termText);
final long totTF = r.totalTermFreq(term);
if (totTF != -1) {
totalTF += totTF;
continue;
} // otherwise we fall-through
}
// note: what should we do if field omits freqs? currently it counts as 1...
DocsEnum de = r.termDocsEnum(liveDocs, field, termText);
DocsEnum de = r.termDocsEnum(term);
if (de != null) {
while (de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS)
totalTF += de.freq();

View File

@ -26,6 +26,7 @@ import org.apache.lucene.document.Field;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
@ -182,7 +183,7 @@ public class TestHighFreqTerms extends LuceneTestCase {
String term ="highTF";
BytesRef termtext = new BytesRef (term);
String field = "FIELD_1";
long totalTermFreq = HighFreqTerms.getTotalTermFreq(reader, field, termtext);
long totalTermFreq = HighFreqTerms.getTotalTermFreq(reader, new Term(field, termtext));
assertEquals("highTf tf should be 200",200,totalTermFreq);
}
@ -191,7 +192,7 @@ public class TestHighFreqTerms extends LuceneTestCase {
String term ="foobar";
BytesRef termtext = new BytesRef (term);
String field = "FIELD_1";
long totalTermFreq = HighFreqTerms.getTotalTermFreq(reader, field, termtext);
long totalTermFreq = HighFreqTerms.getTotalTermFreq(reader, new Term(field, termtext));
assertEquals("totalTermFreq should be 0 for term not in index",0,totalTermFreq);
}

View File

@ -22,7 +22,10 @@ import java.util.Map;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.queries.function.FunctionValues;
import org.apache.lucene.queries.function.docvalues.IntDocValues;
import org.apache.lucene.search.FieldCache.DocTerms;
@ -55,6 +58,8 @@ public class JoinDocFreqValueSource extends FieldCacheSource {
{
final DocTerms terms = cache.getTerms(readerContext.reader(), field, PackedInts.FAST);
final IndexReader top = ReaderUtil.getTopLevelContext(readerContext).reader();
Terms t = MultiFields.getTerms(top, qfield);
final TermsEnum termsEnum = t == null ? TermsEnum.EMPTY : t.iterator(null);
return new IntDocValues(this) {
final BytesRef ref = new BytesRef();
@ -64,8 +69,11 @@ public class JoinDocFreqValueSource extends FieldCacheSource {
{
try {
terms.getTerm(doc, ref);
//System.out.println( NAME+"["+field+"="+ref.utf8ToString()+"=("+qfield+":"+v+")]" );
return top.docFreq( qfield, ref );
if (termsEnum.seekExact(ref, true)) {
return termsEnum.docFreq();
} else {
return 0;
}
}
catch (IOException e) {
throw new RuntimeException("caught exception in function "+description()+" : doc="+doc, e);

View File

@ -18,6 +18,7 @@
package org.apache.lucene.queries.function.valuesource;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.function.FunctionValues;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.queries.function.docvalues.LongDocValues;
@ -65,7 +66,7 @@ public class TotalTermFreqValueSource extends ValueSource {
public void createWeight(Map context, IndexSearcher searcher) throws IOException {
long totalTermFreq = 0;
for (AtomicReaderContext readerContext : searcher.getTopReaderContext().leaves()) {
long val = readerContext.reader().totalTermFreq(indexedField, indexedBytes);
long val = readerContext.reader().totalTermFreq(new Term(indexedField, indexedBytes));
if (val == -1) {
totalTermFreq = -1;
break;

View File

@ -33,6 +33,7 @@ import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.PriorityQueue;
@ -344,7 +345,7 @@ public class LukeRequestHandler extends RequestHandlerBase
if(sfield != null && sfield.indexed() ) {
// In the pre-4.0 days, this did a veeeery expensive range query. But we can be much faster now,
// so just do this all the time.
StoredDocument doc = getFirstLiveDoc(reader, fieldName, terms);
StoredDocument doc = getFirstLiveDoc(terms, reader);
if( doc != null ) {
@ -378,7 +379,7 @@ public class LukeRequestHandler extends RequestHandlerBase
// Just get a document with the term in it, the first one will do!
// Is there a better way to do this? Shouldn't actually be very costly
// to do it this way.
private static StoredDocument getFirstLiveDoc(AtomicReader reader, String fieldName, Terms terms) throws IOException {
private static StoredDocument getFirstLiveDoc(Terms terms, AtomicReader reader) throws IOException {
DocsEnum docsEnum = null;
TermsEnum termsEnum = terms.iterator(null);
BytesRef text;
@ -388,16 +389,9 @@ public class LukeRequestHandler extends RequestHandlerBase
if (text == null) { // Ran off the end of the terms enum without finding any live docs with that field in them.
return null;
}
Term term = new Term(fieldName, text);
docsEnum = reader.termDocsEnum(reader.getLiveDocs(),
term.field(),
new BytesRef(term.text()),
0);
if (docsEnum != null) {
int docId;
if ((docId = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
return reader.document(docId);
}
docsEnum = termsEnum.docs(reader.getLiveDocs(), docsEnum, 0);
if (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
return reader.document(docsEnum.docID());
}
}
return null;

View File

@ -18,6 +18,7 @@ import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
@ -381,13 +382,19 @@ public class TermVectorComponent extends SearchComponent implements SolrCoreAwar
}
}
}
if (fieldOptions.docFreq) {
termInfo.add("df", getDocFreq(reader, field, text));
int df = 0;
if (fieldOptions.docFreq || fieldOptions.tfIdf) {
df = reader.docFreq(new Term(field, text));
}
if (fieldOptions.docFreq) {
termInfo.add("df", df);
}
// TODO: this is not TF/IDF by anyone's definition!
if (fieldOptions.tfIdf) {
double tfIdfVal = ((double) freq) / getDocFreq(reader, field, text);
double tfIdfVal = ((double) freq) / df;
termInfo.add("tf-idf", tfIdfVal);
}
}
@ -408,16 +415,6 @@ public class TermVectorComponent extends SearchComponent implements SolrCoreAwar
return result;
}
private static int getDocFreq(IndexReader reader, String field, BytesRef term) {
int result = 1;
try {
result = reader.docFreq(field, term);
} catch (IOException e) {
throw new RuntimeException(e);
}
return result;
}
@Override
public void prepare(ResponseBuilder rb) throws IOException {

View File

@ -615,19 +615,18 @@ public class SolrIndexSearcher extends IndexSearcher implements Closeable,SolrIn
final AtomicReaderContext leaf = leafContexts.get(i);
final AtomicReader reader = leaf.reader();
final Fields fields = reader.fields();
if (fields == null) continue;
final Bits liveDocs = reader.getLiveDocs();
final Terms terms = reader.terms(field);
if (terms == null) continue;
final DocsEnum docs = reader.termDocsEnum(liveDocs, field, idBytes, 0);
TermsEnum te = terms.iterator(null);
if (te.seekExact(idBytes, true)) {
DocsEnum docs = te.docs(reader.getLiveDocs(), null, 0);
int id = docs.nextDoc();
if (id == DocIdSetIterator.NO_MORE_DOCS) continue;
assert docs.nextDoc() == DocIdSetIterator.NO_MORE_DOCS;
if (docs == null) continue;
int id = docs.nextDoc();
if (id == DocIdSetIterator.NO_MORE_DOCS) continue;
assert docs.nextDoc() == DocIdSetIterator.NO_MORE_DOCS;
return (((long)i) << 32) | id;
return (((long)i) << 32) | id;
}
}
return -1;