LUCENE-4859: Expose more stats on IndexReader

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1458907 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shai Erera 2013-03-20 16:05:45 +00:00
parent 9946aeffb0
commit dd3a8dff69
7 changed files with 170 additions and 26 deletions

View File

@ -113,6 +113,9 @@ New Features
* LUCENE-4856: If there are no matches for a given field, return the
first maxPassages sentences (Robert Muir, Mike McCandless)
* LUCENE-4859: IndexReader now exposes Terms statistics: getDocCount,
getSumDocFreq, getSumTotalTermFreq. (Shai Erera)
API Changes
* LUCENE-4844: removed TaxonomyReader.getParent(), you should use

View File

@ -108,6 +108,33 @@ public abstract class AtomicReader extends IndexReader {
}
}
@Override
public final long getSumDocFreq(String field) throws IOException {
final Terms terms = terms(field);
if (terms == null) {
return 0;
}
return terms.getSumDocFreq();
}
@Override
public final int getDocCount(String field) throws IOException {
final Terms terms = terms(field);
if (terms == null) {
return 0;
}
return terms.getDocCount();
}
@Override
public final long getSumTotalTermFreq(String field) throws IOException {
final Terms terms = terms(field);
if (terms == null) {
return 0;
}
return terms.getSumTotalTermFreq();
}
/** This may return null if the field does not exist.*/
public final Terms terms(String field) throws IOException {
final Fields fields = fields();

View File

@ -146,6 +146,48 @@ public abstract class BaseCompositeReader<R extends IndexReader> extends Composi
return total;
}
@Override
public final long getSumDocFreq(String field) throws IOException {
ensureOpen();
long total = 0; // sum doc freqs in subreaders
for (R reader : subReaders) {
long sub = reader.getSumDocFreq(field);
if (sub == -1) {
return -1; // if any of the subs doesn't support it, return -1
}
total += sub;
}
return total;
}
@Override
public final int getDocCount(String field) throws IOException {
ensureOpen();
int total = 0; // sum doc counts in subreaders
for (R reader : subReaders) {
int sub = reader.getDocCount(field);
if (sub == -1) {
return -1; // if any of the subs doesn't support it, return -1
}
total += sub;
}
return total;
}
@Override
public final long getSumTotalTermFreq(String field) throws IOException {
ensureOpen();
long total = 0; // sum doc total term freqs in subreaders
for (R reader : subReaders) {
long sub = reader.getSumTotalTermFreq(field);
if (sub == -1) {
return -1; // if any of the subs doesn't support it, return -1
}
total += sub;
}
return total;
}
/** Helper method for subclasses to get the corresponding reader for a doc ID */
protected final int readerIndex(int docID) {
if (docID < 0 || docID >= maxDoc) {

View File

@ -446,4 +446,33 @@ public abstract class IndexReader implements Closeable {
* @see TermsEnum#totalTermFreq()
*/
public abstract long totalTermFreq(Term term) throws IOException;
/**
* Returns the sum of {@link TermsEnum#docFreq()} for all terms in this field,
* or -1 if this measure isn't stored by the codec. Note that, just like other
* term measures, this measure does not take deleted documents into account.
*
* @see Terms#getSumDocFreq()
*/
public abstract long getSumDocFreq(String field) throws IOException;
/**
* Returns the number of documents that have at least one term for this field,
* or -1 if this measure isn't stored by the codec. Note that, just like other
* term measures, this measure does not take deleted documents into account.
*
* @see Terms#getDocCount()
*/
public abstract int getDocCount(String field) throws IOException;
/**
* Returns the sum of {@link TermsEnum#totalTermFreq} for all terms in this
* field, or -1 if this measure isn't stored by the codec (or if this fields
* omits term freq and positions). Note that, just like other term measures,
* this measure does not take deleted documents into account.
*
* @see Terms#getSumTotalTermFreq()
*/
public abstract long getSumTotalTermFreq(String field) throws IOException;
}

View File

@ -243,25 +243,6 @@ public final class MultiFields extends Fields {
return -1;
}
/** Returns the total number of occurrences of this term
* across all documents (the sum of the freq() for each
* doc that has this term). This will be -1 if the
* codec doesn't support this measure. Note that, like
* other term measures, this measure does not take
* deleted documents into account.
* @see TermsEnum#totalTermFreq()
*/
public static long totalTermFreq(IndexReader r, String field, BytesRef text) throws IOException {
final Terms terms = getTerms(r, field);
if (terms != null) {
final TermsEnum termsEnum = terms.iterator(null);
if (termsEnum.seekExact(text, true)) {
return termsEnum.totalTermFreq();
}
}
return 0;
}
/** Call this to get the (merged) FieldInfos for a
* composite reader.
* <p>

View File

@ -939,10 +939,73 @@ public void testFilesOpenClose() throws IOException {
writer.close();
try {
// Make sure codec impls totalTermFreq (eg PreFlex doesn't)
Assume.assumeTrue(MultiFields.totalTermFreq(r, "f", new BytesRef("b")) != -1);
assertEquals(1, MultiFields.totalTermFreq(r, "f", new BytesRef("b")));
assertEquals(2, MultiFields.totalTermFreq(r, "f", new BytesRef("a")));
assertEquals(1, MultiFields.totalTermFreq(r, "f", new BytesRef("b")));
Assume.assumeTrue(r.totalTermFreq(new Term("f", new BytesRef("b"))) != -1);
assertEquals(1, r.totalTermFreq(new Term("f", new BytesRef("b"))));
assertEquals(2, r.totalTermFreq(new Term("f", new BytesRef("a"))));
assertEquals(1, r.totalTermFreq(new Term("f", new BytesRef("b"))));
} finally {
r.close();
dir.close();
}
}
public void testGetSumDocFreq() throws Exception {
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())));
Document d = new Document();
d.add(newTextField("f", "a", Field.Store.NO));
writer.addDocument(d);
d = new Document();
d.add(newTextField("f", "b", Field.Store.NO));
writer.addDocument(d);
DirectoryReader r = writer.getReader();
writer.close();
try {
// Make sure codec impls getSumDocFreq (eg PreFlex doesn't)
Assume.assumeTrue(r.getSumDocFreq("f") != -1);
assertEquals(2, r.getSumDocFreq("f"));
} finally {
r.close();
dir.close();
}
}
public void testGetDocCount() throws Exception {
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())));
Document d = new Document();
d.add(newTextField("f", "a", Field.Store.NO));
writer.addDocument(d);
d = new Document();
d.add(newTextField("f", "a", Field.Store.NO));
writer.addDocument(d);
DirectoryReader r = writer.getReader();
writer.close();
try {
// Make sure codec impls getSumDocFreq (eg PreFlex doesn't)
Assume.assumeTrue(r.getDocCount("f") != -1);
assertEquals(2, r.getDocCount("f"));
} finally {
r.close();
dir.close();
}
}
public void testGetSumTotalTermFreq() throws Exception {
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())));
Document d = new Document();
d.add(newTextField("f", "a b b", Field.Store.NO));
writer.addDocument(d);
d = new Document();
d.add(newTextField("f", "a a b", Field.Store.NO));
writer.addDocument(d);
DirectoryReader r = writer.getReader();
writer.close();
try {
// Make sure codec impls getSumDocFreq (eg PreFlex doesn't)
Assume.assumeTrue(r.getSumTotalTermFreq("f") != -1);
assertEquals(6, r.getSumTotalTermFreq("f"));
} finally {
r.close();
dir.close();

View File

@ -439,9 +439,8 @@ public class TestOmitTf extends LuceneTestCase {
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
Terms terms = MultiFields.getTerms(ir, "foo");
assertEquals(-1, MultiFields.totalTermFreq(ir, "foo", new BytesRef("bar")));
assertEquals(-1, terms.getSumTotalTermFreq());
assertEquals(-1, ir.totalTermFreq(new Term("foo", new BytesRef("bar"))));
assertEquals(-1, ir.getSumTotalTermFreq("foo"));
ir.close();
dir.close();
}