mirror of https://github.com/apache/lucene.git
LUCENE-4859: Expose more stats on IndexReader
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1458907 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9946aeffb0
commit
dd3a8dff69
|
@ -113,6 +113,9 @@ New Features
|
|||
* LUCENE-4856: If there are no matches for a given field, return the
|
||||
first maxPassages sentences (Robert Muir, Mike McCandless)
|
||||
|
||||
* LUCENE-4859: IndexReader now exposes Terms statistics: getDocCount,
|
||||
getSumDocFreq, getSumTotalTermFreq. (Shai Erera)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-4844: removed TaxonomyReader.getParent(), you should use
|
||||
|
|
|
@ -108,6 +108,33 @@ public abstract class AtomicReader extends IndexReader {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public final long getSumDocFreq(String field) throws IOException {
|
||||
final Terms terms = terms(field);
|
||||
if (terms == null) {
|
||||
return 0;
|
||||
}
|
||||
return terms.getSumDocFreq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final int getDocCount(String field) throws IOException {
|
||||
final Terms terms = terms(field);
|
||||
if (terms == null) {
|
||||
return 0;
|
||||
}
|
||||
return terms.getDocCount();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final long getSumTotalTermFreq(String field) throws IOException {
|
||||
final Terms terms = terms(field);
|
||||
if (terms == null) {
|
||||
return 0;
|
||||
}
|
||||
return terms.getSumTotalTermFreq();
|
||||
}
|
||||
|
||||
/** This may return null if the field does not exist.*/
|
||||
public final Terms terms(String field) throws IOException {
|
||||
final Fields fields = fields();
|
||||
|
|
|
@ -146,6 +146,48 @@ public abstract class BaseCompositeReader<R extends IndexReader> extends Composi
|
|||
return total;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final long getSumDocFreq(String field) throws IOException {
|
||||
ensureOpen();
|
||||
long total = 0; // sum doc freqs in subreaders
|
||||
for (R reader : subReaders) {
|
||||
long sub = reader.getSumDocFreq(field);
|
||||
if (sub == -1) {
|
||||
return -1; // if any of the subs doesn't support it, return -1
|
||||
}
|
||||
total += sub;
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final int getDocCount(String field) throws IOException {
|
||||
ensureOpen();
|
||||
int total = 0; // sum doc counts in subreaders
|
||||
for (R reader : subReaders) {
|
||||
int sub = reader.getDocCount(field);
|
||||
if (sub == -1) {
|
||||
return -1; // if any of the subs doesn't support it, return -1
|
||||
}
|
||||
total += sub;
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final long getSumTotalTermFreq(String field) throws IOException {
|
||||
ensureOpen();
|
||||
long total = 0; // sum doc total term freqs in subreaders
|
||||
for (R reader : subReaders) {
|
||||
long sub = reader.getSumTotalTermFreq(field);
|
||||
if (sub == -1) {
|
||||
return -1; // if any of the subs doesn't support it, return -1
|
||||
}
|
||||
total += sub;
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
/** Helper method for subclasses to get the corresponding reader for a doc ID */
|
||||
protected final int readerIndex(int docID) {
|
||||
if (docID < 0 || docID >= maxDoc) {
|
||||
|
|
|
@ -446,4 +446,33 @@ public abstract class IndexReader implements Closeable {
|
|||
* @see TermsEnum#totalTermFreq()
|
||||
*/
|
||||
public abstract long totalTermFreq(Term term) throws IOException;
|
||||
|
||||
/**
|
||||
* Returns the sum of {@link TermsEnum#docFreq()} for all terms in this field,
|
||||
* or -1 if this measure isn't stored by the codec. Note that, just like other
|
||||
* term measures, this measure does not take deleted documents into account.
|
||||
*
|
||||
* @see Terms#getSumDocFreq()
|
||||
*/
|
||||
public abstract long getSumDocFreq(String field) throws IOException;
|
||||
|
||||
/**
|
||||
* Returns the number of documents that have at least one term for this field,
|
||||
* or -1 if this measure isn't stored by the codec. Note that, just like other
|
||||
* term measures, this measure does not take deleted documents into account.
|
||||
*
|
||||
* @see Terms#getDocCount()
|
||||
*/
|
||||
public abstract int getDocCount(String field) throws IOException;
|
||||
|
||||
/**
|
||||
* Returns the sum of {@link TermsEnum#totalTermFreq} for all terms in this
|
||||
* field, or -1 if this measure isn't stored by the codec (or if this fields
|
||||
* omits term freq and positions). Note that, just like other term measures,
|
||||
* this measure does not take deleted documents into account.
|
||||
*
|
||||
* @see Terms#getSumTotalTermFreq()
|
||||
*/
|
||||
public abstract long getSumTotalTermFreq(String field) throws IOException;
|
||||
|
||||
}
|
||||
|
|
|
@ -243,25 +243,6 @@ public final class MultiFields extends Fields {
|
|||
return -1;
|
||||
}
|
||||
|
||||
/** Returns the total number of occurrences of this term
|
||||
* across all documents (the sum of the freq() for each
|
||||
* doc that has this term). This will be -1 if the
|
||||
* codec doesn't support this measure. Note that, like
|
||||
* other term measures, this measure does not take
|
||||
* deleted documents into account.
|
||||
* @see TermsEnum#totalTermFreq()
|
||||
*/
|
||||
public static long totalTermFreq(IndexReader r, String field, BytesRef text) throws IOException {
|
||||
final Terms terms = getTerms(r, field);
|
||||
if (terms != null) {
|
||||
final TermsEnum termsEnum = terms.iterator(null);
|
||||
if (termsEnum.seekExact(text, true)) {
|
||||
return termsEnum.totalTermFreq();
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/** Call this to get the (merged) FieldInfos for a
|
||||
* composite reader.
|
||||
* <p>
|
||||
|
|
|
@ -939,10 +939,73 @@ public void testFilesOpenClose() throws IOException {
|
|||
writer.close();
|
||||
try {
|
||||
// Make sure codec impls totalTermFreq (eg PreFlex doesn't)
|
||||
Assume.assumeTrue(MultiFields.totalTermFreq(r, "f", new BytesRef("b")) != -1);
|
||||
assertEquals(1, MultiFields.totalTermFreq(r, "f", new BytesRef("b")));
|
||||
assertEquals(2, MultiFields.totalTermFreq(r, "f", new BytesRef("a")));
|
||||
assertEquals(1, MultiFields.totalTermFreq(r, "f", new BytesRef("b")));
|
||||
Assume.assumeTrue(r.totalTermFreq(new Term("f", new BytesRef("b"))) != -1);
|
||||
assertEquals(1, r.totalTermFreq(new Term("f", new BytesRef("b"))));
|
||||
assertEquals(2, r.totalTermFreq(new Term("f", new BytesRef("a"))));
|
||||
assertEquals(1, r.totalTermFreq(new Term("f", new BytesRef("b"))));
|
||||
} finally {
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
||||
public void testGetSumDocFreq() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())));
|
||||
Document d = new Document();
|
||||
d.add(newTextField("f", "a", Field.Store.NO));
|
||||
writer.addDocument(d);
|
||||
d = new Document();
|
||||
d.add(newTextField("f", "b", Field.Store.NO));
|
||||
writer.addDocument(d);
|
||||
DirectoryReader r = writer.getReader();
|
||||
writer.close();
|
||||
try {
|
||||
// Make sure codec impls getSumDocFreq (eg PreFlex doesn't)
|
||||
Assume.assumeTrue(r.getSumDocFreq("f") != -1);
|
||||
assertEquals(2, r.getSumDocFreq("f"));
|
||||
} finally {
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
||||
public void testGetDocCount() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())));
|
||||
Document d = new Document();
|
||||
d.add(newTextField("f", "a", Field.Store.NO));
|
||||
writer.addDocument(d);
|
||||
d = new Document();
|
||||
d.add(newTextField("f", "a", Field.Store.NO));
|
||||
writer.addDocument(d);
|
||||
DirectoryReader r = writer.getReader();
|
||||
writer.close();
|
||||
try {
|
||||
// Make sure codec impls getSumDocFreq (eg PreFlex doesn't)
|
||||
Assume.assumeTrue(r.getDocCount("f") != -1);
|
||||
assertEquals(2, r.getDocCount("f"));
|
||||
} finally {
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
||||
public void testGetSumTotalTermFreq() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())));
|
||||
Document d = new Document();
|
||||
d.add(newTextField("f", "a b b", Field.Store.NO));
|
||||
writer.addDocument(d);
|
||||
d = new Document();
|
||||
d.add(newTextField("f", "a a b", Field.Store.NO));
|
||||
writer.addDocument(d);
|
||||
DirectoryReader r = writer.getReader();
|
||||
writer.close();
|
||||
try {
|
||||
// Make sure codec impls getSumDocFreq (eg PreFlex doesn't)
|
||||
Assume.assumeTrue(r.getSumTotalTermFreq("f") != -1);
|
||||
assertEquals(6, r.getSumTotalTermFreq("f"));
|
||||
} finally {
|
||||
r.close();
|
||||
dir.close();
|
||||
|
|
|
@ -439,9 +439,8 @@ public class TestOmitTf extends LuceneTestCase {
|
|||
iw.addDocument(doc);
|
||||
IndexReader ir = iw.getReader();
|
||||
iw.close();
|
||||
Terms terms = MultiFields.getTerms(ir, "foo");
|
||||
assertEquals(-1, MultiFields.totalTermFreq(ir, "foo", new BytesRef("bar")));
|
||||
assertEquals(-1, terms.getSumTotalTermFreq());
|
||||
assertEquals(-1, ir.totalTermFreq(new Term("foo", new BytesRef("bar"))));
|
||||
assertEquals(-1, ir.getSumTotalTermFreq("foo"));
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue