LUCENE-5200: HighFreqTerms has confusing behavior with -t option

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1520615 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2013-09-06 15:56:38 +00:00
parent 9a930b8806
commit 91c01ddb5f
5 changed files with 93 additions and 171 deletions

View File

@ -223,6 +223,10 @@ Changes in Runtime Behavior
* LUCENE-5178: DocValues codec consumer APIs (iterables) return null values
when the document has no value for the field. (Robert Muir)
* LUCENE-5200: The HighFreqTerms command-line tool returns the true top-N
by totalTermFreq when using the -t option, it uses the term statistics (faster)
and now always shows totalTermFreq in the output. (Robert Muir)
Optimizations
* LUCENE-5088: Added TermFilter to filter docs by a specific term.

View File

@ -20,7 +20,6 @@ package org.apache.lucene.misc;
import java.io.File;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
@ -50,9 +49,8 @@ public class GetTermInfo {
public static void getTermInfo(Directory dir, Term term) throws Exception {
IndexReader reader = DirectoryReader.open(dir);
long totalTF = HighFreqTerms.getTotalTermFreq(reader, term);
System.out.printf("%s:%s \t totalTF = %,d \t doc freq = %,d \n",
term.field(), term.text(), totalTF, reader.docFreq(term));
term.field(), term.text(), reader.totalTermFreq(term), reader.docFreq(term));
}
private static void usage() {

View File

@ -17,26 +17,19 @@ package org.apache.lucene.misc;
* limitations under the License.
*/
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Bits;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
/**
@ -51,27 +44,24 @@ import java.util.Comparator;
public class HighFreqTerms {
// The top numTerms will be displayed
public static final int DEFAULTnumTerms = 100;
public static int numTerms = DEFAULTnumTerms;
public static final int DEFAULT_NUMTERMS = 100;
public static void main(String[] args) throws Exception {
IndexReader reader = null;
FSDirectory dir = null;
String field = null;
boolean IncludeTermFreqs = false;
int numTerms = DEFAULT_NUMTERMS;
if (args.length == 0 || args.length > 4) {
usage();
System.exit(1);
}
if (args.length > 0) {
dir = FSDirectory.open(new File(args[0]));
}
Directory dir = FSDirectory.open(new File(args[0]));
Comparator<TermStats> comparator = new DocFreqComparator();
for (int i = 1; i < args.length; i++) {
if (args[i].equals("-t")) {
IncludeTermFreqs = true;
comparator = new TotalTermFreqComparator();
}
else{
try {
@ -82,22 +72,12 @@ public class HighFreqTerms {
}
}
reader = DirectoryReader.open(dir);
TermStats[] terms = getHighFreqTerms(reader, numTerms, field);
if (!IncludeTermFreqs) {
//default HighFreqTerms behavior
IndexReader reader = DirectoryReader.open(dir);
TermStats[] terms = getHighFreqTerms(reader, numTerms, field, comparator);
for (int i = 0; i < terms.length; i++) {
System.out.printf("%s:%s %,d \n",
terms[i].field, terms[i].termtext.utf8ToString(), terms[i].docFreq);
}
}
else{
TermStats[] termsWithTF = sortByTotalTermFreq(reader, terms);
for (int i = 0; i < termsWithTF.length; i++) {
System.out.printf("%s:%s \t totalTF = %,d \t doc freq = %,d \n",
termsWithTF[i].field, termsWithTF[i].termtext.utf8ToString(),
termsWithTF[i].totalTermFreq, termsWithTF[i].docFreq);
}
System.out.printf("%s:%s \t totalTF = %,d \t docFreq = %,d \n",
terms[i].field, terms[i].termtext.utf8ToString(), terms[i].totalTermFreq, terms[i].docFreq);
}
reader.close();
}
@ -105,12 +85,13 @@ public class HighFreqTerms {
private static void usage() {
System.out
.println("\n\n"
+ "java org.apache.lucene.misc.HighFreqTerms <index dir> [-t] [number_terms] [field]\n\t -t: include totalTermFreq\n\n");
+ "java org.apache.lucene.misc.HighFreqTerms <index dir> [-t] [number_terms] [field]\n\t -t: order by totalTermFreq\n\n");
}
/**
* Returns TermStats[] ordered by terms with highest docFreq first.
* Returns TermStats[] ordered by the specified comparator
*/
public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String field) throws Exception {
public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String field, Comparator<TermStats> comparator) throws Exception {
TermStatsQueue tiq = null;
if (field != null) {
@ -121,7 +102,7 @@ public class HighFreqTerms {
Terms terms = fields.terms(field);
if (terms != null) {
TermsEnum termsEnum = terms.iterator(null);
tiq = new TermStatsQueue(numTerms);
tiq = new TermStatsQueue(numTerms, comparator);
tiq.fill(field, termsEnum);
}
} else {
@ -129,7 +110,7 @@ public class HighFreqTerms {
if (fields == null) {
throw new RuntimeException("no fields found for this index");
}
tiq = new TermStatsQueue(numTerms);
tiq = new TermStatsQueue(numTerms, comparator);
for (String fieldName : fields) {
Terms terms = fields.terms(fieldName);
if (terms != null) {
@ -150,91 +131,61 @@ public class HighFreqTerms {
}
/**
* Takes array of TermStats. For each term looks up the tf for each doc
* containing the term and stores the total in the output array of TermStats.
* Output array is sorted by highest total tf.
*
* @param terms
* TermStats[]
* @return TermStats[]
* Compares terms by docTermFreq
*/
public static TermStats[] sortByTotalTermFreq(IndexReader reader, TermStats[] terms) throws Exception {
TermStats[] ts = new TermStats[terms.length]; // array for sorting
long totalTF;
for (int i = 0; i < terms.length; i++) {
totalTF = getTotalTermFreq(reader, new Term(terms[i].field, terms[i].termtext));
ts[i] = new TermStats(terms[i].field, terms[i].termtext, terms[i].docFreq, totalTF);
}
Comparator<TermStats> c = new TotalTermFreqComparatorSortDescending();
Arrays.sort(ts, c);
return ts;
}
public static long getTotalTermFreq(IndexReader reader, Term term) throws Exception {
long totalTF = 0L;
for (final AtomicReaderContext ctx : reader.leaves()) {
AtomicReader r = ctx.reader();
if (!r.hasDeletions()) {
// TODO: we could do this up front, during the scan
// (next()), instead of after-the-fact here w/ seek,
// if the codec supports it and there are no del
// docs...
final long totTF = r.totalTermFreq(term);
if (totTF != -1) {
totalTF += totTF;
continue;
} // otherwise we fall-through
}
// note: what should we do if field omits freqs? currently it counts as 1...
DocsEnum de = r.termDocsEnum(term);
if (de != null) {
while (de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS)
totalTF += de.freq();
}
}
return totalTF;
}
}
/**
* Comparator
*
* Reverse of normal Comparator. i.e. returns 1 if a.totalTermFreq is less than
* b.totalTermFreq So we can sort in descending order of totalTermFreq
*/
final class TotalTermFreqComparatorSortDescending implements Comparator<TermStats> {
public static final class DocFreqComparator implements Comparator<TermStats> {
@Override
public int compare(TermStats a, TermStats b) {
return Long.compare(b.totalTermFreq, a.totalTermFreq);
int res = Long.compare(a.docFreq, b.docFreq);
if (res == 0) {
res = a.field.compareTo(b.field);
if (res == 0) {
res = a.termtext.compareTo(b.termtext);
}
}
return res;
}
}
/**
* Priority queue for TermStats objects ordered by docFreq
* Compares terms by totalTermFreq
*/
public static final class TotalTermFreqComparator implements Comparator<TermStats> {
@Override
public int compare(TermStats a, TermStats b) {
int res = Long.compare(a.totalTermFreq, b.totalTermFreq);
if (res == 0) {
res = a.field.compareTo(b.field);
if (res == 0) {
res = a.termtext.compareTo(b.termtext);
}
}
return res;
}
}
/**
* Priority queue for TermStats objects
**/
final class TermStatsQueue extends PriorityQueue<TermStats> {
TermStatsQueue(int size) {
static final class TermStatsQueue extends PriorityQueue<TermStats> {
final Comparator<TermStats> comparator;
TermStatsQueue(int size, Comparator<TermStats> comparator) {
super(size);
this.comparator = comparator;
}
@Override
protected boolean lessThan(TermStats termInfoA, TermStats termInfoB) {
return termInfoA.docFreq < termInfoB.docFreq;
return comparator.compare(termInfoA, termInfoB) < 0;
}
protected void fill(String field, TermsEnum termsEnum) throws IOException {
while (true) {
BytesRef term = termsEnum.next();
if (term != null) {
insertWithOverflow(new TermStats(field, term, termsEnum.docFreq()));
} else {
break;
BytesRef term = null;
while ((term = termsEnum.next()) != null) {
insertWithOverflow(new TermStats(field, term, termsEnum.docFreq(), termsEnum.totalTermFreq()));
}
}
}

View File

@ -29,12 +29,6 @@ public final class TermStats {
public int docFreq;
public long totalTermFreq;
TermStats(String field, BytesRef termtext, int df) {
this.termtext = BytesRef.deepCopyOf(termtext);
this.field = field;
this.docFreq = df;
}
TermStats(String field, BytesRef termtext, int df, long tf) {
this.termtext = BytesRef.deepCopyOf(termtext);
this.field = field;

View File

@ -26,9 +26,7 @@ import org.apache.lucene.document.Field;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
import org.junit.AfterClass;
@ -66,21 +64,21 @@ public class TestHighFreqTerms extends LuceneTestCase {
public void testFirstTermHighestDocFreqAllFields () throws Exception{
int numTerms = 12;
String field =null;
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator());
assertEquals("Term with highest docfreq is first", 20,terms[0].docFreq );
}
public void testFirstTermHighestDocFreq () throws Exception{
int numTerms = 12;
String field="FIELD_1";
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator());
assertEquals("Term with highest docfreq is first", 10,terms[0].docFreq );
}
public void testOrderedByDocFreqDescending () throws Exception{
int numTerms = 12;
String field="FIELD_1";
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator());
for (int i = 0; i < terms.length; i++) {
if (i > 0) {
assertTrue ("out of order " + terms[i-1].docFreq + "should be >= " + terms[i].docFreq,terms[i-1].docFreq >= terms[i].docFreq);
@ -91,14 +89,14 @@ public class TestHighFreqTerms extends LuceneTestCase {
public void testNumTerms () throws Exception{
int numTerms = 12;
String field = null;
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator());
assertEquals("length of terms array equals numTerms :" + numTerms, numTerms, terms.length);
}
public void testGetHighFreqTerms () throws Exception{
int numTerms=12;
String field="FIELD_1";
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator());
for (int i = 0; i < terms.length; i++) {
String termtext = terms[i].termtext.utf8ToString();
@ -122,30 +120,27 @@ public class TestHighFreqTerms extends LuceneTestCase {
public void testFirstTermHighestTotalTermFreq () throws Exception{
int numTerms = 20;
String field = null;
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
TermStats[] termsWithTotalTermFreq = HighFreqTerms.sortByTotalTermFreq(reader, terms);
assertEquals("Term with highest totalTermFreq is first",200, termsWithTotalTermFreq[0].totalTermFreq);
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.TotalTermFreqComparator());
assertEquals("Term with highest totalTermFreq is first",200, terms[0].totalTermFreq);
}
public void testFirstTermHighestTotalTermFreqDifferentField () throws Exception{
int numTerms = 20;
String field = "different_field";
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
TermStats[] termsWithTotalTermFreq = HighFreqTerms.sortByTotalTermFreq(reader, terms);
assertEquals("Term with highest totalTermFreq is first"+ termsWithTotalTermFreq[0].getTermText(),150, termsWithTotalTermFreq[0].totalTermFreq);
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.TotalTermFreqComparator());
assertEquals("Term with highest totalTermFreq is first"+ terms[0].getTermText(),150, terms[0].totalTermFreq);
}
public void testOrderedByTermFreqDescending () throws Exception{
int numTerms = 12;
String field = "FIELD_1";
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
TermStats[] termsWithTF = HighFreqTerms.sortByTotalTermFreq(reader, terms);
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.TotalTermFreqComparator());
for (int i = 0; i < termsWithTF.length; i++) {
for (int i = 0; i < terms.length; i++) {
// check that they are sorted by descending termfreq
// order
if (i > 0) {
assertTrue ("out of order" +termsWithTF[i-1]+ " > " +termsWithTF[i],termsWithTF[i-1].totalTermFreq >= termsWithTF[i].totalTermFreq);
assertTrue ("out of order" +terms[i-1]+ " > " +terms[i],terms[i-1].totalTermFreq >= terms[i].totalTermFreq);
}
}
}
@ -153,49 +148,29 @@ public class TestHighFreqTerms extends LuceneTestCase {
public void testGetTermFreqOrdered () throws Exception{
int numTerms = 12;
String field = "FIELD_1";
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
TermStats[] termsWithTF = HighFreqTerms.sortByTotalTermFreq(reader, terms);
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.TotalTermFreqComparator());
for (int i = 0; i < termsWithTF.length; i++) {
String text = termsWithTF[i].termtext.utf8ToString();
for (int i = 0; i < terms.length; i++) {
String text = terms[i].termtext.utf8ToString();
if (text.contains("highTF")) {
if (text.contains("medDF")) {
assertEquals("total term freq is expected", 125,
termsWithTF[i].totalTermFreq);
terms[i].totalTermFreq);
} else {
assertEquals("total term freq is expected", 200,
termsWithTF[i].totalTermFreq);
terms[i].totalTermFreq);
}
} else {
int n = Integer.parseInt(text);
assertEquals("doc freq is expected", getExpecteddocFreq(n),
termsWithTF[i].docFreq);
terms[i].docFreq);
assertEquals("total term freq is expected", getExpectedtotalTermFreq(n),
termsWithTF[i].totalTermFreq);
terms[i].totalTermFreq);
}
}
}
/********************Tests for getTotalTermFreq**********************************/
public void testGetTotalTermFreq() throws Exception{
String term ="highTF";
BytesRef termtext = new BytesRef (term);
String field = "FIELD_1";
long totalTermFreq = HighFreqTerms.getTotalTermFreq(reader, new Term(field, termtext));
assertEquals("highTf tf should be 200",200,totalTermFreq);
}
public void testGetTotalTermFreqBadTerm() throws Exception{
String term ="foobar";
BytesRef termtext = new BytesRef (term);
String field = "FIELD_1";
long totalTermFreq = HighFreqTerms.getTotalTermFreq(reader, new Term(field, termtext));
assertEquals("totalTermFreq should be 0 for term not in index",0,totalTermFreq);
}
/********************Testing Utils**********************************/
private static void indexDocs(IndexWriter writer) throws Exception {