mirror of https://github.com/apache/lucene.git
LUCENE-5200: HighFreqTerms has confusing behavior with -t option
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1520615 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9a930b8806
commit
91c01ddb5f
|
@ -223,6 +223,10 @@ Changes in Runtime Behavior
|
|||
* LUCENE-5178: DocValues codec consumer APIs (iterables) return null values
|
||||
when the document has no value for the field. (Robert Muir)
|
||||
|
||||
* LUCENE-5200: The HighFreqTerms command-line tool returns the true top-N
|
||||
by totalTermFreq when using the -t option, it uses the term statistics (faster)
|
||||
and now always shows totalTermFreq in the output. (Robert Muir)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-5088: Added TermFilter to filter docs by a specific term.
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.misc;
|
|||
import java.io.File;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
|
@ -50,9 +49,8 @@ public class GetTermInfo {
|
|||
|
||||
public static void getTermInfo(Directory dir, Term term) throws Exception {
|
||||
IndexReader reader = DirectoryReader.open(dir);
|
||||
long totalTF = HighFreqTerms.getTotalTermFreq(reader, term);
|
||||
System.out.printf("%s:%s \t totalTF = %,d \t doc freq = %,d \n",
|
||||
term.field(), term.text(), totalTF, reader.docFreq(term));
|
||||
term.field(), term.text(), reader.totalTermFreq(term), reader.docFreq(term));
|
||||
}
|
||||
|
||||
private static void usage() {
|
||||
|
|
|
@ -17,26 +17,19 @@ package org.apache.lucene.misc;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.AtomicReader;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.ReaderUtil;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.Bits;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
|
||||
/**
|
||||
|
@ -51,27 +44,24 @@ import java.util.Comparator;
|
|||
public class HighFreqTerms {
|
||||
|
||||
// The top numTerms will be displayed
|
||||
public static final int DEFAULTnumTerms = 100;
|
||||
public static int numTerms = DEFAULTnumTerms;
|
||||
public static final int DEFAULT_NUMTERMS = 100;
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
IndexReader reader = null;
|
||||
FSDirectory dir = null;
|
||||
String field = null;
|
||||
boolean IncludeTermFreqs = false;
|
||||
int numTerms = DEFAULT_NUMTERMS;
|
||||
|
||||
if (args.length == 0 || args.length > 4) {
|
||||
usage();
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
if (args.length > 0) {
|
||||
dir = FSDirectory.open(new File(args[0]));
|
||||
}
|
||||
Directory dir = FSDirectory.open(new File(args[0]));
|
||||
|
||||
Comparator<TermStats> comparator = new DocFreqComparator();
|
||||
|
||||
for (int i = 1; i < args.length; i++) {
|
||||
if (args[i].equals("-t")) {
|
||||
IncludeTermFreqs = true;
|
||||
comparator = new TotalTermFreqComparator();
|
||||
}
|
||||
else{
|
||||
try {
|
||||
|
@ -82,22 +72,12 @@ public class HighFreqTerms {
|
|||
}
|
||||
}
|
||||
|
||||
reader = DirectoryReader.open(dir);
|
||||
TermStats[] terms = getHighFreqTerms(reader, numTerms, field);
|
||||
if (!IncludeTermFreqs) {
|
||||
//default HighFreqTerms behavior
|
||||
IndexReader reader = DirectoryReader.open(dir);
|
||||
TermStats[] terms = getHighFreqTerms(reader, numTerms, field, comparator);
|
||||
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
System.out.printf("%s:%s %,d \n",
|
||||
terms[i].field, terms[i].termtext.utf8ToString(), terms[i].docFreq);
|
||||
}
|
||||
}
|
||||
else{
|
||||
TermStats[] termsWithTF = sortByTotalTermFreq(reader, terms);
|
||||
for (int i = 0; i < termsWithTF.length; i++) {
|
||||
System.out.printf("%s:%s \t totalTF = %,d \t doc freq = %,d \n",
|
||||
termsWithTF[i].field, termsWithTF[i].termtext.utf8ToString(),
|
||||
termsWithTF[i].totalTermFreq, termsWithTF[i].docFreq);
|
||||
}
|
||||
System.out.printf("%s:%s \t totalTF = %,d \t docFreq = %,d \n",
|
||||
terms[i].field, terms[i].termtext.utf8ToString(), terms[i].totalTermFreq, terms[i].docFreq);
|
||||
}
|
||||
reader.close();
|
||||
}
|
||||
|
@ -105,12 +85,13 @@ public class HighFreqTerms {
|
|||
private static void usage() {
|
||||
System.out
|
||||
.println("\n\n"
|
||||
+ "java org.apache.lucene.misc.HighFreqTerms <index dir> [-t] [number_terms] [field]\n\t -t: include totalTermFreq\n\n");
|
||||
+ "java org.apache.lucene.misc.HighFreqTerms <index dir> [-t] [number_terms] [field]\n\t -t: order by totalTermFreq\n\n");
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns TermStats[] ordered by terms with highest docFreq first.
|
||||
* Returns TermStats[] ordered by the specified comparator
|
||||
*/
|
||||
public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String field) throws Exception {
|
||||
public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String field, Comparator<TermStats> comparator) throws Exception {
|
||||
TermStatsQueue tiq = null;
|
||||
|
||||
if (field != null) {
|
||||
|
@ -121,7 +102,7 @@ public class HighFreqTerms {
|
|||
Terms terms = fields.terms(field);
|
||||
if (terms != null) {
|
||||
TermsEnum termsEnum = terms.iterator(null);
|
||||
tiq = new TermStatsQueue(numTerms);
|
||||
tiq = new TermStatsQueue(numTerms, comparator);
|
||||
tiq.fill(field, termsEnum);
|
||||
}
|
||||
} else {
|
||||
|
@ -129,7 +110,7 @@ public class HighFreqTerms {
|
|||
if (fields == null) {
|
||||
throw new RuntimeException("no fields found for this index");
|
||||
}
|
||||
tiq = new TermStatsQueue(numTerms);
|
||||
tiq = new TermStatsQueue(numTerms, comparator);
|
||||
for (String fieldName : fields) {
|
||||
Terms terms = fields.terms(fieldName);
|
||||
if (terms != null) {
|
||||
|
@ -150,91 +131,61 @@ public class HighFreqTerms {
|
|||
}
|
||||
|
||||
/**
|
||||
* Takes array of TermStats. For each term looks up the tf for each doc
|
||||
* containing the term and stores the total in the output array of TermStats.
|
||||
* Output array is sorted by highest total tf.
|
||||
*
|
||||
* @param terms
|
||||
* TermStats[]
|
||||
* @return TermStats[]
|
||||
* Compares terms by docTermFreq
|
||||
*/
|
||||
|
||||
public static TermStats[] sortByTotalTermFreq(IndexReader reader, TermStats[] terms) throws Exception {
|
||||
TermStats[] ts = new TermStats[terms.length]; // array for sorting
|
||||
long totalTF;
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
totalTF = getTotalTermFreq(reader, new Term(terms[i].field, terms[i].termtext));
|
||||
ts[i] = new TermStats(terms[i].field, terms[i].termtext, terms[i].docFreq, totalTF);
|
||||
}
|
||||
|
||||
Comparator<TermStats> c = new TotalTermFreqComparatorSortDescending();
|
||||
Arrays.sort(ts, c);
|
||||
|
||||
return ts;
|
||||
}
|
||||
|
||||
public static long getTotalTermFreq(IndexReader reader, Term term) throws Exception {
|
||||
long totalTF = 0L;
|
||||
for (final AtomicReaderContext ctx : reader.leaves()) {
|
||||
AtomicReader r = ctx.reader();
|
||||
if (!r.hasDeletions()) {
|
||||
// TODO: we could do this up front, during the scan
|
||||
// (next()), instead of after-the-fact here w/ seek,
|
||||
// if the codec supports it and there are no del
|
||||
// docs...
|
||||
final long totTF = r.totalTermFreq(term);
|
||||
if (totTF != -1) {
|
||||
totalTF += totTF;
|
||||
continue;
|
||||
} // otherwise we fall-through
|
||||
}
|
||||
// note: what should we do if field omits freqs? currently it counts as 1...
|
||||
DocsEnum de = r.termDocsEnum(term);
|
||||
if (de != null) {
|
||||
while (de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS)
|
||||
totalTF += de.freq();
|
||||
}
|
||||
}
|
||||
|
||||
return totalTF;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Comparator
|
||||
*
|
||||
* Reverse of normal Comparator. i.e. returns 1 if a.totalTermFreq is less than
|
||||
* b.totalTermFreq So we can sort in descending order of totalTermFreq
|
||||
*/
|
||||
|
||||
final class TotalTermFreqComparatorSortDescending implements Comparator<TermStats> {
|
||||
public static final class DocFreqComparator implements Comparator<TermStats> {
|
||||
|
||||
@Override
|
||||
public int compare(TermStats a, TermStats b) {
|
||||
return Long.compare(b.totalTermFreq, a.totalTermFreq);
|
||||
int res = Long.compare(a.docFreq, b.docFreq);
|
||||
if (res == 0) {
|
||||
res = a.field.compareTo(b.field);
|
||||
if (res == 0) {
|
||||
res = a.termtext.compareTo(b.termtext);
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Priority queue for TermStats objects ordered by docFreq
|
||||
* Compares terms by totalTermFreq
|
||||
*/
|
||||
public static final class TotalTermFreqComparator implements Comparator<TermStats> {
|
||||
|
||||
@Override
|
||||
public int compare(TermStats a, TermStats b) {
|
||||
int res = Long.compare(a.totalTermFreq, b.totalTermFreq);
|
||||
if (res == 0) {
|
||||
res = a.field.compareTo(b.field);
|
||||
if (res == 0) {
|
||||
res = a.termtext.compareTo(b.termtext);
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Priority queue for TermStats objects
|
||||
**/
|
||||
final class TermStatsQueue extends PriorityQueue<TermStats> {
|
||||
TermStatsQueue(int size) {
|
||||
static final class TermStatsQueue extends PriorityQueue<TermStats> {
|
||||
final Comparator<TermStats> comparator;
|
||||
|
||||
TermStatsQueue(int size, Comparator<TermStats> comparator) {
|
||||
super(size);
|
||||
this.comparator = comparator;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean lessThan(TermStats termInfoA, TermStats termInfoB) {
|
||||
return termInfoA.docFreq < termInfoB.docFreq;
|
||||
return comparator.compare(termInfoA, termInfoB) < 0;
|
||||
}
|
||||
|
||||
protected void fill(String field, TermsEnum termsEnum) throws IOException {
|
||||
while (true) {
|
||||
BytesRef term = termsEnum.next();
|
||||
if (term != null) {
|
||||
insertWithOverflow(new TermStats(field, term, termsEnum.docFreq()));
|
||||
} else {
|
||||
break;
|
||||
BytesRef term = null;
|
||||
while ((term = termsEnum.next()) != null) {
|
||||
insertWithOverflow(new TermStats(field, term, termsEnum.docFreq(), termsEnum.totalTermFreq()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -29,12 +29,6 @@ public final class TermStats {
|
|||
public int docFreq;
|
||||
public long totalTermFreq;
|
||||
|
||||
TermStats(String field, BytesRef termtext, int df) {
|
||||
this.termtext = BytesRef.deepCopyOf(termtext);
|
||||
this.field = field;
|
||||
this.docFreq = df;
|
||||
}
|
||||
|
||||
TermStats(String field, BytesRef termtext, int df, long tf) {
|
||||
this.termtext = BytesRef.deepCopyOf(termtext);
|
||||
this.field = field;
|
||||
|
|
|
@ -26,9 +26,7 @@ import org.apache.lucene.document.Field;
|
|||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.junit.AfterClass;
|
||||
|
@ -66,21 +64,21 @@ public class TestHighFreqTerms extends LuceneTestCase {
|
|||
public void testFirstTermHighestDocFreqAllFields () throws Exception{
|
||||
int numTerms = 12;
|
||||
String field =null;
|
||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
|
||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator());
|
||||
assertEquals("Term with highest docfreq is first", 20,terms[0].docFreq );
|
||||
}
|
||||
|
||||
public void testFirstTermHighestDocFreq () throws Exception{
|
||||
int numTerms = 12;
|
||||
String field="FIELD_1";
|
||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
|
||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator());
|
||||
assertEquals("Term with highest docfreq is first", 10,terms[0].docFreq );
|
||||
}
|
||||
|
||||
public void testOrderedByDocFreqDescending () throws Exception{
|
||||
int numTerms = 12;
|
||||
String field="FIELD_1";
|
||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
|
||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator());
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
if (i > 0) {
|
||||
assertTrue ("out of order " + terms[i-1].docFreq + "should be >= " + terms[i].docFreq,terms[i-1].docFreq >= terms[i].docFreq);
|
||||
|
@ -91,14 +89,14 @@ public class TestHighFreqTerms extends LuceneTestCase {
|
|||
public void testNumTerms () throws Exception{
|
||||
int numTerms = 12;
|
||||
String field = null;
|
||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
|
||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator());
|
||||
assertEquals("length of terms array equals numTerms :" + numTerms, numTerms, terms.length);
|
||||
}
|
||||
|
||||
public void testGetHighFreqTerms () throws Exception{
|
||||
int numTerms=12;
|
||||
String field="FIELD_1";
|
||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
|
||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator());
|
||||
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
String termtext = terms[i].termtext.utf8ToString();
|
||||
|
@ -122,30 +120,27 @@ public class TestHighFreqTerms extends LuceneTestCase {
|
|||
public void testFirstTermHighestTotalTermFreq () throws Exception{
|
||||
int numTerms = 20;
|
||||
String field = null;
|
||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
|
||||
TermStats[] termsWithTotalTermFreq = HighFreqTerms.sortByTotalTermFreq(reader, terms);
|
||||
assertEquals("Term with highest totalTermFreq is first",200, termsWithTotalTermFreq[0].totalTermFreq);
|
||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.TotalTermFreqComparator());
|
||||
assertEquals("Term with highest totalTermFreq is first",200, terms[0].totalTermFreq);
|
||||
}
|
||||
|
||||
public void testFirstTermHighestTotalTermFreqDifferentField () throws Exception{
|
||||
int numTerms = 20;
|
||||
String field = "different_field";
|
||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
|
||||
TermStats[] termsWithTotalTermFreq = HighFreqTerms.sortByTotalTermFreq(reader, terms);
|
||||
assertEquals("Term with highest totalTermFreq is first"+ termsWithTotalTermFreq[0].getTermText(),150, termsWithTotalTermFreq[0].totalTermFreq);
|
||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.TotalTermFreqComparator());
|
||||
assertEquals("Term with highest totalTermFreq is first"+ terms[0].getTermText(),150, terms[0].totalTermFreq);
|
||||
}
|
||||
|
||||
public void testOrderedByTermFreqDescending () throws Exception{
|
||||
int numTerms = 12;
|
||||
String field = "FIELD_1";
|
||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
|
||||
TermStats[] termsWithTF = HighFreqTerms.sortByTotalTermFreq(reader, terms);
|
||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.TotalTermFreqComparator());
|
||||
|
||||
for (int i = 0; i < termsWithTF.length; i++) {
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
// check that they are sorted by descending termfreq
|
||||
// order
|
||||
if (i > 0) {
|
||||
assertTrue ("out of order" +termsWithTF[i-1]+ " > " +termsWithTF[i],termsWithTF[i-1].totalTermFreq >= termsWithTF[i].totalTermFreq);
|
||||
assertTrue ("out of order" +terms[i-1]+ " > " +terms[i],terms[i-1].totalTermFreq >= terms[i].totalTermFreq);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -153,49 +148,29 @@ public class TestHighFreqTerms extends LuceneTestCase {
|
|||
public void testGetTermFreqOrdered () throws Exception{
|
||||
int numTerms = 12;
|
||||
String field = "FIELD_1";
|
||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
|
||||
TermStats[] termsWithTF = HighFreqTerms.sortByTotalTermFreq(reader, terms);
|
||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.TotalTermFreqComparator());
|
||||
|
||||
for (int i = 0; i < termsWithTF.length; i++) {
|
||||
String text = termsWithTF[i].termtext.utf8ToString();
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
String text = terms[i].termtext.utf8ToString();
|
||||
if (text.contains("highTF")) {
|
||||
if (text.contains("medDF")) {
|
||||
assertEquals("total term freq is expected", 125,
|
||||
termsWithTF[i].totalTermFreq);
|
||||
terms[i].totalTermFreq);
|
||||
} else {
|
||||
assertEquals("total term freq is expected", 200,
|
||||
termsWithTF[i].totalTermFreq);
|
||||
terms[i].totalTermFreq);
|
||||
}
|
||||
|
||||
} else {
|
||||
int n = Integer.parseInt(text);
|
||||
assertEquals("doc freq is expected", getExpecteddocFreq(n),
|
||||
termsWithTF[i].docFreq);
|
||||
terms[i].docFreq);
|
||||
assertEquals("total term freq is expected", getExpectedtotalTermFreq(n),
|
||||
termsWithTF[i].totalTermFreq);
|
||||
terms[i].totalTermFreq);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/********************Tests for getTotalTermFreq**********************************/
|
||||
|
||||
public void testGetTotalTermFreq() throws Exception{
|
||||
String term ="highTF";
|
||||
BytesRef termtext = new BytesRef (term);
|
||||
String field = "FIELD_1";
|
||||
long totalTermFreq = HighFreqTerms.getTotalTermFreq(reader, new Term(field, termtext));
|
||||
assertEquals("highTf tf should be 200",200,totalTermFreq);
|
||||
|
||||
}
|
||||
|
||||
public void testGetTotalTermFreqBadTerm() throws Exception{
|
||||
String term ="foobar";
|
||||
BytesRef termtext = new BytesRef (term);
|
||||
String field = "FIELD_1";
|
||||
long totalTermFreq = HighFreqTerms.getTotalTermFreq(reader, new Term(field, termtext));
|
||||
assertEquals("totalTermFreq should be 0 for term not in index",0,totalTermFreq);
|
||||
|
||||
}
|
||||
/********************Testing Utils**********************************/
|
||||
|
||||
private static void indexDocs(IndexWriter writer) throws Exception {
|
||||
|
|
Loading…
Reference in New Issue