diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index c61cc7f7c5e..d987b7b4c6e 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -223,6 +223,10 @@ Changes in Runtime Behavior * LUCENE-5178: DocValues codec consumer APIs (iterables) return null values when the document has no value for the field. (Robert Muir) +* LUCENE-5200: The HighFreqTerms command-line tool returns the true top-N + by totalTermFreq when using the -t option, it uses the term statistics (faster) + and now always shows totalTermFreq in the output. (Robert Muir) + Optimizations * LUCENE-5088: Added TermFilter to filter docs by a specific term. diff --git a/lucene/misc/src/java/org/apache/lucene/misc/GetTermInfo.java b/lucene/misc/src/java/org/apache/lucene/misc/GetTermInfo.java index 30ce3f4297e..fb86b99aaba 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/GetTermInfo.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/GetTermInfo.java @@ -20,7 +20,6 @@ package org.apache.lucene.misc; import java.io.File; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.util.BytesRef; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; @@ -50,9 +49,8 @@ public class GetTermInfo { public static void getTermInfo(Directory dir, Term term) throws Exception { IndexReader reader = DirectoryReader.open(dir); - long totalTF = HighFreqTerms.getTotalTermFreq(reader, term); System.out.printf("%s:%s \t totalTF = %,d \t doc freq = %,d \n", - term.field(), term.text(), totalTF, reader.docFreq(term)); + term.field(), term.text(), reader.totalTermFreq(term), reader.docFreq(term)); } private static void usage() { diff --git a/lucene/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java b/lucene/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java index ab53aaf9f47..6b7860719a3 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java @@ -17,26 +17,19 @@ package org.apache.lucene.misc; * limitations under the License. */ -import org.apache.lucene.index.AtomicReader; -import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Fields; -import org.apache.lucene.index.ReaderUtil; -import org.apache.lucene.index.Term; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.Terms; -import org.apache.lucene.index.DocsEnum; -import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.Bits; import java.io.File; import java.io.IOException; -import java.util.Arrays; import java.util.Comparator; /** @@ -51,27 +44,24 @@ import java.util.Comparator; public class HighFreqTerms { // The top numTerms will be displayed - public static final int DEFAULTnumTerms = 100; - public static int numTerms = DEFAULTnumTerms; + public static final int DEFAULT_NUMTERMS = 100; public static void main(String[] args) throws Exception { - IndexReader reader = null; - FSDirectory dir = null; String field = null; - boolean IncludeTermFreqs = false; + int numTerms = DEFAULT_NUMTERMS; if (args.length == 0 || args.length > 4) { usage(); System.exit(1); } - if (args.length > 0) { - dir = FSDirectory.open(new File(args[0])); - } + Directory dir = FSDirectory.open(new File(args[0])); + + Comparator comparator = new DocFreqComparator(); for (int i = 1; i < args.length; i++) { if (args[i].equals("-t")) { - IncludeTermFreqs = true; + comparator = new TotalTermFreqComparator(); } else{ try { @@ -82,22 +72,12 @@ public class HighFreqTerms { } } - reader = DirectoryReader.open(dir); - TermStats[] terms = getHighFreqTerms(reader, numTerms, field); - if (!IncludeTermFreqs) { - //default HighFreqTerms behavior - for (int i = 0; i < terms.length; i++) { - System.out.printf("%s:%s %,d \n", - terms[i].field, terms[i].termtext.utf8ToString(), terms[i].docFreq); - } - } - else{ - TermStats[] termsWithTF = sortByTotalTermFreq(reader, terms); - for (int i = 0; i < termsWithTF.length; i++) { - System.out.printf("%s:%s \t totalTF = %,d \t doc freq = %,d \n", - termsWithTF[i].field, termsWithTF[i].termtext.utf8ToString(), - termsWithTF[i].totalTermFreq, termsWithTF[i].docFreq); - } + IndexReader reader = DirectoryReader.open(dir); + TermStats[] terms = getHighFreqTerms(reader, numTerms, field, comparator); + + for (int i = 0; i < terms.length; i++) { + System.out.printf("%s:%s \t totalTF = %,d \t docFreq = %,d \n", + terms[i].field, terms[i].termtext.utf8ToString(), terms[i].totalTermFreq, terms[i].docFreq); } reader.close(); } @@ -105,12 +85,13 @@ public class HighFreqTerms { private static void usage() { System.out .println("\n\n" - + "java org.apache.lucene.misc.HighFreqTerms [-t] [number_terms] [field]\n\t -t: include totalTermFreq\n\n"); + + "java org.apache.lucene.misc.HighFreqTerms [-t] [number_terms] [field]\n\t -t: order by totalTermFreq\n\n"); } + /** - * Returns TermStats[] ordered by terms with highest docFreq first. + * Returns TermStats[] ordered by the specified comparator */ - public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String field) throws Exception { + public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String field, Comparator comparator) throws Exception { TermStatsQueue tiq = null; if (field != null) { @@ -121,7 +102,7 @@ public class HighFreqTerms { Terms terms = fields.terms(field); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); - tiq = new TermStatsQueue(numTerms); + tiq = new TermStatsQueue(numTerms, comparator); tiq.fill(field, termsEnum); } } else { @@ -129,7 +110,7 @@ public class HighFreqTerms { if (fields == null) { throw new RuntimeException("no fields found for this index"); } - tiq = new TermStatsQueue(numTerms); + tiq = new TermStatsQueue(numTerms, comparator); for (String fieldName : fields) { Terms terms = fields.terms(fieldName); if (terms != null) { @@ -150,91 +131,61 @@ public class HighFreqTerms { } /** - * Takes array of TermStats. For each term looks up the tf for each doc - * containing the term and stores the total in the output array of TermStats. - * Output array is sorted by highest total tf. - * - * @param terms - * TermStats[] - * @return TermStats[] + * Compares terms by docTermFreq */ + public static final class DocFreqComparator implements Comparator { + + @Override + public int compare(TermStats a, TermStats b) { + int res = Long.compare(a.docFreq, b.docFreq); + if (res == 0) { + res = a.field.compareTo(b.field); + if (res == 0) { + res = a.termtext.compareTo(b.termtext); + } + } + return res; + } + } + + /** + * Compares terms by totalTermFreq + */ + public static final class TotalTermFreqComparator implements Comparator { + + @Override + public int compare(TermStats a, TermStats b) { + int res = Long.compare(a.totalTermFreq, b.totalTermFreq); + if (res == 0) { + res = a.field.compareTo(b.field); + if (res == 0) { + res = a.termtext.compareTo(b.termtext); + } + } + return res; + } + } - public static TermStats[] sortByTotalTermFreq(IndexReader reader, TermStats[] terms) throws Exception { - TermStats[] ts = new TermStats[terms.length]; // array for sorting - long totalTF; - for (int i = 0; i < terms.length; i++) { - totalTF = getTotalTermFreq(reader, new Term(terms[i].field, terms[i].termtext)); - ts[i] = new TermStats(terms[i].field, terms[i].termtext, terms[i].docFreq, totalTF); + /** + * Priority queue for TermStats objects + **/ + static final class TermStatsQueue extends PriorityQueue { + final Comparator comparator; + + TermStatsQueue(int size, Comparator comparator) { + super(size); + this.comparator = comparator; } - Comparator c = new TotalTermFreqComparatorSortDescending(); - Arrays.sort(ts, c); - - return ts; - } - - public static long getTotalTermFreq(IndexReader reader, Term term) throws Exception { - long totalTF = 0L; - for (final AtomicReaderContext ctx : reader.leaves()) { - AtomicReader r = ctx.reader(); - if (!r.hasDeletions()) { - // TODO: we could do this up front, during the scan - // (next()), instead of after-the-fact here w/ seek, - // if the codec supports it and there are no del - // docs... - final long totTF = r.totalTermFreq(term); - if (totTF != -1) { - totalTF += totTF; - continue; - } // otherwise we fall-through - } - // note: what should we do if field omits freqs? currently it counts as 1... - DocsEnum de = r.termDocsEnum(term); - if (de != null) { - while (de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) - totalTF += de.freq(); - } + @Override + protected boolean lessThan(TermStats termInfoA, TermStats termInfoB) { + return comparator.compare(termInfoA, termInfoB) < 0; } - return totalTF; - } - } - -/** - * Comparator - * - * Reverse of normal Comparator. i.e. returns 1 if a.totalTermFreq is less than - * b.totalTermFreq So we can sort in descending order of totalTermFreq - */ - -final class TotalTermFreqComparatorSortDescending implements Comparator { - - @Override - public int compare(TermStats a, TermStats b) { - return Long.compare(b.totalTermFreq, a.totalTermFreq); - } -} - -/** - * Priority queue for TermStats objects ordered by docFreq - **/ -final class TermStatsQueue extends PriorityQueue { - TermStatsQueue(int size) { - super(size); - } - - @Override - protected boolean lessThan(TermStats termInfoA, TermStats termInfoB) { - return termInfoA.docFreq < termInfoB.docFreq; - } - - protected void fill(String field, TermsEnum termsEnum) throws IOException { - while (true) { - BytesRef term = termsEnum.next(); - if (term != null) { - insertWithOverflow(new TermStats(field, term, termsEnum.docFreq())); - } else { - break; + protected void fill(String field, TermsEnum termsEnum) throws IOException { + BytesRef term = null; + while ((term = termsEnum.next()) != null) { + insertWithOverflow(new TermStats(field, term, termsEnum.docFreq(), termsEnum.totalTermFreq())); } } } diff --git a/lucene/misc/src/java/org/apache/lucene/misc/TermStats.java b/lucene/misc/src/java/org/apache/lucene/misc/TermStats.java index b7b43113ab6..1c38f1d4ffb 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/TermStats.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/TermStats.java @@ -29,12 +29,6 @@ public final class TermStats { public int docFreq; public long totalTermFreq; - TermStats(String field, BytesRef termtext, int df) { - this.termtext = BytesRef.deepCopyOf(termtext); - this.field = field; - this.docFreq = df; - } - TermStats(String field, BytesRef termtext, int df, long tf) { this.termtext = BytesRef.deepCopyOf(termtext); this.field = field; diff --git a/lucene/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java b/lucene/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java index 8e36cc93a0e..4faef8feb3f 100644 --- a/lucene/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java +++ b/lucene/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java @@ -26,9 +26,7 @@ import org.apache.lucene.document.Field; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; -import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; import org.junit.AfterClass; @@ -66,21 +64,21 @@ public class TestHighFreqTerms extends LuceneTestCase { public void testFirstTermHighestDocFreqAllFields () throws Exception{ int numTerms = 12; String field =null; - TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field); + TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator()); assertEquals("Term with highest docfreq is first", 20,terms[0].docFreq ); } public void testFirstTermHighestDocFreq () throws Exception{ int numTerms = 12; String field="FIELD_1"; - TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field); + TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator()); assertEquals("Term with highest docfreq is first", 10,terms[0].docFreq ); } public void testOrderedByDocFreqDescending () throws Exception{ int numTerms = 12; String field="FIELD_1"; - TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field); + TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator()); for (int i = 0; i < terms.length; i++) { if (i > 0) { assertTrue ("out of order " + terms[i-1].docFreq + "should be >= " + terms[i].docFreq,terms[i-1].docFreq >= terms[i].docFreq); @@ -91,14 +89,14 @@ public class TestHighFreqTerms extends LuceneTestCase { public void testNumTerms () throws Exception{ int numTerms = 12; String field = null; - TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field); + TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator()); assertEquals("length of terms array equals numTerms :" + numTerms, numTerms, terms.length); } public void testGetHighFreqTerms () throws Exception{ int numTerms=12; String field="FIELD_1"; - TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field); + TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator()); for (int i = 0; i < terms.length; i++) { String termtext = terms[i].termtext.utf8ToString(); @@ -122,30 +120,27 @@ public class TestHighFreqTerms extends LuceneTestCase { public void testFirstTermHighestTotalTermFreq () throws Exception{ int numTerms = 20; String field = null; - TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field); - TermStats[] termsWithTotalTermFreq = HighFreqTerms.sortByTotalTermFreq(reader, terms); - assertEquals("Term with highest totalTermFreq is first",200, termsWithTotalTermFreq[0].totalTermFreq); + TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.TotalTermFreqComparator()); + assertEquals("Term with highest totalTermFreq is first",200, terms[0].totalTermFreq); } public void testFirstTermHighestTotalTermFreqDifferentField () throws Exception{ int numTerms = 20; String field = "different_field"; - TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field); - TermStats[] termsWithTotalTermFreq = HighFreqTerms.sortByTotalTermFreq(reader, terms); - assertEquals("Term with highest totalTermFreq is first"+ termsWithTotalTermFreq[0].getTermText(),150, termsWithTotalTermFreq[0].totalTermFreq); + TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.TotalTermFreqComparator()); + assertEquals("Term with highest totalTermFreq is first"+ terms[0].getTermText(),150, terms[0].totalTermFreq); } public void testOrderedByTermFreqDescending () throws Exception{ int numTerms = 12; String field = "FIELD_1"; - TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field); - TermStats[] termsWithTF = HighFreqTerms.sortByTotalTermFreq(reader, terms); + TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.TotalTermFreqComparator()); - for (int i = 0; i < termsWithTF.length; i++) { + for (int i = 0; i < terms.length; i++) { // check that they are sorted by descending termfreq // order if (i > 0) { - assertTrue ("out of order" +termsWithTF[i-1]+ " > " +termsWithTF[i],termsWithTF[i-1].totalTermFreq >= termsWithTF[i].totalTermFreq); + assertTrue ("out of order" +terms[i-1]+ " > " +terms[i],terms[i-1].totalTermFreq >= terms[i].totalTermFreq); } } } @@ -153,49 +148,29 @@ public class TestHighFreqTerms extends LuceneTestCase { public void testGetTermFreqOrdered () throws Exception{ int numTerms = 12; String field = "FIELD_1"; - TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field); - TermStats[] termsWithTF = HighFreqTerms.sortByTotalTermFreq(reader, terms); + TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.TotalTermFreqComparator()); - for (int i = 0; i < termsWithTF.length; i++) { - String text = termsWithTF[i].termtext.utf8ToString(); + for (int i = 0; i < terms.length; i++) { + String text = terms[i].termtext.utf8ToString(); if (text.contains("highTF")) { if (text.contains("medDF")) { assertEquals("total term freq is expected", 125, - termsWithTF[i].totalTermFreq); + terms[i].totalTermFreq); } else { assertEquals("total term freq is expected", 200, - termsWithTF[i].totalTermFreq); + terms[i].totalTermFreq); } } else { int n = Integer.parseInt(text); assertEquals("doc freq is expected", getExpecteddocFreq(n), - termsWithTF[i].docFreq); + terms[i].docFreq); assertEquals("total term freq is expected", getExpectedtotalTermFreq(n), - termsWithTF[i].totalTermFreq); + terms[i].totalTermFreq); } } } - - /********************Tests for getTotalTermFreq**********************************/ - - public void testGetTotalTermFreq() throws Exception{ - String term ="highTF"; - BytesRef termtext = new BytesRef (term); - String field = "FIELD_1"; - long totalTermFreq = HighFreqTerms.getTotalTermFreq(reader, new Term(field, termtext)); - assertEquals("highTf tf should be 200",200,totalTermFreq); - - } - - public void testGetTotalTermFreqBadTerm() throws Exception{ - String term ="foobar"; - BytesRef termtext = new BytesRef (term); - String field = "FIELD_1"; - long totalTermFreq = HighFreqTerms.getTotalTermFreq(reader, new Term(field, termtext)); - assertEquals("totalTermFreq should be 0 for term not in index",0,totalTermFreq); - - } + /********************Testing Utils**********************************/ private static void indexDocs(IndexWriter writer) throws Exception {