mirror of https://github.com/apache/lucene.git
#26396 - HighFreqTerms fixup from Jean-Fran��ois Halleux
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150941 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8fa85d5cdb
commit
507626829b
|
@ -3,7 +3,7 @@ package org.apache.lucene.misc;
|
||||||
/* ====================================================================
|
/* ====================================================================
|
||||||
* The Apache Software License, Version 1.1
|
* The Apache Software License, Version 1.1
|
||||||
*
|
*
|
||||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
* Copyright (c) 2001,2004 The Apache Software Foundation. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
@ -54,10 +54,10 @@ package org.apache.lucene.misc;
|
||||||
* <http://www.apache.org/>.
|
* <http://www.apache.org/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.util.PriorityQueue;
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.index.TermEnum;
|
import org.apache.lucene.index.TermEnum;
|
||||||
|
import org.apache.lucene.util.PriorityQueue;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <code>HighFreqTerms</code> class extracts terms and their frequencies out
|
* <code>HighFreqTerms</code> class extracts terms and their frequencies out
|
||||||
|
@ -65,77 +65,59 @@ import org.apache.lucene.index.TermEnum;
|
||||||
*
|
*
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
*/
|
*/
|
||||||
public class HighFreqTerms
|
public class HighFreqTerms {
|
||||||
{
|
|
||||||
public static int numTerms = 100;
|
// The top numTerms will be displayed
|
||||||
|
public static final int numTerms = 100;
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception
|
public static void main(String[] args) throws Exception {
|
||||||
{
|
IndexReader reader = null;
|
||||||
IndexReader reader = null;
|
if (args.length == 1) {
|
||||||
if (args.length == 1)
|
reader = IndexReader.open(args[0]);
|
||||||
{
|
} else {
|
||||||
reader = IndexReader.open(args[0]);
|
usage();
|
||||||
}
|
System.exit(1);
|
||||||
else
|
}
|
||||||
{
|
|
||||||
usage();
|
|
||||||
System.exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
TermInfoQueue tiq = new TermInfoQueue(numTerms);
|
TermInfoQueue tiq = new TermInfoQueue(numTerms);
|
||||||
TermEnum terms = reader.terms();
|
TermEnum terms = reader.terms();
|
||||||
|
|
||||||
int minFreq = 0;
|
while (terms.next()) {
|
||||||
while (terms.next())
|
tiq.insert(new TermInfo(terms.term(), terms.docFreq()));
|
||||||
{
|
}
|
||||||
if (terms.docFreq() > minFreq)
|
|
||||||
{
|
|
||||||
tiq.put(new TermInfo(terms.term(), terms.docFreq()));
|
|
||||||
if (tiq.size() > numTerms) // if tiq overfull
|
|
||||||
{
|
|
||||||
tiq.pop(); // remove lowest in tiq
|
|
||||||
minFreq = ((TermInfo)tiq.top()).docFreq; // reset minFreq
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
while (tiq.size() != 0)
|
while (tiq.size() != 0) {
|
||||||
{
|
TermInfo termInfo = (TermInfo) tiq.pop();
|
||||||
TermInfo termInfo = (TermInfo)tiq.pop();
|
System.out.println(termInfo.term + " " + termInfo.docFreq);
|
||||||
System.out.println(termInfo.term + " " + termInfo.docFreq);
|
}
|
||||||
}
|
|
||||||
|
|
||||||
reader.close();
|
reader.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void usage()
|
private static void usage() {
|
||||||
{
|
System.out.println(
|
||||||
System.out.println("\n\n" +
|
"\n\n"
|
||||||
"java org.apache.lucene.misc.HighFreqTerms <index dir>\n\n");
|
+ "java org.apache.lucene.misc.HighFreqTerms <index dir>\n\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
final class TermInfo
|
final class TermInfo {
|
||||||
{
|
TermInfo(Term t, int df) {
|
||||||
TermInfo(Term t, int df)
|
term = t;
|
||||||
{
|
docFreq = df;
|
||||||
term = t;
|
}
|
||||||
docFreq = df;
|
int docFreq;
|
||||||
}
|
Term term;
|
||||||
int docFreq;
|
|
||||||
Term term;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
final class TermInfoQueue extends PriorityQueue
|
final class TermInfoQueue extends PriorityQueue {
|
||||||
{
|
TermInfoQueue(int size) {
|
||||||
TermInfoQueue(int size)
|
initialize(size);
|
||||||
{
|
}
|
||||||
initialize(size);
|
|
||||||
}
|
protected final boolean lessThan(Object a, Object b) {
|
||||||
protected final boolean lessThan(Object a, Object b)
|
TermInfo termInfoA = (TermInfo) a;
|
||||||
{
|
TermInfo termInfoB = (TermInfo) b;
|
||||||
TermInfo termInfoA = (TermInfo)a;
|
return termInfoA.docFreq < termInfoB.docFreq;
|
||||||
TermInfo termInfoB = (TermInfo)b;
|
}
|
||||||
return termInfoA.docFreq < termInfoB.docFreq;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue