mirror of https://github.com/apache/lucene.git
LUCENE-2393: add total TF tracking to HighFreqTerms tool
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@943936 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
48cca13075
commit
ee5ec3ed32
|
@ -154,6 +154,9 @@ New features
|
|||
|
||||
* LUCENE-2437: Add an Analyzer for Indonesian. (Robert Muir)
|
||||
|
||||
* LUCENE-2393: The HighFreqTerms tool (in misc) can now optionally
|
||||
also include the total termFreq. (Tom Burton-West via Mike McCandless)
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
||||
|
|
|
@ -0,0 +1,64 @@
|
|||
package org.apache.lucene.misc;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
|
||||
/*
|
||||
* Utility to get document frequency and total number of occurrences (sum of the tf for each doc) of a term.
|
||||
*/
|
||||
public class GetTermInfo {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
FSDirectory dir = null;
|
||||
String inputStr = null;
|
||||
String field = null;
|
||||
|
||||
if (args.length == 3) {
|
||||
dir = FSDirectory.open(new File(args[0]));
|
||||
field = args[1];
|
||||
inputStr = args[2];
|
||||
} else {
|
||||
usage();
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
getTermInfo(dir,field, new BytesRef(inputStr));
|
||||
}
|
||||
|
||||
public static void getTermInfo(Directory dir, String field, BytesRef termtext) throws Exception {
|
||||
IndexReader reader = IndexReader.open(dir);
|
||||
Terms terms =MultiFields.getTerms(reader, field);
|
||||
long totalTF = HighFreqTerms.getTotalTermFreq(reader, field, termtext);
|
||||
System.out.printf("%s:%s \t totalTF = %,d \t doc freq = %,d \n",
|
||||
field, termtext.utf8ToString(), totalTF, terms.docFreq(termtext));
|
||||
}
|
||||
|
||||
private static void usage() {
|
||||
System.out
|
||||
.println("\n\nusage:\n\t"
|
||||
+ "java org.apache.lucene.index.GetTermInfo <index dir> field term \n\n");
|
||||
}
|
||||
}
|
|
@ -1,132 +1,270 @@
|
|||
package org.apache.lucene.misc;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.FieldsEnum;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.FieldReaderException;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import java.io.File;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
|
||||
/**
|
||||
* <code>HighFreqTerms</code> class extracts terms and their frequencies out
|
||||
* of an existing Lucene index.
|
||||
*
|
||||
* <code>HighFreqTerms</code> class extracts the top n most frequent terms
|
||||
* (by document frequency ) from an existing Lucene index and reports their document frequencey.
|
||||
* If the -t flag is and reports both their document frequency and their total tf (total number of occurences)
|
||||
* in order of highest total tf
|
||||
*/
|
||||
public class HighFreqTerms {
|
||||
|
||||
// The top numTerms will be displayed
|
||||
public static final int numTerms = 100;
|
||||
|
||||
public static final int DEFAULTnumTerms = 100;
|
||||
public static int numTerms = DEFAULTnumTerms;
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
IndexReader reader = null;
|
||||
FSDirectory dir = null;
|
||||
String field = null;
|
||||
if (args.length == 1) {
|
||||
dir = FSDirectory.open(new File(args[0]));
|
||||
reader = IndexReader.open(dir, true);
|
||||
} else if (args.length == 2) {
|
||||
dir = FSDirectory.open(new File(args[0]));
|
||||
reader = IndexReader.open(dir, true);
|
||||
field = args[1];
|
||||
} else {
|
||||
boolean IncludeTermFreqs = false;
|
||||
|
||||
if (args.length == 0 || args.length > 4) {
|
||||
usage();
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
if (args.length > 0) {
|
||||
dir = FSDirectory.open(new File(args[0]));
|
||||
}
|
||||
|
||||
TermInfoQueue tiq = new TermInfoQueue(numTerms);
|
||||
|
||||
if (field != null) {
|
||||
|
||||
for (int i = 1; i < args.length; i++) {
|
||||
if (args[i].equals("-t")) {
|
||||
IncludeTermFreqs = true;
|
||||
}
|
||||
else{
|
||||
try {
|
||||
numTerms = Integer.parseInt(args[i]);
|
||||
} catch (NumberFormatException e) {
|
||||
field=args[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
reader = IndexReader.open(dir, true);
|
||||
TermStats[] terms = getHighFreqTerms(reader, numTerms, field);
|
||||
if (!IncludeTermFreqs) {
|
||||
//default HighFreqTerms behavior
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
System.out.printf("%s:%s %,d \n",
|
||||
terms[i].field, terms[i].termtext.utf8ToString(), terms[i].docFreq);
|
||||
}
|
||||
}
|
||||
else{
|
||||
TermStats[] termsWithTF = sortByTotalTermFreq(reader, terms);
|
||||
for (int i = 0; i < termsWithTF.length; i++) {
|
||||
System.out.printf("%s:%s \t totalTF = %,d \t doc freq = %,d \n",
|
||||
termsWithTF[i].field, termsWithTF[i].termtext.utf8ToString(),
|
||||
termsWithTF[i].totalTermFreq, termsWithTF[i].docFreq);
|
||||
}
|
||||
}
|
||||
reader.close();
|
||||
}
|
||||
|
||||
private static void usage() {
|
||||
System.out
|
||||
.println("\n\n"
|
||||
+ "java org.apache.lucene.misc.HighFreqTerms <index dir> [-t][number_terms] [field]\n\t -t: include totalTermFreq\n\n");
|
||||
}
|
||||
/**
|
||||
*
|
||||
* @param reader
|
||||
* @param numTerms
|
||||
* @param field
|
||||
* @return TermStats[] ordered by terms with highest docFreq first.
|
||||
* @throws Exception
|
||||
*/
|
||||
public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String field) throws Exception {
|
||||
TermStatsQueue tiq = null;
|
||||
|
||||
if (field != null) {
|
||||
Fields fields = MultiFields.getFields(reader);
|
||||
if (fields == null) {
|
||||
return;
|
||||
throw new FieldReaderException("field " + field + " not found");
|
||||
}
|
||||
Terms terms = fields.terms(field);
|
||||
if (terms != null) {
|
||||
TermsEnum termsEnum = terms.iterator();
|
||||
while(true) {
|
||||
BytesRef term = termsEnum.next();
|
||||
if (term != null) {
|
||||
tiq.insertWithOverflow(new TermInfo(new Term(field, term.utf8ToString()), termsEnum.docFreq()));
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
tiq = new TermStatsQueue(numTerms);
|
||||
fillQueue(termsEnum, tiq, field);
|
||||
}
|
||||
} else {
|
||||
Fields fields = MultiFields.getFields(reader);
|
||||
if (fields == null) {
|
||||
return;
|
||||
throw new FieldReaderException("no fields found for this index");
|
||||
}
|
||||
tiq = new TermStatsQueue(numTerms);
|
||||
FieldsEnum fieldsEnum = fields.iterator();
|
||||
while(true) {
|
||||
while (true) {
|
||||
field = fieldsEnum.next();
|
||||
if (field != null) {
|
||||
TermsEnum terms = fieldsEnum.terms();
|
||||
while(true) {
|
||||
BytesRef term = terms.next();
|
||||
if (term != null) {
|
||||
tiq.insertWithOverflow(new TermInfo(new Term(field, term.toString()), terms.docFreq()));
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
fillQueue(terms, tiq, field);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TermStats[] result = new TermStats[tiq.size()];
|
||||
// we want highest first so we read the queue and populate the array
|
||||
// starting at the end and work backwards
|
||||
int count = tiq.size() - 1;
|
||||
while (tiq.size() != 0) {
|
||||
TermInfo termInfo = tiq.pop();
|
||||
System.out.println(termInfo.term + " " + termInfo.docFreq);
|
||||
result[count] = tiq.pop();
|
||||
count--;
|
||||
}
|
||||
|
||||
reader.close();
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes array of TermStats. For each term looks up the tf for each doc
|
||||
* containing the term and stores the total in the output array of TermStats.
|
||||
* Output array is sorted by highest total tf.
|
||||
*
|
||||
* @param reader
|
||||
* @param terms
|
||||
* TermStats[]
|
||||
* @return TermStats[]
|
||||
* @throws Exception
|
||||
*/
|
||||
|
||||
public static TermStats[] sortByTotalTermFreq(IndexReader reader, TermStats[] terms) throws Exception {
|
||||
TermStats[] ts = new TermStats[terms.length]; // array for sorting
|
||||
long totalTF;
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
totalTF = getTotalTermFreq(reader, terms[i].field, terms[i].termtext);
|
||||
ts[i] = new TermStats(terms[i].field, terms[i].termtext, terms[i].docFreq, totalTF);
|
||||
}
|
||||
|
||||
Comparator<TermStats> c = new TotalTermFreqComparatorSortDescending();
|
||||
Arrays.sort(ts, c);
|
||||
|
||||
return ts;
|
||||
}
|
||||
|
||||
public static long getTotalTermFreq(IndexReader reader, String field, BytesRef termtext) throws Exception {
|
||||
BytesRef br = termtext;
|
||||
long totalTF = 0;
|
||||
Bits skipDocs = MultiFields.getDeletedDocs(reader);
|
||||
DocsEnum de = MultiFields.getTermDocsEnum(reader, skipDocs, field, br);
|
||||
// if term is not in index return totalTF of 0
|
||||
if (de == null) {
|
||||
return 0;
|
||||
}
|
||||
// use DocsEnum.read() and BulkResult api
|
||||
final DocsEnum.BulkReadResult bulkresult = de.getBulkResult();
|
||||
int count;
|
||||
while ((count = de.read()) != 0) {
|
||||
final int[] freqs = bulkresult.freqs.ints;
|
||||
final int limit = bulkresult.freqs.offset + count;
|
||||
for(int i=bulkresult.freqs.offset;i<limit;i++) {
|
||||
totalTF += freqs[i];
|
||||
}
|
||||
}
|
||||
return totalTF;
|
||||
}
|
||||
|
||||
public static void fillQueue(TermsEnum termsEnum, TermStatsQueue tiq, String field) throws Exception {
|
||||
|
||||
while (true) {
|
||||
BytesRef term = termsEnum.next();
|
||||
if (term != null) {
|
||||
tiq.insertWithOverflow(new TermStats(field, term, termsEnum.docFreq()));
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void usage() {
|
||||
System.out.println(
|
||||
"\n\n"
|
||||
+ "java org.apache.lucene.misc.HighFreqTerms <index dir> [field]\n\n");
|
||||
/**
|
||||
* Comparator
|
||||
*
|
||||
* Reverse of normal Comparator. i.e. returns 1 if a.totalTermFreq is less than
|
||||
* b.totalTermFreq So we can sort in descending order of totalTermFreq
|
||||
*/
|
||||
|
||||
final class TotalTermFreqComparatorSortDescending implements Comparator<TermStats> {
|
||||
|
||||
public int compare(TermStats a, TermStats b) {
|
||||
if (a.totalTermFreq < b.totalTermFreq) {
|
||||
return 1;
|
||||
} else if (a.totalTermFreq > b.totalTermFreq) {
|
||||
return -1;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final class TermInfo {
|
||||
TermInfo(Term t, int df) {
|
||||
term = t;
|
||||
docFreq = df;
|
||||
}
|
||||
final class TermStats {
|
||||
BytesRef termtext;
|
||||
String field;
|
||||
int docFreq;
|
||||
Term term;
|
||||
long totalTermFreq;
|
||||
|
||||
TermStats(String field, BytesRef termtext, int df) {
|
||||
this.termtext = new BytesRef(termtext);
|
||||
this.field = field;
|
||||
this.docFreq = df;
|
||||
}
|
||||
|
||||
TermStats(String field, BytesRef termtext, int df, long tf) {
|
||||
this.termtext = new BytesRef(termtext);
|
||||
this.field = field;
|
||||
this.docFreq = df;
|
||||
this.totalTermFreq = tf;
|
||||
}
|
||||
|
||||
String getTermText() {
|
||||
return termtext.utf8ToString();
|
||||
}
|
||||
}
|
||||
|
||||
final class TermInfoQueue extends PriorityQueue<TermInfo> {
|
||||
TermInfoQueue(int size) {
|
||||
/**
|
||||
* Priority queue for TermStats objects ordered by docFreq
|
||||
**/
|
||||
final class TermStatsQueue extends PriorityQueue<TermStats> {
|
||||
TermStatsQueue(int size) {
|
||||
initialize(size);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected final boolean lessThan(TermInfo termInfoA, TermInfo termInfoB) {
|
||||
protected boolean lessThan(TermStats termInfoA, TermStats termInfoB) {
|
||||
return termInfoA.docFreq < termInfoB.docFreq;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,272 @@
|
|||
package org.apache.lucene.misc;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.store.MockRAMDirectory;
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
||||
public class TestHighFreqTerms extends LuceneTestCase {
|
||||
|
||||
private static IndexWriter writer =null;
|
||||
private static MockRAMDirectory dir = null;
|
||||
private static IndexReader reader =null;
|
||||
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
dir= new MockRAMDirectory();
|
||||
writer = new IndexWriter(dir, new IndexWriterConfig(
|
||||
TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT))
|
||||
.setMaxBufferedDocs(2));
|
||||
indexDocs(writer);
|
||||
reader = IndexReader.open(dir, true);
|
||||
}
|
||||
|
||||
public void tearDown()throws Exception{
|
||||
super.tearDown();
|
||||
reader.close();
|
||||
}
|
||||
/******************** Tests for getHighFreqTerms **********************************/
|
||||
|
||||
// test without specifying field (i.e. if we pass in field=null it should examine all fields)
|
||||
// the term "diff" in the field "different_field" occurs 20 times and is the highest df term
|
||||
public static void testFirstTermHighestDocFreqAllFields () throws Exception{
|
||||
int numTerms = 12;
|
||||
String field =null;
|
||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
|
||||
assertEquals("Term with highest docfreq is first", 20,terms[0].docFreq );
|
||||
}
|
||||
|
||||
public static void testFirstTermHighestDocFreq () throws Exception{
|
||||
int numTerms = 12;
|
||||
String field="FIELD_1";
|
||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
|
||||
assertEquals("Term with highest docfreq is first", 10,terms[0].docFreq );
|
||||
}
|
||||
|
||||
public static void testOrderedByDocFreqDescending () throws Exception{
|
||||
int numTerms = 12;
|
||||
String field="FIELD_1";
|
||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
if (i >0){
|
||||
assertTrue ("out of order " + terms[i-1].docFreq + "should be >= " + terms[i].docFreq,terms[i-1].docFreq >= terms[i].docFreq);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static void testNumTerms () throws Exception{
|
||||
int numTerms = 12;
|
||||
String field = null;
|
||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
|
||||
assertEquals("length of terms array equals numTerms :" + numTerms, numTerms, terms.length);
|
||||
}
|
||||
|
||||
public static void testGetHighFreqTerms () throws Exception{
|
||||
int numTerms=12;
|
||||
String field="FIELD_1";
|
||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
|
||||
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
String termtext = terms[i].termtext.utf8ToString();
|
||||
// hardcoded highTF or highTFmedDF
|
||||
if (termtext.contains("highTF")) {
|
||||
if (termtext.contains("medDF")) {
|
||||
assertEquals("doc freq is not as expected", 5, terms[i].docFreq);
|
||||
} else {
|
||||
assertEquals("doc freq is not as expected", 1, terms[i].docFreq);
|
||||
}
|
||||
} else {
|
||||
int n = Integer.parseInt(termtext);
|
||||
assertEquals("doc freq is not as expected", getExpecteddocFreq(n),
|
||||
terms[i].docFreq);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/********************Test sortByTotalTermFreq**********************************/
|
||||
|
||||
public static void testFirstTermHighestTotalTermFreq () throws Exception{
|
||||
int numTerms = 20;
|
||||
String field = null;
|
||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
|
||||
TermStats[] termsWithTotalTermFreq = HighFreqTerms.sortByTotalTermFreq(reader, terms);
|
||||
assertEquals("Term with highest totalTermFreq is first",200, termsWithTotalTermFreq[0].totalTermFreq);
|
||||
}
|
||||
|
||||
public static void testFirstTermHighestTotalTermFreqDifferentField () throws Exception{
|
||||
int numTerms = 20;
|
||||
String field = "different_field";
|
||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
|
||||
TermStats[] termsWithTotalTermFreq = HighFreqTerms.sortByTotalTermFreq(reader, terms);
|
||||
assertEquals("Term with highest totalTermFreq is first"+ termsWithTotalTermFreq[0].getTermText(),150, termsWithTotalTermFreq[0].totalTermFreq);
|
||||
}
|
||||
|
||||
public static void testOrderedByTermFreqDescending () throws Exception{
|
||||
int numTerms = 12;
|
||||
String field = "FIELD_1";
|
||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
|
||||
TermStats[] termsWithTF = HighFreqTerms.sortByTotalTermFreq(reader, terms);
|
||||
|
||||
for (int i = 0; i < termsWithTF.length; i++) {
|
||||
// check that they are sorted by descending termfreq order
|
||||
if (i >0){
|
||||
assertTrue ("out of order" +termsWithTF[i-1]+ " > " +termsWithTF[i],termsWithTF[i-1].totalTermFreq > termsWithTF[i].totalTermFreq);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static void testGetTermFreqOrdered () throws Exception{
|
||||
int numTerms = 12;
|
||||
String field = "FIELD_1";
|
||||
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
|
||||
TermStats[] termsWithTF = HighFreqTerms.sortByTotalTermFreq(reader, terms);
|
||||
|
||||
for (int i = 0; i < termsWithTF.length; i++) {
|
||||
String text = termsWithTF[i].termtext.utf8ToString();
|
||||
if (text.contains("highTF")) {
|
||||
if (text.contains("medDF")) {
|
||||
assertEquals("total term freq is expected", 125,
|
||||
termsWithTF[i].totalTermFreq);
|
||||
} else {
|
||||
assertEquals("total term freq is expected", 200,
|
||||
termsWithTF[i].totalTermFreq);
|
||||
}
|
||||
|
||||
} else {
|
||||
int n = Integer.parseInt(text);
|
||||
assertEquals("doc freq is expected", getExpecteddocFreq(n),
|
||||
termsWithTF[i].docFreq);
|
||||
assertEquals("total term freq is expected", getExpectedtotalTermFreq(n),
|
||||
termsWithTF[i].totalTermFreq);
|
||||
}
|
||||
}
|
||||
reader.close();
|
||||
}
|
||||
|
||||
/********************Tests for getTotalTermFreq**********************************/
|
||||
|
||||
public static void testGetTotalTermFreq() throws Exception{
|
||||
String term ="highTF";
|
||||
BytesRef termtext = new BytesRef (term);
|
||||
String field = "FIELD_1";
|
||||
long totalTermFreq = HighFreqTerms.getTotalTermFreq(reader, field, termtext);
|
||||
reader.close();
|
||||
assertEquals("highTf tf should be 200",200,totalTermFreq);
|
||||
|
||||
}
|
||||
|
||||
public static void testGetTotalTermFreqBadTerm() throws Exception{
|
||||
String term ="foobar";
|
||||
BytesRef termtext = new BytesRef (term);
|
||||
String field = "FIELD_1";
|
||||
long totalTermFreq = HighFreqTerms.getTotalTermFreq(reader, field, termtext);
|
||||
reader.close();
|
||||
assertEquals("totalTermFreq should be 0 for term not in index",0,totalTermFreq);
|
||||
|
||||
}
|
||||
/********************Testing Utils**********************************/
|
||||
|
||||
private static void indexDocs(IndexWriter writer) throws Exception {
|
||||
|
||||
/**
|
||||
* Generate 10 documents where term n has a docFreq of n and a totalTermFreq of n*2 (squared).
|
||||
*/
|
||||
for (int i = 1; i <= 10; i++) {
|
||||
Document doc = new Document();
|
||||
String content = getContent(i);
|
||||
|
||||
doc.add(new Field("FIELD_1", content, Field.Store.YES,Field.Index.ANALYZED, Field.TermVector.NO));
|
||||
//add a different field
|
||||
doc.add(new Field("different_field", "diff", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
//add 10 more docs with the term "diff" this will make it have the highest docFreq if we don't ask for the
|
||||
//highest freq terms for a specific field.
|
||||
for (int i = 1; i <= 10; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("different_field", "diff", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
// add some docs where tf < df so we can see if sorting works
|
||||
// highTF low df
|
||||
int highTF = 200;
|
||||
Document doc = new Document();
|
||||
String content = "";
|
||||
for (int i = 0; i < highTF; i++) {
|
||||
content += "highTF ";
|
||||
}
|
||||
doc.add(new Field("FIELD_1", content, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
|
||||
writer.addDocument(doc);
|
||||
// highTF medium df =5
|
||||
int medium_df = 5;
|
||||
for (int i = 0; i < medium_df; i++) {
|
||||
int tf = 25;
|
||||
Document newdoc = new Document();
|
||||
String newcontent = "";
|
||||
for (int j = 0; j < tf; j++) {
|
||||
newcontent += "highTFmedDF ";
|
||||
}
|
||||
newdoc.add(new Field("FIELD_1", newcontent, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
|
||||
writer.addDocument(newdoc);
|
||||
}
|
||||
// add a doc with high tf in field different_field
|
||||
int targetTF =150;
|
||||
doc = new Document();
|
||||
content = "";
|
||||
for (int i = 0; i < targetTF; i++) {
|
||||
content += "TF150 ";
|
||||
}
|
||||
doc.add(new Field("different_field", content, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
|
||||
writer.addDocument(doc);
|
||||
writer.close();
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* getContent
|
||||
* return string containing numbers 1 to i with each number n occurring n times.
|
||||
* i.e. for input of 3 return string "3 3 3 2 2 1"
|
||||
*/
|
||||
|
||||
private static String getContent(int i) {
|
||||
String s = "";
|
||||
for (int j = 10; j >= i; j--) {
|
||||
for (int k = 0; k < j; k++) {
|
||||
// if j is 3 we return "3 3 3"
|
||||
s += String.valueOf(j) + " ";
|
||||
}
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
private static int getExpectedtotalTermFreq(int i) {
|
||||
return getExpecteddocFreq(i) * i;
|
||||
}
|
||||
|
||||
private static int getExpecteddocFreq(int i) {
|
||||
return i;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue