LUCENE-3555: add support for distributed stats

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1197455 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-11-04 09:23:56 +00:00
parent 5f8d4fc8af
commit b19a207c86
21 changed files with 278 additions and 348 deletions

View File

@ -0,0 +1,72 @@
package org.apache.lucene.search;
import org.apache.lucene.index.IndexReader; // javadocs
import org.apache.lucene.index.Terms; // javadocs
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Contains statistics for a collection (field)
* @lucene.experimental
*/
public class CollectionStatistics {
private final String field;
private final int maxDoc;
private final int docCount;
private final long sumTotalTermFreq;
private final long sumDocFreq;
public CollectionStatistics(String field, int maxDoc, int docCount, long sumTotalTermFreq, long sumDocFreq) {
this.field = field;
this.maxDoc = maxDoc;
this.docCount = docCount;
this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq;
}
/** returns the field name */
public String field() {
return field;
}
/** returns the total number of documents, regardless of
* whether they all contain values for this field.
* @see IndexReader#maxDoc() */
public int maxDoc() {
return maxDoc;
}
/** returns the total number of documents that
* have at least one term for this field.
* @see Terms#getDocCount() */
public int docCount() {
return docCount;
}
/** returns the total number of tokens for this field
* @see Terms#getSumTotalTermFreq() */
public long sumTotalTermFreq() {
return sumTotalTermFreq;
}
/** returns the total number of postings for this field
* @see Terms#getSumDocFreq() */
public long sumDocFreq() {
return sumDocFreq;
}
}

View File

@ -35,14 +35,18 @@ import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.NIOFSDirectory; // javadoc
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.ReaderUtil;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.ThreadInterruptedException;
/** Implements search over a single IndexReader.
@ -860,4 +864,34 @@ public class IndexSearcher implements Closeable {
public String toString() {
return "IndexSearcher(" + reader + "; executor=" + executor + ")";
}
/**
* Returns {@link TermStatistics} for a term
* @lucene.experimental
*/
public TermStatistics termStatistics(Term term, TermContext context) throws IOException {
return new TermStatistics(term.bytes(), context.docFreq(), context.totalTermFreq());
};
/**
* Returns {@link CollectionStatistics} for a field
* @lucene.experimental
*/
public CollectionStatistics collectionStatistics(String field) throws IOException {
final int docCount;
final long sumTotalTermFreq;
final long sumDocFreq;
Terms terms = MultiFields.getTerms(reader, field);
if (terms == null) {
docCount = 0;
sumTotalTermFreq = 0;
sumDocFreq = 0;
} else {
docCount = terms.getDocCount();
sumTotalTermFreq = terms.getSumTotalTermFreq();
sumDocFreq = terms.getSumDocFreq();
}
return new CollectionStatistics(field, reader.maxDoc(), docCount, sumTotalTermFreq, sumDocFreq);
}
}

View File

@ -141,13 +141,15 @@ public class MultiPhraseQuery extends Query {
final ReaderContext context = searcher.getTopReaderContext();
// compute idf
ArrayList<TermContext> allTerms = new ArrayList<TermContext>();
ArrayList<TermStatistics> allTermStats = new ArrayList<TermStatistics>();
for(final Term[] terms: termArrays) {
for (Term term: terms) {
allTerms.add(TermContext.build(context, term, true));
TermContext termContext = TermContext.build(context, term, true);
allTermStats.add(searcher.termStatistics(term, termContext));
}
}
stats = similarity.computeStats(searcher, field, getBoost(), allTerms.toArray(new TermContext[allTerms.size()]));
stats = similarity.computeStats(searcher.collectionStatistics(field),
getBoost(), allTermStats.toArray(new TermStatistics[allTermStats.size()]));
}
@Override

View File

@ -190,9 +190,13 @@ public class PhraseQuery extends Query {
this.similarity = searcher.getSimilarityProvider().get(field);
final ReaderContext context = searcher.getTopReaderContext();
states = new TermContext[terms.size()];
for (int i = 0; i < terms.size(); i++)
states[i] = TermContext.build(context, terms.get(i), true);
stats = similarity.computeStats(searcher, field, getBoost(), states);
TermStatistics termStats[] = new TermStatistics[terms.size()];
for (int i = 0; i < terms.size(); i++) {
final Term term = terms.get(i);
states[i] = TermContext.build(context, term, true);
termStats[i] = searcher.termStatistics(term, states[i]);
}
stats = similarity.computeStats(searcher.collectionStatistics(field), getBoost(), termStats);
}
@Override

View File

@ -54,7 +54,10 @@ public class TermQuery extends Query {
assert termStates != null : "TermContext must not be null";
this.termStates = termStates;
this.similarity = searcher.getSimilarityProvider().get(term.field());
this.stats = similarity.computeStats(searcher, term.field(), getBoost(), termStates);
this.stats = similarity.computeStats(
searcher.collectionStatistics(term.field()),
getBoost(),
searcher.termStatistics(term, termStates));
}
@Override

View File

@ -0,0 +1,53 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.IndexReader; // javadocs
import org.apache.lucene.util.BytesRef;
/**
* Contains statistics for a specific term
* @lucene.experimental
*/
public class TermStatistics {
private final BytesRef term;
private final int docFreq;
private final long totalTermFreq;
public TermStatistics(BytesRef term, int docFreq, long totalTermFreq) {
this.term = term;
this.docFreq = docFreq;
this.totalTermFreq = totalTermFreq;
}
/** returns the term text */
public BytesRef term() {
return term;
}
/** returns the number of documents this term occurs in
* @see IndexReader#docFreq(String, BytesRef) */
public int docFreq() {
return docFreq;
}
/** returns the total number of occurrences of this term
* @see IndexReader#totalTermFreq(String, BytesRef) */
public long totalTermFreq() {
return totalTermFreq;
}
}

View File

@ -20,14 +20,12 @@ package org.apache.lucene.search.similarities;
import java.io.IOException;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SmallFloat;
import org.apache.lucene.util.TermContext;
/**
* BM25 Similarity. Introduced in Stephen E. Robertson, Steve Walker,
@ -75,15 +73,13 @@ public class BM25Similarity extends Similarity {
/** The default implementation computes the average as <code>sumTotalTermFreq / maxDoc</code>,
* or returns <code>1</code> if the index does not store sumTotalTermFreq (Lucene 3.x indexes
* or any field that omits frequency information). */
protected float avgFieldLength(IndexSearcher searcher, String field) throws IOException {
Terms terms = MultiFields.getTerms(searcher.getIndexReader(), field);
if (terms == null) {
// field does not exist;
return 1f;
protected float avgFieldLength(CollectionStatistics collectionStats) {
final long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
if (sumTotalTermFreq <= 0) {
return 1f; // field does not exist, or stat is unsupported
} else {
return (float) (sumTotalTermFreq / (double) collectionStats.maxDoc());
}
long sumTotalTermFreq = terms.getSumTotalTermFreq();
long maxdoc = searcher.maxDoc();
return sumTotalTermFreq == -1 ? 1f : (float) (sumTotalTermFreq / (double) maxdoc);
}
/** The default implementation encodes <code>boost / sqrt(length)</code>
@ -131,19 +127,19 @@ public class BM25Similarity extends Similarity {
return encodeNormValue(state.getBoost(), numTerms);
}
public Explanation idfExplain(TermContext stats, final IndexSearcher searcher) throws IOException {
final int df = stats.docFreq();
final int max = searcher.maxDoc();
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
final int df = termStats.docFreq();
final int max = collectionStats.maxDoc();
final float idf = idf(df, max);
return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
}
public Explanation idfExplain(final TermContext stats[], IndexSearcher searcher) throws IOException {
final int max = searcher.maxDoc();
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
final int max = collectionStats.maxDoc();
float idf = 0.0f;
final Explanation exp = new Explanation();
exp.setDescription("idf(), sum of:");
for (final TermContext stat : stats ) {
for (final TermStatistics stat : termStats ) {
final int df = stat.docFreq();
final float termIdf = idf(df, max);
exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
@ -154,10 +150,10 @@ public class BM25Similarity extends Similarity {
}
@Override
public final Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termStats) throws IOException {
Explanation idf = termStats.length == 1 ? idfExplain(termStats[0], searcher) : idfExplain(termStats, searcher);
public final Stats computeStats(CollectionStatistics collectionStats, float queryBoost, TermStatistics... termStats) {
Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);
float avgdl = avgFieldLength(searcher, fieldName);
float avgdl = avgFieldLength(collectionStats);
// compute freq-independent part of bm25 equation across all norm values
float cache[] = new float[256];

View File

@ -17,11 +17,9 @@ package org.apache.lucene.search.similarities;
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.search.TermStatistics;
/**
* Abstract superclass for language modeling Similarities. The following inner
@ -62,8 +60,8 @@ public abstract class LMSimilarity extends SimilarityBase {
* usual statistics.
*/
@Override
protected void fillBasicStats(BasicStats stats, IndexSearcher searcher, String fieldName, TermContext termContext) throws IOException {
super.fillBasicStats(stats, searcher, fieldName, termContext);
protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
super.fillBasicStats(stats, collectionStats, termStats);
LMStats lmStats = (LMStats) stats;
lmStats.setCollectionProbability(collectionModel.computeProbability(stats));
}

View File

@ -21,10 +21,10 @@ import java.io.IOException;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TermContext;
/**
* Implements the CombSUM method for combining evidence from multiple
@ -45,10 +45,10 @@ public class MultiSimilarity extends Similarity {
}
@Override
public Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termContexts) throws IOException {
public Stats computeStats(CollectionStatistics collectionStats, float queryBoost, TermStatistics... termStats) {
Stats subStats[] = new Stats[sims.length];
for (int i = 0; i < subStats.length; i++) {
subStats[i] = sims[i].computeStats(searcher, fieldName, queryBoost, termContexts);
subStats[i] = sims[i].computeStats(collectionStats, queryBoost, termStats);
}
return new MultiStats(subStats);
}

View File

@ -26,11 +26,13 @@ import org.apache.lucene.index.IndexReader; // javadoc
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Terms; // javadoc
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.spans.SpanQuery; // javadoc
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SmallFloat; // javadoc
@ -81,10 +83,10 @@ import org.apache.lucene.util.TermContext;
* <a name="querytime"/>
* At query-time, Queries interact with the Similarity via these steps:
* <ol>
* <li>The {@link #computeStats(IndexSearcher, String, float, TermContext...)} method is called a single time,
* <li>The {@link #computeStats(CollectionStatistics, float, TermStatistics...)} method is called a single time,
* allowing the implementation to compute any statistics (such as IDF, average document length, etc)
* across <i>the entire collection</i>. The {@link TermContext}s passed in are already positioned
* to the terms involved with the raw statistics involved, so a Similarity can freely use any combination
* across <i>the entire collection</i>. The {@link TermStatistics} passed in already contain
* the raw statistics involved, so a Similarity can freely use any combination
* of term statistics without causing any additional I/O. Lucene makes no assumption about what is
* stored in the returned {@link Similarity.Stats} object.
* <li>The query normalization process occurs a single time: {@link Similarity.Stats#getValueForNormalization()}
@ -128,7 +130,7 @@ public abstract class Similarity {
/**
* Compute any collection-level stats (e.g. IDF, average document length, etc) needed for scoring a query.
*/
public abstract Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termContexts) throws IOException;
public abstract Stats computeStats(CollectionStatistics collectionStats, float queryBoost, TermStatistics... termStats);
/**
* returns a new {@link Similarity.ExactDocScorer}.

View File

@ -20,15 +20,12 @@ package org.apache.lucene.search.similarities;
import java.io.IOException;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SmallFloat;
import org.apache.lucene.util.TermContext;
/**
* A subclass of {@code Similarity} that provides a simplified API for its
@ -71,12 +68,11 @@ public abstract class SimilarityBase extends Similarity {
}
@Override
public final Stats computeStats(IndexSearcher searcher, String fieldName,
float queryBoost, TermContext... termContexts) throws IOException {
BasicStats stats[] = new BasicStats[termContexts.length];
for (int i = 0; i < termContexts.length; i++) {
public final Stats computeStats(CollectionStatistics collectionStats, float queryBoost, TermStatistics... termStats) {
BasicStats stats[] = new BasicStats[termStats.length];
for (int i = 0; i < termStats.length; i++) {
stats[i] = newStats(queryBoost);
fillBasicStats(stats[i], searcher, fieldName, termContexts[i]);
fillBasicStats(stats[i], collectionStats, termStats[i]);
}
return stats.length == 1 ? stats[0] : new MultiSimilarity.MultiStats(stats);
}
@ -88,13 +84,11 @@ public abstract class SimilarityBase extends Similarity {
/** Fills all member fields defined in {@code BasicStats} in {@code stats}.
* Subclasses can override this method to fill additional stats. */
protected void fillBasicStats(BasicStats stats, IndexSearcher searcher,
String fieldName, TermContext termContext) throws IOException {
IndexReader reader = searcher.getIndexReader();
int numberOfDocuments = reader.maxDoc();
protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
int numberOfDocuments = collectionStats.maxDoc();
int docFreq = termContext.docFreq();
long totalTermFreq = termContext.totalTermFreq();
int docFreq = termStats.docFreq();
long totalTermFreq = termStats.totalTermFreq();
// codec does not supply totalTermFreq: substitute docFreq
if (totalTermFreq == -1) {
@ -103,25 +97,19 @@ public abstract class SimilarityBase extends Similarity {
final long numberOfFieldTokens;
final float avgFieldLength;
Terms terms = MultiFields.getTerms(searcher.getIndexReader(), fieldName);
if (terms == null) {
// field does not exist;
numberOfFieldTokens = 0;
avgFieldLength = 1;
} else {
long sumTotalTermFreq = terms.getSumTotalTermFreq();
long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
if (sumTotalTermFreq <= 0) {
// field does not exist;
// We have to provide something if codec doesnt supply these measures,
// or if someone omitted frequencies for the field... negative values cause
// NaN/Inf for some scorers.
if (sumTotalTermFreq == -1) {
numberOfFieldTokens = docFreq;
avgFieldLength = 1;
} else {
numberOfFieldTokens = sumTotalTermFreq;
avgFieldLength = (float)numberOfFieldTokens / numberOfDocuments;
}
numberOfFieldTokens = docFreq;
avgFieldLength = 1;
} else {
numberOfFieldTokens = sumTotalTermFreq;
avgFieldLength = (float)numberOfFieldTokens / numberOfDocuments;
}
// TODO: add sumDocFreq for field (numberOfFieldPostings)

View File

@ -22,9 +22,11 @@ import java.io.IOException;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.SmallFloat;
@ -575,15 +577,15 @@ public abstract class TFIDFSimilarity extends Similarity {
* is inaccurate, so is {@link IndexSearcher#maxDoc()}, and in the same direction.
* In addition, {@link IndexSearcher#maxDoc()} is more efficient to compute
*
* @param stats statistics of the term in question
* @param searcher the document collection being searched
* @param collectionStats collection-level statistics
* @param termStats term-level statistics for the term
* @return an Explain object that includes both an idf score factor
and an explanation for the term.
* @throws IOException
*/
public Explanation idfExplain(TermContext stats, final IndexSearcher searcher) throws IOException {
final int df = stats.docFreq();
final int max = searcher.maxDoc();
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
final int df = termStats.docFreq();
final int max = collectionStats.maxDoc();
final float idf = idf(df, max);
return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
}
@ -595,19 +597,19 @@ public abstract class TFIDFSimilarity extends Similarity {
* The default implementation sums the idf factor for
* each term in the phrase.
*
* @param stats statistics of the terms in the phrase
* @param searcher the document collection being searched
* @param collectionStats collection-level statistics
* @param termStats term-level statistics for the terms in the phrase
* @return an Explain object that includes both an idf
* score factor for the phrase and an explanation
* for each term.
* @throws IOException
*/
public Explanation idfExplain(final TermContext stats[], IndexSearcher searcher) throws IOException {
final int max = searcher.maxDoc();
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
final int max = collectionStats.maxDoc();
float idf = 0.0f;
final Explanation exp = new Explanation();
exp.setDescription("idf(), sum of:");
for (final TermContext stat : stats ) {
for (final TermStatistics stat : termStats ) {
final int df = stat.docFreq();
final float termIdf = idf(df, max);
exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
@ -693,11 +695,10 @@ public abstract class TFIDFSimilarity extends Similarity {
public abstract float scorePayload(int doc, int start, int end, BytesRef payload);
@Override
public final Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost,
TermContext... termContexts) throws IOException {
final Explanation idf = termContexts.length == 1
? idfExplain(termContexts[0], searcher)
: idfExplain(termContexts, searcher);
public final Stats computeStats(CollectionStatistics collectionStats, float queryBoost, TermStatistics... termStats) {
final Explanation idf = termStats.length == 1
? idfExplain(collectionStats, termStats[0])
: idfExplain(collectionStats, termStats);
return new IDFStats(idf, queryBoost);
}

View File

@ -48,10 +48,17 @@ public class SpanWeight extends Weight {
query.extractTerms(terms);
final ReaderContext context = searcher.getTopReaderContext();
final TermContext states[] = new TermContext[terms.size()];
final TermStatistics termStats[] = new TermStatistics[terms.size()];
int i = 0;
for (Term term : terms)
states[i++] = TermContext.build(context, term, true);
stats = similarity.computeStats(searcher, query.getField(), query.getBoost(), states);
for (Term term : terms) {
states[i] = TermContext.build(context, term, true);
termStats[i] = searcher.termStatistics(term, states[i]);
i++;
}
stats = similarity.computeStats(
searcher.collectionStatistics(query.getField()),
query.getBoost(),
termStats);
}
@Override

View File

@ -21,7 +21,6 @@ import java.io.IOException;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
@ -50,7 +49,7 @@ public class TestOmitTf extends LuceneTestCase {
@Override public float tf(float freq) { return freq; }
@Override public float sloppyFreq(int distance) { return 2.0f; }
@Override public float idf(int docFreq, int numDocs) { return 1.0f; }
@Override public Explanation idfExplain(TermContext[] terms, IndexSearcher searcher) throws IOException {
@Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] termStats) {
return new Explanation(1.0f, "Inexplicable");
}
@Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 1.0f; }

View File

@ -22,12 +22,8 @@ import java.io.IOException;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.search.similarities.Similarity.ExactDocScorer;
import org.apache.lucene.search.similarities.Similarity.SloppyDocScorer;
import org.apache.lucene.search.similarities.Similarity.Stats;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.util.PriorityQueue;
@ -250,7 +246,7 @@ final class JustCompileSearch {
static final class JustCompileSimilarity extends Similarity {
@Override
public Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termContexts) throws IOException {
public Stats computeStats(CollectionStatistics collectionStats, float queryBoost, TermStatistics... termStats) {
throw new UnsupportedOperationException(UNSUPPORTED_MSG);
}

View File

@ -161,8 +161,8 @@ public class TestDocValuesScoring extends LuceneTestCase {
}
@Override
public Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termContexts) throws IOException {
return sim.computeStats(searcher, fieldName, queryBoost, termContexts);
public Stats computeStats(CollectionStatistics collectionStats, float queryBoost, TermStatistics... termStats) {
return sim.computeStats(collectionStats, queryBoost, termStats);
}
@Override

View File

@ -316,8 +316,7 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
return new DefaultSimilarity() {
@Override
public Explanation idfExplain(TermContext stats[],
IndexSearcher searcher) throws IOException {
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
return new Explanation(10f, "just a test");
}
};

View File

@ -18,7 +18,6 @@ package org.apache.lucene.search;
*/
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TermContext;
import java.io.IOException;
@ -50,7 +49,7 @@ public class TestSimilarity extends LuceneTestCase {
@Override public float tf(float freq) { return freq; }
@Override public float sloppyFreq(int distance) { return 2.0f; }
@Override public float idf(int docFreq, int numDocs) { return 1.0f; }
@Override public Explanation idfExplain(TermContext[] stats, IndexSearcher searcher) throws IOException {
@Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] stats) {
return new Explanation(1.0f, "Inexplicable");
}
};

View File

@ -27,10 +27,12 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Payload;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.QueryUtils;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.Similarity;
@ -42,7 +44,6 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.English;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TermContext;
import org.junit.AfterClass;
import org.junit.BeforeClass;
@ -346,7 +347,7 @@ public class TestPayloadNearQuery extends LuceneTestCase {
// idf used for phrase queries
@Override
public Explanation idfExplain(TermContext states[], IndexSearcher searcher) throws IOException {
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] termStats) {
return new Explanation(1.0f, "Inexplicable");
}
};

View File

@ -1,222 +0,0 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.Map;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.index.TermVectorMapper;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.codecs.PerDocValues;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
/**
* Index searcher implementation that takes an {@link BasicStats} instance and
* returns statistics accordingly. Most of the methods are not implemented, so
* it can only be used for Similarity unit testing.
*/
public class SpoofIndexSearcher extends IndexSearcher {
public SpoofIndexSearcher(BasicStats stats) {
super(new SpoofIndexReader(stats));
}
public static class SpoofIndexReader extends IndexReader {
/** The stats the reader has to return. */
protected BasicStats stats;
/** The fields the reader has to return. */
protected SpoofFields fields;
public SpoofIndexReader(BasicStats stats) {
this.stats = stats;
this.fields = new SpoofFields(stats);
}
@Override
public int numDocs() {
return stats.getNumberOfDocuments();
}
@Override
public int maxDoc() {
return stats.getNumberOfDocuments();
}
@Override
public Fields fields() throws IOException {
return fields;
}
@Override
public Collection<String> getFieldNames(FieldOption fldOption) {
return Arrays.asList(new String[]{"spoof"});
}
@Override
public ReaderContext getTopReaderContext() {
return new AtomicReaderContext(this);
}
@Override
public boolean hasDeletions() {
return false;
}
// ------------------------ Not implemented methods ------------------------
@Override
public TermFreqVector[] getTermFreqVectors(int docNumber)
throws IOException {
return null;
}
@Override
public TermFreqVector getTermFreqVector(int docNumber, String field)
throws IOException {
return null;
}
@Override
public void getTermFreqVector(int docNumber, String field,
TermVectorMapper mapper) throws IOException {
}
@Override
public void getTermFreqVector(int docNumber, TermVectorMapper mapper)
throws IOException {
}
@Override
public void document(int docID, StoredFieldVisitor visitor) throws CorruptIndexException, IOException {
}
@Override
public byte[] norms(String field) throws IOException {
return null;
}
@Override
protected void doSetNorm(int doc, String field, byte value)
throws CorruptIndexException, IOException {
}
@Override
public PerDocValues perDocValues() throws IOException {
return null;
}
@Override
protected void doDelete(int docNum) throws CorruptIndexException,
IOException {
}
@Override
protected void doUndeleteAll() throws CorruptIndexException, IOException {
}
@Override
protected void doCommit(Map<String,String> commitUserData)
throws IOException {
}
@Override
protected void doClose() throws IOException {
}
@Override
public Bits getLiveDocs() {
return null;
}
}
/** Spoof Fields class for Similarity testing. */
public static class SpoofFields extends Fields {
/** The stats the object has to return. */
protected SpoofTerms terms;
public SpoofFields(BasicStats stats) {
this.terms = new SpoofTerms(stats);
}
@Override
public Terms terms(String field) throws IOException {
return terms;
}
// ------------------------ Not implemented methods ------------------------
@Override
public FieldsEnum iterator() throws IOException {
return null;
}
}
/** Spoof Terms class for Similarity testing. */
public static class SpoofTerms extends Terms {
/** The stats the object has to return. */
protected BasicStats stats;
public SpoofTerms(BasicStats stats) {
this.stats = stats;
}
@Override
public long getSumTotalTermFreq() throws IOException {
return stats.getNumberOfFieldTokens();
}
@Override
public long getSumDocFreq() throws IOException {
return stats.getDocFreq();
}
@Override
public int getDocCount() throws IOException {
return stats.getDocFreq();
}
// ------------------------ Not implemented methods ------------------------
@Override
public TermsEnum iterator() throws IOException {
return null;
}
@Override
public long getUniqueTermCount() throws IOException {
return -1;
}
@Override
public Comparator<BytesRef> getComparator() throws IOException {
return null;
}
}
}

View File

@ -30,12 +30,15 @@ import org.apache.lucene.index.OrdTermState;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TermContext;
import org.junit.Ignore;
@ -172,7 +175,14 @@ public class TestSimilarityBase extends LuceneTestCase {
stats.setTotalTermFreq(TOTAL_TERM_FREQ);
return stats;
}
private CollectionStatistics toCollectionStats(BasicStats stats) {
return new CollectionStatistics("spoof", stats.getNumberOfDocuments(), -1, stats.getNumberOfFieldTokens(), -1);
}
private TermStatistics toTermStats(BasicStats stats) {
return new TermStatistics(new BytesRef("spoofyText"), stats.getDocFreq(), stats.getTotalTermFreq());
}
/**
* The generic test core called by all unit test methods. It calls the
* {@link SimilarityBase#score(BasicStats, float, int)} method of all
@ -180,17 +190,11 @@ public class TestSimilarityBase extends LuceneTestCase {
* is a finite positive real number.
*/
private void unitTestCore(BasicStats stats, float freq, int docLen)
throws IOException {
// We have to fake everything, because computeStats() can be overridden and
// there is no way to inject false data after fillBasicStats().
SpoofIndexSearcher searcher = new SpoofIndexSearcher(stats);
TermContext tc = new TermContext(
searcher.getIndexReader().getTopReaderContext(),
new OrdTermState(), 0, stats.getDocFreq(), stats.getTotalTermFreq());
throws IOException {
for (SimilarityBase sim : sims) {
BasicStats realStats = (BasicStats) sim.computeStats(new SpoofIndexSearcher(stats),
"spoof", stats.getTotalBoost(), tc);
BasicStats realStats = (BasicStats) sim.computeStats(toCollectionStats(stats),
stats.getTotalBoost(),
toTermStats(stats));
float score = sim.score(realStats, freq, docLen);
float explScore = sim.explain(
realStats, 1, new Explanation(freq, "freq"), docLen).getValue();
@ -520,16 +524,10 @@ public class TestSimilarityBase extends LuceneTestCase {
*/
private void correctnessTestCore(SimilarityBase sim, float gold)
throws IOException {
// We have to fake everything, because computeStats() can be overridden and
// there is no way to inject false data after fillBasicStats().
BasicStats stats = createStats();
SpoofIndexSearcher searcher = new SpoofIndexSearcher(stats);
TermContext tc = new TermContext(
searcher.getIndexReader().getTopReaderContext(),
new OrdTermState(), 0, stats.getDocFreq(), stats.getTotalTermFreq());
BasicStats realStats = (BasicStats) sim.computeStats(
searcher, "spoof", stats.getTotalBoost(), tc);
BasicStats realStats = (BasicStats) sim.computeStats(toCollectionStats(stats),
stats.getTotalBoost(),
toTermStats(stats));
float score = sim.score(realStats, FREQ, DOC_LEN);
assertEquals(
sim.toString() + " score not correct.", gold, score, FLOAT_EPSILON);