merged with trunk

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/LUCENE2793@1144189 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2011-07-08 08:09:32 +00:00
commit feb9739bf7
141 changed files with 2656 additions and 2009 deletions

View File

@ -156,6 +156,12 @@ Changes in backwards compatibility policy
the queries module and can be found at o.a.l.queries.function. See MIGRATE.txt the queries module and can be found at o.a.l.queries.function. See MIGRATE.txt
for more information (Chris Male) for more information (Chris Male)
* LUCENE-2392: Decoupled vector space scoring from Query/Weight/Scorer. If you
extended Similarity directly before, you should extend TFIDFSimilarity instead.
Similarity is now a lower-level API to implement other scoring algorithms.
See MIGRATE.txt for more details.
(David Nemeskey, Simon Willnauer, Mike Mccandless, Robert Muir)
Changes in Runtime Behavior Changes in Runtime Behavior
* LUCENE-2846: omitNorms now behaves like omitTermFrequencyAndPositions, if you * LUCENE-2846: omitNorms now behaves like omitTermFrequencyAndPositions, if you

View File

@ -382,3 +382,13 @@ LUCENE-1458, LUCENE-2111: Flexible Indexing
- o.a.l.search.function.ShortFieldSource -> o.a.l.queries.function.valuesource.ShortFieldSource - o.a.l.search.function.ShortFieldSource -> o.a.l.queries.function.valuesource.ShortFieldSource
- o.a.l.search.function.ValueSource -> o.a.l.queries.function.ValueSource - o.a.l.search.function.ValueSource -> o.a.l.queries.function.ValueSource
- o.a.l.search.function.ValueSourceQuery -> o.a.l.queries.function.FunctionQuery - o.a.l.search.function.ValueSourceQuery -> o.a.l.queries.function.FunctionQuery
* LUCENE-2392: Enable flexible scoring:
The existing "Similarity" api is now TFIDFSimilarity, if you were extending
Similarity before, you should likely extend this instead.
Weight.normalize no longer takes a norm value that incorporates the top-level
boost from outer queries such as BooleanQuery, instead it takes 2 parameters,
the outer boost (topLevelBoost) and the norm. Weight.sumOfSquaredWeights has
been renamed to Weight.getValueForNormalization().

View File

@ -331,7 +331,7 @@
<artifact:deploy file="${build.dir}/${maven.project.build.finalName}.jar"> <artifact:deploy file="${build.dir}/${maven.project.build.finalName}.jar">
<artifact-attachments/> <artifact-attachments/>
<remoteRepository url="${m2.repository.url}"> <remoteRepository url="${m2.repository.url}">
<authentication username="${m2.repository.username}" privateKey="${m2.repository.private.key}"/> <authentication username="${m2.repository.username}" privateKey="${m2.repository.private.key}" password="${m2.repository.password}"/>
</remoteRepository> </remoteRepository>
<pom refid="maven.project"/> <pom refid="maven.project"/>
</artifact:deploy> </artifact:deploy>
@ -351,7 +351,7 @@
<artifact:pom id="maven.project" file="${maven.build.dir}/@{pom.xml}" /> <artifact:pom id="maven.project" file="${maven.build.dir}/@{pom.xml}" />
<artifact:deploy file="@{jar.file}"> <artifact:deploy file="@{jar.file}">
<remoteRepository url="${m2.repository.url}"> <remoteRepository url="${m2.repository.url}">
<authentication username="${m2.repository.username}" privateKey="${m2.repository.private.key}"/> <authentication username="${m2.repository.username}" privateKey="${m2.repository.private.key}" password="${m2.repository.password}"/>
</remoteRepository> </remoteRepository>
<pom refid="maven.project"/> <pom refid="maven.project"/>
</artifact:deploy> </artifact:deploy>

View File

@ -240,8 +240,7 @@ public class InstantiatedIndexWriter implements Closeable {
final FieldInvertState invertState = new FieldInvertState(); final FieldInvertState invertState = new FieldInvertState();
invertState.setBoost(eFieldTermDocInfoFactoriesByTermText.getKey().boost * document.getDocument().getBoost()); invertState.setBoost(eFieldTermDocInfoFactoriesByTermText.getKey().boost * document.getDocument().getBoost());
invertState.setLength(eFieldTermDocInfoFactoriesByTermText.getKey().fieldLength); invertState.setLength(eFieldTermDocInfoFactoriesByTermText.getKey().fieldLength);
final float norm = similarityProvider.get(fieldName).computeNorm(invertState); normsByFieldNameAndDocumentNumber.get(fieldName)[document.getDocumentNumber()] = similarityProvider.get(fieldName).computeNorm(invertState);
normsByFieldNameAndDocumentNumber.get(fieldName)[document.getDocumentNumber()] = similarityProvider.get(fieldName).encodeNormValue(norm);
} else { } else {
System.currentTimeMillis(); System.currentTimeMillis();
} }

View File

@ -51,7 +51,6 @@ import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.index.TermPositionVector; import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.index.TermVectorMapper; import org.apache.lucene.index.TermVectorMapper;
import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.index.codecs.PerDocValues; import org.apache.lucene.index.codecs.PerDocValues;
import org.apache.lucene.search.Collector; import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
@ -1202,15 +1201,14 @@ public class MemoryIndex {
int numOverlapTokens = info != null ? info.numOverlapTokens : 0; int numOverlapTokens = info != null ? info.numOverlapTokens : 0;
float boost = info != null ? info.getBoost() : 1.0f; float boost = info != null ? info.getBoost() : 1.0f;
FieldInvertState invertState = new FieldInvertState(0, numTokens, numOverlapTokens, 0, boost); FieldInvertState invertState = new FieldInvertState(0, numTokens, numOverlapTokens, 0, boost);
float n = fieldSim.computeNorm(invertState); byte norm = fieldSim.computeNorm(invertState);
byte norm = fieldSim.encodeNormValue(n);
norms = new byte[] {norm}; norms = new byte[] {norm};
// cache it for future reuse // cache it for future reuse
cachedNorms = norms; cachedNorms = norms;
cachedFieldName = fieldName; cachedFieldName = fieldName;
cachedSimilarity = sim; cachedSimilarity = sim;
if (DEBUG) System.err.println("MemoryIndexReader.norms: " + fieldName + ":" + n + ":" + norm + ":" + numTokens); if (DEBUG) System.err.println("MemoryIndexReader.norms: " + fieldName + ":" + norm + ":" + numTokens);
} }
return norms; return norms;
} }

View File

@ -147,7 +147,7 @@ public class FieldNormModifier {
for (int d = 0; d < termCounts.length; d++) { for (int d = 0; d < termCounts.length; d++) {
if (liveDocs == null || liveDocs.get(d)) { if (liveDocs == null || liveDocs.get(d)) {
invertState.setLength(termCounts[d]); invertState.setLength(termCounts[d]);
subReader.setNorm(d, field, fieldSim.encodeNormValue(fieldSim.computeNorm(invertState))); subReader.setNorm(d, field, fieldSim.computeNorm(invertState));
} }
} }
} }

View File

@ -25,7 +25,6 @@ import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.codecs.Codec; import org.apache.lucene.index.codecs.Codec;
import org.apache.lucene.index.codecs.DocValuesConsumer;
import org.apache.lucene.index.codecs.DefaultDocValuesProducer; import org.apache.lucene.index.codecs.DefaultDocValuesProducer;
import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.index.codecs.FieldsConsumer;
import org.apache.lucene.index.codecs.FieldsProducer; import org.apache.lucene.index.codecs.FieldsProducer;
@ -58,7 +57,7 @@ public class AppendingCodec extends Codec {
public static String CODEC_NAME = "Appending"; public static String CODEC_NAME = "Appending";
public AppendingCodec() { public AppendingCodec() {
name = CODEC_NAME; super(CODEC_NAME);
} }
@Override @Override
@ -138,22 +137,22 @@ public class AppendingCodec extends Codec {
StandardPostingsReader.files(dir, segmentInfo, codecId, files); StandardPostingsReader.files(dir, segmentInfo, codecId, files);
BlockTermsReader.files(dir, segmentInfo, codecId, files); BlockTermsReader.files(dir, segmentInfo, codecId, files);
FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files); FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files);
DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files); DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files, getDocValuesUseCFS());
} }
@Override @Override
public void getExtensions(Set<String> extensions) { public void getExtensions(Set<String> extensions) {
StandardCodec.getStandardExtensions(extensions); StandardCodec.getStandardExtensions(extensions);
DefaultDocValuesConsumer.getDocValuesExtensions(extensions); DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS());
} }
@Override @Override
public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException {
return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS());
} }
@Override @Override
public PerDocValues docsProducer(SegmentReadState state) throws IOException { public PerDocValues docsProducer(SegmentReadState state) throws IOException {
return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, state.context); return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, getDocValuesUseCFS(), getDocValuesSortComparator(), state.context);
} }
} }

View File

@ -106,7 +106,7 @@ public class SweetSpotSimilarity extends DefaultSimilarity {
* discountOverlaps is true by default or true for this * discountOverlaps is true by default or true for this
* specific field. */ * specific field. */
@Override @Override
public float computeNorm(FieldInvertState state) { public byte computeNorm(FieldInvertState state) {
final int numTokens; final int numTokens;
if (discountOverlaps) if (discountOverlaps)
@ -114,7 +114,7 @@ public class SweetSpotSimilarity extends DefaultSimilarity {
else else
numTokens = state.getLength(); numTokens = state.getLength();
return state.getBoost() * computeLengthNorm(numTokens); return encodeNormValue(state.getBoost() * computeLengthNorm(numTokens));
} }
/** /**

View File

@ -70,7 +70,7 @@ public class WindowsDirectory extends FSDirectory {
@Override @Override
public IndexInput openInput(String name, IOContext context) throws IOException { public IndexInput openInput(String name, IOContext context) throws IOException {
ensureOpen(); ensureOpen();
return new WindowsIndexInput(new File(getDirectory(), name), DEFAULT_BUFFERSIZE); return new WindowsIndexInput(new File(getDirectory(), name), Math.max(BufferedIndexInput.bufferSize(context), DEFAULT_BUFFERSIZE));
} }
protected static class WindowsIndexInput extends BufferedIndexInput { protected static class WindowsIndexInput extends BufferedIndexInput {

View File

@ -49,8 +49,8 @@ public class TestFieldNormModifier extends LuceneTestCase {
public Similarity get(String field) { public Similarity get(String field) {
return new DefaultSimilarity() { return new DefaultSimilarity() {
@Override @Override
public float computeNorm(FieldInvertState state) { public byte computeNorm(FieldInvertState state) {
return state.getBoost() * (discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength()); return encodeNormValue(state.getBoost() * (discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength()));
} }
}; };
} }

View File

@ -21,6 +21,7 @@ package org.apache.lucene.misc;
import org.apache.lucene.search.DefaultSimilarity; import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.DefaultSimilarityProvider; import org.apache.lucene.search.DefaultSimilarityProvider;
import org.apache.lucene.search.Similarity; import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.TFIDFSimilarity;
import org.apache.lucene.search.SimilarityProvider; import org.apache.lucene.search.SimilarityProvider;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.FieldInvertState;
@ -58,15 +59,15 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
invertState.setLength(i); invertState.setLength(i);
assertEquals("3,10: spot i="+i, assertEquals("3,10: spot i="+i,
1.0f, 1.0f,
s.computeNorm(invertState), ss.decodeNormValue(s.computeNorm(invertState)),
0.0f); 0.0f);
} }
for (int i = 10; i < 1000; i++) { for (int i = 10; i < 1000; i++) {
invertState.setLength(i-9); invertState.setLength(i-9);
final float normD = d.computeNorm(invertState); final byte normD = d.computeNorm(invertState);
invertState.setLength(i); invertState.setLength(i);
final float normS = s.computeNorm(invertState); final byte normS = s.computeNorm(invertState);
assertEquals("3,10: 10<x : i="+i, assertEquals("3,10: 10<x : i="+i,
normD, normD,
normS, normS,
@ -104,14 +105,14 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
invertState.setLength(i); invertState.setLength(i);
assertEquals("f: 3,10: spot i="+i, assertEquals("f: 3,10: spot i="+i,
1.0f, 1.0f,
sp.get("foo").computeNorm(invertState), ss.decodeNormValue(sp.get("foo").computeNorm(invertState)),
0.0f); 0.0f);
} }
for (int i = 10; i < 1000; i++) { for (int i = 10; i < 1000; i++) {
invertState.setLength(i-9); invertState.setLength(i-9);
final float normD = d.computeNorm(invertState); final byte normD = d.computeNorm(invertState);
invertState.setLength(i); invertState.setLength(i);
final float normS = sp.get("foo").computeNorm(invertState); final byte normS = sp.get("foo").computeNorm(invertState);
assertEquals("f: 3,10: 10<x : i="+i, assertEquals("f: 3,10: 10<x : i="+i,
normD, normD,
normS, normS,
@ -121,21 +122,21 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
invertState.setLength(i); invertState.setLength(i);
assertEquals("f: 8,13: spot i="+i, assertEquals("f: 8,13: spot i="+i,
1.0f, 1.0f,
sp.get("bar").computeNorm(invertState), ss.decodeNormValue(sp.get("bar").computeNorm(invertState)),
0.0f); 0.0f);
} }
for (int i = 6; i <=9; i++) { for (int i = 6; i <=9; i++) {
invertState.setLength(i); invertState.setLength(i);
assertEquals("f: 6,9: spot i="+i, assertEquals("f: 6,9: spot i="+i,
1.0f, 1.0f,
sp.get("yak").computeNorm(invertState), ss.decodeNormValue(sp.get("yak").computeNorm(invertState)),
0.0f); 0.0f);
} }
for (int i = 13; i < 1000; i++) { for (int i = 13; i < 1000; i++) {
invertState.setLength(i-12); invertState.setLength(i-12);
final float normD = d.computeNorm(invertState); final byte normD = d.computeNorm(invertState);
invertState.setLength(i); invertState.setLength(i);
final float normS = sp.get("bar").computeNorm(invertState); final byte normS = sp.get("bar").computeNorm(invertState);
assertEquals("f: 8,13: 13<x : i="+i, assertEquals("f: 8,13: 13<x : i="+i,
normD, normD,
normS, normS,
@ -143,9 +144,9 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
} }
for (int i = 9; i < 1000; i++) { for (int i = 9; i < 1000; i++) {
invertState.setLength(i-8); invertState.setLength(i-8);
final float normD = d.computeNorm(invertState); final byte normD = d.computeNorm(invertState);
invertState.setLength(i); invertState.setLength(i);
final float normS = sp.get("yak").computeNorm(invertState); final byte normS = sp.get("yak").computeNorm(invertState);
assertEquals("f: 6,9: 9<x : i="+i, assertEquals("f: 6,9: 9<x : i="+i,
normD, normD,
normS, normS,
@ -157,8 +158,8 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
for (int i = 9; i < 1000; i++) { for (int i = 9; i < 1000; i++) {
invertState.setLength(i); invertState.setLength(i);
final float normSS = sp.get("a").computeNorm(invertState); final byte normSS = sp.get("a").computeNorm(invertState);
final float normS = sp.get("b").computeNorm(invertState); final byte normS = sp.get("b").computeNorm(invertState);
assertTrue("s: i="+i+" : a="+normSS+ assertTrue("s: i="+i+" : a="+normSS+
" < b="+normS, " < b="+normS,
normSS < normS); normSS < normS);
@ -170,8 +171,8 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
SweetSpotSimilarity ss = new SweetSpotSimilarity(); SweetSpotSimilarity ss = new SweetSpotSimilarity();
Similarity d = new DefaultSimilarity(); TFIDFSimilarity d = new DefaultSimilarity();
Similarity s = ss; TFIDFSimilarity s = ss;
// tf equal // tf equal
@ -222,7 +223,7 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
}; };
ss.setHyperbolicTfFactors(3.3f, 7.7f, Math.E, 5.0f); ss.setHyperbolicTfFactors(3.3f, 7.7f, Math.E, 5.0f);
Similarity s = ss; TFIDFSimilarity s = ss;
for (int i = 1; i <=1000; i++) { for (int i = 1; i <=1000; i++) {
assertTrue("MIN tf: i="+i+" : s="+s.tf(i), assertTrue("MIN tf: i="+i+" : s="+s.tf(i),

View File

@ -54,8 +54,8 @@ public class TestLengthNormModifier extends LuceneTestCase {
public Similarity get(String field) { public Similarity get(String field) {
return new DefaultSimilarity() { return new DefaultSimilarity() {
@Override @Override
public float computeNorm(FieldInvertState state) { public byte computeNorm(FieldInvertState state) {
return state.getBoost() * (discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength()); return encodeNormValue(state.getBoost() * (discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength()));
} }
}; };
} }
@ -175,8 +175,8 @@ public class TestLengthNormModifier extends LuceneTestCase {
public Similarity get(String field) { public Similarity get(String field) {
return new DefaultSimilarity() { return new DefaultSimilarity() {
@Override @Override
public float computeNorm(FieldInvertState state) { public byte computeNorm(FieldInvertState state) {
return state.getBoost() * (discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength()); return encodeNormValue(state.getBoost() * (discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength()));
} }
}; };
} }

View File

@ -51,7 +51,11 @@ import org.apache.lucene.util.PriorityQueue;
*/ */
public class FuzzyLikeThisQuery extends Query public class FuzzyLikeThisQuery extends Query
{ {
static Similarity sim=new DefaultSimilarity(); // TODO: generalize this query (at least it should not reuse this static sim!
// a better way might be to convert this into multitermquery rewrite methods.
// the rewrite method can 'average' the TermContext's term statistics (docfreq,totalTermFreq)
// provided to TermQuery, so that the general idea is agnostic to any scoring system...
static TFIDFSimilarity sim=new DefaultSimilarity();
Query rewrittenQuery=null; Query rewrittenQuery=null;
ArrayList<FieldVals> fieldVals=new ArrayList<FieldVals>(); ArrayList<FieldVals> fieldVals=new ArrayList<FieldVals>();
Analyzer analyzer; Analyzer analyzer;

View File

@ -44,6 +44,7 @@ import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Similarity; import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.TFIDFSimilarity;
import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.FSDirectory;
@ -285,7 +286,7 @@ public final class MoreLikeThis {
/** /**
* For idf() calculations. * For idf() calculations.
*/ */
private Similarity similarity;// = new DefaultSimilarity(); private TFIDFSimilarity similarity;// = new DefaultSimilarity();
/** /**
* IndexReader to use * IndexReader to use
@ -320,17 +321,17 @@ public final class MoreLikeThis {
this(ir, new DefaultSimilarity()); this(ir, new DefaultSimilarity());
} }
public MoreLikeThis(IndexReader ir, Similarity sim){ public MoreLikeThis(IndexReader ir, TFIDFSimilarity sim){
this.ir = ir; this.ir = ir;
this.similarity = sim; this.similarity = sim;
} }
public Similarity getSimilarity() { public TFIDFSimilarity getSimilarity() {
return similarity; return similarity;
} }
public void setSimilarity(Similarity similarity) { public void setSimilarity(TFIDFSimilarity similarity) {
this.similarity = similarity; this.similarity = similarity;
} }

View File

@ -81,13 +81,13 @@ public abstract class AbstractField implements Fieldable {
* default, in the {@link * default, in the {@link
* org.apache.lucene.search.Similarity#computeNorm(FieldInvertState)} method, the boost value is multiplied * org.apache.lucene.search.Similarity#computeNorm(FieldInvertState)} method, the boost value is multiplied
* by the length normalization factor and then * by the length normalization factor and then
* rounded by {@link org.apache.lucene.search.Similarity#encodeNormValue(float)} before it is stored in the * rounded by {@link org.apache.lucene.search.DefaultSimilarity#encodeNormValue(float)} before it is stored in the
* index. One should attempt to ensure that this product does not overflow * index. One should attempt to ensure that this product does not overflow
* the range of that encoding. * the range of that encoding.
* *
* @see org.apache.lucene.document.Document#setBoost(float) * @see org.apache.lucene.document.Document#setBoost(float)
* @see org.apache.lucene.search.Similarity#computeNorm(FieldInvertState) * @see org.apache.lucene.search.Similarity#computeNorm(FieldInvertState)
* @see org.apache.lucene.search.Similarity#encodeNormValue(float) * @see org.apache.lucene.search.DefaultSimilarity#encodeNormValue(float)
*/ */
public void setBoost(float boost) { public void setBoost(float boost) {
this.boost = boost; this.boost = boost;

View File

@ -48,13 +48,13 @@ public interface Fieldable {
* default, in the {@link * default, in the {@link
* org.apache.lucene.search.Similarity#computeNorm(FieldInvertState)} method, the boost value is multiplied * org.apache.lucene.search.Similarity#computeNorm(FieldInvertState)} method, the boost value is multiplied
* by the length normalization factor * by the length normalization factor
* and then rounded by {@link org.apache.lucene.search.Similarity#encodeNormValue(float)} before it is stored in the * and then rounded by {@link org.apache.lucene.search.DefaultSimilarity#encodeNormValue(float)} before it is stored in the
* index. One should attempt to ensure that this product does not overflow * index. One should attempt to ensure that this product does not overflow
* the range of that encoding. * the range of that encoding.
* *
* @see org.apache.lucene.document.Document#setBoost(float) * @see org.apache.lucene.document.Document#setBoost(float)
* @see org.apache.lucene.search.Similarity#computeNorm(FieldInvertState) * @see org.apache.lucene.search.Similarity#computeNorm(FieldInvertState)
* @see org.apache.lucene.search.Similarity#encodeNormValue(float) * @see org.apache.lucene.search.DefaultSimilarity#encodeNormValue(float)
*/ */
void setBoost(float boost); void setBoost(float boost);

View File

@ -32,7 +32,6 @@ import org.apache.lucene.search.SimilarityProvider;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FlushInfo; import org.apache.lucene.store.FlushInfo;
import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IOContext.Context;
import org.apache.lucene.util.BitVector; import org.apache.lucene.util.BitVector;
import org.apache.lucene.util.ByteBlockPool.Allocator; import org.apache.lucene.util.ByteBlockPool.Allocator;
import org.apache.lucene.util.ByteBlockPool.DirectTrackingAllocator; import org.apache.lucene.util.ByteBlockPool.DirectTrackingAllocator;
@ -546,7 +545,7 @@ public class DocumentsWriterPerThread {
PerDocWriteState newPerDocWriteState(int codecId) { PerDocWriteState newPerDocWriteState(int codecId) {
assert segment != null; assert segment != null;
return new PerDocWriteState(infoStream, directory, segment, fieldInfos, bytesUsed, codecId); return new PerDocWriteState(infoStream, directory, segment, fieldInfos, bytesUsed, codecId, IOContext.DEFAULT);
} }
void setInfoStream(PrintStream infoStream) { void setInfoStream(PrintStream infoStream) {

View File

@ -31,9 +31,12 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.CloseableThreadLocal; import org.apache.lucene.util.CloseableThreadLocal;
import org.apache.lucene.util.IOUtils;
import java.io.Closeable;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.ArrayList;
/** /**
* Class responsible for access to stored document fields. * Class responsible for access to stored document fields.
@ -42,7 +45,7 @@ import java.io.Reader;
* *
* @lucene.internal * @lucene.internal
*/ */
public final class FieldsReader implements Cloneable { public final class FieldsReader implements Cloneable, Closeable {
private final static int FORMAT_SIZE = 4; private final static int FORMAT_SIZE = 4;
private final FieldInfos fieldInfos; private final FieldInfos fieldInfos;
@ -180,21 +183,11 @@ public final class FieldsReader implements Cloneable {
*/ */
public final void close() throws IOException { public final void close() throws IOException {
if (!closed) { if (!closed) {
if (fieldsStream != null) {
fieldsStream.close();
}
if (isOriginal) { if (isOriginal) {
if (cloneableFieldsStream != null) { IOUtils.closeSafely(false, fieldsStream, indexStream, fieldsStreamTL, cloneableFieldsStream, cloneableIndexStream);
cloneableFieldsStream.close(); } else {
} IOUtils.closeSafely(false, fieldsStream, indexStream, fieldsStreamTL);
if (cloneableIndexStream != null) {
cloneableIndexStream.close();
}
} }
if (indexStream != null) {
indexStream.close();
}
fieldsStreamTL.close();
closed = true; closed = true;
} }
} }

View File

@ -1025,7 +1025,7 @@ public abstract class IndexReader implements Cloneable,Closeable {
public abstract byte[] norms(String field) throws IOException; public abstract byte[] norms(String field) throws IOException;
/** Expert: Resets the normalization factor for the named field of the named /** Expert: Resets the normalization factor for the named field of the named
* document. The norm represents the product of the field's {@link * document. By default, The norm represents the product of the field's {@link
* org.apache.lucene.document.Fieldable#setBoost(float) boost} and its * org.apache.lucene.document.Fieldable#setBoost(float) boost} and its
* length normalization}. Thus, to preserve the length normalization * length normalization}. Thus, to preserve the length normalization
* values when resetting this, one should base the new value upon the old. * values when resetting this, one should base the new value upon the old.
@ -1034,7 +1034,8 @@ public abstract class IndexReader implements Cloneable,Closeable {
* this method throws {@link IllegalStateException}. * this method throws {@link IllegalStateException}.
* *
* @see #norms(String) * @see #norms(String)
* @see Similarity#decodeNormValue(byte) * @see Similarity#computeNorm(FieldInvertState)
* @see org.apache.lucene.search.DefaultSimilarity#decodeNormValue(byte)
* @throws StaleReaderException if the index has changed * @throws StaleReaderException if the index has changed
* since this reader was opened * since this reader was opened
* @throws CorruptIndexException if the index is corrupt * @throws CorruptIndexException if the index is corrupt

View File

@ -28,6 +28,7 @@ import org.apache.lucene.index.values.IndexDocValues;
import org.apache.lucene.index.values.MultiIndexDocValues; import org.apache.lucene.index.values.MultiIndexDocValues;
import org.apache.lucene.index.values.ValueType; import org.apache.lucene.index.values.ValueType;
import org.apache.lucene.index.values.MultiIndexDocValues.DocValuesIndex; import org.apache.lucene.index.values.MultiIndexDocValues.DocValuesIndex;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.ReaderUtil; import org.apache.lucene.util.ReaderUtil;
import org.apache.lucene.util.ReaderUtil.Gather; import org.apache.lucene.util.ReaderUtil.Gather;
@ -151,20 +152,7 @@ public class MultiPerDocValues extends PerDocValues {
} }
public void close() throws IOException { public void close() throws IOException {
final PerDocValues[] perDocValues = this.subs; IOUtils.closeSafely(false, this.subs);
IOException ex = null;
for (PerDocValues values : perDocValues) {
try {
values.close();
} catch (IOException e) {
if (ex == null) {
ex = e;
}
}
}
if (ex != null) {
throw ex;
}
} }
@Override @Override

View File

@ -72,8 +72,7 @@ final class NormsWriterPerField extends InvertedDocEndConsumerPerField implement
assert norms.length == upto; assert norms.length == upto;
norms = ArrayUtil.grow(norms, 1+upto); norms = ArrayUtil.grow(norms, 1+upto);
} }
final float norm = similarity.computeNorm(fieldState); norms[upto] = similarity.computeNorm(fieldState);
norms[upto] = similarity.encodeNormValue(norm);
docIDs[upto] = docState.docID; docIDs[upto] = docState.docID;
upto++; upto++;
} }

View File

@ -20,6 +20,7 @@ import java.util.concurrent.atomic.AtomicLong;
import org.apache.lucene.index.codecs.PerDocConsumer; import org.apache.lucene.index.codecs.PerDocConsumer;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
/** /**
* Encapsulates all necessary state to initiate a {@link PerDocConsumer} and * Encapsulates all necessary state to initiate a {@link PerDocConsumer} and
@ -35,10 +36,11 @@ public class PerDocWriteState {
public final AtomicLong bytesUsed; public final AtomicLong bytesUsed;
public final SegmentCodecs segmentCodecs; public final SegmentCodecs segmentCodecs;
public final int codecId; public final int codecId;
public final IOContext context;
PerDocWriteState(PrintStream infoStream, Directory directory, PerDocWriteState(PrintStream infoStream, Directory directory,
String segmentName, FieldInfos fieldInfos, AtomicLong bytesUsed, String segmentName, FieldInfos fieldInfos, AtomicLong bytesUsed,
int codecId) { int codecId, IOContext context) {
this.infoStream = infoStream; this.infoStream = infoStream;
this.directory = directory; this.directory = directory;
this.segmentName = segmentName; this.segmentName = segmentName;
@ -46,6 +48,7 @@ public class PerDocWriteState {
this.segmentCodecs = fieldInfos.buildSegmentCodecs(false); this.segmentCodecs = fieldInfos.buildSegmentCodecs(false);
this.codecId = codecId; this.codecId = codecId;
this.bytesUsed = bytesUsed; this.bytesUsed = bytesUsed;
this.context = context;
} }
PerDocWriteState(SegmentWriteState state) { PerDocWriteState(SegmentWriteState state) {
@ -56,6 +59,7 @@ public class PerDocWriteState {
fieldInfos = state.fieldInfos; fieldInfos = state.fieldInfos;
codecId = state.codecId; codecId = state.codecId;
bytesUsed = new AtomicLong(0); bytesUsed = new AtomicLong(0);
context = state.context;
} }
PerDocWriteState(PerDocWriteState state, int codecId) { PerDocWriteState(PerDocWriteState state, int codecId) {
@ -66,5 +70,6 @@ public class PerDocWriteState {
this.segmentCodecs = state.segmentCodecs; this.segmentCodecs = state.segmentCodecs;
this.codecId = codecId; this.codecId = codecId;
this.bytesUsed = state.bytesUsed; this.bytesUsed = state.bytesUsed;
this.context = state.context;
} }
} }

View File

@ -52,7 +52,7 @@ final class PerFieldCodecWrapper extends Codec {
private final SegmentCodecs segmentCodecs; private final SegmentCodecs segmentCodecs;
PerFieldCodecWrapper(SegmentCodecs segmentCodecs) { PerFieldCodecWrapper(SegmentCodecs segmentCodecs) {
name = "PerField"; super("PerField");
this.segmentCodecs = segmentCodecs; this.segmentCodecs = segmentCodecs;
} }

View File

@ -26,6 +26,7 @@ import org.apache.lucene.index.codecs.PerDocValues;
import org.apache.lucene.store.CompoundFileDirectory; import org.apache.lucene.store.CompoundFileDirectory;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.IOUtils;
/** Holds core readers that are shared (unchanged) when /** Holds core readers that are shared (unchanged) when
* SegmentReader is cloned or reopened */ * SegmentReader is cloned or reopened */
@ -120,33 +121,9 @@ final class SegmentCoreReaders {
} }
synchronized void decRef() throws IOException { synchronized void decRef() throws IOException {
if (ref.decrementAndGet() == 0) { if (ref.decrementAndGet() == 0) {
IOUtils.closeSafely(false, fields, perDocProducer, termVectorsReaderOrig,
if (fields != null) { fieldsReaderOrig, cfsReader, storeCFSReader);
fields.close();
}
if (perDocProducer != null) {
perDocProducer.close();
}
if (termVectorsReaderOrig != null) {
termVectorsReaderOrig.close();
}
if (fieldsReaderOrig != null) {
fieldsReaderOrig.close();
}
if (cfsReader != null) {
cfsReader.close();
}
if (storeCFSReader != null) {
storeCFSReader.close();
}
// Now, notify any ReaderFinished listeners: // Now, notify any ReaderFinished listeners:
if (owner != null) { if (owner != null) {
owner.notifyReaderFinishedListeners(); owner.notifyReaderFinishedListeners();

View File

@ -30,7 +30,6 @@ import java.util.Set;
import org.apache.lucene.index.codecs.Codec; import org.apache.lucene.index.codecs.Codec;
import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.index.codecs.DefaultSegmentInfosWriter; import org.apache.lucene.index.codecs.DefaultSegmentInfosWriter;
import org.apache.lucene.store.BufferedIndexInput;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexInput;
@ -248,7 +247,7 @@ public final class SegmentInfo implements Cloneable {
} }
final Directory dirToTest; final Directory dirToTest;
if (isCompoundFile) { if (isCompoundFile) {
dirToTest = dir.openCompoundInput(IndexFileNames.segmentFileName(storesSegment, "", ext), IOContext.READONCE ); dirToTest = dir.openCompoundInput(IndexFileNames.segmentFileName(storesSegment, "", ext), IOContext.READONCE);
} else { } else {
dirToTest = dir; dirToTest = dir;
} }

View File

@ -119,6 +119,7 @@ final class SegmentMerger {
mergedDocs = mergeFields(); mergedDocs = mergeFields();
mergeTerms(); mergeTerms();
mergePerDoc();
mergeNorms(); mergeNorms();
if (fieldInfos.hasVectors()) if (fieldInfos.hasVectors())
@ -484,17 +485,10 @@ final class SegmentMerger {
int docBase = 0; int docBase = 0;
final List<Fields> fields = new ArrayList<Fields>(); final List<Fields> fields = new ArrayList<Fields>();
final List<ReaderUtil.Slice> slices = new ArrayList<ReaderUtil.Slice>(); final List<ReaderUtil.Slice> slices = new ArrayList<ReaderUtil.Slice>();
final List<Bits> bits = new ArrayList<Bits>(); final List<Bits> bits = new ArrayList<Bits>();
final List<Integer> bitsStarts = new ArrayList<Integer>(); final List<Integer> bitsStarts = new ArrayList<Integer>();
// TODO: move this into its own method - this merges currently only docvalues
final List<PerDocValues> perDocProducers = new ArrayList<PerDocValues>();
final List<ReaderUtil.Slice> perDocSlices = new ArrayList<ReaderUtil.Slice>();
final List<Bits> perDocBits = new ArrayList<Bits>();
final List<Integer> perDocBitsStarts = new ArrayList<Integer>();
for(IndexReader r : readers) { for(IndexReader r : readers) {
final Fields f = r.fields(); final Fields f = r.fields();
final int maxDoc = r.maxDoc(); final int maxDoc = r.maxDoc();
@ -504,18 +498,10 @@ final class SegmentMerger {
bits.add(r.getLiveDocs()); bits.add(r.getLiveDocs());
bitsStarts.add(docBase); bitsStarts.add(docBase);
} }
final PerDocValues producer = r.perDocValues();
if (producer != null) {
perDocSlices.add(new ReaderUtil.Slice(docBase, maxDoc, fields.size()));
perDocProducers.add(producer);
perDocBits.add(r.getLiveDocs());
perDocBitsStarts.add(docBase);
}
docBase += maxDoc; docBase += maxDoc;
} }
bitsStarts.add(docBase); bitsStarts.add(docBase);
perDocBitsStarts.add(docBase);
// we may gather more readers than mergeState.readerCount // we may gather more readers than mergeState.readerCount
mergeState = new MergeState(); mergeState = new MergeState();
@ -581,19 +567,45 @@ final class SegmentMerger {
} finally { } finally {
consumer.close(); consumer.close();
} }
}
private void mergePerDoc() throws IOException {
final List<PerDocValues> perDocProducers = new ArrayList<PerDocValues>();
final List<ReaderUtil.Slice> perDocSlices = new ArrayList<ReaderUtil.Slice>();
final List<Bits> perDocBits = new ArrayList<Bits>();
final List<Integer> perDocBitsStarts = new ArrayList<Integer>();
int docBase = 0;
for (IndexReader r : readers) {
final int maxDoc = r.maxDoc();
final PerDocValues producer = r.perDocValues();
if (producer != null) {
perDocSlices.add(new ReaderUtil.Slice(docBase, maxDoc, perDocProducers
.size()));
perDocProducers.add(producer);
perDocBits.add(r.getLiveDocs());
perDocBitsStarts.add(docBase);
}
docBase += maxDoc;
}
perDocBitsStarts.add(docBase);
if (!perDocSlices.isEmpty()) { if (!perDocSlices.isEmpty()) {
mergeState.multiLiveDocs = new MultiBits(perDocBits, perDocBitsStarts, true); mergeState.multiLiveDocs = new MultiBits(perDocBits, perDocBitsStarts,
true);
final PerDocConsumer docsConsumer = codec final PerDocConsumer docsConsumer = codec
.docsConsumer(new PerDocWriteState(segmentWriteState)); .docsConsumer(new PerDocWriteState(segmentWriteState));
boolean success = false;
try { try {
final MultiPerDocValues multiPerDocValues = new MultiPerDocValues(perDocProducers final MultiPerDocValues multiPerDocValues = new MultiPerDocValues(
.toArray(PerDocValues.EMPTY_ARRAY), perDocSlices perDocProducers.toArray(PerDocValues.EMPTY_ARRAY),
.toArray(ReaderUtil.Slice.EMPTY_ARRAY)); perDocSlices.toArray(ReaderUtil.Slice.EMPTY_ARRAY));
docsConsumer.merge(mergeState, multiPerDocValues); docsConsumer.merge(mergeState, multiPerDocValues);
success = true;
} finally { } finally {
docsConsumer.close(); IOUtils.closeSafely(!success, docsConsumer);
} }
} }
/* don't close the perDocProducers here since they are private segment producers
* and will be closed once the SegmentReader goes out of scope */
} }
private MergeState mergeState; private MergeState mergeState;

View File

@ -25,11 +25,13 @@ import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IOContext.Context; import org.apache.lucene.store.IOContext.Context;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import java.io.Closeable;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
class TermVectorsReader implements Cloneable { class TermVectorsReader implements Cloneable, Closeable {
// NOTE: if you make a new format, it must be larger than // NOTE: if you make a new format, it must be larger than
// the current format // the current format
@ -190,14 +192,8 @@ class TermVectorsReader implements Cloneable {
return format; return format;
} }
void close() throws IOException { public void close() throws IOException {
// make all effort to close up. Keep the first exception IOUtils.closeSafely(false, tvx, tvd, tvf);
// and throw it as a new one.
IOException keep = null;
if (tvx != null) try { tvx.close(); } catch (IOException e) { keep = e; }
if (tvd != null) try { tvd.close(); } catch (IOException e) { if (keep == null) keep = e; }
if (tvf != null) try { tvf.close(); } catch (IOException e) { if (keep == null) keep = e; }
if (keep != null) throw (IOException) keep.fillInStackTrace();
} }
/** /**

View File

@ -18,6 +18,7 @@ package org.apache.lucene.index.codecs;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.Comparator;
import java.util.Set; import java.util.Set;
import org.apache.lucene.index.PerDocWriteState; import org.apache.lucene.index.PerDocWriteState;
@ -25,13 +26,21 @@ import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
/** @lucene.experimental */ /** @lucene.experimental */
public abstract class Codec { public abstract class Codec {
public static final Codec[] EMPTY = new Codec[0]; public static final Codec[] EMPTY = new Codec[0];
/** Unique name that's used to retrieve this codec when /** Unique name that's used to retrieve this codec when
* reading the index */ * reading the index */
public String name; public final String name;
private boolean dvUseCompoundFile = true;
private Comparator<BytesRef> docValuesSortComparator = BytesRef
.getUTF8SortedAsUnicodeComparator();
protected Codec(String name) {
this.name = name;
}
/** Writes a new segment */ /** Writes a new segment */
public abstract FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException; public abstract FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException;
@ -69,6 +78,47 @@ public abstract class Codec {
/** Records all file extensions this codec uses */ /** Records all file extensions this codec uses */
public abstract void getExtensions(Set<String> extensions); public abstract void getExtensions(Set<String> extensions);
/**
* If set to <code>true</code> this codec will use a compound file for
* IndexDocValues, otherwise each IndexDocValues field will create up to 2
* files per segment.
* <p>
* NOTE: The default values is <code>true</code>.
*/
public void setDocValuesUseCFS(boolean docValuesUseCFS) {
this.dvUseCompoundFile = docValuesUseCFS;
}
/**
* Returns <code>true</code> iff compound file should be used for
* IndexDocValues, otherwise <code>false</code>.
*
* @see #setDocValuesUseCFS(boolean)
* @return <code>true</code> iff compound file should be used for
* IndexDocValues, otherwise <code>false</code>.
*/
public boolean getDocValuesUseCFS() {
return dvUseCompoundFile;
}
/**
* Sets the {@link BytesRef} comparator for sorted IndexDocValue variants. The
* default is {@link BytesRef#getUTF8SortedAsUnicodeComparator()}. *
*/
public void setDocValuesSortComparator(
Comparator<BytesRef> docValuesSortComparator) {
this.docValuesSortComparator = docValuesSortComparator;
}
/**
* Returns the {@link BytesRef} comparator for sorted IndexDocValue variants.
* The default is {@link BytesRef#getUTF8SortedAsUnicodeComparator()}.
*/
public Comparator<BytesRef> getDocValuesSortComparator() {
return docValuesSortComparator;
}
@Override @Override
public String toString() { public String toString() {
return name; return name;

View File

@ -44,7 +44,7 @@ public class CoreCodecProvider extends CodecProvider {
public CoreCodecProvider() { public CoreCodecProvider() {
register(new StandardCodec()); register(new StandardCodec());
register(new PreFlexCodec()); register(new PreFlexCodec());
register(new PulsingCodec(1)); register(new PulsingCodec());
register(new SimpleTextCodec()); register(new SimpleTextCodec());
register(new MemoryCodec()); register(new MemoryCodec());
} }

View File

@ -32,79 +32,106 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
/**
*
* @lucene.experimental
*/
public class DefaultDocValuesConsumer extends PerDocConsumer { public class DefaultDocValuesConsumer extends PerDocConsumer {
private final String segmentName; private final String segmentName;
private final int codecId; private final int codecId;
private final Directory directory; private final Directory directory;
private final AtomicLong bytesUsed; private final AtomicLong bytesUsed;
private final Comparator<BytesRef> comparator; private final Comparator<BytesRef> comparator;
private boolean useCompoundFile;
private final IOContext context;
public DefaultDocValuesConsumer(PerDocWriteState state, Comparator<BytesRef> comparator) { public DefaultDocValuesConsumer(PerDocWriteState state, Comparator<BytesRef> comparator, boolean useCompoundFile) throws IOException {
this.segmentName = state.segmentName; this.segmentName = state.segmentName;
this.codecId = state.codecId; this.codecId = state.codecId;
this.bytesUsed = state.bytesUsed; this.bytesUsed = state.bytesUsed;
this.directory = state.directory; this.context = state.context;
//TODO maybe we should enable a global CFS that all codecs can pull on demand to further reduce the number of files?
this.directory = useCompoundFile ? state.directory.createCompoundOutput(
IndexFileNames.segmentFileName(segmentName, codecId,
IndexFileNames.COMPOUND_FILE_EXTENSION), context) : state.directory;
this.comparator = comparator; this.comparator = comparator;
this.useCompoundFile = useCompoundFile;
} }
public void close() throws IOException { public void close() throws IOException {
if (useCompoundFile) {
this.directory.close();
}
} }
@Override @Override
public DocValuesConsumer addValuesField(FieldInfo field) throws IOException { public DocValuesConsumer addValuesField(FieldInfo field) throws IOException {
return Writer.create(field.getDocValues(), return Writer.create(field.getDocValues(),
docValuesId(segmentName, codecId, field.number), docValuesId(segmentName, codecId, field.number),
// TODO can we have a compound file per segment and codec for directory, comparator, bytesUsed, context);
// docvalues?
directory, comparator, bytesUsed, IOContext.DEFAULT);
} }
@SuppressWarnings("fallthrough") @SuppressWarnings("fallthrough")
public static void files(Directory dir, SegmentInfo segmentInfo, int codecId, public static void files(Directory dir, SegmentInfo segmentInfo, int codecId,
Set<String> files) throws IOException { Set<String> files, boolean useCompoundFile) throws IOException {
FieldInfos fieldInfos = segmentInfo.getFieldInfos(); FieldInfos fieldInfos = segmentInfo.getFieldInfos();
for (FieldInfo fieldInfo : fieldInfos) { for (FieldInfo fieldInfo : fieldInfos) {
if (fieldInfo.getCodecId() == codecId && fieldInfo.hasDocValues()) { if (fieldInfo.getCodecId() == codecId && fieldInfo.hasDocValues()) {
String filename = docValuesId(segmentInfo.name, codecId, String filename = docValuesId(segmentInfo.name, codecId,
fieldInfo.number); fieldInfo.number);
switch (fieldInfo.getDocValues()) { if (useCompoundFile) {
case BYTES_FIXED_DEREF: files.add(IndexFileNames.segmentFileName(segmentInfo.name, codecId, IndexFileNames.COMPOUND_FILE_EXTENSION));
case BYTES_VAR_DEREF: files.add(IndexFileNames.segmentFileName(segmentInfo.name, codecId, IndexFileNames.COMPOUND_FILE_ENTRIES_EXTENSION));
case BYTES_VAR_SORTED: assert dir.fileExists(IndexFileNames.segmentFileName(segmentInfo.name, codecId, IndexFileNames.COMPOUND_FILE_ENTRIES_EXTENSION));
case BYTES_FIXED_SORTED: assert dir.fileExists(IndexFileNames.segmentFileName(segmentInfo.name, codecId, IndexFileNames.COMPOUND_FILE_EXTENSION));
case BYTES_VAR_STRAIGHT: return;
files.add(IndexFileNames.segmentFileName(filename, "", } else {
Writer.INDEX_EXTENSION)); switch (fieldInfo.getDocValues()) {
assert dir.fileExists(IndexFileNames.segmentFileName(filename, "", case BYTES_FIXED_DEREF:
Writer.INDEX_EXTENSION)); case BYTES_VAR_DEREF:
// until here all types use an index case BYTES_VAR_SORTED:
case BYTES_FIXED_STRAIGHT: case BYTES_FIXED_SORTED:
case FLOAT_32: case BYTES_VAR_STRAIGHT:
case FLOAT_64: files.add(IndexFileNames.segmentFileName(filename, "",
case VAR_INTS: Writer.INDEX_EXTENSION));
case FIXED_INTS_16: assert dir.fileExists(IndexFileNames.segmentFileName(filename, "",
case FIXED_INTS_32: Writer.INDEX_EXTENSION));
case FIXED_INTS_64: // until here all types use an index
case FIXED_INTS_8: case BYTES_FIXED_STRAIGHT:
files.add(IndexFileNames.segmentFileName(filename, "", case FLOAT_32:
Writer.DATA_EXTENSION)); case FLOAT_64:
assert dir.fileExists(IndexFileNames.segmentFileName(filename, "", case VAR_INTS:
Writer.DATA_EXTENSION)); case FIXED_INTS_16:
break; case FIXED_INTS_32:
case FIXED_INTS_64:
case FIXED_INTS_8:
files.add(IndexFileNames.segmentFileName(filename, "",
Writer.DATA_EXTENSION));
assert dir.fileExists(IndexFileNames.segmentFileName(filename, "",
Writer.DATA_EXTENSION));
break;
default: default:
assert false; assert false;
}
} }
} }
} }
} }
static String docValuesId(String segmentsName, int codecID, int fieldId) { static String docValuesId(String segmentsName, int codecID, int fieldId) {
return segmentsName + "_" + codecID + "-" + fieldId; return segmentsName + "_" + codecID + "-" + fieldId;
} }
public static void getDocValuesExtensions(Set<String> extensions) { public static void getDocValuesExtensions(Set<String> extensions, boolean useCompoundFile) {
extensions.add(Writer.DATA_EXTENSION); if (useCompoundFile) {
extensions.add(Writer.INDEX_EXTENSION); extensions.add(IndexFileNames.COMPOUND_FILE_ENTRIES_EXTENSION);
extensions.add(IndexFileNames.COMPOUND_FILE_EXTENSION);
} else {
extensions.add(Writer.DATA_EXTENSION);
extensions.add(Writer.INDEX_EXTENSION);
}
} }
} }

View File

@ -16,12 +16,16 @@ package org.apache.lucene.index.codecs;
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
import java.io.Closeable;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.Comparator;
import java.util.TreeMap; import java.util.TreeMap;
import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.values.Bytes; import org.apache.lucene.index.values.Bytes;
import org.apache.lucene.index.values.IndexDocValues; import org.apache.lucene.index.values.IndexDocValues;
@ -30,6 +34,8 @@ import org.apache.lucene.index.values.Ints;
import org.apache.lucene.index.values.ValueType; import org.apache.lucene.index.values.ValueType;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
/** /**
* Abstract base class for FieldsProducer implementations supporting * Abstract base class for FieldsProducer implementations supporting
@ -40,8 +46,12 @@ import org.apache.lucene.store.IOContext;
public class DefaultDocValuesProducer extends PerDocValues { public class DefaultDocValuesProducer extends PerDocValues {
protected final TreeMap<String, IndexDocValues> docValues; protected final TreeMap<String, IndexDocValues> docValues;
private final boolean useCompoundFile;
private final Closeable cfs;
private final Comparator<BytesRef> sortComparator;
/** /**
*
* Creates a new {@link DefaultDocValuesProducer} instance and loads all * Creates a new {@link DefaultDocValuesProducer} instance and loads all
* {@link IndexDocValues} instances for this segment and codec. * {@link IndexDocValues} instances for this segment and codec.
* *
@ -53,12 +63,27 @@ public class DefaultDocValuesProducer extends PerDocValues {
* the {@link FieldInfos} * the {@link FieldInfos}
* @param codecId * @param codecId
* the codec ID * the codec ID
* @param useCompoundFile
* if <code>true</code> this producer opens a compound file to read
* IndexDocValues fields, otherwise each field defines its own set of
* files.
* @param sortComparator
* defines the sort order for sorted IndexDocValues variants
* @throws IOException * @throws IOException
* if an {@link IOException} occurs * if an {@link IOException} occurs
*/ */
public DefaultDocValuesProducer(SegmentInfo si, Directory dir, public DefaultDocValuesProducer(SegmentInfo si, Directory dir,
FieldInfos fieldInfo, int codecId, IOContext context) throws IOException { FieldInfos fieldInfo, int codecId, boolean useCompoundFile, Comparator<BytesRef> sortComparator, IOContext context) throws IOException {
docValues = load(fieldInfo, si.name, si.docCount, dir, codecId, context); this.useCompoundFile = useCompoundFile;
this.sortComparator = sortComparator;
final Directory directory;
if (useCompoundFile) {
cfs = directory = dir.openCompoundInput(IndexFileNames.segmentFileName(si.name, codecId, IndexFileNames.COMPOUND_FILE_EXTENSION), context);
} else {
cfs = null;
directory = dir;
}
docValues = load(fieldInfo, si.name, si.docCount, directory, codecId, context);
} }
/** /**
@ -86,14 +111,14 @@ public class DefaultDocValuesProducer extends PerDocValues {
final String id = DefaultDocValuesConsumer.docValuesId(segment, final String id = DefaultDocValuesConsumer.docValuesId(segment,
codecId, fieldInfo.number); codecId, fieldInfo.number);
values.put(field, values.put(field,
loadDocValues(docCount, dir, id, fieldInfo.getDocValues(), context)); loadDocValues(docCount, dir, id, fieldInfo.getDocValues(), sortComparator, context));
} }
} }
success = true; success = true;
} finally { } finally {
if (!success) { if (!success) {
// if we fail we must close all opened resources if there are any // if we fail we must close all opened resources if there are any
closeDocValues(values.values()); closeInternal(values.values());
} }
} }
return values; return values;
@ -113,6 +138,7 @@ public class DefaultDocValuesProducer extends PerDocValues {
* the unique file ID within the segment * the unique file ID within the segment
* @param type * @param type
* the type to load * the type to load
* @param sortComparator byte comparator used by sorted variants
* @return a {@link IndexDocValues} instance for the given type * @return a {@link IndexDocValues} instance for the given type
* @throws IOException * @throws IOException
* if an {@link IOException} occurs * if an {@link IOException} occurs
@ -120,7 +146,7 @@ public class DefaultDocValuesProducer extends PerDocValues {
* if the given {@link ValueType} is not supported * if the given {@link ValueType} is not supported
*/ */
protected IndexDocValues loadDocValues(int docCount, Directory dir, String id, protected IndexDocValues loadDocValues(int docCount, Directory dir, String id,
ValueType type, IOContext context) throws IOException { ValueType type, Comparator<BytesRef> sortComparator, IOContext context) throws IOException {
switch (type) { switch (type) {
case FIXED_INTS_16: case FIXED_INTS_16:
case FIXED_INTS_32: case FIXED_INTS_32:
@ -133,39 +159,36 @@ public class DefaultDocValuesProducer extends PerDocValues {
case FLOAT_64: case FLOAT_64:
return Floats.getValues(dir, id, docCount, context); return Floats.getValues(dir, id, docCount, context);
case BYTES_FIXED_STRAIGHT: case BYTES_FIXED_STRAIGHT:
return Bytes.getValues(dir, id, Bytes.Mode.STRAIGHT, true, docCount, context); return Bytes.getValues(dir, id, Bytes.Mode.STRAIGHT, true, docCount, sortComparator, context);
case BYTES_FIXED_DEREF: case BYTES_FIXED_DEREF:
return Bytes.getValues(dir, id, Bytes.Mode.DEREF, true, docCount, context); return Bytes.getValues(dir, id, Bytes.Mode.DEREF, true, docCount, sortComparator, context);
case BYTES_FIXED_SORTED: case BYTES_FIXED_SORTED:
return Bytes.getValues(dir, id, Bytes.Mode.SORTED, true, docCount, context); return Bytes.getValues(dir, id, Bytes.Mode.SORTED, true, docCount, sortComparator, context);
case BYTES_VAR_STRAIGHT: case BYTES_VAR_STRAIGHT:
return Bytes.getValues(dir, id, Bytes.Mode.STRAIGHT, false, docCount, context); return Bytes.getValues(dir, id, Bytes.Mode.STRAIGHT, false, docCount, sortComparator, context);
case BYTES_VAR_DEREF: case BYTES_VAR_DEREF:
return Bytes.getValues(dir, id, Bytes.Mode.DEREF, false, docCount, context); return Bytes.getValues(dir, id, Bytes.Mode.DEREF, false, docCount, sortComparator, context);
case BYTES_VAR_SORTED: case BYTES_VAR_SORTED:
return Bytes.getValues(dir, id, Bytes.Mode.SORTED, false, docCount, context); return Bytes.getValues(dir, id, Bytes.Mode.SORTED, false, docCount, sortComparator, context);
default: default:
throw new IllegalStateException("unrecognized index values mode " + type); throw new IllegalStateException("unrecognized index values mode " + type);
} }
} }
public void close() throws IOException { public void close() throws IOException {
closeDocValues(docValues.values()); closeInternal(docValues.values());
} }
private void closeDocValues(final Collection<IndexDocValues> values) private void closeInternal(Collection<? extends Closeable> closeables) throws IOException {
throws IOException { final Collection<? extends Closeable> toClose;
IOException ex = null; if (useCompoundFile) {
for (IndexDocValues docValues : values) { final ArrayList<Closeable> list = new ArrayList<Closeable>(closeables);
try { list.add(cfs);
docValues.close(); toClose = list;
} catch (IOException e) { } else {
ex = e; toClose = closeables;
}
}
if (ex != null) {
throw ex;
} }
IOUtils.closeSafely(false, toClose);
} }
@Override @Override

View File

@ -79,7 +79,7 @@ import org.apache.lucene.util.fst.FST;
public class MemoryCodec extends Codec { public class MemoryCodec extends Codec {
public MemoryCodec() { public MemoryCodec() {
name = "Memory"; super("Memory");
} }
private static final boolean VERBOSE = false; private static final boolean VERBOSE = false;
@ -779,22 +779,22 @@ public class MemoryCodec extends Codec {
@Override @Override
public void files(Directory dir, SegmentInfo segmentInfo, int id, Set<String> files) throws IOException { public void files(Directory dir, SegmentInfo segmentInfo, int id, Set<String> files) throws IOException {
files.add(IndexFileNames.segmentFileName(segmentInfo.name, id, EXTENSION)); files.add(IndexFileNames.segmentFileName(segmentInfo.name, id, EXTENSION));
DefaultDocValuesConsumer.files(dir, segmentInfo, id, files); DefaultDocValuesConsumer.files(dir, segmentInfo, id, files, getDocValuesUseCFS());
} }
@Override @Override
public void getExtensions(Set<String> extensions) { public void getExtensions(Set<String> extensions) {
extensions.add(EXTENSION); extensions.add(EXTENSION);
DefaultDocValuesConsumer.getDocValuesExtensions(extensions); DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS());
} }
@Override @Override
public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException {
return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS());
} }
@Override @Override
public PerDocValues docsProducer(SegmentReadState state) throws IOException { public PerDocValues docsProducer(SegmentReadState state) throws IOException {
return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, IOContext.READONCE); return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, getDocValuesUseCFS(), getDocValuesSortComparator(), IOContext.READONCE);
} }
} }

View File

@ -55,7 +55,7 @@ public class PreFlexCodec extends Codec {
public static final String PROX_EXTENSION = "prx"; public static final String PROX_EXTENSION = "prx";
public PreFlexCodec() { public PreFlexCodec() {
name = "PreFlex"; super("PreFlex");
} }
@Override @Override

View File

@ -43,7 +43,6 @@ import org.apache.lucene.index.codecs.TermsIndexReaderBase;
import org.apache.lucene.index.codecs.TermsIndexWriterBase; import org.apache.lucene.index.codecs.TermsIndexWriterBase;
import org.apache.lucene.index.codecs.standard.StandardCodec; import org.apache.lucene.index.codecs.standard.StandardCodec;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
/** This codec "inlines" the postings for terms that have /** This codec "inlines" the postings for terms that have
@ -58,10 +57,19 @@ public class PulsingCodec extends Codec {
private final int freqCutoff; private final int freqCutoff;
/**
* Creates a {@link PulsingCodec} with <tt>freqCutoff = 1</tt>
*
* @see PulsingCodec#PulsingCodec(int)
*/
public PulsingCodec() {
this(1);
}
/** Terms with freq <= freqCutoff are inlined into terms /** Terms with freq <= freqCutoff are inlined into terms
* dict. */ * dict. */
public PulsingCodec(int freqCutoff) { public PulsingCodec(int freqCutoff) {
name = "Pulsing"; super("Pulsing");
this.freqCutoff = freqCutoff; this.freqCutoff = freqCutoff;
} }
@ -157,22 +165,22 @@ public class PulsingCodec extends Codec {
StandardPostingsReader.files(dir, segmentInfo, id, files); StandardPostingsReader.files(dir, segmentInfo, id, files);
BlockTermsReader.files(dir, segmentInfo, id, files); BlockTermsReader.files(dir, segmentInfo, id, files);
VariableGapTermsIndexReader.files(dir, segmentInfo, id, files); VariableGapTermsIndexReader.files(dir, segmentInfo, id, files);
DefaultDocValuesConsumer.files(dir, segmentInfo, id, files); DefaultDocValuesConsumer.files(dir, segmentInfo, id, files, getDocValuesUseCFS());
} }
@Override @Override
public void getExtensions(Set<String> extensions) { public void getExtensions(Set<String> extensions) {
StandardCodec.getStandardExtensions(extensions); StandardCodec.getStandardExtensions(extensions);
DefaultDocValuesConsumer.getDocValuesExtensions(extensions); DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS());
} }
@Override @Override
public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException {
return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS());
} }
@Override @Override
public PerDocValues docsProducer(SegmentReadState state) throws IOException { public PerDocValues docsProducer(SegmentReadState state) throws IOException {
return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, state.context); return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, getDocValuesUseCFS(), getDocValuesSortComparator(), state.context);
} }
} }

View File

@ -33,7 +33,6 @@ import org.apache.lucene.index.codecs.PerDocConsumer;
import org.apache.lucene.index.codecs.DefaultDocValuesConsumer; import org.apache.lucene.index.codecs.DefaultDocValuesConsumer;
import org.apache.lucene.index.codecs.PerDocValues; import org.apache.lucene.index.codecs.PerDocValues;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
/** For debugging, curiosity, transparency only!! Do not /** For debugging, curiosity, transparency only!! Do not
* use this codec in production. * use this codec in production.
@ -46,9 +45,10 @@ import org.apache.lucene.util.BytesRef;
public class SimpleTextCodec extends Codec { public class SimpleTextCodec extends Codec {
public SimpleTextCodec() { public SimpleTextCodec() {
name = "SimpleText"; super("SimpleText");
} }
@Override @Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
return new SimpleTextFieldsWriter(state); return new SimpleTextFieldsWriter(state);
@ -69,23 +69,23 @@ public class SimpleTextCodec extends Codec {
@Override @Override
public void files(Directory dir, SegmentInfo segmentInfo, int id, Set<String> files) throws IOException { public void files(Directory dir, SegmentInfo segmentInfo, int id, Set<String> files) throws IOException {
files.add(getPostingsFileName(segmentInfo.name, id)); files.add(getPostingsFileName(segmentInfo.name, id));
DefaultDocValuesConsumer.files(dir, segmentInfo, id, files); DefaultDocValuesConsumer.files(dir, segmentInfo, id, files, getDocValuesUseCFS());
} }
@Override @Override
public void getExtensions(Set<String> extensions) { public void getExtensions(Set<String> extensions) {
extensions.add(POSTINGS_EXTENSION); extensions.add(POSTINGS_EXTENSION);
DefaultDocValuesConsumer.getDocValuesExtensions(extensions); DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS());
} }
// TODO: would be great if these used a plain text impl // TODO: would be great if these used a plain text impl
@Override @Override
public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException {
return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS());
} }
@Override @Override
public PerDocValues docsProducer(SegmentReadState state) throws IOException { public PerDocValues docsProducer(SegmentReadState state) throws IOException {
return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, state.context); return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, getDocValuesUseCFS(), getDocValuesSortComparator(), state.context);
} }
} }

View File

@ -40,14 +40,13 @@ import org.apache.lucene.index.codecs.BlockTermsWriter;
import org.apache.lucene.index.codecs.BlockTermsReader; import org.apache.lucene.index.codecs.BlockTermsReader;
import org.apache.lucene.index.codecs.DefaultDocValuesProducer; import org.apache.lucene.index.codecs.DefaultDocValuesProducer;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
/** Default codec. /** Default codec.
* @lucene.experimental */ * @lucene.experimental */
public class StandardCodec extends Codec { public class StandardCodec extends Codec {
public StandardCodec() { public StandardCodec() {
name = "Standard"; super("Standard");
} }
@Override @Override
@ -140,13 +139,13 @@ public class StandardCodec extends Codec {
StandardPostingsReader.files(dir, segmentInfo, id, files); StandardPostingsReader.files(dir, segmentInfo, id, files);
BlockTermsReader.files(dir, segmentInfo, id, files); BlockTermsReader.files(dir, segmentInfo, id, files);
VariableGapTermsIndexReader.files(dir, segmentInfo, id, files); VariableGapTermsIndexReader.files(dir, segmentInfo, id, files);
DefaultDocValuesConsumer.files(dir, segmentInfo, id, files); DefaultDocValuesConsumer.files(dir, segmentInfo, id, files, getDocValuesUseCFS());
} }
@Override @Override
public void getExtensions(Set<String> extensions) { public void getExtensions(Set<String> extensions) {
getStandardExtensions(extensions); getStandardExtensions(extensions);
DefaultDocValuesConsumer.getDocValuesExtensions(extensions); DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS());
} }
public static void getStandardExtensions(Set<String> extensions) { public static void getStandardExtensions(Set<String> extensions) {
@ -158,11 +157,11 @@ public class StandardCodec extends Codec {
@Override @Override
public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException {
return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS());
} }
@Override @Override
public PerDocValues docsProducer(SegmentReadState state) throws IOException { public PerDocValues docsProducer(SegmentReadState state) throws IOException {
return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, state.context); return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, getDocValuesUseCFS(), getDocValuesSortComparator(), state.context);
} }
} }

View File

@ -153,12 +153,13 @@ public final class Bytes {
* otherwise <code>false</code> * otherwise <code>false</code>
* @param maxDoc * @param maxDoc
* the number of document values stored for the given ID * the number of document values stored for the given ID
* @param sortComparator byte comparator used by sorted variants
* @return an initialized {@link IndexDocValues} instance. * @return an initialized {@link IndexDocValues} instance.
* @throws IOException * @throws IOException
* if an {@link IOException} occurs * if an {@link IOException} occurs
*/ */
public static IndexDocValues getValues(Directory dir, String id, Mode mode, public static IndexDocValues getValues(Directory dir, String id, Mode mode,
boolean fixedSize, int maxDoc, IOContext context) throws IOException { boolean fixedSize, int maxDoc, Comparator<BytesRef> sortComparator, IOContext context) throws IOException {
// TODO -- I can peek @ header to determing fixed/mode? // TODO -- I can peek @ header to determing fixed/mode?
if (fixedSize) { if (fixedSize) {
@ -175,7 +176,7 @@ public final class Bytes {
} else if (mode == Mode.DEREF) { } else if (mode == Mode.DEREF) {
return new VarDerefBytesImpl.Reader(dir, id, maxDoc, context); return new VarDerefBytesImpl.Reader(dir, id, maxDoc, context);
} else if (mode == Mode.SORTED) { } else if (mode == Mode.SORTED) {
return new VarSortedBytesImpl.Reader(dir, id, maxDoc, context); return new VarSortedBytesImpl.Reader(dir, id, maxDoc, sortComparator, context);
} }
} }

View File

@ -131,6 +131,18 @@ public abstract class IndexDocValues implements Closeable {
return cache.loadSorted(this, comparator); return cache.loadSorted(this, comparator);
} }
/**
* Returns a {@link SortedSource} instance using a default {@link BytesRef}
* comparator for this {@link IndexDocValues} field instance like
* {@link #getSource()}.
* <p>
* This method will return null iff this {@link IndexDocValues} represent a
* {@link Source} instead of a {@link SortedSource}.
*/
public SortedSource getSortedSorted() throws IOException {
return getSortedSorted(null);
}
/** /**
* Loads and returns a {@link SortedSource} instance for this * Loads and returns a {@link SortedSource} instance for this
* {@link IndexDocValues} field instance like {@link #load()}. * {@link IndexDocValues} field instance like {@link #load()}.
@ -143,6 +155,18 @@ public abstract class IndexDocValues implements Closeable {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
/**
* Loads and returns a {@link SortedSource} instance using a default
* {@link BytesRef} comparator for this {@link IndexDocValues} field instance
* like {@link #load()}.
* <p>
* This method will return null iff this {@link IndexDocValues} represent a
* {@link Source} instead of a {@link SortedSource}.
*/
public SortedSource loadSorted() throws IOException {
return loadSorted(null);
}
/** /**
* Returns the {@link ValueType} of this {@link IndexDocValues} instance * Returns the {@link ValueType} of this {@link IndexDocValues} instance
*/ */

View File

@ -168,14 +168,17 @@ class VarSortedBytesImpl {
public static class Reader extends BytesReaderBase { public static class Reader extends BytesReaderBase {
Reader(Directory dir, String id, int maxDoc, IOContext context) throws IOException { private final Comparator<BytesRef> defaultComp;
Reader(Directory dir, String id, int maxDoc, Comparator<BytesRef> comparator, IOContext context) throws IOException {
super(dir, id, CODEC_NAME, VERSION_START, true, context); super(dir, id, CODEC_NAME, VERSION_START, true, context);
this.defaultComp = comparator;
} }
@Override @Override
public org.apache.lucene.index.values.IndexDocValues.Source load() public org.apache.lucene.index.values.IndexDocValues.Source load()
throws IOException { throws IOException {
return loadSorted(null); return loadSorted(defaultComp);
} }
@Override @Override

View File

@ -183,14 +183,11 @@ public class BooleanQuery extends Query implements Iterable<BooleanClause> {
public Query getQuery() { return BooleanQuery.this; } public Query getQuery() { return BooleanQuery.this; }
@Override @Override
public float getValue() { return getBoost(); } public float getValueForNormalization() throws IOException {
@Override
public float sumOfSquaredWeights() throws IOException {
float sum = 0.0f; float sum = 0.0f;
for (int i = 0 ; i < weights.size(); i++) { for (int i = 0 ; i < weights.size(); i++) {
// call sumOfSquaredWeights for all clauses in case of side effects // call sumOfSquaredWeights for all clauses in case of side effects
float s = weights.get(i).sumOfSquaredWeights(); // sum sub weights float s = weights.get(i).getValueForNormalization(); // sum sub weights
if (!clauses.get(i).isProhibited()) if (!clauses.get(i).isProhibited())
// only add to sum for non-prohibited clauses // only add to sum for non-prohibited clauses
sum += s; sum += s;
@ -206,11 +203,11 @@ public class BooleanQuery extends Query implements Iterable<BooleanClause> {
} }
@Override @Override
public void normalize(float norm) { public void normalize(float norm, float topLevelBoost) {
norm *= getBoost(); // incorporate boost topLevelBoost *= getBoost(); // incorporate boost
for (Weight w : weights) { for (Weight w : weights) {
// normalize all clauses, (even if prohibited in case of side affects) // normalize all clauses, (even if prohibited in case of side affects)
w.normalize(norm); w.normalize(norm, topLevelBoost);
} }
} }

View File

@ -27,7 +27,7 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash; import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.PerReaderTermState; import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray; import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
@ -77,7 +77,7 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
} }
@Override @Override
protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost /*ignored*/, PerReaderTermState states) { protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost /*ignored*/, TermContext states) {
topLevel.add(new TermQuery(term, states), BooleanClause.Occur.SHOULD); topLevel.add(new TermQuery(term, states), BooleanClause.Occur.SHOULD);
} }
@ -140,9 +140,9 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
assert termState != null; assert termState != null;
if (pos < 0) { if (pos < 0) {
pos = (-pos)-1; pos = (-pos)-1;
array.termState[pos].register(termState, readerContext.ord, termsEnum.docFreq()); array.termState[pos].register(termState, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
} else { } else {
array.termState[pos] = new PerReaderTermState(topReaderContext, termState, readerContext.ord, termsEnum.docFreq()); array.termState[pos] = new TermContext(topReaderContext, termState, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
} }
return true; return true;
} }
@ -183,9 +183,9 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
return true; return true;
} }
/** Special implementation of BytesStartArray that keeps parallel arrays for {@link PerReaderTermState} */ /** Special implementation of BytesStartArray that keeps parallel arrays for {@link TermContext} */
static final class TermStateByteStart extends DirectBytesStartArray { static final class TermStateByteStart extends DirectBytesStartArray {
PerReaderTermState[] termState; TermContext[] termState;
public TermStateByteStart(int initSize) { public TermStateByteStart(int initSize) {
super(initSize); super(initSize);
@ -194,7 +194,7 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
@Override @Override
public int[] init() { public int[] init() {
final int[] ord = super.init(); final int[] ord = super.init();
termState = new PerReaderTermState[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; termState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
assert termState.length >= ord.length; assert termState.length >= ord.length;
return ord; return ord;
} }
@ -203,7 +203,7 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
public int[] grow() { public int[] grow() {
final int[] ord = super.grow(); final int[] ord = super.grow();
if (termState.length < ord.length) { if (termState.length < ord.length) {
PerReaderTermState[] tmpTermState = new PerReaderTermState[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; TermContext[] tmpTermState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(termState, 0, tmpTermState, 0, termState.length); System.arraycopy(termState, 0, tmpTermState, 0, termState.length);
termState = tmpTermState; termState = tmpTermState;
} }

View File

@ -110,24 +110,19 @@ public class ConstantScoreQuery extends Query {
} }
@Override @Override
public float getValue() { public float getValueForNormalization() throws IOException {
return queryWeight;
}
@Override
public float sumOfSquaredWeights() throws IOException {
// we calculate sumOfSquaredWeights of the inner weight, but ignore it (just to initialize everything) // we calculate sumOfSquaredWeights of the inner weight, but ignore it (just to initialize everything)
if (innerWeight != null) innerWeight.sumOfSquaredWeights(); if (innerWeight != null) innerWeight.getValueForNormalization();
queryWeight = getBoost(); queryWeight = getBoost();
return queryWeight * queryWeight; return queryWeight * queryWeight;
} }
@Override @Override
public void normalize(float norm) { public void normalize(float norm, float topLevelBoost) {
this.queryNorm = norm; this.queryNorm = norm * topLevelBoost;
queryWeight *= this.queryNorm; queryWeight *= this.queryNorm;
// we normalize the inner weight, but ignore it (just to initialize everything) // we normalize the inner weight, but ignore it (just to initialize everything)
if (innerWeight != null) innerWeight.normalize(norm); if (innerWeight != null) innerWeight.normalize(norm, topLevelBoost);
} }
@Override @Override
@ -148,7 +143,7 @@ public class ConstantScoreQuery extends Query {
if (disi == null) { if (disi == null) {
return null; return null;
} }
return new ConstantScorer(disi, this); return new ConstantScorer(disi, this, queryWeight);
} }
@Override @Override
@ -181,9 +176,9 @@ public class ConstantScoreQuery extends Query {
final DocIdSetIterator docIdSetIterator; final DocIdSetIterator docIdSetIterator;
final float theScore; final float theScore;
public ConstantScorer(DocIdSetIterator docIdSetIterator, Weight w) throws IOException { public ConstantScorer(DocIdSetIterator docIdSetIterator, Weight w, float theScore) throws IOException {
super(w); super(w);
theScore = w.getValue(); this.theScore = theScore;
this.docIdSetIterator = docIdSetIterator; this.docIdSetIterator = docIdSetIterator;
} }
@ -212,7 +207,7 @@ public class ConstantScoreQuery extends Query {
@Override @Override
public void setScorer(Scorer scorer) throws IOException { public void setScorer(Scorer scorer) throws IOException {
// we must wrap again here, but using the scorer passed in as parameter: // we must wrap again here, but using the scorer passed in as parameter:
collector.setScorer(new ConstantScorer(scorer, ConstantScorer.this.weight)); collector.setScorer(new ConstantScorer(scorer, ConstantScorer.this.weight, ConstantScorer.this.theScore));
} }
@Override @Override

View File

@ -20,7 +20,7 @@ import org.apache.lucene.index.FieldInvertState;
*/ */
/** Expert: Default scoring implementation. */ /** Expert: Default scoring implementation. */
public class DefaultSimilarity extends Similarity { public class DefaultSimilarity extends TFIDFSimilarity {
/** Implemented as /** Implemented as
* <code>state.getBoost()*lengthNorm(numTerms)</code>, where * <code>state.getBoost()*lengthNorm(numTerms)</code>, where
@ -31,13 +31,13 @@ public class DefaultSimilarity extends Similarity {
* *
* @lucene.experimental */ * @lucene.experimental */
@Override @Override
public float computeNorm(FieldInvertState state) { public byte computeNorm(FieldInvertState state) {
final int numTerms; final int numTerms;
if (discountOverlaps) if (discountOverlaps)
numTerms = state.getLength() - state.getNumOverlap(); numTerms = state.getLength() - state.getNumOverlap();
else else
numTerms = state.getLength(); numTerms = state.getLength();
return state.getBoost() * ((float) (1.0 / Math.sqrt(numTerms))); return encodeNormValue(state.getBoost() * ((float) (1.0 / Math.sqrt(numTerms))));
} }
/** Implemented as <code>sqrt(freq)</code>. */ /** Implemented as <code>sqrt(freq)</code>. */

View File

@ -110,16 +110,12 @@ public class DisjunctionMaxQuery extends Query implements Iterable<Query> {
@Override @Override
public Query getQuery() { return DisjunctionMaxQuery.this; } public Query getQuery() { return DisjunctionMaxQuery.this; }
/** Return our boost */
@Override
public float getValue() { return getBoost(); }
/** Compute the sub of squared weights of us applied to our subqueries. Used for normalization. */ /** Compute the sub of squared weights of us applied to our subqueries. Used for normalization. */
@Override @Override
public float sumOfSquaredWeights() throws IOException { public float getValueForNormalization() throws IOException {
float max = 0.0f, sum = 0.0f; float max = 0.0f, sum = 0.0f;
for (Weight currentWeight : weights) { for (Weight currentWeight : weights) {
float sub = currentWeight.sumOfSquaredWeights(); float sub = currentWeight.getValueForNormalization();
sum += sub; sum += sub;
max = Math.max(max, sub); max = Math.max(max, sub);
@ -130,10 +126,10 @@ public class DisjunctionMaxQuery extends Query implements Iterable<Query> {
/** Apply the computed normalization factor to our subqueries */ /** Apply the computed normalization factor to our subqueries */
@Override @Override
public void normalize(float norm) { public void normalize(float norm, float topLevelBoost) {
norm *= getBoost(); // Incorporate our boost topLevelBoost *= getBoost(); // Incorporate our boost
for (Weight wt : weights) { for (Weight wt : weights) {
wt.normalize(norm); wt.normalize(norm, topLevelBoost);
} }
} }

View File

@ -23,12 +23,6 @@ import java.util.Arrays;
import org.apache.lucene.index.*; import org.apache.lucene.index.*;
final class ExactPhraseScorer extends Scorer { final class ExactPhraseScorer extends Scorer {
private final byte[] norms;
private final float value;
private static final int SCORE_CACHE_SIZE = 32;
private final float[] scoreCache = new float[SCORE_CACHE_SIZE];
private final int endMinus1; private final int endMinus1;
private final static int CHUNK = 4096; private final static int CHUNK = 4096;
@ -60,14 +54,12 @@ final class ExactPhraseScorer extends Scorer {
private int docID = -1; private int docID = -1;
private int freq; private int freq;
private final Similarity similarity; private final Similarity.ExactDocScorer docScorer;
ExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, ExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
Similarity similarity, byte[] norms) throws IOException { Similarity.ExactDocScorer docScorer) throws IOException {
super(weight); super(weight);
this.similarity = similarity; this.docScorer = docScorer;
this.norms = norms;
this.value = weight.getValue();
chunkStates = new ChunkState[postings.length]; chunkStates = new ChunkState[postings.length];
@ -88,10 +80,6 @@ final class ExactPhraseScorer extends Scorer {
return; return;
} }
} }
for (int i = 0; i < SCORE_CACHE_SIZE; i++) {
scoreCache[i] = similarity.tf((float) i) * value;
}
} }
@Override @Override
@ -206,13 +194,7 @@ final class ExactPhraseScorer extends Scorer {
@Override @Override
public float score() throws IOException { public float score() throws IOException {
final float raw; // raw score return docScorer.score(docID, freq);
if (freq < SCORE_CACHE_SIZE) {
raw = scoreCache[freq];
} else {
raw = similarity.tf((float) freq) * value;
}
return norms == null ? raw : raw * similarity.decodeNormValue(norms[docID]); // normalize
} }
private int phraseFreq() throws IOException { private int phraseFreq() throws IOException {

View File

@ -125,25 +125,4 @@ public class Explanation {
return buffer.toString(); return buffer.toString();
} }
/**
* Small Util class used to pass both an idf factor as well as an
* explanation for that factor.
*
* This class will likely be held on a {@link Weight}, so be aware
* before storing any large or un-serializable fields.
*
*/
public static abstract class IDFExplanation {
/**
* @return the idf factor
*/
public abstract float getIdf();
/**
* This should be calculated lazily if possible.
*
* @return the explanation for the idf factor.
*/
public abstract String explain();
}
} }

View File

@ -63,21 +63,15 @@ extends Query {
public Weight createWeight(final IndexSearcher searcher) throws IOException { public Weight createWeight(final IndexSearcher searcher) throws IOException {
final Weight weight = query.createWeight (searcher); final Weight weight = query.createWeight (searcher);
return new Weight() { return new Weight() {
private float value;
// pass these methods through to enclosed query's weight
@Override
public float getValue() { return value; }
@Override @Override
public float sumOfSquaredWeights() throws IOException { public float getValueForNormalization() throws IOException {
return weight.sumOfSquaredWeights() * getBoost() * getBoost(); return weight.getValueForNormalization() * getBoost() * getBoost();
} }
@Override @Override
public void normalize (float v) { public void normalize (float norm, float topLevelBoost) {
weight.normalize(v); weight.normalize(norm, topLevelBoost);
value = weight.getValue() * getBoost();
} }
@Override @Override

View File

@ -674,11 +674,11 @@ public class IndexSearcher {
public Weight createNormalizedWeight(Query query) throws IOException { public Weight createNormalizedWeight(Query query) throws IOException {
query = rewrite(query); query = rewrite(query);
Weight weight = query.createWeight(this); Weight weight = query.createWeight(this);
float sum = weight.sumOfSquaredWeights(); float v = weight.getValueForNormalization();
float norm = getSimilarityProvider().queryNorm(sum); float norm = getSimilarityProvider().queryNorm(v);
if (Float.isInfinite(norm) || Float.isNaN(norm)) if (Float.isInfinite(norm) || Float.isNaN(norm))
norm = 1.0f; norm = 1.0f;
weight.normalize(norm); weight.normalize(norm, 1.0f);
return weight; return weight;
} }

View File

@ -32,35 +32,17 @@ import java.io.IOException;
*/ */
public class MatchAllDocsQuery extends Query { public class MatchAllDocsQuery extends Query {
public MatchAllDocsQuery() {
this(null);
}
private final String normsField;
/**
* @param normsField Field used for normalization factor (document boost). Null if nothing.
*/
public MatchAllDocsQuery(String normsField) {
this.normsField = normsField;
}
private class MatchAllScorer extends Scorer { private class MatchAllScorer extends Scorer {
final float score; final float score;
final byte[] norms;
private int doc = -1; private int doc = -1;
private final int maxDoc; private final int maxDoc;
private final Bits liveDocs; private final Bits liveDocs;
private final Similarity similarity;
MatchAllScorer(IndexReader reader, Similarity similarity, Weight w, MatchAllScorer(IndexReader reader, Weight w, float score) throws IOException {
byte[] norms) throws IOException {
super(w); super(w);
this.similarity = similarity;
liveDocs = reader.getLiveDocs(); liveDocs = reader.getLiveDocs();
score = w.getValue(); this.score = score;
maxDoc = reader.maxDoc(); maxDoc = reader.maxDoc();
this.norms = norms;
} }
@Override @Override
@ -82,7 +64,7 @@ public class MatchAllDocsQuery extends Query {
@Override @Override
public float score() { public float score() {
return norms == null ? score : score * similarity.decodeNormValue(norms[docID()]); return score;
} }
@Override @Override
@ -93,12 +75,10 @@ public class MatchAllDocsQuery extends Query {
} }
private class MatchAllDocsWeight extends Weight { private class MatchAllDocsWeight extends Weight {
private Similarity similarity;
private float queryWeight; private float queryWeight;
private float queryNorm; private float queryNorm;
public MatchAllDocsWeight(IndexSearcher searcher) { public MatchAllDocsWeight(IndexSearcher searcher) {
this.similarity = normsField == null ? null : searcher.getSimilarityProvider().get(normsField);
} }
@Override @Override
@ -112,33 +92,27 @@ public class MatchAllDocsQuery extends Query {
} }
@Override @Override
public float getValue() { public float getValueForNormalization() {
return queryWeight;
}
@Override
public float sumOfSquaredWeights() {
queryWeight = getBoost(); queryWeight = getBoost();
return queryWeight * queryWeight; return queryWeight * queryWeight;
} }
@Override @Override
public void normalize(float queryNorm) { public void normalize(float queryNorm, float topLevelBoost) {
this.queryNorm = queryNorm; this.queryNorm = queryNorm * topLevelBoost;
queryWeight *= this.queryNorm; queryWeight *= this.queryNorm;
} }
@Override @Override
public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException {
return new MatchAllScorer(context.reader, similarity, this, return new MatchAllScorer(context.reader, this, queryWeight);
normsField != null ? context.reader.norms(normsField) : null);
} }
@Override @Override
public Explanation explain(AtomicReaderContext context, int doc) { public Explanation explain(AtomicReaderContext context, int doc) {
// explain query weight // explain query weight
Explanation queryExpl = new ComplexExplanation Explanation queryExpl = new ComplexExplanation
(true, getValue(), "MatchAllDocsQuery, product of:"); (true, queryWeight, "MatchAllDocsQuery, product of:");
if (getBoost() != 1.0f) { if (getBoost() != 1.0f) {
queryExpl.addDetail(new Explanation(getBoost(),"boost")); queryExpl.addDetail(new Explanation(getBoost(),"boost"));
} }

View File

@ -22,12 +22,14 @@ import java.util.*;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.search.Explanation.IDFExplanation; import org.apache.lucene.search.Similarity.SloppyDocScorer;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
@ -129,45 +131,35 @@ public class MultiPhraseQuery extends Query {
private class MultiPhraseWeight extends Weight { private class MultiPhraseWeight extends Weight {
private Similarity similarity; private final Similarity similarity;
private float value; private final Similarity.Stats stats;
private final IDFExplanation idfExp;
private float idf;
private float queryNorm;
private float queryWeight;
public MultiPhraseWeight(IndexSearcher searcher) public MultiPhraseWeight(IndexSearcher searcher)
throws IOException { throws IOException {
this.similarity = searcher.getSimilarityProvider().get(field); this.similarity = searcher.getSimilarityProvider().get(field);
final ReaderContext context = searcher.getTopReaderContext();
// compute idf // compute idf
ArrayList<Term> allTerms = new ArrayList<Term>(); ArrayList<TermContext> allTerms = new ArrayList<TermContext>();
for(final Term[] terms: termArrays) { for(final Term[] terms: termArrays) {
for (Term term: terms) { for (Term term: terms) {
allTerms.add(term); allTerms.add(TermContext.build(context, term, true));
} }
} }
idfExp = similarity.idfExplain(allTerms, searcher); stats = similarity.computeStats(searcher, field, getBoost(), allTerms.toArray(new TermContext[allTerms.size()]));
idf = idfExp.getIdf();
} }
@Override @Override
public Query getQuery() { return MultiPhraseQuery.this; } public Query getQuery() { return MultiPhraseQuery.this; }
@Override @Override
public float getValue() { return value; } public float getValueForNormalization() {
return stats.getValueForNormalization();
@Override
public float sumOfSquaredWeights() {
queryWeight = idf * getBoost(); // compute query weight
return queryWeight * queryWeight; // square it
} }
@Override @Override
public void normalize(float queryNorm) { public void normalize(float queryNorm, float topLevelBoost) {
this.queryNorm = queryNorm; stats.normalize(queryNorm, topLevelBoost);
queryWeight *= queryNorm; // normalize query weight
value = queryWeight * idf; // idf for document
} }
@Override @Override
@ -222,8 +214,7 @@ public class MultiPhraseQuery extends Query {
} }
if (slop == 0) { if (slop == 0) {
ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity, ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.exactDocScorer(stats, field, context));
reader.norms(field));
if (s.noDocs) { if (s.noDocs) {
return null; return null;
} else { } else {
@ -231,84 +222,29 @@ public class MultiPhraseQuery extends Query {
} }
} else { } else {
return new SloppyPhraseScorer(this, postingsFreqs, similarity, return new SloppyPhraseScorer(this, postingsFreqs, similarity,
slop, reader.norms(field)); slop, similarity.sloppyDocScorer(stats, field, context));
} }
} }
@Override @Override
public Explanation explain(AtomicReaderContext context, int doc) public Explanation explain(AtomicReaderContext context, int doc) throws IOException {
throws IOException {
ComplexExplanation result = new ComplexExplanation();
result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");
Explanation idfExpl = new Explanation(idf, "idf(" + field + ":" + idfExp.explain() +")");
// explain query weight
Explanation queryExpl = new Explanation();
queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:");
Explanation boostExpl = new Explanation(getBoost(), "boost");
if (getBoost() != 1.0f)
queryExpl.addDetail(boostExpl);
queryExpl.addDetail(idfExpl);
Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm");
queryExpl.addDetail(queryNormExpl);
queryExpl.setValue(boostExpl.getValue() *
idfExpl.getValue() *
queryNormExpl.getValue());
result.addDetail(queryExpl);
// explain field weight
ComplexExplanation fieldExpl = new ComplexExplanation();
fieldExpl.setDescription("fieldWeight("+getQuery()+" in "+doc+
"), product of:");
Scorer scorer = scorer(context, ScorerContext.def()); Scorer scorer = scorer(context, ScorerContext.def());
if (scorer == null) { if (scorer != null) {
return new Explanation(0.0f, "no matching docs"); int newDoc = scorer.advance(doc);
if (newDoc == doc) {
float freq = scorer.freq();
SloppyDocScorer docScorer = similarity.sloppyDocScorer(stats, field, context);
ComplexExplanation result = new ComplexExplanation();
result.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:");
Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq));
result.addDetail(scoreExplanation);
result.setValue(scoreExplanation.getValue());
result.setMatch(true);
return result;
}
} }
Explanation tfExplanation = new Explanation(); return new ComplexExplanation(false, 0.0f, "no matching term");
int d = scorer.advance(doc);
float phraseFreq;
if (d == doc) {
phraseFreq = scorer.freq();
} else {
phraseFreq = 0.0f;
}
tfExplanation.setValue(similarity.tf(phraseFreq));
tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")");
fieldExpl.addDetail(tfExplanation);
fieldExpl.addDetail(idfExpl);
Explanation fieldNormExpl = new Explanation();
byte[] fieldNorms = context.reader.norms(field);
float fieldNorm =
fieldNorms!=null ? similarity.decodeNormValue(fieldNorms[doc]) : 1.0f;
fieldNormExpl.setValue(fieldNorm);
fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")");
fieldExpl.addDetail(fieldNormExpl);
fieldExpl.setMatch(Boolean.valueOf(tfExplanation.isMatch()));
fieldExpl.setValue(tfExplanation.getValue() *
idfExpl.getValue() *
fieldNormExpl.getValue());
result.addDetail(fieldExpl);
result.setMatch(fieldExpl.getMatch());
// combine them
result.setValue(queryExpl.getValue() * fieldExpl.getValue());
if (queryExpl.getValue() == 1.0f)
return fieldExpl;
return result;
} }
} }

View File

@ -25,7 +25,7 @@ import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.PerReaderTermState; import org.apache.lucene.util.TermContext;
/** /**
* An abstract {@link Query} that matches documents * An abstract {@link Query} that matches documents
@ -154,7 +154,7 @@ public abstract class MultiTermQuery extends Query {
} }
@Override @Override
protected void addClause(BooleanQuery topLevel, Term term, int docCount, float boost, PerReaderTermState states) { protected void addClause(BooleanQuery topLevel, Term term, int docCount, float boost, TermContext states) {
final TermQuery tq = new TermQuery(term, states); final TermQuery tq = new TermQuery(term, states);
tq.setBoost(boost); tq.setBoost(boost);
topLevel.add(tq, BooleanClause.Occur.SHOULD); topLevel.add(tq, BooleanClause.Occur.SHOULD);
@ -195,7 +195,7 @@ public abstract class MultiTermQuery extends Query {
} }
@Override @Override
protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost, PerReaderTermState states) { protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost, TermContext states) {
final Query q = new ConstantScoreQuery(new TermQuery(term, states)); final Query q = new ConstantScoreQuery(new TermQuery(term, states));
q.setBoost(boost); q.setBoost(boost);
topLevel.add(q, BooleanClause.Occur.SHOULD); topLevel.add(q, BooleanClause.Occur.SHOULD);

View File

@ -22,10 +22,16 @@ import java.util.Set;
import java.util.ArrayList; import java.util.ArrayList;
import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Explanation.IDFExplanation; import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.Similarity.SloppyDocScorer;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
@ -171,18 +177,17 @@ public class PhraseQuery extends Query {
private class PhraseWeight extends Weight { private class PhraseWeight extends Weight {
private final Similarity similarity; private final Similarity similarity;
private float value; private final Similarity.Stats stats;
private float idf; private transient TermContext states[];
private float queryNorm;
private float queryWeight;
private IDFExplanation idfExp;
public PhraseWeight(IndexSearcher searcher) public PhraseWeight(IndexSearcher searcher)
throws IOException { throws IOException {
this.similarity = searcher.getSimilarityProvider().get(field); this.similarity = searcher.getSimilarityProvider().get(field);
final ReaderContext context = searcher.getTopReaderContext();
idfExp = similarity.idfExplain(terms, searcher); states = new TermContext[terms.size()];
idf = idfExp.getIdf(); for (int i = 0; i < terms.size(); i++)
states[i] = TermContext.build(context, terms.get(i), true);
stats = similarity.computeStats(searcher, field, getBoost(), states);
} }
@Override @Override
@ -192,19 +197,13 @@ public class PhraseQuery extends Query {
public Query getQuery() { return PhraseQuery.this; } public Query getQuery() { return PhraseQuery.this; }
@Override @Override
public float getValue() { return value; } public float getValueForNormalization() {
return stats.getValueForNormalization();
@Override
public float sumOfSquaredWeights() {
queryWeight = idf * getBoost(); // compute query weight
return queryWeight * queryWeight; // square it
} }
@Override @Override
public void normalize(float queryNorm) { public void normalize(float queryNorm, float topLevelBoost) {
this.queryNorm = queryNorm; stats.normalize(queryNorm, topLevelBoost);
queryWeight *= queryNorm; // normalize query weight
value = queryWeight * idf; // idf for document
} }
@Override @Override
@ -216,21 +215,26 @@ public class PhraseQuery extends Query {
PostingsAndFreq[] postingsFreqs = new PostingsAndFreq[terms.size()]; PostingsAndFreq[] postingsFreqs = new PostingsAndFreq[terms.size()];
for (int i = 0; i < terms.size(); i++) { for (int i = 0; i < terms.size(); i++) {
final Term t = terms.get(i); final Term t = terms.get(i);
final TermState state = states[i].get(context.ord);
if (state == null) { /* term doesnt exist in this segment */
assert termNotInReader(reader, field, t.bytes()) : "no termstate found but term exists in reader";
return null;
}
DocsAndPositionsEnum postingsEnum = reader.termPositionsEnum(liveDocs, DocsAndPositionsEnum postingsEnum = reader.termPositionsEnum(liveDocs,
t.field(), t.field(),
t.bytes()); t.bytes(),
state);
// PhraseQuery on a field that did not index // PhraseQuery on a field that did not index
// positions. // positions.
if (postingsEnum == null) { if (postingsEnum == null) {
if (reader.termDocsEnum(liveDocs, t.field(), t.bytes()) != null) { assert (reader.termDocsEnum(liveDocs, t.field(), t.bytes(), state) != null) : "termstate found but no term exists in reader";
// term does exist, but has no positions // term does exist, but has no positions
throw new IllegalStateException("field \"" + t.field() + "\" was indexed with Field.omitTermFreqAndPositions=true; cannot run PhraseQuery (term=" + t.text() + ")"); throw new IllegalStateException("field \"" + t.field() + "\" was indexed with Field.omitTermFreqAndPositions=true; cannot run PhraseQuery (term=" + t.text() + ")");
} else {
// term does not exist
return null;
}
} }
postingsFreqs[i] = new PostingsAndFreq(postingsEnum, reader.docFreq(t.field(), t.bytes()), positions.get(i).intValue(), t); // get the docFreq without seeking
TermsEnum te = reader.fields().terms(field).getThreadTermsEnum();
te.seekExact(t.bytes(), state);
postingsFreqs[i] = new PostingsAndFreq(postingsEnum, te.docFreq(), positions.get(i).intValue(), t);
} }
// sort by increasing docFreq order // sort by increasing docFreq order
@ -239,8 +243,7 @@ public class PhraseQuery extends Query {
} }
if (slop == 0) { // optimize exact case if (slop == 0) { // optimize exact case
ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity, ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.exactDocScorer(stats, field, context));
reader.norms(field));
if (s.noDocs) { if (s.noDocs) {
return null; return null;
} else { } else {
@ -248,96 +251,35 @@ public class PhraseQuery extends Query {
} }
} else { } else {
return return
new SloppyPhraseScorer(this, postingsFreqs, similarity, slop, new SloppyPhraseScorer(this, postingsFreqs, similarity, slop, similarity.sloppyDocScorer(stats, field, context));
reader.norms(field));
} }
} }
private boolean termNotInReader(IndexReader reader, String field, BytesRef bytes) throws IOException {
// only called from assert
final Terms terms = reader.terms(field);
return terms == null || terms.docFreq(bytes) == 0;
}
@Override @Override
public Explanation explain(AtomicReaderContext context, int doc) public Explanation explain(AtomicReaderContext context, int doc) throws IOException {
throws IOException {
ComplexExplanation result = new ComplexExplanation();
result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");
StringBuilder docFreqs = new StringBuilder();
StringBuilder query = new StringBuilder();
query.append('\"');
docFreqs.append(idfExp.explain());
for (int i = 0; i < terms.size(); i++) {
if (i != 0) {
query.append(" ");
}
Term term = terms.get(i);
query.append(term.text());
}
query.append('\"');
Explanation idfExpl =
new Explanation(idf, "idf(" + field + ":" + docFreqs + ")");
// explain query weight
Explanation queryExpl = new Explanation();
queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:");
Explanation boostExpl = new Explanation(getBoost(), "boost");
if (getBoost() != 1.0f)
queryExpl.addDetail(boostExpl);
queryExpl.addDetail(idfExpl);
Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm");
queryExpl.addDetail(queryNormExpl);
queryExpl.setValue(boostExpl.getValue() *
idfExpl.getValue() *
queryNormExpl.getValue());
result.addDetail(queryExpl);
// explain field weight
Explanation fieldExpl = new Explanation();
fieldExpl.setDescription("fieldWeight("+field+":"+query+" in "+doc+
"), product of:");
Scorer scorer = scorer(context, ScorerContext.def()); Scorer scorer = scorer(context, ScorerContext.def());
if (scorer == null) { if (scorer != null) {
return new Explanation(0.0f, "no matching docs"); int newDoc = scorer.advance(doc);
} if (newDoc == doc) {
Explanation tfExplanation = new Explanation(); float freq = scorer.freq();
int d = scorer.advance(doc); SloppyDocScorer docScorer = similarity.sloppyDocScorer(stats, field, context);
float phraseFreq; ComplexExplanation result = new ComplexExplanation();
if (d == doc) { result.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:");
phraseFreq = scorer.freq(); Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq));
} else { result.addDetail(scoreExplanation);
phraseFreq = 0.0f; result.setValue(scoreExplanation.getValue());
result.setMatch(true);
return result;
}
} }
tfExplanation.setValue(similarity.tf(phraseFreq)); return new ComplexExplanation(false, 0.0f, "no matching term");
tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")");
fieldExpl.addDetail(tfExplanation);
fieldExpl.addDetail(idfExpl);
Explanation fieldNormExpl = new Explanation();
byte[] fieldNorms = context.reader.norms(field);
float fieldNorm =
fieldNorms!=null ? similarity.decodeNormValue(fieldNorms[doc]) : 1.0f;
fieldNormExpl.setValue(fieldNorm);
fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")");
fieldExpl.addDetail(fieldNormExpl);
fieldExpl.setValue(tfExplanation.getValue() *
idfExpl.getValue() *
fieldNormExpl.getValue());
result.addDetail(fieldExpl);
// combine them
result.setValue(queryExpl.getValue() * fieldExpl.getValue());
result.setMatch(tfExplanation.isMatch());
return result;
} }
} }

View File

@ -30,9 +30,6 @@ import java.io.IOException;
* means a match. * means a match.
*/ */
abstract class PhraseScorer extends Scorer { abstract class PhraseScorer extends Scorer {
protected byte[] norms;
protected float value;
private boolean firstTime = true; private boolean firstTime = true;
private boolean more = true; private boolean more = true;
protected PhraseQueue pq; protected PhraseQueue pq;
@ -40,14 +37,12 @@ abstract class PhraseScorer extends Scorer {
private float freq; //phrase frequency in current doc as computed by phraseFreq(). private float freq; //phrase frequency in current doc as computed by phraseFreq().
protected final Similarity similarity; protected final Similarity.SloppyDocScorer docScorer;
PhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, PhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
Similarity similarity, byte[] norms) { Similarity.SloppyDocScorer docScorer) throws IOException {
super(weight); super(weight);
this.similarity = similarity; this.docScorer = docScorer;
this.norms = norms;
this.value = weight.getValue();
// convert tps to a list of phrase positions. // convert tps to a list of phrase positions.
// note: phrase-position differs from term-position in that its position // note: phrase-position differs from term-position in that its position
@ -107,9 +102,7 @@ abstract class PhraseScorer extends Scorer {
@Override @Override
public float score() throws IOException { public float score() throws IOException {
//System.out.println("scoring " + first.doc); return docScorer.score(first.doc, freq);
float raw = similarity.tf(freq) * value; // raw score
return norms == null ? raw : raw * similarity.decodeNormValue(norms[first.doc]); // normalize
} }
@Override @Override

View File

@ -28,7 +28,7 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash; import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.PerReaderTermState; import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray; import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
@ -56,7 +56,7 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
@Override @Override
protected void addClause(BooleanQuery topLevel, Term term, int docCount, protected void addClause(BooleanQuery topLevel, Term term, int docCount,
float boost, PerReaderTermState states) { float boost, TermContext states) {
final TermQuery tq = new TermQuery(term, states); final TermQuery tq = new TermQuery(term, states);
tq.setBoost(boost); tq.setBoost(boost);
topLevel.add(tq, BooleanClause.Occur.SHOULD); topLevel.add(tq, BooleanClause.Occur.SHOULD);
@ -117,7 +117,7 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
if (size > 0) { if (size > 0) {
final int sort[] = col.terms.sort(col.termsEnum.getComparator()); final int sort[] = col.terms.sort(col.termsEnum.getComparator());
final float[] boost = col.array.boost; final float[] boost = col.array.boost;
final PerReaderTermState[] termStates = col.array.termState; final TermContext[] termStates = col.array.termState;
for (int i = 0; i < size; i++) { for (int i = 0; i < size; i++) {
final int pos = sort[i]; final int pos = sort[i];
final Term term = new Term(query.getField(), col.terms.get(pos, new BytesRef())); final Term term = new Term(query.getField(), col.terms.get(pos, new BytesRef()));
@ -150,12 +150,12 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
if (e < 0 ) { if (e < 0 ) {
// duplicate term: update docFreq // duplicate term: update docFreq
final int pos = (-e)-1; final int pos = (-e)-1;
array.termState[pos].register(state, readerContext.ord, termsEnum.docFreq()); array.termState[pos].register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
assert array.boost[pos] == boostAtt.getBoost() : "boost should be equal in all segment TermsEnums"; assert array.boost[pos] == boostAtt.getBoost() : "boost should be equal in all segment TermsEnums";
} else { } else {
// new entry: we populate the entry initially // new entry: we populate the entry initially
array.boost[e] = boostAtt.getBoost(); array.boost[e] = boostAtt.getBoost();
array.termState[e] = new PerReaderTermState(topReaderContext, state, readerContext.ord, termsEnum.docFreq()); array.termState[e] = new TermContext(topReaderContext, state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
ScoringRewrite.this.checkMaxClauseCount(terms.size()); ScoringRewrite.this.checkMaxClauseCount(terms.size());
} }
return true; return true;
@ -165,7 +165,7 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
/** Special implementation of BytesStartArray that keeps parallel arrays for boost and docFreq */ /** Special implementation of BytesStartArray that keeps parallel arrays for boost and docFreq */
static final class TermFreqBoostByteStart extends DirectBytesStartArray { static final class TermFreqBoostByteStart extends DirectBytesStartArray {
float[] boost; float[] boost;
PerReaderTermState[] termState; TermContext[] termState;
public TermFreqBoostByteStart(int initSize) { public TermFreqBoostByteStart(int initSize) {
super(initSize); super(initSize);
@ -175,7 +175,7 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
public int[] init() { public int[] init() {
final int[] ord = super.init(); final int[] ord = super.init();
boost = new float[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_FLOAT)]; boost = new float[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_FLOAT)];
termState = new PerReaderTermState[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; termState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
assert termState.length >= ord.length && boost.length >= ord.length; assert termState.length >= ord.length && boost.length >= ord.length;
return ord; return ord;
} }
@ -185,7 +185,7 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
final int[] ord = super.grow(); final int[] ord = super.grow();
boost = ArrayUtil.grow(boost, ord.length); boost = ArrayUtil.grow(boost, ord.length);
if (termState.length < ord.length) { if (termState.length < ord.length) {
PerReaderTermState[] tmpTermState = new PerReaderTermState[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; TermContext[] tmpTermState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(termState, 0, tmpTermState, 0, termState.length); System.arraycopy(termState, 0, tmpTermState, 0, termState.length);
termState = tmpTermState; termState = tmpTermState;
} }

View File

@ -19,594 +19,111 @@ package org.apache.lucene.search;
import java.io.IOException; import java.io.IOException;
import java.util.Collection;
import org.apache.lucene.document.IndexDocValuesField; // javadoc
import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexReader; // javadoc
import org.apache.lucene.search.Explanation.IDFExplanation; import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.util.SmallFloat; import org.apache.lucene.index.Terms; // javadoc
import org.apache.lucene.search.spans.SpanQuery; // javadoc
import org.apache.lucene.util.SmallFloat; // javadoc
import org.apache.lucene.util.TermContext;
/** /**
* Similarity defines the components of Lucene scoring.
* <p>
* Expert: Scoring API. * Expert: Scoring API.
* * <p>
* <p>Similarity defines the components of Lucene scoring. * This is a low-level API, you should only extend this API if you want to implement
* Overriding computation of these components is a convenient * an information retrieval <i>model</i>. If you are instead looking for a convenient way
* way to alter Lucene scoring. * to alter Lucene's scoring, consider extending a higher-level implementation
* * such as {@link TFIDFSimilarity}, which implements the vector space model with this API, or
* <p>Suggested reading: * just tweaking the default implementation: {@link DefaultSimilarity}.
* <a href="http://nlp.stanford.edu/IR-book/html/htmledition/queries-as-vectors-1.html"> * <p>
* Introduction To Information Retrieval, Chapter 6</a>. * Similarity determines how Lucene weights terms, and Lucene interacts with
* * this class at both <a href="#indextime">index-time</a> and
* <p>The following describes how Lucene scoring evolves from * <a href="#querytime">query-time</a>.
* underlying information retrieval models to (efficient) implementation. * <p>
* We first brief on <i>VSM Score</i>, * <a name="indextime"/>
* then derive from it <i>Lucene's Conceptual Scoring Formula</i>, * At indexing time, the indexer calls {@link #computeNorm(FieldInvertState)}, allowing
* from which, finally, evolves <i>Lucene's Practical Scoring Function</i> * the Similarity implementation to return a per-document byte for the field that will
* (the latter is connected directly with Lucene classes and methods). * be later accessible via {@link IndexReader#norms(String)}. Lucene makes no assumption
* * about what is in this byte, but it is most useful for encoding length normalization
* <p>Lucene combines * information.
* <a href="http://en.wikipedia.org/wiki/Standard_Boolean_model"> * <p>
* Boolean model (BM) of Information Retrieval</a> * Implementations should carefully consider how the normalization byte is encoded: while
* with * Lucene's classical {@link TFIDFSimilarity} encodes a combination of index-time boost
* <a href="http://en.wikipedia.org/wiki/Vector_Space_Model"> * and length normalization information with {@link SmallFloat}, this might not be suitable
* Vector Space Model (VSM) of Information Retrieval</a> - * for all purposes.
* documents "approved" by BM are scored by VSM. * <p>
* * Many formulas require the use of average document length, which can be computed via a
* <p>In VSM, documents and queries are represented as * combination of {@link Terms#getSumTotalTermFreq()} and {@link IndexReader#maxDoc()},
* weighted vectors in a multi-dimensional space, * <p>
* where each distinct index term is a dimension, * Because index-time boost is handled entirely at the application level anyway,
* and weights are * an application can alternatively store the index-time boost separately using an
* <a href="http://en.wikipedia.org/wiki/Tfidf">Tf-idf</a> values. * {@link IndexDocValuesField}, and access this at query-time with
* * {@link IndexReader#docValues(String)}.
* <p>VSM does not require weights to be <i>Tf-idf</i> values, * <p>
* but <i>Tf-idf</i> values are believed to produce search results of high quality, * Finally, using index-time boosts (either via folding into the normalization byte or
* and so Lucene is using <i>Tf-idf</i>. * via IndexDocValues), is an inefficient way to boost the scores of different fields if the
* <i>Tf</i> and <i>Idf</i> are described in more detail below, * boost will be the same for every document, instead the Similarity can simply take a constant
* but for now, for completion, let's just say that * boost parameter <i>C</i>, and the SimilarityProvider can return different instances with
* for given term <i>t</i> and document (or query) <i>x</i>, * different boosts depending upon field name.
* <i>Tf(t,x)</i> varies with the number of occurrences of term <i>t</i> in <i>x</i> * <p>
* (when one increases so does the other) and * <a name="querytime"/>
* <i>idf(t)</i> similarly varies with the inverse of the * At query-time, Queries interact with the Similarity via these steps:
* number of index documents containing term <i>t</i>.
*
* <p><i>VSM score</i> of document <i>d</i> for query <i>q</i> is the
* <a href="http://en.wikipedia.org/wiki/Cosine_similarity">
* Cosine Similarity</a>
* of the weighted query vectors <i>V(q)</i> and <i>V(d)</i>:
*
* <br>&nbsp;<br>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr><td>
* <table cellpadding="1" cellspacing="0" border="1" align="center">
* <tr><td>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* cosine-similarity(q,d) &nbsp; = &nbsp;
* </td>
* <td valign="middle" align="center">
* <table>
* <tr><td align="center"><small>V(q)&nbsp;&middot;&nbsp;V(d)</small></td></tr>
* <tr><td align="center">&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;</td></tr>
* <tr><td align="center"><small>|V(q)|&nbsp;|V(d)|</small></td></tr>
* </table>
* </td>
* </tr>
* </table>
* </td></tr>
* </table>
* </td></tr>
* <tr><td>
* <center><font=-1><u>VSM Score</u></font></center>
* </td></tr>
* </table>
* <br>&nbsp;<br>
*
*
* Where <i>V(q)</i> &middot; <i>V(d)</i> is the
* <a href="http://en.wikipedia.org/wiki/Dot_product">dot product</a>
* of the weighted vectors,
* and <i>|V(q)|</i> and <i>|V(d)|</i> are their
* <a href="http://en.wikipedia.org/wiki/Euclidean_norm#Euclidean_norm">Euclidean norms</a>.
*
* <p>Note: the above equation can be viewed as the dot product of
* the normalized weighted vectors, in the sense that dividing
* <i>V(q)</i> by its euclidean norm is normalizing it to a unit vector.
*
* <p>Lucene refines <i>VSM score</i> for both search quality and usability:
* <ul>
* <li>Normalizing <i>V(d)</i> to the unit vector is known to be problematic in that
* it removes all document length information.
* For some documents removing this info is probably ok,
* e.g. a document made by duplicating a certain paragraph <i>10</i> times,
* especially if that paragraph is made of distinct terms.
* But for a document which contains no duplicated paragraphs,
* this might be wrong.
* To avoid this problem, a different document length normalization
* factor is used, which normalizes to a vector equal to or larger
* than the unit vector: <i>doc-len-norm(d)</i>.
* </li>
*
* <li>At indexing, users can specify that certain documents are more
* important than others, by assigning a document boost.
* For this, the score of each document is also multiplied by its boost value
* <i>doc-boost(d)</i>.
* </li>
*
* <li>Lucene is field based, hence each query term applies to a single
* field, document length normalization is by the length of the certain field,
* and in addition to document boost there are also document fields boosts.
* </li>
*
* <li>The same field can be added to a document during indexing several times,
* and so the boost of that field is the multiplication of the boosts of
* the separate additions (or parts) of that field within the document.
* </li>
*
* <li>At search time users can specify boosts to each query, sub-query, and
* each query term, hence the contribution of a query term to the score of
* a document is multiplied by the boost of that query term <i>query-boost(q)</i>.
* </li>
*
* <li>A document may match a multi term query without containing all
* the terms of that query (this is correct for some of the queries),
* and users can further reward documents matching more query terms
* through a coordination factor, which is usually larger when
* more terms are matched: <i>coord-factor(q,d)</i>.
* </li>
* </ul>
*
* <p>Under the simplifying assumption of a single field in the index,
* we get <i>Lucene's Conceptual scoring formula</i>:
*
* <br>&nbsp;<br>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr><td>
* <table cellpadding="1" cellspacing="0" border="1" align="center">
* <tr><td>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* score(q,d) &nbsp; = &nbsp;
* <font color="#FF9933">coord-factor(q,d)</font> &middot; &nbsp;
* <font color="#CCCC00">query-boost(q)</font> &middot; &nbsp;
* </td>
* <td valign="middle" align="center">
* <table>
* <tr><td align="center"><small><font color="#993399">V(q)&nbsp;&middot;&nbsp;V(d)</font></small></td></tr>
* <tr><td align="center">&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;</td></tr>
* <tr><td align="center"><small><font color="#FF33CC">|V(q)|</font></small></td></tr>
* </table>
* </td>
* <td valign="middle" align="right" rowspan="1">
* &nbsp; &middot; &nbsp; <font color="#3399FF">doc-len-norm(d)</font>
* &nbsp; &middot; &nbsp; <font color="#3399FF">doc-boost(d)</font>
* </td>
* </tr>
* </table>
* </td></tr>
* </table>
* </td></tr>
* <tr><td>
* <center><font=-1><u>Lucene Conceptual Scoring Formula</u></font></center>
* </td></tr>
* </table>
* <br>&nbsp;<br>
*
* <p>The conceptual formula is a simplification in the sense that (1) terms and documents
* are fielded and (2) boosts are usually per query term rather than per query.
*
* <p>We now describe how Lucene implements this conceptual scoring formula, and
* derive from it <i>Lucene's Practical Scoring Function</i>.
*
* <p>For efficient score computation some scoring components
* are computed and aggregated in advance:
*
* <ul>
* <li><i>Query-boost</i> for the query (actually for each query term)
* is known when search starts.
* </li>
*
* <li>Query Euclidean norm <i>|V(q)|</i> can be computed when search starts,
* as it is independent of the document being scored.
* From search optimization perspective, it is a valid question
* why bother to normalize the query at all, because all
* scored documents will be multiplied by the same <i>|V(q)|</i>,
* and hence documents ranks (their order by score) will not
* be affected by this normalization.
* There are two good reasons to keep this normalization:
* <ul>
* <li>Recall that
* <a href="http://en.wikipedia.org/wiki/Cosine_similarity">
* Cosine Similarity</a> can be used find how similar
* two documents are. One can use Lucene for e.g.
* clustering, and use a document as a query to compute
* its similarity to other documents.
* In this use case it is important that the score of document <i>d3</i>
* for query <i>d1</i> is comparable to the score of document <i>d3</i>
* for query <i>d2</i>. In other words, scores of a document for two
* distinct queries should be comparable.
* There are other applications that may require this.
* And this is exactly what normalizing the query vector <i>V(q)</i>
* provides: comparability (to a certain extent) of two or more queries.
* </li>
*
* <li>Applying query normalization on the scores helps to keep the
* scores around the unit vector, hence preventing loss of score data
* because of floating point precision limitations.
* </li>
* </ul>
* </li>
*
* <li>Document length norm <i>doc-len-norm(d)</i> and document
* boost <i>doc-boost(d)</i> are known at indexing time.
* They are computed in advance and their multiplication
* is saved as a single value in the index: <i>norm(d)</i>.
* (In the equations below, <i>norm(t in d)</i> means <i>norm(field(t) in doc d)</i>
* where <i>field(t)</i> is the field associated with term <i>t</i>.)
* </li>
* </ul>
*
* <p><i>Lucene's Practical Scoring Function</i> is derived from the above.
* The color codes demonstrate how it relates
* to those of the <i>conceptual</i> formula:
*
* <P>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr><td>
* <table cellpadding="" cellspacing="2" border="2" align="center">
* <tr><td>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* score(q,d) &nbsp; = &nbsp;
* <A HREF="#formula_coord"><font color="#FF9933">coord(q,d)</font></A> &nbsp;&middot;&nbsp;
* <A HREF="#formula_queryNorm"><font color="#FF33CC">queryNorm(q)</font></A> &nbsp;&middot;&nbsp;
* </td>
* <td valign="bottom" align="center" rowspan="1">
* <big><big><big>&sum;</big></big></big>
* </td>
* <td valign="middle" align="right" rowspan="1">
* <big><big>(</big></big>
* <A HREF="#formula_tf"><font color="#993399">tf(t in d)</font></A> &nbsp;&middot;&nbsp;
* <A HREF="#formula_idf"><font color="#993399">idf(t)</font></A><sup>2</sup> &nbsp;&middot;&nbsp;
* <A HREF="#formula_termBoost"><font color="#CCCC00">t.getBoost()</font></A>&nbsp;&middot;&nbsp;
* <A HREF="#formula_norm"><font color="#3399FF">norm(t,d)</font></A>
* <big><big>)</big></big>
* </td>
* </tr>
* <tr valigh="top">
* <td></td>
* <td align="center"><small>t in q</small></td>
* <td></td>
* </tr>
* </table>
* </td></tr>
* </table>
* </td></tr>
* <tr><td>
* <center><font=-1><u>Lucene Practical Scoring Function</u></font></center>
* </td></tr>
* </table>
*
* <p> where
* <ol> * <ol>
* <li> * <li>The {@link #computeStats(IndexSearcher, String, float, TermContext...)} method is called a single time,
* <A NAME="formula_tf"></A> * allowing the implementation to compute any statistics (such as IDF, average document length, etc)
* <b><i>tf(t in d)</i></b> * across <i>the entire collection</i>. The {@link TermContext}s passed in are already positioned
* correlates to the term's <i>frequency</i>, * to the terms involved with the raw statistics involved, so a Similarity can freely use any combination
* defined as the number of times term <i>t</i> appears in the currently scored document <i>d</i>. * of term statistics without causing any additional I/O. Lucene makes no assumption about what is
* Documents that have more occurrences of a given term receive a higher score. * stored in the returned {@link Similarity.Stats} object.
* Note that <i>tf(t in q)</i> is assumed to be <i>1</i> and therefore it does not appear in this equation, * <li>The query normalization process occurs a single time: {@link Similarity.Stats#getValueForNormalization()}
* However if a query contains twice the same term, there will be * is called for each query leaf node, {@link SimilarityProvider#queryNorm(float)} is called for the top-level
* two term-queries with that same term and hence the computation would still be correct (although * query, and finally {@link Similarity.Stats#normalize(float, float)} passes down the normalization value
* not very efficient). * and any top-level boosts (e.g. from enclosing {@link BooleanQuery}s).
* The default computation for <i>tf(t in d)</i> in * <li>For each segment in the index, the Query creates a {@link #exactDocScorer(Stats, String, IndexReader.AtomicReaderContext)}
* {@link org.apache.lucene.search.DefaultSimilarity#tf(float) DefaultSimilarity} is: * (for queries with exact frequencies such as TermQuerys and exact PhraseQueries) or a
* * {@link #sloppyDocScorer(Stats, String, IndexReader.AtomicReaderContext)} (for queries with sloppy frequencies such as
* <br>&nbsp;<br> * SpanQuerys and sloppy PhraseQueries). The score() method is called for each matching document.
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* {@link org.apache.lucene.search.DefaultSimilarity#tf(float) tf(t in d)} &nbsp; = &nbsp;
* </td>
* <td valign="top" align="center" rowspan="1">
* frequency<sup><big>&frac12;</big></sup>
* </td>
* </tr>
* </table>
* <br>&nbsp;<br>
* </li>
*
* <li>
* <A NAME="formula_idf"></A>
* <b><i>idf(t)</i></b> stands for Inverse Document Frequency. This value
* correlates to the inverse of <i>docFreq</i>
* (the number of documents in which the term <i>t</i> appears).
* This means rarer terms give higher contribution to the total score.
* <i>idf(t)</i> appears for <i>t</i> in both the query and the document,
* hence it is squared in the equation.
* The default computation for <i>idf(t)</i> in
* {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) DefaultSimilarity} is:
*
* <br>&nbsp;<br>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right">
* {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) idf(t)}&nbsp; = &nbsp;
* </td>
* <td valign="middle" align="center">
* 1 + log <big>(</big>
* </td>
* <td valign="middle" align="center">
* <table>
* <tr><td align="center"><small>numDocs</small></td></tr>
* <tr><td align="center">&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;</td></tr>
* <tr><td align="center"><small>docFreq+1</small></td></tr>
* </table>
* </td>
* <td valign="middle" align="center">
* <big>)</big>
* </td>
* </tr>
* </table>
* <br>&nbsp;<br>
* </li>
*
* <li>
* <A NAME="formula_coord"></A>
* <b><i>coord(q,d)</i></b>
* is a score factor based on how many of the query terms are found in the specified document.
* Typically, a document that contains more of the query's terms will receive a higher score
* than another document with fewer query terms.
* This is a search time factor computed in
* {@link SimilarityProvider#coord(int, int) coord(q,d)}
* by the SimilarityProvider in effect at search time.
* <br>&nbsp;<br>
* </li>
*
* <li><b>
* <A NAME="formula_queryNorm"></A>
* <i>queryNorm(q)</i>
* </b>
* is a normalizing factor used to make scores between queries comparable.
* This factor does not affect document ranking (since all ranked documents are multiplied by the same factor),
* but rather just attempts to make scores from different queries (or even different indexes) comparable.
* This is a search time factor computed by the SimilarityProvider in effect at search time.
*
* The default computation in
* {@link org.apache.lucene.search.DefaultSimilarityProvider#queryNorm(float) DefaultSimilarityProvider}
* produces a <a href="http://en.wikipedia.org/wiki/Euclidean_norm#Euclidean_norm">Euclidean norm</a>:
* <br>&nbsp;<br>
* <table cellpadding="1" cellspacing="0" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* queryNorm(q) &nbsp; = &nbsp;
* {@link org.apache.lucene.search.DefaultSimilarityProvider#queryNorm(float) queryNorm(sumOfSquaredWeights)}
* &nbsp; = &nbsp;
* </td>
* <td valign="middle" align="center" rowspan="1">
* <table>
* <tr><td align="center"><big>1</big></td></tr>
* <tr><td align="center"><big>
* &ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;
* </big></td></tr>
* <tr><td align="center">sumOfSquaredWeights<sup><big>&frac12;</big></sup></td></tr>
* </table>
* </td>
* </tr>
* </table>
* <br>&nbsp;<br>
*
* The sum of squared weights (of the query terms) is
* computed by the query {@link org.apache.lucene.search.Weight} object.
* For example, a {@link org.apache.lucene.search.BooleanQuery}
* computes this value as:
*
* <br>&nbsp;<br>
* <table cellpadding="1" cellspacing="0" border="0"n align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* {@link org.apache.lucene.search.Weight#sumOfSquaredWeights() sumOfSquaredWeights} &nbsp; = &nbsp;
* {@link org.apache.lucene.search.Query#getBoost() q.getBoost()} <sup><big>2</big></sup>
* &nbsp;&middot;&nbsp;
* </td>
* <td valign="bottom" align="center" rowspan="1">
* <big><big><big>&sum;</big></big></big>
* </td>
* <td valign="middle" align="right" rowspan="1">
* <big><big>(</big></big>
* <A HREF="#formula_idf">idf(t)</A> &nbsp;&middot;&nbsp;
* <A HREF="#formula_termBoost">t.getBoost()</A>
* <big><big>) <sup>2</sup> </big></big>
* </td>
* </tr>
* <tr valigh="top">
* <td></td>
* <td align="center"><small>t in q</small></td>
* <td></td>
* </tr>
* </table>
* <br>&nbsp;<br>
*
* </li>
*
* <li>
* <A NAME="formula_termBoost"></A>
* <b><i>t.getBoost()</i></b>
* is a search time boost of term <i>t</i> in the query <i>q</i> as
* specified in the query text
* (see <A HREF="../../../../../../queryparsersyntax.html#Boosting a Term">query syntax</A>),
* or as set by application calls to
* {@link org.apache.lucene.search.Query#setBoost(float) setBoost()}.
* Notice that there is really no direct API for accessing a boost of one term in a multi term query,
* but rather multi terms are represented in a query as multi
* {@link org.apache.lucene.search.TermQuery TermQuery} objects,
* and so the boost of a term in the query is accessible by calling the sub-query
* {@link org.apache.lucene.search.Query#getBoost() getBoost()}.
* <br>&nbsp;<br>
* </li>
*
* <li>
* <A NAME="formula_norm"></A>
* <b><i>norm(t,d)</i></b> encapsulates a few (indexing time) boost and length factors:
*
* <ul>
* <li><b>Document boost</b> - set by calling
* {@link org.apache.lucene.document.Document#setBoost(float) doc.setBoost()}
* before adding the document to the index.
* </li>
* <li><b>Field boost</b> - set by calling
* {@link org.apache.lucene.document.Fieldable#setBoost(float) field.setBoost()}
* before adding the field to a document.
* </li>
* <li><b>lengthNorm</b> - computed
* when the document is added to the index in accordance with the number of tokens
* of this field in the document, so that shorter fields contribute more to the score.
* LengthNorm is computed by the Similarity class in effect at indexing.
* </li>
* </ul>
* The {@link #computeNorm} method is responsible for
* combining all of these factors into a single float.
*
* <p>
* When a document is added to the index, all the above factors are multiplied.
* If the document has multiple fields with the same name, all their boosts are multiplied together:
*
* <br>&nbsp;<br>
* <table cellpadding="1" cellspacing="0" border="0"n align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* norm(t,d) &nbsp; = &nbsp;
* {@link org.apache.lucene.document.Document#getBoost() doc.getBoost()}
* &nbsp;&middot;&nbsp;
* lengthNorm
* &nbsp;&middot;&nbsp;
* </td>
* <td valign="bottom" align="center" rowspan="1">
* <big><big><big>&prod;</big></big></big>
* </td>
* <td valign="middle" align="right" rowspan="1">
* {@link org.apache.lucene.document.Fieldable#getBoost() f.getBoost}()
* </td>
* </tr>
* <tr valigh="top">
* <td></td>
* <td align="center"><small>field <i><b>f</b></i> in <i>d</i> named as <i><b>t</b></i></small></td>
* <td></td>
* </tr>
* </table>
* <br>&nbsp;<br>
* However the resulted <i>norm</i> value is {@link #encodeNormValue(float) encoded} as a single byte
* before being stored.
* At search time, the norm byte value is read from the index
* {@link org.apache.lucene.store.Directory directory} and
* {@link #decodeNormValue(byte) decoded} back to a float <i>norm</i> value.
* This encoding/decoding, while reducing index size, comes with the price of
* precision loss - it is not guaranteed that <i>decode(encode(x)) = x</i>.
* For instance, <i>decode(encode(0.89)) = 0.75</i>.
* <br>&nbsp;<br>
* Compression of norm values to a single byte saves memory at search time,
* because once a field is referenced at search time, its norms - for
* all documents - are maintained in memory.
* <br>&nbsp;<br>
* The rationale supporting such lossy compression of norm values is that
* given the difficulty (and inaccuracy) of users to express their true information
* need by a query, only big differences matter.
* <br>&nbsp;<br>
* Last, note that search time is too late to modify this <i>norm</i> part of scoring, e.g. by
* using a different {@link Similarity} for search.
* <br>&nbsp;<br>
* </li>
* </ol> * </ol>
* <p>
* <a name="explaintime"/>
* When {@link IndexSearcher#explain(Query, int)} is called, queries consult the Similarity's DocScorer for an
* explanation of how it computed its score. The query passes in a the document id and an explanation of how the frequency
* was computed.
* *
* @see org.apache.lucene.index.IndexWriterConfig#setSimilarityProvider(SimilarityProvider) * @see org.apache.lucene.index.IndexWriterConfig#setSimilarityProvider(SimilarityProvider)
* @see IndexSearcher#setSimilarityProvider(SimilarityProvider) * @see IndexSearcher#setSimilarityProvider(SimilarityProvider)
* @lucene.experimental
*/ */
public abstract class Similarity { public abstract class Similarity {
public static final int NO_DOC_ID_PROVIDED = -1; public static final int NO_DOC_ID_PROVIDED = -1;
/** Cache of decoded bytes. */
private static final float[] NORM_TABLE = new float[256];
static {
for (int i = 0; i < 256; i++)
NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
}
/** Decodes a normalization factor stored in an index.
* @see #encodeNormValue(float)
*/
public float decodeNormValue(byte b) {
return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127
}
/** /**
* Computes the normalization value for a field, given the accumulated * Computes the normalization value for a field, given the accumulated
* state of term processing for this field (see {@link FieldInvertState}). * state of term processing for this field (see {@link FieldInvertState}).
* *
* <p>Implementations should calculate a float value based on the field * <p>Implementations should calculate a byte value based on the field
* state and then return that value. * state and then return that value.
* *
* <p>Matches in longer fields are less precise, so implementations of this * <p>Matches in longer fields are less precise, so implementations of this
* method usually return smaller values when <code>state.getLength()</code> is large, * method usually return smaller values when <code>state.getLength()</code> is large,
* and larger values when <code>state.getLength()</code> is small. * and larger values when <code>state.getLength()</code> is small.
* *
* <p>Note that the return values are computed under
* {@link org.apache.lucene.index.IndexWriter#addDocument(org.apache.lucene.document.Document)}
* and then stored using
* {@link #encodeNormValue(float)}.
* Thus they have limited precision, and documents
* must be re-indexed if this method is altered.
*
* @lucene.experimental * @lucene.experimental
* *
* @param state current processing state for this field * @param state current processing state for this field
* @return the calculated float norm * @return the calculated byte norm
*/ */
public abstract float computeNorm(FieldInvertState state); public abstract byte computeNorm(FieldInvertState state);
/** Encodes a normalization factor for storage in an index.
*
* <p>The encoding uses a three-bit mantissa, a five-bit exponent, and
* the zero-exponent point at 15, thus
* representing values from around 7x10^9 to 2x10^-9 with about one
* significant decimal digit of accuracy. Zero is also represented.
* Negative numbers are rounded up to zero. Values too large to represent
* are rounded down to the largest representable value. Positive values too
* small to represent are rounded up to the smallest positive representable
* value.
* @see org.apache.lucene.document.Field#setBoost(float)
* @see org.apache.lucene.util.SmallFloat
*/
public byte encodeNormValue(float f) {
return SmallFloat.floatToByte315(f);
}
/** Computes a score factor based on a term or phrase's frequency in a
* document. This value is multiplied by the {@link #idf(int, int)}
* factor for each term in the query and these products are then summed to
* form the initial score for a document.
*
* <p>Terms and phrases repeated in a document indicate the topic of the
* document, so implementations of this method usually return larger values
* when <code>freq</code> is large, and smaller values when <code>freq</code>
* is small.
*
* <p>The default implementation calls {@link #tf(float)}.
*
* @param freq the frequency of a term within a document
* @return a score factor based on a term's within-document frequency
*/
public float tf(int freq) {
return tf((float)freq);
}
/** Computes the amount of a sloppy phrase match, based on an edit distance. /** Computes the amount of a sloppy phrase match, based on an edit distance.
* This value is summed for each sloppy phrase match in a document to form * This value is summed for each sloppy phrase match in a document to form
* the frequency that is passed to {@link #tf(float)}. * the frequency to be used in scoring instead of the exact term count.
* *
* <p>A phrase match with a small edit distance to a document passage more * <p>A phrase match with a small edit distance to a document passage more
* closely matches the document, so implementations of this method usually * closely matches the document, so implementations of this method usually
@ -619,124 +136,6 @@ public abstract class Similarity {
*/ */
public abstract float sloppyFreq(int distance); public abstract float sloppyFreq(int distance);
/** Computes a score factor based on a term or phrase's frequency in a
* document. This value is multiplied by the {@link #idf(int, int)}
* factor for each term in the query and these products are then summed to
* form the initial score for a document.
*
* <p>Terms and phrases repeated in a document indicate the topic of the
* document, so implementations of this method usually return larger values
* when <code>freq</code> is large, and smaller values when <code>freq</code>
* is small.
*
* @param freq the frequency of a term within a document
* @return a score factor based on a term's within-document frequency
*/
public abstract float tf(float freq);
/**
* Computes a score factor for a simple term and returns an explanation
* for that score factor.
*
* <p>
* The default implementation uses:
*
* <pre>
* idf(docFreq, searcher.maxDoc());
* </pre>
*
* Note that {@link IndexSearcher#maxDoc()} is used instead of
* {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also
* {@link IndexSearcher#docFreq(Term)} is used, and when the latter
* is inaccurate, so is {@link IndexSearcher#maxDoc()}, and in the same direction.
* In addition, {@link IndexSearcher#maxDoc()} is more efficient to compute
*
* @param term the term in question
* @param searcher the document collection being searched
* @param docFreq externally computed docFreq for this term
* @return an IDFExplain object that includes both an idf score factor
and an explanation for the term.
* @throws IOException
*/
public IDFExplanation idfExplain(final Term term, final IndexSearcher searcher, int docFreq) throws IOException {
final int df = docFreq;
final int max = searcher.maxDoc();
final float idf = idf(df, max);
return new IDFExplanation() {
@Override
public String explain() {
return "idf(docFreq=" + df +
", maxDocs=" + max + ")";
}
@Override
public float getIdf() {
return idf;
}};
}
/**
* This method forwards to {@link
* #idfExplain(Term,IndexSearcher,int)} by passing
* <code>searcher.docFreq(term)</code> as the docFreq.
*/
public IDFExplanation idfExplain(final Term term, final IndexSearcher searcher) throws IOException {
return idfExplain(term, searcher, searcher.docFreq(term));
}
/**
* Computes a score factor for a phrase.
*
* <p>
* The default implementation sums the idf factor for
* each term in the phrase.
*
* @param terms the terms in the phrase
* @param searcher the document collection being searched
* @return an IDFExplain object that includes both an idf
* score factor for the phrase and an explanation
* for each term.
* @throws IOException
*/
public IDFExplanation idfExplain(Collection<Term> terms, IndexSearcher searcher) throws IOException {
final int max = searcher.maxDoc();
float idf = 0.0f;
final StringBuilder exp = new StringBuilder();
for (final Term term : terms ) {
final int df = searcher.docFreq(term);
idf += idf(df, max);
exp.append(" ");
exp.append(term.text());
exp.append("=");
exp.append(df);
}
final float fIdf = idf;
return new IDFExplanation() {
@Override
public float getIdf() {
return fIdf;
}
@Override
public String explain() {
return exp.toString();
}
};
}
/** Computes a score factor based on a term's document frequency (the number
* of documents which contain the term). This value is multiplied by the
* {@link #tf(int)} factor for each term in the query and these products are
* then summed to form the initial score for a document.
*
* <p>Terms that occur in fewer documents are better indicators of topic, so
* implementations of this method usually return larger values for rare terms,
* and smaller values for common terms.
*
* @param docFreq the number of documents which contain the term
* @param numDocs the total number of documents in the collection
* @return a score factor based on the term's document frequency
*/
public abstract float idf(int docFreq, int numDocs);
/** /**
* Calculate a scoring factor based on the data in the payload. Overriding implementations * Calculate a scoring factor based on the data in the payload. Overriding implementations
* are responsible for interpreting what is in the payload. Lucene makes no assumptions about * are responsible for interpreting what is in the payload. Lucene makes no assumptions about
@ -759,4 +158,100 @@ public abstract class Similarity {
return 1; return 1;
} }
/**
* Compute any collection-level stats (e.g. IDF, average document length, etc) needed for scoring a query.
*/
public abstract Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termContexts) throws IOException;
/**
* returns a new {@link Similarity.ExactDocScorer}.
*/
public abstract ExactDocScorer exactDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException;
/**
* returns a new {@link Similarity.SloppyDocScorer}.
*/
public abstract SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException;
/**
* API for scoring exact queries such as {@link TermQuery} and
* exact {@link PhraseQuery}.
* <p>
* Term frequencies are integers (the term or phrase's tf)
*/
public abstract class ExactDocScorer {
/**
* Score a single document
* @param doc document id
* @param freq term frequency
* @return document's score
*/
public abstract float score(int doc, int freq);
/**
* Explain the score for a single document
* @param doc document id
* @param freq Explanation of how the term frequency was computed
* @return document's score
*/
public Explanation explain(int doc, Explanation freq) {
Explanation result = new Explanation(score(doc, (int)freq.getValue()),
"score(doc=" + doc + ",freq=" + freq.getValue() +"), with freq of:");
result.addDetail(freq);
return result;
}
}
/**
* API for scoring "sloppy" queries such as {@link SpanQuery} and
* sloppy {@link PhraseQuery}.
* <p>
* Term frequencies are floating point values.
*/
public abstract class SloppyDocScorer {
/**
* Score a single document
* @param doc document id
* @param freq sloppy term frequency
* @return document's score
*/
public abstract float score(int doc, float freq);
/**
* Explain the score for a single document
* @param doc document id
* @param freq Explanation of how the sloppy term frequency was computed
* @return document's score
*/
public Explanation explain(int doc, Explanation freq) {
Explanation result = new Explanation(score(doc, freq.getValue()),
"score(doc=" + doc + ",freq=" + freq.getValue() +"), with freq of:");
result.addDetail(freq);
return result;
}
}
/** Stores the statistics for the indexed collection. This abstract
* implementation is empty; descendants of {@code Similarity} should
* subclass {@code Stats} and define the statistics they require in the
* subclass. Examples include idf, average field length, etc.
*/
public static abstract class Stats {
/** The value for normalization of contained query clauses (e.g. sum of squared weights).
* <p>
* NOTE: a Similarity implementation might not use any query normalization at all,
* its not required. However, if it wants to participate in query normalization,
* it can return a value here.
*/
public abstract float getValueForNormalization();
/** Assigns the query normalization factor and boost from parent queries to this.
* <p>
* NOTE: a Similarity implementation might not use this normalized value at all,
* its not required. However, its usually a good idea to at least incorporate
* the topLevelBoost (e.g. from an outer BooleanQuery) into its score.
*/
public abstract void normalize(float queryNorm, float topLevelBoost);
}
} }

View File

@ -25,11 +25,13 @@ final class SloppyPhraseScorer extends PhraseScorer {
private PhrasePositions repeats[]; private PhrasePositions repeats[];
private PhrasePositions tmpPos[]; // for flipping repeating pps. private PhrasePositions tmpPos[]; // for flipping repeating pps.
private boolean checkedRepeats; private boolean checkedRepeats;
private final Similarity similarity;
SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, Similarity similarity, SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, Similarity similarity,
int slop, byte[] norms) { int slop, Similarity.SloppyDocScorer docScorer) throws IOException {
super(weight, postings, similarity, norms); super(weight, postings, docScorer);
this.slop = slop; this.slop = slop;
this.similarity = similarity;
} }
/** /**

View File

@ -0,0 +1,831 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.SmallFloat;
/**
* Implementation of {@link Similarity} with the Vector Space Model.
* <p>
* Expert: Scoring API.
* <p>TFIDFSimilarity defines the components of Lucene scoring.
* Overriding computation of these components is a convenient
* way to alter Lucene scoring.
*
* <p>Suggested reading:
* <a href="http://nlp.stanford.edu/IR-book/html/htmledition/queries-as-vectors-1.html">
* Introduction To Information Retrieval, Chapter 6</a>.
*
* <p>The following describes how Lucene scoring evolves from
* underlying information retrieval models to (efficient) implementation.
* We first brief on <i>VSM Score</i>,
* then derive from it <i>Lucene's Conceptual Scoring Formula</i>,
* from which, finally, evolves <i>Lucene's Practical Scoring Function</i>
* (the latter is connected directly with Lucene classes and methods).
*
* <p>Lucene combines
* <a href="http://en.wikipedia.org/wiki/Standard_Boolean_model">
* Boolean model (BM) of Information Retrieval</a>
* with
* <a href="http://en.wikipedia.org/wiki/Vector_Space_Model">
* Vector Space Model (VSM) of Information Retrieval</a> -
* documents "approved" by BM are scored by VSM.
*
* <p>In VSM, documents and queries are represented as
* weighted vectors in a multi-dimensional space,
* where each distinct index term is a dimension,
* and weights are
* <a href="http://en.wikipedia.org/wiki/Tfidf">Tf-idf</a> values.
*
* <p>VSM does not require weights to be <i>Tf-idf</i> values,
* but <i>Tf-idf</i> values are believed to produce search results of high quality,
* and so Lucene is using <i>Tf-idf</i>.
* <i>Tf</i> and <i>Idf</i> are described in more detail below,
* but for now, for completion, let's just say that
* for given term <i>t</i> and document (or query) <i>x</i>,
* <i>Tf(t,x)</i> varies with the number of occurrences of term <i>t</i> in <i>x</i>
* (when one increases so does the other) and
* <i>idf(t)</i> similarly varies with the inverse of the
* number of index documents containing term <i>t</i>.
*
* <p><i>VSM score</i> of document <i>d</i> for query <i>q</i> is the
* <a href="http://en.wikipedia.org/wiki/Cosine_similarity">
* Cosine Similarity</a>
* of the weighted query vectors <i>V(q)</i> and <i>V(d)</i>:
*
* <br>&nbsp;<br>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr><td>
* <table cellpadding="1" cellspacing="0" border="1" align="center">
* <tr><td>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* cosine-similarity(q,d) &nbsp; = &nbsp;
* </td>
* <td valign="middle" align="center">
* <table>
* <tr><td align="center"><small>V(q)&nbsp;&middot;&nbsp;V(d)</small></td></tr>
* <tr><td align="center">&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;</td></tr>
* <tr><td align="center"><small>|V(q)|&nbsp;|V(d)|</small></td></tr>
* </table>
* </td>
* </tr>
* </table>
* </td></tr>
* </table>
* </td></tr>
* <tr><td>
* <center><font=-1><u>VSM Score</u></font></center>
* </td></tr>
* </table>
* <br>&nbsp;<br>
*
*
* Where <i>V(q)</i> &middot; <i>V(d)</i> is the
* <a href="http://en.wikipedia.org/wiki/Dot_product">dot product</a>
* of the weighted vectors,
* and <i>|V(q)|</i> and <i>|V(d)|</i> are their
* <a href="http://en.wikipedia.org/wiki/Euclidean_norm#Euclidean_norm">Euclidean norms</a>.
*
* <p>Note: the above equation can be viewed as the dot product of
* the normalized weighted vectors, in the sense that dividing
* <i>V(q)</i> by its euclidean norm is normalizing it to a unit vector.
*
* <p>Lucene refines <i>VSM score</i> for both search quality and usability:
* <ul>
* <li>Normalizing <i>V(d)</i> to the unit vector is known to be problematic in that
* it removes all document length information.
* For some documents removing this info is probably ok,
* e.g. a document made by duplicating a certain paragraph <i>10</i> times,
* especially if that paragraph is made of distinct terms.
* But for a document which contains no duplicated paragraphs,
* this might be wrong.
* To avoid this problem, a different document length normalization
* factor is used, which normalizes to a vector equal to or larger
* than the unit vector: <i>doc-len-norm(d)</i>.
* </li>
*
* <li>At indexing, users can specify that certain documents are more
* important than others, by assigning a document boost.
* For this, the score of each document is also multiplied by its boost value
* <i>doc-boost(d)</i>.
* </li>
*
* <li>Lucene is field based, hence each query term applies to a single
* field, document length normalization is by the length of the certain field,
* and in addition to document boost there are also document fields boosts.
* </li>
*
* <li>The same field can be added to a document during indexing several times,
* and so the boost of that field is the multiplication of the boosts of
* the separate additions (or parts) of that field within the document.
* </li>
*
* <li>At search time users can specify boosts to each query, sub-query, and
* each query term, hence the contribution of a query term to the score of
* a document is multiplied by the boost of that query term <i>query-boost(q)</i>.
* </li>
*
* <li>A document may match a multi term query without containing all
* the terms of that query (this is correct for some of the queries),
* and users can further reward documents matching more query terms
* through a coordination factor, which is usually larger when
* more terms are matched: <i>coord-factor(q,d)</i>.
* </li>
* </ul>
*
* <p>Under the simplifying assumption of a single field in the index,
* we get <i>Lucene's Conceptual scoring formula</i>:
*
* <br>&nbsp;<br>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr><td>
* <table cellpadding="1" cellspacing="0" border="1" align="center">
* <tr><td>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* score(q,d) &nbsp; = &nbsp;
* <font color="#FF9933">coord-factor(q,d)</font> &middot; &nbsp;
* <font color="#CCCC00">query-boost(q)</font> &middot; &nbsp;
* </td>
* <td valign="middle" align="center">
* <table>
* <tr><td align="center"><small><font color="#993399">V(q)&nbsp;&middot;&nbsp;V(d)</font></small></td></tr>
* <tr><td align="center">&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;</td></tr>
* <tr><td align="center"><small><font color="#FF33CC">|V(q)|</font></small></td></tr>
* </table>
* </td>
* <td valign="middle" align="right" rowspan="1">
* &nbsp; &middot; &nbsp; <font color="#3399FF">doc-len-norm(d)</font>
* &nbsp; &middot; &nbsp; <font color="#3399FF">doc-boost(d)</font>
* </td>
* </tr>
* </table>
* </td></tr>
* </table>
* </td></tr>
* <tr><td>
* <center><font=-1><u>Lucene Conceptual Scoring Formula</u></font></center>
* </td></tr>
* </table>
* <br>&nbsp;<br>
*
* <p>The conceptual formula is a simplification in the sense that (1) terms and documents
* are fielded and (2) boosts are usually per query term rather than per query.
*
* <p>We now describe how Lucene implements this conceptual scoring formula, and
* derive from it <i>Lucene's Practical Scoring Function</i>.
*
* <p>For efficient score computation some scoring components
* are computed and aggregated in advance:
*
* <ul>
* <li><i>Query-boost</i> for the query (actually for each query term)
* is known when search starts.
* </li>
*
* <li>Query Euclidean norm <i>|V(q)|</i> can be computed when search starts,
* as it is independent of the document being scored.
* From search optimization perspective, it is a valid question
* why bother to normalize the query at all, because all
* scored documents will be multiplied by the same <i>|V(q)|</i>,
* and hence documents ranks (their order by score) will not
* be affected by this normalization.
* There are two good reasons to keep this normalization:
* <ul>
* <li>Recall that
* <a href="http://en.wikipedia.org/wiki/Cosine_similarity">
* Cosine Similarity</a> can be used find how similar
* two documents are. One can use Lucene for e.g.
* clustering, and use a document as a query to compute
* its similarity to other documents.
* In this use case it is important that the score of document <i>d3</i>
* for query <i>d1</i> is comparable to the score of document <i>d3</i>
* for query <i>d2</i>. In other words, scores of a document for two
* distinct queries should be comparable.
* There are other applications that may require this.
* And this is exactly what normalizing the query vector <i>V(q)</i>
* provides: comparability (to a certain extent) of two or more queries.
* </li>
*
* <li>Applying query normalization on the scores helps to keep the
* scores around the unit vector, hence preventing loss of score data
* because of floating point precision limitations.
* </li>
* </ul>
* </li>
*
* <li>Document length norm <i>doc-len-norm(d)</i> and document
* boost <i>doc-boost(d)</i> are known at indexing time.
* They are computed in advance and their multiplication
* is saved as a single value in the index: <i>norm(d)</i>.
* (In the equations below, <i>norm(t in d)</i> means <i>norm(field(t) in doc d)</i>
* where <i>field(t)</i> is the field associated with term <i>t</i>.)
* </li>
* </ul>
*
* <p><i>Lucene's Practical Scoring Function</i> is derived from the above.
* The color codes demonstrate how it relates
* to those of the <i>conceptual</i> formula:
*
* <P>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr><td>
* <table cellpadding="" cellspacing="2" border="2" align="center">
* <tr><td>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* score(q,d) &nbsp; = &nbsp;
* <A HREF="#formula_coord"><font color="#FF9933">coord(q,d)</font></A> &nbsp;&middot;&nbsp;
* <A HREF="#formula_queryNorm"><font color="#FF33CC">queryNorm(q)</font></A> &nbsp;&middot;&nbsp;
* </td>
* <td valign="bottom" align="center" rowspan="1">
* <big><big><big>&sum;</big></big></big>
* </td>
* <td valign="middle" align="right" rowspan="1">
* <big><big>(</big></big>
* <A HREF="#formula_tf"><font color="#993399">tf(t in d)</font></A> &nbsp;&middot;&nbsp;
* <A HREF="#formula_idf"><font color="#993399">idf(t)</font></A><sup>2</sup> &nbsp;&middot;&nbsp;
* <A HREF="#formula_termBoost"><font color="#CCCC00">t.getBoost()</font></A>&nbsp;&middot;&nbsp;
* <A HREF="#formula_norm"><font color="#3399FF">norm(t,d)</font></A>
* <big><big>)</big></big>
* </td>
* </tr>
* <tr valigh="top">
* <td></td>
* <td align="center"><small>t in q</small></td>
* <td></td>
* </tr>
* </table>
* </td></tr>
* </table>
* </td></tr>
* <tr><td>
* <center><font=-1><u>Lucene Practical Scoring Function</u></font></center>
* </td></tr>
* </table>
*
* <p> where
* <ol>
* <li>
* <A NAME="formula_tf"></A>
* <b><i>tf(t in d)</i></b>
* correlates to the term's <i>frequency</i>,
* defined as the number of times term <i>t</i> appears in the currently scored document <i>d</i>.
* Documents that have more occurrences of a given term receive a higher score.
* Note that <i>tf(t in q)</i> is assumed to be <i>1</i> and therefore it does not appear in this equation,
* However if a query contains twice the same term, there will be
* two term-queries with that same term and hence the computation would still be correct (although
* not very efficient).
* The default computation for <i>tf(t in d)</i> in
* {@link org.apache.lucene.search.DefaultSimilarity#tf(float) DefaultSimilarity} is:
*
* <br>&nbsp;<br>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* {@link org.apache.lucene.search.DefaultSimilarity#tf(float) tf(t in d)} &nbsp; = &nbsp;
* </td>
* <td valign="top" align="center" rowspan="1">
* frequency<sup><big>&frac12;</big></sup>
* </td>
* </tr>
* </table>
* <br>&nbsp;<br>
* </li>
*
* <li>
* <A NAME="formula_idf"></A>
* <b><i>idf(t)</i></b> stands for Inverse Document Frequency. This value
* correlates to the inverse of <i>docFreq</i>
* (the number of documents in which the term <i>t</i> appears).
* This means rarer terms give higher contribution to the total score.
* <i>idf(t)</i> appears for <i>t</i> in both the query and the document,
* hence it is squared in the equation.
* The default computation for <i>idf(t)</i> in
* {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) DefaultSimilarity} is:
*
* <br>&nbsp;<br>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right">
* {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) idf(t)}&nbsp; = &nbsp;
* </td>
* <td valign="middle" align="center">
* 1 + log <big>(</big>
* </td>
* <td valign="middle" align="center">
* <table>
* <tr><td align="center"><small>numDocs</small></td></tr>
* <tr><td align="center">&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;</td></tr>
* <tr><td align="center"><small>docFreq+1</small></td></tr>
* </table>
* </td>
* <td valign="middle" align="center">
* <big>)</big>
* </td>
* </tr>
* </table>
* <br>&nbsp;<br>
* </li>
*
* <li>
* <A NAME="formula_coord"></A>
* <b><i>coord(q,d)</i></b>
* is a score factor based on how many of the query terms are found in the specified document.
* Typically, a document that contains more of the query's terms will receive a higher score
* than another document with fewer query terms.
* This is a search time factor computed in
* {@link SimilarityProvider#coord(int, int) coord(q,d)}
* by the SimilarityProvider in effect at search time.
* <br>&nbsp;<br>
* </li>
*
* <li><b>
* <A NAME="formula_queryNorm"></A>
* <i>queryNorm(q)</i>
* </b>
* is a normalizing factor used to make scores between queries comparable.
* This factor does not affect document ranking (since all ranked documents are multiplied by the same factor),
* but rather just attempts to make scores from different queries (or even different indexes) comparable.
* This is a search time factor computed by the Similarity in effect at search time.
*
* The default computation in
* {@link org.apache.lucene.search.DefaultSimilarityProvider#queryNorm(float) DefaultSimilarityProvider}
* produces a <a href="http://en.wikipedia.org/wiki/Euclidean_norm#Euclidean_norm">Euclidean norm</a>:
* <br>&nbsp;<br>
* <table cellpadding="1" cellspacing="0" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* queryNorm(q) &nbsp; = &nbsp;
* {@link org.apache.lucene.search.DefaultSimilarityProvider#queryNorm(float) queryNorm(sumOfSquaredWeights)}
* &nbsp; = &nbsp;
* </td>
* <td valign="middle" align="center" rowspan="1">
* <table>
* <tr><td align="center"><big>1</big></td></tr>
* <tr><td align="center"><big>
* &ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;
* </big></td></tr>
* <tr><td align="center">sumOfSquaredWeights<sup><big>&frac12;</big></sup></td></tr>
* </table>
* </td>
* </tr>
* </table>
* <br>&nbsp;<br>
*
* The sum of squared weights (of the query terms) is
* computed by the query {@link org.apache.lucene.search.Weight} object.
* For example, a {@link org.apache.lucene.search.BooleanQuery}
* computes this value as:
*
* <br>&nbsp;<br>
* <table cellpadding="1" cellspacing="0" border="0"n align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* {@link org.apache.lucene.search.Weight#getValueForNormalization() sumOfSquaredWeights} &nbsp; = &nbsp;
* {@link org.apache.lucene.search.Query#getBoost() q.getBoost()} <sup><big>2</big></sup>
* &nbsp;&middot;&nbsp;
* </td>
* <td valign="bottom" align="center" rowspan="1">
* <big><big><big>&sum;</big></big></big>
* </td>
* <td valign="middle" align="right" rowspan="1">
* <big><big>(</big></big>
* <A HREF="#formula_idf">idf(t)</A> &nbsp;&middot;&nbsp;
* <A HREF="#formula_termBoost">t.getBoost()</A>
* <big><big>) <sup>2</sup> </big></big>
* </td>
* </tr>
* <tr valigh="top">
* <td></td>
* <td align="center"><small>t in q</small></td>
* <td></td>
* </tr>
* </table>
* <br>&nbsp;<br>
*
* </li>
*
* <li>
* <A NAME="formula_termBoost"></A>
* <b><i>t.getBoost()</i></b>
* is a search time boost of term <i>t</i> in the query <i>q</i> as
* specified in the query text
* (see <A HREF="../../../../../../queryparsersyntax.html#Boosting a Term">query syntax</A>),
* or as set by application calls to
* {@link org.apache.lucene.search.Query#setBoost(float) setBoost()}.
* Notice that there is really no direct API for accessing a boost of one term in a multi term query,
* but rather multi terms are represented in a query as multi
* {@link org.apache.lucene.search.TermQuery TermQuery} objects,
* and so the boost of a term in the query is accessible by calling the sub-query
* {@link org.apache.lucene.search.Query#getBoost() getBoost()}.
* <br>&nbsp;<br>
* </li>
*
* <li>
* <A NAME="formula_norm"></A>
* <b><i>norm(t,d)</i></b> encapsulates a few (indexing time) boost and length factors:
*
* <ul>
* <li><b>Document boost</b> - set by calling
* {@link org.apache.lucene.document.Document#setBoost(float) doc.setBoost()}
* before adding the document to the index.
* </li>
* <li><b>Field boost</b> - set by calling
* {@link org.apache.lucene.document.Fieldable#setBoost(float) field.setBoost()}
* before adding the field to a document.
* </li>
* <li><b>lengthNorm</b> - computed
* when the document is added to the index in accordance with the number of tokens
* of this field in the document, so that shorter fields contribute more to the score.
* LengthNorm is computed by the Similarity class in effect at indexing.
* </li>
* </ul>
* The {@link #computeNorm} method is responsible for
* combining all of these factors into a single float.
*
* <p>
* When a document is added to the index, all the above factors are multiplied.
* If the document has multiple fields with the same name, all their boosts are multiplied together:
*
* <br>&nbsp;<br>
* <table cellpadding="1" cellspacing="0" border="0"n align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* norm(t,d) &nbsp; = &nbsp;
* {@link org.apache.lucene.document.Document#getBoost() doc.getBoost()}
* &nbsp;&middot;&nbsp;
* lengthNorm
* &nbsp;&middot;&nbsp;
* </td>
* <td valign="bottom" align="center" rowspan="1">
* <big><big><big>&prod;</big></big></big>
* </td>
* <td valign="middle" align="right" rowspan="1">
* {@link org.apache.lucene.document.Fieldable#getBoost() f.getBoost}()
* </td>
* </tr>
* <tr valigh="top">
* <td></td>
* <td align="center"><small>field <i><b>f</b></i> in <i>d</i> named as <i><b>t</b></i></small></td>
* <td></td>
* </tr>
* </table>
* <br>&nbsp;<br>
* However the resulted <i>norm</i> value is {@link #encodeNormValue(float) encoded} as a single byte
* before being stored.
* At search time, the norm byte value is read from the index
* {@link org.apache.lucene.store.Directory directory} and
* {@link #decodeNormValue(byte) decoded} back to a float <i>norm</i> value.
* This encoding/decoding, while reducing index size, comes with the price of
* precision loss - it is not guaranteed that <i>decode(encode(x)) = x</i>.
* For instance, <i>decode(encode(0.89)) = 0.75</i>.
* <br>&nbsp;<br>
* Compression of norm values to a single byte saves memory at search time,
* because once a field is referenced at search time, its norms - for
* all documents - are maintained in memory.
* <br>&nbsp;<br>
* The rationale supporting such lossy compression of norm values is that
* given the difficulty (and inaccuracy) of users to express their true information
* need by a query, only big differences matter.
* <br>&nbsp;<br>
* Last, note that search time is too late to modify this <i>norm</i> part of scoring, e.g. by
* using a different {@link Similarity} for search.
* <br>&nbsp;<br>
* </li>
* </ol>
*
* @see org.apache.lucene.index.IndexWriterConfig#setSimilarityProvider(SimilarityProvider)
* @see IndexSearcher#setSimilarityProvider(SimilarityProvider)
*/
public abstract class TFIDFSimilarity extends Similarity {
/** Computes a score factor based on a term or phrase's frequency in a
* document. This value is multiplied by the {@link #idf(int, int)}
* factor for each term in the query and these products are then summed to
* form the initial score for a document.
*
* <p>Terms and phrases repeated in a document indicate the topic of the
* document, so implementations of this method usually return larger values
* when <code>freq</code> is large, and smaller values when <code>freq</code>
* is small.
*
* <p>The default implementation calls {@link #tf(float)}.
*
* @param freq the frequency of a term within a document
* @return a score factor based on a term's within-document frequency
*/
public float tf(int freq) {
return tf((float)freq);
}
/** Computes a score factor based on a term or phrase's frequency in a
* document. This value is multiplied by the {@link #idf(int, int)}
* factor for each term in the query and these products are then summed to
* form the initial score for a document.
*
* <p>Terms and phrases repeated in a document indicate the topic of the
* document, so implementations of this method usually return larger values
* when <code>freq</code> is large, and smaller values when <code>freq</code>
* is small.
*
* @param freq the frequency of a term within a document
* @return a score factor based on a term's within-document frequency
*/
public abstract float tf(float freq);
/**
* Computes a score factor for a simple term and returns an explanation
* for that score factor.
*
* <p>
* The default implementation uses:
*
* <pre>
* idf(docFreq, searcher.maxDoc());
* </pre>
*
* Note that {@link IndexSearcher#maxDoc()} is used instead of
* {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also
* {@link IndexSearcher#docFreq(Term)} is used, and when the latter
* is inaccurate, so is {@link IndexSearcher#maxDoc()}, and in the same direction.
* In addition, {@link IndexSearcher#maxDoc()} is more efficient to compute
*
* @param stats statistics of the term in question
* @param searcher the document collection being searched
* @return an Explain object that includes both an idf score factor
and an explanation for the term.
* @throws IOException
*/
public Explanation idfExplain(TermContext stats, final IndexSearcher searcher) throws IOException {
final int df = stats.docFreq();
final int max = searcher.maxDoc();
final float idf = idf(df, max);
return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
}
/**
* Computes a score factor for a phrase.
*
* <p>
* The default implementation sums the idf factor for
* each term in the phrase.
*
* @param stats statistics of the terms in the phrase
* @param searcher the document collection being searched
* @return an Explain object that includes both an idf
* score factor for the phrase and an explanation
* for each term.
* @throws IOException
*/
public Explanation idfExplain(final TermContext stats[], IndexSearcher searcher) throws IOException {
final int max = searcher.maxDoc();
float idf = 0.0f;
final Explanation exp = new Explanation();
exp.setDescription("idf(), sum of:");
for (final TermContext stat : stats ) {
final int df = stat.docFreq();
final float termIdf = idf(df, max);
exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
idf += termIdf;
}
exp.setValue(idf);
return exp;
}
/** Computes a score factor based on a term's document frequency (the number
* of documents which contain the term). This value is multiplied by the
* {@link #tf(int)} factor for each term in the query and these products are
* then summed to form the initial score for a document.
*
* <p>Terms that occur in fewer documents are better indicators of topic, so
* implementations of this method usually return larger values for rare terms,
* and smaller values for common terms.
*
* @param docFreq the number of documents which contain the term
* @param numDocs the total number of documents in the collection
* @return a score factor based on the term's document frequency
*/
public abstract float idf(int docFreq, int numDocs);
/** Cache of decoded bytes. */
private static final float[] NORM_TABLE = new float[256];
static {
for (int i = 0; i < 256; i++)
NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
}
/** Decodes a normalization factor stored in an index.
* @see #encodeNormValue(float)
*/
public float decodeNormValue(byte b) {
return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127
}
/** Encodes a normalization factor for storage in an index.
*
* <p>The encoding uses a three-bit mantissa, a five-bit exponent, and
* the zero-exponent point at 15, thus
* representing values from around 7x10^9 to 2x10^-9 with about one
* significant decimal digit of accuracy. Zero is also represented.
* Negative numbers are rounded up to zero. Values too large to represent
* are rounded down to the largest representable value. Positive values too
* small to represent are rounded up to the smallest positive representable
* value.
* @see org.apache.lucene.document.Field#setBoost(float)
* @see org.apache.lucene.util.SmallFloat
*/
public byte encodeNormValue(float f) {
return SmallFloat.floatToByte315(f);
}
@Override
public final Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost,
TermContext... termContexts) throws IOException {
final Explanation idf = termContexts.length == 1
? idfExplain(termContexts[0], searcher)
: idfExplain(termContexts, searcher);
return new IDFStats(idf, queryBoost);
}
@Override
public final ExactDocScorer exactDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException {
return new ExactTFIDFDocScorer((IDFStats)stats, context.reader.norms(fieldName));
}
@Override
public final SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException {
return new SloppyTFIDFDocScorer((IDFStats)stats, context.reader.norms(fieldName));
}
// TODO: we can specialize these for omitNorms up front, but we should test that it doesn't confuse stupid hotspot.
private final class ExactTFIDFDocScorer extends ExactDocScorer {
private final IDFStats stats;
private final float weightValue;
private final byte[] norms;
private static final int SCORE_CACHE_SIZE = 32;
private float[] scoreCache = new float[SCORE_CACHE_SIZE];
ExactTFIDFDocScorer(IDFStats stats, byte norms[]) {
this.stats = stats;
this.weightValue = stats.value;
this.norms = norms;
for (int i = 0; i < SCORE_CACHE_SIZE; i++)
scoreCache[i] = tf(i) * weightValue;
}
@Override
public float score(int doc, int freq) {
final float raw = // compute tf(f)*weight
freq < SCORE_CACHE_SIZE // check cache
? scoreCache[freq] // cache hit
: tf(freq)*weightValue; // cache miss
return norms == null ? raw : raw * decodeNormValue(norms[doc]); // normalize for field
}
@Override
public Explanation explain(int doc, Explanation freq) {
return explainScore(doc, freq, stats, norms);
}
}
private final class SloppyTFIDFDocScorer extends SloppyDocScorer {
private final IDFStats stats;
private final float weightValue;
private final byte[] norms;
SloppyTFIDFDocScorer(IDFStats stats, byte norms[]) {
this.stats = stats;
this.weightValue = stats.value;
this.norms = norms;
}
@Override
public float score(int doc, float freq) {
final float raw = tf(freq) * weightValue; // compute tf(f)*weight
return norms == null ? raw : raw * decodeNormValue(norms[doc]); // normalize for field
}
@Override
public Explanation explain(int doc, Explanation freq) {
return explainScore(doc, freq, stats, norms);
}
}
/** Collection statistics for the TF-IDF model. The only statistic of interest
* to this model is idf. */
private static class IDFStats extends Stats {
/** The idf and its explanation */
private final Explanation idf;
private float queryNorm;
private float queryWeight;
private final float queryBoost;
private float value;
public IDFStats(Explanation idf, float queryBoost) {
// TODO: Validate?
this.idf = idf;
this.queryBoost = queryBoost;
this.queryWeight = idf.getValue() * queryBoost; // compute query weight
}
@Override
public float getValueForNormalization() {
// TODO: (sorta LUCENE-1907) make non-static class and expose this squaring via a nice method to subclasses?
return queryWeight * queryWeight; // sum of squared weights
}
@Override
public void normalize(float queryNorm, float topLevelBoost) {
this.queryNorm = queryNorm * topLevelBoost;
queryWeight *= this.queryNorm; // normalize query weight
value = queryWeight * idf.getValue(); // idf for document
}
}
private Explanation explainScore(int doc, Explanation freq, IDFStats stats, byte[] norms) {
Explanation result = new Explanation();
result.setDescription("score(doc="+doc+",freq="+freq+"), product of:");
// explain query weight
Explanation queryExpl = new Explanation();
queryExpl.setDescription("queryWeight, product of:");
Explanation boostExpl = new Explanation(stats.queryBoost, "boost");
if (stats.queryBoost != 1.0f)
queryExpl.addDetail(boostExpl);
queryExpl.addDetail(stats.idf);
Explanation queryNormExpl = new Explanation(stats.queryNorm,"queryNorm");
queryExpl.addDetail(queryNormExpl);
queryExpl.setValue(boostExpl.getValue() *
stats.idf.getValue() *
queryNormExpl.getValue());
result.addDetail(queryExpl);
// explain field weight
Explanation fieldExpl = new Explanation();
fieldExpl.setDescription("fieldWeight in "+doc+
", product of:");
Explanation tfExplanation = new Explanation();
tfExplanation.setValue(tf(freq.getValue()));
tfExplanation.setDescription("tf(freq="+freq.getValue()+"), with freq of:");
tfExplanation.addDetail(freq);
fieldExpl.addDetail(tfExplanation);
fieldExpl.addDetail(stats.idf);
Explanation fieldNormExpl = new Explanation();
float fieldNorm =
norms!=null ? decodeNormValue(norms[doc]) : 1.0f;
fieldNormExpl.setValue(fieldNorm);
fieldNormExpl.setDescription("fieldNorm(doc="+doc+")");
fieldExpl.addDetail(fieldNormExpl);
fieldExpl.setValue(tfExplanation.getValue() *
stats.idf.getValue() *
fieldNormExpl.getValue());
result.addDetail(fieldExpl);
// combine them
result.setValue(queryExpl.getValue() * fieldExpl.getValue());
if (queryExpl.getValue() == 1.0f)
return fieldExpl;
return result;
}
}

View File

@ -29,7 +29,7 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader.ReaderContext; import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PerReaderTermState; import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.ReaderUtil; import org.apache.lucene.util.ReaderUtil;
abstract class TermCollectingRewrite<Q extends Query> extends MultiTermQuery.RewriteMethod { abstract class TermCollectingRewrite<Q extends Query> extends MultiTermQuery.RewriteMethod {
@ -43,7 +43,7 @@ abstract class TermCollectingRewrite<Q extends Query> extends MultiTermQuery.Rew
addClause(topLevel, term, docCount, boost, null); addClause(topLevel, term, docCount, boost, null);
} }
protected abstract void addClause(Q topLevel, Term term, int docCount, float boost, PerReaderTermState states) throws IOException; protected abstract void addClause(Q topLevel, Term term, int docCount, float boost, TermContext states) throws IOException;
protected final void collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException { protected final void collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException {

View File

@ -27,9 +27,9 @@ import org.apache.lucene.index.Terms;
import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader.ReaderContext; import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.search.Explanation.IDFExplanation; import org.apache.lucene.search.Similarity.ExactDocScorer;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PerReaderTermState; import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.ReaderUtil; import org.apache.lucene.util.ReaderUtil;
import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.util.ToStringUtils;
@ -39,28 +39,19 @@ import org.apache.lucene.util.ToStringUtils;
public class TermQuery extends Query { public class TermQuery extends Query {
private final Term term; private final Term term;
private int docFreq; private int docFreq;
private transient PerReaderTermState perReaderTermState; private transient TermContext perReaderTermState;
private class TermWeight extends Weight { private class TermWeight extends Weight {
private final Similarity similarity; private final Similarity similarity;
private float value; private final Similarity.Stats stats;
private final float idf; private transient TermContext termStates;
private float queryNorm;
private float queryWeight;
private final IDFExplanation idfExp;
private transient PerReaderTermState termStates;
public TermWeight(IndexSearcher searcher, PerReaderTermState termStates, int docFreq) public TermWeight(IndexSearcher searcher, TermContext termStates)
throws IOException { throws IOException {
assert termStates != null : "PerReaderTermState must not be null"; assert termStates != null : "TermContext must not be null";
this.termStates = termStates; this.termStates = termStates;
this.similarity = searcher.getSimilarityProvider().get(term.field()); this.similarity = searcher.getSimilarityProvider().get(term.field());
if (docFreq != -1) { this.stats = similarity.computeStats(searcher, term.field(), getBoost(), termStates);
idfExp = similarity.idfExplain(term, searcher, docFreq);
} else {
idfExp = similarity.idfExplain(term, searcher);
}
idf = idfExp.getIdf();
} }
@Override @Override
@ -70,19 +61,13 @@ public class TermQuery extends Query {
public Query getQuery() { return TermQuery.this; } public Query getQuery() { return TermQuery.this; }
@Override @Override
public float getValue() { return value; } public float getValueForNormalization() {
return stats.getValueForNormalization();
@Override
public float sumOfSquaredWeights() {
queryWeight = idf * getBoost(); // compute query weight
return queryWeight * queryWeight; // square it
} }
@Override @Override
public void normalize(float queryNorm) { public void normalize(float queryNorm, float topLevelBoost) {
this.queryNorm = queryNorm; stats.normalize(queryNorm, topLevelBoost);
queryWeight *= queryNorm; // normalize query weight
value = queryWeight * idf; // idf for document
} }
@Override @Override
@ -97,7 +82,7 @@ public class TermQuery extends Query {
} }
final DocsEnum docs = reader.termDocsEnum(reader.getLiveDocs(), field, term.bytes(), state); final DocsEnum docs = reader.termDocsEnum(reader.getLiveDocs(), field, term.bytes(), state);
assert docs != null; assert docs != null;
return new TermScorer(this, docs, similarity, context.reader.norms(field)); return new TermScorer(this, docs, similarity.exactDocScorer(stats, field, context));
} }
private boolean termNotInReader(IndexReader reader, String field, BytesRef bytes) throws IOException { private boolean termNotInReader(IndexReader reader, String field, BytesRef bytes) throws IOException {
@ -107,79 +92,25 @@ public class TermQuery extends Query {
} }
@Override @Override
public Explanation explain(AtomicReaderContext context, int doc) public Explanation explain(AtomicReaderContext context, int doc) throws IOException {
throws IOException { IndexReader reader = context.reader;
final IndexReader reader = context.reader;
ComplexExplanation result = new ComplexExplanation();
result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");
Explanation expl = new Explanation(idf, idfExp.explain());
// explain query weight
Explanation queryExpl = new Explanation();
queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:");
Explanation boostExpl = new Explanation(getBoost(), "boost");
if (getBoost() != 1.0f)
queryExpl.addDetail(boostExpl);
queryExpl.addDetail(expl);
Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm");
queryExpl.addDetail(queryNormExpl);
queryExpl.setValue(boostExpl.getValue() *
expl.getValue() *
queryNormExpl.getValue());
result.addDetail(queryExpl);
// explain field weight
String field = term.field();
ComplexExplanation fieldExpl = new ComplexExplanation();
fieldExpl.setDescription("fieldWeight("+term+" in "+doc+
"), product of:");
Explanation tfExplanation = new Explanation();
int tf = 0;
DocsEnum docs = reader.termDocsEnum(context.reader.getLiveDocs(), term.field(), term.bytes()); DocsEnum docs = reader.termDocsEnum(context.reader.getLiveDocs(), term.field(), term.bytes());
if (docs != null) { if (docs != null) {
int newDoc = docs.advance(doc); int newDoc = docs.advance(doc);
if (newDoc == doc) { if (newDoc == doc) {
tf = docs.freq(); int freq = docs.freq();
} ExactDocScorer docScorer = similarity.exactDocScorer(stats, term.field(), context);
tfExplanation.setValue(similarity.tf(tf)); ComplexExplanation result = new ComplexExplanation();
tfExplanation.setDescription("tf(termFreq("+term+")="+tf+")"); result.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:");
} else { Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "termFreq=" + freq));
tfExplanation.setValue(0.0f); result.addDetail(scoreExplanation);
tfExplanation.setDescription("no matching term"); result.setValue(scoreExplanation.getValue());
result.setMatch(true);
return result;
}
} }
fieldExpl.addDetail(tfExplanation);
fieldExpl.addDetail(expl);
Explanation fieldNormExpl = new Explanation(); return new ComplexExplanation(false, 0.0f, "no matching term");
final byte[] fieldNorms = reader.norms(field);
float fieldNorm =
fieldNorms!=null ? similarity.decodeNormValue(fieldNorms[doc]) : 1.0f;
fieldNormExpl.setValue(fieldNorm);
fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")");
fieldExpl.addDetail(fieldNormExpl);
fieldExpl.setMatch(Boolean.valueOf(tfExplanation.isMatch()));
fieldExpl.setValue(tfExplanation.getValue() *
expl.getValue() *
fieldNormExpl.getValue());
result.addDetail(fieldExpl);
result.setMatch(fieldExpl.getMatch());
// combine them
result.setValue(queryExpl.getValue() * fieldExpl.getValue());
if (queryExpl.getValue() == 1.0f)
return fieldExpl;
return result;
} }
} }
@ -200,7 +131,7 @@ public class TermQuery extends Query {
/** Expert: constructs a TermQuery that will use the /** Expert: constructs a TermQuery that will use the
* provided docFreq instead of looking up the docFreq * provided docFreq instead of looking up the docFreq
* against the searcher. */ * against the searcher. */
public TermQuery(Term t, PerReaderTermState states) { public TermQuery(Term t, TermContext states) {
assert states != null; assert states != null;
term = t; term = t;
docFreq = states.docFreq(); docFreq = states.docFreq();
@ -213,20 +144,20 @@ public class TermQuery extends Query {
@Override @Override
public Weight createWeight(IndexSearcher searcher) throws IOException { public Weight createWeight(IndexSearcher searcher) throws IOException {
final ReaderContext context = searcher.getTopReaderContext(); final ReaderContext context = searcher.getTopReaderContext();
final int weightDocFreq; final TermContext termState;
final PerReaderTermState termState;
if (perReaderTermState == null || perReaderTermState.topReaderContext != context) { if (perReaderTermState == null || perReaderTermState.topReaderContext != context) {
// make TermQuery single-pass if we don't have a PRTS or if the context differs! // make TermQuery single-pass if we don't have a PRTS or if the context differs!
termState = PerReaderTermState.build(context, term, true); // cache term lookups! termState = TermContext.build(context, term, true); // cache term lookups!
// we must not ignore the given docFreq - if set use the given value
weightDocFreq = docFreq == -1 ? termState.docFreq() : docFreq;
} else { } else {
// PRTS was pre-build for this IS // PRTS was pre-build for this IS
termState = this.perReaderTermState; termState = this.perReaderTermState;
weightDocFreq = docFreq;
} }
return new TermWeight(searcher, termState, weightDocFreq); // we must not ignore the given docFreq - if set use the given value (lie)
if (docFreq != -1)
termState.setDocFreq(docFreq);
return new TermWeight(searcher, termState);
} }
@Override @Override

View File

@ -25,20 +25,16 @@ import org.apache.lucene.index.DocsEnum;
*/ */
final class TermScorer extends Scorer { final class TermScorer extends Scorer {
private DocsEnum docsEnum; private DocsEnum docsEnum;
private byte[] norms;
private float weightValue;
private int doc = -1; private int doc = -1;
private int freq; private int freq;
private int pointer; private int pointer;
private int pointerMax; private int pointerMax;
private static final int SCORE_CACHE_SIZE = 32;
private float[] scoreCache = new float[SCORE_CACHE_SIZE];
private int[] docs; private int[] docs;
private int[] freqs; private int[] freqs;
private final DocsEnum.BulkReadResult bulkResult; private final DocsEnum.BulkReadResult bulkResult;
private final Similarity similarity; private final Similarity.ExactDocScorer docScorer;
/** /**
* Construct a <code>TermScorer</code>. * Construct a <code>TermScorer</code>.
@ -47,22 +43,15 @@ final class TermScorer extends Scorer {
* The weight of the <code>Term</code> in the query. * The weight of the <code>Term</code> in the query.
* @param td * @param td
* An iterator over the documents matching the <code>Term</code>. * An iterator over the documents matching the <code>Term</code>.
* @param similarity * @param docScorer
* The </code>Similarity</code> implementation to be used for score * The </code>Similarity.ExactDocScorer</code> implementation
* computations. * to be used for score computations.
* @param norms
* The field norms of the document fields for the <code>Term</code>.
*/ */
TermScorer(Weight weight, DocsEnum td, Similarity similarity, byte[] norms) { TermScorer(Weight weight, DocsEnum td, Similarity.ExactDocScorer docScorer) throws IOException {
super(weight); super(weight);
this.similarity = similarity; this.docScorer = docScorer;
this.docsEnum = td; this.docsEnum = td;
this.norms = norms;
this.weightValue = weight.getValue();
bulkResult = td.getBulkResult(); bulkResult = td.getBulkResult();
for (int i = 0; i < SCORE_CACHE_SIZE; i++)
scoreCache[i] = similarity.tf(i) * weightValue;
} }
@Override @Override
@ -134,12 +123,7 @@ final class TermScorer extends Scorer {
@Override @Override
public float score() { public float score() {
assert doc != NO_MORE_DOCS; assert doc != NO_MORE_DOCS;
float raw = // compute tf(f)*weight return docScorer.score(doc, freq);
freq < SCORE_CACHE_SIZE // check cache
? scoreCache[freq] // cache hit
: similarity.tf(freq)*weightValue; // cache miss
return norms == null ? raw : raw * similarity.decodeNormValue(norms[doc]); // normalize for field
} }
/** /**

View File

@ -29,7 +29,7 @@ import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PerReaderTermState; import org.apache.lucene.util.TermContext;
/** /**
* Base rewrite method for collecting only the top terms * Base rewrite method for collecting only the top terms
@ -80,7 +80,7 @@ public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRew
this.termComp = termsEnum.getComparator(); this.termComp = termsEnum.getComparator();
// lazy init the initial ScoreTerm because comparator is not known on ctor: // lazy init the initial ScoreTerm because comparator is not known on ctor:
if (st == null) if (st == null)
st = new ScoreTerm(this.termComp, new PerReaderTermState(topReaderContext)); st = new ScoreTerm(this.termComp, new TermContext(topReaderContext));
boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class); boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
} }
@ -101,14 +101,14 @@ public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRew
if (t != null) { if (t != null) {
// if the term is already in the PQ, only update docFreq of term in PQ // if the term is already in the PQ, only update docFreq of term in PQ
assert t.boost == boost : "boost should be equal in all segment TermsEnums"; assert t.boost == boost : "boost should be equal in all segment TermsEnums";
t.termState.register(state, readerContext.ord, termsEnum.docFreq()); t.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
} else { } else {
// add new entry in PQ, we must clone the term, else it may get overwritten! // add new entry in PQ, we must clone the term, else it may get overwritten!
st.bytes.copy(bytes); st.bytes.copy(bytes);
st.boost = boost; st.boost = boost;
visitedTerms.put(st.bytes, st); visitedTerms.put(st.bytes, st);
assert st.termState.docFreq() == 0; assert st.termState.docFreq() == 0;
st.termState.register(state, readerContext.ord, termsEnum.docFreq()); st.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
stQueue.offer(st); stQueue.offer(st);
// possibly drop entries from queue // possibly drop entries from queue
if (stQueue.size() > maxSize) { if (stQueue.size() > maxSize) {
@ -116,7 +116,7 @@ public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRew
visitedTerms.remove(st.bytes); visitedTerms.remove(st.bytes);
st.termState.clear(); // reset the termstate! st.termState.clear(); // reset the termstate!
} else { } else {
st = new ScoreTerm(termComp, new PerReaderTermState(topReaderContext)); st = new ScoreTerm(termComp, new TermContext(topReaderContext));
} }
assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize"; assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize";
// set maxBoostAtt with values to help FuzzyTermsEnum to optimize // set maxBoostAtt with values to help FuzzyTermsEnum to optimize
@ -171,8 +171,8 @@ public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRew
public final Comparator<BytesRef> termComp; public final Comparator<BytesRef> termComp;
public final BytesRef bytes = new BytesRef(); public final BytesRef bytes = new BytesRef();
public float boost; public float boost;
public final PerReaderTermState termState; public final TermContext termState;
public ScoreTerm(Comparator<BytesRef> termComp, PerReaderTermState termState) { public ScoreTerm(Comparator<BytesRef> termComp, TermContext termState) {
this.termComp = termComp; this.termComp = termComp;
this.termState = termState; this.termState = termState;
} }

View File

@ -41,11 +41,11 @@ import org.apache.lucene.index.IndexReader.ReaderContext;
* <ol> * <ol>
* <li>A <code>Weight</code> is constructed by a top-level query, given a * <li>A <code>Weight</code> is constructed by a top-level query, given a
* <code>IndexSearcher</code> ({@link Query#createWeight(IndexSearcher)}). * <code>IndexSearcher</code> ({@link Query#createWeight(IndexSearcher)}).
* <li>The {@link #sumOfSquaredWeights()} method is called on the * <li>The {@link #getValueForNormalization()} method is called on the
* <code>Weight</code> to compute the query normalization factor * <code>Weight</code> to compute the query normalization factor
* {@link SimilarityProvider#queryNorm(float)} of the query clauses contained in the * {@link SimilarityProvider#queryNorm(float)} of the query clauses contained in the
* query. * query.
* <li>The query normalization factor is passed to {@link #normalize(float)}. At * <li>The query normalization factor is passed to {@link #normalize(float, float)}. At
* this point the weighting is complete. * this point the weighting is complete.
* <li>A <code>Scorer</code> is constructed by * <li>A <code>Scorer</code> is constructed by
* {@link #scorer(IndexReader.AtomicReaderContext, ScorerContext)}. * {@link #scorer(IndexReader.AtomicReaderContext, ScorerContext)}.
@ -68,11 +68,11 @@ public abstract class Weight {
/** The query that this concerns. */ /** The query that this concerns. */
public abstract Query getQuery(); public abstract Query getQuery();
/** The weight for this query. */ /** The value for normalization of contained query clauses (e.g. sum of squared weights). */
public abstract float getValue(); public abstract float getValueForNormalization() throws IOException;
/** Assigns the query normalization factor to this. */ /** Assigns the query normalization factor and boost from parent queries to this. */
public abstract void normalize(float norm); public abstract void normalize(float norm, float topLevelBoost);
/** /**
* Returns a {@link Scorer} which scores documents in/out-of order according * Returns a {@link Scorer} which scores documents in/out-of order according
@ -94,9 +94,6 @@ public abstract class Weight {
*/ */
public abstract Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException; public abstract Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException;
/** The sum of squared weights of contained query clauses. */
public abstract float sumOfSquaredWeights() throws IOException;
/** /**
* Returns true iff this implementation scores docs only out of order. This * Returns true iff this implementation scores docs only out of order. This
* method is used in conjunction with {@link Collector}'s * method is used in conjunction with {@link Collector}'s

View File

@ -18,11 +18,13 @@ package org.apache.lucene.search.payloads;
*/ */
import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.ComplexExplanation;
import org.apache.lucene.search.Explanation; import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Similarity; import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.Weight; import org.apache.lucene.search.Weight;
import org.apache.lucene.search.Similarity.SloppyDocScorer;
import org.apache.lucene.search.spans.NearSpansOrdered; import org.apache.lucene.search.spans.NearSpansOrdered;
import org.apache.lucene.search.spans.NearSpansUnordered; import org.apache.lucene.search.spans.NearSpansUnordered;
import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanNearQuery;
@ -145,7 +147,35 @@ public class PayloadNearQuery extends SpanNearQuery {
@Override @Override
public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException {
return new PayloadNearSpanScorer(query.getSpans(context), this, return new PayloadNearSpanScorer(query.getSpans(context), this,
similarity, context.reader.norms(query.getField())); similarity, similarity.sloppyDocScorer(stats, query.getField(), context));
}
@Override
public Explanation explain(AtomicReaderContext context, int doc) throws IOException {
PayloadNearSpanScorer scorer = (PayloadNearSpanScorer) scorer(context, ScorerContext.def());
if (scorer != null) {
int newDoc = scorer.advance(doc);
if (newDoc == doc) {
float freq = scorer.freq();
SloppyDocScorer docScorer = similarity.sloppyDocScorer(stats, query.getField(), context);
Explanation expl = new Explanation();
expl.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:");
Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq));
expl.addDetail(scoreExplanation);
expl.setValue(scoreExplanation.getValue());
// now the payloads part
Explanation payloadExpl = function.explain(doc, scorer.payloadsSeen, scorer.payloadScore);
// combined
ComplexExplanation result = new ComplexExplanation();
result.addDetail(expl);
result.addDetail(payloadExpl);
result.setValue(expl.getValue() * payloadExpl.getValue());
result.setDescription("PayloadNearQuery, product of:");
return result;
}
}
return new ComplexExplanation(false, 0.0f, "no matching term");
} }
} }
@ -155,8 +185,8 @@ public class PayloadNearQuery extends SpanNearQuery {
private int payloadsSeen; private int payloadsSeen;
protected PayloadNearSpanScorer(Spans spans, Weight weight, protected PayloadNearSpanScorer(Spans spans, Weight weight,
Similarity similarity, byte[] norms) throws IOException { Similarity similarity, Similarity.SloppyDocScorer docScorer) throws IOException {
super(spans, weight, similarity, norms); super(spans, weight, similarity, docScorer);
this.spans = spans; this.spans = spans;
} }
@ -225,20 +255,6 @@ public class PayloadNearQuery extends SpanNearQuery {
return super.score() return super.score()
* function.docScore(doc, fieldName, payloadsSeen, payloadScore); * function.docScore(doc, fieldName, payloadsSeen, payloadScore);
} }
@Override
protected Explanation explain(int doc) throws IOException {
Explanation result = new Explanation();
// Add detail about tf/idf...
Explanation nonPayloadExpl = super.explain(doc);
result.addDetail(nonPayloadExpl);
// Add detail about payload
Explanation payloadExpl = function.explain(doc, payloadsSeen, payloadScore);
result.addDetail(payloadExpl);
result.setValue(nonPayloadExpl.getValue() * payloadExpl.getValue());
result.setDescription("PayloadNearQuery, product of:");
return result;
}
} }
} }

View File

@ -26,6 +26,9 @@ import org.apache.lucene.search.Weight;
import org.apache.lucene.search.Similarity; import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.Explanation; import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.ComplexExplanation; import org.apache.lucene.search.ComplexExplanation;
import org.apache.lucene.search.Similarity.SloppyDocScorer;
import org.apache.lucene.search.Weight.ScorerContext;
import org.apache.lucene.search.payloads.PayloadNearQuery.PayloadNearSpanScorer;
import org.apache.lucene.search.spans.TermSpans; import org.apache.lucene.search.spans.TermSpans;
import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.SpanWeight; import org.apache.lucene.search.spans.SpanWeight;
@ -76,7 +79,7 @@ public class PayloadTermQuery extends SpanTermQuery {
@Override @Override
public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException {
return new PayloadTermSpanScorer((TermSpans) query.getSpans(context), return new PayloadTermSpanScorer((TermSpans) query.getSpans(context),
this, similarity, context.reader.norms(query.getField())); this, similarity, similarity.sloppyDocScorer(stats, query.getField(), context));
} }
protected class PayloadTermSpanScorer extends SpanScorer { protected class PayloadTermSpanScorer extends SpanScorer {
@ -86,8 +89,8 @@ public class PayloadTermQuery extends SpanTermQuery {
private final TermSpans termSpans; private final TermSpans termSpans;
public PayloadTermSpanScorer(TermSpans spans, Weight weight, public PayloadTermSpanScorer(TermSpans spans, Weight weight,
Similarity similarity, byte[] norms) throws IOException { Similarity similarity, Similarity.SloppyDocScorer docScorer) throws IOException {
super(spans, weight, similarity, norms); super(spans, weight, similarity, docScorer);
termSpans = spans; termSpans = spans;
} }
@ -173,29 +176,40 @@ public class PayloadTermQuery extends SpanTermQuery {
protected float getPayloadScore() { protected float getPayloadScore() {
return function.docScore(doc, term.field(), payloadsSeen, payloadScore); return function.docScore(doc, term.field(), payloadsSeen, payloadScore);
} }
}
@Override @Override
protected Explanation explain(final int doc) throws IOException { public Explanation explain(AtomicReaderContext context, int doc) throws IOException {
ComplexExplanation result = new ComplexExplanation(); PayloadTermSpanScorer scorer = (PayloadTermSpanScorer) scorer(context, ScorerContext.def());
Explanation nonPayloadExpl = super.explain(doc); if (scorer != null) {
result.addDetail(nonPayloadExpl); int newDoc = scorer.advance(doc);
// QUESTION: Is there a way to avoid this skipTo call? We need to know if (newDoc == doc) {
// whether to load the payload or not float freq = scorer.freq();
Explanation payloadBoost = new Explanation(); SloppyDocScorer docScorer = similarity.sloppyDocScorer(stats, query.getField(), context);
result.addDetail(payloadBoost); Explanation expl = new Explanation();
expl.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:");
float payloadScore = getPayloadScore(); Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq));
payloadBoost.setValue(payloadScore); expl.addDetail(scoreExplanation);
// GSI: I suppose we could toString the payload, but I don't think that expl.setValue(scoreExplanation.getValue());
// would be a good idea // now the payloads part
payloadBoost.setDescription("scorePayload(...)"); // QUESTION: Is there a way to avoid this skipTo call? We need to know
result.setValue(nonPayloadExpl.getValue() * payloadScore); // whether to load the payload or not
result.setDescription("btq, product of:"); // GSI: I suppose we could toString the payload, but I don't think that
result.setMatch(nonPayloadExpl.getValue() == 0 ? Boolean.FALSE // would be a good idea
: Boolean.TRUE); // LUCENE-1303 Explanation payloadExpl = new Explanation(scorer.getPayloadScore(), "scorePayload(...)");
return result; payloadExpl.setValue(scorer.getPayloadScore());
// combined
ComplexExplanation result = new ComplexExplanation();
result.addDetail(expl);
result.addDetail(payloadExpl);
result.setValue(expl.getValue() * payloadExpl.getValue());
result.setDescription("btq, product of:");
result.setMatch(expl.getValue() == 0 ? Boolean.FALSE : Boolean.TRUE); // LUCENE-1303
return result;
}
} }
return new ComplexExplanation(false, 0.0f, "no matching term");
} }
} }

View File

@ -27,7 +27,7 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopTermsRewrite; import org.apache.lucene.search.TopTermsRewrite;
import org.apache.lucene.search.ScoringRewrite; import org.apache.lucene.search.ScoringRewrite;
import org.apache.lucene.search.BooleanClause.Occur; // javadocs only import org.apache.lucene.search.BooleanClause.Occur; // javadocs only
import org.apache.lucene.util.PerReaderTermState; import org.apache.lucene.util.TermContext;
/** /**
* Wraps any {@link MultiTermQuery} as a {@link SpanQuery}, * Wraps any {@link MultiTermQuery} as a {@link SpanQuery},
@ -155,7 +155,7 @@ public class SpanMultiTermQueryWrapper<Q extends MultiTermQuery> extends SpanQue
} }
@Override @Override
protected void addClause(SpanOrQuery topLevel, Term term, int docCount, float boost, PerReaderTermState states) { protected void addClause(SpanOrQuery topLevel, Term term, int docCount, float boost, TermContext states) {
final SpanTermQuery q = new SpanTermQuery(term); final SpanTermQuery q = new SpanTermQuery(term);
q.setBoost(boost); q.setBoost(boost);
topLevel.addClause(q); topLevel.addClause(q);
@ -204,7 +204,7 @@ public class SpanMultiTermQueryWrapper<Q extends MultiTermQuery> extends SpanQue
} }
@Override @Override
protected void addClause(SpanOrQuery topLevel, Term term, int docFreq, float boost, PerReaderTermState states) { protected void addClause(SpanOrQuery topLevel, Term term, int docFreq, float boost, TermContext states) {
final SpanTermQuery q = new SpanTermQuery(term); final SpanTermQuery q = new SpanTermQuery(term);
q.setBoost(boost); q.setBoost(boost);
topLevel.addClause(q); topLevel.addClause(q);

View File

@ -20,6 +20,7 @@ package org.apache.lucene.search.spans;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.search.Explanation; import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TFIDFSimilarity;
import org.apache.lucene.search.Weight; import org.apache.lucene.search.Weight;
import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Similarity; import org.apache.lucene.search.Similarity;
@ -29,22 +30,21 @@ import org.apache.lucene.search.Similarity;
*/ */
public class SpanScorer extends Scorer { public class SpanScorer extends Scorer {
protected Spans spans; protected Spans spans;
protected byte[] norms;
protected float value;
protected boolean more = true; protected boolean more = true;
protected int doc; protected int doc;
protected float freq; protected float freq;
protected final Similarity similarity; protected final Similarity similarity;
protected final Similarity.SloppyDocScorer docScorer;
protected SpanScorer(Spans spans, Weight weight, Similarity similarity, byte[] norms) protected SpanScorer(Spans spans, Weight weight, Similarity similarity, Similarity.SloppyDocScorer docScorer)
throws IOException { throws IOException {
super(weight); super(weight);
this.similarity = similarity; this.similarity = similarity;
this.docScorer = docScorer;
this.spans = spans; this.spans = spans;
this.norms = norms;
this.value = weight.getValue();
if (this.spans.next()) { if (this.spans.next()) {
doc = -1; doc = -1;
} else { } else {
@ -94,27 +94,11 @@ public class SpanScorer extends Scorer {
@Override @Override
public float score() throws IOException { public float score() throws IOException {
float raw = similarity.tf(freq) * value; // raw score return docScorer.score(doc, freq);
return norms == null? raw : raw * similarity.decodeNormValue(norms[doc]); // normalize
} }
@Override @Override
public float freq() throws IOException { public float freq() throws IOException {
return freq; return freq;
} }
/** This method is no longer an official member of {@link Scorer},
* but it is needed by SpanWeight to build an explanation. */
protected Explanation explain(final int doc) throws IOException {
Explanation tfExplanation = new Explanation();
int expDoc = advance(doc);
float phraseFreq = (expDoc == doc) ? freq : 0.0f;
tfExplanation.setValue(similarity.tf(phraseFreq));
tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")");
return tfExplanation;
}
} }

View File

@ -18,125 +18,76 @@ package org.apache.lucene.search.spans;
*/ */
import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.search.*; import org.apache.lucene.search.*;
import org.apache.lucene.search.Explanation.IDFExplanation; import org.apache.lucene.search.Similarity.SloppyDocScorer;
import org.apache.lucene.util.TermContext;
import java.io.IOException; import java.io.IOException;
import java.util.HashSet;
import java.util.Set; import java.util.Set;
import java.util.TreeSet;
/** /**
* Expert-only. Public for use by other weight implementations * Expert-only. Public for use by other weight implementations
*/ */
public class SpanWeight extends Weight { public class SpanWeight extends Weight {
protected Similarity similarity; protected Similarity similarity;
protected float value;
protected float idf;
protected float queryNorm;
protected float queryWeight;
protected Set<Term> terms; protected Set<Term> terms;
protected SpanQuery query; protected SpanQuery query;
private IDFExplanation idfExp; protected Similarity.Stats stats;
public SpanWeight(SpanQuery query, IndexSearcher searcher) public SpanWeight(SpanQuery query, IndexSearcher searcher)
throws IOException { throws IOException {
this.similarity = searcher.getSimilarityProvider().get(query.getField()); this.similarity = searcher.getSimilarityProvider().get(query.getField());
this.query = query; this.query = query;
terms=new HashSet<Term>(); terms=new TreeSet<Term>();
query.extractTerms(terms); query.extractTerms(terms);
final ReaderContext context = searcher.getTopReaderContext();
idfExp = similarity.idfExplain(terms, searcher); final TermContext states[] = new TermContext[terms.size()];
idf = idfExp.getIdf(); int i = 0;
for (Term term : terms)
states[i++] = TermContext.build(context, term, true);
stats = similarity.computeStats(searcher, query.getField(), query.getBoost(), states);
} }
@Override @Override
public Query getQuery() { return query; } public Query getQuery() { return query; }
@Override @Override
public float getValue() { return value; } public float getValueForNormalization() throws IOException {
return stats.getValueForNormalization();
@Override
public float sumOfSquaredWeights() throws IOException {
queryWeight = idf * query.getBoost(); // compute query weight
return queryWeight * queryWeight; // square it
} }
@Override @Override
public void normalize(float queryNorm) { public void normalize(float queryNorm, float topLevelBoost) {
this.queryNorm = queryNorm; stats.normalize(queryNorm, topLevelBoost);
queryWeight *= queryNorm; // normalize query weight
value = queryWeight * idf; // idf for document
} }
@Override @Override
public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException {
return new SpanScorer(query.getSpans(context), this, similarity, context.reader return new SpanScorer(query.getSpans(context), this, similarity, similarity.sloppyDocScorer(stats, query.getField(), context));
.norms(query.getField()));
} }
@Override @Override
public Explanation explain(AtomicReaderContext context, int doc) public Explanation explain(AtomicReaderContext context, int doc) throws IOException {
throws IOException { Scorer scorer = scorer(context, ScorerContext.def());
if (scorer != null) {
int newDoc = scorer.advance(doc);
if (newDoc == doc) {
float freq = scorer.freq();
SloppyDocScorer docScorer = similarity.sloppyDocScorer(stats, query.getField(), context);
ComplexExplanation result = new ComplexExplanation();
result.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:");
Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq));
result.addDetail(scoreExplanation);
result.setValue(scoreExplanation.getValue());
result.setMatch(true);
return result;
}
}
ComplexExplanation result = new ComplexExplanation(); return new ComplexExplanation(false, 0.0f, "no matching term");
result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");
String field = ((SpanQuery)getQuery()).getField();
Explanation idfExpl =
new Explanation(idf, "idf(" + field + ": " + idfExp.explain() + ")");
// explain query weight
Explanation queryExpl = new Explanation();
queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:");
Explanation boostExpl = new Explanation(getQuery().getBoost(), "boost");
if (getQuery().getBoost() != 1.0f)
queryExpl.addDetail(boostExpl);
queryExpl.addDetail(idfExpl);
Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm");
queryExpl.addDetail(queryNormExpl);
queryExpl.setValue(boostExpl.getValue() *
idfExpl.getValue() *
queryNormExpl.getValue());
result.addDetail(queryExpl);
// explain field weight
ComplexExplanation fieldExpl = new ComplexExplanation();
fieldExpl.setDescription("fieldWeight("+field+":"+query.toString(field)+
" in "+doc+"), product of:");
Explanation tfExpl = ((SpanScorer)scorer(context, ScorerContext.def())).explain(doc);
fieldExpl.addDetail(tfExpl);
fieldExpl.addDetail(idfExpl);
Explanation fieldNormExpl = new Explanation();
byte[] fieldNorms = context.reader.norms(field);
float fieldNorm =
fieldNorms!=null ? similarity.decodeNormValue(fieldNorms[doc]) : 1.0f;
fieldNormExpl.setValue(fieldNorm);
fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")");
fieldExpl.addDetail(fieldNormExpl);
fieldExpl.setMatch(Boolean.valueOf(tfExpl.isMatch()));
fieldExpl.setValue(tfExpl.getValue() *
idfExpl.getValue() *
fieldNormExpl.getValue());
result.addDetail(fieldExpl);
result.setMatch(fieldExpl.getMatch());
// combine them
result.setValue(queryExpl.getValue() * fieldExpl.getValue());
if (queryExpl.getValue() == 1.0f)
return fieldExpl;
return result;
} }
} }

View File

@ -60,7 +60,7 @@ public abstract class CompoundFileDirectory extends Directory {
* NOTE: subclasses must call {@link #initForRead(Map)} before the directory can be used. * NOTE: subclasses must call {@link #initForRead(Map)} before the directory can be used.
*/ */
public CompoundFileDirectory(Directory directory, String fileName, IOContext context) throws IOException { public CompoundFileDirectory(Directory directory, String fileName, IOContext context) throws IOException {
assert !(directory instanceof CompoundFileDirectory) : "compound file inside of compound file: " + fileName;
this.directory = directory; this.directory = directory;
this.fileName = fileName; this.fileName = fileName;
this.readBufferSize = BufferedIndexInput.bufferSize(context); this.readBufferSize = BufferedIndexInput.bufferSize(context);
@ -75,9 +75,11 @@ public abstract class CompoundFileDirectory extends Directory {
} }
protected final void initForWrite() { protected final void initForWrite() {
assert !(directory instanceof CompoundFileDirectory) : "compound file inside of compound file: " + fileName;
this.entries = SENTINEL; this.entries = SENTINEL;
this.openForWrite = true; this.openForWrite = true;
this.isOpen = true; this.isOpen = true;
writer = new CompoundFileWriter(directory, fileName);
} }
/** Helper method that reads CFS entries from an input stream */ /** Helper method that reads CFS entries from an input stream */
@ -173,7 +175,11 @@ public abstract class CompoundFileDirectory extends Directory {
@Override @Override
public synchronized void close() throws IOException { public synchronized void close() throws IOException {
ensureOpen(); if (!isOpen) {
// allow double close - usually to be consistent with other closeables
assert entries == null;
return; // already closed
}
entries = null; entries = null;
isOpen = false; isOpen = false;
if (writer != null) { if (writer != null) {
@ -263,7 +269,6 @@ public abstract class CompoundFileDirectory extends Directory {
@Override @Override
public IndexOutput createOutput(String name, IOContext context) throws IOException { public IndexOutput createOutput(String name, IOContext context) throws IOException {
ensureOpen(); ensureOpen();
initWriter();
return writer.createOutput(name, context); return writer.createOutput(name, context);
} }
@ -279,12 +284,13 @@ public abstract class CompoundFileDirectory extends Directory {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
/** Not implemented
* @throws UnsupportedOperationException */
@Override @Override
public final CompoundFileDirectory openCompoundInput(String name, IOContext context) throws IOException { public CompoundFileDirectory openCompoundInput(String name, IOContext context) throws IOException {
// NOTE: final to make nested compounding impossible. FileEntry fileEntry = this.entries.get(IndexFileNames.stripSegmentName(name));
throw new UnsupportedOperationException(); if (fileEntry == null) {
throw new FileNotFoundException("file " + name + " does not exists in this CFS");
}
return new NestedCompoundFileDirectory(name, context, fileEntry.offset, fileEntry.length);
} }
/** Not implemented /** Not implemented
@ -292,16 +298,36 @@ public abstract class CompoundFileDirectory extends Directory {
@Override @Override
public CompoundFileDirectory createCompoundOutput(String name, IOContext context) public CompoundFileDirectory createCompoundOutput(String name, IOContext context)
throws IOException { throws IOException {
// NOTE: final to make nested compounding impossible. throw new UnsupportedOperationException("can not create nested CFS, create seperately and use Directory.copy instead");
throw new UnsupportedOperationException();
} }
private final void initWriter() { private class NestedCompoundFileDirectory extends CompoundFileDirectory {
assert openForWrite;
assert entries == SENTINEL; private final long cfsOffset;
if (writer == null) { private final long cfsLength;
writer = new CompoundFileWriter(directory, fileName);
public NestedCompoundFileDirectory(String fileName, IOContext context, long offset, long length)
throws IOException {
super(directory, fileName, context);
this.cfsOffset = offset;
this.cfsLength = length;
IndexInput input = null;
try {
input = CompoundFileDirectory.this.openInput(fileName, IOContext.READONCE);
initForRead(CompoundFileDirectory.readEntries(input,
CompoundFileDirectory.this, fileName));
} finally {
IOUtils.closeSafely(false, input);
}
} }
@Override
public IndexInput openInputSlice(String id, long offset, long length,
int readBufferSize) throws IOException {
assert offset + length <= cfsLength;
return CompoundFileDirectory.this.openInputSlice(id, cfsOffset + offset, length, readBufferSize);
}
} }
} }

View File

@ -17,6 +17,7 @@ package org.apache.lucene.store;
* limitations under the License. * limitations under the License.
*/ */
import java.io.Closeable;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.util.Collection; import java.util.Collection;
@ -55,7 +56,7 @@ import org.apache.lucene.util.IOUtils;
* *
* @lucene.internal * @lucene.internal
*/ */
final class CompoundFileWriter { final class CompoundFileWriter implements Closeable{
private static final class FileEntry { private static final class FileEntry {
/** source file */ /** source file */
@ -89,8 +90,8 @@ final class CompoundFileWriter {
private boolean closed = false; private boolean closed = false;
private volatile IndexOutput dataOut; private volatile IndexOutput dataOut;
private final AtomicBoolean outputTaken = new AtomicBoolean(false); private final AtomicBoolean outputTaken = new AtomicBoolean(false);
private final String entryTableName; final String entryTableName;
private final String dataFileName; final String dataFileName;
/** /**
* Create the compound stream in the specified file. The file name is the * Create the compound stream in the specified file. The file name is the
@ -128,17 +129,14 @@ final class CompoundFileWriter {
* if close() had been called before or if no file has been added to * if close() had been called before or if no file has been added to
* this object * this object
*/ */
void close() throws IOException { public void close() throws IOException {
if (closed) { if (closed) {
throw new IllegalStateException("already closed"); throw new IllegalStateException("already closed");
} }
IOException priorException = null; IOException priorException = null;
IndexOutput entryTableOut = null; IndexOutput entryTableOut = null;
try { try {
if (entries.isEmpty()) { initDataOut(IOContext.DEFAULT);
throw new IllegalStateException("CFS has no entries");
}
if (!pendingEntries.isEmpty() || outputTaken.get()) { if (!pendingEntries.isEmpty() || outputTaken.get()) {
throw new IllegalStateException("CFS has pending open files"); throw new IllegalStateException("CFS has pending open files");
} }
@ -147,12 +145,18 @@ final class CompoundFileWriter {
assert dataOut != null; assert dataOut != null;
long finalLength = dataOut.getFilePointer(); long finalLength = dataOut.getFilePointer();
assert assertFileLength(finalLength, dataOut); assert assertFileLength(finalLength, dataOut);
} catch (IOException e) {
priorException = e;
} finally {
IOUtils.closeSafely(priorException, dataOut);
}
try {
entryTableOut = directory.createOutput(entryTableName, IOContext.DEFAULT); entryTableOut = directory.createOutput(entryTableName, IOContext.DEFAULT);
writeEntryTable(entries.values(), entryTableOut); writeEntryTable(entries.values(), entryTableOut);
} catch (IOException e) { } catch (IOException e) {
priorException = e; priorException = e;
} finally { } finally {
IOUtils.closeSafely(priorException, dataOut, entryTableOut); IOUtils.closeSafely(priorException, entryTableOut);
} }
} }
@ -321,6 +325,7 @@ final class CompoundFileWriter {
closed = true; closed = true;
entry.length = writtenBytes; entry.length = writtenBytes;
if (isSeparate) { if (isSeparate) {
delegate.close();
// we are a separate file - push into the pending entries // we are a separate file - push into the pending entries
pendingEntries.add(entry); pendingEntries.add(entry);
} else { } else {

View File

@ -28,25 +28,27 @@ import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader.ReaderContext; import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.index.TermsEnum.SeekStatus;
/** /**
* Maintains a {@link IndexReader} {@link TermState} view over * Maintains a {@link IndexReader} {@link TermState} view over
* {@link IndexReader} instances containing a single term. The * {@link IndexReader} instances containing a single term. The
* {@link PerReaderTermState} doesn't track if the given {@link TermState} * {@link TermContext} doesn't track if the given {@link TermState}
* objects are valid, neither if the {@link TermState} instances refer to the * objects are valid, neither if the {@link TermState} instances refer to the
* same terms in the associated readers. * same terms in the associated readers.
* *
* @lucene.experimental * @lucene.experimental
*/ */
public final class PerReaderTermState { public final class TermContext {
public final ReaderContext topReaderContext; // for asserting! public final ReaderContext topReaderContext; // for asserting!
private final TermState[] states; private final TermState[] states;
private int docFreq; private int docFreq;
private long totalTermFreq;
/** /**
* Creates an empty {@link PerReaderTermState} from a {@link ReaderContext} * Creates an empty {@link TermContext} from a {@link ReaderContext}
*/ */
public PerReaderTermState(ReaderContext context) { public TermContext(ReaderContext context) {
assert context != null && context.isTopLevel; assert context != null && context.isTopLevel;
topReaderContext = context; topReaderContext = context;
docFreq = 0; docFreq = 0;
@ -60,28 +62,28 @@ public final class PerReaderTermState {
} }
/** /**
* Creates a {@link PerReaderTermState} with an initial {@link TermState}, * Creates a {@link TermContext} with an initial {@link TermState},
* {@link IndexReader} pair. * {@link IndexReader} pair.
*/ */
public PerReaderTermState(ReaderContext context, TermState state, int ord, int docFreq) { public TermContext(ReaderContext context, TermState state, int ord, int docFreq, long totalTermFreq) {
this(context); this(context);
register(state, ord, docFreq); register(state, ord, docFreq, totalTermFreq);
} }
/** /**
* Creates a {@link PerReaderTermState} from a top-level {@link ReaderContext} and the * Creates a {@link TermContext} from a top-level {@link ReaderContext} and the
* given {@link Term}. This method will lookup the given term in all context's leaf readers * given {@link Term}. This method will lookup the given term in all context's leaf readers
* and register each of the readers containing the term in the returned {@link PerReaderTermState} * and register each of the readers containing the term in the returned {@link TermContext}
* using the leaf reader's ordinal. * using the leaf reader's ordinal.
* <p> * <p>
* Note: the given context must be a top-level context. * Note: the given context must be a top-level context.
*/ */
public static PerReaderTermState build(ReaderContext context, Term term, boolean cache) public static TermContext build(ReaderContext context, Term term, boolean cache)
throws IOException { throws IOException {
assert context != null && context.isTopLevel; assert context != null && context.isTopLevel;
final String field = term.field(); final String field = term.field();
final BytesRef bytes = term.bytes(); final BytesRef bytes = term.bytes();
final PerReaderTermState perReaderTermState = new PerReaderTermState(context); final TermContext perReaderTermState = new TermContext(context);
final AtomicReaderContext[] leaves = ReaderUtil.leaves(context); final AtomicReaderContext[] leaves = ReaderUtil.leaves(context);
for (int i = 0; i < leaves.length; i++) { for (int i = 0; i < leaves.length; i++) {
final Fields fields = leaves[i].reader.fields(); final Fields fields = leaves[i].reader.fields();
@ -91,7 +93,7 @@ public final class PerReaderTermState {
final TermsEnum termsEnum = terms.getThreadTermsEnum(); // thread-private don't share! final TermsEnum termsEnum = terms.getThreadTermsEnum(); // thread-private don't share!
if (termsEnum.seekExact(bytes, cache)) { if (termsEnum.seekExact(bytes, cache)) {
final TermState termState = termsEnum.termState(); final TermState termState = termsEnum.termState();
perReaderTermState.register(termState, leaves[i].ord, termsEnum.docFreq()); perReaderTermState.register(termState, leaves[i].ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
} }
} }
} }
@ -100,7 +102,7 @@ public final class PerReaderTermState {
} }
/** /**
* Clears the {@link PerReaderTermState} internal state and removes all * Clears the {@link TermContext} internal state and removes all
* registered {@link TermState}s * registered {@link TermState}s
*/ */
public void clear() { public void clear() {
@ -112,12 +114,16 @@ public final class PerReaderTermState {
* Registers and associates a {@link TermState} with an leaf ordinal. The leaf ordinal * Registers and associates a {@link TermState} with an leaf ordinal. The leaf ordinal
* should be derived from a {@link ReaderContext}'s leaf ord. * should be derived from a {@link ReaderContext}'s leaf ord.
*/ */
public void register(TermState state, final int ord, final int docFreq) { public void register(TermState state, final int ord, final int docFreq, final long totalTermFreq) {
assert state != null : "state must not be null"; assert state != null : "state must not be null";
assert ord >= 0 && ord < states.length; assert ord >= 0 && ord < states.length;
assert states[ord] == null : "state for ord: " + ord assert states[ord] == null : "state for ord: " + ord
+ " already registered"; + " already registered";
this.docFreq += docFreq; this.docFreq += docFreq;
if (this.totalTermFreq >= 0 && totalTermFreq >= 0)
this.totalTermFreq += totalTermFreq;
else
this.totalTermFreq = -1;
states[ord] = state; states[ord] = state;
} }
@ -137,11 +143,27 @@ public final class PerReaderTermState {
/** /**
* Returns the accumulated document frequency of all {@link TermState} * Returns the accumulated document frequency of all {@link TermState}
* instances passed to {@link #register(TermState, int, int)}. * instances passed to {@link #register(TermState, int, int, long)}.
* @return the accumulated document frequency of all {@link TermState} * @return the accumulated document frequency of all {@link TermState}
* instances passed to {@link #register(TermState, int, int)}. * instances passed to {@link #register(TermState, int, int, long)}.
*/ */
public int docFreq() { public int docFreq() {
return docFreq; return docFreq;
} }
/**
* Returns the accumulated term frequency of all {@link TermState}
* instances passed to {@link #register(TermState, int, int, long)}.
* @return the accumulated term frequency of all {@link TermState}
* instances passed to {@link #register(TermState, int, int, long)}.
*/
public long totalTermFreq() {
return totalTermFreq;
}
/** expert: only available for queries that want to lie about docfreq
* @lucene.internal */
public void setDocFreq(int docFreq) {
this.docFreq = docFreq;
}
} }

View File

@ -32,6 +32,7 @@ import org.apache.lucene.index.codecs.sep.IntIndexInput;
import org.apache.lucene.index.codecs.sep.IntIndexOutput; import org.apache.lucene.index.codecs.sep.IntIndexOutput;
import org.apache.lucene.index.codecs.sep.SepPostingsReaderImpl; import org.apache.lucene.index.codecs.sep.SepPostingsReaderImpl;
import org.apache.lucene.index.codecs.sep.SepPostingsWriterImpl; import org.apache.lucene.index.codecs.sep.SepPostingsWriterImpl;
import org.apache.lucene.index.codecs.standard.StandardCodec;
import org.apache.lucene.index.codecs.intblock.FixedIntBlockIndexInput; import org.apache.lucene.index.codecs.intblock.FixedIntBlockIndexInput;
import org.apache.lucene.index.codecs.intblock.FixedIntBlockIndexOutput; import org.apache.lucene.index.codecs.intblock.FixedIntBlockIndexOutput;
import org.apache.lucene.index.codecs.DefaultDocValuesProducer; import org.apache.lucene.index.codecs.DefaultDocValuesProducer;
@ -46,7 +47,6 @@ import org.apache.lucene.index.codecs.BlockTermsReader;
import org.apache.lucene.index.codecs.BlockTermsWriter; import org.apache.lucene.index.codecs.BlockTermsWriter;
import org.apache.lucene.index.codecs.TermsIndexReaderBase; import org.apache.lucene.index.codecs.TermsIndexReaderBase;
import org.apache.lucene.index.codecs.TermsIndexWriterBase; import org.apache.lucene.index.codecs.TermsIndexWriterBase;
import org.apache.lucene.index.codecs.standard.StandardCodec;
import org.apache.lucene.store.*; import org.apache.lucene.store.*;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
@ -62,8 +62,8 @@ public class MockFixedIntBlockCodec extends Codec {
private final int blockSize; private final int blockSize;
public MockFixedIntBlockCodec(int blockSize) { public MockFixedIntBlockCodec(int blockSize) {
super("MockFixedIntBlock");
this.blockSize = blockSize; this.blockSize = blockSize;
name = "MockFixedIntBlock";
} }
@Override @Override
@ -207,7 +207,7 @@ public class MockFixedIntBlockCodec extends Codec {
SepPostingsReaderImpl.files(segmentInfo, codecId, files); SepPostingsReaderImpl.files(segmentInfo, codecId, files);
BlockTermsReader.files(dir, segmentInfo, codecId, files); BlockTermsReader.files(dir, segmentInfo, codecId, files);
FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files); FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files);
DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files); DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files, getDocValuesUseCFS());
} }
@Override @Override
@ -215,16 +215,16 @@ public class MockFixedIntBlockCodec extends Codec {
SepPostingsWriterImpl.getExtensions(extensions); SepPostingsWriterImpl.getExtensions(extensions);
BlockTermsReader.getExtensions(extensions); BlockTermsReader.getExtensions(extensions);
FixedGapTermsIndexReader.getIndexExtensions(extensions); FixedGapTermsIndexReader.getIndexExtensions(extensions);
DefaultDocValuesConsumer.getDocValuesExtensions(extensions); DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS());
} }
@Override @Override
public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException {
return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS());
} }
@Override @Override
public PerDocValues docsProducer(SegmentReadState state) throws IOException { public PerDocValues docsProducer(SegmentReadState state) throws IOException {
return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, state.context); return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, getDocValuesUseCFS(), getDocValuesSortComparator(), state.context);
} }
} }

View File

@ -32,6 +32,7 @@ import org.apache.lucene.index.codecs.sep.IntIndexInput;
import org.apache.lucene.index.codecs.sep.IntIndexOutput; import org.apache.lucene.index.codecs.sep.IntIndexOutput;
import org.apache.lucene.index.codecs.sep.SepPostingsReaderImpl; import org.apache.lucene.index.codecs.sep.SepPostingsReaderImpl;
import org.apache.lucene.index.codecs.sep.SepPostingsWriterImpl; import org.apache.lucene.index.codecs.sep.SepPostingsWriterImpl;
import org.apache.lucene.index.codecs.standard.StandardCodec;
import org.apache.lucene.index.codecs.intblock.VariableIntBlockIndexInput; import org.apache.lucene.index.codecs.intblock.VariableIntBlockIndexInput;
import org.apache.lucene.index.codecs.intblock.VariableIntBlockIndexOutput; import org.apache.lucene.index.codecs.intblock.VariableIntBlockIndexOutput;
import org.apache.lucene.index.codecs.DefaultDocValuesProducer; import org.apache.lucene.index.codecs.DefaultDocValuesProducer;
@ -46,7 +47,6 @@ import org.apache.lucene.index.codecs.BlockTermsReader;
import org.apache.lucene.index.codecs.BlockTermsWriter; import org.apache.lucene.index.codecs.BlockTermsWriter;
import org.apache.lucene.index.codecs.TermsIndexReaderBase; import org.apache.lucene.index.codecs.TermsIndexReaderBase;
import org.apache.lucene.index.codecs.TermsIndexWriterBase; import org.apache.lucene.index.codecs.TermsIndexWriterBase;
import org.apache.lucene.index.codecs.standard.StandardCodec;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexInput;
@ -65,7 +65,7 @@ public class MockVariableIntBlockCodec extends Codec {
private final int baseBlockSize; private final int baseBlockSize;
public MockVariableIntBlockCodec(int baseBlockSize) { public MockVariableIntBlockCodec(int baseBlockSize) {
name = "MockVariableIntBlock"; super("MockVariableIntBlock");
this.baseBlockSize = baseBlockSize; this.baseBlockSize = baseBlockSize;
} }
@ -230,7 +230,7 @@ public class MockVariableIntBlockCodec extends Codec {
SepPostingsReaderImpl.files(segmentInfo, codecId, files); SepPostingsReaderImpl.files(segmentInfo, codecId, files);
BlockTermsReader.files(dir, segmentInfo, codecId, files); BlockTermsReader.files(dir, segmentInfo, codecId, files);
FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files); FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files);
DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files); DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files, getDocValuesUseCFS());
} }
@Override @Override
@ -238,16 +238,16 @@ public class MockVariableIntBlockCodec extends Codec {
SepPostingsWriterImpl.getExtensions(extensions); SepPostingsWriterImpl.getExtensions(extensions);
BlockTermsReader.getExtensions(extensions); BlockTermsReader.getExtensions(extensions);
FixedGapTermsIndexReader.getIndexExtensions(extensions); FixedGapTermsIndexReader.getIndexExtensions(extensions);
DefaultDocValuesConsumer.getDocValuesExtensions(extensions); DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS());
} }
@Override @Override
public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException {
return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS());
} }
@Override @Override
public PerDocValues docsProducer(SegmentReadState state) throws IOException { public PerDocValues docsProducer(SegmentReadState state) throws IOException {
return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, state.context); return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, getDocValuesUseCFS(), getDocValuesSortComparator(), state.context);
} }
} }

View File

@ -78,7 +78,7 @@ public class MockRandomCodec extends Codec {
private final String SEED_EXT = "sd"; private final String SEED_EXT = "sd";
public MockRandomCodec(Random random) { public MockRandomCodec(Random random) {
name = "MockRandom"; super("MockRandom");
this.seedRandom = new Random(random.nextLong()); this.seedRandom = new Random(random.nextLong());
} }
@ -355,7 +355,7 @@ public class MockRandomCodec extends Codec {
BlockTermsReader.files(dir, segmentInfo, codecId, files); BlockTermsReader.files(dir, segmentInfo, codecId, files);
FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files); FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files);
VariableGapTermsIndexReader.files(dir, segmentInfo, codecId, files); VariableGapTermsIndexReader.files(dir, segmentInfo, codecId, files);
DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files); DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files, getDocValuesUseCFS());
// hackish! // hackish!
Iterator<String> it = files.iterator(); Iterator<String> it = files.iterator();
while(it.hasNext()) { while(it.hasNext()) {
@ -373,7 +373,7 @@ public class MockRandomCodec extends Codec {
BlockTermsReader.getExtensions(extensions); BlockTermsReader.getExtensions(extensions);
FixedGapTermsIndexReader.getIndexExtensions(extensions); FixedGapTermsIndexReader.getIndexExtensions(extensions);
VariableGapTermsIndexReader.getIndexExtensions(extensions); VariableGapTermsIndexReader.getIndexExtensions(extensions);
DefaultDocValuesConsumer.getDocValuesExtensions(extensions); DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS());
extensions.add(SEED_EXT); extensions.add(SEED_EXT);
//System.out.println("MockRandom.getExtensions return " + extensions); //System.out.println("MockRandom.getExtensions return " + extensions);
} }
@ -381,11 +381,11 @@ public class MockRandomCodec extends Codec {
// can we make this more evil? // can we make this more evil?
@Override @Override
public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException {
return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS());
} }
@Override @Override
public PerDocValues docsProducer(SegmentReadState state) throws IOException { public PerDocValues docsProducer(SegmentReadState state) throws IOException {
return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, state.context); return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, getDocValuesUseCFS(), getDocValuesSortComparator(), state.context);
} }
} }

View File

@ -54,7 +54,7 @@ import org.apache.lucene.util.BytesRef;
public class MockSepCodec extends Codec { public class MockSepCodec extends Codec {
public MockSepCodec() { public MockSepCodec() {
name = "MockSep"; super("MockSep");
} }
@Override @Override
@ -139,13 +139,13 @@ public class MockSepCodec extends Codec {
SepPostingsReaderImpl.files(segmentInfo, codecId, files); SepPostingsReaderImpl.files(segmentInfo, codecId, files);
BlockTermsReader.files(dir, segmentInfo, codecId, files); BlockTermsReader.files(dir, segmentInfo, codecId, files);
FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files); FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files);
DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files); DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files, getDocValuesUseCFS());
} }
@Override @Override
public void getExtensions(Set<String> extensions) { public void getExtensions(Set<String> extensions) {
getSepExtensions(extensions); getSepExtensions(extensions);
DefaultDocValuesConsumer.getDocValuesExtensions(extensions); DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS());
} }
public static void getSepExtensions(Set<String> extensions) { public static void getSepExtensions(Set<String> extensions) {
@ -156,11 +156,11 @@ public class MockSepCodec extends Codec {
@Override @Override
public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException {
return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS());
} }
@Override @Override
public PerDocValues docsProducer(SegmentReadState state) throws IOException { public PerDocValues docsProducer(SegmentReadState state) throws IOException {
return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, state.context); return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, getDocValuesUseCFS(), getDocValuesSortComparator(), state.context);
} }
} }

View File

@ -37,7 +37,6 @@ public class PreFlexRWCodec extends PreFlexCodec {
public PreFlexRWCodec() { public PreFlexRWCodec() {
// NOTE: we impersonate the PreFlex codec so that it can // NOTE: we impersonate the PreFlex codec so that it can
// read the segments we write! // read the segments we write!
super();
} }
@Override @Override

View File

@ -62,12 +62,7 @@ public class AssertingIndexSearcher extends IndexSearcher {
} }
@Override @Override
public float getValue() { public void normalize(float norm, float topLevelBoost) {
return w.getValue();
}
@Override
public void normalize(float norm) {
throw new IllegalStateException("Weight already normalized."); throw new IllegalStateException("Weight already normalized.");
} }
@ -77,7 +72,7 @@ public class AssertingIndexSearcher extends IndexSearcher {
} }
@Override @Override
public float sumOfSquaredWeights() throws IOException { public float getValueForNormalization() throws IOException {
throw new IllegalStateException("Weight already normalized."); throw new IllegalStateException("Weight already normalized.");
} }

View File

@ -329,9 +329,10 @@ public class CheckHits {
Explanation detail[] = expl.getDetails(); Explanation detail[] = expl.getDetails();
if (detail!=null) { if (detail!=null) {
if (detail.length==1) { if (detail.length==1) {
// simple containment, no matter what the description says, // simple containment, unless its a freq of: (which lets a query explain how the freq is calculated),
// just verify contained expl has same score // just verify contained expl has same score
verifyExplanation(q,doc,score,deep,detail[0]); if (!expl.getDescription().endsWith("with freq of:"))
verifyExplanation(q,doc,score,deep,detail[0]);
} else { } else {
// explanation must either: // explanation must either:
// - end with one of: "product of:", "sum of:", "max of:", or // - end with one of: "product of:", "sum of:", "max of:", or
@ -357,6 +358,7 @@ public class CheckHits {
} }
} }
} }
// TODO: this is a TERRIBLE assertion!!!!
Assert.assertTrue( Assert.assertTrue(
q+": multi valued explanation description=\""+descr q+": multi valued explanation description=\""+descr
+"\" must be 'max of plus x times others' or end with 'product of'" +"\" must be 'max of plus x times others' or end with 'product of'"

View File

@ -19,7 +19,6 @@ package org.apache.lucene.store;
import java.io.IOException; import java.io.IOException;
import java.util.Collection; import java.util.Collection;
import java.util.Collections;
public class MockCompoundFileDirectoryWrapper extends CompoundFileDirectory { public class MockCompoundFileDirectoryWrapper extends CompoundFileDirectory {
private final MockDirectoryWrapper parent; private final MockDirectoryWrapper parent;
@ -31,11 +30,7 @@ public class MockCompoundFileDirectoryWrapper extends CompoundFileDirectory {
this.name = name; this.name = name;
this.parent = parent; this.parent = parent;
this.delegate = delegate; this.delegate = delegate;
if (forWrite) { // don't initialize here since we delegate everything - if not initialized a direct call will cause an assert to fail!
super.initForWrite();
} else {
super.initForRead(Collections.<String,FileEntry>emptyMap());
}
parent.addFileHandle(this, name, !forWrite); parent.addFileHandle(this, name, !forWrite);
} }
@ -51,12 +46,8 @@ public class MockCompoundFileDirectoryWrapper extends CompoundFileDirectory {
@Override @Override
public synchronized void close() throws IOException { public synchronized void close() throws IOException {
try { delegate.close();
delegate.close(); parent.removeOpenFile(this, name);
parent.removeOpenFile(this, name);
} finally {
super.close();
}
} }
@Override @Override
@ -148,4 +139,11 @@ public class MockCompoundFileDirectoryWrapper extends CompoundFileDirectory {
public CompoundFileDirectory createCompoundOutput(String name, IOContext context) throws IOException { public CompoundFileDirectory createCompoundOutput(String name, IOContext context) throws IOException {
return delegate.createCompoundOutput(name, context); return delegate.createCompoundOutput(name, context);
} }
@Override
public CompoundFileDirectory openCompoundInput(String name, IOContext context)
throws IOException {
return delegate.openCompoundInput(name, context);
}
} }

View File

@ -242,7 +242,7 @@ public abstract class LuceneTestCase extends Assert {
if (prior != null) { if (prior != null) {
cp.unregister(prior); cp.unregister(prior);
} }
cp.register(c); cp.register(randomizCodec(random, c));
} }
// returns current default codec // returns current default codec
@ -280,7 +280,7 @@ public abstract class LuceneTestCase extends Assert {
} }
swapCodec(new MockSepCodec(), cp); swapCodec(new MockSepCodec(), cp);
swapCodec(new PulsingCodec(codecHasParam && "Pulsing".equals(codec) ? codecParam : _TestUtil.nextInt(random, 1, 20)), cp); swapCodec(new PulsingCodec(codecHasParam && "Pulsing".equals(codec) ? codecParam : 1 + random.nextInt(20)), cp);
swapCodec(new MockFixedIntBlockCodec(codecHasParam && "MockFixedIntBlock".equals(codec) ? codecParam : _TestUtil.nextInt(random, 1, 2000)), cp); swapCodec(new MockFixedIntBlockCodec(codecHasParam && "MockFixedIntBlock".equals(codec) ? codecParam : _TestUtil.nextInt(random, 1, 2000)), cp);
// baseBlockSize cannot be over 127: // baseBlockSize cannot be over 127:
swapCodec(new MockVariableIntBlockCodec(codecHasParam && "MockVariableIntBlock".equals(codec) ? codecParam : _TestUtil.nextInt(random, 1, 127)), cp); swapCodec(new MockVariableIntBlockCodec(codecHasParam && "MockVariableIntBlock".equals(codec) ? codecParam : _TestUtil.nextInt(random, 1, 127)), cp);
@ -289,6 +289,11 @@ public abstract class LuceneTestCase extends Assert {
return cp.lookup(codec); return cp.lookup(codec);
} }
public static Codec randomizCodec(Random random, Codec codec) {
codec.setDocValuesUseCFS(random.nextBoolean());
return codec;
}
// returns current PreFlex codec // returns current PreFlex codec
static void removeTestCodecs(Codec codec, CodecProvider cp) { static void removeTestCodecs(Codec codec, CodecProvider cp) {
if (codec.name.equals("PreFlex")) { if (codec.name.equals("PreFlex")) {
@ -1493,11 +1498,11 @@ public abstract class LuceneTestCase extends Assert {
RandomCodecProvider(Random random) { RandomCodecProvider(Random random) {
this.perFieldSeed = random.nextInt(); this.perFieldSeed = random.nextInt();
register(new StandardCodec()); register(randomizCodec(random, new StandardCodec()));
register(new PreFlexCodec()); register(randomizCodec(random, new PreFlexCodec()));
register(new PulsingCodec(1)); register(randomizCodec(random, new PulsingCodec( 1 + random.nextInt(20))));
register(new SimpleTextCodec()); register(randomizCodec(random, new SimpleTextCodec()));
register(new MemoryCodec()); register(randomizCodec(random, new MemoryCodec()));
Collections.shuffle(knownCodecs, random); Collections.shuffle(knownCodecs, random);
} }

View File

@ -24,8 +24,6 @@ import org.apache.lucene.document.*;
import org.apache.lucene.search.*; import org.apache.lucene.search.*;
import org.apache.lucene.analysis.*; import org.apache.lucene.analysis.*;
import org.apache.lucene.index.codecs.*; import org.apache.lucene.index.codecs.*;
import org.apache.lucene.index.codecs.standard.*;
import org.apache.lucene.index.codecs.pulsing.*;
import org.apache.lucene.store.*; import org.apache.lucene.store.*;
import java.util.*; import java.util.*;
import java.io.*; import java.io.*;
@ -75,7 +73,7 @@ public class TestExternalCodecs extends LuceneTestCase {
public static class RAMOnlyCodec extends Codec { public static class RAMOnlyCodec extends Codec {
public RAMOnlyCodec() { public RAMOnlyCodec() {
name = "RamOnly"; super("RamOnly");
} }
// Postings state: // Postings state:
static class RAMPostings extends FieldsProducer { static class RAMPostings extends FieldsProducer {

View File

@ -1161,7 +1161,7 @@ public class TestAddIndexes extends LuceneTestCase {
IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT,
new MockAnalyzer(random)); new MockAnalyzer(random));
CodecProvider provider = new CodecProvider(); CodecProvider provider = new CodecProvider();
provider.register(new PulsingCodec(1 + random.nextInt(10))); provider.register(new PulsingCodec(1 + random.nextInt(20)));
conf.setCodecProvider(provider); conf.setCodecProvider(provider);
IndexWriter w = new IndexWriter(dir, conf); IndexWriter w = new IndexWriter(dir, conf);
try { try {
@ -1182,7 +1182,7 @@ public class TestAddIndexes extends LuceneTestCase {
IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT,
new MockAnalyzer(random)); new MockAnalyzer(random));
CodecProvider provider = new CodecProvider(); CodecProvider provider = new CodecProvider();
provider.register(new PulsingCodec(1 + random.nextInt(10))); provider.register(new PulsingCodec(1 + random.nextInt(20)));
conf.setCodecProvider(provider); conf.setCodecProvider(provider);
IndexWriter w = new IndexWriter(dir, conf); IndexWriter w = new IndexWriter(dir, conf);
IndexReader indexReader = IndexReader.open(toAdd); IndexReader indexReader = IndexReader.open(toAdd);

View File

@ -38,7 +38,6 @@ import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.NumericRangeQuery; import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.CompoundFileDirectory; import org.apache.lucene.store.CompoundFileDirectory;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
@ -375,7 +374,8 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
Term searchTerm = new Term("id", "6"); Term searchTerm = new Term("id", "6");
int delCount = reader.deleteDocuments(searchTerm); int delCount = reader.deleteDocuments(searchTerm);
assertEquals("wrong delete count", 1, delCount); assertEquals("wrong delete count", 1, delCount);
reader.setNorm(searcher.search(new TermQuery(new Term("id", "22")), 10).scoreDocs[0].doc, "content", searcher.getSimilarityProvider().get("content").encodeNormValue(2.0f)); DefaultSimilarity sim = new DefaultSimilarity();
reader.setNorm(searcher.search(new TermQuery(new Term("id", "22")), 10).scoreDocs[0].doc, "content", sim.encodeNormValue(2.0f));
reader.close(); reader.close();
searcher.close(); searcher.close();
@ -421,7 +421,8 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
Term searchTerm = new Term("id", "6"); Term searchTerm = new Term("id", "6");
int delCount = reader.deleteDocuments(searchTerm); int delCount = reader.deleteDocuments(searchTerm);
assertEquals("wrong delete count", 1, delCount); assertEquals("wrong delete count", 1, delCount);
reader.setNorm(22, "content", searcher.getSimilarityProvider().get("content").encodeNormValue(2.0f)); DefaultSimilarity sim = new DefaultSimilarity();
reader.setNorm(22, "content", sim.encodeNormValue(2.0f));
reader.close(); reader.close();
// make sure they "took": // make sure they "took":
@ -483,7 +484,8 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
assertEquals("didn't delete the right number of documents", 1, delCount); assertEquals("didn't delete the right number of documents", 1, delCount);
// Set one norm so we get a .s0 file: // Set one norm so we get a .s0 file:
reader.setNorm(21, "content", conf.getSimilarityProvider().get("content").encodeNormValue(1.5f)); DefaultSimilarity sim = new DefaultSimilarity();
reader.setNorm(21, "content", sim.encodeNormValue(1.5f));
reader.close(); reader.close();
} }
@ -526,7 +528,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
assertEquals("didn't delete the right number of documents", 1, delCount); assertEquals("didn't delete the right number of documents", 1, delCount);
// Set one norm so we get a .s0 file: // Set one norm so we get a .s0 file:
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
reader.setNorm(21, "content", sim.encodeNormValue(1.5f)); reader.setNorm(21, "content", sim.encodeNormValue(1.5f));
reader.close(); reader.close();

View File

@ -21,10 +21,9 @@ import java.io.IOException;
import java.io.File; import java.io.File;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import junit.framework.TestSuite;
import junit.textui.TestRunner;
import org.apache.lucene.store.CompoundFileDirectory; import org.apache.lucene.store.CompoundFileDirectory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexInput;
@ -35,27 +34,8 @@ import org.apache.lucene.util._TestUtil;
public class TestCompoundFile extends LuceneTestCase public class TestCompoundFile extends LuceneTestCase
{ {
/** Main for running test case by itself. */
public static void main(String args[]) {
TestRunner.run (new TestSuite(TestCompoundFile.class));
// TestRunner.run (new TestCompoundFile("testSingleFile"));
// TestRunner.run (new TestCompoundFile("testTwoFiles"));
// TestRunner.run (new TestCompoundFile("testRandomFiles"));
// TestRunner.run (new TestCompoundFile("testClonedStreamsClosing"));
// TestRunner.run (new TestCompoundFile("testReadAfterClose"));
// TestRunner.run (new TestCompoundFile("testRandomAccess"));
// TestRunner.run (new TestCompoundFile("testRandomAccessClones"));
// TestRunner.run (new TestCompoundFile("testFileNotFound"));
// TestRunner.run (new TestCompoundFile("testReadPastEOF"));
// TestRunner.run (new TestCompoundFile("testIWCreate"));
}
private Directory dir; private Directory dir;
@Override @Override
public void setUp() throws Exception { public void setUp() throws Exception {
super.setUp(); super.setUp();
@ -323,13 +303,13 @@ public class TestCompoundFile extends LuceneTestCase
throws IOException throws IOException
{ {
// Setup the test file - we need more than 1024 bytes // Setup the test file - we need more than 1024 bytes
IndexOutput os = fsdir.createOutput(file, newIOContext(random)); IndexOutput os = fsdir.createOutput(file, IOContext.DEFAULT);
for(int i=0; i<2000; i++) { for(int i=0; i<2000; i++) {
os.writeByte((byte) i); os.writeByte((byte) i);
} }
os.close(); os.close();
IndexInput in = fsdir.openInput(file, newIOContext(random)); IndexInput in = fsdir.openInput(file, IOContext.DEFAULT);
// This read primes the buffer in IndexInput // This read primes the buffer in IndexInput
in.readByte(); in.readByte();
@ -718,4 +698,73 @@ public class TestCompoundFile extends LuceneTestCase
newDir.close(); newDir.close();
} }
public void testEmptyCFS() throws IOException {
Directory newDir = newDirectory();
CompoundFileDirectory csw = newDir.createCompoundOutput("d.cfs", newIOContext(random));
csw.close();
CompoundFileDirectory csr = newDir.openCompoundInput("d.cfs", newIOContext(random));
assertEquals(0, csr.listAll().length);
csr.close();
newDir.close();
}
public void testReadNestedCFP() throws IOException {
Directory newDir = newDirectory();
CompoundFileDirectory csw = newDir.createCompoundOutput("d.cfs", newIOContext(random));
CompoundFileDirectory nested = newDir.createCompoundOutput("b.cfs", newIOContext(random));
IndexOutput out = nested.createOutput("b.xyz", newIOContext(random));
IndexOutput out1 = nested.createOutput("b_1.xyz", newIOContext(random));
out.writeInt(0);
out1.writeInt(1);
out.close();
out1.close();
nested.close();
newDir.copy(csw, "b.cfs", "b.cfs", newIOContext(random));
newDir.copy(csw, "b.cfe", "b.cfe", newIOContext(random));
newDir.deleteFile("b.cfs");
newDir.deleteFile("b.cfe");
csw.close();
assertEquals(2, newDir.listAll().length);
csw = newDir.openCompoundInput("d.cfs", newIOContext(random));
assertEquals(2, csw.listAll().length);
nested = csw.openCompoundInput("b.cfs", newIOContext(random));
assertEquals(2, nested.listAll().length);
IndexInput openInput = nested.openInput("b.xyz", newIOContext(random));
assertEquals(0, openInput.readInt());
openInput.close();
openInput = nested.openInput("b_1.xyz", newIOContext(random));
assertEquals(1, openInput.readInt());
openInput.close();
nested.close();
csw.close();
newDir.close();
}
public void testDoubleClose() throws IOException {
Directory newDir = newDirectory();
CompoundFileDirectory csw = newDir.createCompoundOutput("d.cfs", newIOContext(random));
IndexOutput out = csw.createOutput("d.xyz", newIOContext(random));
out.writeInt(0);
out.close();
csw.close();
// close a second time - must have no effect according to Closeable
csw.close();
csw = newDir.openCompoundInput("d.cfs", newIOContext(random));
IndexInput openInput = csw.openInput("d.xyz", newIOContext(random));
assertEquals(0, openInput.readInt());
openInput.close();
csw.close();
// close a second time - must have no effect according to Closeable
csw.close();
newDir.close();
}
} }

View File

@ -27,6 +27,7 @@ import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.ScoreDoc;
@ -655,7 +656,8 @@ public class TestDeletionPolicy extends LuceneTestCase {
writer.close(); writer.close();
IndexReader reader = IndexReader.open(dir, policy, false); IndexReader reader = IndexReader.open(dir, policy, false);
reader.deleteDocument(3*i+1); reader.deleteDocument(3*i+1);
reader.setNorm(4*i+1, "content", conf.getSimilarityProvider().get("content").encodeNormValue(2.0F)); DefaultSimilarity sim = new DefaultSimilarity();
reader.setNorm(4*i+1, "content", sim.encodeNormValue(2.0F));
IndexSearcher searcher = newSearcher(reader); IndexSearcher searcher = newSearcher(reader);
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(16*(1+i), hits.length); assertEquals(16*(1+i), hits.length);
@ -781,7 +783,8 @@ public class TestDeletionPolicy extends LuceneTestCase {
writer.close(); writer.close();
IndexReader reader = IndexReader.open(dir, policy, false); IndexReader reader = IndexReader.open(dir, policy, false);
reader.deleteDocument(3); reader.deleteDocument(3);
reader.setNorm(5, "content", conf.getSimilarityProvider().get("content").encodeNormValue(2.0F)); DefaultSimilarity sim = new DefaultSimilarity();
reader.setNorm(5, "content", sim.encodeNormValue(2.0F));
IndexSearcher searcher = newSearcher(reader); IndexSearcher searcher = newSearcher(reader);
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(16, hits.length); assertEquals(16, hits.length);

View File

@ -105,8 +105,9 @@ public class TestDocTermOrds extends LuceneTestCase {
} }
private static class StandardCodecWithOrds extends Codec { private static class StandardCodecWithOrds extends Codec {
public StandardCodecWithOrds() { public StandardCodecWithOrds() {
name = "StandardOrds"; super("StandardOrds");
} }
@Override @Override
@ -200,13 +201,13 @@ public class TestDocTermOrds extends LuceneTestCase {
StandardPostingsReader.files(dir, segmentInfo, id, files); StandardPostingsReader.files(dir, segmentInfo, id, files);
BlockTermsReader.files(dir, segmentInfo, id, files); BlockTermsReader.files(dir, segmentInfo, id, files);
FixedGapTermsIndexReader.files(dir, segmentInfo, id, files); FixedGapTermsIndexReader.files(dir, segmentInfo, id, files);
DefaultDocValuesConsumer.files(dir, segmentInfo, id, files); DefaultDocValuesConsumer.files(dir, segmentInfo, id, files, getDocValuesUseCFS());
} }
@Override @Override
public void getExtensions(Set<String> extensions) { public void getExtensions(Set<String> extensions) {
getStandardExtensions(extensions); getStandardExtensions(extensions);
DefaultDocValuesConsumer.getDocValuesExtensions(extensions); DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS());
} }
public static void getStandardExtensions(Set<String> extensions) { public static void getStandardExtensions(Set<String> extensions) {
@ -218,12 +219,12 @@ public class TestDocTermOrds extends LuceneTestCase {
@Override @Override
public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException {
return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS());
} }
@Override @Override
public PerDocValues docsProducer(SegmentReadState state) throws IOException { public PerDocValues docsProducer(SegmentReadState state) throws IOException {
return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, state.context); return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, getDocValuesUseCFS(), getDocValuesSortComparator(), state.context);
} }
} }

View File

@ -71,7 +71,7 @@ public class TestIndexFileDeleter extends LuceneTestCase {
Term searchTerm = new Term("id", "7"); Term searchTerm = new Term("id", "7");
int delCount = reader.deleteDocuments(searchTerm); int delCount = reader.deleteDocuments(searchTerm);
assertEquals("didn't delete the right number of documents", 1, delCount); assertEquals("didn't delete the right number of documents", 1, delCount);
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
// Set one norm so we get a .s0 file: // Set one norm so we get a .s0 file:
reader.setNorm(21, "content", sim.encodeNormValue(1.5f)); reader.setNorm(21, "content", sim.encodeNormValue(1.5f));
reader.close(); reader.close();

View File

@ -421,7 +421,7 @@ public class TestIndexReader extends LuceneTestCase
// expected // expected
} }
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
try { try {
reader.setNorm(5, "aaa", sim.encodeNormValue(2.0f)); reader.setNorm(5, "aaa", sim.encodeNormValue(2.0f));
fail("setNorm after close failed to throw IOException"); fail("setNorm after close failed to throw IOException");
@ -462,7 +462,7 @@ public class TestIndexReader extends LuceneTestCase
// expected // expected
} }
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
try { try {
reader.setNorm(5, "aaa", sim.encodeNormValue(2.0f)); reader.setNorm(5, "aaa", sim.encodeNormValue(2.0f));
fail("setNorm should have hit LockObtainFailedException"); fail("setNorm should have hit LockObtainFailedException");
@ -494,7 +494,7 @@ public class TestIndexReader extends LuceneTestCase
// now open reader & set norm for doc 0 // now open reader & set norm for doc 0
IndexReader reader = IndexReader.open(dir, false); IndexReader reader = IndexReader.open(dir, false);
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
reader.setNorm(0, "content", sim.encodeNormValue(2.0f)); reader.setNorm(0, "content", sim.encodeNormValue(2.0f));
// we should be holding the write lock now: // we should be holding the write lock now:
@ -539,7 +539,7 @@ public class TestIndexReader extends LuceneTestCase
addDoc(writer, searchTerm.text()); addDoc(writer, searchTerm.text());
writer.close(); writer.close();
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
// now open reader & set norm for doc 0 (writes to // now open reader & set norm for doc 0 (writes to
// _0_1.s0) // _0_1.s0)
reader = IndexReader.open(dir, false); reader = IndexReader.open(dir, false);
@ -738,7 +738,7 @@ public class TestIndexReader extends LuceneTestCase
} }
reader = IndexReader.open(dir, false); reader = IndexReader.open(dir, false);
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
try { try {
reader.setNorm(1, "content", sim.encodeNormValue(2.0f)); reader.setNorm(1, "content", sim.encodeNormValue(2.0f));
fail("did not hit exception when calling setNorm on an invalid doc number"); fail("did not hit exception when calling setNorm on an invalid doc number");

View File

@ -273,7 +273,7 @@ public class TestIndexReaderClone extends LuceneTestCase {
* @throws Exception * @throws Exception
*/ */
private void performDefaultTests(IndexReader r1) throws Exception { private void performDefaultTests(IndexReader r1) throws Exception {
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
float norm1 = sim.decodeNormValue(MultiNorms.norms(r1, "field1")[4]); float norm1 = sim.decodeNormValue(MultiNorms.norms(r1, "field1")[4]);
IndexReader pr1Clone = (IndexReader) r1.clone(); IndexReader pr1Clone = (IndexReader) r1.clone();
@ -329,7 +329,7 @@ public class TestIndexReaderClone extends LuceneTestCase {
TestIndexReaderReopen.createIndex(random, dir1, false); TestIndexReaderReopen.createIndex(random, dir1, false);
SegmentReader origSegmentReader = getOnlySegmentReader(IndexReader.open(dir1, false)); SegmentReader origSegmentReader = getOnlySegmentReader(IndexReader.open(dir1, false));
origSegmentReader.deleteDocument(1); origSegmentReader.deleteDocument(1);
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
origSegmentReader.setNorm(4, "field1", sim.encodeNormValue(0.5f)); origSegmentReader.setNorm(4, "field1", sim.encodeNormValue(0.5f));
SegmentReader clonedSegmentReader = (SegmentReader) origSegmentReader SegmentReader clonedSegmentReader = (SegmentReader) origSegmentReader
@ -429,7 +429,7 @@ public class TestIndexReaderClone extends LuceneTestCase {
final Directory dir1 = newDirectory(); final Directory dir1 = newDirectory();
TestIndexReaderReopen.createIndex(random, dir1, false); TestIndexReaderReopen.createIndex(random, dir1, false);
IndexReader orig = IndexReader.open(dir1, false); IndexReader orig = IndexReader.open(dir1, false);
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
orig.setNorm(1, "field1", sim.encodeNormValue(17.0f)); orig.setNorm(1, "field1", sim.encodeNormValue(17.0f));
final byte encoded = sim.encodeNormValue(17.0f); final byte encoded = sim.encodeNormValue(17.0f);
assertEquals(encoded, MultiNorms.norms(orig, "field1")[1]); assertEquals(encoded, MultiNorms.norms(orig, "field1")[1]);

View File

@ -47,9 +47,9 @@ public class TestIndexReaderCloneNorms extends LuceneTestCase {
public Similarity get(String field) { public Similarity get(String field) {
return new DefaultSimilarity() { return new DefaultSimilarity() {
@Override @Override
public float computeNorm(FieldInvertState state) { public byte computeNorm(FieldInvertState state) {
// diable length norm // diable length norm
return state.getBoost(); return encodeNormValue(state.getBoost());
} }
}; };
} }
@ -217,7 +217,7 @@ public class TestIndexReaderCloneNorms extends LuceneTestCase {
IndexReader reader4C = (IndexReader) reader3C.clone(); IndexReader reader4C = (IndexReader) reader3C.clone();
SegmentReader segmentReader4C = getOnlySegmentReader(reader4C); SegmentReader segmentReader4C = getOnlySegmentReader(reader4C);
assertEquals(4, reader3CCNorm.bytesRef().get()); assertEquals(4, reader3CCNorm.bytesRef().get());
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
reader4C.setNorm(5, "field1", sim.encodeNormValue(0.33f)); reader4C.setNorm(5, "field1", sim.encodeNormValue(0.33f));
// generate a cannot update exception in reader1 // generate a cannot update exception in reader1
@ -278,7 +278,7 @@ public class TestIndexReaderCloneNorms extends LuceneTestCase {
// System.out.println(" and: for "+k+" from "+newNorm+" to "+origNorm); // System.out.println(" and: for "+k+" from "+newNorm+" to "+origNorm);
modifiedNorms.set(i, Float.valueOf(newNorm)); modifiedNorms.set(i, Float.valueOf(newNorm));
modifiedNorms.set(k, Float.valueOf(origNorm)); modifiedNorms.set(k, Float.valueOf(origNorm));
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
ir.setNorm(i, "f" + 1, sim.encodeNormValue(newNorm)); ir.setNorm(i, "f" + 1, sim.encodeNormValue(newNorm));
ir.setNorm(k, "f" + 1, sim.encodeNormValue(origNorm)); ir.setNorm(k, "f" + 1, sim.encodeNormValue(origNorm));
// System.out.println("setNorm i: "+i); // System.out.println("setNorm i: "+i);
@ -300,7 +300,7 @@ public class TestIndexReaderCloneNorms extends LuceneTestCase {
assertEquals("number of norms mismatches", numDocNorms, b.length); assertEquals("number of norms mismatches", numDocNorms, b.length);
ArrayList<Float> storedNorms = (i == 1 ? modifiedNorms : norms); ArrayList<Float> storedNorms = (i == 1 ? modifiedNorms : norms);
for (int j = 0; j < b.length; j++) { for (int j = 0; j < b.length; j++) {
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
float norm = sim.decodeNormValue(b[j]); float norm = sim.decodeNormValue(b[j]);
float norm1 = storedNorms.get(j).floatValue(); float norm1 = storedNorms.get(j).floatValue();
assertEquals("stored norm value of " + field + " for doc " + j + " is " assertEquals("stored norm value of " + field + " for doc " + j + " is "
@ -340,7 +340,7 @@ public class TestIndexReaderCloneNorms extends LuceneTestCase {
// return unique norm values that are unchanged by encoding/decoding // return unique norm values that are unchanged by encoding/decoding
private float nextNorm(String fname) { private float nextNorm(String fname) {
float norm = lastNorm + normDelta; float norm = lastNorm + normDelta;
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
do { do {
float norm1 = sim.decodeNormValue( float norm1 = sim.decodeNormValue(
sim.encodeNormValue(norm)); sim.encodeNormValue(norm));

View File

@ -131,7 +131,7 @@ public class TestIndexReaderOnDiskFull extends LuceneTestCase {
dir.setMaxSizeInBytes(thisDiskFree); dir.setMaxSizeInBytes(thisDiskFree);
dir.setRandomIOExceptionRate(rate); dir.setRandomIOExceptionRate(rate);
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
try { try {
if (0 == x) { if (0 == x) {
int docId = 12; int docId = 12;

View File

@ -606,7 +606,7 @@ public class TestIndexReaderReopen extends LuceneTestCase {
IndexReader reader2 = reader1.reopen(); IndexReader reader2 = reader1.reopen();
modifier = IndexReader.open(dir1, false); modifier = IndexReader.open(dir1, false);
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
modifier.setNorm(1, "field1", sim.encodeNormValue(50f)); modifier.setNorm(1, "field1", sim.encodeNormValue(50f));
modifier.setNorm(1, "field2", sim.encodeNormValue(50f)); modifier.setNorm(1, "field2", sim.encodeNormValue(50f));
modifier.close(); modifier.close();
@ -702,7 +702,7 @@ public class TestIndexReaderReopen extends LuceneTestCase {
protected void modifyIndex(int i) throws IOException { protected void modifyIndex(int i) throws IOException {
if (i % 3 == 0) { if (i % 3 == 0) {
IndexReader modifier = IndexReader.open(dir, false); IndexReader modifier = IndexReader.open(dir, false);
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
modifier.setNorm(i, "field1", sim.encodeNormValue(50f)); modifier.setNorm(i, "field1", sim.encodeNormValue(50f));
modifier.close(); modifier.close();
} else if (i % 3 == 1) { } else if (i % 3 == 1) {
@ -983,7 +983,7 @@ public class TestIndexReaderReopen extends LuceneTestCase {
} }
case 1: { case 1: {
IndexReader reader = IndexReader.open(dir, false); IndexReader reader = IndexReader.open(dir, false);
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
reader.setNorm(4, "field1", sim.encodeNormValue(123f)); reader.setNorm(4, "field1", sim.encodeNormValue(123f));
reader.setNorm(44, "field2", sim.encodeNormValue(222f)); reader.setNorm(44, "field2", sim.encodeNormValue(222f));
reader.setNorm(44, "field4", sim.encodeNormValue(22f)); reader.setNorm(44, "field4", sim.encodeNormValue(22f));
@ -1007,7 +1007,7 @@ public class TestIndexReaderReopen extends LuceneTestCase {
} }
case 4: { case 4: {
IndexReader reader = IndexReader.open(dir, false); IndexReader reader = IndexReader.open(dir, false);
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
reader.setNorm(5, "field1", sim.encodeNormValue(123f)); reader.setNorm(5, "field1", sim.encodeNormValue(123f));
reader.setNorm(55, "field2", sim.encodeNormValue(222f)); reader.setNorm(55, "field2", sim.encodeNormValue(222f));
reader.close(); reader.close();

View File

@ -116,8 +116,8 @@ public class TestMaxTermFrequency extends LuceneTestCase {
} }
@Override @Override
public float computeNorm(FieldInvertState state) { public byte computeNorm(FieldInvertState state) {
return (float) state.getMaxTermFrequency(); return encodeNormValue((float) state.getMaxTermFrequency());
} }
} }
} }

View File

@ -46,9 +46,9 @@ public class TestNorms extends LuceneTestCase {
public Similarity get(String field) { public Similarity get(String field) {
return new DefaultSimilarity() { return new DefaultSimilarity() {
@Override @Override
public float computeNorm(FieldInvertState state) { public byte computeNorm(FieldInvertState state) {
// diable length norm // diable length norm
return state.getBoost(); return encodeNormValue(state.getBoost());
} }
}; };
} }
@ -177,7 +177,7 @@ public class TestNorms extends LuceneTestCase {
//System.out.println(" and: for "+k+" from "+newNorm+" to "+origNorm); //System.out.println(" and: for "+k+" from "+newNorm+" to "+origNorm);
modifiedNorms.set(i, Float.valueOf(newNorm)); modifiedNorms.set(i, Float.valueOf(newNorm));
modifiedNorms.set(k, Float.valueOf(origNorm)); modifiedNorms.set(k, Float.valueOf(origNorm));
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
ir.setNorm(i, "f"+1, sim.encodeNormValue(newNorm)); ir.setNorm(i, "f"+1, sim.encodeNormValue(newNorm));
ir.setNorm(k, "f"+1, sim.encodeNormValue(origNorm)); ir.setNorm(k, "f"+1, sim.encodeNormValue(origNorm));
} }
@ -192,8 +192,9 @@ public class TestNorms extends LuceneTestCase {
byte b[] = MultiNorms.norms(ir, field); byte b[] = MultiNorms.norms(ir, field);
assertEquals("number of norms mismatches",numDocNorms,b.length); assertEquals("number of norms mismatches",numDocNorms,b.length);
ArrayList<Float> storedNorms = (i==1 ? modifiedNorms : norms); ArrayList<Float> storedNorms = (i==1 ? modifiedNorms : norms);
DefaultSimilarity sim = (DefaultSimilarity) similarityProviderOne.get(field);
for (int j = 0; j < b.length; j++) { for (int j = 0; j < b.length; j++) {
float norm = similarityProviderOne.get(field).decodeNormValue(b[j]); float norm = sim.decodeNormValue(b[j]);
float norm1 = storedNorms.get(j).floatValue(); float norm1 = storedNorms.get(j).floatValue();
assertEquals("stored norm value of "+field+" for doc "+j+" is "+norm+" - a mismatch!", norm, norm1, 0.000001); assertEquals("stored norm value of "+field+" for doc "+j+" is "+norm+" - a mismatch!", norm, norm1, 0.000001);
} }
@ -229,7 +230,7 @@ public class TestNorms extends LuceneTestCase {
// return unique norm values that are unchanged by encoding/decoding // return unique norm values that are unchanged by encoding/decoding
private float nextNorm(String fname) { private float nextNorm(String fname) {
float norm = lastNorm + normDelta; float norm = lastNorm + normDelta;
Similarity similarity = similarityProviderOne.get(fname); DefaultSimilarity similarity = (DefaultSimilarity) similarityProviderOne.get(fname);
do { do {
float norm1 = similarity.decodeNormValue(similarity.encodeNormValue(norm)); float norm1 = similarity.decodeNormValue(similarity.encodeNormValue(norm));
if (norm1 > lastNorm) { if (norm1 > lastNorm) {
@ -259,8 +260,8 @@ public class TestNorms extends LuceneTestCase {
} }
@Override @Override
public float computeNorm(FieldInvertState state) { public byte computeNorm(FieldInvertState state) {
return (float) state.getLength(); return encodeNormValue((float) state.getLength());
} }
} }

View File

@ -18,9 +18,9 @@ package org.apache.lucene.index;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.Collection;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.util._TestUtil; import org.apache.lucene.util._TestUtil;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockAnalyzer;
@ -30,7 +30,6 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.*; import org.apache.lucene.search.*;
import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.search.Explanation.IDFExplanation;
public class TestOmitTf extends LuceneTestCase { public class TestOmitTf extends LuceneTestCase {
@ -39,23 +38,14 @@ public class TestOmitTf extends LuceneTestCase {
public float queryNorm(float sumOfSquaredWeights) { return 1.0f; } public float queryNorm(float sumOfSquaredWeights) { return 1.0f; }
public float coord(int overlap, int maxOverlap) { return 1.0f; } public float coord(int overlap, int maxOverlap) { return 1.0f; }
public Similarity get(String field) { public Similarity get(String field) {
return new Similarity() { return new TFIDFSimilarity() {
@Override public float computeNorm(FieldInvertState state) { return state.getBoost(); } @Override public byte computeNorm(FieldInvertState state) { return encodeNormValue(state.getBoost()); }
@Override public float tf(float freq) { return freq; } @Override public float tf(float freq) { return freq; }
@Override public float sloppyFreq(int distance) { return 2.0f; } @Override public float sloppyFreq(int distance) { return 2.0f; }
@Override public float idf(int docFreq, int numDocs) { return 1.0f; } @Override public float idf(int docFreq, int numDocs) { return 1.0f; }
@Override public IDFExplanation idfExplain(Collection<Term> terms, IndexSearcher searcher) throws IOException { @Override public Explanation idfExplain(TermContext[] terms, IndexSearcher searcher) throws IOException {
return new IDFExplanation() { return new Explanation(1.0f, "Inexplicable");
@Override
public float getIdf() {
return 1.0f;
}
@Override
public String explain() {
return "Inexplicable";
}
};
} }
}; };
} }

View File

@ -149,7 +149,7 @@ public class TestParallelReader extends LuceneTestCase {
assertTrue(pr.isCurrent()); assertTrue(pr.isCurrent());
IndexReader modifier = IndexReader.open(dir1, false); IndexReader modifier = IndexReader.open(dir1, false);
Similarity sim = new DefaultSimilarity(); DefaultSimilarity sim = new DefaultSimilarity();
modifier.setNorm(0, "f1", sim.encodeNormValue(100f)); modifier.setNorm(0, "f1", sim.encodeNormValue(100f));
modifier.close(); modifier.close();

View File

@ -279,7 +279,7 @@ public class TestPerFieldCodecSupport extends LuceneTestCase {
CodecProvider provider = new CodecProvider(); CodecProvider provider = new CodecProvider();
Codec[] codecs = new Codec[] { new StandardCodec(), Codec[] codecs = new Codec[] { new StandardCodec(),
new SimpleTextCodec(), new MockSepCodec(), new SimpleTextCodec(), new MockSepCodec(),
new PulsingCodec(1 + random.nextInt(10)), new PulsingCodec(1 + random.nextInt(20)),
new MockVariableIntBlockCodec(1 + random.nextInt(10)), new MockVariableIntBlockCodec(1 + random.nextInt(10)),
new MockFixedIntBlockCodec(1 + random.nextInt(10)) }; new MockFixedIntBlockCodec(1 + random.nextInt(10)) };
for (Codec codec : codecs) { for (Codec codec : codecs) {

View File

@ -81,7 +81,7 @@ public class TestDocValues extends LuceneTestCase {
w.finish(maxDoc); w.finish(maxDoc);
assertEquals(0, trackBytes.get()); assertEquals(0, trackBytes.get());
IndexDocValues r = Bytes.getValues(dir, "test", mode, fixedSize, maxDoc, newIOContext(random)); IndexDocValues r = Bytes.getValues(dir, "test", mode, fixedSize, maxDoc, comp, newIOContext(random));
for (int iter = 0; iter < 2; iter++) { for (int iter = 0; iter < 2; iter++) {
ValuesEnum bytesEnum = getEnum(r); ValuesEnum bytesEnum = getEnum(r);
assertNotNull("enum is null", bytesEnum); assertNotNull("enum is null", bytesEnum);
@ -105,7 +105,8 @@ public class TestDocValues extends LuceneTestCase {
Source s; Source s;
IndexDocValues.SortedSource ss; IndexDocValues.SortedSource ss;
if (mode == Bytes.Mode.SORTED) { if (mode == Bytes.Mode.SORTED) {
s = ss = getSortedSource(r, comp); // default is unicode so we can simply pass null here
s = ss = getSortedSource(r, random.nextBoolean() ? comp : null);
} else { } else {
s = getSource(r); s = getSource(r);
ss = null; ss = null;

View File

@ -20,7 +20,11 @@ package org.apache.lucene.search;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.Similarity.ExactDocScorer;
import org.apache.lucene.search.Similarity.SloppyDocScorer;
import org.apache.lucene.search.Similarity.Stats;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.PriorityQueue;
@ -187,8 +191,8 @@ final class JustCompileSearch {
static final class JustCompilePhraseScorer extends PhraseScorer { static final class JustCompilePhraseScorer extends PhraseScorer {
JustCompilePhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, JustCompilePhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
Similarity similarity, byte[] norms) { Similarity.SloppyDocScorer docScorer) throws IOException {
super(weight, postings, similarity, norms); super(weight, postings, docScorer);
} }
@Override @Override
@ -243,12 +247,22 @@ final class JustCompileSearch {
static final class JustCompileSimilarity extends Similarity { static final class JustCompileSimilarity extends Similarity {
@Override @Override
public float idf(int docFreq, int numDocs) { public Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termContexts) throws IOException {
throw new UnsupportedOperationException(UNSUPPORTED_MSG); throw new UnsupportedOperationException(UNSUPPORTED_MSG);
} }
@Override @Override
public float computeNorm(FieldInvertState state) { public ExactDocScorer exactDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException {
throw new UnsupportedOperationException(UNSUPPORTED_MSG);
}
@Override
public SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException {
throw new UnsupportedOperationException(UNSUPPORTED_MSG);
}
@Override
public byte computeNorm(FieldInvertState state) {
throw new UnsupportedOperationException(UNSUPPORTED_MSG); throw new UnsupportedOperationException(UNSUPPORTED_MSG);
} }
@ -256,11 +270,6 @@ final class JustCompileSearch {
public float sloppyFreq(int distance) { public float sloppyFreq(int distance) {
throw new UnsupportedOperationException(UNSUPPORTED_MSG); throw new UnsupportedOperationException(UNSUPPORTED_MSG);
} }
@Override
public float tf(float freq) {
throw new UnsupportedOperationException(UNSUPPORTED_MSG);
}
} }
static final class JustCompileSimilarityProvider implements SimilarityProvider { static final class JustCompileSimilarityProvider implements SimilarityProvider {
@ -348,17 +357,12 @@ final class JustCompileSearch {
} }
@Override @Override
public float getValue() { public void normalize(float norm, float topLevelBoost) {
throw new UnsupportedOperationException(UNSUPPORTED_MSG); throw new UnsupportedOperationException(UNSUPPORTED_MSG);
} }
@Override @Override
public void normalize(float norm) { public float getValueForNormalization() throws IOException {
throw new UnsupportedOperationException(UNSUPPORTED_MSG);
}
@Override
public float sumOfSquaredWeights() throws IOException {
throw new UnsupportedOperationException(UNSUPPORTED_MSG); throw new UnsupportedOperationException(UNSUPPORTED_MSG);
} }

View File

@ -62,9 +62,9 @@ public class TestDisjunctionMaxQuery extends LuceneTestCase {
} }
@Override @Override
public float computeNorm(FieldInvertState state) { public byte computeNorm(FieldInvertState state) {
// Disable length norm // Disable length norm
return state.getBoost(); return encodeNormValue(state.getBoost());
} }
@Override @Override

View File

@ -0,0 +1,203 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IndexDocValuesField;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.index.values.IndexDocValues.Source;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TermContext;
/**
* Tests the use of indexdocvalues in scoring.
*
* In the example, a docvalues field is used as a per-document boost (separate from the norm)
* @lucene.experimental
*/
public class TestDocValuesScoring extends LuceneTestCase {
private static final float SCORE_EPSILON = 0.001f; /* for comparing floats */
public void testSimple() throws Exception {
assumeFalse("PreFlex codec cannot work with IndexDocValues!",
"PreFlex".equals(CodecProvider.getDefault().getDefaultFieldCodec()));
Directory dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random, dir);
Document doc = new Document();
Field field = newField("foo", "", Field.Store.NO, Field.Index.ANALYZED);
doc.add(field);
IndexDocValuesField dvField = new IndexDocValuesField("foo_boost");
doc.add(dvField);
Field field2 = newField("bar", "", Field.Store.NO, Field.Index.ANALYZED);
doc.add(field2);
field.setValue("quick brown fox");
field2.setValue("quick brown fox");
dvField.setFloat(2f); // boost x2
iw.addDocument(doc);
field.setValue("jumps over lazy brown dog");
field2.setValue("jumps over lazy brown dog");
dvField.setFloat(4f); // boost x4
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
// no boosting
IndexSearcher searcher1 = newSearcher(ir);
// boosting
IndexSearcher searcher2 = newSearcher(ir);
searcher2.setSimilarityProvider(new DefaultSimilarityProvider() {
final Similarity fooSim = new BoostingSimilarity(super.get("foo"), "foo_boost");
public Similarity get(String field) {
return "foo".equals(field) ? fooSim : super.get(field);
}
});
// in this case, we searched on field "foo". first document should have 2x the score.
TermQuery tq = new TermQuery(new Term("foo", "quick"));
QueryUtils.check(random, tq, searcher1);
QueryUtils.check(random, tq, searcher2);
TopDocs noboost = searcher1.search(tq, 10);
TopDocs boost = searcher2.search(tq, 10);
assertEquals(1, noboost.totalHits);
assertEquals(1, boost.totalHits);
//System.out.println(searcher2.explain(tq, boost.scoreDocs[0].doc));
assertEquals(boost.scoreDocs[0].score, noboost.scoreDocs[0].score*2f, SCORE_EPSILON);
// this query matches only the second document, which should have 4x the score.
tq = new TermQuery(new Term("foo", "jumps"));
QueryUtils.check(random, tq, searcher1);
QueryUtils.check(random, tq, searcher2);
noboost = searcher1.search(tq, 10);
boost = searcher2.search(tq, 10);
assertEquals(1, noboost.totalHits);
assertEquals(1, boost.totalHits);
assertEquals(boost.scoreDocs[0].score, noboost.scoreDocs[0].score*4f, SCORE_EPSILON);
// search on on field bar just for kicks, nothing should happen, since we setup
// our sim provider to only use foo_boost for field foo.
tq = new TermQuery(new Term("bar", "quick"));
QueryUtils.check(random, tq, searcher1);
QueryUtils.check(random, tq, searcher2);
noboost = searcher1.search(tq, 10);
boost = searcher2.search(tq, 10);
assertEquals(1, noboost.totalHits);
assertEquals(1, boost.totalHits);
assertEquals(boost.scoreDocs[0].score, noboost.scoreDocs[0].score, SCORE_EPSILON);
searcher1.close();
searcher2.close();
ir.close();
dir.close();
}
/**
* Similarity that wraps another similarity and boosts the final score
* according to whats in a docvalues field.
*
* @lucene.experimental
*/
static class BoostingSimilarity extends Similarity {
private final Similarity sim;
private final String boostField;
public BoostingSimilarity(Similarity sim, String boostField) {
this.sim = sim;
this.boostField = boostField;
}
@Override
public byte computeNorm(FieldInvertState state) {
return sim.computeNorm(state);
}
@Override
public float sloppyFreq(int distance) {
return sim.sloppyFreq(distance);
}
@Override
public Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termContexts) throws IOException {
return sim.computeStats(searcher, fieldName, queryBoost, termContexts);
}
@Override
public ExactDocScorer exactDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException {
final ExactDocScorer sub = sim.exactDocScorer(stats, fieldName, context);
final Source values = context.reader.docValues(boostField).getSource();
return new ExactDocScorer() {
@Override
public float score(int doc, int freq) {
return (float) values.getFloat(doc) * sub.score(doc, freq);
}
@Override
public Explanation explain(int doc, Explanation freq) {
Explanation boostExplanation = new Explanation((float) values.getFloat(doc), "indexDocValue(" + boostField + ")");
Explanation simExplanation = sub.explain(doc, freq);
Explanation expl = new Explanation(boostExplanation.getValue() * simExplanation.getValue(), "product of:");
expl.addDetail(boostExplanation);
expl.addDetail(simExplanation);
return expl;
}
};
}
@Override
public SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException {
final SloppyDocScorer sub = sim.sloppyDocScorer(stats, fieldName, context);
final Source values = context.reader.docValues(boostField).getSource();
return new SloppyDocScorer() {
@Override
public float score(int doc, float freq) {
return (float) values.getFloat(doc) * sub.score(doc, freq);
}
@Override
public Explanation explain(int doc, Explanation freq) {
Explanation boostExplanation = new Explanation((float) values.getFloat(doc), "indexDocValue(" + boostField + ")");
Explanation simExplanation = sub.explain(doc, freq);
Explanation expl = new Explanation(boostExplanation.getValue() * simExplanation.getValue(), "product of:");
expl.addDetail(boostExplanation);
expl.addDetail(simExplanation);
return expl;
}
};
}
}
}

Some files were not shown because too many files have changed in this diff Show More