diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 58f2a38bc11..7d76af0b813 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -156,6 +156,12 @@ Changes in backwards compatibility policy the queries module and can be found at o.a.l.queries.function. See MIGRATE.txt for more information (Chris Male) +* LUCENE-2392: Decoupled vector space scoring from Query/Weight/Scorer. If you + extended Similarity directly before, you should extend TFIDFSimilarity instead. + Similarity is now a lower-level API to implement other scoring algorithms. + See MIGRATE.txt for more details. + (David Nemeskey, Simon Willnauer, Mike Mccandless, Robert Muir) + Changes in Runtime Behavior * LUCENE-2846: omitNorms now behaves like omitTermFrequencyAndPositions, if you diff --git a/lucene/MIGRATE.txt b/lucene/MIGRATE.txt index ffbdef459c5..268ca527fb3 100644 --- a/lucene/MIGRATE.txt +++ b/lucene/MIGRATE.txt @@ -382,3 +382,13 @@ LUCENE-1458, LUCENE-2111: Flexible Indexing - o.a.l.search.function.ShortFieldSource -> o.a.l.queries.function.valuesource.ShortFieldSource - o.a.l.search.function.ValueSource -> o.a.l.queries.function.ValueSource - o.a.l.search.function.ValueSourceQuery -> o.a.l.queries.function.FunctionQuery + +* LUCENE-2392: Enable flexible scoring: + + The existing "Similarity" api is now TFIDFSimilarity, if you were extending + Similarity before, you should likely extend this instead. + + Weight.normalize no longer takes a norm value that incorporates the top-level + boost from outer queries such as BooleanQuery, instead it takes 2 parameters, + the outer boost (topLevelBoost) and the norm. Weight.sumOfSquaredWeights has + been renamed to Weight.getValueForNormalization(). diff --git a/lucene/common-build.xml b/lucene/common-build.xml index 0de68e181f4..2af53feb602 100644 --- a/lucene/common-build.xml +++ b/lucene/common-build.xml @@ -331,7 +331,7 @@ - + @@ -351,7 +351,7 @@ - + diff --git a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java index 26c166e9ce1..8b2635085c7 100644 --- a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java +++ b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java @@ -240,8 +240,7 @@ public class InstantiatedIndexWriter implements Closeable { final FieldInvertState invertState = new FieldInvertState(); invertState.setBoost(eFieldTermDocInfoFactoriesByTermText.getKey().boost * document.getDocument().getBoost()); invertState.setLength(eFieldTermDocInfoFactoriesByTermText.getKey().fieldLength); - final float norm = similarityProvider.get(fieldName).computeNorm(invertState); - normsByFieldNameAndDocumentNumber.get(fieldName)[document.getDocumentNumber()] = similarityProvider.get(fieldName).encodeNormValue(norm); + normsByFieldNameAndDocumentNumber.get(fieldName)[document.getDocumentNumber()] = similarityProvider.get(fieldName).computeNorm(invertState); } else { System.currentTimeMillis(); } diff --git a/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index 9e383eb2900..727d47cafa3 100644 --- a/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -51,7 +51,6 @@ import org.apache.lucene.index.TermFreqVector; import org.apache.lucene.index.TermPositionVector; import org.apache.lucene.index.TermVectorMapper; import org.apache.lucene.index.FieldInvertState; -import org.apache.lucene.index.IndexReader.ReaderContext; import org.apache.lucene.index.codecs.PerDocValues; import org.apache.lucene.search.Collector; import org.apache.lucene.search.IndexSearcher; @@ -1202,19 +1201,18 @@ public class MemoryIndex { int numOverlapTokens = info != null ? info.numOverlapTokens : 0; float boost = info != null ? info.getBoost() : 1.0f; FieldInvertState invertState = new FieldInvertState(0, numTokens, numOverlapTokens, 0, boost); - float n = fieldSim.computeNorm(invertState); - byte norm = fieldSim.encodeNormValue(n); + byte norm = fieldSim.computeNorm(invertState); norms = new byte[] {norm}; // cache it for future reuse cachedNorms = norms; cachedFieldName = fieldName; cachedSimilarity = sim; - if (DEBUG) System.err.println("MemoryIndexReader.norms: " + fieldName + ":" + n + ":" + norm + ":" + numTokens); + if (DEBUG) System.err.println("MemoryIndexReader.norms: " + fieldName + ":" + norm + ":" + numTokens); } return norms; } - + @Override protected void doSetNorm(int doc, String fieldName, byte value) { throw new UnsupportedOperationException(); diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/index/FieldNormModifier.java b/lucene/contrib/misc/src/java/org/apache/lucene/index/FieldNormModifier.java index d3673243e0d..c17ac02aed7 100644 --- a/lucene/contrib/misc/src/java/org/apache/lucene/index/FieldNormModifier.java +++ b/lucene/contrib/misc/src/java/org/apache/lucene/index/FieldNormModifier.java @@ -147,7 +147,7 @@ public class FieldNormModifier { for (int d = 0; d < termCounts.length; d++) { if (liveDocs == null || liveDocs.get(d)) { invertState.setLength(termCounts[d]); - subReader.setNorm(d, field, fieldSim.encodeNormValue(fieldSim.computeNorm(invertState))); + subReader.setNorm(d, field, fieldSim.computeNorm(invertState)); } } } diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingCodec.java b/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingCodec.java index 57915da0675..f39ce3db518 100644 --- a/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingCodec.java +++ b/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingCodec.java @@ -25,7 +25,6 @@ import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.codecs.Codec; -import org.apache.lucene.index.codecs.DocValuesConsumer; import org.apache.lucene.index.codecs.DefaultDocValuesProducer; import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.index.codecs.FieldsProducer; @@ -58,7 +57,7 @@ public class AppendingCodec extends Codec { public static String CODEC_NAME = "Appending"; public AppendingCodec() { - name = CODEC_NAME; + super(CODEC_NAME); } @Override @@ -138,22 +137,22 @@ public class AppendingCodec extends Codec { StandardPostingsReader.files(dir, segmentInfo, codecId, files); BlockTermsReader.files(dir, segmentInfo, codecId, files); FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files); - DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files); + DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files, getDocValuesUseCFS()); } @Override public void getExtensions(Set extensions) { StandardCodec.getStandardExtensions(extensions); - DefaultDocValuesConsumer.getDocValuesExtensions(extensions); + DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS()); } @Override public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { - return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); + return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS()); } @Override public PerDocValues docsProducer(SegmentReadState state) throws IOException { - return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, state.context); + return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, getDocValuesUseCFS(), getDocValuesSortComparator(), state.context); } } diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java b/lucene/contrib/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java index c5c454a14bb..f1ac1459532 100644 --- a/lucene/contrib/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java +++ b/lucene/contrib/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java @@ -106,7 +106,7 @@ public class SweetSpotSimilarity extends DefaultSimilarity { * discountOverlaps is true by default or true for this * specific field. */ @Override - public float computeNorm(FieldInvertState state) { + public byte computeNorm(FieldInvertState state) { final int numTokens; if (discountOverlaps) @@ -114,7 +114,7 @@ public class SweetSpotSimilarity extends DefaultSimilarity { else numTokens = state.getLength(); - return state.getBoost() * computeLengthNorm(numTokens); + return encodeNormValue(state.getBoost() * computeLengthNorm(numTokens)); } /** diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/store/WindowsDirectory.java b/lucene/contrib/misc/src/java/org/apache/lucene/store/WindowsDirectory.java index 0e435c5eb29..29d8998cfc8 100644 --- a/lucene/contrib/misc/src/java/org/apache/lucene/store/WindowsDirectory.java +++ b/lucene/contrib/misc/src/java/org/apache/lucene/store/WindowsDirectory.java @@ -70,7 +70,7 @@ public class WindowsDirectory extends FSDirectory { @Override public IndexInput openInput(String name, IOContext context) throws IOException { ensureOpen(); - return new WindowsIndexInput(new File(getDirectory(), name), DEFAULT_BUFFERSIZE); + return new WindowsIndexInput(new File(getDirectory(), name), Math.max(BufferedIndexInput.bufferSize(context), DEFAULT_BUFFERSIZE)); } protected static class WindowsIndexInput extends BufferedIndexInput { diff --git a/lucene/contrib/misc/src/test/org/apache/lucene/index/TestFieldNormModifier.java b/lucene/contrib/misc/src/test/org/apache/lucene/index/TestFieldNormModifier.java index 9af69bba012..d5896a45a27 100644 --- a/lucene/contrib/misc/src/test/org/apache/lucene/index/TestFieldNormModifier.java +++ b/lucene/contrib/misc/src/test/org/apache/lucene/index/TestFieldNormModifier.java @@ -49,8 +49,8 @@ public class TestFieldNormModifier extends LuceneTestCase { public Similarity get(String field) { return new DefaultSimilarity() { @Override - public float computeNorm(FieldInvertState state) { - return state.getBoost() * (discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength()); + public byte computeNorm(FieldInvertState state) { + return encodeNormValue(state.getBoost() * (discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength())); } }; } diff --git a/lucene/contrib/misc/src/test/org/apache/lucene/misc/SweetSpotSimilarityTest.java b/lucene/contrib/misc/src/test/org/apache/lucene/misc/SweetSpotSimilarityTest.java index f7f33f3748f..0e9732c4a91 100644 --- a/lucene/contrib/misc/src/test/org/apache/lucene/misc/SweetSpotSimilarityTest.java +++ b/lucene/contrib/misc/src/test/org/apache/lucene/misc/SweetSpotSimilarityTest.java @@ -21,6 +21,7 @@ package org.apache.lucene.misc; import org.apache.lucene.search.DefaultSimilarity; import org.apache.lucene.search.DefaultSimilarityProvider; import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.TFIDFSimilarity; import org.apache.lucene.search.SimilarityProvider; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.index.FieldInvertState; @@ -58,15 +59,15 @@ public class SweetSpotSimilarityTest extends LuceneTestCase { invertState.setLength(i); assertEquals("3,10: spot i="+i, 1.0f, - s.computeNorm(invertState), + ss.decodeNormValue(s.computeNorm(invertState)), 0.0f); } for (int i = 10; i < 1000; i++) { invertState.setLength(i-9); - final float normD = d.computeNorm(invertState); + final byte normD = d.computeNorm(invertState); invertState.setLength(i); - final float normS = s.computeNorm(invertState); + final byte normS = s.computeNorm(invertState); assertEquals("3,10: 10 fields = new ArrayList(); - final List slices = new ArrayList(); final List bits = new ArrayList(); final List bitsStarts = new ArrayList(); - - // TODO: move this into its own method - this merges currently only docvalues - final List perDocProducers = new ArrayList(); - final List perDocSlices = new ArrayList(); - final List perDocBits = new ArrayList(); - final List perDocBitsStarts = new ArrayList(); for(IndexReader r : readers) { final Fields f = r.fields(); @@ -504,18 +498,10 @@ final class SegmentMerger { bits.add(r.getLiveDocs()); bitsStarts.add(docBase); } - final PerDocValues producer = r.perDocValues(); - if (producer != null) { - perDocSlices.add(new ReaderUtil.Slice(docBase, maxDoc, fields.size())); - perDocProducers.add(producer); - perDocBits.add(r.getLiveDocs()); - perDocBitsStarts.add(docBase); - } docBase += maxDoc; } bitsStarts.add(docBase); - perDocBitsStarts.add(docBase); // we may gather more readers than mergeState.readerCount mergeState = new MergeState(); @@ -581,19 +567,45 @@ final class SegmentMerger { } finally { consumer.close(); } + } + + private void mergePerDoc() throws IOException { + final List perDocProducers = new ArrayList(); + final List perDocSlices = new ArrayList(); + final List perDocBits = new ArrayList(); + final List perDocBitsStarts = new ArrayList(); + int docBase = 0; + for (IndexReader r : readers) { + final int maxDoc = r.maxDoc(); + final PerDocValues producer = r.perDocValues(); + if (producer != null) { + perDocSlices.add(new ReaderUtil.Slice(docBase, maxDoc, perDocProducers + .size())); + perDocProducers.add(producer); + perDocBits.add(r.getLiveDocs()); + perDocBitsStarts.add(docBase); + } + docBase += maxDoc; + } + perDocBitsStarts.add(docBase); if (!perDocSlices.isEmpty()) { - mergeState.multiLiveDocs = new MultiBits(perDocBits, perDocBitsStarts, true); + mergeState.multiLiveDocs = new MultiBits(perDocBits, perDocBitsStarts, + true); final PerDocConsumer docsConsumer = codec .docsConsumer(new PerDocWriteState(segmentWriteState)); + boolean success = false; try { - final MultiPerDocValues multiPerDocValues = new MultiPerDocValues(perDocProducers - .toArray(PerDocValues.EMPTY_ARRAY), perDocSlices - .toArray(ReaderUtil.Slice.EMPTY_ARRAY)); + final MultiPerDocValues multiPerDocValues = new MultiPerDocValues( + perDocProducers.toArray(PerDocValues.EMPTY_ARRAY), + perDocSlices.toArray(ReaderUtil.Slice.EMPTY_ARRAY)); docsConsumer.merge(mergeState, multiPerDocValues); + success = true; } finally { - docsConsumer.close(); + IOUtils.closeSafely(!success, docsConsumer); } } + /* don't close the perDocProducers here since they are private segment producers + * and will be closed once the SegmentReader goes out of scope */ } private MergeState mergeState; diff --git a/lucene/src/java/org/apache/lucene/index/TermVectorsReader.java b/lucene/src/java/org/apache/lucene/index/TermVectorsReader.java index ca9b628cb0e..b03463782ff 100644 --- a/lucene/src/java/org/apache/lucene/index/TermVectorsReader.java +++ b/lucene/src/java/org/apache/lucene/index/TermVectorsReader.java @@ -25,11 +25,13 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IOContext.Context; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import java.io.Closeable; import java.io.IOException; import java.util.Arrays; -class TermVectorsReader implements Cloneable { +class TermVectorsReader implements Cloneable, Closeable { // NOTE: if you make a new format, it must be larger than // the current format @@ -190,14 +192,8 @@ class TermVectorsReader implements Cloneable { return format; } - void close() throws IOException { - // make all effort to close up. Keep the first exception - // and throw it as a new one. - IOException keep = null; - if (tvx != null) try { tvx.close(); } catch (IOException e) { keep = e; } - if (tvd != null) try { tvd.close(); } catch (IOException e) { if (keep == null) keep = e; } - if (tvf != null) try { tvf.close(); } catch (IOException e) { if (keep == null) keep = e; } - if (keep != null) throw (IOException) keep.fillInStackTrace(); + public void close() throws IOException { + IOUtils.closeSafely(false, tvx, tvd, tvf); } /** diff --git a/lucene/src/java/org/apache/lucene/index/codecs/Codec.java b/lucene/src/java/org/apache/lucene/index/codecs/Codec.java index 736ceed67dd..bd0af40ad94 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/Codec.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/Codec.java @@ -18,6 +18,7 @@ package org.apache.lucene.index.codecs; */ import java.io.IOException; +import java.util.Comparator; import java.util.Set; import org.apache.lucene.index.PerDocWriteState; @@ -25,13 +26,21 @@ import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; /** @lucene.experimental */ public abstract class Codec { public static final Codec[] EMPTY = new Codec[0]; /** Unique name that's used to retrieve this codec when * reading the index */ - public String name; + public final String name; + private boolean dvUseCompoundFile = true; + private Comparator docValuesSortComparator = BytesRef + .getUTF8SortedAsUnicodeComparator(); + + protected Codec(String name) { + this.name = name; + } /** Writes a new segment */ public abstract FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException; @@ -68,7 +77,48 @@ public abstract class Codec { /** Records all file extensions this codec uses */ public abstract void getExtensions(Set extensions); + + /** + * If set to true this codec will use a compound file for + * IndexDocValues, otherwise each IndexDocValues field will create up to 2 + * files per segment. + *

+ * NOTE: The default values is true. + */ + public void setDocValuesUseCFS(boolean docValuesUseCFS) { + this.dvUseCompoundFile = docValuesUseCFS; + } + + /** + * Returns true iff compound file should be used for + * IndexDocValues, otherwise false. + * + * @see #setDocValuesUseCFS(boolean) + * @return true iff compound file should be used for + * IndexDocValues, otherwise false. + */ + public boolean getDocValuesUseCFS() { + return dvUseCompoundFile; + } + + /** + * Sets the {@link BytesRef} comparator for sorted IndexDocValue variants. The + * default is {@link BytesRef#getUTF8SortedAsUnicodeComparator()}. * + */ + public void setDocValuesSortComparator( + Comparator docValuesSortComparator) { + this.docValuesSortComparator = docValuesSortComparator; + } + + /** + * Returns the {@link BytesRef} comparator for sorted IndexDocValue variants. + * The default is {@link BytesRef#getUTF8SortedAsUnicodeComparator()}. + */ + public Comparator getDocValuesSortComparator() { + return docValuesSortComparator; + } + @Override public String toString() { return name; diff --git a/lucene/src/java/org/apache/lucene/index/codecs/CoreCodecProvider.java b/lucene/src/java/org/apache/lucene/index/codecs/CoreCodecProvider.java index 6f3934b401e..74d547b56e6 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/CoreCodecProvider.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/CoreCodecProvider.java @@ -44,7 +44,7 @@ public class CoreCodecProvider extends CodecProvider { public CoreCodecProvider() { register(new StandardCodec()); register(new PreFlexCodec()); - register(new PulsingCodec(1)); + register(new PulsingCodec()); register(new SimpleTextCodec()); register(new MemoryCodec()); } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/DefaultDocValuesConsumer.java b/lucene/src/java/org/apache/lucene/index/codecs/DefaultDocValuesConsumer.java index b02c9c91d49..d1749fb320a 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/DefaultDocValuesConsumer.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/DefaultDocValuesConsumer.java @@ -32,79 +32,106 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.util.BytesRef; +/** + * + * @lucene.experimental + */ public class DefaultDocValuesConsumer extends PerDocConsumer { private final String segmentName; private final int codecId; private final Directory directory; private final AtomicLong bytesUsed; private final Comparator comparator; - - public DefaultDocValuesConsumer(PerDocWriteState state, Comparator comparator) { + private boolean useCompoundFile; + private final IOContext context; + + public DefaultDocValuesConsumer(PerDocWriteState state, Comparator comparator, boolean useCompoundFile) throws IOException { this.segmentName = state.segmentName; this.codecId = state.codecId; this.bytesUsed = state.bytesUsed; - this.directory = state.directory; + this.context = state.context; + //TODO maybe we should enable a global CFS that all codecs can pull on demand to further reduce the number of files? + this.directory = useCompoundFile ? state.directory.createCompoundOutput( + IndexFileNames.segmentFileName(segmentName, codecId, + IndexFileNames.COMPOUND_FILE_EXTENSION), context) : state.directory; this.comparator = comparator; + this.useCompoundFile = useCompoundFile; } - + public void close() throws IOException { + if (useCompoundFile) { + this.directory.close(); + } } @Override public DocValuesConsumer addValuesField(FieldInfo field) throws IOException { return Writer.create(field.getDocValues(), docValuesId(segmentName, codecId, field.number), - // TODO can we have a compound file per segment and codec for - // docvalues? - directory, comparator, bytesUsed, IOContext.DEFAULT); + directory, comparator, bytesUsed, context); } @SuppressWarnings("fallthrough") public static void files(Directory dir, SegmentInfo segmentInfo, int codecId, - Set files) throws IOException { + Set files, boolean useCompoundFile) throws IOException { FieldInfos fieldInfos = segmentInfo.getFieldInfos(); for (FieldInfo fieldInfo : fieldInfos) { if (fieldInfo.getCodecId() == codecId && fieldInfo.hasDocValues()) { String filename = docValuesId(segmentInfo.name, codecId, fieldInfo.number); - switch (fieldInfo.getDocValues()) { - case BYTES_FIXED_DEREF: - case BYTES_VAR_DEREF: - case BYTES_VAR_SORTED: - case BYTES_FIXED_SORTED: - case BYTES_VAR_STRAIGHT: - files.add(IndexFileNames.segmentFileName(filename, "", - Writer.INDEX_EXTENSION)); - assert dir.fileExists(IndexFileNames.segmentFileName(filename, "", - Writer.INDEX_EXTENSION)); - // until here all types use an index - case BYTES_FIXED_STRAIGHT: - case FLOAT_32: - case FLOAT_64: - case VAR_INTS: - case FIXED_INTS_16: - case FIXED_INTS_32: - case FIXED_INTS_64: - case FIXED_INTS_8: - files.add(IndexFileNames.segmentFileName(filename, "", - Writer.DATA_EXTENSION)); - assert dir.fileExists(IndexFileNames.segmentFileName(filename, "", - Writer.DATA_EXTENSION)); - break; - - default: - assert false; + if (useCompoundFile) { + files.add(IndexFileNames.segmentFileName(segmentInfo.name, codecId, IndexFileNames.COMPOUND_FILE_EXTENSION)); + files.add(IndexFileNames.segmentFileName(segmentInfo.name, codecId, IndexFileNames.COMPOUND_FILE_ENTRIES_EXTENSION)); + assert dir.fileExists(IndexFileNames.segmentFileName(segmentInfo.name, codecId, IndexFileNames.COMPOUND_FILE_ENTRIES_EXTENSION)); + assert dir.fileExists(IndexFileNames.segmentFileName(segmentInfo.name, codecId, IndexFileNames.COMPOUND_FILE_EXTENSION)); + return; + } else { + switch (fieldInfo.getDocValues()) { + case BYTES_FIXED_DEREF: + case BYTES_VAR_DEREF: + case BYTES_VAR_SORTED: + case BYTES_FIXED_SORTED: + case BYTES_VAR_STRAIGHT: + files.add(IndexFileNames.segmentFileName(filename, "", + Writer.INDEX_EXTENSION)); + assert dir.fileExists(IndexFileNames.segmentFileName(filename, "", + Writer.INDEX_EXTENSION)); + // until here all types use an index + case BYTES_FIXED_STRAIGHT: + case FLOAT_32: + case FLOAT_64: + case VAR_INTS: + case FIXED_INTS_16: + case FIXED_INTS_32: + case FIXED_INTS_64: + case FIXED_INTS_8: + files.add(IndexFileNames.segmentFileName(filename, "", + Writer.DATA_EXTENSION)); + assert dir.fileExists(IndexFileNames.segmentFileName(filename, "", + Writer.DATA_EXTENSION)); + break; + + default: + assert false; + } } } } } + static String docValuesId(String segmentsName, int codecID, int fieldId) { return segmentsName + "_" + codecID + "-" + fieldId; } - - public static void getDocValuesExtensions(Set extensions) { - extensions.add(Writer.DATA_EXTENSION); - extensions.add(Writer.INDEX_EXTENSION); + + public static void getDocValuesExtensions(Set extensions, boolean useCompoundFile) { + if (useCompoundFile) { + extensions.add(IndexFileNames.COMPOUND_FILE_ENTRIES_EXTENSION); + extensions.add(IndexFileNames.COMPOUND_FILE_EXTENSION); + } else { + extensions.add(Writer.DATA_EXTENSION); + extensions.add(Writer.INDEX_EXTENSION); + } } + } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/DefaultDocValuesProducer.java b/lucene/src/java/org/apache/lucene/index/codecs/DefaultDocValuesProducer.java index 663df35c036..6a3207d2b96 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/DefaultDocValuesProducer.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/DefaultDocValuesProducer.java @@ -16,12 +16,16 @@ package org.apache.lucene.index.codecs; * See the License for the specific language governing permissions and * limitations under the License. */ +import java.io.Closeable; import java.io.IOException; +import java.util.ArrayList; import java.util.Collection; +import java.util.Comparator; import java.util.TreeMap; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.values.Bytes; import org.apache.lucene.index.values.IndexDocValues; @@ -30,6 +34,8 @@ import org.apache.lucene.index.values.Ints; import org.apache.lucene.index.values.ValueType; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; /** * Abstract base class for FieldsProducer implementations supporting @@ -40,8 +46,12 @@ import org.apache.lucene.store.IOContext; public class DefaultDocValuesProducer extends PerDocValues { protected final TreeMap docValues; + private final boolean useCompoundFile; + private final Closeable cfs; + private final Comparator sortComparator; /** + * * Creates a new {@link DefaultDocValuesProducer} instance and loads all * {@link IndexDocValues} instances for this segment and codec. * @@ -53,12 +63,27 @@ public class DefaultDocValuesProducer extends PerDocValues { * the {@link FieldInfos} * @param codecId * the codec ID + * @param useCompoundFile + * if true this producer opens a compound file to read + * IndexDocValues fields, otherwise each field defines its own set of + * files. + * @param sortComparator + * defines the sort order for sorted IndexDocValues variants * @throws IOException * if an {@link IOException} occurs */ - public DefaultDocValuesProducer(SegmentInfo si, Directory dir, - FieldInfos fieldInfo, int codecId, IOContext context) throws IOException { - docValues = load(fieldInfo, si.name, si.docCount, dir, codecId, context); + public DefaultDocValuesProducer(SegmentInfo si, Directory dir, + FieldInfos fieldInfo, int codecId, boolean useCompoundFile, Comparator sortComparator, IOContext context) throws IOException { + this.useCompoundFile = useCompoundFile; + this.sortComparator = sortComparator; + final Directory directory; + if (useCompoundFile) { + cfs = directory = dir.openCompoundInput(IndexFileNames.segmentFileName(si.name, codecId, IndexFileNames.COMPOUND_FILE_EXTENSION), context); + } else { + cfs = null; + directory = dir; + } + docValues = load(fieldInfo, si.name, si.docCount, directory, codecId, context); } /** @@ -86,14 +111,14 @@ public class DefaultDocValuesProducer extends PerDocValues { final String id = DefaultDocValuesConsumer.docValuesId(segment, codecId, fieldInfo.number); values.put(field, - loadDocValues(docCount, dir, id, fieldInfo.getDocValues(), context)); + loadDocValues(docCount, dir, id, fieldInfo.getDocValues(), sortComparator, context)); } } success = true; } finally { if (!success) { // if we fail we must close all opened resources if there are any - closeDocValues(values.values()); + closeInternal(values.values()); } } return values; @@ -113,6 +138,7 @@ public class DefaultDocValuesProducer extends PerDocValues { * the unique file ID within the segment * @param type * the type to load + * @param sortComparator byte comparator used by sorted variants * @return a {@link IndexDocValues} instance for the given type * @throws IOException * if an {@link IOException} occurs @@ -120,7 +146,7 @@ public class DefaultDocValuesProducer extends PerDocValues { * if the given {@link ValueType} is not supported */ protected IndexDocValues loadDocValues(int docCount, Directory dir, String id, - ValueType type, IOContext context) throws IOException { + ValueType type, Comparator sortComparator, IOContext context) throws IOException { switch (type) { case FIXED_INTS_16: case FIXED_INTS_32: @@ -133,39 +159,36 @@ public class DefaultDocValuesProducer extends PerDocValues { case FLOAT_64: return Floats.getValues(dir, id, docCount, context); case BYTES_FIXED_STRAIGHT: - return Bytes.getValues(dir, id, Bytes.Mode.STRAIGHT, true, docCount, context); + return Bytes.getValues(dir, id, Bytes.Mode.STRAIGHT, true, docCount, sortComparator, context); case BYTES_FIXED_DEREF: - return Bytes.getValues(dir, id, Bytes.Mode.DEREF, true, docCount, context); + return Bytes.getValues(dir, id, Bytes.Mode.DEREF, true, docCount, sortComparator, context); case BYTES_FIXED_SORTED: - return Bytes.getValues(dir, id, Bytes.Mode.SORTED, true, docCount, context); + return Bytes.getValues(dir, id, Bytes.Mode.SORTED, true, docCount, sortComparator, context); case BYTES_VAR_STRAIGHT: - return Bytes.getValues(dir, id, Bytes.Mode.STRAIGHT, false, docCount, context); + return Bytes.getValues(dir, id, Bytes.Mode.STRAIGHT, false, docCount, sortComparator, context); case BYTES_VAR_DEREF: - return Bytes.getValues(dir, id, Bytes.Mode.DEREF, false, docCount, context); + return Bytes.getValues(dir, id, Bytes.Mode.DEREF, false, docCount, sortComparator, context); case BYTES_VAR_SORTED: - return Bytes.getValues(dir, id, Bytes.Mode.SORTED, false, docCount, context); + return Bytes.getValues(dir, id, Bytes.Mode.SORTED, false, docCount, sortComparator, context); default: throw new IllegalStateException("unrecognized index values mode " + type); } } public void close() throws IOException { - closeDocValues(docValues.values()); + closeInternal(docValues.values()); } - private void closeDocValues(final Collection values) - throws IOException { - IOException ex = null; - for (IndexDocValues docValues : values) { - try { - docValues.close(); - } catch (IOException e) { - ex = e; - } - } - if (ex != null) { - throw ex; - } + private void closeInternal(Collection closeables) throws IOException { + final Collection toClose; + if (useCompoundFile) { + final ArrayList list = new ArrayList(closeables); + list.add(cfs); + toClose = list; + } else { + toClose = closeables; + } + IOUtils.closeSafely(false, toClose); } @Override diff --git a/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java b/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java index 79c1cab7e73..b4fed2dbabe 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java @@ -77,9 +77,9 @@ import org.apache.lucene.util.fst.FST; * @lucene.experimental */ public class MemoryCodec extends Codec { - + public MemoryCodec() { - name = "Memory"; + super("Memory"); } private static final boolean VERBOSE = false; @@ -779,22 +779,22 @@ public class MemoryCodec extends Codec { @Override public void files(Directory dir, SegmentInfo segmentInfo, int id, Set files) throws IOException { files.add(IndexFileNames.segmentFileName(segmentInfo.name, id, EXTENSION)); - DefaultDocValuesConsumer.files(dir, segmentInfo, id, files); + DefaultDocValuesConsumer.files(dir, segmentInfo, id, files, getDocValuesUseCFS()); } @Override public void getExtensions(Set extensions) { extensions.add(EXTENSION); - DefaultDocValuesConsumer.getDocValuesExtensions(extensions); + DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS()); } @Override public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { - return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); + return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS()); } @Override public PerDocValues docsProducer(SegmentReadState state) throws IOException { - return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, IOContext.READONCE); + return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, getDocValuesUseCFS(), getDocValuesSortComparator(), IOContext.READONCE); } } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java index 087be5e527d..e5ce0b65f9e 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java @@ -55,7 +55,7 @@ public class PreFlexCodec extends Codec { public static final String PROX_EXTENSION = "prx"; public PreFlexCodec() { - name = "PreFlex"; + super("PreFlex"); } @Override diff --git a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java index e5c5283992b..2bb6d97dfe3 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java @@ -43,7 +43,6 @@ import org.apache.lucene.index.codecs.TermsIndexReaderBase; import org.apache.lucene.index.codecs.TermsIndexWriterBase; import org.apache.lucene.index.codecs.standard.StandardCodec; import org.apache.lucene.store.Directory; -import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; /** This codec "inlines" the postings for terms that have @@ -58,10 +57,19 @@ public class PulsingCodec extends Codec { private final int freqCutoff; + /** + * Creates a {@link PulsingCodec} with freqCutoff = 1 + * + * @see PulsingCodec#PulsingCodec(int) + */ + public PulsingCodec() { + this(1); + } + /** Terms with freq <= freqCutoff are inlined into terms * dict. */ public PulsingCodec(int freqCutoff) { - name = "Pulsing"; + super("Pulsing"); this.freqCutoff = freqCutoff; } @@ -157,22 +165,22 @@ public class PulsingCodec extends Codec { StandardPostingsReader.files(dir, segmentInfo, id, files); BlockTermsReader.files(dir, segmentInfo, id, files); VariableGapTermsIndexReader.files(dir, segmentInfo, id, files); - DefaultDocValuesConsumer.files(dir, segmentInfo, id, files); + DefaultDocValuesConsumer.files(dir, segmentInfo, id, files, getDocValuesUseCFS()); } @Override public void getExtensions(Set extensions) { StandardCodec.getStandardExtensions(extensions); - DefaultDocValuesConsumer.getDocValuesExtensions(extensions); + DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS()); } @Override public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { - return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); + return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS()); } @Override public PerDocValues docsProducer(SegmentReadState state) throws IOException { - return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, state.context); + return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, getDocValuesUseCFS(), getDocValuesSortComparator(), state.context); } } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextCodec.java b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextCodec.java index 832b7e3bb20..1bdb88fee48 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextCodec.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextCodec.java @@ -33,7 +33,6 @@ import org.apache.lucene.index.codecs.PerDocConsumer; import org.apache.lucene.index.codecs.DefaultDocValuesConsumer; import org.apache.lucene.index.codecs.PerDocValues; import org.apache.lucene.store.Directory; -import org.apache.lucene.util.BytesRef; /** For debugging, curiosity, transparency only!! Do not * use this codec in production. @@ -44,11 +43,12 @@ import org.apache.lucene.util.BytesRef; * * @lucene.experimental */ public class SimpleTextCodec extends Codec { - + public SimpleTextCodec() { - name = "SimpleText"; + super("SimpleText"); } + @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { return new SimpleTextFieldsWriter(state); @@ -69,23 +69,23 @@ public class SimpleTextCodec extends Codec { @Override public void files(Directory dir, SegmentInfo segmentInfo, int id, Set files) throws IOException { files.add(getPostingsFileName(segmentInfo.name, id)); - DefaultDocValuesConsumer.files(dir, segmentInfo, id, files); + DefaultDocValuesConsumer.files(dir, segmentInfo, id, files, getDocValuesUseCFS()); } @Override public void getExtensions(Set extensions) { extensions.add(POSTINGS_EXTENSION); - DefaultDocValuesConsumer.getDocValuesExtensions(extensions); + DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS()); } // TODO: would be great if these used a plain text impl @Override public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { - return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); + return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS()); } @Override public PerDocValues docsProducer(SegmentReadState state) throws IOException { - return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, state.context); + return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, getDocValuesUseCFS(), getDocValuesSortComparator(), state.context); } } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java index 47def928931..eed2648e045 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java @@ -40,14 +40,13 @@ import org.apache.lucene.index.codecs.BlockTermsWriter; import org.apache.lucene.index.codecs.BlockTermsReader; import org.apache.lucene.index.codecs.DefaultDocValuesProducer; import org.apache.lucene.store.Directory; -import org.apache.lucene.util.BytesRef; /** Default codec. * @lucene.experimental */ public class StandardCodec extends Codec { public StandardCodec() { - name = "Standard"; + super("Standard"); } @Override @@ -140,13 +139,13 @@ public class StandardCodec extends Codec { StandardPostingsReader.files(dir, segmentInfo, id, files); BlockTermsReader.files(dir, segmentInfo, id, files); VariableGapTermsIndexReader.files(dir, segmentInfo, id, files); - DefaultDocValuesConsumer.files(dir, segmentInfo, id, files); + DefaultDocValuesConsumer.files(dir, segmentInfo, id, files, getDocValuesUseCFS()); } @Override public void getExtensions(Set extensions) { getStandardExtensions(extensions); - DefaultDocValuesConsumer.getDocValuesExtensions(extensions); + DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS()); } public static void getStandardExtensions(Set extensions) { @@ -158,11 +157,11 @@ public class StandardCodec extends Codec { @Override public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { - return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); + return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS()); } @Override public PerDocValues docsProducer(SegmentReadState state) throws IOException { - return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, state.context); + return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, getDocValuesUseCFS(), getDocValuesSortComparator(), state.context); } } diff --git a/lucene/src/java/org/apache/lucene/index/values/Bytes.java b/lucene/src/java/org/apache/lucene/index/values/Bytes.java index 9735c69a0dc..f92e6578bab 100644 --- a/lucene/src/java/org/apache/lucene/index/values/Bytes.java +++ b/lucene/src/java/org/apache/lucene/index/values/Bytes.java @@ -153,12 +153,13 @@ public final class Bytes { * otherwise false * @param maxDoc * the number of document values stored for the given ID + * @param sortComparator byte comparator used by sorted variants * @return an initialized {@link IndexDocValues} instance. * @throws IOException * if an {@link IOException} occurs */ public static IndexDocValues getValues(Directory dir, String id, Mode mode, - boolean fixedSize, int maxDoc, IOContext context) throws IOException { + boolean fixedSize, int maxDoc, Comparator sortComparator, IOContext context) throws IOException { // TODO -- I can peek @ header to determing fixed/mode? if (fixedSize) { @@ -175,7 +176,7 @@ public final class Bytes { } else if (mode == Mode.DEREF) { return new VarDerefBytesImpl.Reader(dir, id, maxDoc, context); } else if (mode == Mode.SORTED) { - return new VarSortedBytesImpl.Reader(dir, id, maxDoc, context); + return new VarSortedBytesImpl.Reader(dir, id, maxDoc, sortComparator, context); } } diff --git a/lucene/src/java/org/apache/lucene/index/values/IndexDocValues.java b/lucene/src/java/org/apache/lucene/index/values/IndexDocValues.java index e9bde3fb3c1..305a076fe06 100644 --- a/lucene/src/java/org/apache/lucene/index/values/IndexDocValues.java +++ b/lucene/src/java/org/apache/lucene/index/values/IndexDocValues.java @@ -130,6 +130,18 @@ public abstract class IndexDocValues implements Closeable { throws IOException { return cache.loadSorted(this, comparator); } + + /** + * Returns a {@link SortedSource} instance using a default {@link BytesRef} + * comparator for this {@link IndexDocValues} field instance like + * {@link #getSource()}. + *

+ * This method will return null iff this {@link IndexDocValues} represent a + * {@link Source} instead of a {@link SortedSource}. + */ + public SortedSource getSortedSorted() throws IOException { + return getSortedSorted(null); + } /** * Loads and returns a {@link SortedSource} instance for this @@ -142,7 +154,19 @@ public abstract class IndexDocValues implements Closeable { throws IOException { throw new UnsupportedOperationException(); } - + + /** + * Loads and returns a {@link SortedSource} instance using a default + * {@link BytesRef} comparator for this {@link IndexDocValues} field instance + * like {@link #load()}. + *

+ * This method will return null iff this {@link IndexDocValues} represent a + * {@link Source} instead of a {@link SortedSource}. + */ + public SortedSource loadSorted() throws IOException { + return loadSorted(null); + } + /** * Returns the {@link ValueType} of this {@link IndexDocValues} instance */ diff --git a/lucene/src/java/org/apache/lucene/index/values/VarSortedBytesImpl.java b/lucene/src/java/org/apache/lucene/index/values/VarSortedBytesImpl.java index b452f45b006..3e884b391d2 100644 --- a/lucene/src/java/org/apache/lucene/index/values/VarSortedBytesImpl.java +++ b/lucene/src/java/org/apache/lucene/index/values/VarSortedBytesImpl.java @@ -168,14 +168,17 @@ class VarSortedBytesImpl { public static class Reader extends BytesReaderBase { - Reader(Directory dir, String id, int maxDoc, IOContext context) throws IOException { + private final Comparator defaultComp; + + Reader(Directory dir, String id, int maxDoc, Comparator comparator, IOContext context) throws IOException { super(dir, id, CODEC_NAME, VERSION_START, true, context); + this.defaultComp = comparator; } @Override public org.apache.lucene.index.values.IndexDocValues.Source load() throws IOException { - return loadSorted(null); + return loadSorted(defaultComp); } @Override diff --git a/lucene/src/java/org/apache/lucene/search/BooleanQuery.java b/lucene/src/java/org/apache/lucene/search/BooleanQuery.java index ecddb66bb96..983fb4a805c 100644 --- a/lucene/src/java/org/apache/lucene/search/BooleanQuery.java +++ b/lucene/src/java/org/apache/lucene/search/BooleanQuery.java @@ -183,14 +183,11 @@ public class BooleanQuery extends Query implements Iterable { public Query getQuery() { return BooleanQuery.this; } @Override - public float getValue() { return getBoost(); } - - @Override - public float sumOfSquaredWeights() throws IOException { + public float getValueForNormalization() throws IOException { float sum = 0.0f; for (int i = 0 ; i < weights.size(); i++) { // call sumOfSquaredWeights for all clauses in case of side effects - float s = weights.get(i).sumOfSquaredWeights(); // sum sub weights + float s = weights.get(i).getValueForNormalization(); // sum sub weights if (!clauses.get(i).isProhibited()) // only add to sum for non-prohibited clauses sum += s; @@ -206,11 +203,11 @@ public class BooleanQuery extends Query implements Iterable { } @Override - public void normalize(float norm) { - norm *= getBoost(); // incorporate boost + public void normalize(float norm, float topLevelBoost) { + topLevelBoost *= getBoost(); // incorporate boost for (Weight w : weights) { // normalize all clauses, (even if prohibited in case of side affects) - w.normalize(norm); + w.normalize(norm, topLevelBoost); } } diff --git a/lucene/src/java/org/apache/lucene/search/ConstantScoreAutoRewrite.java b/lucene/src/java/org/apache/lucene/search/ConstantScoreAutoRewrite.java index abe1ff6ac08..69be445e256 100644 --- a/lucene/src/java/org/apache/lucene/search/ConstantScoreAutoRewrite.java +++ b/lucene/src/java/org/apache/lucene/search/ConstantScoreAutoRewrite.java @@ -27,7 +27,7 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefHash; -import org.apache.lucene.util.PerReaderTermState; +import org.apache.lucene.util.TermContext; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray; @@ -77,7 +77,7 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite { } @Override - protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost /*ignored*/, PerReaderTermState states) { + protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost /*ignored*/, TermContext states) { topLevel.add(new TermQuery(term, states), BooleanClause.Occur.SHOULD); } @@ -140,9 +140,9 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite { assert termState != null; if (pos < 0) { pos = (-pos)-1; - array.termState[pos].register(termState, readerContext.ord, termsEnum.docFreq()); + array.termState[pos].register(termState, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } else { - array.termState[pos] = new PerReaderTermState(topReaderContext, termState, readerContext.ord, termsEnum.docFreq()); + array.termState[pos] = new TermContext(topReaderContext, termState, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } return true; } @@ -183,9 +183,9 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite { return true; } - /** Special implementation of BytesStartArray that keeps parallel arrays for {@link PerReaderTermState} */ + /** Special implementation of BytesStartArray that keeps parallel arrays for {@link TermContext} */ static final class TermStateByteStart extends DirectBytesStartArray { - PerReaderTermState[] termState; + TermContext[] termState; public TermStateByteStart(int initSize) { super(initSize); @@ -194,7 +194,7 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite { @Override public int[] init() { final int[] ord = super.init(); - termState = new PerReaderTermState[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + termState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; assert termState.length >= ord.length; return ord; } @@ -203,7 +203,7 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite { public int[] grow() { final int[] ord = super.grow(); if (termState.length < ord.length) { - PerReaderTermState[] tmpTermState = new PerReaderTermState[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + TermContext[] tmpTermState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; System.arraycopy(termState, 0, tmpTermState, 0, termState.length); termState = tmpTermState; } diff --git a/lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java b/lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java index 85007dfe74b..4c1e6c7ff00 100644 --- a/lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java +++ b/lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java @@ -110,24 +110,19 @@ public class ConstantScoreQuery extends Query { } @Override - public float getValue() { - return queryWeight; - } - - @Override - public float sumOfSquaredWeights() throws IOException { + public float getValueForNormalization() throws IOException { // we calculate sumOfSquaredWeights of the inner weight, but ignore it (just to initialize everything) - if (innerWeight != null) innerWeight.sumOfSquaredWeights(); + if (innerWeight != null) innerWeight.getValueForNormalization(); queryWeight = getBoost(); return queryWeight * queryWeight; } @Override - public void normalize(float norm) { - this.queryNorm = norm; + public void normalize(float norm, float topLevelBoost) { + this.queryNorm = norm * topLevelBoost; queryWeight *= this.queryNorm; // we normalize the inner weight, but ignore it (just to initialize everything) - if (innerWeight != null) innerWeight.normalize(norm); + if (innerWeight != null) innerWeight.normalize(norm, topLevelBoost); } @Override @@ -148,7 +143,7 @@ public class ConstantScoreQuery extends Query { if (disi == null) { return null; } - return new ConstantScorer(disi, this); + return new ConstantScorer(disi, this, queryWeight); } @Override @@ -181,9 +176,9 @@ public class ConstantScoreQuery extends Query { final DocIdSetIterator docIdSetIterator; final float theScore; - public ConstantScorer(DocIdSetIterator docIdSetIterator, Weight w) throws IOException { + public ConstantScorer(DocIdSetIterator docIdSetIterator, Weight w, float theScore) throws IOException { super(w); - theScore = w.getValue(); + this.theScore = theScore; this.docIdSetIterator = docIdSetIterator; } @@ -212,7 +207,7 @@ public class ConstantScoreQuery extends Query { @Override public void setScorer(Scorer scorer) throws IOException { // we must wrap again here, but using the scorer passed in as parameter: - collector.setScorer(new ConstantScorer(scorer, ConstantScorer.this.weight)); + collector.setScorer(new ConstantScorer(scorer, ConstantScorer.this.weight, ConstantScorer.this.theScore)); } @Override diff --git a/lucene/src/java/org/apache/lucene/search/DefaultSimilarity.java b/lucene/src/java/org/apache/lucene/search/DefaultSimilarity.java index 4b89f8e502f..5e0ab442da4 100644 --- a/lucene/src/java/org/apache/lucene/search/DefaultSimilarity.java +++ b/lucene/src/java/org/apache/lucene/search/DefaultSimilarity.java @@ -20,7 +20,7 @@ import org.apache.lucene.index.FieldInvertState; */ /** Expert: Default scoring implementation. */ -public class DefaultSimilarity extends Similarity { +public class DefaultSimilarity extends TFIDFSimilarity { /** Implemented as * state.getBoost()*lengthNorm(numTerms), where @@ -31,15 +31,15 @@ public class DefaultSimilarity extends Similarity { * * @lucene.experimental */ @Override - public float computeNorm(FieldInvertState state) { + public byte computeNorm(FieldInvertState state) { final int numTerms; if (discountOverlaps) numTerms = state.getLength() - state.getNumOverlap(); else numTerms = state.getLength(); - return state.getBoost() * ((float) (1.0 / Math.sqrt(numTerms))); + return encodeNormValue(state.getBoost() * ((float) (1.0 / Math.sqrt(numTerms)))); } - + /** Implemented as sqrt(freq). */ @Override public float tf(float freq) { diff --git a/lucene/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java b/lucene/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java index 0434232035e..c50edfd4d41 100644 --- a/lucene/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java +++ b/lucene/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java @@ -110,16 +110,12 @@ public class DisjunctionMaxQuery extends Query implements Iterable { @Override public Query getQuery() { return DisjunctionMaxQuery.this; } - /** Return our boost */ - @Override - public float getValue() { return getBoost(); } - /** Compute the sub of squared weights of us applied to our subqueries. Used for normalization. */ @Override - public float sumOfSquaredWeights() throws IOException { + public float getValueForNormalization() throws IOException { float max = 0.0f, sum = 0.0f; for (Weight currentWeight : weights) { - float sub = currentWeight.sumOfSquaredWeights(); + float sub = currentWeight.getValueForNormalization(); sum += sub; max = Math.max(max, sub); @@ -130,10 +126,10 @@ public class DisjunctionMaxQuery extends Query implements Iterable { /** Apply the computed normalization factor to our subqueries */ @Override - public void normalize(float norm) { - norm *= getBoost(); // Incorporate our boost + public void normalize(float norm, float topLevelBoost) { + topLevelBoost *= getBoost(); // Incorporate our boost for (Weight wt : weights) { - wt.normalize(norm); + wt.normalize(norm, topLevelBoost); } } diff --git a/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java b/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java index 153821d92d0..08cf2c330d4 100644 --- a/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java +++ b/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java @@ -23,12 +23,6 @@ import java.util.Arrays; import org.apache.lucene.index.*; final class ExactPhraseScorer extends Scorer { - private final byte[] norms; - private final float value; - - private static final int SCORE_CACHE_SIZE = 32; - private final float[] scoreCache = new float[SCORE_CACHE_SIZE]; - private final int endMinus1; private final static int CHUNK = 4096; @@ -60,14 +54,12 @@ final class ExactPhraseScorer extends Scorer { private int docID = -1; private int freq; - private final Similarity similarity; + private final Similarity.ExactDocScorer docScorer; ExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, - Similarity similarity, byte[] norms) throws IOException { + Similarity.ExactDocScorer docScorer) throws IOException { super(weight); - this.similarity = similarity; - this.norms = norms; - this.value = weight.getValue(); + this.docScorer = docScorer; chunkStates = new ChunkState[postings.length]; @@ -88,10 +80,6 @@ final class ExactPhraseScorer extends Scorer { return; } } - - for (int i = 0; i < SCORE_CACHE_SIZE; i++) { - scoreCache[i] = similarity.tf((float) i) * value; - } } @Override @@ -206,13 +194,7 @@ final class ExactPhraseScorer extends Scorer { @Override public float score() throws IOException { - final float raw; // raw score - if (freq < SCORE_CACHE_SIZE) { - raw = scoreCache[freq]; - } else { - raw = similarity.tf((float) freq) * value; - } - return norms == null ? raw : raw * similarity.decodeNormValue(norms[docID]); // normalize + return docScorer.score(docID, freq); } private int phraseFreq() throws IOException { diff --git a/lucene/src/java/org/apache/lucene/search/Explanation.java b/lucene/src/java/org/apache/lucene/search/Explanation.java index 299752841b5..0425f246210 100644 --- a/lucene/src/java/org/apache/lucene/search/Explanation.java +++ b/lucene/src/java/org/apache/lucene/search/Explanation.java @@ -125,25 +125,4 @@ public class Explanation { return buffer.toString(); } - - /** - * Small Util class used to pass both an idf factor as well as an - * explanation for that factor. - * - * This class will likely be held on a {@link Weight}, so be aware - * before storing any large or un-serializable fields. - * - */ - public static abstract class IDFExplanation { - /** - * @return the idf factor - */ - public abstract float getIdf(); - /** - * This should be calculated lazily if possible. - * - * @return the explanation for the idf factor. - */ - public abstract String explain(); - } } diff --git a/lucene/src/java/org/apache/lucene/search/FilteredQuery.java b/lucene/src/java/org/apache/lucene/search/FilteredQuery.java index 1bcd8459d84..0630846e56c 100644 --- a/lucene/src/java/org/apache/lucene/search/FilteredQuery.java +++ b/lucene/src/java/org/apache/lucene/search/FilteredQuery.java @@ -63,21 +63,15 @@ extends Query { public Weight createWeight(final IndexSearcher searcher) throws IOException { final Weight weight = query.createWeight (searcher); return new Weight() { - private float value; - - // pass these methods through to enclosed query's weight - @Override - public float getValue() { return value; } @Override - public float sumOfSquaredWeights() throws IOException { - return weight.sumOfSquaredWeights() * getBoost() * getBoost(); + public float getValueForNormalization() throws IOException { + return weight.getValueForNormalization() * getBoost() * getBoost(); } @Override - public void normalize (float v) { - weight.normalize(v); - value = weight.getValue() * getBoost(); + public void normalize (float norm, float topLevelBoost) { + weight.normalize(norm, topLevelBoost); } @Override diff --git a/lucene/src/java/org/apache/lucene/search/IndexSearcher.java b/lucene/src/java/org/apache/lucene/search/IndexSearcher.java index ce404f9ff4a..81b90f28679 100644 --- a/lucene/src/java/org/apache/lucene/search/IndexSearcher.java +++ b/lucene/src/java/org/apache/lucene/search/IndexSearcher.java @@ -674,11 +674,11 @@ public class IndexSearcher { public Weight createNormalizedWeight(Query query) throws IOException { query = rewrite(query); Weight weight = query.createWeight(this); - float sum = weight.sumOfSquaredWeights(); - float norm = getSimilarityProvider().queryNorm(sum); + float v = weight.getValueForNormalization(); + float norm = getSimilarityProvider().queryNorm(v); if (Float.isInfinite(norm) || Float.isNaN(norm)) norm = 1.0f; - weight.normalize(norm); + weight.normalize(norm, 1.0f); return weight; } diff --git a/lucene/src/java/org/apache/lucene/search/MatchAllDocsQuery.java b/lucene/src/java/org/apache/lucene/search/MatchAllDocsQuery.java index bc66237677b..39b98410e85 100644 --- a/lucene/src/java/org/apache/lucene/search/MatchAllDocsQuery.java +++ b/lucene/src/java/org/apache/lucene/search/MatchAllDocsQuery.java @@ -32,35 +32,17 @@ import java.io.IOException; */ public class MatchAllDocsQuery extends Query { - public MatchAllDocsQuery() { - this(null); - } - - private final String normsField; - - /** - * @param normsField Field used for normalization factor (document boost). Null if nothing. - */ - public MatchAllDocsQuery(String normsField) { - this.normsField = normsField; - } - private class MatchAllScorer extends Scorer { final float score; - final byte[] norms; private int doc = -1; private final int maxDoc; private final Bits liveDocs; - private final Similarity similarity; - - MatchAllScorer(IndexReader reader, Similarity similarity, Weight w, - byte[] norms) throws IOException { + + MatchAllScorer(IndexReader reader, Weight w, float score) throws IOException { super(w); - this.similarity = similarity; liveDocs = reader.getLiveDocs(); - score = w.getValue(); + this.score = score; maxDoc = reader.maxDoc(); - this.norms = norms; } @Override @@ -82,7 +64,7 @@ public class MatchAllDocsQuery extends Query { @Override public float score() { - return norms == null ? score : score * similarity.decodeNormValue(norms[docID()]); + return score; } @Override @@ -93,12 +75,10 @@ public class MatchAllDocsQuery extends Query { } private class MatchAllDocsWeight extends Weight { - private Similarity similarity; private float queryWeight; private float queryNorm; public MatchAllDocsWeight(IndexSearcher searcher) { - this.similarity = normsField == null ? null : searcher.getSimilarityProvider().get(normsField); } @Override @@ -112,33 +92,27 @@ public class MatchAllDocsQuery extends Query { } @Override - public float getValue() { - return queryWeight; - } - - @Override - public float sumOfSquaredWeights() { + public float getValueForNormalization() { queryWeight = getBoost(); return queryWeight * queryWeight; } @Override - public void normalize(float queryNorm) { - this.queryNorm = queryNorm; + public void normalize(float queryNorm, float topLevelBoost) { + this.queryNorm = queryNorm * topLevelBoost; queryWeight *= this.queryNorm; } @Override public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { - return new MatchAllScorer(context.reader, similarity, this, - normsField != null ? context.reader.norms(normsField) : null); + return new MatchAllScorer(context.reader, this, queryWeight); } @Override public Explanation explain(AtomicReaderContext context, int doc) { // explain query weight Explanation queryExpl = new ComplexExplanation - (true, getValue(), "MatchAllDocsQuery, product of:"); + (true, queryWeight, "MatchAllDocsQuery, product of:"); if (getBoost() != 1.0f) { queryExpl.addDetail(new Explanation(getBoost(),"boost")); } diff --git a/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java b/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java index 6ae3c69f29c..bed29c781d7 100644 --- a/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java +++ b/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java @@ -22,12 +22,14 @@ import java.util.*; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.index.IndexReader.ReaderContext; import org.apache.lucene.index.Term; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.DocsAndPositionsEnum; -import org.apache.lucene.search.Explanation.IDFExplanation; +import org.apache.lucene.search.Similarity.SloppyDocScorer; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.TermContext; import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.Bits; @@ -129,45 +131,35 @@ public class MultiPhraseQuery extends Query { private class MultiPhraseWeight extends Weight { - private Similarity similarity; - private float value; - private final IDFExplanation idfExp; - private float idf; - private float queryNorm; - private float queryWeight; + private final Similarity similarity; + private final Similarity.Stats stats; public MultiPhraseWeight(IndexSearcher searcher) throws IOException { this.similarity = searcher.getSimilarityProvider().get(field); - + final ReaderContext context = searcher.getTopReaderContext(); + // compute idf - ArrayList allTerms = new ArrayList(); + ArrayList allTerms = new ArrayList(); for(final Term[] terms: termArrays) { for (Term term: terms) { - allTerms.add(term); + allTerms.add(TermContext.build(context, term, true)); } } - idfExp = similarity.idfExplain(allTerms, searcher); - idf = idfExp.getIdf(); + stats = similarity.computeStats(searcher, field, getBoost(), allTerms.toArray(new TermContext[allTerms.size()])); } @Override public Query getQuery() { return MultiPhraseQuery.this; } @Override - public float getValue() { return value; } - - @Override - public float sumOfSquaredWeights() { - queryWeight = idf * getBoost(); // compute query weight - return queryWeight * queryWeight; // square it + public float getValueForNormalization() { + return stats.getValueForNormalization(); } @Override - public void normalize(float queryNorm) { - this.queryNorm = queryNorm; - queryWeight *= queryNorm; // normalize query weight - value = queryWeight * idf; // idf for document + public void normalize(float queryNorm, float topLevelBoost) { + stats.normalize(queryNorm, topLevelBoost); } @Override @@ -222,8 +214,7 @@ public class MultiPhraseQuery extends Query { } if (slop == 0) { - ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity, - reader.norms(field)); + ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.exactDocScorer(stats, field, context)); if (s.noDocs) { return null; } else { @@ -231,84 +222,29 @@ public class MultiPhraseQuery extends Query { } } else { return new SloppyPhraseScorer(this, postingsFreqs, similarity, - slop, reader.norms(field)); + slop, similarity.sloppyDocScorer(stats, field, context)); } } @Override - public Explanation explain(AtomicReaderContext context, int doc) - throws IOException { - ComplexExplanation result = new ComplexExplanation(); - result.setDescription("weight("+getQuery()+" in "+doc+"), product of:"); - - Explanation idfExpl = new Explanation(idf, "idf(" + field + ":" + idfExp.explain() +")"); - - // explain query weight - Explanation queryExpl = new Explanation(); - queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:"); - - Explanation boostExpl = new Explanation(getBoost(), "boost"); - if (getBoost() != 1.0f) - queryExpl.addDetail(boostExpl); - - queryExpl.addDetail(idfExpl); - - Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm"); - queryExpl.addDetail(queryNormExpl); - - queryExpl.setValue(boostExpl.getValue() * - idfExpl.getValue() * - queryNormExpl.getValue()); - - result.addDetail(queryExpl); - - // explain field weight - ComplexExplanation fieldExpl = new ComplexExplanation(); - fieldExpl.setDescription("fieldWeight("+getQuery()+" in "+doc+ - "), product of:"); - + public Explanation explain(AtomicReaderContext context, int doc) throws IOException { Scorer scorer = scorer(context, ScorerContext.def()); - if (scorer == null) { - return new Explanation(0.0f, "no matching docs"); + if (scorer != null) { + int newDoc = scorer.advance(doc); + if (newDoc == doc) { + float freq = scorer.freq(); + SloppyDocScorer docScorer = similarity.sloppyDocScorer(stats, field, context); + ComplexExplanation result = new ComplexExplanation(); + result.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:"); + Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq)); + result.addDetail(scoreExplanation); + result.setValue(scoreExplanation.getValue()); + result.setMatch(true); + return result; + } } - - Explanation tfExplanation = new Explanation(); - int d = scorer.advance(doc); - float phraseFreq; - if (d == doc) { - phraseFreq = scorer.freq(); - } else { - phraseFreq = 0.0f; - } - - tfExplanation.setValue(similarity.tf(phraseFreq)); - tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")"); - fieldExpl.addDetail(tfExplanation); - fieldExpl.addDetail(idfExpl); - - Explanation fieldNormExpl = new Explanation(); - byte[] fieldNorms = context.reader.norms(field); - float fieldNorm = - fieldNorms!=null ? similarity.decodeNormValue(fieldNorms[doc]) : 1.0f; - fieldNormExpl.setValue(fieldNorm); - fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")"); - fieldExpl.addDetail(fieldNormExpl); - - fieldExpl.setMatch(Boolean.valueOf(tfExplanation.isMatch())); - fieldExpl.setValue(tfExplanation.getValue() * - idfExpl.getValue() * - fieldNormExpl.getValue()); - - result.addDetail(fieldExpl); - result.setMatch(fieldExpl.getMatch()); - - // combine them - result.setValue(queryExpl.getValue() * fieldExpl.getValue()); - - if (queryExpl.getValue() == 1.0f) - return fieldExpl; - - return result; + + return new ComplexExplanation(false, 0.0f, "no matching term"); } } diff --git a/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java b/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java index 3c8c267691c..e8e7020975b 100644 --- a/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java +++ b/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java @@ -25,7 +25,7 @@ import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.util.AttributeSource; -import org.apache.lucene.util.PerReaderTermState; +import org.apache.lucene.util.TermContext; /** * An abstract {@link Query} that matches documents @@ -154,7 +154,7 @@ public abstract class MultiTermQuery extends Query { } @Override - protected void addClause(BooleanQuery topLevel, Term term, int docCount, float boost, PerReaderTermState states) { + protected void addClause(BooleanQuery topLevel, Term term, int docCount, float boost, TermContext states) { final TermQuery tq = new TermQuery(term, states); tq.setBoost(boost); topLevel.add(tq, BooleanClause.Occur.SHOULD); @@ -195,7 +195,7 @@ public abstract class MultiTermQuery extends Query { } @Override - protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost, PerReaderTermState states) { + protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost, TermContext states) { final Query q = new ConstantScoreQuery(new TermQuery(term, states)); q.setBoost(boost); topLevel.add(q, BooleanClause.Occur.SHOULD); diff --git a/lucene/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/src/java/org/apache/lucene/search/PhraseQuery.java index 300e63f30e1..470cc6656c9 100644 --- a/lucene/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/src/java/org/apache/lucene/search/PhraseQuery.java @@ -22,10 +22,16 @@ import java.util.Set; import java.util.ArrayList; import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.index.IndexReader.ReaderContext; import org.apache.lucene.index.Term; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.search.Explanation.IDFExplanation; +import org.apache.lucene.index.TermState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.Similarity.SloppyDocScorer; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.TermContext; import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Bits; @@ -171,18 +177,17 @@ public class PhraseQuery extends Query { private class PhraseWeight extends Weight { private final Similarity similarity; - private float value; - private float idf; - private float queryNorm; - private float queryWeight; - private IDFExplanation idfExp; + private final Similarity.Stats stats; + private transient TermContext states[]; public PhraseWeight(IndexSearcher searcher) throws IOException { this.similarity = searcher.getSimilarityProvider().get(field); - - idfExp = similarity.idfExplain(terms, searcher); - idf = idfExp.getIdf(); + final ReaderContext context = searcher.getTopReaderContext(); + states = new TermContext[terms.size()]; + for (int i = 0; i < terms.size(); i++) + states[i] = TermContext.build(context, terms.get(i), true); + stats = similarity.computeStats(searcher, field, getBoost(), states); } @Override @@ -192,19 +197,13 @@ public class PhraseQuery extends Query { public Query getQuery() { return PhraseQuery.this; } @Override - public float getValue() { return value; } - - @Override - public float sumOfSquaredWeights() { - queryWeight = idf * getBoost(); // compute query weight - return queryWeight * queryWeight; // square it + public float getValueForNormalization() { + return stats.getValueForNormalization(); } @Override - public void normalize(float queryNorm) { - this.queryNorm = queryNorm; - queryWeight *= queryNorm; // normalize query weight - value = queryWeight * idf; // idf for document + public void normalize(float queryNorm, float topLevelBoost) { + stats.normalize(queryNorm, topLevelBoost); } @Override @@ -216,21 +215,26 @@ public class PhraseQuery extends Query { PostingsAndFreq[] postingsFreqs = new PostingsAndFreq[terms.size()]; for (int i = 0; i < terms.size(); i++) { final Term t = terms.get(i); + final TermState state = states[i].get(context.ord); + if (state == null) { /* term doesnt exist in this segment */ + assert termNotInReader(reader, field, t.bytes()) : "no termstate found but term exists in reader"; + return null; + } DocsAndPositionsEnum postingsEnum = reader.termPositionsEnum(liveDocs, t.field(), - t.bytes()); + t.bytes(), + state); // PhraseQuery on a field that did not index // positions. if (postingsEnum == null) { - if (reader.termDocsEnum(liveDocs, t.field(), t.bytes()) != null) { - // term does exist, but has no positions - throw new IllegalStateException("field \"" + t.field() + "\" was indexed with Field.omitTermFreqAndPositions=true; cannot run PhraseQuery (term=" + t.text() + ")"); - } else { - // term does not exist - return null; - } + assert (reader.termDocsEnum(liveDocs, t.field(), t.bytes(), state) != null) : "termstate found but no term exists in reader"; + // term does exist, but has no positions + throw new IllegalStateException("field \"" + t.field() + "\" was indexed with Field.omitTermFreqAndPositions=true; cannot run PhraseQuery (term=" + t.text() + ")"); } - postingsFreqs[i] = new PostingsAndFreq(postingsEnum, reader.docFreq(t.field(), t.bytes()), positions.get(i).intValue(), t); + // get the docFreq without seeking + TermsEnum te = reader.fields().terms(field).getThreadTermsEnum(); + te.seekExact(t.bytes(), state); + postingsFreqs[i] = new PostingsAndFreq(postingsEnum, te.docFreq(), positions.get(i).intValue(), t); } // sort by increasing docFreq order @@ -239,8 +243,7 @@ public class PhraseQuery extends Query { } if (slop == 0) { // optimize exact case - ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity, - reader.norms(field)); + ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.exactDocScorer(stats, field, context)); if (s.noDocs) { return null; } else { @@ -248,96 +251,35 @@ public class PhraseQuery extends Query { } } else { return - new SloppyPhraseScorer(this, postingsFreqs, similarity, slop, - reader.norms(field)); + new SloppyPhraseScorer(this, postingsFreqs, similarity, slop, similarity.sloppyDocScorer(stats, field, context)); } } + + private boolean termNotInReader(IndexReader reader, String field, BytesRef bytes) throws IOException { + // only called from assert + final Terms terms = reader.terms(field); + return terms == null || terms.docFreq(bytes) == 0; + } @Override - public Explanation explain(AtomicReaderContext context, int doc) - throws IOException { - - ComplexExplanation result = new ComplexExplanation(); - result.setDescription("weight("+getQuery()+" in "+doc+"), product of:"); - - StringBuilder docFreqs = new StringBuilder(); - StringBuilder query = new StringBuilder(); - query.append('\"'); - docFreqs.append(idfExp.explain()); - for (int i = 0; i < terms.size(); i++) { - if (i != 0) { - query.append(" "); - } - - Term term = terms.get(i); - - query.append(term.text()); - } - query.append('\"'); - - Explanation idfExpl = - new Explanation(idf, "idf(" + field + ":" + docFreqs + ")"); - - // explain query weight - Explanation queryExpl = new Explanation(); - queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:"); - - Explanation boostExpl = new Explanation(getBoost(), "boost"); - if (getBoost() != 1.0f) - queryExpl.addDetail(boostExpl); - queryExpl.addDetail(idfExpl); - - Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm"); - queryExpl.addDetail(queryNormExpl); - - queryExpl.setValue(boostExpl.getValue() * - idfExpl.getValue() * - queryNormExpl.getValue()); - - result.addDetail(queryExpl); - - // explain field weight - Explanation fieldExpl = new Explanation(); - fieldExpl.setDescription("fieldWeight("+field+":"+query+" in "+doc+ - "), product of:"); - + public Explanation explain(AtomicReaderContext context, int doc) throws IOException { Scorer scorer = scorer(context, ScorerContext.def()); - if (scorer == null) { - return new Explanation(0.0f, "no matching docs"); + if (scorer != null) { + int newDoc = scorer.advance(doc); + if (newDoc == doc) { + float freq = scorer.freq(); + SloppyDocScorer docScorer = similarity.sloppyDocScorer(stats, field, context); + ComplexExplanation result = new ComplexExplanation(); + result.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:"); + Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq)); + result.addDetail(scoreExplanation); + result.setValue(scoreExplanation.getValue()); + result.setMatch(true); + return result; + } } - Explanation tfExplanation = new Explanation(); - int d = scorer.advance(doc); - float phraseFreq; - if (d == doc) { - phraseFreq = scorer.freq(); - } else { - phraseFreq = 0.0f; - } - - tfExplanation.setValue(similarity.tf(phraseFreq)); - tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")"); - fieldExpl.addDetail(tfExplanation); - fieldExpl.addDetail(idfExpl); - - Explanation fieldNormExpl = new Explanation(); - byte[] fieldNorms = context.reader.norms(field); - float fieldNorm = - fieldNorms!=null ? similarity.decodeNormValue(fieldNorms[doc]) : 1.0f; - fieldNormExpl.setValue(fieldNorm); - fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")"); - fieldExpl.addDetail(fieldNormExpl); - - fieldExpl.setValue(tfExplanation.getValue() * - idfExpl.getValue() * - fieldNormExpl.getValue()); - - result.addDetail(fieldExpl); - - // combine them - result.setValue(queryExpl.getValue() * fieldExpl.getValue()); - result.setMatch(tfExplanation.isMatch()); - return result; + return new ComplexExplanation(false, 0.0f, "no matching term"); } } diff --git a/lucene/src/java/org/apache/lucene/search/PhraseScorer.java b/lucene/src/java/org/apache/lucene/search/PhraseScorer.java index da84dbcca42..f50ae07032c 100644 --- a/lucene/src/java/org/apache/lucene/search/PhraseScorer.java +++ b/lucene/src/java/org/apache/lucene/search/PhraseScorer.java @@ -30,9 +30,6 @@ import java.io.IOException; * means a match. */ abstract class PhraseScorer extends Scorer { - protected byte[] norms; - protected float value; - private boolean firstTime = true; private boolean more = true; protected PhraseQueue pq; @@ -40,14 +37,12 @@ abstract class PhraseScorer extends Scorer { private float freq; //phrase frequency in current doc as computed by phraseFreq(). - protected final Similarity similarity; + protected final Similarity.SloppyDocScorer docScorer; PhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, - Similarity similarity, byte[] norms) { + Similarity.SloppyDocScorer docScorer) throws IOException { super(weight); - this.similarity = similarity; - this.norms = norms; - this.value = weight.getValue(); + this.docScorer = docScorer; // convert tps to a list of phrase positions. // note: phrase-position differs from term-position in that its position @@ -107,9 +102,7 @@ abstract class PhraseScorer extends Scorer { @Override public float score() throws IOException { - //System.out.println("scoring " + first.doc); - float raw = similarity.tf(freq) * value; // raw score - return norms == null ? raw : raw * similarity.decodeNormValue(norms[first.doc]); // normalize + return docScorer.score(first.doc, freq); } @Override diff --git a/lucene/src/java/org/apache/lucene/search/ScoringRewrite.java b/lucene/src/java/org/apache/lucene/search/ScoringRewrite.java index f9451161a3e..098d8b4a8b7 100644 --- a/lucene/src/java/org/apache/lucene/search/ScoringRewrite.java +++ b/lucene/src/java/org/apache/lucene/search/ScoringRewrite.java @@ -28,7 +28,7 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefHash; -import org.apache.lucene.util.PerReaderTermState; +import org.apache.lucene.util.TermContext; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray; @@ -56,7 +56,7 @@ public abstract class ScoringRewrite extends TermCollectingRewr @Override protected void addClause(BooleanQuery topLevel, Term term, int docCount, - float boost, PerReaderTermState states) { + float boost, TermContext states) { final TermQuery tq = new TermQuery(term, states); tq.setBoost(boost); topLevel.add(tq, BooleanClause.Occur.SHOULD); @@ -117,7 +117,7 @@ public abstract class ScoringRewrite extends TermCollectingRewr if (size > 0) { final int sort[] = col.terms.sort(col.termsEnum.getComparator()); final float[] boost = col.array.boost; - final PerReaderTermState[] termStates = col.array.termState; + final TermContext[] termStates = col.array.termState; for (int i = 0; i < size; i++) { final int pos = sort[i]; final Term term = new Term(query.getField(), col.terms.get(pos, new BytesRef())); @@ -150,12 +150,12 @@ public abstract class ScoringRewrite extends TermCollectingRewr if (e < 0 ) { // duplicate term: update docFreq final int pos = (-e)-1; - array.termState[pos].register(state, readerContext.ord, termsEnum.docFreq()); + array.termState[pos].register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); assert array.boost[pos] == boostAtt.getBoost() : "boost should be equal in all segment TermsEnums"; } else { // new entry: we populate the entry initially array.boost[e] = boostAtt.getBoost(); - array.termState[e] = new PerReaderTermState(topReaderContext, state, readerContext.ord, termsEnum.docFreq()); + array.termState[e] = new TermContext(topReaderContext, state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); ScoringRewrite.this.checkMaxClauseCount(terms.size()); } return true; @@ -165,7 +165,7 @@ public abstract class ScoringRewrite extends TermCollectingRewr /** Special implementation of BytesStartArray that keeps parallel arrays for boost and docFreq */ static final class TermFreqBoostByteStart extends DirectBytesStartArray { float[] boost; - PerReaderTermState[] termState; + TermContext[] termState; public TermFreqBoostByteStart(int initSize) { super(initSize); @@ -175,7 +175,7 @@ public abstract class ScoringRewrite extends TermCollectingRewr public int[] init() { final int[] ord = super.init(); boost = new float[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_FLOAT)]; - termState = new PerReaderTermState[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + termState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; assert termState.length >= ord.length && boost.length >= ord.length; return ord; } @@ -185,7 +185,7 @@ public abstract class ScoringRewrite extends TermCollectingRewr final int[] ord = super.grow(); boost = ArrayUtil.grow(boost, ord.length); if (termState.length < ord.length) { - PerReaderTermState[] tmpTermState = new PerReaderTermState[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + TermContext[] tmpTermState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; System.arraycopy(termState, 0, tmpTermState, 0, termState.length); termState = tmpTermState; } diff --git a/lucene/src/java/org/apache/lucene/search/Similarity.java b/lucene/src/java/org/apache/lucene/search/Similarity.java index e8ae33f6ea1..5a907fcb1be 100644 --- a/lucene/src/java/org/apache/lucene/search/Similarity.java +++ b/lucene/src/java/org/apache/lucene/search/Similarity.java @@ -19,594 +19,111 @@ package org.apache.lucene.search; import java.io.IOException; -import java.util.Collection; +import org.apache.lucene.document.IndexDocValuesField; // javadoc import org.apache.lucene.index.FieldInvertState; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.Explanation.IDFExplanation; -import org.apache.lucene.util.SmallFloat; +import org.apache.lucene.index.IndexReader; // javadoc +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.index.Terms; // javadoc +import org.apache.lucene.search.spans.SpanQuery; // javadoc +import org.apache.lucene.util.SmallFloat; // javadoc +import org.apache.lucene.util.TermContext; /** + * Similarity defines the components of Lucene scoring. + *

* Expert: Scoring API. - * - *

Similarity defines the components of Lucene scoring. - * Overriding computation of these components is a convenient - * way to alter Lucene scoring. - * - *

Suggested reading: - * - * Introduction To Information Retrieval, Chapter 6. - * - *

The following describes how Lucene scoring evolves from - * underlying information retrieval models to (efficient) implementation. - * We first brief on VSM Score, - * then derive from it Lucene's Conceptual Scoring Formula, - * from which, finally, evolves Lucene's Practical Scoring Function - * (the latter is connected directly with Lucene classes and methods). - * - *

Lucene combines - * - * Boolean model (BM) of Information Retrieval - * with - * - * Vector Space Model (VSM) of Information Retrieval - - * documents "approved" by BM are scored by VSM. - * - *

In VSM, documents and queries are represented as - * weighted vectors in a multi-dimensional space, - * where each distinct index term is a dimension, - * and weights are - * Tf-idf values. - * - *

VSM does not require weights to be Tf-idf values, - * but Tf-idf values are believed to produce search results of high quality, - * and so Lucene is using Tf-idf. - * Tf and Idf are described in more detail below, - * but for now, for completion, let's just say that - * for given term t and document (or query) x, - * Tf(t,x) varies with the number of occurrences of term t in x - * (when one increases so does the other) and - * idf(t) similarly varies with the inverse of the - * number of index documents containing term t. - * - *

VSM score of document d for query q is the - * - * Cosine Similarity - * of the weighted query vectors V(q) and V(d): - * - *
 
- * - * - * - *
- * - * - *
- * - * - * - * - * - *
- * cosine-similarity(q,d)   =   - * - * - * - * - * - *
V(q) · V(d)
–––––––––
|V(q)| |V(d)|
- *
- *
- *
- *
VSM Score
- *
- *
 
- * - * - * Where V(q) · V(d) is the - * dot product - * of the weighted vectors, - * and |V(q)| and |V(d)| are their - * Euclidean norms. - * - *

Note: the above equation can be viewed as the dot product of - * the normalized weighted vectors, in the sense that dividing - * V(q) by its euclidean norm is normalizing it to a unit vector. - * - *

Lucene refines VSM score for both search quality and usability: - *

    - *
  • Normalizing V(d) to the unit vector is known to be problematic in that - * it removes all document length information. - * For some documents removing this info is probably ok, - * e.g. a document made by duplicating a certain paragraph 10 times, - * especially if that paragraph is made of distinct terms. - * But for a document which contains no duplicated paragraphs, - * this might be wrong. - * To avoid this problem, a different document length normalization - * factor is used, which normalizes to a vector equal to or larger - * than the unit vector: doc-len-norm(d). - *
  • - * - *
  • At indexing, users can specify that certain documents are more - * important than others, by assigning a document boost. - * For this, the score of each document is also multiplied by its boost value - * doc-boost(d). - *
  • - * - *
  • Lucene is field based, hence each query term applies to a single - * field, document length normalization is by the length of the certain field, - * and in addition to document boost there are also document fields boosts. - *
  • - * - *
  • The same field can be added to a document during indexing several times, - * and so the boost of that field is the multiplication of the boosts of - * the separate additions (or parts) of that field within the document. - *
  • - * - *
  • At search time users can specify boosts to each query, sub-query, and - * each query term, hence the contribution of a query term to the score of - * a document is multiplied by the boost of that query term query-boost(q). - *
  • - * - *
  • A document may match a multi term query without containing all - * the terms of that query (this is correct for some of the queries), - * and users can further reward documents matching more query terms - * through a coordination factor, which is usually larger when - * more terms are matched: coord-factor(q,d). - *
  • - *
- * - *

Under the simplifying assumption of a single field in the index, - * we get Lucene's Conceptual scoring formula: - * - *
 
- * - * - * - *
- * - * - *
- * - * - * - * - * - * - *
- * score(q,d)   =   - * coord-factor(q,d) ·   - * query-boost(q) ·   - * - * - * - * - * - *
V(q) · V(d)
–––––––––
|V(q)|
- *
- *   ·   doc-len-norm(d) - *   ·   doc-boost(d) - *
- *
- *
- *
Lucene Conceptual Scoring Formula
- *
- *
 
- * - *

The conceptual formula is a simplification in the sense that (1) terms and documents - * are fielded and (2) boosts are usually per query term rather than per query. - * - *

We now describe how Lucene implements this conceptual scoring formula, and - * derive from it Lucene's Practical Scoring Function. - * - *

For efficient score computation some scoring components - * are computed and aggregated in advance: - * - *

    - *
  • Query-boost for the query (actually for each query term) - * is known when search starts. - *
  • - * - *
  • Query Euclidean norm |V(q)| can be computed when search starts, - * as it is independent of the document being scored. - * From search optimization perspective, it is a valid question - * why bother to normalize the query at all, because all - * scored documents will be multiplied by the same |V(q)|, - * and hence documents ranks (their order by score) will not - * be affected by this normalization. - * There are two good reasons to keep this normalization: - *
      - *
    • Recall that - * - * Cosine Similarity can be used find how similar - * two documents are. One can use Lucene for e.g. - * clustering, and use a document as a query to compute - * its similarity to other documents. - * In this use case it is important that the score of document d3 - * for query d1 is comparable to the score of document d3 - * for query d2. In other words, scores of a document for two - * distinct queries should be comparable. - * There are other applications that may require this. - * And this is exactly what normalizing the query vector V(q) - * provides: comparability (to a certain extent) of two or more queries. - *
    • - * - *
    • Applying query normalization on the scores helps to keep the - * scores around the unit vector, hence preventing loss of score data - * because of floating point precision limitations. - *
    • - *
    - *
  • - * - *
  • Document length norm doc-len-norm(d) and document - * boost doc-boost(d) are known at indexing time. - * They are computed in advance and their multiplication - * is saved as a single value in the index: norm(d). - * (In the equations below, norm(t in d) means norm(field(t) in doc d) - * where field(t) is the field associated with term t.) - *
  • - *
- * - *

Lucene's Practical Scoring Function is derived from the above. - * The color codes demonstrate how it relates - * to those of the conceptual formula: - * - *

- * - * - * - *
- * - * - *
- * - * - * - * - * - * - * - * - * - * - * - *
- * score(q,d)   =   - * coord(q,d)  ·  - * queryNorm(q)  ·  - * - * - * - * ( - * tf(t in d)  ·  - * idf(t)2  ·  - * t.getBoost() ·  - * norm(t,d) - * ) - *
t in q
- *
- *
- *
Lucene Practical Scoring Function
- *
- * - *

where + *

+ * This is a low-level API, you should only extend this API if you want to implement + * an information retrieval model. If you are instead looking for a convenient way + * to alter Lucene's scoring, consider extending a higher-level implementation + * such as {@link TFIDFSimilarity}, which implements the vector space model with this API, or + * just tweaking the default implementation: {@link DefaultSimilarity}. + *

+ * Similarity determines how Lucene weights terms, and Lucene interacts with + * this class at both index-time and + * query-time. + *

+ * + * At indexing time, the indexer calls {@link #computeNorm(FieldInvertState)}, allowing + * the Similarity implementation to return a per-document byte for the field that will + * be later accessible via {@link IndexReader#norms(String)}. Lucene makes no assumption + * about what is in this byte, but it is most useful for encoding length normalization + * information. + *

+ * Implementations should carefully consider how the normalization byte is encoded: while + * Lucene's classical {@link TFIDFSimilarity} encodes a combination of index-time boost + * and length normalization information with {@link SmallFloat}, this might not be suitable + * for all purposes. + *

+ * Many formulas require the use of average document length, which can be computed via a + * combination of {@link Terms#getSumTotalTermFreq()} and {@link IndexReader#maxDoc()}, + *

+ * Because index-time boost is handled entirely at the application level anyway, + * an application can alternatively store the index-time boost separately using an + * {@link IndexDocValuesField}, and access this at query-time with + * {@link IndexReader#docValues(String)}. + *

+ * Finally, using index-time boosts (either via folding into the normalization byte or + * via IndexDocValues), is an inefficient way to boost the scores of different fields if the + * boost will be the same for every document, instead the Similarity can simply take a constant + * boost parameter C, and the SimilarityProvider can return different instances with + * different boosts depending upon field name. + *

+ * + * At query-time, Queries interact with the Similarity via these steps: *

    - *
  1. - * - * tf(t in d) - * correlates to the term's frequency, - * defined as the number of times term t appears in the currently scored document d. - * Documents that have more occurrences of a given term receive a higher score. - * Note that tf(t in q) is assumed to be 1 and therefore it does not appear in this equation, - * However if a query contains twice the same term, there will be - * two term-queries with that same term and hence the computation would still be correct (although - * not very efficient). - * The default computation for tf(t in d) in - * {@link org.apache.lucene.search.DefaultSimilarity#tf(float) DefaultSimilarity} is: - * - *
     
    - * - * - * - * - * - *
    - * {@link org.apache.lucene.search.DefaultSimilarity#tf(float) tf(t in d)}   =   - * - * frequency½ - *
    - *
     
    - *
  2. - * - *
  3. - * - * idf(t) stands for Inverse Document Frequency. This value - * correlates to the inverse of docFreq - * (the number of documents in which the term t appears). - * This means rarer terms give higher contribution to the total score. - * idf(t) appears for t in both the query and the document, - * hence it is squared in the equation. - * The default computation for idf(t) in - * {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) DefaultSimilarity} is: - * - *
     
    - * - * - * - * - * - * - * - *
    - * {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) idf(t)}  =   - * - * 1 + log ( - * - * - * - * - * - *
    numDocs
    –––––––––
    docFreq+1
    - *
    - * ) - *
    - *
     
    - *
  4. - * - *
  5. - * - * coord(q,d) - * is a score factor based on how many of the query terms are found in the specified document. - * Typically, a document that contains more of the query's terms will receive a higher score - * than another document with fewer query terms. - * This is a search time factor computed in - * {@link SimilarityProvider#coord(int, int) coord(q,d)} - * by the SimilarityProvider in effect at search time. - *
     
    - *
  6. - * - *
  7. - * - * queryNorm(q) - * - * is a normalizing factor used to make scores between queries comparable. - * This factor does not affect document ranking (since all ranked documents are multiplied by the same factor), - * but rather just attempts to make scores from different queries (or even different indexes) comparable. - * This is a search time factor computed by the SimilarityProvider in effect at search time. - * - * The default computation in - * {@link org.apache.lucene.search.DefaultSimilarityProvider#queryNorm(float) DefaultSimilarityProvider} - * produces a Euclidean norm: - *
     
    - * - * - * - * - * - *
    - * queryNorm(q)   =   - * {@link org.apache.lucene.search.DefaultSimilarityProvider#queryNorm(float) queryNorm(sumOfSquaredWeights)} - *   =   - * - * - * - * - * - *
    1
    - * –––––––––––––– - *
    sumOfSquaredWeights½
    - *
    - *
     
    - * - * The sum of squared weights (of the query terms) is - * computed by the query {@link org.apache.lucene.search.Weight} object. - * For example, a {@link org.apache.lucene.search.BooleanQuery} - * computes this value as: - * - *
     
    - * - * - * - * - * - * - * - * - * - * - * - *
    - * {@link org.apache.lucene.search.Weight#sumOfSquaredWeights() sumOfSquaredWeights}   =   - * {@link org.apache.lucene.search.Query#getBoost() q.getBoost()} 2 - *  ·  - * - * - * - * ( - * idf(t)  ·  - * t.getBoost() - * ) 2 - *
    t in q
    - *
     
    - * - *
  8. - * - *
  9. - * - * t.getBoost() - * is a search time boost of term t in the query q as - * specified in the query text - * (see query syntax), - * or as set by application calls to - * {@link org.apache.lucene.search.Query#setBoost(float) setBoost()}. - * Notice that there is really no direct API for accessing a boost of one term in a multi term query, - * but rather multi terms are represented in a query as multi - * {@link org.apache.lucene.search.TermQuery TermQuery} objects, - * and so the boost of a term in the query is accessible by calling the sub-query - * {@link org.apache.lucene.search.Query#getBoost() getBoost()}. - *
     
    - *
  10. - * - *
  11. - * - * norm(t,d) encapsulates a few (indexing time) boost and length factors: - * - *
      - *
    • Document boost - set by calling - * {@link org.apache.lucene.document.Document#setBoost(float) doc.setBoost()} - * before adding the document to the index. - *
    • - *
    • Field boost - set by calling - * {@link org.apache.lucene.document.Fieldable#setBoost(float) field.setBoost()} - * before adding the field to a document. - *
    • - *
    • lengthNorm - computed - * when the document is added to the index in accordance with the number of tokens - * of this field in the document, so that shorter fields contribute more to the score. - * LengthNorm is computed by the Similarity class in effect at indexing. - *
    • - *
    - * The {@link #computeNorm} method is responsible for - * combining all of these factors into a single float. - * - *

    - * When a document is added to the index, all the above factors are multiplied. - * If the document has multiple fields with the same name, all their boosts are multiplied together: - * - *
     
    - * - * - * - * - * - * - * - * - * - * - * - *
    - * norm(t,d)   =   - * {@link org.apache.lucene.document.Document#getBoost() doc.getBoost()} - *  ·  - * lengthNorm - *  ·  - * - * - * - * {@link org.apache.lucene.document.Fieldable#getBoost() f.getBoost}() - *
    field f in d named as t
    - *
     
    - * However the resulted norm value is {@link #encodeNormValue(float) encoded} as a single byte - * before being stored. - * At search time, the norm byte value is read from the index - * {@link org.apache.lucene.store.Directory directory} and - * {@link #decodeNormValue(byte) decoded} back to a float norm value. - * This encoding/decoding, while reducing index size, comes with the price of - * precision loss - it is not guaranteed that decode(encode(x)) = x. - * For instance, decode(encode(0.89)) = 0.75. - *
     
    - * Compression of norm values to a single byte saves memory at search time, - * because once a field is referenced at search time, its norms - for - * all documents - are maintained in memory. - *
     
    - * The rationale supporting such lossy compression of norm values is that - * given the difficulty (and inaccuracy) of users to express their true information - * need by a query, only big differences matter. - *
     
    - * Last, note that search time is too late to modify this norm part of scoring, e.g. by - * using a different {@link Similarity} for search. - *
     
    - *

  12. + *
  13. The {@link #computeStats(IndexSearcher, String, float, TermContext...)} method is called a single time, + * allowing the implementation to compute any statistics (such as IDF, average document length, etc) + * across the entire collection. The {@link TermContext}s passed in are already positioned + * to the terms involved with the raw statistics involved, so a Similarity can freely use any combination + * of term statistics without causing any additional I/O. Lucene makes no assumption about what is + * stored in the returned {@link Similarity.Stats} object. + *
  14. The query normalization process occurs a single time: {@link Similarity.Stats#getValueForNormalization()} + * is called for each query leaf node, {@link SimilarityProvider#queryNorm(float)} is called for the top-level + * query, and finally {@link Similarity.Stats#normalize(float, float)} passes down the normalization value + * and any top-level boosts (e.g. from enclosing {@link BooleanQuery}s). + *
  15. For each segment in the index, the Query creates a {@link #exactDocScorer(Stats, String, IndexReader.AtomicReaderContext)} + * (for queries with exact frequencies such as TermQuerys and exact PhraseQueries) or a + * {@link #sloppyDocScorer(Stats, String, IndexReader.AtomicReaderContext)} (for queries with sloppy frequencies such as + * SpanQuerys and sloppy PhraseQueries). The score() method is called for each matching document. *
+ *

+ * + * When {@link IndexSearcher#explain(Query, int)} is called, queries consult the Similarity's DocScorer for an + * explanation of how it computed its score. The query passes in a the document id and an explanation of how the frequency + * was computed. * * @see org.apache.lucene.index.IndexWriterConfig#setSimilarityProvider(SimilarityProvider) * @see IndexSearcher#setSimilarityProvider(SimilarityProvider) + * @lucene.experimental */ public abstract class Similarity { public static final int NO_DOC_ID_PROVIDED = -1; - /** Cache of decoded bytes. */ - private static final float[] NORM_TABLE = new float[256]; - - static { - for (int i = 0; i < 256; i++) - NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i); - } - - /** Decodes a normalization factor stored in an index. - * @see #encodeNormValue(float) - */ - public float decodeNormValue(byte b) { - return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127 - } - /** * Computes the normalization value for a field, given the accumulated * state of term processing for this field (see {@link FieldInvertState}). * - *

Implementations should calculate a float value based on the field + *

Implementations should calculate a byte value based on the field * state and then return that value. * *

Matches in longer fields are less precise, so implementations of this * method usually return smaller values when state.getLength() is large, * and larger values when state.getLength() is small. * - *

Note that the return values are computed under - * {@link org.apache.lucene.index.IndexWriter#addDocument(org.apache.lucene.document.Document)} - * and then stored using - * {@link #encodeNormValue(float)}. - * Thus they have limited precision, and documents - * must be re-indexed if this method is altered. - * * @lucene.experimental * * @param state current processing state for this field - * @return the calculated float norm + * @return the calculated byte norm */ - public abstract float computeNorm(FieldInvertState state); - - /** Encodes a normalization factor for storage in an index. - * - *

The encoding uses a three-bit mantissa, a five-bit exponent, and - * the zero-exponent point at 15, thus - * representing values from around 7x10^9 to 2x10^-9 with about one - * significant decimal digit of accuracy. Zero is also represented. - * Negative numbers are rounded up to zero. Values too large to represent - * are rounded down to the largest representable value. Positive values too - * small to represent are rounded up to the smallest positive representable - * value. - * @see org.apache.lucene.document.Field#setBoost(float) - * @see org.apache.lucene.util.SmallFloat - */ - public byte encodeNormValue(float f) { - return SmallFloat.floatToByte315(f); - } - - /** Computes a score factor based on a term or phrase's frequency in a - * document. This value is multiplied by the {@link #idf(int, int)} - * factor for each term in the query and these products are then summed to - * form the initial score for a document. - * - *

Terms and phrases repeated in a document indicate the topic of the - * document, so implementations of this method usually return larger values - * when freq is large, and smaller values when freq - * is small. - * - *

The default implementation calls {@link #tf(float)}. - * - * @param freq the frequency of a term within a document - * @return a score factor based on a term's within-document frequency - */ - public float tf(int freq) { - return tf((float)freq); - } + public abstract byte computeNorm(FieldInvertState state); /** Computes the amount of a sloppy phrase match, based on an edit distance. * This value is summed for each sloppy phrase match in a document to form - * the frequency that is passed to {@link #tf(float)}. + * the frequency to be used in scoring instead of the exact term count. * *

A phrase match with a small edit distance to a document passage more * closely matches the document, so implementations of this method usually @@ -619,124 +136,6 @@ public abstract class Similarity { */ public abstract float sloppyFreq(int distance); - /** Computes a score factor based on a term or phrase's frequency in a - * document. This value is multiplied by the {@link #idf(int, int)} - * factor for each term in the query and these products are then summed to - * form the initial score for a document. - * - *

Terms and phrases repeated in a document indicate the topic of the - * document, so implementations of this method usually return larger values - * when freq is large, and smaller values when freq - * is small. - * - * @param freq the frequency of a term within a document - * @return a score factor based on a term's within-document frequency - */ - public abstract float tf(float freq); - - /** - * Computes a score factor for a simple term and returns an explanation - * for that score factor. - * - *

- * The default implementation uses: - * - *

-   * idf(docFreq, searcher.maxDoc());
-   * 
- * - * Note that {@link IndexSearcher#maxDoc()} is used instead of - * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also - * {@link IndexSearcher#docFreq(Term)} is used, and when the latter - * is inaccurate, so is {@link IndexSearcher#maxDoc()}, and in the same direction. - * In addition, {@link IndexSearcher#maxDoc()} is more efficient to compute - * - * @param term the term in question - * @param searcher the document collection being searched - * @param docFreq externally computed docFreq for this term - * @return an IDFExplain object that includes both an idf score factor - and an explanation for the term. - * @throws IOException - */ - public IDFExplanation idfExplain(final Term term, final IndexSearcher searcher, int docFreq) throws IOException { - final int df = docFreq; - final int max = searcher.maxDoc(); - final float idf = idf(df, max); - return new IDFExplanation() { - @Override - public String explain() { - return "idf(docFreq=" + df + - ", maxDocs=" + max + ")"; - } - @Override - public float getIdf() { - return idf; - }}; - } - - /** - * This method forwards to {@link - * #idfExplain(Term,IndexSearcher,int)} by passing - * searcher.docFreq(term) as the docFreq. - */ - public IDFExplanation idfExplain(final Term term, final IndexSearcher searcher) throws IOException { - return idfExplain(term, searcher, searcher.docFreq(term)); - } - - /** - * Computes a score factor for a phrase. - * - *

- * The default implementation sums the idf factor for - * each term in the phrase. - * - * @param terms the terms in the phrase - * @param searcher the document collection being searched - * @return an IDFExplain object that includes both an idf - * score factor for the phrase and an explanation - * for each term. - * @throws IOException - */ - public IDFExplanation idfExplain(Collection terms, IndexSearcher searcher) throws IOException { - final int max = searcher.maxDoc(); - float idf = 0.0f; - final StringBuilder exp = new StringBuilder(); - for (final Term term : terms ) { - final int df = searcher.docFreq(term); - idf += idf(df, max); - exp.append(" "); - exp.append(term.text()); - exp.append("="); - exp.append(df); - } - final float fIdf = idf; - return new IDFExplanation() { - @Override - public float getIdf() { - return fIdf; - } - @Override - public String explain() { - return exp.toString(); - } - }; - } - - /** Computes a score factor based on a term's document frequency (the number - * of documents which contain the term). This value is multiplied by the - * {@link #tf(int)} factor for each term in the query and these products are - * then summed to form the initial score for a document. - * - *

Terms that occur in fewer documents are better indicators of topic, so - * implementations of this method usually return larger values for rare terms, - * and smaller values for common terms. - * - * @param docFreq the number of documents which contain the term - * @param numDocs the total number of documents in the collection - * @return a score factor based on the term's document frequency - */ - public abstract float idf(int docFreq, int numDocs); - /** * Calculate a scoring factor based on the data in the payload. Overriding implementations * are responsible for interpreting what is in the payload. Lucene makes no assumptions about @@ -758,5 +157,101 @@ public abstract class Similarity { { return 1; } - + + /** + * Compute any collection-level stats (e.g. IDF, average document length, etc) needed for scoring a query. + */ + public abstract Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termContexts) throws IOException; + + /** + * returns a new {@link Similarity.ExactDocScorer}. + */ + public abstract ExactDocScorer exactDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException; + + /** + * returns a new {@link Similarity.SloppyDocScorer}. + */ + public abstract SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException; + + /** + * API for scoring exact queries such as {@link TermQuery} and + * exact {@link PhraseQuery}. + *

+ * Term frequencies are integers (the term or phrase's tf) + */ + public abstract class ExactDocScorer { + /** + * Score a single document + * @param doc document id + * @param freq term frequency + * @return document's score + */ + public abstract float score(int doc, int freq); + + /** + * Explain the score for a single document + * @param doc document id + * @param freq Explanation of how the term frequency was computed + * @return document's score + */ + public Explanation explain(int doc, Explanation freq) { + Explanation result = new Explanation(score(doc, (int)freq.getValue()), + "score(doc=" + doc + ",freq=" + freq.getValue() +"), with freq of:"); + result.addDetail(freq); + return result; + } + } + + /** + * API for scoring "sloppy" queries such as {@link SpanQuery} and + * sloppy {@link PhraseQuery}. + *

+ * Term frequencies are floating point values. + */ + public abstract class SloppyDocScorer { + /** + * Score a single document + * @param doc document id + * @param freq sloppy term frequency + * @return document's score + */ + public abstract float score(int doc, float freq); + + /** + * Explain the score for a single document + * @param doc document id + * @param freq Explanation of how the sloppy term frequency was computed + * @return document's score + */ + public Explanation explain(int doc, Explanation freq) { + Explanation result = new Explanation(score(doc, freq.getValue()), + "score(doc=" + doc + ",freq=" + freq.getValue() +"), with freq of:"); + result.addDetail(freq); + return result; + } + } + + /** Stores the statistics for the indexed collection. This abstract + * implementation is empty; descendants of {@code Similarity} should + * subclass {@code Stats} and define the statistics they require in the + * subclass. Examples include idf, average field length, etc. + */ + public static abstract class Stats { + + /** The value for normalization of contained query clauses (e.g. sum of squared weights). + *

+ * NOTE: a Similarity implementation might not use any query normalization at all, + * its not required. However, if it wants to participate in query normalization, + * it can return a value here. + */ + public abstract float getValueForNormalization(); + + /** Assigns the query normalization factor and boost from parent queries to this. + *

+ * NOTE: a Similarity implementation might not use this normalized value at all, + * its not required. However, its usually a good idea to at least incorporate + * the topLevelBoost (e.g. from an outer BooleanQuery) into its score. + */ + public abstract void normalize(float queryNorm, float topLevelBoost); + } } diff --git a/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java b/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java index 381518bbe10..5252c5550b4 100644 --- a/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java +++ b/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java @@ -25,11 +25,13 @@ final class SloppyPhraseScorer extends PhraseScorer { private PhrasePositions repeats[]; private PhrasePositions tmpPos[]; // for flipping repeating pps. private boolean checkedRepeats; - + private final Similarity similarity; + SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, Similarity similarity, - int slop, byte[] norms) { - super(weight, postings, similarity, norms); + int slop, Similarity.SloppyDocScorer docScorer) throws IOException { + super(weight, postings, docScorer); this.slop = slop; + this.similarity = similarity; } /** diff --git a/lucene/src/java/org/apache/lucene/search/TFIDFSimilarity.java b/lucene/src/java/org/apache/lucene/search/TFIDFSimilarity.java new file mode 100644 index 00000000000..abc8e512064 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/search/TFIDFSimilarity.java @@ -0,0 +1,831 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.index.Term; +import org.apache.lucene.util.TermContext; +import org.apache.lucene.util.SmallFloat; + + +/** + * Implementation of {@link Similarity} with the Vector Space Model. + *

+ * Expert: Scoring API. + *

TFIDFSimilarity defines the components of Lucene scoring. + * Overriding computation of these components is a convenient + * way to alter Lucene scoring. + * + *

Suggested reading: + * + * Introduction To Information Retrieval, Chapter 6. + * + *

The following describes how Lucene scoring evolves from + * underlying information retrieval models to (efficient) implementation. + * We first brief on VSM Score, + * then derive from it Lucene's Conceptual Scoring Formula, + * from which, finally, evolves Lucene's Practical Scoring Function + * (the latter is connected directly with Lucene classes and methods). + * + *

Lucene combines + * + * Boolean model (BM) of Information Retrieval + * with + * + * Vector Space Model (VSM) of Information Retrieval - + * documents "approved" by BM are scored by VSM. + * + *

In VSM, documents and queries are represented as + * weighted vectors in a multi-dimensional space, + * where each distinct index term is a dimension, + * and weights are + * Tf-idf values. + * + *

VSM does not require weights to be Tf-idf values, + * but Tf-idf values are believed to produce search results of high quality, + * and so Lucene is using Tf-idf. + * Tf and Idf are described in more detail below, + * but for now, for completion, let's just say that + * for given term t and document (or query) x, + * Tf(t,x) varies with the number of occurrences of term t in x + * (when one increases so does the other) and + * idf(t) similarly varies with the inverse of the + * number of index documents containing term t. + * + *

VSM score of document d for query q is the + * + * Cosine Similarity + * of the weighted query vectors V(q) and V(d): + * + *
 
+ * + * + * + *
+ * + * + *
+ * + * + * + * + * + *
+ * cosine-similarity(q,d)   =   + * + * + * + * + * + *
V(q) · V(d)
–––––––––
|V(q)| |V(d)|
+ *
+ *
+ *
+ *
VSM Score
+ *
+ *
 
+ * + * + * Where V(q) · V(d) is the + * dot product + * of the weighted vectors, + * and |V(q)| and |V(d)| are their + * Euclidean norms. + * + *

Note: the above equation can be viewed as the dot product of + * the normalized weighted vectors, in the sense that dividing + * V(q) by its euclidean norm is normalizing it to a unit vector. + * + *

Lucene refines VSM score for both search quality and usability: + *

    + *
  • Normalizing V(d) to the unit vector is known to be problematic in that + * it removes all document length information. + * For some documents removing this info is probably ok, + * e.g. a document made by duplicating a certain paragraph 10 times, + * especially if that paragraph is made of distinct terms. + * But for a document which contains no duplicated paragraphs, + * this might be wrong. + * To avoid this problem, a different document length normalization + * factor is used, which normalizes to a vector equal to or larger + * than the unit vector: doc-len-norm(d). + *
  • + * + *
  • At indexing, users can specify that certain documents are more + * important than others, by assigning a document boost. + * For this, the score of each document is also multiplied by its boost value + * doc-boost(d). + *
  • + * + *
  • Lucene is field based, hence each query term applies to a single + * field, document length normalization is by the length of the certain field, + * and in addition to document boost there are also document fields boosts. + *
  • + * + *
  • The same field can be added to a document during indexing several times, + * and so the boost of that field is the multiplication of the boosts of + * the separate additions (or parts) of that field within the document. + *
  • + * + *
  • At search time users can specify boosts to each query, sub-query, and + * each query term, hence the contribution of a query term to the score of + * a document is multiplied by the boost of that query term query-boost(q). + *
  • + * + *
  • A document may match a multi term query without containing all + * the terms of that query (this is correct for some of the queries), + * and users can further reward documents matching more query terms + * through a coordination factor, which is usually larger when + * more terms are matched: coord-factor(q,d). + *
  • + *
+ * + *

Under the simplifying assumption of a single field in the index, + * we get Lucene's Conceptual scoring formula: + * + *
 
+ * + * + * + *
+ * + * + *
+ * + * + * + * + * + * + *
+ * score(q,d)   =   + * coord-factor(q,d) ·   + * query-boost(q) ·   + * + * + * + * + * + *
V(q) · V(d)
–––––––––
|V(q)|
+ *
+ *   ·   doc-len-norm(d) + *   ·   doc-boost(d) + *
+ *
+ *
+ *
Lucene Conceptual Scoring Formula
+ *
+ *
 
+ * + *

The conceptual formula is a simplification in the sense that (1) terms and documents + * are fielded and (2) boosts are usually per query term rather than per query. + * + *

We now describe how Lucene implements this conceptual scoring formula, and + * derive from it Lucene's Practical Scoring Function. + * + *

For efficient score computation some scoring components + * are computed and aggregated in advance: + * + *

    + *
  • Query-boost for the query (actually for each query term) + * is known when search starts. + *
  • + * + *
  • Query Euclidean norm |V(q)| can be computed when search starts, + * as it is independent of the document being scored. + * From search optimization perspective, it is a valid question + * why bother to normalize the query at all, because all + * scored documents will be multiplied by the same |V(q)|, + * and hence documents ranks (their order by score) will not + * be affected by this normalization. + * There are two good reasons to keep this normalization: + *
      + *
    • Recall that + * + * Cosine Similarity can be used find how similar + * two documents are. One can use Lucene for e.g. + * clustering, and use a document as a query to compute + * its similarity to other documents. + * In this use case it is important that the score of document d3 + * for query d1 is comparable to the score of document d3 + * for query d2. In other words, scores of a document for two + * distinct queries should be comparable. + * There are other applications that may require this. + * And this is exactly what normalizing the query vector V(q) + * provides: comparability (to a certain extent) of two or more queries. + *
    • + * + *
    • Applying query normalization on the scores helps to keep the + * scores around the unit vector, hence preventing loss of score data + * because of floating point precision limitations. + *
    • + *
    + *
  • + * + *
  • Document length norm doc-len-norm(d) and document + * boost doc-boost(d) are known at indexing time. + * They are computed in advance and their multiplication + * is saved as a single value in the index: norm(d). + * (In the equations below, norm(t in d) means norm(field(t) in doc d) + * where field(t) is the field associated with term t.) + *
  • + *
+ * + *

Lucene's Practical Scoring Function is derived from the above. + * The color codes demonstrate how it relates + * to those of the conceptual formula: + * + *

+ * + * + * + *
+ * + * + *
+ * + * + * + * + * + * + * + * + * + * + * + *
+ * score(q,d)   =   + * coord(q,d)  ·  + * queryNorm(q)  ·  + * + * + * + * ( + * tf(t in d)  ·  + * idf(t)2  ·  + * t.getBoost() ·  + * norm(t,d) + * ) + *
t in q
+ *
+ *
+ *
Lucene Practical Scoring Function
+ *
+ * + *

where + *

    + *
  1. + * + * tf(t in d) + * correlates to the term's frequency, + * defined as the number of times term t appears in the currently scored document d. + * Documents that have more occurrences of a given term receive a higher score. + * Note that tf(t in q) is assumed to be 1 and therefore it does not appear in this equation, + * However if a query contains twice the same term, there will be + * two term-queries with that same term and hence the computation would still be correct (although + * not very efficient). + * The default computation for tf(t in d) in + * {@link org.apache.lucene.search.DefaultSimilarity#tf(float) DefaultSimilarity} is: + * + *
     
    + * + * + * + * + * + *
    + * {@link org.apache.lucene.search.DefaultSimilarity#tf(float) tf(t in d)}   =   + * + * frequency½ + *
    + *
     
    + *
  2. + * + *
  3. + * + * idf(t) stands for Inverse Document Frequency. This value + * correlates to the inverse of docFreq + * (the number of documents in which the term t appears). + * This means rarer terms give higher contribution to the total score. + * idf(t) appears for t in both the query and the document, + * hence it is squared in the equation. + * The default computation for idf(t) in + * {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) DefaultSimilarity} is: + * + *
     
    + * + * + * + * + * + * + * + *
    + * {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) idf(t)}  =   + * + * 1 + log ( + * + * + * + * + * + *
    numDocs
    –––––––––
    docFreq+1
    + *
    + * ) + *
    + *
     
    + *
  4. + * + *
  5. + * + * coord(q,d) + * is a score factor based on how many of the query terms are found in the specified document. + * Typically, a document that contains more of the query's terms will receive a higher score + * than another document with fewer query terms. + * This is a search time factor computed in + * {@link SimilarityProvider#coord(int, int) coord(q,d)} + * by the SimilarityProvider in effect at search time. + *
     
    + *
  6. + * + *
  7. + * + * queryNorm(q) + * + * is a normalizing factor used to make scores between queries comparable. + * This factor does not affect document ranking (since all ranked documents are multiplied by the same factor), + * but rather just attempts to make scores from different queries (or even different indexes) comparable. + * This is a search time factor computed by the Similarity in effect at search time. + * + * The default computation in + * {@link org.apache.lucene.search.DefaultSimilarityProvider#queryNorm(float) DefaultSimilarityProvider} + * produces a Euclidean norm: + *
     
    + * + * + * + * + * + *
    + * queryNorm(q)   =   + * {@link org.apache.lucene.search.DefaultSimilarityProvider#queryNorm(float) queryNorm(sumOfSquaredWeights)} + *   =   + * + * + * + * + * + *
    1
    + * –––––––––––––– + *
    sumOfSquaredWeights½
    + *
    + *
     
    + * + * The sum of squared weights (of the query terms) is + * computed by the query {@link org.apache.lucene.search.Weight} object. + * For example, a {@link org.apache.lucene.search.BooleanQuery} + * computes this value as: + * + *
     
    + * + * + * + * + * + * + * + * + * + * + * + *
    + * {@link org.apache.lucene.search.Weight#getValueForNormalization() sumOfSquaredWeights}   =   + * {@link org.apache.lucene.search.Query#getBoost() q.getBoost()} 2 + *  ·  + * + * + * + * ( + * idf(t)  ·  + * t.getBoost() + * ) 2 + *
    t in q
    + *
     
    + * + *
  8. + * + *
  9. + * + * t.getBoost() + * is a search time boost of term t in the query q as + * specified in the query text + * (see query syntax), + * or as set by application calls to + * {@link org.apache.lucene.search.Query#setBoost(float) setBoost()}. + * Notice that there is really no direct API for accessing a boost of one term in a multi term query, + * but rather multi terms are represented in a query as multi + * {@link org.apache.lucene.search.TermQuery TermQuery} objects, + * and so the boost of a term in the query is accessible by calling the sub-query + * {@link org.apache.lucene.search.Query#getBoost() getBoost()}. + *
     
    + *
  10. + * + *
  11. + * + * norm(t,d) encapsulates a few (indexing time) boost and length factors: + * + *
      + *
    • Document boost - set by calling + * {@link org.apache.lucene.document.Document#setBoost(float) doc.setBoost()} + * before adding the document to the index. + *
    • + *
    • Field boost - set by calling + * {@link org.apache.lucene.document.Fieldable#setBoost(float) field.setBoost()} + * before adding the field to a document. + *
    • + *
    • lengthNorm - computed + * when the document is added to the index in accordance with the number of tokens + * of this field in the document, so that shorter fields contribute more to the score. + * LengthNorm is computed by the Similarity class in effect at indexing. + *
    • + *
    + * The {@link #computeNorm} method is responsible for + * combining all of these factors into a single float. + * + *

    + * When a document is added to the index, all the above factors are multiplied. + * If the document has multiple fields with the same name, all their boosts are multiplied together: + * + *
     
    + * + * + * + * + * + * + * + * + * + * + * + *
    + * norm(t,d)   =   + * {@link org.apache.lucene.document.Document#getBoost() doc.getBoost()} + *  ·  + * lengthNorm + *  ·  + * + * + * + * {@link org.apache.lucene.document.Fieldable#getBoost() f.getBoost}() + *
    field f in d named as t
    + *
     
    + * However the resulted norm value is {@link #encodeNormValue(float) encoded} as a single byte + * before being stored. + * At search time, the norm byte value is read from the index + * {@link org.apache.lucene.store.Directory directory} and + * {@link #decodeNormValue(byte) decoded} back to a float norm value. + * This encoding/decoding, while reducing index size, comes with the price of + * precision loss - it is not guaranteed that decode(encode(x)) = x. + * For instance, decode(encode(0.89)) = 0.75. + *
     
    + * Compression of norm values to a single byte saves memory at search time, + * because once a field is referenced at search time, its norms - for + * all documents - are maintained in memory. + *
     
    + * The rationale supporting such lossy compression of norm values is that + * given the difficulty (and inaccuracy) of users to express their true information + * need by a query, only big differences matter. + *
     
    + * Last, note that search time is too late to modify this norm part of scoring, e.g. by + * using a different {@link Similarity} for search. + *
     
    + *

  12. + *
+ * + * @see org.apache.lucene.index.IndexWriterConfig#setSimilarityProvider(SimilarityProvider) + * @see IndexSearcher#setSimilarityProvider(SimilarityProvider) + */ +public abstract class TFIDFSimilarity extends Similarity { + + /** Computes a score factor based on a term or phrase's frequency in a + * document. This value is multiplied by the {@link #idf(int, int)} + * factor for each term in the query and these products are then summed to + * form the initial score for a document. + * + *

Terms and phrases repeated in a document indicate the topic of the + * document, so implementations of this method usually return larger values + * when freq is large, and smaller values when freq + * is small. + * + *

The default implementation calls {@link #tf(float)}. + * + * @param freq the frequency of a term within a document + * @return a score factor based on a term's within-document frequency + */ + public float tf(int freq) { + return tf((float)freq); + } + + /** Computes a score factor based on a term or phrase's frequency in a + * document. This value is multiplied by the {@link #idf(int, int)} + * factor for each term in the query and these products are then summed to + * form the initial score for a document. + * + *

Terms and phrases repeated in a document indicate the topic of the + * document, so implementations of this method usually return larger values + * when freq is large, and smaller values when freq + * is small. + * + * @param freq the frequency of a term within a document + * @return a score factor based on a term's within-document frequency + */ + public abstract float tf(float freq); + + /** + * Computes a score factor for a simple term and returns an explanation + * for that score factor. + * + *

+ * The default implementation uses: + * + *

+   * idf(docFreq, searcher.maxDoc());
+   * 
+ * + * Note that {@link IndexSearcher#maxDoc()} is used instead of + * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also + * {@link IndexSearcher#docFreq(Term)} is used, and when the latter + * is inaccurate, so is {@link IndexSearcher#maxDoc()}, and in the same direction. + * In addition, {@link IndexSearcher#maxDoc()} is more efficient to compute + * + * @param stats statistics of the term in question + * @param searcher the document collection being searched + * @return an Explain object that includes both an idf score factor + and an explanation for the term. + * @throws IOException + */ + public Explanation idfExplain(TermContext stats, final IndexSearcher searcher) throws IOException { + final int df = stats.docFreq(); + final int max = searcher.maxDoc(); + final float idf = idf(df, max); + return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"); + } + + /** + * Computes a score factor for a phrase. + * + *

+ * The default implementation sums the idf factor for + * each term in the phrase. + * + * @param stats statistics of the terms in the phrase + * @param searcher the document collection being searched + * @return an Explain object that includes both an idf + * score factor for the phrase and an explanation + * for each term. + * @throws IOException + */ + public Explanation idfExplain(final TermContext stats[], IndexSearcher searcher) throws IOException { + final int max = searcher.maxDoc(); + float idf = 0.0f; + final Explanation exp = new Explanation(); + exp.setDescription("idf(), sum of:"); + for (final TermContext stat : stats ) { + final int df = stat.docFreq(); + final float termIdf = idf(df, max); + exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")")); + idf += termIdf; + } + exp.setValue(idf); + return exp; + } + + /** Computes a score factor based on a term's document frequency (the number + * of documents which contain the term). This value is multiplied by the + * {@link #tf(int)} factor for each term in the query and these products are + * then summed to form the initial score for a document. + * + *

Terms that occur in fewer documents are better indicators of topic, so + * implementations of this method usually return larger values for rare terms, + * and smaller values for common terms. + * + * @param docFreq the number of documents which contain the term + * @param numDocs the total number of documents in the collection + * @return a score factor based on the term's document frequency + */ + public abstract float idf(int docFreq, int numDocs); + + /** Cache of decoded bytes. */ + private static final float[] NORM_TABLE = new float[256]; + + static { + for (int i = 0; i < 256; i++) + NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i); + } + + /** Decodes a normalization factor stored in an index. + * @see #encodeNormValue(float) + */ + public float decodeNormValue(byte b) { + return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127 + } + + /** Encodes a normalization factor for storage in an index. + * + *

The encoding uses a three-bit mantissa, a five-bit exponent, and + * the zero-exponent point at 15, thus + * representing values from around 7x10^9 to 2x10^-9 with about one + * significant decimal digit of accuracy. Zero is also represented. + * Negative numbers are rounded up to zero. Values too large to represent + * are rounded down to the largest representable value. Positive values too + * small to represent are rounded up to the smallest positive representable + * value. + * @see org.apache.lucene.document.Field#setBoost(float) + * @see org.apache.lucene.util.SmallFloat + */ + public byte encodeNormValue(float f) { + return SmallFloat.floatToByte315(f); + } + + @Override + public final Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, + TermContext... termContexts) throws IOException { + final Explanation idf = termContexts.length == 1 + ? idfExplain(termContexts[0], searcher) + : idfExplain(termContexts, searcher); + return new IDFStats(idf, queryBoost); + } + + @Override + public final ExactDocScorer exactDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException { + return new ExactTFIDFDocScorer((IDFStats)stats, context.reader.norms(fieldName)); + } + + @Override + public final SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException { + return new SloppyTFIDFDocScorer((IDFStats)stats, context.reader.norms(fieldName)); + } + + // TODO: we can specialize these for omitNorms up front, but we should test that it doesn't confuse stupid hotspot. + + private final class ExactTFIDFDocScorer extends ExactDocScorer { + private final IDFStats stats; + private final float weightValue; + private final byte[] norms; + private static final int SCORE_CACHE_SIZE = 32; + private float[] scoreCache = new float[SCORE_CACHE_SIZE]; + + ExactTFIDFDocScorer(IDFStats stats, byte norms[]) { + this.stats = stats; + this.weightValue = stats.value; + this.norms = norms; + for (int i = 0; i < SCORE_CACHE_SIZE; i++) + scoreCache[i] = tf(i) * weightValue; + } + + @Override + public float score(int doc, int freq) { + final float raw = // compute tf(f)*weight + freq < SCORE_CACHE_SIZE // check cache + ? scoreCache[freq] // cache hit + : tf(freq)*weightValue; // cache miss + + return norms == null ? raw : raw * decodeNormValue(norms[doc]); // normalize for field + } + + @Override + public Explanation explain(int doc, Explanation freq) { + return explainScore(doc, freq, stats, norms); + } + } + + private final class SloppyTFIDFDocScorer extends SloppyDocScorer { + private final IDFStats stats; + private final float weightValue; + private final byte[] norms; + + SloppyTFIDFDocScorer(IDFStats stats, byte norms[]) { + this.stats = stats; + this.weightValue = stats.value; + this.norms = norms; + } + + @Override + public float score(int doc, float freq) { + final float raw = tf(freq) * weightValue; // compute tf(f)*weight + + return norms == null ? raw : raw * decodeNormValue(norms[doc]); // normalize for field + } + + @Override + public Explanation explain(int doc, Explanation freq) { + return explainScore(doc, freq, stats, norms); + } + } + + /** Collection statistics for the TF-IDF model. The only statistic of interest + * to this model is idf. */ + private static class IDFStats extends Stats { + /** The idf and its explanation */ + private final Explanation idf; + private float queryNorm; + private float queryWeight; + private final float queryBoost; + private float value; + + public IDFStats(Explanation idf, float queryBoost) { + // TODO: Validate? + this.idf = idf; + this.queryBoost = queryBoost; + this.queryWeight = idf.getValue() * queryBoost; // compute query weight + } + + @Override + public float getValueForNormalization() { + // TODO: (sorta LUCENE-1907) make non-static class and expose this squaring via a nice method to subclasses? + return queryWeight * queryWeight; // sum of squared weights + } + + @Override + public void normalize(float queryNorm, float topLevelBoost) { + this.queryNorm = queryNorm * topLevelBoost; + queryWeight *= this.queryNorm; // normalize query weight + value = queryWeight * idf.getValue(); // idf for document + } + } + + private Explanation explainScore(int doc, Explanation freq, IDFStats stats, byte[] norms) { + Explanation result = new Explanation(); + result.setDescription("score(doc="+doc+",freq="+freq+"), product of:"); + + // explain query weight + Explanation queryExpl = new Explanation(); + queryExpl.setDescription("queryWeight, product of:"); + + Explanation boostExpl = new Explanation(stats.queryBoost, "boost"); + if (stats.queryBoost != 1.0f) + queryExpl.addDetail(boostExpl); + queryExpl.addDetail(stats.idf); + + Explanation queryNormExpl = new Explanation(stats.queryNorm,"queryNorm"); + queryExpl.addDetail(queryNormExpl); + + queryExpl.setValue(boostExpl.getValue() * + stats.idf.getValue() * + queryNormExpl.getValue()); + + result.addDetail(queryExpl); + + // explain field weight + Explanation fieldExpl = new Explanation(); + fieldExpl.setDescription("fieldWeight in "+doc+ + ", product of:"); + + Explanation tfExplanation = new Explanation(); + tfExplanation.setValue(tf(freq.getValue())); + tfExplanation.setDescription("tf(freq="+freq.getValue()+"), with freq of:"); + tfExplanation.addDetail(freq); + fieldExpl.addDetail(tfExplanation); + fieldExpl.addDetail(stats.idf); + + Explanation fieldNormExpl = new Explanation(); + float fieldNorm = + norms!=null ? decodeNormValue(norms[doc]) : 1.0f; + fieldNormExpl.setValue(fieldNorm); + fieldNormExpl.setDescription("fieldNorm(doc="+doc+")"); + fieldExpl.addDetail(fieldNormExpl); + + fieldExpl.setValue(tfExplanation.getValue() * + stats.idf.getValue() * + fieldNormExpl.getValue()); + + result.addDetail(fieldExpl); + + // combine them + result.setValue(queryExpl.getValue() * fieldExpl.getValue()); + + if (queryExpl.getValue() == 1.0f) + return fieldExpl; + + return result; + } +} diff --git a/lucene/src/java/org/apache/lucene/search/TermCollectingRewrite.java b/lucene/src/java/org/apache/lucene/search/TermCollectingRewrite.java index 501831728d3..192dd434be7 100644 --- a/lucene/src/java/org/apache/lucene/search/TermCollectingRewrite.java +++ b/lucene/src/java/org/apache/lucene/search/TermCollectingRewrite.java @@ -29,7 +29,7 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader.ReaderContext; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.PerReaderTermState; +import org.apache.lucene.util.TermContext; import org.apache.lucene.util.ReaderUtil; abstract class TermCollectingRewrite extends MultiTermQuery.RewriteMethod { @@ -43,7 +43,7 @@ abstract class TermCollectingRewrite extends MultiTermQuery.Rew addClause(topLevel, term, docCount, boost, null); } - protected abstract void addClause(Q topLevel, Term term, int docCount, float boost, PerReaderTermState states) throws IOException; + protected abstract void addClause(Q topLevel, Term term, int docCount, float boost, TermContext states) throws IOException; protected final void collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException { diff --git a/lucene/src/java/org/apache/lucene/search/TermQuery.java b/lucene/src/java/org/apache/lucene/search/TermQuery.java index 078d02f7089..936b0bf4581 100644 --- a/lucene/src/java/org/apache/lucene/search/TermQuery.java +++ b/lucene/src/java/org/apache/lucene/search/TermQuery.java @@ -27,9 +27,9 @@ import org.apache.lucene.index.Terms; import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader.ReaderContext; import org.apache.lucene.index.Term; -import org.apache.lucene.search.Explanation.IDFExplanation; +import org.apache.lucene.search.Similarity.ExactDocScorer; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.PerReaderTermState; +import org.apache.lucene.util.TermContext; import org.apache.lucene.util.ReaderUtil; import org.apache.lucene.util.ToStringUtils; @@ -39,28 +39,19 @@ import org.apache.lucene.util.ToStringUtils; public class TermQuery extends Query { private final Term term; private int docFreq; - private transient PerReaderTermState perReaderTermState; + private transient TermContext perReaderTermState; private class TermWeight extends Weight { private final Similarity similarity; - private float value; - private final float idf; - private float queryNorm; - private float queryWeight; - private final IDFExplanation idfExp; - private transient PerReaderTermState termStates; + private final Similarity.Stats stats; + private transient TermContext termStates; - public TermWeight(IndexSearcher searcher, PerReaderTermState termStates, int docFreq) + public TermWeight(IndexSearcher searcher, TermContext termStates) throws IOException { - assert termStates != null : "PerReaderTermState must not be null"; + assert termStates != null : "TermContext must not be null"; this.termStates = termStates; this.similarity = searcher.getSimilarityProvider().get(term.field()); - if (docFreq != -1) { - idfExp = similarity.idfExplain(term, searcher, docFreq); - } else { - idfExp = similarity.idfExplain(term, searcher); - } - idf = idfExp.getIdf(); + this.stats = similarity.computeStats(searcher, term.field(), getBoost(), termStates); } @Override @@ -70,19 +61,13 @@ public class TermQuery extends Query { public Query getQuery() { return TermQuery.this; } @Override - public float getValue() { return value; } - - @Override - public float sumOfSquaredWeights() { - queryWeight = idf * getBoost(); // compute query weight - return queryWeight * queryWeight; // square it + public float getValueForNormalization() { + return stats.getValueForNormalization(); } @Override - public void normalize(float queryNorm) { - this.queryNorm = queryNorm; - queryWeight *= queryNorm; // normalize query weight - value = queryWeight * idf; // idf for document + public void normalize(float queryNorm, float topLevelBoost) { + stats.normalize(queryNorm, topLevelBoost); } @Override @@ -97,7 +82,7 @@ public class TermQuery extends Query { } final DocsEnum docs = reader.termDocsEnum(reader.getLiveDocs(), field, term.bytes(), state); assert docs != null; - return new TermScorer(this, docs, similarity, context.reader.norms(field)); + return new TermScorer(this, docs, similarity.exactDocScorer(stats, field, context)); } private boolean termNotInReader(IndexReader reader, String field, BytesRef bytes) throws IOException { @@ -107,79 +92,25 @@ public class TermQuery extends Query { } @Override - public Explanation explain(AtomicReaderContext context, int doc) - throws IOException { - final IndexReader reader = context.reader; - - ComplexExplanation result = new ComplexExplanation(); - result.setDescription("weight("+getQuery()+" in "+doc+"), product of:"); - - Explanation expl = new Explanation(idf, idfExp.explain()); - - // explain query weight - Explanation queryExpl = new Explanation(); - queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:"); - - Explanation boostExpl = new Explanation(getBoost(), "boost"); - if (getBoost() != 1.0f) - queryExpl.addDetail(boostExpl); - queryExpl.addDetail(expl); - - Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm"); - queryExpl.addDetail(queryNormExpl); - - queryExpl.setValue(boostExpl.getValue() * - expl.getValue() * - queryNormExpl.getValue()); - - result.addDetail(queryExpl); - - // explain field weight - String field = term.field(); - ComplexExplanation fieldExpl = new ComplexExplanation(); - fieldExpl.setDescription("fieldWeight("+term+" in "+doc+ - "), product of:"); - - Explanation tfExplanation = new Explanation(); - int tf = 0; + public Explanation explain(AtomicReaderContext context, int doc) throws IOException { + IndexReader reader = context.reader; DocsEnum docs = reader.termDocsEnum(context.reader.getLiveDocs(), term.field(), term.bytes()); if (docs != null) { - int newDoc = docs.advance(doc); - if (newDoc == doc) { - tf = docs.freq(); - } - tfExplanation.setValue(similarity.tf(tf)); - tfExplanation.setDescription("tf(termFreq("+term+")="+tf+")"); - } else { - tfExplanation.setValue(0.0f); - tfExplanation.setDescription("no matching term"); + int newDoc = docs.advance(doc); + if (newDoc == doc) { + int freq = docs.freq(); + ExactDocScorer docScorer = similarity.exactDocScorer(stats, term.field(), context); + ComplexExplanation result = new ComplexExplanation(); + result.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:"); + Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "termFreq=" + freq)); + result.addDetail(scoreExplanation); + result.setValue(scoreExplanation.getValue()); + result.setMatch(true); + return result; + } } - fieldExpl.addDetail(tfExplanation); - fieldExpl.addDetail(expl); - - Explanation fieldNormExpl = new Explanation(); - final byte[] fieldNorms = reader.norms(field); - float fieldNorm = - fieldNorms!=null ? similarity.decodeNormValue(fieldNorms[doc]) : 1.0f; - fieldNormExpl.setValue(fieldNorm); - fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")"); - fieldExpl.addDetail(fieldNormExpl); - fieldExpl.setMatch(Boolean.valueOf(tfExplanation.isMatch())); - fieldExpl.setValue(tfExplanation.getValue() * - expl.getValue() * - fieldNormExpl.getValue()); - - result.addDetail(fieldExpl); - result.setMatch(fieldExpl.getMatch()); - - // combine them - result.setValue(queryExpl.getValue() * fieldExpl.getValue()); - - if (queryExpl.getValue() == 1.0f) - return fieldExpl; - - return result; + return new ComplexExplanation(false, 0.0f, "no matching term"); } } @@ -200,7 +131,7 @@ public class TermQuery extends Query { /** Expert: constructs a TermQuery that will use the * provided docFreq instead of looking up the docFreq * against the searcher. */ - public TermQuery(Term t, PerReaderTermState states) { + public TermQuery(Term t, TermContext states) { assert states != null; term = t; docFreq = states.docFreq(); @@ -213,20 +144,20 @@ public class TermQuery extends Query { @Override public Weight createWeight(IndexSearcher searcher) throws IOException { final ReaderContext context = searcher.getTopReaderContext(); - final int weightDocFreq; - final PerReaderTermState termState; + final TermContext termState; if (perReaderTermState == null || perReaderTermState.topReaderContext != context) { // make TermQuery single-pass if we don't have a PRTS or if the context differs! - termState = PerReaderTermState.build(context, term, true); // cache term lookups! - // we must not ignore the given docFreq - if set use the given value - weightDocFreq = docFreq == -1 ? termState.docFreq() : docFreq; + termState = TermContext.build(context, term, true); // cache term lookups! } else { // PRTS was pre-build for this IS termState = this.perReaderTermState; - weightDocFreq = docFreq; } + + // we must not ignore the given docFreq - if set use the given value (lie) + if (docFreq != -1) + termState.setDocFreq(docFreq); - return new TermWeight(searcher, termState, weightDocFreq); + return new TermWeight(searcher, termState); } @Override diff --git a/lucene/src/java/org/apache/lucene/search/TermScorer.java b/lucene/src/java/org/apache/lucene/search/TermScorer.java index 9a9ef5eeb3c..3534079fb34 100644 --- a/lucene/src/java/org/apache/lucene/search/TermScorer.java +++ b/lucene/src/java/org/apache/lucene/search/TermScorer.java @@ -25,20 +25,16 @@ import org.apache.lucene.index.DocsEnum; */ final class TermScorer extends Scorer { private DocsEnum docsEnum; - private byte[] norms; - private float weightValue; private int doc = -1; private int freq; private int pointer; private int pointerMax; - private static final int SCORE_CACHE_SIZE = 32; - private float[] scoreCache = new float[SCORE_CACHE_SIZE]; private int[] docs; private int[] freqs; private final DocsEnum.BulkReadResult bulkResult; - private final Similarity similarity; + private final Similarity.ExactDocScorer docScorer; /** * Construct a TermScorer. @@ -47,22 +43,15 @@ final class TermScorer extends Scorer { * The weight of the Term in the query. * @param td * An iterator over the documents matching the Term. - * @param similarity - * The Similarity implementation to be used for score - * computations. - * @param norms - * The field norms of the document fields for the Term. + * @param docScorer + * The Similarity.ExactDocScorer implementation + * to be used for score computations. */ - TermScorer(Weight weight, DocsEnum td, Similarity similarity, byte[] norms) { + TermScorer(Weight weight, DocsEnum td, Similarity.ExactDocScorer docScorer) throws IOException { super(weight); - this.similarity = similarity; + this.docScorer = docScorer; this.docsEnum = td; - this.norms = norms; - this.weightValue = weight.getValue(); bulkResult = td.getBulkResult(); - - for (int i = 0; i < SCORE_CACHE_SIZE; i++) - scoreCache[i] = similarity.tf(i) * weightValue; } @Override @@ -134,12 +123,7 @@ final class TermScorer extends Scorer { @Override public float score() { assert doc != NO_MORE_DOCS; - float raw = // compute tf(f)*weight - freq < SCORE_CACHE_SIZE // check cache - ? scoreCache[freq] // cache hit - : similarity.tf(freq)*weightValue; // cache miss - - return norms == null ? raw : raw * similarity.decodeNormValue(norms[doc]); // normalize for field + return docScorer.score(doc, freq); } /** diff --git a/lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java b/lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java index 5b322a87910..4ad6222b801 100644 --- a/lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java +++ b/lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java @@ -29,7 +29,7 @@ import org.apache.lucene.index.TermState; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.PerReaderTermState; +import org.apache.lucene.util.TermContext; /** * Base rewrite method for collecting only the top terms @@ -80,7 +80,7 @@ public abstract class TopTermsRewrite extends TermCollectingRew this.termComp = termsEnum.getComparator(); // lazy init the initial ScoreTerm because comparator is not known on ctor: if (st == null) - st = new ScoreTerm(this.termComp, new PerReaderTermState(topReaderContext)); + st = new ScoreTerm(this.termComp, new TermContext(topReaderContext)); boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class); } @@ -101,14 +101,14 @@ public abstract class TopTermsRewrite extends TermCollectingRew if (t != null) { // if the term is already in the PQ, only update docFreq of term in PQ assert t.boost == boost : "boost should be equal in all segment TermsEnums"; - t.termState.register(state, readerContext.ord, termsEnum.docFreq()); + t.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } else { // add new entry in PQ, we must clone the term, else it may get overwritten! st.bytes.copy(bytes); st.boost = boost; visitedTerms.put(st.bytes, st); assert st.termState.docFreq() == 0; - st.termState.register(state, readerContext.ord, termsEnum.docFreq()); + st.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); stQueue.offer(st); // possibly drop entries from queue if (stQueue.size() > maxSize) { @@ -116,7 +116,7 @@ public abstract class TopTermsRewrite extends TermCollectingRew visitedTerms.remove(st.bytes); st.termState.clear(); // reset the termstate! } else { - st = new ScoreTerm(termComp, new PerReaderTermState(topReaderContext)); + st = new ScoreTerm(termComp, new TermContext(topReaderContext)); } assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize"; // set maxBoostAtt with values to help FuzzyTermsEnum to optimize @@ -171,8 +171,8 @@ public abstract class TopTermsRewrite extends TermCollectingRew public final Comparator termComp; public final BytesRef bytes = new BytesRef(); public float boost; - public final PerReaderTermState termState; - public ScoreTerm(Comparator termComp, PerReaderTermState termState) { + public final TermContext termState; + public ScoreTerm(Comparator termComp, TermContext termState) { this.termComp = termComp; this.termState = termState; } diff --git a/lucene/src/java/org/apache/lucene/search/Weight.java b/lucene/src/java/org/apache/lucene/search/Weight.java index 3fb892714c6..e99c5a6b5cb 100644 --- a/lucene/src/java/org/apache/lucene/search/Weight.java +++ b/lucene/src/java/org/apache/lucene/search/Weight.java @@ -41,11 +41,11 @@ import org.apache.lucene.index.IndexReader.ReaderContext; *

    *
  1. A Weight is constructed by a top-level query, given a * IndexSearcher ({@link Query#createWeight(IndexSearcher)}). - *
  2. The {@link #sumOfSquaredWeights()} method is called on the + *
  3. The {@link #getValueForNormalization()} method is called on the * Weight to compute the query normalization factor * {@link SimilarityProvider#queryNorm(float)} of the query clauses contained in the * query. - *
  4. The query normalization factor is passed to {@link #normalize(float)}. At + *
  5. The query normalization factor is passed to {@link #normalize(float, float)}. At * this point the weighting is complete. *
  6. A Scorer is constructed by * {@link #scorer(IndexReader.AtomicReaderContext, ScorerContext)}. @@ -67,12 +67,12 @@ public abstract class Weight { /** The query that this concerns. */ public abstract Query getQuery(); + + /** The value for normalization of contained query clauses (e.g. sum of squared weights). */ + public abstract float getValueForNormalization() throws IOException; - /** The weight for this query. */ - public abstract float getValue(); - - /** Assigns the query normalization factor to this. */ - public abstract void normalize(float norm); + /** Assigns the query normalization factor and boost from parent queries to this. */ + public abstract void normalize(float norm, float topLevelBoost); /** * Returns a {@link Scorer} which scores documents in/out-of order according @@ -93,9 +93,6 @@ public abstract class Weight { * @throws IOException */ public abstract Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException; - - /** The sum of squared weights of contained query clauses. */ - public abstract float sumOfSquaredWeights() throws IOException; /** * Returns true iff this implementation scores docs only out of order. This diff --git a/lucene/src/java/org/apache/lucene/search/payloads/PayloadNearQuery.java b/lucene/src/java/org/apache/lucene/search/payloads/PayloadNearQuery.java index da91ef59f9d..ac2f5008cd3 100644 --- a/lucene/src/java/org/apache/lucene/search/payloads/PayloadNearQuery.java +++ b/lucene/src/java/org/apache/lucene/search/payloads/PayloadNearQuery.java @@ -18,11 +18,13 @@ package org.apache.lucene.search.payloads; */ import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.search.ComplexExplanation; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Similarity; import org.apache.lucene.search.Weight; +import org.apache.lucene.search.Similarity.SloppyDocScorer; import org.apache.lucene.search.spans.NearSpansOrdered; import org.apache.lucene.search.spans.NearSpansUnordered; import org.apache.lucene.search.spans.SpanNearQuery; @@ -145,7 +147,35 @@ public class PayloadNearQuery extends SpanNearQuery { @Override public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { return new PayloadNearSpanScorer(query.getSpans(context), this, - similarity, context.reader.norms(query.getField())); + similarity, similarity.sloppyDocScorer(stats, query.getField(), context)); + } + + @Override + public Explanation explain(AtomicReaderContext context, int doc) throws IOException { + PayloadNearSpanScorer scorer = (PayloadNearSpanScorer) scorer(context, ScorerContext.def()); + if (scorer != null) { + int newDoc = scorer.advance(doc); + if (newDoc == doc) { + float freq = scorer.freq(); + SloppyDocScorer docScorer = similarity.sloppyDocScorer(stats, query.getField(), context); + Explanation expl = new Explanation(); + expl.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:"); + Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq)); + expl.addDetail(scoreExplanation); + expl.setValue(scoreExplanation.getValue()); + // now the payloads part + Explanation payloadExpl = function.explain(doc, scorer.payloadsSeen, scorer.payloadScore); + // combined + ComplexExplanation result = new ComplexExplanation(); + result.addDetail(expl); + result.addDetail(payloadExpl); + result.setValue(expl.getValue() * payloadExpl.getValue()); + result.setDescription("PayloadNearQuery, product of:"); + return result; + } + } + + return new ComplexExplanation(false, 0.0f, "no matching term"); } } @@ -155,8 +185,8 @@ public class PayloadNearQuery extends SpanNearQuery { private int payloadsSeen; protected PayloadNearSpanScorer(Spans spans, Weight weight, - Similarity similarity, byte[] norms) throws IOException { - super(spans, weight, similarity, norms); + Similarity similarity, Similarity.SloppyDocScorer docScorer) throws IOException { + super(spans, weight, similarity, docScorer); this.spans = spans; } @@ -225,20 +255,6 @@ public class PayloadNearQuery extends SpanNearQuery { return super.score() * function.docScore(doc, fieldName, payloadsSeen, payloadScore); } - - @Override - protected Explanation explain(int doc) throws IOException { - Explanation result = new Explanation(); - // Add detail about tf/idf... - Explanation nonPayloadExpl = super.explain(doc); - result.addDetail(nonPayloadExpl); - // Add detail about payload - Explanation payloadExpl = function.explain(doc, payloadsSeen, payloadScore); - result.addDetail(payloadExpl); - result.setValue(nonPayloadExpl.getValue() * payloadExpl.getValue()); - result.setDescription("PayloadNearQuery, product of:"); - return result; - } } } diff --git a/lucene/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java b/lucene/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java index b3415a7b42c..9c697de1474 100644 --- a/lucene/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java +++ b/lucene/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java @@ -26,6 +26,9 @@ import org.apache.lucene.search.Weight; import org.apache.lucene.search.Similarity; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.ComplexExplanation; +import org.apache.lucene.search.Similarity.SloppyDocScorer; +import org.apache.lucene.search.Weight.ScorerContext; +import org.apache.lucene.search.payloads.PayloadNearQuery.PayloadNearSpanScorer; import org.apache.lucene.search.spans.TermSpans; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.search.spans.SpanWeight; @@ -76,7 +79,7 @@ public class PayloadTermQuery extends SpanTermQuery { @Override public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { return new PayloadTermSpanScorer((TermSpans) query.getSpans(context), - this, similarity, context.reader.norms(query.getField())); + this, similarity, similarity.sloppyDocScorer(stats, query.getField(), context)); } protected class PayloadTermSpanScorer extends SpanScorer { @@ -86,8 +89,8 @@ public class PayloadTermQuery extends SpanTermQuery { private final TermSpans termSpans; public PayloadTermSpanScorer(TermSpans spans, Weight weight, - Similarity similarity, byte[] norms) throws IOException { - super(spans, weight, similarity, norms); + Similarity similarity, Similarity.SloppyDocScorer docScorer) throws IOException { + super(spans, weight, similarity, docScorer); termSpans = spans; } @@ -173,29 +176,40 @@ public class PayloadTermQuery extends SpanTermQuery { protected float getPayloadScore() { return function.docScore(doc, term.field(), payloadsSeen, payloadScore); } - - @Override - protected Explanation explain(final int doc) throws IOException { - ComplexExplanation result = new ComplexExplanation(); - Explanation nonPayloadExpl = super.explain(doc); - result.addDetail(nonPayloadExpl); - // QUESTION: Is there a way to avoid this skipTo call? We need to know - // whether to load the payload or not - Explanation payloadBoost = new Explanation(); - result.addDetail(payloadBoost); - - float payloadScore = getPayloadScore(); - payloadBoost.setValue(payloadScore); - // GSI: I suppose we could toString the payload, but I don't think that - // would be a good idea - payloadBoost.setDescription("scorePayload(...)"); - result.setValue(nonPayloadExpl.getValue() * payloadScore); - result.setDescription("btq, product of:"); - result.setMatch(nonPayloadExpl.getValue() == 0 ? Boolean.FALSE - : Boolean.TRUE); // LUCENE-1303 - return result; + } + + @Override + public Explanation explain(AtomicReaderContext context, int doc) throws IOException { + PayloadTermSpanScorer scorer = (PayloadTermSpanScorer) scorer(context, ScorerContext.def()); + if (scorer != null) { + int newDoc = scorer.advance(doc); + if (newDoc == doc) { + float freq = scorer.freq(); + SloppyDocScorer docScorer = similarity.sloppyDocScorer(stats, query.getField(), context); + Explanation expl = new Explanation(); + expl.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:"); + Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq)); + expl.addDetail(scoreExplanation); + expl.setValue(scoreExplanation.getValue()); + // now the payloads part + // QUESTION: Is there a way to avoid this skipTo call? We need to know + // whether to load the payload or not + // GSI: I suppose we could toString the payload, but I don't think that + // would be a good idea + Explanation payloadExpl = new Explanation(scorer.getPayloadScore(), "scorePayload(...)"); + payloadExpl.setValue(scorer.getPayloadScore()); + // combined + ComplexExplanation result = new ComplexExplanation(); + result.addDetail(expl); + result.addDetail(payloadExpl); + result.setValue(expl.getValue() * payloadExpl.getValue()); + result.setDescription("btq, product of:"); + result.setMatch(expl.getValue() == 0 ? Boolean.FALSE : Boolean.TRUE); // LUCENE-1303 + return result; + } } - + + return new ComplexExplanation(false, 0.0f, "no matching term"); } } diff --git a/lucene/src/java/org/apache/lucene/search/spans/SpanMultiTermQueryWrapper.java b/lucene/src/java/org/apache/lucene/search/spans/SpanMultiTermQueryWrapper.java index 865e2b1eb46..a393b38b5df 100644 --- a/lucene/src/java/org/apache/lucene/search/spans/SpanMultiTermQueryWrapper.java +++ b/lucene/src/java/org/apache/lucene/search/spans/SpanMultiTermQueryWrapper.java @@ -27,7 +27,7 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.TopTermsRewrite; import org.apache.lucene.search.ScoringRewrite; import org.apache.lucene.search.BooleanClause.Occur; // javadocs only -import org.apache.lucene.util.PerReaderTermState; +import org.apache.lucene.util.TermContext; /** * Wraps any {@link MultiTermQuery} as a {@link SpanQuery}, @@ -155,7 +155,7 @@ public class SpanMultiTermQueryWrapper extends SpanQue } @Override - protected void addClause(SpanOrQuery topLevel, Term term, int docCount, float boost, PerReaderTermState states) { + protected void addClause(SpanOrQuery topLevel, Term term, int docCount, float boost, TermContext states) { final SpanTermQuery q = new SpanTermQuery(term); q.setBoost(boost); topLevel.addClause(q); @@ -204,7 +204,7 @@ public class SpanMultiTermQueryWrapper extends SpanQue } @Override - protected void addClause(SpanOrQuery topLevel, Term term, int docFreq, float boost, PerReaderTermState states) { + protected void addClause(SpanOrQuery topLevel, Term term, int docFreq, float boost, TermContext states) { final SpanTermQuery q = new SpanTermQuery(term); q.setBoost(boost); topLevel.addClause(q); diff --git a/lucene/src/java/org/apache/lucene/search/spans/SpanScorer.java b/lucene/src/java/org/apache/lucene/search/spans/SpanScorer.java index 8b309a3df68..9cce1f45e4b 100644 --- a/lucene/src/java/org/apache/lucene/search/spans/SpanScorer.java +++ b/lucene/src/java/org/apache/lucene/search/spans/SpanScorer.java @@ -20,6 +20,7 @@ package org.apache.lucene.search.spans; import java.io.IOException; import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.TFIDFSimilarity; import org.apache.lucene.search.Weight; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Similarity; @@ -29,22 +30,21 @@ import org.apache.lucene.search.Similarity; */ public class SpanScorer extends Scorer { protected Spans spans; - protected byte[] norms; - protected float value; protected boolean more = true; protected int doc; protected float freq; protected final Similarity similarity; + protected final Similarity.SloppyDocScorer docScorer; - protected SpanScorer(Spans spans, Weight weight, Similarity similarity, byte[] norms) + protected SpanScorer(Spans spans, Weight weight, Similarity similarity, Similarity.SloppyDocScorer docScorer) throws IOException { super(weight); this.similarity = similarity; + this.docScorer = docScorer; this.spans = spans; - this.norms = norms; - this.value = weight.getValue(); + if (this.spans.next()) { doc = -1; } else { @@ -94,27 +94,11 @@ public class SpanScorer extends Scorer { @Override public float score() throws IOException { - float raw = similarity.tf(freq) * value; // raw score - return norms == null? raw : raw * similarity.decodeNormValue(norms[doc]); // normalize + return docScorer.score(doc, freq); } @Override public float freq() throws IOException { return freq; } - - /** This method is no longer an official member of {@link Scorer}, - * but it is needed by SpanWeight to build an explanation. */ - protected Explanation explain(final int doc) throws IOException { - Explanation tfExplanation = new Explanation(); - - int expDoc = advance(doc); - - float phraseFreq = (expDoc == doc) ? freq : 0.0f; - tfExplanation.setValue(similarity.tf(phraseFreq)); - tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")"); - - return tfExplanation; - } - } diff --git a/lucene/src/java/org/apache/lucene/search/spans/SpanWeight.java b/lucene/src/java/org/apache/lucene/search/spans/SpanWeight.java index 104bacf0a37..cf8bf4e22a2 100644 --- a/lucene/src/java/org/apache/lucene/search/spans/SpanWeight.java +++ b/lucene/src/java/org/apache/lucene/search/spans/SpanWeight.java @@ -18,125 +18,76 @@ package org.apache.lucene.search.spans; */ import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.index.IndexReader.ReaderContext; import org.apache.lucene.index.Term; import org.apache.lucene.search.*; -import org.apache.lucene.search.Explanation.IDFExplanation; +import org.apache.lucene.search.Similarity.SloppyDocScorer; +import org.apache.lucene.util.TermContext; import java.io.IOException; -import java.util.HashSet; import java.util.Set; +import java.util.TreeSet; /** * Expert-only. Public for use by other weight implementations */ public class SpanWeight extends Weight { protected Similarity similarity; - protected float value; - protected float idf; - protected float queryNorm; - protected float queryWeight; - protected Set terms; protected SpanQuery query; - private IDFExplanation idfExp; + protected Similarity.Stats stats; public SpanWeight(SpanQuery query, IndexSearcher searcher) throws IOException { this.similarity = searcher.getSimilarityProvider().get(query.getField()); this.query = query; - terms=new HashSet(); + terms=new TreeSet(); query.extractTerms(terms); - - idfExp = similarity.idfExplain(terms, searcher); - idf = idfExp.getIdf(); + final ReaderContext context = searcher.getTopReaderContext(); + final TermContext states[] = new TermContext[terms.size()]; + int i = 0; + for (Term term : terms) + states[i++] = TermContext.build(context, term, true); + stats = similarity.computeStats(searcher, query.getField(), query.getBoost(), states); } @Override public Query getQuery() { return query; } @Override - public float getValue() { return value; } - - @Override - public float sumOfSquaredWeights() throws IOException { - queryWeight = idf * query.getBoost(); // compute query weight - return queryWeight * queryWeight; // square it + public float getValueForNormalization() throws IOException { + return stats.getValueForNormalization(); } @Override - public void normalize(float queryNorm) { - this.queryNorm = queryNorm; - queryWeight *= queryNorm; // normalize query weight - value = queryWeight * idf; // idf for document + public void normalize(float queryNorm, float topLevelBoost) { + stats.normalize(queryNorm, topLevelBoost); } @Override public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { - return new SpanScorer(query.getSpans(context), this, similarity, context.reader - .norms(query.getField())); + return new SpanScorer(query.getSpans(context), this, similarity, similarity.sloppyDocScorer(stats, query.getField(), context)); } @Override - public Explanation explain(AtomicReaderContext context, int doc) - throws IOException { - - ComplexExplanation result = new ComplexExplanation(); - result.setDescription("weight("+getQuery()+" in "+doc+"), product of:"); - String field = ((SpanQuery)getQuery()).getField(); - - Explanation idfExpl = - new Explanation(idf, "idf(" + field + ": " + idfExp.explain() + ")"); - - // explain query weight - Explanation queryExpl = new Explanation(); - queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:"); - - Explanation boostExpl = new Explanation(getQuery().getBoost(), "boost"); - if (getQuery().getBoost() != 1.0f) - queryExpl.addDetail(boostExpl); - queryExpl.addDetail(idfExpl); - - Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm"); - queryExpl.addDetail(queryNormExpl); - - queryExpl.setValue(boostExpl.getValue() * - idfExpl.getValue() * - queryNormExpl.getValue()); - - result.addDetail(queryExpl); - - // explain field weight - ComplexExplanation fieldExpl = new ComplexExplanation(); - fieldExpl.setDescription("fieldWeight("+field+":"+query.toString(field)+ - " in "+doc+"), product of:"); - - Explanation tfExpl = ((SpanScorer)scorer(context, ScorerContext.def())).explain(doc); - fieldExpl.addDetail(tfExpl); - fieldExpl.addDetail(idfExpl); - - Explanation fieldNormExpl = new Explanation(); - byte[] fieldNorms = context.reader.norms(field); - float fieldNorm = - fieldNorms!=null ? similarity.decodeNormValue(fieldNorms[doc]) : 1.0f; - fieldNormExpl.setValue(fieldNorm); - fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")"); - fieldExpl.addDetail(fieldNormExpl); - - fieldExpl.setMatch(Boolean.valueOf(tfExpl.isMatch())); - fieldExpl.setValue(tfExpl.getValue() * - idfExpl.getValue() * - fieldNormExpl.getValue()); - - result.addDetail(fieldExpl); - result.setMatch(fieldExpl.getMatch()); - - // combine them - result.setValue(queryExpl.getValue() * fieldExpl.getValue()); - - if (queryExpl.getValue() == 1.0f) - return fieldExpl; - - return result; + public Explanation explain(AtomicReaderContext context, int doc) throws IOException { + Scorer scorer = scorer(context, ScorerContext.def()); + if (scorer != null) { + int newDoc = scorer.advance(doc); + if (newDoc == doc) { + float freq = scorer.freq(); + SloppyDocScorer docScorer = similarity.sloppyDocScorer(stats, query.getField(), context); + ComplexExplanation result = new ComplexExplanation(); + result.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:"); + Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq)); + result.addDetail(scoreExplanation); + result.setValue(scoreExplanation.getValue()); + result.setMatch(true); + return result; + } + } + + return new ComplexExplanation(false, 0.0f, "no matching term"); } } diff --git a/lucene/src/java/org/apache/lucene/store/CompoundFileDirectory.java b/lucene/src/java/org/apache/lucene/store/CompoundFileDirectory.java index 371425f2c1b..92c525604d6 100644 --- a/lucene/src/java/org/apache/lucene/store/CompoundFileDirectory.java +++ b/lucene/src/java/org/apache/lucene/store/CompoundFileDirectory.java @@ -60,7 +60,7 @@ public abstract class CompoundFileDirectory extends Directory { * NOTE: subclasses must call {@link #initForRead(Map)} before the directory can be used. */ public CompoundFileDirectory(Directory directory, String fileName, IOContext context) throws IOException { - assert !(directory instanceof CompoundFileDirectory) : "compound file inside of compound file: " + fileName; + this.directory = directory; this.fileName = fileName; this.readBufferSize = BufferedIndexInput.bufferSize(context); @@ -75,9 +75,11 @@ public abstract class CompoundFileDirectory extends Directory { } protected final void initForWrite() { + assert !(directory instanceof CompoundFileDirectory) : "compound file inside of compound file: " + fileName; this.entries = SENTINEL; this.openForWrite = true; this.isOpen = true; + writer = new CompoundFileWriter(directory, fileName); } /** Helper method that reads CFS entries from an input stream */ @@ -173,7 +175,11 @@ public abstract class CompoundFileDirectory extends Directory { @Override public synchronized void close() throws IOException { - ensureOpen(); + if (!isOpen) { + // allow double close - usually to be consistent with other closeables + assert entries == null; + return; // already closed + } entries = null; isOpen = false; if (writer != null) { @@ -263,7 +269,6 @@ public abstract class CompoundFileDirectory extends Directory { @Override public IndexOutput createOutput(String name, IOContext context) throws IOException { ensureOpen(); - initWriter(); return writer.createOutput(name, context); } @@ -279,12 +284,13 @@ public abstract class CompoundFileDirectory extends Directory { throw new UnsupportedOperationException(); } - /** Not implemented - * @throws UnsupportedOperationException */ @Override - public final CompoundFileDirectory openCompoundInput(String name, IOContext context) throws IOException { - // NOTE: final to make nested compounding impossible. - throw new UnsupportedOperationException(); + public CompoundFileDirectory openCompoundInput(String name, IOContext context) throws IOException { + FileEntry fileEntry = this.entries.get(IndexFileNames.stripSegmentName(name)); + if (fileEntry == null) { + throw new FileNotFoundException("file " + name + " does not exists in this CFS"); + } + return new NestedCompoundFileDirectory(name, context, fileEntry.offset, fileEntry.length); } /** Not implemented @@ -292,16 +298,36 @@ public abstract class CompoundFileDirectory extends Directory { @Override public CompoundFileDirectory createCompoundOutput(String name, IOContext context) throws IOException { - // NOTE: final to make nested compounding impossible. - throw new UnsupportedOperationException(); + throw new UnsupportedOperationException("can not create nested CFS, create seperately and use Directory.copy instead"); + } + + private class NestedCompoundFileDirectory extends CompoundFileDirectory { + + private final long cfsOffset; + private final long cfsLength; + + public NestedCompoundFileDirectory(String fileName, IOContext context, long offset, long length) + throws IOException { + super(directory, fileName, context); + this.cfsOffset = offset; + this.cfsLength = length; + IndexInput input = null; + try { + input = CompoundFileDirectory.this.openInput(fileName, IOContext.READONCE); + initForRead(CompoundFileDirectory.readEntries(input, + CompoundFileDirectory.this, fileName)); + } finally { + IOUtils.closeSafely(false, input); + } + } + + @Override + public IndexInput openInputSlice(String id, long offset, long length, + int readBufferSize) throws IOException { + assert offset + length <= cfsLength; + return CompoundFileDirectory.this.openInputSlice(id, cfsOffset + offset, length, readBufferSize); + } + } - private final void initWriter() { - assert openForWrite; - assert entries == SENTINEL; - if (writer == null) { - writer = new CompoundFileWriter(directory, fileName); - } - } - } diff --git a/lucene/src/java/org/apache/lucene/store/CompoundFileWriter.java b/lucene/src/java/org/apache/lucene/store/CompoundFileWriter.java index 7587a981310..cd56d8fa7f6 100644 --- a/lucene/src/java/org/apache/lucene/store/CompoundFileWriter.java +++ b/lucene/src/java/org/apache/lucene/store/CompoundFileWriter.java @@ -17,6 +17,7 @@ package org.apache.lucene.store; * limitations under the License. */ +import java.io.Closeable; import java.io.FileNotFoundException; import java.io.IOException; import java.util.Collection; @@ -55,7 +56,7 @@ import org.apache.lucene.util.IOUtils; * * @lucene.internal */ -final class CompoundFileWriter { +final class CompoundFileWriter implements Closeable{ private static final class FileEntry { /** source file */ @@ -89,8 +90,8 @@ final class CompoundFileWriter { private boolean closed = false; private volatile IndexOutput dataOut; private final AtomicBoolean outputTaken = new AtomicBoolean(false); - private final String entryTableName; - private final String dataFileName; + final String entryTableName; + final String dataFileName; /** * Create the compound stream in the specified file. The file name is the @@ -128,17 +129,14 @@ final class CompoundFileWriter { * if close() had been called before or if no file has been added to * this object */ - void close() throws IOException { + public void close() throws IOException { if (closed) { throw new IllegalStateException("already closed"); } IOException priorException = null; IndexOutput entryTableOut = null; try { - if (entries.isEmpty()) { - throw new IllegalStateException("CFS has no entries"); - } - + initDataOut(IOContext.DEFAULT); if (!pendingEntries.isEmpty() || outputTaken.get()) { throw new IllegalStateException("CFS has pending open files"); } @@ -147,12 +145,18 @@ final class CompoundFileWriter { assert dataOut != null; long finalLength = dataOut.getFilePointer(); assert assertFileLength(finalLength, dataOut); + } catch (IOException e) { + priorException = e; + } finally { + IOUtils.closeSafely(priorException, dataOut); + } + try { entryTableOut = directory.createOutput(entryTableName, IOContext.DEFAULT); writeEntryTable(entries.values(), entryTableOut); } catch (IOException e) { priorException = e; } finally { - IOUtils.closeSafely(priorException, dataOut, entryTableOut); + IOUtils.closeSafely(priorException, entryTableOut); } } @@ -321,6 +325,7 @@ final class CompoundFileWriter { closed = true; entry.length = writtenBytes; if (isSeparate) { + delegate.close(); // we are a separate file - push into the pending entries pendingEntries.add(entry); } else { diff --git a/lucene/src/java/org/apache/lucene/util/PerReaderTermState.java b/lucene/src/java/org/apache/lucene/util/TermContext.java similarity index 73% rename from lucene/src/java/org/apache/lucene/util/PerReaderTermState.java rename to lucene/src/java/org/apache/lucene/util/TermContext.java index a5139b6335e..746405c353d 100644 --- a/lucene/src/java/org/apache/lucene/util/PerReaderTermState.java +++ b/lucene/src/java/org/apache/lucene/util/TermContext.java @@ -28,25 +28,27 @@ import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader.ReaderContext; +import org.apache.lucene.index.TermsEnum.SeekStatus; /** * Maintains a {@link IndexReader} {@link TermState} view over * {@link IndexReader} instances containing a single term. The - * {@link PerReaderTermState} doesn't track if the given {@link TermState} + * {@link TermContext} doesn't track if the given {@link TermState} * objects are valid, neither if the {@link TermState} instances refer to the * same terms in the associated readers. * * @lucene.experimental */ -public final class PerReaderTermState { +public final class TermContext { public final ReaderContext topReaderContext; // for asserting! private final TermState[] states; private int docFreq; + private long totalTermFreq; /** - * Creates an empty {@link PerReaderTermState} from a {@link ReaderContext} + * Creates an empty {@link TermContext} from a {@link ReaderContext} */ - public PerReaderTermState(ReaderContext context) { + public TermContext(ReaderContext context) { assert context != null && context.isTopLevel; topReaderContext = context; docFreq = 0; @@ -60,28 +62,28 @@ public final class PerReaderTermState { } /** - * Creates a {@link PerReaderTermState} with an initial {@link TermState}, + * Creates a {@link TermContext} with an initial {@link TermState}, * {@link IndexReader} pair. */ - public PerReaderTermState(ReaderContext context, TermState state, int ord, int docFreq) { + public TermContext(ReaderContext context, TermState state, int ord, int docFreq, long totalTermFreq) { this(context); - register(state, ord, docFreq); + register(state, ord, docFreq, totalTermFreq); } /** - * Creates a {@link PerReaderTermState} from a top-level {@link ReaderContext} and the + * Creates a {@link TermContext} from a top-level {@link ReaderContext} and the * given {@link Term}. This method will lookup the given term in all context's leaf readers - * and register each of the readers containing the term in the returned {@link PerReaderTermState} + * and register each of the readers containing the term in the returned {@link TermContext} * using the leaf reader's ordinal. *

    * Note: the given context must be a top-level context. */ - public static PerReaderTermState build(ReaderContext context, Term term, boolean cache) + public static TermContext build(ReaderContext context, Term term, boolean cache) throws IOException { assert context != null && context.isTopLevel; final String field = term.field(); final BytesRef bytes = term.bytes(); - final PerReaderTermState perReaderTermState = new PerReaderTermState(context); + final TermContext perReaderTermState = new TermContext(context); final AtomicReaderContext[] leaves = ReaderUtil.leaves(context); for (int i = 0; i < leaves.length; i++) { final Fields fields = leaves[i].reader.fields(); @@ -91,7 +93,7 @@ public final class PerReaderTermState { final TermsEnum termsEnum = terms.getThreadTermsEnum(); // thread-private don't share! if (termsEnum.seekExact(bytes, cache)) { final TermState termState = termsEnum.termState(); - perReaderTermState.register(termState, leaves[i].ord, termsEnum.docFreq()); + perReaderTermState.register(termState, leaves[i].ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } } } @@ -100,7 +102,7 @@ public final class PerReaderTermState { } /** - * Clears the {@link PerReaderTermState} internal state and removes all + * Clears the {@link TermContext} internal state and removes all * registered {@link TermState}s */ public void clear() { @@ -112,12 +114,16 @@ public final class PerReaderTermState { * Registers and associates a {@link TermState} with an leaf ordinal. The leaf ordinal * should be derived from a {@link ReaderContext}'s leaf ord. */ - public void register(TermState state, final int ord, final int docFreq) { + public void register(TermState state, final int ord, final int docFreq, final long totalTermFreq) { assert state != null : "state must not be null"; assert ord >= 0 && ord < states.length; assert states[ord] == null : "state for ord: " + ord + " already registered"; this.docFreq += docFreq; + if (this.totalTermFreq >= 0 && totalTermFreq >= 0) + this.totalTermFreq += totalTermFreq; + else + this.totalTermFreq = -1; states[ord] = state; } @@ -137,11 +143,27 @@ public final class PerReaderTermState { /** * Returns the accumulated document frequency of all {@link TermState} - * instances passed to {@link #register(TermState, int, int)}. + * instances passed to {@link #register(TermState, int, int, long)}. * @return the accumulated document frequency of all {@link TermState} - * instances passed to {@link #register(TermState, int, int)}. + * instances passed to {@link #register(TermState, int, int, long)}. */ public int docFreq() { return docFreq; } + + /** + * Returns the accumulated term frequency of all {@link TermState} + * instances passed to {@link #register(TermState, int, int, long)}. + * @return the accumulated term frequency of all {@link TermState} + * instances passed to {@link #register(TermState, int, int, long)}. + */ + public long totalTermFreq() { + return totalTermFreq; + } + + /** expert: only available for queries that want to lie about docfreq + * @lucene.internal */ + public void setDocFreq(int docFreq) { + this.docFreq = docFreq; + } } \ No newline at end of file diff --git a/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockCodec.java b/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockCodec.java index aa3c799c705..e665e82f02c 100644 --- a/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockCodec.java +++ b/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockFixedIntBlockCodec.java @@ -32,6 +32,7 @@ import org.apache.lucene.index.codecs.sep.IntIndexInput; import org.apache.lucene.index.codecs.sep.IntIndexOutput; import org.apache.lucene.index.codecs.sep.SepPostingsReaderImpl; import org.apache.lucene.index.codecs.sep.SepPostingsWriterImpl; +import org.apache.lucene.index.codecs.standard.StandardCodec; import org.apache.lucene.index.codecs.intblock.FixedIntBlockIndexInput; import org.apache.lucene.index.codecs.intblock.FixedIntBlockIndexOutput; import org.apache.lucene.index.codecs.DefaultDocValuesProducer; @@ -46,7 +47,6 @@ import org.apache.lucene.index.codecs.BlockTermsReader; import org.apache.lucene.index.codecs.BlockTermsWriter; import org.apache.lucene.index.codecs.TermsIndexReaderBase; import org.apache.lucene.index.codecs.TermsIndexWriterBase; -import org.apache.lucene.index.codecs.standard.StandardCodec; import org.apache.lucene.store.*; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; @@ -62,8 +62,8 @@ public class MockFixedIntBlockCodec extends Codec { private final int blockSize; public MockFixedIntBlockCodec(int blockSize) { + super("MockFixedIntBlock"); this.blockSize = blockSize; - name = "MockFixedIntBlock"; } @Override @@ -207,7 +207,7 @@ public class MockFixedIntBlockCodec extends Codec { SepPostingsReaderImpl.files(segmentInfo, codecId, files); BlockTermsReader.files(dir, segmentInfo, codecId, files); FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files); - DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files); + DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files, getDocValuesUseCFS()); } @Override @@ -215,16 +215,16 @@ public class MockFixedIntBlockCodec extends Codec { SepPostingsWriterImpl.getExtensions(extensions); BlockTermsReader.getExtensions(extensions); FixedGapTermsIndexReader.getIndexExtensions(extensions); - DefaultDocValuesConsumer.getDocValuesExtensions(extensions); + DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS()); } @Override public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { - return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); + return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS()); } @Override public PerDocValues docsProducer(SegmentReadState state) throws IOException { - return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, state.context); + return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, getDocValuesUseCFS(), getDocValuesSortComparator(), state.context); } } diff --git a/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockVariableIntBlockCodec.java b/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockVariableIntBlockCodec.java index 3ba849ce75a..6d15b92cd43 100644 --- a/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockVariableIntBlockCodec.java +++ b/lucene/src/test-framework/org/apache/lucene/index/codecs/mockintblock/MockVariableIntBlockCodec.java @@ -32,6 +32,7 @@ import org.apache.lucene.index.codecs.sep.IntIndexInput; import org.apache.lucene.index.codecs.sep.IntIndexOutput; import org.apache.lucene.index.codecs.sep.SepPostingsReaderImpl; import org.apache.lucene.index.codecs.sep.SepPostingsWriterImpl; +import org.apache.lucene.index.codecs.standard.StandardCodec; import org.apache.lucene.index.codecs.intblock.VariableIntBlockIndexInput; import org.apache.lucene.index.codecs.intblock.VariableIntBlockIndexOutput; import org.apache.lucene.index.codecs.DefaultDocValuesProducer; @@ -46,7 +47,6 @@ import org.apache.lucene.index.codecs.BlockTermsReader; import org.apache.lucene.index.codecs.BlockTermsWriter; import org.apache.lucene.index.codecs.TermsIndexReaderBase; import org.apache.lucene.index.codecs.TermsIndexWriterBase; -import org.apache.lucene.index.codecs.standard.StandardCodec; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; @@ -63,9 +63,9 @@ import org.apache.lucene.util.IOUtils; public class MockVariableIntBlockCodec extends Codec { private final int baseBlockSize; - + public MockVariableIntBlockCodec(int baseBlockSize) { - name = "MockVariableIntBlock"; + super("MockVariableIntBlock"); this.baseBlockSize = baseBlockSize; } @@ -230,7 +230,7 @@ public class MockVariableIntBlockCodec extends Codec { SepPostingsReaderImpl.files(segmentInfo, codecId, files); BlockTermsReader.files(dir, segmentInfo, codecId, files); FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files); - DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files); + DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files, getDocValuesUseCFS()); } @Override @@ -238,16 +238,16 @@ public class MockVariableIntBlockCodec extends Codec { SepPostingsWriterImpl.getExtensions(extensions); BlockTermsReader.getExtensions(extensions); FixedGapTermsIndexReader.getIndexExtensions(extensions); - DefaultDocValuesConsumer.getDocValuesExtensions(extensions); + DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS()); } @Override public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { - return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); + return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS()); } @Override public PerDocValues docsProducer(SegmentReadState state) throws IOException { - return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, state.context); + return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, getDocValuesUseCFS(), getDocValuesSortComparator(), state.context); } } diff --git a/lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java b/lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java index d147694d60a..c15865c7aa8 100644 --- a/lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java +++ b/lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java @@ -76,9 +76,9 @@ public class MockRandomCodec extends Codec { private final Random seedRandom; private final String SEED_EXT = "sd"; - + public MockRandomCodec(Random random) { - name = "MockRandom"; + super("MockRandom"); this.seedRandom = new Random(random.nextLong()); } @@ -355,7 +355,7 @@ public class MockRandomCodec extends Codec { BlockTermsReader.files(dir, segmentInfo, codecId, files); FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files); VariableGapTermsIndexReader.files(dir, segmentInfo, codecId, files); - DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files); + DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files, getDocValuesUseCFS()); // hackish! Iterator it = files.iterator(); while(it.hasNext()) { @@ -373,7 +373,7 @@ public class MockRandomCodec extends Codec { BlockTermsReader.getExtensions(extensions); FixedGapTermsIndexReader.getIndexExtensions(extensions); VariableGapTermsIndexReader.getIndexExtensions(extensions); - DefaultDocValuesConsumer.getDocValuesExtensions(extensions); + DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS()); extensions.add(SEED_EXT); //System.out.println("MockRandom.getExtensions return " + extensions); } @@ -381,11 +381,11 @@ public class MockRandomCodec extends Codec { // can we make this more evil? @Override public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { - return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); + return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS()); } @Override public PerDocValues docsProducer(SegmentReadState state) throws IOException { - return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, state.context); + return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, getDocValuesUseCFS(), getDocValuesSortComparator(), state.context); } } diff --git a/lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSepCodec.java b/lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSepCodec.java index 7454cab7b5c..30cd3643657 100644 --- a/lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSepCodec.java +++ b/lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSepCodec.java @@ -54,7 +54,7 @@ import org.apache.lucene.util.BytesRef; public class MockSepCodec extends Codec { public MockSepCodec() { - name = "MockSep"; + super("MockSep"); } @Override @@ -139,13 +139,13 @@ public class MockSepCodec extends Codec { SepPostingsReaderImpl.files(segmentInfo, codecId, files); BlockTermsReader.files(dir, segmentInfo, codecId, files); FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files); - DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files); + DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files, getDocValuesUseCFS()); } @Override public void getExtensions(Set extensions) { getSepExtensions(extensions); - DefaultDocValuesConsumer.getDocValuesExtensions(extensions); + DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS()); } public static void getSepExtensions(Set extensions) { @@ -156,11 +156,11 @@ public class MockSepCodec extends Codec { @Override public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { - return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); + return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS()); } @Override public PerDocValues docsProducer(SegmentReadState state) throws IOException { - return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, state.context); + return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, getDocValuesUseCFS(), getDocValuesSortComparator(), state.context); } } diff --git a/lucene/src/test-framework/org/apache/lucene/index/codecs/preflexrw/PreFlexRWCodec.java b/lucene/src/test-framework/org/apache/lucene/index/codecs/preflexrw/PreFlexRWCodec.java index d2edcca41b7..f911ef2b72b 100644 --- a/lucene/src/test-framework/org/apache/lucene/index/codecs/preflexrw/PreFlexRWCodec.java +++ b/lucene/src/test-framework/org/apache/lucene/index/codecs/preflexrw/PreFlexRWCodec.java @@ -37,7 +37,6 @@ public class PreFlexRWCodec extends PreFlexCodec { public PreFlexRWCodec() { // NOTE: we impersonate the PreFlex codec so that it can // read the segments we write! - super(); } @Override diff --git a/lucene/src/test-framework/org/apache/lucene/search/AssertingIndexSearcher.java b/lucene/src/test-framework/org/apache/lucene/search/AssertingIndexSearcher.java index 41541264955..3573ffc6fb5 100644 --- a/lucene/src/test-framework/org/apache/lucene/search/AssertingIndexSearcher.java +++ b/lucene/src/test-framework/org/apache/lucene/search/AssertingIndexSearcher.java @@ -62,12 +62,7 @@ public class AssertingIndexSearcher extends IndexSearcher { } @Override - public float getValue() { - return w.getValue(); - } - - @Override - public void normalize(float norm) { + public void normalize(float norm, float topLevelBoost) { throw new IllegalStateException("Weight already normalized."); } @@ -77,7 +72,7 @@ public class AssertingIndexSearcher extends IndexSearcher { } @Override - public float sumOfSquaredWeights() throws IOException { + public float getValueForNormalization() throws IOException { throw new IllegalStateException("Weight already normalized."); } diff --git a/lucene/src/test-framework/org/apache/lucene/search/CheckHits.java b/lucene/src/test-framework/org/apache/lucene/search/CheckHits.java index 6f1d333cc3d..36362555e33 100644 --- a/lucene/src/test-framework/org/apache/lucene/search/CheckHits.java +++ b/lucene/src/test-framework/org/apache/lucene/search/CheckHits.java @@ -329,9 +329,10 @@ public class CheckHits { Explanation detail[] = expl.getDetails(); if (detail!=null) { if (detail.length==1) { - // simple containment, no matter what the description says, + // simple containment, unless its a freq of: (which lets a query explain how the freq is calculated), // just verify contained expl has same score - verifyExplanation(q,doc,score,deep,detail[0]); + if (!expl.getDescription().endsWith("with freq of:")) + verifyExplanation(q,doc,score,deep,detail[0]); } else { // explanation must either: // - end with one of: "product of:", "sum of:", "max of:", or @@ -357,6 +358,7 @@ public class CheckHits { } } } + // TODO: this is a TERRIBLE assertion!!!! Assert.assertTrue( q+": multi valued explanation description=\""+descr +"\" must be 'max of plus x times others' or end with 'product of'" diff --git a/lucene/src/test-framework/org/apache/lucene/store/MockCompoundFileDirectoryWrapper.java b/lucene/src/test-framework/org/apache/lucene/store/MockCompoundFileDirectoryWrapper.java index c7a592e89c7..cc465358750 100644 --- a/lucene/src/test-framework/org/apache/lucene/store/MockCompoundFileDirectoryWrapper.java +++ b/lucene/src/test-framework/org/apache/lucene/store/MockCompoundFileDirectoryWrapper.java @@ -19,7 +19,6 @@ package org.apache.lucene.store; import java.io.IOException; import java.util.Collection; -import java.util.Collections; public class MockCompoundFileDirectoryWrapper extends CompoundFileDirectory { private final MockDirectoryWrapper parent; @@ -31,11 +30,7 @@ public class MockCompoundFileDirectoryWrapper extends CompoundFileDirectory { this.name = name; this.parent = parent; this.delegate = delegate; - if (forWrite) { - super.initForWrite(); - } else { - super.initForRead(Collections.emptyMap()); - } + // don't initialize here since we delegate everything - if not initialized a direct call will cause an assert to fail! parent.addFileHandle(this, name, !forWrite); } @@ -51,12 +46,8 @@ public class MockCompoundFileDirectoryWrapper extends CompoundFileDirectory { @Override public synchronized void close() throws IOException { - try { - delegate.close(); - parent.removeOpenFile(this, name); - } finally { - super.close(); - } + delegate.close(); + parent.removeOpenFile(this, name); } @Override @@ -148,4 +139,11 @@ public class MockCompoundFileDirectoryWrapper extends CompoundFileDirectory { public CompoundFileDirectory createCompoundOutput(String name, IOContext context) throws IOException { return delegate.createCompoundOutput(name, context); } + + @Override + public CompoundFileDirectory openCompoundInput(String name, IOContext context) + throws IOException { + return delegate.openCompoundInput(name, context); + } + } diff --git a/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java b/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java index cee7d546c67..c8a016ff481 100644 --- a/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java +++ b/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java @@ -242,7 +242,7 @@ public abstract class LuceneTestCase extends Assert { if (prior != null) { cp.unregister(prior); } - cp.register(c); + cp.register(randomizCodec(random, c)); } // returns current default codec @@ -280,7 +280,7 @@ public abstract class LuceneTestCase extends Assert { } swapCodec(new MockSepCodec(), cp); - swapCodec(new PulsingCodec(codecHasParam && "Pulsing".equals(codec) ? codecParam : _TestUtil.nextInt(random, 1, 20)), cp); + swapCodec(new PulsingCodec(codecHasParam && "Pulsing".equals(codec) ? codecParam : 1 + random.nextInt(20)), cp); swapCodec(new MockFixedIntBlockCodec(codecHasParam && "MockFixedIntBlock".equals(codec) ? codecParam : _TestUtil.nextInt(random, 1, 2000)), cp); // baseBlockSize cannot be over 127: swapCodec(new MockVariableIntBlockCodec(codecHasParam && "MockVariableIntBlock".equals(codec) ? codecParam : _TestUtil.nextInt(random, 1, 127)), cp); @@ -288,6 +288,11 @@ public abstract class LuceneTestCase extends Assert { return cp.lookup(codec); } + + public static Codec randomizCodec(Random random, Codec codec) { + codec.setDocValuesUseCFS(random.nextBoolean()); + return codec; + } // returns current PreFlex codec static void removeTestCodecs(Codec codec, CodecProvider cp) { @@ -1493,11 +1498,11 @@ public abstract class LuceneTestCase extends Assert { RandomCodecProvider(Random random) { this.perFieldSeed = random.nextInt(); - register(new StandardCodec()); - register(new PreFlexCodec()); - register(new PulsingCodec(1)); - register(new SimpleTextCodec()); - register(new MemoryCodec()); + register(randomizCodec(random, new StandardCodec())); + register(randomizCodec(random, new PreFlexCodec())); + register(randomizCodec(random, new PulsingCodec( 1 + random.nextInt(20)))); + register(randomizCodec(random, new SimpleTextCodec())); + register(randomizCodec(random, new MemoryCodec())); Collections.shuffle(knownCodecs, random); } diff --git a/lucene/src/test/org/apache/lucene/TestExternalCodecs.java b/lucene/src/test/org/apache/lucene/TestExternalCodecs.java index c3625abc4cf..50485853819 100644 --- a/lucene/src/test/org/apache/lucene/TestExternalCodecs.java +++ b/lucene/src/test/org/apache/lucene/TestExternalCodecs.java @@ -24,8 +24,6 @@ import org.apache.lucene.document.*; import org.apache.lucene.search.*; import org.apache.lucene.analysis.*; import org.apache.lucene.index.codecs.*; -import org.apache.lucene.index.codecs.standard.*; -import org.apache.lucene.index.codecs.pulsing.*; import org.apache.lucene.store.*; import java.util.*; import java.io.*; @@ -75,7 +73,7 @@ public class TestExternalCodecs extends LuceneTestCase { public static class RAMOnlyCodec extends Codec { public RAMOnlyCodec() { - name = "RamOnly"; + super("RamOnly"); } // Postings state: static class RAMPostings extends FieldsProducer { diff --git a/lucene/src/test/org/apache/lucene/index/TestAddIndexes.java b/lucene/src/test/org/apache/lucene/index/TestAddIndexes.java index 7dc161e509e..a2c4ec45711 100755 --- a/lucene/src/test/org/apache/lucene/index/TestAddIndexes.java +++ b/lucene/src/test/org/apache/lucene/index/TestAddIndexes.java @@ -1161,7 +1161,7 @@ public class TestAddIndexes extends LuceneTestCase { IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)); CodecProvider provider = new CodecProvider(); - provider.register(new PulsingCodec(1 + random.nextInt(10))); + provider.register(new PulsingCodec(1 + random.nextInt(20))); conf.setCodecProvider(provider); IndexWriter w = new IndexWriter(dir, conf); try { @@ -1182,7 +1182,7 @@ public class TestAddIndexes extends LuceneTestCase { IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)); CodecProvider provider = new CodecProvider(); - provider.register(new PulsingCodec(1 + random.nextInt(10))); + provider.register(new PulsingCodec(1 + random.nextInt(20))); conf.setCodecProvider(provider); IndexWriter w = new IndexWriter(dir, conf); IndexReader indexReader = IndexReader.open(toAdd); diff --git a/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java b/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java index 44bff09084f..5a629dacd85 100644 --- a/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java +++ b/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java @@ -38,7 +38,6 @@ import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.NumericRangeQuery; import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.Similarity; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.CompoundFileDirectory; import org.apache.lucene.store.Directory; @@ -375,7 +374,8 @@ public class TestBackwardsCompatibility extends LuceneTestCase { Term searchTerm = new Term("id", "6"); int delCount = reader.deleteDocuments(searchTerm); assertEquals("wrong delete count", 1, delCount); - reader.setNorm(searcher.search(new TermQuery(new Term("id", "22")), 10).scoreDocs[0].doc, "content", searcher.getSimilarityProvider().get("content").encodeNormValue(2.0f)); + DefaultSimilarity sim = new DefaultSimilarity(); + reader.setNorm(searcher.search(new TermQuery(new Term("id", "22")), 10).scoreDocs[0].doc, "content", sim.encodeNormValue(2.0f)); reader.close(); searcher.close(); @@ -421,7 +421,8 @@ public class TestBackwardsCompatibility extends LuceneTestCase { Term searchTerm = new Term("id", "6"); int delCount = reader.deleteDocuments(searchTerm); assertEquals("wrong delete count", 1, delCount); - reader.setNorm(22, "content", searcher.getSimilarityProvider().get("content").encodeNormValue(2.0f)); + DefaultSimilarity sim = new DefaultSimilarity(); + reader.setNorm(22, "content", sim.encodeNormValue(2.0f)); reader.close(); // make sure they "took": @@ -483,7 +484,8 @@ public class TestBackwardsCompatibility extends LuceneTestCase { assertEquals("didn't delete the right number of documents", 1, delCount); // Set one norm so we get a .s0 file: - reader.setNorm(21, "content", conf.getSimilarityProvider().get("content").encodeNormValue(1.5f)); + DefaultSimilarity sim = new DefaultSimilarity(); + reader.setNorm(21, "content", sim.encodeNormValue(1.5f)); reader.close(); } @@ -526,7 +528,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase { assertEquals("didn't delete the right number of documents", 1, delCount); // Set one norm so we get a .s0 file: - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); reader.setNorm(21, "content", sim.encodeNormValue(1.5f)); reader.close(); diff --git a/lucene/src/test/org/apache/lucene/index/TestCompoundFile.java b/lucene/src/test/org/apache/lucene/index/TestCompoundFile.java index 262a1a3d7e2..f6acb3b060f 100644 --- a/lucene/src/test/org/apache/lucene/index/TestCompoundFile.java +++ b/lucene/src/test/org/apache/lucene/index/TestCompoundFile.java @@ -21,10 +21,9 @@ import java.io.IOException; import java.io.File; import org.apache.lucene.util.LuceneTestCase; -import junit.framework.TestSuite; -import junit.textui.TestRunner; import org.apache.lucene.store.CompoundFileDirectory; +import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; @@ -35,27 +34,8 @@ import org.apache.lucene.util._TestUtil; public class TestCompoundFile extends LuceneTestCase { - /** Main for running test case by itself. */ - public static void main(String args[]) { - TestRunner.run (new TestSuite(TestCompoundFile.class)); -// TestRunner.run (new TestCompoundFile("testSingleFile")); -// TestRunner.run (new TestCompoundFile("testTwoFiles")); -// TestRunner.run (new TestCompoundFile("testRandomFiles")); -// TestRunner.run (new TestCompoundFile("testClonedStreamsClosing")); -// TestRunner.run (new TestCompoundFile("testReadAfterClose")); -// TestRunner.run (new TestCompoundFile("testRandomAccess")); -// TestRunner.run (new TestCompoundFile("testRandomAccessClones")); -// TestRunner.run (new TestCompoundFile("testFileNotFound")); -// TestRunner.run (new TestCompoundFile("testReadPastEOF")); - -// TestRunner.run (new TestCompoundFile("testIWCreate")); - - } - - private Directory dir; - @Override public void setUp() throws Exception { super.setUp(); @@ -323,13 +303,13 @@ public class TestCompoundFile extends LuceneTestCase throws IOException { // Setup the test file - we need more than 1024 bytes - IndexOutput os = fsdir.createOutput(file, newIOContext(random)); + IndexOutput os = fsdir.createOutput(file, IOContext.DEFAULT); for(int i=0; i<2000; i++) { os.writeByte((byte) i); } os.close(); - IndexInput in = fsdir.openInput(file, newIOContext(random)); + IndexInput in = fsdir.openInput(file, IOContext.DEFAULT); // This read primes the buffer in IndexInput in.readByte(); @@ -717,5 +697,74 @@ public class TestCompoundFile extends LuceneTestCase cfr.close(); newDir.close(); } + + public void testEmptyCFS() throws IOException { + Directory newDir = newDirectory(); + CompoundFileDirectory csw = newDir.createCompoundOutput("d.cfs", newIOContext(random)); + csw.close(); + CompoundFileDirectory csr = newDir.openCompoundInput("d.cfs", newIOContext(random)); + assertEquals(0, csr.listAll().length); + csr.close(); + + newDir.close(); + } + + public void testReadNestedCFP() throws IOException { + Directory newDir = newDirectory(); + CompoundFileDirectory csw = newDir.createCompoundOutput("d.cfs", newIOContext(random)); + CompoundFileDirectory nested = newDir.createCompoundOutput("b.cfs", newIOContext(random)); + IndexOutput out = nested.createOutput("b.xyz", newIOContext(random)); + IndexOutput out1 = nested.createOutput("b_1.xyz", newIOContext(random)); + out.writeInt(0); + out1.writeInt(1); + out.close(); + out1.close(); + nested.close(); + newDir.copy(csw, "b.cfs", "b.cfs", newIOContext(random)); + newDir.copy(csw, "b.cfe", "b.cfe", newIOContext(random)); + newDir.deleteFile("b.cfs"); + newDir.deleteFile("b.cfe"); + csw.close(); + + assertEquals(2, newDir.listAll().length); + csw = newDir.openCompoundInput("d.cfs", newIOContext(random)); + + assertEquals(2, csw.listAll().length); + nested = csw.openCompoundInput("b.cfs", newIOContext(random)); + + assertEquals(2, nested.listAll().length); + IndexInput openInput = nested.openInput("b.xyz", newIOContext(random)); + assertEquals(0, openInput.readInt()); + openInput.close(); + openInput = nested.openInput("b_1.xyz", newIOContext(random)); + assertEquals(1, openInput.readInt()); + openInput.close(); + nested.close(); + csw.close(); + newDir.close(); + } + + public void testDoubleClose() throws IOException { + Directory newDir = newDirectory(); + CompoundFileDirectory csw = newDir.createCompoundOutput("d.cfs", newIOContext(random)); + IndexOutput out = csw.createOutput("d.xyz", newIOContext(random)); + out.writeInt(0); + out.close(); + + csw.close(); + // close a second time - must have no effect according to Closeable + csw.close(); + + csw = newDir.openCompoundInput("d.cfs", newIOContext(random)); + IndexInput openInput = csw.openInput("d.xyz", newIOContext(random)); + assertEquals(0, openInput.readInt()); + openInput.close(); + csw.close(); + // close a second time - must have no effect according to Closeable + csw.close(); + + newDir.close(); + + } } diff --git a/lucene/src/test/org/apache/lucene/index/TestDeletionPolicy.java b/lucene/src/test/org/apache/lucene/index/TestDeletionPolicy.java index f46fd29fbe8..9c4994ebb43 100644 --- a/lucene/src/test/org/apache/lucene/index/TestDeletionPolicy.java +++ b/lucene/src/test/org/apache/lucene/index/TestDeletionPolicy.java @@ -27,6 +27,7 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.search.DefaultSimilarity; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; @@ -655,7 +656,8 @@ public class TestDeletionPolicy extends LuceneTestCase { writer.close(); IndexReader reader = IndexReader.open(dir, policy, false); reader.deleteDocument(3*i+1); - reader.setNorm(4*i+1, "content", conf.getSimilarityProvider().get("content").encodeNormValue(2.0F)); + DefaultSimilarity sim = new DefaultSimilarity(); + reader.setNorm(4*i+1, "content", sim.encodeNormValue(2.0F)); IndexSearcher searcher = newSearcher(reader); ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(16*(1+i), hits.length); @@ -781,7 +783,8 @@ public class TestDeletionPolicy extends LuceneTestCase { writer.close(); IndexReader reader = IndexReader.open(dir, policy, false); reader.deleteDocument(3); - reader.setNorm(5, "content", conf.getSimilarityProvider().get("content").encodeNormValue(2.0F)); + DefaultSimilarity sim = new DefaultSimilarity(); + reader.setNorm(5, "content", sim.encodeNormValue(2.0F)); IndexSearcher searcher = newSearcher(reader); ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(16, hits.length); diff --git a/lucene/src/test/org/apache/lucene/index/TestDocTermOrds.java b/lucene/src/test/org/apache/lucene/index/TestDocTermOrds.java index d1affdcedff..b568279d572 100644 --- a/lucene/src/test/org/apache/lucene/index/TestDocTermOrds.java +++ b/lucene/src/test/org/apache/lucene/index/TestDocTermOrds.java @@ -105,8 +105,9 @@ public class TestDocTermOrds extends LuceneTestCase { } private static class StandardCodecWithOrds extends Codec { + public StandardCodecWithOrds() { - name = "StandardOrds"; + super("StandardOrds"); } @Override @@ -200,13 +201,13 @@ public class TestDocTermOrds extends LuceneTestCase { StandardPostingsReader.files(dir, segmentInfo, id, files); BlockTermsReader.files(dir, segmentInfo, id, files); FixedGapTermsIndexReader.files(dir, segmentInfo, id, files); - DefaultDocValuesConsumer.files(dir, segmentInfo, id, files); + DefaultDocValuesConsumer.files(dir, segmentInfo, id, files, getDocValuesUseCFS()); } @Override public void getExtensions(Set extensions) { getStandardExtensions(extensions); - DefaultDocValuesConsumer.getDocValuesExtensions(extensions); + DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS()); } public static void getStandardExtensions(Set extensions) { @@ -218,12 +219,12 @@ public class TestDocTermOrds extends LuceneTestCase { @Override public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException { - return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator()); + return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS()); } @Override public PerDocValues docsProducer(SegmentReadState state) throws IOException { - return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, state.context); + return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, getDocValuesUseCFS(), getDocValuesSortComparator(), state.context); } } diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexFileDeleter.java b/lucene/src/test/org/apache/lucene/index/TestIndexFileDeleter.java index 9561b5194ea..a87bcb9e776 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexFileDeleter.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexFileDeleter.java @@ -71,7 +71,7 @@ public class TestIndexFileDeleter extends LuceneTestCase { Term searchTerm = new Term("id", "7"); int delCount = reader.deleteDocuments(searchTerm); assertEquals("didn't delete the right number of documents", 1, delCount); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); // Set one norm so we get a .s0 file: reader.setNorm(21, "content", sim.encodeNormValue(1.5f)); reader.close(); diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexReader.java b/lucene/src/test/org/apache/lucene/index/TestIndexReader.java index 05b2f3c951b..7965406ff98 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexReader.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexReader.java @@ -421,7 +421,7 @@ public class TestIndexReader extends LuceneTestCase // expected } - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); try { reader.setNorm(5, "aaa", sim.encodeNormValue(2.0f)); fail("setNorm after close failed to throw IOException"); @@ -462,7 +462,7 @@ public class TestIndexReader extends LuceneTestCase // expected } - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); try { reader.setNorm(5, "aaa", sim.encodeNormValue(2.0f)); fail("setNorm should have hit LockObtainFailedException"); @@ -494,7 +494,7 @@ public class TestIndexReader extends LuceneTestCase // now open reader & set norm for doc 0 IndexReader reader = IndexReader.open(dir, false); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); reader.setNorm(0, "content", sim.encodeNormValue(2.0f)); // we should be holding the write lock now: @@ -539,7 +539,7 @@ public class TestIndexReader extends LuceneTestCase addDoc(writer, searchTerm.text()); writer.close(); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); // now open reader & set norm for doc 0 (writes to // _0_1.s0) reader = IndexReader.open(dir, false); @@ -738,7 +738,7 @@ public class TestIndexReader extends LuceneTestCase } reader = IndexReader.open(dir, false); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); try { reader.setNorm(1, "content", sim.encodeNormValue(2.0f)); fail("did not hit exception when calling setNorm on an invalid doc number"); diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexReaderClone.java b/lucene/src/test/org/apache/lucene/index/TestIndexReaderClone.java index bcbf857a195..9b6c4d24fd2 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexReaderClone.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexReaderClone.java @@ -273,7 +273,7 @@ public class TestIndexReaderClone extends LuceneTestCase { * @throws Exception */ private void performDefaultTests(IndexReader r1) throws Exception { - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); float norm1 = sim.decodeNormValue(MultiNorms.norms(r1, "field1")[4]); IndexReader pr1Clone = (IndexReader) r1.clone(); @@ -329,7 +329,7 @@ public class TestIndexReaderClone extends LuceneTestCase { TestIndexReaderReopen.createIndex(random, dir1, false); SegmentReader origSegmentReader = getOnlySegmentReader(IndexReader.open(dir1, false)); origSegmentReader.deleteDocument(1); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); origSegmentReader.setNorm(4, "field1", sim.encodeNormValue(0.5f)); SegmentReader clonedSegmentReader = (SegmentReader) origSegmentReader @@ -429,7 +429,7 @@ public class TestIndexReaderClone extends LuceneTestCase { final Directory dir1 = newDirectory(); TestIndexReaderReopen.createIndex(random, dir1, false); IndexReader orig = IndexReader.open(dir1, false); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); orig.setNorm(1, "field1", sim.encodeNormValue(17.0f)); final byte encoded = sim.encodeNormValue(17.0f); assertEquals(encoded, MultiNorms.norms(orig, "field1")[1]); diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexReaderCloneNorms.java b/lucene/src/test/org/apache/lucene/index/TestIndexReaderCloneNorms.java index 32cef3ea5db..228d03331da 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexReaderCloneNorms.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexReaderCloneNorms.java @@ -47,9 +47,9 @@ public class TestIndexReaderCloneNorms extends LuceneTestCase { public Similarity get(String field) { return new DefaultSimilarity() { @Override - public float computeNorm(FieldInvertState state) { + public byte computeNorm(FieldInvertState state) { // diable length norm - return state.getBoost(); + return encodeNormValue(state.getBoost()); } }; } @@ -217,7 +217,7 @@ public class TestIndexReaderCloneNorms extends LuceneTestCase { IndexReader reader4C = (IndexReader) reader3C.clone(); SegmentReader segmentReader4C = getOnlySegmentReader(reader4C); assertEquals(4, reader3CCNorm.bytesRef().get()); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); reader4C.setNorm(5, "field1", sim.encodeNormValue(0.33f)); // generate a cannot update exception in reader1 @@ -278,7 +278,7 @@ public class TestIndexReaderCloneNorms extends LuceneTestCase { // System.out.println(" and: for "+k+" from "+newNorm+" to "+origNorm); modifiedNorms.set(i, Float.valueOf(newNorm)); modifiedNorms.set(k, Float.valueOf(origNorm)); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); ir.setNorm(i, "f" + 1, sim.encodeNormValue(newNorm)); ir.setNorm(k, "f" + 1, sim.encodeNormValue(origNorm)); // System.out.println("setNorm i: "+i); @@ -300,7 +300,7 @@ public class TestIndexReaderCloneNorms extends LuceneTestCase { assertEquals("number of norms mismatches", numDocNorms, b.length); ArrayList storedNorms = (i == 1 ? modifiedNorms : norms); for (int j = 0; j < b.length; j++) { - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); float norm = sim.decodeNormValue(b[j]); float norm1 = storedNorms.get(j).floatValue(); assertEquals("stored norm value of " + field + " for doc " + j + " is " @@ -340,7 +340,7 @@ public class TestIndexReaderCloneNorms extends LuceneTestCase { // return unique norm values that are unchanged by encoding/decoding private float nextNorm(String fname) { float norm = lastNorm + normDelta; - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); do { float norm1 = sim.decodeNormValue( sim.encodeNormValue(norm)); diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexReaderOnDiskFull.java b/lucene/src/test/org/apache/lucene/index/TestIndexReaderOnDiskFull.java index 86ea0f7514c..058939eee01 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexReaderOnDiskFull.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexReaderOnDiskFull.java @@ -131,7 +131,7 @@ public class TestIndexReaderOnDiskFull extends LuceneTestCase { dir.setMaxSizeInBytes(thisDiskFree); dir.setRandomIOExceptionRate(rate); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); try { if (0 == x) { int docId = 12; diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexReaderReopen.java b/lucene/src/test/org/apache/lucene/index/TestIndexReaderReopen.java index f9277eab9a9..3200204df5f 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexReaderReopen.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexReaderReopen.java @@ -606,7 +606,7 @@ public class TestIndexReaderReopen extends LuceneTestCase { IndexReader reader2 = reader1.reopen(); modifier = IndexReader.open(dir1, false); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); modifier.setNorm(1, "field1", sim.encodeNormValue(50f)); modifier.setNorm(1, "field2", sim.encodeNormValue(50f)); modifier.close(); @@ -702,7 +702,7 @@ public class TestIndexReaderReopen extends LuceneTestCase { protected void modifyIndex(int i) throws IOException { if (i % 3 == 0) { IndexReader modifier = IndexReader.open(dir, false); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); modifier.setNorm(i, "field1", sim.encodeNormValue(50f)); modifier.close(); } else if (i % 3 == 1) { @@ -983,7 +983,7 @@ public class TestIndexReaderReopen extends LuceneTestCase { } case 1: { IndexReader reader = IndexReader.open(dir, false); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); reader.setNorm(4, "field1", sim.encodeNormValue(123f)); reader.setNorm(44, "field2", sim.encodeNormValue(222f)); reader.setNorm(44, "field4", sim.encodeNormValue(22f)); @@ -1007,7 +1007,7 @@ public class TestIndexReaderReopen extends LuceneTestCase { } case 4: { IndexReader reader = IndexReader.open(dir, false); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); reader.setNorm(5, "field1", sim.encodeNormValue(123f)); reader.setNorm(55, "field2", sim.encodeNormValue(222f)); reader.close(); diff --git a/lucene/src/test/org/apache/lucene/index/TestMaxTermFrequency.java b/lucene/src/test/org/apache/lucene/index/TestMaxTermFrequency.java index d81d3a404be..9744008ece6 100644 --- a/lucene/src/test/org/apache/lucene/index/TestMaxTermFrequency.java +++ b/lucene/src/test/org/apache/lucene/index/TestMaxTermFrequency.java @@ -116,8 +116,8 @@ public class TestMaxTermFrequency extends LuceneTestCase { } @Override - public float computeNorm(FieldInvertState state) { - return (float) state.getMaxTermFrequency(); + public byte computeNorm(FieldInvertState state) { + return encodeNormValue((float) state.getMaxTermFrequency()); } } } diff --git a/lucene/src/test/org/apache/lucene/index/TestNorms.java b/lucene/src/test/org/apache/lucene/index/TestNorms.java index 3a8b295f287..372ae2ef964 100755 --- a/lucene/src/test/org/apache/lucene/index/TestNorms.java +++ b/lucene/src/test/org/apache/lucene/index/TestNorms.java @@ -46,9 +46,9 @@ public class TestNorms extends LuceneTestCase { public Similarity get(String field) { return new DefaultSimilarity() { @Override - public float computeNorm(FieldInvertState state) { + public byte computeNorm(FieldInvertState state) { // diable length norm - return state.getBoost(); + return encodeNormValue(state.getBoost()); } }; } @@ -177,7 +177,7 @@ public class TestNorms extends LuceneTestCase { //System.out.println(" and: for "+k+" from "+newNorm+" to "+origNorm); modifiedNorms.set(i, Float.valueOf(newNorm)); modifiedNorms.set(k, Float.valueOf(origNorm)); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); ir.setNorm(i, "f"+1, sim.encodeNormValue(newNorm)); ir.setNorm(k, "f"+1, sim.encodeNormValue(origNorm)); } @@ -192,8 +192,9 @@ public class TestNorms extends LuceneTestCase { byte b[] = MultiNorms.norms(ir, field); assertEquals("number of norms mismatches",numDocNorms,b.length); ArrayList storedNorms = (i==1 ? modifiedNorms : norms); + DefaultSimilarity sim = (DefaultSimilarity) similarityProviderOne.get(field); for (int j = 0; j < b.length; j++) { - float norm = similarityProviderOne.get(field).decodeNormValue(b[j]); + float norm = sim.decodeNormValue(b[j]); float norm1 = storedNorms.get(j).floatValue(); assertEquals("stored norm value of "+field+" for doc "+j+" is "+norm+" - a mismatch!", norm, norm1, 0.000001); } @@ -229,7 +230,7 @@ public class TestNorms extends LuceneTestCase { // return unique norm values that are unchanged by encoding/decoding private float nextNorm(String fname) { float norm = lastNorm + normDelta; - Similarity similarity = similarityProviderOne.get(fname); + DefaultSimilarity similarity = (DefaultSimilarity) similarityProviderOne.get(fname); do { float norm1 = similarity.decodeNormValue(similarity.encodeNormValue(norm)); if (norm1 > lastNorm) { @@ -259,8 +260,8 @@ public class TestNorms extends LuceneTestCase { } @Override - public float computeNorm(FieldInvertState state) { - return (float) state.getLength(); + public byte computeNorm(FieldInvertState state) { + return encodeNormValue((float) state.getLength()); } } diff --git a/lucene/src/test/org/apache/lucene/index/TestOmitTf.java b/lucene/src/test/org/apache/lucene/index/TestOmitTf.java index cf7ecbd16ef..efef48a9729 100644 --- a/lucene/src/test/org/apache/lucene/index/TestOmitTf.java +++ b/lucene/src/test/org/apache/lucene/index/TestOmitTf.java @@ -18,9 +18,9 @@ package org.apache.lucene.index; */ import java.io.IOException; -import java.util.Collection; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TermContext; import org.apache.lucene.util._TestUtil; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; @@ -30,7 +30,6 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.search.*; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.store.Directory; -import org.apache.lucene.search.Explanation.IDFExplanation; public class TestOmitTf extends LuceneTestCase { @@ -39,23 +38,14 @@ public class TestOmitTf extends LuceneTestCase { public float queryNorm(float sumOfSquaredWeights) { return 1.0f; } public float coord(int overlap, int maxOverlap) { return 1.0f; } public Similarity get(String field) { - return new Similarity() { + return new TFIDFSimilarity() { - @Override public float computeNorm(FieldInvertState state) { return state.getBoost(); } + @Override public byte computeNorm(FieldInvertState state) { return encodeNormValue(state.getBoost()); } @Override public float tf(float freq) { return freq; } @Override public float sloppyFreq(int distance) { return 2.0f; } @Override public float idf(int docFreq, int numDocs) { return 1.0f; } - @Override public IDFExplanation idfExplain(Collection terms, IndexSearcher searcher) throws IOException { - return new IDFExplanation() { - @Override - public float getIdf() { - return 1.0f; - } - @Override - public String explain() { - return "Inexplicable"; - } - }; + @Override public Explanation idfExplain(TermContext[] terms, IndexSearcher searcher) throws IOException { + return new Explanation(1.0f, "Inexplicable"); } }; } diff --git a/lucene/src/test/org/apache/lucene/index/TestParallelReader.java b/lucene/src/test/org/apache/lucene/index/TestParallelReader.java index 6b5dc4eea04..b2d0b3cd26a 100644 --- a/lucene/src/test/org/apache/lucene/index/TestParallelReader.java +++ b/lucene/src/test/org/apache/lucene/index/TestParallelReader.java @@ -149,7 +149,7 @@ public class TestParallelReader extends LuceneTestCase { assertTrue(pr.isCurrent()); IndexReader modifier = IndexReader.open(dir1, false); - Similarity sim = new DefaultSimilarity(); + DefaultSimilarity sim = new DefaultSimilarity(); modifier.setNorm(0, "f1", sim.encodeNormValue(100f)); modifier.close(); diff --git a/lucene/src/test/org/apache/lucene/index/TestPerFieldCodecSupport.java b/lucene/src/test/org/apache/lucene/index/TestPerFieldCodecSupport.java index 2108b687384..a8e7d79838f 100644 --- a/lucene/src/test/org/apache/lucene/index/TestPerFieldCodecSupport.java +++ b/lucene/src/test/org/apache/lucene/index/TestPerFieldCodecSupport.java @@ -279,7 +279,7 @@ public class TestPerFieldCodecSupport extends LuceneTestCase { CodecProvider provider = new CodecProvider(); Codec[] codecs = new Codec[] { new StandardCodec(), new SimpleTextCodec(), new MockSepCodec(), - new PulsingCodec(1 + random.nextInt(10)), + new PulsingCodec(1 + random.nextInt(20)), new MockVariableIntBlockCodec(1 + random.nextInt(10)), new MockFixedIntBlockCodec(1 + random.nextInt(10)) }; for (Codec codec : codecs) { diff --git a/lucene/src/test/org/apache/lucene/index/values/TestDocValues.java b/lucene/src/test/org/apache/lucene/index/values/TestDocValues.java index d4981aa3093..2fc02a224b4 100644 --- a/lucene/src/test/org/apache/lucene/index/values/TestDocValues.java +++ b/lucene/src/test/org/apache/lucene/index/values/TestDocValues.java @@ -81,7 +81,7 @@ public class TestDocValues extends LuceneTestCase { w.finish(maxDoc); assertEquals(0, trackBytes.get()); - IndexDocValues r = Bytes.getValues(dir, "test", mode, fixedSize, maxDoc, newIOContext(random)); + IndexDocValues r = Bytes.getValues(dir, "test", mode, fixedSize, maxDoc, comp, newIOContext(random)); for (int iter = 0; iter < 2; iter++) { ValuesEnum bytesEnum = getEnum(r); assertNotNull("enum is null", bytesEnum); @@ -105,7 +105,8 @@ public class TestDocValues extends LuceneTestCase { Source s; IndexDocValues.SortedSource ss; if (mode == Bytes.Mode.SORTED) { - s = ss = getSortedSource(r, comp); + // default is unicode so we can simply pass null here + s = ss = getSortedSource(r, random.nextBoolean() ? comp : null); } else { s = getSource(r); ss = null; diff --git a/lucene/src/test/org/apache/lucene/search/JustCompileSearch.java b/lucene/src/test/org/apache/lucene/search/JustCompileSearch.java index 13e7f8145c3..167d10e696c 100644 --- a/lucene/src/test/org/apache/lucene/search/JustCompileSearch.java +++ b/lucene/src/test/org/apache/lucene/search/JustCompileSearch.java @@ -20,7 +20,11 @@ package org.apache.lucene.search; import java.io.IOException; import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.search.Similarity.ExactDocScorer; +import org.apache.lucene.search.Similarity.SloppyDocScorer; +import org.apache.lucene.search.Similarity.Stats; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.TermContext; import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.util.PriorityQueue; @@ -187,8 +191,8 @@ final class JustCompileSearch { static final class JustCompilePhraseScorer extends PhraseScorer { JustCompilePhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, - Similarity similarity, byte[] norms) { - super(weight, postings, similarity, norms); + Similarity.SloppyDocScorer docScorer) throws IOException { + super(weight, postings, docScorer); } @Override @@ -243,12 +247,22 @@ final class JustCompileSearch { static final class JustCompileSimilarity extends Similarity { @Override - public float idf(int docFreq, int numDocs) { + public Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termContexts) throws IOException { throw new UnsupportedOperationException(UNSUPPORTED_MSG); } @Override - public float computeNorm(FieldInvertState state) { + public ExactDocScorer exactDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException { + throw new UnsupportedOperationException(UNSUPPORTED_MSG); + } + + @Override + public SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException { + throw new UnsupportedOperationException(UNSUPPORTED_MSG); + } + + @Override + public byte computeNorm(FieldInvertState state) { throw new UnsupportedOperationException(UNSUPPORTED_MSG); } @@ -256,11 +270,6 @@ final class JustCompileSearch { public float sloppyFreq(int distance) { throw new UnsupportedOperationException(UNSUPPORTED_MSG); } - - @Override - public float tf(float freq) { - throw new UnsupportedOperationException(UNSUPPORTED_MSG); - } } static final class JustCompileSimilarityProvider implements SimilarityProvider { @@ -348,17 +357,12 @@ final class JustCompileSearch { } @Override - public float getValue() { + public void normalize(float norm, float topLevelBoost) { throw new UnsupportedOperationException(UNSUPPORTED_MSG); } @Override - public void normalize(float norm) { - throw new UnsupportedOperationException(UNSUPPORTED_MSG); - } - - @Override - public float sumOfSquaredWeights() throws IOException { + public float getValueForNormalization() throws IOException { throw new UnsupportedOperationException(UNSUPPORTED_MSG); } diff --git a/lucene/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java b/lucene/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java index e8a6b69a948..71b96a45a7a 100644 --- a/lucene/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java +++ b/lucene/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java @@ -62,9 +62,9 @@ public class TestDisjunctionMaxQuery extends LuceneTestCase { } @Override - public float computeNorm(FieldInvertState state) { + public byte computeNorm(FieldInvertState state) { // Disable length norm - return state.getBoost(); + return encodeNormValue(state.getBoost()); } @Override diff --git a/lucene/src/test/org/apache/lucene/search/TestDocValuesScoring.java b/lucene/src/test/org/apache/lucene/search/TestDocValuesScoring.java new file mode 100644 index 00000000000..2281000eff3 --- /dev/null +++ b/lucene/src/test/org/apache/lucene/search/TestDocValuesScoring.java @@ -0,0 +1,203 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.IndexDocValuesField; +import org.apache.lucene.index.FieldInvertState; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.index.values.IndexDocValues.Source; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TermContext; + +/** + * Tests the use of indexdocvalues in scoring. + * + * In the example, a docvalues field is used as a per-document boost (separate from the norm) + * @lucene.experimental + */ +public class TestDocValuesScoring extends LuceneTestCase { + private static final float SCORE_EPSILON = 0.001f; /* for comparing floats */ + + public void testSimple() throws Exception { + assumeFalse("PreFlex codec cannot work with IndexDocValues!", + "PreFlex".equals(CodecProvider.getDefault().getDefaultFieldCodec())); + + Directory dir = newDirectory(); + RandomIndexWriter iw = new RandomIndexWriter(random, dir); + Document doc = new Document(); + Field field = newField("foo", "", Field.Store.NO, Field.Index.ANALYZED); + doc.add(field); + IndexDocValuesField dvField = new IndexDocValuesField("foo_boost"); + doc.add(dvField); + Field field2 = newField("bar", "", Field.Store.NO, Field.Index.ANALYZED); + doc.add(field2); + + field.setValue("quick brown fox"); + field2.setValue("quick brown fox"); + dvField.setFloat(2f); // boost x2 + iw.addDocument(doc); + field.setValue("jumps over lazy brown dog"); + field2.setValue("jumps over lazy brown dog"); + dvField.setFloat(4f); // boost x4 + iw.addDocument(doc); + IndexReader ir = iw.getReader(); + iw.close(); + + // no boosting + IndexSearcher searcher1 = newSearcher(ir); + // boosting + IndexSearcher searcher2 = newSearcher(ir); + searcher2.setSimilarityProvider(new DefaultSimilarityProvider() { + final Similarity fooSim = new BoostingSimilarity(super.get("foo"), "foo_boost"); + + public Similarity get(String field) { + return "foo".equals(field) ? fooSim : super.get(field); + } + }); + + // in this case, we searched on field "foo". first document should have 2x the score. + TermQuery tq = new TermQuery(new Term("foo", "quick")); + QueryUtils.check(random, tq, searcher1); + QueryUtils.check(random, tq, searcher2); + + TopDocs noboost = searcher1.search(tq, 10); + TopDocs boost = searcher2.search(tq, 10); + assertEquals(1, noboost.totalHits); + assertEquals(1, boost.totalHits); + + //System.out.println(searcher2.explain(tq, boost.scoreDocs[0].doc)); + assertEquals(boost.scoreDocs[0].score, noboost.scoreDocs[0].score*2f, SCORE_EPSILON); + + // this query matches only the second document, which should have 4x the score. + tq = new TermQuery(new Term("foo", "jumps")); + QueryUtils.check(random, tq, searcher1); + QueryUtils.check(random, tq, searcher2); + + noboost = searcher1.search(tq, 10); + boost = searcher2.search(tq, 10); + assertEquals(1, noboost.totalHits); + assertEquals(1, boost.totalHits); + + assertEquals(boost.scoreDocs[0].score, noboost.scoreDocs[0].score*4f, SCORE_EPSILON); + + // search on on field bar just for kicks, nothing should happen, since we setup + // our sim provider to only use foo_boost for field foo. + tq = new TermQuery(new Term("bar", "quick")); + QueryUtils.check(random, tq, searcher1); + QueryUtils.check(random, tq, searcher2); + + noboost = searcher1.search(tq, 10); + boost = searcher2.search(tq, 10); + assertEquals(1, noboost.totalHits); + assertEquals(1, boost.totalHits); + + assertEquals(boost.scoreDocs[0].score, noboost.scoreDocs[0].score, SCORE_EPSILON); + + + searcher1.close(); + searcher2.close(); + ir.close(); + dir.close(); + } + + /** + * Similarity that wraps another similarity and boosts the final score + * according to whats in a docvalues field. + * + * @lucene.experimental + */ + static class BoostingSimilarity extends Similarity { + private final Similarity sim; + private final String boostField; + + public BoostingSimilarity(Similarity sim, String boostField) { + this.sim = sim; + this.boostField = boostField; + } + + @Override + public byte computeNorm(FieldInvertState state) { + return sim.computeNorm(state); + } + + @Override + public float sloppyFreq(int distance) { + return sim.sloppyFreq(distance); + } + + @Override + public Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termContexts) throws IOException { + return sim.computeStats(searcher, fieldName, queryBoost, termContexts); + } + + @Override + public ExactDocScorer exactDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException { + final ExactDocScorer sub = sim.exactDocScorer(stats, fieldName, context); + final Source values = context.reader.docValues(boostField).getSource(); + + return new ExactDocScorer() { + @Override + public float score(int doc, int freq) { + return (float) values.getFloat(doc) * sub.score(doc, freq); + } + + @Override + public Explanation explain(int doc, Explanation freq) { + Explanation boostExplanation = new Explanation((float) values.getFloat(doc), "indexDocValue(" + boostField + ")"); + Explanation simExplanation = sub.explain(doc, freq); + Explanation expl = new Explanation(boostExplanation.getValue() * simExplanation.getValue(), "product of:"); + expl.addDetail(boostExplanation); + expl.addDetail(simExplanation); + return expl; + } + }; + } + + @Override + public SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException { + final SloppyDocScorer sub = sim.sloppyDocScorer(stats, fieldName, context); + final Source values = context.reader.docValues(boostField).getSource(); + + return new SloppyDocScorer() { + @Override + public float score(int doc, float freq) { + return (float) values.getFloat(doc) * sub.score(doc, freq); + } + + @Override + public Explanation explain(int doc, Explanation freq) { + Explanation boostExplanation = new Explanation((float) values.getFloat(doc), "indexDocValue(" + boostField + ")"); + Explanation simExplanation = sub.explain(doc, freq); + Explanation expl = new Explanation(boostExplanation.getValue() * simExplanation.getValue(), "product of:"); + expl.addDetail(boostExplanation); + expl.addDetail(simExplanation); + return expl; + } + }; + } + } +} diff --git a/lucene/src/test/org/apache/lucene/search/TestMatchAllDocsQuery.java b/lucene/src/test/org/apache/lucene/search/TestMatchAllDocsQuery.java index 634435844d1..c60a8becae8 100644 --- a/lucene/src/test/org/apache/lucene/search/TestMatchAllDocsQuery.java +++ b/lucene/src/test/org/apache/lucene/search/TestMatchAllDocsQuery.java @@ -49,34 +49,12 @@ public class TestMatchAllDocsQuery extends LuceneTestCase { IndexSearcher is = newSearcher(ir); ScoreDoc[] hits; - // assert with norms scoring turned off - hits = is.search(new MatchAllDocsQuery(), null, 1000).scoreDocs; assertEquals(3, hits.length); assertEquals("one", is.doc(hits[0].doc).get("key")); assertEquals("two", is.doc(hits[1].doc).get("key")); assertEquals("three four", is.doc(hits[2].doc).get("key")); - // assert with norms scoring turned on - - MatchAllDocsQuery normsQuery = new MatchAllDocsQuery("key"); - hits = is.search(normsQuery, null, 1000).scoreDocs; - assertEquals(3, hits.length); - - assertEquals("three four", is.doc(hits[0].doc).get("key")); - assertEquals("two", is.doc(hits[1].doc).get("key")); - assertEquals("one", is.doc(hits[2].doc).get("key")); - - // change norm & retest - is.getIndexReader().setNorm(0, "key", is.getSimilarityProvider().get("key").encodeNormValue(400f)); - normsQuery = new MatchAllDocsQuery("key"); - hits = is.search(normsQuery, null, 1000).scoreDocs; - assertEquals(3, hits.length); - - assertEquals("one", is.doc(hits[0].doc).get("key")); - assertEquals("three four", is.doc(hits[1].doc).get("key")); - assertEquals("two", is.doc(hits[2].doc).get("key")); - // some artificial queries to trigger the use of skipTo(): BooleanQuery bq = new BooleanQuery(); diff --git a/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java b/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java index 02b876e0204..c434b1d9c87 100644 --- a/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java +++ b/lucene/src/test/org/apache/lucene/search/TestMultiPhraseQuery.java @@ -24,9 +24,9 @@ import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiFields; import org.apache.lucene.queryParser.ParseException; -import org.apache.lucene.search.Explanation.IDFExplanation; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.TermContext; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; @@ -312,21 +312,9 @@ public class TestMultiPhraseQuery extends LuceneTestCase { return new DefaultSimilarity() { @Override - public IDFExplanation idfExplain(Collection terms, + public Explanation idfExplain(TermContext stats[], IndexSearcher searcher) throws IOException { - return new IDFExplanation() { - - @Override - public float getIdf() { - return 10f; - } - - @Override - public String explain() { - return "just a test"; - } - - }; + return new Explanation(10f, "just a test"); } }; } @@ -336,7 +324,7 @@ public class TestMultiPhraseQuery extends LuceneTestCase { query.add(new Term[] { new Term("body", "this"), new Term("body", "that") }); query.add(new Term("body", "is")); Weight weight = query.createWeight(searcher); - assertEquals(10f * 10f, weight.sumOfSquaredWeights(), 0.001f); + assertEquals(10f * 10f, weight.getValueForNormalization(), 0.001f); writer.close(); searcher.close(); diff --git a/lucene/src/test/org/apache/lucene/search/TestSetNorm.java b/lucene/src/test/org/apache/lucene/search/TestSetNorm.java index 906aeb039b9..72245e1207f 100644 --- a/lucene/src/test/org/apache/lucene/search/TestSetNorm.java +++ b/lucene/src/test/org/apache/lucene/search/TestSetNorm.java @@ -50,7 +50,7 @@ public class TestSetNorm extends LuceneTestCase { // reset the boost of each instance of this document IndexReader reader = IndexReader.open(store, false); - Similarity similarity = new DefaultSimilarity(); + DefaultSimilarity similarity = new DefaultSimilarity(); reader.setNorm(0, "field", similarity.encodeNormValue(1.0f)); reader.setNorm(1, "field", similarity.encodeNormValue(2.0f)); reader.setNorm(2, "field", similarity.encodeNormValue(4.0f)); diff --git a/lucene/src/test/org/apache/lucene/search/TestSimilarity.java b/lucene/src/test/org/apache/lucene/search/TestSimilarity.java index 3afeb25566f..55c62248a53 100644 --- a/lucene/src/test/org/apache/lucene/search/TestSimilarity.java +++ b/lucene/src/test/org/apache/lucene/search/TestSimilarity.java @@ -18,8 +18,9 @@ package org.apache.lucene.search; */ import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TermContext; + import java.io.IOException; -import java.util.Collection; import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.IndexReader; @@ -30,7 +31,6 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.search.Explanation.IDFExplanation; /** Similarity unit test. * @@ -42,22 +42,13 @@ public class TestSimilarity extends LuceneTestCase { public float queryNorm(float sumOfSquaredWeights) { return 1.0f; } public float coord(int overlap, int maxOverlap) { return 1.0f; } public Similarity get(String field) { - return new Similarity() { - @Override public float computeNorm(FieldInvertState state) { return state.getBoost(); } + return new DefaultSimilarity() { + @Override public byte computeNorm(FieldInvertState state) { return encodeNormValue(state.getBoost()); } @Override public float tf(float freq) { return freq; } @Override public float sloppyFreq(int distance) { return 2.0f; } @Override public float idf(int docFreq, int numDocs) { return 1.0f; } - @Override public IDFExplanation idfExplain(Collection terms, IndexSearcher searcher) throws IOException { - return new IDFExplanation() { - @Override - public float getIdf() { - return 1.0f; - } - @Override - public String explain() { - return "Inexplicable"; - } - }; + @Override public Explanation idfExplain(TermContext[] stats, IndexSearcher searcher) throws IOException { + return new Explanation(1.0f, "Inexplicable"); } }; } diff --git a/lucene/src/test/org/apache/lucene/search/TestSimilarityProvider.java b/lucene/src/test/org/apache/lucene/search/TestSimilarityProvider.java index 7a9d6410863..1bf30e3b773 100644 --- a/lucene/src/test/org/apache/lucene/search/TestSimilarityProvider.java +++ b/lucene/src/test/org/apache/lucene/search/TestSimilarityProvider.java @@ -105,10 +105,10 @@ public class TestSimilarityProvider extends LuceneTestCase { } } - private class Sim1 extends Similarity { + private class Sim1 extends TFIDFSimilarity { @Override - public float computeNorm(FieldInvertState state) { - return 1f; + public byte computeNorm(FieldInvertState state) { + return encodeNormValue(1f); } @Override @@ -127,10 +127,10 @@ public class TestSimilarityProvider extends LuceneTestCase { } } - private class Sim2 extends Similarity { + private class Sim2 extends TFIDFSimilarity { @Override - public float computeNorm(FieldInvertState state) { - return 10f; + public byte computeNorm(FieldInvertState state) { + return encodeNormValue(10f); } @Override diff --git a/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadNearQuery.java b/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadNearQuery.java index 5c115d5cf9f..962eab069c2 100644 --- a/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadNearQuery.java +++ b/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadNearQuery.java @@ -17,7 +17,6 @@ package org.apache.lucene.search.payloads; */ import java.io.IOException; import java.io.Reader; -import java.util.Collection; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockTokenizer; @@ -45,7 +44,7 @@ import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.util.English; import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.search.Explanation.IDFExplanation; +import org.apache.lucene.util.TermContext; import org.junit.AfterClass; import org.junit.BeforeClass; @@ -325,8 +324,8 @@ public class TestPayloadNearQuery extends LuceneTestCase { //Make everything else 1 so we see the effect of the payload //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! @Override - public float computeNorm(FieldInvertState state) { - return state.getBoost(); + public byte computeNorm(FieldInvertState state) { + return encodeNormValue(state.getBoost()); } @Override @@ -341,18 +340,8 @@ public class TestPayloadNearQuery extends LuceneTestCase { // idf used for phrase queries @Override - public IDFExplanation idfExplain(Collection terms, IndexSearcher searcher) throws IOException { - return new IDFExplanation() { - @Override - public float getIdf() { - return 1.0f; - } - - @Override - public String explain() { - return "Inexplicable"; - } - }; + public Explanation idfExplain(TermContext states[], IndexSearcher searcher) throws IOException { + return new Explanation(1.0f, "Inexplicable"); } }; } diff --git a/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java b/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java index 9ed0db35a44..ea35f60cb56 100644 --- a/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java +++ b/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java @@ -318,8 +318,8 @@ public class TestPayloadTermQuery extends LuceneTestCase { //Make everything else 1 so we see the effect of the payload //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! @Override - public float computeNorm(FieldInvertState state) { - return state.getBoost(); + public byte computeNorm(FieldInvertState state) { + return encodeNormValue(state.getBoost()); } @Override diff --git a/lucene/src/test/org/apache/lucene/search/spans/JustCompileSearchSpans.java b/lucene/src/test/org/apache/lucene/search/spans/JustCompileSearchSpans.java index ac0f45d767b..4adceca1bb7 100644 --- a/lucene/src/test/org/apache/lucene/search/spans/JustCompileSearchSpans.java +++ b/lucene/src/test/org/apache/lucene/search/spans/JustCompileSearchSpans.java @@ -135,8 +135,8 @@ final class JustCompileSearchSpans { static final class JustCompileSpanScorer extends SpanScorer { protected JustCompileSpanScorer(Spans spans, Weight weight, - Similarity similarity, byte[] norms) throws IOException { - super(spans, weight, similarity, norms); + Similarity similarity, Similarity.SloppyDocScorer docScorer) throws IOException { + super(spans, weight, similarity, docScorer); } @Override diff --git a/modules/join/src/java/org/apache/lucene/search/join/BlockJoinQuery.java b/modules/join/src/java/org/apache/lucene/search/join/BlockJoinQuery.java index a066d1eba26..edc1516be98 100644 --- a/modules/join/src/java/org/apache/lucene/search/join/BlockJoinQuery.java +++ b/modules/join/src/java/org/apache/lucene/search/join/BlockJoinQuery.java @@ -133,18 +133,13 @@ public class BlockJoinQuery extends Query { } @Override - public float getValue() { - return childWeight.getValue(); + public float getValueForNormalization() throws IOException { + return childWeight.getValueForNormalization(); } @Override - public float sumOfSquaredWeights() throws IOException { - return childWeight.sumOfSquaredWeights(); - } - - @Override - public void normalize(float norm) { - childWeight.normalize(norm); + public void normalize(float norm, float topLevelBoost) { + childWeight.normalize(norm, topLevelBoost); } @Override diff --git a/modules/queries/src/java/org/apache/lucene/queries/CustomScoreQuery.java b/modules/queries/src/java/org/apache/lucene/queries/CustomScoreQuery.java index 7842a90e115..0d24612412d 100755 --- a/modules/queries/src/java/org/apache/lucene/queries/CustomScoreQuery.java +++ b/modules/queries/src/java/org/apache/lucene/queries/CustomScoreQuery.java @@ -195,21 +195,14 @@ public class CustomScoreQuery extends Query { return CustomScoreQuery.this; } - /*(non-Javadoc) @see org.apache.lucene.search.Weight#getValue() */ @Override - public float getValue() { - return getBoost(); - } - - /*(non-Javadoc) @see org.apache.lucene.search.Weight#sumOfSquaredWeights() */ - @Override - public float sumOfSquaredWeights() throws IOException { - float sum = subQueryWeight.sumOfSquaredWeights(); + public float getValueForNormalization() throws IOException { + float sum = subQueryWeight.getValueForNormalization(); for(int i = 0; i < valSrcWeights.length; i++) { if (qStrict) { - valSrcWeights[i].sumOfSquaredWeights(); // do not include ValueSource part in the query normalization + valSrcWeights[i].getValueForNormalization(); // do not include ValueSource part in the query normalization } else { - sum += valSrcWeights[i].sumOfSquaredWeights(); + sum += valSrcWeights[i].getValueForNormalization(); } } sum *= getBoost() * getBoost(); // boost each sub-weight @@ -218,14 +211,14 @@ public class CustomScoreQuery extends Query { /*(non-Javadoc) @see org.apache.lucene.search.Weight#normalize(float) */ @Override - public void normalize(float norm) { - norm *= getBoost(); // incorporate boost - subQueryWeight.normalize(norm); + public void normalize(float norm, float topLevelBoost) { + topLevelBoost *= getBoost(); // incorporate boost + subQueryWeight.normalize(norm, topLevelBoost); for(int i = 0; i < valSrcWeights.length; i++) { if (qStrict) { - valSrcWeights[i].normalize(1); // do not normalize the ValueSource part + valSrcWeights[i].normalize(1, 1); // do not normalize the ValueSource part } else { - valSrcWeights[i].normalize(norm); + valSrcWeights[i].normalize(norm, topLevelBoost); } } } @@ -245,7 +238,7 @@ public class CustomScoreQuery extends Query { for(int i = 0; i < valSrcScorers.length; i++) { valSrcScorers[i] = valSrcWeights[i].scorer(context, scorerContext.scoreDocsInOrder(true)); } - return new CustomScorer(CustomScoreQuery.this.getCustomScoreProvider(context), this, subQueryScorer, valSrcScorers); + return new CustomScorer(CustomScoreQuery.this.getCustomScoreProvider(context), this, getBoost(), subQueryScorer, valSrcScorers); } @Override @@ -265,11 +258,11 @@ public class CustomScoreQuery extends Query { valSrcExpls[i] = valSrcWeights[i].explain(info, doc); } Explanation customExp = CustomScoreQuery.this.getCustomScoreProvider(info).customExplain(doc,subQueryExpl,valSrcExpls); - float sc = getValue() * customExp.getValue(); + float sc = getBoost() * customExp.getValue(); Explanation res = new ComplexExplanation( true, sc, CustomScoreQuery.this.toString() + ", product of:"); res.addDetail(customExp); - res.addDetail(new Explanation(getValue(), "queryBoost")); // actually using the q boost as q weight (== weight value) + res.addDetail(new Explanation(getBoost(), "queryBoost")); // actually using the q boost as q weight (== weight value) return res; } @@ -294,10 +287,10 @@ public class CustomScoreQuery extends Query { private float vScores[]; // reused in score() to avoid allocating this array for each doc // constructor - private CustomScorer(CustomScoreProvider provider, CustomWeight w, + private CustomScorer(CustomScoreProvider provider, CustomWeight w, float qWeight, Scorer subQueryScorer, Scorer[] valSrcScorers) throws IOException { super(w); - this.qWeight = w.getValue(); + this.qWeight = qWeight; this.subQueryScorer = subQueryScorer; this.valSrcScorers = valSrcScorers; this.vScores = new float[valSrcScorers.length]; diff --git a/modules/queries/src/java/org/apache/lucene/queries/function/BoostedQuery.java b/modules/queries/src/java/org/apache/lucene/queries/function/BoostedQuery.java index 1fafb077252..3e04f55ae72 100755 --- a/modules/queries/src/java/org/apache/lucene/queries/function/BoostedQuery.java +++ b/modules/queries/src/java/org/apache/lucene/queries/function/BoostedQuery.java @@ -78,21 +78,16 @@ public class BoostedQuery extends Query { } @Override - public float getValue() { - return getBoost(); - } - - @Override - public float sumOfSquaredWeights() throws IOException { - float sum = qWeight.sumOfSquaredWeights(); + public float getValueForNormalization() throws IOException { + float sum = qWeight.getValueForNormalization(); sum *= getBoost() * getBoost(); return sum ; } @Override - public void normalize(float norm) { - norm *= getBoost(); - qWeight.normalize(norm); + public void normalize(float norm, float topLevelBoost) { + topLevelBoost *= getBoost(); + qWeight.normalize(norm, topLevelBoost); } @Override @@ -101,7 +96,7 @@ public class BoostedQuery extends Query { if(subQueryScorer == null) { return null; } - return new BoostedQuery.CustomScorer(context, this, subQueryScorer, boostVal); + return new BoostedQuery.CustomScorer(context, this, getBoost(), subQueryScorer, boostVal); } @Override @@ -128,11 +123,11 @@ public class BoostedQuery extends Query { private final DocValues vals; private final AtomicReaderContext readerContext; - private CustomScorer(AtomicReaderContext readerContext, BoostedQuery.BoostedWeight w, + private CustomScorer(AtomicReaderContext readerContext, BoostedQuery.BoostedWeight w, float qWeight, Scorer scorer, ValueSource vs) throws IOException { super(w); this.weight = w; - this.qWeight = w.getValue(); + this.qWeight = qWeight; this.scorer = scorer; this.readerContext = readerContext; this.vals = vs.getValues(weight.fcontext, readerContext); diff --git a/modules/queries/src/java/org/apache/lucene/queries/function/FunctionQuery.java b/modules/queries/src/java/org/apache/lucene/queries/function/FunctionQuery.java index ffeba130ae6..65383752569 100644 --- a/modules/queries/src/java/org/apache/lucene/queries/function/FunctionQuery.java +++ b/modules/queries/src/java/org/apache/lucene/queries/function/FunctionQuery.java @@ -77,25 +77,20 @@ public class FunctionQuery extends Query { } @Override - public float getValue() { - return queryWeight; - } - - @Override - public float sumOfSquaredWeights() throws IOException { + public float getValueForNormalization() throws IOException { queryWeight = getBoost(); return queryWeight * queryWeight; } @Override - public void normalize(float norm) { - this.queryNorm = norm; + public void normalize(float norm, float topLevelBoost) { + this.queryNorm = norm * topLevelBoost; queryWeight *= this.queryNorm; } @Override public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { - return new AllScorer(context, this); + return new AllScorer(context, this, queryWeight); } @Override @@ -114,10 +109,10 @@ public class FunctionQuery extends Query { final boolean hasDeletions; final Bits liveDocs; - public AllScorer(AtomicReaderContext context, FunctionWeight w) throws IOException { + public AllScorer(AtomicReaderContext context, FunctionWeight w, float qWeight) throws IOException { super(w); this.weight = w; - this.qWeight = w.getValue(); + this.qWeight = qWeight; this.reader = context.reader; this.maxDoc = reader.maxDoc(); this.hasDeletions = reader.hasDeletions(); diff --git a/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/IDFValueSource.java b/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/IDFValueSource.java index 23ccd22cd89..b6a53416f37 100755 --- a/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/IDFValueSource.java +++ b/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/IDFValueSource.java @@ -22,6 +22,7 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.queries.function.DocValues; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.TFIDFSimilarity; import org.apache.lucene.util.BytesRef; import java.io.IOException; @@ -42,9 +43,11 @@ public class IDFValueSource extends DocFreqValueSource { public DocValues getValues(Map context, AtomicReaderContext readerContext) throws IOException { IndexSearcher searcher = (IndexSearcher)context.get("searcher"); Similarity sim = searcher.getSimilarityProvider().get(field); - // todo: we need docFreq that takes a BytesRef - int docfreq = searcher.docFreq(new Term(indexedField, indexedBytes.utf8ToString())); - float idf = sim.idf(docfreq, searcher.maxDoc()); + if (!(sim instanceof TFIDFSimilarity)) { + throw new UnsupportedOperationException("requires a TFIDFSimilarity (such as DefaultSimilarity)"); + } + int docfreq = searcher.docFreq(new Term(indexedField, indexedBytes)); + float idf = ((TFIDFSimilarity)sim).idf(docfreq, searcher.maxDoc()); return new ConstDoubleDocValues(idf, this); } } diff --git a/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java b/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java index 5a515ad48c9..f2b5436bb6f 100755 --- a/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java +++ b/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java @@ -23,6 +23,8 @@ import org.apache.lucene.queries.function.ValueSource; import org.apache.lucene.queries.function.docvalues.FloatDocValues; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.TFIDFSimilarity; + import java.io.IOException; import java.util.Map; @@ -49,7 +51,11 @@ public class NormValueSource extends ValueSource { @Override public DocValues getValues(Map context, AtomicReaderContext readerContext) throws IOException { IndexSearcher searcher = (IndexSearcher)context.get("searcher"); - final Similarity similarity = searcher.getSimilarityProvider().get(field); + Similarity sim = searcher.getSimilarityProvider().get(field); + if (!(sim instanceof TFIDFSimilarity)) { + throw new UnsupportedOperationException("requires a TFIDFSimilarity (such as DefaultSimilarity)"); + } + final TFIDFSimilarity similarity = (TFIDFSimilarity) sim; final byte[] norms = readerContext.reader.norms(field); if (norms == null) { return new ConstDoubleDocValues(0.0, this); diff --git a/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/TFValueSource.java b/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/TFValueSource.java index d868456f8c5..90b605bc25c 100755 --- a/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/TFValueSource.java +++ b/modules/queries/src/java/org/apache/lucene/queries/function/valuesource/TFValueSource.java @@ -24,6 +24,7 @@ import org.apache.lucene.queries.function.docvalues.FloatDocValues; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.TFIDFSimilarity; import org.apache.lucene.util.BytesRef; import java.io.IOException; @@ -43,7 +44,11 @@ public class TFValueSource extends TermFreqValueSource { public DocValues getValues(Map context, AtomicReaderContext readerContext) throws IOException { Fields fields = readerContext.reader.fields(); final Terms terms = fields.terms(field); - final Similarity similarity = ((IndexSearcher)context.get("searcher")).getSimilarityProvider().get(field); + final Similarity sim = ((IndexSearcher)context.get("searcher")).getSimilarityProvider().get(field); + if (!(sim instanceof TFIDFSimilarity)) { + throw new UnsupportedOperationException("requires a TFIDFSimilarity (such as DefaultSimilarity)"); + } + final TFIDFSimilarity similarity = (TFIDFSimilarity) sim; return new FloatDocValues(this) { DocsEnum docs ; diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 5893720c1b9..e0da92c0478 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -302,6 +302,14 @@ Bug Fixes * SOLR-2636: Fix explain functionality for negative queries. (Tom Hill via yonik) +* SOLR-2538: Range Faceting on long/double fields could overflow if values + bigger then the max int/float were used. + (Erbi Hanka, hossman) + +* SOLR-2230: CommonsHttpSolrServer.addFile could not be used to send + multiple files in a single request. + (Stephan Günther, hossman) + Other Changes ---------------------- diff --git a/solr/common-build.xml b/solr/common-build.xml index 168a70047f2..ae3ce3bf43f 100644 --- a/solr/common-build.xml +++ b/solr/common-build.xml @@ -162,8 +162,9 @@ into your local repository). If you wish to deploy to a remote repository, set this property to the URL of that repository. In addition, if the repository requires authentication, you can set - properties "m2.repository.username" and "m2.repository.private.key" - to define your credentials. + properties "m2.repository.username" and either + "m2.repository.private.key" or "m2.repository.password" to define + your credentials. --> @@ -423,7 +424,7 @@ - + @@ -443,7 +444,7 @@ - + diff --git a/solr/site/features.pdf b/solr/site/features.pdf index 009dacfd256..ccc35bbd482 100755 Binary files a/solr/site/features.pdf and b/solr/site/features.pdf differ diff --git a/solr/site/index.html b/solr/site/index.html index 9cd2d7d6912..5330ae536f9 100755 --- a/solr/site/index.html +++ b/solr/site/index.html @@ -232,6 +232,9 @@ document.write("Last Published: " + document.lastModified); News

    • +July 2011 - Solr 3.3 Released +
    • +
    • May 2011 - Solr 3.2 Released
    • @@ -352,7 +355,37 @@ customization is required.

      News

      - + +

      July 2011 - Solr 3.3 Released

      +

      The Lucene PMC is pleased to announce the release of Apache Solr 3.3! +

      +

      + Solr's version number was synced with Lucene following the Lucene/Solr merge, so Solr 3.3 contains Lucene 3.3. +

      +

      + Solr 3.3 release highlights include +

      +
        + +
      • Grouping / Field Collapsing
      • + +
      • A new, automaton-based suggest/autocomplete implementation offering an + order of magnitude smaller RAM consumption.
      • + +
      • KStemFilterFactory, an optimized implementation of a less aggressive + stemmer for English.
      • + +
      • Solr defaults to a new, more efficient merge policy (TieredMergePolicy). + See http://s.apache.org/merging for more information.
      • + +
      • Important bugfixes, including extremely high RAM usage in spellchecking.
      • + +
      • Bugfixes and improvements from Apache Lucene 3.3
      • + +
      +

      See the release notes for a more complete list of all the new features, improvements, and bugfixes. +

      +

      May 2011 - Solr 3.2 Released

      The Lucene PMC is pleased to announce the release of Apache Solr 3.2!

      @@ -378,7 +411,7 @@ customization is required.

    See the release notes for a more complete list of all the new features, improvements, and bugfixes.

    - +

    March 2011 - Solr 3.1 Released

    The Lucene PMC is pleased to announce the release of Apache Solr 3.1!

    @@ -426,7 +459,7 @@ customization is required.

    See the release notes for a more complete list of all the new features, improvements, and bugfixes.

    - +

    25 June 2010 - Solr 1.4.1 Released

    Solr 1.4.1 has been released and is now available for public @@ -439,7 +472,7 @@ customization is required. See the release notes for more details.

    - +

    7 May 2010 - Apache Lucene Eurocon 2010 Coming to Prague May 18-21

    On May 18th to the 21st Prague will play host to the first @@ -511,7 +544,7 @@ customization is required.

  7. - +

    10 November 2009 - Solr 1.4 Released

    Solr 1.4 has been released and is now available for public download! @@ -543,7 +576,7 @@ customization is required.

    See the release notes for more details.

    - +

    20 August 2009 - Solr's first book is published!

    @@ -558,7 +591,7 @@ customization is required.

    Finally, this book covers various deployment considerations to include indexing strategies and performance-oriented configuration that will enable you to scale Solr to meet the needs of a high-volume site.

    - +

    18 August 2009 - Lucene at US ApacheCon

    @@ -634,7 +667,7 @@ Be sure not to miss: Search - Jason Rutherglen @ 15:00 - +

    09 February 2009 - Lucene at ApacheCon Europe 2009 in Amsterdam

    @@ -672,23 +705,23 @@ Be sure not to miss: - +

    19 December 2008 - Solr Logo Contest Results

    Many great logos were submitted, but only one could be chosen. Congratulations Michiel, the creator of the winning logo that is proudly displayed at the top of this page.

    - +

    03 October 2008 - Solr Logo Contest

    By popular demand, Solr is holding a contest to pick a new Solr logo. Details about how to submit an entry can be found on the wiki. The Deadline for submissions is November 20th, 2008 @ 11:59PM GMT.

    - +

    15 September 2008 - Solr 1.3.0 Available

    Solr 1.3.0 is available for public download. This version contains many enhancements and bug fixes, including distributed search capabilities, Lucene 2.3.x performance improvements and many others.

    See the release notes for more details. Download is available from a Apache Mirror.

    - +

    28 August 2008 - Lucene/Solr at ApacheCon New Orleans

    @@ -710,7 +743,7 @@ Be sure not to miss:

  8. An entire day of Lucene sessions on November 5th
  9. - +

    03 September 2007 - Lucene at ApacheCon Atlanta

    ApacheCon US logo @@ -730,7 +763,7 @@ Be sure not to miss:

  10. November 16, 4:00 pm: Advanced Indexing Techniques with Apache Lucene by Michael Busch. Information on payloads and advanced indexing techniques.
  11. - +

    06 June 2007: Release 1.2 available

    This is the first release since Solr graduated from the Incubator, @@ -740,40 +773,40 @@ Be sure not to miss: and more flexible plugins.

    See the release notes for more details.

    - +

    17 January 2007: Solr graduates from Incubator

    Solr has graduated from the Apache Incubator, and is now a sub-project of Lucene.

    - +

    22 December 2006: Release 1.1.0 available

    This is the first release since Solr joined the Incubator, and brings many new features and performance optimizations including highlighting, faceted search, and JSON/Python/Ruby response formats.

    - +

    15 August 2006: Solr at ApacheCon US

    Chris Hostetter will be presenting "Faceted Searching With Apache Solr" at ApacheCon US 2006, on October 13th at 4:30pm. See the ApacheCon website for more details.

    - +

    21 April 2006: Solr at ApacheCon

    Yonik Seeley will be presenting "Apache Solr, a Full-Text Search Server based on Lucene" at ApacheCon Europe 2006, on June 29th at 5:30pm. See the ApacheCon website for more details.

    - +

    21 February 2006: nightly builds

    Solr now has nightly builds. This automatically creates a downloadable version of Solr every night. All unit tests must pass, or a message is sent to the developers mailing list and no new version is created. This also updates the javadoc.

    - +

    17 January 2006: Solr Joins Apache Incubator

    Solr, a search server based on Lucene, has been accepted into the Apache Incubator. Solr was originally developed by CNET Networks, and is widely used within CNET diff --git a/solr/site/index.pdf b/solr/site/index.pdf index 4cd19fc8e1d..84ed289d262 100755 Binary files a/solr/site/index.pdf and b/solr/site/index.pdf differ diff --git a/solr/site/issue_tracking.pdf b/solr/site/issue_tracking.pdf index 6e494ad81a9..110fefe7efd 100755 Binary files a/solr/site/issue_tracking.pdf and b/solr/site/issue_tracking.pdf differ diff --git a/solr/site/linkmap.pdf b/solr/site/linkmap.pdf index 883ba557e5c..2f6a45a83de 100755 Binary files a/solr/site/linkmap.pdf and b/solr/site/linkmap.pdf differ diff --git a/solr/site/mailing_lists.pdf b/solr/site/mailing_lists.pdf index 1a7cd3badce..d1f85b81e3b 100755 Binary files a/solr/site/mailing_lists.pdf and b/solr/site/mailing_lists.pdf differ diff --git a/solr/site/tutorial.pdf b/solr/site/tutorial.pdf index 275b55e1650..b5dbf5d0442 100755 Binary files a/solr/site/tutorial.pdf and b/solr/site/tutorial.pdf differ diff --git a/solr/site/version_control.pdf b/solr/site/version_control.pdf index 4ea4657a839..da680aba17e 100755 Binary files a/solr/site/version_control.pdf and b/solr/site/version_control.pdf differ diff --git a/solr/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java b/solr/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java index 5def900e97f..04b26e1da7e 100644 --- a/solr/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java +++ b/solr/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java @@ -89,7 +89,6 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase { TokenStream tokenStream = null; try { tokenStream = analyzer.reusableTokenStream(context.getFieldName(), new StringReader(value)); - tokenStream.reset(); } catch (IOException e) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e); } diff --git a/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java b/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java index 7d2b37d9581..d2c5b107456 100644 --- a/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java +++ b/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java @@ -454,8 +454,6 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf Highlighter highlighter; if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) { - // TODO: this is not always necessary - eventually we would like to avoid this wrap - // when it is not needed. if (maxCharsToAnalyze < 0) { tstream = new CachingTokenFilter(tstream); } else { diff --git a/solr/src/java/org/apache/solr/request/SimpleFacets.java b/solr/src/java/org/apache/solr/request/SimpleFacets.java index 7eb70374342..1593d923fe0 100644 --- a/solr/src/java/org/apache/solr/request/SimpleFacets.java +++ b/solr/src/java/org/apache/solr/request/SimpleFacets.java @@ -1317,7 +1317,7 @@ public class SimpleFacets { } @Override public Double parseAndAddGap(Double value, String gap) { - return new Double(value.floatValue() + Double.valueOf(gap).floatValue()); + return new Double(value.doubleValue() + Double.valueOf(gap).doubleValue()); } } private static class IntegerRangeEndpointCalculator @@ -1343,7 +1343,7 @@ public class SimpleFacets { } @Override public Long parseAndAddGap(Long value, String gap) { - return new Long(value.intValue() + Long.valueOf(gap).intValue()); + return new Long(value.longValue() + Long.valueOf(gap).longValue()); } } private static class DateRangeEndpointCalculator diff --git a/solr/src/java/org/apache/solr/schema/LatLonType.java b/solr/src/java/org/apache/solr/schema/LatLonType.java index dbcd9588c8a..b3956178601 100644 --- a/solr/src/java/org/apache/solr/schema/LatLonType.java +++ b/solr/src/java/org/apache/solr/schema/LatLonType.java @@ -354,25 +354,20 @@ class SpatialDistanceQuery extends Query { } @Override - public float getValue() { - return queryWeight; - } - - @Override - public float sumOfSquaredWeights() throws IOException { + public float getValueForNormalization() throws IOException { queryWeight = getBoost(); return queryWeight * queryWeight; } @Override - public void normalize(float norm) { - this.queryNorm = norm; + public void normalize(float norm, float topLevelBoost) { + this.queryNorm = norm * topLevelBoost; queryWeight *= this.queryNorm; } @Override public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { - return new SpatialScorer(context, this); + return new SpatialScorer(context, this, queryWeight); } @Override @@ -405,10 +400,10 @@ class SpatialDistanceQuery extends Query { int lastDistDoc; double lastDist; - public SpatialScorer(AtomicReaderContext readerContext, SpatialWeight w) throws IOException { + public SpatialScorer(AtomicReaderContext readerContext, SpatialWeight w, float qWeight) throws IOException { super(w); this.weight = w; - this.qWeight = w.getValue(); + this.qWeight = qWeight; this.reader = readerContext.reader; this.maxDoc = reader.maxDoc(); this.liveDocs = reader.getLiveDocs(); diff --git a/solr/src/java/org/apache/solr/search/Grouping.java b/solr/src/java/org/apache/solr/search/Grouping.java index af4fa2e62bb..4ee4c809fb0 100755 --- a/solr/src/java/org/apache/solr/search/Grouping.java +++ b/solr/src/java/org/apache/solr/search/Grouping.java @@ -261,7 +261,7 @@ public class Grouping { public void execute() throws IOException { if (commands.isEmpty()) { - throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Specify at least on field, function or query to group by."); + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Specify at least one field, function or query to group by."); } DocListAndSet out = new DocListAndSet(); diff --git a/solr/src/java/org/apache/solr/search/JoinQParserPlugin.java b/solr/src/java/org/apache/solr/search/JoinQParserPlugin.java index 4f188ea15a1..61da1b1b9d4 100644 --- a/solr/src/java/org/apache/solr/search/JoinQParserPlugin.java +++ b/solr/src/java/org/apache/solr/search/JoinQParserPlugin.java @@ -168,19 +168,15 @@ class JoinQuery extends Query { return JoinQuery.this; } - public float getValue() { - return getBoost(); - } - @Override - public float sumOfSquaredWeights() throws IOException { + public float getValueForNormalization() throws IOException { queryWeight = getBoost(); return queryWeight * queryWeight; } @Override - public void normalize(float norm) { - this.queryNorm = norm; + public void normalize(float norm, float topLevelBoost) { + this.queryNorm = norm * topLevelBoost; queryWeight *= this.queryNorm; } @@ -223,7 +219,7 @@ class JoinQuery extends Query { DocIdSet readerSet = filter.getDocIdSet(context); if (readerSet == null) readerSet=DocIdSet.EMPTY_DOCIDSET; - return new JoinScorer(this, readerSet.iterator()); + return new JoinScorer(this, readerSet.iterator(), getBoost()); } @@ -514,9 +510,9 @@ class JoinQuery extends Query { final float score; int doc = -1; - public JoinScorer(Weight w, DocIdSetIterator iter) throws IOException { + public JoinScorer(Weight w, DocIdSetIterator iter, float score) throws IOException { super(w); - score = w.getValue(); + this.score = score; this.iter = iter==null ? DocIdSet.EMPTY_DOCIDSET.iterator() : iter; } diff --git a/solr/src/java/org/apache/solr/search/SolrConstantScoreQuery.java b/solr/src/java/org/apache/solr/search/SolrConstantScoreQuery.java index fd41c32b8f2..2880302e03f 100755 --- a/solr/src/java/org/apache/solr/search/SolrConstantScoreQuery.java +++ b/solr/src/java/org/apache/solr/search/SolrConstantScoreQuery.java @@ -106,31 +106,26 @@ public class SolrConstantScoreQuery extends ConstantScoreQuery implements Extend } @Override - public float getValue() { - return queryWeight; - } - - @Override - public float sumOfSquaredWeights() throws IOException { + public float getValueForNormalization() throws IOException { queryWeight = getBoost(); return queryWeight * queryWeight; } @Override - public void normalize(float norm) { - this.queryNorm = norm; + public void normalize(float norm, float topLevelBoost) { + this.queryNorm = norm * topLevelBoost; queryWeight *= this.queryNorm; } @Override public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { - return new ConstantScorer(context, this); + return new ConstantScorer(context, this, queryWeight); } @Override public Explanation explain(AtomicReaderContext context, int doc) throws IOException { - ConstantScorer cs = new ConstantScorer(context, this); + ConstantScorer cs = new ConstantScorer(context, this, queryWeight); boolean exists = cs.docIdSetIterator.advance(doc) == doc; ComplexExplanation result = new ComplexExplanation(); @@ -157,9 +152,9 @@ public class SolrConstantScoreQuery extends ConstantScoreQuery implements Extend final float theScore; int doc = -1; - public ConstantScorer(AtomicReaderContext context, ConstantWeight w) throws IOException { + public ConstantScorer(AtomicReaderContext context, ConstantWeight w, float theScore) throws IOException { super(w); - theScore = w.getValue(); + this.theScore = theScore; DocIdSet docIdSet = filter instanceof SolrFilter ? ((SolrFilter)filter).getDocIdSet(w.context, context) : filter.getDocIdSet(context); if (docIdSet == null) { docIdSetIterator = DocIdSet.EMPTY_DOCIDSET.iterator(); diff --git a/solr/src/site/src/documentation/content/xdocs/index.xml b/solr/src/site/src/documentation/content/xdocs/index.xml index b9808bd293e..00e232dd9c6 100755 --- a/solr/src/site/src/documentation/content/xdocs/index.xml +++ b/solr/src/site/src/documentation/content/xdocs/index.xml @@ -66,6 +66,32 @@ customization is required.

    News +
    + July 2011 - Solr 3.3 Released +

    The Lucene PMC is pleased to announce the release of Apache Solr 3.3! +

    +

    + Solr's version number was synced with Lucene following the Lucene/Solr merge, so Solr 3.3 contains Lucene 3.3. +

    +

    + Solr 3.3 release highlights include +

    +
      +
    • Grouping / Field Collapsing
    • +
    • A new, automaton-based suggest/autocomplete implementation offering an + order of magnitude smaller RAM consumption.
    • +
    • KStemFilterFactory, an optimized implementation of a less aggressive + stemmer for English.
    • +
    • Solr defaults to a new, more efficient merge policy (TieredMergePolicy). + See http://s.apache.org/merging for more information.
    • +
    • Important bugfixes, including extremely high RAM usage in spellchecking.
    • +
    • Bugfixes and improvements from Apache Lucene 3.3
    • +
    + +

    See the release notes for a more complete list of all the new features, improvements, and bugfixes. +

    + +
    May 2011 - Solr 3.2 Released

    The Lucene PMC is pleased to announce the release of Apache Solr 3.2! diff --git a/solr/src/solrj/org/apache/solr/client/solrj/impl/CommonsHttpSolrServer.java b/solr/src/solrj/org/apache/solr/client/solrj/impl/CommonsHttpSolrServer.java index 2f1c19a6c5b..5a928857f5f 100644 --- a/solr/src/solrj/org/apache/solr/client/solrj/impl/CommonsHttpSolrServer.java +++ b/solr/src/solrj/org/apache/solr/client/solrj/impl/CommonsHttpSolrServer.java @@ -32,11 +32,11 @@ import org.apache.commons.httpclient.methods.InputStreamRequestEntity; import org.apache.commons.httpclient.methods.PostMethod; import org.apache.commons.httpclient.methods.RequestEntity; import org.apache.commons.httpclient.methods.multipart.MultipartRequestEntity; +import org.apache.commons.httpclient.methods.multipart.FilePart; import org.apache.commons.httpclient.methods.multipart.Part; -import org.apache.commons.httpclient.methods.multipart.PartBase; +import org.apache.commons.httpclient.methods.multipart.PartSource; import org.apache.commons.httpclient.methods.multipart.StringPart; import org.apache.commons.httpclient.params.HttpMethodParams; -import org.apache.commons.io.IOUtils; import org.apache.solr.client.solrj.ResponseParser; import org.apache.solr.client.solrj.SolrRequest; import org.apache.solr.client.solrj.SolrServer; @@ -324,25 +324,24 @@ public class CommonsHttpSolrServer extends SolrServer final ContentStream c = content; String charSet = null; - String transferEncoding = null; - parts.add(new PartBase(c.getName(), c.getContentType(), - charSet, transferEncoding) { + PartSource source = new PartSource() { @Override - protected long lengthOfData() throws IOException { + public long getLength() { return c.getSize(); } + + public String getFileName() { + return c.getName(); + } @Override - protected void sendData(OutputStream out) - throws IOException { - InputStream in = c.getStream(); - try { - IOUtils.copy(in, out); - } finally { - in.close(); - } + public InputStream createInputStream() throws IOException { + return c.getStream(); } - }); + }; + + parts.add(new FilePart(c.getName(), source, + c.getContentType(), charSet)); } } if (parts.size() > 0) { diff --git a/solr/src/test-files/docs1.xml b/solr/src/test-files/docs1.xml new file mode 100644 index 00000000000..3c5448d9dcf --- /dev/null +++ b/solr/src/test-files/docs1.xml @@ -0,0 +1,56 @@ + + + + + SP2514N + Samsung SpinPoint P120 SP2514N - hard drive - 250 GB - ATA-133 + Samsung Electronics Co. Ltd. + + samsung + electronics + hard drive + 7200RPM, 8MB cache, IDE Ultra ATA-133 + NoiseGuard, SilentSeek technology, Fluid Dynamic Bearing (FDB) motor + 92 + 6 + true + 2006-02-13T15:26:37Z + + 35.0752,-97.032 + + + + 6H500F0 + Maxtor DiamondMax 11 - hard drive - 500 GB - SATA-300 + Maxtor Corp. + + maxtor + electronics + hard drive + SATA 3.0Gb/s, NCQ + 8.5ms seek + 16MB cache + 350 + 6 + true + + 45.17614,-93.87341 + 2006-02-13T15:26:37Z + + + diff --git a/solr/src/test-files/docs2.xml b/solr/src/test-files/docs2.xml new file mode 100644 index 00000000000..0b89d6785c2 --- /dev/null +++ b/solr/src/test-files/docs2.xml @@ -0,0 +1,77 @@ + + + + + TWINX2048-3200PRO + CORSAIR XMS 2GB (2 x 1GB) 184-Pin DDR SDRAM Unbuffered DDR 400 (PC 3200) Dual Channel Kit System Memory - Retail + Corsair Microsystems Inc. + + corsair + electronics + memory + CAS latency 2, 2-3-3-6 timing, 2.75v, unbuffered, heat-spreader + 185 + 5 + true + + 37.7752,-122.4232 + 2006-02-13T15:26:37Z + + + electronics|6.0 memory|3.0 + + + + VS1GB400C3 + CORSAIR ValueSelect 1GB 184-Pin DDR SDRAM Unbuffered DDR 400 (PC 3200) System Memory - Retail + Corsair Microsystems Inc. + + corsair + electronics + memory + 74.99 + 7 + true + + 37.7752,-100.0232 + 2006-02-13T15:26:37Z + + electronics|4.0 memory|2.0 + + + + VDBDB1A16 + A-DATA V-Series 1GB 184-Pin DDR SDRAM Unbuffered DDR 400 (PC 3200) System Memory - OEM + A-DATA Technology Inc. + + corsair + electronics + memory + CAS latency 3, 2.7v + + 0 + true + + 45.18414,-93.88141 + 2006-02-13T15:26:37Z + + electronics|0.9 memory|0.1 + + + + diff --git a/solr/src/test/org/apache/solr/client/solrj/SolrExampleTests.java b/solr/src/test/org/apache/solr/client/solrj/SolrExampleTests.java index e19992a4a21..1f28fa120d5 100644 --- a/solr/src/test/org/apache/solr/client/solrj/SolrExampleTests.java +++ b/solr/src/test/org/apache/solr/client/solrj/SolrExampleTests.java @@ -446,6 +446,24 @@ abstract public class SolrExampleTests extends SolrJettyTestBase assertNotNull("Couldn't upload books.csv", result); rsp = server.query( new SolrQuery( "*:*") ); Assert.assertEquals( 10, rsp.getResults().getNumFound() ); + } + + @Test + public void testMultiContentStreamRequest() throws Exception { + SolrServer server = getSolrServer(); + server.deleteByQuery( "*:*" );// delete everything! + server.commit(); + QueryResponse rsp = server.query( new SolrQuery( "*:*") ); + Assert.assertEquals( 0, rsp.getResults().getNumFound() ); + + ContentStreamUpdateRequest up = new ContentStreamUpdateRequest("/update"); + up.addFile(getFile("docs1.xml")); // 2 + up.addFile(getFile("docs2.xml")); // 3 + up.setAction(AbstractUpdateRequest.ACTION.COMMIT, true, true); + NamedList result = server.request(up); + assertNotNull("Couldn't upload xml files", result); + rsp = server.query( new SolrQuery( "*:*") ); + Assert.assertEquals( 5 , rsp.getResults().getNumFound() ); } diff --git a/solr/src/test/org/apache/solr/client/solrj/embedded/TestSolrProperties.java b/solr/src/test/org/apache/solr/client/solrj/embedded/TestSolrProperties.java index 975aa200f84..bf3c1687162 100644 --- a/solr/src/test/org/apache/solr/client/solrj/embedded/TestSolrProperties.java +++ b/solr/src/test/org/apache/solr/client/solrj/embedded/TestSolrProperties.java @@ -202,8 +202,15 @@ public class TestSolrProperties extends LuceneTestCase { assertTrue("should have more recent time: " + after + "," + before, after > before); mcr = CoreAdminRequest.persist("solr-persist.xml", coreadmin); - - System.out.println(IOUtils.toString(new FileInputStream(new File(solrXml.getParent(), "solr-persist.xml")))); + + if (VERBOSE) { + FileInputStream fis = new FileInputStream(new File(solrXml.getParent(), "solr-persist.xml")); + try { + System.out.println(IOUtils.toString(fis)); + } finally { + fis.close(); + } + } DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); FileInputStream fis = new FileInputStream(new File(solrXml.getParent(), "solr-persist.xml")); try { diff --git a/solr/src/test/org/apache/solr/core/MockCodecProviderFactory.java b/solr/src/test/org/apache/solr/core/MockCodecProviderFactory.java index 08f29dcf1a6..ace2e092377 100644 --- a/solr/src/test/org/apache/solr/core/MockCodecProviderFactory.java +++ b/solr/src/test/org/apache/solr/core/MockCodecProviderFactory.java @@ -42,7 +42,7 @@ public class MockCodecProviderFactory extends CodecProviderFactory { public CodecProvider create() { CodecProvider cp = new CodecProvider(); cp.register(new StandardCodec()); - cp.register(new PulsingCodec(1)); + cp.register(new PulsingCodec()); if (codecs != null) { for (Object codec : codecs.getAll("name")) { if (!cp.isCodecRegistered((String)codec)) { diff --git a/solr/src/test/org/apache/solr/request/SimpleFacetsTest.java b/solr/src/test/org/apache/solr/request/SimpleFacetsTest.java index 9302daaed9f..47f31f6f591 100644 --- a/solr/src/test/org/apache/solr/request/SimpleFacetsTest.java +++ b/solr/src/test/org/apache/solr/request/SimpleFacetsTest.java @@ -892,7 +892,48 @@ public class SimpleFacetsTest extends SolrTestCaseJ4 { public void testNumericRangeFacetsSortableDouble() { helpTestFractionalNumberRangeFacets("range_facet_sd"); } - private void helpTestFractionalNumberRangeFacets(final String fieldName) { + + @Test + public void testNumericRangeFacetsOverflowTrieDouble() { + helpTestNumericRangeFacetsDoubleOverflow("range_facet_d"); + } + @Test + public void testNumericRangeFacetsOverflowSortableDouble() { + helpTestNumericRangeFacetsDoubleOverflow("range_facet_sd"); + } + + private void helpTestNumericRangeFacetsDoubleOverflow(final String fieldName) { + final String f = fieldName; + final String pre = "//lst[@name='facet_ranges']/lst[@name='"+f+"']/lst[@name='counts']"; + final String meta = pre + "/../"; + + String start = "0.0"; + String gap = (new Double( (double)Float.MAX_VALUE )).toString(); + String end = (new Double( ((double)Float.MAX_VALUE) * 3D )).toString(); + String mid = (new Double( ((double)Float.MAX_VALUE) * 2D )).toString(); + + assertQ(f+": checking counts for lower", + req( "q", "id:[30 TO 60]" + ,"rows", "0" + ,"facet", "true" + ,"facet.range", f + ,"facet.range.start", start + ,"facet.range.end", end + ,"facet.range.gap", gap + ,"facet.range.other", "all" + ,"facet.range.include", "lower" + ) + ,"*[count("+pre+"/int)=3]" + ,pre+"/int[@name='"+start+"'][.='6' ]" + ,pre+"/int[@name='"+mid+"'][.='0' ]" + // + ,meta+"/double[@name='end' ][.='"+end+"']" + ,meta+"/int[@name='before' ][.='0']" + ,meta+"/int[@name='after' ][.='0']" + ,meta+"/int[@name='between'][.='6']" + ); + } + private void helpTestFractionalNumberRangeFacets(final String fieldName) { final String f = fieldName; final String pre = "//lst[@name='facet_ranges']/lst[@name='"+f+"']/lst[@name='counts']"; @@ -1112,6 +1153,47 @@ public class SimpleFacetsTest extends SolrTestCaseJ4 { helpTestWholeNumberRangeFacets("range_facet_sl"); } + + @Test + public void testNumericRangeFacetsOverflowTrieLong() { + helpTestNumericRangeFacetsLongOverflow("range_facet_l"); + } + @Test + public void testNumericRangeFacetsOverflowSortableLong() { + helpTestNumericRangeFacetsLongOverflow("range_facet_sl"); + } + + private void helpTestNumericRangeFacetsLongOverflow(final String fieldName) { + final String f = fieldName; + final String pre = "//lst[@name='facet_ranges']/lst[@name='"+f+"']/lst[@name='counts']"; + final String meta = pre + "/../"; + + String start = "0"; + String gap = (new Long( (long)Integer.MAX_VALUE )).toString(); + String end = (new Long( ((long)Integer.MAX_VALUE) * 3L )).toString(); + String mid = (new Long( ((long)Integer.MAX_VALUE) * 2L )).toString(); + + assertQ(f+": checking counts for lower", + req( "q", "id:[30 TO 60]" + ,"rows", "0" + ,"facet", "true" + ,"facet.range", f + ,"facet.range.start", start + ,"facet.range.end", end + ,"facet.range.gap", gap + ,"facet.range.other", "all" + ,"facet.range.include", "lower" + ) + ,"*[count("+pre+"/int)=3]" + ,pre+"/int[@name='"+start+"'][.='6' ]" + ,pre+"/int[@name='"+mid+"'][.='0' ]" + // + ,meta+"/long[@name='end' ][.='"+end+"']" + ,meta+"/int[@name='before' ][.='0']" + ,meta+"/int[@name='after' ][.='0']" + ,meta+"/int[@name='between'][.='6']" + ); + } private void helpTestWholeNumberRangeFacets(final String fieldName) { // the float test covers a lot of the weird edge cases diff --git a/solr/src/test/org/apache/solr/search/function/TestFunctionQuery.java b/solr/src/test/org/apache/solr/search/function/TestFunctionQuery.java index ba86864ec8a..38ab99ec539 100755 --- a/solr/src/test/org/apache/solr/search/function/TestFunctionQuery.java +++ b/solr/src/test/org/apache/solr/search/function/TestFunctionQuery.java @@ -21,7 +21,7 @@ import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.search.DefaultSimilarity; import org.apache.lucene.search.FieldCache; -import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.TFIDFSimilarity; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.util.NamedList; @@ -305,7 +305,7 @@ public class TestFunctionQuery extends SolrTestCaseJ4 { assertQ(req("fl","*,score","q", "{!func}docfreq($field,$value)", "fq","id:6", "field","a_t", "value","cow"), "//float[@name='score']='3.0'"); assertQ(req("fl","*,score","q", "{!func}termfreq(a_t,cow)", "fq","id:6"), "//float[@name='score']='5.0'"); - Similarity similarity = new DefaultSimilarity(); + TFIDFSimilarity similarity = new DefaultSimilarity(); // make sure it doesn't get a NPE if no terms are present in a field. assertQ(req("fl","*,score","q", "{!func}termfreq(nofield_t,cow)", "fq","id:6"), "//float[@name='score']='0.0'"); @@ -323,7 +323,7 @@ public class TestFunctionQuery extends SolrTestCaseJ4 { state.setBoost(1.0f); state.setLength(4); assertQ(req("fl","*,score","q", "{!func}norm(a_t)", "fq","id:2"), - "//float[@name='score']='" + similarity.computeNorm(state) + "'"); // sqrt(4)==2 and is exactly representable when quantized to a byte + "//float[@name='score']='" + similarity.decodeNormValue(similarity.computeNorm(state)) + "'"); // sqrt(4)==2 and is exactly representable when quantized to a byte // test that ord and rord are working on a global index basis, not just // at the segment level (since Lucene 2.9 has switched to per-segment searching)