mirror of https://github.com/apache/lucene.git
LUCENE-8031: DOCS_ONLY fields set incorrect length norm
This commit is contained in:
parent
4e198a2737
commit
29e5b8abce
|
@ -58,6 +58,9 @@ Changes in Runtime Behavior
|
|||
* LUCENE-8134: Index options are no longer automatically downgraded.
|
||||
(Adrien Grand)
|
||||
|
||||
* LUCENE-8031: Length normalization correctly reflects omission of term frequencies.
|
||||
(Robert Muir, Adrien Grand)
|
||||
|
||||
Improvements
|
||||
|
||||
* LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities.
|
||||
|
|
|
@ -684,7 +684,7 @@ final class DefaultIndexingChain extends DocConsumer {
|
|||
}
|
||||
|
||||
void setInvertState() {
|
||||
invertState = new FieldInvertState(indexCreatedVersionMajor, fieldInfo.name);
|
||||
invertState = new FieldInvertState(indexCreatedVersionMajor, fieldInfo.name, fieldInfo.getIndexOptions());
|
||||
termsHashPerField = termsHash.addField(invertState, fieldInfo);
|
||||
if (fieldInfo.omitsNorms() == false) {
|
||||
assert norms == null;
|
||||
|
|
|
@ -34,6 +34,7 @@ import org.apache.lucene.util.AttributeSource;
|
|||
public final class FieldInvertState {
|
||||
final int indexCreatedVersionMajor;
|
||||
final String name;
|
||||
final IndexOptions indexOptions;
|
||||
int position;
|
||||
int length;
|
||||
int numOverlap;
|
||||
|
@ -53,19 +54,22 @@ public final class FieldInvertState {
|
|||
|
||||
/** Creates {code FieldInvertState} for the specified
|
||||
* field name. */
|
||||
public FieldInvertState(int indexCreatedVersionMajor, String name) {
|
||||
public FieldInvertState(int indexCreatedVersionMajor, String name, IndexOptions indexOptions) {
|
||||
this.indexCreatedVersionMajor = indexCreatedVersionMajor;
|
||||
this.name = name;
|
||||
this.indexOptions = indexOptions;
|
||||
}
|
||||
|
||||
/** Creates {code FieldInvertState} for the specified
|
||||
* field name and values for all fields. */
|
||||
public FieldInvertState(int indexCreatedVersionMajor, String name, int position, int length, int numOverlap, int offset) {
|
||||
this(indexCreatedVersionMajor, name);
|
||||
public FieldInvertState(int indexCreatedVersionMajor, String name, IndexOptions indexOptions, int position, int length, int numOverlap, int offset, int maxTermFrequency, int uniqueTermCount) {
|
||||
this(indexCreatedVersionMajor, name, indexOptions);
|
||||
this.position = position;
|
||||
this.length = length;
|
||||
this.numOverlap = numOverlap;
|
||||
this.offset = offset;
|
||||
this.maxTermFrequency = maxTermFrequency;
|
||||
this.uniqueTermCount = uniqueTermCount;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -176,4 +180,11 @@ public final class FieldInvertState {
|
|||
public int getIndexCreatedVersionMajor() {
|
||||
return indexCreatedVersionMajor;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the index options for this field
|
||||
*/
|
||||
public IndexOptions getIndexOptions() {
|
||||
return indexOptions;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
|
@ -113,7 +114,14 @@ public class BM25Similarity extends Similarity {
|
|||
|
||||
@Override
|
||||
public final long computeNorm(FieldInvertState state) {
|
||||
final int numTerms = discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength();
|
||||
final int numTerms;
|
||||
if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
|
||||
numTerms = state.getUniqueTermCount();
|
||||
} else if (discountOverlaps) {
|
||||
numTerms = state.getLength() - state.getNumOverlap();
|
||||
} else {
|
||||
numTerms = state.getLength();
|
||||
}
|
||||
return SmallFloat.intToByte4(numTerms);
|
||||
}
|
||||
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
|
@ -185,10 +186,13 @@ public abstract class SimilarityBase extends Similarity {
|
|||
@Override
|
||||
public final long computeNorm(FieldInvertState state) {
|
||||
final int numTerms;
|
||||
if (discountOverlaps)
|
||||
if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
|
||||
numTerms = state.getUniqueTermCount();
|
||||
} else if (discountOverlaps) {
|
||||
numTerms = state.getLength() - state.getNumOverlap();
|
||||
else
|
||||
} else {
|
||||
numTerms = state.getLength();
|
||||
}
|
||||
return SmallFloat.intToByte4(numTerms);
|
||||
}
|
||||
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
|
@ -500,10 +501,13 @@ public abstract class TFIDFSimilarity extends Similarity {
|
|||
@Override
|
||||
public final long computeNorm(FieldInvertState state) {
|
||||
final int numTerms;
|
||||
if (discountOverlaps)
|
||||
if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
|
||||
numTerms = state.getUniqueTermCount();
|
||||
} else if (discountOverlaps) {
|
||||
numTerms = state.getLength() - state.getNumOverlap();
|
||||
else
|
||||
} else {
|
||||
numTerms = state.getLength();
|
||||
}
|
||||
return SmallFloat.intToByte4(numTerms);
|
||||
}
|
||||
|
||||
|
|
|
@ -25,6 +25,7 @@ import org.apache.lucene.document.StringField;
|
|||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BoostQuery;
|
||||
|
@ -107,7 +108,9 @@ public class TestBooleanSimilarity extends BaseSimilarityTestCase {
|
|||
final int length = TestUtil.nextInt(random(), 1, 100);
|
||||
final int position = random().nextInt(length);
|
||||
final int numOverlaps = random().nextInt(length);
|
||||
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", position, length, numOverlaps, 100);
|
||||
final int maxTermFrequency = 1;
|
||||
final int uniqueTermCount = 1;
|
||||
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", IndexOptions.DOCS_AND_FREQS, position, length, numOverlaps, 100, maxTermFrequency, uniqueTermCount);
|
||||
assertEquals(
|
||||
sim2.computeNorm(state),
|
||||
sim1.computeNorm(state),
|
||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.document.Field.Store;
|
|||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
|
@ -176,7 +177,9 @@ public class TestClassicSimilarity extends BaseSimilarityTestCase {
|
|||
final int length = TestUtil.nextInt(random(), 1, 1000);
|
||||
final int position = random().nextInt(length);
|
||||
final int numOverlaps = random().nextInt(length);
|
||||
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", position, length, numOverlaps, 100);
|
||||
final int maxTermFrequency = 1;
|
||||
final int uniqueTermCount = 1;
|
||||
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", IndexOptions.DOCS_AND_FREQS, position, length, numOverlaps, 100, maxTermFrequency, uniqueTermCount);
|
||||
assertEquals(
|
||||
sim2.computeNorm(state),
|
||||
sim1.computeNorm(state),
|
||||
|
|
|
@ -551,7 +551,7 @@ public class TestSimilarityBase extends LuceneTestCase {
|
|||
SimilarityBase actual = new DFRSimilarity(new BasicModelIne(), new AfterEffectB(), new NormalizationH2());
|
||||
expected.setDiscountOverlaps(false);
|
||||
actual.setDiscountOverlaps(false);
|
||||
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo");
|
||||
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", IndexOptions.DOCS_AND_FREQS);
|
||||
state.setLength(5);
|
||||
state.setNumOverlap(2);
|
||||
assertEquals(expected.computeNorm(state), actual.computeNorm(state));
|
||||
|
|
|
@ -598,6 +598,7 @@ public class MemoryIndex {
|
|||
info.sliceArray.start[ord] = postingsWriter.startNewSlice();
|
||||
}
|
||||
info.sliceArray.freq[ord]++;
|
||||
info.maxTermFrequency = Math.max(info.maxTermFrequency, info.sliceArray.freq[ord]);
|
||||
info.sumTotalTermFreq++;
|
||||
postingsWriter.writeInt(pos);
|
||||
if (storeOffsets) {
|
||||
|
@ -808,6 +809,8 @@ public class MemoryIndex {
|
|||
|
||||
private long sumTotalTermFreq;
|
||||
|
||||
private int maxTermFrequency;
|
||||
|
||||
/** the last position encountered in this field for multi field support*/
|
||||
private int lastPosition;
|
||||
|
||||
|
@ -901,8 +904,8 @@ public class MemoryIndex {
|
|||
|
||||
NumericDocValues getNormDocValues() {
|
||||
if (norm == null) {
|
||||
FieldInvertState invertState = new FieldInvertState(Version.LATEST.major, fieldInfo.name, fieldInfo.number,
|
||||
numTokens, numOverlapTokens, 0);
|
||||
FieldInvertState invertState = new FieldInvertState(Version.LATEST.major, fieldInfo.name, fieldInfo.getIndexOptions(), lastPosition,
|
||||
numTokens, numOverlapTokens, 0, maxTermFrequency, terms.size());
|
||||
final long value = normSimilarity.computeNorm(invertState);
|
||||
if (DEBUG) System.err.println("MemoryIndexReader.norms: " + fieldInfo.name + ":" + value + ":" + numTokens);
|
||||
|
||||
|
|
Loading…
Reference in New Issue