mirror of https://github.com/apache/lucene.git
LUCENE-8031: DOCS_ONLY fields set incorrect length norm
This commit is contained in:
parent
4e198a2737
commit
29e5b8abce
|
@ -58,6 +58,9 @@ Changes in Runtime Behavior
|
||||||
* LUCENE-8134: Index options are no longer automatically downgraded.
|
* LUCENE-8134: Index options are no longer automatically downgraded.
|
||||||
(Adrien Grand)
|
(Adrien Grand)
|
||||||
|
|
||||||
|
* LUCENE-8031: Length normalization correctly reflects omission of term frequencies.
|
||||||
|
(Robert Muir, Adrien Grand)
|
||||||
|
|
||||||
Improvements
|
Improvements
|
||||||
|
|
||||||
* LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities.
|
* LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities.
|
||||||
|
|
|
@ -684,7 +684,7 @@ final class DefaultIndexingChain extends DocConsumer {
|
||||||
}
|
}
|
||||||
|
|
||||||
void setInvertState() {
|
void setInvertState() {
|
||||||
invertState = new FieldInvertState(indexCreatedVersionMajor, fieldInfo.name);
|
invertState = new FieldInvertState(indexCreatedVersionMajor, fieldInfo.name, fieldInfo.getIndexOptions());
|
||||||
termsHashPerField = termsHash.addField(invertState, fieldInfo);
|
termsHashPerField = termsHash.addField(invertState, fieldInfo);
|
||||||
if (fieldInfo.omitsNorms() == false) {
|
if (fieldInfo.omitsNorms() == false) {
|
||||||
assert norms == null;
|
assert norms == null;
|
||||||
|
|
|
@ -34,6 +34,7 @@ import org.apache.lucene.util.AttributeSource;
|
||||||
public final class FieldInvertState {
|
public final class FieldInvertState {
|
||||||
final int indexCreatedVersionMajor;
|
final int indexCreatedVersionMajor;
|
||||||
final String name;
|
final String name;
|
||||||
|
final IndexOptions indexOptions;
|
||||||
int position;
|
int position;
|
||||||
int length;
|
int length;
|
||||||
int numOverlap;
|
int numOverlap;
|
||||||
|
@ -53,19 +54,22 @@ public final class FieldInvertState {
|
||||||
|
|
||||||
/** Creates {code FieldInvertState} for the specified
|
/** Creates {code FieldInvertState} for the specified
|
||||||
* field name. */
|
* field name. */
|
||||||
public FieldInvertState(int indexCreatedVersionMajor, String name) {
|
public FieldInvertState(int indexCreatedVersionMajor, String name, IndexOptions indexOptions) {
|
||||||
this.indexCreatedVersionMajor = indexCreatedVersionMajor;
|
this.indexCreatedVersionMajor = indexCreatedVersionMajor;
|
||||||
this.name = name;
|
this.name = name;
|
||||||
|
this.indexOptions = indexOptions;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Creates {code FieldInvertState} for the specified
|
/** Creates {code FieldInvertState} for the specified
|
||||||
* field name and values for all fields. */
|
* field name and values for all fields. */
|
||||||
public FieldInvertState(int indexCreatedVersionMajor, String name, int position, int length, int numOverlap, int offset) {
|
public FieldInvertState(int indexCreatedVersionMajor, String name, IndexOptions indexOptions, int position, int length, int numOverlap, int offset, int maxTermFrequency, int uniqueTermCount) {
|
||||||
this(indexCreatedVersionMajor, name);
|
this(indexCreatedVersionMajor, name, indexOptions);
|
||||||
this.position = position;
|
this.position = position;
|
||||||
this.length = length;
|
this.length = length;
|
||||||
this.numOverlap = numOverlap;
|
this.numOverlap = numOverlap;
|
||||||
this.offset = offset;
|
this.offset = offset;
|
||||||
|
this.maxTermFrequency = maxTermFrequency;
|
||||||
|
this.uniqueTermCount = uniqueTermCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -176,4 +180,11 @@ public final class FieldInvertState {
|
||||||
public int getIndexCreatedVersionMajor() {
|
public int getIndexCreatedVersionMajor() {
|
||||||
return indexCreatedVersionMajor;
|
return indexCreatedVersionMajor;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the index options for this field
|
||||||
|
*/
|
||||||
|
public IndexOptions getIndexOptions() {
|
||||||
|
return indexOptions;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,6 +21,7 @@ import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.lucene.index.FieldInvertState;
|
import org.apache.lucene.index.FieldInvertState;
|
||||||
|
import org.apache.lucene.index.IndexOptions;
|
||||||
import org.apache.lucene.search.CollectionStatistics;
|
import org.apache.lucene.search.CollectionStatistics;
|
||||||
import org.apache.lucene.search.Explanation;
|
import org.apache.lucene.search.Explanation;
|
||||||
import org.apache.lucene.search.TermStatistics;
|
import org.apache.lucene.search.TermStatistics;
|
||||||
|
@ -113,7 +114,14 @@ public class BM25Similarity extends Similarity {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final long computeNorm(FieldInvertState state) {
|
public final long computeNorm(FieldInvertState state) {
|
||||||
final int numTerms = discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength();
|
final int numTerms;
|
||||||
|
if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
|
||||||
|
numTerms = state.getUniqueTermCount();
|
||||||
|
} else if (discountOverlaps) {
|
||||||
|
numTerms = state.getLength() - state.getNumOverlap();
|
||||||
|
} else {
|
||||||
|
numTerms = state.getLength();
|
||||||
|
}
|
||||||
return SmallFloat.intToByte4(numTerms);
|
return SmallFloat.intToByte4(numTerms);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -21,6 +21,7 @@ import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.lucene.index.FieldInvertState;
|
import org.apache.lucene.index.FieldInvertState;
|
||||||
|
import org.apache.lucene.index.IndexOptions;
|
||||||
import org.apache.lucene.search.CollectionStatistics;
|
import org.apache.lucene.search.CollectionStatistics;
|
||||||
import org.apache.lucene.search.Explanation;
|
import org.apache.lucene.search.Explanation;
|
||||||
import org.apache.lucene.search.TermStatistics;
|
import org.apache.lucene.search.TermStatistics;
|
||||||
|
@ -185,10 +186,13 @@ public abstract class SimilarityBase extends Similarity {
|
||||||
@Override
|
@Override
|
||||||
public final long computeNorm(FieldInvertState state) {
|
public final long computeNorm(FieldInvertState state) {
|
||||||
final int numTerms;
|
final int numTerms;
|
||||||
if (discountOverlaps)
|
if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
|
||||||
|
numTerms = state.getUniqueTermCount();
|
||||||
|
} else if (discountOverlaps) {
|
||||||
numTerms = state.getLength() - state.getNumOverlap();
|
numTerms = state.getLength() - state.getNumOverlap();
|
||||||
else
|
} else {
|
||||||
numTerms = state.getLength();
|
numTerms = state.getLength();
|
||||||
|
}
|
||||||
return SmallFloat.intToByte4(numTerms);
|
return SmallFloat.intToByte4(numTerms);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -21,6 +21,7 @@ import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.lucene.index.FieldInvertState;
|
import org.apache.lucene.index.FieldInvertState;
|
||||||
|
import org.apache.lucene.index.IndexOptions;
|
||||||
import org.apache.lucene.search.CollectionStatistics;
|
import org.apache.lucene.search.CollectionStatistics;
|
||||||
import org.apache.lucene.search.Explanation;
|
import org.apache.lucene.search.Explanation;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
@ -500,10 +501,13 @@ public abstract class TFIDFSimilarity extends Similarity {
|
||||||
@Override
|
@Override
|
||||||
public final long computeNorm(FieldInvertState state) {
|
public final long computeNorm(FieldInvertState state) {
|
||||||
final int numTerms;
|
final int numTerms;
|
||||||
if (discountOverlaps)
|
if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
|
||||||
|
numTerms = state.getUniqueTermCount();
|
||||||
|
} else if (discountOverlaps) {
|
||||||
numTerms = state.getLength() - state.getNumOverlap();
|
numTerms = state.getLength() - state.getNumOverlap();
|
||||||
else
|
} else {
|
||||||
numTerms = state.getLength();
|
numTerms = state.getLength();
|
||||||
|
}
|
||||||
return SmallFloat.intToByte4(numTerms);
|
return SmallFloat.intToByte4(numTerms);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -25,6 +25,7 @@ import org.apache.lucene.document.StringField;
|
||||||
import org.apache.lucene.document.TextField;
|
import org.apache.lucene.document.TextField;
|
||||||
import org.apache.lucene.index.DirectoryReader;
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
import org.apache.lucene.index.FieldInvertState;
|
import org.apache.lucene.index.FieldInvertState;
|
||||||
|
import org.apache.lucene.index.IndexOptions;
|
||||||
import org.apache.lucene.index.RandomIndexWriter;
|
import org.apache.lucene.index.RandomIndexWriter;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.search.BoostQuery;
|
import org.apache.lucene.search.BoostQuery;
|
||||||
|
@ -107,7 +108,9 @@ public class TestBooleanSimilarity extends BaseSimilarityTestCase {
|
||||||
final int length = TestUtil.nextInt(random(), 1, 100);
|
final int length = TestUtil.nextInt(random(), 1, 100);
|
||||||
final int position = random().nextInt(length);
|
final int position = random().nextInt(length);
|
||||||
final int numOverlaps = random().nextInt(length);
|
final int numOverlaps = random().nextInt(length);
|
||||||
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", position, length, numOverlaps, 100);
|
final int maxTermFrequency = 1;
|
||||||
|
final int uniqueTermCount = 1;
|
||||||
|
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", IndexOptions.DOCS_AND_FREQS, position, length, numOverlaps, 100, maxTermFrequency, uniqueTermCount);
|
||||||
assertEquals(
|
assertEquals(
|
||||||
sim2.computeNorm(state),
|
sim2.computeNorm(state),
|
||||||
sim1.computeNorm(state),
|
sim1.computeNorm(state),
|
||||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.document.Field.Store;
|
||||||
import org.apache.lucene.document.StringField;
|
import org.apache.lucene.document.StringField;
|
||||||
import org.apache.lucene.index.DirectoryReader;
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
import org.apache.lucene.index.FieldInvertState;
|
import org.apache.lucene.index.FieldInvertState;
|
||||||
|
import org.apache.lucene.index.IndexOptions;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
|
@ -176,7 +177,9 @@ public class TestClassicSimilarity extends BaseSimilarityTestCase {
|
||||||
final int length = TestUtil.nextInt(random(), 1, 1000);
|
final int length = TestUtil.nextInt(random(), 1, 1000);
|
||||||
final int position = random().nextInt(length);
|
final int position = random().nextInt(length);
|
||||||
final int numOverlaps = random().nextInt(length);
|
final int numOverlaps = random().nextInt(length);
|
||||||
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", position, length, numOverlaps, 100);
|
final int maxTermFrequency = 1;
|
||||||
|
final int uniqueTermCount = 1;
|
||||||
|
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", IndexOptions.DOCS_AND_FREQS, position, length, numOverlaps, 100, maxTermFrequency, uniqueTermCount);
|
||||||
assertEquals(
|
assertEquals(
|
||||||
sim2.computeNorm(state),
|
sim2.computeNorm(state),
|
||||||
sim1.computeNorm(state),
|
sim1.computeNorm(state),
|
||||||
|
|
|
@ -551,7 +551,7 @@ public class TestSimilarityBase extends LuceneTestCase {
|
||||||
SimilarityBase actual = new DFRSimilarity(new BasicModelIne(), new AfterEffectB(), new NormalizationH2());
|
SimilarityBase actual = new DFRSimilarity(new BasicModelIne(), new AfterEffectB(), new NormalizationH2());
|
||||||
expected.setDiscountOverlaps(false);
|
expected.setDiscountOverlaps(false);
|
||||||
actual.setDiscountOverlaps(false);
|
actual.setDiscountOverlaps(false);
|
||||||
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo");
|
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", IndexOptions.DOCS_AND_FREQS);
|
||||||
state.setLength(5);
|
state.setLength(5);
|
||||||
state.setNumOverlap(2);
|
state.setNumOverlap(2);
|
||||||
assertEquals(expected.computeNorm(state), actual.computeNorm(state));
|
assertEquals(expected.computeNorm(state), actual.computeNorm(state));
|
||||||
|
|
|
@ -598,6 +598,7 @@ public class MemoryIndex {
|
||||||
info.sliceArray.start[ord] = postingsWriter.startNewSlice();
|
info.sliceArray.start[ord] = postingsWriter.startNewSlice();
|
||||||
}
|
}
|
||||||
info.sliceArray.freq[ord]++;
|
info.sliceArray.freq[ord]++;
|
||||||
|
info.maxTermFrequency = Math.max(info.maxTermFrequency, info.sliceArray.freq[ord]);
|
||||||
info.sumTotalTermFreq++;
|
info.sumTotalTermFreq++;
|
||||||
postingsWriter.writeInt(pos);
|
postingsWriter.writeInt(pos);
|
||||||
if (storeOffsets) {
|
if (storeOffsets) {
|
||||||
|
@ -808,6 +809,8 @@ public class MemoryIndex {
|
||||||
|
|
||||||
private long sumTotalTermFreq;
|
private long sumTotalTermFreq;
|
||||||
|
|
||||||
|
private int maxTermFrequency;
|
||||||
|
|
||||||
/** the last position encountered in this field for multi field support*/
|
/** the last position encountered in this field for multi field support*/
|
||||||
private int lastPosition;
|
private int lastPosition;
|
||||||
|
|
||||||
|
@ -901,8 +904,8 @@ public class MemoryIndex {
|
||||||
|
|
||||||
NumericDocValues getNormDocValues() {
|
NumericDocValues getNormDocValues() {
|
||||||
if (norm == null) {
|
if (norm == null) {
|
||||||
FieldInvertState invertState = new FieldInvertState(Version.LATEST.major, fieldInfo.name, fieldInfo.number,
|
FieldInvertState invertState = new FieldInvertState(Version.LATEST.major, fieldInfo.name, fieldInfo.getIndexOptions(), lastPosition,
|
||||||
numTokens, numOverlapTokens, 0);
|
numTokens, numOverlapTokens, 0, maxTermFrequency, terms.size());
|
||||||
final long value = normSimilarity.computeNorm(invertState);
|
final long value = normSimilarity.computeNorm(invertState);
|
||||||
if (DEBUG) System.err.println("MemoryIndexReader.norms: " + fieldInfo.name + ":" + value + ":" + numTokens);
|
if (DEBUG) System.err.println("MemoryIndexReader.norms: " + fieldInfo.name + ":" + value + ":" + numTokens);
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue