LUCENE-8031: DOCS_ONLY fields set incorrect length norm

This commit is contained in:
Robert Muir 2018-02-24 11:21:53 -05:00
parent 4e198a2737
commit 29e5b8abce
10 changed files with 53 additions and 14 deletions

View File

@ -58,6 +58,9 @@ Changes in Runtime Behavior
* LUCENE-8134: Index options are no longer automatically downgraded. * LUCENE-8134: Index options are no longer automatically downgraded.
(Adrien Grand) (Adrien Grand)
* LUCENE-8031: Length normalization correctly reflects omission of term frequencies.
(Robert Muir, Adrien Grand)
Improvements Improvements
* LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities. * LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities.

View File

@ -684,7 +684,7 @@ final class DefaultIndexingChain extends DocConsumer {
} }
void setInvertState() { void setInvertState() {
invertState = new FieldInvertState(indexCreatedVersionMajor, fieldInfo.name); invertState = new FieldInvertState(indexCreatedVersionMajor, fieldInfo.name, fieldInfo.getIndexOptions());
termsHashPerField = termsHash.addField(invertState, fieldInfo); termsHashPerField = termsHash.addField(invertState, fieldInfo);
if (fieldInfo.omitsNorms() == false) { if (fieldInfo.omitsNorms() == false) {
assert norms == null; assert norms == null;

View File

@ -34,6 +34,7 @@ import org.apache.lucene.util.AttributeSource;
public final class FieldInvertState { public final class FieldInvertState {
final int indexCreatedVersionMajor; final int indexCreatedVersionMajor;
final String name; final String name;
final IndexOptions indexOptions;
int position; int position;
int length; int length;
int numOverlap; int numOverlap;
@ -53,19 +54,22 @@ public final class FieldInvertState {
/** Creates {code FieldInvertState} for the specified /** Creates {code FieldInvertState} for the specified
* field name. */ * field name. */
public FieldInvertState(int indexCreatedVersionMajor, String name) { public FieldInvertState(int indexCreatedVersionMajor, String name, IndexOptions indexOptions) {
this.indexCreatedVersionMajor = indexCreatedVersionMajor; this.indexCreatedVersionMajor = indexCreatedVersionMajor;
this.name = name; this.name = name;
this.indexOptions = indexOptions;
} }
/** Creates {code FieldInvertState} for the specified /** Creates {code FieldInvertState} for the specified
* field name and values for all fields. */ * field name and values for all fields. */
public FieldInvertState(int indexCreatedVersionMajor, String name, int position, int length, int numOverlap, int offset) { public FieldInvertState(int indexCreatedVersionMajor, String name, IndexOptions indexOptions, int position, int length, int numOverlap, int offset, int maxTermFrequency, int uniqueTermCount) {
this(indexCreatedVersionMajor, name); this(indexCreatedVersionMajor, name, indexOptions);
this.position = position; this.position = position;
this.length = length; this.length = length;
this.numOverlap = numOverlap; this.numOverlap = numOverlap;
this.offset = offset; this.offset = offset;
this.maxTermFrequency = maxTermFrequency;
this.uniqueTermCount = uniqueTermCount;
} }
/** /**
@ -176,4 +180,11 @@ public final class FieldInvertState {
public int getIndexCreatedVersionMajor() { public int getIndexCreatedVersionMajor() {
return indexCreatedVersionMajor; return indexCreatedVersionMajor;
} }
/**
* Get the index options for this field
*/
public IndexOptions getIndexOptions() {
return indexOptions;
}
} }

View File

@ -21,6 +21,7 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation; import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics; import org.apache.lucene.search.TermStatistics;
@ -113,7 +114,14 @@ public class BM25Similarity extends Similarity {
@Override @Override
public final long computeNorm(FieldInvertState state) { public final long computeNorm(FieldInvertState state) {
final int numTerms = discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength(); final int numTerms;
if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
numTerms = state.getUniqueTermCount();
} else if (discountOverlaps) {
numTerms = state.getLength() - state.getNumOverlap();
} else {
numTerms = state.getLength();
}
return SmallFloat.intToByte4(numTerms); return SmallFloat.intToByte4(numTerms);
} }

View File

@ -21,6 +21,7 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation; import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics; import org.apache.lucene.search.TermStatistics;
@ -185,10 +186,13 @@ public abstract class SimilarityBase extends Similarity {
@Override @Override
public final long computeNorm(FieldInvertState state) { public final long computeNorm(FieldInvertState state) {
final int numTerms; final int numTerms;
if (discountOverlaps) if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
numTerms = state.getUniqueTermCount();
} else if (discountOverlaps) {
numTerms = state.getLength() - state.getNumOverlap(); numTerms = state.getLength() - state.getNumOverlap();
else } else {
numTerms = state.getLength(); numTerms = state.getLength();
}
return SmallFloat.intToByte4(numTerms); return SmallFloat.intToByte4(numTerms);
} }

View File

@ -21,6 +21,7 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation; import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
@ -500,10 +501,13 @@ public abstract class TFIDFSimilarity extends Similarity {
@Override @Override
public final long computeNorm(FieldInvertState state) { public final long computeNorm(FieldInvertState state) {
final int numTerms; final int numTerms;
if (discountOverlaps) if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
numTerms = state.getUniqueTermCount();
} else if (discountOverlaps) {
numTerms = state.getLength() - state.getNumOverlap(); numTerms = state.getLength() - state.getNumOverlap();
else } else {
numTerms = state.getLength(); numTerms = state.getLength();
}
return SmallFloat.intToByte4(numTerms); return SmallFloat.intToByte4(numTerms);
} }

View File

@ -25,6 +25,7 @@ import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField; import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.BoostQuery;
@ -107,7 +108,9 @@ public class TestBooleanSimilarity extends BaseSimilarityTestCase {
final int length = TestUtil.nextInt(random(), 1, 100); final int length = TestUtil.nextInt(random(), 1, 100);
final int position = random().nextInt(length); final int position = random().nextInt(length);
final int numOverlaps = random().nextInt(length); final int numOverlaps = random().nextInt(length);
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", position, length, numOverlaps, 100); final int maxTermFrequency = 1;
final int uniqueTermCount = 1;
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", IndexOptions.DOCS_AND_FREQS, position, length, numOverlaps, 100, maxTermFrequency, uniqueTermCount);
assertEquals( assertEquals(
sim2.computeNorm(state), sim2.computeNorm(state),
sim1.computeNorm(state), sim1.computeNorm(state),

View File

@ -26,6 +26,7 @@ import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField; import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
@ -176,7 +177,9 @@ public class TestClassicSimilarity extends BaseSimilarityTestCase {
final int length = TestUtil.nextInt(random(), 1, 1000); final int length = TestUtil.nextInt(random(), 1, 1000);
final int position = random().nextInt(length); final int position = random().nextInt(length);
final int numOverlaps = random().nextInt(length); final int numOverlaps = random().nextInt(length);
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", position, length, numOverlaps, 100); final int maxTermFrequency = 1;
final int uniqueTermCount = 1;
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", IndexOptions.DOCS_AND_FREQS, position, length, numOverlaps, 100, maxTermFrequency, uniqueTermCount);
assertEquals( assertEquals(
sim2.computeNorm(state), sim2.computeNorm(state),
sim1.computeNorm(state), sim1.computeNorm(state),

View File

@ -551,7 +551,7 @@ public class TestSimilarityBase extends LuceneTestCase {
SimilarityBase actual = new DFRSimilarity(new BasicModelIne(), new AfterEffectB(), new NormalizationH2()); SimilarityBase actual = new DFRSimilarity(new BasicModelIne(), new AfterEffectB(), new NormalizationH2());
expected.setDiscountOverlaps(false); expected.setDiscountOverlaps(false);
actual.setDiscountOverlaps(false); actual.setDiscountOverlaps(false);
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo"); FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", IndexOptions.DOCS_AND_FREQS);
state.setLength(5); state.setLength(5);
state.setNumOverlap(2); state.setNumOverlap(2);
assertEquals(expected.computeNorm(state), actual.computeNorm(state)); assertEquals(expected.computeNorm(state), actual.computeNorm(state));

View File

@ -598,6 +598,7 @@ public class MemoryIndex {
info.sliceArray.start[ord] = postingsWriter.startNewSlice(); info.sliceArray.start[ord] = postingsWriter.startNewSlice();
} }
info.sliceArray.freq[ord]++; info.sliceArray.freq[ord]++;
info.maxTermFrequency = Math.max(info.maxTermFrequency, info.sliceArray.freq[ord]);
info.sumTotalTermFreq++; info.sumTotalTermFreq++;
postingsWriter.writeInt(pos); postingsWriter.writeInt(pos);
if (storeOffsets) { if (storeOffsets) {
@ -808,6 +809,8 @@ public class MemoryIndex {
private long sumTotalTermFreq; private long sumTotalTermFreq;
private int maxTermFrequency;
/** the last position encountered in this field for multi field support*/ /** the last position encountered in this field for multi field support*/
private int lastPosition; private int lastPosition;
@ -901,8 +904,8 @@ public class MemoryIndex {
NumericDocValues getNormDocValues() { NumericDocValues getNormDocValues() {
if (norm == null) { if (norm == null) {
FieldInvertState invertState = new FieldInvertState(Version.LATEST.major, fieldInfo.name, fieldInfo.number, FieldInvertState invertState = new FieldInvertState(Version.LATEST.major, fieldInfo.name, fieldInfo.getIndexOptions(), lastPosition,
numTokens, numOverlapTokens, 0); numTokens, numOverlapTokens, 0, maxTermFrequency, terms.size());
final long value = normSimilarity.computeNorm(invertState); final long value = normSimilarity.computeNorm(invertState);
if (DEBUG) System.err.println("MemoryIndexReader.norms: " + fieldInfo.name + ":" + value + ":" + numTokens); if (DEBUG) System.err.println("MemoryIndexReader.norms: " + fieldInfo.name + ":" + value + ":" + numTokens);