LUCENE-8031: DOCS_ONLY fields set incorrect length norm

This commit is contained in:
Robert Muir 2018-02-24 11:21:53 -05:00
parent 4e198a2737
commit 29e5b8abce
10 changed files with 53 additions and 14 deletions

View File

@ -58,6 +58,9 @@ Changes in Runtime Behavior
* LUCENE-8134: Index options are no longer automatically downgraded.
(Adrien Grand)
* LUCENE-8031: Length normalization correctly reflects omission of term frequencies.
(Robert Muir, Adrien Grand)
Improvements
* LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities.

View File

@ -684,7 +684,7 @@ final class DefaultIndexingChain extends DocConsumer {
}
void setInvertState() {
invertState = new FieldInvertState(indexCreatedVersionMajor, fieldInfo.name);
invertState = new FieldInvertState(indexCreatedVersionMajor, fieldInfo.name, fieldInfo.getIndexOptions());
termsHashPerField = termsHash.addField(invertState, fieldInfo);
if (fieldInfo.omitsNorms() == false) {
assert norms == null;

View File

@ -34,6 +34,7 @@ import org.apache.lucene.util.AttributeSource;
public final class FieldInvertState {
final int indexCreatedVersionMajor;
final String name;
final IndexOptions indexOptions;
int position;
int length;
int numOverlap;
@ -53,19 +54,22 @@ public final class FieldInvertState {
/** Creates {code FieldInvertState} for the specified
* field name. */
public FieldInvertState(int indexCreatedVersionMajor, String name) {
public FieldInvertState(int indexCreatedVersionMajor, String name, IndexOptions indexOptions) {
this.indexCreatedVersionMajor = indexCreatedVersionMajor;
this.name = name;
this.indexOptions = indexOptions;
}
/** Creates {code FieldInvertState} for the specified
* field name and values for all fields. */
public FieldInvertState(int indexCreatedVersionMajor, String name, int position, int length, int numOverlap, int offset) {
this(indexCreatedVersionMajor, name);
public FieldInvertState(int indexCreatedVersionMajor, String name, IndexOptions indexOptions, int position, int length, int numOverlap, int offset, int maxTermFrequency, int uniqueTermCount) {
this(indexCreatedVersionMajor, name, indexOptions);
this.position = position;
this.length = length;
this.numOverlap = numOverlap;
this.offset = offset;
this.maxTermFrequency = maxTermFrequency;
this.uniqueTermCount = uniqueTermCount;
}
/**
@ -176,4 +180,11 @@ public final class FieldInvertState {
public int getIndexCreatedVersionMajor() {
return indexCreatedVersionMajor;
}
/**
* Get the index options for this field
*/
public IndexOptions getIndexOptions() {
return indexOptions;
}
}

View File

@ -21,6 +21,7 @@ import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
@ -113,7 +114,14 @@ public class BM25Similarity extends Similarity {
@Override
public final long computeNorm(FieldInvertState state) {
final int numTerms = discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength();
final int numTerms;
if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
numTerms = state.getUniqueTermCount();
} else if (discountOverlaps) {
numTerms = state.getLength() - state.getNumOverlap();
} else {
numTerms = state.getLength();
}
return SmallFloat.intToByte4(numTerms);
}

View File

@ -21,6 +21,7 @@ import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
@ -185,10 +186,13 @@ public abstract class SimilarityBase extends Similarity {
@Override
public final long computeNorm(FieldInvertState state) {
final int numTerms;
if (discountOverlaps)
if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
numTerms = state.getUniqueTermCount();
} else if (discountOverlaps) {
numTerms = state.getLength() - state.getNumOverlap();
else
} else {
numTerms = state.getLength();
}
return SmallFloat.intToByte4(numTerms);
}

View File

@ -21,6 +21,7 @@ import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
@ -500,10 +501,13 @@ public abstract class TFIDFSimilarity extends Similarity {
@Override
public final long computeNorm(FieldInvertState state) {
final int numTerms;
if (discountOverlaps)
if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
numTerms = state.getUniqueTermCount();
} else if (discountOverlaps) {
numTerms = state.getLength() - state.getNumOverlap();
else
} else {
numTerms = state.getLength();
}
return SmallFloat.intToByte4(numTerms);
}

View File

@ -25,6 +25,7 @@ import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BoostQuery;
@ -107,7 +108,9 @@ public class TestBooleanSimilarity extends BaseSimilarityTestCase {
final int length = TestUtil.nextInt(random(), 1, 100);
final int position = random().nextInt(length);
final int numOverlaps = random().nextInt(length);
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", position, length, numOverlaps, 100);
final int maxTermFrequency = 1;
final int uniqueTermCount = 1;
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", IndexOptions.DOCS_AND_FREQS, position, length, numOverlaps, 100, maxTermFrequency, uniqueTermCount);
assertEquals(
sim2.computeNorm(state),
sim1.computeNorm(state),

View File

@ -26,6 +26,7 @@ import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
@ -176,7 +177,9 @@ public class TestClassicSimilarity extends BaseSimilarityTestCase {
final int length = TestUtil.nextInt(random(), 1, 1000);
final int position = random().nextInt(length);
final int numOverlaps = random().nextInt(length);
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", position, length, numOverlaps, 100);
final int maxTermFrequency = 1;
final int uniqueTermCount = 1;
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", IndexOptions.DOCS_AND_FREQS, position, length, numOverlaps, 100, maxTermFrequency, uniqueTermCount);
assertEquals(
sim2.computeNorm(state),
sim1.computeNorm(state),

View File

@ -551,7 +551,7 @@ public class TestSimilarityBase extends LuceneTestCase {
SimilarityBase actual = new DFRSimilarity(new BasicModelIne(), new AfterEffectB(), new NormalizationH2());
expected.setDiscountOverlaps(false);
actual.setDiscountOverlaps(false);
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo");
FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", IndexOptions.DOCS_AND_FREQS);
state.setLength(5);
state.setNumOverlap(2);
assertEquals(expected.computeNorm(state), actual.computeNorm(state));

View File

@ -598,6 +598,7 @@ public class MemoryIndex {
info.sliceArray.start[ord] = postingsWriter.startNewSlice();
}
info.sliceArray.freq[ord]++;
info.maxTermFrequency = Math.max(info.maxTermFrequency, info.sliceArray.freq[ord]);
info.sumTotalTermFreq++;
postingsWriter.writeInt(pos);
if (storeOffsets) {
@ -808,6 +809,8 @@ public class MemoryIndex {
private long sumTotalTermFreq;
private int maxTermFrequency;
/** the last position encountered in this field for multi field support*/
private int lastPosition;
@ -901,8 +904,8 @@ public class MemoryIndex {
NumericDocValues getNormDocValues() {
if (norm == null) {
FieldInvertState invertState = new FieldInvertState(Version.LATEST.major, fieldInfo.name, fieldInfo.number,
numTokens, numOverlapTokens, 0);
FieldInvertState invertState = new FieldInvertState(Version.LATEST.major, fieldInfo.name, fieldInfo.getIndexOptions(), lastPosition,
numTokens, numOverlapTokens, 0, maxTermFrequency, terms.size());
final long value = normSimilarity.computeNorm(invertState);
if (DEBUG) System.err.println("MemoryIndexReader.norms: " + fieldInfo.name + ":" + value + ":" + numTokens);