LUCENE-3687: Allow similarity to encode norms other than a single byte

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1232014 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2012-01-16 14:57:15 +00:00
parent 354a3be78f
commit 98e59fceee
59 changed files with 1474 additions and 340 deletions

View File

@ -622,6 +622,11 @@ New features
* LUCENE-3628: Norms are represented as DocValues. IndexReader exposes
a #normValues(String) method to obtain norms per field. (Simon Willnauer)
* LUCENE-3687: Similarity#computeNorm(FieldInvertState, Norm) allows to compute
norm values or arbitrary precision. Instead of returning a fixed single byte
value, custom similarities can now set a integer, float or byte value to the
given Norm object. (Simon Willnauer)
Optimizations
* LUCENE-2588: Don't store unnecessary suffixes when writing the terms

View File

@ -550,3 +550,7 @@ you can now do this:
IndexReader.openIfChanged (a static method), and now returns null
(instead of the old reader) if there are no changes to the index, to
prevent the common pitfall of accidentally closing the old reader.
* LUCENE-3687: Similarity#computeNorm() now expects a Norm object to set the computed
norm value instead of returning a fixed single byte value. Custom similarities can now
set integer, float and byte values if a single byte is not sufficient.

View File

@ -33,6 +33,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.Norm;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
@ -48,7 +49,7 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.memory.MemoryIndexNormDocValues.SingleByteSource;
import org.apache.lucene.index.memory.MemoryIndexNormDocValues.SingleValueSource;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
@ -1157,8 +1158,9 @@ public class MemoryIndex {
int numOverlapTokens = info != null ? info.numOverlapTokens : 0;
float boost = info != null ? info.getBoost() : 1.0f;
FieldInvertState invertState = new FieldInvertState(0, numTokens, numOverlapTokens, 0, boost);
byte norm = fieldSim.computeNorm(invertState);
SingleByteSource singleByteSource = new SingleByteSource(new byte[] {norm});
Norm norm = new Norm();
fieldSim.computeNorm(invertState, norm);
SingleValueSource singleByteSource = new SingleValueSource(norm);
norms = new MemoryIndexNormDocValues(singleByteSource);
// cache it for future reuse
cachedNormValues = norms;

View File

@ -17,6 +17,7 @@ package org.apache.lucene.index.memory;
*/
import java.io.IOException;
import org.apache.lucene.index.Norm;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.util.BytesRef;
@ -51,21 +52,56 @@ class MemoryIndexNormDocValues extends DocValues {
return 1;
}
public static class SingleByteSource extends Source {
public static class SingleValueSource extends Source {
private final byte[] bytes;
private final Number numericValue;
private final BytesRef binaryValue;
protected SingleByteSource(byte[] bytes) {
super(Type.BYTES_FIXED_STRAIGHT);
this.bytes = bytes;
protected SingleValueSource(Norm norm) {
super(norm.type());
this.numericValue = norm.field().numericValue();
this.binaryValue = norm.field().binaryValue();
}
@Override
public long getInt(int docID) {
switch (type) {
case FIXED_INTS_16:
case FIXED_INTS_32:
case FIXED_INTS_64:
case FIXED_INTS_8:
case VAR_INTS:
assert numericValue != null;
return numericValue.longValue();
}
return super.getInt(docID);
}
@Override
public double getFloat(int docID) {
switch (type) {
case FLOAT_32:
case FLOAT_64:
assert numericValue != null;
return numericValue.floatValue();
}
return super.getFloat(docID);
}
@Override
public BytesRef getBytes(int docID, BytesRef ref) {
ref.bytes = bytes;
ref.offset = docID;
ref.length = 1;
return ref;
switch (type) {
case BYTES_FIXED_DEREF:
case BYTES_FIXED_SORTED:
case BYTES_FIXED_STRAIGHT:
case BYTES_VAR_DEREF:
case BYTES_VAR_SORTED:
case BYTES_VAR_STRAIGHT:
assert binaryValue != null;
ref.copyBytes(binaryValue);
return ref;
}
return super.getBytes(docID, ref);
}
@Override
@ -75,9 +111,33 @@ class MemoryIndexNormDocValues extends DocValues {
@Override
public Object getArray() {
return bytes;
switch (type) {
case BYTES_FIXED_DEREF:
case BYTES_FIXED_SORTED:
case BYTES_FIXED_STRAIGHT:
case BYTES_VAR_DEREF:
case BYTES_VAR_SORTED:
case BYTES_VAR_STRAIGHT:
return binaryValue.bytes;
case FIXED_INTS_16:
return new short[] { numericValue.shortValue() };
case FIXED_INTS_32:
return new int[] { numericValue.intValue() };
case FIXED_INTS_64:
return new long[] { numericValue.longValue() };
case FIXED_INTS_8:
return new byte[] { numericValue.byteValue() };
case VAR_INTS:
return new long[] { numericValue.longValue() };
case FLOAT_32:
return new float[] { numericValue.floatValue() };
case FLOAT_64:
return new double[] { numericValue.doubleValue() };
default:
throw new IllegalArgumentException("unknown type " + type);
}
}
}
}

View File

@ -19,6 +19,7 @@ package org.apache.lucene.misc;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.Norm;
/**
* A similarity with a lengthNorm that provides for a "plateau" of
@ -106,7 +107,7 @@ public class SweetSpotSimilarity extends DefaultSimilarity {
* discountOverlaps is true by default or true for this
* specific field. */
@Override
public byte computeNorm(FieldInvertState state) {
public void computeNorm(FieldInvertState state, Norm norm) {
final int numTokens;
if (discountOverlaps)
@ -114,7 +115,7 @@ public class SweetSpotSimilarity extends DefaultSimilarity {
else
numTokens = state.getLength();
return encodeNormValue(state.getBoost() * computeLengthNorm(numTokens));
norm.setByte(encodeNormValue(state.getBoost() * computeLengthNorm(numTokens)));
}
/**

View File

@ -24,12 +24,24 @@ import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.index.Norm;
import org.apache.lucene.index.FieldInvertState;
/**
* Test of the SweetSpotSimilarity
*/
public class SweetSpotSimilarityTest extends LuceneTestCase {
public static float computeAndDecodeNorm(SweetSpotSimilarity decode, Similarity encode, FieldInvertState state) {
return decode.decodeNormValue(computeAndGetNorm(encode, state));
}
public static byte computeAndGetNorm(Similarity s, FieldInvertState state) {
Norm norm = new Norm();
s.computeNorm(state, norm);
return norm.field().numericValue().byteValue();
}
public void testSweetSpotComputeNorm() {
@ -45,9 +57,13 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
invertState.setBoost(1.0f);
for (int i = 1; i < 1000; i++) {
invertState.setLength(i);
Norm lNorm = new Norm();
Norm rNorm = new Norm();
d.computeNorm(invertState, lNorm);
s.computeNorm(invertState, rNorm);
assertEquals("base case: i="+i,
d.computeNorm(invertState),
s.computeNorm(invertState),
computeAndGetNorm(d, invertState),
computeAndGetNorm(s, invertState),
0.0f);
}
@ -59,15 +75,15 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
invertState.setLength(i);
assertEquals("3,10: spot i="+i,
1.0f,
ss.decodeNormValue(s.computeNorm(invertState)),
computeAndDecodeNorm(ss, ss, invertState),
0.0f);
}
for (int i = 10; i < 1000; i++) {
invertState.setLength(i-9);
final byte normD = d.computeNorm(invertState);
final byte normD = computeAndGetNorm(d, invertState);
invertState.setLength(i);
final byte normS = s.computeNorm(invertState);
final byte normS = computeAndGetNorm(s, invertState);
assertEquals("3,10: 10<x : i="+i,
normD,
normS,
@ -105,14 +121,14 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
invertState.setLength(i);
assertEquals("f: 3,10: spot i="+i,
1.0f,
ss.decodeNormValue(sp.get("foo").computeNorm(invertState)),
computeAndDecodeNorm(ss, sp.get("foo"), invertState),
0.0f);
}
for (int i = 10; i < 1000; i++) {
invertState.setLength(i-9);
final byte normD = d.computeNorm(invertState);
final byte normD = computeAndGetNorm(d, invertState);
invertState.setLength(i);
final byte normS = sp.get("foo").computeNorm(invertState);
final byte normS = computeAndGetNorm(sp.get("foo"), invertState);
assertEquals("f: 3,10: 10<x : i="+i,
normD,
normS,
@ -122,21 +138,21 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
invertState.setLength(i);
assertEquals("f: 8,13: spot i="+i,
1.0f,
ss.decodeNormValue(sp.get("bar").computeNorm(invertState)),
computeAndDecodeNorm(ss, sp.get("bar"), invertState),
0.0f);
}
for (int i = 6; i <=9; i++) {
invertState.setLength(i);
assertEquals("f: 6,9: spot i="+i,
1.0f,
ss.decodeNormValue(sp.get("yak").computeNorm(invertState)),
computeAndDecodeNorm(ss, sp.get("yak"), invertState),
0.0f);
}
for (int i = 13; i < 1000; i++) {
invertState.setLength(i-12);
final byte normD = d.computeNorm(invertState);
final byte normD = computeAndGetNorm(d, invertState);
invertState.setLength(i);
final byte normS = sp.get("bar").computeNorm(invertState);
final byte normS = computeAndGetNorm(sp.get("bar"), invertState);
assertEquals("f: 8,13: 13<x : i="+i,
normD,
normS,
@ -144,9 +160,9 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
}
for (int i = 9; i < 1000; i++) {
invertState.setLength(i-8);
final byte normD = d.computeNorm(invertState);
final byte normD = computeAndGetNorm(d, invertState);
invertState.setLength(i);
final byte normS = sp.get("yak").computeNorm(invertState);
final byte normS = computeAndGetNorm(sp.get("yak"), invertState);
assertEquals("f: 6,9: 9<x : i="+i,
normD,
normS,
@ -158,8 +174,8 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
for (int i = 9; i < 1000; i++) {
invertState.setLength(i);
final byte normSS = sp.get("a").computeNorm(invertState);
final byte normS = sp.get("b").computeNorm(invertState);
final byte normSS = computeAndGetNorm(sp.get("a"), invertState);
final byte normS = computeAndGetNorm(sp.get("b"), invertState);
assertTrue("s: i="+i+" : a="+normSS+
" < b="+normS,
normSS < normS);

View File

@ -115,23 +115,13 @@ public abstract class DocValuesConsumer {
final Field scratchField;
switch(type) {
case VAR_INTS:
scratchField = new DocValuesField("", (long) 0, type);
break;
case FIXED_INTS_16:
scratchField = new DocValuesField("", (short) 0, type);
break;
case FIXED_INTS_32:
scratchField = new DocValuesField("", 0, type);
break;
case FIXED_INTS_64:
scratchField = new DocValuesField("", (long) 0, type);
break;
case FIXED_INTS_8:
scratchField = new DocValuesField("", (byte) 0, type);
scratchField = new DocValuesField("", (long) 0, type);
break;
case FLOAT_32:
scratchField = new DocValuesField("", (float) 0, type);
break;
case FLOAT_64:
scratchField = new DocValuesField("", (double) 0, type);
break;

View File

@ -52,7 +52,10 @@ public abstract class PerDocConsumer implements Closeable {
for (int i = 0; i < docValues.length; i++) {
docValues[i] = getDocValuesForMerge(mergeState.readers.get(i).reader, fieldInfo);
}
final DocValuesConsumer docValuesConsumer = addValuesField(getDocValuesType(fieldInfo), fieldInfo);
Type docValuesType = getDocValuesType(fieldInfo);
assert docValuesType != null;
final DocValuesConsumer docValuesConsumer = addValuesField(docValuesType, fieldInfo);
assert docValuesConsumer != null;
docValuesConsumer.merge(mergeState, docValues);
}

View File

@ -30,7 +30,6 @@ import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SegmentInfosFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.lucene40.Lucene40FieldInfosFormat;
import org.apache.lucene.codecs.lucene40.Lucene40SegmentInfosFormat;
import org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsFormat;
import org.apache.lucene.codecs.lucene40.Lucene40TermVectorsFormat;
@ -56,7 +55,7 @@ public class Lucene3xCodec extends Codec {
private final TermVectorsFormat vectorsFormat = new Lucene40TermVectorsFormat();
// TODO: this should really be a different impl
private final FieldInfosFormat fieldInfosFormat = new Lucene40FieldInfosFormat();
private final FieldInfosFormat fieldInfosFormat = new Lucene3xFieldInfosFormat();
// TODO: this should really be a different impl
// also if we want preflex to *really* be read-only it should throw exception for the writer?

View File

@ -0,0 +1,49 @@
package org.apache.lucene.codecs.lucene3x;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Set;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.FieldInfosReader;
import org.apache.lucene.codecs.FieldInfosWriter;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.Directory;
/**
* @lucene.experimental
*/
public class Lucene3xFieldInfosFormat extends FieldInfosFormat {
private final FieldInfosReader reader = new Lucene3xFieldInfosReader();
@Override
public FieldInfosReader getFieldInfosReader() throws IOException {
return reader;
}
@Override
public FieldInfosWriter getFieldInfosWriter() throws IOException {
throw new IllegalArgumentException("this codec can only be used for reading");
}
@Override
public void files(Directory dir, SegmentInfo info, Set<String> files) throws IOException {
Lucene3xFieldInfosReader.files(dir, info, files);
}
}

View File

@ -0,0 +1,123 @@
package org.apache.lucene.codecs.lucene3x;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Set;
import org.apache.lucene.codecs.FieldInfosReader;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DocValues.Type;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexFormatTooNewException;
import org.apache.lucene.index.IndexFormatTooOldException;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
/**
* @lucene.experimental
*/
public class Lucene3xFieldInfosReader extends FieldInfosReader {
/** Extension of field infos */
static final String FIELD_INFOS_EXTENSION = "fnm";
// First used in 2.9; prior to 2.9 there was no format header
static final int FORMAT_START = -2;
// First used in 3.4: omit only positional information
static final int FORMAT_OMIT_POSITIONS = -3;
static final int FORMAT_MINIMUM = FORMAT_START;
static final int FORMAT_CURRENT = FORMAT_OMIT_POSITIONS;
static final byte IS_INDEXED = 0x1;
static final byte STORE_TERMVECTOR = 0x2;
static final byte OMIT_NORMS = 0x10;
static final byte STORE_PAYLOADS = 0x20;
static final byte OMIT_TERM_FREQ_AND_POSITIONS = 0x40;
static final byte OMIT_POSITIONS = -128;
@Override
public FieldInfos read(Directory directory, String segmentName, IOContext iocontext) throws IOException {
final String fileName = IndexFileNames.segmentFileName(segmentName, "", FIELD_INFOS_EXTENSION);
IndexInput input = directory.openInput(fileName, iocontext);
boolean hasVectors = false;
boolean hasFreq = false;
boolean hasProx = false;
try {
final int format = input.readVInt();
if (format > FORMAT_MINIMUM) {
throw new IndexFormatTooOldException(input, format, FORMAT_MINIMUM, FORMAT_CURRENT);
}
if (format < FORMAT_CURRENT) {
throw new IndexFormatTooNewException(input, format, FORMAT_MINIMUM, FORMAT_CURRENT);
}
final int size = input.readVInt(); //read in the size
FieldInfo infos[] = new FieldInfo[size];
for (int i = 0; i < size; i++) {
String name = input.readString();
final int fieldNumber = i;
byte bits = input.readByte();
boolean isIndexed = (bits & IS_INDEXED) != 0;
boolean storeTermVector = (bits & STORE_TERMVECTOR) != 0;
boolean omitNorms = (bits & OMIT_NORMS) != 0;
boolean storePayloads = (bits & STORE_PAYLOADS) != 0;
final IndexOptions indexOptions;
if ((bits & OMIT_TERM_FREQ_AND_POSITIONS) != 0) {
indexOptions = IndexOptions.DOCS_ONLY;
} else if ((bits & OMIT_POSITIONS) != 0) {
if (format <= FORMAT_OMIT_POSITIONS) {
indexOptions = IndexOptions.DOCS_AND_FREQS;
} else {
throw new CorruptIndexException("Corrupt fieldinfos, OMIT_POSITIONS set but format=" + format + " (resource: " + input + ")");
}
} else {
indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
}
// LUCENE-3027: past indices were able to write
// storePayloads=true when omitTFAP is also true,
// which is invalid. We correct that, here:
if (indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
storePayloads = false;
}
hasVectors |= storeTermVector;
hasProx |= isIndexed && indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
hasFreq |= isIndexed && indexOptions != IndexOptions.DOCS_ONLY;
infos[i] = new FieldInfo(name, isIndexed, fieldNumber, storeTermVector,
omitNorms, storePayloads, indexOptions, null, isIndexed && !omitNorms? Type.BYTES_VAR_STRAIGHT : null);
}
if (input.getFilePointer() != input.length()) {
throw new CorruptIndexException("did not read all bytes from file \"" + fileName + "\": read " + input.getFilePointer() + " vs size " + input.length() + " (resource: " + input + ")");
}
return new FieldInfos(infos, hasFreq, hasProx, hasVectors);
} finally {
input.close();
}
}
public static void files(Directory dir, SegmentInfo info, Set<String> files) throws IOException {
files.add(IndexFileNames.segmentFileName(info.name, "", FIELD_INFOS_EXTENSION));
}
}

View File

@ -75,7 +75,7 @@ class Lucene3xNormsProducer extends PerDocProducer {
try {
long nextNormSeek = NORMS_HEADER.length; //skip header (header unused for now)
for (FieldInfo fi : fields) {
if (fi.isIndexed && !fi.omitNorms) {
if (fi.normsPresent()) {
String fileName = getNormFilename(segmentName, normGen, fi.number);
Directory d = hasSeparateNorms(normGen, fi.number) ? separateNormsDir : dir;
@ -161,7 +161,7 @@ class Lucene3xNormsProducer extends PerDocProducer {
static final class NormSource extends Source {
protected NormSource(byte[] bytes) {
super(Type.BYTES_FIXED_STRAIGHT);
super(Type.FIXED_INTS_8);
this.bytes = bytes;
}
@ -175,6 +175,11 @@ class Lucene3xNormsProducer extends PerDocProducer {
return ref;
}
@Override
public long getInt(int docID) {
return bytes[docID];
}
@Override
public boolean hasArray() {
return true;
@ -192,6 +197,7 @@ class Lucene3xNormsProducer extends PerDocProducer {
// like first FI that has norms but doesn't have separate norms?
final String normsFileName = IndexFileNames.segmentFileName(info.name, "", NORMS_EXTENSION);
if (dir.fileExists(normsFileName)) {
// only needed to do this in 3x - 4x can decide if the norms are present
files.add(normsFileName);
}
}
@ -231,7 +237,7 @@ class Lucene3xNormsProducer extends PerDocProducer {
@Override
public Type type() {
return Type.BYTES_FIXED_STRAIGHT;
return Type.FIXED_INTS_8;
}
byte[] bytes() throws IOException {

View File

@ -65,7 +65,7 @@ public class Lucene40FieldInfosReader extends FieldInfosReader {
for (int i = 0; i < size; i++) {
String name = input.readString();
final int fieldNumber = format <= Lucene40FieldInfosWriter.FORMAT_FLEX? input.readInt():i;
final int fieldNumber = input.readInt();
byte bits = input.readByte();
boolean isIndexed = (bits & Lucene40FieldInfosWriter.IS_INDEXED) != 0;
boolean storeTermVector = (bits & Lucene40FieldInfosWriter.STORE_TERMVECTOR) != 0;
@ -75,12 +75,8 @@ public class Lucene40FieldInfosReader extends FieldInfosReader {
if ((bits & Lucene40FieldInfosWriter.OMIT_TERM_FREQ_AND_POSITIONS) != 0) {
indexOptions = IndexOptions.DOCS_ONLY;
} else if ((bits & Lucene40FieldInfosWriter.OMIT_POSITIONS) != 0) {
if (format <= Lucene40FieldInfosWriter.FORMAT_OMIT_POSITIONS) {
indexOptions = IndexOptions.DOCS_AND_FREQS;
} else {
throw new CorruptIndexException("Corrupt fieldinfos, OMIT_POSITIONS set but format=" + format + " (resource: " + input + ")");
}
} else if (format <= Lucene40FieldInfosWriter.FORMAT_FLEX && (bits & Lucene40FieldInfosWriter.STORE_OFFSETS_IN_POSTINGS) != 0) {
indexOptions = IndexOptions.DOCS_AND_FREQS;
} else if ((bits & Lucene40FieldInfosWriter.STORE_OFFSETS_IN_POSTINGS) != 0) {
indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
} else {
indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
@ -95,59 +91,12 @@ public class Lucene40FieldInfosReader extends FieldInfosReader {
hasVectors |= storeTermVector;
hasProx |= isIndexed && indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
hasFreq |= isIndexed && indexOptions != IndexOptions.DOCS_ONLY;
DocValues.Type docValuesType = null;
if (format <= Lucene40FieldInfosWriter.FORMAT_FLEX) {
final byte b = input.readByte();
switch(b) {
case 0:
docValuesType = null;
break;
case 1:
docValuesType = DocValues.Type.VAR_INTS;
break;
case 2:
docValuesType = DocValues.Type.FLOAT_32;
break;
case 3:
docValuesType = DocValues.Type.FLOAT_64;
break;
case 4:
docValuesType = DocValues.Type.BYTES_FIXED_STRAIGHT;
break;
case 5:
docValuesType = DocValues.Type.BYTES_FIXED_DEREF;
break;
case 6:
docValuesType = DocValues.Type.BYTES_VAR_STRAIGHT;
break;
case 7:
docValuesType = DocValues.Type.BYTES_VAR_DEREF;
break;
case 8:
docValuesType = DocValues.Type.FIXED_INTS_16;
break;
case 9:
docValuesType = DocValues.Type.FIXED_INTS_32;
break;
case 10:
docValuesType = DocValues.Type.FIXED_INTS_64;
break;
case 11:
docValuesType = DocValues.Type.FIXED_INTS_8;
break;
case 12:
docValuesType = DocValues.Type.BYTES_FIXED_SORTED;
break;
case 13:
docValuesType = DocValues.Type.BYTES_VAR_SORTED;
break;
default:
throw new IllegalStateException("unhandled indexValues type " + b);
}
}
// DV Types are packed in one byte
byte val = input.readByte();
final DocValues.Type docValuesType = getDocValuesType((byte) (val & 0x0F));
final DocValues.Type normsType = getDocValuesType((byte) ((val >>> 4) & 0x0F));
infos[i] = new FieldInfo(name, isIndexed, fieldNumber, storeTermVector,
omitNorms, storePayloads, indexOptions, docValuesType);
omitNorms, storePayloads, indexOptions, docValuesType, normsType);
}
if (input.getFilePointer() != input.length()) {
@ -159,6 +108,42 @@ public class Lucene40FieldInfosReader extends FieldInfosReader {
input.close();
}
}
public DocValues.Type getDocValuesType(
final byte b) {
switch(b) {
case 0:
return null;
case 1:
return DocValues.Type.VAR_INTS;
case 2:
return DocValues.Type.FLOAT_32;
case 3:
return DocValues.Type.FLOAT_64;
case 4:
return DocValues.Type.BYTES_FIXED_STRAIGHT;
case 5:
return DocValues.Type.BYTES_FIXED_DEREF;
case 6:
return DocValues.Type.BYTES_VAR_STRAIGHT;
case 7:
return DocValues.Type.BYTES_VAR_DEREF;
case 8:
return DocValues.Type.FIXED_INTS_16;
case 9:
return DocValues.Type.FIXED_INTS_32;
case 10:
return DocValues.Type.FIXED_INTS_64;
case 11:
return DocValues.Type.FIXED_INTS_8;
case 12:
return DocValues.Type.BYTES_FIXED_SORTED;
case 13:
return DocValues.Type.BYTES_VAR_SORTED;
default:
throw new IllegalStateException("unhandled indexValues type " + b);
}
}
public static void files(Directory dir, SegmentInfo info, Set<String> files) throws IOException {
files.add(IndexFileNames.segmentFileName(info.name, "", Lucene40FieldInfosWriter.FIELD_INFOS_EXTENSION));

View File

@ -19,6 +19,7 @@ package org.apache.lucene.codecs.lucene40;
import java.io.IOException;
import org.apache.lucene.codecs.FieldInfosWriter;
import org.apache.lucene.index.DocValues.Type;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
@ -35,15 +36,11 @@ public class Lucene40FieldInfosWriter extends FieldInfosWriter {
/** Extension of field infos */
static final String FIELD_INFOS_EXTENSION = "fnm";
// First used in 2.9; prior to 2.9 there was no format header
static final int FORMAT_START = -2;
// First used in 3.4: omit only positional information
static final int FORMAT_OMIT_POSITIONS = -3;
// per-field codec support, records index values for fields
static final int FORMAT_FLEX = -4;
static final int FORMAT_START = -4;
// whenever you add a new format, make it 1 smaller (negative version logic)!
static final int FORMAT_CURRENT = FORMAT_FLEX;
static final int FORMAT_CURRENT = FORMAT_START;
static final byte IS_INDEXED = 0x1;
static final byte STORE_TERMVECTOR = 0x2;
@ -78,60 +75,53 @@ public class Lucene40FieldInfosWriter extends FieldInfosWriter {
output.writeInt(fi.number);
output.writeByte(bits);
final byte b;
if (!fi.hasDocValues()) {
b = 0;
} else {
switch(fi.getDocValuesType()) {
case VAR_INTS:
b = 1;
break;
case FLOAT_32:
b = 2;
break;
case FLOAT_64:
b = 3;
break;
case BYTES_FIXED_STRAIGHT:
b = 4;
break;
case BYTES_FIXED_DEREF:
b = 5;
break;
case BYTES_VAR_STRAIGHT:
b = 6;
break;
case BYTES_VAR_DEREF:
b = 7;
break;
case FIXED_INTS_16:
b = 8;
break;
case FIXED_INTS_32:
b = 9;
break;
case FIXED_INTS_64:
b = 10;
break;
case FIXED_INTS_8:
b = 11;
break;
case BYTES_FIXED_SORTED:
b = 12;
break;
case BYTES_VAR_SORTED:
b = 13;
break;
default:
throw new IllegalStateException("unhandled indexValues type " + fi.getDocValuesType());
}
}
output.writeByte(b);
// pack the DV types in one byte
final byte dv = docValuesByte(fi.getDocValuesType());
final byte nrm = docValuesByte(fi.getNormType());
assert (dv & (~0xF)) == 0 && (nrm & (~0x0F)) == 0;
byte val = (byte) (0xff & ((nrm << 4) | dv));
output.writeByte(val);
}
} finally {
output.close();
}
}
public byte docValuesByte(Type type) {
if (type == null) {
return 0;
} else {
switch(type) {
case VAR_INTS:
return 1;
case FLOAT_32:
return 2;
case FLOAT_64:
return 3;
case BYTES_FIXED_STRAIGHT:
return 4;
case BYTES_FIXED_DEREF:
return 5;
case BYTES_VAR_STRAIGHT:
return 6;
case BYTES_VAR_DEREF:
return 7;
case FIXED_INTS_16:
return 8;
case FIXED_INTS_32:
return 9;
case FIXED_INTS_64:
return 10;
case FIXED_INTS_8:
return 11;
case BYTES_FIXED_SORTED:
return 12;
case BYTES_VAR_SORTED:
return 13;
default:
throw new IllegalStateException("unhandled indexValues type " + type);
}
}
}
}

View File

@ -72,12 +72,12 @@ public class Lucene40NormsFormat extends NormsFormat {
@Override
protected boolean canLoad(FieldInfo info) {
return !info.omitNorms && info.isIndexed;
return info.normsPresent();
}
@Override
protected Type getDocValuesType(FieldInfo info) {
return Type.BYTES_FIXED_STRAIGHT;
return info.getNormType();
}
@Override
@ -102,23 +102,24 @@ public class Lucene40NormsFormat extends NormsFormat {
@Override
protected boolean canMerge(FieldInfo info) {
return !info.omitNorms && info.isIndexed;
return info.normsPresent();
}
@Override
protected Type getDocValuesType(FieldInfo info) {
return Type.BYTES_FIXED_STRAIGHT;
return info.getNormType();
}
public static void files(Directory dir, SegmentInfo segmentInfo, Set<String> files) throws IOException {
// see the comment in all the other codecs... its bogus that we do fileExists here, but its
// a harder problem since fieldinfos are never 'cleaned'
final String normsFileName = IndexFileNames.segmentFileName(segmentInfo.name, NORMS_SEGMENT_SUFFIX, IndexFileNames.COMPOUND_FILE_EXTENSION);
if (dir.fileExists(normsFileName)) {
final String normsEntriesFileName = IndexFileNames.segmentFileName(segmentInfo.name, NORMS_SEGMENT_SUFFIX, IndexFileNames.COMPOUND_FILE_ENTRIES_EXTENSION);
assert dir.fileExists(normsEntriesFileName);
files.add(normsFileName);
files.add(normsEntriesFileName);
FieldInfos fieldInfos = segmentInfo.getFieldInfos();
for (FieldInfo fieldInfo : fieldInfos) {
if (fieldInfo.normsPresent()) {
final String normsEntriesFileName = IndexFileNames.segmentFileName(segmentInfo.name, NORMS_SEGMENT_SUFFIX, IndexFileNames.COMPOUND_FILE_ENTRIES_EXTENSION);
files.add(normsFileName);
files.add(normsEntriesFileName);
return;
}
}
}
}

View File

@ -86,17 +86,18 @@ public class SimpleTextFieldInfosReader extends FieldInfosReader {
SimpleTextUtil.readLine(input, scratch);
assert StringHelper.startsWith(scratch, NORMS);
boolean omitNorms = !Boolean.parseBoolean(readString(NORMS.length, scratch));
SimpleTextUtil.readLine(input, scratch);
assert StringHelper.startsWith(scratch, NORMS_TYPE);
String nrmType = readString(NORMS_TYPE.length, scratch);
final DocValues.Type normsType = docValuesType(nrmType);
SimpleTextUtil.readLine(input, scratch);
assert StringHelper.startsWith(scratch, DOCVALUES);
String dvType = readString(DOCVALUES.length, scratch);
final DocValues.Type docValuesType;
final DocValues.Type docValuesType = docValuesType(dvType);
if ("false".equals(dvType)) {
docValuesType = null;
} else {
docValuesType = DocValues.Type.valueOf(dvType);
}
SimpleTextUtil.readLine(input, scratch);
assert StringHelper.startsWith(scratch, INDEXOPTIONS);
@ -107,7 +108,7 @@ public class SimpleTextFieldInfosReader extends FieldInfosReader {
hasFreq |= isIndexed && indexOptions != IndexOptions.DOCS_ONLY;
infos[i] = new FieldInfo(name, isIndexed, fieldNumber, storeTermVector,
omitNorms, storePayloads, indexOptions, docValuesType);
omitNorms, storePayloads, indexOptions, docValuesType, normsType);
}
if (input.getFilePointer() != input.length()) {
@ -119,6 +120,14 @@ public class SimpleTextFieldInfosReader extends FieldInfosReader {
input.close();
}
}
public DocValues.Type docValuesType(String dvType) {
if ("false".equals(dvType)) {
return null;
} else {
return DocValues.Type.valueOf(dvType);
}
}
private String readString(int offset, BytesRef scratch) {
return new String(scratch.bytes, scratch.offset+offset, scratch.length-offset, IOUtils.CHARSET_UTF_8);

View File

@ -19,6 +19,7 @@ package org.apache.lucene.codecs.simpletext;
import java.io.IOException;
import org.apache.lucene.codecs.FieldInfosWriter;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
@ -48,6 +49,7 @@ public class SimpleTextFieldInfosWriter extends FieldInfosWriter {
static final BytesRef STORETVOFF = new BytesRef(" term vector offsets ");
static final BytesRef PAYLOADS = new BytesRef(" payloads ");
static final BytesRef NORMS = new BytesRef(" norms ");
static final BytesRef NORMS_TYPE = new BytesRef(" norms type ");
static final BytesRef DOCVALUES = new BytesRef(" doc values ");
static final BytesRef INDEXOPTIONS = new BytesRef(" index options ");
@ -88,12 +90,12 @@ public class SimpleTextFieldInfosWriter extends FieldInfosWriter {
SimpleTextUtil.write(out, Boolean.toString(!fi.omitNorms), scratch);
SimpleTextUtil.writeNewline(out);
SimpleTextUtil.write(out, NORMS_TYPE);
SimpleTextUtil.write(out, getDocValuesType(fi.getNormType()), scratch);
SimpleTextUtil.writeNewline(out);
SimpleTextUtil.write(out, DOCVALUES);
if (!fi.hasDocValues()) {
SimpleTextUtil.write(out, "false", scratch);
} else {
SimpleTextUtil.write(out, fi.getDocValuesType().toString(), scratch);
}
SimpleTextUtil.write(out, getDocValuesType(fi.getDocValuesType()), scratch);
SimpleTextUtil.writeNewline(out);
SimpleTextUtil.write(out, INDEXOPTIONS);
@ -104,4 +106,8 @@ public class SimpleTextFieldInfosWriter extends FieldInfosWriter {
out.close();
}
}
private static String getDocValuesType(DocValues.Type type) {
return type == null ? "false" : type.toString();
}
}

View File

@ -94,17 +94,20 @@ public class SimpleTextNormsConsumer extends PerDocConsumer {
@Override
protected boolean canMerge(FieldInfo info) {
return !info.omitNorms && info.isIndexed;
return info.normsPresent();
}
@Override
protected Type getDocValuesType(FieldInfo info) {
return Type.BYTES_FIXED_STRAIGHT;
return info.getNormType();
}
@Override
public DocValuesConsumer addValuesField(Type type, FieldInfo fieldInfo)
throws IOException {
if (type != Type.FIXED_INTS_8) {
throw new UnsupportedOperationException("Codec only supports single byte norm values. Type give: " + type);
}
return new SimpleTextNormsDocValuesConsumer(fieldInfo);
}
@ -131,10 +134,10 @@ public class SimpleTextNormsConsumer extends PerDocConsumer {
@Override
public void add(int docID, IndexableField docValue) throws IOException {
add(docID, docValue.binaryValue());
add(docID, docValue.numericValue().longValue());
}
protected void add(int docID, BytesRef value) throws IOException {
public void add(int docID, long value) {
if (docIDs.length <= upto) {
assert docIDs.length == upto;
docIDs = ArrayUtil.grow(docIDs, 1 + upto);
@ -143,8 +146,8 @@ public class SimpleTextNormsConsumer extends PerDocConsumer {
assert norms.length == upto;
norms = ArrayUtil.grow(norms, 1 + upto);
}
assert value.length == 1;
norms[upto] = value.bytes[value.offset];
norms[upto] = (byte) value;
docIDs[upto] = docID;
upto++;
}
@ -281,7 +284,7 @@ public class SimpleTextNormsConsumer extends PerDocConsumer {
FieldInfos fieldInfos = info.getFieldInfos();
for (FieldInfo fieldInfo : fieldInfos) {
if (!fieldInfo.omitNorms && fieldInfo.isIndexed) {
if (fieldInfo.normsPresent()) {
files.add(IndexFileNames.segmentFileName(info.name, "",
NORMS_EXTENSION));
break;

View File

@ -32,6 +32,7 @@ import org.apache.lucene.codecs.PerDocProducer;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.DocValues.Source;
import org.apache.lucene.index.DocValues.Type;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
@ -95,11 +96,12 @@ public class SimpleTextNormsProducer extends PerDocProducer {
}
static void files(Directory dir, SegmentInfo info, Set<String> files) throws IOException {
// TODO: This is what SI always did... but we can do this cleaner?
// like first FI that has norms but doesn't have separate norms?
final String normsFileName = IndexFileNames.segmentFileName(info.name, "", SimpleTextNormsConsumer.NORMS_EXTENSION);
if (dir.fileExists(normsFileName)) {
files.add(normsFileName);
FieldInfos fieldInfos = info.getFieldInfos();
for (FieldInfo fieldInfo : fieldInfos) {
if (fieldInfo.normsPresent()) {
files.add(IndexFileNames.segmentFileName(info.name, "", SimpleTextNormsConsumer.NORMS_EXTENSION));
break;
}
}
}
@ -130,7 +132,7 @@ public class SimpleTextNormsProducer extends PerDocProducer {
@Override
public Type type() {
return Type.BYTES_FIXED_STRAIGHT;
return Type.FIXED_INTS_8;
}
@Override
@ -141,7 +143,7 @@ public class SimpleTextNormsProducer extends PerDocProducer {
static final class Norm extends Source {
protected Norm(byte[] bytes) {
super(Type.BYTES_FIXED_STRAIGHT);
super(Type.FIXED_INTS_8);
this.bytes = bytes;
}
final byte bytes[];
@ -153,6 +155,11 @@ public class SimpleTextNormsProducer extends PerDocProducer {
ref.length = 1;
return ref;
}
@Override
public long getInt(int docID) {
return bytes[docID];
}
@Override
public boolean hasArray() {

View File

@ -29,6 +29,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.index.IndexWriter; // javadocs
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.IndexableFieldType;
import org.apache.lucene.index.Norm;
import org.apache.lucene.util.BytesRef;
/**
@ -383,13 +384,13 @@ public class Field implements IndexableField {
* document.
*
* <p>The boost is used to compute the norm factor for the field. By
* default, in the {@link org.apache.lucene.search.similarities.Similarity#computeNorm(FieldInvertState)} method,
* default, in the {@link org.apache.lucene.search.similarities.Similarity#computeNorm(FieldInvertState, Norm)} method,
* the boost value is multiplied by the length normalization factor and then
* rounded by {@link org.apache.lucene.search.similarities.DefaultSimilarity#encodeNormValue(float)} before it is stored in the
* index. One should attempt to ensure that this product does not overflow
* the range of that encoding.
*
* @see org.apache.lucene.search.similarities.Similarity#computeNorm(FieldInvertState)
* @see org.apache.lucene.search.similarities.Similarity#computeNorm(FieldInvertState, Norm)
* @see org.apache.lucene.search.similarities.DefaultSimilarity#encodeNormValue(float)
*/
public void setBoost(float boost) {

View File

@ -671,7 +671,7 @@ public class CheckIndex {
if (reader.normValues(info.name) != null) {
throw new RuntimeException("field: " + info.name + " should omit norms but has them!");
}
if (info.isIndexed && !info.omitNorms) {
if (info.normsPresent()) {
throw new RuntimeException("field: " + info.name + " should have norms but omits them!");
}
}

View File

@ -339,7 +339,7 @@ final class DocFieldProcessor extends DocConsumer {
perDocConsumer = docState.docWriter.codec.docValuesFormat().docsConsumer(perDocWriteState);
}
DocValuesConsumer docValuesConsumer = perDocConsumer.addValuesField(valueType, fieldInfo);
fieldInfo.setDocValuesType(valueType);
fieldInfo.setDocValuesType(valueType, false);
docValuesConsumerAndDocID = new DocValuesConsumerAndDocID(docValuesConsumer);
docValuesConsumerAndDocID.docID = docState.docID;

View File

@ -1,5 +1,7 @@
package org.apache.lucene.index;
import org.apache.lucene.index.DocValues.Type;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -24,11 +26,12 @@ public final class FieldInfo {
public final int number;
public boolean isIndexed;
private DocValues.Type docValues;
private DocValues.Type docValueType;
// True if any document indexed term vectors
public boolean storeTermVector;
private DocValues.Type normType;
public boolean omitNorms; // omit norms associated with indexed fields
public IndexOptions indexOptions;
public boolean storePayloads; // whether this field stores payloads together with term positions
@ -56,21 +59,23 @@ public final class FieldInfo {
* @lucene.experimental
*/
public FieldInfo(String name, boolean isIndexed, int number, boolean storeTermVector,
boolean omitNorms, boolean storePayloads, IndexOptions indexOptions, DocValues.Type docValues) {
boolean omitNorms, boolean storePayloads, IndexOptions indexOptions, DocValues.Type docValues, DocValues.Type normsType) {
this.name = name;
this.isIndexed = isIndexed;
this.number = number;
this.docValues = docValues;
this.docValueType = docValues;
if (isIndexed) {
this.storeTermVector = storeTermVector;
this.storePayloads = storePayloads;
this.omitNorms = omitNorms;
this.indexOptions = indexOptions;
this.normType = !omitNorms ? normsType : null;
} else { // for non-indexed fields, leave defaults
this.storeTermVector = false;
this.storePayloads = false;
this.omitNorms = false;
this.indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
this.normType = null;
}
assert indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 || !storePayloads;
}
@ -78,7 +83,7 @@ public final class FieldInfo {
@Override
public Object clone() {
return new FieldInfo(name, isIndexed, number, storeTermVector,
omitNorms, storePayloads, indexOptions, docValues);
omitNorms, storePayloads, indexOptions, docValueType, normType);
}
// should only be called by FieldInfos#addOrUpdate
@ -109,27 +114,44 @@ public final class FieldInfo {
assert this.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 || !this.storePayloads;
}
void setDocValuesType(DocValues.Type v) {
if (docValues == null) {
docValues = v;
}
}
public void resetDocValuesType(DocValues.Type v) {
if (docValues != null) {
docValues = v;
void setDocValuesType(DocValues.Type type, boolean force) {
if (docValueType == null || force) {
docValueType = type;
} else if (type != docValueType) {
throw new IllegalArgumentException("DocValues type already set to " + docValueType + " but was: " + type);
}
}
public boolean hasDocValues() {
return docValues != null;
return docValueType != null;
}
public DocValues.Type getDocValuesType() {
return docValues;
return docValueType;
}
public DocValues.Type getNormType() {
return normType;
}
public void setStoreTermVectors() {
storeTermVector = true;
}
public void setNormValueType(Type type, boolean force) {
if (normType == null || force) {
normType = type;
} else if (type != normType) {
throw new IllegalArgumentException("Norm type already set to " + normType);
}
}
public boolean omitNorms() {
return omitNorms;
}
public boolean normsPresent() {
return isIndexed && !omitNorms && normType != null;
}
}

View File

@ -268,7 +268,7 @@ public final class FieldInfos implements Iterable<FieldInfo> {
*/
synchronized public void addOrUpdate(String name, boolean isIndexed, boolean storeTermVector,
boolean omitNorms) {
addOrUpdate(name, isIndexed, storeTermVector, omitNorms, false, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, null);
addOrUpdate(name, isIndexed, storeTermVector, omitNorms, false, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, null, null);
}
/** If the field is not yet known, adds it. If it is known, checks to make
@ -284,8 +284,8 @@ public final class FieldInfos implements Iterable<FieldInfo> {
* @param indexOptions if term freqs should be omitted for this field
*/
synchronized public FieldInfo addOrUpdate(String name, boolean isIndexed, boolean storeTermVector,
boolean omitNorms, boolean storePayloads, IndexOptions indexOptions, DocValues.Type docValues) {
return addOrUpdateInternal(name, -1, isIndexed, storeTermVector, omitNorms, storePayloads, indexOptions, docValues);
boolean omitNorms, boolean storePayloads, IndexOptions indexOptions, DocValues.Type docValues, DocValues.Type normType) {
return addOrUpdateInternal(name, -1, isIndexed, storeTermVector, omitNorms, storePayloads, indexOptions, docValues, normType);
}
// NOTE: this method does not carry over termVector
@ -301,32 +301,37 @@ public final class FieldInfos implements Iterable<FieldInfo> {
// be updated by maybe FreqProxTermsWriterPerField:
return addOrUpdateInternal(name, -1, fieldType.indexed(), false,
fieldType.omitNorms(), false,
fieldType.indexOptions(), null);
fieldType.indexOptions(), null, null);
}
synchronized private FieldInfo addOrUpdateInternal(String name, int preferredFieldNumber, boolean isIndexed,
boolean storeTermVector,
boolean omitNorms, boolean storePayloads, IndexOptions indexOptions, DocValues.Type docValues) {
boolean omitNorms, boolean storePayloads, IndexOptions indexOptions, DocValues.Type docValues, DocValues.Type normType) {
if (globalFieldNumbers == null) {
throw new IllegalStateException("FieldInfos are read-only, create a new instance with a global field map to make modifications to FieldInfos");
}
FieldInfo fi = fieldInfo(name);
if (fi == null) {
final int fieldNumber = nextFieldNumber(name, preferredFieldNumber);
fi = addInternal(name, fieldNumber, isIndexed, storeTermVector, omitNorms, storePayloads, indexOptions, docValues);
fi = addInternal(name, fieldNumber, isIndexed, storeTermVector, omitNorms, storePayloads, indexOptions, docValues, normType);
} else {
fi.update(isIndexed, storeTermVector, omitNorms, storePayloads, indexOptions);
fi.setDocValuesType(docValues);
if (docValues != null) {
fi.setDocValuesType(docValues, true);
}
if (normType != null) {
fi.setNormValueType(normType, true);
}
}
version++;
return fi;
}
synchronized public FieldInfo add(FieldInfo fi) {
// IMPORTANT - reuse the field number if possible for consistent field numbers across segments
return addOrUpdateInternal(fi.name, fi.number, fi.isIndexed, fi.storeTermVector,
fi.omitNorms, fi.storePayloads,
fi.indexOptions, fi.getDocValuesType());
fi.indexOptions, fi.getDocValuesType(), fi.getNormType());
}
/*
@ -334,12 +339,12 @@ public final class FieldInfos implements Iterable<FieldInfo> {
*/
private FieldInfo addInternal(String name, int fieldNumber, boolean isIndexed,
boolean storeTermVector, boolean omitNorms, boolean storePayloads,
IndexOptions indexOptions, DocValues.Type docValuesType) {
IndexOptions indexOptions, DocValues.Type docValuesType, DocValues.Type normType) {
// don't check modifiable here since we use that to initially build up FIs
if (globalFieldNumbers != null) {
globalFieldNumbers.setIfNotSet(fieldNumber, name);
}
final FieldInfo fi = new FieldInfo(name, isIndexed, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValuesType);
final FieldInfo fi = new FieldInfo(name, isIndexed, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValuesType, normType);
putInternal(fi);
return fi;
}

View File

@ -49,8 +49,9 @@ public class MultiDocValues extends DocValues {
public boolean stopLoadingOnNull(IndexReader reader, String field) throws IOException {
// for norms we drop all norms if one leaf reader has no norms and the field is present
Fields fields = reader.fields();
return (fields != null && fields.terms(field) != null);
FieldInfos fieldInfos = reader.getFieldInfos();
FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
return fieldInfo != null && fieldInfo.omitNorms;
}
};

View File

@ -0,0 +1,154 @@
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.document.DocValuesField;
import org.apache.lucene.index.DocValues.Type;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.BytesRef;
/**
* Stores the normalization value computed in
* {@link Similarity#computeNorm(FieldInvertState, Norm)} per field.
* Normalization values must be consistent within a single field, different
* value types are not permitted within a single field. All values set must be
* fixed size values ie. all values passed to {@link Norm#setBytes(BytesRef)}
* must have the same length per field.
*
* @lucene.experimental
* @lucene.internal
*/
public final class Norm {
private DocValuesField field;
private BytesRef spare;
/**
* Returns the {@link IndexableField} representation for this norm
*/
public IndexableField field() {
return field;
}
/**
* Returns the {@link Type} for this norm.
*/
public Type type() {
return field == null? null : field.fieldType().docValueType();
}
/**
* Returns a spare {@link BytesRef}
*/
public BytesRef getSpare() {
if (spare == null) {
spare = new BytesRef();
}
return spare;
}
/**
* Sets a float norm value
*/
public void setFloat(float norm) {
setType(Type.FLOAT_32);
this.field.setValue(norm);
}
/**
* Sets a double norm value
*/
public void setDouble(double norm) {
setType(Type.FLOAT_64);
this.field.setValue(norm);
}
/**
* Sets a short norm value
*/
public void setShort(short norm) {
setType(Type.FIXED_INTS_16);
this.field.setValue(norm);
}
/**
* Sets a int norm value
*/
public void setInt(int norm) {
setType(Type.FIXED_INTS_32);
this.field.setValue(norm);
}
/**
* Sets a long norm value
*/
public void setLong(long norm) {
setType(Type.FIXED_INTS_64);
this.field.setValue(norm);
}
/**
* Sets a byte norm value
*/
public void setByte(byte norm) {
setType(Type.FIXED_INTS_8);
this.field.setValue(norm);
}
/**
* Sets a fixed byte array norm value
*/
public void setBytes(BytesRef norm) {
setType(Type.BYTES_FIXED_STRAIGHT);
this.field.setValue(norm);
}
private void setType(Type type) {
if (field != null) {
if (type != field.fieldType().docValueType()) {
throw new IllegalArgumentException("FieldType missmatch - expected "+type+" but was " + field.fieldType().docValueType());
}
} else {
switch (type) {
case BYTES_FIXED_DEREF:
case BYTES_FIXED_SORTED:
case BYTES_FIXED_STRAIGHT:
case BYTES_VAR_DEREF:
case BYTES_VAR_SORTED:
case BYTES_VAR_STRAIGHT:
this.field = new DocValuesField("", new BytesRef(), type);
break;
case FIXED_INTS_16:
case FIXED_INTS_32:
case FIXED_INTS_64:
case FIXED_INTS_8:
case VAR_INTS:
this.field = new DocValuesField("", 0, type);
break;
case FLOAT_32:
case FLOAT_64:
this.field = new DocValuesField("", 0f, type);
break;
default:
throw new IllegalArgumentException("unknown Type: " + type);
}
}
}
}

View File

@ -24,9 +24,7 @@ import java.util.Map;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PerDocConsumer;
import org.apache.lucene.document.DocValuesField;
import org.apache.lucene.index.DocValues.Type;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
// TODO FI: norms could actually be stored as doc store
@ -69,13 +67,12 @@ final class NormsConsumer extends InvertedDocEndConsumer {
if (!fi.omitNorms) {
if (toWrite != null && toWrite.initialized()) {
anythingFlushed = true;
toWrite.flush(state.numDocs);
final Type type = toWrite.flush(state.numDocs);
assert fi.getNormType() == type;
} else if (fi.isIndexed) {
anythingFlushed = true;
final DocValuesConsumer valuesConsumer = newConsumer(new PerDocWriteState(state), fi);
final DocValuesField value = new DocValuesField("", new BytesRef(new byte[] {0x0}), Type.BYTES_FIXED_STRAIGHT);
valuesConsumer.add(state.numDocs-1, value);
valuesConsumer.finish(state.numDocs);
assert fi.getNormType() == null;
fi.setNormValueType(null, false);
}
}
}
@ -107,12 +104,12 @@ final class NormsConsumer extends InvertedDocEndConsumer {
}
DocValuesConsumer newConsumer(PerDocWriteState perDocWriteState,
FieldInfo fieldInfo) throws IOException {
FieldInfo fieldInfo, Type type) throws IOException {
if (consumer == null) {
consumer = normsFormat.docsConsumer(perDocWriteState);
}
DocValuesConsumer addValuesField = consumer.addValuesField(
Type.BYTES_FIXED_STRAIGHT, fieldInfo);
DocValuesConsumer addValuesField = consumer.addValuesField(type, fieldInfo);
return addValuesField;
}
}

View File

@ -19,6 +19,7 @@ import java.io.IOException;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.document.DocValuesField;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.DocValues.Type;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.BytesRef;
@ -29,9 +30,9 @@ public class NormsConsumerPerField extends InvertedDocEndConsumerPerField implem
private final Similarity similarity;
private final FieldInvertState fieldState;
private DocValuesConsumer consumer;
private final BytesRef spare = new BytesRef(1);
private final DocValuesField value = new DocValuesField("", spare, Type.BYTES_FIXED_STRAIGHT);
private final Norm norm;
private final NormsConsumer parent;
private Type initType;
public NormsConsumerPerField(final DocInverterPerField docInverterPerField, final FieldInfo fieldInfo, NormsConsumer parent) {
this.fieldInfo = fieldInfo;
@ -39,10 +40,9 @@ public class NormsConsumerPerField extends InvertedDocEndConsumerPerField implem
docState = docInverterPerField.docState;
fieldState = docInverterPerField.fieldState;
similarity = docState.similarityProvider.get(fieldInfo.name);
spare.length = 1;
spare.offset = 0;
norm = new Norm();
}
@Override
public int compareTo(NormsConsumerPerField other) {
return fieldInfo.name.compareTo(other.fieldInfo.name);
@ -51,20 +51,33 @@ public class NormsConsumerPerField extends InvertedDocEndConsumerPerField implem
@Override
void finish() throws IOException {
if (fieldInfo.isIndexed && !fieldInfo.omitNorms) {
DocValuesConsumer consumer = getConsumer();
spare.bytes[0] = similarity.computeNorm(fieldState);
consumer.add(docState.docID, value);
similarity.computeNorm(fieldState, norm);
if (norm.type() != null) {
IndexableField field = norm.field();
// some similarity might not compute any norms
DocValuesConsumer consumer = getConsumer(norm.type());
consumer.add(docState.docID, field);
}
}
}
void flush(int docCount) throws IOException {
assert initialized();
Type flush(int docCount) throws IOException {
if (!initialized()) {
return null; // null type - not omitted but not written
}
consumer.finish(docCount);
return initType;
}
private DocValuesConsumer getConsumer() throws IOException {
private DocValuesConsumer getConsumer(Type type) throws IOException {
if (consumer == null) {
consumer = parent.newConsumer(docState.docWriter.newPerDocWriteState(""), fieldInfo);
fieldInfo.setNormValueType(type, false);
consumer = parent.newConsumer(docState.docWriter.newPerDocWriteState(""), fieldInfo, type);
this.initType = type;
}
if (initType != type) {
throw new IllegalArgumentException("NormTypes for field: " + fieldInfo.name + " doesn't match " + initType + " != " + type);
}
return consumer;
}

View File

@ -191,9 +191,18 @@ final class SegmentMerger {
}
private void mergeFieldInfos() throws IOException {
mergeDocValuesAndNormsFieldInfos();
// write the merged infos
FieldInfosWriter fieldInfosWriter = codec.fieldInfosFormat()
.getFieldInfosWriter();
fieldInfosWriter.write(directory, segment, mergeState.fieldInfos, context);
}
public void mergeDocValuesAndNormsFieldInfos() throws IOException {
// mapping from all docvalues fields found to their promoted types
// this is because FieldInfos does not store the valueSize
Map<FieldInfo,TypePromoter> docValuesTypes = new HashMap<FieldInfo,TypePromoter>();
Map<FieldInfo,TypePromoter> normValuesTypes = new HashMap<FieldInfo,TypePromoter>();
for (MergeState.IndexReaderAndLiveDocs readerAndLiveDocs : mergeState.readers) {
final IndexReader reader = readerAndLiveDocs.reader;
@ -205,28 +214,44 @@ final class SegmentMerger {
TypePromoter previous = docValuesTypes.get(merged);
docValuesTypes.put(merged, mergeDocValuesType(previous, reader.docValues(fi.name)));
}
}
}
// update any promoted doc values types:
for (Map.Entry<FieldInfo,TypePromoter> e : docValuesTypes.entrySet()) {
FieldInfo fi = e.getKey();
TypePromoter promoter = e.getValue();
if (promoter == null) {
fi.resetDocValuesType(null);
} else {
assert promoter != TypePromoter.getIdentityPromoter();
if (fi.getDocValuesType() != promoter.type()) {
// reset the type if we got promoted
fi.resetDocValuesType(promoter.type());
if (fi.normsPresent()) {
TypePromoter previous = normValuesTypes.get(merged);
normValuesTypes.put(merged, mergeDocValuesType(previous, reader.normValues(fi.name)));
}
}
}
// write the merged infos
FieldInfosWriter fieldInfosWriter = codec.fieldInfosFormat().getFieldInfosWriter();
fieldInfosWriter.write(directory, segment, mergeState.fieldInfos, context);
updatePromoted(normValuesTypes, true);
updatePromoted(docValuesTypes, false);
}
protected void updatePromoted(Map<FieldInfo,TypePromoter> infoAndPromoter, boolean norms) {
// update any promoted doc values types:
for (Map.Entry<FieldInfo,TypePromoter> e : infoAndPromoter.entrySet()) {
FieldInfo fi = e.getKey();
TypePromoter promoter = e.getValue();
if (promoter == null) {
if (norms) {
fi.setNormValueType(null, true);
} else {
fi.setDocValuesType(null, true);
}
} else {
assert promoter != TypePromoter.getIdentityPromoter();
if (norms) {
if (fi.getNormType() != promoter.type()) {
// reset the type if we got promoted
fi.setNormValueType(promoter.type(), true);
}
} else {
if (fi.getDocValuesType() != promoter.type()) {
// reset the type if we got promoted
fi.setDocValuesType(promoter.type(), true);
}
}
}
}
}
/**
*

View File

@ -198,7 +198,7 @@ public final class SegmentReader extends IndexReader {
public boolean hasNorms(String field) {
ensureOpen();
FieldInfo fi = core.fieldInfos.fieldInfo(field);
return fi != null && fi.isIndexed && !fi.omitNorms;
return fi.normsPresent();
}
/** @lucene.internal */

View File

@ -22,6 +22,7 @@ import java.io.IOException;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Norm;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
@ -122,10 +123,11 @@ public class BM25Similarity extends Similarity {
}
}
@Override
public final byte computeNorm(FieldInvertState state) {
public final void computeNorm(FieldInvertState state, Norm norm) {
final int numTerms = discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength();
return encodeNormValue(state.getBoost(), numTerms);
norm.setByte(encodeNormValue(state.getBoost(), numTerms));
}
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {

View File

@ -1,6 +1,7 @@
package org.apache.lucene.search.similarities;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.Norm;
import org.apache.lucene.util.BytesRef;
/**
@ -22,7 +23,7 @@ import org.apache.lucene.util.BytesRef;
/** Expert: Default scoring implementation. */
public class DefaultSimilarity extends TFIDFSimilarity {
/** Implemented as
* <code>state.getBoost()*lengthNorm(numTerms)</code>, where
* <code>numTerms</code> is {@link FieldInvertState#getLength()} if {@link
@ -32,13 +33,13 @@ public class DefaultSimilarity extends TFIDFSimilarity {
*
* @lucene.experimental */
@Override
public byte computeNorm(FieldInvertState state) {
public void computeNorm(FieldInvertState state, Norm norm) {
final int numTerms;
if (discountOverlaps)
numTerms = state.getLength() - state.getNumOverlap();
else
numTerms = state.getLength();
return encodeNormValue(state.getBoost() * ((float) (1.0 / Math.sqrt(numTerms))));
norm.setByte(encodeNormValue(state.getBoost() * ((float) (1.0 / Math.sqrt(numTerms)))));
}
/** Implemented as <code>sqrt(freq)</code>. */

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Norm;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
@ -40,8 +41,8 @@ public class MultiSimilarity extends Similarity {
}
@Override
public byte computeNorm(FieldInvertState state) {
return sims[0].computeNorm(state);
public void computeNorm(FieldInvertState state, Norm norm) {
sims[0].computeNorm(state, norm);
}
@Override

View File

@ -24,6 +24,7 @@ import org.apache.lucene.document.DocValuesField; // javadoc
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader; // javadoc
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Norm;
import org.apache.lucene.index.Terms; // javadoc
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.CollectionStatistics;
@ -36,7 +37,6 @@ import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.spans.SpanQuery; // javadoc
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SmallFloat; // javadoc
import org.apache.lucene.util.TermContext;
/**
@ -55,8 +55,8 @@ import org.apache.lucene.util.TermContext;
* <a href="#querytime">query-time</a>.
* <p>
* <a name="indextime"/>
* At indexing time, the indexer calls {@link #computeNorm(FieldInvertState)}, allowing
* the Similarity implementation to return a per-document byte for the field that will
* At indexing time, the indexer calls {@link #computeNorm(FieldInvertState, Norm)}, allowing
* the Similarity implementation to set a per-document value for the field that will
* be later accessible via {@link IndexReader#normValues(String)}. Lucene makes no assumption
* about what is in this byte, but it is most useful for encoding length normalization
* information.
@ -109,23 +109,24 @@ import org.apache.lucene.util.TermContext;
* @lucene.experimental
*/
public abstract class Similarity {
/**
* Computes the normalization value for a field, given the accumulated
* state of term processing for this field (see {@link FieldInvertState}).
*
* <p>Implementations should calculate a byte value based on the field
* state and then return that value.
* <p>Implementations should calculate a norm value based on the field
* state and set that value to the given {@link Norm}.
*
* <p>Matches in longer fields are less precise, so implementations of this
* method usually return smaller values when <code>state.getLength()</code> is large,
* method usually set smaller values when <code>state.getLength()</code> is large,
* and larger values when <code>state.getLength()</code> is small.
*
* @lucene.experimental
*
* @param state current processing state for this field
* @return the calculated byte norm
* @param norm holds the computed norm value when this method returns
*/
public abstract byte computeNorm(FieldInvertState state);
public abstract void computeNorm(FieldInvertState state, Norm norm);
/**
* Compute any collection-level stats (e.g. IDF, average document length, etc) needed for scoring a query.

View File

@ -22,6 +22,7 @@ import java.io.IOException;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Norm;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
@ -234,13 +235,13 @@ public abstract class SimilarityBase extends Similarity {
/** Encodes the document length in the same way as {@link TFIDFSimilarity}. */
@Override
public byte computeNorm(FieldInvertState state) {
public void computeNorm(FieldInvertState state, Norm norm) {
final float numTerms;
if (discountOverlaps)
numTerms = state.getLength() - state.getNumOverlap();
else
numTerms = state.getLength() / state.getBoost();
return encodeNormValue(state.getBoost(), numTerms);
norm.setByte(encodeNormValue(state.getBoost(), numTerms));
}
/** Decodes a normalization factor (document length) stored in an index.

View File

@ -22,14 +22,12 @@ import java.io.IOException;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.SmallFloat;

View File

@ -155,7 +155,7 @@ subclassing the Similarity, one can simply introduce a new basic model and tell
matching term occurs. In these
cases people have overridden Similarity to return 1 from the tf() method.</p></li>
<li><p>Changing Length Normalization &mdash; By overriding
{@link org.apache.lucene.search.similarities.Similarity#computeNorm(FieldInvertState state)},
{@link org.apache.lucene.search.similarities.Similarity#computeNorm(FieldInvertState state, Norm)},
it is possible to discount how the length of a field contributes
to a score. In {@link org.apache.lucene.search.similarities.DefaultSimilarity},
lengthNorm = 1 / (numTerms in field)^0.5, but if one changes this to be

View File

@ -17,6 +17,7 @@ package org.apache.lucene.codecs.preflexrw;
* limitations under the License.
*/
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.lucene3x.Lucene3xCodec;
@ -29,6 +30,7 @@ import org.apache.lucene.util.LuceneTestCase;
public class PreFlexRWCodec extends Lucene3xCodec {
private final PostingsFormat postings = new PreFlexRWPostingsFormat();
private final NormsFormat norms = new PreFlexRWNormsFormat();
private final FieldInfosFormat fieldInfos = new PreFlexRWFieldInfosFormat();
@Override
public PostingsFormat postingsFormat() {
@ -47,4 +49,13 @@ public class PreFlexRWCodec extends Lucene3xCodec {
return super.normsFormat();
}
}
@Override
public FieldInfosFormat fieldInfosFormat() {
if (LuceneTestCase.PREFLEX_IMPERSONATION_IS_ACTIVE) {
return fieldInfos;
} else {
return super.fieldInfosFormat();
}
}
}

View File

@ -0,0 +1,41 @@
package org.apache.lucene.codecs.preflexrw;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.FieldInfosReader;
import org.apache.lucene.codecs.FieldInfosWriter;
import org.apache.lucene.codecs.lucene3x.Lucene3xFieldInfosFormat;
/**
*
* @lucene.internal
* @lucene.experimental
*/
public class PreFlexRWFieldInfosFormat extends Lucene3xFieldInfosFormat {
@Override
public FieldInfosReader getFieldInfosReader() throws IOException {
return new PreFlexRWFieldInfosReader();
}
@Override
public FieldInfosWriter getFieldInfosWriter() throws IOException {
return new PreFlexRWFieldInfosWriter();
}
}

View File

@ -0,0 +1,117 @@
package org.apache.lucene.codecs.preflexrw;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Set;
import org.apache.lucene.codecs.FieldInfosReader;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DocValues.Type;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexFormatTooNewException;
import org.apache.lucene.index.IndexFormatTooOldException;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
/**
* @lucene.internal
* @lucene.experimental
*/
public class PreFlexRWFieldInfosReader extends FieldInfosReader {
static final int FORMAT_MINIMUM = PreFlexRWFieldInfosWriter.FORMAT_START;
@Override
public FieldInfos read(Directory directory, String segmentName, IOContext iocontext) throws IOException {
final String fileName = IndexFileNames.segmentFileName(segmentName, "", PreFlexRWFieldInfosWriter.FIELD_INFOS_EXTENSION);
IndexInput input = directory.openInput(fileName, iocontext);
boolean hasVectors = false;
boolean hasFreq = false;
boolean hasProx = false;
try {
final int format = input.readVInt();
if (format > FORMAT_MINIMUM) {
throw new IndexFormatTooOldException(input, format, FORMAT_MINIMUM, PreFlexRWFieldInfosWriter.FORMAT_CURRENT);
}
if (format < PreFlexRWFieldInfosWriter.FORMAT_CURRENT && format != PreFlexRWFieldInfosWriter.FORMAT_PREFLEX_RW) {
throw new IndexFormatTooNewException(input, format, FORMAT_MINIMUM, PreFlexRWFieldInfosWriter.FORMAT_CURRENT);
}
final int size = input.readVInt(); //read in the size
FieldInfo infos[] = new FieldInfo[size];
for (int i = 0; i < size; i++) {
String name = input.readString();
final int fieldNumber = format == PreFlexRWFieldInfosWriter.FORMAT_PREFLEX_RW ? input.readInt() : i;
byte bits = input.readByte();
boolean isIndexed = (bits & PreFlexRWFieldInfosWriter.IS_INDEXED) != 0;
boolean storeTermVector = (bits & PreFlexRWFieldInfosWriter.STORE_TERMVECTOR) != 0;
boolean omitNorms = (bits & PreFlexRWFieldInfosWriter.OMIT_NORMS) != 0;
boolean storePayloads = (bits & PreFlexRWFieldInfosWriter.STORE_PAYLOADS) != 0;
final IndexOptions indexOptions;
if ((bits & PreFlexRWFieldInfosWriter.OMIT_TERM_FREQ_AND_POSITIONS) != 0) {
indexOptions = IndexOptions.DOCS_ONLY;
} else if ((bits & PreFlexRWFieldInfosWriter.OMIT_POSITIONS) != 0) {
if (format <= PreFlexRWFieldInfosWriter.FORMAT_OMIT_POSITIONS) {
indexOptions = IndexOptions.DOCS_AND_FREQS;
} else {
throw new CorruptIndexException("Corrupt fieldinfos, OMIT_POSITIONS set but format=" + format + " (resource: " + input + ")");
}
} else {
indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
}
// LUCENE-3027: past indices were able to write
// storePayloads=true when omitTFAP is also true,
// which is invalid. We correct that, here:
if (indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
storePayloads = false;
}
hasVectors |= storeTermVector;
hasProx |= isIndexed && indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
hasFreq |= isIndexed && indexOptions != IndexOptions.DOCS_ONLY;
Type normType = isIndexed && !omitNorms ? Type.FIXED_INTS_8 : null;
if (format == PreFlexRWFieldInfosWriter.FORMAT_PREFLEX_RW && normType != null) {
// RW can have norms but doesn't write them
normType = input.readByte() != 0 ? Type.FIXED_INTS_8 : null;
}
infos[i] = new FieldInfo(name, isIndexed, fieldNumber, storeTermVector,
omitNorms, storePayloads, indexOptions, null, normType);
}
if (input.getFilePointer() != input.length()) {
throw new CorruptIndexException("did not read all bytes from file \"" + fileName + "\": read " + input.getFilePointer() + " vs size " + input.length() + " (resource: " + input + ")");
}
return new FieldInfos(infos, hasFreq, hasProx, hasVectors);
} finally {
input.close();
}
}
public static void files(Directory dir, SegmentInfo info, Set<String> files) throws IOException {
files.add(IndexFileNames.segmentFileName(info.name, "", PreFlexRWFieldInfosWriter.FIELD_INFOS_EXTENSION));
}
}

View File

@ -0,0 +1,95 @@
package org.apache.lucene.codecs.preflexrw;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.FieldInfosWriter;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
/**
* @lucene.internal
* @lucene.experimental
*/
public class PreFlexRWFieldInfosWriter extends FieldInfosWriter {
// TODO move to test-framework preflex RW?
/** Extension of field infos */
static final String FIELD_INFOS_EXTENSION = "fnm";
// First used in 2.9; prior to 2.9 there was no format header
static final int FORMAT_START = -2;
// First used in 3.4: omit only positional information
static final int FORMAT_OMIT_POSITIONS = -3;
static final int FORMAT_PREFLEX_RW = Integer.MIN_VALUE;
// whenever you add a new format, make it 1 smaller (negative version logic)!
static final int FORMAT_CURRENT = FORMAT_OMIT_POSITIONS;
static final byte IS_INDEXED = 0x1;
static final byte STORE_TERMVECTOR = 0x2;
static final byte OMIT_NORMS = 0x10;
static final byte STORE_PAYLOADS = 0x20;
static final byte OMIT_TERM_FREQ_AND_POSITIONS = 0x40;
static final byte OMIT_POSITIONS = -128;
@Override
public void write(Directory directory, String segmentName, FieldInfos infos, IOContext context) throws IOException {
final String fileName = IndexFileNames.segmentFileName(segmentName, "", FIELD_INFOS_EXTENSION);
IndexOutput output = directory.createOutput(fileName, context);
try {
output.writeVInt(FORMAT_PREFLEX_RW);
output.writeVInt(infos.size());
for (FieldInfo fi : infos) {
assert fi.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS || !fi.storePayloads;
byte bits = 0x0;
if (fi.isIndexed) bits |= IS_INDEXED;
if (fi.storeTermVector) bits |= STORE_TERMVECTOR;
if (fi.omitNorms) bits |= OMIT_NORMS;
if (fi.storePayloads) bits |= STORE_PAYLOADS;
if (fi.indexOptions == IndexOptions.DOCS_ONLY) {
bits |= OMIT_TERM_FREQ_AND_POSITIONS;
} else if (fi.indexOptions == IndexOptions.DOCS_AND_FREQS) {
bits |= OMIT_POSITIONS;
}
output.writeString(fi.name);
/*
* we need to write the field number since IW tries
* to stabelize the field numbers across segments so the
* FI ordinal is not necessarily equivalent to the field number
*/
output.writeInt(fi.number);
output.writeByte(bits);
if (fi.isIndexed && !fi.omitNorms) {
// to allow null norm types we need to indicate if norms are written
// only in RW case
output.writeByte((byte) (fi.getNormType() == null ? 0 : 1));
}
}
} finally {
output.close();
}
}
}

View File

@ -36,7 +36,7 @@ import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
class PreFlexFieldsWriter extends FieldsConsumer {
class PreFlexRWFieldsWriter extends FieldsConsumer {
private final TermInfosWriter termsOut;
private final IndexOutput freqOut;
@ -44,7 +44,7 @@ class PreFlexFieldsWriter extends FieldsConsumer {
private final Lucene40SkipListWriter skipListWriter;
private final int totalNumDocs;
public PreFlexFieldsWriter(SegmentWriteState state) throws IOException {
public PreFlexRWFieldsWriter(SegmentWriteState state) throws IOException {
termsOut = new TermInfosWriter(state.directory,
state.segmentName,
state.fieldInfos,
@ -89,7 +89,7 @@ class PreFlexFieldsWriter extends FieldsConsumer {
public TermsConsumer addField(FieldInfo field) throws IOException {
assert field.number != -1;
if (field.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
throw new UnsupportedOperationException("this codec cannot index offsets");
throw new IllegalArgumentException("this codec cannot index offsets");
}
//System.out.println("w field=" + field.name + " storePayload=" + field.storePayloads + " number=" + field.number);
return new PreFlexTermsWriter(field);
@ -164,7 +164,6 @@ class PreFlexFieldsWriter extends FieldsConsumer {
assert proxOut != null;
assert startOffset == -1;
assert endOffset == -1;
//System.out.println(" w pos=" + position + " payl=" + payload);
final int delta = position - lastPosition;
lastPosition = position;

View File

@ -22,9 +22,9 @@ import java.util.Arrays;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.PerDocConsumer;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.DocValues.Source;
import org.apache.lucene.index.DocValues.Type;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexableField;
@ -34,14 +34,13 @@ import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
/**
* Writes and Merges Lucene 3.x norms format
* @lucene.experimental
*/
class PreFlexNormsConsumer extends PerDocConsumer {
class PreFlexRWNormsConsumer extends PerDocConsumer {
/** norms header placeholder */
private static final byte[] NORMS_HEADER = new byte[]{'N','R','M',-1};
@ -62,7 +61,7 @@ class PreFlexNormsConsumer extends PerDocConsumer {
private NormsWriter writer;
public PreFlexNormsConsumer(Directory directory, String segment, IOContext context){
public PreFlexRWNormsConsumer(Directory directory, String segment, IOContext context){
this.directory = directory;
this.segment = segment;
this.context = context;
@ -79,10 +78,23 @@ class PreFlexNormsConsumer extends PerDocConsumer {
writer.finish();
}
}
@Override
protected boolean canMerge(FieldInfo info) {
return info.normsPresent();
}
@Override
protected Type getDocValuesType(FieldInfo info) {
return info.getNormType();
}
@Override
public DocValuesConsumer addValuesField(Type type, FieldInfo fieldInfo)
throws IOException {
if (type != Type.FIXED_INTS_8) {
throw new UnsupportedOperationException("Codec only supports single byte norm values. Type give: " + type);
}
return new Lucene3xNormsDocValuesConsumer(fieldInfo);
}
@ -134,10 +146,10 @@ class PreFlexNormsConsumer extends PerDocConsumer {
@Override
public void add(int docID, IndexableField docValue) throws IOException {
add(docID, docValue.binaryValue());
add(docID, docValue.numericValue().longValue());
}
protected void add(int docID, BytesRef value) throws IOException {
protected void add(int docID, long value) {
if (docIDs.length <= upto) {
assert docIDs.length == upto;
docIDs = ArrayUtil.grow(docIDs, 1 + upto);
@ -146,8 +158,7 @@ class PreFlexNormsConsumer extends PerDocConsumer {
assert norms.length == upto;
norms = ArrayUtil.grow(norms, 1 + upto);
}
assert value.length == 1;
norms[upto] = value.bytes[value.offset];
norms[upto] = (byte) value;
docIDs[upto] = docID;
upto++;
@ -217,7 +228,7 @@ class PreFlexNormsConsumer extends PerDocConsumer {
public void merge(MergeState mergeState) throws IOException {
int numMergedDocs = 0;
for (FieldInfo fi : mergeState.fieldInfos) {
if (fi.isIndexed && !fi.omitNorms) {
if (fi.normsPresent()) {
startField(fi);
int numMergedDocsForField = 0;
for (MergeState.IndexReaderAndLiveDocs reader : mergeState.readers) {

View File

@ -21,11 +21,15 @@ import org.apache.lucene.codecs.PerDocConsumer;
import org.apache.lucene.codecs.lucene3x.Lucene3xNormsFormat;
import org.apache.lucene.index.PerDocWriteState;
/**
* @lucene.internal
* @lucene.experimental
*/
public class PreFlexRWNormsFormat extends Lucene3xNormsFormat {
@Override
public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException {
return new PreFlexNormsConsumer(state.directory, state.segmentName, state.context);
return new PreFlexRWNormsConsumer(state.directory, state.segmentName, state.context);
}
}

View File

@ -41,7 +41,7 @@ public class PreFlexRWPostingsFormat extends Lucene3xPostingsFormat {
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
return new PreFlexFieldsWriter(state);
return new PreFlexRWFieldsWriter(state);
}
@Override

View File

@ -0,0 +1,206 @@
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Random;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DocValues.Source;
import org.apache.lucene.index.DocValues.Type;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.store.MockDirectoryWrapper;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LineFileDocs;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Before;
/**
*
*/
public class TestCustomNorms extends LuceneTestCase {
final String floatTestField = "normsTestFloat";
final String exceptionTestField = "normsTestExcp";
@Before
public void setUp() throws Exception {
super.setUp();
assumeFalse("cannot work with preflex codec", Codec.getDefault().getName()
.equals("Lucene3x"));
assumeFalse("cannot work with simple text codec", Codec.getDefault()
.getName().equals("SimpleText"));
}
public void testFloatNorms() throws IOException {
MockDirectoryWrapper dir = newDirectory();
dir.setCheckIndexOnClose(false); // can't set sim to checkindex yet
IndexWriterConfig config = newIndexWriterConfig(TEST_VERSION_CURRENT,
new MockAnalyzer(random));
SimilarityProvider provider = new MySimProvider();
config.setSimilarityProvider(provider);
RandomIndexWriter writer = new RandomIndexWriter(random, dir, config);
final LineFileDocs docs = new LineFileDocs(random);
int num = atLeast(100);
for (int i = 0; i < num; i++) {
Document doc = docs.nextDoc();
float nextFloat = random.nextFloat();
Field f = new Field(floatTestField, "" + nextFloat, TextField.TYPE_STORED);
f.setBoost(nextFloat);
doc.add(f);
writer.addDocument(doc);
doc.removeField(floatTestField);
if (rarely()) {
writer.commit();
}
}
writer.commit();
writer.close();
IndexReader open = new SlowMultiReaderWrapper(IndexReader.open(dir));
DocValues normValues = open.normValues(floatTestField);
assertNotNull(normValues);
Source source = normValues.getSource();
assertTrue(source.hasArray());
assertEquals(Type.FLOAT_32, normValues.type());
float[] norms = (float[]) source.getArray();
for (int i = 0; i < open.maxDoc(); i++) {
Document document = open.document(i);
float expected = Float.parseFloat(document.get(floatTestField));
assertEquals(expected, norms[i], 0.0f);
}
open.close();
dir.close();
}
public void testExceptionOnRandomType() throws IOException {
MockDirectoryWrapper dir = newDirectory();
dir.setCheckIndexOnClose(false); // can't set sim to checkindex yet
IndexWriterConfig config = newIndexWriterConfig(TEST_VERSION_CURRENT,
new MockAnalyzer(random));
SimilarityProvider provider = new MySimProvider();
config.setSimilarityProvider(provider);
RandomIndexWriter writer = new RandomIndexWriter(random, dir, config);
final LineFileDocs docs = new LineFileDocs(random);
int num = atLeast(100);
try {
for (int i = 0; i < num; i++) {
Document doc = docs.nextDoc();
float nextFloat = random.nextFloat();
Field f = new Field(exceptionTestField, "" + nextFloat,
TextField.TYPE_STORED);
f.setBoost(nextFloat);
doc.add(f);
writer.addDocument(doc);
doc.removeField(exceptionTestField);
if (rarely()) {
writer.commit();
}
}
fail("expected exception - incompatible types");
} catch (IllegalArgumentException e) {
// expected
}
writer.commit();
writer.close();
dir.close();
}
public class MySimProvider implements SimilarityProvider {
SimilarityProvider delegate = new DefaultSimilarityProvider();
@Override
public float queryNorm(float sumOfSquaredWeights) {
return delegate.queryNorm(sumOfSquaredWeights);
}
@Override
public Similarity get(String field) {
if (floatTestField.equals(field)) {
return new FloatEncodingBoostSimilarity();
} else if (exceptionTestField.equals(field)) {
return new RandomTypeSimilarity(random);
} else {
return delegate.get(field);
}
}
@Override
public float coord(int overlap, int maxOverlap) {
return delegate.coord(overlap, maxOverlap);
}
}
public static class FloatEncodingBoostSimilarity extends DefaultSimilarity {
@Override
public void computeNorm(FieldInvertState state, Norm norm) {
float boost = state.getBoost();
norm.setFloat(boost);
}
}
public static class RandomTypeSimilarity extends DefaultSimilarity {
private final Random random;
public RandomTypeSimilarity(Random random) {
this.random = random;
}
@Override
public void computeNorm(FieldInvertState state, Norm norm) {
float boost = state.getBoost();
int nextInt = random.nextInt(10);
switch (nextInt) {
case 0:
norm.setDouble((double) boost);
break;
case 1:
norm.setFloat(boost);
break;
case 2:
norm.setLong((long) boost);
break;
case 3:
norm.setBytes(new BytesRef(new byte[6]));
break;
case 4:
norm.setInt((int) boost);
break;
case 5:
norm.setShort((short) boost);
break;
default:
norm.setByte((byte) boost);
}
}
}
}

View File

@ -141,7 +141,7 @@ public class TestFieldInfos extends LuceneTestCase {
try {
readOnly.addOrUpdate("bogus", random.nextBoolean(), random.nextBoolean(),
random.nextBoolean(),
random.nextBoolean(), random.nextBoolean() ? IndexOptions.DOCS_ONLY : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, null);
random.nextBoolean(), random.nextBoolean() ? IndexOptions.DOCS_ONLY : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, null, null);
fail("instance should be read only");
} catch (IllegalStateException e) {
// expected

View File

@ -117,8 +117,8 @@ public class TestMaxTermFrequency extends LuceneTestCase {
}
@Override
public byte computeNorm(FieldInvertState state) {
return encodeNormValue((float) state.getMaxTermFrequency());
public void computeNorm(FieldInvertState state, Norm norm) {
norm.setByte(encodeNormValue((float) state.getMaxTermFrequency()));
}
}
}

View File

@ -18,20 +18,20 @@ package org.apache.lucene.index;
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.DocValues.Source;
import org.apache.lucene.index.DocValues.Type;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MockDirectoryWrapper;
import org.apache.lucene.util.LineFileDocs;
import org.apache.lucene.util.LuceneTestCase;
/**
@ -39,7 +39,8 @@ import org.apache.lucene.util.LuceneTestCase;
* separate norms, addDocument, addIndexes, forceMerge.
*/
public class TestNorms extends LuceneTestCase {
final String byteTestField = "normsTestByte";
class CustomNormEncodingSimilarity extends DefaultSimilarity {
@Override
public byte encodeNormValue(float f) {
@ -52,8 +53,8 @@ public class TestNorms extends LuceneTestCase {
}
@Override
public byte computeNorm(FieldInvertState state) {
return encodeNormValue((float) state.getLength());
public void computeNorm(FieldInvertState state, Norm norm) {
norm.setByte(encodeNormValue((float) state.getLength()));
}
}
@ -93,4 +94,160 @@ public class TestNorms extends LuceneTestCase {
reader.close();
dir.close();
}
public void testMaxByteNorms() throws IOException {
Directory dir = newDirectory();
buildIndex(dir, true);
IndexReader open = new SlowMultiReaderWrapper(IndexReader.open(dir));
DocValues normValues = open.normValues(byteTestField);
assertNotNull(normValues);
Source source = normValues.getSource();
assertTrue(source.hasArray());
assertEquals(Type.FIXED_INTS_8, normValues.type());
byte[] norms = (byte[]) source.getArray();
for (int i = 0; i < open.maxDoc(); i++) {
Document document = open.document(i);
int expected = Integer.parseInt(document.get(byteTestField));
assertEquals((byte)expected, norms[i]);
}
open.close();
dir.close();
}
/**
* this test randomly creates segments with or without norms but not omitting
* norms. The similarity used doesn't write a norm value if writeNorms = false is
* passed. This differs from omitNorm since norms are simply not written for this segment
* while merging fills in default values based on the Norm {@link Type}
*/
public void testNormsNotPresent() throws IOException {
Directory dir = newDirectory();
boolean firstWriteNorm = random.nextBoolean();
buildIndex(dir, firstWriteNorm);
Directory otherDir = newDirectory();
boolean secondWriteNorm = random.nextBoolean();
buildIndex(otherDir, secondWriteNorm);
IndexReader reader = new SlowMultiReaderWrapper(IndexReader.open(otherDir));
FieldInfos fieldInfos = reader.getFieldInfos();
FieldInfo fieldInfo = fieldInfos.fieldInfo(byteTestField);
assertFalse(fieldInfo.omitNorms);
assertTrue(fieldInfo.isIndexed);
if (secondWriteNorm) {
assertTrue(fieldInfo.normsPresent());
} else {
assertFalse(fieldInfo.normsPresent());
}
IndexWriterConfig config = newIndexWriterConfig(TEST_VERSION_CURRENT,
new MockAnalyzer(random));
RandomIndexWriter writer = new RandomIndexWriter(random, dir, config);
writer.addIndexes(reader);
IndexReader mergedReader = new SlowMultiReaderWrapper(writer.getReader());
if (!firstWriteNorm && !secondWriteNorm) {
DocValues normValues = mergedReader.normValues(byteTestField);
assertNull(normValues);
FieldInfo fi = mergedReader.getFieldInfos().fieldInfo(byteTestField);
assertFalse(fi.omitNorms);
assertTrue(fi.isIndexed);
assertFalse(fi.normsPresent());
} else {
FieldInfo fi = mergedReader.getFieldInfos().fieldInfo(byteTestField);
assertFalse(fi.omitNorms);
assertTrue(fi.isIndexed);
assertTrue(fi.normsPresent());
DocValues normValues = mergedReader.normValues(byteTestField);
assertNotNull(normValues);
Source source = normValues.getSource();
assertTrue(source.hasArray());
assertEquals(Type.FIXED_INTS_8, normValues.type());
byte[] norms = (byte[]) source.getArray();
for (int i = 0; i < mergedReader.maxDoc(); i++) {
Document document = mergedReader.document(i);
int expected = Integer.parseInt(document.get(byteTestField));
assertEquals((byte) expected, norms[i]);
}
}
mergedReader.close();
reader.close();
writer.close();
dir.close();
otherDir.close();
}
public void buildIndex(Directory dir, boolean writeNorms) throws IOException,
CorruptIndexException {
IndexWriterConfig config = newIndexWriterConfig(TEST_VERSION_CURRENT,
new MockAnalyzer(random));
SimilarityProvider provider = new MySimProvider(writeNorms);
config.setSimilarityProvider(provider);
RandomIndexWriter writer = new RandomIndexWriter(random, dir, config);
final LineFileDocs docs = new LineFileDocs(random);
int num = atLeast(100);
for (int i = 0; i < num; i++) {
Document doc = docs.nextDoc();
int boost = writeNorms ? 1 + random.nextInt(255) : 0;
Field f = new Field(byteTestField, "" + boost,
TextField.TYPE_STORED);
f.setBoost(boost);
doc.add(f);
writer.addDocument(doc);
doc.removeField(byteTestField);
if (rarely()) {
writer.commit();
}
}
writer.commit();
writer.close();
}
public class MySimProvider implements SimilarityProvider {
SimilarityProvider delegate = new DefaultSimilarityProvider();
private boolean writeNorms;
public MySimProvider(boolean writeNorms) {
this.writeNorms = writeNorms;
}
@Override
public float queryNorm(float sumOfSquaredWeights) {
return delegate.queryNorm(sumOfSquaredWeights);
}
@Override
public Similarity get(String field) {
if (byteTestField.equals(field)) {
return new ByteEncodingBoostSimilarity(writeNorms);
} else {
return delegate.get(field);
}
}
@Override
public float coord(int overlap, int maxOverlap) {
return delegate.coord(overlap, maxOverlap);
}
}
public static class ByteEncodingBoostSimilarity extends DefaultSimilarity {
private boolean writeNorms;
public ByteEncodingBoostSimilarity(boolean writeNorms) {
this.writeNorms = writeNorms;
}
@Override
public void computeNorm(FieldInvertState state, Norm norm) {
if (writeNorms) {
int boost = (int) state.getBoost();
norm.setByte((byte) (0xFF & boost));
}
}
}
}

View File

@ -44,8 +44,7 @@ public class TestOmitTf extends LuceneTestCase {
public float coord(int overlap, int maxOverlap) { return 1.0f; }
public Similarity get(String field) {
return new TFIDFSimilarity() {
@Override public byte computeNorm(FieldInvertState state) { return encodeNormValue(state.getBoost()); }
@Override public void computeNorm(FieldInvertState state, Norm norm) { norm.setByte(encodeNormValue(state.getBoost())); }
@Override public float tf(float freq) { return freq; }
@Override public float sloppyFreq(int distance) { return 2.0f; }
@Override public float idf(int docFreq, int numDocs) { return 1.0f; }

View File

@ -102,8 +102,8 @@ public class TestUniqueTermCount extends LuceneTestCase {
class TestSimilarity extends DefaultSimilarity {
@Override
public byte computeNorm(FieldInvertState state) {
return (byte) state.getUniqueTermCount();
public void computeNorm(FieldInvertState state, Norm norm) {
norm.setByte((byte) state.getUniqueTermCount());
}
}
}

View File

@ -20,6 +20,7 @@ package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Norm;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.util.Bits;
@ -261,7 +262,7 @@ final class JustCompileSearch {
}
@Override
public byte computeNorm(FieldInvertState state) {
public void computeNorm(FieldInvertState state, Norm norm) {
throw new UnsupportedOperationException(UNSUPPORTED_MSG);
}
}

View File

@ -24,6 +24,7 @@ import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Norm;
import org.apache.lucene.index.SlowMultiReaderWrapper;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.RandomIndexWriter;
@ -66,9 +67,9 @@ public class TestDisjunctionMaxQuery extends LuceneTestCase {
}
@Override
public byte computeNorm(FieldInvertState state) {
public void computeNorm(FieldInvertState state, Norm norm) {
// Disable length norm
return encodeNormValue(state.getBoost());
norm.setByte(encodeNormValue(state.getBoost()));
}
@Override

View File

@ -29,6 +29,7 @@ import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Norm;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.similarities.Similarity;
@ -152,8 +153,8 @@ public class TestDocValuesScoring extends LuceneTestCase {
}
@Override
public byte computeNorm(FieldInvertState state) {
return sim.computeNorm(state);
public void computeNorm(FieldInvertState state, Norm norm) {
sim.computeNorm(state, norm);
}
@Override

View File

@ -24,6 +24,7 @@ import java.io.IOException;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Norm;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.similarities.DefaultSimilarity;
@ -45,7 +46,7 @@ public class TestSimilarity extends LuceneTestCase {
public float coord(int overlap, int maxOverlap) { return 1.0f; }
public Similarity get(String field) {
return new DefaultSimilarity() {
@Override public byte computeNorm(FieldInvertState state) { return encodeNormValue(state.getBoost()); }
@Override public void computeNorm(FieldInvertState state, Norm norm) { norm.setByte(encodeNormValue(state.getBoost())); }
@Override public float tf(float freq) { return freq; }
@Override public float sloppyFreq(int distance) { return 2.0f; }
@Override public float idf(int docFreq, int numDocs) { return 1.0f; }

View File

@ -25,6 +25,7 @@ import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.Norm;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.similarities.Similarity;
@ -111,9 +112,10 @@ public class TestSimilarityProvider extends LuceneTestCase {
}
private class Sim1 extends TFIDFSimilarity {
@Override
public byte computeNorm(FieldInvertState state) {
return encodeNormValue(1f);
public void computeNorm(FieldInvertState state, Norm norm) {
norm.setByte(encodeNormValue(1f));
}
@Override
@ -138,9 +140,10 @@ public class TestSimilarityProvider extends LuceneTestCase {
}
private class Sim2 extends TFIDFSimilarity {
@Override
public byte computeNorm(FieldInvertState state) {
return encodeNormValue(10f);
public void computeNorm(FieldInvertState state, Norm norm) {
norm.setByte(encodeNormValue(10f));
}
@Override

View File

@ -24,6 +24,7 @@ import org.apache.lucene.document.Document;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Norm;
import org.apache.lucene.index.Payload;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
@ -330,8 +331,8 @@ public class TestPayloadNearQuery extends LuceneTestCase {
//Make everything else 1 so we see the effect of the payload
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
@Override
public byte computeNorm(FieldInvertState state) {
return encodeNormValue(state.getBoost());
public void computeNorm(FieldInvertState state, Norm norm) {
norm.setByte(encodeNormValue(state.getBoost()));
}
@Override

View File

@ -37,6 +37,7 @@ import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Norm;
import org.apache.lucene.index.Payload;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
@ -324,9 +325,9 @@ public class TestPayloadTermQuery extends LuceneTestCase {
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
//Make everything else 1 so we see the effect of the payload
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
@Override
public byte computeNorm(FieldInvertState state) {
return encodeNormValue(state.getBoost());
@Override
public void computeNorm(FieldInvertState state, Norm norm) {
norm.setByte(encodeNormValue(state.getBoost()));
}
@Override

View File

@ -18,6 +18,7 @@
package org.apache.solr.search.function;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.index.Norm;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.similarities.DefaultSimilarity;
@ -346,8 +347,11 @@ public class TestFunctionQuery extends SolrTestCaseJ4 {
FieldInvertState state = new FieldInvertState();
state.setBoost(1.0f);
state.setLength(4);
Norm norm = new Norm();
similarity.computeNorm(state, norm);
float nrm = similarity.decodeNormValue(norm.field().numericValue().byteValue());
assertQ(req("fl","*,score","q", "{!func}norm(a_t)", "fq","id:2"),
"//float[@name='score']='" + similarity.decodeNormValue(similarity.computeNorm(state)) + "'"); // sqrt(4)==2 and is exactly representable when quantized to a byte
"//float[@name='score']='" + nrm + "'"); // sqrt(4)==2 and is exactly representable when quantized to a byte
// test that ord and rord are working on a global index basis, not just
// at the segment level (since Lucene 2.9 has switched to per-segment searching)