make field norms optional : LUCENE-448

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@329366 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2005-10-29 03:52:48 +00:00
parent 52c12ea335
commit ae11eb845b
13 changed files with 200 additions and 55 deletions

View File

@ -145,6 +145,10 @@ New features
20. Added a new class MatchAllDocsQuery that matches all documents.
(John Wang via Daniel Naber, bug #34946)
21. Added ability to omit norms on a per field basis to decrease
index size and memory consumption when there are many indexed fields.
See Field.setOmitNorms()
(Yonik Seeley, LUCENE-448)
API Changes

View File

@ -1245,10 +1245,20 @@ limitations under the License.
FieldBits --> Byte
</p>
<p>
<ul>
<li>
The low-order bit is one for
indexed fields, and zero for non-indexed fields. The second lowest-order
indexed fields, and zero for non-indexed fields.
</li>
<li>
The second lowest-order
bit is one for fields that have term vectors stored, and zero for fields
without term vectors.
</li>
<li> If the third lowest-order bit is set (0x04), term positions are stored with the term vectors. </li>
<li> If the fourth lowest-order bit is set (0x08), term offsets are stored with the term vectors. </li>
<li> If the fifth lowest-order bit is set (0x10), norms are omitted for the indexed field. </li>
</ul>
</p>
<p>
Fields are numbered by their order in this file. Thus field zero is

View File

@ -42,6 +42,7 @@ public final class Field implements Serializable {
private boolean storeTermVector = false;
private boolean storeOffsetWithTermVector = false;
private boolean storePositionWithTermVector = false;
private boolean omitNorms = false;
private boolean isStored = false;
private boolean isIndexed = true;
private boolean isTokenized = true;
@ -94,6 +95,15 @@ public final class Field implements Serializable {
* useful for unique Ids like product numbers.
*/
public static final Index UN_TOKENIZED = new Index("UN_TOKENIZED");
/** Index the field's value without an Analyzer, and disable
* the storing of norms. No norms means that index-time boosting
* and field length normalization will be disabled. The benefit is
* less memory usage as norms take up one byte per indexed field
* for every document in the index.
*/
public static final Index NO_NORMS = new Index("NO_NORMS");
}
public static final class TermVector extends Parameter implements Serializable {
@ -338,6 +348,10 @@ public final class Field implements Serializable {
} else if (index == Index.UN_TOKENIZED) {
this.isIndexed = true;
this.isTokenized = false;
} else if (index == Index.NO_NORMS) {
this.isIndexed = true;
this.isTokenized = false;
this.omitNorms = true;
} else {
throw new IllegalArgumentException("unknown index parameter " + index);
}
@ -540,6 +554,16 @@ public final class Field implements Serializable {
/** True iff the value of the filed is stored as binary */
public final boolean isBinary() { return isBinary; }
/** True if norms are omitted for this indexed field */
public boolean getOmitNorms() { return omitNorms; }
/** Expert:
*
* If set, omit normalization factors associated with this indexed field.
* This effectively disables indexing boosts and length normalization for this field.
*/
public void setOmitNorms(boolean omitNorms) { this.omitNorms=omitNorms; }
/** Prints a Field for human consumption. */
public final String toString() {
StringBuffer result = new StringBuffer();
@ -580,7 +604,9 @@ public final class Field implements Serializable {
result.append(",");
result.append("binary");
}
if (omitNorms) {
result.append(",omitNorms");
}
result.append('<');
result.append(name);
result.append(':');

View File

@ -371,7 +371,7 @@ final class DocumentWriter {
private final void writeNorms(String segment) throws IOException {
for(int n = 0; n < fieldInfos.size(); n++){
FieldInfo fi = fieldInfos.fieldInfo(n);
if(fi.isIndexed){
if(fi.isIndexed && !fi.omitNorms){
float norm = fieldBoosts[n] * similarity.lengthNorm(fi.name, fieldLengths[n]);
IndexOutput norms = directory.createOutput(segment + ".f" + n);
try {

View File

@ -26,13 +26,16 @@ final class FieldInfo {
boolean storeOffsetWithTermVector;
boolean storePositionWithTermVector;
boolean omitNorms; // omit norms associated with indexed fields
FieldInfo(String na, boolean tk, int nu, boolean storeTermVector,
boolean storePositionWithTermVector, boolean storeOffsetWithTermVector) {
boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, boolean omitNorms) {
name = na;
isIndexed = tk;
number = nu;
this.storeTermVector = storeTermVector;
this.storeOffsetWithTermVector = storeOffsetWithTermVector;
this.storePositionWithTermVector = storePositionWithTermVector;
this.omitNorms = omitNorms;
}
}

View File

@ -38,6 +38,7 @@ final class FieldInfos {
static final byte STORE_TERMVECTOR = 0x2;
static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x4;
static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x8;
static final byte OMIT_NORMS = 0x10;
private ArrayList byNumber = new ArrayList();
private HashMap byName = new HashMap();
@ -66,7 +67,7 @@ final class FieldInfos {
while (fields.hasMoreElements()) {
Field field = (Field) fields.nextElement();
add(field.name(), field.isIndexed(), field.isTermVectorStored(), field.isStorePositionWithTermVector(),
field.isStoreOffsetWithTermVector());
field.isStoreOffsetWithTermVector(), field.getOmitNorms());
}
}
@ -109,7 +110,7 @@ final class FieldInfos {
* @see #add(String, boolean, boolean, boolean, boolean)
*/
public void add(String name, boolean isIndexed) {
add(name, isIndexed, false, false, false);
add(name, isIndexed, false, false, false, false);
}
/**
@ -120,7 +121,7 @@ final class FieldInfos {
* @param storeTermVector true if the term vector should be stored
*/
public void add(String name, boolean isIndexed, boolean storeTermVector){
add(name, isIndexed, storeTermVector, false, false);
add(name, isIndexed, storeTermVector, false, false, false);
}
/** If the field is not yet known, adds it. If it is known, checks to make
@ -136,9 +137,27 @@ final class FieldInfos {
*/
public void add(String name, boolean isIndexed, boolean storeTermVector,
boolean storePositionWithTermVector, boolean storeOffsetWithTermVector) {
add(name, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, false);
}
/** If the field is not yet known, adds it. If it is known, checks to make
* sure that the isIndexed flag is the same as was given previously for this
* field. If not - marks it as being indexed. Same goes for the TermVector
* parameters.
*
* @param name The name of the field
* @param isIndexed true if the field is indexed
* @param storeTermVector true if the term vector should be stored
* @param storePositionWithTermVector true if the term vector with positions should be stored
* @param storeOffsetWithTermVector true if the term vector with offsets should be stored
* @param omitNorms true if the norms for the indexed field should be omitted
*/
public void add(String name, boolean isIndexed, boolean storeTermVector,
boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, boolean omitNorms) {
FieldInfo fi = fieldInfo(name);
if (fi == null) {
addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector);
addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms);
} else {
if (fi.isIndexed != isIndexed) {
fi.isIndexed = true; // once indexed, always index
@ -152,15 +171,20 @@ final class FieldInfos {
if (fi.storeOffsetWithTermVector != storeOffsetWithTermVector) {
fi.storeOffsetWithTermVector = true; // once vector, always vector
}
if (fi.omitNorms != omitNorms) {
fi.omitNorms = false; // once norms are stored, always store
}
}
}
private void addInternal(String name, boolean isIndexed,
boolean storeTermVector, boolean storePositionWithTermVector,
boolean storeOffsetWithTermVector) {
boolean storeOffsetWithTermVector, boolean omitNorms) {
FieldInfo fi =
new FieldInfo(name, isIndexed, byNumber.size(), storeTermVector, storePositionWithTermVector,
storeOffsetWithTermVector);
storeOffsetWithTermVector, omitNorms);
byNumber.add(fi);
byName.put(name, fi);
}
@ -245,6 +269,7 @@ final class FieldInfos {
if (fi.storeTermVector) bits |= STORE_TERMVECTOR;
if (fi.storePositionWithTermVector) bits |= STORE_POSITIONS_WITH_TERMVECTOR;
if (fi.storeOffsetWithTermVector) bits |= STORE_OFFSET_WITH_TERMVECTOR;
if (fi.omitNorms) bits |= OMIT_NORMS;
output.writeString(fi.name);
output.writeByte(bits);
}
@ -259,7 +284,9 @@ final class FieldInfos {
boolean storeTermVector = (bits & STORE_TERMVECTOR) != 0;
boolean storePositionsWithTermVector = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
boolean storeOffsetWithTermVector = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector);
boolean omitNorms = (bits & OMIT_NORMS) != 0;
addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms);
}
}

View File

@ -107,6 +107,10 @@ public class FilterIndexReader extends IndexReader {
public boolean hasDeletions() { return in.hasDeletions(); }
protected void doUndeleteAll() throws IOException { in.undeleteAll(); }
public boolean hasNorms(String field) throws IOException {
return in.hasNorms(field);
}
public byte[] norms(String f) throws IOException { return in.norms(f); }
public void norms(String f, byte[] bytes, int offset) throws IOException {
in.norms(f, bytes, offset);

View File

@ -338,6 +338,13 @@ public abstract class IndexReader {
/** Returns true if any documents have been deleted */
public abstract boolean hasDeletions();
/** Returns true if there are norms stored for this field. */
public boolean hasNorms(String field) throws IOException {
// backward compatible implementation.
// SegmentReader has an efficient implementation.
return norms(field) != null;
}
/** Returns the byte-encoded normalization factor for the named field of
* every document. This is used by the search code to score documents.
*

View File

@ -145,10 +145,25 @@ public class MultiReader extends IndexReader {
return hi;
}
public boolean hasNorms(String field) throws IOException {
for (int i = 0; i < subReaders.length; i++) {
if (subReaders[i].hasNorms(field)) return true;
}
return false;
}
private byte[] ones;
private synchronized byte[] fakeNorms() {
if (ones==null) ones=SegmentReader.createFakeNorms(maxDoc());
return ones;
}
public synchronized byte[] norms(String field) throws IOException {
byte[] bytes = (byte[])normsCache.get(field);
if (bytes != null)
return bytes; // cache hit
if (!hasNorms(field))
return fakeNorms();
bytes = new byte[maxDoc()];
for (int i = 0; i < subReaders.length; i++)
@ -160,6 +175,7 @@ public class MultiReader extends IndexReader {
public synchronized void norms(String field, byte[] result, int offset)
throws IOException {
byte[] bytes = (byte[])normsCache.get(field);
if (bytes==null && !hasNorms(field)) bytes=fakeNorms();
if (bytes != null) // cache hit
System.arraycopy(bytes, 0, result, offset, maxDoc());

View File

@ -165,6 +165,10 @@ public class ParallelReader extends IndexReader {
return ((IndexReader)fieldToReader.get(field)).getTermFreqVector(n, field);
}
public boolean hasNorms(String field) throws IOException {
return ((IndexReader)fieldToReader.get(field)).hasNorms(field);
}
public byte[] norms(String field) throws IOException {
return ((IndexReader)fieldToReader.get(field)).norms(field);
}

View File

@ -18,6 +18,7 @@ package org.apache.lucene.index;
import java.util.Vector;
import java.util.Iterator;
import java.util.Collection;
import java.io.IOException;
import org.apache.lucene.store.Directory;
@ -122,7 +123,7 @@ final class SegmentMerger {
// Field norm files
for (int i = 0; i < fieldInfos.size(); i++) {
FieldInfo fi = fieldInfos.fieldInfo(i);
if (fi.isIndexed) {
if (fi.isIndexed && !fi.omitNorms) {
files.add(segment + ".f" + i);
}
}
@ -146,6 +147,15 @@ final class SegmentMerger {
return files;
}
private void addIndexed(IndexReader reader, FieldInfos fieldInfos, Collection names, boolean storeTermVectors, boolean storePositionWithTermVector,
boolean storeOffsetWithTermVector) throws IOException {
Iterator i = names.iterator();
while (i.hasNext()) {
String field = (String)i.next();
fieldInfos.add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader.hasNorms(field));
}
}
/**
*
* @return The number of documents in all of the readers
@ -156,11 +166,11 @@ final class SegmentMerger {
int docCount = 0;
for (int i = 0; i < readers.size(); i++) {
IndexReader reader = (IndexReader) readers.elementAt(i);
fieldInfos.addIndexed(reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true);
fieldInfos.addIndexed(reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false);
fieldInfos.addIndexed(reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true);
fieldInfos.addIndexed(reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false);
fieldInfos.addIndexed(reader.getFieldNames(IndexReader.FieldOption.INDEXED), false, false, false);
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true);
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false);
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true);
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false);
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.INDEXED), false, false, false);
fieldInfos.add(reader.getFieldNames(IndexReader.FieldOption.UNINDEXED), false);
}
fieldInfos.write(directory, segment + ".fnm");
@ -386,7 +396,7 @@ final class SegmentMerger {
private void mergeNorms() throws IOException {
for (int i = 0; i < fieldInfos.size(); i++) {
FieldInfo fi = fieldInfos.fieldInfo(i);
if (fi.isIndexed) {
if (fi.isIndexed && !fi.omitNorms) {
IndexOutput output = directory.createOutput(segment + ".f" + i);
try {
for (int j = 0; j < readers.size(); j++) {

View File

@ -17,12 +17,7 @@ package org.apache.lucene.index;
*/
import java.io.IOException;
import java.util.Collection;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Set;
import java.util.Vector;
import java.util.*;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@ -30,6 +25,7 @@ import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BitVector;
import org.apache.lucene.search.DefaultSimilarity;
/**
* @version $Id$
@ -260,7 +256,7 @@ class SegmentReader extends IndexReader {
for (int i = 0; i < fieldInfos.size(); i++) {
FieldInfo fi = fieldInfos.fieldInfo(i);
if (fi.isIndexed){
if (fi.isIndexed && !fi.omitNorms){
String name;
if(cfsReader == null)
name = segment + ".f" + i;
@ -442,10 +438,28 @@ class SegmentReader extends IndexReader {
return fieldSet;
}
public synchronized byte[] norms(String field) throws IOException {
public synchronized boolean hasNorms(String field) {
return norms.containsKey(field);
}
static byte[] createFakeNorms(int size) {
byte[] ones = new byte[size];
Arrays.fill(ones, DefaultSimilarity.encodeNorm(1.0f));
return ones;
}
private byte[] ones;
private synchronized byte[] fakeNorms() {
if (ones==null) ones=createFakeNorms(maxDoc());
return ones;
}
// can return null if norms aren't stored
protected synchronized byte[] getNorms(String field) throws IOException {
Norm norm = (Norm) norms.get(field);
if (norm == null) // not an indexed field
return null;
if (norm == null) return null; // not indexed, or norms not stored
if (norm.bytes == null) { // value not yet read
byte[] bytes = new byte[maxDoc()];
norms(field, bytes, 0);
@ -454,6 +468,13 @@ class SegmentReader extends IndexReader {
return norm.bytes;
}
// returns fake norms if norms aren't available
public synchronized byte[] norms(String field) throws IOException {
byte[] bytes = getNorms(field);
if (bytes==null) bytes=fakeNorms();
return bytes;
}
protected void doSetNorm(int doc, String field, byte value)
throws IOException {
Norm norm = (Norm) norms.get(field);
@ -470,8 +491,10 @@ class SegmentReader extends IndexReader {
throws IOException {
Norm norm = (Norm) norms.get(field);
if (norm == null)
return; // use zeros in array
if (norm == null) {
System.arraycopy(fakeNorms(), 0, bytes, offset, maxDoc());
return;
}
if (norm.bytes != null) { // can copy from cache
System.arraycopy(norm.bytes, 0, bytes, offset, maxDoc());
@ -487,10 +510,11 @@ class SegmentReader extends IndexReader {
}
}
private void openNorms(Directory cfsDir) throws IOException {
for (int i = 0; i < fieldInfos.size(); i++) {
FieldInfo fi = fieldInfos.fieldInfo(i);
if (fi.isIndexed) {
if (fi.isIndexed && !fi.omitNorms) {
// look first if there are separate norms in compound format
String fileName = segment + ".s" + fi.number;
Directory d = directory();

View File

@ -848,10 +848,20 @@
</p>
<p>
<ul>
<li>
The low-order bit is one for
indexed fields, and zero for non-indexed fields. The second lowest-order
indexed fields, and zero for non-indexed fields.
</li>
<li>
The second lowest-order
bit is one for fields that have term vectors stored, and zero for fields
without term vectors.
</li>
<li> If the third lowest-order bit is set (0x04), term positions are stored with the term vectors. </li>
<li> If the fourth lowest-order bit is set (0x08), term offsets are stored with the term vectors. </li>
<li> If the fifth lowest-order bit is set (0x10), norms are omitted for the indexed field. </li>
</ul>
</p>
<p>