mirror of https://github.com/apache/lucene.git
Distinguish between positions and length when indexing a field. The
length is now defined as the total number of tokens, not the final position. Length is used for score normalization (Similarity.lengthNorm()) and for controlling memory usage (IndexWriter.maxFieldLength). In both cases the total number of tokens is more reasonable than the final position. Position is used in phrase searching (see PhraseQuery and Token.setPositionIncrement()). git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150157 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
bd2acf0bf8
commit
c03491e6a2
|
@ -103,7 +103,8 @@ final class DocumentWriter {
|
|||
|
||||
// invert doc into postingTable
|
||||
postingTable.clear(); // clear postingTable
|
||||
fieldLengths = new int[fieldInfos.size()]; // init fieldLengths
|
||||
fieldLengths = new int[fieldInfos.size()]; // init fieldLengths
|
||||
fieldPositions = new int[fieldInfos.size()]; // init fieldPositions
|
||||
|
||||
fieldBoosts = new float[fieldInfos.size()]; // init fieldBoosts
|
||||
Arrays.fill(fieldBoosts, doc.getBoost());
|
||||
|
@ -138,6 +139,7 @@ final class DocumentWriter {
|
|||
// Used to buffer a document before it is written to the index.
|
||||
private final Hashtable postingTable = new Hashtable();
|
||||
private int[] fieldLengths;
|
||||
private int[] fieldPositions;
|
||||
private float[] fieldBoosts;
|
||||
|
||||
// Tokenizes the fields of a document into Postings.
|
||||
|
@ -149,11 +151,13 @@ final class DocumentWriter {
|
|||
String fieldName = field.name();
|
||||
int fieldNumber = fieldInfos.fieldNumber(fieldName);
|
||||
|
||||
int position = fieldLengths[fieldNumber]; // position in field
|
||||
int length = fieldLengths[fieldNumber]; // length of field
|
||||
int position = fieldPositions[fieldNumber]; // position in field
|
||||
|
||||
if (field.isIndexed()) {
|
||||
if (!field.isTokenized()) { // un-tokenized field
|
||||
addPosition(fieldName, field.stringValue(), position++);
|
||||
length++;
|
||||
} else {
|
||||
Reader reader; // find or make Reader
|
||||
if (field.readerValue() != null)
|
||||
|
@ -170,14 +174,15 @@ final class DocumentWriter {
|
|||
for (Token t = stream.next(); t != null; t = stream.next()) {
|
||||
position += (t.getPositionIncrement() - 1);
|
||||
addPosition(fieldName, t.termText(), position++);
|
||||
if (position > maxFieldLength) break;
|
||||
if (++length > maxFieldLength) break;
|
||||
}
|
||||
} finally {
|
||||
stream.close();
|
||||
}
|
||||
}
|
||||
|
||||
fieldLengths[fieldNumber] = position; // save field length
|
||||
fieldLengths[fieldNumber] = length; // save field length
|
||||
fieldPositions[fieldNumber] = position; // save field position
|
||||
fieldBoosts[fieldNumber] *= field.getBoost();
|
||||
}
|
||||
}
|
||||
|
@ -321,7 +326,7 @@ final class DocumentWriter {
|
|||
if (field.isIndexed()) {
|
||||
int n = fieldInfos.fieldNumber(field.name());
|
||||
float norm =
|
||||
fieldBoosts[n] * similarity.lengthNorm(field.name(), fieldLengths[n]);
|
||||
fieldBoosts[n] * similarity.lengthNorm(field.name(),fieldLengths[n]);
|
||||
OutputStream norms = directory.createFile(segment + ".f" + n);
|
||||
try {
|
||||
norms.writeByte(similarity.encodeNorm(norm));
|
||||
|
|
Loading…
Reference in New Issue