mirror of https://github.com/apache/lucene.git
LUCENE-755: Added the ability to store arbitrary binary metadata (payloads) in the posting list.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@518486 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9da8211775
commit
eb20c06a62
|
@ -82,6 +82,13 @@ New features
|
|||
|
||||
2. LUCENE-822: Added FieldSelector capabilities to Searchable for use with RemoteSearcher, and other Searchable implementations. (Mark Miller, Grant Ingersoll)
|
||||
|
||||
3. LUCENE-755: Added the ability to store arbitrary binary metadata in the posting list.
|
||||
These metadata are called Payloads. For every position of a Token one Payload in the form
|
||||
of a variable length byte array can be stored in the prox file.
|
||||
Remark: The APIs introduced with this feature are in experimental state and thus
|
||||
contain appropriate warnings in the javadocs.
|
||||
(Michael Busch)
|
||||
|
||||
Optimizations
|
||||
|
||||
1. LUCENE-761: The proxStream is now cloned lazily in SegmentTermPositions
|
||||
|
|
|
@ -1,5 +1,8 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
import org.apache.lucene.index.Payload;
|
||||
import org.apache.lucene.index.TermPositions;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
|
@ -20,23 +23,40 @@ package org.apache.lucene.analysis;
|
|||
/** A Token is an occurence of a term from the text of a field. It consists of
|
||||
a term's text, the start and end offset of the term in the text of the field,
|
||||
and a type string.
|
||||
|
||||
<p>
|
||||
The start and end offsets permit applications to re-associate a token with
|
||||
its source text, e.g., to display highlighted query terms in a document
|
||||
browser, or to show matching text fragments in a KWIC (KeyWord In Context)
|
||||
display, etc.
|
||||
|
||||
<p>
|
||||
The type is an interned string, assigned by a lexical analyzer
|
||||
(a.k.a. tokenizer), naming the lexical or syntactic class that the token
|
||||
belongs to. For example an end of sentence marker token might be implemented
|
||||
with type "eos". The default token type is "word". */
|
||||
with type "eos". The default token type is "word".
|
||||
<p>
|
||||
A Token can optionally have metadata (a.k.a. Payload) in the form of a variable
|
||||
length byte array. Use {@link TermPositions#getPayloadLength()} and
|
||||
{@link TermPositions#getPayload(byte[], int)} to retrieve the payloads from the index.
|
||||
|
||||
<br><br>
|
||||
<b>
|
||||
Warning: The status of the Payloads feature is experimental. The APIs
|
||||
introduced here might change in the future and will not be supported anymore
|
||||
in such a case. If you want to use this feature in a production environment
|
||||
you should wait for an official release.
|
||||
</b>
|
||||
|
||||
@see org.apache.lucene.index.Payload
|
||||
*/
|
||||
// TODO: Remove warning after API has been finalized
|
||||
public class Token implements Cloneable {
|
||||
String termText; // the text of the term
|
||||
int startOffset; // start in source text
|
||||
int endOffset; // end in source text
|
||||
String type = "word"; // lexical type
|
||||
|
||||
|
||||
Payload payload;
|
||||
|
||||
private int positionIncrement = 1;
|
||||
|
||||
/** Constructs a Token with the given term text, and start & end offsets.
|
||||
|
@ -115,6 +135,36 @@ public class Token implements Cloneable {
|
|||
/** Returns this Token's lexical type. Defaults to "word". */
|
||||
public final String type() { return type; }
|
||||
|
||||
/**
|
||||
* Sets this Token's payload.<br>
|
||||
* <br>
|
||||
* <b>
|
||||
* Warning: The status of the Payloads feature is experimental. The APIs
|
||||
* introduced here might change in the future and will not be supported anymore
|
||||
* in such a case. If you want to use this feature in a production environment
|
||||
* you should wait for an official release.
|
||||
* </b>
|
||||
*/
|
||||
// TODO: Remove warning after API has been finalized
|
||||
public void setPayload(Payload payload) {
|
||||
this.payload = payload;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns this Token's payload.<br>
|
||||
* <br>
|
||||
* <b>
|
||||
* Warning: The status of the Payloads feature is experimental. The APIs
|
||||
* introduced here might change in the future and will not be supported anymore
|
||||
* in such a case. If you want to use this feature in a production environment
|
||||
* you should wait for an official release.
|
||||
* </b>
|
||||
*/
|
||||
// TODO: Remove warning after API has been finalized
|
||||
public Payload getPayload() {
|
||||
return this.payload;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuffer sb = new StringBuffer();
|
||||
sb.append("(" + termText + "," + startOffset + "," + endOffset);
|
||||
|
|
|
@ -31,6 +31,7 @@ import java.io.PrintStream;
|
|||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.BitSet;
|
||||
import java.util.Enumeration;
|
||||
import java.util.Hashtable;
|
||||
import java.util.Iterator;
|
||||
|
@ -69,9 +70,30 @@ final class DocumentWriter {
|
|||
|
||||
final void addDocument(String segment, Document doc)
|
||||
throws CorruptIndexException, IOException {
|
||||
// write field names
|
||||
// create field infos
|
||||
fieldInfos = new FieldInfos();
|
||||
fieldInfos.add(doc);
|
||||
|
||||
// invert doc into postingTable
|
||||
postingTable.clear(); // clear postingTable
|
||||
fieldLengths = new int[fieldInfos.size()]; // init fieldLengths
|
||||
fieldPositions = new int[fieldInfos.size()]; // init fieldPositions
|
||||
fieldOffsets = new int[fieldInfos.size()]; // init fieldOffsets
|
||||
fieldStoresPayloads = new BitSet(fieldInfos.size());
|
||||
|
||||
fieldBoosts = new float[fieldInfos.size()]; // init fieldBoosts
|
||||
Arrays.fill(fieldBoosts, doc.getBoost());
|
||||
|
||||
// Before we write the FieldInfos we invert the Document. The reason is that
|
||||
// during invertion the TokenStreams of tokenized fields are being processed
|
||||
// and we might encounter tokens that have payloads associated with them. In
|
||||
// this case we have to update the FieldInfo of the particular field.
|
||||
invertDocument(doc);
|
||||
|
||||
// sort postingTable into an array
|
||||
Posting[] postings = sortPostingTable();
|
||||
|
||||
// write field infos
|
||||
fieldInfos.write(directory, segment + ".fnm");
|
||||
|
||||
// write field values
|
||||
|
@ -82,21 +104,7 @@ final class DocumentWriter {
|
|||
} finally {
|
||||
fieldsWriter.close();
|
||||
}
|
||||
|
||||
// invert doc into postingTable
|
||||
postingTable.clear(); // clear postingTable
|
||||
fieldLengths = new int[fieldInfos.size()]; // init fieldLengths
|
||||
fieldPositions = new int[fieldInfos.size()]; // init fieldPositions
|
||||
fieldOffsets = new int[fieldInfos.size()]; // init fieldOffsets
|
||||
|
||||
fieldBoosts = new float[fieldInfos.size()]; // init fieldBoosts
|
||||
Arrays.fill(fieldBoosts, doc.getBoost());
|
||||
|
||||
invertDocument(doc);
|
||||
|
||||
// sort postingTable into an array
|
||||
Posting[] postings = sortPostingTable();
|
||||
|
||||
|
||||
/*
|
||||
for (int i = 0; i < postings.length; i++) {
|
||||
Posting posting = postings[i];
|
||||
|
@ -125,6 +133,10 @@ final class DocumentWriter {
|
|||
private int[] fieldPositions;
|
||||
private int[] fieldOffsets;
|
||||
private float[] fieldBoosts;
|
||||
|
||||
// If any of the tokens of a paticular field carry a payload
|
||||
// then we enable payloads for that field.
|
||||
private BitSet fieldStoresPayloads;
|
||||
|
||||
// Tokenizes the fields of a document into Postings.
|
||||
private final void invertDocument(Document doc)
|
||||
|
@ -144,9 +156,9 @@ final class DocumentWriter {
|
|||
if (!field.isTokenized()) { // un-tokenized field
|
||||
String stringValue = field.stringValue();
|
||||
if(field.isStoreOffsetWithTermVector())
|
||||
addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length()));
|
||||
addPosition(fieldName, stringValue, position++, null, new TermVectorOffsetInfo(offset, offset + stringValue.length()));
|
||||
else
|
||||
addPosition(fieldName, stringValue, position++, null);
|
||||
addPosition(fieldName, stringValue, position++, null, null);
|
||||
offset += stringValue.length();
|
||||
length++;
|
||||
} else
|
||||
|
@ -167,10 +179,19 @@ final class DocumentWriter {
|
|||
for (Token t = stream.next(); t != null; t = stream.next()) {
|
||||
position += (t.getPositionIncrement() - 1);
|
||||
|
||||
if(field.isStoreOffsetWithTermVector())
|
||||
addPosition(fieldName, t.termText(), position++, new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset()));
|
||||
else
|
||||
addPosition(fieldName, t.termText(), position++, null);
|
||||
Payload payload = t.getPayload();
|
||||
if (payload != null) {
|
||||
// enable payloads for this field
|
||||
fieldStoresPayloads.set(fieldNumber);
|
||||
}
|
||||
|
||||
TermVectorOffsetInfo termVectorOffsetInfo;
|
||||
if (field.isStoreOffsetWithTermVector()) {
|
||||
termVectorOffsetInfo = new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset());
|
||||
} else {
|
||||
termVectorOffsetInfo = null;
|
||||
}
|
||||
addPosition(fieldName, t.termText(), position++, payload, termVectorOffsetInfo);
|
||||
|
||||
lastToken = t;
|
||||
if (++length >= maxFieldLength) {
|
||||
|
@ -194,11 +215,16 @@ final class DocumentWriter {
|
|||
fieldOffsets[fieldNumber] = offset;
|
||||
}
|
||||
}
|
||||
|
||||
// update fieldInfos for all fields that have one or more tokens with payloads
|
||||
for (int i = fieldStoresPayloads.nextSetBit(0); i >= 0; i = fieldStoresPayloads.nextSetBit(i+1)) {
|
||||
fieldInfos.fieldInfo(i).storePayloads = true;
|
||||
}
|
||||
}
|
||||
|
||||
private final Term termBuffer = new Term("", ""); // avoid consing
|
||||
|
||||
private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset) {
|
||||
private final void addPosition(String field, String text, int position, Payload payload, TermVectorOffsetInfo offset) {
|
||||
termBuffer.set(field, text);
|
||||
//System.out.println("Offset: " + offset);
|
||||
Posting ti = (Posting) postingTable.get(termBuffer);
|
||||
|
@ -209,9 +235,25 @@ final class DocumentWriter {
|
|||
int[] positions = ti.positions;
|
||||
System.arraycopy(positions, 0, newPositions, 0, freq);
|
||||
ti.positions = newPositions;
|
||||
|
||||
if (ti.payloads != null) {
|
||||
// the current field stores payloads
|
||||
Payload[] newPayloads = new Payload[freq * 2]; // grow payloads array
|
||||
Payload[] payloads = ti.payloads;
|
||||
System.arraycopy(payloads, 0, newPayloads, 0, payloads.length);
|
||||
ti.payloads = newPayloads;
|
||||
}
|
||||
}
|
||||
ti.positions[freq] = position; // add new position
|
||||
|
||||
if (payload != null) {
|
||||
if (ti.payloads == null) {
|
||||
// lazily allocate payload array
|
||||
ti.payloads = new Payload[ti.positions.length];
|
||||
}
|
||||
ti.payloads[freq] = payload;
|
||||
}
|
||||
|
||||
if (offset != null) {
|
||||
if (ti.offsets.length == freq){
|
||||
TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[freq*2];
|
||||
|
@ -224,7 +266,7 @@ final class DocumentWriter {
|
|||
ti.freq = freq + 1; // update frequency
|
||||
} else { // word not seen before
|
||||
Term term = new Term(field, text, false);
|
||||
postingTable.put(term, new Posting(term, position, offset));
|
||||
postingTable.put(term, new Posting(term, position, payload, offset));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -307,10 +349,31 @@ final class DocumentWriter {
|
|||
termIndexInterval);
|
||||
TermInfo ti = new TermInfo();
|
||||
String currentField = null;
|
||||
|
||||
boolean currentFieldHasPayloads = false;
|
||||
|
||||
for (int i = 0; i < postings.length; i++) {
|
||||
Posting posting = postings[i];
|
||||
|
||||
// check to see if we switched to a new field
|
||||
String termField = posting.term.field();
|
||||
if (currentField != termField) {
|
||||
// changing field - see if there is something to save
|
||||
currentField = termField;
|
||||
FieldInfo fi = fieldInfos.fieldInfo(currentField);
|
||||
currentFieldHasPayloads = fi.storePayloads;
|
||||
if (fi.storeTermVector) {
|
||||
if (termVectorWriter == null) {
|
||||
termVectorWriter =
|
||||
new TermVectorsWriter(directory, segment, fieldInfos);
|
||||
termVectorWriter.openDocument();
|
||||
}
|
||||
termVectorWriter.openField(currentField);
|
||||
|
||||
} else if (termVectorWriter != null) {
|
||||
termVectorWriter.closeField();
|
||||
}
|
||||
}
|
||||
|
||||
// add an entry to the dictionary with pointers to prox and freq files
|
||||
ti.set(1, freq.getFilePointer(), prox.getFilePointer(), -1);
|
||||
tis.add(posting.term, ti);
|
||||
|
@ -326,28 +389,62 @@ final class DocumentWriter {
|
|||
|
||||
int lastPosition = 0; // write positions
|
||||
int[] positions = posting.positions;
|
||||
Payload[] payloads = posting.payloads;
|
||||
int lastPayloadLength = -1;
|
||||
|
||||
|
||||
// The following encoding is being used for positions and payloads:
|
||||
// Case 1: current field does not store payloads
|
||||
// Positions -> <PositionDelta>^freq
|
||||
// PositionDelta -> VInt
|
||||
// The PositionDelta is the difference between the current
|
||||
// and the previous position
|
||||
// Case 2: current field stores payloads
|
||||
// Positions -> <PositionDelta, Payload>^freq
|
||||
// Payload -> <PayloadLength?, PayloadData>
|
||||
// PositionDelta -> VInt
|
||||
// PayloadLength -> VInt
|
||||
// PayloadData -> byte^PayloadLength
|
||||
// In this case PositionDelta/2 is the difference between
|
||||
// the current and the previous position. If PositionDelta
|
||||
// is odd, then a PayloadLength encoded as VInt follows,
|
||||
// if PositionDelta is even, then it is assumed that the
|
||||
// length of the current Payload equals the length of the
|
||||
// previous Payload.
|
||||
for (int j = 0; j < postingFreq; j++) { // use delta-encoding
|
||||
int position = positions[j];
|
||||
prox.writeVInt(position - lastPosition);
|
||||
lastPosition = position;
|
||||
}
|
||||
// check to see if we switched to a new field
|
||||
String termField = posting.term.field();
|
||||
if (currentField != termField) {
|
||||
// changing field - see if there is something to save
|
||||
currentField = termField;
|
||||
FieldInfo fi = fieldInfos.fieldInfo(currentField);
|
||||
if (fi.storeTermVector) {
|
||||
if (termVectorWriter == null) {
|
||||
termVectorWriter =
|
||||
new TermVectorsWriter(directory, segment, fieldInfos);
|
||||
termVectorWriter.openDocument();
|
||||
int delta = position - lastPosition;
|
||||
if (currentFieldHasPayloads) {
|
||||
int payloadLength = 0;
|
||||
Payload payload = null;
|
||||
if (payloads != null) {
|
||||
payload = payloads[j];
|
||||
if (payload != null) {
|
||||
payloadLength = payload.length;
|
||||
}
|
||||
}
|
||||
termVectorWriter.openField(currentField);
|
||||
|
||||
} else if (termVectorWriter != null) {
|
||||
termVectorWriter.closeField();
|
||||
if (payloadLength == lastPayloadLength) {
|
||||
// the length of the current payload equals the length
|
||||
// of the previous one. So we do not have to store the length
|
||||
// again and we only shift the position delta by one bit
|
||||
prox.writeVInt(delta * 2);
|
||||
} else {
|
||||
// the length of the current payload is different from the
|
||||
// previous one. We shift the position delta, set the lowest
|
||||
// bit and store the current payload length as VInt.
|
||||
prox.writeVInt(delta * 2 + 1);
|
||||
prox.writeVInt(payloadLength);
|
||||
lastPayloadLength = payloadLength;
|
||||
}
|
||||
if (payloadLength > 0) {
|
||||
// write current payload
|
||||
prox.writeBytes(payload.data, payload.offset, payload.length);
|
||||
}
|
||||
} else {
|
||||
// field does not store payloads, just write position delta as VInt
|
||||
prox.writeVInt(delta);
|
||||
}
|
||||
lastPosition = position;
|
||||
}
|
||||
if (termVectorWriter != null && termVectorWriter.isFieldOpen()) {
|
||||
termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets);
|
||||
|
@ -397,18 +494,27 @@ final class Posting { // info about a Term in a doc
|
|||
Term term; // the Term
|
||||
int freq; // its frequency in doc
|
||||
int[] positions; // positions it occurs at
|
||||
Payload[] payloads; // the payloads of the terms
|
||||
TermVectorOffsetInfo [] offsets;
|
||||
|
||||
|
||||
Posting(Term t, int position, TermVectorOffsetInfo offset) {
|
||||
Posting(Term t, int position, Payload payload, TermVectorOffsetInfo offset) {
|
||||
term = t;
|
||||
freq = 1;
|
||||
positions = new int[1];
|
||||
positions[0] = position;
|
||||
|
||||
if (payload != null) {
|
||||
payloads = new Payload[1];
|
||||
payloads[0] = payload;
|
||||
} else
|
||||
payloads = null;
|
||||
|
||||
|
||||
if(offset != null){
|
||||
offsets = new TermVectorOffsetInfo[1];
|
||||
offsets[0] = offset;
|
||||
}
|
||||
else
|
||||
offsets = new TermVectorOffsetInfo[1];
|
||||
offsets[0] = offset;
|
||||
} else
|
||||
offsets = null;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -28,9 +28,12 @@ final class FieldInfo {
|
|||
boolean storePositionWithTermVector;
|
||||
|
||||
boolean omitNorms; // omit norms associated with indexed fields
|
||||
|
||||
boolean storePayloads; // whether this field stores payloads together with term positions
|
||||
|
||||
FieldInfo(String na, boolean tk, int nu, boolean storeTermVector,
|
||||
boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, boolean omitNorms) {
|
||||
boolean storePositionWithTermVector, boolean storeOffsetWithTermVector,
|
||||
boolean omitNorms, boolean storePayloads) {
|
||||
name = na;
|
||||
isIndexed = tk;
|
||||
number = nu;
|
||||
|
@ -38,5 +41,6 @@ final class FieldInfo {
|
|||
this.storeOffsetWithTermVector = storeOffsetWithTermVector;
|
||||
this.storePositionWithTermVector = storePositionWithTermVector;
|
||||
this.omitNorms = omitNorms;
|
||||
this.storePayloads = storePayloads;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -39,6 +39,7 @@ final class FieldInfos {
|
|||
static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x4;
|
||||
static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x8;
|
||||
static final byte OMIT_NORMS = 0x10;
|
||||
static final byte STORE_PAYLOADS = 0x20;
|
||||
|
||||
private ArrayList byNumber = new ArrayList();
|
||||
private HashMap byName = new HashMap();
|
||||
|
@ -156,9 +157,29 @@ final class FieldInfos {
|
|||
*/
|
||||
public void add(String name, boolean isIndexed, boolean storeTermVector,
|
||||
boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, boolean omitNorms) {
|
||||
add(name, isIndexed, storeTermVector, storePositionWithTermVector,
|
||||
storeOffsetWithTermVector, omitNorms, false);
|
||||
}
|
||||
|
||||
/** If the field is not yet known, adds it. If it is known, checks to make
|
||||
* sure that the isIndexed flag is the same as was given previously for this
|
||||
* field. If not - marks it as being indexed. Same goes for the TermVector
|
||||
* parameters.
|
||||
*
|
||||
* @param name The name of the field
|
||||
* @param isIndexed true if the field is indexed
|
||||
* @param storeTermVector true if the term vector should be stored
|
||||
* @param storePositionWithTermVector true if the term vector with positions should be stored
|
||||
* @param storeOffsetWithTermVector true if the term vector with offsets should be stored
|
||||
* @param omitNorms true if the norms for the indexed field should be omitted
|
||||
* @param storePayloads true if payloads should be stored for this field
|
||||
*/
|
||||
public void add(String name, boolean isIndexed, boolean storeTermVector,
|
||||
boolean storePositionWithTermVector, boolean storeOffsetWithTermVector,
|
||||
boolean omitNorms, boolean storePayloads) {
|
||||
FieldInfo fi = fieldInfo(name);
|
||||
if (fi == null) {
|
||||
addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms);
|
||||
addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads);
|
||||
} else {
|
||||
if (fi.isIndexed != isIndexed) {
|
||||
fi.isIndexed = true; // once indexed, always index
|
||||
|
@ -175,6 +196,9 @@ final class FieldInfos {
|
|||
if (fi.omitNorms != omitNorms) {
|
||||
fi.omitNorms = false; // once norms are stored, always store
|
||||
}
|
||||
if (fi.storePayloads != storePayloads) {
|
||||
fi.storePayloads = true;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -182,10 +206,10 @@ final class FieldInfos {
|
|||
|
||||
private void addInternal(String name, boolean isIndexed,
|
||||
boolean storeTermVector, boolean storePositionWithTermVector,
|
||||
boolean storeOffsetWithTermVector, boolean omitNorms) {
|
||||
boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads) {
|
||||
FieldInfo fi =
|
||||
new FieldInfo(name, isIndexed, byNumber.size(), storeTermVector, storePositionWithTermVector,
|
||||
storeOffsetWithTermVector, omitNorms);
|
||||
storeOffsetWithTermVector, omitNorms, storePayloads);
|
||||
byNumber.add(fi);
|
||||
byName.put(name, fi);
|
||||
}
|
||||
|
@ -271,6 +295,7 @@ final class FieldInfos {
|
|||
if (fi.storePositionWithTermVector) bits |= STORE_POSITIONS_WITH_TERMVECTOR;
|
||||
if (fi.storeOffsetWithTermVector) bits |= STORE_OFFSET_WITH_TERMVECTOR;
|
||||
if (fi.omitNorms) bits |= OMIT_NORMS;
|
||||
if (fi.storePayloads) bits |= STORE_PAYLOADS;
|
||||
output.writeString(fi.name);
|
||||
output.writeByte(bits);
|
||||
}
|
||||
|
@ -286,8 +311,9 @@ final class FieldInfos {
|
|||
boolean storePositionsWithTermVector = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
|
||||
boolean storeOffsetWithTermVector = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
|
||||
boolean omitNorms = (bits & OMIT_NORMS) != 0;
|
||||
|
||||
addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms);
|
||||
boolean storePayloads = (bits & STORE_PAYLOADS) != 0;
|
||||
|
||||
addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -62,6 +62,14 @@ public class FilterIndexReader extends IndexReader {
|
|||
public int nextPosition() throws IOException {
|
||||
return ((TermPositions) this.in).nextPosition();
|
||||
}
|
||||
|
||||
public int getPayloadLength() {
|
||||
return ((TermPositions) this.in).getPayloadLength();
|
||||
}
|
||||
|
||||
public byte[] getPayload(byte[] data, int offset) throws IOException {
|
||||
return ((TermPositions) this.in).getPayload(data, offset);
|
||||
}
|
||||
}
|
||||
|
||||
/** Base class for filtering {@link TermEnum} implementations. */
|
||||
|
|
|
@ -67,6 +67,8 @@ public abstract class IndexReader {
|
|||
public static final FieldOption ALL = new FieldOption ("ALL");
|
||||
// all indexed fields
|
||||
public static final FieldOption INDEXED = new FieldOption ("INDEXED");
|
||||
// all fields that store payloads
|
||||
public static final FieldOption STORES_PAYLOADS = new FieldOption ("STORES_PAYLOADS");
|
||||
// all fields which are not indexed
|
||||
public static final FieldOption UNINDEXED = new FieldOption ("UNINDEXED");
|
||||
// all fields which are indexed with termvectors enables
|
||||
|
|
|
@ -455,5 +455,12 @@ class MultiTermPositions extends MultiTermDocs implements TermPositions {
|
|||
public int nextPosition() throws IOException {
|
||||
return ((TermPositions)current).nextPosition();
|
||||
}
|
||||
|
||||
|
||||
public int getPayloadLength() {
|
||||
return ((TermPositions)current).getPayloadLength();
|
||||
}
|
||||
|
||||
public byte[] getPayload(byte[] data, int offset) throws IOException {
|
||||
return ((TermPositions)current).getPayload(data, offset);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -191,5 +191,23 @@ public class MultipleTermPositions implements TermPositions {
|
|||
public int read(int[] arg0, int[] arg1) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Not implemented.
|
||||
* @throws UnsupportedOperationException
|
||||
*/
|
||||
public int getPayloadLength() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
/**
|
||||
* Not implemented.
|
||||
* @throws UnsupportedOperationException
|
||||
*/
|
||||
public byte[] getPayload(byte[] data, int offset) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -419,7 +419,15 @@ public class ParallelReader extends IndexReader {
|
|||
return ((TermPositions)termDocs).nextPosition();
|
||||
}
|
||||
|
||||
public int getPayloadLength() {
|
||||
return ((TermPositions)termDocs).getPayloadLength();
|
||||
}
|
||||
|
||||
public byte[] getPayload(byte[] data, int offset) throws IOException {
|
||||
return ((TermPositions)termDocs).getPayload(data, offset);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,114 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* A Payload is metadata that can be stored together with each occurrence
|
||||
* of a term. This metadata is stored inline in the posting list of the
|
||||
* specific term.
|
||||
* <p>
|
||||
* To store payloads in the index a {@link TokenStream} has to be used that
|
||||
* produces {@link Token}s containing payload data.
|
||||
* <p>
|
||||
* Use {@link TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)}
|
||||
* to retrieve the payloads from the index.<br>
|
||||
* <br>
|
||||
*
|
||||
* <b>
|
||||
* Warning: The status of the Payloads feature is experimental. The APIs
|
||||
* introduced here might change in the future and will not be supported anymore
|
||||
* in such a case. If you want to use this feature in a production environment
|
||||
* you should wait for an official release.
|
||||
* </b>
|
||||
*/
|
||||
// TODO: Remove warning after API has been finalized
|
||||
public class Payload implements Serializable {
|
||||
protected byte[] data;
|
||||
protected int offset;
|
||||
protected int length;
|
||||
|
||||
protected Payload() {
|
||||
// no-arg constructor since this class implements Serializable
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new payload with the the given array as data.
|
||||
*
|
||||
* @param data the data of this payload
|
||||
*/
|
||||
public Payload(byte[] data) {
|
||||
this(data, 0, data.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new payload with the the given array as data.
|
||||
*
|
||||
* @param data the data of this payload
|
||||
* @param offset the offset in the data byte array
|
||||
* @param length the length of the data
|
||||
*/
|
||||
public Payload(byte[] data, int offset, int length) {
|
||||
if (offset < 0 || offset + length > data.length) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
this.data = data;
|
||||
this.offset = offset;
|
||||
this.length = length;
|
||||
}
|
||||
|
||||
public int length() {
|
||||
return this.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the byte at the given index.
|
||||
*/
|
||||
public byte byteAt(int index) {
|
||||
if (0 <= index && index < this.length) {
|
||||
return this.data[this.offset + index];
|
||||
}
|
||||
throw new ArrayIndexOutOfBoundsException(index);
|
||||
}
|
||||
|
||||
/**
|
||||
* Allocates a new byte array, copies the payload data into it and returns it.
|
||||
*/
|
||||
public byte[] toByteArray() {
|
||||
byte[] retArray = new byte[this.length];
|
||||
System.arraycopy(this.data, this.offset, retArray, 0, this.length);
|
||||
return retArray;
|
||||
}
|
||||
|
||||
/**
|
||||
* Copies the payload data to a byte array.
|
||||
*
|
||||
* @param target the target byte array
|
||||
* @param targetOffset the offset in the target byte array
|
||||
*/
|
||||
public void copyTo(byte[] target, int targetOffset) {
|
||||
if (this.length > target.length + targetOffset) {
|
||||
throw new ArrayIndexOutOfBoundsException();
|
||||
}
|
||||
System.arraycopy(this.data, this.offset, target, targetOffset, this.length);
|
||||
}
|
||||
}
|
|
@ -157,11 +157,11 @@ final class SegmentMerger {
|
|||
}
|
||||
|
||||
private void addIndexed(IndexReader reader, FieldInfos fieldInfos, Collection names, boolean storeTermVectors, boolean storePositionWithTermVector,
|
||||
boolean storeOffsetWithTermVector) throws IOException {
|
||||
boolean storeOffsetWithTermVector, boolean storePayloads) throws IOException {
|
||||
Iterator i = names.iterator();
|
||||
while (i.hasNext()) {
|
||||
String field = (String)i.next();
|
||||
fieldInfos.add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader.hasNorms(field));
|
||||
fieldInfos.add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader.hasNorms(field), storePayloads);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -176,11 +176,12 @@ final class SegmentMerger {
|
|||
int docCount = 0;
|
||||
for (int i = 0; i < readers.size(); i++) {
|
||||
IndexReader reader = (IndexReader) readers.elementAt(i);
|
||||
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true);
|
||||
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false);
|
||||
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true);
|
||||
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false);
|
||||
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.INDEXED), false, false, false);
|
||||
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false);
|
||||
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false);
|
||||
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false);
|
||||
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false, false);
|
||||
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.STORES_PAYLOADS), false, false, false, true);
|
||||
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.INDEXED), false, false, false, false);
|
||||
fieldInfos.add(reader.getFieldNames(IndexReader.FieldOption.UNINDEXED), false);
|
||||
}
|
||||
fieldInfos.write(directory, segment + ".fnm");
|
||||
|
@ -326,6 +327,8 @@ final class SegmentMerger {
|
|||
termInfosWriter.add(smis[0].term, termInfo);
|
||||
}
|
||||
}
|
||||
|
||||
private byte[] payloadBuffer = null;
|
||||
|
||||
/** Process postings from multiple segments all positioned on the
|
||||
* same term. Writes out merged entries into freqOutput and
|
||||
|
@ -342,6 +345,8 @@ final class SegmentMerger {
|
|||
int lastDoc = 0;
|
||||
int df = 0; // number of docs w/ term
|
||||
resetSkip();
|
||||
boolean storePayloads = fieldInfos.fieldInfo(smis[0].term.field).storePayloads;
|
||||
int lastPayloadLength = -1; // ensures that we write the first length
|
||||
for (int i = 0; i < n; i++) {
|
||||
SegmentMergeInfo smi = smis[i];
|
||||
TermPositions postings = smi.getPositions();
|
||||
|
@ -361,7 +366,7 @@ final class SegmentMerger {
|
|||
df++;
|
||||
|
||||
if ((df % skipInterval) == 0) {
|
||||
bufferSkip(lastDoc);
|
||||
bufferSkip(lastDoc, storePayloads, lastPayloadLength);
|
||||
}
|
||||
|
||||
int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1
|
||||
|
@ -374,11 +379,33 @@ final class SegmentMerger {
|
|||
freqOutput.writeVInt(docCode); // write doc
|
||||
freqOutput.writeVInt(freq); // write frequency in doc
|
||||
}
|
||||
|
||||
|
||||
/** See {@link DocumentWriter#writePostings(Posting[], String) for
|
||||
* documentation about the encoding of positions and payloads
|
||||
*/
|
||||
int lastPosition = 0; // write position deltas
|
||||
for (int j = 0; j < freq; j++) {
|
||||
int position = postings.nextPosition();
|
||||
proxOutput.writeVInt(position - lastPosition);
|
||||
int delta = position - lastPosition;
|
||||
if (storePayloads) {
|
||||
int payloadLength = postings.getPayloadLength();
|
||||
if (payloadLength == lastPayloadLength) {
|
||||
proxOutput.writeVInt(delta * 2);
|
||||
} else {
|
||||
proxOutput.writeVInt(delta * 2 + 1);
|
||||
proxOutput.writeVInt(payloadLength);
|
||||
lastPayloadLength = payloadLength;
|
||||
}
|
||||
if (payloadLength > 0) {
|
||||
if (payloadBuffer == null || payloadBuffer.length < payloadLength) {
|
||||
payloadBuffer = new byte[payloadLength];
|
||||
}
|
||||
postings.getPayload(payloadBuffer, 0);
|
||||
proxOutput.writeBytes(payloadBuffer, 0, payloadLength);
|
||||
}
|
||||
} else {
|
||||
proxOutput.writeVInt(delta);
|
||||
}
|
||||
lastPosition = position;
|
||||
}
|
||||
}
|
||||
|
@ -388,21 +415,59 @@ final class SegmentMerger {
|
|||
|
||||
private RAMOutputStream skipBuffer = new RAMOutputStream();
|
||||
private int lastSkipDoc;
|
||||
private int lastSkipPayloadLength;
|
||||
private long lastSkipFreqPointer;
|
||||
private long lastSkipProxPointer;
|
||||
|
||||
private void resetSkip() {
|
||||
skipBuffer.reset();
|
||||
lastSkipDoc = 0;
|
||||
lastSkipPayloadLength = -1; // we don't have to write the first length in the skip list
|
||||
lastSkipFreqPointer = freqOutput.getFilePointer();
|
||||
lastSkipProxPointer = proxOutput.getFilePointer();
|
||||
}
|
||||
|
||||
private void bufferSkip(int doc) throws IOException {
|
||||
private void bufferSkip(int doc, boolean storePayloads, int payloadLength) throws IOException {
|
||||
long freqPointer = freqOutput.getFilePointer();
|
||||
long proxPointer = proxOutput.getFilePointer();
|
||||
|
||||
skipBuffer.writeVInt(doc - lastSkipDoc);
|
||||
// To efficiently store payloads in the posting lists we do not store the length of
|
||||
// every payload. Instead we omit the length for a payload if the previous payload had
|
||||
// the same length.
|
||||
// However, in order to support skipping the payload length at every skip point must be known.
|
||||
// So we use the same length encoding that we use for the posting lists for the skip data as well:
|
||||
// Case 1: current field does not store payloads
|
||||
// SkipDatum --> DocSkip, FreqSkip, ProxSkip
|
||||
// DocSkip,FreqSkip,ProxSkip --> VInt
|
||||
// DocSkip records the document number before every SkipInterval th document in TermFreqs.
|
||||
// Document numbers are represented as differences from the previous value in the sequence.
|
||||
// Case 2: current field stores payloads
|
||||
// SkipDatum --> DocSkip, PayloadLength?, FreqSkip,ProxSkip
|
||||
// DocSkip,FreqSkip,ProxSkip --> VInt
|
||||
// PayloadLength --> VInt
|
||||
// In this case DocSkip/2 is the difference between
|
||||
// the current and the previous value. If DocSkip
|
||||
// is odd, then a PayloadLength encoded as VInt follows,
|
||||
// if DocSkip is even, then it is assumed that the
|
||||
// current payload length equals the length at the previous
|
||||
// skip point
|
||||
if (storePayloads) {
|
||||
int delta = doc - lastSkipDoc;
|
||||
if (payloadLength == lastSkipPayloadLength) {
|
||||
// the current payload length equals the length at the previous skip point,
|
||||
// so we don't store the length again
|
||||
skipBuffer.writeVInt(delta * 2);
|
||||
} else {
|
||||
// the payload length is different from the previous one. We shift the DocSkip,
|
||||
// set the lowest bit and store the current payload length as VInt.
|
||||
skipBuffer.writeVInt(delta * 2 + 1);
|
||||
skipBuffer.writeVInt(payloadLength);
|
||||
lastSkipPayloadLength = payloadLength;
|
||||
}
|
||||
} else {
|
||||
// current field does not store payloads
|
||||
skipBuffer.writeVInt(doc - lastSkipDoc);
|
||||
}
|
||||
skipBuffer.writeVInt((int) (freqPointer - lastSkipFreqPointer));
|
||||
skipBuffer.writeVInt((int) (proxPointer - lastSkipProxPointer));
|
||||
|
||||
|
|
|
@ -374,6 +374,9 @@ class SegmentReader extends IndexReader {
|
|||
else if (!fi.isIndexed && fieldOption == IndexReader.FieldOption.UNINDEXED) {
|
||||
fieldSet.add(fi.name);
|
||||
}
|
||||
else if (fi.storePayloads && fieldOption == IndexReader.FieldOption.STORES_PAYLOADS) {
|
||||
fieldSet.add(fi.name);
|
||||
}
|
||||
else if (fi.isIndexed && fieldOption == IndexReader.FieldOption.INDEXED) {
|
||||
fieldSet.add(fi.name);
|
||||
}
|
||||
|
@ -582,7 +585,12 @@ class SegmentReader extends IndexReader {
|
|||
|
||||
return termVectorsReader.get(docNumber);
|
||||
}
|
||||
|
||||
|
||||
/** Returns the field infos of this segment */
|
||||
FieldInfos fieldInfos() {
|
||||
return fieldInfos;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the name of the segment this reader is reading.
|
||||
*/
|
||||
|
|
|
@ -39,6 +39,9 @@ class SegmentTermDocs implements TermDocs {
|
|||
private long proxPointer;
|
||||
private long skipPointer;
|
||||
private boolean haveSkipped;
|
||||
|
||||
private int payloadLengthAtLastSkip;
|
||||
protected boolean currentFieldStoresPayloads;
|
||||
|
||||
protected SegmentTermDocs(SegmentReader parent) {
|
||||
this.parent = parent;
|
||||
|
@ -49,23 +52,31 @@ class SegmentTermDocs implements TermDocs {
|
|||
|
||||
public void seek(Term term) throws IOException {
|
||||
TermInfo ti = parent.tis.get(term);
|
||||
seek(ti);
|
||||
seek(ti, term);
|
||||
}
|
||||
|
||||
public void seek(TermEnum termEnum) throws IOException {
|
||||
TermInfo ti;
|
||||
Term term;
|
||||
|
||||
// use comparison of fieldinfos to verify that termEnum belongs to the same segment as this SegmentTermDocs
|
||||
if (termEnum instanceof SegmentTermEnum && ((SegmentTermEnum) termEnum).fieldInfos == parent.fieldInfos) // optimized case
|
||||
ti = ((SegmentTermEnum) termEnum).termInfo();
|
||||
else // punt case
|
||||
ti = parent.tis.get(termEnum.term());
|
||||
|
||||
seek(ti);
|
||||
if (termEnum instanceof SegmentTermEnum && ((SegmentTermEnum) termEnum).fieldInfos == parent.fieldInfos) { // optimized case
|
||||
SegmentTermEnum segmentTermEnum = ((SegmentTermEnum) termEnum);
|
||||
term = segmentTermEnum.term();
|
||||
ti = segmentTermEnum.termInfo();
|
||||
} else { // punt case
|
||||
term = termEnum.term();
|
||||
ti = parent.tis.get(term);
|
||||
}
|
||||
|
||||
seek(ti, term);
|
||||
}
|
||||
|
||||
void seek(TermInfo ti) throws IOException {
|
||||
void seek(TermInfo ti, Term term) throws IOException {
|
||||
count = 0;
|
||||
payloadLengthAtLastSkip = 0;
|
||||
FieldInfo fi = parent.fieldInfos.fieldInfo(term.field);
|
||||
currentFieldStoresPayloads = (fi != null) ? fi.storePayloads : false;
|
||||
if (ti == null) {
|
||||
df = 0;
|
||||
} else {
|
||||
|
@ -141,7 +152,7 @@ class SegmentTermDocs implements TermDocs {
|
|||
}
|
||||
|
||||
/** Overridden by SegmentTermPositions to skip in prox stream. */
|
||||
protected void skipProx(long proxPointer) throws IOException {}
|
||||
protected void skipProx(long proxPointer, int payloadLength) throws IOException {}
|
||||
|
||||
/** Optimized implementation. */
|
||||
public boolean skipTo(int target) throws IOException {
|
||||
|
@ -157,6 +168,7 @@ class SegmentTermDocs implements TermDocs {
|
|||
|
||||
// scan skip data
|
||||
int lastSkipDoc = skipDoc;
|
||||
int lastPayloadLength = 0;
|
||||
long lastFreqPointer = freqStream.getFilePointer();
|
||||
long lastProxPointer = -1;
|
||||
int numSkipped = -1 - (count % skipInterval);
|
||||
|
@ -165,6 +177,7 @@ class SegmentTermDocs implements TermDocs {
|
|||
lastSkipDoc = skipDoc;
|
||||
lastFreqPointer = freqPointer;
|
||||
lastProxPointer = proxPointer;
|
||||
lastPayloadLength = payloadLengthAtLastSkip;
|
||||
|
||||
if (skipDoc != 0 && skipDoc >= doc)
|
||||
numSkipped += skipInterval;
|
||||
|
@ -172,7 +185,21 @@ class SegmentTermDocs implements TermDocs {
|
|||
if(skipCount >= numSkips)
|
||||
break;
|
||||
|
||||
skipDoc += skipStream.readVInt();
|
||||
if (currentFieldStoresPayloads) {
|
||||
// the current field stores payloads.
|
||||
// if the doc delta is odd then we have
|
||||
// to read the current payload length
|
||||
// because it differs from the length of the
|
||||
// previous payload
|
||||
int delta = skipStream.readVInt();
|
||||
if ((delta & 1) != 0) {
|
||||
payloadLengthAtLastSkip = skipStream.readVInt();
|
||||
}
|
||||
delta >>>= 1;
|
||||
skipDoc += delta;
|
||||
} else {
|
||||
skipDoc += skipStream.readVInt();
|
||||
}
|
||||
freqPointer += skipStream.readVInt();
|
||||
proxPointer += skipStream.readVInt();
|
||||
|
||||
|
@ -182,7 +209,7 @@ class SegmentTermDocs implements TermDocs {
|
|||
// if we found something to skip, then skip it
|
||||
if (lastFreqPointer > freqStream.getFilePointer()) {
|
||||
freqStream.seek(lastFreqPointer);
|
||||
skipProx(lastProxPointer);
|
||||
skipProx(lastProxPointer, lastPayloadLength);
|
||||
|
||||
doc = lastSkipDoc;
|
||||
count += numSkipped;
|
||||
|
|
|
@ -27,6 +27,12 @@ extends SegmentTermDocs implements TermPositions {
|
|||
private int proxCount;
|
||||
private int position;
|
||||
|
||||
// the current payload length
|
||||
private int payloadLength;
|
||||
// indicates whether the payload of the currend position has
|
||||
// been read from the proxStream yet
|
||||
private boolean needToLoadPayload;
|
||||
|
||||
// these variables are being used to remember information
|
||||
// for a lazy skip
|
||||
private long lazySkipPointer = 0;
|
||||
|
@ -37,13 +43,15 @@ extends SegmentTermDocs implements TermPositions {
|
|||
this.proxStream = null; // the proxStream will be cloned lazily when nextPosition() is called for the first time
|
||||
}
|
||||
|
||||
final void seek(TermInfo ti) throws IOException {
|
||||
super.seek(ti);
|
||||
final void seek(TermInfo ti, Term term) throws IOException {
|
||||
super.seek(ti, term);
|
||||
if (ti != null)
|
||||
lazySkipPointer = ti.proxPointer;
|
||||
|
||||
lazySkipProxCount = 0;
|
||||
proxCount = 0;
|
||||
payloadLength = 0;
|
||||
needToLoadPayload = false;
|
||||
}
|
||||
|
||||
public final void close() throws IOException {
|
||||
|
@ -55,9 +63,28 @@ extends SegmentTermDocs implements TermPositions {
|
|||
// perform lazy skips if neccessary
|
||||
lazySkip();
|
||||
proxCount--;
|
||||
return position += proxStream.readVInt();
|
||||
return position += readDeltaPosition();
|
||||
}
|
||||
|
||||
private final int readDeltaPosition() throws IOException {
|
||||
int delta = proxStream.readVInt();
|
||||
if (currentFieldStoresPayloads) {
|
||||
// if the current field stores payloads then
|
||||
// the position delta is shifted one bit to the left.
|
||||
// if the LSB is set, then we have to read the current
|
||||
// payload length
|
||||
if ((delta & 1) != 0) {
|
||||
payloadLength = proxStream.readVInt();
|
||||
}
|
||||
delta >>>= 1;
|
||||
needToLoadPayload = true;
|
||||
} else {
|
||||
payloadLength = 0;
|
||||
needToLoadPayload = false;
|
||||
}
|
||||
return delta;
|
||||
}
|
||||
|
||||
protected final void skippingDoc() throws IOException {
|
||||
// we remember to skip a document lazily
|
||||
lazySkipProxCount += freq;
|
||||
|
@ -82,16 +109,27 @@ extends SegmentTermDocs implements TermPositions {
|
|||
|
||||
|
||||
/** Called by super.skipTo(). */
|
||||
protected void skipProx(long proxPointer) throws IOException {
|
||||
protected void skipProx(long proxPointer, int payloadLength) throws IOException {
|
||||
// we save the pointer, we might have to skip there lazily
|
||||
lazySkipPointer = proxPointer;
|
||||
lazySkipProxCount = 0;
|
||||
proxCount = 0;
|
||||
this.payloadLength = payloadLength;
|
||||
needToLoadPayload = false;
|
||||
}
|
||||
|
||||
private void skipPositions(int n) throws IOException {
|
||||
for (int f = n; f > 0; f--) // skip unread positions
|
||||
proxStream.readVInt();
|
||||
for (int f = n; f > 0; f--) { // skip unread positions
|
||||
readDeltaPosition();
|
||||
skipPayload();
|
||||
}
|
||||
}
|
||||
|
||||
private void skipPayload() throws IOException {
|
||||
if (needToLoadPayload && payloadLength > 0) {
|
||||
proxStream.seek(proxStream.getFilePointer() + payloadLength);
|
||||
}
|
||||
needToLoadPayload = false;
|
||||
}
|
||||
|
||||
// It is not always neccessary to move the prox pointer
|
||||
|
@ -109,6 +147,10 @@ extends SegmentTermDocs implements TermPositions {
|
|||
// clone lazily
|
||||
proxStream = (IndexInput)parent.proxStream.clone();
|
||||
}
|
||||
|
||||
// we might have to skip the current payload
|
||||
// if it was not read yet
|
||||
skipPayload();
|
||||
|
||||
if (lazySkipPointer != 0) {
|
||||
proxStream.seek(lazySkipPointer);
|
||||
|
@ -120,5 +162,31 @@ extends SegmentTermDocs implements TermPositions {
|
|||
lazySkipProxCount = 0;
|
||||
}
|
||||
}
|
||||
|
||||
public int getPayloadLength() {
|
||||
return payloadLength;
|
||||
}
|
||||
|
||||
public byte[] getPayload(byte[] data, int offset) throws IOException {
|
||||
if (!needToLoadPayload) {
|
||||
throw new IOException("Payload cannot be loaded more than once for the same term position.");
|
||||
}
|
||||
|
||||
// read payloads lazily
|
||||
byte[] retArray;
|
||||
int retOffset;
|
||||
if (data == null || data.length - offset < payloadLength) {
|
||||
// the array is too small to store the payload data,
|
||||
// so we allocate a new one
|
||||
retArray = new byte[payloadLength];
|
||||
retOffset = 0;
|
||||
} else {
|
||||
retArray = data;
|
||||
retOffset = offset;
|
||||
}
|
||||
proxStream.readBytes(retArray, retOffset, payloadLength);
|
||||
needToLoadPayload = false;
|
||||
return retArray;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -32,10 +32,53 @@ public interface TermPositions
|
|||
extends TermDocs
|
||||
{
|
||||
/** Returns next position in the current document. It is an error to call
|
||||
this more than {@link #freq()} times
|
||||
without calling {@link #next()}<p> This is
|
||||
invalid until {@link #next()} is called for
|
||||
the first time.
|
||||
this more than {@link #freq()} times
|
||||
without calling {@link #next()}<p> This is
|
||||
invalid until {@link #next()} is called for
|
||||
the first time.
|
||||
*/
|
||||
int nextPosition() throws IOException;
|
||||
|
||||
/**
|
||||
* Returns the length of the payload at the current term position.
|
||||
* This is invalid until {@link #nextPosition()} is called for
|
||||
* the first time.<br>
|
||||
* <br>
|
||||
* <b>
|
||||
* Warning: The status of the Payloads feature is experimental. The APIs
|
||||
* introduced here might change in the future and will not be supported anymore
|
||||
* in such a case. If you want to use this feature in a production environment
|
||||
* you should wait for an official release.
|
||||
* </b>
|
||||
* @return length of the current payload in number of bytes
|
||||
*/
|
||||
// TODO: Remove warning after API has been finalized
|
||||
int getPayloadLength();
|
||||
|
||||
/**
|
||||
* Returns the payload data at the current term position.
|
||||
* This is invalid until {@link #nextPosition()} is called for
|
||||
* the first time.
|
||||
* This method must not be called more than once after each call
|
||||
* of {@link #nextPosition()}. However, payloads are loaded lazily,
|
||||
* so if the payload data for the current position is not needed,
|
||||
* this method may not be called at all for performance reasons.<br>
|
||||
* <br>
|
||||
* <b>
|
||||
* Warning: The status of the Payloads feature is experimental. The APIs
|
||||
* introduced here might change in the future and will not be supported anymore
|
||||
* in such a case. If you want to use this feature in a production environment
|
||||
* you should wait for an official release.
|
||||
* </b>
|
||||
*
|
||||
* @param data the array into which the data of this payload is to be
|
||||
* stored, if it is big enough; otherwise, a new byte[] array
|
||||
* is allocated for this purpose.
|
||||
* @param offset the offset in the array into which the data of this payload
|
||||
* is to be stored.
|
||||
* @return a byte[] array containing the data of this payload
|
||||
* @throws IOException
|
||||
*/
|
||||
// TODO: Remove warning after API has been finalized
|
||||
byte[] getPayload(byte[] data, int offset) throws IOException;
|
||||
}
|
||||
|
|
|
@ -24,8 +24,8 @@ public abstract class BufferedIndexOutput extends IndexOutput {
|
|||
static final int BUFFER_SIZE = 1024;
|
||||
|
||||
private final byte[] buffer = new byte[BUFFER_SIZE];
|
||||
private long bufferStart = 0; // position in file of buffer
|
||||
private int bufferPosition = 0; // position in buffer
|
||||
private long bufferStart = 0; // position in file of buffer
|
||||
private int bufferPosition = 0; // position in buffer
|
||||
|
||||
/** Writes a single byte.
|
||||
* @see IndexInput#readByte()
|
||||
|
@ -41,12 +41,12 @@ public abstract class BufferedIndexOutput extends IndexOutput {
|
|||
* @param length the number of bytes to write
|
||||
* @see IndexInput#readBytes(byte[],int,int)
|
||||
*/
|
||||
public void writeBytes(byte[] b, int length) throws IOException {
|
||||
public void writeBytes(byte[] b, int offset, int length) throws IOException {
|
||||
int bytesLeft = BUFFER_SIZE - bufferPosition;
|
||||
// is there enough space in the buffer?
|
||||
if (bytesLeft >= length) {
|
||||
// we add the data to the end of the buffer
|
||||
System.arraycopy(b, 0, buffer, bufferPosition, length);
|
||||
System.arraycopy(b, offset, buffer, bufferPosition, length);
|
||||
bufferPosition += length;
|
||||
// if the buffer is full, flush it
|
||||
if (BUFFER_SIZE - bufferPosition == 0)
|
||||
|
@ -58,7 +58,7 @@ public abstract class BufferedIndexOutput extends IndexOutput {
|
|||
if (bufferPosition > 0)
|
||||
flush();
|
||||
// and write data at once
|
||||
flushBuffer(b, length);
|
||||
flushBuffer(b, offset, length);
|
||||
bufferStart += length;
|
||||
} else {
|
||||
// we fill/flush the buffer (until the input is written)
|
||||
|
@ -66,7 +66,7 @@ public abstract class BufferedIndexOutput extends IndexOutput {
|
|||
int pieceLength;
|
||||
while (pos < length) {
|
||||
pieceLength = (length - pos < bytesLeft) ? length - pos : bytesLeft;
|
||||
System.arraycopy(b, pos, buffer, bufferPosition, pieceLength);
|
||||
System.arraycopy(b, pos + offset, buffer, bufferPosition, pieceLength);
|
||||
pos += pieceLength;
|
||||
bufferPosition += pieceLength;
|
||||
// if the buffer is full, flush it
|
||||
|
@ -92,8 +92,18 @@ public abstract class BufferedIndexOutput extends IndexOutput {
|
|||
* @param b the bytes to write
|
||||
* @param len the number of bytes to write
|
||||
*/
|
||||
protected abstract void flushBuffer(byte[] b, int len) throws IOException;
|
||||
private void flushBuffer(byte[] b, int len) throws IOException {
|
||||
flushBuffer(b, 0, len);
|
||||
}
|
||||
|
||||
/** Expert: implements buffer write. Writes bytes at the current position in
|
||||
* the output.
|
||||
* @param b the bytes to write
|
||||
* @param offset the offset in the byte array
|
||||
* @param len the number of bytes to write
|
||||
*/
|
||||
protected abstract void flushBuffer(byte[] b, int offset, int len) throws IOException;
|
||||
|
||||
/** Closes this stream to further operations. */
|
||||
public void close() throws IOException {
|
||||
flush();
|
||||
|
|
|
@ -588,8 +588,8 @@ class FSIndexOutput extends BufferedIndexOutput {
|
|||
}
|
||||
|
||||
/** output methods: */
|
||||
public void flushBuffer(byte[] b, int size) throws IOException {
|
||||
file.write(b, 0, size);
|
||||
public void flushBuffer(byte[] b, int offset, int size) throws IOException {
|
||||
file.write(b, offset, size);
|
||||
}
|
||||
public void close() throws IOException {
|
||||
// only close the file if it has not been closed yet
|
||||
|
|
|
@ -36,7 +36,17 @@ public abstract class IndexOutput {
|
|||
* @param length the number of bytes to write
|
||||
* @see IndexInput#readBytes(byte[],int,int)
|
||||
*/
|
||||
public abstract void writeBytes(byte[] b, int length) throws IOException;
|
||||
public void writeBytes(byte[] b, int length) throws IOException {
|
||||
writeBytes(b, 0, length);
|
||||
}
|
||||
|
||||
/** Writes an array of bytes.
|
||||
* @param b the bytes to write
|
||||
* @param offset the offset in the byte array
|
||||
* @param length the number of bytes to write
|
||||
* @see IndexInput#readBytes(byte[],int,int)
|
||||
*/
|
||||
public abstract void writeBytes(byte[] b, int offset, int length) throws IOException;
|
||||
|
||||
/** Writes an int as four bytes.
|
||||
* @see IndexInput#readInt()
|
||||
|
|
|
@ -66,7 +66,7 @@ public class RAMOutputStream extends BufferedIndexOutput {
|
|||
file.setLength(0);
|
||||
}
|
||||
|
||||
public void flushBuffer(byte[] src, int len) throws IOException {
|
||||
public void flushBuffer(byte[] src, int offset, int len) throws IOException {
|
||||
byte[] buffer;
|
||||
int bufferPos = 0;
|
||||
while (bufferPos != len) {
|
||||
|
@ -81,7 +81,7 @@ public class RAMOutputStream extends BufferedIndexOutput {
|
|||
else
|
||||
buffer = (byte[]) file.buffers.get(bufferNumber);
|
||||
|
||||
System.arraycopy(src, bufferPos, buffer, bufferOffset, bytesToCopy);
|
||||
System.arraycopy(src, offset + bufferPos, buffer, bufferOffset, bytesToCopy);
|
||||
bufferPos += bytesToCopy;
|
||||
pointer += bytesToCopy;
|
||||
}
|
||||
|
|
|
@ -1013,6 +1013,7 @@
|
|||
<li>If the third lowest-order bit is set (0x04), term positions are stored with the term vectors.</li>
|
||||
<li>If the fourth lowest-order bit is set (0x08), term offsets are stored with the term vectors.</li>
|
||||
<li>If the fifth lowest-order bit is set (0x10), norms are omitted for the indexed field.</li>
|
||||
<li>If the sixth lowest-order bit is set (0x20), payloads are stored for the indexed field.</li>
|
||||
</ul>
|
||||
</p>
|
||||
|
||||
|
@ -1298,9 +1299,9 @@
|
|||
<sup>DocFreq/SkipInterval</sup>
|
||||
</p>
|
||||
<p>SkipDatum -->
|
||||
DocSkip,FreqSkip,ProxSkip
|
||||
DocSkip,PayloadLength?,FreqSkip,ProxSkip
|
||||
</p>
|
||||
<p>DocDelta,Freq,DocSkip,FreqSkip,ProxSkip -->
|
||||
<p>DocDelta,Freq,DocSkip,PayloadLength,FreqSkip,ProxSkip -->
|
||||
VInt
|
||||
</p>
|
||||
<p>TermFreqs
|
||||
|
@ -1328,9 +1329,17 @@
|
|||
SkipInterval
|
||||
<sup>th</sup>
|
||||
document in TermFreqs.
|
||||
Document numbers are represented as differences
|
||||
from the previous value in the sequence. FreqSkip
|
||||
and ProxSkip record the position of every
|
||||
If payloads are disabled for the term's field,
|
||||
then DocSkip represents the difference from the
|
||||
previous value in the sequence.
|
||||
If payloads are enabled for the term's field,
|
||||
then DocSkip/2 represents the difference from the
|
||||
previous value in the sequence. If payloads are enabled
|
||||
and DocSkip is odd,
|
||||
then PayloadLength is stored indicating the length
|
||||
of the last payload before the SkipInterval<sup>th</sup>
|
||||
document in TermPositions.
|
||||
FreqSkip and ProxSkip record the position of every
|
||||
SkipInterval
|
||||
<sup>th</sup>
|
||||
entry in FreqFile and
|
||||
|
@ -1379,12 +1388,21 @@
|
|||
<sup>DocFreq</sup>
|
||||
</p>
|
||||
<p>Positions -->
|
||||
<PositionDelta>
|
||||
<PositionDelta,Payload?>
|
||||
<sup>Freq</sup>
|
||||
</p>
|
||||
<p>Payload -->
|
||||
<PayloadLength?,PayloadData>
|
||||
</p>
|
||||
<p>PositionDelta -->
|
||||
VInt
|
||||
</p>
|
||||
<p>PayloadLength -->
|
||||
VInt
|
||||
</p>
|
||||
<p>PayloadData -->
|
||||
byte<sup>PayloadLength</sup>
|
||||
</p>
|
||||
<p>TermPositions
|
||||
are ordered by term (the term is implicit, from the .tis file).
|
||||
</p>
|
||||
|
@ -1393,19 +1411,30 @@
|
|||
number is implicit from the .frq file).
|
||||
</p>
|
||||
<p>PositionDelta
|
||||
is the difference between the position of the current occurrence in
|
||||
is, if payloads are disabled for the term's field, the difference
|
||||
between the position of the current occurrence in
|
||||
the document and the previous occurrence (or zero, if this is the
|
||||
first occurrence in this document).
|
||||
If payloads are enabled for the term's field, then PositionDelta/2
|
||||
is the difference between the current and the previous position. If
|
||||
payloads are enabled and PositionDelta is odd, then PayloadLength is
|
||||
stored, indicating the length of the payload at the current term position.
|
||||
</p>
|
||||
<p>
|
||||
For example, the TermPositions for a
|
||||
term which occurs as the fourth term in one document, and as the
|
||||
fifth and ninth term in a subsequent document, would be the following
|
||||
sequence of VInts:
|
||||
sequence of VInts (payloads disabled):
|
||||
</p>
|
||||
<p>4,
|
||||
5, 4
|
||||
</p>
|
||||
<p>PayloadData
|
||||
is metadata associated with the current term position. If PayloadLength
|
||||
is stored at the current position, then it indicates the length of this
|
||||
Payload. If PayloadLength is not stored, then this Payload has the same
|
||||
length as the Payload at the previous position.
|
||||
</p>
|
||||
</section>
|
||||
<section id="Normalization Factors"><title>Normalization Factors</title>
|
||||
<p>
|
||||
|
|
|
@ -0,0 +1,443 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
|
||||
|
||||
public class TestPayloads extends TestCase {
|
||||
|
||||
// Simple tests to test the Payload class
|
||||
public void testPayload() throws Exception {
|
||||
byte[] testData = "This is a test!".getBytes();
|
||||
Payload payload = new Payload(testData);
|
||||
assertEquals("Wrong payload length.", testData.length, payload.length());
|
||||
|
||||
// test copyTo()
|
||||
byte[] target = new byte[testData.length - 1];
|
||||
try {
|
||||
payload.copyTo(target, 0);
|
||||
fail("Expected exception not thrown");
|
||||
} catch (Exception expected) {
|
||||
// expected exception
|
||||
}
|
||||
|
||||
target = new byte[testData.length + 3];
|
||||
payload.copyTo(target, 3);
|
||||
|
||||
for (int i = 0; i < testData.length; i++) {
|
||||
assertEquals(testData[i], target[i + 3]);
|
||||
}
|
||||
|
||||
|
||||
// test toByteArray()
|
||||
target = payload.toByteArray();
|
||||
assertByteArrayEquals(testData, target);
|
||||
|
||||
// test byteAt()
|
||||
for (int i = 0; i < testData.length; i++) {
|
||||
assertEquals(payload.byteAt(i), testData[i]);
|
||||
}
|
||||
|
||||
try {
|
||||
payload.byteAt(testData.length + 1);
|
||||
fail("Expected exception not thrown");
|
||||
} catch (Exception expected) {
|
||||
// expected exception
|
||||
}
|
||||
}
|
||||
|
||||
// Tests whether the DocumentWriter and SegmentMerger correctly enable the
|
||||
// payload bit in the FieldInfo
|
||||
public void testPayloadFieldBit() throws Exception {
|
||||
Directory ram = new RAMDirectory();
|
||||
PayloadAnalyzer analyzer = new PayloadAnalyzer();
|
||||
IndexWriter writer = new IndexWriter(ram, analyzer, true);
|
||||
Document d = new Document();
|
||||
// this field won't have any payloads
|
||||
d.add(new Field("f1", "This field has no payloads", Field.Store.NO, Field.Index.TOKENIZED));
|
||||
// this field will have payloads in all docs, however not for all term positions,
|
||||
// so this field is used to check if the DocumentWriter correctly enables the payloads bit
|
||||
// even if only some term positions have payloads
|
||||
d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED));
|
||||
d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED));
|
||||
// this field is used to verify if the SegmentMerger enables payloads for a field if it has payloads
|
||||
// enabled in only some documents
|
||||
d.add(new Field("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.TOKENIZED));
|
||||
// only add payload data for field f2
|
||||
analyzer.setPayloadData("f2", 1, "somedata".getBytes(), 0, 1);
|
||||
writer.addDocument(d);
|
||||
// flush
|
||||
writer.close();
|
||||
|
||||
// only one segment in the index, so we can cast to SegmentReader
|
||||
SegmentReader reader = (SegmentReader) IndexReader.open(ram);
|
||||
FieldInfos fi = reader.fieldInfos();
|
||||
assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
|
||||
assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
|
||||
assertFalse("Payload field bit should not be set.", fi.fieldInfo("f3").storePayloads);
|
||||
reader.close();
|
||||
|
||||
// now we add another document which has payloads for field f3 and verify if the SegmentMerger
|
||||
// enabled payloads for that field
|
||||
writer = new IndexWriter(ram, analyzer, true);
|
||||
d = new Document();
|
||||
d.add(new Field("f1", "This field has no payloads", Field.Store.NO, Field.Index.TOKENIZED));
|
||||
d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED));
|
||||
d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED));
|
||||
d.add(new Field("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.TOKENIZED));
|
||||
// add payload data for field f2 and f3
|
||||
analyzer.setPayloadData("f2", "somedata".getBytes(), 0, 1);
|
||||
analyzer.setPayloadData("f3", "somedata".getBytes(), 0, 3);
|
||||
writer.addDocument(d);
|
||||
// force merge
|
||||
writer.optimize();
|
||||
// flush
|
||||
writer.close();
|
||||
|
||||
// only one segment in the index, so we can cast to SegmentReader
|
||||
reader = (SegmentReader) IndexReader.open(ram);
|
||||
fi = reader.fieldInfos();
|
||||
assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
|
||||
assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
|
||||
assertTrue("Payload field bit should be set.", fi.fieldInfo("f3").storePayloads);
|
||||
reader.close();
|
||||
}
|
||||
|
||||
// Tests if payloads are correctly stored and loaded using both RamDirectory and FSDirectory
|
||||
public void testPayloadsEncoding() throws Exception {
|
||||
// first perform the test using a RAMDirectory
|
||||
Directory dir = new RAMDirectory();
|
||||
performTest(dir);
|
||||
|
||||
// now use a FSDirectory and repeat same test
|
||||
String dirName = "test_payloads";
|
||||
dir = FSDirectory.getDirectory(dirName);
|
||||
performTest(dir);
|
||||
rmDir(dirName);
|
||||
}
|
||||
|
||||
// builds an index with payloads in the given Directory and performs
|
||||
// different tests to verify the payload encoding
|
||||
private void performTest(Directory dir) throws Exception {
|
||||
PayloadAnalyzer analyzer = new PayloadAnalyzer();
|
||||
IndexWriter writer = new IndexWriter(dir, analyzer, true);
|
||||
|
||||
// should be in sync with value in TermInfosWriter
|
||||
final int skipInterval = 16;
|
||||
|
||||
final int numTerms = 5;
|
||||
final String fieldName = "f1";
|
||||
|
||||
int numDocs = skipInterval + 1;
|
||||
// create content for the test documents with just a few terms
|
||||
Term[] terms = generateTerms(fieldName, numTerms);
|
||||
StringBuffer sb = new StringBuffer();
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
sb.append(terms[i].text);
|
||||
sb.append(" ");
|
||||
}
|
||||
String content = sb.toString();
|
||||
|
||||
|
||||
int payloadDataLength = numTerms * numDocs * 2 + numTerms * numDocs * (numDocs - 1) / 2;
|
||||
byte[] payloadData = generateRandomData(payloadDataLength);
|
||||
|
||||
Document d = new Document();
|
||||
d.add(new Field(fieldName, content, Field.Store.NO, Field.Index.TOKENIZED));
|
||||
// add the same document multiple times to have the same payload lengths for all
|
||||
// occurrences within two consecutive skip intervals
|
||||
int offset = 0;
|
||||
for (int i = 0; i < 2 * numDocs; i++) {
|
||||
analyzer.setPayloadData(fieldName, payloadData, offset, 1);
|
||||
offset += numTerms;
|
||||
writer.addDocument(d);
|
||||
}
|
||||
|
||||
// now we make sure to have different payload lengths next at the next skip point
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
analyzer.setPayloadData(fieldName, payloadData, offset, i);
|
||||
offset += i * numTerms;
|
||||
writer.addDocument(d);
|
||||
}
|
||||
|
||||
writer.optimize();
|
||||
// flush
|
||||
writer.close();
|
||||
|
||||
|
||||
/*
|
||||
* Verify the index
|
||||
* first we test if all payloads are stored correctly
|
||||
*/
|
||||
IndexReader reader = IndexReader.open(dir);
|
||||
|
||||
byte[] verifyPayloadData = new byte[payloadDataLength];
|
||||
offset = 0;
|
||||
TermPositions[] tps = new TermPositions[numTerms];
|
||||
for (int i = 0; i < numTerms; i++) {
|
||||
tps[i] = reader.termPositions(terms[i]);
|
||||
}
|
||||
|
||||
while (tps[0].next()) {
|
||||
for (int i = 1; i < numTerms; i++) {
|
||||
tps[i].next();
|
||||
}
|
||||
int freq = tps[0].freq();
|
||||
|
||||
for (int i = 0; i < freq; i++) {
|
||||
for (int j = 0; j < numTerms; j++) {
|
||||
tps[j].nextPosition();
|
||||
tps[j].getPayload(verifyPayloadData, offset);
|
||||
offset += tps[j].getPayloadLength();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < numTerms; i++) {
|
||||
tps[i].close();
|
||||
}
|
||||
|
||||
assertByteArrayEquals(payloadData, verifyPayloadData);
|
||||
|
||||
/*
|
||||
* test lazy skipping
|
||||
*/
|
||||
TermPositions tp = reader.termPositions(terms[0]);
|
||||
tp.next();
|
||||
tp.nextPosition();
|
||||
// now we don't read this payload
|
||||
tp.nextPosition();
|
||||
assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
|
||||
byte[] payload = tp.getPayload(null, 0);
|
||||
assertEquals(payload[0], payloadData[numTerms]);
|
||||
tp.nextPosition();
|
||||
|
||||
// we don't read this payload and skip to a different document
|
||||
tp.skipTo(5);
|
||||
tp.nextPosition();
|
||||
assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
|
||||
payload = tp.getPayload(null, 0);
|
||||
assertEquals(payload[0], payloadData[5 * numTerms]);
|
||||
|
||||
|
||||
/*
|
||||
* Test different lengths at skip points
|
||||
*/
|
||||
tp.seek(terms[1]);
|
||||
tp.next();
|
||||
tp.nextPosition();
|
||||
assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
|
||||
tp.skipTo(skipInterval - 1);
|
||||
tp.nextPosition();
|
||||
assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
|
||||
tp.skipTo(2 * skipInterval - 1);
|
||||
tp.nextPosition();
|
||||
assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
|
||||
tp.skipTo(3 * skipInterval - 1);
|
||||
tp.nextPosition();
|
||||
assertEquals("Wrong payload length.", 3 * skipInterval - 2 * numDocs - 1, tp.getPayloadLength());
|
||||
|
||||
/*
|
||||
* Test multiple call of getPayload()
|
||||
*/
|
||||
tp.getPayload(null, 0);
|
||||
try {
|
||||
// it is forbidden to call getPayload() more than once
|
||||
// without calling nextPosition()
|
||||
tp.getPayload(null, 0);
|
||||
fail("Expected exception not thrown");
|
||||
} catch (Exception expected) {
|
||||
// expected exception
|
||||
}
|
||||
|
||||
reader.close();
|
||||
|
||||
// test long payload
|
||||
analyzer = new PayloadAnalyzer();
|
||||
writer = new IndexWriter(dir, analyzer, true);
|
||||
String singleTerm = "lucene";
|
||||
|
||||
d = new Document();
|
||||
d.add(new Field(fieldName, singleTerm, Field.Store.NO, Field.Index.TOKENIZED));
|
||||
// add a payload whose length is greater than the buffer size of BufferedIndexOutput
|
||||
payloadData = generateRandomData(2000);
|
||||
analyzer.setPayloadData(fieldName, payloadData, 100, 1500);
|
||||
writer.addDocument(d);
|
||||
|
||||
|
||||
writer.optimize();
|
||||
// flush
|
||||
writer.close();
|
||||
|
||||
reader = IndexReader.open(dir);
|
||||
tp = reader.termPositions(new Term(fieldName, singleTerm));
|
||||
tp.next();
|
||||
tp.nextPosition();
|
||||
|
||||
verifyPayloadData = new byte[tp.getPayloadLength()];
|
||||
tp.getPayload(verifyPayloadData, 0);
|
||||
byte[] portion = new byte[1500];
|
||||
System.arraycopy(payloadData, 100, portion, 0, 1500);
|
||||
|
||||
assertByteArrayEquals(portion, verifyPayloadData);
|
||||
reader.close();
|
||||
|
||||
}
|
||||
|
||||
private byte[] generateRandomData(int n) {
|
||||
Random rnd = new Random();
|
||||
byte[] data = new byte[n];
|
||||
rnd.nextBytes(data);
|
||||
return data;
|
||||
}
|
||||
|
||||
private Term[] generateTerms(String fieldName, int n) {
|
||||
int maxDigits = (int) (Math.log(n) / Math.log(10));
|
||||
Term[] terms = new Term[n];
|
||||
StringBuffer sb = new StringBuffer();
|
||||
for (int i = 0; i < n; i++) {
|
||||
sb.setLength(0);
|
||||
sb.append("t");
|
||||
int zeros = maxDigits - (int) (Math.log(i) / Math.log(10));
|
||||
for (int j = 0; j < zeros; j++) {
|
||||
sb.append("0");
|
||||
}
|
||||
sb.append(i);
|
||||
terms[i] = new Term(fieldName, sb.toString());
|
||||
}
|
||||
return terms;
|
||||
}
|
||||
|
||||
|
||||
private void rmDir(String dir) {
|
||||
File fileDir = new File(dir);
|
||||
if (fileDir.exists()) {
|
||||
File[] files = fileDir.listFiles();
|
||||
if (files != null) {
|
||||
for (int i = 0; i < files.length; i++) {
|
||||
files[i].delete();
|
||||
}
|
||||
}
|
||||
fileDir.delete();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
void assertByteArrayEquals(byte[] b1, byte[] b2) {
|
||||
if (b1.length != b2.length) {
|
||||
fail("Byte arrays have different lengths: " + b1.length + ", " + b2.length);
|
||||
}
|
||||
|
||||
for (int i = 0; i < b1.length; i++) {
|
||||
if (b1[i] != b2[i]) {
|
||||
fail("Byte arrays different at index " + i + ": " + b1[i] + ", " + b2[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This Analyzer uses an WhitespaceTokenizer and PayloadFilter.
|
||||
*/
|
||||
private static class PayloadAnalyzer extends Analyzer {
|
||||
Map fieldToData = new HashMap();
|
||||
|
||||
void setPayloadData(String field, byte[] data, int offset, int length) {
|
||||
fieldToData.put(field, new PayloadData(0, data, offset, length));
|
||||
}
|
||||
|
||||
void setPayloadData(String field, int numFieldInstancesToSkip, byte[] data, int offset, int length) {
|
||||
fieldToData.put(field, new PayloadData(numFieldInstancesToSkip, data, offset, length));
|
||||
}
|
||||
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
PayloadData payload = (PayloadData) fieldToData.get(fieldName);
|
||||
TokenStream ts = new WhitespaceTokenizer(reader);
|
||||
if (payload != null) {
|
||||
if (payload.numFieldInstancesToSkip == 0) {
|
||||
ts = new PayloadFilter(ts, payload.data, payload.offset, payload.length);
|
||||
} else {
|
||||
payload.numFieldInstancesToSkip--;
|
||||
}
|
||||
}
|
||||
return ts;
|
||||
}
|
||||
|
||||
private static class PayloadData {
|
||||
byte[] data;
|
||||
int offset;
|
||||
int length;
|
||||
int numFieldInstancesToSkip;
|
||||
|
||||
PayloadData(int skip, byte[] data, int offset, int length) {
|
||||
numFieldInstancesToSkip = skip;
|
||||
this.data = data;
|
||||
this.offset = offset;
|
||||
this.length = length;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This Filter adds payloads to the tokens.
|
||||
*/
|
||||
private static class PayloadFilter extends TokenFilter {
|
||||
private byte[] data;
|
||||
private int length;
|
||||
private int offset;
|
||||
|
||||
public PayloadFilter(TokenStream in, byte[] data, int offset, int length) {
|
||||
super(in);
|
||||
this.data = data;
|
||||
this.length = length;
|
||||
this.offset = offset;
|
||||
}
|
||||
|
||||
public Token next() throws IOException {
|
||||
Token nextToken = input.next();
|
||||
if (nextToken != null && offset + length <= data.length) {
|
||||
nextToken.setPayload(new Payload(data, offset, length));
|
||||
offset += length;
|
||||
}
|
||||
|
||||
return nextToken;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -48,7 +48,7 @@ public class MockRAMOutputStream extends RAMOutputStream {
|
|||
}
|
||||
}
|
||||
|
||||
public void flushBuffer(byte[] src, int len) throws IOException {
|
||||
public void flushBuffer(byte[] src, int offset, int len) throws IOException {
|
||||
long freeSpace = dir.maxSize - dir.sizeInBytes();
|
||||
long realUsage = 0;
|
||||
|
||||
|
@ -63,14 +63,14 @@ public class MockRAMOutputStream extends RAMOutputStream {
|
|||
if (dir.maxSize != 0 && freeSpace <= len) {
|
||||
if (freeSpace > 0 && freeSpace < len) {
|
||||
realUsage += freeSpace;
|
||||
super.flushBuffer(src, (int) freeSpace);
|
||||
super.flushBuffer(src, offset, (int) freeSpace);
|
||||
}
|
||||
if (realUsage > dir.maxUsedSize) {
|
||||
dir.maxUsedSize = realUsage;
|
||||
}
|
||||
throw new IOException("fake disk full at " + dir.getRecomputedActualSizeInBytes() + " bytes");
|
||||
} else {
|
||||
super.flushBuffer(src, len);
|
||||
super.flushBuffer(src, offset, len);
|
||||
}
|
||||
|
||||
if (first) {
|
||||
|
|
Loading…
Reference in New Issue