mirror of https://github.com/apache/lucene.git
LUCENE-755: Added the ability to store arbitrary binary metadata (payloads) in the posting list.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@518486 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9da8211775
commit
eb20c06a62
|
@ -82,6 +82,13 @@ New features
|
||||||
|
|
||||||
2. LUCENE-822: Added FieldSelector capabilities to Searchable for use with RemoteSearcher, and other Searchable implementations. (Mark Miller, Grant Ingersoll)
|
2. LUCENE-822: Added FieldSelector capabilities to Searchable for use with RemoteSearcher, and other Searchable implementations. (Mark Miller, Grant Ingersoll)
|
||||||
|
|
||||||
|
3. LUCENE-755: Added the ability to store arbitrary binary metadata in the posting list.
|
||||||
|
These metadata are called Payloads. For every position of a Token one Payload in the form
|
||||||
|
of a variable length byte array can be stored in the prox file.
|
||||||
|
Remark: The APIs introduced with this feature are in experimental state and thus
|
||||||
|
contain appropriate warnings in the javadocs.
|
||||||
|
(Michael Busch)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
1. LUCENE-761: The proxStream is now cloned lazily in SegmentTermPositions
|
1. LUCENE-761: The proxStream is now cloned lazily in SegmentTermPositions
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
package org.apache.lucene.analysis;
|
package org.apache.lucene.analysis;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.Payload;
|
||||||
|
import org.apache.lucene.index.TermPositions;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
@ -20,23 +23,40 @@ package org.apache.lucene.analysis;
|
||||||
/** A Token is an occurence of a term from the text of a field. It consists of
|
/** A Token is an occurence of a term from the text of a field. It consists of
|
||||||
a term's text, the start and end offset of the term in the text of the field,
|
a term's text, the start and end offset of the term in the text of the field,
|
||||||
and a type string.
|
and a type string.
|
||||||
|
<p>
|
||||||
The start and end offsets permit applications to re-associate a token with
|
The start and end offsets permit applications to re-associate a token with
|
||||||
its source text, e.g., to display highlighted query terms in a document
|
its source text, e.g., to display highlighted query terms in a document
|
||||||
browser, or to show matching text fragments in a KWIC (KeyWord In Context)
|
browser, or to show matching text fragments in a KWIC (KeyWord In Context)
|
||||||
display, etc.
|
display, etc.
|
||||||
|
<p>
|
||||||
The type is an interned string, assigned by a lexical analyzer
|
The type is an interned string, assigned by a lexical analyzer
|
||||||
(a.k.a. tokenizer), naming the lexical or syntactic class that the token
|
(a.k.a. tokenizer), naming the lexical or syntactic class that the token
|
||||||
belongs to. For example an end of sentence marker token might be implemented
|
belongs to. For example an end of sentence marker token might be implemented
|
||||||
with type "eos". The default token type is "word". */
|
with type "eos". The default token type is "word".
|
||||||
|
<p>
|
||||||
|
A Token can optionally have metadata (a.k.a. Payload) in the form of a variable
|
||||||
|
length byte array. Use {@link TermPositions#getPayloadLength()} and
|
||||||
|
{@link TermPositions#getPayload(byte[], int)} to retrieve the payloads from the index.
|
||||||
|
|
||||||
|
<br><br>
|
||||||
|
<b>
|
||||||
|
Warning: The status of the Payloads feature is experimental. The APIs
|
||||||
|
introduced here might change in the future and will not be supported anymore
|
||||||
|
in such a case. If you want to use this feature in a production environment
|
||||||
|
you should wait for an official release.
|
||||||
|
</b>
|
||||||
|
|
||||||
|
@see org.apache.lucene.index.Payload
|
||||||
|
*/
|
||||||
|
// TODO: Remove warning after API has been finalized
|
||||||
public class Token implements Cloneable {
|
public class Token implements Cloneable {
|
||||||
String termText; // the text of the term
|
String termText; // the text of the term
|
||||||
int startOffset; // start in source text
|
int startOffset; // start in source text
|
||||||
int endOffset; // end in source text
|
int endOffset; // end in source text
|
||||||
String type = "word"; // lexical type
|
String type = "word"; // lexical type
|
||||||
|
|
||||||
|
Payload payload;
|
||||||
|
|
||||||
private int positionIncrement = 1;
|
private int positionIncrement = 1;
|
||||||
|
|
||||||
/** Constructs a Token with the given term text, and start & end offsets.
|
/** Constructs a Token with the given term text, and start & end offsets.
|
||||||
|
@ -115,6 +135,36 @@ public class Token implements Cloneable {
|
||||||
/** Returns this Token's lexical type. Defaults to "word". */
|
/** Returns this Token's lexical type. Defaults to "word". */
|
||||||
public final String type() { return type; }
|
public final String type() { return type; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets this Token's payload.<br>
|
||||||
|
* <br>
|
||||||
|
* <b>
|
||||||
|
* Warning: The status of the Payloads feature is experimental. The APIs
|
||||||
|
* introduced here might change in the future and will not be supported anymore
|
||||||
|
* in such a case. If you want to use this feature in a production environment
|
||||||
|
* you should wait for an official release.
|
||||||
|
* </b>
|
||||||
|
*/
|
||||||
|
// TODO: Remove warning after API has been finalized
|
||||||
|
public void setPayload(Payload payload) {
|
||||||
|
this.payload = payload;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns this Token's payload.<br>
|
||||||
|
* <br>
|
||||||
|
* <b>
|
||||||
|
* Warning: The status of the Payloads feature is experimental. The APIs
|
||||||
|
* introduced here might change in the future and will not be supported anymore
|
||||||
|
* in such a case. If you want to use this feature in a production environment
|
||||||
|
* you should wait for an official release.
|
||||||
|
* </b>
|
||||||
|
*/
|
||||||
|
// TODO: Remove warning after API has been finalized
|
||||||
|
public Payload getPayload() {
|
||||||
|
return this.payload;
|
||||||
|
}
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
StringBuffer sb = new StringBuffer();
|
StringBuffer sb = new StringBuffer();
|
||||||
sb.append("(" + termText + "," + startOffset + "," + endOffset);
|
sb.append("(" + termText + "," + startOffset + "," + endOffset);
|
||||||
|
|
|
@ -31,6 +31,7 @@ import java.io.PrintStream;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.BitSet;
|
||||||
import java.util.Enumeration;
|
import java.util.Enumeration;
|
||||||
import java.util.Hashtable;
|
import java.util.Hashtable;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
@ -69,9 +70,30 @@ final class DocumentWriter {
|
||||||
|
|
||||||
final void addDocument(String segment, Document doc)
|
final void addDocument(String segment, Document doc)
|
||||||
throws CorruptIndexException, IOException {
|
throws CorruptIndexException, IOException {
|
||||||
// write field names
|
// create field infos
|
||||||
fieldInfos = new FieldInfos();
|
fieldInfos = new FieldInfos();
|
||||||
fieldInfos.add(doc);
|
fieldInfos.add(doc);
|
||||||
|
|
||||||
|
// invert doc into postingTable
|
||||||
|
postingTable.clear(); // clear postingTable
|
||||||
|
fieldLengths = new int[fieldInfos.size()]; // init fieldLengths
|
||||||
|
fieldPositions = new int[fieldInfos.size()]; // init fieldPositions
|
||||||
|
fieldOffsets = new int[fieldInfos.size()]; // init fieldOffsets
|
||||||
|
fieldStoresPayloads = new BitSet(fieldInfos.size());
|
||||||
|
|
||||||
|
fieldBoosts = new float[fieldInfos.size()]; // init fieldBoosts
|
||||||
|
Arrays.fill(fieldBoosts, doc.getBoost());
|
||||||
|
|
||||||
|
// Before we write the FieldInfos we invert the Document. The reason is that
|
||||||
|
// during invertion the TokenStreams of tokenized fields are being processed
|
||||||
|
// and we might encounter tokens that have payloads associated with them. In
|
||||||
|
// this case we have to update the FieldInfo of the particular field.
|
||||||
|
invertDocument(doc);
|
||||||
|
|
||||||
|
// sort postingTable into an array
|
||||||
|
Posting[] postings = sortPostingTable();
|
||||||
|
|
||||||
|
// write field infos
|
||||||
fieldInfos.write(directory, segment + ".fnm");
|
fieldInfos.write(directory, segment + ".fnm");
|
||||||
|
|
||||||
// write field values
|
// write field values
|
||||||
|
@ -82,21 +104,7 @@ final class DocumentWriter {
|
||||||
} finally {
|
} finally {
|
||||||
fieldsWriter.close();
|
fieldsWriter.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
// invert doc into postingTable
|
|
||||||
postingTable.clear(); // clear postingTable
|
|
||||||
fieldLengths = new int[fieldInfos.size()]; // init fieldLengths
|
|
||||||
fieldPositions = new int[fieldInfos.size()]; // init fieldPositions
|
|
||||||
fieldOffsets = new int[fieldInfos.size()]; // init fieldOffsets
|
|
||||||
|
|
||||||
fieldBoosts = new float[fieldInfos.size()]; // init fieldBoosts
|
|
||||||
Arrays.fill(fieldBoosts, doc.getBoost());
|
|
||||||
|
|
||||||
invertDocument(doc);
|
|
||||||
|
|
||||||
// sort postingTable into an array
|
|
||||||
Posting[] postings = sortPostingTable();
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
for (int i = 0; i < postings.length; i++) {
|
for (int i = 0; i < postings.length; i++) {
|
||||||
Posting posting = postings[i];
|
Posting posting = postings[i];
|
||||||
|
@ -125,6 +133,10 @@ final class DocumentWriter {
|
||||||
private int[] fieldPositions;
|
private int[] fieldPositions;
|
||||||
private int[] fieldOffsets;
|
private int[] fieldOffsets;
|
||||||
private float[] fieldBoosts;
|
private float[] fieldBoosts;
|
||||||
|
|
||||||
|
// If any of the tokens of a paticular field carry a payload
|
||||||
|
// then we enable payloads for that field.
|
||||||
|
private BitSet fieldStoresPayloads;
|
||||||
|
|
||||||
// Tokenizes the fields of a document into Postings.
|
// Tokenizes the fields of a document into Postings.
|
||||||
private final void invertDocument(Document doc)
|
private final void invertDocument(Document doc)
|
||||||
|
@ -144,9 +156,9 @@ final class DocumentWriter {
|
||||||
if (!field.isTokenized()) { // un-tokenized field
|
if (!field.isTokenized()) { // un-tokenized field
|
||||||
String stringValue = field.stringValue();
|
String stringValue = field.stringValue();
|
||||||
if(field.isStoreOffsetWithTermVector())
|
if(field.isStoreOffsetWithTermVector())
|
||||||
addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length()));
|
addPosition(fieldName, stringValue, position++, null, new TermVectorOffsetInfo(offset, offset + stringValue.length()));
|
||||||
else
|
else
|
||||||
addPosition(fieldName, stringValue, position++, null);
|
addPosition(fieldName, stringValue, position++, null, null);
|
||||||
offset += stringValue.length();
|
offset += stringValue.length();
|
||||||
length++;
|
length++;
|
||||||
} else
|
} else
|
||||||
|
@ -167,10 +179,19 @@ final class DocumentWriter {
|
||||||
for (Token t = stream.next(); t != null; t = stream.next()) {
|
for (Token t = stream.next(); t != null; t = stream.next()) {
|
||||||
position += (t.getPositionIncrement() - 1);
|
position += (t.getPositionIncrement() - 1);
|
||||||
|
|
||||||
if(field.isStoreOffsetWithTermVector())
|
Payload payload = t.getPayload();
|
||||||
addPosition(fieldName, t.termText(), position++, new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset()));
|
if (payload != null) {
|
||||||
else
|
// enable payloads for this field
|
||||||
addPosition(fieldName, t.termText(), position++, null);
|
fieldStoresPayloads.set(fieldNumber);
|
||||||
|
}
|
||||||
|
|
||||||
|
TermVectorOffsetInfo termVectorOffsetInfo;
|
||||||
|
if (field.isStoreOffsetWithTermVector()) {
|
||||||
|
termVectorOffsetInfo = new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset());
|
||||||
|
} else {
|
||||||
|
termVectorOffsetInfo = null;
|
||||||
|
}
|
||||||
|
addPosition(fieldName, t.termText(), position++, payload, termVectorOffsetInfo);
|
||||||
|
|
||||||
lastToken = t;
|
lastToken = t;
|
||||||
if (++length >= maxFieldLength) {
|
if (++length >= maxFieldLength) {
|
||||||
|
@ -194,11 +215,16 @@ final class DocumentWriter {
|
||||||
fieldOffsets[fieldNumber] = offset;
|
fieldOffsets[fieldNumber] = offset;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// update fieldInfos for all fields that have one or more tokens with payloads
|
||||||
|
for (int i = fieldStoresPayloads.nextSetBit(0); i >= 0; i = fieldStoresPayloads.nextSetBit(i+1)) {
|
||||||
|
fieldInfos.fieldInfo(i).storePayloads = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private final Term termBuffer = new Term("", ""); // avoid consing
|
private final Term termBuffer = new Term("", ""); // avoid consing
|
||||||
|
|
||||||
private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset) {
|
private final void addPosition(String field, String text, int position, Payload payload, TermVectorOffsetInfo offset) {
|
||||||
termBuffer.set(field, text);
|
termBuffer.set(field, text);
|
||||||
//System.out.println("Offset: " + offset);
|
//System.out.println("Offset: " + offset);
|
||||||
Posting ti = (Posting) postingTable.get(termBuffer);
|
Posting ti = (Posting) postingTable.get(termBuffer);
|
||||||
|
@ -209,9 +235,25 @@ final class DocumentWriter {
|
||||||
int[] positions = ti.positions;
|
int[] positions = ti.positions;
|
||||||
System.arraycopy(positions, 0, newPositions, 0, freq);
|
System.arraycopy(positions, 0, newPositions, 0, freq);
|
||||||
ti.positions = newPositions;
|
ti.positions = newPositions;
|
||||||
|
|
||||||
|
if (ti.payloads != null) {
|
||||||
|
// the current field stores payloads
|
||||||
|
Payload[] newPayloads = new Payload[freq * 2]; // grow payloads array
|
||||||
|
Payload[] payloads = ti.payloads;
|
||||||
|
System.arraycopy(payloads, 0, newPayloads, 0, payloads.length);
|
||||||
|
ti.payloads = newPayloads;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
ti.positions[freq] = position; // add new position
|
ti.positions[freq] = position; // add new position
|
||||||
|
|
||||||
|
if (payload != null) {
|
||||||
|
if (ti.payloads == null) {
|
||||||
|
// lazily allocate payload array
|
||||||
|
ti.payloads = new Payload[ti.positions.length];
|
||||||
|
}
|
||||||
|
ti.payloads[freq] = payload;
|
||||||
|
}
|
||||||
|
|
||||||
if (offset != null) {
|
if (offset != null) {
|
||||||
if (ti.offsets.length == freq){
|
if (ti.offsets.length == freq){
|
||||||
TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[freq*2];
|
TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[freq*2];
|
||||||
|
@ -224,7 +266,7 @@ final class DocumentWriter {
|
||||||
ti.freq = freq + 1; // update frequency
|
ti.freq = freq + 1; // update frequency
|
||||||
} else { // word not seen before
|
} else { // word not seen before
|
||||||
Term term = new Term(field, text, false);
|
Term term = new Term(field, text, false);
|
||||||
postingTable.put(term, new Posting(term, position, offset));
|
postingTable.put(term, new Posting(term, position, payload, offset));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -307,10 +349,31 @@ final class DocumentWriter {
|
||||||
termIndexInterval);
|
termIndexInterval);
|
||||||
TermInfo ti = new TermInfo();
|
TermInfo ti = new TermInfo();
|
||||||
String currentField = null;
|
String currentField = null;
|
||||||
|
boolean currentFieldHasPayloads = false;
|
||||||
|
|
||||||
for (int i = 0; i < postings.length; i++) {
|
for (int i = 0; i < postings.length; i++) {
|
||||||
Posting posting = postings[i];
|
Posting posting = postings[i];
|
||||||
|
|
||||||
|
// check to see if we switched to a new field
|
||||||
|
String termField = posting.term.field();
|
||||||
|
if (currentField != termField) {
|
||||||
|
// changing field - see if there is something to save
|
||||||
|
currentField = termField;
|
||||||
|
FieldInfo fi = fieldInfos.fieldInfo(currentField);
|
||||||
|
currentFieldHasPayloads = fi.storePayloads;
|
||||||
|
if (fi.storeTermVector) {
|
||||||
|
if (termVectorWriter == null) {
|
||||||
|
termVectorWriter =
|
||||||
|
new TermVectorsWriter(directory, segment, fieldInfos);
|
||||||
|
termVectorWriter.openDocument();
|
||||||
|
}
|
||||||
|
termVectorWriter.openField(currentField);
|
||||||
|
|
||||||
|
} else if (termVectorWriter != null) {
|
||||||
|
termVectorWriter.closeField();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// add an entry to the dictionary with pointers to prox and freq files
|
// add an entry to the dictionary with pointers to prox and freq files
|
||||||
ti.set(1, freq.getFilePointer(), prox.getFilePointer(), -1);
|
ti.set(1, freq.getFilePointer(), prox.getFilePointer(), -1);
|
||||||
tis.add(posting.term, ti);
|
tis.add(posting.term, ti);
|
||||||
|
@ -326,28 +389,62 @@ final class DocumentWriter {
|
||||||
|
|
||||||
int lastPosition = 0; // write positions
|
int lastPosition = 0; // write positions
|
||||||
int[] positions = posting.positions;
|
int[] positions = posting.positions;
|
||||||
|
Payload[] payloads = posting.payloads;
|
||||||
|
int lastPayloadLength = -1;
|
||||||
|
|
||||||
|
|
||||||
|
// The following encoding is being used for positions and payloads:
|
||||||
|
// Case 1: current field does not store payloads
|
||||||
|
// Positions -> <PositionDelta>^freq
|
||||||
|
// PositionDelta -> VInt
|
||||||
|
// The PositionDelta is the difference between the current
|
||||||
|
// and the previous position
|
||||||
|
// Case 2: current field stores payloads
|
||||||
|
// Positions -> <PositionDelta, Payload>^freq
|
||||||
|
// Payload -> <PayloadLength?, PayloadData>
|
||||||
|
// PositionDelta -> VInt
|
||||||
|
// PayloadLength -> VInt
|
||||||
|
// PayloadData -> byte^PayloadLength
|
||||||
|
// In this case PositionDelta/2 is the difference between
|
||||||
|
// the current and the previous position. If PositionDelta
|
||||||
|
// is odd, then a PayloadLength encoded as VInt follows,
|
||||||
|
// if PositionDelta is even, then it is assumed that the
|
||||||
|
// length of the current Payload equals the length of the
|
||||||
|
// previous Payload.
|
||||||
for (int j = 0; j < postingFreq; j++) { // use delta-encoding
|
for (int j = 0; j < postingFreq; j++) { // use delta-encoding
|
||||||
int position = positions[j];
|
int position = positions[j];
|
||||||
prox.writeVInt(position - lastPosition);
|
int delta = position - lastPosition;
|
||||||
lastPosition = position;
|
if (currentFieldHasPayloads) {
|
||||||
}
|
int payloadLength = 0;
|
||||||
// check to see if we switched to a new field
|
Payload payload = null;
|
||||||
String termField = posting.term.field();
|
if (payloads != null) {
|
||||||
if (currentField != termField) {
|
payload = payloads[j];
|
||||||
// changing field - see if there is something to save
|
if (payload != null) {
|
||||||
currentField = termField;
|
payloadLength = payload.length;
|
||||||
FieldInfo fi = fieldInfos.fieldInfo(currentField);
|
}
|
||||||
if (fi.storeTermVector) {
|
|
||||||
if (termVectorWriter == null) {
|
|
||||||
termVectorWriter =
|
|
||||||
new TermVectorsWriter(directory, segment, fieldInfos);
|
|
||||||
termVectorWriter.openDocument();
|
|
||||||
}
|
}
|
||||||
termVectorWriter.openField(currentField);
|
if (payloadLength == lastPayloadLength) {
|
||||||
|
// the length of the current payload equals the length
|
||||||
} else if (termVectorWriter != null) {
|
// of the previous one. So we do not have to store the length
|
||||||
termVectorWriter.closeField();
|
// again and we only shift the position delta by one bit
|
||||||
|
prox.writeVInt(delta * 2);
|
||||||
|
} else {
|
||||||
|
// the length of the current payload is different from the
|
||||||
|
// previous one. We shift the position delta, set the lowest
|
||||||
|
// bit and store the current payload length as VInt.
|
||||||
|
prox.writeVInt(delta * 2 + 1);
|
||||||
|
prox.writeVInt(payloadLength);
|
||||||
|
lastPayloadLength = payloadLength;
|
||||||
|
}
|
||||||
|
if (payloadLength > 0) {
|
||||||
|
// write current payload
|
||||||
|
prox.writeBytes(payload.data, payload.offset, payload.length);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// field does not store payloads, just write position delta as VInt
|
||||||
|
prox.writeVInt(delta);
|
||||||
}
|
}
|
||||||
|
lastPosition = position;
|
||||||
}
|
}
|
||||||
if (termVectorWriter != null && termVectorWriter.isFieldOpen()) {
|
if (termVectorWriter != null && termVectorWriter.isFieldOpen()) {
|
||||||
termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets);
|
termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets);
|
||||||
|
@ -397,18 +494,27 @@ final class Posting { // info about a Term in a doc
|
||||||
Term term; // the Term
|
Term term; // the Term
|
||||||
int freq; // its frequency in doc
|
int freq; // its frequency in doc
|
||||||
int[] positions; // positions it occurs at
|
int[] positions; // positions it occurs at
|
||||||
|
Payload[] payloads; // the payloads of the terms
|
||||||
TermVectorOffsetInfo [] offsets;
|
TermVectorOffsetInfo [] offsets;
|
||||||
|
|
||||||
|
|
||||||
Posting(Term t, int position, TermVectorOffsetInfo offset) {
|
Posting(Term t, int position, Payload payload, TermVectorOffsetInfo offset) {
|
||||||
term = t;
|
term = t;
|
||||||
freq = 1;
|
freq = 1;
|
||||||
positions = new int[1];
|
positions = new int[1];
|
||||||
positions[0] = position;
|
positions[0] = position;
|
||||||
|
|
||||||
|
if (payload != null) {
|
||||||
|
payloads = new Payload[1];
|
||||||
|
payloads[0] = payload;
|
||||||
|
} else
|
||||||
|
payloads = null;
|
||||||
|
|
||||||
|
|
||||||
if(offset != null){
|
if(offset != null){
|
||||||
offsets = new TermVectorOffsetInfo[1];
|
offsets = new TermVectorOffsetInfo[1];
|
||||||
offsets[0] = offset;
|
offsets[0] = offset;
|
||||||
}
|
} else
|
||||||
else
|
|
||||||
offsets = null;
|
offsets = null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,9 +28,12 @@ final class FieldInfo {
|
||||||
boolean storePositionWithTermVector;
|
boolean storePositionWithTermVector;
|
||||||
|
|
||||||
boolean omitNorms; // omit norms associated with indexed fields
|
boolean omitNorms; // omit norms associated with indexed fields
|
||||||
|
|
||||||
|
boolean storePayloads; // whether this field stores payloads together with term positions
|
||||||
|
|
||||||
FieldInfo(String na, boolean tk, int nu, boolean storeTermVector,
|
FieldInfo(String na, boolean tk, int nu, boolean storeTermVector,
|
||||||
boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, boolean omitNorms) {
|
boolean storePositionWithTermVector, boolean storeOffsetWithTermVector,
|
||||||
|
boolean omitNorms, boolean storePayloads) {
|
||||||
name = na;
|
name = na;
|
||||||
isIndexed = tk;
|
isIndexed = tk;
|
||||||
number = nu;
|
number = nu;
|
||||||
|
@ -38,5 +41,6 @@ final class FieldInfo {
|
||||||
this.storeOffsetWithTermVector = storeOffsetWithTermVector;
|
this.storeOffsetWithTermVector = storeOffsetWithTermVector;
|
||||||
this.storePositionWithTermVector = storePositionWithTermVector;
|
this.storePositionWithTermVector = storePositionWithTermVector;
|
||||||
this.omitNorms = omitNorms;
|
this.omitNorms = omitNorms;
|
||||||
|
this.storePayloads = storePayloads;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,6 +39,7 @@ final class FieldInfos {
|
||||||
static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x4;
|
static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x4;
|
||||||
static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x8;
|
static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x8;
|
||||||
static final byte OMIT_NORMS = 0x10;
|
static final byte OMIT_NORMS = 0x10;
|
||||||
|
static final byte STORE_PAYLOADS = 0x20;
|
||||||
|
|
||||||
private ArrayList byNumber = new ArrayList();
|
private ArrayList byNumber = new ArrayList();
|
||||||
private HashMap byName = new HashMap();
|
private HashMap byName = new HashMap();
|
||||||
|
@ -156,9 +157,29 @@ final class FieldInfos {
|
||||||
*/
|
*/
|
||||||
public void add(String name, boolean isIndexed, boolean storeTermVector,
|
public void add(String name, boolean isIndexed, boolean storeTermVector,
|
||||||
boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, boolean omitNorms) {
|
boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, boolean omitNorms) {
|
||||||
|
add(name, isIndexed, storeTermVector, storePositionWithTermVector,
|
||||||
|
storeOffsetWithTermVector, omitNorms, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** If the field is not yet known, adds it. If it is known, checks to make
|
||||||
|
* sure that the isIndexed flag is the same as was given previously for this
|
||||||
|
* field. If not - marks it as being indexed. Same goes for the TermVector
|
||||||
|
* parameters.
|
||||||
|
*
|
||||||
|
* @param name The name of the field
|
||||||
|
* @param isIndexed true if the field is indexed
|
||||||
|
* @param storeTermVector true if the term vector should be stored
|
||||||
|
* @param storePositionWithTermVector true if the term vector with positions should be stored
|
||||||
|
* @param storeOffsetWithTermVector true if the term vector with offsets should be stored
|
||||||
|
* @param omitNorms true if the norms for the indexed field should be omitted
|
||||||
|
* @param storePayloads true if payloads should be stored for this field
|
||||||
|
*/
|
||||||
|
public void add(String name, boolean isIndexed, boolean storeTermVector,
|
||||||
|
boolean storePositionWithTermVector, boolean storeOffsetWithTermVector,
|
||||||
|
boolean omitNorms, boolean storePayloads) {
|
||||||
FieldInfo fi = fieldInfo(name);
|
FieldInfo fi = fieldInfo(name);
|
||||||
if (fi == null) {
|
if (fi == null) {
|
||||||
addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms);
|
addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads);
|
||||||
} else {
|
} else {
|
||||||
if (fi.isIndexed != isIndexed) {
|
if (fi.isIndexed != isIndexed) {
|
||||||
fi.isIndexed = true; // once indexed, always index
|
fi.isIndexed = true; // once indexed, always index
|
||||||
|
@ -175,6 +196,9 @@ final class FieldInfos {
|
||||||
if (fi.omitNorms != omitNorms) {
|
if (fi.omitNorms != omitNorms) {
|
||||||
fi.omitNorms = false; // once norms are stored, always store
|
fi.omitNorms = false; // once norms are stored, always store
|
||||||
}
|
}
|
||||||
|
if (fi.storePayloads != storePayloads) {
|
||||||
|
fi.storePayloads = true;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -182,10 +206,10 @@ final class FieldInfos {
|
||||||
|
|
||||||
private void addInternal(String name, boolean isIndexed,
|
private void addInternal(String name, boolean isIndexed,
|
||||||
boolean storeTermVector, boolean storePositionWithTermVector,
|
boolean storeTermVector, boolean storePositionWithTermVector,
|
||||||
boolean storeOffsetWithTermVector, boolean omitNorms) {
|
boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads) {
|
||||||
FieldInfo fi =
|
FieldInfo fi =
|
||||||
new FieldInfo(name, isIndexed, byNumber.size(), storeTermVector, storePositionWithTermVector,
|
new FieldInfo(name, isIndexed, byNumber.size(), storeTermVector, storePositionWithTermVector,
|
||||||
storeOffsetWithTermVector, omitNorms);
|
storeOffsetWithTermVector, omitNorms, storePayloads);
|
||||||
byNumber.add(fi);
|
byNumber.add(fi);
|
||||||
byName.put(name, fi);
|
byName.put(name, fi);
|
||||||
}
|
}
|
||||||
|
@ -271,6 +295,7 @@ final class FieldInfos {
|
||||||
if (fi.storePositionWithTermVector) bits |= STORE_POSITIONS_WITH_TERMVECTOR;
|
if (fi.storePositionWithTermVector) bits |= STORE_POSITIONS_WITH_TERMVECTOR;
|
||||||
if (fi.storeOffsetWithTermVector) bits |= STORE_OFFSET_WITH_TERMVECTOR;
|
if (fi.storeOffsetWithTermVector) bits |= STORE_OFFSET_WITH_TERMVECTOR;
|
||||||
if (fi.omitNorms) bits |= OMIT_NORMS;
|
if (fi.omitNorms) bits |= OMIT_NORMS;
|
||||||
|
if (fi.storePayloads) bits |= STORE_PAYLOADS;
|
||||||
output.writeString(fi.name);
|
output.writeString(fi.name);
|
||||||
output.writeByte(bits);
|
output.writeByte(bits);
|
||||||
}
|
}
|
||||||
|
@ -286,8 +311,9 @@ final class FieldInfos {
|
||||||
boolean storePositionsWithTermVector = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
|
boolean storePositionsWithTermVector = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
|
||||||
boolean storeOffsetWithTermVector = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
|
boolean storeOffsetWithTermVector = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
|
||||||
boolean omitNorms = (bits & OMIT_NORMS) != 0;
|
boolean omitNorms = (bits & OMIT_NORMS) != 0;
|
||||||
|
boolean storePayloads = (bits & STORE_PAYLOADS) != 0;
|
||||||
addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms);
|
|
||||||
|
addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -62,6 +62,14 @@ public class FilterIndexReader extends IndexReader {
|
||||||
public int nextPosition() throws IOException {
|
public int nextPosition() throws IOException {
|
||||||
return ((TermPositions) this.in).nextPosition();
|
return ((TermPositions) this.in).nextPosition();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public int getPayloadLength() {
|
||||||
|
return ((TermPositions) this.in).getPayloadLength();
|
||||||
|
}
|
||||||
|
|
||||||
|
public byte[] getPayload(byte[] data, int offset) throws IOException {
|
||||||
|
return ((TermPositions) this.in).getPayload(data, offset);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Base class for filtering {@link TermEnum} implementations. */
|
/** Base class for filtering {@link TermEnum} implementations. */
|
||||||
|
|
|
@ -67,6 +67,8 @@ public abstract class IndexReader {
|
||||||
public static final FieldOption ALL = new FieldOption ("ALL");
|
public static final FieldOption ALL = new FieldOption ("ALL");
|
||||||
// all indexed fields
|
// all indexed fields
|
||||||
public static final FieldOption INDEXED = new FieldOption ("INDEXED");
|
public static final FieldOption INDEXED = new FieldOption ("INDEXED");
|
||||||
|
// all fields that store payloads
|
||||||
|
public static final FieldOption STORES_PAYLOADS = new FieldOption ("STORES_PAYLOADS");
|
||||||
// all fields which are not indexed
|
// all fields which are not indexed
|
||||||
public static final FieldOption UNINDEXED = new FieldOption ("UNINDEXED");
|
public static final FieldOption UNINDEXED = new FieldOption ("UNINDEXED");
|
||||||
// all fields which are indexed with termvectors enables
|
// all fields which are indexed with termvectors enables
|
||||||
|
|
|
@ -455,5 +455,12 @@ class MultiTermPositions extends MultiTermDocs implements TermPositions {
|
||||||
public int nextPosition() throws IOException {
|
public int nextPosition() throws IOException {
|
||||||
return ((TermPositions)current).nextPosition();
|
return ((TermPositions)current).nextPosition();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public int getPayloadLength() {
|
||||||
|
return ((TermPositions)current).getPayloadLength();
|
||||||
|
}
|
||||||
|
|
||||||
|
public byte[] getPayload(byte[] data, int offset) throws IOException {
|
||||||
|
return ((TermPositions)current).getPayload(data, offset);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -191,5 +191,23 @@ public class MultipleTermPositions implements TermPositions {
|
||||||
public int read(int[] arg0, int[] arg1) throws IOException {
|
public int read(int[] arg0, int[] arg1) throws IOException {
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Not implemented.
|
||||||
|
* @throws UnsupportedOperationException
|
||||||
|
*/
|
||||||
|
public int getPayloadLength() {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Not implemented.
|
||||||
|
* @throws UnsupportedOperationException
|
||||||
|
*/
|
||||||
|
public byte[] getPayload(byte[] data, int offset) throws IOException {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -419,7 +419,15 @@ public class ParallelReader extends IndexReader {
|
||||||
return ((TermPositions)termDocs).nextPosition();
|
return ((TermPositions)termDocs).nextPosition();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public int getPayloadLength() {
|
||||||
|
return ((TermPositions)termDocs).getPayloadLength();
|
||||||
|
}
|
||||||
|
|
||||||
|
public byte[] getPayload(byte[] data, int offset) throws IOException {
|
||||||
|
return ((TermPositions)termDocs).getPayload(data, offset);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,114 @@
|
||||||
|
package org.apache.lucene.index;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A Payload is metadata that can be stored together with each occurrence
|
||||||
|
* of a term. This metadata is stored inline in the posting list of the
|
||||||
|
* specific term.
|
||||||
|
* <p>
|
||||||
|
* To store payloads in the index a {@link TokenStream} has to be used that
|
||||||
|
* produces {@link Token}s containing payload data.
|
||||||
|
* <p>
|
||||||
|
* Use {@link TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)}
|
||||||
|
* to retrieve the payloads from the index.<br>
|
||||||
|
* <br>
|
||||||
|
*
|
||||||
|
* <b>
|
||||||
|
* Warning: The status of the Payloads feature is experimental. The APIs
|
||||||
|
* introduced here might change in the future and will not be supported anymore
|
||||||
|
* in such a case. If you want to use this feature in a production environment
|
||||||
|
* you should wait for an official release.
|
||||||
|
* </b>
|
||||||
|
*/
|
||||||
|
// TODO: Remove warning after API has been finalized
|
||||||
|
public class Payload implements Serializable {
|
||||||
|
protected byte[] data;
|
||||||
|
protected int offset;
|
||||||
|
protected int length;
|
||||||
|
|
||||||
|
protected Payload() {
|
||||||
|
// no-arg constructor since this class implements Serializable
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new payload with the the given array as data.
|
||||||
|
*
|
||||||
|
* @param data the data of this payload
|
||||||
|
*/
|
||||||
|
public Payload(byte[] data) {
|
||||||
|
this(data, 0, data.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new payload with the the given array as data.
|
||||||
|
*
|
||||||
|
* @param data the data of this payload
|
||||||
|
* @param offset the offset in the data byte array
|
||||||
|
* @param length the length of the data
|
||||||
|
*/
|
||||||
|
public Payload(byte[] data, int offset, int length) {
|
||||||
|
if (offset < 0 || offset + length > data.length) {
|
||||||
|
throw new IllegalArgumentException();
|
||||||
|
}
|
||||||
|
this.data = data;
|
||||||
|
this.offset = offset;
|
||||||
|
this.length = length;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int length() {
|
||||||
|
return this.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the byte at the given index.
|
||||||
|
*/
|
||||||
|
public byte byteAt(int index) {
|
||||||
|
if (0 <= index && index < this.length) {
|
||||||
|
return this.data[this.offset + index];
|
||||||
|
}
|
||||||
|
throw new ArrayIndexOutOfBoundsException(index);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Allocates a new byte array, copies the payload data into it and returns it.
|
||||||
|
*/
|
||||||
|
public byte[] toByteArray() {
|
||||||
|
byte[] retArray = new byte[this.length];
|
||||||
|
System.arraycopy(this.data, this.offset, retArray, 0, this.length);
|
||||||
|
return retArray;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copies the payload data to a byte array.
|
||||||
|
*
|
||||||
|
* @param target the target byte array
|
||||||
|
* @param targetOffset the offset in the target byte array
|
||||||
|
*/
|
||||||
|
public void copyTo(byte[] target, int targetOffset) {
|
||||||
|
if (this.length > target.length + targetOffset) {
|
||||||
|
throw new ArrayIndexOutOfBoundsException();
|
||||||
|
}
|
||||||
|
System.arraycopy(this.data, this.offset, target, targetOffset, this.length);
|
||||||
|
}
|
||||||
|
}
|
|
@ -157,11 +157,11 @@ final class SegmentMerger {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void addIndexed(IndexReader reader, FieldInfos fieldInfos, Collection names, boolean storeTermVectors, boolean storePositionWithTermVector,
|
private void addIndexed(IndexReader reader, FieldInfos fieldInfos, Collection names, boolean storeTermVectors, boolean storePositionWithTermVector,
|
||||||
boolean storeOffsetWithTermVector) throws IOException {
|
boolean storeOffsetWithTermVector, boolean storePayloads) throws IOException {
|
||||||
Iterator i = names.iterator();
|
Iterator i = names.iterator();
|
||||||
while (i.hasNext()) {
|
while (i.hasNext()) {
|
||||||
String field = (String)i.next();
|
String field = (String)i.next();
|
||||||
fieldInfos.add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader.hasNorms(field));
|
fieldInfos.add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader.hasNorms(field), storePayloads);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -176,11 +176,12 @@ final class SegmentMerger {
|
||||||
int docCount = 0;
|
int docCount = 0;
|
||||||
for (int i = 0; i < readers.size(); i++) {
|
for (int i = 0; i < readers.size(); i++) {
|
||||||
IndexReader reader = (IndexReader) readers.elementAt(i);
|
IndexReader reader = (IndexReader) readers.elementAt(i);
|
||||||
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true);
|
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false);
|
||||||
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false);
|
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false);
|
||||||
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true);
|
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false);
|
||||||
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false);
|
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false, false);
|
||||||
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.INDEXED), false, false, false);
|
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.STORES_PAYLOADS), false, false, false, true);
|
||||||
|
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.INDEXED), false, false, false, false);
|
||||||
fieldInfos.add(reader.getFieldNames(IndexReader.FieldOption.UNINDEXED), false);
|
fieldInfos.add(reader.getFieldNames(IndexReader.FieldOption.UNINDEXED), false);
|
||||||
}
|
}
|
||||||
fieldInfos.write(directory, segment + ".fnm");
|
fieldInfos.write(directory, segment + ".fnm");
|
||||||
|
@ -326,6 +327,8 @@ final class SegmentMerger {
|
||||||
termInfosWriter.add(smis[0].term, termInfo);
|
termInfosWriter.add(smis[0].term, termInfo);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private byte[] payloadBuffer = null;
|
||||||
|
|
||||||
/** Process postings from multiple segments all positioned on the
|
/** Process postings from multiple segments all positioned on the
|
||||||
* same term. Writes out merged entries into freqOutput and
|
* same term. Writes out merged entries into freqOutput and
|
||||||
|
@ -342,6 +345,8 @@ final class SegmentMerger {
|
||||||
int lastDoc = 0;
|
int lastDoc = 0;
|
||||||
int df = 0; // number of docs w/ term
|
int df = 0; // number of docs w/ term
|
||||||
resetSkip();
|
resetSkip();
|
||||||
|
boolean storePayloads = fieldInfos.fieldInfo(smis[0].term.field).storePayloads;
|
||||||
|
int lastPayloadLength = -1; // ensures that we write the first length
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
SegmentMergeInfo smi = smis[i];
|
SegmentMergeInfo smi = smis[i];
|
||||||
TermPositions postings = smi.getPositions();
|
TermPositions postings = smi.getPositions();
|
||||||
|
@ -361,7 +366,7 @@ final class SegmentMerger {
|
||||||
df++;
|
df++;
|
||||||
|
|
||||||
if ((df % skipInterval) == 0) {
|
if ((df % skipInterval) == 0) {
|
||||||
bufferSkip(lastDoc);
|
bufferSkip(lastDoc, storePayloads, lastPayloadLength);
|
||||||
}
|
}
|
||||||
|
|
||||||
int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1
|
int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1
|
||||||
|
@ -374,11 +379,33 @@ final class SegmentMerger {
|
||||||
freqOutput.writeVInt(docCode); // write doc
|
freqOutput.writeVInt(docCode); // write doc
|
||||||
freqOutput.writeVInt(freq); // write frequency in doc
|
freqOutput.writeVInt(freq); // write frequency in doc
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** See {@link DocumentWriter#writePostings(Posting[], String) for
|
||||||
|
* documentation about the encoding of positions and payloads
|
||||||
|
*/
|
||||||
int lastPosition = 0; // write position deltas
|
int lastPosition = 0; // write position deltas
|
||||||
for (int j = 0; j < freq; j++) {
|
for (int j = 0; j < freq; j++) {
|
||||||
int position = postings.nextPosition();
|
int position = postings.nextPosition();
|
||||||
proxOutput.writeVInt(position - lastPosition);
|
int delta = position - lastPosition;
|
||||||
|
if (storePayloads) {
|
||||||
|
int payloadLength = postings.getPayloadLength();
|
||||||
|
if (payloadLength == lastPayloadLength) {
|
||||||
|
proxOutput.writeVInt(delta * 2);
|
||||||
|
} else {
|
||||||
|
proxOutput.writeVInt(delta * 2 + 1);
|
||||||
|
proxOutput.writeVInt(payloadLength);
|
||||||
|
lastPayloadLength = payloadLength;
|
||||||
|
}
|
||||||
|
if (payloadLength > 0) {
|
||||||
|
if (payloadBuffer == null || payloadBuffer.length < payloadLength) {
|
||||||
|
payloadBuffer = new byte[payloadLength];
|
||||||
|
}
|
||||||
|
postings.getPayload(payloadBuffer, 0);
|
||||||
|
proxOutput.writeBytes(payloadBuffer, 0, payloadLength);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
proxOutput.writeVInt(delta);
|
||||||
|
}
|
||||||
lastPosition = position;
|
lastPosition = position;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -388,21 +415,59 @@ final class SegmentMerger {
|
||||||
|
|
||||||
private RAMOutputStream skipBuffer = new RAMOutputStream();
|
private RAMOutputStream skipBuffer = new RAMOutputStream();
|
||||||
private int lastSkipDoc;
|
private int lastSkipDoc;
|
||||||
|
private int lastSkipPayloadLength;
|
||||||
private long lastSkipFreqPointer;
|
private long lastSkipFreqPointer;
|
||||||
private long lastSkipProxPointer;
|
private long lastSkipProxPointer;
|
||||||
|
|
||||||
private void resetSkip() {
|
private void resetSkip() {
|
||||||
skipBuffer.reset();
|
skipBuffer.reset();
|
||||||
lastSkipDoc = 0;
|
lastSkipDoc = 0;
|
||||||
|
lastSkipPayloadLength = -1; // we don't have to write the first length in the skip list
|
||||||
lastSkipFreqPointer = freqOutput.getFilePointer();
|
lastSkipFreqPointer = freqOutput.getFilePointer();
|
||||||
lastSkipProxPointer = proxOutput.getFilePointer();
|
lastSkipProxPointer = proxOutput.getFilePointer();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void bufferSkip(int doc) throws IOException {
|
private void bufferSkip(int doc, boolean storePayloads, int payloadLength) throws IOException {
|
||||||
long freqPointer = freqOutput.getFilePointer();
|
long freqPointer = freqOutput.getFilePointer();
|
||||||
long proxPointer = proxOutput.getFilePointer();
|
long proxPointer = proxOutput.getFilePointer();
|
||||||
|
|
||||||
skipBuffer.writeVInt(doc - lastSkipDoc);
|
// To efficiently store payloads in the posting lists we do not store the length of
|
||||||
|
// every payload. Instead we omit the length for a payload if the previous payload had
|
||||||
|
// the same length.
|
||||||
|
// However, in order to support skipping the payload length at every skip point must be known.
|
||||||
|
// So we use the same length encoding that we use for the posting lists for the skip data as well:
|
||||||
|
// Case 1: current field does not store payloads
|
||||||
|
// SkipDatum --> DocSkip, FreqSkip, ProxSkip
|
||||||
|
// DocSkip,FreqSkip,ProxSkip --> VInt
|
||||||
|
// DocSkip records the document number before every SkipInterval th document in TermFreqs.
|
||||||
|
// Document numbers are represented as differences from the previous value in the sequence.
|
||||||
|
// Case 2: current field stores payloads
|
||||||
|
// SkipDatum --> DocSkip, PayloadLength?, FreqSkip,ProxSkip
|
||||||
|
// DocSkip,FreqSkip,ProxSkip --> VInt
|
||||||
|
// PayloadLength --> VInt
|
||||||
|
// In this case DocSkip/2 is the difference between
|
||||||
|
// the current and the previous value. If DocSkip
|
||||||
|
// is odd, then a PayloadLength encoded as VInt follows,
|
||||||
|
// if DocSkip is even, then it is assumed that the
|
||||||
|
// current payload length equals the length at the previous
|
||||||
|
// skip point
|
||||||
|
if (storePayloads) {
|
||||||
|
int delta = doc - lastSkipDoc;
|
||||||
|
if (payloadLength == lastSkipPayloadLength) {
|
||||||
|
// the current payload length equals the length at the previous skip point,
|
||||||
|
// so we don't store the length again
|
||||||
|
skipBuffer.writeVInt(delta * 2);
|
||||||
|
} else {
|
||||||
|
// the payload length is different from the previous one. We shift the DocSkip,
|
||||||
|
// set the lowest bit and store the current payload length as VInt.
|
||||||
|
skipBuffer.writeVInt(delta * 2 + 1);
|
||||||
|
skipBuffer.writeVInt(payloadLength);
|
||||||
|
lastSkipPayloadLength = payloadLength;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// current field does not store payloads
|
||||||
|
skipBuffer.writeVInt(doc - lastSkipDoc);
|
||||||
|
}
|
||||||
skipBuffer.writeVInt((int) (freqPointer - lastSkipFreqPointer));
|
skipBuffer.writeVInt((int) (freqPointer - lastSkipFreqPointer));
|
||||||
skipBuffer.writeVInt((int) (proxPointer - lastSkipProxPointer));
|
skipBuffer.writeVInt((int) (proxPointer - lastSkipProxPointer));
|
||||||
|
|
||||||
|
|
|
@ -374,6 +374,9 @@ class SegmentReader extends IndexReader {
|
||||||
else if (!fi.isIndexed && fieldOption == IndexReader.FieldOption.UNINDEXED) {
|
else if (!fi.isIndexed && fieldOption == IndexReader.FieldOption.UNINDEXED) {
|
||||||
fieldSet.add(fi.name);
|
fieldSet.add(fi.name);
|
||||||
}
|
}
|
||||||
|
else if (fi.storePayloads && fieldOption == IndexReader.FieldOption.STORES_PAYLOADS) {
|
||||||
|
fieldSet.add(fi.name);
|
||||||
|
}
|
||||||
else if (fi.isIndexed && fieldOption == IndexReader.FieldOption.INDEXED) {
|
else if (fi.isIndexed && fieldOption == IndexReader.FieldOption.INDEXED) {
|
||||||
fieldSet.add(fi.name);
|
fieldSet.add(fi.name);
|
||||||
}
|
}
|
||||||
|
@ -582,7 +585,12 @@ class SegmentReader extends IndexReader {
|
||||||
|
|
||||||
return termVectorsReader.get(docNumber);
|
return termVectorsReader.get(docNumber);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Returns the field infos of this segment */
|
||||||
|
FieldInfos fieldInfos() {
|
||||||
|
return fieldInfos;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return the name of the segment this reader is reading.
|
* Return the name of the segment this reader is reading.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -39,6 +39,9 @@ class SegmentTermDocs implements TermDocs {
|
||||||
private long proxPointer;
|
private long proxPointer;
|
||||||
private long skipPointer;
|
private long skipPointer;
|
||||||
private boolean haveSkipped;
|
private boolean haveSkipped;
|
||||||
|
|
||||||
|
private int payloadLengthAtLastSkip;
|
||||||
|
protected boolean currentFieldStoresPayloads;
|
||||||
|
|
||||||
protected SegmentTermDocs(SegmentReader parent) {
|
protected SegmentTermDocs(SegmentReader parent) {
|
||||||
this.parent = parent;
|
this.parent = parent;
|
||||||
|
@ -49,23 +52,31 @@ class SegmentTermDocs implements TermDocs {
|
||||||
|
|
||||||
public void seek(Term term) throws IOException {
|
public void seek(Term term) throws IOException {
|
||||||
TermInfo ti = parent.tis.get(term);
|
TermInfo ti = parent.tis.get(term);
|
||||||
seek(ti);
|
seek(ti, term);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void seek(TermEnum termEnum) throws IOException {
|
public void seek(TermEnum termEnum) throws IOException {
|
||||||
TermInfo ti;
|
TermInfo ti;
|
||||||
|
Term term;
|
||||||
|
|
||||||
// use comparison of fieldinfos to verify that termEnum belongs to the same segment as this SegmentTermDocs
|
// use comparison of fieldinfos to verify that termEnum belongs to the same segment as this SegmentTermDocs
|
||||||
if (termEnum instanceof SegmentTermEnum && ((SegmentTermEnum) termEnum).fieldInfos == parent.fieldInfos) // optimized case
|
if (termEnum instanceof SegmentTermEnum && ((SegmentTermEnum) termEnum).fieldInfos == parent.fieldInfos) { // optimized case
|
||||||
ti = ((SegmentTermEnum) termEnum).termInfo();
|
SegmentTermEnum segmentTermEnum = ((SegmentTermEnum) termEnum);
|
||||||
else // punt case
|
term = segmentTermEnum.term();
|
||||||
ti = parent.tis.get(termEnum.term());
|
ti = segmentTermEnum.termInfo();
|
||||||
|
} else { // punt case
|
||||||
seek(ti);
|
term = termEnum.term();
|
||||||
|
ti = parent.tis.get(term);
|
||||||
|
}
|
||||||
|
|
||||||
|
seek(ti, term);
|
||||||
}
|
}
|
||||||
|
|
||||||
void seek(TermInfo ti) throws IOException {
|
void seek(TermInfo ti, Term term) throws IOException {
|
||||||
count = 0;
|
count = 0;
|
||||||
|
payloadLengthAtLastSkip = 0;
|
||||||
|
FieldInfo fi = parent.fieldInfos.fieldInfo(term.field);
|
||||||
|
currentFieldStoresPayloads = (fi != null) ? fi.storePayloads : false;
|
||||||
if (ti == null) {
|
if (ti == null) {
|
||||||
df = 0;
|
df = 0;
|
||||||
} else {
|
} else {
|
||||||
|
@ -141,7 +152,7 @@ class SegmentTermDocs implements TermDocs {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Overridden by SegmentTermPositions to skip in prox stream. */
|
/** Overridden by SegmentTermPositions to skip in prox stream. */
|
||||||
protected void skipProx(long proxPointer) throws IOException {}
|
protected void skipProx(long proxPointer, int payloadLength) throws IOException {}
|
||||||
|
|
||||||
/** Optimized implementation. */
|
/** Optimized implementation. */
|
||||||
public boolean skipTo(int target) throws IOException {
|
public boolean skipTo(int target) throws IOException {
|
||||||
|
@ -157,6 +168,7 @@ class SegmentTermDocs implements TermDocs {
|
||||||
|
|
||||||
// scan skip data
|
// scan skip data
|
||||||
int lastSkipDoc = skipDoc;
|
int lastSkipDoc = skipDoc;
|
||||||
|
int lastPayloadLength = 0;
|
||||||
long lastFreqPointer = freqStream.getFilePointer();
|
long lastFreqPointer = freqStream.getFilePointer();
|
||||||
long lastProxPointer = -1;
|
long lastProxPointer = -1;
|
||||||
int numSkipped = -1 - (count % skipInterval);
|
int numSkipped = -1 - (count % skipInterval);
|
||||||
|
@ -165,6 +177,7 @@ class SegmentTermDocs implements TermDocs {
|
||||||
lastSkipDoc = skipDoc;
|
lastSkipDoc = skipDoc;
|
||||||
lastFreqPointer = freqPointer;
|
lastFreqPointer = freqPointer;
|
||||||
lastProxPointer = proxPointer;
|
lastProxPointer = proxPointer;
|
||||||
|
lastPayloadLength = payloadLengthAtLastSkip;
|
||||||
|
|
||||||
if (skipDoc != 0 && skipDoc >= doc)
|
if (skipDoc != 0 && skipDoc >= doc)
|
||||||
numSkipped += skipInterval;
|
numSkipped += skipInterval;
|
||||||
|
@ -172,7 +185,21 @@ class SegmentTermDocs implements TermDocs {
|
||||||
if(skipCount >= numSkips)
|
if(skipCount >= numSkips)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
skipDoc += skipStream.readVInt();
|
if (currentFieldStoresPayloads) {
|
||||||
|
// the current field stores payloads.
|
||||||
|
// if the doc delta is odd then we have
|
||||||
|
// to read the current payload length
|
||||||
|
// because it differs from the length of the
|
||||||
|
// previous payload
|
||||||
|
int delta = skipStream.readVInt();
|
||||||
|
if ((delta & 1) != 0) {
|
||||||
|
payloadLengthAtLastSkip = skipStream.readVInt();
|
||||||
|
}
|
||||||
|
delta >>>= 1;
|
||||||
|
skipDoc += delta;
|
||||||
|
} else {
|
||||||
|
skipDoc += skipStream.readVInt();
|
||||||
|
}
|
||||||
freqPointer += skipStream.readVInt();
|
freqPointer += skipStream.readVInt();
|
||||||
proxPointer += skipStream.readVInt();
|
proxPointer += skipStream.readVInt();
|
||||||
|
|
||||||
|
@ -182,7 +209,7 @@ class SegmentTermDocs implements TermDocs {
|
||||||
// if we found something to skip, then skip it
|
// if we found something to skip, then skip it
|
||||||
if (lastFreqPointer > freqStream.getFilePointer()) {
|
if (lastFreqPointer > freqStream.getFilePointer()) {
|
||||||
freqStream.seek(lastFreqPointer);
|
freqStream.seek(lastFreqPointer);
|
||||||
skipProx(lastProxPointer);
|
skipProx(lastProxPointer, lastPayloadLength);
|
||||||
|
|
||||||
doc = lastSkipDoc;
|
doc = lastSkipDoc;
|
||||||
count += numSkipped;
|
count += numSkipped;
|
||||||
|
|
|
@ -27,6 +27,12 @@ extends SegmentTermDocs implements TermPositions {
|
||||||
private int proxCount;
|
private int proxCount;
|
||||||
private int position;
|
private int position;
|
||||||
|
|
||||||
|
// the current payload length
|
||||||
|
private int payloadLength;
|
||||||
|
// indicates whether the payload of the currend position has
|
||||||
|
// been read from the proxStream yet
|
||||||
|
private boolean needToLoadPayload;
|
||||||
|
|
||||||
// these variables are being used to remember information
|
// these variables are being used to remember information
|
||||||
// for a lazy skip
|
// for a lazy skip
|
||||||
private long lazySkipPointer = 0;
|
private long lazySkipPointer = 0;
|
||||||
|
@ -37,13 +43,15 @@ extends SegmentTermDocs implements TermPositions {
|
||||||
this.proxStream = null; // the proxStream will be cloned lazily when nextPosition() is called for the first time
|
this.proxStream = null; // the proxStream will be cloned lazily when nextPosition() is called for the first time
|
||||||
}
|
}
|
||||||
|
|
||||||
final void seek(TermInfo ti) throws IOException {
|
final void seek(TermInfo ti, Term term) throws IOException {
|
||||||
super.seek(ti);
|
super.seek(ti, term);
|
||||||
if (ti != null)
|
if (ti != null)
|
||||||
lazySkipPointer = ti.proxPointer;
|
lazySkipPointer = ti.proxPointer;
|
||||||
|
|
||||||
lazySkipProxCount = 0;
|
lazySkipProxCount = 0;
|
||||||
proxCount = 0;
|
proxCount = 0;
|
||||||
|
payloadLength = 0;
|
||||||
|
needToLoadPayload = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public final void close() throws IOException {
|
public final void close() throws IOException {
|
||||||
|
@ -55,9 +63,28 @@ extends SegmentTermDocs implements TermPositions {
|
||||||
// perform lazy skips if neccessary
|
// perform lazy skips if neccessary
|
||||||
lazySkip();
|
lazySkip();
|
||||||
proxCount--;
|
proxCount--;
|
||||||
return position += proxStream.readVInt();
|
return position += readDeltaPosition();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private final int readDeltaPosition() throws IOException {
|
||||||
|
int delta = proxStream.readVInt();
|
||||||
|
if (currentFieldStoresPayloads) {
|
||||||
|
// if the current field stores payloads then
|
||||||
|
// the position delta is shifted one bit to the left.
|
||||||
|
// if the LSB is set, then we have to read the current
|
||||||
|
// payload length
|
||||||
|
if ((delta & 1) != 0) {
|
||||||
|
payloadLength = proxStream.readVInt();
|
||||||
|
}
|
||||||
|
delta >>>= 1;
|
||||||
|
needToLoadPayload = true;
|
||||||
|
} else {
|
||||||
|
payloadLength = 0;
|
||||||
|
needToLoadPayload = false;
|
||||||
|
}
|
||||||
|
return delta;
|
||||||
|
}
|
||||||
|
|
||||||
protected final void skippingDoc() throws IOException {
|
protected final void skippingDoc() throws IOException {
|
||||||
// we remember to skip a document lazily
|
// we remember to skip a document lazily
|
||||||
lazySkipProxCount += freq;
|
lazySkipProxCount += freq;
|
||||||
|
@ -82,16 +109,27 @@ extends SegmentTermDocs implements TermPositions {
|
||||||
|
|
||||||
|
|
||||||
/** Called by super.skipTo(). */
|
/** Called by super.skipTo(). */
|
||||||
protected void skipProx(long proxPointer) throws IOException {
|
protected void skipProx(long proxPointer, int payloadLength) throws IOException {
|
||||||
// we save the pointer, we might have to skip there lazily
|
// we save the pointer, we might have to skip there lazily
|
||||||
lazySkipPointer = proxPointer;
|
lazySkipPointer = proxPointer;
|
||||||
lazySkipProxCount = 0;
|
lazySkipProxCount = 0;
|
||||||
proxCount = 0;
|
proxCount = 0;
|
||||||
|
this.payloadLength = payloadLength;
|
||||||
|
needToLoadPayload = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void skipPositions(int n) throws IOException {
|
private void skipPositions(int n) throws IOException {
|
||||||
for (int f = n; f > 0; f--) // skip unread positions
|
for (int f = n; f > 0; f--) { // skip unread positions
|
||||||
proxStream.readVInt();
|
readDeltaPosition();
|
||||||
|
skipPayload();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void skipPayload() throws IOException {
|
||||||
|
if (needToLoadPayload && payloadLength > 0) {
|
||||||
|
proxStream.seek(proxStream.getFilePointer() + payloadLength);
|
||||||
|
}
|
||||||
|
needToLoadPayload = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// It is not always neccessary to move the prox pointer
|
// It is not always neccessary to move the prox pointer
|
||||||
|
@ -109,6 +147,10 @@ extends SegmentTermDocs implements TermPositions {
|
||||||
// clone lazily
|
// clone lazily
|
||||||
proxStream = (IndexInput)parent.proxStream.clone();
|
proxStream = (IndexInput)parent.proxStream.clone();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// we might have to skip the current payload
|
||||||
|
// if it was not read yet
|
||||||
|
skipPayload();
|
||||||
|
|
||||||
if (lazySkipPointer != 0) {
|
if (lazySkipPointer != 0) {
|
||||||
proxStream.seek(lazySkipPointer);
|
proxStream.seek(lazySkipPointer);
|
||||||
|
@ -120,5 +162,31 @@ extends SegmentTermDocs implements TermPositions {
|
||||||
lazySkipProxCount = 0;
|
lazySkipProxCount = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public int getPayloadLength() {
|
||||||
|
return payloadLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
public byte[] getPayload(byte[] data, int offset) throws IOException {
|
||||||
|
if (!needToLoadPayload) {
|
||||||
|
throw new IOException("Payload cannot be loaded more than once for the same term position.");
|
||||||
|
}
|
||||||
|
|
||||||
|
// read payloads lazily
|
||||||
|
byte[] retArray;
|
||||||
|
int retOffset;
|
||||||
|
if (data == null || data.length - offset < payloadLength) {
|
||||||
|
// the array is too small to store the payload data,
|
||||||
|
// so we allocate a new one
|
||||||
|
retArray = new byte[payloadLength];
|
||||||
|
retOffset = 0;
|
||||||
|
} else {
|
||||||
|
retArray = data;
|
||||||
|
retOffset = offset;
|
||||||
|
}
|
||||||
|
proxStream.readBytes(retArray, retOffset, payloadLength);
|
||||||
|
needToLoadPayload = false;
|
||||||
|
return retArray;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -32,10 +32,53 @@ public interface TermPositions
|
||||||
extends TermDocs
|
extends TermDocs
|
||||||
{
|
{
|
||||||
/** Returns next position in the current document. It is an error to call
|
/** Returns next position in the current document. It is an error to call
|
||||||
this more than {@link #freq()} times
|
this more than {@link #freq()} times
|
||||||
without calling {@link #next()}<p> This is
|
without calling {@link #next()}<p> This is
|
||||||
invalid until {@link #next()} is called for
|
invalid until {@link #next()} is called for
|
||||||
the first time.
|
the first time.
|
||||||
*/
|
*/
|
||||||
int nextPosition() throws IOException;
|
int nextPosition() throws IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the length of the payload at the current term position.
|
||||||
|
* This is invalid until {@link #nextPosition()} is called for
|
||||||
|
* the first time.<br>
|
||||||
|
* <br>
|
||||||
|
* <b>
|
||||||
|
* Warning: The status of the Payloads feature is experimental. The APIs
|
||||||
|
* introduced here might change in the future and will not be supported anymore
|
||||||
|
* in such a case. If you want to use this feature in a production environment
|
||||||
|
* you should wait for an official release.
|
||||||
|
* </b>
|
||||||
|
* @return length of the current payload in number of bytes
|
||||||
|
*/
|
||||||
|
// TODO: Remove warning after API has been finalized
|
||||||
|
int getPayloadLength();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the payload data at the current term position.
|
||||||
|
* This is invalid until {@link #nextPosition()} is called for
|
||||||
|
* the first time.
|
||||||
|
* This method must not be called more than once after each call
|
||||||
|
* of {@link #nextPosition()}. However, payloads are loaded lazily,
|
||||||
|
* so if the payload data for the current position is not needed,
|
||||||
|
* this method may not be called at all for performance reasons.<br>
|
||||||
|
* <br>
|
||||||
|
* <b>
|
||||||
|
* Warning: The status of the Payloads feature is experimental. The APIs
|
||||||
|
* introduced here might change in the future and will not be supported anymore
|
||||||
|
* in such a case. If you want to use this feature in a production environment
|
||||||
|
* you should wait for an official release.
|
||||||
|
* </b>
|
||||||
|
*
|
||||||
|
* @param data the array into which the data of this payload is to be
|
||||||
|
* stored, if it is big enough; otherwise, a new byte[] array
|
||||||
|
* is allocated for this purpose.
|
||||||
|
* @param offset the offset in the array into which the data of this payload
|
||||||
|
* is to be stored.
|
||||||
|
* @return a byte[] array containing the data of this payload
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
// TODO: Remove warning after API has been finalized
|
||||||
|
byte[] getPayload(byte[] data, int offset) throws IOException;
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,8 +24,8 @@ public abstract class BufferedIndexOutput extends IndexOutput {
|
||||||
static final int BUFFER_SIZE = 1024;
|
static final int BUFFER_SIZE = 1024;
|
||||||
|
|
||||||
private final byte[] buffer = new byte[BUFFER_SIZE];
|
private final byte[] buffer = new byte[BUFFER_SIZE];
|
||||||
private long bufferStart = 0; // position in file of buffer
|
private long bufferStart = 0; // position in file of buffer
|
||||||
private int bufferPosition = 0; // position in buffer
|
private int bufferPosition = 0; // position in buffer
|
||||||
|
|
||||||
/** Writes a single byte.
|
/** Writes a single byte.
|
||||||
* @see IndexInput#readByte()
|
* @see IndexInput#readByte()
|
||||||
|
@ -41,12 +41,12 @@ public abstract class BufferedIndexOutput extends IndexOutput {
|
||||||
* @param length the number of bytes to write
|
* @param length the number of bytes to write
|
||||||
* @see IndexInput#readBytes(byte[],int,int)
|
* @see IndexInput#readBytes(byte[],int,int)
|
||||||
*/
|
*/
|
||||||
public void writeBytes(byte[] b, int length) throws IOException {
|
public void writeBytes(byte[] b, int offset, int length) throws IOException {
|
||||||
int bytesLeft = BUFFER_SIZE - bufferPosition;
|
int bytesLeft = BUFFER_SIZE - bufferPosition;
|
||||||
// is there enough space in the buffer?
|
// is there enough space in the buffer?
|
||||||
if (bytesLeft >= length) {
|
if (bytesLeft >= length) {
|
||||||
// we add the data to the end of the buffer
|
// we add the data to the end of the buffer
|
||||||
System.arraycopy(b, 0, buffer, bufferPosition, length);
|
System.arraycopy(b, offset, buffer, bufferPosition, length);
|
||||||
bufferPosition += length;
|
bufferPosition += length;
|
||||||
// if the buffer is full, flush it
|
// if the buffer is full, flush it
|
||||||
if (BUFFER_SIZE - bufferPosition == 0)
|
if (BUFFER_SIZE - bufferPosition == 0)
|
||||||
|
@ -58,7 +58,7 @@ public abstract class BufferedIndexOutput extends IndexOutput {
|
||||||
if (bufferPosition > 0)
|
if (bufferPosition > 0)
|
||||||
flush();
|
flush();
|
||||||
// and write data at once
|
// and write data at once
|
||||||
flushBuffer(b, length);
|
flushBuffer(b, offset, length);
|
||||||
bufferStart += length;
|
bufferStart += length;
|
||||||
} else {
|
} else {
|
||||||
// we fill/flush the buffer (until the input is written)
|
// we fill/flush the buffer (until the input is written)
|
||||||
|
@ -66,7 +66,7 @@ public abstract class BufferedIndexOutput extends IndexOutput {
|
||||||
int pieceLength;
|
int pieceLength;
|
||||||
while (pos < length) {
|
while (pos < length) {
|
||||||
pieceLength = (length - pos < bytesLeft) ? length - pos : bytesLeft;
|
pieceLength = (length - pos < bytesLeft) ? length - pos : bytesLeft;
|
||||||
System.arraycopy(b, pos, buffer, bufferPosition, pieceLength);
|
System.arraycopy(b, pos + offset, buffer, bufferPosition, pieceLength);
|
||||||
pos += pieceLength;
|
pos += pieceLength;
|
||||||
bufferPosition += pieceLength;
|
bufferPosition += pieceLength;
|
||||||
// if the buffer is full, flush it
|
// if the buffer is full, flush it
|
||||||
|
@ -92,8 +92,18 @@ public abstract class BufferedIndexOutput extends IndexOutput {
|
||||||
* @param b the bytes to write
|
* @param b the bytes to write
|
||||||
* @param len the number of bytes to write
|
* @param len the number of bytes to write
|
||||||
*/
|
*/
|
||||||
protected abstract void flushBuffer(byte[] b, int len) throws IOException;
|
private void flushBuffer(byte[] b, int len) throws IOException {
|
||||||
|
flushBuffer(b, 0, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Expert: implements buffer write. Writes bytes at the current position in
|
||||||
|
* the output.
|
||||||
|
* @param b the bytes to write
|
||||||
|
* @param offset the offset in the byte array
|
||||||
|
* @param len the number of bytes to write
|
||||||
|
*/
|
||||||
|
protected abstract void flushBuffer(byte[] b, int offset, int len) throws IOException;
|
||||||
|
|
||||||
/** Closes this stream to further operations. */
|
/** Closes this stream to further operations. */
|
||||||
public void close() throws IOException {
|
public void close() throws IOException {
|
||||||
flush();
|
flush();
|
||||||
|
|
|
@ -588,8 +588,8 @@ class FSIndexOutput extends BufferedIndexOutput {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** output methods: */
|
/** output methods: */
|
||||||
public void flushBuffer(byte[] b, int size) throws IOException {
|
public void flushBuffer(byte[] b, int offset, int size) throws IOException {
|
||||||
file.write(b, 0, size);
|
file.write(b, offset, size);
|
||||||
}
|
}
|
||||||
public void close() throws IOException {
|
public void close() throws IOException {
|
||||||
// only close the file if it has not been closed yet
|
// only close the file if it has not been closed yet
|
||||||
|
|
|
@ -36,7 +36,17 @@ public abstract class IndexOutput {
|
||||||
* @param length the number of bytes to write
|
* @param length the number of bytes to write
|
||||||
* @see IndexInput#readBytes(byte[],int,int)
|
* @see IndexInput#readBytes(byte[],int,int)
|
||||||
*/
|
*/
|
||||||
public abstract void writeBytes(byte[] b, int length) throws IOException;
|
public void writeBytes(byte[] b, int length) throws IOException {
|
||||||
|
writeBytes(b, 0, length);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Writes an array of bytes.
|
||||||
|
* @param b the bytes to write
|
||||||
|
* @param offset the offset in the byte array
|
||||||
|
* @param length the number of bytes to write
|
||||||
|
* @see IndexInput#readBytes(byte[],int,int)
|
||||||
|
*/
|
||||||
|
public abstract void writeBytes(byte[] b, int offset, int length) throws IOException;
|
||||||
|
|
||||||
/** Writes an int as four bytes.
|
/** Writes an int as four bytes.
|
||||||
* @see IndexInput#readInt()
|
* @see IndexInput#readInt()
|
||||||
|
|
|
@ -66,7 +66,7 @@ public class RAMOutputStream extends BufferedIndexOutput {
|
||||||
file.setLength(0);
|
file.setLength(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void flushBuffer(byte[] src, int len) throws IOException {
|
public void flushBuffer(byte[] src, int offset, int len) throws IOException {
|
||||||
byte[] buffer;
|
byte[] buffer;
|
||||||
int bufferPos = 0;
|
int bufferPos = 0;
|
||||||
while (bufferPos != len) {
|
while (bufferPos != len) {
|
||||||
|
@ -81,7 +81,7 @@ public class RAMOutputStream extends BufferedIndexOutput {
|
||||||
else
|
else
|
||||||
buffer = (byte[]) file.buffers.get(bufferNumber);
|
buffer = (byte[]) file.buffers.get(bufferNumber);
|
||||||
|
|
||||||
System.arraycopy(src, bufferPos, buffer, bufferOffset, bytesToCopy);
|
System.arraycopy(src, offset + bufferPos, buffer, bufferOffset, bytesToCopy);
|
||||||
bufferPos += bytesToCopy;
|
bufferPos += bytesToCopy;
|
||||||
pointer += bytesToCopy;
|
pointer += bytesToCopy;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1013,6 +1013,7 @@
|
||||||
<li>If the third lowest-order bit is set (0x04), term positions are stored with the term vectors.</li>
|
<li>If the third lowest-order bit is set (0x04), term positions are stored with the term vectors.</li>
|
||||||
<li>If the fourth lowest-order bit is set (0x08), term offsets are stored with the term vectors.</li>
|
<li>If the fourth lowest-order bit is set (0x08), term offsets are stored with the term vectors.</li>
|
||||||
<li>If the fifth lowest-order bit is set (0x10), norms are omitted for the indexed field.</li>
|
<li>If the fifth lowest-order bit is set (0x10), norms are omitted for the indexed field.</li>
|
||||||
|
<li>If the sixth lowest-order bit is set (0x20), payloads are stored for the indexed field.</li>
|
||||||
</ul>
|
</ul>
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
|
@ -1298,9 +1299,9 @@
|
||||||
<sup>DocFreq/SkipInterval</sup>
|
<sup>DocFreq/SkipInterval</sup>
|
||||||
</p>
|
</p>
|
||||||
<p>SkipDatum -->
|
<p>SkipDatum -->
|
||||||
DocSkip,FreqSkip,ProxSkip
|
DocSkip,PayloadLength?,FreqSkip,ProxSkip
|
||||||
</p>
|
</p>
|
||||||
<p>DocDelta,Freq,DocSkip,FreqSkip,ProxSkip -->
|
<p>DocDelta,Freq,DocSkip,PayloadLength,FreqSkip,ProxSkip -->
|
||||||
VInt
|
VInt
|
||||||
</p>
|
</p>
|
||||||
<p>TermFreqs
|
<p>TermFreqs
|
||||||
|
@ -1328,9 +1329,17 @@
|
||||||
SkipInterval
|
SkipInterval
|
||||||
<sup>th</sup>
|
<sup>th</sup>
|
||||||
document in TermFreqs.
|
document in TermFreqs.
|
||||||
Document numbers are represented as differences
|
If payloads are disabled for the term's field,
|
||||||
from the previous value in the sequence. FreqSkip
|
then DocSkip represents the difference from the
|
||||||
and ProxSkip record the position of every
|
previous value in the sequence.
|
||||||
|
If payloads are enabled for the term's field,
|
||||||
|
then DocSkip/2 represents the difference from the
|
||||||
|
previous value in the sequence. If payloads are enabled
|
||||||
|
and DocSkip is odd,
|
||||||
|
then PayloadLength is stored indicating the length
|
||||||
|
of the last payload before the SkipInterval<sup>th</sup>
|
||||||
|
document in TermPositions.
|
||||||
|
FreqSkip and ProxSkip record the position of every
|
||||||
SkipInterval
|
SkipInterval
|
||||||
<sup>th</sup>
|
<sup>th</sup>
|
||||||
entry in FreqFile and
|
entry in FreqFile and
|
||||||
|
@ -1379,12 +1388,21 @@
|
||||||
<sup>DocFreq</sup>
|
<sup>DocFreq</sup>
|
||||||
</p>
|
</p>
|
||||||
<p>Positions -->
|
<p>Positions -->
|
||||||
<PositionDelta>
|
<PositionDelta,Payload?>
|
||||||
<sup>Freq</sup>
|
<sup>Freq</sup>
|
||||||
</p>
|
</p>
|
||||||
|
<p>Payload -->
|
||||||
|
<PayloadLength?,PayloadData>
|
||||||
|
</p>
|
||||||
<p>PositionDelta -->
|
<p>PositionDelta -->
|
||||||
VInt
|
VInt
|
||||||
</p>
|
</p>
|
||||||
|
<p>PayloadLength -->
|
||||||
|
VInt
|
||||||
|
</p>
|
||||||
|
<p>PayloadData -->
|
||||||
|
byte<sup>PayloadLength</sup>
|
||||||
|
</p>
|
||||||
<p>TermPositions
|
<p>TermPositions
|
||||||
are ordered by term (the term is implicit, from the .tis file).
|
are ordered by term (the term is implicit, from the .tis file).
|
||||||
</p>
|
</p>
|
||||||
|
@ -1393,19 +1411,30 @@
|
||||||
number is implicit from the .frq file).
|
number is implicit from the .frq file).
|
||||||
</p>
|
</p>
|
||||||
<p>PositionDelta
|
<p>PositionDelta
|
||||||
is the difference between the position of the current occurrence in
|
is, if payloads are disabled for the term's field, the difference
|
||||||
|
between the position of the current occurrence in
|
||||||
the document and the previous occurrence (or zero, if this is the
|
the document and the previous occurrence (or zero, if this is the
|
||||||
first occurrence in this document).
|
first occurrence in this document).
|
||||||
|
If payloads are enabled for the term's field, then PositionDelta/2
|
||||||
|
is the difference between the current and the previous position. If
|
||||||
|
payloads are enabled and PositionDelta is odd, then PayloadLength is
|
||||||
|
stored, indicating the length of the payload at the current term position.
|
||||||
</p>
|
</p>
|
||||||
<p>
|
<p>
|
||||||
For example, the TermPositions for a
|
For example, the TermPositions for a
|
||||||
term which occurs as the fourth term in one document, and as the
|
term which occurs as the fourth term in one document, and as the
|
||||||
fifth and ninth term in a subsequent document, would be the following
|
fifth and ninth term in a subsequent document, would be the following
|
||||||
sequence of VInts:
|
sequence of VInts (payloads disabled):
|
||||||
</p>
|
</p>
|
||||||
<p>4,
|
<p>4,
|
||||||
5, 4
|
5, 4
|
||||||
</p>
|
</p>
|
||||||
|
<p>PayloadData
|
||||||
|
is metadata associated with the current term position. If PayloadLength
|
||||||
|
is stored at the current position, then it indicates the length of this
|
||||||
|
Payload. If PayloadLength is not stored, then this Payload has the same
|
||||||
|
length as the Payload at the previous position.
|
||||||
|
</p>
|
||||||
</section>
|
</section>
|
||||||
<section id="Normalization Factors"><title>Normalization Factors</title>
|
<section id="Normalization Factors"><title>Normalization Factors</title>
|
||||||
<p>
|
<p>
|
||||||
|
|
|
@ -0,0 +1,443 @@
|
||||||
|
package org.apache.lucene.index;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Random;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.store.FSDirectory;
|
||||||
|
import org.apache.lucene.store.RAMDirectory;
|
||||||
|
|
||||||
|
|
||||||
|
public class TestPayloads extends TestCase {
|
||||||
|
|
||||||
|
// Simple tests to test the Payload class
|
||||||
|
public void testPayload() throws Exception {
|
||||||
|
byte[] testData = "This is a test!".getBytes();
|
||||||
|
Payload payload = new Payload(testData);
|
||||||
|
assertEquals("Wrong payload length.", testData.length, payload.length());
|
||||||
|
|
||||||
|
// test copyTo()
|
||||||
|
byte[] target = new byte[testData.length - 1];
|
||||||
|
try {
|
||||||
|
payload.copyTo(target, 0);
|
||||||
|
fail("Expected exception not thrown");
|
||||||
|
} catch (Exception expected) {
|
||||||
|
// expected exception
|
||||||
|
}
|
||||||
|
|
||||||
|
target = new byte[testData.length + 3];
|
||||||
|
payload.copyTo(target, 3);
|
||||||
|
|
||||||
|
for (int i = 0; i < testData.length; i++) {
|
||||||
|
assertEquals(testData[i], target[i + 3]);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// test toByteArray()
|
||||||
|
target = payload.toByteArray();
|
||||||
|
assertByteArrayEquals(testData, target);
|
||||||
|
|
||||||
|
// test byteAt()
|
||||||
|
for (int i = 0; i < testData.length; i++) {
|
||||||
|
assertEquals(payload.byteAt(i), testData[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
payload.byteAt(testData.length + 1);
|
||||||
|
fail("Expected exception not thrown");
|
||||||
|
} catch (Exception expected) {
|
||||||
|
// expected exception
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tests whether the DocumentWriter and SegmentMerger correctly enable the
|
||||||
|
// payload bit in the FieldInfo
|
||||||
|
public void testPayloadFieldBit() throws Exception {
|
||||||
|
Directory ram = new RAMDirectory();
|
||||||
|
PayloadAnalyzer analyzer = new PayloadAnalyzer();
|
||||||
|
IndexWriter writer = new IndexWriter(ram, analyzer, true);
|
||||||
|
Document d = new Document();
|
||||||
|
// this field won't have any payloads
|
||||||
|
d.add(new Field("f1", "This field has no payloads", Field.Store.NO, Field.Index.TOKENIZED));
|
||||||
|
// this field will have payloads in all docs, however not for all term positions,
|
||||||
|
// so this field is used to check if the DocumentWriter correctly enables the payloads bit
|
||||||
|
// even if only some term positions have payloads
|
||||||
|
d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED));
|
||||||
|
d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED));
|
||||||
|
// this field is used to verify if the SegmentMerger enables payloads for a field if it has payloads
|
||||||
|
// enabled in only some documents
|
||||||
|
d.add(new Field("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.TOKENIZED));
|
||||||
|
// only add payload data for field f2
|
||||||
|
analyzer.setPayloadData("f2", 1, "somedata".getBytes(), 0, 1);
|
||||||
|
writer.addDocument(d);
|
||||||
|
// flush
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
// only one segment in the index, so we can cast to SegmentReader
|
||||||
|
SegmentReader reader = (SegmentReader) IndexReader.open(ram);
|
||||||
|
FieldInfos fi = reader.fieldInfos();
|
||||||
|
assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
|
||||||
|
assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
|
||||||
|
assertFalse("Payload field bit should not be set.", fi.fieldInfo("f3").storePayloads);
|
||||||
|
reader.close();
|
||||||
|
|
||||||
|
// now we add another document which has payloads for field f3 and verify if the SegmentMerger
|
||||||
|
// enabled payloads for that field
|
||||||
|
writer = new IndexWriter(ram, analyzer, true);
|
||||||
|
d = new Document();
|
||||||
|
d.add(new Field("f1", "This field has no payloads", Field.Store.NO, Field.Index.TOKENIZED));
|
||||||
|
d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED));
|
||||||
|
d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED));
|
||||||
|
d.add(new Field("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.TOKENIZED));
|
||||||
|
// add payload data for field f2 and f3
|
||||||
|
analyzer.setPayloadData("f2", "somedata".getBytes(), 0, 1);
|
||||||
|
analyzer.setPayloadData("f3", "somedata".getBytes(), 0, 3);
|
||||||
|
writer.addDocument(d);
|
||||||
|
// force merge
|
||||||
|
writer.optimize();
|
||||||
|
// flush
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
// only one segment in the index, so we can cast to SegmentReader
|
||||||
|
reader = (SegmentReader) IndexReader.open(ram);
|
||||||
|
fi = reader.fieldInfos();
|
||||||
|
assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
|
||||||
|
assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
|
||||||
|
assertTrue("Payload field bit should be set.", fi.fieldInfo("f3").storePayloads);
|
||||||
|
reader.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tests if payloads are correctly stored and loaded using both RamDirectory and FSDirectory
|
||||||
|
public void testPayloadsEncoding() throws Exception {
|
||||||
|
// first perform the test using a RAMDirectory
|
||||||
|
Directory dir = new RAMDirectory();
|
||||||
|
performTest(dir);
|
||||||
|
|
||||||
|
// now use a FSDirectory and repeat same test
|
||||||
|
String dirName = "test_payloads";
|
||||||
|
dir = FSDirectory.getDirectory(dirName);
|
||||||
|
performTest(dir);
|
||||||
|
rmDir(dirName);
|
||||||
|
}
|
||||||
|
|
||||||
|
// builds an index with payloads in the given Directory and performs
|
||||||
|
// different tests to verify the payload encoding
|
||||||
|
private void performTest(Directory dir) throws Exception {
|
||||||
|
PayloadAnalyzer analyzer = new PayloadAnalyzer();
|
||||||
|
IndexWriter writer = new IndexWriter(dir, analyzer, true);
|
||||||
|
|
||||||
|
// should be in sync with value in TermInfosWriter
|
||||||
|
final int skipInterval = 16;
|
||||||
|
|
||||||
|
final int numTerms = 5;
|
||||||
|
final String fieldName = "f1";
|
||||||
|
|
||||||
|
int numDocs = skipInterval + 1;
|
||||||
|
// create content for the test documents with just a few terms
|
||||||
|
Term[] terms = generateTerms(fieldName, numTerms);
|
||||||
|
StringBuffer sb = new StringBuffer();
|
||||||
|
for (int i = 0; i < terms.length; i++) {
|
||||||
|
sb.append(terms[i].text);
|
||||||
|
sb.append(" ");
|
||||||
|
}
|
||||||
|
String content = sb.toString();
|
||||||
|
|
||||||
|
|
||||||
|
int payloadDataLength = numTerms * numDocs * 2 + numTerms * numDocs * (numDocs - 1) / 2;
|
||||||
|
byte[] payloadData = generateRandomData(payloadDataLength);
|
||||||
|
|
||||||
|
Document d = new Document();
|
||||||
|
d.add(new Field(fieldName, content, Field.Store.NO, Field.Index.TOKENIZED));
|
||||||
|
// add the same document multiple times to have the same payload lengths for all
|
||||||
|
// occurrences within two consecutive skip intervals
|
||||||
|
int offset = 0;
|
||||||
|
for (int i = 0; i < 2 * numDocs; i++) {
|
||||||
|
analyzer.setPayloadData(fieldName, payloadData, offset, 1);
|
||||||
|
offset += numTerms;
|
||||||
|
writer.addDocument(d);
|
||||||
|
}
|
||||||
|
|
||||||
|
// now we make sure to have different payload lengths next at the next skip point
|
||||||
|
for (int i = 0; i < numDocs; i++) {
|
||||||
|
analyzer.setPayloadData(fieldName, payloadData, offset, i);
|
||||||
|
offset += i * numTerms;
|
||||||
|
writer.addDocument(d);
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.optimize();
|
||||||
|
// flush
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Verify the index
|
||||||
|
* first we test if all payloads are stored correctly
|
||||||
|
*/
|
||||||
|
IndexReader reader = IndexReader.open(dir);
|
||||||
|
|
||||||
|
byte[] verifyPayloadData = new byte[payloadDataLength];
|
||||||
|
offset = 0;
|
||||||
|
TermPositions[] tps = new TermPositions[numTerms];
|
||||||
|
for (int i = 0; i < numTerms; i++) {
|
||||||
|
tps[i] = reader.termPositions(terms[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
while (tps[0].next()) {
|
||||||
|
for (int i = 1; i < numTerms; i++) {
|
||||||
|
tps[i].next();
|
||||||
|
}
|
||||||
|
int freq = tps[0].freq();
|
||||||
|
|
||||||
|
for (int i = 0; i < freq; i++) {
|
||||||
|
for (int j = 0; j < numTerms; j++) {
|
||||||
|
tps[j].nextPosition();
|
||||||
|
tps[j].getPayload(verifyPayloadData, offset);
|
||||||
|
offset += tps[j].getPayloadLength();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < numTerms; i++) {
|
||||||
|
tps[i].close();
|
||||||
|
}
|
||||||
|
|
||||||
|
assertByteArrayEquals(payloadData, verifyPayloadData);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* test lazy skipping
|
||||||
|
*/
|
||||||
|
TermPositions tp = reader.termPositions(terms[0]);
|
||||||
|
tp.next();
|
||||||
|
tp.nextPosition();
|
||||||
|
// now we don't read this payload
|
||||||
|
tp.nextPosition();
|
||||||
|
assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
|
||||||
|
byte[] payload = tp.getPayload(null, 0);
|
||||||
|
assertEquals(payload[0], payloadData[numTerms]);
|
||||||
|
tp.nextPosition();
|
||||||
|
|
||||||
|
// we don't read this payload and skip to a different document
|
||||||
|
tp.skipTo(5);
|
||||||
|
tp.nextPosition();
|
||||||
|
assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
|
||||||
|
payload = tp.getPayload(null, 0);
|
||||||
|
assertEquals(payload[0], payloadData[5 * numTerms]);
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Test different lengths at skip points
|
||||||
|
*/
|
||||||
|
tp.seek(terms[1]);
|
||||||
|
tp.next();
|
||||||
|
tp.nextPosition();
|
||||||
|
assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
|
||||||
|
tp.skipTo(skipInterval - 1);
|
||||||
|
tp.nextPosition();
|
||||||
|
assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
|
||||||
|
tp.skipTo(2 * skipInterval - 1);
|
||||||
|
tp.nextPosition();
|
||||||
|
assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
|
||||||
|
tp.skipTo(3 * skipInterval - 1);
|
||||||
|
tp.nextPosition();
|
||||||
|
assertEquals("Wrong payload length.", 3 * skipInterval - 2 * numDocs - 1, tp.getPayloadLength());
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Test multiple call of getPayload()
|
||||||
|
*/
|
||||||
|
tp.getPayload(null, 0);
|
||||||
|
try {
|
||||||
|
// it is forbidden to call getPayload() more than once
|
||||||
|
// without calling nextPosition()
|
||||||
|
tp.getPayload(null, 0);
|
||||||
|
fail("Expected exception not thrown");
|
||||||
|
} catch (Exception expected) {
|
||||||
|
// expected exception
|
||||||
|
}
|
||||||
|
|
||||||
|
reader.close();
|
||||||
|
|
||||||
|
// test long payload
|
||||||
|
analyzer = new PayloadAnalyzer();
|
||||||
|
writer = new IndexWriter(dir, analyzer, true);
|
||||||
|
String singleTerm = "lucene";
|
||||||
|
|
||||||
|
d = new Document();
|
||||||
|
d.add(new Field(fieldName, singleTerm, Field.Store.NO, Field.Index.TOKENIZED));
|
||||||
|
// add a payload whose length is greater than the buffer size of BufferedIndexOutput
|
||||||
|
payloadData = generateRandomData(2000);
|
||||||
|
analyzer.setPayloadData(fieldName, payloadData, 100, 1500);
|
||||||
|
writer.addDocument(d);
|
||||||
|
|
||||||
|
|
||||||
|
writer.optimize();
|
||||||
|
// flush
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
reader = IndexReader.open(dir);
|
||||||
|
tp = reader.termPositions(new Term(fieldName, singleTerm));
|
||||||
|
tp.next();
|
||||||
|
tp.nextPosition();
|
||||||
|
|
||||||
|
verifyPayloadData = new byte[tp.getPayloadLength()];
|
||||||
|
tp.getPayload(verifyPayloadData, 0);
|
||||||
|
byte[] portion = new byte[1500];
|
||||||
|
System.arraycopy(payloadData, 100, portion, 0, 1500);
|
||||||
|
|
||||||
|
assertByteArrayEquals(portion, verifyPayloadData);
|
||||||
|
reader.close();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private byte[] generateRandomData(int n) {
|
||||||
|
Random rnd = new Random();
|
||||||
|
byte[] data = new byte[n];
|
||||||
|
rnd.nextBytes(data);
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Term[] generateTerms(String fieldName, int n) {
|
||||||
|
int maxDigits = (int) (Math.log(n) / Math.log(10));
|
||||||
|
Term[] terms = new Term[n];
|
||||||
|
StringBuffer sb = new StringBuffer();
|
||||||
|
for (int i = 0; i < n; i++) {
|
||||||
|
sb.setLength(0);
|
||||||
|
sb.append("t");
|
||||||
|
int zeros = maxDigits - (int) (Math.log(i) / Math.log(10));
|
||||||
|
for (int j = 0; j < zeros; j++) {
|
||||||
|
sb.append("0");
|
||||||
|
}
|
||||||
|
sb.append(i);
|
||||||
|
terms[i] = new Term(fieldName, sb.toString());
|
||||||
|
}
|
||||||
|
return terms;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void rmDir(String dir) {
|
||||||
|
File fileDir = new File(dir);
|
||||||
|
if (fileDir.exists()) {
|
||||||
|
File[] files = fileDir.listFiles();
|
||||||
|
if (files != null) {
|
||||||
|
for (int i = 0; i < files.length; i++) {
|
||||||
|
files[i].delete();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fileDir.delete();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void assertByteArrayEquals(byte[] b1, byte[] b2) {
|
||||||
|
if (b1.length != b2.length) {
|
||||||
|
fail("Byte arrays have different lengths: " + b1.length + ", " + b2.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < b1.length; i++) {
|
||||||
|
if (b1[i] != b2[i]) {
|
||||||
|
fail("Byte arrays different at index " + i + ": " + b1[i] + ", " + b2[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This Analyzer uses an WhitespaceTokenizer and PayloadFilter.
|
||||||
|
*/
|
||||||
|
private static class PayloadAnalyzer extends Analyzer {
|
||||||
|
Map fieldToData = new HashMap();
|
||||||
|
|
||||||
|
void setPayloadData(String field, byte[] data, int offset, int length) {
|
||||||
|
fieldToData.put(field, new PayloadData(0, data, offset, length));
|
||||||
|
}
|
||||||
|
|
||||||
|
void setPayloadData(String field, int numFieldInstancesToSkip, byte[] data, int offset, int length) {
|
||||||
|
fieldToData.put(field, new PayloadData(numFieldInstancesToSkip, data, offset, length));
|
||||||
|
}
|
||||||
|
|
||||||
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
|
PayloadData payload = (PayloadData) fieldToData.get(fieldName);
|
||||||
|
TokenStream ts = new WhitespaceTokenizer(reader);
|
||||||
|
if (payload != null) {
|
||||||
|
if (payload.numFieldInstancesToSkip == 0) {
|
||||||
|
ts = new PayloadFilter(ts, payload.data, payload.offset, payload.length);
|
||||||
|
} else {
|
||||||
|
payload.numFieldInstancesToSkip--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ts;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class PayloadData {
|
||||||
|
byte[] data;
|
||||||
|
int offset;
|
||||||
|
int length;
|
||||||
|
int numFieldInstancesToSkip;
|
||||||
|
|
||||||
|
PayloadData(int skip, byte[] data, int offset, int length) {
|
||||||
|
numFieldInstancesToSkip = skip;
|
||||||
|
this.data = data;
|
||||||
|
this.offset = offset;
|
||||||
|
this.length = length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This Filter adds payloads to the tokens.
|
||||||
|
*/
|
||||||
|
private static class PayloadFilter extends TokenFilter {
|
||||||
|
private byte[] data;
|
||||||
|
private int length;
|
||||||
|
private int offset;
|
||||||
|
|
||||||
|
public PayloadFilter(TokenStream in, byte[] data, int offset, int length) {
|
||||||
|
super(in);
|
||||||
|
this.data = data;
|
||||||
|
this.length = length;
|
||||||
|
this.offset = offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Token next() throws IOException {
|
||||||
|
Token nextToken = input.next();
|
||||||
|
if (nextToken != null && offset + length <= data.length) {
|
||||||
|
nextToken.setPayload(new Payload(data, offset, length));
|
||||||
|
offset += length;
|
||||||
|
}
|
||||||
|
|
||||||
|
return nextToken;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -48,7 +48,7 @@ public class MockRAMOutputStream extends RAMOutputStream {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void flushBuffer(byte[] src, int len) throws IOException {
|
public void flushBuffer(byte[] src, int offset, int len) throws IOException {
|
||||||
long freeSpace = dir.maxSize - dir.sizeInBytes();
|
long freeSpace = dir.maxSize - dir.sizeInBytes();
|
||||||
long realUsage = 0;
|
long realUsage = 0;
|
||||||
|
|
||||||
|
@ -63,14 +63,14 @@ public class MockRAMOutputStream extends RAMOutputStream {
|
||||||
if (dir.maxSize != 0 && freeSpace <= len) {
|
if (dir.maxSize != 0 && freeSpace <= len) {
|
||||||
if (freeSpace > 0 && freeSpace < len) {
|
if (freeSpace > 0 && freeSpace < len) {
|
||||||
realUsage += freeSpace;
|
realUsage += freeSpace;
|
||||||
super.flushBuffer(src, (int) freeSpace);
|
super.flushBuffer(src, offset, (int) freeSpace);
|
||||||
}
|
}
|
||||||
if (realUsage > dir.maxUsedSize) {
|
if (realUsage > dir.maxUsedSize) {
|
||||||
dir.maxUsedSize = realUsage;
|
dir.maxUsedSize = realUsage;
|
||||||
}
|
}
|
||||||
throw new IOException("fake disk full at " + dir.getRecomputedActualSizeInBytes() + " bytes");
|
throw new IOException("fake disk full at " + dir.getRecomputedActualSizeInBytes() + " bytes");
|
||||||
} else {
|
} else {
|
||||||
super.flushBuffer(src, len);
|
super.flushBuffer(src, offset, len);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (first) {
|
if (first) {
|
||||||
|
|
Loading…
Reference in New Issue