LUCENE-755: Added the ability to store arbitrary binary metadata (payloads) in the posting list.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@518486 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Busch 2007-03-15 05:15:43 +00:00
parent 9da8211775
commit eb20c06a62
23 changed files with 1169 additions and 116 deletions

View File

@ -82,6 +82,13 @@ New features
2. LUCENE-822: Added FieldSelector capabilities to Searchable for use with RemoteSearcher, and other Searchable implementations. (Mark Miller, Grant Ingersoll) 2. LUCENE-822: Added FieldSelector capabilities to Searchable for use with RemoteSearcher, and other Searchable implementations. (Mark Miller, Grant Ingersoll)
3. LUCENE-755: Added the ability to store arbitrary binary metadata in the posting list.
These metadata are called Payloads. For every position of a Token one Payload in the form
of a variable length byte array can be stored in the prox file.
Remark: The APIs introduced with this feature are in experimental state and thus
contain appropriate warnings in the javadocs.
(Michael Busch)
Optimizations Optimizations
1. LUCENE-761: The proxStream is now cloned lazily in SegmentTermPositions 1. LUCENE-761: The proxStream is now cloned lazily in SegmentTermPositions

View File

@ -1,5 +1,8 @@
package org.apache.lucene.analysis; package org.apache.lucene.analysis;
import org.apache.lucene.index.Payload;
import org.apache.lucene.index.TermPositions;
/** /**
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -20,23 +23,40 @@ package org.apache.lucene.analysis;
/** A Token is an occurence of a term from the text of a field. It consists of /** A Token is an occurence of a term from the text of a field. It consists of
a term's text, the start and end offset of the term in the text of the field, a term's text, the start and end offset of the term in the text of the field,
and a type string. and a type string.
<p>
The start and end offsets permit applications to re-associate a token with The start and end offsets permit applications to re-associate a token with
its source text, e.g., to display highlighted query terms in a document its source text, e.g., to display highlighted query terms in a document
browser, or to show matching text fragments in a KWIC (KeyWord In Context) browser, or to show matching text fragments in a KWIC (KeyWord In Context)
display, etc. display, etc.
<p>
The type is an interned string, assigned by a lexical analyzer The type is an interned string, assigned by a lexical analyzer
(a.k.a. tokenizer), naming the lexical or syntactic class that the token (a.k.a. tokenizer), naming the lexical or syntactic class that the token
belongs to. For example an end of sentence marker token might be implemented belongs to. For example an end of sentence marker token might be implemented
with type "eos". The default token type is "word". */ with type "eos". The default token type is "word".
<p>
A Token can optionally have metadata (a.k.a. Payload) in the form of a variable
length byte array. Use {@link TermPositions#getPayloadLength()} and
{@link TermPositions#getPayload(byte[], int)} to retrieve the payloads from the index.
<br><br>
<b>
Warning: The status of the Payloads feature is experimental. The APIs
introduced here might change in the future and will not be supported anymore
in such a case. If you want to use this feature in a production environment
you should wait for an official release.
</b>
@see org.apache.lucene.index.Payload
*/
// TODO: Remove warning after API has been finalized
public class Token implements Cloneable { public class Token implements Cloneable {
String termText; // the text of the term String termText; // the text of the term
int startOffset; // start in source text int startOffset; // start in source text
int endOffset; // end in source text int endOffset; // end in source text
String type = "word"; // lexical type String type = "word"; // lexical type
Payload payload;
private int positionIncrement = 1; private int positionIncrement = 1;
/** Constructs a Token with the given term text, and start & end offsets. /** Constructs a Token with the given term text, and start & end offsets.
@ -115,6 +135,36 @@ public class Token implements Cloneable {
/** Returns this Token's lexical type. Defaults to "word". */ /** Returns this Token's lexical type. Defaults to "word". */
public final String type() { return type; } public final String type() { return type; }
/**
* Sets this Token's payload.<br>
* <br>
* <b>
* Warning: The status of the Payloads feature is experimental. The APIs
* introduced here might change in the future and will not be supported anymore
* in such a case. If you want to use this feature in a production environment
* you should wait for an official release.
* </b>
*/
// TODO: Remove warning after API has been finalized
public void setPayload(Payload payload) {
this.payload = payload;
}
/**
* Returns this Token's payload.<br>
* <br>
* <b>
* Warning: The status of the Payloads feature is experimental. The APIs
* introduced here might change in the future and will not be supported anymore
* in such a case. If you want to use this feature in a production environment
* you should wait for an official release.
* </b>
*/
// TODO: Remove warning after API has been finalized
public Payload getPayload() {
return this.payload;
}
public String toString() { public String toString() {
StringBuffer sb = new StringBuffer(); StringBuffer sb = new StringBuffer();
sb.append("(" + termText + "," + startOffset + "," + endOffset); sb.append("(" + termText + "," + startOffset + "," + endOffset);

View File

@ -31,6 +31,7 @@ import java.io.PrintStream;
import java.io.Reader; import java.io.Reader;
import java.io.StringReader; import java.io.StringReader;
import java.util.Arrays; import java.util.Arrays;
import java.util.BitSet;
import java.util.Enumeration; import java.util.Enumeration;
import java.util.Hashtable; import java.util.Hashtable;
import java.util.Iterator; import java.util.Iterator;
@ -69,9 +70,30 @@ final class DocumentWriter {
final void addDocument(String segment, Document doc) final void addDocument(String segment, Document doc)
throws CorruptIndexException, IOException { throws CorruptIndexException, IOException {
// write field names // create field infos
fieldInfos = new FieldInfos(); fieldInfos = new FieldInfos();
fieldInfos.add(doc); fieldInfos.add(doc);
// invert doc into postingTable
postingTable.clear(); // clear postingTable
fieldLengths = new int[fieldInfos.size()]; // init fieldLengths
fieldPositions = new int[fieldInfos.size()]; // init fieldPositions
fieldOffsets = new int[fieldInfos.size()]; // init fieldOffsets
fieldStoresPayloads = new BitSet(fieldInfos.size());
fieldBoosts = new float[fieldInfos.size()]; // init fieldBoosts
Arrays.fill(fieldBoosts, doc.getBoost());
// Before we write the FieldInfos we invert the Document. The reason is that
// during invertion the TokenStreams of tokenized fields are being processed
// and we might encounter tokens that have payloads associated with them. In
// this case we have to update the FieldInfo of the particular field.
invertDocument(doc);
// sort postingTable into an array
Posting[] postings = sortPostingTable();
// write field infos
fieldInfos.write(directory, segment + ".fnm"); fieldInfos.write(directory, segment + ".fnm");
// write field values // write field values
@ -82,21 +104,7 @@ final class DocumentWriter {
} finally { } finally {
fieldsWriter.close(); fieldsWriter.close();
} }
// invert doc into postingTable
postingTable.clear(); // clear postingTable
fieldLengths = new int[fieldInfos.size()]; // init fieldLengths
fieldPositions = new int[fieldInfos.size()]; // init fieldPositions
fieldOffsets = new int[fieldInfos.size()]; // init fieldOffsets
fieldBoosts = new float[fieldInfos.size()]; // init fieldBoosts
Arrays.fill(fieldBoosts, doc.getBoost());
invertDocument(doc);
// sort postingTable into an array
Posting[] postings = sortPostingTable();
/* /*
for (int i = 0; i < postings.length; i++) { for (int i = 0; i < postings.length; i++) {
Posting posting = postings[i]; Posting posting = postings[i];
@ -125,6 +133,10 @@ final class DocumentWriter {
private int[] fieldPositions; private int[] fieldPositions;
private int[] fieldOffsets; private int[] fieldOffsets;
private float[] fieldBoosts; private float[] fieldBoosts;
// If any of the tokens of a paticular field carry a payload
// then we enable payloads for that field.
private BitSet fieldStoresPayloads;
// Tokenizes the fields of a document into Postings. // Tokenizes the fields of a document into Postings.
private final void invertDocument(Document doc) private final void invertDocument(Document doc)
@ -144,9 +156,9 @@ final class DocumentWriter {
if (!field.isTokenized()) { // un-tokenized field if (!field.isTokenized()) { // un-tokenized field
String stringValue = field.stringValue(); String stringValue = field.stringValue();
if(field.isStoreOffsetWithTermVector()) if(field.isStoreOffsetWithTermVector())
addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length())); addPosition(fieldName, stringValue, position++, null, new TermVectorOffsetInfo(offset, offset + stringValue.length()));
else else
addPosition(fieldName, stringValue, position++, null); addPosition(fieldName, stringValue, position++, null, null);
offset += stringValue.length(); offset += stringValue.length();
length++; length++;
} else } else
@ -167,10 +179,19 @@ final class DocumentWriter {
for (Token t = stream.next(); t != null; t = stream.next()) { for (Token t = stream.next(); t != null; t = stream.next()) {
position += (t.getPositionIncrement() - 1); position += (t.getPositionIncrement() - 1);
if(field.isStoreOffsetWithTermVector()) Payload payload = t.getPayload();
addPosition(fieldName, t.termText(), position++, new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset())); if (payload != null) {
else // enable payloads for this field
addPosition(fieldName, t.termText(), position++, null); fieldStoresPayloads.set(fieldNumber);
}
TermVectorOffsetInfo termVectorOffsetInfo;
if (field.isStoreOffsetWithTermVector()) {
termVectorOffsetInfo = new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset());
} else {
termVectorOffsetInfo = null;
}
addPosition(fieldName, t.termText(), position++, payload, termVectorOffsetInfo);
lastToken = t; lastToken = t;
if (++length >= maxFieldLength) { if (++length >= maxFieldLength) {
@ -194,11 +215,16 @@ final class DocumentWriter {
fieldOffsets[fieldNumber] = offset; fieldOffsets[fieldNumber] = offset;
} }
} }
// update fieldInfos for all fields that have one or more tokens with payloads
for (int i = fieldStoresPayloads.nextSetBit(0); i >= 0; i = fieldStoresPayloads.nextSetBit(i+1)) {
fieldInfos.fieldInfo(i).storePayloads = true;
}
} }
private final Term termBuffer = new Term("", ""); // avoid consing private final Term termBuffer = new Term("", ""); // avoid consing
private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset) { private final void addPosition(String field, String text, int position, Payload payload, TermVectorOffsetInfo offset) {
termBuffer.set(field, text); termBuffer.set(field, text);
//System.out.println("Offset: " + offset); //System.out.println("Offset: " + offset);
Posting ti = (Posting) postingTable.get(termBuffer); Posting ti = (Posting) postingTable.get(termBuffer);
@ -209,9 +235,25 @@ final class DocumentWriter {
int[] positions = ti.positions; int[] positions = ti.positions;
System.arraycopy(positions, 0, newPositions, 0, freq); System.arraycopy(positions, 0, newPositions, 0, freq);
ti.positions = newPositions; ti.positions = newPositions;
if (ti.payloads != null) {
// the current field stores payloads
Payload[] newPayloads = new Payload[freq * 2]; // grow payloads array
Payload[] payloads = ti.payloads;
System.arraycopy(payloads, 0, newPayloads, 0, payloads.length);
ti.payloads = newPayloads;
}
} }
ti.positions[freq] = position; // add new position ti.positions[freq] = position; // add new position
if (payload != null) {
if (ti.payloads == null) {
// lazily allocate payload array
ti.payloads = new Payload[ti.positions.length];
}
ti.payloads[freq] = payload;
}
if (offset != null) { if (offset != null) {
if (ti.offsets.length == freq){ if (ti.offsets.length == freq){
TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[freq*2]; TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[freq*2];
@ -224,7 +266,7 @@ final class DocumentWriter {
ti.freq = freq + 1; // update frequency ti.freq = freq + 1; // update frequency
} else { // word not seen before } else { // word not seen before
Term term = new Term(field, text, false); Term term = new Term(field, text, false);
postingTable.put(term, new Posting(term, position, offset)); postingTable.put(term, new Posting(term, position, payload, offset));
} }
} }
@ -307,10 +349,31 @@ final class DocumentWriter {
termIndexInterval); termIndexInterval);
TermInfo ti = new TermInfo(); TermInfo ti = new TermInfo();
String currentField = null; String currentField = null;
boolean currentFieldHasPayloads = false;
for (int i = 0; i < postings.length; i++) { for (int i = 0; i < postings.length; i++) {
Posting posting = postings[i]; Posting posting = postings[i];
// check to see if we switched to a new field
String termField = posting.term.field();
if (currentField != termField) {
// changing field - see if there is something to save
currentField = termField;
FieldInfo fi = fieldInfos.fieldInfo(currentField);
currentFieldHasPayloads = fi.storePayloads;
if (fi.storeTermVector) {
if (termVectorWriter == null) {
termVectorWriter =
new TermVectorsWriter(directory, segment, fieldInfos);
termVectorWriter.openDocument();
}
termVectorWriter.openField(currentField);
} else if (termVectorWriter != null) {
termVectorWriter.closeField();
}
}
// add an entry to the dictionary with pointers to prox and freq files // add an entry to the dictionary with pointers to prox and freq files
ti.set(1, freq.getFilePointer(), prox.getFilePointer(), -1); ti.set(1, freq.getFilePointer(), prox.getFilePointer(), -1);
tis.add(posting.term, ti); tis.add(posting.term, ti);
@ -326,28 +389,62 @@ final class DocumentWriter {
int lastPosition = 0; // write positions int lastPosition = 0; // write positions
int[] positions = posting.positions; int[] positions = posting.positions;
Payload[] payloads = posting.payloads;
int lastPayloadLength = -1;
// The following encoding is being used for positions and payloads:
// Case 1: current field does not store payloads
// Positions -> <PositionDelta>^freq
// PositionDelta -> VInt
// The PositionDelta is the difference between the current
// and the previous position
// Case 2: current field stores payloads
// Positions -> <PositionDelta, Payload>^freq
// Payload -> <PayloadLength?, PayloadData>
// PositionDelta -> VInt
// PayloadLength -> VInt
// PayloadData -> byte^PayloadLength
// In this case PositionDelta/2 is the difference between
// the current and the previous position. If PositionDelta
// is odd, then a PayloadLength encoded as VInt follows,
// if PositionDelta is even, then it is assumed that the
// length of the current Payload equals the length of the
// previous Payload.
for (int j = 0; j < postingFreq; j++) { // use delta-encoding for (int j = 0; j < postingFreq; j++) { // use delta-encoding
int position = positions[j]; int position = positions[j];
prox.writeVInt(position - lastPosition); int delta = position - lastPosition;
lastPosition = position; if (currentFieldHasPayloads) {
} int payloadLength = 0;
// check to see if we switched to a new field Payload payload = null;
String termField = posting.term.field(); if (payloads != null) {
if (currentField != termField) { payload = payloads[j];
// changing field - see if there is something to save if (payload != null) {
currentField = termField; payloadLength = payload.length;
FieldInfo fi = fieldInfos.fieldInfo(currentField); }
if (fi.storeTermVector) {
if (termVectorWriter == null) {
termVectorWriter =
new TermVectorsWriter(directory, segment, fieldInfos);
termVectorWriter.openDocument();
} }
termVectorWriter.openField(currentField); if (payloadLength == lastPayloadLength) {
// the length of the current payload equals the length
} else if (termVectorWriter != null) { // of the previous one. So we do not have to store the length
termVectorWriter.closeField(); // again and we only shift the position delta by one bit
prox.writeVInt(delta * 2);
} else {
// the length of the current payload is different from the
// previous one. We shift the position delta, set the lowest
// bit and store the current payload length as VInt.
prox.writeVInt(delta * 2 + 1);
prox.writeVInt(payloadLength);
lastPayloadLength = payloadLength;
}
if (payloadLength > 0) {
// write current payload
prox.writeBytes(payload.data, payload.offset, payload.length);
}
} else {
// field does not store payloads, just write position delta as VInt
prox.writeVInt(delta);
} }
lastPosition = position;
} }
if (termVectorWriter != null && termVectorWriter.isFieldOpen()) { if (termVectorWriter != null && termVectorWriter.isFieldOpen()) {
termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets); termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets);
@ -397,18 +494,27 @@ final class Posting { // info about a Term in a doc
Term term; // the Term Term term; // the Term
int freq; // its frequency in doc int freq; // its frequency in doc
int[] positions; // positions it occurs at int[] positions; // positions it occurs at
Payload[] payloads; // the payloads of the terms
TermVectorOffsetInfo [] offsets; TermVectorOffsetInfo [] offsets;
Posting(Term t, int position, TermVectorOffsetInfo offset) { Posting(Term t, int position, Payload payload, TermVectorOffsetInfo offset) {
term = t; term = t;
freq = 1; freq = 1;
positions = new int[1]; positions = new int[1];
positions[0] = position; positions[0] = position;
if (payload != null) {
payloads = new Payload[1];
payloads[0] = payload;
} else
payloads = null;
if(offset != null){ if(offset != null){
offsets = new TermVectorOffsetInfo[1]; offsets = new TermVectorOffsetInfo[1];
offsets[0] = offset; offsets[0] = offset;
} } else
else
offsets = null; offsets = null;
} }
} }

View File

@ -28,9 +28,12 @@ final class FieldInfo {
boolean storePositionWithTermVector; boolean storePositionWithTermVector;
boolean omitNorms; // omit norms associated with indexed fields boolean omitNorms; // omit norms associated with indexed fields
boolean storePayloads; // whether this field stores payloads together with term positions
FieldInfo(String na, boolean tk, int nu, boolean storeTermVector, FieldInfo(String na, boolean tk, int nu, boolean storeTermVector,
boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, boolean omitNorms) { boolean storePositionWithTermVector, boolean storeOffsetWithTermVector,
boolean omitNorms, boolean storePayloads) {
name = na; name = na;
isIndexed = tk; isIndexed = tk;
number = nu; number = nu;
@ -38,5 +41,6 @@ final class FieldInfo {
this.storeOffsetWithTermVector = storeOffsetWithTermVector; this.storeOffsetWithTermVector = storeOffsetWithTermVector;
this.storePositionWithTermVector = storePositionWithTermVector; this.storePositionWithTermVector = storePositionWithTermVector;
this.omitNorms = omitNorms; this.omitNorms = omitNorms;
this.storePayloads = storePayloads;
} }
} }

View File

@ -39,6 +39,7 @@ final class FieldInfos {
static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x4; static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x4;
static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x8; static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x8;
static final byte OMIT_NORMS = 0x10; static final byte OMIT_NORMS = 0x10;
static final byte STORE_PAYLOADS = 0x20;
private ArrayList byNumber = new ArrayList(); private ArrayList byNumber = new ArrayList();
private HashMap byName = new HashMap(); private HashMap byName = new HashMap();
@ -156,9 +157,29 @@ final class FieldInfos {
*/ */
public void add(String name, boolean isIndexed, boolean storeTermVector, public void add(String name, boolean isIndexed, boolean storeTermVector,
boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, boolean omitNorms) { boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, boolean omitNorms) {
add(name, isIndexed, storeTermVector, storePositionWithTermVector,
storeOffsetWithTermVector, omitNorms, false);
}
/** If the field is not yet known, adds it. If it is known, checks to make
* sure that the isIndexed flag is the same as was given previously for this
* field. If not - marks it as being indexed. Same goes for the TermVector
* parameters.
*
* @param name The name of the field
* @param isIndexed true if the field is indexed
* @param storeTermVector true if the term vector should be stored
* @param storePositionWithTermVector true if the term vector with positions should be stored
* @param storeOffsetWithTermVector true if the term vector with offsets should be stored
* @param omitNorms true if the norms for the indexed field should be omitted
* @param storePayloads true if payloads should be stored for this field
*/
public void add(String name, boolean isIndexed, boolean storeTermVector,
boolean storePositionWithTermVector, boolean storeOffsetWithTermVector,
boolean omitNorms, boolean storePayloads) {
FieldInfo fi = fieldInfo(name); FieldInfo fi = fieldInfo(name);
if (fi == null) { if (fi == null) {
addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms); addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads);
} else { } else {
if (fi.isIndexed != isIndexed) { if (fi.isIndexed != isIndexed) {
fi.isIndexed = true; // once indexed, always index fi.isIndexed = true; // once indexed, always index
@ -175,6 +196,9 @@ final class FieldInfos {
if (fi.omitNorms != omitNorms) { if (fi.omitNorms != omitNorms) {
fi.omitNorms = false; // once norms are stored, always store fi.omitNorms = false; // once norms are stored, always store
} }
if (fi.storePayloads != storePayloads) {
fi.storePayloads = true;
}
} }
} }
@ -182,10 +206,10 @@ final class FieldInfos {
private void addInternal(String name, boolean isIndexed, private void addInternal(String name, boolean isIndexed,
boolean storeTermVector, boolean storePositionWithTermVector, boolean storeTermVector, boolean storePositionWithTermVector,
boolean storeOffsetWithTermVector, boolean omitNorms) { boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads) {
FieldInfo fi = FieldInfo fi =
new FieldInfo(name, isIndexed, byNumber.size(), storeTermVector, storePositionWithTermVector, new FieldInfo(name, isIndexed, byNumber.size(), storeTermVector, storePositionWithTermVector,
storeOffsetWithTermVector, omitNorms); storeOffsetWithTermVector, omitNorms, storePayloads);
byNumber.add(fi); byNumber.add(fi);
byName.put(name, fi); byName.put(name, fi);
} }
@ -271,6 +295,7 @@ final class FieldInfos {
if (fi.storePositionWithTermVector) bits |= STORE_POSITIONS_WITH_TERMVECTOR; if (fi.storePositionWithTermVector) bits |= STORE_POSITIONS_WITH_TERMVECTOR;
if (fi.storeOffsetWithTermVector) bits |= STORE_OFFSET_WITH_TERMVECTOR; if (fi.storeOffsetWithTermVector) bits |= STORE_OFFSET_WITH_TERMVECTOR;
if (fi.omitNorms) bits |= OMIT_NORMS; if (fi.omitNorms) bits |= OMIT_NORMS;
if (fi.storePayloads) bits |= STORE_PAYLOADS;
output.writeString(fi.name); output.writeString(fi.name);
output.writeByte(bits); output.writeByte(bits);
} }
@ -286,8 +311,9 @@ final class FieldInfos {
boolean storePositionsWithTermVector = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; boolean storePositionsWithTermVector = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
boolean storeOffsetWithTermVector = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; boolean storeOffsetWithTermVector = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
boolean omitNorms = (bits & OMIT_NORMS) != 0; boolean omitNorms = (bits & OMIT_NORMS) != 0;
boolean storePayloads = (bits & STORE_PAYLOADS) != 0;
addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms);
addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads);
} }
} }

View File

@ -62,6 +62,14 @@ public class FilterIndexReader extends IndexReader {
public int nextPosition() throws IOException { public int nextPosition() throws IOException {
return ((TermPositions) this.in).nextPosition(); return ((TermPositions) this.in).nextPosition();
} }
public int getPayloadLength() {
return ((TermPositions) this.in).getPayloadLength();
}
public byte[] getPayload(byte[] data, int offset) throws IOException {
return ((TermPositions) this.in).getPayload(data, offset);
}
} }
/** Base class for filtering {@link TermEnum} implementations. */ /** Base class for filtering {@link TermEnum} implementations. */

View File

@ -67,6 +67,8 @@ public abstract class IndexReader {
public static final FieldOption ALL = new FieldOption ("ALL"); public static final FieldOption ALL = new FieldOption ("ALL");
// all indexed fields // all indexed fields
public static final FieldOption INDEXED = new FieldOption ("INDEXED"); public static final FieldOption INDEXED = new FieldOption ("INDEXED");
// all fields that store payloads
public static final FieldOption STORES_PAYLOADS = new FieldOption ("STORES_PAYLOADS");
// all fields which are not indexed // all fields which are not indexed
public static final FieldOption UNINDEXED = new FieldOption ("UNINDEXED"); public static final FieldOption UNINDEXED = new FieldOption ("UNINDEXED");
// all fields which are indexed with termvectors enables // all fields which are indexed with termvectors enables

View File

@ -455,5 +455,12 @@ class MultiTermPositions extends MultiTermDocs implements TermPositions {
public int nextPosition() throws IOException { public int nextPosition() throws IOException {
return ((TermPositions)current).nextPosition(); return ((TermPositions)current).nextPosition();
} }
public int getPayloadLength() {
return ((TermPositions)current).getPayloadLength();
}
public byte[] getPayload(byte[] data, int offset) throws IOException {
return ((TermPositions)current).getPayload(data, offset);
}
} }

View File

@ -191,5 +191,23 @@ public class MultipleTermPositions implements TermPositions {
public int read(int[] arg0, int[] arg1) throws IOException { public int read(int[] arg0, int[] arg1) throws IOException {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
/**
* Not implemented.
* @throws UnsupportedOperationException
*/
public int getPayloadLength() {
throw new UnsupportedOperationException();
}
/**
* Not implemented.
* @throws UnsupportedOperationException
*/
public byte[] getPayload(byte[] data, int offset) throws IOException {
throw new UnsupportedOperationException();
}
} }

View File

@ -419,7 +419,15 @@ public class ParallelReader extends IndexReader {
return ((TermPositions)termDocs).nextPosition(); return ((TermPositions)termDocs).nextPosition();
} }
public int getPayloadLength() {
return ((TermPositions)termDocs).getPayloadLength();
}
public byte[] getPayload(byte[] data, int offset) throws IOException {
return ((TermPositions)termDocs).getPayload(data, offset);
}
} }
} }

View File

@ -0,0 +1,114 @@
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Serializable;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
/**
* A Payload is metadata that can be stored together with each occurrence
* of a term. This metadata is stored inline in the posting list of the
* specific term.
* <p>
* To store payloads in the index a {@link TokenStream} has to be used that
* produces {@link Token}s containing payload data.
* <p>
* Use {@link TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)}
* to retrieve the payloads from the index.<br>
* <br>
*
* <b>
* Warning: The status of the Payloads feature is experimental. The APIs
* introduced here might change in the future and will not be supported anymore
* in such a case. If you want to use this feature in a production environment
* you should wait for an official release.
* </b>
*/
// TODO: Remove warning after API has been finalized
public class Payload implements Serializable {
protected byte[] data;
protected int offset;
protected int length;
protected Payload() {
// no-arg constructor since this class implements Serializable
}
/**
* Creates a new payload with the the given array as data.
*
* @param data the data of this payload
*/
public Payload(byte[] data) {
this(data, 0, data.length);
}
/**
* Creates a new payload with the the given array as data.
*
* @param data the data of this payload
* @param offset the offset in the data byte array
* @param length the length of the data
*/
public Payload(byte[] data, int offset, int length) {
if (offset < 0 || offset + length > data.length) {
throw new IllegalArgumentException();
}
this.data = data;
this.offset = offset;
this.length = length;
}
public int length() {
return this.length;
}
/**
* Returns the byte at the given index.
*/
public byte byteAt(int index) {
if (0 <= index && index < this.length) {
return this.data[this.offset + index];
}
throw new ArrayIndexOutOfBoundsException(index);
}
/**
* Allocates a new byte array, copies the payload data into it and returns it.
*/
public byte[] toByteArray() {
byte[] retArray = new byte[this.length];
System.arraycopy(this.data, this.offset, retArray, 0, this.length);
return retArray;
}
/**
* Copies the payload data to a byte array.
*
* @param target the target byte array
* @param targetOffset the offset in the target byte array
*/
public void copyTo(byte[] target, int targetOffset) {
if (this.length > target.length + targetOffset) {
throw new ArrayIndexOutOfBoundsException();
}
System.arraycopy(this.data, this.offset, target, targetOffset, this.length);
}
}

View File

@ -157,11 +157,11 @@ final class SegmentMerger {
} }
private void addIndexed(IndexReader reader, FieldInfos fieldInfos, Collection names, boolean storeTermVectors, boolean storePositionWithTermVector, private void addIndexed(IndexReader reader, FieldInfos fieldInfos, Collection names, boolean storeTermVectors, boolean storePositionWithTermVector,
boolean storeOffsetWithTermVector) throws IOException { boolean storeOffsetWithTermVector, boolean storePayloads) throws IOException {
Iterator i = names.iterator(); Iterator i = names.iterator();
while (i.hasNext()) { while (i.hasNext()) {
String field = (String)i.next(); String field = (String)i.next();
fieldInfos.add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader.hasNorms(field)); fieldInfos.add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader.hasNorms(field), storePayloads);
} }
} }
@ -176,11 +176,12 @@ final class SegmentMerger {
int docCount = 0; int docCount = 0;
for (int i = 0; i < readers.size(); i++) { for (int i = 0; i < readers.size(); i++) {
IndexReader reader = (IndexReader) readers.elementAt(i); IndexReader reader = (IndexReader) readers.elementAt(i);
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true); addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false);
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false); addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false);
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true); addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false);
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false); addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false, false);
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.INDEXED), false, false, false); addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.STORES_PAYLOADS), false, false, false, true);
addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.INDEXED), false, false, false, false);
fieldInfos.add(reader.getFieldNames(IndexReader.FieldOption.UNINDEXED), false); fieldInfos.add(reader.getFieldNames(IndexReader.FieldOption.UNINDEXED), false);
} }
fieldInfos.write(directory, segment + ".fnm"); fieldInfos.write(directory, segment + ".fnm");
@ -326,6 +327,8 @@ final class SegmentMerger {
termInfosWriter.add(smis[0].term, termInfo); termInfosWriter.add(smis[0].term, termInfo);
} }
} }
private byte[] payloadBuffer = null;
/** Process postings from multiple segments all positioned on the /** Process postings from multiple segments all positioned on the
* same term. Writes out merged entries into freqOutput and * same term. Writes out merged entries into freqOutput and
@ -342,6 +345,8 @@ final class SegmentMerger {
int lastDoc = 0; int lastDoc = 0;
int df = 0; // number of docs w/ term int df = 0; // number of docs w/ term
resetSkip(); resetSkip();
boolean storePayloads = fieldInfos.fieldInfo(smis[0].term.field).storePayloads;
int lastPayloadLength = -1; // ensures that we write the first length
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
SegmentMergeInfo smi = smis[i]; SegmentMergeInfo smi = smis[i];
TermPositions postings = smi.getPositions(); TermPositions postings = smi.getPositions();
@ -361,7 +366,7 @@ final class SegmentMerger {
df++; df++;
if ((df % skipInterval) == 0) { if ((df % skipInterval) == 0) {
bufferSkip(lastDoc); bufferSkip(lastDoc, storePayloads, lastPayloadLength);
} }
int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1 int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1
@ -374,11 +379,33 @@ final class SegmentMerger {
freqOutput.writeVInt(docCode); // write doc freqOutput.writeVInt(docCode); // write doc
freqOutput.writeVInt(freq); // write frequency in doc freqOutput.writeVInt(freq); // write frequency in doc
} }
/** See {@link DocumentWriter#writePostings(Posting[], String) for
* documentation about the encoding of positions and payloads
*/
int lastPosition = 0; // write position deltas int lastPosition = 0; // write position deltas
for (int j = 0; j < freq; j++) { for (int j = 0; j < freq; j++) {
int position = postings.nextPosition(); int position = postings.nextPosition();
proxOutput.writeVInt(position - lastPosition); int delta = position - lastPosition;
if (storePayloads) {
int payloadLength = postings.getPayloadLength();
if (payloadLength == lastPayloadLength) {
proxOutput.writeVInt(delta * 2);
} else {
proxOutput.writeVInt(delta * 2 + 1);
proxOutput.writeVInt(payloadLength);
lastPayloadLength = payloadLength;
}
if (payloadLength > 0) {
if (payloadBuffer == null || payloadBuffer.length < payloadLength) {
payloadBuffer = new byte[payloadLength];
}
postings.getPayload(payloadBuffer, 0);
proxOutput.writeBytes(payloadBuffer, 0, payloadLength);
}
} else {
proxOutput.writeVInt(delta);
}
lastPosition = position; lastPosition = position;
} }
} }
@ -388,21 +415,59 @@ final class SegmentMerger {
private RAMOutputStream skipBuffer = new RAMOutputStream(); private RAMOutputStream skipBuffer = new RAMOutputStream();
private int lastSkipDoc; private int lastSkipDoc;
private int lastSkipPayloadLength;
private long lastSkipFreqPointer; private long lastSkipFreqPointer;
private long lastSkipProxPointer; private long lastSkipProxPointer;
private void resetSkip() { private void resetSkip() {
skipBuffer.reset(); skipBuffer.reset();
lastSkipDoc = 0; lastSkipDoc = 0;
lastSkipPayloadLength = -1; // we don't have to write the first length in the skip list
lastSkipFreqPointer = freqOutput.getFilePointer(); lastSkipFreqPointer = freqOutput.getFilePointer();
lastSkipProxPointer = proxOutput.getFilePointer(); lastSkipProxPointer = proxOutput.getFilePointer();
} }
private void bufferSkip(int doc) throws IOException { private void bufferSkip(int doc, boolean storePayloads, int payloadLength) throws IOException {
long freqPointer = freqOutput.getFilePointer(); long freqPointer = freqOutput.getFilePointer();
long proxPointer = proxOutput.getFilePointer(); long proxPointer = proxOutput.getFilePointer();
skipBuffer.writeVInt(doc - lastSkipDoc); // To efficiently store payloads in the posting lists we do not store the length of
// every payload. Instead we omit the length for a payload if the previous payload had
// the same length.
// However, in order to support skipping the payload length at every skip point must be known.
// So we use the same length encoding that we use for the posting lists for the skip data as well:
// Case 1: current field does not store payloads
// SkipDatum --> DocSkip, FreqSkip, ProxSkip
// DocSkip,FreqSkip,ProxSkip --> VInt
// DocSkip records the document number before every SkipInterval th document in TermFreqs.
// Document numbers are represented as differences from the previous value in the sequence.
// Case 2: current field stores payloads
// SkipDatum --> DocSkip, PayloadLength?, FreqSkip,ProxSkip
// DocSkip,FreqSkip,ProxSkip --> VInt
// PayloadLength --> VInt
// In this case DocSkip/2 is the difference between
// the current and the previous value. If DocSkip
// is odd, then a PayloadLength encoded as VInt follows,
// if DocSkip is even, then it is assumed that the
// current payload length equals the length at the previous
// skip point
if (storePayloads) {
int delta = doc - lastSkipDoc;
if (payloadLength == lastSkipPayloadLength) {
// the current payload length equals the length at the previous skip point,
// so we don't store the length again
skipBuffer.writeVInt(delta * 2);
} else {
// the payload length is different from the previous one. We shift the DocSkip,
// set the lowest bit and store the current payload length as VInt.
skipBuffer.writeVInt(delta * 2 + 1);
skipBuffer.writeVInt(payloadLength);
lastSkipPayloadLength = payloadLength;
}
} else {
// current field does not store payloads
skipBuffer.writeVInt(doc - lastSkipDoc);
}
skipBuffer.writeVInt((int) (freqPointer - lastSkipFreqPointer)); skipBuffer.writeVInt((int) (freqPointer - lastSkipFreqPointer));
skipBuffer.writeVInt((int) (proxPointer - lastSkipProxPointer)); skipBuffer.writeVInt((int) (proxPointer - lastSkipProxPointer));

View File

@ -374,6 +374,9 @@ class SegmentReader extends IndexReader {
else if (!fi.isIndexed && fieldOption == IndexReader.FieldOption.UNINDEXED) { else if (!fi.isIndexed && fieldOption == IndexReader.FieldOption.UNINDEXED) {
fieldSet.add(fi.name); fieldSet.add(fi.name);
} }
else if (fi.storePayloads && fieldOption == IndexReader.FieldOption.STORES_PAYLOADS) {
fieldSet.add(fi.name);
}
else if (fi.isIndexed && fieldOption == IndexReader.FieldOption.INDEXED) { else if (fi.isIndexed && fieldOption == IndexReader.FieldOption.INDEXED) {
fieldSet.add(fi.name); fieldSet.add(fi.name);
} }
@ -582,7 +585,12 @@ class SegmentReader extends IndexReader {
return termVectorsReader.get(docNumber); return termVectorsReader.get(docNumber);
} }
/** Returns the field infos of this segment */
FieldInfos fieldInfos() {
return fieldInfos;
}
/** /**
* Return the name of the segment this reader is reading. * Return the name of the segment this reader is reading.
*/ */

View File

@ -39,6 +39,9 @@ class SegmentTermDocs implements TermDocs {
private long proxPointer; private long proxPointer;
private long skipPointer; private long skipPointer;
private boolean haveSkipped; private boolean haveSkipped;
private int payloadLengthAtLastSkip;
protected boolean currentFieldStoresPayloads;
protected SegmentTermDocs(SegmentReader parent) { protected SegmentTermDocs(SegmentReader parent) {
this.parent = parent; this.parent = parent;
@ -49,23 +52,31 @@ class SegmentTermDocs implements TermDocs {
public void seek(Term term) throws IOException { public void seek(Term term) throws IOException {
TermInfo ti = parent.tis.get(term); TermInfo ti = parent.tis.get(term);
seek(ti); seek(ti, term);
} }
public void seek(TermEnum termEnum) throws IOException { public void seek(TermEnum termEnum) throws IOException {
TermInfo ti; TermInfo ti;
Term term;
// use comparison of fieldinfos to verify that termEnum belongs to the same segment as this SegmentTermDocs // use comparison of fieldinfos to verify that termEnum belongs to the same segment as this SegmentTermDocs
if (termEnum instanceof SegmentTermEnum && ((SegmentTermEnum) termEnum).fieldInfos == parent.fieldInfos) // optimized case if (termEnum instanceof SegmentTermEnum && ((SegmentTermEnum) termEnum).fieldInfos == parent.fieldInfos) { // optimized case
ti = ((SegmentTermEnum) termEnum).termInfo(); SegmentTermEnum segmentTermEnum = ((SegmentTermEnum) termEnum);
else // punt case term = segmentTermEnum.term();
ti = parent.tis.get(termEnum.term()); ti = segmentTermEnum.termInfo();
} else { // punt case
seek(ti); term = termEnum.term();
ti = parent.tis.get(term);
}
seek(ti, term);
} }
void seek(TermInfo ti) throws IOException { void seek(TermInfo ti, Term term) throws IOException {
count = 0; count = 0;
payloadLengthAtLastSkip = 0;
FieldInfo fi = parent.fieldInfos.fieldInfo(term.field);
currentFieldStoresPayloads = (fi != null) ? fi.storePayloads : false;
if (ti == null) { if (ti == null) {
df = 0; df = 0;
} else { } else {
@ -141,7 +152,7 @@ class SegmentTermDocs implements TermDocs {
} }
/** Overridden by SegmentTermPositions to skip in prox stream. */ /** Overridden by SegmentTermPositions to skip in prox stream. */
protected void skipProx(long proxPointer) throws IOException {} protected void skipProx(long proxPointer, int payloadLength) throws IOException {}
/** Optimized implementation. */ /** Optimized implementation. */
public boolean skipTo(int target) throws IOException { public boolean skipTo(int target) throws IOException {
@ -157,6 +168,7 @@ class SegmentTermDocs implements TermDocs {
// scan skip data // scan skip data
int lastSkipDoc = skipDoc; int lastSkipDoc = skipDoc;
int lastPayloadLength = 0;
long lastFreqPointer = freqStream.getFilePointer(); long lastFreqPointer = freqStream.getFilePointer();
long lastProxPointer = -1; long lastProxPointer = -1;
int numSkipped = -1 - (count % skipInterval); int numSkipped = -1 - (count % skipInterval);
@ -165,6 +177,7 @@ class SegmentTermDocs implements TermDocs {
lastSkipDoc = skipDoc; lastSkipDoc = skipDoc;
lastFreqPointer = freqPointer; lastFreqPointer = freqPointer;
lastProxPointer = proxPointer; lastProxPointer = proxPointer;
lastPayloadLength = payloadLengthAtLastSkip;
if (skipDoc != 0 && skipDoc >= doc) if (skipDoc != 0 && skipDoc >= doc)
numSkipped += skipInterval; numSkipped += skipInterval;
@ -172,7 +185,21 @@ class SegmentTermDocs implements TermDocs {
if(skipCount >= numSkips) if(skipCount >= numSkips)
break; break;
skipDoc += skipStream.readVInt(); if (currentFieldStoresPayloads) {
// the current field stores payloads.
// if the doc delta is odd then we have
// to read the current payload length
// because it differs from the length of the
// previous payload
int delta = skipStream.readVInt();
if ((delta & 1) != 0) {
payloadLengthAtLastSkip = skipStream.readVInt();
}
delta >>>= 1;
skipDoc += delta;
} else {
skipDoc += skipStream.readVInt();
}
freqPointer += skipStream.readVInt(); freqPointer += skipStream.readVInt();
proxPointer += skipStream.readVInt(); proxPointer += skipStream.readVInt();
@ -182,7 +209,7 @@ class SegmentTermDocs implements TermDocs {
// if we found something to skip, then skip it // if we found something to skip, then skip it
if (lastFreqPointer > freqStream.getFilePointer()) { if (lastFreqPointer > freqStream.getFilePointer()) {
freqStream.seek(lastFreqPointer); freqStream.seek(lastFreqPointer);
skipProx(lastProxPointer); skipProx(lastProxPointer, lastPayloadLength);
doc = lastSkipDoc; doc = lastSkipDoc;
count += numSkipped; count += numSkipped;

View File

@ -27,6 +27,12 @@ extends SegmentTermDocs implements TermPositions {
private int proxCount; private int proxCount;
private int position; private int position;
// the current payload length
private int payloadLength;
// indicates whether the payload of the currend position has
// been read from the proxStream yet
private boolean needToLoadPayload;
// these variables are being used to remember information // these variables are being used to remember information
// for a lazy skip // for a lazy skip
private long lazySkipPointer = 0; private long lazySkipPointer = 0;
@ -37,13 +43,15 @@ extends SegmentTermDocs implements TermPositions {
this.proxStream = null; // the proxStream will be cloned lazily when nextPosition() is called for the first time this.proxStream = null; // the proxStream will be cloned lazily when nextPosition() is called for the first time
} }
final void seek(TermInfo ti) throws IOException { final void seek(TermInfo ti, Term term) throws IOException {
super.seek(ti); super.seek(ti, term);
if (ti != null) if (ti != null)
lazySkipPointer = ti.proxPointer; lazySkipPointer = ti.proxPointer;
lazySkipProxCount = 0; lazySkipProxCount = 0;
proxCount = 0; proxCount = 0;
payloadLength = 0;
needToLoadPayload = false;
} }
public final void close() throws IOException { public final void close() throws IOException {
@ -55,9 +63,28 @@ extends SegmentTermDocs implements TermPositions {
// perform lazy skips if neccessary // perform lazy skips if neccessary
lazySkip(); lazySkip();
proxCount--; proxCount--;
return position += proxStream.readVInt(); return position += readDeltaPosition();
} }
private final int readDeltaPosition() throws IOException {
int delta = proxStream.readVInt();
if (currentFieldStoresPayloads) {
// if the current field stores payloads then
// the position delta is shifted one bit to the left.
// if the LSB is set, then we have to read the current
// payload length
if ((delta & 1) != 0) {
payloadLength = proxStream.readVInt();
}
delta >>>= 1;
needToLoadPayload = true;
} else {
payloadLength = 0;
needToLoadPayload = false;
}
return delta;
}
protected final void skippingDoc() throws IOException { protected final void skippingDoc() throws IOException {
// we remember to skip a document lazily // we remember to skip a document lazily
lazySkipProxCount += freq; lazySkipProxCount += freq;
@ -82,16 +109,27 @@ extends SegmentTermDocs implements TermPositions {
/** Called by super.skipTo(). */ /** Called by super.skipTo(). */
protected void skipProx(long proxPointer) throws IOException { protected void skipProx(long proxPointer, int payloadLength) throws IOException {
// we save the pointer, we might have to skip there lazily // we save the pointer, we might have to skip there lazily
lazySkipPointer = proxPointer; lazySkipPointer = proxPointer;
lazySkipProxCount = 0; lazySkipProxCount = 0;
proxCount = 0; proxCount = 0;
this.payloadLength = payloadLength;
needToLoadPayload = false;
} }
private void skipPositions(int n) throws IOException { private void skipPositions(int n) throws IOException {
for (int f = n; f > 0; f--) // skip unread positions for (int f = n; f > 0; f--) { // skip unread positions
proxStream.readVInt(); readDeltaPosition();
skipPayload();
}
}
private void skipPayload() throws IOException {
if (needToLoadPayload && payloadLength > 0) {
proxStream.seek(proxStream.getFilePointer() + payloadLength);
}
needToLoadPayload = false;
} }
// It is not always neccessary to move the prox pointer // It is not always neccessary to move the prox pointer
@ -109,6 +147,10 @@ extends SegmentTermDocs implements TermPositions {
// clone lazily // clone lazily
proxStream = (IndexInput)parent.proxStream.clone(); proxStream = (IndexInput)parent.proxStream.clone();
} }
// we might have to skip the current payload
// if it was not read yet
skipPayload();
if (lazySkipPointer != 0) { if (lazySkipPointer != 0) {
proxStream.seek(lazySkipPointer); proxStream.seek(lazySkipPointer);
@ -120,5 +162,31 @@ extends SegmentTermDocs implements TermPositions {
lazySkipProxCount = 0; lazySkipProxCount = 0;
} }
} }
public int getPayloadLength() {
return payloadLength;
}
public byte[] getPayload(byte[] data, int offset) throws IOException {
if (!needToLoadPayload) {
throw new IOException("Payload cannot be loaded more than once for the same term position.");
}
// read payloads lazily
byte[] retArray;
int retOffset;
if (data == null || data.length - offset < payloadLength) {
// the array is too small to store the payload data,
// so we allocate a new one
retArray = new byte[payloadLength];
retOffset = 0;
} else {
retArray = data;
retOffset = offset;
}
proxStream.readBytes(retArray, retOffset, payloadLength);
needToLoadPayload = false;
return retArray;
}
} }

View File

@ -32,10 +32,53 @@ public interface TermPositions
extends TermDocs extends TermDocs
{ {
/** Returns next position in the current document. It is an error to call /** Returns next position in the current document. It is an error to call
this more than {@link #freq()} times this more than {@link #freq()} times
without calling {@link #next()}<p> This is without calling {@link #next()}<p> This is
invalid until {@link #next()} is called for invalid until {@link #next()} is called for
the first time. the first time.
*/ */
int nextPosition() throws IOException; int nextPosition() throws IOException;
/**
* Returns the length of the payload at the current term position.
* This is invalid until {@link #nextPosition()} is called for
* the first time.<br>
* <br>
* <b>
* Warning: The status of the Payloads feature is experimental. The APIs
* introduced here might change in the future and will not be supported anymore
* in such a case. If you want to use this feature in a production environment
* you should wait for an official release.
* </b>
* @return length of the current payload in number of bytes
*/
// TODO: Remove warning after API has been finalized
int getPayloadLength();
/**
* Returns the payload data at the current term position.
* This is invalid until {@link #nextPosition()} is called for
* the first time.
* This method must not be called more than once after each call
* of {@link #nextPosition()}. However, payloads are loaded lazily,
* so if the payload data for the current position is not needed,
* this method may not be called at all for performance reasons.<br>
* <br>
* <b>
* Warning: The status of the Payloads feature is experimental. The APIs
* introduced here might change in the future and will not be supported anymore
* in such a case. If you want to use this feature in a production environment
* you should wait for an official release.
* </b>
*
* @param data the array into which the data of this payload is to be
* stored, if it is big enough; otherwise, a new byte[] array
* is allocated for this purpose.
* @param offset the offset in the array into which the data of this payload
* is to be stored.
* @return a byte[] array containing the data of this payload
* @throws IOException
*/
// TODO: Remove warning after API has been finalized
byte[] getPayload(byte[] data, int offset) throws IOException;
} }

View File

@ -24,8 +24,8 @@ public abstract class BufferedIndexOutput extends IndexOutput {
static final int BUFFER_SIZE = 1024; static final int BUFFER_SIZE = 1024;
private final byte[] buffer = new byte[BUFFER_SIZE]; private final byte[] buffer = new byte[BUFFER_SIZE];
private long bufferStart = 0; // position in file of buffer private long bufferStart = 0; // position in file of buffer
private int bufferPosition = 0; // position in buffer private int bufferPosition = 0; // position in buffer
/** Writes a single byte. /** Writes a single byte.
* @see IndexInput#readByte() * @see IndexInput#readByte()
@ -41,12 +41,12 @@ public abstract class BufferedIndexOutput extends IndexOutput {
* @param length the number of bytes to write * @param length the number of bytes to write
* @see IndexInput#readBytes(byte[],int,int) * @see IndexInput#readBytes(byte[],int,int)
*/ */
public void writeBytes(byte[] b, int length) throws IOException { public void writeBytes(byte[] b, int offset, int length) throws IOException {
int bytesLeft = BUFFER_SIZE - bufferPosition; int bytesLeft = BUFFER_SIZE - bufferPosition;
// is there enough space in the buffer? // is there enough space in the buffer?
if (bytesLeft >= length) { if (bytesLeft >= length) {
// we add the data to the end of the buffer // we add the data to the end of the buffer
System.arraycopy(b, 0, buffer, bufferPosition, length); System.arraycopy(b, offset, buffer, bufferPosition, length);
bufferPosition += length; bufferPosition += length;
// if the buffer is full, flush it // if the buffer is full, flush it
if (BUFFER_SIZE - bufferPosition == 0) if (BUFFER_SIZE - bufferPosition == 0)
@ -58,7 +58,7 @@ public abstract class BufferedIndexOutput extends IndexOutput {
if (bufferPosition > 0) if (bufferPosition > 0)
flush(); flush();
// and write data at once // and write data at once
flushBuffer(b, length); flushBuffer(b, offset, length);
bufferStart += length; bufferStart += length;
} else { } else {
// we fill/flush the buffer (until the input is written) // we fill/flush the buffer (until the input is written)
@ -66,7 +66,7 @@ public abstract class BufferedIndexOutput extends IndexOutput {
int pieceLength; int pieceLength;
while (pos < length) { while (pos < length) {
pieceLength = (length - pos < bytesLeft) ? length - pos : bytesLeft; pieceLength = (length - pos < bytesLeft) ? length - pos : bytesLeft;
System.arraycopy(b, pos, buffer, bufferPosition, pieceLength); System.arraycopy(b, pos + offset, buffer, bufferPosition, pieceLength);
pos += pieceLength; pos += pieceLength;
bufferPosition += pieceLength; bufferPosition += pieceLength;
// if the buffer is full, flush it // if the buffer is full, flush it
@ -92,8 +92,18 @@ public abstract class BufferedIndexOutput extends IndexOutput {
* @param b the bytes to write * @param b the bytes to write
* @param len the number of bytes to write * @param len the number of bytes to write
*/ */
protected abstract void flushBuffer(byte[] b, int len) throws IOException; private void flushBuffer(byte[] b, int len) throws IOException {
flushBuffer(b, 0, len);
}
/** Expert: implements buffer write. Writes bytes at the current position in
* the output.
* @param b the bytes to write
* @param offset the offset in the byte array
* @param len the number of bytes to write
*/
protected abstract void flushBuffer(byte[] b, int offset, int len) throws IOException;
/** Closes this stream to further operations. */ /** Closes this stream to further operations. */
public void close() throws IOException { public void close() throws IOException {
flush(); flush();

View File

@ -588,8 +588,8 @@ class FSIndexOutput extends BufferedIndexOutput {
} }
/** output methods: */ /** output methods: */
public void flushBuffer(byte[] b, int size) throws IOException { public void flushBuffer(byte[] b, int offset, int size) throws IOException {
file.write(b, 0, size); file.write(b, offset, size);
} }
public void close() throws IOException { public void close() throws IOException {
// only close the file if it has not been closed yet // only close the file if it has not been closed yet

View File

@ -36,7 +36,17 @@ public abstract class IndexOutput {
* @param length the number of bytes to write * @param length the number of bytes to write
* @see IndexInput#readBytes(byte[],int,int) * @see IndexInput#readBytes(byte[],int,int)
*/ */
public abstract void writeBytes(byte[] b, int length) throws IOException; public void writeBytes(byte[] b, int length) throws IOException {
writeBytes(b, 0, length);
}
/** Writes an array of bytes.
* @param b the bytes to write
* @param offset the offset in the byte array
* @param length the number of bytes to write
* @see IndexInput#readBytes(byte[],int,int)
*/
public abstract void writeBytes(byte[] b, int offset, int length) throws IOException;
/** Writes an int as four bytes. /** Writes an int as four bytes.
* @see IndexInput#readInt() * @see IndexInput#readInt()

View File

@ -66,7 +66,7 @@ public class RAMOutputStream extends BufferedIndexOutput {
file.setLength(0); file.setLength(0);
} }
public void flushBuffer(byte[] src, int len) throws IOException { public void flushBuffer(byte[] src, int offset, int len) throws IOException {
byte[] buffer; byte[] buffer;
int bufferPos = 0; int bufferPos = 0;
while (bufferPos != len) { while (bufferPos != len) {
@ -81,7 +81,7 @@ public class RAMOutputStream extends BufferedIndexOutput {
else else
buffer = (byte[]) file.buffers.get(bufferNumber); buffer = (byte[]) file.buffers.get(bufferNumber);
System.arraycopy(src, bufferPos, buffer, bufferOffset, bytesToCopy); System.arraycopy(src, offset + bufferPos, buffer, bufferOffset, bytesToCopy);
bufferPos += bytesToCopy; bufferPos += bytesToCopy;
pointer += bytesToCopy; pointer += bytesToCopy;
} }

View File

@ -1013,6 +1013,7 @@
<li>If the third lowest-order bit is set (0x04), term positions are stored with the term vectors.</li> <li>If the third lowest-order bit is set (0x04), term positions are stored with the term vectors.</li>
<li>If the fourth lowest-order bit is set (0x08), term offsets are stored with the term vectors.</li> <li>If the fourth lowest-order bit is set (0x08), term offsets are stored with the term vectors.</li>
<li>If the fifth lowest-order bit is set (0x10), norms are omitted for the indexed field.</li> <li>If the fifth lowest-order bit is set (0x10), norms are omitted for the indexed field.</li>
<li>If the sixth lowest-order bit is set (0x20), payloads are stored for the indexed field.</li>
</ul> </ul>
</p> </p>
@ -1298,9 +1299,9 @@
<sup>DocFreq/SkipInterval</sup> <sup>DocFreq/SkipInterval</sup>
</p> </p>
<p>SkipDatum --&gt; <p>SkipDatum --&gt;
DocSkip,FreqSkip,ProxSkip DocSkip,PayloadLength?,FreqSkip,ProxSkip
</p> </p>
<p>DocDelta,Freq,DocSkip,FreqSkip,ProxSkip --&gt; <p>DocDelta,Freq,DocSkip,PayloadLength,FreqSkip,ProxSkip --&gt;
VInt VInt
</p> </p>
<p>TermFreqs <p>TermFreqs
@ -1328,9 +1329,17 @@
SkipInterval SkipInterval
<sup>th</sup> <sup>th</sup>
document in TermFreqs. document in TermFreqs.
Document numbers are represented as differences If payloads are disabled for the term's field,
from the previous value in the sequence. FreqSkip then DocSkip represents the difference from the
and ProxSkip record the position of every previous value in the sequence.
If payloads are enabled for the term's field,
then DocSkip/2 represents the difference from the
previous value in the sequence. If payloads are enabled
and DocSkip is odd,
then PayloadLength is stored indicating the length
of the last payload before the SkipInterval<sup>th</sup>
document in TermPositions.
FreqSkip and ProxSkip record the position of every
SkipInterval SkipInterval
<sup>th</sup> <sup>th</sup>
entry in FreqFile and entry in FreqFile and
@ -1379,12 +1388,21 @@
<sup>DocFreq</sup> <sup>DocFreq</sup>
</p> </p>
<p>Positions --&gt; <p>Positions --&gt;
&lt;PositionDelta&gt; &lt;PositionDelta,Payload?&gt;
<sup>Freq</sup> <sup>Freq</sup>
</p> </p>
<p>Payload --&gt;
&lt;PayloadLength?,PayloadData&gt;
</p>
<p>PositionDelta --&gt; <p>PositionDelta --&gt;
VInt VInt
</p> </p>
<p>PayloadLength --&gt;
VInt
</p>
<p>PayloadData --&gt;
byte<sup>PayloadLength</sup>
</p>
<p>TermPositions <p>TermPositions
are ordered by term (the term is implicit, from the .tis file). are ordered by term (the term is implicit, from the .tis file).
</p> </p>
@ -1393,19 +1411,30 @@
number is implicit from the .frq file). number is implicit from the .frq file).
</p> </p>
<p>PositionDelta <p>PositionDelta
is the difference between the position of the current occurrence in is, if payloads are disabled for the term's field, the difference
between the position of the current occurrence in
the document and the previous occurrence (or zero, if this is the the document and the previous occurrence (or zero, if this is the
first occurrence in this document). first occurrence in this document).
If payloads are enabled for the term's field, then PositionDelta/2
is the difference between the current and the previous position. If
payloads are enabled and PositionDelta is odd, then PayloadLength is
stored, indicating the length of the payload at the current term position.
</p> </p>
<p> <p>
For example, the TermPositions for a For example, the TermPositions for a
term which occurs as the fourth term in one document, and as the term which occurs as the fourth term in one document, and as the
fifth and ninth term in a subsequent document, would be the following fifth and ninth term in a subsequent document, would be the following
sequence of VInts: sequence of VInts (payloads disabled):
</p> </p>
<p>4, <p>4,
5, 4 5, 4
</p> </p>
<p>PayloadData
is metadata associated with the current term position. If PayloadLength
is stored at the current position, then it indicates the length of this
Payload. If PayloadLength is not stored, then this Payload has the same
length as the Payload at the previous position.
</p>
</section> </section>
<section id="Normalization Factors"><title>Normalization Factors</title> <section id="Normalization Factors"><title>Normalization Factors</title>
<p> <p>

View File

@ -0,0 +1,443 @@
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
import java.util.Random;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
public class TestPayloads extends TestCase {
// Simple tests to test the Payload class
public void testPayload() throws Exception {
byte[] testData = "This is a test!".getBytes();
Payload payload = new Payload(testData);
assertEquals("Wrong payload length.", testData.length, payload.length());
// test copyTo()
byte[] target = new byte[testData.length - 1];
try {
payload.copyTo(target, 0);
fail("Expected exception not thrown");
} catch (Exception expected) {
// expected exception
}
target = new byte[testData.length + 3];
payload.copyTo(target, 3);
for (int i = 0; i < testData.length; i++) {
assertEquals(testData[i], target[i + 3]);
}
// test toByteArray()
target = payload.toByteArray();
assertByteArrayEquals(testData, target);
// test byteAt()
for (int i = 0; i < testData.length; i++) {
assertEquals(payload.byteAt(i), testData[i]);
}
try {
payload.byteAt(testData.length + 1);
fail("Expected exception not thrown");
} catch (Exception expected) {
// expected exception
}
}
// Tests whether the DocumentWriter and SegmentMerger correctly enable the
// payload bit in the FieldInfo
public void testPayloadFieldBit() throws Exception {
Directory ram = new RAMDirectory();
PayloadAnalyzer analyzer = new PayloadAnalyzer();
IndexWriter writer = new IndexWriter(ram, analyzer, true);
Document d = new Document();
// this field won't have any payloads
d.add(new Field("f1", "This field has no payloads", Field.Store.NO, Field.Index.TOKENIZED));
// this field will have payloads in all docs, however not for all term positions,
// so this field is used to check if the DocumentWriter correctly enables the payloads bit
// even if only some term positions have payloads
d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED));
d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED));
// this field is used to verify if the SegmentMerger enables payloads for a field if it has payloads
// enabled in only some documents
d.add(new Field("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.TOKENIZED));
// only add payload data for field f2
analyzer.setPayloadData("f2", 1, "somedata".getBytes(), 0, 1);
writer.addDocument(d);
// flush
writer.close();
// only one segment in the index, so we can cast to SegmentReader
SegmentReader reader = (SegmentReader) IndexReader.open(ram);
FieldInfos fi = reader.fieldInfos();
assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
assertFalse("Payload field bit should not be set.", fi.fieldInfo("f3").storePayloads);
reader.close();
// now we add another document which has payloads for field f3 and verify if the SegmentMerger
// enabled payloads for that field
writer = new IndexWriter(ram, analyzer, true);
d = new Document();
d.add(new Field("f1", "This field has no payloads", Field.Store.NO, Field.Index.TOKENIZED));
d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED));
d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED));
d.add(new Field("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.TOKENIZED));
// add payload data for field f2 and f3
analyzer.setPayloadData("f2", "somedata".getBytes(), 0, 1);
analyzer.setPayloadData("f3", "somedata".getBytes(), 0, 3);
writer.addDocument(d);
// force merge
writer.optimize();
// flush
writer.close();
// only one segment in the index, so we can cast to SegmentReader
reader = (SegmentReader) IndexReader.open(ram);
fi = reader.fieldInfos();
assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
assertTrue("Payload field bit should be set.", fi.fieldInfo("f3").storePayloads);
reader.close();
}
// Tests if payloads are correctly stored and loaded using both RamDirectory and FSDirectory
public void testPayloadsEncoding() throws Exception {
// first perform the test using a RAMDirectory
Directory dir = new RAMDirectory();
performTest(dir);
// now use a FSDirectory and repeat same test
String dirName = "test_payloads";
dir = FSDirectory.getDirectory(dirName);
performTest(dir);
rmDir(dirName);
}
// builds an index with payloads in the given Directory and performs
// different tests to verify the payload encoding
private void performTest(Directory dir) throws Exception {
PayloadAnalyzer analyzer = new PayloadAnalyzer();
IndexWriter writer = new IndexWriter(dir, analyzer, true);
// should be in sync with value in TermInfosWriter
final int skipInterval = 16;
final int numTerms = 5;
final String fieldName = "f1";
int numDocs = skipInterval + 1;
// create content for the test documents with just a few terms
Term[] terms = generateTerms(fieldName, numTerms);
StringBuffer sb = new StringBuffer();
for (int i = 0; i < terms.length; i++) {
sb.append(terms[i].text);
sb.append(" ");
}
String content = sb.toString();
int payloadDataLength = numTerms * numDocs * 2 + numTerms * numDocs * (numDocs - 1) / 2;
byte[] payloadData = generateRandomData(payloadDataLength);
Document d = new Document();
d.add(new Field(fieldName, content, Field.Store.NO, Field.Index.TOKENIZED));
// add the same document multiple times to have the same payload lengths for all
// occurrences within two consecutive skip intervals
int offset = 0;
for (int i = 0; i < 2 * numDocs; i++) {
analyzer.setPayloadData(fieldName, payloadData, offset, 1);
offset += numTerms;
writer.addDocument(d);
}
// now we make sure to have different payload lengths next at the next skip point
for (int i = 0; i < numDocs; i++) {
analyzer.setPayloadData(fieldName, payloadData, offset, i);
offset += i * numTerms;
writer.addDocument(d);
}
writer.optimize();
// flush
writer.close();
/*
* Verify the index
* first we test if all payloads are stored correctly
*/
IndexReader reader = IndexReader.open(dir);
byte[] verifyPayloadData = new byte[payloadDataLength];
offset = 0;
TermPositions[] tps = new TermPositions[numTerms];
for (int i = 0; i < numTerms; i++) {
tps[i] = reader.termPositions(terms[i]);
}
while (tps[0].next()) {
for (int i = 1; i < numTerms; i++) {
tps[i].next();
}
int freq = tps[0].freq();
for (int i = 0; i < freq; i++) {
for (int j = 0; j < numTerms; j++) {
tps[j].nextPosition();
tps[j].getPayload(verifyPayloadData, offset);
offset += tps[j].getPayloadLength();
}
}
}
for (int i = 0; i < numTerms; i++) {
tps[i].close();
}
assertByteArrayEquals(payloadData, verifyPayloadData);
/*
* test lazy skipping
*/
TermPositions tp = reader.termPositions(terms[0]);
tp.next();
tp.nextPosition();
// now we don't read this payload
tp.nextPosition();
assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
byte[] payload = tp.getPayload(null, 0);
assertEquals(payload[0], payloadData[numTerms]);
tp.nextPosition();
// we don't read this payload and skip to a different document
tp.skipTo(5);
tp.nextPosition();
assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
payload = tp.getPayload(null, 0);
assertEquals(payload[0], payloadData[5 * numTerms]);
/*
* Test different lengths at skip points
*/
tp.seek(terms[1]);
tp.next();
tp.nextPosition();
assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
tp.skipTo(skipInterval - 1);
tp.nextPosition();
assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
tp.skipTo(2 * skipInterval - 1);
tp.nextPosition();
assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
tp.skipTo(3 * skipInterval - 1);
tp.nextPosition();
assertEquals("Wrong payload length.", 3 * skipInterval - 2 * numDocs - 1, tp.getPayloadLength());
/*
* Test multiple call of getPayload()
*/
tp.getPayload(null, 0);
try {
// it is forbidden to call getPayload() more than once
// without calling nextPosition()
tp.getPayload(null, 0);
fail("Expected exception not thrown");
} catch (Exception expected) {
// expected exception
}
reader.close();
// test long payload
analyzer = new PayloadAnalyzer();
writer = new IndexWriter(dir, analyzer, true);
String singleTerm = "lucene";
d = new Document();
d.add(new Field(fieldName, singleTerm, Field.Store.NO, Field.Index.TOKENIZED));
// add a payload whose length is greater than the buffer size of BufferedIndexOutput
payloadData = generateRandomData(2000);
analyzer.setPayloadData(fieldName, payloadData, 100, 1500);
writer.addDocument(d);
writer.optimize();
// flush
writer.close();
reader = IndexReader.open(dir);
tp = reader.termPositions(new Term(fieldName, singleTerm));
tp.next();
tp.nextPosition();
verifyPayloadData = new byte[tp.getPayloadLength()];
tp.getPayload(verifyPayloadData, 0);
byte[] portion = new byte[1500];
System.arraycopy(payloadData, 100, portion, 0, 1500);
assertByteArrayEquals(portion, verifyPayloadData);
reader.close();
}
private byte[] generateRandomData(int n) {
Random rnd = new Random();
byte[] data = new byte[n];
rnd.nextBytes(data);
return data;
}
private Term[] generateTerms(String fieldName, int n) {
int maxDigits = (int) (Math.log(n) / Math.log(10));
Term[] terms = new Term[n];
StringBuffer sb = new StringBuffer();
for (int i = 0; i < n; i++) {
sb.setLength(0);
sb.append("t");
int zeros = maxDigits - (int) (Math.log(i) / Math.log(10));
for (int j = 0; j < zeros; j++) {
sb.append("0");
}
sb.append(i);
terms[i] = new Term(fieldName, sb.toString());
}
return terms;
}
private void rmDir(String dir) {
File fileDir = new File(dir);
if (fileDir.exists()) {
File[] files = fileDir.listFiles();
if (files != null) {
for (int i = 0; i < files.length; i++) {
files[i].delete();
}
}
fileDir.delete();
}
}
void assertByteArrayEquals(byte[] b1, byte[] b2) {
if (b1.length != b2.length) {
fail("Byte arrays have different lengths: " + b1.length + ", " + b2.length);
}
for (int i = 0; i < b1.length; i++) {
if (b1[i] != b2[i]) {
fail("Byte arrays different at index " + i + ": " + b1[i] + ", " + b2[i]);
}
}
}
/**
* This Analyzer uses an WhitespaceTokenizer and PayloadFilter.
*/
private static class PayloadAnalyzer extends Analyzer {
Map fieldToData = new HashMap();
void setPayloadData(String field, byte[] data, int offset, int length) {
fieldToData.put(field, new PayloadData(0, data, offset, length));
}
void setPayloadData(String field, int numFieldInstancesToSkip, byte[] data, int offset, int length) {
fieldToData.put(field, new PayloadData(numFieldInstancesToSkip, data, offset, length));
}
public TokenStream tokenStream(String fieldName, Reader reader) {
PayloadData payload = (PayloadData) fieldToData.get(fieldName);
TokenStream ts = new WhitespaceTokenizer(reader);
if (payload != null) {
if (payload.numFieldInstancesToSkip == 0) {
ts = new PayloadFilter(ts, payload.data, payload.offset, payload.length);
} else {
payload.numFieldInstancesToSkip--;
}
}
return ts;
}
private static class PayloadData {
byte[] data;
int offset;
int length;
int numFieldInstancesToSkip;
PayloadData(int skip, byte[] data, int offset, int length) {
numFieldInstancesToSkip = skip;
this.data = data;
this.offset = offset;
this.length = length;
}
}
}
/**
* This Filter adds payloads to the tokens.
*/
private static class PayloadFilter extends TokenFilter {
private byte[] data;
private int length;
private int offset;
public PayloadFilter(TokenStream in, byte[] data, int offset, int length) {
super(in);
this.data = data;
this.length = length;
this.offset = offset;
}
public Token next() throws IOException {
Token nextToken = input.next();
if (nextToken != null && offset + length <= data.length) {
nextToken.setPayload(new Payload(data, offset, length));
offset += length;
}
return nextToken;
}
}
}

View File

@ -48,7 +48,7 @@ public class MockRAMOutputStream extends RAMOutputStream {
} }
} }
public void flushBuffer(byte[] src, int len) throws IOException { public void flushBuffer(byte[] src, int offset, int len) throws IOException {
long freeSpace = dir.maxSize - dir.sizeInBytes(); long freeSpace = dir.maxSize - dir.sizeInBytes();
long realUsage = 0; long realUsage = 0;
@ -63,14 +63,14 @@ public class MockRAMOutputStream extends RAMOutputStream {
if (dir.maxSize != 0 && freeSpace <= len) { if (dir.maxSize != 0 && freeSpace <= len) {
if (freeSpace > 0 && freeSpace < len) { if (freeSpace > 0 && freeSpace < len) {
realUsage += freeSpace; realUsage += freeSpace;
super.flushBuffer(src, (int) freeSpace); super.flushBuffer(src, offset, (int) freeSpace);
} }
if (realUsage > dir.maxUsedSize) { if (realUsage > dir.maxUsedSize) {
dir.maxUsedSize = realUsage; dir.maxUsedSize = realUsage;
} }
throw new IOException("fake disk full at " + dir.getRecomputedActualSizeInBytes() + " bytes"); throw new IOException("fake disk full at " + dir.getRecomputedActualSizeInBytes() + " bytes");
} else { } else {
super.flushBuffer(src, len); super.flushBuffer(src, offset, len);
} }
if (first) { if (first) {