mirror of https://github.com/apache/lucene.git
LUCENE-964: remove DocumentWriter
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@560378 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
98fa2d898d
commit
1bc5a68549
|
@ -1,556 +0,0 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.BitSet;
|
||||
import java.util.Enumeration;
|
||||
import java.util.Hashtable;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
final class DocumentWriter {
|
||||
private Analyzer analyzer;
|
||||
private Directory directory;
|
||||
private Similarity similarity;
|
||||
private FieldInfos fieldInfos;
|
||||
private int maxFieldLength;
|
||||
private int termIndexInterval = IndexWriter.DEFAULT_TERM_INDEX_INTERVAL;
|
||||
private PrintStream infoStream;
|
||||
|
||||
/** This ctor used by test code only.
|
||||
*
|
||||
* @param directory The directory to write the document information to
|
||||
* @param analyzer The analyzer to use for the document
|
||||
* @param similarity The Similarity function
|
||||
* @param maxFieldLength The maximum number of tokens a field may have
|
||||
*/
|
||||
DocumentWriter(Directory directory, Analyzer analyzer,
|
||||
Similarity similarity, int maxFieldLength) {
|
||||
this.directory = directory;
|
||||
this.analyzer = analyzer;
|
||||
this.similarity = similarity;
|
||||
this.maxFieldLength = maxFieldLength;
|
||||
}
|
||||
|
||||
DocumentWriter(Directory directory, Analyzer analyzer, IndexWriter writer) {
|
||||
this.directory = directory;
|
||||
this.analyzer = analyzer;
|
||||
this.similarity = writer.getSimilarity();
|
||||
this.maxFieldLength = writer.getMaxFieldLength();
|
||||
this.termIndexInterval = writer.getTermIndexInterval();
|
||||
}
|
||||
|
||||
final void addDocument(String segment, Document doc)
|
||||
throws CorruptIndexException, IOException {
|
||||
// create field infos
|
||||
fieldInfos = new FieldInfos();
|
||||
fieldInfos.add(doc);
|
||||
|
||||
// invert doc into postingTable
|
||||
postingTable.clear(); // clear postingTable
|
||||
fieldLengths = new int[fieldInfos.size()]; // init fieldLengths
|
||||
fieldPositions = new int[fieldInfos.size()]; // init fieldPositions
|
||||
fieldOffsets = new int[fieldInfos.size()]; // init fieldOffsets
|
||||
fieldStoresPayloads = new BitSet(fieldInfos.size());
|
||||
|
||||
fieldBoosts = new float[fieldInfos.size()]; // init fieldBoosts
|
||||
Arrays.fill(fieldBoosts, doc.getBoost());
|
||||
|
||||
try {
|
||||
|
||||
// Before we write the FieldInfos we invert the Document. The reason is that
|
||||
// during invertion the TokenStreams of tokenized fields are being processed
|
||||
// and we might encounter tokens that have payloads associated with them. In
|
||||
// this case we have to update the FieldInfo of the particular field.
|
||||
invertDocument(doc);
|
||||
|
||||
// sort postingTable into an array
|
||||
Posting[] postings = sortPostingTable();
|
||||
|
||||
// write field infos
|
||||
fieldInfos.write(directory, segment + ".fnm");
|
||||
|
||||
// write field values
|
||||
FieldsWriter fieldsWriter =
|
||||
new FieldsWriter(directory, segment, fieldInfos);
|
||||
try {
|
||||
fieldsWriter.addDocument(doc);
|
||||
} finally {
|
||||
fieldsWriter.close();
|
||||
}
|
||||
|
||||
/*
|
||||
for (int i = 0; i < postings.length; i++) {
|
||||
Posting posting = postings[i];
|
||||
System.out.print(posting.term);
|
||||
System.out.print(" freq=" + posting.freq);
|
||||
System.out.print(" pos=");
|
||||
System.out.print(posting.positions[0]);
|
||||
for (int j = 1; j < posting.freq; j++)
|
||||
System.out.print("," + posting.positions[j]);
|
||||
System.out.println("");
|
||||
}
|
||||
*/
|
||||
|
||||
// write postings
|
||||
writePostings(postings, segment);
|
||||
|
||||
// write norms of indexed fields
|
||||
writeNorms(segment);
|
||||
} finally {
|
||||
// close TokenStreams
|
||||
IOException ex = null;
|
||||
|
||||
Iterator it = openTokenStreams.iterator();
|
||||
while (it.hasNext()) {
|
||||
try {
|
||||
((TokenStream) it.next()).close();
|
||||
} catch (IOException e) {
|
||||
if (ex != null) {
|
||||
ex = e;
|
||||
}
|
||||
}
|
||||
}
|
||||
openTokenStreams.clear();
|
||||
|
||||
if (ex != null) {
|
||||
throw ex;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Keys are Terms, values are Postings.
|
||||
// Used to buffer a document before it is written to the index.
|
||||
private final Hashtable postingTable = new Hashtable();
|
||||
private int[] fieldLengths;
|
||||
private int[] fieldPositions;
|
||||
private int[] fieldOffsets;
|
||||
private float[] fieldBoosts;
|
||||
|
||||
// If any of the tokens of a paticular field carry a payload
|
||||
// then we enable payloads for that field.
|
||||
private BitSet fieldStoresPayloads;
|
||||
|
||||
// Keep references of the token streams. We must close them after
|
||||
// the postings are written to the segment.
|
||||
private List openTokenStreams = new LinkedList();
|
||||
|
||||
// Tokenizes the fields of a document into Postings.
|
||||
private final void invertDocument(Document doc)
|
||||
throws IOException {
|
||||
Iterator fieldIterator = doc.getFields().iterator();
|
||||
while (fieldIterator.hasNext()) {
|
||||
Fieldable field = (Fieldable) fieldIterator.next();
|
||||
String fieldName = field.name();
|
||||
int fieldNumber = fieldInfos.fieldNumber(fieldName);
|
||||
|
||||
int length = fieldLengths[fieldNumber]; // length of field
|
||||
int position = fieldPositions[fieldNumber]; // position in field
|
||||
if (length>0) position+=analyzer.getPositionIncrementGap(fieldName);
|
||||
int offset = fieldOffsets[fieldNumber]; // offset field
|
||||
|
||||
if (field.isIndexed()) {
|
||||
if (!field.isTokenized()) { // un-tokenized field
|
||||
String stringValue = field.stringValue();
|
||||
if(field.isStoreOffsetWithTermVector())
|
||||
addPosition(fieldName, stringValue, position++, null, new TermVectorOffsetInfo(offset, offset + stringValue.length()));
|
||||
else
|
||||
addPosition(fieldName, stringValue, position++, null, null);
|
||||
offset += stringValue.length();
|
||||
length++;
|
||||
} else
|
||||
{ // tokenized field
|
||||
TokenStream stream = field.tokenStreamValue();
|
||||
|
||||
// the field does not have a TokenStream,
|
||||
// so we have to obtain one from the analyzer
|
||||
if (stream == null) {
|
||||
Reader reader; // find or make Reader
|
||||
if (field.readerValue() != null)
|
||||
reader = field.readerValue();
|
||||
else if (field.stringValue() != null)
|
||||
reader = new StringReader(field.stringValue());
|
||||
else
|
||||
throw new IllegalArgumentException
|
||||
("field must have either String or Reader value");
|
||||
|
||||
// Tokenize field and add to postingTable
|
||||
stream = analyzer.tokenStream(fieldName, reader);
|
||||
}
|
||||
|
||||
// remember this TokenStream, we must close it later
|
||||
openTokenStreams.add(stream);
|
||||
|
||||
// reset the TokenStream to the first token
|
||||
stream.reset();
|
||||
|
||||
|
||||
Token lastToken = null;
|
||||
for (Token t = stream.next(); t != null; t = stream.next()) {
|
||||
position += (t.getPositionIncrement() - 1);
|
||||
|
||||
Payload payload = t.getPayload();
|
||||
if (payload != null) {
|
||||
// enable payloads for this field
|
||||
fieldStoresPayloads.set(fieldNumber);
|
||||
}
|
||||
|
||||
TermVectorOffsetInfo termVectorOffsetInfo;
|
||||
if (field.isStoreOffsetWithTermVector()) {
|
||||
termVectorOffsetInfo = new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset());
|
||||
} else {
|
||||
termVectorOffsetInfo = null;
|
||||
}
|
||||
addPosition(fieldName, t.termText(), position++, payload, termVectorOffsetInfo);
|
||||
|
||||
lastToken = t;
|
||||
if (++length >= maxFieldLength) {
|
||||
if (infoStream != null)
|
||||
infoStream.println("maxFieldLength " +maxFieldLength+ " reached, ignoring following tokens");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(lastToken != null)
|
||||
offset += lastToken.endOffset() + 1;
|
||||
}
|
||||
|
||||
fieldLengths[fieldNumber] = length; // save field length
|
||||
fieldPositions[fieldNumber] = position; // save field position
|
||||
fieldBoosts[fieldNumber] *= field.getBoost();
|
||||
fieldOffsets[fieldNumber] = offset;
|
||||
}
|
||||
}
|
||||
|
||||
// update fieldInfos for all fields that have one or more tokens with payloads
|
||||
for (int i = fieldStoresPayloads.nextSetBit(0); i >= 0; i = fieldStoresPayloads.nextSetBit(i+1)) {
|
||||
fieldInfos.fieldInfo(i).storePayloads = true;
|
||||
}
|
||||
}
|
||||
|
||||
private final Term termBuffer = new Term("", ""); // avoid consing
|
||||
|
||||
private final void addPosition(String field, String text, int position, Payload payload, TermVectorOffsetInfo offset) {
|
||||
termBuffer.set(field, text);
|
||||
//System.out.println("Offset: " + offset);
|
||||
Posting ti = (Posting) postingTable.get(termBuffer);
|
||||
if (ti != null) { // word seen before
|
||||
int freq = ti.freq;
|
||||
if (ti.positions.length == freq) { // positions array is full
|
||||
int[] newPositions = new int[freq * 2]; // double size
|
||||
int[] positions = ti.positions;
|
||||
System.arraycopy(positions, 0, newPositions, 0, freq);
|
||||
ti.positions = newPositions;
|
||||
|
||||
if (ti.payloads != null) {
|
||||
// the current field stores payloads
|
||||
Payload[] newPayloads = new Payload[freq * 2]; // grow payloads array
|
||||
Payload[] payloads = ti.payloads;
|
||||
System.arraycopy(payloads, 0, newPayloads, 0, payloads.length);
|
||||
ti.payloads = newPayloads;
|
||||
}
|
||||
}
|
||||
ti.positions[freq] = position; // add new position
|
||||
|
||||
if (payload != null) {
|
||||
if (ti.payloads == null) {
|
||||
// lazily allocate payload array
|
||||
ti.payloads = new Payload[ti.positions.length];
|
||||
}
|
||||
ti.payloads[freq] = payload;
|
||||
}
|
||||
|
||||
if (offset != null) {
|
||||
if (ti.offsets.length == freq){
|
||||
TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[freq*2];
|
||||
TermVectorOffsetInfo [] offsets = ti.offsets;
|
||||
System.arraycopy(offsets, 0, newOffsets, 0, freq);
|
||||
ti.offsets = newOffsets;
|
||||
}
|
||||
ti.offsets[freq] = offset;
|
||||
}
|
||||
ti.freq = freq + 1; // update frequency
|
||||
} else { // word not seen before
|
||||
Term term = new Term(field, text, false);
|
||||
postingTable.put(term, new Posting(term, position, payload, offset));
|
||||
}
|
||||
}
|
||||
|
||||
private final Posting[] sortPostingTable() {
|
||||
// copy postingTable into an array
|
||||
Posting[] array = new Posting[postingTable.size()];
|
||||
Enumeration postings = postingTable.elements();
|
||||
for (int i = 0; postings.hasMoreElements(); i++)
|
||||
array[i] = (Posting) postings.nextElement();
|
||||
|
||||
// sort the array
|
||||
quickSort(array, 0, array.length - 1);
|
||||
|
||||
return array;
|
||||
}
|
||||
|
||||
private static final void quickSort(Posting[] postings, int lo, int hi) {
|
||||
if (lo >= hi)
|
||||
return;
|
||||
|
||||
int mid = (lo + hi) >>> 1;
|
||||
|
||||
if (postings[lo].term.compareTo(postings[mid].term) > 0) {
|
||||
Posting tmp = postings[lo];
|
||||
postings[lo] = postings[mid];
|
||||
postings[mid] = tmp;
|
||||
}
|
||||
|
||||
if (postings[mid].term.compareTo(postings[hi].term) > 0) {
|
||||
Posting tmp = postings[mid];
|
||||
postings[mid] = postings[hi];
|
||||
postings[hi] = tmp;
|
||||
|
||||
if (postings[lo].term.compareTo(postings[mid].term) > 0) {
|
||||
Posting tmp2 = postings[lo];
|
||||
postings[lo] = postings[mid];
|
||||
postings[mid] = tmp2;
|
||||
}
|
||||
}
|
||||
|
||||
int left = lo + 1;
|
||||
int right = hi - 1;
|
||||
|
||||
if (left >= right)
|
||||
return;
|
||||
|
||||
Term partition = postings[mid].term;
|
||||
|
||||
for (; ;) {
|
||||
while (postings[right].term.compareTo(partition) > 0)
|
||||
--right;
|
||||
|
||||
while (left < right && postings[left].term.compareTo(partition) <= 0)
|
||||
++left;
|
||||
|
||||
if (left < right) {
|
||||
Posting tmp = postings[left];
|
||||
postings[left] = postings[right];
|
||||
postings[right] = tmp;
|
||||
--right;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
quickSort(postings, lo, left);
|
||||
quickSort(postings, left + 1, hi);
|
||||
}
|
||||
|
||||
private final void writePostings(Posting[] postings, String segment)
|
||||
throws CorruptIndexException, IOException {
|
||||
IndexOutput freq = null, prox = null;
|
||||
TermInfosWriter tis = null;
|
||||
TermVectorsWriter termVectorWriter = null;
|
||||
try {
|
||||
//open files for inverse index storage
|
||||
freq = directory.createOutput(segment + ".frq");
|
||||
prox = directory.createOutput(segment + ".prx");
|
||||
tis = new TermInfosWriter(directory, segment, fieldInfos,
|
||||
termIndexInterval);
|
||||
TermInfo ti = new TermInfo();
|
||||
String currentField = null;
|
||||
boolean currentFieldHasPayloads = false;
|
||||
|
||||
for (int i = 0; i < postings.length; i++) {
|
||||
Posting posting = postings[i];
|
||||
|
||||
// check to see if we switched to a new field
|
||||
String termField = posting.term.field();
|
||||
if (currentField != termField) {
|
||||
// changing field - see if there is something to save
|
||||
currentField = termField;
|
||||
FieldInfo fi = fieldInfos.fieldInfo(currentField);
|
||||
currentFieldHasPayloads = fi.storePayloads;
|
||||
if (fi.storeTermVector) {
|
||||
if (termVectorWriter == null) {
|
||||
termVectorWriter =
|
||||
new TermVectorsWriter(directory, segment, fieldInfos);
|
||||
termVectorWriter.openDocument();
|
||||
}
|
||||
termVectorWriter.openField(currentField);
|
||||
|
||||
} else if (termVectorWriter != null) {
|
||||
termVectorWriter.closeField();
|
||||
}
|
||||
}
|
||||
|
||||
// add an entry to the dictionary with pointers to prox and freq files
|
||||
ti.set(1, freq.getFilePointer(), prox.getFilePointer(), -1);
|
||||
tis.add(posting.term, ti);
|
||||
|
||||
// add an entry to the freq file
|
||||
int postingFreq = posting.freq;
|
||||
if (postingFreq == 1) // optimize freq=1
|
||||
freq.writeVInt(1); // set low bit of doc num.
|
||||
else {
|
||||
freq.writeVInt(0); // the document number
|
||||
freq.writeVInt(postingFreq); // frequency in doc
|
||||
}
|
||||
|
||||
int lastPosition = 0; // write positions
|
||||
int[] positions = posting.positions;
|
||||
Payload[] payloads = posting.payloads;
|
||||
int lastPayloadLength = -1;
|
||||
|
||||
|
||||
// The following encoding is being used for positions and payloads:
|
||||
// Case 1: current field does not store payloads
|
||||
// Positions -> <PositionDelta>^freq
|
||||
// PositionDelta -> VInt
|
||||
// The PositionDelta is the difference between the current
|
||||
// and the previous position
|
||||
// Case 2: current field stores payloads
|
||||
// Positions -> <PositionDelta, Payload>^freq
|
||||
// Payload -> <PayloadLength?, PayloadData>
|
||||
// PositionDelta -> VInt
|
||||
// PayloadLength -> VInt
|
||||
// PayloadData -> byte^PayloadLength
|
||||
// In this case PositionDelta/2 is the difference between
|
||||
// the current and the previous position. If PositionDelta
|
||||
// is odd, then a PayloadLength encoded as VInt follows,
|
||||
// if PositionDelta is even, then it is assumed that the
|
||||
// length of the current Payload equals the length of the
|
||||
// previous Payload.
|
||||
for (int j = 0; j < postingFreq; j++) { // use delta-encoding
|
||||
int position = positions[j];
|
||||
int delta = position - lastPosition;
|
||||
if (currentFieldHasPayloads) {
|
||||
int payloadLength = 0;
|
||||
Payload payload = null;
|
||||
if (payloads != null) {
|
||||
payload = payloads[j];
|
||||
if (payload != null) {
|
||||
payloadLength = payload.length;
|
||||
}
|
||||
}
|
||||
if (payloadLength == lastPayloadLength) {
|
||||
// the length of the current payload equals the length
|
||||
// of the previous one. So we do not have to store the length
|
||||
// again and we only shift the position delta by one bit
|
||||
prox.writeVInt(delta * 2);
|
||||
} else {
|
||||
// the length of the current payload is different from the
|
||||
// previous one. We shift the position delta, set the lowest
|
||||
// bit and store the current payload length as VInt.
|
||||
prox.writeVInt(delta * 2 + 1);
|
||||
prox.writeVInt(payloadLength);
|
||||
lastPayloadLength = payloadLength;
|
||||
}
|
||||
if (payloadLength > 0) {
|
||||
// write current payload
|
||||
prox.writeBytes(payload.data, payload.offset, payload.length);
|
||||
}
|
||||
} else {
|
||||
// field does not store payloads, just write position delta as VInt
|
||||
prox.writeVInt(delta);
|
||||
}
|
||||
lastPosition = position;
|
||||
}
|
||||
if (termVectorWriter != null && termVectorWriter.isFieldOpen()) {
|
||||
termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets);
|
||||
}
|
||||
}
|
||||
if (termVectorWriter != null)
|
||||
termVectorWriter.closeDocument();
|
||||
} finally {
|
||||
// make an effort to close all streams we can but remember and re-throw
|
||||
// the first exception encountered in this process
|
||||
IOException keep = null;
|
||||
if (freq != null) try { freq.close(); } catch (IOException e) { if (keep == null) keep = e; }
|
||||
if (prox != null) try { prox.close(); } catch (IOException e) { if (keep == null) keep = e; }
|
||||
if (tis != null) try { tis.close(); } catch (IOException e) { if (keep == null) keep = e; }
|
||||
if (termVectorWriter != null) try { termVectorWriter.close(); } catch (IOException e) { if (keep == null) keep = e; }
|
||||
if (keep != null) throw (IOException) keep.fillInStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
private final void writeNorms(String segment) throws IOException {
|
||||
for(int n = 0; n < fieldInfos.size(); n++){
|
||||
FieldInfo fi = fieldInfos.fieldInfo(n);
|
||||
if(fi.isIndexed && !fi.omitNorms){
|
||||
float norm = fieldBoosts[n] * similarity.lengthNorm(fi.name, fieldLengths[n]);
|
||||
IndexOutput norms = directory.createOutput(segment + ".f" + n);
|
||||
try {
|
||||
norms.writeByte(Similarity.encodeNorm(norm));
|
||||
} finally {
|
||||
norms.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** If non-null, a message will be printed to this if maxFieldLength is reached.
|
||||
*/
|
||||
void setInfoStream(PrintStream infoStream) {
|
||||
this.infoStream = infoStream;
|
||||
}
|
||||
|
||||
int getNumFields() {
|
||||
return fieldInfos.size();
|
||||
}
|
||||
}
|
||||
|
||||
final class Posting { // info about a Term in a doc
|
||||
Term term; // the Term
|
||||
int freq; // its frequency in doc
|
||||
int[] positions; // positions it occurs at
|
||||
Payload[] payloads; // the payloads of the terms
|
||||
TermVectorOffsetInfo [] offsets;
|
||||
|
||||
|
||||
Posting(Term t, int position, Payload payload, TermVectorOffsetInfo offset) {
|
||||
term = t;
|
||||
freq = 1;
|
||||
positions = new int[1];
|
||||
positions[0] = position;
|
||||
|
||||
if (payload != null) {
|
||||
payloads = new Payload[1];
|
||||
payloads[0] = payload;
|
||||
} else
|
||||
payloads = null;
|
||||
|
||||
|
||||
if(offset != null){
|
||||
offsets = new TermVectorOffsetInfo[1];
|
||||
offsets[0] = offset;
|
||||
} else
|
||||
offsets = null;
|
||||
}
|
||||
}
|
|
@ -207,55 +207,38 @@ class DocHelper {
|
|||
}
|
||||
|
||||
/**
|
||||
* Writes the document to the directory using a segment named "test"
|
||||
* Writes the document to the directory using a segment
|
||||
* named "test"; returns the SegmentInfo describing the new
|
||||
* segment
|
||||
* @param dir
|
||||
* @param doc
|
||||
* @throws IOException
|
||||
*/
|
||||
public static void writeDoc(Directory dir, Document doc) throws IOException
|
||||
public static SegmentInfo writeDoc(Directory dir, Document doc) throws IOException
|
||||
{
|
||||
writeDoc(dir, "test", doc);
|
||||
return writeDoc(dir, new WhitespaceAnalyzer(), Similarity.getDefault(), doc);
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes the document to the directory in the given segment
|
||||
* @param dir
|
||||
* @param segment
|
||||
* @param doc
|
||||
* @throws IOException
|
||||
*/
|
||||
public static void writeDoc(Directory dir, String segment, Document doc) throws IOException
|
||||
{
|
||||
Similarity similarity = Similarity.getDefault();
|
||||
writeDoc(dir, new WhitespaceAnalyzer(), similarity, segment, doc);
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes the document to the directory segment named "test" using the specified analyzer and similarity
|
||||
* Writes the document to the directory using the analyzer
|
||||
* and the similarity score; returns the SegmentInfo
|
||||
* describing the new segment
|
||||
* @param dir
|
||||
* @param analyzer
|
||||
* @param similarity
|
||||
* @param doc
|
||||
* @throws IOException
|
||||
*/
|
||||
public static void writeDoc(Directory dir, Analyzer analyzer, Similarity similarity, Document doc) throws IOException
|
||||
public static SegmentInfo writeDoc(Directory dir, Analyzer analyzer, Similarity similarity, Document doc) throws IOException
|
||||
{
|
||||
writeDoc(dir, analyzer, similarity, "test", doc);
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes the document to the directory segment using the analyzer and the similarity score
|
||||
* @param dir
|
||||
* @param analyzer
|
||||
* @param similarity
|
||||
* @param segment
|
||||
* @param doc
|
||||
* @throws IOException
|
||||
*/
|
||||
public static void writeDoc(Directory dir, Analyzer analyzer, Similarity similarity, String segment, Document doc) throws IOException
|
||||
{
|
||||
DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50);
|
||||
writer.addDocument(segment, doc);
|
||||
IndexWriter writer = new IndexWriter(dir, analyzer);
|
||||
writer.setSimilarity(similarity);
|
||||
//writer.setUseCompoundFile(false);
|
||||
writer.addDocument(doc);
|
||||
writer.flush();
|
||||
SegmentInfo info = writer.segmentInfos.info(writer.segmentInfos.size()-1);
|
||||
writer.close();
|
||||
return info;
|
||||
}
|
||||
|
||||
public static int numFields(Document doc) {
|
||||
|
|
|
@ -105,14 +105,16 @@ public class TestDoc extends TestCase {
|
|||
StringWriter sw = new StringWriter();
|
||||
PrintWriter out = new PrintWriter(sw, true);
|
||||
|
||||
Directory directory = FSDirectory.getDirectory(indexDir, true);
|
||||
directory.close();
|
||||
Directory directory = FSDirectory.getDirectory(indexDir);
|
||||
IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true);
|
||||
|
||||
SegmentInfo si1 = indexDoc("one", "test.txt");
|
||||
SegmentInfo si1 = indexDoc(writer, "test.txt");
|
||||
printSegment(out, si1);
|
||||
|
||||
SegmentInfo si2 = indexDoc("two", "test2.txt");
|
||||
SegmentInfo si2 = indexDoc(writer, "test2.txt");
|
||||
printSegment(out, si2);
|
||||
writer.close();
|
||||
directory.close();
|
||||
|
||||
SegmentInfo siMerge = merge(si1, si2, "merge", false);
|
||||
printSegment(out, siMerge);
|
||||
|
@ -131,14 +133,16 @@ public class TestDoc extends TestCase {
|
|||
sw = new StringWriter();
|
||||
out = new PrintWriter(sw, true);
|
||||
|
||||
directory = FSDirectory.getDirectory(indexDir, true);
|
||||
directory.close();
|
||||
directory = FSDirectory.getDirectory(indexDir);
|
||||
writer = new IndexWriter(directory, new SimpleAnalyzer(), true);
|
||||
|
||||
si1 = indexDoc("one", "test.txt");
|
||||
si1 = indexDoc(writer, "test.txt");
|
||||
printSegment(out, si1);
|
||||
|
||||
si2 = indexDoc("two", "test2.txt");
|
||||
si2 = indexDoc(writer, "test2.txt");
|
||||
printSegment(out, si2);
|
||||
writer.close();
|
||||
directory.close();
|
||||
|
||||
siMerge = merge(si1, si2, "merge", true);
|
||||
printSegment(out, siMerge);
|
||||
|
@ -157,21 +161,14 @@ public class TestDoc extends TestCase {
|
|||
}
|
||||
|
||||
|
||||
private SegmentInfo indexDoc(String segment, String fileName)
|
||||
private SegmentInfo indexDoc(IndexWriter writer, String fileName)
|
||||
throws Exception
|
||||
{
|
||||
Directory directory = FSDirectory.getDirectory(indexDir, false);
|
||||
Analyzer analyzer = new SimpleAnalyzer();
|
||||
DocumentWriter writer =
|
||||
new DocumentWriter(directory, analyzer, Similarity.getDefault(), 1000);
|
||||
|
||||
File file = new File(workDir, fileName);
|
||||
Document doc = FileDocument.Document(file);
|
||||
|
||||
writer.addDocument(segment, doc);
|
||||
|
||||
directory.close();
|
||||
return new SegmentInfo(segment, 1, directory, false, false);
|
||||
writer.addDocument(doc);
|
||||
writer.flush();
|
||||
return writer.segmentInfos.info(writer.segmentInfos.size()-1);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -32,6 +32,8 @@ import org.apache.lucene.store.RAMDirectory;
|
|||
import java.io.Reader;
|
||||
import java.io.IOException;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class TestDocumentWriter extends TestCase {
|
||||
private RAMDirectory dir;
|
||||
|
||||
|
@ -57,11 +59,13 @@ public class TestDocumentWriter extends TestCase {
|
|||
DocHelper.setupDoc(testDoc);
|
||||
Analyzer analyzer = new WhitespaceAnalyzer();
|
||||
Similarity similarity = Similarity.getDefault();
|
||||
DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50);
|
||||
String segName = "test";
|
||||
writer.addDocument(segName, testDoc);
|
||||
IndexWriter writer = new IndexWriter(dir, analyzer, true);
|
||||
writer.addDocument(testDoc);
|
||||
writer.flush();
|
||||
SegmentInfo info = writer.segmentInfos.info(writer.segmentInfos.size()-1);
|
||||
writer.close();
|
||||
//After adding the document, we should be able to read it back in
|
||||
SegmentReader reader = SegmentReader.get(new SegmentInfo(segName, 1, dir));
|
||||
SegmentReader reader = SegmentReader.get(info);
|
||||
assertTrue(reader != null);
|
||||
Document doc = reader.document(0);
|
||||
assertTrue(doc != null);
|
||||
|
@ -89,14 +93,14 @@ public class TestDocumentWriter extends TestCase {
|
|||
assertTrue(fields != null && fields.length == 1);
|
||||
assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_3_TEXT));
|
||||
|
||||
// test that the norm file is not present if omitNorms is true
|
||||
// test that the norms are not present in the segment if
|
||||
// omitNorms is true
|
||||
for (int i = 0; i < reader.fieldInfos.size(); i++) {
|
||||
FieldInfo fi = reader.fieldInfos.fieldInfo(i);
|
||||
if (fi.isIndexed) {
|
||||
assertTrue(fi.omitNorms == !dir.fileExists(segName + ".f" + i));
|
||||
assertTrue(fi.omitNorms == !reader.hasNorms(fi.name));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public void testPositionIncrementGap() throws IOException {
|
||||
|
@ -111,14 +115,17 @@ public class TestDocumentWriter extends TestCase {
|
|||
};
|
||||
|
||||
Similarity similarity = Similarity.getDefault();
|
||||
DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50);
|
||||
IndexWriter writer = new IndexWriter(dir, analyzer, true);
|
||||
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("repeated", "repeated one", Field.Store.YES, Field.Index.TOKENIZED));
|
||||
doc.add(new Field("repeated", "repeated two", Field.Store.YES, Field.Index.TOKENIZED));
|
||||
|
||||
String segName = "test";
|
||||
writer.addDocument(segName, doc);
|
||||
SegmentReader reader = SegmentReader.get(new SegmentInfo(segName, 1, dir));
|
||||
writer.addDocument(doc);
|
||||
writer.flush();
|
||||
SegmentInfo info = writer.segmentInfos.info(writer.segmentInfos.size()-1);
|
||||
writer.close();
|
||||
SegmentReader reader = SegmentReader.get(info);
|
||||
|
||||
TermPositions termPositions = reader.termPositions(new Term("repeated", "repeated"));
|
||||
assertTrue(termPositions.next());
|
||||
|
@ -130,7 +137,7 @@ public class TestDocumentWriter extends TestCase {
|
|||
|
||||
public void testPreAnalyzedField() throws IOException {
|
||||
Similarity similarity = Similarity.getDefault();
|
||||
DocumentWriter writer = new DocumentWriter(dir, new SimpleAnalyzer(), similarity, 50);
|
||||
IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true);
|
||||
Document doc = new Document();
|
||||
|
||||
doc.add(new Field("preanalyzed", new TokenStream() {
|
||||
|
@ -147,9 +154,11 @@ public class TestDocumentWriter extends TestCase {
|
|||
|
||||
}, TermVector.NO));
|
||||
|
||||
String segName = "test";
|
||||
writer.addDocument(segName, doc);
|
||||
SegmentReader reader = SegmentReader.get(new SegmentInfo(segName, 1, dir));
|
||||
writer.addDocument(doc);
|
||||
writer.flush();
|
||||
SegmentInfo info = writer.segmentInfos.info(writer.segmentInfos.size()-1);
|
||||
writer.close();
|
||||
SegmentReader reader = SegmentReader.get(info);
|
||||
|
||||
TermPositions termPositions = reader.termPositions(new Term("preanalyzed", "term1"));
|
||||
assertTrue(termPositions.next());
|
||||
|
|
|
@ -35,6 +35,8 @@ public class TestFieldsReader extends TestCase {
|
|||
private Document testDoc = new Document();
|
||||
private FieldInfos fieldInfos = null;
|
||||
|
||||
private final static String TEST_SEGMENT_NAME = "_0";
|
||||
|
||||
public TestFieldsReader(String s) {
|
||||
super(s);
|
||||
}
|
||||
|
@ -43,16 +45,16 @@ public class TestFieldsReader extends TestCase {
|
|||
fieldInfos = new FieldInfos();
|
||||
DocHelper.setupDoc(testDoc);
|
||||
fieldInfos.add(testDoc);
|
||||
DocumentWriter writer = new DocumentWriter(dir, new WhitespaceAnalyzer(),
|
||||
Similarity.getDefault(), 50);
|
||||
assertTrue(writer != null);
|
||||
writer.addDocument("test", testDoc);
|
||||
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
|
||||
writer.setUseCompoundFile(false);
|
||||
writer.addDocument(testDoc);
|
||||
writer.close();
|
||||
}
|
||||
|
||||
public void test() throws IOException {
|
||||
assertTrue(dir != null);
|
||||
assertTrue(fieldInfos != null);
|
||||
FieldsReader reader = new FieldsReader(dir, "test", fieldInfos);
|
||||
FieldsReader reader = new FieldsReader(dir, TEST_SEGMENT_NAME, fieldInfos);
|
||||
assertTrue(reader != null);
|
||||
assertTrue(reader.size() == 1);
|
||||
Document doc = reader.doc(0, null);
|
||||
|
@ -82,7 +84,7 @@ public class TestFieldsReader extends TestCase {
|
|||
public void testLazyFields() throws Exception {
|
||||
assertTrue(dir != null);
|
||||
assertTrue(fieldInfos != null);
|
||||
FieldsReader reader = new FieldsReader(dir, "test", fieldInfos);
|
||||
FieldsReader reader = new FieldsReader(dir, TEST_SEGMENT_NAME, fieldInfos);
|
||||
assertTrue(reader != null);
|
||||
assertTrue(reader.size() == 1);
|
||||
Set loadFieldNames = new HashSet();
|
||||
|
@ -137,7 +139,7 @@ public class TestFieldsReader extends TestCase {
|
|||
public void testLazyFieldsAfterClose() throws Exception {
|
||||
assertTrue(dir != null);
|
||||
assertTrue(fieldInfos != null);
|
||||
FieldsReader reader = new FieldsReader(dir, "test", fieldInfos);
|
||||
FieldsReader reader = new FieldsReader(dir, TEST_SEGMENT_NAME, fieldInfos);
|
||||
assertTrue(reader != null);
|
||||
assertTrue(reader.size() == 1);
|
||||
Set loadFieldNames = new HashSet();
|
||||
|
@ -167,7 +169,7 @@ public class TestFieldsReader extends TestCase {
|
|||
public void testLoadFirst() throws Exception {
|
||||
assertTrue(dir != null);
|
||||
assertTrue(fieldInfos != null);
|
||||
FieldsReader reader = new FieldsReader(dir, "test", fieldInfos);
|
||||
FieldsReader reader = new FieldsReader(dir, TEST_SEGMENT_NAME, fieldInfos);
|
||||
assertTrue(reader != null);
|
||||
assertTrue(reader.size() == 1);
|
||||
LoadFirstFieldSelector fieldSelector = new LoadFirstFieldSelector();
|
||||
|
@ -200,10 +202,12 @@ public class TestFieldsReader extends TestCase {
|
|||
_TestUtil.rmDir(file);
|
||||
FSDirectory tmpDir = FSDirectory.getDirectory(file);
|
||||
assertTrue(tmpDir != null);
|
||||
DocumentWriter writer = new DocumentWriter(tmpDir, new WhitespaceAnalyzer(),
|
||||
Similarity.getDefault(), 50);
|
||||
assertTrue(writer != null);
|
||||
writer.addDocument("test", testDoc);
|
||||
|
||||
IndexWriter writer = new IndexWriter(tmpDir, new WhitespaceAnalyzer(), true);
|
||||
writer.setUseCompoundFile(false);
|
||||
writer.addDocument(testDoc);
|
||||
writer.close();
|
||||
|
||||
assertTrue(fieldInfos != null);
|
||||
FieldsReader reader;
|
||||
long lazyTime = 0;
|
||||
|
@ -214,7 +218,7 @@ public class TestFieldsReader extends TestCase {
|
|||
SetBasedFieldSelector fieldSelector = new SetBasedFieldSelector(Collections.EMPTY_SET, lazyFieldNames);
|
||||
|
||||
for (int i = 0; i < length; i++) {
|
||||
reader = new FieldsReader(tmpDir, "test", fieldInfos);
|
||||
reader = new FieldsReader(tmpDir, TEST_SEGMENT_NAME, fieldInfos);
|
||||
assertTrue(reader != null);
|
||||
assertTrue(reader.size() == 1);
|
||||
|
||||
|
@ -238,7 +242,7 @@ public class TestFieldsReader extends TestCase {
|
|||
doc = null;
|
||||
//Hmmm, are we still in cache???
|
||||
System.gc();
|
||||
reader = new FieldsReader(tmpDir, "test", fieldInfos);
|
||||
reader = new FieldsReader(tmpDir, TEST_SEGMENT_NAME, fieldInfos);
|
||||
doc = reader.doc(0, fieldSelector);
|
||||
field = doc.getFieldable(DocHelper.LARGE_LAZY_FIELD_KEY);
|
||||
assertTrue("field is not lazy", field.isLazy() == true);
|
||||
|
@ -256,7 +260,7 @@ public class TestFieldsReader extends TestCase {
|
|||
}
|
||||
|
||||
public void testLoadSize() throws IOException {
|
||||
FieldsReader reader = new FieldsReader(dir, "test", fieldInfos);
|
||||
FieldsReader reader = new FieldsReader(dir, TEST_SEGMENT_NAME, fieldInfos);
|
||||
Document doc;
|
||||
|
||||
doc = reader.doc(0, new FieldSelector(){
|
||||
|
|
|
@ -43,15 +43,20 @@ public class TestMultiReader extends TestCase {
|
|||
protected void setUp() throws IOException {
|
||||
DocHelper.setupDoc(doc1);
|
||||
DocHelper.setupDoc(doc2);
|
||||
DocHelper.writeDoc(dir, "seg-1", doc1);
|
||||
DocHelper.writeDoc(dir, "seg-2", doc2);
|
||||
SegmentInfo info1 = DocHelper.writeDoc(dir, doc1);
|
||||
SegmentInfo info2 = DocHelper.writeDoc(dir, doc2);
|
||||
sis.write(dir);
|
||||
reader1 = SegmentReader.get(new SegmentInfo("seg-1", 1, dir));
|
||||
reader2 = SegmentReader.get(new SegmentInfo("seg-2", 1, dir));
|
||||
openReaders();
|
||||
}
|
||||
|
||||
private void openReaders() throws IOException {
|
||||
sis.read(dir);
|
||||
reader1 = SegmentReader.get(sis.info(0));
|
||||
reader2 = SegmentReader.get(sis.info(1));
|
||||
readers[0] = reader1;
|
||||
readers[1] = reader2;
|
||||
}
|
||||
|
||||
|
||||
public void test() {
|
||||
assertTrue(dir != null);
|
||||
assertTrue(reader1 != null);
|
||||
|
@ -88,6 +93,7 @@ public class TestMultiReader extends TestCase {
|
|||
reader.commit();
|
||||
reader.close();
|
||||
sis.read(dir);
|
||||
openReaders();
|
||||
reader = new MultiSegmentReader(dir, sis, false, readers);
|
||||
assertEquals( 2, reader.numDocs() );
|
||||
|
||||
|
|
|
@ -32,12 +32,10 @@ public class TestSegmentMerger extends TestCase {
|
|||
//First segment to be merged
|
||||
private Directory merge1Dir = new RAMDirectory();
|
||||
private Document doc1 = new Document();
|
||||
private String merge1Segment = "test-1";
|
||||
private SegmentReader reader1 = null;
|
||||
//Second Segment to be merged
|
||||
private Directory merge2Dir = new RAMDirectory();
|
||||
private Document doc2 = new Document();
|
||||
private String merge2Segment = "test-2";
|
||||
private SegmentReader reader2 = null;
|
||||
|
||||
|
||||
|
@ -47,11 +45,11 @@ public class TestSegmentMerger extends TestCase {
|
|||
|
||||
protected void setUp() throws IOException {
|
||||
DocHelper.setupDoc(doc1);
|
||||
DocHelper.writeDoc(merge1Dir, merge1Segment, doc1);
|
||||
SegmentInfo info1 = DocHelper.writeDoc(merge1Dir, doc1);
|
||||
DocHelper.setupDoc(doc2);
|
||||
DocHelper.writeDoc(merge2Dir, merge2Segment, doc2);
|
||||
reader1 = SegmentReader.get(new SegmentInfo(merge1Segment, 1, merge1Dir));
|
||||
reader2 = SegmentReader.get(new SegmentInfo(merge2Segment, 1, merge2Dir));
|
||||
SegmentInfo info2 = DocHelper.writeDoc(merge2Dir, doc2);
|
||||
reader1 = SegmentReader.get(info1);
|
||||
reader2 = SegmentReader.get(info2);
|
||||
}
|
||||
|
||||
public void test() {
|
||||
|
|
|
@ -41,8 +41,8 @@ public class TestSegmentReader extends TestCase {
|
|||
//TODO: Setup the reader w/ multiple documents
|
||||
protected void setUp() throws IOException {
|
||||
DocHelper.setupDoc(testDoc);
|
||||
DocHelper.writeDoc(dir, testDoc);
|
||||
reader = SegmentReader.get(new SegmentInfo("test", 1, dir));
|
||||
SegmentInfo info = DocHelper.writeDoc(dir, testDoc);
|
||||
reader = SegmentReader.get(info);
|
||||
}
|
||||
|
||||
protected void tearDown() {
|
||||
|
@ -75,8 +75,8 @@ public class TestSegmentReader extends TestCase {
|
|||
public void testDelete() throws IOException {
|
||||
Document docToDelete = new Document();
|
||||
DocHelper.setupDoc(docToDelete);
|
||||
DocHelper.writeDoc(dir, "seg-to-delete", docToDelete);
|
||||
SegmentReader deleteReader = SegmentReader.get(new SegmentInfo("seg-to-delete", 1, dir));
|
||||
SegmentInfo info = DocHelper.writeDoc(dir, docToDelete);
|
||||
SegmentReader deleteReader = SegmentReader.get(info);
|
||||
assertTrue(deleteReader != null);
|
||||
assertTrue(deleteReader.numDocs() == 1);
|
||||
deleteReader.deleteDocument(0);
|
||||
|
|
|
@ -29,6 +29,7 @@ import java.io.IOException;
|
|||
public class TestSegmentTermDocs extends TestCase {
|
||||
private Document testDoc = new Document();
|
||||
private Directory dir = new RAMDirectory();
|
||||
private SegmentInfo info;
|
||||
|
||||
public TestSegmentTermDocs(String s) {
|
||||
super(s);
|
||||
|
@ -36,7 +37,7 @@ public class TestSegmentTermDocs extends TestCase {
|
|||
|
||||
protected void setUp() throws IOException {
|
||||
DocHelper.setupDoc(testDoc);
|
||||
DocHelper.writeDoc(dir, testDoc);
|
||||
info = DocHelper.writeDoc(dir, testDoc);
|
||||
}
|
||||
|
||||
|
||||
|
@ -50,7 +51,7 @@ public class TestSegmentTermDocs extends TestCase {
|
|||
|
||||
public void testTermDocs() throws IOException {
|
||||
//After adding the document, we should be able to read it back in
|
||||
SegmentReader reader = SegmentReader.get(new SegmentInfo("test", 1, dir));
|
||||
SegmentReader reader = SegmentReader.get(info);
|
||||
assertTrue(reader != null);
|
||||
SegmentTermDocs segTermDocs = new SegmentTermDocs(reader);
|
||||
assertTrue(segTermDocs != null);
|
||||
|
@ -68,7 +69,7 @@ public class TestSegmentTermDocs extends TestCase {
|
|||
public void testBadSeek() throws IOException {
|
||||
{
|
||||
//After adding the document, we should be able to read it back in
|
||||
SegmentReader reader = SegmentReader.get(new SegmentInfo("test", 1, dir));
|
||||
SegmentReader reader = SegmentReader.get(info);
|
||||
assertTrue(reader != null);
|
||||
SegmentTermDocs segTermDocs = new SegmentTermDocs(reader);
|
||||
assertTrue(segTermDocs != null);
|
||||
|
@ -78,7 +79,7 @@ public class TestSegmentTermDocs extends TestCase {
|
|||
}
|
||||
{
|
||||
//After adding the document, we should be able to read it back in
|
||||
SegmentReader reader = SegmentReader.get(new SegmentInfo("test", 1, dir));
|
||||
SegmentReader reader = SegmentReader.get(info);
|
||||
assertTrue(reader != null);
|
||||
SegmentTermDocs segTermDocs = new SegmentTermDocs(reader);
|
||||
assertTrue(segTermDocs != null);
|
||||
|
|
Loading…
Reference in New Issue