LUCENE-1120: speed up merging of term vectors by bulk-copying the raw bytes

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@615183 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2008-01-25 11:32:32 +00:00
parent ee835ccf21
commit 76dfb92a44
19 changed files with 331 additions and 148 deletions

View File

@ -1,4 +1,4 @@
Lucene Change Log
Lucene Change Log
$Id$
======================= Trunk (not yet released) =======================
@ -32,6 +32,10 @@ Optimizations
disk full situation before actually filling up the disk. (Mike
McCandless)
2. LUCENE-1120: Speed up merging of term vectors by bulk-copying the
raw bytes for each contiguous range of non-deleted documents.
(Mike McCandless)
Documentation
Build

View File

@ -662,12 +662,12 @@ final class DocumentsWriter {
// Append term vectors to the real outputs:
if (tvx != null) {
tvx.writeLong(tvd.getFilePointer());
tvx.writeLong(tvf.getFilePointer());
tvd.writeVInt(numVectorFields);
if (numVectorFields > 0) {
for(int i=0;i<numVectorFields;i++)
tvd.writeVInt(vectorFieldNumbers[i]);
assert 0 == vectorFieldPointers[0];
tvd.writeVLong(tvf.getFilePointer());
long lastPos = vectorFieldPointers[0];
for(int i=1;i<numVectorFields;i++) {
long pos = vectorFieldPointers[i];
@ -870,22 +870,23 @@ final class DocumentsWriter {
// state:
try {
tvx = directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
tvx.writeInt(TermVectorsReader.FORMAT_VERSION);
tvx.writeInt(TermVectorsReader.FORMAT_VERSION2);
tvd = directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
tvd.writeInt(TermVectorsReader.FORMAT_VERSION);
tvd.writeInt(TermVectorsReader.FORMAT_VERSION2);
tvf = directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
tvf.writeInt(TermVectorsReader.FORMAT_VERSION);
tvf.writeInt(TermVectorsReader.FORMAT_VERSION2);
// We must "catch up" for all docIDs that had no
// vectors before this one
for(int i=0;i<docID;i++)
for(int i=0;i<docID;i++) {
tvx.writeLong(0);
tvx.writeLong(0);
}
} catch (Throwable t) {
throw new AbortException(t, DocumentsWriter.this);
}
files = null;
}
numVectorFields = 0;
}
}

View File

@ -205,6 +205,38 @@ final class SegmentMerger {
}
}
private SegmentReader[] matchingSegmentReaders;
private int[] rawDocLengths;
private int[] rawDocLengths2;
private void setMatchingSegmentReaders() {
// If the i'th reader is a SegmentReader and has
// identical fieldName -> number mapping, then this
// array will be non-null at position i:
matchingSegmentReaders = new SegmentReader[readers.size()];
// If this reader is a SegmentReader, and all of its
// field name -> number mappings match the "merged"
// FieldInfos, then we can do a bulk copy of the
// stored fields:
for (int i = 0; i < readers.size(); i++) {
IndexReader reader = (IndexReader) readers.elementAt(i);
if (reader instanceof SegmentReader) {
SegmentReader segmentReader = (SegmentReader) reader;
boolean same = true;
FieldInfos segmentFieldInfos = segmentReader.getFieldInfos();
for (int j = 0; same && j < segmentFieldInfos.size(); j++)
same = fieldInfos.fieldName(j).equals(segmentFieldInfos.fieldName(j));
if (same)
matchingSegmentReaders[i] = segmentReader;
}
}
// Used for bulk-reading raw bytes for stored fields
rawDocLengths = new int[MAX_RAW_MERGE_DOCS];
rawDocLengths2 = new int[MAX_RAW_MERGE_DOCS];
}
/**
*
* @return The number of documents in all of the readers
@ -248,34 +280,10 @@ final class SegmentMerger {
int docCount = 0;
setMatchingSegmentReaders();
if (mergeDocStores) {
// If the i'th reader is a SegmentReader and has
// identical fieldName -> number mapping, then this
// array will be non-null at position i:
SegmentReader[] matchingSegmentReaders = new SegmentReader[readers.size()];
// If this reader is a SegmentReader, and all of its
// field name -> number mappings match the "merged"
// FieldInfos, then we can do a bulk copy of the
// stored fields:
for (int i = 0; i < readers.size(); i++) {
IndexReader reader = (IndexReader) readers.elementAt(i);
if (reader instanceof SegmentReader) {
SegmentReader segmentReader = (SegmentReader) reader;
boolean same = true;
FieldInfos segmentFieldInfos = segmentReader.getFieldInfos();
for (int j = 0; same && j < segmentFieldInfos.size(); j++)
same = fieldInfos.fieldName(j).equals(segmentFieldInfos.fieldName(j));
if (same) {
matchingSegmentReaders[i] = segmentReader;
}
}
}
// Used for bulk-reading raw bytes for stored fields
final int[] rawDocLengths = new int[MAX_RAW_MERGE_DOCS];
// for merging we don't want to compress/uncompress the data, so to tell the FieldsReader that we're
// in merge mode, we use this FieldSelector
FieldSelector fieldSelectorMerge = new FieldSelector() {
@ -350,15 +358,45 @@ final class SegmentMerger {
try {
for (int r = 0; r < readers.size(); r++) {
final SegmentReader matchingSegmentReader = matchingSegmentReaders[r];
TermVectorsReader matchingVectorsReader;
if (matchingSegmentReader != null) {
matchingVectorsReader = matchingSegmentReader.termVectorsReaderOrig;
// If the TV* files are an older format then they
// cannot read raw docs:
if (matchingVectorsReader != null && !matchingVectorsReader.canReadRawDocs())
matchingVectorsReader = null;
} else
matchingVectorsReader = null;
IndexReader reader = (IndexReader) readers.elementAt(r);
int maxDoc = reader.maxDoc();
for (int docNum = 0; docNum < maxDoc; docNum++) {
for (int docNum = 0; docNum < maxDoc;) {
// skip deleted docs
if (reader.isDeleted(docNum))
continue;
termVectorsWriter.addAllDocVectors(reader.getTermFreqVectors(docNum));
if (checkAbort != null)
checkAbort.work(300);
if (!reader.isDeleted(docNum)) {
if (matchingVectorsReader != null) {
// We can optimize this case (doing a bulk
// byte copy) since the field numbers are
// identical
int start = docNum;
int numDocs = 0;
do {
docNum++;
numDocs++;
} while(docNum < maxDoc && !matchingSegmentReader.isDeleted(docNum) && numDocs < MAX_RAW_MERGE_DOCS);
matchingVectorsReader.rawDocs(rawDocLengths, rawDocLengths2, start, numDocs);
termVectorsWriter.addRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs);
if (checkAbort != null)
checkAbort.work(300*numDocs);
} else {
termVectorsWriter.addAllDocVectors(reader.getTermFreqVectors(docNum));
docNum++;
if (checkAbort != null)
checkAbort.work(300);
}
} else
docNum++;
}
}
} finally {

View File

@ -22,13 +22,18 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import java.io.IOException;
import java.util.Arrays;
/**
* @version $Id$
*/
class TermVectorsReader implements Cloneable {
// NOTE: if you make a new format, it must be larger than
// the current format
static final int FORMAT_VERSION = 2;
static final int FORMAT_VERSION2 = 3;
//The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
static final int FORMAT_SIZE = 4;
@ -41,13 +46,13 @@ class TermVectorsReader implements Cloneable {
private IndexInput tvd;
private IndexInput tvf;
private int size;
private int numTotalDocs;
// The docID offset where our docs begin in the index
// file. This will be 0 if we have our own private file.
private int docStoreOffset;
private int tvdFormat;
private int tvfFormat;
private final int format;
TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos)
throws CorruptIndexException, IOException {
@ -56,7 +61,7 @@ class TermVectorsReader implements Cloneable {
TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize)
throws CorruptIndexException, IOException {
this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE, -1, 0);
this(d, segment, fieldInfos, readBufferSize, -1, 0);
}
TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize, int docStoreOffset, int size)
@ -66,22 +71,35 @@ class TermVectorsReader implements Cloneable {
try {
if (d.fileExists(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION)) {
tvx = d.openInput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION, readBufferSize);
checkValidFormat(tvx);
format = checkValidFormat(tvx);
tvd = d.openInput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION, readBufferSize);
tvdFormat = checkValidFormat(tvd);
final int tvdFormat = checkValidFormat(tvd);
tvf = d.openInput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION, readBufferSize);
tvfFormat = checkValidFormat(tvf);
final int tvfFormat = checkValidFormat(tvf);
assert format == tvdFormat;
assert format == tvfFormat;
if (format >= FORMAT_VERSION2) {
assert (tvx.length()-FORMAT_SIZE) % 16 == 0;
numTotalDocs = (int) (tvx.length() >> 4);
} else {
assert (tvx.length()-FORMAT_SIZE) % 8 == 0;
numTotalDocs = (int) (tvx.length() >> 3);
}
if (-1 == docStoreOffset) {
this.docStoreOffset = 0;
this.size = (int) (tvx.length() >> 3);
this.size = numTotalDocs;
} else {
this.docStoreOffset = docStoreOffset;
this.size = size;
// Verify the file is long enough to hold all of our
// docs
assert ((int) (tvx.length() / 8)) >= size + docStoreOffset;
assert numTotalDocs >= size + docStoreOffset;
}
}
} else
format = 0;
this.fieldInfos = fieldInfos;
success = true;
@ -97,25 +115,93 @@ class TermVectorsReader implements Cloneable {
}
}
// Used for bulk copy when merging
IndexInput getTvdStream() {
return tvd;
}
// Used for bulk copy when merging
IndexInput getTvfStream() {
return tvf;
}
final private void seekTvx(final int docNum) throws IOException {
if (format < FORMAT_VERSION2)
tvx.seek((docNum + docStoreOffset) * 8L + FORMAT_SIZE);
else
tvx.seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE);
}
boolean canReadRawDocs() {
return format >= FORMAT_VERSION2;
}
/** Retrieve the length (in bytes) of the tvd and tvf
* entries for the next numDocs starting with
* startDocID. This is used for bulk copying when
* merging segments, if the field numbers are
* congruent. Once this returns, the tvf & tvd streams
* are seeked to the startDocID. */
final void rawDocs(int[] tvdLengths, int[] tvfLengths, int startDocID, int numDocs) throws IOException {
if (tvx == null) {
Arrays.fill(tvdLengths, 0);
Arrays.fill(tvfLengths, 0);
return;
}
// SegmentMerger calls canReadRawDocs() first and should
// not call us if that returns false.
if (format < FORMAT_VERSION2)
throw new IllegalStateException("cannot read raw docs with older term vector formats");
seekTvx(startDocID);
long tvdPosition = tvx.readLong();
tvd.seek(tvdPosition);
long tvfPosition = tvx.readLong();
tvf.seek(tvfPosition);
long lastTvdPosition = tvdPosition;
long lastTvfPosition = tvfPosition;
int count = 0;
while (count < numDocs) {
final int docID = startDocID + count + 1;
if (docID < numTotalDocs) {
tvdPosition = tvx.readLong();
tvfPosition = tvx.readLong();
} else {
tvdPosition = tvd.length();
tvfPosition = tvf.length();
}
tvdLengths[count] = (int) (tvdPosition-lastTvdPosition);
tvfLengths[count] = (int) (tvfPosition-lastTvfPosition);
count++;
lastTvdPosition = tvdPosition;
lastTvfPosition = tvfPosition;
}
}
private int checkValidFormat(IndexInput in) throws CorruptIndexException, IOException
{
int format = in.readInt();
if (format > FORMAT_VERSION)
{
if (format > FORMAT_VERSION2) {
throw new CorruptIndexException("Incompatible format version: " + format + " expected "
+ FORMAT_VERSION + " or less");
+ FORMAT_VERSION2 + " or less");
}
return format;
}
void close() throws IOException {
// make all effort to close up. Keep the first exception
// and throw it as a new one.
IOException keep = null;
if (tvx != null) try { tvx.close(); } catch (IOException e) { if (keep == null) keep = e; }
if (tvd != null) try { tvd.close(); } catch (IOException e) { if (keep == null) keep = e; }
if (tvf != null) try { tvf.close(); } catch (IOException e) { if (keep == null) keep = e; }
if (keep != null) throw (IOException) keep.fillInStackTrace();
// make all effort to close up. Keep the first exception
// and throw it as a new one.
IOException keep = null;
if (tvx != null) try { tvx.close(); } catch (IOException e) { if (keep == null) keep = e; }
if (tvd != null) try { tvd.close(); } catch (IOException e) { if (keep == null) keep = e; }
if (tvf != null) try { tvf.close(); } catch (IOException e) { if (keep == null) keep = e; }
if (keep != null) throw (IOException) keep.fillInStackTrace();
}
/**
@ -133,11 +219,11 @@ class TermVectorsReader implements Cloneable {
//We don't need to do this in other seeks because we already have the
// file pointer
//that was written in another file
tvx.seek(((docNum + docStoreOffset) * 8L) + FORMAT_SIZE);
seekTvx(docNum);
//System.out.println("TVX Pointer: " + tvx.getFilePointer());
long position = tvx.readLong();
long tvdPosition = tvx.readLong();
tvd.seek(position);
tvd.seek(tvdPosition);
int fieldCount = tvd.readVInt();
//System.out.println("Num Fields: " + fieldCount);
// There are only a few fields per document. We opt for a full scan
@ -146,7 +232,7 @@ class TermVectorsReader implements Cloneable {
int number = 0;
int found = -1;
for (int i = 0; i < fieldCount; i++) {
if(tvdFormat == FORMAT_VERSION)
if (format >= FORMAT_VERSION)
number = tvd.readVInt();
else
number += tvd.readVInt();
@ -159,8 +245,12 @@ class TermVectorsReader implements Cloneable {
// document
if (found != -1) {
// Compute position in the tvf file
position = 0;
for (int i = 0; i <= found; i++)
long position;
if (format >= FORMAT_VERSION2)
position = tvx.readLong();
else
position = tvd.readVLong();
for (int i = 1; i <= found; i++)
position += tvd.readVLong();
mapper.setDocumentNumber(docNum);
@ -190,6 +280,45 @@ class TermVectorsReader implements Cloneable {
return mapper.materializeVector();
}
// Reads the String[] fields; you have to pre-seek tvd to
// the right point
final private String[] readFields(int fieldCount) throws IOException {
int number = 0;
String[] fields = new String[fieldCount];
for (int i = 0; i < fieldCount; i++) {
if (format >= FORMAT_VERSION)
number = tvd.readVInt();
else
number += tvd.readVInt();
fields[i] = fieldInfos.fieldName(number);
}
return fields;
}
// Reads the long[] offsets into TVF; you have to pre-seek
// tvx/tvd to the right point
final private long[] readTvfPointers(int fieldCount) throws IOException {
// Compute position in the tvf file
long position;
if (format >= FORMAT_VERSION2)
position = tvx.readLong();
else
position = tvd.readVLong();
long[] tvfPointers = new long[fieldCount];
tvfPointers[0] = position;
for (int i = 1; i < fieldCount; i++) {
position += tvd.readVLong();
tvfPointers[i] = position;
}
return tvfPointers;
}
/**
* Return all term vectors stored for this document or null if the could not be read in.
*
@ -201,34 +330,16 @@ class TermVectorsReader implements Cloneable {
TermFreqVector[] result = null;
if (tvx != null) {
//We need to offset by
tvx.seek(((docNum + docStoreOffset) * 8L) + FORMAT_SIZE);
long position = tvx.readLong();
seekTvx(docNum);
long tvdPosition = tvx.readLong();
tvd.seek(position);
tvd.seek(tvdPosition);
int fieldCount = tvd.readVInt();
// No fields are vectorized for this document
if (fieldCount != 0) {
int number = 0;
String[] fields = new String[fieldCount];
for (int i = 0; i < fieldCount; i++) {
if(tvdFormat == FORMAT_VERSION)
number = tvd.readVInt();
else
number += tvd.readVInt();
fields[i] = fieldInfos.fieldName(number);
}
// Compute position in the tvf file
position = 0;
long[] tvfPointers = new long[fieldCount];
for (int i = 0; i < fieldCount; i++) {
position += tvd.readVLong();
tvfPointers[i] = position;
}
final String[] fields = readFields(fieldCount);
final long[] tvfPointers = readTvfPointers(fieldCount);
result = readTermVectors(docNum, fields, tvfPointers);
}
} else {
@ -241,34 +352,17 @@ class TermVectorsReader implements Cloneable {
// Check if no term vectors are available for this segment at all
if (tvx != null) {
//We need to offset by
tvx.seek((docNumber * 8L) + FORMAT_SIZE);
long position = tvx.readLong();
tvd.seek(position);
seekTvx(docNumber);
long tvdPosition = tvx.readLong();
tvd.seek(tvdPosition);
int fieldCount = tvd.readVInt();
// No fields are vectorized for this document
if (fieldCount != 0) {
int number = 0;
String[] fields = new String[fieldCount];
for (int i = 0; i < fieldCount; i++) {
if(tvdFormat == FORMAT_VERSION)
number = tvd.readVInt();
else
number += tvd.readVInt();
fields[i] = fieldInfos.fieldName(number);
}
// Compute position in the tvf file
position = 0;
long[] tvfPointers = new long[fieldCount];
for (int i = 0; i < fieldCount; i++) {
position += tvd.readVLong();
tvfPointers[i] = position;
}
final String[] fields = readFields(fieldCount);
final long[] tvfPointers = readTvfPointers(fieldCount);
mapper.setDocumentNumber(docNumber);
readTermVectors(fields, tvfPointers, mapper);
}
@ -293,9 +387,8 @@ class TermVectorsReader implements Cloneable {
private void readTermVectors(String fields[], long tvfPointers[], TermVectorMapper mapper)
throws IOException {
for (int i = 0; i < fields.length; i++) {
readTermVector(fields[i], tvfPointers[i], mapper);
readTermVector(fields[i], tvfPointers[i], mapper);
}
}
@ -324,7 +417,7 @@ class TermVectorsReader implements Cloneable {
boolean storePositions;
boolean storeOffsets;
if(tvfFormat == FORMAT_VERSION){
if (format >= FORMAT_VERSION){
byte bits = tvf.readByte();
storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
@ -400,8 +493,6 @@ class TermVectorsReader implements Cloneable {
}
}
protected Object clone() {
if (tvx == null || tvd == null || tvf == null)
@ -418,11 +509,9 @@ class TermVectorsReader implements Cloneable {
return clone;
}
}
/**
* Models the existing parallel array structure
*/

View File

@ -33,11 +33,11 @@ final class TermVectorsWriter {
throws IOException {
// Open files for TermVector storage
tvx = directory.createOutput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
tvx.writeInt(TermVectorsReader.FORMAT_VERSION);
tvx.writeInt(TermVectorsReader.FORMAT_VERSION2);
tvd = directory.createOutput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
tvd.writeInt(TermVectorsReader.FORMAT_VERSION);
tvd.writeInt(TermVectorsReader.FORMAT_VERSION2);
tvf = directory.createOutput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
tvf.writeInt(TermVectorsReader.FORMAT_VERSION);
tvf.writeInt(TermVectorsReader.FORMAT_VERSION2);
this.fieldInfos = fieldInfos;
}
@ -53,6 +53,7 @@ final class TermVectorsWriter {
throws IOException {
tvx.writeLong(tvd.getFilePointer());
tvx.writeLong(tvf.getFilePointer());
if (vectors != null) {
final int numFields = vectors.length;
@ -145,8 +146,8 @@ final class TermVectorsWriter {
}
// 2nd pass: write field pointers to tvd
long lastFieldPointer = 0;
for (int i=0; i<numFields; i++) {
long lastFieldPointer = fieldPointers[0];
for (int i=1; i<numFields; i++) {
final long fieldPointer = fieldPointers[i];
tvd.writeVLong(fieldPointer-lastFieldPointer);
lastFieldPointer = fieldPointer;
@ -155,6 +156,28 @@ final class TermVectorsWriter {
tvd.writeVInt(0);
}
/**
* Do a bulk copy of numDocs documents from reader to our
* streams. This is used to expedite merging, if the
* field numbers are congruent.
*/
final void addRawDocuments(TermVectorsReader reader, int[] tvdLengths, int[] tvfLengths, int numDocs) throws IOException {
long tvdPosition = tvd.getFilePointer();
long tvfPosition = tvf.getFilePointer();
long tvdStart = tvdPosition;
long tvfStart = tvfPosition;
for(int i=0;i<numDocs;i++) {
tvx.writeLong(tvdPosition);
tvdPosition += tvdLengths[i];
tvx.writeLong(tvfPosition);
tvfPosition += tvfLengths[i];
}
tvd.copyBytes(reader.getTvdStream(), tvdPosition-tvdStart);
tvf.copyBytes(reader.getTvfStream(), tvfPosition-tvfStart);
assert tvd.getFilePointer() == tvdPosition;
assert tvf.getFilePointer() == tvfPosition;
}
/** Close all streams. */
final void close() throws IOException {
// make an effort to close all streams we can but remember and re-throw

View File

@ -18,10 +18,16 @@ package org.apache.lucene.index;
*/
import org.apache.lucene.util.LuceneTestCase;
import java.util.Vector;
import java.util.Arrays;
import java.io.ByteArrayOutputStream;
import java.io.ObjectOutputStream;
import java.util.Enumeration;
import java.util.zip.ZipFile;
import java.util.zip.ZipEntry;
import java.io.OutputStream;
import java.io.InputStream;
import java.io.FileOutputStream;
import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.File;
@ -33,9 +39,6 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import java.io.*;
import java.util.*;
import java.util.zip.*;
/*
Verify we can read the pre-2.1 file format, do searches
@ -50,11 +53,11 @@ public class TestBackwardsCompatibility extends LuceneTestCase
/*
public void testCreatePreLocklessCFS() throws IOException {
createIndex("src/test/org/apache/lucene/index/index.prelockless.cfs", true);
createIndex("src/test/org/apache/lucene/index/index.cfs", true);
}
public void testCreatePreLocklessNoCFS() throws IOException {
createIndex("src/test/org/apache/lucene/index/index.prelockless.nocfs", false);
createIndex("src/test/org/apache/lucene/index/index.nocfs", false);
}
*/
@ -106,10 +109,14 @@ public class TestBackwardsCompatibility extends LuceneTestCase
rmDir(dirName);
}
final String[] oldNames = {"prelockless.cfs",
"prelockless.nocfs",
"presharedstores.cfs",
"presharedstores.nocfs"};
final String[] oldNames = {"19.cfs",
"19.nocfs",
"20.cfs",
"20.nocfs",
"21.cfs",
"21.nocfs",
"22.cfs",
"22.nocfs"};
public void testSearchOldIndex() throws IOException {
for(int i=0;i<oldNames.length;i++) {
@ -146,6 +153,15 @@ public class TestBackwardsCompatibility extends LuceneTestCase
}
}
private void testHits(Hits hits, int expectedCount, IndexReader reader) throws IOException {
final int hitCount = hits.length();
assertEquals("wrong number of hits", expectedCount, hitCount);
for(int i=0;i<hitCount;i++) {
hits.doc(i);
reader.getTermFreqVectors(hits.id(i));
}
}
public void searchIndex(String dirName) throws IOException {
//QueryParser parser = new QueryParser("contents", new WhitespaceAnalyzer());
//Query query = parser.parse("handle:1");
@ -156,12 +172,14 @@ public class TestBackwardsCompatibility extends LuceneTestCase
IndexSearcher searcher = new IndexSearcher(dir);
Hits hits = searcher.search(new TermQuery(new Term("content", "aaa")));
assertEquals(34, hits.length());
Document d = hits.doc(0);
// First document should be #21 since it's norm was increased:
// First document should be #21 since it's norm was
// increased:
Document d = hits.doc(0);
assertEquals("didn't get the right document first", "21", d.get("id"));
testHits(hits, 34, searcher.getIndexReader());
searcher.close();
dir.close();
}
@ -189,9 +207,9 @@ public class TestBackwardsCompatibility extends LuceneTestCase
// make sure searching sees right # hits
IndexSearcher searcher = new IndexSearcher(dir);
Hits hits = searcher.search(new TermQuery(new Term("content", "aaa")));
assertEquals("wrong number of hits", 44, hits.length());
Document d = hits.doc(0);
assertEquals("wrong first document", "21", d.get("id"));
testHits(hits, 44, searcher.getIndexReader());
searcher.close();
// make sure we can do delete & setNorm against this
@ -209,6 +227,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase
assertEquals("wrong number of hits", 43, hits.length());
d = hits.doc(0);
assertEquals("wrong first document", "22", d.get("id"));
testHits(hits, 43, searcher.getIndexReader());
searcher.close();
// optimize
@ -220,6 +239,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase
hits = searcher.search(new TermQuery(new Term("content", "aaa")));
assertEquals("wrong number of hits", 43, hits.length());
d = hits.doc(0);
testHits(hits, 43, searcher.getIndexReader());
assertEquals("wrong first document", "22", d.get("id"));
searcher.close();
@ -257,6 +277,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase
assertEquals("wrong number of hits", 33, hits.length());
d = hits.doc(0);
assertEquals("wrong first document", "22", d.get("id"));
testHits(hits, 33, searcher.getIndexReader());
searcher.close();
// optimize
@ -269,6 +290,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase
assertEquals("wrong number of hits", 33, hits.length());
d = hits.doc(0);
assertEquals("wrong first document", "22", d.get("id"));
testHits(hits, 33, searcher.getIndexReader());
searcher.close();
dir.close();
@ -283,6 +305,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase
Directory dir = FSDirectory.getDirectory(dirName);
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
writer.setUseCompoundFile(doCFS);
writer.setMaxBufferedDocs(10);
for(int i=0;i<35;i++) {
addDoc(writer, i);
@ -393,6 +416,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase
Document doc = new Document();
doc.add(new Field("content", "aaa", Field.Store.NO, Field.Index.TOKENIZED));
doc.add(new Field("id", Integer.toString(id), Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field("content2", "here is more content with aaa aaa aaa", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
writer.addDocument(doc);
}

View File

@ -1937,9 +1937,10 @@ public class TestIndexWriter extends LuceneTestCase
for(int j=0;j<reader.maxDoc();j++) {
if (reader.isDeleted(j))
numDel++;
else
else {
reader.document(j);
reader.getTermFreqVectors(j);
reader.getTermFreqVectors(j);
}
}
reader.close();
@ -1963,9 +1964,10 @@ public class TestIndexWriter extends LuceneTestCase
for(int j=0;j<reader.maxDoc();j++) {
if (reader.isDeleted(j))
numDel++;
else
else {
reader.document(j);
reader.getTermFreqVectors(j);
reader.getTermFreqVectors(j);
}
}
reader.close();
assertEquals(0, numDel);
@ -2053,9 +2055,10 @@ public class TestIndexWriter extends LuceneTestCase
for(int j=0;j<reader.maxDoc();j++) {
if (reader.isDeleted(j))
numDel++;
else
else {
reader.document(j);
reader.getTermFreqVectors(j);
reader.getTermFreqVectors(j);
}
}
reader.close();
@ -2079,9 +2082,10 @@ public class TestIndexWriter extends LuceneTestCase
for(int j=0;j<reader.maxDoc();j++) {
if (reader.isDeleted(j))
numDel++;
else
else {
reader.document(j);
reader.getTermFreqVectors(j);
reader.getTermFreqVectors(j);
}
}
reader.close();
assertEquals(0, numDel);

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.