mirror of https://github.com/apache/lucene.git
LUCENE-2205: port to trunk in preflex codec; port TestPagedBytes and PagedBytes.DataInput/Output to trunk
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1190017 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
60079a441f
commit
00da65393c
|
@ -723,6 +723,9 @@ Optimizations
|
||||||
FilteredQuery/IndexSearcher added by LUCENE-1536 to Lucene 4.0.
|
FilteredQuery/IndexSearcher added by LUCENE-1536 to Lucene 4.0.
|
||||||
(Uwe Schindler)
|
(Uwe Schindler)
|
||||||
|
|
||||||
|
* LUCENE-2205: Very substantial (3-5X) RAM reduction required to hold
|
||||||
|
the terms index on opening an IndexReader (Aaron McCurry via Mike McCandless)
|
||||||
|
|
||||||
Test Cases
|
Test Cases
|
||||||
|
|
||||||
* LUCENE-3420: Disable the finalness checks in TokenStream and Analyzer
|
* LUCENE-3420: Disable the finalness checks in TokenStream and Analyzer
|
||||||
|
|
|
@ -47,9 +47,8 @@ public final class TermInfosReader {
|
||||||
private final SegmentTermEnum origEnum;
|
private final SegmentTermEnum origEnum;
|
||||||
private final long size;
|
private final long size;
|
||||||
|
|
||||||
private final Term[] indexTerms;
|
private final TermInfosReaderIndex index;
|
||||||
private final TermInfo[] indexInfos;
|
private final int indexLength;
|
||||||
private final long[] indexPointers;
|
|
||||||
|
|
||||||
private final int totalIndexInterval;
|
private final int totalIndexInterval;
|
||||||
|
|
||||||
|
@ -118,37 +117,23 @@ public final class TermInfosReader {
|
||||||
if (indexDivisor != -1) {
|
if (indexDivisor != -1) {
|
||||||
// Load terms index
|
// Load terms index
|
||||||
totalIndexInterval = origEnum.indexInterval * indexDivisor;
|
totalIndexInterval = origEnum.indexInterval * indexDivisor;
|
||||||
final SegmentTermEnum indexEnum = new SegmentTermEnum(directory.openInput(IndexFileNames.segmentFileName(segment, "", PreFlexCodec.TERMS_INDEX_EXTENSION),
|
|
||||||
|
final String indexFileName = IndexFileNames.segmentFileName(segment, "", PreFlexCodec.TERMS_INDEX_EXTENSION);
|
||||||
|
final SegmentTermEnum indexEnum = new SegmentTermEnum(directory.openInput(indexFileName,
|
||||||
context), fieldInfos, true);
|
context), fieldInfos, true);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
int indexSize = 1+((int)indexEnum.size-1)/indexDivisor; // otherwise read index
|
// nocommit don't cast to int..
|
||||||
|
index = new TermInfosReaderIndex(indexEnum, indexDivisor, (int) dir.fileLength(indexFileName), totalIndexInterval);
|
||||||
indexTerms = new Term[indexSize];
|
indexLength = index.length();
|
||||||
indexInfos = new TermInfo[indexSize];
|
|
||||||
indexPointers = new long[indexSize];
|
|
||||||
|
|
||||||
for (int i=0;indexEnum.next(); i++) {
|
|
||||||
indexTerms[i] = indexEnum.term();
|
|
||||||
assert indexTerms[i] != null;
|
|
||||||
assert indexTerms[i].text() != null;
|
|
||||||
assert indexTerms[i].field() != null;
|
|
||||||
indexInfos[i] = indexEnum.termInfo();
|
|
||||||
indexPointers[i] = indexEnum.indexPointer;
|
|
||||||
|
|
||||||
for (int j = 1; j < indexDivisor; j++)
|
|
||||||
if (!indexEnum.next())
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} finally {
|
} finally {
|
||||||
indexEnum.close();
|
indexEnum.close();
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Do not load terms index:
|
// Do not load terms index:
|
||||||
totalIndexInterval = -1;
|
totalIndexInterval = -1;
|
||||||
indexTerms = null;
|
index = null;
|
||||||
indexInfos = null;
|
indexLength = -1;
|
||||||
indexPointers = null;
|
|
||||||
}
|
}
|
||||||
success = true;
|
success = true;
|
||||||
} finally {
|
} finally {
|
||||||
|
@ -203,31 +188,6 @@ public final class TermInfosReader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns the offset of the greatest index entry which is less than or equal to term.*/
|
|
||||||
private int getIndexOffset(Term term) {
|
|
||||||
int lo = 0; // binary search indexTerms[]
|
|
||||||
int hi = indexTerms.length - 1;
|
|
||||||
|
|
||||||
while (hi >= lo) {
|
|
||||||
int mid = (lo + hi) >>> 1;
|
|
||||||
assert indexTerms[mid] != null : "indexTerms = " + indexTerms.length + " mid=" + mid;
|
|
||||||
int delta = compareAsUTF16(term, indexTerms[mid]);
|
|
||||||
if (delta < 0)
|
|
||||||
hi = mid - 1;
|
|
||||||
else if (delta > 0)
|
|
||||||
lo = mid + 1;
|
|
||||||
else
|
|
||||||
return mid;
|
|
||||||
}
|
|
||||||
return hi;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException {
|
|
||||||
enumerator.seek(indexPointers[indexOffset],
|
|
||||||
((long) indexOffset * totalIndexInterval) - 1,
|
|
||||||
indexTerms[indexOffset], indexInfos[indexOffset]);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Returns the TermInfo for a Term in the set, or null. */
|
/** Returns the TermInfo for a Term in the set, or null. */
|
||||||
TermInfo get(Term term) throws IOException {
|
TermInfo get(Term term) throws IOException {
|
||||||
return get(term, false);
|
return get(term, false);
|
||||||
|
@ -272,8 +232,8 @@ public final class TermInfosReader {
|
||||||
&& ((enumerator.prev() != null && compareAsUTF16(term, enumerator.prev())> 0)
|
&& ((enumerator.prev() != null && compareAsUTF16(term, enumerator.prev())> 0)
|
||||||
|| compareAsUTF16(term, enumerator.term()) >= 0)) {
|
|| compareAsUTF16(term, enumerator.term()) >= 0)) {
|
||||||
int enumOffset = (int)(enumerator.position/totalIndexInterval)+1;
|
int enumOffset = (int)(enumerator.position/totalIndexInterval)+1;
|
||||||
if (indexTerms.length == enumOffset // but before end of block
|
if (indexLength == enumOffset // but before end of block
|
||||||
|| compareAsUTF16(term, indexTerms[enumOffset]) < 0) {
|
|| index.compareTo(term, enumOffset) < 0) {
|
||||||
// no need to seek
|
// no need to seek
|
||||||
|
|
||||||
final TermInfo ti;
|
final TermInfo ti;
|
||||||
|
@ -309,10 +269,10 @@ public final class TermInfosReader {
|
||||||
indexPos = (int) (tiOrd.termOrd / totalIndexInterval);
|
indexPos = (int) (tiOrd.termOrd / totalIndexInterval);
|
||||||
} else {
|
} else {
|
||||||
// Must do binary search:
|
// Must do binary search:
|
||||||
indexPos = getIndexOffset(term);
|
indexPos = index.getIndexOffset(term);
|
||||||
}
|
}
|
||||||
|
|
||||||
seekEnum(enumerator, indexPos);
|
index.seekEnum(enumerator, indexPos);
|
||||||
enumerator.scanTo(term);
|
enumerator.scanTo(term);
|
||||||
final TermInfo ti;
|
final TermInfo ti;
|
||||||
|
|
||||||
|
@ -352,7 +312,7 @@ public final class TermInfosReader {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void ensureIndexIsRead() {
|
private void ensureIndexIsRead() {
|
||||||
if (indexTerms == null) {
|
if (index == null) {
|
||||||
throw new IllegalStateException("terms index was not loaded when this reader was created");
|
throw new IllegalStateException("terms index was not loaded when this reader was created");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -362,10 +322,10 @@ public final class TermInfosReader {
|
||||||
if (size == 0) return -1;
|
if (size == 0) return -1;
|
||||||
|
|
||||||
ensureIndexIsRead();
|
ensureIndexIsRead();
|
||||||
int indexOffset = getIndexOffset(term);
|
int indexOffset = index.getIndexOffset(term);
|
||||||
|
|
||||||
SegmentTermEnum enumerator = getThreadResources().termEnum;
|
SegmentTermEnum enumerator = getThreadResources().termEnum;
|
||||||
seekEnum(enumerator, indexOffset);
|
index.seekEnum(enumerator, indexOffset);
|
||||||
|
|
||||||
while(compareAsUTF16(term, enumerator.term()) > 0 && enumerator.next()) {}
|
while(compareAsUTF16(term, enumerator.term()) > 0 && enumerator.next()) {}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,252 @@
|
||||||
|
package org.apache.lucene.index.codecs.preflex;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.util.BitUtil;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.PagedBytes.PagedBytesDataInput;
|
||||||
|
import org.apache.lucene.util.PagedBytes.PagedBytesDataOutput;
|
||||||
|
import org.apache.lucene.util.PagedBytes;
|
||||||
|
import org.apache.lucene.util.packed.GrowableWriter;
|
||||||
|
import org.apache.lucene.util.packed.PackedInts;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This stores a monotonically increasing set of <Term, TermInfo> pairs in an
|
||||||
|
* index segment. Pairs are accessed either by Term or by ordinal position the
|
||||||
|
* set. The Terms and TermInfo are actually serialized and stored into a byte
|
||||||
|
* array and pointers to the position of each are stored in a int array.
|
||||||
|
*/
|
||||||
|
class TermInfosReaderIndex {
|
||||||
|
|
||||||
|
private static final int MAX_PAGE_BITS = 18; // 256 KB block
|
||||||
|
private Term[] fields;
|
||||||
|
private int totalIndexInterval;
|
||||||
|
private Comparator<BytesRef> comparator = BytesRef.getUTF8SortedAsUTF16Comparator();
|
||||||
|
private final PagedBytesDataInput dataInput;
|
||||||
|
private final PackedInts.Reader indexToDataOffset;
|
||||||
|
private final int indexSize;
|
||||||
|
private final int skipInterval;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads the segment information at segment load time.
|
||||||
|
*
|
||||||
|
* @param indexEnum
|
||||||
|
* the term enum.
|
||||||
|
* @param indexDivisor
|
||||||
|
* the index divisor.
|
||||||
|
* @param tiiFileLength
|
||||||
|
* the size of the tii file, used to approximate the size of the
|
||||||
|
* buffer.
|
||||||
|
* @param totalIndexInterval
|
||||||
|
* the total index interval.
|
||||||
|
*/
|
||||||
|
TermInfosReaderIndex(SegmentTermEnum indexEnum, int indexDivisor, long tiiFileLength, int totalIndexInterval) throws IOException {
|
||||||
|
this.totalIndexInterval = totalIndexInterval;
|
||||||
|
indexSize = 1 + ((int) indexEnum.size - 1) / indexDivisor;
|
||||||
|
skipInterval = indexEnum.skipInterval;
|
||||||
|
// this is only an inital size, it will be GCed once the build is complete
|
||||||
|
long initialSize = (long) (tiiFileLength * 1.5) / indexDivisor;
|
||||||
|
PagedBytes dataPagedBytes = new PagedBytes(estimatePageBits(initialSize));
|
||||||
|
PagedBytesDataOutput dataOutput = dataPagedBytes.getDataOutput();
|
||||||
|
|
||||||
|
GrowableWriter indexToTerms = new GrowableWriter(4, indexSize, false);
|
||||||
|
String currentField = null;
|
||||||
|
List<String> fieldStrs = new ArrayList<String>();
|
||||||
|
int fieldCounter = -1;
|
||||||
|
for (int i = 0; indexEnum.next(); i++) {
|
||||||
|
Term term = indexEnum.term();
|
||||||
|
if (currentField == null || !currentField.equals(term.field())) {
|
||||||
|
currentField = term.field();
|
||||||
|
fieldStrs.add(currentField);
|
||||||
|
fieldCounter++;
|
||||||
|
}
|
||||||
|
TermInfo termInfo = indexEnum.termInfo();
|
||||||
|
indexToTerms.set(i, dataOutput.getPosition());
|
||||||
|
dataOutput.writeVInt(fieldCounter);
|
||||||
|
dataOutput.writeString(term.text());
|
||||||
|
dataOutput.writeVInt(termInfo.docFreq);
|
||||||
|
if (termInfo.docFreq >= skipInterval) {
|
||||||
|
dataOutput.writeVInt(termInfo.skipOffset);
|
||||||
|
}
|
||||||
|
dataOutput.writeVLong(termInfo.freqPointer);
|
||||||
|
dataOutput.writeVLong(termInfo.proxPointer);
|
||||||
|
dataOutput.writeVLong(indexEnum.indexPointer);
|
||||||
|
for (int j = 1; j < indexDivisor; j++) {
|
||||||
|
if (!indexEnum.next()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fields = new Term[fieldStrs.size()];
|
||||||
|
for (int i = 0; i < fields.length; i++) {
|
||||||
|
fields[i] = new Term(fieldStrs.get(i));
|
||||||
|
}
|
||||||
|
|
||||||
|
dataPagedBytes.freeze(true);
|
||||||
|
dataInput = dataPagedBytes.getDataInput();
|
||||||
|
indexToDataOffset = indexToTerms.getMutable();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int estimatePageBits(long estSize) {
|
||||||
|
return Math.max(Math.min(64 - BitUtil.nlz(estSize), MAX_PAGE_BITS), 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException {
|
||||||
|
PagedBytesDataInput input = (PagedBytesDataInput) dataInput.clone();
|
||||||
|
|
||||||
|
input.setPosition(indexToDataOffset.get(indexOffset));
|
||||||
|
|
||||||
|
// read the term
|
||||||
|
int fieldId = input.readVInt();
|
||||||
|
Term field = fields[fieldId];
|
||||||
|
Term term = new Term(field.field(), input.readString());
|
||||||
|
|
||||||
|
// read the terminfo
|
||||||
|
TermInfo termInfo = new TermInfo();
|
||||||
|
termInfo.docFreq = input.readVInt();
|
||||||
|
if (termInfo.docFreq >= skipInterval) {
|
||||||
|
termInfo.skipOffset = input.readVInt();
|
||||||
|
} else {
|
||||||
|
termInfo.skipOffset = 0;
|
||||||
|
}
|
||||||
|
termInfo.freqPointer = input.readVLong();
|
||||||
|
termInfo.proxPointer = input.readVLong();
|
||||||
|
|
||||||
|
long pointer = input.readVLong();
|
||||||
|
|
||||||
|
// perform the seek
|
||||||
|
enumerator.seek(pointer, ((long) indexOffset * totalIndexInterval) - 1, term, termInfo);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Binary search for the given term.
|
||||||
|
*
|
||||||
|
* @param term
|
||||||
|
* the term to locate.
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
int getIndexOffset(Term term) throws IOException {
|
||||||
|
int lo = 0;
|
||||||
|
int hi = indexSize - 1;
|
||||||
|
PagedBytesDataInput input = (PagedBytesDataInput) dataInput.clone();
|
||||||
|
BytesRef scratch = new BytesRef();
|
||||||
|
while (hi >= lo) {
|
||||||
|
int mid = (lo + hi) >>> 1;
|
||||||
|
int delta = compareTo(term, mid, input, scratch);
|
||||||
|
if (delta < 0)
|
||||||
|
hi = mid - 1;
|
||||||
|
else if (delta > 0)
|
||||||
|
lo = mid + 1;
|
||||||
|
else
|
||||||
|
return mid;
|
||||||
|
}
|
||||||
|
return hi;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the term at the given position. For testing.
|
||||||
|
*
|
||||||
|
* @param termIndex
|
||||||
|
* the position to read the term from the index.
|
||||||
|
* @return the term.
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
Term getTerm(int termIndex) throws IOException {
|
||||||
|
PagedBytesDataInput input = (PagedBytesDataInput) dataInput.clone();
|
||||||
|
input.setPosition(indexToDataOffset.get(termIndex));
|
||||||
|
|
||||||
|
// read the term
|
||||||
|
int fieldId = input.readVInt();
|
||||||
|
Term field = fields[fieldId];
|
||||||
|
return new Term(field.field(), input.readString());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the number of terms.
|
||||||
|
*
|
||||||
|
* @return int.
|
||||||
|
*/
|
||||||
|
int length() {
|
||||||
|
return indexSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The compares the given term against the term in the index specified by the
|
||||||
|
* term index. ie It returns negative N when term is less than index term;
|
||||||
|
*
|
||||||
|
* @param term
|
||||||
|
* the given term.
|
||||||
|
* @param termIndex
|
||||||
|
* the index of the of term to compare.
|
||||||
|
* @return int.
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
int compareTo(Term term, int termIndex) throws IOException {
|
||||||
|
return compareTo(term, termIndex, (PagedBytesDataInput) dataInput.clone(), new BytesRef());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compare the fields of the terms first, and if not equals return from
|
||||||
|
* compare. If equal compare terms.
|
||||||
|
*
|
||||||
|
* @param term
|
||||||
|
* the term to compare.
|
||||||
|
* @param termIndex
|
||||||
|
* the position of the term in the input to compare
|
||||||
|
* @param input
|
||||||
|
* the input buffer.
|
||||||
|
* @return int.
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
private int compareTo(Term term, int termIndex, PagedBytesDataInput input, BytesRef reuse) throws IOException {
|
||||||
|
// if term field does not equal mid's field index, then compare fields
|
||||||
|
// else if they are equal, compare term's string values...
|
||||||
|
int c = compareField(term, termIndex, input);
|
||||||
|
if (c == 0) {
|
||||||
|
reuse.length = input.readVInt();
|
||||||
|
reuse.grow(reuse.length);
|
||||||
|
input.readBytes(reuse.bytes, 0, reuse.length);
|
||||||
|
return comparator.compare(term.bytes(), reuse);
|
||||||
|
}
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compares the fields before checking the text of the terms.
|
||||||
|
*
|
||||||
|
* @param term
|
||||||
|
* the given term.
|
||||||
|
* @param termIndex
|
||||||
|
* the term that exists in the data block.
|
||||||
|
* @param input
|
||||||
|
* the data block.
|
||||||
|
* @return int.
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
private int compareField(Term term, int termIndex, PagedBytesDataInput input) throws IOException {
|
||||||
|
input.setPosition(indexToDataOffset.get(termIndex));
|
||||||
|
return term.field().compareTo(fields[input.readVInt()].field());
|
||||||
|
}
|
||||||
|
}
|
|
@ -17,12 +17,14 @@ package org.apache.lucene.util;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.store.IndexInput;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.io.Closeable;
|
import java.io.Closeable;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.lucene.store.DataInput;
|
||||||
|
import org.apache.lucene.store.DataOutput;
|
||||||
|
import org.apache.lucene.store.IndexInput;
|
||||||
|
|
||||||
/** Represents a logical byte[] as a series of pages. You
|
/** Represents a logical byte[] as a series of pages. You
|
||||||
* can write-once into the logical byte[] (append only),
|
* can write-once into the logical byte[] (append only),
|
||||||
|
@ -37,6 +39,8 @@ public final class PagedBytes {
|
||||||
private final int blockSize;
|
private final int blockSize;
|
||||||
private final int blockBits;
|
private final int blockBits;
|
||||||
private final int blockMask;
|
private final int blockMask;
|
||||||
|
private boolean didSkipBytes;
|
||||||
|
private boolean frozen;
|
||||||
private int upto;
|
private int upto;
|
||||||
private byte[] currentBlock;
|
private byte[] currentBlock;
|
||||||
|
|
||||||
|
@ -320,6 +324,7 @@ public final class PagedBytes {
|
||||||
if (currentBlock != null) {
|
if (currentBlock != null) {
|
||||||
blocks.add(currentBlock);
|
blocks.add(currentBlock);
|
||||||
blockEnd.add(upto);
|
blockEnd.add(upto);
|
||||||
|
didSkipBytes = true;
|
||||||
}
|
}
|
||||||
currentBlock = new byte[blockSize];
|
currentBlock = new byte[blockSize];
|
||||||
upto = 0;
|
upto = 0;
|
||||||
|
@ -338,6 +343,12 @@ public final class PagedBytes {
|
||||||
|
|
||||||
/** Commits final byte[], trimming it if necessary and if trim=true */
|
/** Commits final byte[], trimming it if necessary and if trim=true */
|
||||||
public Reader freeze(boolean trim) {
|
public Reader freeze(boolean trim) {
|
||||||
|
if (frozen) {
|
||||||
|
throw new IllegalStateException("already frozen");
|
||||||
|
}
|
||||||
|
if (didSkipBytes) {
|
||||||
|
throw new IllegalStateException("cannot freeze when copy(BytesRef, BytesRef) was used");
|
||||||
|
}
|
||||||
if (trim && upto < blockSize) {
|
if (trim && upto < blockSize) {
|
||||||
final byte[] newBlock = new byte[upto];
|
final byte[] newBlock = new byte[upto];
|
||||||
System.arraycopy(currentBlock, 0, newBlock, 0, upto);
|
System.arraycopy(currentBlock, 0, newBlock, 0, upto);
|
||||||
|
@ -348,6 +359,7 @@ public final class PagedBytes {
|
||||||
}
|
}
|
||||||
blocks.add(currentBlock);
|
blocks.add(currentBlock);
|
||||||
blockEnd.add(upto);
|
blockEnd.add(upto);
|
||||||
|
frozen = true;
|
||||||
currentBlock = null;
|
currentBlock = null;
|
||||||
return new Reader(this);
|
return new Reader(this);
|
||||||
}
|
}
|
||||||
|
@ -389,4 +401,150 @@ public final class PagedBytes {
|
||||||
|
|
||||||
return pointer;
|
return pointer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public final class PagedBytesDataInput extends DataInput {
|
||||||
|
private int currentBlockIndex;
|
||||||
|
private int currentBlockUpto;
|
||||||
|
private byte[] currentBlock;
|
||||||
|
|
||||||
|
PagedBytesDataInput() {
|
||||||
|
currentBlock = blocks.get(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Object clone() {
|
||||||
|
PagedBytesDataInput clone = getDataInput();
|
||||||
|
clone.setPosition(getPosition());
|
||||||
|
return clone;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the current byte position. */
|
||||||
|
public long getPosition() {
|
||||||
|
return currentBlockIndex * blockSize + currentBlockUpto;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Seek to a position previously obtained from
|
||||||
|
* {@link #getPosition}. */
|
||||||
|
public void setPosition(long pos) {
|
||||||
|
currentBlockIndex = (int) (pos >> blockBits);
|
||||||
|
currentBlock = blocks.get(currentBlockIndex);
|
||||||
|
currentBlockUpto = (int) (pos & blockMask);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public byte readByte() {
|
||||||
|
if (currentBlockUpto == blockSize) {
|
||||||
|
nextBlock();
|
||||||
|
}
|
||||||
|
return currentBlock[currentBlockUpto++];
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void readBytes(byte[] b, int offset, int len) {
|
||||||
|
final int offsetEnd = offset + len;
|
||||||
|
while (true) {
|
||||||
|
final int blockLeft = blockSize - currentBlockUpto;
|
||||||
|
final int left = offsetEnd - offset;
|
||||||
|
if (blockLeft < left) {
|
||||||
|
System.arraycopy(currentBlock, currentBlockUpto,
|
||||||
|
b, offset,
|
||||||
|
blockLeft);
|
||||||
|
nextBlock();
|
||||||
|
offset += blockLeft;
|
||||||
|
} else {
|
||||||
|
// Last block
|
||||||
|
System.arraycopy(currentBlock, currentBlockUpto,
|
||||||
|
b, offset,
|
||||||
|
left);
|
||||||
|
currentBlockUpto += left;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void nextBlock() {
|
||||||
|
currentBlockIndex++;
|
||||||
|
currentBlockUpto = 0;
|
||||||
|
currentBlock = blocks.get(currentBlockIndex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public final class PagedBytesDataOutput extends DataOutput {
|
||||||
|
@Override
|
||||||
|
public void writeByte(byte b) {
|
||||||
|
if (upto == blockSize) {
|
||||||
|
if (currentBlock != null) {
|
||||||
|
blocks.add(currentBlock);
|
||||||
|
blockEnd.add(upto);
|
||||||
|
}
|
||||||
|
currentBlock = new byte[blockSize];
|
||||||
|
upto = 0;
|
||||||
|
}
|
||||||
|
currentBlock[upto++] = b;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void writeBytes(byte[] b, int offset, int length) throws IOException {
|
||||||
|
if (length == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (upto == blockSize) {
|
||||||
|
if (currentBlock != null) {
|
||||||
|
blocks.add(currentBlock);
|
||||||
|
blockEnd.add(upto);
|
||||||
|
}
|
||||||
|
currentBlock = new byte[blockSize];
|
||||||
|
upto = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
final int offsetEnd = offset + length;
|
||||||
|
while(true) {
|
||||||
|
final int left = offsetEnd - offset;
|
||||||
|
final int blockLeft = blockSize - upto;
|
||||||
|
if (blockLeft < left) {
|
||||||
|
System.arraycopy(b, offset, currentBlock, upto, blockLeft);
|
||||||
|
blocks.add(currentBlock);
|
||||||
|
blockEnd.add(blockSize);
|
||||||
|
currentBlock = new byte[blockSize];
|
||||||
|
upto = 0;
|
||||||
|
offset += blockLeft;
|
||||||
|
} else {
|
||||||
|
// Last block
|
||||||
|
System.arraycopy(b, offset, currentBlock, upto, left);
|
||||||
|
upto += left;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Return the current byte position. */
|
||||||
|
public long getPosition() {
|
||||||
|
if (currentBlock == null) {
|
||||||
|
return 0;
|
||||||
|
} else {
|
||||||
|
return blocks.size() * blockSize + upto;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns a DataInput to read values from this
|
||||||
|
* PagedBytes instance. */
|
||||||
|
public PagedBytesDataInput getDataInput() {
|
||||||
|
if (!frozen) {
|
||||||
|
throw new IllegalStateException("must call freeze() before getDataInput");
|
||||||
|
}
|
||||||
|
return new PagedBytesDataInput();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns a DataOutput that you may use to write into
|
||||||
|
* this PagedBytes instance. If you do this, you should
|
||||||
|
* not call the other writing methods (eg, copy);
|
||||||
|
* results are undefined. */
|
||||||
|
public PagedBytesDataOutput getDataOutput() {
|
||||||
|
if (frozen) {
|
||||||
|
throw new IllegalStateException("cannot get DataOutput after freeze()");
|
||||||
|
}
|
||||||
|
return new PagedBytesDataOutput();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,193 @@
|
||||||
|
package org.apache.lucene.index.codecs.preflex;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Random;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.StringField;
|
||||||
|
import org.apache.lucene.index.CorruptIndexException;
|
||||||
|
import org.apache.lucene.index.FieldInfos;
|
||||||
|
import org.apache.lucene.index.FieldsEnum;
|
||||||
|
import org.apache.lucene.index.IndexFileNames;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.IndexWriterConfig;
|
||||||
|
import org.apache.lucene.index.LogMergePolicy;
|
||||||
|
import org.apache.lucene.index.MultiFields;
|
||||||
|
import org.apache.lucene.index.RandomIndexWriter;
|
||||||
|
import org.apache.lucene.index.SegmentReader;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.index.TermsEnum;
|
||||||
|
import org.apache.lucene.index.codecs.Codec;
|
||||||
|
import org.apache.lucene.index.codecs.CodecProvider;
|
||||||
|
import org.apache.lucene.index.codecs.CoreCodecProvider;
|
||||||
|
import org.apache.lucene.index.codecs.preflexrw.PreFlexRWCodec;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
import org.apache.lucene.search.TermQuery;
|
||||||
|
import org.apache.lucene.search.TopDocs;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.store.IndexInput;
|
||||||
|
import org.apache.lucene.store.LockObtainFailedException;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util._TestUtil;
|
||||||
|
|
||||||
|
public class TestTermInfosReaderIndex extends LuceneTestCase {
|
||||||
|
|
||||||
|
private static final int NUMBER_OF_DOCUMENTS = 1000;
|
||||||
|
private static final int NUMBER_OF_FIELDS = 100;
|
||||||
|
private TermInfosReaderIndex index;
|
||||||
|
private Directory directory;
|
||||||
|
private SegmentTermEnum termEnum;
|
||||||
|
private int indexDivisor;
|
||||||
|
private int termIndexInterval;
|
||||||
|
private IndexReader reader;
|
||||||
|
private List<Term> sampleTerms;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setUp() throws Exception {
|
||||||
|
super.setUp();
|
||||||
|
indexDivisor = _TestUtil.nextInt(random, 1, 10);
|
||||||
|
directory = newDirectory();
|
||||||
|
termIndexInterval = populate(directory);
|
||||||
|
|
||||||
|
IndexReader r0 = IndexReader.open(directory);
|
||||||
|
SegmentReader r = (SegmentReader) r0.getSequentialSubReaders()[0];
|
||||||
|
String segment = r.getSegmentName();
|
||||||
|
r.close();
|
||||||
|
|
||||||
|
FieldInfos fieldInfos = new FieldInfos(directory, IndexFileNames.segmentFileName(segment, "", IndexFileNames.FIELD_INFOS_EXTENSION));
|
||||||
|
String segmentFileName = IndexFileNames.segmentFileName(segment, "", PreFlexCodec.TERMS_INDEX_EXTENSION);
|
||||||
|
long tiiFileLength = directory.fileLength(segmentFileName);
|
||||||
|
IndexInput input = directory.openInput(segmentFileName, newIOContext(random));
|
||||||
|
termEnum = new SegmentTermEnum(directory.openInput(IndexFileNames.segmentFileName(segment, "", PreFlexCodec.TERMS_EXTENSION), newIOContext(random)), fieldInfos, false);
|
||||||
|
int totalIndexInterval = termEnum.indexInterval * indexDivisor;
|
||||||
|
|
||||||
|
SegmentTermEnum indexEnum = new SegmentTermEnum(input, fieldInfos, true);
|
||||||
|
index = new TermInfosReaderIndex(indexEnum, indexDivisor, tiiFileLength, totalIndexInterval);
|
||||||
|
indexEnum.close();
|
||||||
|
input.close();
|
||||||
|
|
||||||
|
reader = IndexReader.open(directory);
|
||||||
|
sampleTerms = sample(reader,1000);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void tearDown() throws Exception {
|
||||||
|
termEnum.close();
|
||||||
|
reader.close();
|
||||||
|
directory.close();
|
||||||
|
super.tearDown();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSeekEnum() throws CorruptIndexException, IOException {
|
||||||
|
int indexPosition = 3;
|
||||||
|
SegmentTermEnum clone = (SegmentTermEnum) termEnum.clone();
|
||||||
|
Term term = findTermThatWouldBeAtIndex(clone, indexPosition);
|
||||||
|
SegmentTermEnum enumerator = clone;
|
||||||
|
index.seekEnum(enumerator, indexPosition);
|
||||||
|
assertEquals(term, enumerator.term());
|
||||||
|
clone.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testCompareTo() throws IOException {
|
||||||
|
Term term = new Term("field" + random.nextInt(NUMBER_OF_FIELDS) ,getText());
|
||||||
|
for (int i = 0; i < index.length(); i++) {
|
||||||
|
Term t = index.getTerm(i);
|
||||||
|
int compareTo = term.compareTo(t);
|
||||||
|
assertEquals(compareTo, index.compareTo(term, i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRandomSearchPerformance() throws CorruptIndexException, IOException {
|
||||||
|
IndexSearcher searcher = new IndexSearcher(reader);
|
||||||
|
for (Term t : sampleTerms) {
|
||||||
|
TermQuery query = new TermQuery(t);
|
||||||
|
TopDocs topDocs = searcher.search(query, 10);
|
||||||
|
assertTrue(topDocs.totalHits > 0);
|
||||||
|
}
|
||||||
|
searcher.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<Term> sample(IndexReader reader, int size) throws IOException {
|
||||||
|
List<Term> sample = new ArrayList<Term>();
|
||||||
|
Random random = new Random();
|
||||||
|
FieldsEnum fieldsEnum = MultiFields.getFields(reader).iterator();
|
||||||
|
String field;
|
||||||
|
while((field = fieldsEnum.next()) != null) {
|
||||||
|
TermsEnum terms = fieldsEnum.terms();
|
||||||
|
while (terms.next() != null) {
|
||||||
|
if (sample.size() >= size) {
|
||||||
|
int pos = random.nextInt(size);
|
||||||
|
sample.set(pos, new Term(field, terms.term()));
|
||||||
|
} else {
|
||||||
|
sample.add(new Term(field, terms.term()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Collections.shuffle(sample);
|
||||||
|
return sample;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Term findTermThatWouldBeAtIndex(SegmentTermEnum termEnum, int index) throws IOException {
|
||||||
|
int termPosition = index * termIndexInterval * indexDivisor;
|
||||||
|
for (int i = 0; i < termPosition; i++) {
|
||||||
|
if (!termEnum.next()) {
|
||||||
|
fail("Should not have run out of terms.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return termEnum.term();
|
||||||
|
}
|
||||||
|
|
||||||
|
private int populate(Directory directory) throws CorruptIndexException, LockObtainFailedException, IOException {
|
||||||
|
IndexWriterConfig config = newIndexWriterConfig(TEST_VERSION_CURRENT,
|
||||||
|
new MockAnalyzer(random, MockTokenizer.KEYWORD, false));
|
||||||
|
CoreCodecProvider cp = new CoreCodecProvider();
|
||||||
|
cp.unregister(cp.lookup("PreFlex"));
|
||||||
|
cp.register(new PreFlexRWCodec());
|
||||||
|
cp.setDefaultFieldCodec("PreFlex");
|
||||||
|
config.setCodecProvider(cp);
|
||||||
|
// turn off compound file, this test will open some index files directly.
|
||||||
|
LogMergePolicy mp = newLogMergePolicy();
|
||||||
|
mp.setUseCompoundFile(false);
|
||||||
|
config.setMergePolicy(mp);
|
||||||
|
|
||||||
|
RandomIndexWriter writer = new RandomIndexWriter(random, directory, config);
|
||||||
|
for (int i = 0; i < NUMBER_OF_DOCUMENTS; i++) {
|
||||||
|
Document document = new Document();
|
||||||
|
for (int f = 0; f < NUMBER_OF_FIELDS; f++) {
|
||||||
|
document.add(newField("field" + f, getText(), StringField.TYPE_UNSTORED));
|
||||||
|
}
|
||||||
|
writer.addDocument(document);
|
||||||
|
}
|
||||||
|
writer.optimize();
|
||||||
|
writer.close();
|
||||||
|
return config.getTermIndexInterval();
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getText() {
|
||||||
|
return Long.toString(random.nextLong(),Character.MAX_RADIX);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,64 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.util;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
import org.apache.lucene.store.DataInput;
|
||||||
|
import org.apache.lucene.store.DataOutput;
|
||||||
|
|
||||||
|
public class TestPagedBytes extends LuceneTestCase {
|
||||||
|
|
||||||
|
public void testDataInputOutput() throws Exception {
|
||||||
|
for(int iter=0;iter<5*RANDOM_MULTIPLIER;iter++) {
|
||||||
|
final PagedBytes p = new PagedBytes(_TestUtil.nextInt(random, 1, 20));
|
||||||
|
final DataOutput out = p.getDataOutput();
|
||||||
|
final int numBytes = random.nextInt(10000000);
|
||||||
|
|
||||||
|
final byte[] answer = new byte[numBytes];
|
||||||
|
random.nextBytes(answer);
|
||||||
|
int written = 0;
|
||||||
|
while(written < numBytes) {
|
||||||
|
if (random.nextInt(10) == 7) {
|
||||||
|
out.writeByte(answer[written++]);
|
||||||
|
} else {
|
||||||
|
int chunk = Math.max(random.nextInt(1000), numBytes - written);
|
||||||
|
out.writeBytes(answer, written, chunk);
|
||||||
|
written += chunk;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
p.freeze(random.nextBoolean());
|
||||||
|
|
||||||
|
final DataInput in = p.getDataInput();
|
||||||
|
|
||||||
|
final byte[] verify = new byte[numBytes];
|
||||||
|
int read = 0;
|
||||||
|
while(read < numBytes) {
|
||||||
|
if (random.nextInt(10) == 7) {
|
||||||
|
verify[read++] = in.readByte();
|
||||||
|
} else {
|
||||||
|
int chunk = Math.max(random.nextInt(1000), numBytes - read);
|
||||||
|
in.readBytes(verify, read, chunk);
|
||||||
|
read += chunk;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertTrue(Arrays.equals(answer, verify));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue