mirror of https://github.com/apache/lucene.git
LUCENE-2205: port to trunk in preflex codec; port TestPagedBytes and PagedBytes.DataInput/Output to trunk
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1190017 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
60079a441f
commit
00da65393c
|
@ -723,6 +723,9 @@ Optimizations
|
|||
FilteredQuery/IndexSearcher added by LUCENE-1536 to Lucene 4.0.
|
||||
(Uwe Schindler)
|
||||
|
||||
* LUCENE-2205: Very substantial (3-5X) RAM reduction required to hold
|
||||
the terms index on opening an IndexReader (Aaron McCurry via Mike McCandless)
|
||||
|
||||
Test Cases
|
||||
|
||||
* LUCENE-3420: Disable the finalness checks in TokenStream and Analyzer
|
||||
|
|
|
@ -47,9 +47,8 @@ public final class TermInfosReader {
|
|||
private final SegmentTermEnum origEnum;
|
||||
private final long size;
|
||||
|
||||
private final Term[] indexTerms;
|
||||
private final TermInfo[] indexInfos;
|
||||
private final long[] indexPointers;
|
||||
private final TermInfosReaderIndex index;
|
||||
private final int indexLength;
|
||||
|
||||
private final int totalIndexInterval;
|
||||
|
||||
|
@ -118,37 +117,23 @@ public final class TermInfosReader {
|
|||
if (indexDivisor != -1) {
|
||||
// Load terms index
|
||||
totalIndexInterval = origEnum.indexInterval * indexDivisor;
|
||||
final SegmentTermEnum indexEnum = new SegmentTermEnum(directory.openInput(IndexFileNames.segmentFileName(segment, "", PreFlexCodec.TERMS_INDEX_EXTENSION),
|
||||
|
||||
final String indexFileName = IndexFileNames.segmentFileName(segment, "", PreFlexCodec.TERMS_INDEX_EXTENSION);
|
||||
final SegmentTermEnum indexEnum = new SegmentTermEnum(directory.openInput(indexFileName,
|
||||
context), fieldInfos, true);
|
||||
|
||||
try {
|
||||
int indexSize = 1+((int)indexEnum.size-1)/indexDivisor; // otherwise read index
|
||||
|
||||
indexTerms = new Term[indexSize];
|
||||
indexInfos = new TermInfo[indexSize];
|
||||
indexPointers = new long[indexSize];
|
||||
|
||||
for (int i=0;indexEnum.next(); i++) {
|
||||
indexTerms[i] = indexEnum.term();
|
||||
assert indexTerms[i] != null;
|
||||
assert indexTerms[i].text() != null;
|
||||
assert indexTerms[i].field() != null;
|
||||
indexInfos[i] = indexEnum.termInfo();
|
||||
indexPointers[i] = indexEnum.indexPointer;
|
||||
|
||||
for (int j = 1; j < indexDivisor; j++)
|
||||
if (!indexEnum.next())
|
||||
break;
|
||||
}
|
||||
// nocommit don't cast to int..
|
||||
index = new TermInfosReaderIndex(indexEnum, indexDivisor, (int) dir.fileLength(indexFileName), totalIndexInterval);
|
||||
indexLength = index.length();
|
||||
} finally {
|
||||
indexEnum.close();
|
||||
}
|
||||
} else {
|
||||
// Do not load terms index:
|
||||
totalIndexInterval = -1;
|
||||
indexTerms = null;
|
||||
indexInfos = null;
|
||||
indexPointers = null;
|
||||
index = null;
|
||||
indexLength = -1;
|
||||
}
|
||||
success = true;
|
||||
} finally {
|
||||
|
@ -203,31 +188,6 @@ public final class TermInfosReader {
|
|||
}
|
||||
}
|
||||
|
||||
/** Returns the offset of the greatest index entry which is less than or equal to term.*/
|
||||
private int getIndexOffset(Term term) {
|
||||
int lo = 0; // binary search indexTerms[]
|
||||
int hi = indexTerms.length - 1;
|
||||
|
||||
while (hi >= lo) {
|
||||
int mid = (lo + hi) >>> 1;
|
||||
assert indexTerms[mid] != null : "indexTerms = " + indexTerms.length + " mid=" + mid;
|
||||
int delta = compareAsUTF16(term, indexTerms[mid]);
|
||||
if (delta < 0)
|
||||
hi = mid - 1;
|
||||
else if (delta > 0)
|
||||
lo = mid + 1;
|
||||
else
|
||||
return mid;
|
||||
}
|
||||
return hi;
|
||||
}
|
||||
|
||||
private void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException {
|
||||
enumerator.seek(indexPointers[indexOffset],
|
||||
((long) indexOffset * totalIndexInterval) - 1,
|
||||
indexTerms[indexOffset], indexInfos[indexOffset]);
|
||||
}
|
||||
|
||||
/** Returns the TermInfo for a Term in the set, or null. */
|
||||
TermInfo get(Term term) throws IOException {
|
||||
return get(term, false);
|
||||
|
@ -272,8 +232,8 @@ public final class TermInfosReader {
|
|||
&& ((enumerator.prev() != null && compareAsUTF16(term, enumerator.prev())> 0)
|
||||
|| compareAsUTF16(term, enumerator.term()) >= 0)) {
|
||||
int enumOffset = (int)(enumerator.position/totalIndexInterval)+1;
|
||||
if (indexTerms.length == enumOffset // but before end of block
|
||||
|| compareAsUTF16(term, indexTerms[enumOffset]) < 0) {
|
||||
if (indexLength == enumOffset // but before end of block
|
||||
|| index.compareTo(term, enumOffset) < 0) {
|
||||
// no need to seek
|
||||
|
||||
final TermInfo ti;
|
||||
|
@ -309,10 +269,10 @@ public final class TermInfosReader {
|
|||
indexPos = (int) (tiOrd.termOrd / totalIndexInterval);
|
||||
} else {
|
||||
// Must do binary search:
|
||||
indexPos = getIndexOffset(term);
|
||||
indexPos = index.getIndexOffset(term);
|
||||
}
|
||||
|
||||
seekEnum(enumerator, indexPos);
|
||||
index.seekEnum(enumerator, indexPos);
|
||||
enumerator.scanTo(term);
|
||||
final TermInfo ti;
|
||||
|
||||
|
@ -352,7 +312,7 @@ public final class TermInfosReader {
|
|||
}
|
||||
|
||||
private void ensureIndexIsRead() {
|
||||
if (indexTerms == null) {
|
||||
if (index == null) {
|
||||
throw new IllegalStateException("terms index was not loaded when this reader was created");
|
||||
}
|
||||
}
|
||||
|
@ -362,10 +322,10 @@ public final class TermInfosReader {
|
|||
if (size == 0) return -1;
|
||||
|
||||
ensureIndexIsRead();
|
||||
int indexOffset = getIndexOffset(term);
|
||||
int indexOffset = index.getIndexOffset(term);
|
||||
|
||||
SegmentTermEnum enumerator = getThreadResources().termEnum;
|
||||
seekEnum(enumerator, indexOffset);
|
||||
index.seekEnum(enumerator, indexOffset);
|
||||
|
||||
while(compareAsUTF16(term, enumerator.term()) > 0 && enumerator.next()) {}
|
||||
|
||||
|
|
|
@ -0,0 +1,252 @@
|
|||
package org.apache.lucene.index.codecs.preflex;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.util.BitUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.PagedBytes.PagedBytesDataInput;
|
||||
import org.apache.lucene.util.PagedBytes.PagedBytesDataOutput;
|
||||
import org.apache.lucene.util.PagedBytes;
|
||||
import org.apache.lucene.util.packed.GrowableWriter;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
* This stores a monotonically increasing set of <Term, TermInfo> pairs in an
|
||||
* index segment. Pairs are accessed either by Term or by ordinal position the
|
||||
* set. The Terms and TermInfo are actually serialized and stored into a byte
|
||||
* array and pointers to the position of each are stored in a int array.
|
||||
*/
|
||||
class TermInfosReaderIndex {
|
||||
|
||||
private static final int MAX_PAGE_BITS = 18; // 256 KB block
|
||||
private Term[] fields;
|
||||
private int totalIndexInterval;
|
||||
private Comparator<BytesRef> comparator = BytesRef.getUTF8SortedAsUTF16Comparator();
|
||||
private final PagedBytesDataInput dataInput;
|
||||
private final PackedInts.Reader indexToDataOffset;
|
||||
private final int indexSize;
|
||||
private final int skipInterval;
|
||||
|
||||
/**
|
||||
* Loads the segment information at segment load time.
|
||||
*
|
||||
* @param indexEnum
|
||||
* the term enum.
|
||||
* @param indexDivisor
|
||||
* the index divisor.
|
||||
* @param tiiFileLength
|
||||
* the size of the tii file, used to approximate the size of the
|
||||
* buffer.
|
||||
* @param totalIndexInterval
|
||||
* the total index interval.
|
||||
*/
|
||||
TermInfosReaderIndex(SegmentTermEnum indexEnum, int indexDivisor, long tiiFileLength, int totalIndexInterval) throws IOException {
|
||||
this.totalIndexInterval = totalIndexInterval;
|
||||
indexSize = 1 + ((int) indexEnum.size - 1) / indexDivisor;
|
||||
skipInterval = indexEnum.skipInterval;
|
||||
// this is only an inital size, it will be GCed once the build is complete
|
||||
long initialSize = (long) (tiiFileLength * 1.5) / indexDivisor;
|
||||
PagedBytes dataPagedBytes = new PagedBytes(estimatePageBits(initialSize));
|
||||
PagedBytesDataOutput dataOutput = dataPagedBytes.getDataOutput();
|
||||
|
||||
GrowableWriter indexToTerms = new GrowableWriter(4, indexSize, false);
|
||||
String currentField = null;
|
||||
List<String> fieldStrs = new ArrayList<String>();
|
||||
int fieldCounter = -1;
|
||||
for (int i = 0; indexEnum.next(); i++) {
|
||||
Term term = indexEnum.term();
|
||||
if (currentField == null || !currentField.equals(term.field())) {
|
||||
currentField = term.field();
|
||||
fieldStrs.add(currentField);
|
||||
fieldCounter++;
|
||||
}
|
||||
TermInfo termInfo = indexEnum.termInfo();
|
||||
indexToTerms.set(i, dataOutput.getPosition());
|
||||
dataOutput.writeVInt(fieldCounter);
|
||||
dataOutput.writeString(term.text());
|
||||
dataOutput.writeVInt(termInfo.docFreq);
|
||||
if (termInfo.docFreq >= skipInterval) {
|
||||
dataOutput.writeVInt(termInfo.skipOffset);
|
||||
}
|
||||
dataOutput.writeVLong(termInfo.freqPointer);
|
||||
dataOutput.writeVLong(termInfo.proxPointer);
|
||||
dataOutput.writeVLong(indexEnum.indexPointer);
|
||||
for (int j = 1; j < indexDivisor; j++) {
|
||||
if (!indexEnum.next()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fields = new Term[fieldStrs.size()];
|
||||
for (int i = 0; i < fields.length; i++) {
|
||||
fields[i] = new Term(fieldStrs.get(i));
|
||||
}
|
||||
|
||||
dataPagedBytes.freeze(true);
|
||||
dataInput = dataPagedBytes.getDataInput();
|
||||
indexToDataOffset = indexToTerms.getMutable();
|
||||
}
|
||||
|
||||
private static int estimatePageBits(long estSize) {
|
||||
return Math.max(Math.min(64 - BitUtil.nlz(estSize), MAX_PAGE_BITS), 4);
|
||||
}
|
||||
|
||||
void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException {
|
||||
PagedBytesDataInput input = (PagedBytesDataInput) dataInput.clone();
|
||||
|
||||
input.setPosition(indexToDataOffset.get(indexOffset));
|
||||
|
||||
// read the term
|
||||
int fieldId = input.readVInt();
|
||||
Term field = fields[fieldId];
|
||||
Term term = new Term(field.field(), input.readString());
|
||||
|
||||
// read the terminfo
|
||||
TermInfo termInfo = new TermInfo();
|
||||
termInfo.docFreq = input.readVInt();
|
||||
if (termInfo.docFreq >= skipInterval) {
|
||||
termInfo.skipOffset = input.readVInt();
|
||||
} else {
|
||||
termInfo.skipOffset = 0;
|
||||
}
|
||||
termInfo.freqPointer = input.readVLong();
|
||||
termInfo.proxPointer = input.readVLong();
|
||||
|
||||
long pointer = input.readVLong();
|
||||
|
||||
// perform the seek
|
||||
enumerator.seek(pointer, ((long) indexOffset * totalIndexInterval) - 1, term, termInfo);
|
||||
}
|
||||
|
||||
/**
|
||||
* Binary search for the given term.
|
||||
*
|
||||
* @param term
|
||||
* the term to locate.
|
||||
* @throws IOException
|
||||
*/
|
||||
int getIndexOffset(Term term) throws IOException {
|
||||
int lo = 0;
|
||||
int hi = indexSize - 1;
|
||||
PagedBytesDataInput input = (PagedBytesDataInput) dataInput.clone();
|
||||
BytesRef scratch = new BytesRef();
|
||||
while (hi >= lo) {
|
||||
int mid = (lo + hi) >>> 1;
|
||||
int delta = compareTo(term, mid, input, scratch);
|
||||
if (delta < 0)
|
||||
hi = mid - 1;
|
||||
else if (delta > 0)
|
||||
lo = mid + 1;
|
||||
else
|
||||
return mid;
|
||||
}
|
||||
return hi;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the term at the given position. For testing.
|
||||
*
|
||||
* @param termIndex
|
||||
* the position to read the term from the index.
|
||||
* @return the term.
|
||||
* @throws IOException
|
||||
*/
|
||||
Term getTerm(int termIndex) throws IOException {
|
||||
PagedBytesDataInput input = (PagedBytesDataInput) dataInput.clone();
|
||||
input.setPosition(indexToDataOffset.get(termIndex));
|
||||
|
||||
// read the term
|
||||
int fieldId = input.readVInt();
|
||||
Term field = fields[fieldId];
|
||||
return new Term(field.field(), input.readString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of terms.
|
||||
*
|
||||
* @return int.
|
||||
*/
|
||||
int length() {
|
||||
return indexSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* The compares the given term against the term in the index specified by the
|
||||
* term index. ie It returns negative N when term is less than index term;
|
||||
*
|
||||
* @param term
|
||||
* the given term.
|
||||
* @param termIndex
|
||||
* the index of the of term to compare.
|
||||
* @return int.
|
||||
* @throws IOException
|
||||
*/
|
||||
int compareTo(Term term, int termIndex) throws IOException {
|
||||
return compareTo(term, termIndex, (PagedBytesDataInput) dataInput.clone(), new BytesRef());
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare the fields of the terms first, and if not equals return from
|
||||
* compare. If equal compare terms.
|
||||
*
|
||||
* @param term
|
||||
* the term to compare.
|
||||
* @param termIndex
|
||||
* the position of the term in the input to compare
|
||||
* @param input
|
||||
* the input buffer.
|
||||
* @return int.
|
||||
* @throws IOException
|
||||
*/
|
||||
private int compareTo(Term term, int termIndex, PagedBytesDataInput input, BytesRef reuse) throws IOException {
|
||||
// if term field does not equal mid's field index, then compare fields
|
||||
// else if they are equal, compare term's string values...
|
||||
int c = compareField(term, termIndex, input);
|
||||
if (c == 0) {
|
||||
reuse.length = input.readVInt();
|
||||
reuse.grow(reuse.length);
|
||||
input.readBytes(reuse.bytes, 0, reuse.length);
|
||||
return comparator.compare(term.bytes(), reuse);
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares the fields before checking the text of the terms.
|
||||
*
|
||||
* @param term
|
||||
* the given term.
|
||||
* @param termIndex
|
||||
* the term that exists in the data block.
|
||||
* @param input
|
||||
* the data block.
|
||||
* @return int.
|
||||
* @throws IOException
|
||||
*/
|
||||
private int compareField(Term term, int termIndex, PagedBytesDataInput input) throws IOException {
|
||||
input.setPosition(indexToDataOffset.get(termIndex));
|
||||
return term.field().compareTo(fields[input.readVInt()].field());
|
||||
}
|
||||
}
|
|
@ -17,12 +17,14 @@ package org.apache.lucene.util;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
||||
/** Represents a logical byte[] as a series of pages. You
|
||||
* can write-once into the logical byte[] (append only),
|
||||
|
@ -37,6 +39,8 @@ public final class PagedBytes {
|
|||
private final int blockSize;
|
||||
private final int blockBits;
|
||||
private final int blockMask;
|
||||
private boolean didSkipBytes;
|
||||
private boolean frozen;
|
||||
private int upto;
|
||||
private byte[] currentBlock;
|
||||
|
||||
|
@ -320,6 +324,7 @@ public final class PagedBytes {
|
|||
if (currentBlock != null) {
|
||||
blocks.add(currentBlock);
|
||||
blockEnd.add(upto);
|
||||
didSkipBytes = true;
|
||||
}
|
||||
currentBlock = new byte[blockSize];
|
||||
upto = 0;
|
||||
|
@ -338,6 +343,12 @@ public final class PagedBytes {
|
|||
|
||||
/** Commits final byte[], trimming it if necessary and if trim=true */
|
||||
public Reader freeze(boolean trim) {
|
||||
if (frozen) {
|
||||
throw new IllegalStateException("already frozen");
|
||||
}
|
||||
if (didSkipBytes) {
|
||||
throw new IllegalStateException("cannot freeze when copy(BytesRef, BytesRef) was used");
|
||||
}
|
||||
if (trim && upto < blockSize) {
|
||||
final byte[] newBlock = new byte[upto];
|
||||
System.arraycopy(currentBlock, 0, newBlock, 0, upto);
|
||||
|
@ -348,6 +359,7 @@ public final class PagedBytes {
|
|||
}
|
||||
blocks.add(currentBlock);
|
||||
blockEnd.add(upto);
|
||||
frozen = true;
|
||||
currentBlock = null;
|
||||
return new Reader(this);
|
||||
}
|
||||
|
@ -389,4 +401,150 @@ public final class PagedBytes {
|
|||
|
||||
return pointer;
|
||||
}
|
||||
|
||||
public final class PagedBytesDataInput extends DataInput {
|
||||
private int currentBlockIndex;
|
||||
private int currentBlockUpto;
|
||||
private byte[] currentBlock;
|
||||
|
||||
PagedBytesDataInput() {
|
||||
currentBlock = blocks.get(0);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object clone() {
|
||||
PagedBytesDataInput clone = getDataInput();
|
||||
clone.setPosition(getPosition());
|
||||
return clone;
|
||||
}
|
||||
|
||||
/** Returns the current byte position. */
|
||||
public long getPosition() {
|
||||
return currentBlockIndex * blockSize + currentBlockUpto;
|
||||
}
|
||||
|
||||
/** Seek to a position previously obtained from
|
||||
* {@link #getPosition}. */
|
||||
public void setPosition(long pos) {
|
||||
currentBlockIndex = (int) (pos >> blockBits);
|
||||
currentBlock = blocks.get(currentBlockIndex);
|
||||
currentBlockUpto = (int) (pos & blockMask);
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte readByte() {
|
||||
if (currentBlockUpto == blockSize) {
|
||||
nextBlock();
|
||||
}
|
||||
return currentBlock[currentBlockUpto++];
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readBytes(byte[] b, int offset, int len) {
|
||||
final int offsetEnd = offset + len;
|
||||
while (true) {
|
||||
final int blockLeft = blockSize - currentBlockUpto;
|
||||
final int left = offsetEnd - offset;
|
||||
if (blockLeft < left) {
|
||||
System.arraycopy(currentBlock, currentBlockUpto,
|
||||
b, offset,
|
||||
blockLeft);
|
||||
nextBlock();
|
||||
offset += blockLeft;
|
||||
} else {
|
||||
// Last block
|
||||
System.arraycopy(currentBlock, currentBlockUpto,
|
||||
b, offset,
|
||||
left);
|
||||
currentBlockUpto += left;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void nextBlock() {
|
||||
currentBlockIndex++;
|
||||
currentBlockUpto = 0;
|
||||
currentBlock = blocks.get(currentBlockIndex);
|
||||
}
|
||||
}
|
||||
|
||||
public final class PagedBytesDataOutput extends DataOutput {
|
||||
@Override
|
||||
public void writeByte(byte b) {
|
||||
if (upto == blockSize) {
|
||||
if (currentBlock != null) {
|
||||
blocks.add(currentBlock);
|
||||
blockEnd.add(upto);
|
||||
}
|
||||
currentBlock = new byte[blockSize];
|
||||
upto = 0;
|
||||
}
|
||||
currentBlock[upto++] = b;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeBytes(byte[] b, int offset, int length) throws IOException {
|
||||
if (length == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (upto == blockSize) {
|
||||
if (currentBlock != null) {
|
||||
blocks.add(currentBlock);
|
||||
blockEnd.add(upto);
|
||||
}
|
||||
currentBlock = new byte[blockSize];
|
||||
upto = 0;
|
||||
}
|
||||
|
||||
final int offsetEnd = offset + length;
|
||||
while(true) {
|
||||
final int left = offsetEnd - offset;
|
||||
final int blockLeft = blockSize - upto;
|
||||
if (blockLeft < left) {
|
||||
System.arraycopy(b, offset, currentBlock, upto, blockLeft);
|
||||
blocks.add(currentBlock);
|
||||
blockEnd.add(blockSize);
|
||||
currentBlock = new byte[blockSize];
|
||||
upto = 0;
|
||||
offset += blockLeft;
|
||||
} else {
|
||||
// Last block
|
||||
System.arraycopy(b, offset, currentBlock, upto, left);
|
||||
upto += left;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Return the current byte position. */
|
||||
public long getPosition() {
|
||||
if (currentBlock == null) {
|
||||
return 0;
|
||||
} else {
|
||||
return blocks.size() * blockSize + upto;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns a DataInput to read values from this
|
||||
* PagedBytes instance. */
|
||||
public PagedBytesDataInput getDataInput() {
|
||||
if (!frozen) {
|
||||
throw new IllegalStateException("must call freeze() before getDataInput");
|
||||
}
|
||||
return new PagedBytesDataInput();
|
||||
}
|
||||
|
||||
/** Returns a DataOutput that you may use to write into
|
||||
* this PagedBytes instance. If you do this, you should
|
||||
* not call the other writing methods (eg, copy);
|
||||
* results are undefined. */
|
||||
public PagedBytesDataOutput getDataOutput() {
|
||||
if (frozen) {
|
||||
throw new IllegalStateException("cannot get DataOutput after freeze()");
|
||||
}
|
||||
return new PagedBytesDataOutput();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,193 @@
|
|||
package org.apache.lucene.index.codecs.preflex;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.FieldsEnum;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.LogMergePolicy;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.SegmentReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.codecs.Codec;
|
||||
import org.apache.lucene.index.codecs.CodecProvider;
|
||||
import org.apache.lucene.index.codecs.CoreCodecProvider;
|
||||
import org.apache.lucene.index.codecs.preflexrw.PreFlexRWCodec;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.LockObtainFailedException;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
public class TestTermInfosReaderIndex extends LuceneTestCase {
|
||||
|
||||
private static final int NUMBER_OF_DOCUMENTS = 1000;
|
||||
private static final int NUMBER_OF_FIELDS = 100;
|
||||
private TermInfosReaderIndex index;
|
||||
private Directory directory;
|
||||
private SegmentTermEnum termEnum;
|
||||
private int indexDivisor;
|
||||
private int termIndexInterval;
|
||||
private IndexReader reader;
|
||||
private List<Term> sampleTerms;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
indexDivisor = _TestUtil.nextInt(random, 1, 10);
|
||||
directory = newDirectory();
|
||||
termIndexInterval = populate(directory);
|
||||
|
||||
IndexReader r0 = IndexReader.open(directory);
|
||||
SegmentReader r = (SegmentReader) r0.getSequentialSubReaders()[0];
|
||||
String segment = r.getSegmentName();
|
||||
r.close();
|
||||
|
||||
FieldInfos fieldInfos = new FieldInfos(directory, IndexFileNames.segmentFileName(segment, "", IndexFileNames.FIELD_INFOS_EXTENSION));
|
||||
String segmentFileName = IndexFileNames.segmentFileName(segment, "", PreFlexCodec.TERMS_INDEX_EXTENSION);
|
||||
long tiiFileLength = directory.fileLength(segmentFileName);
|
||||
IndexInput input = directory.openInput(segmentFileName, newIOContext(random));
|
||||
termEnum = new SegmentTermEnum(directory.openInput(IndexFileNames.segmentFileName(segment, "", PreFlexCodec.TERMS_EXTENSION), newIOContext(random)), fieldInfos, false);
|
||||
int totalIndexInterval = termEnum.indexInterval * indexDivisor;
|
||||
|
||||
SegmentTermEnum indexEnum = new SegmentTermEnum(input, fieldInfos, true);
|
||||
index = new TermInfosReaderIndex(indexEnum, indexDivisor, tiiFileLength, totalIndexInterval);
|
||||
indexEnum.close();
|
||||
input.close();
|
||||
|
||||
reader = IndexReader.open(directory);
|
||||
sampleTerms = sample(reader,1000);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
termEnum.close();
|
||||
reader.close();
|
||||
directory.close();
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
public void testSeekEnum() throws CorruptIndexException, IOException {
|
||||
int indexPosition = 3;
|
||||
SegmentTermEnum clone = (SegmentTermEnum) termEnum.clone();
|
||||
Term term = findTermThatWouldBeAtIndex(clone, indexPosition);
|
||||
SegmentTermEnum enumerator = clone;
|
||||
index.seekEnum(enumerator, indexPosition);
|
||||
assertEquals(term, enumerator.term());
|
||||
clone.close();
|
||||
}
|
||||
|
||||
public void testCompareTo() throws IOException {
|
||||
Term term = new Term("field" + random.nextInt(NUMBER_OF_FIELDS) ,getText());
|
||||
for (int i = 0; i < index.length(); i++) {
|
||||
Term t = index.getTerm(i);
|
||||
int compareTo = term.compareTo(t);
|
||||
assertEquals(compareTo, index.compareTo(term, i));
|
||||
}
|
||||
}
|
||||
|
||||
public void testRandomSearchPerformance() throws CorruptIndexException, IOException {
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
for (Term t : sampleTerms) {
|
||||
TermQuery query = new TermQuery(t);
|
||||
TopDocs topDocs = searcher.search(query, 10);
|
||||
assertTrue(topDocs.totalHits > 0);
|
||||
}
|
||||
searcher.close();
|
||||
}
|
||||
|
||||
private List<Term> sample(IndexReader reader, int size) throws IOException {
|
||||
List<Term> sample = new ArrayList<Term>();
|
||||
Random random = new Random();
|
||||
FieldsEnum fieldsEnum = MultiFields.getFields(reader).iterator();
|
||||
String field;
|
||||
while((field = fieldsEnum.next()) != null) {
|
||||
TermsEnum terms = fieldsEnum.terms();
|
||||
while (terms.next() != null) {
|
||||
if (sample.size() >= size) {
|
||||
int pos = random.nextInt(size);
|
||||
sample.set(pos, new Term(field, terms.term()));
|
||||
} else {
|
||||
sample.add(new Term(field, terms.term()));
|
||||
}
|
||||
}
|
||||
}
|
||||
Collections.shuffle(sample);
|
||||
return sample;
|
||||
}
|
||||
|
||||
private Term findTermThatWouldBeAtIndex(SegmentTermEnum termEnum, int index) throws IOException {
|
||||
int termPosition = index * termIndexInterval * indexDivisor;
|
||||
for (int i = 0; i < termPosition; i++) {
|
||||
if (!termEnum.next()) {
|
||||
fail("Should not have run out of terms.");
|
||||
}
|
||||
}
|
||||
return termEnum.term();
|
||||
}
|
||||
|
||||
private int populate(Directory directory) throws CorruptIndexException, LockObtainFailedException, IOException {
|
||||
IndexWriterConfig config = newIndexWriterConfig(TEST_VERSION_CURRENT,
|
||||
new MockAnalyzer(random, MockTokenizer.KEYWORD, false));
|
||||
CoreCodecProvider cp = new CoreCodecProvider();
|
||||
cp.unregister(cp.lookup("PreFlex"));
|
||||
cp.register(new PreFlexRWCodec());
|
||||
cp.setDefaultFieldCodec("PreFlex");
|
||||
config.setCodecProvider(cp);
|
||||
// turn off compound file, this test will open some index files directly.
|
||||
LogMergePolicy mp = newLogMergePolicy();
|
||||
mp.setUseCompoundFile(false);
|
||||
config.setMergePolicy(mp);
|
||||
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random, directory, config);
|
||||
for (int i = 0; i < NUMBER_OF_DOCUMENTS; i++) {
|
||||
Document document = new Document();
|
||||
for (int f = 0; f < NUMBER_OF_FIELDS; f++) {
|
||||
document.add(newField("field" + f, getText(), StringField.TYPE_UNSTORED));
|
||||
}
|
||||
writer.addDocument(document);
|
||||
}
|
||||
writer.optimize();
|
||||
writer.close();
|
||||
return config.getTermIndexInterval();
|
||||
}
|
||||
|
||||
private String getText() {
|
||||
return Long.toString(random.nextLong(),Character.MAX_RADIX);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,64 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.util;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
|
||||
public class TestPagedBytes extends LuceneTestCase {
|
||||
|
||||
public void testDataInputOutput() throws Exception {
|
||||
for(int iter=0;iter<5*RANDOM_MULTIPLIER;iter++) {
|
||||
final PagedBytes p = new PagedBytes(_TestUtil.nextInt(random, 1, 20));
|
||||
final DataOutput out = p.getDataOutput();
|
||||
final int numBytes = random.nextInt(10000000);
|
||||
|
||||
final byte[] answer = new byte[numBytes];
|
||||
random.nextBytes(answer);
|
||||
int written = 0;
|
||||
while(written < numBytes) {
|
||||
if (random.nextInt(10) == 7) {
|
||||
out.writeByte(answer[written++]);
|
||||
} else {
|
||||
int chunk = Math.max(random.nextInt(1000), numBytes - written);
|
||||
out.writeBytes(answer, written, chunk);
|
||||
written += chunk;
|
||||
}
|
||||
}
|
||||
|
||||
p.freeze(random.nextBoolean());
|
||||
|
||||
final DataInput in = p.getDataInput();
|
||||
|
||||
final byte[] verify = new byte[numBytes];
|
||||
int read = 0;
|
||||
while(read < numBytes) {
|
||||
if (random.nextInt(10) == 7) {
|
||||
verify[read++] = in.readByte();
|
||||
} else {
|
||||
int chunk = Math.max(random.nextInt(1000), numBytes - read);
|
||||
in.readBytes(verify, read, chunk);
|
||||
read += chunk;
|
||||
}
|
||||
}
|
||||
assertTrue(Arrays.equals(answer, verify));
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue