From 00da65393cc56a933440796631686e5d113ca338 Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Thu, 27 Oct 2011 20:49:13 +0000 Subject: [PATCH] LUCENE-2205: port to trunk in preflex codec; port TestPagedBytes and PagedBytes.DataInput/Output to trunk git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1190017 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 3 + .../index/codecs/preflex/TermInfosReader.java | 76 ++---- .../codecs/preflex/TermInfosReaderIndex.java | 252 ++++++++++++++++++ .../lucene/store/ByteArrayDataInput.java | 2 +- .../org/apache/lucene/util/PagedBytes.java | 166 +++++++++++- .../preflex/TestTermInfosReaderIndex.java | 193 ++++++++++++++ .../apache/lucene/util/TestPagedBytes.java | 64 +++++ 7 files changed, 693 insertions(+), 63 deletions(-) create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReaderIndex.java create mode 100644 lucene/src/test/org/apache/lucene/index/codecs/preflex/TestTermInfosReaderIndex.java create mode 100644 lucene/src/test/org/apache/lucene/util/TestPagedBytes.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index f3421e54a28..a3ecc854e7b 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -723,6 +723,9 @@ Optimizations FilteredQuery/IndexSearcher added by LUCENE-1536 to Lucene 4.0. (Uwe Schindler) +* LUCENE-2205: Very substantial (3-5X) RAM reduction required to hold + the terms index on opening an IndexReader (Aaron McCurry via Mike McCandless) + Test Cases * LUCENE-3420: Disable the finalness checks in TokenStream and Analyzer diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java index 48352000236..cccfa88ab05 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java @@ -47,9 +47,8 @@ public final class TermInfosReader { private final SegmentTermEnum origEnum; private final long size; - private final Term[] indexTerms; - private final TermInfo[] indexInfos; - private final long[] indexPointers; + private final TermInfosReaderIndex index; + private final int indexLength; private final int totalIndexInterval; @@ -118,37 +117,23 @@ public final class TermInfosReader { if (indexDivisor != -1) { // Load terms index totalIndexInterval = origEnum.indexInterval * indexDivisor; - final SegmentTermEnum indexEnum = new SegmentTermEnum(directory.openInput(IndexFileNames.segmentFileName(segment, "", PreFlexCodec.TERMS_INDEX_EXTENSION), - context), fieldInfos, true); + + final String indexFileName = IndexFileNames.segmentFileName(segment, "", PreFlexCodec.TERMS_INDEX_EXTENSION); + final SegmentTermEnum indexEnum = new SegmentTermEnum(directory.openInput(indexFileName, + context), fieldInfos, true); try { - int indexSize = 1+((int)indexEnum.size-1)/indexDivisor; // otherwise read index - - indexTerms = new Term[indexSize]; - indexInfos = new TermInfo[indexSize]; - indexPointers = new long[indexSize]; - - for (int i=0;indexEnum.next(); i++) { - indexTerms[i] = indexEnum.term(); - assert indexTerms[i] != null; - assert indexTerms[i].text() != null; - assert indexTerms[i].field() != null; - indexInfos[i] = indexEnum.termInfo(); - indexPointers[i] = indexEnum.indexPointer; - - for (int j = 1; j < indexDivisor; j++) - if (!indexEnum.next()) - break; - } + // nocommit don't cast to int.. + index = new TermInfosReaderIndex(indexEnum, indexDivisor, (int) dir.fileLength(indexFileName), totalIndexInterval); + indexLength = index.length(); } finally { indexEnum.close(); } } else { // Do not load terms index: totalIndexInterval = -1; - indexTerms = null; - indexInfos = null; - indexPointers = null; + index = null; + indexLength = -1; } success = true; } finally { @@ -203,31 +188,6 @@ public final class TermInfosReader { } } - /** Returns the offset of the greatest index entry which is less than or equal to term.*/ - private int getIndexOffset(Term term) { - int lo = 0; // binary search indexTerms[] - int hi = indexTerms.length - 1; - - while (hi >= lo) { - int mid = (lo + hi) >>> 1; - assert indexTerms[mid] != null : "indexTerms = " + indexTerms.length + " mid=" + mid; - int delta = compareAsUTF16(term, indexTerms[mid]); - if (delta < 0) - hi = mid - 1; - else if (delta > 0) - lo = mid + 1; - else - return mid; - } - return hi; - } - - private void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException { - enumerator.seek(indexPointers[indexOffset], - ((long) indexOffset * totalIndexInterval) - 1, - indexTerms[indexOffset], indexInfos[indexOffset]); - } - /** Returns the TermInfo for a Term in the set, or null. */ TermInfo get(Term term) throws IOException { return get(term, false); @@ -272,8 +232,8 @@ public final class TermInfosReader { && ((enumerator.prev() != null && compareAsUTF16(term, enumerator.prev())> 0) || compareAsUTF16(term, enumerator.term()) >= 0)) { int enumOffset = (int)(enumerator.position/totalIndexInterval)+1; - if (indexTerms.length == enumOffset // but before end of block - || compareAsUTF16(term, indexTerms[enumOffset]) < 0) { + if (indexLength == enumOffset // but before end of block + || index.compareTo(term, enumOffset) < 0) { // no need to seek final TermInfo ti; @@ -309,10 +269,10 @@ public final class TermInfosReader { indexPos = (int) (tiOrd.termOrd / totalIndexInterval); } else { // Must do binary search: - indexPos = getIndexOffset(term); + indexPos = index.getIndexOffset(term); } - seekEnum(enumerator, indexPos); + index.seekEnum(enumerator, indexPos); enumerator.scanTo(term); final TermInfo ti; @@ -352,7 +312,7 @@ public final class TermInfosReader { } private void ensureIndexIsRead() { - if (indexTerms == null) { + if (index == null) { throw new IllegalStateException("terms index was not loaded when this reader was created"); } } @@ -362,10 +322,10 @@ public final class TermInfosReader { if (size == 0) return -1; ensureIndexIsRead(); - int indexOffset = getIndexOffset(term); + int indexOffset = index.getIndexOffset(term); SegmentTermEnum enumerator = getThreadResources().termEnum; - seekEnum(enumerator, indexOffset); + index.seekEnum(enumerator, indexOffset); while(compareAsUTF16(term, enumerator.term()) > 0 && enumerator.next()) {} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReaderIndex.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReaderIndex.java new file mode 100644 index 00000000000..d384ff9e0f1 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReaderIndex.java @@ -0,0 +1,252 @@ +package org.apache.lucene.index.codecs.preflex; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; + +import org.apache.lucene.index.Term; +import org.apache.lucene.util.BitUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.PagedBytes.PagedBytesDataInput; +import org.apache.lucene.util.PagedBytes.PagedBytesDataOutput; +import org.apache.lucene.util.PagedBytes; +import org.apache.lucene.util.packed.GrowableWriter; +import org.apache.lucene.util.packed.PackedInts; + +/** + * This stores a monotonically increasing set of pairs in an + * index segment. Pairs are accessed either by Term or by ordinal position the + * set. The Terms and TermInfo are actually serialized and stored into a byte + * array and pointers to the position of each are stored in a int array. + */ +class TermInfosReaderIndex { + + private static final int MAX_PAGE_BITS = 18; // 256 KB block + private Term[] fields; + private int totalIndexInterval; + private Comparator comparator = BytesRef.getUTF8SortedAsUTF16Comparator(); + private final PagedBytesDataInput dataInput; + private final PackedInts.Reader indexToDataOffset; + private final int indexSize; + private final int skipInterval; + + /** + * Loads the segment information at segment load time. + * + * @param indexEnum + * the term enum. + * @param indexDivisor + * the index divisor. + * @param tiiFileLength + * the size of the tii file, used to approximate the size of the + * buffer. + * @param totalIndexInterval + * the total index interval. + */ + TermInfosReaderIndex(SegmentTermEnum indexEnum, int indexDivisor, long tiiFileLength, int totalIndexInterval) throws IOException { + this.totalIndexInterval = totalIndexInterval; + indexSize = 1 + ((int) indexEnum.size - 1) / indexDivisor; + skipInterval = indexEnum.skipInterval; + // this is only an inital size, it will be GCed once the build is complete + long initialSize = (long) (tiiFileLength * 1.5) / indexDivisor; + PagedBytes dataPagedBytes = new PagedBytes(estimatePageBits(initialSize)); + PagedBytesDataOutput dataOutput = dataPagedBytes.getDataOutput(); + + GrowableWriter indexToTerms = new GrowableWriter(4, indexSize, false); + String currentField = null; + List fieldStrs = new ArrayList(); + int fieldCounter = -1; + for (int i = 0; indexEnum.next(); i++) { + Term term = indexEnum.term(); + if (currentField == null || !currentField.equals(term.field())) { + currentField = term.field(); + fieldStrs.add(currentField); + fieldCounter++; + } + TermInfo termInfo = indexEnum.termInfo(); + indexToTerms.set(i, dataOutput.getPosition()); + dataOutput.writeVInt(fieldCounter); + dataOutput.writeString(term.text()); + dataOutput.writeVInt(termInfo.docFreq); + if (termInfo.docFreq >= skipInterval) { + dataOutput.writeVInt(termInfo.skipOffset); + } + dataOutput.writeVLong(termInfo.freqPointer); + dataOutput.writeVLong(termInfo.proxPointer); + dataOutput.writeVLong(indexEnum.indexPointer); + for (int j = 1; j < indexDivisor; j++) { + if (!indexEnum.next()) { + break; + } + } + } + + fields = new Term[fieldStrs.size()]; + for (int i = 0; i < fields.length; i++) { + fields[i] = new Term(fieldStrs.get(i)); + } + + dataPagedBytes.freeze(true); + dataInput = dataPagedBytes.getDataInput(); + indexToDataOffset = indexToTerms.getMutable(); + } + + private static int estimatePageBits(long estSize) { + return Math.max(Math.min(64 - BitUtil.nlz(estSize), MAX_PAGE_BITS), 4); + } + + void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException { + PagedBytesDataInput input = (PagedBytesDataInput) dataInput.clone(); + + input.setPosition(indexToDataOffset.get(indexOffset)); + + // read the term + int fieldId = input.readVInt(); + Term field = fields[fieldId]; + Term term = new Term(field.field(), input.readString()); + + // read the terminfo + TermInfo termInfo = new TermInfo(); + termInfo.docFreq = input.readVInt(); + if (termInfo.docFreq >= skipInterval) { + termInfo.skipOffset = input.readVInt(); + } else { + termInfo.skipOffset = 0; + } + termInfo.freqPointer = input.readVLong(); + termInfo.proxPointer = input.readVLong(); + + long pointer = input.readVLong(); + + // perform the seek + enumerator.seek(pointer, ((long) indexOffset * totalIndexInterval) - 1, term, termInfo); + } + + /** + * Binary search for the given term. + * + * @param term + * the term to locate. + * @throws IOException + */ + int getIndexOffset(Term term) throws IOException { + int lo = 0; + int hi = indexSize - 1; + PagedBytesDataInput input = (PagedBytesDataInput) dataInput.clone(); + BytesRef scratch = new BytesRef(); + while (hi >= lo) { + int mid = (lo + hi) >>> 1; + int delta = compareTo(term, mid, input, scratch); + if (delta < 0) + hi = mid - 1; + else if (delta > 0) + lo = mid + 1; + else + return mid; + } + return hi; + } + + /** + * Gets the term at the given position. For testing. + * + * @param termIndex + * the position to read the term from the index. + * @return the term. + * @throws IOException + */ + Term getTerm(int termIndex) throws IOException { + PagedBytesDataInput input = (PagedBytesDataInput) dataInput.clone(); + input.setPosition(indexToDataOffset.get(termIndex)); + + // read the term + int fieldId = input.readVInt(); + Term field = fields[fieldId]; + return new Term(field.field(), input.readString()); + } + + /** + * Returns the number of terms. + * + * @return int. + */ + int length() { + return indexSize; + } + + /** + * The compares the given term against the term in the index specified by the + * term index. ie It returns negative N when term is less than index term; + * + * @param term + * the given term. + * @param termIndex + * the index of the of term to compare. + * @return int. + * @throws IOException + */ + int compareTo(Term term, int termIndex) throws IOException { + return compareTo(term, termIndex, (PagedBytesDataInput) dataInput.clone(), new BytesRef()); + } + + /** + * Compare the fields of the terms first, and if not equals return from + * compare. If equal compare terms. + * + * @param term + * the term to compare. + * @param termIndex + * the position of the term in the input to compare + * @param input + * the input buffer. + * @return int. + * @throws IOException + */ + private int compareTo(Term term, int termIndex, PagedBytesDataInput input, BytesRef reuse) throws IOException { + // if term field does not equal mid's field index, then compare fields + // else if they are equal, compare term's string values... + int c = compareField(term, termIndex, input); + if (c == 0) { + reuse.length = input.readVInt(); + reuse.grow(reuse.length); + input.readBytes(reuse.bytes, 0, reuse.length); + return comparator.compare(term.bytes(), reuse); + } + return c; + } + + /** + * Compares the fields before checking the text of the terms. + * + * @param term + * the given term. + * @param termIndex + * the term that exists in the data block. + * @param input + * the data block. + * @return int. + * @throws IOException + */ + private int compareField(Term term, int termIndex, PagedBytesDataInput input) throws IOException { + input.setPosition(indexToDataOffset.get(termIndex)); + return term.field().compareTo(fields[input.readVInt()].field()); + } +} diff --git a/lucene/src/java/org/apache/lucene/store/ByteArrayDataInput.java b/lucene/src/java/org/apache/lucene/store/ByteArrayDataInput.java index 0779a168f33..b968f4eb298 100644 --- a/lucene/src/java/org/apache/lucene/store/ByteArrayDataInput.java +++ b/lucene/src/java/org/apache/lucene/store/ByteArrayDataInput.java @@ -52,7 +52,7 @@ public final class ByteArrayDataInput extends DataInput { public int getPosition() { return pos; } - + public void setPosition(int pos) { this.pos = pos; } diff --git a/lucene/src/java/org/apache/lucene/util/PagedBytes.java b/lucene/src/java/org/apache/lucene/util/PagedBytes.java index d53e9b3140d..2e29464b90d 100644 --- a/lucene/src/java/org/apache/lucene/util/PagedBytes.java +++ b/lucene/src/java/org/apache/lucene/util/PagedBytes.java @@ -17,12 +17,14 @@ package org.apache.lucene.util; * limitations under the License. */ -import org.apache.lucene.store.IndexInput; - -import java.util.List; -import java.util.ArrayList; import java.io.Closeable; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexInput; /** Represents a logical byte[] as a series of pages. You * can write-once into the logical byte[] (append only), @@ -37,6 +39,8 @@ public final class PagedBytes { private final int blockSize; private final int blockBits; private final int blockMask; + private boolean didSkipBytes; + private boolean frozen; private int upto; private byte[] currentBlock; @@ -320,6 +324,7 @@ public final class PagedBytes { if (currentBlock != null) { blocks.add(currentBlock); blockEnd.add(upto); + didSkipBytes = true; } currentBlock = new byte[blockSize]; upto = 0; @@ -338,6 +343,12 @@ public final class PagedBytes { /** Commits final byte[], trimming it if necessary and if trim=true */ public Reader freeze(boolean trim) { + if (frozen) { + throw new IllegalStateException("already frozen"); + } + if (didSkipBytes) { + throw new IllegalStateException("cannot freeze when copy(BytesRef, BytesRef) was used"); + } if (trim && upto < blockSize) { final byte[] newBlock = new byte[upto]; System.arraycopy(currentBlock, 0, newBlock, 0, upto); @@ -348,6 +359,7 @@ public final class PagedBytes { } blocks.add(currentBlock); blockEnd.add(upto); + frozen = true; currentBlock = null; return new Reader(this); } @@ -389,4 +401,150 @@ public final class PagedBytes { return pointer; } + + public final class PagedBytesDataInput extends DataInput { + private int currentBlockIndex; + private int currentBlockUpto; + private byte[] currentBlock; + + PagedBytesDataInput() { + currentBlock = blocks.get(0); + } + + @Override + public Object clone() { + PagedBytesDataInput clone = getDataInput(); + clone.setPosition(getPosition()); + return clone; + } + + /** Returns the current byte position. */ + public long getPosition() { + return currentBlockIndex * blockSize + currentBlockUpto; + } + + /** Seek to a position previously obtained from + * {@link #getPosition}. */ + public void setPosition(long pos) { + currentBlockIndex = (int) (pos >> blockBits); + currentBlock = blocks.get(currentBlockIndex); + currentBlockUpto = (int) (pos & blockMask); + } + + @Override + public byte readByte() { + if (currentBlockUpto == blockSize) { + nextBlock(); + } + return currentBlock[currentBlockUpto++]; + } + + @Override + public void readBytes(byte[] b, int offset, int len) { + final int offsetEnd = offset + len; + while (true) { + final int blockLeft = blockSize - currentBlockUpto; + final int left = offsetEnd - offset; + if (blockLeft < left) { + System.arraycopy(currentBlock, currentBlockUpto, + b, offset, + blockLeft); + nextBlock(); + offset += blockLeft; + } else { + // Last block + System.arraycopy(currentBlock, currentBlockUpto, + b, offset, + left); + currentBlockUpto += left; + break; + } + } + } + + private void nextBlock() { + currentBlockIndex++; + currentBlockUpto = 0; + currentBlock = blocks.get(currentBlockIndex); + } + } + + public final class PagedBytesDataOutput extends DataOutput { + @Override + public void writeByte(byte b) { + if (upto == blockSize) { + if (currentBlock != null) { + blocks.add(currentBlock); + blockEnd.add(upto); + } + currentBlock = new byte[blockSize]; + upto = 0; + } + currentBlock[upto++] = b; + } + + @Override + public void writeBytes(byte[] b, int offset, int length) throws IOException { + if (length == 0) { + return; + } + + if (upto == blockSize) { + if (currentBlock != null) { + blocks.add(currentBlock); + blockEnd.add(upto); + } + currentBlock = new byte[blockSize]; + upto = 0; + } + + final int offsetEnd = offset + length; + while(true) { + final int left = offsetEnd - offset; + final int blockLeft = blockSize - upto; + if (blockLeft < left) { + System.arraycopy(b, offset, currentBlock, upto, blockLeft); + blocks.add(currentBlock); + blockEnd.add(blockSize); + currentBlock = new byte[blockSize]; + upto = 0; + offset += blockLeft; + } else { + // Last block + System.arraycopy(b, offset, currentBlock, upto, left); + upto += left; + break; + } + } + } + + /** Return the current byte position. */ + public long getPosition() { + if (currentBlock == null) { + return 0; + } else { + return blocks.size() * blockSize + upto; + } + } + } + + /** Returns a DataInput to read values from this + * PagedBytes instance. */ + public PagedBytesDataInput getDataInput() { + if (!frozen) { + throw new IllegalStateException("must call freeze() before getDataInput"); + } + return new PagedBytesDataInput(); + } + + /** Returns a DataOutput that you may use to write into + * this PagedBytes instance. If you do this, you should + * not call the other writing methods (eg, copy); + * results are undefined. */ + public PagedBytesDataOutput getDataOutput() { + if (frozen) { + throw new IllegalStateException("cannot get DataOutput after freeze()"); + } + return new PagedBytesDataOutput(); + } } diff --git a/lucene/src/test/org/apache/lucene/index/codecs/preflex/TestTermInfosReaderIndex.java b/lucene/src/test/org/apache/lucene/index/codecs/preflex/TestTermInfosReaderIndex.java new file mode 100644 index 00000000000..d42efa35281 --- /dev/null +++ b/lucene/src/test/org/apache/lucene/index/codecs/preflex/TestTermInfosReaderIndex.java @@ -0,0 +1,193 @@ +package org.apache.lucene.index.codecs.preflex; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Random; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.FieldsEnum; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.LogMergePolicy; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.SegmentReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.index.codecs.CoreCodecProvider; +import org.apache.lucene.index.codecs.preflexrw.PreFlexRWCodec; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.LockObtainFailedException; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; + +public class TestTermInfosReaderIndex extends LuceneTestCase { + + private static final int NUMBER_OF_DOCUMENTS = 1000; + private static final int NUMBER_OF_FIELDS = 100; + private TermInfosReaderIndex index; + private Directory directory; + private SegmentTermEnum termEnum; + private int indexDivisor; + private int termIndexInterval; + private IndexReader reader; + private List sampleTerms; + + @Override + public void setUp() throws Exception { + super.setUp(); + indexDivisor = _TestUtil.nextInt(random, 1, 10); + directory = newDirectory(); + termIndexInterval = populate(directory); + + IndexReader r0 = IndexReader.open(directory); + SegmentReader r = (SegmentReader) r0.getSequentialSubReaders()[0]; + String segment = r.getSegmentName(); + r.close(); + + FieldInfos fieldInfos = new FieldInfos(directory, IndexFileNames.segmentFileName(segment, "", IndexFileNames.FIELD_INFOS_EXTENSION)); + String segmentFileName = IndexFileNames.segmentFileName(segment, "", PreFlexCodec.TERMS_INDEX_EXTENSION); + long tiiFileLength = directory.fileLength(segmentFileName); + IndexInput input = directory.openInput(segmentFileName, newIOContext(random)); + termEnum = new SegmentTermEnum(directory.openInput(IndexFileNames.segmentFileName(segment, "", PreFlexCodec.TERMS_EXTENSION), newIOContext(random)), fieldInfos, false); + int totalIndexInterval = termEnum.indexInterval * indexDivisor; + + SegmentTermEnum indexEnum = new SegmentTermEnum(input, fieldInfos, true); + index = new TermInfosReaderIndex(indexEnum, indexDivisor, tiiFileLength, totalIndexInterval); + indexEnum.close(); + input.close(); + + reader = IndexReader.open(directory); + sampleTerms = sample(reader,1000); + + } + + @Override + public void tearDown() throws Exception { + termEnum.close(); + reader.close(); + directory.close(); + super.tearDown(); + } + + public void testSeekEnum() throws CorruptIndexException, IOException { + int indexPosition = 3; + SegmentTermEnum clone = (SegmentTermEnum) termEnum.clone(); + Term term = findTermThatWouldBeAtIndex(clone, indexPosition); + SegmentTermEnum enumerator = clone; + index.seekEnum(enumerator, indexPosition); + assertEquals(term, enumerator.term()); + clone.close(); + } + + public void testCompareTo() throws IOException { + Term term = new Term("field" + random.nextInt(NUMBER_OF_FIELDS) ,getText()); + for (int i = 0; i < index.length(); i++) { + Term t = index.getTerm(i); + int compareTo = term.compareTo(t); + assertEquals(compareTo, index.compareTo(term, i)); + } + } + + public void testRandomSearchPerformance() throws CorruptIndexException, IOException { + IndexSearcher searcher = new IndexSearcher(reader); + for (Term t : sampleTerms) { + TermQuery query = new TermQuery(t); + TopDocs topDocs = searcher.search(query, 10); + assertTrue(topDocs.totalHits > 0); + } + searcher.close(); + } + + private List sample(IndexReader reader, int size) throws IOException { + List sample = new ArrayList(); + Random random = new Random(); + FieldsEnum fieldsEnum = MultiFields.getFields(reader).iterator(); + String field; + while((field = fieldsEnum.next()) != null) { + TermsEnum terms = fieldsEnum.terms(); + while (terms.next() != null) { + if (sample.size() >= size) { + int pos = random.nextInt(size); + sample.set(pos, new Term(field, terms.term())); + } else { + sample.add(new Term(field, terms.term())); + } + } + } + Collections.shuffle(sample); + return sample; + } + + private Term findTermThatWouldBeAtIndex(SegmentTermEnum termEnum, int index) throws IOException { + int termPosition = index * termIndexInterval * indexDivisor; + for (int i = 0; i < termPosition; i++) { + if (!termEnum.next()) { + fail("Should not have run out of terms."); + } + } + return termEnum.term(); + } + + private int populate(Directory directory) throws CorruptIndexException, LockObtainFailedException, IOException { + IndexWriterConfig config = newIndexWriterConfig(TEST_VERSION_CURRENT, + new MockAnalyzer(random, MockTokenizer.KEYWORD, false)); + CoreCodecProvider cp = new CoreCodecProvider(); + cp.unregister(cp.lookup("PreFlex")); + cp.register(new PreFlexRWCodec()); + cp.setDefaultFieldCodec("PreFlex"); + config.setCodecProvider(cp); + // turn off compound file, this test will open some index files directly. + LogMergePolicy mp = newLogMergePolicy(); + mp.setUseCompoundFile(false); + config.setMergePolicy(mp); + + RandomIndexWriter writer = new RandomIndexWriter(random, directory, config); + for (int i = 0; i < NUMBER_OF_DOCUMENTS; i++) { + Document document = new Document(); + for (int f = 0; f < NUMBER_OF_FIELDS; f++) { + document.add(newField("field" + f, getText(), StringField.TYPE_UNSTORED)); + } + writer.addDocument(document); + } + writer.optimize(); + writer.close(); + return config.getTermIndexInterval(); + } + + private String getText() { + return Long.toString(random.nextLong(),Character.MAX_RADIX); + } +} diff --git a/lucene/src/test/org/apache/lucene/util/TestPagedBytes.java b/lucene/src/test/org/apache/lucene/util/TestPagedBytes.java new file mode 100644 index 00000000000..5205300b027 --- /dev/null +++ b/lucene/src/test/org/apache/lucene/util/TestPagedBytes.java @@ -0,0 +1,64 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.util; + +import java.util.Arrays; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; + +public class TestPagedBytes extends LuceneTestCase { + + public void testDataInputOutput() throws Exception { + for(int iter=0;iter<5*RANDOM_MULTIPLIER;iter++) { + final PagedBytes p = new PagedBytes(_TestUtil.nextInt(random, 1, 20)); + final DataOutput out = p.getDataOutput(); + final int numBytes = random.nextInt(10000000); + + final byte[] answer = new byte[numBytes]; + random.nextBytes(answer); + int written = 0; + while(written < numBytes) { + if (random.nextInt(10) == 7) { + out.writeByte(answer[written++]); + } else { + int chunk = Math.max(random.nextInt(1000), numBytes - written); + out.writeBytes(answer, written, chunk); + written += chunk; + } + } + + p.freeze(random.nextBoolean()); + + final DataInput in = p.getDataInput(); + + final byte[] verify = new byte[numBytes]; + int read = 0; + while(read < numBytes) { + if (random.nextInt(10) == 7) { + verify[read++] = in.readByte(); + } else { + int chunk = Math.max(random.nextInt(1000), numBytes - read); + in.readBytes(verify, read, chunk); + read += chunk; + } + } + assertTrue(Arrays.equals(answer, verify)); + } + } +}