LUCENE-2329: Use parallel arrays instead of PostingList objects in TermsHash*

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@926791 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Busch 2010-03-23 21:25:15 +00:00
parent 32a370e127
commit f6126f8808
13 changed files with 368 additions and 378 deletions

View File

@ -276,6 +276,15 @@ Optimizations
TermAttributeImpl, move DEFAULT_TYPE constant to TypeInterface, improve
null-handling for TypeAttribute. (Uwe Schindler)
* LUCENE-2329: Switch TermsHash* from using a PostingList object per unique
term to parallel arrays, indexed by termID. This reduces garbage collection
overhead significantly, which results in great indexing performance wins
when the available JVM heap space is low. This will become even more
important when the DocumentsWriter RAM buffer is searchable in the future,
because then it will make sense to make the RAM buffers as large as
possible. (Mike McCandless, Michael Busch)
Build
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation

View File

@ -19,6 +19,8 @@ package org.apache.lucene.index;
import java.io.IOException;
import org.apache.lucene.index.FreqProxTermsWriterPerField.FreqProxPostingsArray;
// TODO FI: some of this is "generic" to TermsHash* so we
// should factor it out so other consumers don't have to
// duplicate this code
@ -30,9 +32,10 @@ final class FreqProxFieldMergeState {
final FreqProxTermsWriterPerField field;
final int numPostings;
final CharBlockPool charPool;
final RawPostingList[] postings;
private FreqProxTermsWriter.PostingList p;
final int[] termIDs;
final FreqProxPostingsArray postings;
int currentTermID;
char[] text;
int textOffset;
@ -48,7 +51,8 @@ final class FreqProxFieldMergeState {
this.field = field;
this.charPool = field.perThread.termsHashPerThread.charPool;
this.numPostings = field.termsHashPerField.numPostings;
this.postings = field.termsHashPerField.sortPostings();
this.termIDs = field.termsHashPerField.sortPostings();
this.postings = (FreqProxPostingsArray) field.termsHashPerField.postingsArray;
}
boolean nextTerm() throws IOException {
@ -56,15 +60,16 @@ final class FreqProxFieldMergeState {
if (postingUpto == numPostings)
return false;
p = (FreqProxTermsWriter.PostingList) postings[postingUpto];
currentTermID = termIDs[postingUpto];
docID = 0;
text = charPool.buffers[p.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
textOffset = p.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
final int textStart = postings.textStarts[currentTermID];
text = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
textOffset = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
field.termsHashPerField.initReader(freq, p, 0);
field.termsHashPerField.initReader(freq, currentTermID, 0);
if (!field.fieldInfo.omitTermFreqAndPositions)
field.termsHashPerField.initReader(prox, p, 1);
field.termsHashPerField.initReader(prox, currentTermID, 1);
// Should always be true
boolean result = nextDoc();
@ -75,12 +80,12 @@ final class FreqProxFieldMergeState {
public boolean nextDoc() throws IOException {
if (freq.eof()) {
if (p.lastDocCode != -1) {
if (postings.lastDocCodes[currentTermID] != -1) {
// Return last doc
docID = p.lastDocID;
docID = postings.lastDocIDs[currentTermID];
if (!field.omitTermFreqAndPositions)
termFreq = p.docFreq;
p.lastDocCode = -1;
termFreq = postings.docFreqs[currentTermID];
postings.lastDocCodes[currentTermID] = -1;
return true;
} else
// EOF
@ -98,7 +103,7 @@ final class FreqProxFieldMergeState {
termFreq = freq.readVInt();
}
assert docID != p.lastDocID;
assert docID != postings.lastDocIDs[currentTermID];
return true;
}

View File

@ -33,13 +33,6 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
return new FreqProxTermsWriterPerThread(perThread);
}
@Override
void createPostings(RawPostingList[] postings, int start, int count) {
final int end = start + count;
for(int i=start;i<end;i++)
postings[i] = new PostingList();
}
private static int compareText(final char[] text1, int pos1, final char[] text2, int pos2) {
while(true) {
final char c1 = text1[pos1++];
@ -272,16 +265,4 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
}
final UnicodeUtil.UTF8Result termsUTF8 = new UnicodeUtil.UTF8Result();
static final class PostingList extends RawPostingList {
int docFreq; // # times this term occurs in the current doc
int lastDocID; // Last docID where this term occurred
int lastDocCode; // Code for prior doc
int lastPosition; // Last position where this term occurred
}
@Override
int bytesPerPosting() {
return RawPostingList.BYTES_SIZE + 4 * DocumentsWriter.INT_NUM_BYTE;
}
}

View File

@ -18,8 +18,9 @@ package org.apache.lucene.index;
*/
import java.io.IOException;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.document.Fieldable;
// TODO: break into separate freq and prox writers as
// codecs; make separate container (tii/tis/skip/*) that can
@ -87,7 +88,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
}
}
final void writeProx(FreqProxTermsWriter.PostingList p, int proxCode) {
final void writeProx(final int termID, int proxCode) {
final Payload payload;
if (payloadAttribute == null) {
payload = null;
@ -102,66 +103,111 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
hasPayloads = true;
} else
termsHashPerField.writeVInt(1, proxCode<<1);
p.lastPosition = fieldState.position;
FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
postings.lastPositions[termID] = fieldState.position;
}
@Override
final void newTerm(RawPostingList p0) {
final void newTerm(final int termID) {
// First time we're seeing this term since the last
// flush
assert docState.testPoint("FreqProxTermsWriterPerField.newTerm start");
FreqProxTermsWriter.PostingList p = (FreqProxTermsWriter.PostingList) p0;
p.lastDocID = docState.docID;
FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
postings.lastDocIDs[termID] = docState.docID;
if (omitTermFreqAndPositions) {
p.lastDocCode = docState.docID;
postings.lastDocCodes[termID] = docState.docID;
} else {
p.lastDocCode = docState.docID << 1;
p.docFreq = 1;
writeProx(p, fieldState.position);
postings.lastDocCodes[termID] = docState.docID << 1;
postings.docFreqs[termID] = 1;
writeProx(termID, fieldState.position);
}
}
@Override
final void addTerm(RawPostingList p0) {
final void addTerm(final int termID) {
assert docState.testPoint("FreqProxTermsWriterPerField.addTerm start");
FreqProxTermsWriter.PostingList p = (FreqProxTermsWriter.PostingList) p0;
assert omitTermFreqAndPositions || p.docFreq > 0;
FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
assert omitTermFreqAndPositions || postings.docFreqs[termID] > 0;
if (omitTermFreqAndPositions) {
if (docState.docID != p.lastDocID) {
assert docState.docID > p.lastDocID;
termsHashPerField.writeVInt(0, p.lastDocCode);
p.lastDocCode = docState.docID - p.lastDocID;
p.lastDocID = docState.docID;
if (docState.docID != postings.lastDocIDs[termID]) {
assert docState.docID > postings.lastDocIDs[termID];
termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]);
postings.lastDocCodes[termID] = docState.docID - postings.lastDocIDs[termID];
postings.lastDocIDs[termID] = docState.docID;
}
} else {
if (docState.docID != p.lastDocID) {
assert docState.docID > p.lastDocID;
if (docState.docID != postings.lastDocIDs[termID]) {
assert docState.docID > postings.lastDocIDs[termID];
// Term not yet seen in the current doc but previously
// seen in other doc(s) since the last flush
// Now that we know doc freq for previous doc,
// write it & lastDocCode
if (1 == p.docFreq)
termsHashPerField.writeVInt(0, p.lastDocCode|1);
if (1 == postings.docFreqs[termID])
termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]|1);
else {
termsHashPerField.writeVInt(0, p.lastDocCode);
termsHashPerField.writeVInt(0, p.docFreq);
termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]);
termsHashPerField.writeVInt(0, postings.docFreqs[termID]);
}
p.docFreq = 1;
p.lastDocCode = (docState.docID - p.lastDocID) << 1;
p.lastDocID = docState.docID;
writeProx(p, fieldState.position);
postings.docFreqs[termID] = 1;
postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
postings.lastDocIDs[termID] = docState.docID;
writeProx(termID, fieldState.position);
} else {
p.docFreq++;
writeProx(p, fieldState.position-p.lastPosition);
postings.docFreqs[termID]++;
writeProx(termID, fieldState.position-postings.lastPositions[termID]);
}
}
}
@Override
ParallelPostingsArray createPostingsArray(int size) {
return new FreqProxPostingsArray(size);
}
static final class FreqProxPostingsArray extends ParallelPostingsArray {
public FreqProxPostingsArray(int size) {
super(size);
docFreqs = new int[size];
lastDocIDs = new int[size];
lastDocCodes = new int[size];
lastPositions = new int[size];
}
int docFreqs[]; // # times this term occurs in the current doc
int lastDocIDs[]; // Last docID where this term occurred
int lastDocCodes[]; // Code for prior doc
int lastPositions[]; // Last position where this term occurred
@Override
ParallelPostingsArray resize(int newSize) {
FreqProxPostingsArray newArray = new FreqProxPostingsArray(newSize);
copy(this, newArray);
return newArray;
}
void copy(FreqProxPostingsArray fromArray, FreqProxPostingsArray toArray) {
super.copy(fromArray, toArray);
System.arraycopy(fromArray.docFreqs, 0, toArray.docFreqs, 0, fromArray.docFreqs.length);
System.arraycopy(fromArray.lastDocIDs, 0, toArray.lastDocIDs, 0, fromArray.lastDocIDs.length);
System.arraycopy(fromArray.lastDocCodes, 0, toArray.lastDocCodes, 0, fromArray.lastDocCodes.length);
System.arraycopy(fromArray.lastPositions, 0, toArray.lastPositions, 0, fromArray.lastPositions.length);
}
}
@Override
int bytesPerPosting() {
return ParallelPostingsArray.BYTES_PER_POSTING + 4 * DocumentsWriter.INT_NUM_BYTE;
}
public void abort() {}
}

View File

@ -0,0 +1,45 @@
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
class ParallelPostingsArray {
final static int BYTES_PER_POSTING = 3 * DocumentsWriter.INT_NUM_BYTE;
final int[] textStarts;
final int[] intStarts;
final int[] byteStarts;
public ParallelPostingsArray(final int size) {
textStarts = new int[size];
intStarts = new int[size];
byteStarts = new int[size];
}
ParallelPostingsArray resize(int newSize) {
ParallelPostingsArray newArray = new ParallelPostingsArray(newSize);
copy(this, newArray);
return newArray;
}
void copy(ParallelPostingsArray fromArray, ParallelPostingsArray toArray) {
System.arraycopy(fromArray.textStarts, 0, toArray.textStarts, 0, fromArray.textStarts.length);
System.arraycopy(fromArray.intStarts, 0, toArray.intStarts, 0, fromArray.intStarts.length);
System.arraycopy(fromArray.byteStarts, 0, toArray.byteStarts, 0, fromArray.byteStarts.length);
}
}

View File

@ -1,36 +0,0 @@
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** This is the base class for an in-memory posting list,
* keyed by a Token. {@link TermsHash} maintains a hash
* table holding one instance of this per unique Token.
* Consumers of TermsHash ({@link TermsHashConsumer}) must
* subclass this class with its own concrete class.
* FreqProxTermsWriter.PostingList is a private inner class used
* for the freq/prox postings, and
* TermVectorsTermsWriter.PostingList is a private inner class
* used to hold TermVectors postings. */
abstract class RawPostingList {
final static int BYTES_SIZE = DocumentsWriter.OBJECT_HEADER_BYTES + 3*DocumentsWriter.INT_NUM_BYTE;
int textStart;
int intStart;
int byteStart;
}

View File

@ -47,13 +47,6 @@ final class TermVectorsTermsWriter extends TermsHashConsumer {
return new TermVectorsTermsWriterPerThread(termsHashPerThread, this);
}
@Override
void createPostings(RawPostingList[] postings, int start, int count) {
final int end = start + count;
for(int i=start;i<end;i++)
postings[i] = new PostingList();
}
@Override
synchronized void flush(Map<TermsHashConsumerPerThread,Collection<TermsHashConsumerPerField>> threadsAndFields, final SegmentWriteState state) throws IOException {
@ -290,15 +283,4 @@ final class TermVectorsTermsWriter extends TermsHashConsumer {
finishDocument(this);
}
}
static final class PostingList extends RawPostingList {
int freq; // How many times this term occurred in the current doc
int lastOffset; // Last offset we saw
int lastPosition; // Last position where this term occurred
}
@Override
int bytesPerPosting() {
return RawPostingList.BYTES_SIZE + 3 * DocumentsWriter.INT_NUM_BYTE;
}
}

View File

@ -124,8 +124,9 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
assert perThread.vectorFieldsInOrder(fieldInfo);
perThread.doc.addField(termsHashPerField.fieldInfo.number);
TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray;
final RawPostingList[] postings = termsHashPerField.sortPostings();
final int[] termIDs = termsHashPerField.sortPostings();
tvf.writeVInt(numPostings);
byte bits = 0x0;
@ -141,11 +142,11 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
final ByteSliceReader reader = perThread.vectorSliceReader;
final char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers;
for(int j=0;j<numPostings;j++) {
final TermVectorsTermsWriter.PostingList posting = (TermVectorsTermsWriter.PostingList) postings[j];
final int freq = posting.freq;
final int termID = termIDs[j];
final int freq = postings.freqs[termID];
final char[] text2 = charBuffers[posting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
final int start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
final char[] text2 = charBuffers[postings.textStarts[termID] >> DocumentsWriter.CHAR_BLOCK_SHIFT];
final int start2 = postings.textStarts[termID] & DocumentsWriter.CHAR_BLOCK_MASK;
// We swap between two encoders to save copying
// last Term's byte array
@ -178,12 +179,12 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
tvf.writeVInt(freq);
if (doVectorPositions) {
termsHashPerField.initReader(reader, posting, 0);
termsHashPerField.initReader(reader, termID, 0);
reader.writeTo(tvf);
}
if (doVectorOffsets) {
termsHashPerField.initReader(reader, posting, 1);
termsHashPerField.initReader(reader, termID, 1);
reader.writeTo(tvf);
}
}
@ -207,13 +208,13 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
}
@Override
void newTerm(RawPostingList p0) {
void newTerm(final int termID) {
assert docState.testPoint("TermVectorsTermsWriterPerField.newTerm start");
TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList) p0;
TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray;
p.freq = 1;
postings.freqs[termID] = 1;
if (doVectorOffsets) {
int startOffset = fieldState.offset + offsetAttribute.startOffset();
@ -221,38 +222,76 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
termsHashPerField.writeVInt(1, startOffset);
termsHashPerField.writeVInt(1, endOffset - startOffset);
p.lastOffset = endOffset;
postings.lastOffsets[termID] = endOffset;
}
if (doVectorPositions) {
termsHashPerField.writeVInt(0, fieldState.position);
p.lastPosition = fieldState.position;
postings.lastPositions[termID] = fieldState.position;
}
}
@Override
void addTerm(RawPostingList p0) {
void addTerm(final int termID) {
assert docState.testPoint("TermVectorsTermsWriterPerField.addTerm start");
TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList) p0;
p.freq++;
TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray;
postings.freqs[termID]++;
if (doVectorOffsets) {
int startOffset = fieldState.offset + offsetAttribute.startOffset();
int endOffset = fieldState.offset + offsetAttribute.endOffset();
termsHashPerField.writeVInt(1, startOffset - p.lastOffset);
termsHashPerField.writeVInt(1, startOffset - postings.lastOffsets[termID]);
termsHashPerField.writeVInt(1, endOffset - startOffset);
p.lastOffset = endOffset;
postings.lastOffsets[termID] = endOffset;
}
if (doVectorPositions) {
termsHashPerField.writeVInt(0, fieldState.position - p.lastPosition);
p.lastPosition = fieldState.position;
termsHashPerField.writeVInt(0, fieldState.position - postings.lastPositions[termID]);
postings.lastPositions[termID] = fieldState.position;
}
}
@Override
void skippingLongTerm() {}
@Override
ParallelPostingsArray createPostingsArray(int size) {
return new TermVectorsPostingsArray(size);
}
static final class TermVectorsPostingsArray extends ParallelPostingsArray {
public TermVectorsPostingsArray(int size) {
super(size);
freqs = new int[size];
lastOffsets = new int[size];
lastPositions = new int[size];
}
int[] freqs; // How many times this term occurred in the current doc
int[] lastOffsets; // Last offset we saw
int[] lastPositions; // Last position where this term occurred
@Override
ParallelPostingsArray resize(int newSize) {
TermVectorsPostingsArray newArray = new TermVectorsPostingsArray(newSize);
copy(this, newArray);
return newArray;
}
void copy(TermVectorsPostingsArray fromArray, TermVectorsPostingsArray toArray) {
super.copy(fromArray, toArray);
System.arraycopy(fromArray.freqs, 0, toArray.freqs, 0, fromArray.freqs.length);
System.arraycopy(fromArray.lastOffsets, 0, toArray.lastOffsets, 0, fromArray.lastOffsets.length);
System.arraycopy(fromArray.lastPositions, 0, toArray.lastPositions, 0, fromArray.lastPositions.length);
}
}
@Override
int bytesPerPosting() {
return ParallelPostingsArray.BYTES_PER_POSTING + 3 * DocumentsWriter.INT_NUM_BYTE;
}
}

View File

@ -17,16 +17,12 @@ package org.apache.lucene.index;
* limitations under the License.
*/
import java.util.Collection;
import java.util.Map;
import java.util.HashMap;
import java.util.Iterator;
import java.util.HashSet;
import java.util.Arrays;
import java.io.IOException;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
/** This class implements {@link InvertedDocConsumer}, which
* is passed each token produced by the analyzer on each
@ -40,13 +36,8 @@ final class TermsHash extends InvertedDocConsumer {
final TermsHashConsumer consumer;
final TermsHash nextTermsHash;
final int bytesPerPosting;
final int postingsFreeChunk;
final DocumentsWriter docWriter;
private RawPostingList[] postingsFreeList = new RawPostingList[1];
private int postingsFreeCount;
private int postingsAllocCount;
boolean trackAllocations;
public TermsHash(final DocumentsWriter docWriter, boolean trackAllocations, final TermsHashConsumer consumer, final TermsHash nextTermsHash) {
@ -54,14 +45,6 @@ final class TermsHash extends InvertedDocConsumer {
this.consumer = consumer;
this.nextTermsHash = nextTermsHash;
this.trackAllocations = trackAllocations;
// Why + 4*POINTER_NUM_BYTE below?
// +1: Posting is referenced by postingsFreeList array
// +3: Posting is referenced by hash, which
// targets 25-50% fill factor; approximate this
// as 3X # pointers
bytesPerPosting = consumer.bytesPerPosting() + 4*DocumentsWriter.POINTER_NUM_BYTE;
postingsFreeChunk = (DocumentsWriter.BYTE_BLOCK_SIZE / bytesPerPosting);
}
@Override
@ -86,18 +69,6 @@ final class TermsHash extends InvertedDocConsumer {
nextTermsHash.abort();
}
void shrinkFreePostings(Map<InvertedDocConsumerPerThread,Collection<InvertedDocConsumerPerField>> threadsAndFields, SegmentWriteState state) {
assert postingsFreeCount == postingsAllocCount: Thread.currentThread().getName() + ": postingsFreeCount=" + postingsFreeCount + " postingsAllocCount=" + postingsAllocCount + " consumer=" + consumer;
final int newSize = ArrayUtil.getShrinkSize(postingsFreeList.length, postingsAllocCount, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
if (newSize != postingsFreeList.length) {
RawPostingList[] newArray = new RawPostingList[newSize];
System.arraycopy(postingsFreeList, 0, newArray, 0, postingsFreeCount);
postingsFreeList = newArray;
}
}
@Override
synchronized void closeDocStore(SegmentWriteState state) throws IOException {
consumer.closeDocStore(state);
@ -144,91 +115,12 @@ final class TermsHash extends InvertedDocConsumer {
consumer.flush(childThreadsAndFields, state);
shrinkFreePostings(threadsAndFields, state);
if (nextTermsHash != null)
nextTermsHash.flush(nextThreadsAndFields, state);
}
@Override
synchronized public boolean freeRAM() {
if (!trackAllocations)
return false;
boolean any;
final int numToFree;
if (postingsFreeCount >= postingsFreeChunk)
numToFree = postingsFreeChunk;
else
numToFree = postingsFreeCount;
any = numToFree > 0;
if (any) {
Arrays.fill(postingsFreeList, postingsFreeCount-numToFree, postingsFreeCount, null);
postingsFreeCount -= numToFree;
postingsAllocCount -= numToFree;
docWriter.bytesAllocated(-numToFree * bytesPerPosting);
any = true;
}
if (nextTermsHash != null)
any |= nextTermsHash.freeRAM();
return any;
}
synchronized public void recyclePostings(final RawPostingList[] postings, final int numPostings) {
assert postings.length >= numPostings;
// Move all Postings from this ThreadState back to our
// free list. We pre-allocated this array while we were
// creating Postings to make sure it's large enough
assert postingsFreeCount + numPostings <= postingsFreeList.length;
System.arraycopy(postings, 0, postingsFreeList, postingsFreeCount, numPostings);
postingsFreeCount += numPostings;
}
synchronized public void getPostings(final RawPostingList[] postings) {
assert docWriter.writer.testPoint("TermsHash.getPostings start");
assert postingsFreeCount <= postingsFreeList.length;
assert postingsFreeCount <= postingsAllocCount: "postingsFreeCount=" + postingsFreeCount + " postingsAllocCount=" + postingsAllocCount;
final int numToCopy;
if (postingsFreeCount < postings.length)
numToCopy = postingsFreeCount;
else
numToCopy = postings.length;
final int start = postingsFreeCount-numToCopy;
assert start >= 0;
assert start + numToCopy <= postingsFreeList.length;
assert numToCopy <= postings.length;
System.arraycopy(postingsFreeList, start,
postings, 0, numToCopy);
// Directly allocate the remainder if any
if (numToCopy != postings.length) {
final int extra = postings.length - numToCopy;
final int newPostingsAllocCount = postingsAllocCount + extra;
consumer.createPostings(postings, numToCopy, extra);
assert docWriter.writer.testPoint("TermsHash.getPostings after create");
postingsAllocCount += extra;
if (trackAllocations)
docWriter.bytesAllocated(extra * bytesPerPosting);
if (newPostingsAllocCount > postingsFreeList.length)
// Pre-allocate the postingsFreeList so it's large
// enough to hold all postings we've given out
postingsFreeList = new RawPostingList[ArrayUtil.oversize(newPostingsAllocCount, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
}
postingsFreeCount -= numToCopy;
if (trackAllocations)
docWriter.bytesUsed(postings.length * bytesPerPosting);
return false;
}
}

View File

@ -22,8 +22,6 @@ import java.util.Collection;
import java.util.Map;
abstract class TermsHashConsumer {
abstract int bytesPerPosting();
abstract void createPostings(RawPostingList[] postings, int start, int count);
abstract TermsHashConsumerPerThread addThread(TermsHashPerThread perThread);
abstract void flush(Map<TermsHashConsumerPerThread,Collection<TermsHashConsumerPerField>> threadsAndFields, final SegmentWriteState state) throws IOException;
abstract void abort();

View File

@ -31,7 +31,11 @@ abstract class TermsHashConsumerPerField {
abstract void finish() throws IOException;
abstract void skippingLongTerm() throws IOException;
abstract void start(Fieldable field);
abstract void newTerm(RawPostingList p) throws IOException;
abstract void addTerm(RawPostingList p) throws IOException;
abstract void newTerm(int termID) throws IOException;
abstract void addTerm(int termID) throws IOException;
abstract int getStreamCount();
abstract ParallelPostingsArray createPostingsArray(int size);
abstract int bytesPerPosting();
}

View File

@ -27,6 +27,7 @@ import org.apache.lucene.util.UnicodeUtil;
final class TermsHashPerField extends InvertedDocConsumerPerField {
final TermsHashConsumerPerField consumer;
final TermsHashPerField nextPerField;
final TermsHashPerThread perThread;
final DocumentsWriter.DocState docState;
@ -48,8 +49,11 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
private int postingsHashSize = 4;
private int postingsHashHalfSize = postingsHashSize/2;
private int postingsHashMask = postingsHashSize-1;
private RawPostingList[] postingsHash = new RawPostingList[postingsHashSize];
private RawPostingList p;
private int[] postingsHash;
ParallelPostingsArray postingsArray;
private final int bytesPerPosting;
public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) {
this.perThread = perThread;
@ -57,6 +61,8 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
charPool = perThread.charPool;
bytePool = perThread.bytePool;
docState = perThread.docState;
postingsHash = new int[postingsHashSize];
Arrays.fill(postingsHash, -1);
fieldState = docInverterPerField.fieldState;
this.consumer = perThread.consumer.addField(this, fieldInfo);
streamCount = consumer.getStreamCount();
@ -66,6 +72,21 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
nextPerField = (TermsHashPerField) nextPerThread.addField(docInverterPerField, fieldInfo);
else
nextPerField = null;
// +3: Posting is referenced by hash, which
// targets 25-50% fill factor; approximate this
// as 3X # pointers
bytesPerPosting = consumer.bytesPerPosting() + 3*DocumentsWriter.INT_NUM_BYTE;
}
void initPostingsArray() {
assert postingsArray == null;
postingsArray = consumer.createPostingsArray(postingsHashSize);
if (perThread.termsHash.trackAllocations) {
perThread.termsHash.docWriter.bytesAllocated(bytesPerPosting * postingsHashSize);
}
}
void shrinkHash(int targetSize) {
@ -79,7 +100,9 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
}
if (newSize != postingsHash.length) {
postingsHash = new RawPostingList[newSize];
postingsHash = new int[newSize];
Arrays.fill(postingsHash, -1);
postingsArray = null;
postingsHashSize = newSize;
postingsHashHalfSize = newSize/2;
postingsHashMask = newSize-1;
@ -91,8 +114,7 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
compactPostings();
assert numPostings <= postingsHash.length;
if (numPostings > 0) {
perThread.termsHash.recyclePostings(postingsHash, numPostings);
Arrays.fill(postingsHash, 0, numPostings, null);
Arrays.fill(postingsHash, 0, numPostings, -1);
numPostings = 0;
}
postingsCompacted = false;
@ -106,23 +128,34 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
if (nextPerField != null)
nextPerField.abort();
}
private void growParallelPostingsArray() {
int oldSize = postingsArray.byteStarts.length;
int newSize = (int) (oldSize * 1.5);
this.postingsArray = this.postingsArray.resize(newSize);
if (perThread.termsHash.trackAllocations) {
perThread.termsHash.docWriter.bytesAllocated(bytesPerPosting * (newSize - oldSize));
}
}
public void initReader(ByteSliceReader reader, RawPostingList p, int stream) {
public void initReader(ByteSliceReader reader, int termID, int stream) {
assert stream < streamCount;
final int[] ints = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
final int upto = p.intStart & DocumentsWriter.INT_BLOCK_MASK;
int intStart = postingsArray.intStarts[termID];
final int[] ints = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
final int upto = intStart & DocumentsWriter.INT_BLOCK_MASK;
reader.init(bytePool,
p.byteStart+stream*ByteBlockPool.FIRST_LEVEL_SIZE,
postingsArray.byteStarts[termID]+stream*ByteBlockPool.FIRST_LEVEL_SIZE,
ints[upto+stream]);
}
private synchronized void compactPostings() {
int upto = 0;
for(int i=0;i<postingsHashSize;i++) {
if (postingsHash[i] != null) {
if (postingsHash[i] != -1) {
if (upto < i) {
postingsHash[upto] = postingsHash[i];
postingsHash[i] = null;
postingsHash[i] = -1;
}
upto++;
}
@ -133,41 +166,41 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
}
/** Collapse the hash table & sort in-place. */
public RawPostingList[] sortPostings() {
public int[] sortPostings() {
compactPostings();
quickSort(postingsHash, 0, numPostings-1);
return postingsHash;
}
void quickSort(RawPostingList[] postings, int lo, int hi) {
void quickSort(int[] termIDs, int lo, int hi) {
if (lo >= hi)
return;
else if (hi == 1+lo) {
if (comparePostings(postings[lo], postings[hi]) > 0) {
final RawPostingList tmp = postings[lo];
postings[lo] = postings[hi];
postings[hi] = tmp;
if (comparePostings(termIDs[lo], termIDs[hi]) > 0) {
final int tmp = termIDs[lo];
termIDs[lo] = termIDs[hi];
termIDs[hi] = tmp;
}
return;
}
int mid = (lo + hi) >>> 1;
if (comparePostings(postings[lo], postings[mid]) > 0) {
RawPostingList tmp = postings[lo];
postings[lo] = postings[mid];
postings[mid] = tmp;
if (comparePostings(termIDs[lo], termIDs[mid]) > 0) {
int tmp = termIDs[lo];
termIDs[lo] = termIDs[mid];
termIDs[mid] = tmp;
}
if (comparePostings(postings[mid], postings[hi]) > 0) {
RawPostingList tmp = postings[mid];
postings[mid] = postings[hi];
postings[hi] = tmp;
if (comparePostings(termIDs[mid], termIDs[hi]) > 0) {
int tmp = termIDs[mid];
termIDs[mid] = termIDs[hi];
termIDs[hi] = tmp;
if (comparePostings(postings[lo], postings[mid]) > 0) {
RawPostingList tmp2 = postings[lo];
postings[lo] = postings[mid];
postings[mid] = tmp2;
if (comparePostings(termIDs[lo], termIDs[mid]) > 0) {
int tmp2 = termIDs[lo];
termIDs[lo] = termIDs[mid];
termIDs[mid] = tmp2;
}
}
@ -177,40 +210,43 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
if (left >= right)
return;
RawPostingList partition = postings[mid];
int partition = termIDs[mid];
for (; ;) {
while (comparePostings(postings[right], partition) > 0)
while (comparePostings(termIDs[right], partition) > 0)
--right;
while (left < right && comparePostings(postings[left], partition) <= 0)
while (left < right && comparePostings(termIDs[left], partition) <= 0)
++left;
if (left < right) {
RawPostingList tmp = postings[left];
postings[left] = postings[right];
postings[right] = tmp;
int tmp = termIDs[left];
termIDs[left] = termIDs[right];
termIDs[right] = tmp;
--right;
} else {
break;
}
}
quickSort(postings, lo, left);
quickSort(postings, left + 1, hi);
quickSort(termIDs, lo, left);
quickSort(termIDs, left + 1, hi);
}
/** Compares term text for two Posting instance and
* returns -1 if p1 < p2; 1 if p1 > p2; else 0. */
int comparePostings(RawPostingList p1, RawPostingList p2) {
int comparePostings(int term1, int term2) {
if (p1 == p2)
if (term1 == term2)
return 0;
final char[] text1 = charPool.buffers[p1.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
int pos1 = p1.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
final char[] text2 = charPool.buffers[p2.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
int pos2 = p2.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
final int textStart1 = postingsArray.textStarts[term1];
final int textStart2 = postingsArray.textStarts[term2];
final char[] text1 = charPool.buffers[textStart1 >> DocumentsWriter.CHAR_BLOCK_SHIFT];
int pos1 = textStart1 & DocumentsWriter.CHAR_BLOCK_MASK;
final char[] text2 = charPool.buffers[textStart2 >> DocumentsWriter.CHAR_BLOCK_SHIFT];
int pos2 = textStart2 & DocumentsWriter.CHAR_BLOCK_MASK;
assert text1 != text2 || pos1 != pos2;
@ -233,11 +269,12 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
/** Test whether the text for current RawPostingList p equals
* current tokenText. */
private boolean postingEquals(final char[] tokenText, final int tokenTextLen) {
final char[] text = perThread.charPool.buffers[p.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
private boolean postingEquals(final int termID, final char[] tokenText, final int tokenTextLen) {
final int textStart = postingsArray.textStarts[termID];
final char[] text = perThread.charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
assert text != null;
int pos = p.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
int pos = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
int tokenPos = 0;
for(;tokenPos<tokenTextLen;pos++,tokenPos++)
@ -251,6 +288,9 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
@Override
void start(Fieldable f) {
if (postingsArray == null) {
initPostingsArray();
}
termAtt = fieldState.attributeSource.addAttribute(TermAttribute.class);
consumer.start(f);
if (nextPerField != null) {
@ -270,7 +310,6 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
// because token text has already been "interned" into
// textStart, so we hash by textStart
public void add(int textStart) throws IOException {
int code = textStart;
int hashPos = code & postingsHashMask;
@ -278,37 +317,39 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
assert !postingsCompacted;
// Locate RawPostingList in hash
p = postingsHash[hashPos];
int termID = postingsHash[hashPos];
if (p != null && p.textStart != textStart) {
if (termID != -1 && postingsArray.textStarts[termID] != textStart) {
// Conflict: keep searching different locations in
// the hash table.
final int inc = ((code>>8)+code)|1;
do {
code += inc;
hashPos = code & postingsHashMask;
p = postingsHash[hashPos];
} while (p != null && p.textStart != textStart);
termID = postingsHash[hashPos];
} while (termID != -1 && postingsArray.textStarts[termID] != textStart);
}
if (p == null) {
if (termID == -1) {
// First time we are seeing this token since we last
// flushed the hash.
// Refill?
if (0 == perThread.freePostingsCount)
perThread.morePostings();
// New posting
termID = numPostings++;
if (termID >= postingsArray.textStarts.length) {
growParallelPostingsArray();
}
if (perThread.termsHash.trackAllocations) {
perThread.termsHash.docWriter.bytesUsed(bytesPerPosting);
}
// Pull next free RawPostingList from free list
p = perThread.freePostings[--perThread.freePostingsCount];
assert p != null;
assert termID >= 0;
p.textStart = textStart;
postingsArray.textStarts[termID] = textStart;
assert postingsHash[hashPos] == null;
postingsHash[hashPos] = p;
numPostings++;
assert postingsHash[hashPos] == -1;
postingsHash[hashPos] = termID;
if (numPostings == postingsHashHalfSize)
rehashPostings(2*postingsHashSize);
@ -324,20 +365,21 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
intUptoStart = intPool.intUpto;
intPool.intUpto += streamCount;
p.intStart = intUptoStart + intPool.intOffset;
postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;
for(int i=0;i<streamCount;i++) {
final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
}
p.byteStart = intUptos[intUptoStart];
postingsArray.byteStarts[termID] = intUptos[intUptoStart];
consumer.newTerm(p);
consumer.newTerm(termID);
} else {
intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK;
consumer.addTerm(p);
int intStart = postingsArray.intStarts[termID];
intUptos = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK;
consumer.addTerm(termID);
}
}
@ -389,20 +431,20 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
int hashPos = code & postingsHashMask;
// Locate RawPostingList in hash
p = postingsHash[hashPos];
int termID = postingsHash[hashPos];
if (p != null && !postingEquals(tokenText, tokenTextLen)) {
if (termID != -1 && !postingEquals(termID, tokenText, tokenTextLen)) {
// Conflict: keep searching different locations in
// the hash table.
final int inc = ((code>>8)+code)|1;
do {
code += inc;
hashPos = code & postingsHashMask;
p = postingsHash[hashPos];
} while (p != null && !postingEquals(tokenText, tokenTextLen));
termID = postingsHash[hashPos];
} while (termID != -1 && !postingEquals(termID, tokenText, tokenTextLen));
}
if (p == null) {
if (termID == -1) {
// First time we are seeing this token since we last
// flushed the hash.
@ -424,24 +466,26 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
charPool.nextBuffer();
}
// Refill?
if (0 == perThread.freePostingsCount)
perThread.morePostings();
// New posting
termID = numPostings++;
if (termID >= postingsArray.textStarts.length) {
growParallelPostingsArray();
}
if (perThread.termsHash.trackAllocations) {
perThread.termsHash.docWriter.bytesUsed(bytesPerPosting);
}
// Pull next free RawPostingList from free list
p = perThread.freePostings[--perThread.freePostingsCount];
assert p != null;
assert termID != -1;
final char[] text = charPool.buffer;
final int textUpto = charPool.charUpto;
p.textStart = textUpto + charPool.charOffset;
postingsArray.textStarts[termID] = textUpto + charPool.charOffset;
charPool.charUpto += textLen1;
System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen);
text[textUpto+tokenTextLen] = 0xffff;
assert postingsHash[hashPos] == null;
postingsHash[hashPos] = p;
numPostings++;
assert postingsHash[hashPos] == -1;
postingsHash[hashPos] = termID;
if (numPostings == postingsHashHalfSize)
rehashPostings(2*postingsHashSize);
@ -457,24 +501,25 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
intUptoStart = intPool.intUpto;
intPool.intUpto += streamCount;
p.intStart = intUptoStart + intPool.intOffset;
postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;
for(int i=0;i<streamCount;i++) {
final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
}
p.byteStart = intUptos[intUptoStart];
postingsArray.byteStarts[termID] = intUptos[intUptoStart];
consumer.newTerm(p);
consumer.newTerm(termID);
} else {
intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK;
consumer.addTerm(p);
final int intStart = postingsArray.intStarts[termID];
intUptos = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK;
consumer.addTerm(termID);
}
if (doNextCall)
nextPerField.add(p.textStart);
nextPerField.add(postingsArray.textStarts[termID]);
}
int[] intUptos;
@ -524,14 +569,16 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
final int newMask = newSize-1;
RawPostingList[] newHash = new RawPostingList[newSize];
int[] newHash = new int[newSize];
Arrays.fill(newHash, -1);
for(int i=0;i<postingsHashSize;i++) {
RawPostingList p0 = postingsHash[i];
if (p0 != null) {
int termID = postingsHash[i];
if (termID != -1) {
int code;
if (perThread.primary) {
final int start = p0.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
final char[] text = charPool.buffers[p0.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
final int textStart = postingsArray.textStarts[termID];
final int start = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
final char[] text = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
int pos = start;
while(text[pos] != 0xffff)
pos++;
@ -539,18 +586,18 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
while (pos > start)
code = (code*31) + text[--pos];
} else
code = p0.textStart;
code = postingsArray.textStarts[termID];
int hashPos = code & newMask;
assert hashPos >= 0;
if (newHash[hashPos] != null) {
if (newHash[hashPos] != -1) {
final int inc = ((code>>8)+code)|1;
do {
code += inc;
hashPos = code & newMask;
} while (newHash[hashPos] != null);
} while (newHash[hashPos] != -1);
}
newHash[hashPos] = p0;
newHash[hashPos] = termID;
}
}

View File

@ -31,9 +31,6 @@ final class TermsHashPerThread extends InvertedDocConsumerPerThread {
final boolean primary;
final DocumentsWriter.DocState docState;
final RawPostingList freePostings[] = new RawPostingList[256];
int freePostingsCount;
public TermsHashPerThread(DocInverterPerThread docInverterPerThread, final TermsHash termsHash, final TermsHash nextTermsHash, final TermsHashPerThread primaryPerThread) {
docState = docInverterPerThread.docState;
@ -71,20 +68,6 @@ final class TermsHashPerThread extends InvertedDocConsumerPerThread {
nextPerThread.abort();
}
// perField calls this when it needs more postings:
void morePostings() throws IOException {
assert freePostingsCount == 0;
termsHash.getPostings(freePostings);
freePostingsCount = freePostings.length;
assert noNullPostings(freePostings, freePostingsCount, "consumer=" + consumer);
}
private static boolean noNullPostings(RawPostingList[] postings, int count, String details) {
for(int i=0;i<count;i++)
assert postings[i] != null: "postings[" + i + "] of " + count + " is null: " + details;
return true;
}
@Override
public void startDocument() throws IOException {
consumer.startDocument();
@ -116,10 +99,5 @@ final class TermsHashPerThread extends InvertedDocConsumerPerThread {
if (primary)
charPool.reset();
if (recyclePostings) {
termsHash.recyclePostings(freePostings, freePostingsCount);
freePostingsCount = 0;
}
}
}