LUCENE-2329: Use parallel arrays instead of PostingList objects in TermsHash*

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@926791 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Busch 2010-03-23 21:25:15 +00:00
parent 32a370e127
commit f6126f8808
13 changed files with 368 additions and 378 deletions

View File

@ -276,6 +276,15 @@ Optimizations
TermAttributeImpl, move DEFAULT_TYPE constant to TypeInterface, improve TermAttributeImpl, move DEFAULT_TYPE constant to TypeInterface, improve
null-handling for TypeAttribute. (Uwe Schindler) null-handling for TypeAttribute. (Uwe Schindler)
* LUCENE-2329: Switch TermsHash* from using a PostingList object per unique
term to parallel arrays, indexed by termID. This reduces garbage collection
overhead significantly, which results in great indexing performance wins
when the available JVM heap space is low. This will become even more
important when the DocumentsWriter RAM buffer is searchable in the future,
because then it will make sense to make the RAM buffers as large as
possible. (Mike McCandless, Michael Busch)
Build Build
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation * LUCENE-2124: Moved the JDK-based collation support from contrib/collation

View File

@ -19,6 +19,8 @@ package org.apache.lucene.index;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.index.FreqProxTermsWriterPerField.FreqProxPostingsArray;
// TODO FI: some of this is "generic" to TermsHash* so we // TODO FI: some of this is "generic" to TermsHash* so we
// should factor it out so other consumers don't have to // should factor it out so other consumers don't have to
// duplicate this code // duplicate this code
@ -30,9 +32,10 @@ final class FreqProxFieldMergeState {
final FreqProxTermsWriterPerField field; final FreqProxTermsWriterPerField field;
final int numPostings; final int numPostings;
final CharBlockPool charPool; final CharBlockPool charPool;
final RawPostingList[] postings; final int[] termIDs;
final FreqProxPostingsArray postings;
private FreqProxTermsWriter.PostingList p; int currentTermID;
char[] text; char[] text;
int textOffset; int textOffset;
@ -48,7 +51,8 @@ final class FreqProxFieldMergeState {
this.field = field; this.field = field;
this.charPool = field.perThread.termsHashPerThread.charPool; this.charPool = field.perThread.termsHashPerThread.charPool;
this.numPostings = field.termsHashPerField.numPostings; this.numPostings = field.termsHashPerField.numPostings;
this.postings = field.termsHashPerField.sortPostings(); this.termIDs = field.termsHashPerField.sortPostings();
this.postings = (FreqProxPostingsArray) field.termsHashPerField.postingsArray;
} }
boolean nextTerm() throws IOException { boolean nextTerm() throws IOException {
@ -56,15 +60,16 @@ final class FreqProxFieldMergeState {
if (postingUpto == numPostings) if (postingUpto == numPostings)
return false; return false;
p = (FreqProxTermsWriter.PostingList) postings[postingUpto]; currentTermID = termIDs[postingUpto];
docID = 0; docID = 0;
text = charPool.buffers[p.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; final int textStart = postings.textStarts[currentTermID];
textOffset = p.textStart & DocumentsWriter.CHAR_BLOCK_MASK; text = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
textOffset = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
field.termsHashPerField.initReader(freq, p, 0); field.termsHashPerField.initReader(freq, currentTermID, 0);
if (!field.fieldInfo.omitTermFreqAndPositions) if (!field.fieldInfo.omitTermFreqAndPositions)
field.termsHashPerField.initReader(prox, p, 1); field.termsHashPerField.initReader(prox, currentTermID, 1);
// Should always be true // Should always be true
boolean result = nextDoc(); boolean result = nextDoc();
@ -75,12 +80,12 @@ final class FreqProxFieldMergeState {
public boolean nextDoc() throws IOException { public boolean nextDoc() throws IOException {
if (freq.eof()) { if (freq.eof()) {
if (p.lastDocCode != -1) { if (postings.lastDocCodes[currentTermID] != -1) {
// Return last doc // Return last doc
docID = p.lastDocID; docID = postings.lastDocIDs[currentTermID];
if (!field.omitTermFreqAndPositions) if (!field.omitTermFreqAndPositions)
termFreq = p.docFreq; termFreq = postings.docFreqs[currentTermID];
p.lastDocCode = -1; postings.lastDocCodes[currentTermID] = -1;
return true; return true;
} else } else
// EOF // EOF
@ -98,7 +103,7 @@ final class FreqProxFieldMergeState {
termFreq = freq.readVInt(); termFreq = freq.readVInt();
} }
assert docID != p.lastDocID; assert docID != postings.lastDocIDs[currentTermID];
return true; return true;
} }

View File

@ -33,13 +33,6 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
return new FreqProxTermsWriterPerThread(perThread); return new FreqProxTermsWriterPerThread(perThread);
} }
@Override
void createPostings(RawPostingList[] postings, int start, int count) {
final int end = start + count;
for(int i=start;i<end;i++)
postings[i] = new PostingList();
}
private static int compareText(final char[] text1, int pos1, final char[] text2, int pos2) { private static int compareText(final char[] text1, int pos1, final char[] text2, int pos2) {
while(true) { while(true) {
final char c1 = text1[pos1++]; final char c1 = text1[pos1++];
@ -272,16 +265,4 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
} }
final UnicodeUtil.UTF8Result termsUTF8 = new UnicodeUtil.UTF8Result(); final UnicodeUtil.UTF8Result termsUTF8 = new UnicodeUtil.UTF8Result();
static final class PostingList extends RawPostingList {
int docFreq; // # times this term occurs in the current doc
int lastDocID; // Last docID where this term occurred
int lastDocCode; // Code for prior doc
int lastPosition; // Last position where this term occurred
}
@Override
int bytesPerPosting() {
return RawPostingList.BYTES_SIZE + 4 * DocumentsWriter.INT_NUM_BYTE;
}
} }

View File

@ -18,8 +18,9 @@ package org.apache.lucene.index;
*/ */
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.document.Fieldable;
// TODO: break into separate freq and prox writers as // TODO: break into separate freq and prox writers as
// codecs; make separate container (tii/tis/skip/*) that can // codecs; make separate container (tii/tis/skip/*) that can
@ -87,7 +88,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
} }
} }
final void writeProx(FreqProxTermsWriter.PostingList p, int proxCode) { final void writeProx(final int termID, int proxCode) {
final Payload payload; final Payload payload;
if (payloadAttribute == null) { if (payloadAttribute == null) {
payload = null; payload = null;
@ -102,66 +103,111 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
hasPayloads = true; hasPayloads = true;
} else } else
termsHashPerField.writeVInt(1, proxCode<<1); termsHashPerField.writeVInt(1, proxCode<<1);
p.lastPosition = fieldState.position;
FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
postings.lastPositions[termID] = fieldState.position;
} }
@Override @Override
final void newTerm(RawPostingList p0) { final void newTerm(final int termID) {
// First time we're seeing this term since the last // First time we're seeing this term since the last
// flush // flush
assert docState.testPoint("FreqProxTermsWriterPerField.newTerm start"); assert docState.testPoint("FreqProxTermsWriterPerField.newTerm start");
FreqProxTermsWriter.PostingList p = (FreqProxTermsWriter.PostingList) p0;
p.lastDocID = docState.docID; FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
postings.lastDocIDs[termID] = docState.docID;
if (omitTermFreqAndPositions) { if (omitTermFreqAndPositions) {
p.lastDocCode = docState.docID; postings.lastDocCodes[termID] = docState.docID;
} else { } else {
p.lastDocCode = docState.docID << 1; postings.lastDocCodes[termID] = docState.docID << 1;
p.docFreq = 1; postings.docFreqs[termID] = 1;
writeProx(p, fieldState.position); writeProx(termID, fieldState.position);
} }
} }
@Override @Override
final void addTerm(RawPostingList p0) { final void addTerm(final int termID) {
assert docState.testPoint("FreqProxTermsWriterPerField.addTerm start"); assert docState.testPoint("FreqProxTermsWriterPerField.addTerm start");
FreqProxTermsWriter.PostingList p = (FreqProxTermsWriter.PostingList) p0; FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
assert omitTermFreqAndPositions || p.docFreq > 0; assert omitTermFreqAndPositions || postings.docFreqs[termID] > 0;
if (omitTermFreqAndPositions) { if (omitTermFreqAndPositions) {
if (docState.docID != p.lastDocID) { if (docState.docID != postings.lastDocIDs[termID]) {
assert docState.docID > p.lastDocID; assert docState.docID > postings.lastDocIDs[termID];
termsHashPerField.writeVInt(0, p.lastDocCode); termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]);
p.lastDocCode = docState.docID - p.lastDocID; postings.lastDocCodes[termID] = docState.docID - postings.lastDocIDs[termID];
p.lastDocID = docState.docID; postings.lastDocIDs[termID] = docState.docID;
} }
} else { } else {
if (docState.docID != p.lastDocID) { if (docState.docID != postings.lastDocIDs[termID]) {
assert docState.docID > p.lastDocID; assert docState.docID > postings.lastDocIDs[termID];
// Term not yet seen in the current doc but previously // Term not yet seen in the current doc but previously
// seen in other doc(s) since the last flush // seen in other doc(s) since the last flush
// Now that we know doc freq for previous doc, // Now that we know doc freq for previous doc,
// write it & lastDocCode // write it & lastDocCode
if (1 == p.docFreq) if (1 == postings.docFreqs[termID])
termsHashPerField.writeVInt(0, p.lastDocCode|1); termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]|1);
else { else {
termsHashPerField.writeVInt(0, p.lastDocCode); termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]);
termsHashPerField.writeVInt(0, p.docFreq); termsHashPerField.writeVInt(0, postings.docFreqs[termID]);
} }
p.docFreq = 1; postings.docFreqs[termID] = 1;
p.lastDocCode = (docState.docID - p.lastDocID) << 1; postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
p.lastDocID = docState.docID; postings.lastDocIDs[termID] = docState.docID;
writeProx(p, fieldState.position); writeProx(termID, fieldState.position);
} else { } else {
p.docFreq++; postings.docFreqs[termID]++;
writeProx(p, fieldState.position-p.lastPosition); writeProx(termID, fieldState.position-postings.lastPositions[termID]);
} }
} }
} }
@Override
ParallelPostingsArray createPostingsArray(int size) {
return new FreqProxPostingsArray(size);
}
static final class FreqProxPostingsArray extends ParallelPostingsArray {
public FreqProxPostingsArray(int size) {
super(size);
docFreqs = new int[size];
lastDocIDs = new int[size];
lastDocCodes = new int[size];
lastPositions = new int[size];
}
int docFreqs[]; // # times this term occurs in the current doc
int lastDocIDs[]; // Last docID where this term occurred
int lastDocCodes[]; // Code for prior doc
int lastPositions[]; // Last position where this term occurred
@Override
ParallelPostingsArray resize(int newSize) {
FreqProxPostingsArray newArray = new FreqProxPostingsArray(newSize);
copy(this, newArray);
return newArray;
}
void copy(FreqProxPostingsArray fromArray, FreqProxPostingsArray toArray) {
super.copy(fromArray, toArray);
System.arraycopy(fromArray.docFreqs, 0, toArray.docFreqs, 0, fromArray.docFreqs.length);
System.arraycopy(fromArray.lastDocIDs, 0, toArray.lastDocIDs, 0, fromArray.lastDocIDs.length);
System.arraycopy(fromArray.lastDocCodes, 0, toArray.lastDocCodes, 0, fromArray.lastDocCodes.length);
System.arraycopy(fromArray.lastPositions, 0, toArray.lastPositions, 0, fromArray.lastPositions.length);
}
}
@Override
int bytesPerPosting() {
return ParallelPostingsArray.BYTES_PER_POSTING + 4 * DocumentsWriter.INT_NUM_BYTE;
}
public void abort() {} public void abort() {}
} }

View File

@ -0,0 +1,45 @@
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
class ParallelPostingsArray {
final static int BYTES_PER_POSTING = 3 * DocumentsWriter.INT_NUM_BYTE;
final int[] textStarts;
final int[] intStarts;
final int[] byteStarts;
public ParallelPostingsArray(final int size) {
textStarts = new int[size];
intStarts = new int[size];
byteStarts = new int[size];
}
ParallelPostingsArray resize(int newSize) {
ParallelPostingsArray newArray = new ParallelPostingsArray(newSize);
copy(this, newArray);
return newArray;
}
void copy(ParallelPostingsArray fromArray, ParallelPostingsArray toArray) {
System.arraycopy(fromArray.textStarts, 0, toArray.textStarts, 0, fromArray.textStarts.length);
System.arraycopy(fromArray.intStarts, 0, toArray.intStarts, 0, fromArray.intStarts.length);
System.arraycopy(fromArray.byteStarts, 0, toArray.byteStarts, 0, fromArray.byteStarts.length);
}
}

View File

@ -1,36 +0,0 @@
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** This is the base class for an in-memory posting list,
* keyed by a Token. {@link TermsHash} maintains a hash
* table holding one instance of this per unique Token.
* Consumers of TermsHash ({@link TermsHashConsumer}) must
* subclass this class with its own concrete class.
* FreqProxTermsWriter.PostingList is a private inner class used
* for the freq/prox postings, and
* TermVectorsTermsWriter.PostingList is a private inner class
* used to hold TermVectors postings. */
abstract class RawPostingList {
final static int BYTES_SIZE = DocumentsWriter.OBJECT_HEADER_BYTES + 3*DocumentsWriter.INT_NUM_BYTE;
int textStart;
int intStart;
int byteStart;
}

View File

@ -47,13 +47,6 @@ final class TermVectorsTermsWriter extends TermsHashConsumer {
return new TermVectorsTermsWriterPerThread(termsHashPerThread, this); return new TermVectorsTermsWriterPerThread(termsHashPerThread, this);
} }
@Override
void createPostings(RawPostingList[] postings, int start, int count) {
final int end = start + count;
for(int i=start;i<end;i++)
postings[i] = new PostingList();
}
@Override @Override
synchronized void flush(Map<TermsHashConsumerPerThread,Collection<TermsHashConsumerPerField>> threadsAndFields, final SegmentWriteState state) throws IOException { synchronized void flush(Map<TermsHashConsumerPerThread,Collection<TermsHashConsumerPerField>> threadsAndFields, final SegmentWriteState state) throws IOException {
@ -290,15 +283,4 @@ final class TermVectorsTermsWriter extends TermsHashConsumer {
finishDocument(this); finishDocument(this);
} }
} }
static final class PostingList extends RawPostingList {
int freq; // How many times this term occurred in the current doc
int lastOffset; // Last offset we saw
int lastPosition; // Last position where this term occurred
}
@Override
int bytesPerPosting() {
return RawPostingList.BYTES_SIZE + 3 * DocumentsWriter.INT_NUM_BYTE;
}
} }

View File

@ -124,8 +124,9 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
assert perThread.vectorFieldsInOrder(fieldInfo); assert perThread.vectorFieldsInOrder(fieldInfo);
perThread.doc.addField(termsHashPerField.fieldInfo.number); perThread.doc.addField(termsHashPerField.fieldInfo.number);
TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray;
final RawPostingList[] postings = termsHashPerField.sortPostings(); final int[] termIDs = termsHashPerField.sortPostings();
tvf.writeVInt(numPostings); tvf.writeVInt(numPostings);
byte bits = 0x0; byte bits = 0x0;
@ -141,11 +142,11 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
final ByteSliceReader reader = perThread.vectorSliceReader; final ByteSliceReader reader = perThread.vectorSliceReader;
final char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers; final char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers;
for(int j=0;j<numPostings;j++) { for(int j=0;j<numPostings;j++) {
final TermVectorsTermsWriter.PostingList posting = (TermVectorsTermsWriter.PostingList) postings[j]; final int termID = termIDs[j];
final int freq = posting.freq; final int freq = postings.freqs[termID];
final char[] text2 = charBuffers[posting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; final char[] text2 = charBuffers[postings.textStarts[termID] >> DocumentsWriter.CHAR_BLOCK_SHIFT];
final int start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK; final int start2 = postings.textStarts[termID] & DocumentsWriter.CHAR_BLOCK_MASK;
// We swap between two encoders to save copying // We swap between two encoders to save copying
// last Term's byte array // last Term's byte array
@ -178,12 +179,12 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
tvf.writeVInt(freq); tvf.writeVInt(freq);
if (doVectorPositions) { if (doVectorPositions) {
termsHashPerField.initReader(reader, posting, 0); termsHashPerField.initReader(reader, termID, 0);
reader.writeTo(tvf); reader.writeTo(tvf);
} }
if (doVectorOffsets) { if (doVectorOffsets) {
termsHashPerField.initReader(reader, posting, 1); termsHashPerField.initReader(reader, termID, 1);
reader.writeTo(tvf); reader.writeTo(tvf);
} }
} }
@ -207,13 +208,13 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
} }
@Override @Override
void newTerm(RawPostingList p0) { void newTerm(final int termID) {
assert docState.testPoint("TermVectorsTermsWriterPerField.newTerm start"); assert docState.testPoint("TermVectorsTermsWriterPerField.newTerm start");
TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList) p0; TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray;
p.freq = 1; postings.freqs[termID] = 1;
if (doVectorOffsets) { if (doVectorOffsets) {
int startOffset = fieldState.offset + offsetAttribute.startOffset(); int startOffset = fieldState.offset + offsetAttribute.startOffset();
@ -221,38 +222,76 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
termsHashPerField.writeVInt(1, startOffset); termsHashPerField.writeVInt(1, startOffset);
termsHashPerField.writeVInt(1, endOffset - startOffset); termsHashPerField.writeVInt(1, endOffset - startOffset);
p.lastOffset = endOffset; postings.lastOffsets[termID] = endOffset;
} }
if (doVectorPositions) { if (doVectorPositions) {
termsHashPerField.writeVInt(0, fieldState.position); termsHashPerField.writeVInt(0, fieldState.position);
p.lastPosition = fieldState.position; postings.lastPositions[termID] = fieldState.position;
} }
} }
@Override @Override
void addTerm(RawPostingList p0) { void addTerm(final int termID) {
assert docState.testPoint("TermVectorsTermsWriterPerField.addTerm start"); assert docState.testPoint("TermVectorsTermsWriterPerField.addTerm start");
TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList) p0; TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray;
p.freq++;
postings.freqs[termID]++;
if (doVectorOffsets) { if (doVectorOffsets) {
int startOffset = fieldState.offset + offsetAttribute.startOffset(); int startOffset = fieldState.offset + offsetAttribute.startOffset();
int endOffset = fieldState.offset + offsetAttribute.endOffset(); int endOffset = fieldState.offset + offsetAttribute.endOffset();
termsHashPerField.writeVInt(1, startOffset - p.lastOffset); termsHashPerField.writeVInt(1, startOffset - postings.lastOffsets[termID]);
termsHashPerField.writeVInt(1, endOffset - startOffset); termsHashPerField.writeVInt(1, endOffset - startOffset);
p.lastOffset = endOffset; postings.lastOffsets[termID] = endOffset;
} }
if (doVectorPositions) { if (doVectorPositions) {
termsHashPerField.writeVInt(0, fieldState.position - p.lastPosition); termsHashPerField.writeVInt(0, fieldState.position - postings.lastPositions[termID]);
p.lastPosition = fieldState.position; postings.lastPositions[termID] = fieldState.position;
} }
} }
@Override @Override
void skippingLongTerm() {} void skippingLongTerm() {}
@Override
ParallelPostingsArray createPostingsArray(int size) {
return new TermVectorsPostingsArray(size);
}
static final class TermVectorsPostingsArray extends ParallelPostingsArray {
public TermVectorsPostingsArray(int size) {
super(size);
freqs = new int[size];
lastOffsets = new int[size];
lastPositions = new int[size];
}
int[] freqs; // How many times this term occurred in the current doc
int[] lastOffsets; // Last offset we saw
int[] lastPositions; // Last position where this term occurred
@Override
ParallelPostingsArray resize(int newSize) {
TermVectorsPostingsArray newArray = new TermVectorsPostingsArray(newSize);
copy(this, newArray);
return newArray;
}
void copy(TermVectorsPostingsArray fromArray, TermVectorsPostingsArray toArray) {
super.copy(fromArray, toArray);
System.arraycopy(fromArray.freqs, 0, toArray.freqs, 0, fromArray.freqs.length);
System.arraycopy(fromArray.lastOffsets, 0, toArray.lastOffsets, 0, fromArray.lastOffsets.length);
System.arraycopy(fromArray.lastPositions, 0, toArray.lastPositions, 0, fromArray.lastPositions.length);
}
}
@Override
int bytesPerPosting() {
return ParallelPostingsArray.BYTES_PER_POSTING + 3 * DocumentsWriter.INT_NUM_BYTE;
}
} }

View File

@ -17,16 +17,12 @@ package org.apache.lucene.index;
* limitations under the License. * limitations under the License.
*/ */
import java.util.Collection;
import java.util.Map;
import java.util.HashMap;
import java.util.Iterator;
import java.util.HashSet;
import java.util.Arrays;
import java.io.IOException; import java.io.IOException;
import java.util.Collection;
import org.apache.lucene.util.ArrayUtil; import java.util.HashMap;
import org.apache.lucene.util.RamUsageEstimator; import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
/** This class implements {@link InvertedDocConsumer}, which /** This class implements {@link InvertedDocConsumer}, which
* is passed each token produced by the analyzer on each * is passed each token produced by the analyzer on each
@ -40,13 +36,8 @@ final class TermsHash extends InvertedDocConsumer {
final TermsHashConsumer consumer; final TermsHashConsumer consumer;
final TermsHash nextTermsHash; final TermsHash nextTermsHash;
final int bytesPerPosting;
final int postingsFreeChunk;
final DocumentsWriter docWriter; final DocumentsWriter docWriter;
private RawPostingList[] postingsFreeList = new RawPostingList[1];
private int postingsFreeCount;
private int postingsAllocCount;
boolean trackAllocations; boolean trackAllocations;
public TermsHash(final DocumentsWriter docWriter, boolean trackAllocations, final TermsHashConsumer consumer, final TermsHash nextTermsHash) { public TermsHash(final DocumentsWriter docWriter, boolean trackAllocations, final TermsHashConsumer consumer, final TermsHash nextTermsHash) {
@ -54,14 +45,6 @@ final class TermsHash extends InvertedDocConsumer {
this.consumer = consumer; this.consumer = consumer;
this.nextTermsHash = nextTermsHash; this.nextTermsHash = nextTermsHash;
this.trackAllocations = trackAllocations; this.trackAllocations = trackAllocations;
// Why + 4*POINTER_NUM_BYTE below?
// +1: Posting is referenced by postingsFreeList array
// +3: Posting is referenced by hash, which
// targets 25-50% fill factor; approximate this
// as 3X # pointers
bytesPerPosting = consumer.bytesPerPosting() + 4*DocumentsWriter.POINTER_NUM_BYTE;
postingsFreeChunk = (DocumentsWriter.BYTE_BLOCK_SIZE / bytesPerPosting);
} }
@Override @Override
@ -86,18 +69,6 @@ final class TermsHash extends InvertedDocConsumer {
nextTermsHash.abort(); nextTermsHash.abort();
} }
void shrinkFreePostings(Map<InvertedDocConsumerPerThread,Collection<InvertedDocConsumerPerField>> threadsAndFields, SegmentWriteState state) {
assert postingsFreeCount == postingsAllocCount: Thread.currentThread().getName() + ": postingsFreeCount=" + postingsFreeCount + " postingsAllocCount=" + postingsAllocCount + " consumer=" + consumer;
final int newSize = ArrayUtil.getShrinkSize(postingsFreeList.length, postingsAllocCount, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
if (newSize != postingsFreeList.length) {
RawPostingList[] newArray = new RawPostingList[newSize];
System.arraycopy(postingsFreeList, 0, newArray, 0, postingsFreeCount);
postingsFreeList = newArray;
}
}
@Override @Override
synchronized void closeDocStore(SegmentWriteState state) throws IOException { synchronized void closeDocStore(SegmentWriteState state) throws IOException {
consumer.closeDocStore(state); consumer.closeDocStore(state);
@ -144,91 +115,12 @@ final class TermsHash extends InvertedDocConsumer {
consumer.flush(childThreadsAndFields, state); consumer.flush(childThreadsAndFields, state);
shrinkFreePostings(threadsAndFields, state);
if (nextTermsHash != null) if (nextTermsHash != null)
nextTermsHash.flush(nextThreadsAndFields, state); nextTermsHash.flush(nextThreadsAndFields, state);
} }
@Override @Override
synchronized public boolean freeRAM() { synchronized public boolean freeRAM() {
return false;
if (!trackAllocations)
return false;
boolean any;
final int numToFree;
if (postingsFreeCount >= postingsFreeChunk)
numToFree = postingsFreeChunk;
else
numToFree = postingsFreeCount;
any = numToFree > 0;
if (any) {
Arrays.fill(postingsFreeList, postingsFreeCount-numToFree, postingsFreeCount, null);
postingsFreeCount -= numToFree;
postingsAllocCount -= numToFree;
docWriter.bytesAllocated(-numToFree * bytesPerPosting);
any = true;
}
if (nextTermsHash != null)
any |= nextTermsHash.freeRAM();
return any;
}
synchronized public void recyclePostings(final RawPostingList[] postings, final int numPostings) {
assert postings.length >= numPostings;
// Move all Postings from this ThreadState back to our
// free list. We pre-allocated this array while we were
// creating Postings to make sure it's large enough
assert postingsFreeCount + numPostings <= postingsFreeList.length;
System.arraycopy(postings, 0, postingsFreeList, postingsFreeCount, numPostings);
postingsFreeCount += numPostings;
}
synchronized public void getPostings(final RawPostingList[] postings) {
assert docWriter.writer.testPoint("TermsHash.getPostings start");
assert postingsFreeCount <= postingsFreeList.length;
assert postingsFreeCount <= postingsAllocCount: "postingsFreeCount=" + postingsFreeCount + " postingsAllocCount=" + postingsAllocCount;
final int numToCopy;
if (postingsFreeCount < postings.length)
numToCopy = postingsFreeCount;
else
numToCopy = postings.length;
final int start = postingsFreeCount-numToCopy;
assert start >= 0;
assert start + numToCopy <= postingsFreeList.length;
assert numToCopy <= postings.length;
System.arraycopy(postingsFreeList, start,
postings, 0, numToCopy);
// Directly allocate the remainder if any
if (numToCopy != postings.length) {
final int extra = postings.length - numToCopy;
final int newPostingsAllocCount = postingsAllocCount + extra;
consumer.createPostings(postings, numToCopy, extra);
assert docWriter.writer.testPoint("TermsHash.getPostings after create");
postingsAllocCount += extra;
if (trackAllocations)
docWriter.bytesAllocated(extra * bytesPerPosting);
if (newPostingsAllocCount > postingsFreeList.length)
// Pre-allocate the postingsFreeList so it's large
// enough to hold all postings we've given out
postingsFreeList = new RawPostingList[ArrayUtil.oversize(newPostingsAllocCount, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
}
postingsFreeCount -= numToCopy;
if (trackAllocations)
docWriter.bytesUsed(postings.length * bytesPerPosting);
} }
} }

View File

@ -22,8 +22,6 @@ import java.util.Collection;
import java.util.Map; import java.util.Map;
abstract class TermsHashConsumer { abstract class TermsHashConsumer {
abstract int bytesPerPosting();
abstract void createPostings(RawPostingList[] postings, int start, int count);
abstract TermsHashConsumerPerThread addThread(TermsHashPerThread perThread); abstract TermsHashConsumerPerThread addThread(TermsHashPerThread perThread);
abstract void flush(Map<TermsHashConsumerPerThread,Collection<TermsHashConsumerPerField>> threadsAndFields, final SegmentWriteState state) throws IOException; abstract void flush(Map<TermsHashConsumerPerThread,Collection<TermsHashConsumerPerField>> threadsAndFields, final SegmentWriteState state) throws IOException;
abstract void abort(); abstract void abort();

View File

@ -31,7 +31,11 @@ abstract class TermsHashConsumerPerField {
abstract void finish() throws IOException; abstract void finish() throws IOException;
abstract void skippingLongTerm() throws IOException; abstract void skippingLongTerm() throws IOException;
abstract void start(Fieldable field); abstract void start(Fieldable field);
abstract void newTerm(RawPostingList p) throws IOException; abstract void newTerm(int termID) throws IOException;
abstract void addTerm(RawPostingList p) throws IOException; abstract void addTerm(int termID) throws IOException;
abstract int getStreamCount(); abstract int getStreamCount();
abstract ParallelPostingsArray createPostingsArray(int size);
abstract int bytesPerPosting();
} }

View File

@ -27,6 +27,7 @@ import org.apache.lucene.util.UnicodeUtil;
final class TermsHashPerField extends InvertedDocConsumerPerField { final class TermsHashPerField extends InvertedDocConsumerPerField {
final TermsHashConsumerPerField consumer; final TermsHashConsumerPerField consumer;
final TermsHashPerField nextPerField; final TermsHashPerField nextPerField;
final TermsHashPerThread perThread; final TermsHashPerThread perThread;
final DocumentsWriter.DocState docState; final DocumentsWriter.DocState docState;
@ -48,8 +49,11 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
private int postingsHashSize = 4; private int postingsHashSize = 4;
private int postingsHashHalfSize = postingsHashSize/2; private int postingsHashHalfSize = postingsHashSize/2;
private int postingsHashMask = postingsHashSize-1; private int postingsHashMask = postingsHashSize-1;
private RawPostingList[] postingsHash = new RawPostingList[postingsHashSize]; private int[] postingsHash;
private RawPostingList p;
ParallelPostingsArray postingsArray;
private final int bytesPerPosting;
public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) { public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) {
this.perThread = perThread; this.perThread = perThread;
@ -57,6 +61,8 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
charPool = perThread.charPool; charPool = perThread.charPool;
bytePool = perThread.bytePool; bytePool = perThread.bytePool;
docState = perThread.docState; docState = perThread.docState;
postingsHash = new int[postingsHashSize];
Arrays.fill(postingsHash, -1);
fieldState = docInverterPerField.fieldState; fieldState = docInverterPerField.fieldState;
this.consumer = perThread.consumer.addField(this, fieldInfo); this.consumer = perThread.consumer.addField(this, fieldInfo);
streamCount = consumer.getStreamCount(); streamCount = consumer.getStreamCount();
@ -66,6 +72,21 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
nextPerField = (TermsHashPerField) nextPerThread.addField(docInverterPerField, fieldInfo); nextPerField = (TermsHashPerField) nextPerThread.addField(docInverterPerField, fieldInfo);
else else
nextPerField = null; nextPerField = null;
// +3: Posting is referenced by hash, which
// targets 25-50% fill factor; approximate this
// as 3X # pointers
bytesPerPosting = consumer.bytesPerPosting() + 3*DocumentsWriter.INT_NUM_BYTE;
}
void initPostingsArray() {
assert postingsArray == null;
postingsArray = consumer.createPostingsArray(postingsHashSize);
if (perThread.termsHash.trackAllocations) {
perThread.termsHash.docWriter.bytesAllocated(bytesPerPosting * postingsHashSize);
}
} }
void shrinkHash(int targetSize) { void shrinkHash(int targetSize) {
@ -79,7 +100,9 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
} }
if (newSize != postingsHash.length) { if (newSize != postingsHash.length) {
postingsHash = new RawPostingList[newSize]; postingsHash = new int[newSize];
Arrays.fill(postingsHash, -1);
postingsArray = null;
postingsHashSize = newSize; postingsHashSize = newSize;
postingsHashHalfSize = newSize/2; postingsHashHalfSize = newSize/2;
postingsHashMask = newSize-1; postingsHashMask = newSize-1;
@ -91,8 +114,7 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
compactPostings(); compactPostings();
assert numPostings <= postingsHash.length; assert numPostings <= postingsHash.length;
if (numPostings > 0) { if (numPostings > 0) {
perThread.termsHash.recyclePostings(postingsHash, numPostings); Arrays.fill(postingsHash, 0, numPostings, -1);
Arrays.fill(postingsHash, 0, numPostings, null);
numPostings = 0; numPostings = 0;
} }
postingsCompacted = false; postingsCompacted = false;
@ -106,23 +128,34 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
if (nextPerField != null) if (nextPerField != null)
nextPerField.abort(); nextPerField.abort();
} }
private void growParallelPostingsArray() {
int oldSize = postingsArray.byteStarts.length;
int newSize = (int) (oldSize * 1.5);
this.postingsArray = this.postingsArray.resize(newSize);
if (perThread.termsHash.trackAllocations) {
perThread.termsHash.docWriter.bytesAllocated(bytesPerPosting * (newSize - oldSize));
}
}
public void initReader(ByteSliceReader reader, RawPostingList p, int stream) { public void initReader(ByteSliceReader reader, int termID, int stream) {
assert stream < streamCount; assert stream < streamCount;
final int[] ints = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT]; int intStart = postingsArray.intStarts[termID];
final int upto = p.intStart & DocumentsWriter.INT_BLOCK_MASK; final int[] ints = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
final int upto = intStart & DocumentsWriter.INT_BLOCK_MASK;
reader.init(bytePool, reader.init(bytePool,
p.byteStart+stream*ByteBlockPool.FIRST_LEVEL_SIZE, postingsArray.byteStarts[termID]+stream*ByteBlockPool.FIRST_LEVEL_SIZE,
ints[upto+stream]); ints[upto+stream]);
} }
private synchronized void compactPostings() { private synchronized void compactPostings() {
int upto = 0; int upto = 0;
for(int i=0;i<postingsHashSize;i++) { for(int i=0;i<postingsHashSize;i++) {
if (postingsHash[i] != null) { if (postingsHash[i] != -1) {
if (upto < i) { if (upto < i) {
postingsHash[upto] = postingsHash[i]; postingsHash[upto] = postingsHash[i];
postingsHash[i] = null; postingsHash[i] = -1;
} }
upto++; upto++;
} }
@ -133,41 +166,41 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
} }
/** Collapse the hash table & sort in-place. */ /** Collapse the hash table & sort in-place. */
public RawPostingList[] sortPostings() { public int[] sortPostings() {
compactPostings(); compactPostings();
quickSort(postingsHash, 0, numPostings-1); quickSort(postingsHash, 0, numPostings-1);
return postingsHash; return postingsHash;
} }
void quickSort(RawPostingList[] postings, int lo, int hi) { void quickSort(int[] termIDs, int lo, int hi) {
if (lo >= hi) if (lo >= hi)
return; return;
else if (hi == 1+lo) { else if (hi == 1+lo) {
if (comparePostings(postings[lo], postings[hi]) > 0) { if (comparePostings(termIDs[lo], termIDs[hi]) > 0) {
final RawPostingList tmp = postings[lo]; final int tmp = termIDs[lo];
postings[lo] = postings[hi]; termIDs[lo] = termIDs[hi];
postings[hi] = tmp; termIDs[hi] = tmp;
} }
return; return;
} }
int mid = (lo + hi) >>> 1; int mid = (lo + hi) >>> 1;
if (comparePostings(postings[lo], postings[mid]) > 0) { if (comparePostings(termIDs[lo], termIDs[mid]) > 0) {
RawPostingList tmp = postings[lo]; int tmp = termIDs[lo];
postings[lo] = postings[mid]; termIDs[lo] = termIDs[mid];
postings[mid] = tmp; termIDs[mid] = tmp;
} }
if (comparePostings(postings[mid], postings[hi]) > 0) { if (comparePostings(termIDs[mid], termIDs[hi]) > 0) {
RawPostingList tmp = postings[mid]; int tmp = termIDs[mid];
postings[mid] = postings[hi]; termIDs[mid] = termIDs[hi];
postings[hi] = tmp; termIDs[hi] = tmp;
if (comparePostings(postings[lo], postings[mid]) > 0) { if (comparePostings(termIDs[lo], termIDs[mid]) > 0) {
RawPostingList tmp2 = postings[lo]; int tmp2 = termIDs[lo];
postings[lo] = postings[mid]; termIDs[lo] = termIDs[mid];
postings[mid] = tmp2; termIDs[mid] = tmp2;
} }
} }
@ -177,40 +210,43 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
if (left >= right) if (left >= right)
return; return;
RawPostingList partition = postings[mid]; int partition = termIDs[mid];
for (; ;) { for (; ;) {
while (comparePostings(postings[right], partition) > 0) while (comparePostings(termIDs[right], partition) > 0)
--right; --right;
while (left < right && comparePostings(postings[left], partition) <= 0) while (left < right && comparePostings(termIDs[left], partition) <= 0)
++left; ++left;
if (left < right) { if (left < right) {
RawPostingList tmp = postings[left]; int tmp = termIDs[left];
postings[left] = postings[right]; termIDs[left] = termIDs[right];
postings[right] = tmp; termIDs[right] = tmp;
--right; --right;
} else { } else {
break; break;
} }
} }
quickSort(postings, lo, left); quickSort(termIDs, lo, left);
quickSort(postings, left + 1, hi); quickSort(termIDs, left + 1, hi);
} }
/** Compares term text for two Posting instance and /** Compares term text for two Posting instance and
* returns -1 if p1 < p2; 1 if p1 > p2; else 0. */ * returns -1 if p1 < p2; 1 if p1 > p2; else 0. */
int comparePostings(RawPostingList p1, RawPostingList p2) { int comparePostings(int term1, int term2) {
if (p1 == p2) if (term1 == term2)
return 0; return 0;
final char[] text1 = charPool.buffers[p1.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; final int textStart1 = postingsArray.textStarts[term1];
int pos1 = p1.textStart & DocumentsWriter.CHAR_BLOCK_MASK; final int textStart2 = postingsArray.textStarts[term2];
final char[] text2 = charPool.buffers[p2.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
int pos2 = p2.textStart & DocumentsWriter.CHAR_BLOCK_MASK; final char[] text1 = charPool.buffers[textStart1 >> DocumentsWriter.CHAR_BLOCK_SHIFT];
int pos1 = textStart1 & DocumentsWriter.CHAR_BLOCK_MASK;
final char[] text2 = charPool.buffers[textStart2 >> DocumentsWriter.CHAR_BLOCK_SHIFT];
int pos2 = textStart2 & DocumentsWriter.CHAR_BLOCK_MASK;
assert text1 != text2 || pos1 != pos2; assert text1 != text2 || pos1 != pos2;
@ -233,11 +269,12 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
/** Test whether the text for current RawPostingList p equals /** Test whether the text for current RawPostingList p equals
* current tokenText. */ * current tokenText. */
private boolean postingEquals(final char[] tokenText, final int tokenTextLen) { private boolean postingEquals(final int termID, final char[] tokenText, final int tokenTextLen) {
final int textStart = postingsArray.textStarts[termID];
final char[] text = perThread.charPool.buffers[p.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
final char[] text = perThread.charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
assert text != null; assert text != null;
int pos = p.textStart & DocumentsWriter.CHAR_BLOCK_MASK; int pos = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
int tokenPos = 0; int tokenPos = 0;
for(;tokenPos<tokenTextLen;pos++,tokenPos++) for(;tokenPos<tokenTextLen;pos++,tokenPos++)
@ -251,6 +288,9 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
@Override @Override
void start(Fieldable f) { void start(Fieldable f) {
if (postingsArray == null) {
initPostingsArray();
}
termAtt = fieldState.attributeSource.addAttribute(TermAttribute.class); termAtt = fieldState.attributeSource.addAttribute(TermAttribute.class);
consumer.start(f); consumer.start(f);
if (nextPerField != null) { if (nextPerField != null) {
@ -270,7 +310,6 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
// because token text has already been "interned" into // because token text has already been "interned" into
// textStart, so we hash by textStart // textStart, so we hash by textStart
public void add(int textStart) throws IOException { public void add(int textStart) throws IOException {
int code = textStart; int code = textStart;
int hashPos = code & postingsHashMask; int hashPos = code & postingsHashMask;
@ -278,37 +317,39 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
assert !postingsCompacted; assert !postingsCompacted;
// Locate RawPostingList in hash // Locate RawPostingList in hash
p = postingsHash[hashPos]; int termID = postingsHash[hashPos];
if (p != null && p.textStart != textStart) { if (termID != -1 && postingsArray.textStarts[termID] != textStart) {
// Conflict: keep searching different locations in // Conflict: keep searching different locations in
// the hash table. // the hash table.
final int inc = ((code>>8)+code)|1; final int inc = ((code>>8)+code)|1;
do { do {
code += inc; code += inc;
hashPos = code & postingsHashMask; hashPos = code & postingsHashMask;
p = postingsHash[hashPos]; termID = postingsHash[hashPos];
} while (p != null && p.textStart != textStart); } while (termID != -1 && postingsArray.textStarts[termID] != textStart);
} }
if (p == null) { if (termID == -1) {
// First time we are seeing this token since we last // First time we are seeing this token since we last
// flushed the hash. // flushed the hash.
// Refill? // New posting
if (0 == perThread.freePostingsCount) termID = numPostings++;
perThread.morePostings(); if (termID >= postingsArray.textStarts.length) {
growParallelPostingsArray();
}
if (perThread.termsHash.trackAllocations) {
perThread.termsHash.docWriter.bytesUsed(bytesPerPosting);
}
// Pull next free RawPostingList from free list assert termID >= 0;
p = perThread.freePostings[--perThread.freePostingsCount];
assert p != null;
p.textStart = textStart; postingsArray.textStarts[termID] = textStart;
assert postingsHash[hashPos] == null; assert postingsHash[hashPos] == -1;
postingsHash[hashPos] = p; postingsHash[hashPos] = termID;
numPostings++;
if (numPostings == postingsHashHalfSize) if (numPostings == postingsHashHalfSize)
rehashPostings(2*postingsHashSize); rehashPostings(2*postingsHashSize);
@ -324,20 +365,21 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
intUptoStart = intPool.intUpto; intUptoStart = intPool.intUpto;
intPool.intUpto += streamCount; intPool.intUpto += streamCount;
p.intStart = intUptoStart + intPool.intOffset; postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;
for(int i=0;i<streamCount;i++) { for(int i=0;i<streamCount;i++) {
final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE); final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
intUptos[intUptoStart+i] = upto + bytePool.byteOffset; intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
} }
p.byteStart = intUptos[intUptoStart]; postingsArray.byteStarts[termID] = intUptos[intUptoStart];
consumer.newTerm(p); consumer.newTerm(termID);
} else { } else {
intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT]; int intStart = postingsArray.intStarts[termID];
intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK; intUptos = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
consumer.addTerm(p); intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK;
consumer.addTerm(termID);
} }
} }
@ -389,20 +431,20 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
int hashPos = code & postingsHashMask; int hashPos = code & postingsHashMask;
// Locate RawPostingList in hash // Locate RawPostingList in hash
p = postingsHash[hashPos]; int termID = postingsHash[hashPos];
if (p != null && !postingEquals(tokenText, tokenTextLen)) { if (termID != -1 && !postingEquals(termID, tokenText, tokenTextLen)) {
// Conflict: keep searching different locations in // Conflict: keep searching different locations in
// the hash table. // the hash table.
final int inc = ((code>>8)+code)|1; final int inc = ((code>>8)+code)|1;
do { do {
code += inc; code += inc;
hashPos = code & postingsHashMask; hashPos = code & postingsHashMask;
p = postingsHash[hashPos]; termID = postingsHash[hashPos];
} while (p != null && !postingEquals(tokenText, tokenTextLen)); } while (termID != -1 && !postingEquals(termID, tokenText, tokenTextLen));
} }
if (p == null) { if (termID == -1) {
// First time we are seeing this token since we last // First time we are seeing this token since we last
// flushed the hash. // flushed the hash.
@ -424,24 +466,26 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
charPool.nextBuffer(); charPool.nextBuffer();
} }
// Refill? // New posting
if (0 == perThread.freePostingsCount) termID = numPostings++;
perThread.morePostings(); if (termID >= postingsArray.textStarts.length) {
growParallelPostingsArray();
}
if (perThread.termsHash.trackAllocations) {
perThread.termsHash.docWriter.bytesUsed(bytesPerPosting);
}
// Pull next free RawPostingList from free list assert termID != -1;
p = perThread.freePostings[--perThread.freePostingsCount];
assert p != null;
final char[] text = charPool.buffer; final char[] text = charPool.buffer;
final int textUpto = charPool.charUpto; final int textUpto = charPool.charUpto;
p.textStart = textUpto + charPool.charOffset; postingsArray.textStarts[termID] = textUpto + charPool.charOffset;
charPool.charUpto += textLen1; charPool.charUpto += textLen1;
System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen); System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen);
text[textUpto+tokenTextLen] = 0xffff; text[textUpto+tokenTextLen] = 0xffff;
assert postingsHash[hashPos] == null; assert postingsHash[hashPos] == -1;
postingsHash[hashPos] = p; postingsHash[hashPos] = termID;
numPostings++;
if (numPostings == postingsHashHalfSize) if (numPostings == postingsHashHalfSize)
rehashPostings(2*postingsHashSize); rehashPostings(2*postingsHashSize);
@ -457,24 +501,25 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
intUptoStart = intPool.intUpto; intUptoStart = intPool.intUpto;
intPool.intUpto += streamCount; intPool.intUpto += streamCount;
p.intStart = intUptoStart + intPool.intOffset; postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;
for(int i=0;i<streamCount;i++) { for(int i=0;i<streamCount;i++) {
final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE); final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
intUptos[intUptoStart+i] = upto + bytePool.byteOffset; intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
} }
p.byteStart = intUptos[intUptoStart]; postingsArray.byteStarts[termID] = intUptos[intUptoStart];
consumer.newTerm(p); consumer.newTerm(termID);
} else { } else {
intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT]; final int intStart = postingsArray.intStarts[termID];
intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK; intUptos = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
consumer.addTerm(p); intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK;
consumer.addTerm(termID);
} }
if (doNextCall) if (doNextCall)
nextPerField.add(p.textStart); nextPerField.add(postingsArray.textStarts[termID]);
} }
int[] intUptos; int[] intUptos;
@ -524,14 +569,16 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
final int newMask = newSize-1; final int newMask = newSize-1;
RawPostingList[] newHash = new RawPostingList[newSize]; int[] newHash = new int[newSize];
Arrays.fill(newHash, -1);
for(int i=0;i<postingsHashSize;i++) { for(int i=0;i<postingsHashSize;i++) {
RawPostingList p0 = postingsHash[i]; int termID = postingsHash[i];
if (p0 != null) { if (termID != -1) {
int code; int code;
if (perThread.primary) { if (perThread.primary) {
final int start = p0.textStart & DocumentsWriter.CHAR_BLOCK_MASK; final int textStart = postingsArray.textStarts[termID];
final char[] text = charPool.buffers[p0.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; final int start = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
final char[] text = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
int pos = start; int pos = start;
while(text[pos] != 0xffff) while(text[pos] != 0xffff)
pos++; pos++;
@ -539,18 +586,18 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
while (pos > start) while (pos > start)
code = (code*31) + text[--pos]; code = (code*31) + text[--pos];
} else } else
code = p0.textStart; code = postingsArray.textStarts[termID];
int hashPos = code & newMask; int hashPos = code & newMask;
assert hashPos >= 0; assert hashPos >= 0;
if (newHash[hashPos] != null) { if (newHash[hashPos] != -1) {
final int inc = ((code>>8)+code)|1; final int inc = ((code>>8)+code)|1;
do { do {
code += inc; code += inc;
hashPos = code & newMask; hashPos = code & newMask;
} while (newHash[hashPos] != null); } while (newHash[hashPos] != -1);
} }
newHash[hashPos] = p0; newHash[hashPos] = termID;
} }
} }

View File

@ -31,9 +31,6 @@ final class TermsHashPerThread extends InvertedDocConsumerPerThread {
final boolean primary; final boolean primary;
final DocumentsWriter.DocState docState; final DocumentsWriter.DocState docState;
final RawPostingList freePostings[] = new RawPostingList[256];
int freePostingsCount;
public TermsHashPerThread(DocInverterPerThread docInverterPerThread, final TermsHash termsHash, final TermsHash nextTermsHash, final TermsHashPerThread primaryPerThread) { public TermsHashPerThread(DocInverterPerThread docInverterPerThread, final TermsHash termsHash, final TermsHash nextTermsHash, final TermsHashPerThread primaryPerThread) {
docState = docInverterPerThread.docState; docState = docInverterPerThread.docState;
@ -71,20 +68,6 @@ final class TermsHashPerThread extends InvertedDocConsumerPerThread {
nextPerThread.abort(); nextPerThread.abort();
} }
// perField calls this when it needs more postings:
void morePostings() throws IOException {
assert freePostingsCount == 0;
termsHash.getPostings(freePostings);
freePostingsCount = freePostings.length;
assert noNullPostings(freePostings, freePostingsCount, "consumer=" + consumer);
}
private static boolean noNullPostings(RawPostingList[] postings, int count, String details) {
for(int i=0;i<count;i++)
assert postings[i] != null: "postings[" + i + "] of " + count + " is null: " + details;
return true;
}
@Override @Override
public void startDocument() throws IOException { public void startDocument() throws IOException {
consumer.startDocument(); consumer.startDocument();
@ -116,10 +99,5 @@ final class TermsHashPerThread extends InvertedDocConsumerPerThread {
if (primary) if (primary)
charPool.reset(); charPool.reset();
if (recyclePostings) {
termsHash.recyclePostings(freePostings, freePostingsCount);
freePostingsCount = 0;
}
} }
} }