mirror of https://github.com/apache/lucene.git
LUCENE-1426: next steps towards flexible indexing: use the same API when writing a new segment
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@707836 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f2c988ae2b
commit
fa79b04042
|
@ -54,6 +54,14 @@ class DefaultSkipListWriter extends MultiLevelSkipListWriter {
|
|||
lastSkipProxPointer = new long[numberOfSkipLevels];
|
||||
}
|
||||
|
||||
void setFreqOutput(IndexOutput freqOutput) {
|
||||
this.freqOutput = freqOutput;
|
||||
}
|
||||
|
||||
void setProxOutput(IndexOutput proxOutput) {
|
||||
this.proxOutput = proxOutput;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the values for the current skip data.
|
||||
*/
|
||||
|
|
|
@ -22,8 +22,8 @@ import java.util.Collection;
|
|||
|
||||
abstract class DocConsumer {
|
||||
abstract DocConsumerPerThread addThread(DocumentsWriterThreadState perThread) throws IOException;
|
||||
abstract void flush(final Collection threads, final DocumentsWriter.FlushState state) throws IOException;
|
||||
abstract void closeDocStore(final DocumentsWriter.FlushState state) throws IOException;
|
||||
abstract void flush(final Collection threads, final SegmentWriteState state) throws IOException;
|
||||
abstract void closeDocStore(final SegmentWriteState state) throws IOException;
|
||||
abstract void abort();
|
||||
abstract boolean freeRAM();
|
||||
}
|
||||
|
|
|
@ -26,11 +26,11 @@ abstract class DocFieldConsumer {
|
|||
|
||||
/** Called when DocumentsWriter decides to create a new
|
||||
* segment */
|
||||
abstract void flush(Map threadsAndFields, DocumentsWriter.FlushState state) throws IOException;
|
||||
abstract void flush(Map threadsAndFields, SegmentWriteState state) throws IOException;
|
||||
|
||||
/** Called when DocumentsWriter decides to close the doc
|
||||
* stores */
|
||||
abstract void closeDocStore(DocumentsWriter.FlushState state) throws IOException;
|
||||
abstract void closeDocStore(SegmentWriteState state) throws IOException;
|
||||
|
||||
/** Called when an aborting exception is hit */
|
||||
abstract void abort();
|
||||
|
|
|
@ -44,7 +44,7 @@ final class DocFieldConsumers extends DocFieldConsumer {
|
|||
two.setFieldInfos(fieldInfos);
|
||||
}
|
||||
|
||||
public void flush(Map threadsAndFields, DocumentsWriter.FlushState state) throws IOException {
|
||||
public void flush(Map threadsAndFields, SegmentWriteState state) throws IOException {
|
||||
|
||||
Map oneThreadsAndFields = new HashMap();
|
||||
Map twoThreadsAndFields = new HashMap();
|
||||
|
@ -76,7 +76,7 @@ final class DocFieldConsumers extends DocFieldConsumer {
|
|||
two.flush(twoThreadsAndFields, state);
|
||||
}
|
||||
|
||||
public void closeDocStore(DocumentsWriter.FlushState state) throws IOException {
|
||||
public void closeDocStore(SegmentWriteState state) throws IOException {
|
||||
try {
|
||||
one.closeDocStore(state);
|
||||
} finally {
|
||||
|
|
|
@ -43,11 +43,11 @@ final class DocFieldProcessor extends DocConsumer {
|
|||
consumer.setFieldInfos(fieldInfos);
|
||||
}
|
||||
|
||||
public void closeDocStore(DocumentsWriter.FlushState state) throws IOException {
|
||||
public void closeDocStore(SegmentWriteState state) throws IOException {
|
||||
consumer.closeDocStore(state);
|
||||
}
|
||||
|
||||
public void flush(Collection threads, DocumentsWriter.FlushState state) throws IOException {
|
||||
public void flush(Collection threads, SegmentWriteState state) throws IOException {
|
||||
|
||||
Map childThreadsAndFields = new HashMap();
|
||||
Iterator it = threads.iterator();
|
||||
|
@ -63,7 +63,9 @@ final class DocFieldProcessor extends DocConsumer {
|
|||
// consumer can alter the FieldInfo* if necessary. EG,
|
||||
// FreqProxTermsWriter does this with
|
||||
// FieldInfo.storePayload.
|
||||
fieldInfos.write(state.directory, state.segmentName + ".fnm");
|
||||
final String fileName = state.segmentFileName(IndexFileNames.FIELD_INFOS_EXTENSION);
|
||||
fieldInfos.write(state.directory, fileName);
|
||||
state.flushedFiles.add(fileName);
|
||||
}
|
||||
|
||||
public void abort() {
|
||||
|
|
|
@ -87,7 +87,7 @@ final class DocFieldProcessorPerThread extends DocConsumerPerThread {
|
|||
/** If there are fields we've seen but did not see again
|
||||
* in the last run, then free them up. */
|
||||
|
||||
void trimFields(DocumentsWriter.FlushState state) {
|
||||
void trimFields(SegmentWriteState state) {
|
||||
|
||||
for(int i=0;i<fieldHash.length;i++) {
|
||||
DocFieldProcessorPerField perField = fieldHash[i];
|
||||
|
|
|
@ -44,7 +44,7 @@ final class DocInverter extends DocFieldConsumer {
|
|||
endConsumer.setFieldInfos(fieldInfos);
|
||||
}
|
||||
|
||||
void flush(Map threadsAndFields, DocumentsWriter.FlushState state) throws IOException {
|
||||
void flush(Map threadsAndFields, SegmentWriteState state) throws IOException {
|
||||
|
||||
Map childThreadsAndFields = new HashMap();
|
||||
Map endChildThreadsAndFields = new HashMap();
|
||||
|
@ -75,7 +75,7 @@ final class DocInverter extends DocFieldConsumer {
|
|||
endConsumer.flush(endChildThreadsAndFields, state);
|
||||
}
|
||||
|
||||
public void closeDocStore(DocumentsWriter.FlushState state) throws IOException {
|
||||
public void closeDocStore(SegmentWriteState state) throws IOException {
|
||||
consumer.closeDocStore(state);
|
||||
endConsumer.closeDocStore(state);
|
||||
}
|
||||
|
|
|
@ -156,20 +156,6 @@ final class DocumentsWriter {
|
|||
}
|
||||
}
|
||||
|
||||
static class FlushState {
|
||||
DocumentsWriter docWriter;
|
||||
Directory directory;
|
||||
String segmentName;
|
||||
String docStoreSegmentName;
|
||||
int numDocsInRAM;
|
||||
int numDocsInStore;
|
||||
Collection flushedFiles;
|
||||
|
||||
public String segmentFileName(String ext) {
|
||||
return segmentName + "." + ext;
|
||||
}
|
||||
}
|
||||
|
||||
/** Consumer returns this on each doc. This holds any
|
||||
* state that must be flushed synchronized "in docID
|
||||
* order". We gather these and flush them in order. */
|
||||
|
@ -402,7 +388,7 @@ final class DocumentsWriter {
|
|||
|
||||
private Collection abortedFiles; // List of files that were written before last abort()
|
||||
|
||||
private FlushState flushState;
|
||||
private SegmentWriteState flushState;
|
||||
|
||||
Collection abortedFiles() {
|
||||
return abortedFiles;
|
||||
|
@ -545,18 +531,7 @@ final class DocumentsWriter {
|
|||
|
||||
synchronized private void initFlushState(boolean onlyDocStore) {
|
||||
initSegmentName(onlyDocStore);
|
||||
|
||||
if (flushState == null) {
|
||||
flushState = new FlushState();
|
||||
flushState.directory = directory;
|
||||
flushState.docWriter = this;
|
||||
}
|
||||
|
||||
flushState.docStoreSegmentName = docStoreSegment;
|
||||
flushState.segmentName = segment;
|
||||
flushState.numDocsInRAM = numDocsInRAM;
|
||||
flushState.numDocsInStore = numDocsInStore;
|
||||
flushState.flushedFiles = new HashSet();
|
||||
flushState = new SegmentWriteState(this, directory, segment, docStoreSegment, numDocsInRAM, numDocsInStore, writer.getTermIndexInterval());
|
||||
}
|
||||
|
||||
/** Flush all pending docs to a new segment */
|
||||
|
@ -602,7 +577,7 @@ final class DocumentsWriter {
|
|||
message(message);
|
||||
}
|
||||
|
||||
flushedDocCount += flushState.numDocsInRAM;
|
||||
flushedDocCount += flushState.numDocs;
|
||||
|
||||
doAfterFlush();
|
||||
|
||||
|
@ -616,7 +591,7 @@ final class DocumentsWriter {
|
|||
|
||||
assert waitQueue.waitingBytes == 0;
|
||||
|
||||
return flushState.numDocsInRAM;
|
||||
return flushState.numDocs;
|
||||
}
|
||||
|
||||
/** Build compound file for the segment we just flushed */
|
||||
|
|
|
@ -0,0 +1,34 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* NOTE: this API is experimental and will likely change
|
||||
*/
|
||||
|
||||
abstract class FormatPostingsDocsConsumer {
|
||||
|
||||
/** Adds a new doc in this term. If this returns null
|
||||
* then we just skip consuming positions/payloads. */
|
||||
abstract FormatPostingsPositionsConsumer addDoc(int docID, int termDocFreq) throws IOException;
|
||||
|
||||
/** Called when we are done adding docs to this term */
|
||||
abstract void finish() throws IOException;
|
||||
}
|
|
@ -0,0 +1,127 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/** Consumes doc & freq, writing them using the current
|
||||
* index file format */
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
|
||||
final class FormatPostingsDocsWriter extends FormatPostingsDocsConsumer {
|
||||
|
||||
final IndexOutput out;
|
||||
final FormatPostingsTermsWriter parent;
|
||||
final FormatPostingsPositionsWriter posWriter;
|
||||
final DefaultSkipListWriter skipListWriter;
|
||||
final int skipInterval;
|
||||
final int totalNumDocs;
|
||||
|
||||
boolean omitTF;
|
||||
boolean storePayloads;
|
||||
long freqStart;
|
||||
FieldInfo fieldInfo;
|
||||
|
||||
FormatPostingsDocsWriter(SegmentWriteState state, FormatPostingsTermsWriter parent) throws IOException {
|
||||
super();
|
||||
this.parent = parent;
|
||||
final String fileName = IndexFileNames.segmentFileName(parent.parent.segment, IndexFileNames.FREQ_EXTENSION);
|
||||
state.flushedFiles.add(fileName);
|
||||
out = parent.parent.dir.createOutput(fileName);
|
||||
totalNumDocs = parent.parent.totalNumDocs;
|
||||
|
||||
// TODO: abstraction violation
|
||||
skipInterval = parent.parent.termsOut.skipInterval;
|
||||
skipListWriter = parent.parent.skipListWriter;
|
||||
skipListWriter.setFreqOutput(out);
|
||||
|
||||
posWriter = new FormatPostingsPositionsWriter(state, this);
|
||||
}
|
||||
|
||||
void setField(FieldInfo fieldInfo) {
|
||||
this.fieldInfo = fieldInfo;
|
||||
omitTF = fieldInfo.omitTf;
|
||||
storePayloads = fieldInfo.storePayloads;
|
||||
posWriter.setField(fieldInfo);
|
||||
}
|
||||
|
||||
int lastDocID;
|
||||
int df;
|
||||
|
||||
/** Adds a new doc in this term. If this returns null
|
||||
* then we just skip consuming positions/payloads. */
|
||||
FormatPostingsPositionsConsumer addDoc(int docID, int termDocFreq) throws IOException {
|
||||
|
||||
final int delta = docID - lastDocID;
|
||||
|
||||
if (docID < 0 || (df > 0 && delta <= 0))
|
||||
throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )");
|
||||
|
||||
if ((++df % skipInterval) == 0) {
|
||||
// TODO: abstraction violation
|
||||
skipListWriter.setSkipData(lastDocID, storePayloads, posWriter.lastPayloadLength);
|
||||
skipListWriter.bufferSkip(df);
|
||||
}
|
||||
|
||||
assert docID < totalNumDocs: "docID=" + docID + " totalNumDocs=" + totalNumDocs;
|
||||
|
||||
lastDocID = docID;
|
||||
if (omitTF)
|
||||
out.writeVInt(delta);
|
||||
else if (1 == termDocFreq)
|
||||
out.writeVInt((delta<<1) | 1);
|
||||
else {
|
||||
out.writeVInt(delta<<1);
|
||||
out.writeVInt(termDocFreq);
|
||||
}
|
||||
|
||||
return posWriter;
|
||||
}
|
||||
|
||||
private final TermInfo termInfo = new TermInfo(); // minimize consing
|
||||
final UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result();
|
||||
|
||||
/** Called when we are done adding docs to this term */
|
||||
void finish() throws IOException {
|
||||
long skipPointer = skipListWriter.writeSkip(out);
|
||||
|
||||
// TODO: this is abstraction violation -- we should not
|
||||
// peek up into parents terms encoding format
|
||||
termInfo.set(df, parent.freqStart, parent.proxStart, (int) (skipPointer - parent.freqStart));
|
||||
|
||||
// TODO: we could do this incrementally
|
||||
UnicodeUtil.UTF16toUTF8(parent.currentTerm, parent.currentTermStart, utf8);
|
||||
|
||||
if (df > 0) {
|
||||
parent.termsOut.add(fieldInfo.number,
|
||||
utf8.result,
|
||||
utf8.length,
|
||||
termInfo);
|
||||
}
|
||||
|
||||
lastDocID = 0;
|
||||
df = 0;
|
||||
}
|
||||
|
||||
void close() throws IOException {
|
||||
out.close();
|
||||
posWriter.close();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/** Abstract API that consumes terms, doc, freq, prox and
|
||||
* payloads postings. Concrete implementations of this
|
||||
* actually do "something" with the postings (write it into
|
||||
* the index in a specific format).
|
||||
*
|
||||
* NOTE: this API is experimental and will likely change
|
||||
*/
|
||||
abstract class FormatPostingsFieldsConsumer {
|
||||
|
||||
/** Add a new field */
|
||||
abstract FormatPostingsTermsConsumer addField(FieldInfo field) throws IOException;
|
||||
|
||||
/** Called when we are done adding everything. */
|
||||
abstract void finish() throws IOException;
|
||||
}
|
|
@ -0,0 +1,73 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
||||
final class FormatPostingsFieldsWriter extends FormatPostingsFieldsConsumer {
|
||||
|
||||
final Directory dir;
|
||||
final String segment;
|
||||
final TermInfosWriter termsOut;
|
||||
final FieldInfos fieldInfos;
|
||||
final FormatPostingsTermsWriter termsWriter;
|
||||
final DefaultSkipListWriter skipListWriter;
|
||||
final int totalNumDocs;
|
||||
|
||||
public FormatPostingsFieldsWriter(SegmentWriteState state, FieldInfos fieldInfos) throws IOException {
|
||||
super();
|
||||
|
||||
dir = state.directory;
|
||||
segment = state.segmentName;
|
||||
totalNumDocs = state.numDocs;
|
||||
this.fieldInfos = fieldInfos;
|
||||
termsOut = new TermInfosWriter(dir,
|
||||
segment,
|
||||
fieldInfos,
|
||||
state.termIndexInterval);
|
||||
|
||||
// TODO: this is a nasty abstraction violation (that we
|
||||
// peek down to find freqOut/proxOut) -- we need a
|
||||
// better abstraction here whereby these child consumers
|
||||
// can provide skip data or not
|
||||
skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval,
|
||||
termsOut.maxSkipLevels,
|
||||
totalNumDocs,
|
||||
null,
|
||||
null);
|
||||
|
||||
state.flushedFiles.add(state.segmentFileName(IndexFileNames.TERMS_EXTENSION));
|
||||
state.flushedFiles.add(state.segmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION));
|
||||
|
||||
termsWriter = new FormatPostingsTermsWriter(state, this);
|
||||
}
|
||||
|
||||
/** Add a new field */
|
||||
FormatPostingsTermsConsumer addField(FieldInfo field) {
|
||||
termsWriter.setField(field);
|
||||
return termsWriter;
|
||||
}
|
||||
|
||||
/** Called when we are done adding everything. */
|
||||
void finish() throws IOException {
|
||||
termsOut.close();
|
||||
termsWriter.close();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
||||
abstract class FormatPostingsPositionsConsumer {
|
||||
|
||||
/** Add a new position & payload. If payloadLength > 0
|
||||
* you must read those bytes from the IndexInput. */
|
||||
abstract void addPosition(int position, byte[] payload, int payloadOffset, int payloadLength) throws IOException;
|
||||
|
||||
/** Called when we are done adding positions & payloads */
|
||||
abstract void finish() throws IOException;
|
||||
}
|
|
@ -0,0 +1,87 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
final class FormatPostingsPositionsWriter extends FormatPostingsPositionsConsumer {
|
||||
|
||||
final FormatPostingsDocsWriter parent;
|
||||
final IndexOutput out;
|
||||
|
||||
boolean omitTF;
|
||||
boolean storePayloads;
|
||||
int lastPayloadLength = -1;
|
||||
|
||||
FormatPostingsPositionsWriter(SegmentWriteState state, FormatPostingsDocsWriter parent) throws IOException {
|
||||
this.parent = parent;
|
||||
omitTF = parent.omitTF;
|
||||
if (parent.parent.parent.fieldInfos.hasProx()) {
|
||||
// At least one field does not omit TF, so create the
|
||||
// prox file
|
||||
final String fileName = IndexFileNames.segmentFileName(parent.parent.parent.segment, IndexFileNames.PROX_EXTENSION);
|
||||
state.flushedFiles.add(fileName);
|
||||
out = parent.parent.parent.dir.createOutput(fileName);
|
||||
parent.skipListWriter.setProxOutput(out);
|
||||
} else
|
||||
// Every field omits TF so we will write no prox file
|
||||
out = null;
|
||||
}
|
||||
|
||||
int lastPosition;
|
||||
|
||||
/** Add a new position & payload */
|
||||
void addPosition(int position, byte[] payload, int payloadOffset, int payloadLength) throws IOException {
|
||||
assert !omitTF: "omitTF is true";
|
||||
assert out != null;
|
||||
|
||||
final int delta = position - lastPosition;
|
||||
lastPosition = position;
|
||||
|
||||
if (storePayloads) {
|
||||
if (payloadLength != lastPayloadLength) {
|
||||
lastPayloadLength = payloadLength;
|
||||
out.writeVInt((delta<<1)|1);
|
||||
out.writeVInt(payloadLength);
|
||||
} else
|
||||
out.writeVInt(delta << 1);
|
||||
if (payloadLength > 0)
|
||||
out.writeBytes(payload, payloadLength);
|
||||
} else
|
||||
out.writeVInt(delta);
|
||||
}
|
||||
|
||||
void setField(FieldInfo fieldInfo) {
|
||||
omitTF = fieldInfo.omitTf;
|
||||
storePayloads = omitTF ? false : fieldInfo.storePayloads;
|
||||
}
|
||||
|
||||
/** Called when we are done adding positions & payloads */
|
||||
void finish() {
|
||||
lastPosition = 0;
|
||||
lastPayloadLength = -1;
|
||||
}
|
||||
|
||||
void close() throws IOException {
|
||||
if (out != null)
|
||||
out.close();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
||||
/**
|
||||
* NOTE: this API is experimental and will likely change
|
||||
*/
|
||||
|
||||
abstract class FormatPostingsTermsConsumer {
|
||||
|
||||
/** Adds a new term in this field; term ends with U+FFFF
|
||||
* char */
|
||||
abstract FormatPostingsDocsConsumer addTerm(char[] text, int start) throws IOException;
|
||||
|
||||
char[] termBuffer;
|
||||
FormatPostingsDocsConsumer addTerm(String text) throws IOException {
|
||||
final int len = text.length();
|
||||
if (termBuffer == null || termBuffer.length < 1+len)
|
||||
termBuffer = new char[ArrayUtil.getNextSize(1+len)];
|
||||
text.getChars(0, len, termBuffer, 0);
|
||||
termBuffer[len] = 0xffff;
|
||||
return addTerm(termBuffer, 0);
|
||||
}
|
||||
|
||||
/** Called when we are done adding terms to this field */
|
||||
abstract void finish() throws IOException;
|
||||
}
|
|
@ -0,0 +1,71 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
final class FormatPostingsTermsWriter extends FormatPostingsTermsConsumer {
|
||||
|
||||
final FormatPostingsFieldsWriter parent;
|
||||
final FormatPostingsDocsWriter docsWriter;
|
||||
final TermInfosWriter termsOut;
|
||||
FieldInfo fieldInfo;
|
||||
|
||||
FormatPostingsTermsWriter(SegmentWriteState state, FormatPostingsFieldsWriter parent) throws IOException {
|
||||
super();
|
||||
this.parent = parent;
|
||||
termsOut = parent.termsOut;
|
||||
docsWriter = new FormatPostingsDocsWriter(state, this);
|
||||
}
|
||||
|
||||
void setField(FieldInfo fieldInfo) {
|
||||
this.fieldInfo = fieldInfo;
|
||||
docsWriter.setField(fieldInfo);
|
||||
}
|
||||
|
||||
char[] currentTerm;
|
||||
int currentTermStart;
|
||||
|
||||
long freqStart;
|
||||
long proxStart;
|
||||
|
||||
/** Adds a new term in this field */
|
||||
FormatPostingsDocsConsumer addTerm(char[] text, int start) {
|
||||
currentTerm = text;
|
||||
currentTermStart = start;
|
||||
|
||||
// TODO: this is abstraction violation -- ideally this
|
||||
// terms writer is not so "invasive", looking for file
|
||||
// pointers in its child consumers.
|
||||
freqStart = docsWriter.out.getFilePointer();
|
||||
if (docsWriter.posWriter.out != null)
|
||||
proxStart = docsWriter.posWriter.out.getFilePointer();
|
||||
|
||||
parent.skipListWriter.resetSkip();
|
||||
|
||||
return docsWriter;
|
||||
}
|
||||
|
||||
/** Called when we are done adding terms to this field */
|
||||
void finish() {
|
||||
}
|
||||
|
||||
void close() throws IOException {
|
||||
docsWriter.close();
|
||||
}
|
||||
}
|
|
@ -57,7 +57,7 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
|
|||
}
|
||||
}
|
||||
|
||||
void closeDocStore(DocumentsWriter.FlushState state) {}
|
||||
void closeDocStore(SegmentWriteState state) {}
|
||||
void abort() {}
|
||||
|
||||
|
||||
|
@ -66,7 +66,7 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
|
|||
// under the same FieldInfo together, up into TermsHash*.
|
||||
// Other writers would presumably share alot of this...
|
||||
|
||||
public void flush(Map threadsAndFields, final DocumentsWriter.FlushState state) throws IOException {
|
||||
public void flush(Map threadsAndFields, final SegmentWriteState state) throws IOException {
|
||||
|
||||
// Gather all FieldData's that have postings, across all
|
||||
// ThreadStates
|
||||
|
@ -92,22 +92,19 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
|
|||
Collections.sort(allFields);
|
||||
final int numAllFields = allFields.size();
|
||||
|
||||
final TermInfosWriter termsOut = new TermInfosWriter(state.directory,
|
||||
state.segmentName,
|
||||
fieldInfos,
|
||||
state.docWriter.writer.getTermIndexInterval());
|
||||
|
||||
final IndexOutput freqOut = state.directory.createOutput(state.segmentFileName(IndexFileNames.FREQ_EXTENSION));
|
||||
final IndexOutput proxOut;
|
||||
|
||||
if (fieldInfos.hasProx())
|
||||
proxOut = state.directory.createOutput(state.segmentFileName(IndexFileNames.PROX_EXTENSION));
|
||||
else
|
||||
proxOut = null;
|
||||
|
||||
final DefaultSkipListWriter skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval,
|
||||
termsOut.maxSkipLevels,
|
||||
state.numDocsInRAM, freqOut, proxOut);
|
||||
// TODO: allow Lucene user to customize this consumer:
|
||||
final FormatPostingsFieldsConsumer consumer = new FormatPostingsFieldsWriter(state, fieldInfos);
|
||||
/*
|
||||
Current writer chain:
|
||||
FormatPostingsFieldsConsumer
|
||||
-> IMPL: FormatPostingsFieldsWriter
|
||||
-> FormatPostingsTermsConsumer
|
||||
-> IMPL: FormatPostingsTermsWriter
|
||||
-> FormatPostingsDocConsumer
|
||||
-> IMPL: FormatPostingsDocWriter
|
||||
-> FormatPostingsPositionsConsumer
|
||||
-> IMPL: FormatPostingsPositionsWriter
|
||||
*/
|
||||
|
||||
int start = 0;
|
||||
while(start < numAllFields) {
|
||||
|
@ -129,7 +126,7 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
|
|||
|
||||
// If this field has postings then add them to the
|
||||
// segment
|
||||
appendPostings(state, fields, termsOut, freqOut, proxOut, skipListWriter);
|
||||
appendPostings(fields, consumer);
|
||||
|
||||
for(int i=0;i<fields.length;i++) {
|
||||
TermsHashPerField perField = fields[i].termsHashPerField;
|
||||
|
@ -149,51 +146,18 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
|
|||
perThread.termsHashPerThread.reset(true);
|
||||
}
|
||||
|
||||
freqOut.close();
|
||||
if (proxOut != null) {
|
||||
state.flushedFiles.add(state.segmentFileName(IndexFileNames.PROX_EXTENSION));
|
||||
proxOut.close();
|
||||
}
|
||||
termsOut.close();
|
||||
|
||||
// Record all files we have flushed
|
||||
state.flushedFiles.add(state.segmentFileName(IndexFileNames.FIELD_INFOS_EXTENSION));
|
||||
state.flushedFiles.add(state.segmentFileName(IndexFileNames.FREQ_EXTENSION));
|
||||
state.flushedFiles.add(state.segmentFileName(IndexFileNames.TERMS_EXTENSION));
|
||||
state.flushedFiles.add(state.segmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION));
|
||||
consumer.finish();
|
||||
}
|
||||
|
||||
final byte[] copyByteBuffer = new byte[4096];
|
||||
|
||||
/** Copy numBytes from srcIn to destIn */
|
||||
void copyBytes(IndexInput srcIn, IndexOutput destIn, long numBytes) throws IOException {
|
||||
// TODO: we could do this more efficiently (save a copy)
|
||||
// because it's always from a ByteSliceReader ->
|
||||
// IndexOutput
|
||||
while(numBytes > 0) {
|
||||
final int chunk;
|
||||
if (numBytes > 4096)
|
||||
chunk = 4096;
|
||||
else
|
||||
chunk = (int) numBytes;
|
||||
srcIn.readBytes(copyByteBuffer, 0, chunk);
|
||||
destIn.writeBytes(copyByteBuffer, 0, chunk);
|
||||
numBytes -= chunk;
|
||||
}
|
||||
}
|
||||
private byte[] payloadBuffer;
|
||||
|
||||
/* Walk through all unique text tokens (Posting
|
||||
* instances) found in this field and serialize them
|
||||
* into a single RAM segment. */
|
||||
void appendPostings(final DocumentsWriter.FlushState flushState,
|
||||
FreqProxTermsWriterPerField[] fields,
|
||||
TermInfosWriter termsOut,
|
||||
IndexOutput freqOut,
|
||||
IndexOutput proxOut,
|
||||
DefaultSkipListWriter skipListWriter)
|
||||
void appendPostings(FreqProxTermsWriterPerField[] fields,
|
||||
FormatPostingsFieldsConsumer consumer)
|
||||
throws CorruptIndexException, IOException {
|
||||
|
||||
final int fieldNumber = fields[0].fieldInfo.number;
|
||||
int numFields = fields.length;
|
||||
|
||||
final FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields];
|
||||
|
@ -208,15 +172,12 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
|
|||
assert result;
|
||||
}
|
||||
|
||||
final int skipInterval = termsOut.skipInterval;
|
||||
final boolean currentFieldOmitTf = fields[0].fieldInfo.omitTf;
|
||||
final FormatPostingsTermsConsumer termsConsumer = consumer.addField(fields[0].fieldInfo);
|
||||
|
||||
// If current field omits tf then it cannot store
|
||||
// payloads. We silently drop the payloads in this case:
|
||||
final boolean currentFieldStorePayloads = currentFieldOmitTf ? false : fields[0].fieldInfo.storePayloads;
|
||||
|
||||
FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields];
|
||||
|
||||
final boolean currentFieldOmitTf = fields[0].fieldInfo.omitTf;
|
||||
|
||||
while(numFields > 0) {
|
||||
|
||||
// Get the next term to merge
|
||||
|
@ -235,43 +196,21 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
|
|||
termStates[numToMerge++] = mergeStates[i];
|
||||
}
|
||||
|
||||
int df = 0;
|
||||
int lastPayloadLength = -1;
|
||||
|
||||
int lastDoc = 0;
|
||||
|
||||
final char[] text = termStates[0].text;
|
||||
final int start = termStates[0].textOffset;
|
||||
|
||||
final long freqPointer = freqOut.getFilePointer();
|
||||
final long proxPointer;
|
||||
if (proxOut != null)
|
||||
proxPointer = proxOut.getFilePointer();
|
||||
else
|
||||
proxPointer = 0;
|
||||
|
||||
skipListWriter.resetSkip();
|
||||
final FormatPostingsDocsConsumer docConsumer = termsConsumer.addTerm(termStates[0].text, termStates[0].textOffset);
|
||||
|
||||
// Now termStates has numToMerge FieldMergeStates
|
||||
// which all share the same term. Now we must
|
||||
// interleave the docID streams.
|
||||
while(numToMerge > 0) {
|
||||
|
||||
if ((++df % skipInterval) == 0) {
|
||||
skipListWriter.setSkipData(lastDoc, currentFieldStorePayloads, lastPayloadLength);
|
||||
skipListWriter.bufferSkip(df);
|
||||
}
|
||||
|
||||
FreqProxFieldMergeState minState = termStates[0];
|
||||
for(int i=1;i<numToMerge;i++)
|
||||
if (termStates[i].docID < minState.docID)
|
||||
minState = termStates[i];
|
||||
|
||||
final int doc = minState.docID;
|
||||
final int termDocFreq = minState.termFreq;
|
||||
|
||||
assert doc < flushState.numDocsInRAM;
|
||||
assert doc > lastDoc || df == 1;
|
||||
final FormatPostingsPositionsConsumer posConsumer = docConsumer.addDoc(minState.docID, termDocFreq);
|
||||
|
||||
final ByteSliceReader prox = minState.prox;
|
||||
|
||||
|
@ -279,47 +218,32 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
|
|||
// changing the format to match Lucene's segment
|
||||
// format.
|
||||
if (!currentFieldOmitTf) {
|
||||
// omitTf == false so we do write positions & payload
|
||||
assert proxOut != null;
|
||||
// omitTf == false so we do write positions &
|
||||
// payload
|
||||
int position = 0;
|
||||
for(int j=0;j<termDocFreq;j++) {
|
||||
final int code = prox.readVInt();
|
||||
if (currentFieldStorePayloads) {
|
||||
final int payloadLength;
|
||||
if ((code & 1) != 0) {
|
||||
// This position has a payload
|
||||
payloadLength = prox.readVInt();
|
||||
} else
|
||||
payloadLength = 0;
|
||||
if (payloadLength != lastPayloadLength) {
|
||||
proxOut.writeVInt(code|1);
|
||||
proxOut.writeVInt(payloadLength);
|
||||
lastPayloadLength = payloadLength;
|
||||
} else
|
||||
proxOut.writeVInt(code & (~1));
|
||||
if (payloadLength > 0)
|
||||
copyBytes(prox, proxOut, payloadLength);
|
||||
} else {
|
||||
assert 0 == (code & 1);
|
||||
proxOut.writeVInt(code>>1);
|
||||
}
|
||||
position += code >> 1;
|
||||
|
||||
final int payloadLength;
|
||||
if ((code & 1) != 0) {
|
||||
// This position has a payload
|
||||
payloadLength = prox.readVInt();
|
||||
|
||||
if (payloadBuffer == null || payloadBuffer.length < payloadLength)
|
||||
payloadBuffer = new byte[payloadLength];
|
||||
|
||||
prox.readBytes(payloadBuffer, 0, payloadLength);
|
||||
|
||||
} else
|
||||
payloadLength = 0;
|
||||
|
||||
posConsumer.addPosition(position, payloadBuffer, 0, payloadLength);
|
||||
} //End for
|
||||
|
||||
final int newDocCode = (doc-lastDoc)<<1;
|
||||
|
||||
if (1 == termDocFreq) {
|
||||
freqOut.writeVInt(newDocCode|1);
|
||||
} else {
|
||||
freqOut.writeVInt(newDocCode);
|
||||
freqOut.writeVInt(termDocFreq);
|
||||
}
|
||||
} else {
|
||||
// omitTf==true: we store only the docs, without
|
||||
// term freq, positions, payloads
|
||||
freqOut.writeVInt(doc-lastDoc);
|
||||
posConsumer.finish();
|
||||
}
|
||||
|
||||
lastDoc = doc;
|
||||
|
||||
if (!minState.nextDoc()) {
|
||||
|
||||
// Remove from termStates
|
||||
|
@ -345,26 +269,10 @@ final class FreqProxTermsWriter extends TermsHashConsumer {
|
|||
}
|
||||
}
|
||||
|
||||
assert df > 0;
|
||||
|
||||
// Done merging this term
|
||||
|
||||
long skipPointer = skipListWriter.writeSkip(freqOut);
|
||||
|
||||
// Write term
|
||||
termInfo.set(df, freqPointer, proxPointer, (int) (skipPointer - freqPointer));
|
||||
|
||||
// TODO: we could do this incrementally
|
||||
UnicodeUtil.UTF16toUTF8(text, start, termsUTF8);
|
||||
|
||||
// TODO: we could save O(n) re-scan of the term by
|
||||
// computing the shared prefix with the last term
|
||||
// while during the UTF8 encoding
|
||||
termsOut.add(fieldNumber,
|
||||
termsUTF8.result,
|
||||
termsUTF8.length,
|
||||
termInfo);
|
||||
docConsumer.finish();
|
||||
}
|
||||
|
||||
termsConsumer.finish();
|
||||
}
|
||||
|
||||
private final TermInfo termInfo = new TermInfo(); // minimize consing
|
||||
|
|
|
@ -195,4 +195,8 @@ final class IndexFileNames {
|
|||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static String segmentFileName(String segmentName, String ext) {
|
||||
return segmentName + "." + ext;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -29,10 +29,10 @@ abstract class InvertedDocConsumer {
|
|||
abstract void abort();
|
||||
|
||||
/** Flush a new segment */
|
||||
abstract void flush(Map threadsAndFields, DocumentsWriter.FlushState state) throws IOException;
|
||||
abstract void flush(Map threadsAndFields, SegmentWriteState state) throws IOException;
|
||||
|
||||
/** Close doc stores */
|
||||
abstract void closeDocStore(DocumentsWriter.FlushState state) throws IOException;
|
||||
abstract void closeDocStore(SegmentWriteState state) throws IOException;
|
||||
|
||||
/** Attempt to free RAM, returning true if any RAM was
|
||||
* freed */
|
||||
|
|
|
@ -22,8 +22,8 @@ import java.io.IOException;
|
|||
|
||||
abstract class InvertedDocEndConsumer {
|
||||
abstract InvertedDocEndConsumerPerThread addThread(DocInverterPerThread docInverterPerThread);
|
||||
abstract void flush(Map threadsAndFields, DocumentsWriter.FlushState state) throws IOException;
|
||||
abstract void closeDocStore(DocumentsWriter.FlushState state) throws IOException;
|
||||
abstract void flush(Map threadsAndFields, SegmentWriteState state) throws IOException;
|
||||
abstract void closeDocStore(SegmentWriteState state) throws IOException;
|
||||
abstract void abort();
|
||||
abstract void setFieldInfos(FieldInfos fieldInfos);
|
||||
}
|
||||
|
|
|
@ -54,7 +54,7 @@ final class NormsWriter extends InvertedDocEndConsumer {
|
|||
|
||||
/** Produce _X.nrm if any document had a field with norms
|
||||
* not disabled */
|
||||
public void flush(Map threadsAndFields, DocumentsWriter.FlushState state) throws IOException {
|
||||
public void flush(Map threadsAndFields, SegmentWriteState state) throws IOException {
|
||||
|
||||
final Map byField = new HashMap();
|
||||
|
||||
|
@ -133,7 +133,7 @@ final class NormsWriter extends InvertedDocEndConsumer {
|
|||
}
|
||||
}
|
||||
|
||||
assert minDocID < state.numDocsInRAM;
|
||||
assert minDocID < state.numDocs;
|
||||
|
||||
// Fill hole
|
||||
for(;upto<minDocID;upto++)
|
||||
|
@ -154,16 +154,16 @@ final class NormsWriter extends InvertedDocEndConsumer {
|
|||
}
|
||||
|
||||
// Fill final hole with defaultNorm
|
||||
for(;upto<state.numDocsInRAM;upto++)
|
||||
for(;upto<state.numDocs;upto++)
|
||||
normsOut.writeByte(defaultNorm);
|
||||
} else if (fieldInfo.isIndexed && !fieldInfo.omitNorms) {
|
||||
normCount++;
|
||||
// Fill entire field with default norm:
|
||||
for(;upto<state.numDocsInRAM;upto++)
|
||||
for(;upto<state.numDocs;upto++)
|
||||
normsOut.writeByte(defaultNorm);
|
||||
}
|
||||
|
||||
assert 4+normCount*state.numDocsInRAM == normsOut.getFilePointer() : ".nrm file size mismatch: expected=" + (4+normCount*state.numDocsInRAM) + " actual=" + normsOut.getFilePointer();
|
||||
assert 4+normCount*state.numDocs == normsOut.getFilePointer() : ".nrm file size mismatch: expected=" + (4+normCount*state.numDocs) + " actual=" + normsOut.getFilePointer();
|
||||
}
|
||||
|
||||
} finally {
|
||||
|
@ -171,5 +171,5 @@ final class NormsWriter extends InvertedDocEndConsumer {
|
|||
}
|
||||
}
|
||||
|
||||
void closeDocStore(DocumentsWriter.FlushState state) {}
|
||||
void closeDocStore(SegmentWriteState state) {}
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.io.IOException;
|
|||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
|
@ -476,38 +477,28 @@ final class SegmentMerger {
|
|||
throw new RuntimeException("mergeVectors produced an invalid result: mergedDocs is " + mergedDocs + " but tvx size is " + tvxSize + "; now aborting this merge to prevent index corruption");
|
||||
}
|
||||
|
||||
private IndexOutput freqOutput = null;
|
||||
private IndexOutput proxOutput = null;
|
||||
private TermInfosWriter termInfosWriter = null;
|
||||
private int skipInterval;
|
||||
private int maxSkipLevels;
|
||||
private SegmentMergeQueue queue = null;
|
||||
private DefaultSkipListWriter skipListWriter = null;
|
||||
|
||||
private final void mergeTerms() throws CorruptIndexException, IOException {
|
||||
|
||||
SegmentWriteState state = new SegmentWriteState(null, directory, segment, null, mergedDocs, 0, termIndexInterval);
|
||||
|
||||
final FormatPostingsFieldsConsumer consumer = new FormatPostingsFieldsWriter(state, fieldInfos);
|
||||
|
||||
try {
|
||||
freqOutput = directory.createOutput(segment + ".frq");
|
||||
if (hasProx())
|
||||
proxOutput = directory.createOutput(segment + ".prx");
|
||||
termInfosWriter =
|
||||
new TermInfosWriter(directory, segment, fieldInfos,
|
||||
termIndexInterval);
|
||||
skipInterval = termInfosWriter.skipInterval;
|
||||
maxSkipLevels = termInfosWriter.maxSkipLevels;
|
||||
skipListWriter = new DefaultSkipListWriter(skipInterval, maxSkipLevels, mergedDocs, freqOutput, proxOutput);
|
||||
queue = new SegmentMergeQueue(readers.size());
|
||||
|
||||
mergeTermInfos();
|
||||
mergeTermInfos(consumer);
|
||||
|
||||
} finally {
|
||||
if (freqOutput != null) freqOutput.close();
|
||||
if (proxOutput != null) proxOutput.close();
|
||||
if (termInfosWriter != null) termInfosWriter.close();
|
||||
consumer.finish();
|
||||
if (queue != null) queue.close();
|
||||
}
|
||||
}
|
||||
|
||||
private final void mergeTermInfos() throws CorruptIndexException, IOException {
|
||||
boolean omitTF;
|
||||
|
||||
private final void mergeTermInfos(final FormatPostingsFieldsConsumer consumer) throws CorruptIndexException, IOException {
|
||||
int base = 0;
|
||||
final int readerCount = readers.size();
|
||||
for (int i = 0; i < readerCount; i++) {
|
||||
|
@ -533,6 +524,9 @@ final class SegmentMerger {
|
|||
|
||||
SegmentMergeInfo[] match = new SegmentMergeInfo[readers.size()];
|
||||
|
||||
String currentField = null;
|
||||
FormatPostingsTermsConsumer termsConsumer = null;
|
||||
|
||||
while (queue.size() > 0) {
|
||||
int matchSize = 0; // pop matching terms
|
||||
match[matchSize++] = (SegmentMergeInfo) queue.pop();
|
||||
|
@ -544,7 +538,16 @@ final class SegmentMerger {
|
|||
top = (SegmentMergeInfo) queue.top();
|
||||
}
|
||||
|
||||
final int df = mergeTermInfo(match, matchSize); // add new TermInfo
|
||||
if (currentField != term.field) {
|
||||
currentField = term.field;
|
||||
if (termsConsumer != null)
|
||||
termsConsumer.finish();
|
||||
final FieldInfo fieldInfo = fieldInfos.fieldInfo(currentField);
|
||||
termsConsumer = consumer.addField(fieldInfo);
|
||||
omitTF = fieldInfo.omitTf;
|
||||
}
|
||||
|
||||
int df = appendPostings(termsConsumer, match, matchSize); // add new TermInfo
|
||||
|
||||
if (checkAbort != null)
|
||||
checkAbort.work(df/3.0);
|
||||
|
@ -559,44 +562,6 @@ final class SegmentMerger {
|
|||
}
|
||||
}
|
||||
|
||||
private final TermInfo termInfo = new TermInfo(); // minimize consing
|
||||
|
||||
/** Merge one term found in one or more segments. The array <code>smis</code>
|
||||
* contains segments that are positioned at the same term. <code>N</code>
|
||||
* is the number of cells in the array actually occupied.
|
||||
*
|
||||
* @param smis array of segments
|
||||
* @param n number of cells in the array actually occupied
|
||||
* @throws CorruptIndexException if the index is corrupt
|
||||
* @throws IOException if there is a low-level IO error
|
||||
*/
|
||||
private final int mergeTermInfo(SegmentMergeInfo[] smis, int n)
|
||||
throws CorruptIndexException, IOException {
|
||||
final long freqPointer = freqOutput.getFilePointer();
|
||||
final long proxPointer;
|
||||
if (proxOutput != null)
|
||||
proxPointer = proxOutput.getFilePointer();
|
||||
else
|
||||
proxPointer = 0;
|
||||
|
||||
int df;
|
||||
if (fieldInfos.fieldInfo(smis[0].term.field).omitTf) { // append posting data
|
||||
df = appendPostingsNoTf(smis, n);
|
||||
} else{
|
||||
df = appendPostings(smis, n);
|
||||
}
|
||||
|
||||
long skipPointer = skipListWriter.writeSkip(freqOutput);
|
||||
|
||||
if (df > 0) {
|
||||
// add an entry to the dictionary with pointers to prox and freq files
|
||||
termInfo.set(df, freqPointer, proxPointer, (int) (skipPointer - freqPointer));
|
||||
termInfosWriter.add(smis[0].term, termInfo);
|
||||
}
|
||||
|
||||
return df;
|
||||
}
|
||||
|
||||
private byte[] payloadBuffer;
|
||||
private int[][] docMaps;
|
||||
int[][] getDocMaps() {
|
||||
|
@ -617,13 +582,11 @@ final class SegmentMerger {
|
|||
* @throws CorruptIndexException if the index is corrupt
|
||||
* @throws IOException if there is a low-level IO error
|
||||
*/
|
||||
private final int appendPostings(SegmentMergeInfo[] smis, int n)
|
||||
throws CorruptIndexException, IOException {
|
||||
int lastDoc = 0;
|
||||
int df = 0; // number of docs w/ term
|
||||
skipListWriter.resetSkip();
|
||||
boolean storePayloads = fieldInfos.fieldInfo(smis[0].term.field).storePayloads;
|
||||
int lastPayloadLength = -1; // ensures that we write the first length
|
||||
private final int appendPostings(final FormatPostingsTermsConsumer termsConsumer, SegmentMergeInfo[] smis, int n)
|
||||
throws CorruptIndexException, IOException {
|
||||
|
||||
final FormatPostingsDocsConsumer docConsumer = termsConsumer.addTerm(smis[0].term.text);
|
||||
int df = 0;
|
||||
for (int i = 0; i < n; i++) {
|
||||
SegmentMergeInfo smi = smis[i];
|
||||
TermPositions postings = smi.getPositions();
|
||||
|
@ -631,114 +594,37 @@ final class SegmentMerger {
|
|||
int base = smi.base;
|
||||
int[] docMap = smi.getDocMap();
|
||||
postings.seek(smi.termEnum);
|
||||
|
||||
while (postings.next()) {
|
||||
df++;
|
||||
int doc = postings.doc();
|
||||
if (docMap != null)
|
||||
doc = docMap[doc]; // map around deletions
|
||||
doc += base; // convert to merged space
|
||||
|
||||
if (doc < 0 || (df > 0 && doc <= lastDoc))
|
||||
throw new CorruptIndexException("docs out of order (" + doc +
|
||||
" <= " + lastDoc + " )");
|
||||
final int freq = postings.freq();
|
||||
final FormatPostingsPositionsConsumer posConsumer = docConsumer.addDoc(doc, freq);
|
||||
|
||||
df++;
|
||||
|
||||
if ((df % skipInterval) == 0) {
|
||||
skipListWriter.setSkipData(lastDoc, storePayloads, lastPayloadLength);
|
||||
skipListWriter.bufferSkip(df);
|
||||
}
|
||||
|
||||
int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1
|
||||
lastDoc = doc;
|
||||
|
||||
int freq = postings.freq();
|
||||
if (freq == 1) {
|
||||
freqOutput.writeVInt(docCode | 1); // write doc & freq=1
|
||||
} else {
|
||||
freqOutput.writeVInt(docCode); // write doc
|
||||
freqOutput.writeVInt(freq); // write frequency in doc
|
||||
}
|
||||
|
||||
/** See {@link DocumentWriter#writePostings(Posting[], String)} for
|
||||
* documentation about the encoding of positions and payloads
|
||||
*/
|
||||
int lastPosition = 0; // write position deltas
|
||||
for (int j = 0; j < freq; j++) {
|
||||
int position = postings.nextPosition();
|
||||
int delta = position - lastPosition;
|
||||
if (storePayloads) {
|
||||
int payloadLength = postings.getPayloadLength();
|
||||
if (payloadLength == lastPayloadLength) {
|
||||
proxOutput.writeVInt(delta * 2);
|
||||
} else {
|
||||
proxOutput.writeVInt(delta * 2 + 1);
|
||||
proxOutput.writeVInt(payloadLength);
|
||||
lastPayloadLength = payloadLength;
|
||||
}
|
||||
if (!omitTF) {
|
||||
for (int j = 0; j < freq; j++) {
|
||||
final int position = postings.nextPosition();
|
||||
final int payloadLength = postings.getPayloadLength();
|
||||
if (payloadLength > 0) {
|
||||
if (payloadBuffer == null || payloadBuffer.length < payloadLength) {
|
||||
if (payloadBuffer == null || payloadBuffer.length < payloadLength)
|
||||
payloadBuffer = new byte[payloadLength];
|
||||
}
|
||||
postings.getPayload(payloadBuffer, 0);
|
||||
proxOutput.writeBytes(payloadBuffer, 0, payloadLength);
|
||||
}
|
||||
} else {
|
||||
proxOutput.writeVInt(delta);
|
||||
posConsumer.addPosition(position, payloadBuffer, 0, payloadLength);
|
||||
}
|
||||
lastPosition = position;
|
||||
posConsumer.finish();
|
||||
}
|
||||
}
|
||||
}
|
||||
docConsumer.finish();
|
||||
|
||||
return df;
|
||||
}
|
||||
|
||||
/** Process postings from multiple segments without tf, all positioned on the
|
||||
* same term. Writes out merged entries only into freqOutput, proxOut is not written.
|
||||
*
|
||||
* @param smis array of segments
|
||||
* @param n number of cells in the array actually occupied
|
||||
* @return number of documents across all segments where this term was found
|
||||
* @throws CorruptIndexException if the index is corrupt
|
||||
* @throws IOException if there is a low-level IO error
|
||||
*/
|
||||
private final int appendPostingsNoTf(SegmentMergeInfo[] smis, int n)
|
||||
throws CorruptIndexException, IOException {
|
||||
int lastDoc = 0;
|
||||
int df = 0; // number of docs w/ term
|
||||
skipListWriter.resetSkip();
|
||||
int lastPayloadLength = -1; // ensures that we write the first length
|
||||
for (int i = 0; i < n; i++) {
|
||||
SegmentMergeInfo smi = smis[i];
|
||||
TermPositions postings = smi.getPositions();
|
||||
assert postings != null;
|
||||
int base = smi.base;
|
||||
int[] docMap = smi.getDocMap();
|
||||
postings.seek(smi.termEnum);
|
||||
while (postings.next()) {
|
||||
int doc = postings.doc();
|
||||
if (docMap != null)
|
||||
doc = docMap[doc]; // map around deletions
|
||||
doc += base; // convert to merged space
|
||||
|
||||
if (doc < 0 || (df > 0 && doc <= lastDoc))
|
||||
throw new CorruptIndexException("docs out of order (" + doc +
|
||||
" <= " + lastDoc + " )");
|
||||
|
||||
df++;
|
||||
|
||||
if ((df % skipInterval) == 0) {
|
||||
skipListWriter.setSkipData(lastDoc, false, lastPayloadLength);
|
||||
skipListWriter.bufferSkip(df);
|
||||
}
|
||||
|
||||
int docCode = (doc - lastDoc);
|
||||
lastDoc = doc;
|
||||
freqOutput.writeVInt(docCode); // write doc & freq=1
|
||||
}
|
||||
}
|
||||
return df;
|
||||
}
|
||||
|
||||
private void mergeNorms() throws IOException {
|
||||
byte[] normBuffer = null;
|
||||
IndexOutput output = null;
|
||||
|
|
|
@ -0,0 +1,50 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Collection;
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
||||
class SegmentWriteState {
|
||||
DocumentsWriter docWriter;
|
||||
Directory directory;
|
||||
String segmentName;
|
||||
String docStoreSegmentName;
|
||||
int numDocs;
|
||||
int termIndexInterval;
|
||||
int numDocsInStore;
|
||||
Collection flushedFiles;
|
||||
|
||||
public SegmentWriteState(DocumentsWriter docWriter, Directory directory, String segmentName, String docStoreSegmentName, int numDocs,
|
||||
int numDocsInStore, int termIndexInterval) {
|
||||
this.docWriter = docWriter;
|
||||
this.directory = directory;
|
||||
this.segmentName = segmentName;
|
||||
this.docStoreSegmentName = docStoreSegmentName;
|
||||
this.numDocs = numDocs;
|
||||
this.numDocsInStore = numDocsInStore;
|
||||
this.termIndexInterval = termIndexInterval;
|
||||
flushedFiles = new HashSet();
|
||||
}
|
||||
|
||||
public String segmentFileName(String ext) {
|
||||
return segmentName + "." + ext;
|
||||
}
|
||||
}
|
|
@ -40,7 +40,7 @@ final class StoredFieldsWriter extends DocFieldConsumer {
|
|||
return new StoredFieldsWriterPerThread(docFieldProcessorPerThread, this);
|
||||
}
|
||||
|
||||
synchronized public void flush(Map threadsAndFields, DocumentsWriter.FlushState state) throws IOException {
|
||||
synchronized public void flush(Map threadsAndFields, SegmentWriteState state) throws IOException {
|
||||
|
||||
if (state.numDocsInStore > 0) {
|
||||
// It's possible that all documents seen in this segment
|
||||
|
@ -72,7 +72,7 @@ final class StoredFieldsWriter extends DocFieldConsumer {
|
|||
}
|
||||
}
|
||||
|
||||
synchronized public void closeDocStore(DocumentsWriter.FlushState state) throws IOException {
|
||||
synchronized public void closeDocStore(SegmentWriteState state) throws IOException {
|
||||
final int inc = state.numDocsInStore - lastDocID;
|
||||
if (inc > 0) {
|
||||
initFieldsWriter();
|
||||
|
|
|
@ -51,7 +51,7 @@ final class TermVectorsTermsWriter extends TermsHashConsumer {
|
|||
postings[i] = new PostingList();
|
||||
}
|
||||
|
||||
synchronized void flush(Map threadsAndFields, final DocumentsWriter.FlushState state) throws IOException {
|
||||
synchronized void flush(Map threadsAndFields, final SegmentWriteState state) throws IOException {
|
||||
|
||||
if (tvx != null) {
|
||||
|
||||
|
@ -80,7 +80,7 @@ final class TermVectorsTermsWriter extends TermsHashConsumer {
|
|||
}
|
||||
}
|
||||
|
||||
synchronized void closeDocStore(final DocumentsWriter.FlushState state) throws IOException {
|
||||
synchronized void closeDocStore(final SegmentWriteState state) throws IOException {
|
||||
if (tvx != null) {
|
||||
// At least one doc in this run had term vectors
|
||||
// enabled
|
||||
|
|
|
@ -85,7 +85,7 @@ final class TermsHash extends InvertedDocConsumer {
|
|||
nextTermsHash.abort();
|
||||
}
|
||||
|
||||
void shrinkFreePostings(Map threadsAndFields, DocumentsWriter.FlushState state) {
|
||||
void shrinkFreePostings(Map threadsAndFields, SegmentWriteState state) {
|
||||
|
||||
assert postingsFreeCount == postingsAllocCount: Thread.currentThread().getName() + ": postingsFreeCount=" + postingsFreeCount + " postingsAllocCount=" + postingsAllocCount + " consumer=" + consumer;
|
||||
|
||||
|
@ -97,13 +97,13 @@ final class TermsHash extends InvertedDocConsumer {
|
|||
}
|
||||
}
|
||||
|
||||
synchronized void closeDocStore(DocumentsWriter.FlushState state) throws IOException {
|
||||
synchronized void closeDocStore(SegmentWriteState state) throws IOException {
|
||||
consumer.closeDocStore(state);
|
||||
if (nextTermsHash != null)
|
||||
nextTermsHash.closeDocStore(state);
|
||||
}
|
||||
|
||||
synchronized void flush(Map threadsAndFields, final DocumentsWriter.FlushState state) throws IOException {
|
||||
synchronized void flush(Map threadsAndFields, final SegmentWriteState state) throws IOException {
|
||||
Map childThreadsAndFields = new HashMap();
|
||||
Map nextThreadsAndFields;
|
||||
|
||||
|
|
|
@ -24,9 +24,9 @@ abstract class TermsHashConsumer {
|
|||
abstract int bytesPerPosting();
|
||||
abstract void createPostings(RawPostingList[] postings, int start, int count);
|
||||
abstract TermsHashConsumerPerThread addThread(TermsHashPerThread perThread);
|
||||
abstract void flush(Map threadsAndFields, final DocumentsWriter.FlushState state) throws IOException;
|
||||
abstract void flush(Map threadsAndFields, final SegmentWriteState state) throws IOException;
|
||||
abstract void abort();
|
||||
abstract void closeDocStore(DocumentsWriter.FlushState state) throws IOException;
|
||||
abstract void closeDocStore(SegmentWriteState state) throws IOException;
|
||||
|
||||
FieldInfos fieldInfos;
|
||||
|
||||
|
|
Loading…
Reference in New Issue