mirror of https://github.com/apache/lucene.git
LUCENE-1212: factor DocumentsWriter into separate source files
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@636458 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a6258c76fd
commit
e5f9b4e1cb
|
@ -0,0 +1,29 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
// Used only internally to DW to call abort "up the stack"
|
||||
class AbortException extends IOException {
|
||||
public AbortException(Throwable cause, DocumentsWriter docWriter) {
|
||||
super();
|
||||
initCause(cause);
|
||||
docWriter.setAborting();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,146 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
/** Holds buffered deletes, by docID, term or query. We
|
||||
* hold two instances of this class: one for the deletes
|
||||
* prior to the last flush, the other for deletes after
|
||||
* the last flush. This is so if we need to abort
|
||||
* (discard all buffered docs) we can also discard the
|
||||
* buffered deletes yet keep the deletes done during
|
||||
* previously flushed segments. */
|
||||
class BufferedDeletes {
|
||||
int numTerms;
|
||||
HashMap terms = new HashMap();
|
||||
HashMap queries = new HashMap();
|
||||
List docIDs = new ArrayList();
|
||||
|
||||
// Number of documents a delete term applies to.
|
||||
final static class Num {
|
||||
private int num;
|
||||
|
||||
Num(int num) {
|
||||
this.num = num;
|
||||
}
|
||||
|
||||
int getNum() {
|
||||
return num;
|
||||
}
|
||||
|
||||
void setNum(int num) {
|
||||
// Only record the new number if it's greater than the
|
||||
// current one. This is important because if multiple
|
||||
// threads are replacing the same doc at nearly the
|
||||
// same time, it's possible that one thread that got a
|
||||
// higher docID is scheduled before the other
|
||||
// threads.
|
||||
if (num > this.num)
|
||||
this.num = num;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
void update(BufferedDeletes in) {
|
||||
numTerms += in.numTerms;
|
||||
terms.putAll(in.terms);
|
||||
queries.putAll(in.queries);
|
||||
docIDs.addAll(in.docIDs);
|
||||
in.terms.clear();
|
||||
in.numTerms = 0;
|
||||
in.queries.clear();
|
||||
in.docIDs.clear();
|
||||
}
|
||||
|
||||
void clear() {
|
||||
terms.clear();
|
||||
queries.clear();
|
||||
docIDs.clear();
|
||||
numTerms = 0;
|
||||
}
|
||||
|
||||
boolean any() {
|
||||
return terms.size() > 0 || docIDs.size() > 0 || queries.size() > 0;
|
||||
}
|
||||
|
||||
// Remaps all buffered deletes based on a completed
|
||||
// merge
|
||||
synchronized void remap(MergeDocIDRemapper mapper,
|
||||
SegmentInfos infos,
|
||||
int[][] docMaps,
|
||||
int[] delCounts,
|
||||
MergePolicy.OneMerge merge,
|
||||
int mergeDocCount) {
|
||||
|
||||
final HashMap newDeleteTerms;
|
||||
|
||||
// Remap delete-by-term
|
||||
if (terms.size() > 0) {
|
||||
newDeleteTerms = new HashMap();
|
||||
Iterator iter = terms.entrySet().iterator();
|
||||
while(iter.hasNext()) {
|
||||
Entry entry = (Entry) iter.next();
|
||||
Num num = (Num) entry.getValue();
|
||||
newDeleteTerms.put(entry.getKey(),
|
||||
new Num(mapper.remap(num.getNum())));
|
||||
}
|
||||
} else
|
||||
newDeleteTerms = null;
|
||||
|
||||
// Remap delete-by-docID
|
||||
final List newDeleteDocIDs;
|
||||
|
||||
if (docIDs.size() > 0) {
|
||||
newDeleteDocIDs = new ArrayList(docIDs.size());
|
||||
Iterator iter = docIDs.iterator();
|
||||
while(iter.hasNext()) {
|
||||
Integer num = (Integer) iter.next();
|
||||
newDeleteDocIDs.add(new Integer(mapper.remap(num.intValue())));
|
||||
}
|
||||
} else
|
||||
newDeleteDocIDs = null;
|
||||
|
||||
// Remap delete-by-query
|
||||
final HashMap newDeleteQueries;
|
||||
|
||||
if (queries.size() > 0) {
|
||||
newDeleteQueries = new HashMap(queries.size());
|
||||
Iterator iter = queries.entrySet().iterator();
|
||||
while(iter.hasNext()) {
|
||||
Entry entry = (Entry) iter.next();
|
||||
Integer num = (Integer) entry.getValue();
|
||||
newDeleteQueries.put(entry.getKey(),
|
||||
new Integer(mapper.remap(num.intValue())));
|
||||
}
|
||||
} else
|
||||
newDeleteQueries = null;
|
||||
|
||||
if (newDeleteTerms != null)
|
||||
terms = newDeleteTerms;
|
||||
if (newDeleteDocIDs != null)
|
||||
docIDs = newDeleteDocIDs;
|
||||
if (newDeleteQueries != null)
|
||||
queries = newDeleteQueries;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,60 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
|
||||
/* Stores norms, buffered in RAM, until they are flushed
|
||||
* to a partial segment. */
|
||||
final class BufferedNorms {
|
||||
|
||||
RAMOutputStream out;
|
||||
int upto;
|
||||
|
||||
private static final byte defaultNorm = Similarity.encodeNorm(1.0f);
|
||||
|
||||
BufferedNorms() {
|
||||
out = new RAMOutputStream();
|
||||
}
|
||||
|
||||
void add(float norm) throws IOException {
|
||||
byte b = Similarity.encodeNorm(norm);
|
||||
out.writeByte(b);
|
||||
upto++;
|
||||
}
|
||||
|
||||
void reset() {
|
||||
out.reset();
|
||||
upto = 0;
|
||||
}
|
||||
|
||||
void fill(int docID) throws IOException {
|
||||
// Must now fill in docs that didn't have this
|
||||
// field. Note that this is how norms can consume
|
||||
// tremendous storage when the docs have widely
|
||||
// varying different fields, because we are not
|
||||
// storing the norms sparsely (see LUCENE-830)
|
||||
if (upto < docID) {
|
||||
DocumentsWriter.fillBytes(out, defaultNorm, docID-upto);
|
||||
upto = docID;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,142 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* Class that Posting and PostingVector use to write byte
|
||||
* streams into shared fixed-size byte[] arrays. The idea
|
||||
* is to allocate slices of increasing lengths For
|
||||
* example, the first slice is 5 bytes, the next slice is
|
||||
* 14, etc. We start by writing our bytes into the first
|
||||
* 5 bytes. When we hit the end of the slice, we allocate
|
||||
* the next slice and then write the address of the new
|
||||
* slice into the last 4 bytes of the previous slice (the
|
||||
* "forwarding address").
|
||||
*
|
||||
* Each slice is filled with 0's initially, and we mark
|
||||
* the end with a non-zero byte. This way the methods
|
||||
* that are writing into the slice don't need to record
|
||||
* its length and instead allocate a new slice once they
|
||||
* hit a non-zero byte. */
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
final class ByteBlockPool {
|
||||
|
||||
public byte[][] buffers = new byte[10][];
|
||||
|
||||
int bufferUpto = -1; // Which buffer we are upto
|
||||
public int byteUpto = DocumentsWriter.BYTE_BLOCK_SIZE; // Where we are in head buffer
|
||||
|
||||
public byte[] buffer; // Current head buffer
|
||||
public int byteOffset = -DocumentsWriter.BYTE_BLOCK_SIZE; // Current head offset
|
||||
|
||||
private boolean trackAllocations;
|
||||
DocumentsWriter docWriter;
|
||||
|
||||
public ByteBlockPool(DocumentsWriter docWriter, boolean trackAllocations) {
|
||||
this.docWriter = docWriter;
|
||||
this.trackAllocations = trackAllocations;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
if (bufferUpto != -1) {
|
||||
// We allocated at least one buffer
|
||||
|
||||
for(int i=0;i<bufferUpto;i++)
|
||||
// Fully zero fill buffers that we fully used
|
||||
Arrays.fill(buffers[i], (byte) 0);
|
||||
|
||||
// Partial zero fill the final buffer
|
||||
Arrays.fill(buffers[bufferUpto], 0, byteUpto, (byte) 0);
|
||||
|
||||
if (bufferUpto > 0)
|
||||
// Recycle all but the first buffer
|
||||
docWriter.recycleByteBlocks(buffers, 1, 1+bufferUpto);
|
||||
|
||||
// Re-use the first buffer
|
||||
bufferUpto = 0;
|
||||
byteUpto = 0;
|
||||
byteOffset = 0;
|
||||
buffer = buffers[0];
|
||||
}
|
||||
}
|
||||
|
||||
public void nextBuffer() {
|
||||
if (1+bufferUpto == buffers.length) {
|
||||
byte[][] newBuffers = new byte[(int) (buffers.length*1.5)][];
|
||||
System.arraycopy(buffers, 0, newBuffers, 0, buffers.length);
|
||||
buffers = newBuffers;
|
||||
}
|
||||
buffer = buffers[1+bufferUpto] = docWriter.getByteBlock(trackAllocations);
|
||||
bufferUpto++;
|
||||
|
||||
byteUpto = 0;
|
||||
byteOffset += DocumentsWriter.BYTE_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
public int newSlice(final int size) {
|
||||
if (byteUpto > DocumentsWriter.BYTE_BLOCK_SIZE-size)
|
||||
nextBuffer();
|
||||
final int upto = byteUpto;
|
||||
byteUpto += size;
|
||||
buffer[byteUpto-1] = 16;
|
||||
return upto;
|
||||
}
|
||||
|
||||
// Size of each slice. These arrays should be at most 16
|
||||
// elements. First array is just a compact way to encode
|
||||
// X+1 with a max. Second array is the length of each
|
||||
// slice, ie first slice is 5 bytes, next slice is 14
|
||||
// bytes, etc.
|
||||
final static int[] nextLevelArray = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9};
|
||||
final static int[] levelSizeArray = {5, 14, 20, 30, 40, 40, 80, 80, 120, 200};
|
||||
final static int FIRST_LEVEL_SIZE = levelSizeArray[0];
|
||||
|
||||
public int allocSlice(final byte[] slice, final int upto) {
|
||||
|
||||
final int level = slice[upto] & 15;
|
||||
final int newLevel = nextLevelArray[level];
|
||||
final int newSize = levelSizeArray[newLevel];
|
||||
|
||||
// Maybe allocate another block
|
||||
if (byteUpto > DocumentsWriter.BYTE_BLOCK_SIZE-newSize)
|
||||
nextBuffer();
|
||||
|
||||
final int newUpto = byteUpto;
|
||||
final int offset = newUpto + byteOffset;
|
||||
byteUpto += newSize;
|
||||
|
||||
// Copy forward the past 3 bytes (which we are about
|
||||
// to overwrite with the forwarding address):
|
||||
buffer[newUpto] = slice[upto-3];
|
||||
buffer[newUpto+1] = slice[upto-2];
|
||||
buffer[newUpto+2] = slice[upto-1];
|
||||
|
||||
// Write forwarding address at end of last slice:
|
||||
slice[upto-3] = (byte) (offset >>> 24);
|
||||
slice[upto-2] = (byte) (offset >>> 16);
|
||||
slice[upto-1] = (byte) (offset >>> 8);
|
||||
slice[upto] = (byte) offset;
|
||||
|
||||
// Write new level:
|
||||
buffer[byteUpto-1] = (byte) (16|newLevel);
|
||||
|
||||
return newUpto+3;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,136 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import java.io.IOException;
|
||||
|
||||
/* IndexInput that knows how to read the byte slices written
|
||||
* by Posting and PostingVector. We read the bytes in
|
||||
* each slice until we hit the end of that slice at which
|
||||
* point we read the forwarding address of the next slice
|
||||
* and then jump to it.*/
|
||||
final class ByteSliceReader extends IndexInput {
|
||||
ByteBlockPool pool;
|
||||
int bufferUpto;
|
||||
byte[] buffer;
|
||||
public int upto;
|
||||
int limit;
|
||||
int level;
|
||||
public int bufferOffset;
|
||||
|
||||
public int endIndex;
|
||||
|
||||
public void init(ByteBlockPool pool, int startIndex, int endIndex) {
|
||||
|
||||
assert endIndex-startIndex > 0;
|
||||
|
||||
this.pool = pool;
|
||||
this.endIndex = endIndex;
|
||||
|
||||
level = 0;
|
||||
bufferUpto = startIndex / DocumentsWriter.BYTE_BLOCK_SIZE;
|
||||
bufferOffset = bufferUpto * DocumentsWriter.BYTE_BLOCK_SIZE;
|
||||
buffer = pool.buffers[bufferUpto];
|
||||
upto = startIndex & DocumentsWriter.BYTE_BLOCK_MASK;
|
||||
|
||||
final int firstSize = ByteBlockPool.levelSizeArray[0];
|
||||
|
||||
if (startIndex+firstSize >= endIndex) {
|
||||
// There is only this one slice to read
|
||||
limit = endIndex & DocumentsWriter.BYTE_BLOCK_MASK;
|
||||
} else
|
||||
limit = upto+firstSize-4;
|
||||
}
|
||||
|
||||
public byte readByte() {
|
||||
// Assert that we are not @ EOF
|
||||
assert upto + bufferOffset < endIndex;
|
||||
if (upto == limit)
|
||||
nextSlice();
|
||||
return buffer[upto++];
|
||||
}
|
||||
|
||||
public long writeTo(IndexOutput out) throws IOException {
|
||||
long size = 0;
|
||||
while(true) {
|
||||
if (limit + bufferOffset == endIndex) {
|
||||
assert endIndex - bufferOffset >= upto;
|
||||
out.writeBytes(buffer, upto, limit-upto);
|
||||
size += limit-upto;
|
||||
break;
|
||||
} else {
|
||||
out.writeBytes(buffer, upto, limit-upto);
|
||||
size += limit-upto;
|
||||
nextSlice();
|
||||
}
|
||||
}
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
public void nextSlice() {
|
||||
|
||||
// Skip to our next slice
|
||||
final int nextIndex = ((buffer[limit]&0xff)<<24) + ((buffer[1+limit]&0xff)<<16) + ((buffer[2+limit]&0xff)<<8) + (buffer[3+limit]&0xff);
|
||||
|
||||
level = ByteBlockPool.nextLevelArray[level];
|
||||
final int newSize = ByteBlockPool.levelSizeArray[level];
|
||||
|
||||
bufferUpto = nextIndex / DocumentsWriter.BYTE_BLOCK_SIZE;
|
||||
bufferOffset = bufferUpto * DocumentsWriter.BYTE_BLOCK_SIZE;
|
||||
|
||||
buffer = pool.buffers[bufferUpto];
|
||||
upto = nextIndex & DocumentsWriter.BYTE_BLOCK_MASK;
|
||||
|
||||
if (nextIndex + newSize >= endIndex) {
|
||||
// We are advancing to the final slice
|
||||
assert endIndex - nextIndex > 0;
|
||||
limit = endIndex - bufferOffset;
|
||||
} else {
|
||||
// This is not the final slice (subtract 4 for the
|
||||
// forwarding address at the end of this new slice)
|
||||
limit = upto+newSize-4;
|
||||
}
|
||||
}
|
||||
|
||||
public void readBytes(byte[] b, int offset, int len) {
|
||||
while(len > 0) {
|
||||
final int numLeft = limit-upto;
|
||||
if (numLeft < len) {
|
||||
// Read entire slice
|
||||
System.arraycopy(buffer, upto, b, offset, numLeft);
|
||||
offset += numLeft;
|
||||
len -= numLeft;
|
||||
nextSlice();
|
||||
} else {
|
||||
// This slice is the last one
|
||||
System.arraycopy(buffer, upto, b, offset, len);
|
||||
upto += len;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public long getFilePointer() {throw new RuntimeException("not implemented");}
|
||||
public long length() {throw new RuntimeException("not implemented");}
|
||||
public void seek(long pos) {throw new RuntimeException("not implemented");}
|
||||
public void close() {throw new RuntimeException("not implemented");}
|
||||
}
|
||||
|
|
@ -0,0 +1,56 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
final class CharBlockPool {
|
||||
|
||||
public char[][] buffers = new char[10][];
|
||||
int numBuffer;
|
||||
|
||||
int bufferUpto = -1; // Which buffer we are upto
|
||||
public int byteUpto = DocumentsWriter.CHAR_BLOCK_SIZE; // Where we are in head buffer
|
||||
|
||||
public char[] buffer; // Current head buffer
|
||||
public int byteOffset = -DocumentsWriter.CHAR_BLOCK_SIZE; // Current head offset
|
||||
private DocumentsWriter docWriter;
|
||||
|
||||
public CharBlockPool(DocumentsWriter docWriter) {
|
||||
this.docWriter = docWriter;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
docWriter.recycleCharBlocks(buffers, 1+bufferUpto);
|
||||
bufferUpto = -1;
|
||||
byteUpto = DocumentsWriter.CHAR_BLOCK_SIZE;
|
||||
byteOffset = -DocumentsWriter.CHAR_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
public void nextBuffer() {
|
||||
if (1+bufferUpto == buffers.length) {
|
||||
char[][] newBuffers = new char[(int) (buffers.length*1.5)][];
|
||||
System.arraycopy(buffers, 0, newBuffers, 0, buffers.length);
|
||||
buffers = newBuffers;
|
||||
}
|
||||
buffer = buffers[1+bufferUpto] = docWriter.getCharBlock();
|
||||
bufferUpto++;
|
||||
|
||||
byteUpto = 0;
|
||||
byteOffset += DocumentsWriter.CHAR_BLOCK_SIZE;
|
||||
}
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,773 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
|
||||
/** Used by DocumentsWriter to hold data associated with a
|
||||
* single field in a single ThreadState, including the
|
||||
* Postings hash. A document may have many occurrences for
|
||||
* a given field name; we gather all such occurrences here
|
||||
* (in docFields) so that we can process the entire field
|
||||
* at once. */
|
||||
|
||||
final class DocumentsWriterFieldData implements Comparable {
|
||||
|
||||
final DocumentsWriterThreadState threadState;
|
||||
FieldInfo fieldInfo;
|
||||
|
||||
int fieldCount;
|
||||
Fieldable[] docFields = new Fieldable[1];
|
||||
|
||||
int lastGen = -1;
|
||||
DocumentsWriterFieldData next;
|
||||
|
||||
boolean doNorms;
|
||||
boolean doVectors;
|
||||
boolean doVectorPositions;
|
||||
boolean doVectorOffsets;
|
||||
boolean postingsCompacted;
|
||||
|
||||
int numPostings;
|
||||
|
||||
Posting[] postingsHash;
|
||||
int postingsHashSize;
|
||||
int postingsHashHalfSize;
|
||||
int postingsHashMask;
|
||||
|
||||
int position;
|
||||
int length;
|
||||
int offset;
|
||||
float boost;
|
||||
int postingsVectorsUpto;
|
||||
|
||||
public DocumentsWriterFieldData(DocumentsWriterThreadState threadState, FieldInfo fieldInfo) {
|
||||
this.fieldInfo = fieldInfo;
|
||||
this.threadState = threadState;
|
||||
}
|
||||
|
||||
void resetPostingArrays() {
|
||||
if (!postingsCompacted)
|
||||
compactPostings();
|
||||
threadState.docWriter.recyclePostings(this.postingsHash, numPostings);
|
||||
Arrays.fill(postingsHash, 0, postingsHash.length, null);
|
||||
postingsCompacted = false;
|
||||
numPostings = 0;
|
||||
}
|
||||
|
||||
void initPostingArrays() {
|
||||
// Target hash fill factor of <= 50%
|
||||
// NOTE: must be a power of two for hash collision
|
||||
// strategy to work correctly
|
||||
postingsHashSize = 4;
|
||||
postingsHashHalfSize = 2;
|
||||
postingsHashMask = postingsHashSize-1;
|
||||
postingsHash = new Posting[postingsHashSize];
|
||||
}
|
||||
|
||||
public int compareTo(Object o) {
|
||||
return fieldInfo.name.compareTo(((DocumentsWriterFieldData) o).fieldInfo.name);
|
||||
}
|
||||
|
||||
private void compactPostings() {
|
||||
int upto = 0;
|
||||
for(int i=0;i<postingsHashSize;i++)
|
||||
if (postingsHash[i] != null)
|
||||
postingsHash[upto++] = postingsHash[i];
|
||||
|
||||
assert upto == numPostings;
|
||||
postingsCompacted = true;
|
||||
}
|
||||
|
||||
/** Collapse the hash table & sort in-place. */
|
||||
public Posting[] sortPostings() {
|
||||
compactPostings();
|
||||
threadState.doPostingSort(postingsHash, numPostings);
|
||||
return postingsHash;
|
||||
}
|
||||
|
||||
/** Process all occurrences of one field in the document. */
|
||||
public void processField(Analyzer analyzer) throws IOException, AbortException {
|
||||
length = 0;
|
||||
position = 0;
|
||||
offset = 0;
|
||||
boost = threadState.docBoost;
|
||||
|
||||
final int maxFieldLength = threadState.docWriter.writer.getMaxFieldLength();
|
||||
|
||||
final int limit = fieldCount;
|
||||
final Fieldable[] docFieldsFinal = docFields;
|
||||
|
||||
boolean doWriteVectors = true;
|
||||
|
||||
// Walk through all occurrences in this doc for this
|
||||
// field:
|
||||
try {
|
||||
for(int j=0;j<limit;j++) {
|
||||
Fieldable field = docFieldsFinal[j];
|
||||
|
||||
if (field.isIndexed())
|
||||
invertField(field, analyzer, maxFieldLength);
|
||||
|
||||
if (field.isStored()) {
|
||||
threadState.numStoredFields++;
|
||||
boolean success = false;
|
||||
try {
|
||||
threadState.localFieldsWriter.writeField(fieldInfo, field);
|
||||
success = true;
|
||||
} finally {
|
||||
// If we hit an exception inside
|
||||
// localFieldsWriter.writeField, the
|
||||
// contents of fdtLocal can be corrupt, so
|
||||
// we must discard all stored fields for
|
||||
// this document:
|
||||
if (!success)
|
||||
threadState.fdtLocal.reset();
|
||||
}
|
||||
}
|
||||
|
||||
docFieldsFinal[j] = null;
|
||||
}
|
||||
} catch (AbortException ae) {
|
||||
doWriteVectors = false;
|
||||
throw ae;
|
||||
} finally {
|
||||
if (postingsVectorsUpto > 0) {
|
||||
try {
|
||||
if (doWriteVectors) {
|
||||
// Add term vectors for this field
|
||||
boolean success = false;
|
||||
try {
|
||||
writeVectors(fieldInfo);
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
// If we hit an exception inside
|
||||
// writeVectors, the contents of tvfLocal
|
||||
// can be corrupt, so we must discard all
|
||||
// term vectors for this document:
|
||||
threadState.numVectorFields = 0;
|
||||
threadState.tvfLocal.reset();
|
||||
}
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
if (postingsVectorsUpto > threadState.maxPostingsVectors)
|
||||
threadState.maxPostingsVectors = postingsVectorsUpto;
|
||||
postingsVectorsUpto = 0;
|
||||
threadState.vectorsPool.reset();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int offsetEnd;
|
||||
Token localToken = new Token();
|
||||
|
||||
/* Invert one occurrence of one field in the document */
|
||||
public void invertField(Fieldable field, Analyzer analyzer, final int maxFieldLength) throws IOException, AbortException {
|
||||
|
||||
if (length>0)
|
||||
position += analyzer.getPositionIncrementGap(fieldInfo.name);
|
||||
|
||||
if (!field.isTokenized()) { // un-tokenized field
|
||||
String stringValue = field.stringValue();
|
||||
final int valueLength = stringValue.length();
|
||||
Token token = localToken;
|
||||
token.clear();
|
||||
char[] termBuffer = token.termBuffer();
|
||||
if (termBuffer.length < valueLength)
|
||||
termBuffer = token.resizeTermBuffer(valueLength);
|
||||
stringValue.getChars(0, valueLength, termBuffer, 0);
|
||||
token.setTermLength(valueLength);
|
||||
token.setStartOffset(offset);
|
||||
token.setEndOffset(offset + stringValue.length());
|
||||
addPosition(token);
|
||||
offset += stringValue.length();
|
||||
length++;
|
||||
} else { // tokenized field
|
||||
final TokenStream stream;
|
||||
final TokenStream streamValue = field.tokenStreamValue();
|
||||
|
||||
if (streamValue != null)
|
||||
stream = streamValue;
|
||||
else {
|
||||
// the field does not have a TokenStream,
|
||||
// so we have to obtain one from the analyzer
|
||||
final Reader reader; // find or make Reader
|
||||
final Reader readerValue = field.readerValue();
|
||||
|
||||
if (readerValue != null)
|
||||
reader = readerValue;
|
||||
else {
|
||||
String stringValue = field.stringValue();
|
||||
if (stringValue == null)
|
||||
throw new IllegalArgumentException("field must have either TokenStream, String or Reader value");
|
||||
threadState.stringReader.init(stringValue);
|
||||
reader = threadState.stringReader;
|
||||
}
|
||||
|
||||
// Tokenize field and add to postingTable
|
||||
stream = analyzer.reusableTokenStream(fieldInfo.name, reader);
|
||||
}
|
||||
|
||||
// reset the TokenStream to the first token
|
||||
stream.reset();
|
||||
|
||||
try {
|
||||
offsetEnd = offset-1;
|
||||
Token token;
|
||||
for(;;) {
|
||||
token = stream.next(localToken);
|
||||
if (token == null) break;
|
||||
position += (token.getPositionIncrement() - 1);
|
||||
addPosition(token);
|
||||
if (++length >= maxFieldLength) {
|
||||
if (threadState.docWriter.infoStream != null)
|
||||
threadState.docWriter.infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens");
|
||||
break;
|
||||
}
|
||||
}
|
||||
offset = offsetEnd+1;
|
||||
} finally {
|
||||
stream.close();
|
||||
}
|
||||
}
|
||||
|
||||
boost *= field.getBoost();
|
||||
}
|
||||
|
||||
/** Only called when term vectors are enabled. This
|
||||
* is called the first time we see a given term for
|
||||
* each document, to allocate a PostingVector
|
||||
* instance that is used to record data needed to
|
||||
* write the posting vectors. */
|
||||
private PostingVector addNewVector() {
|
||||
|
||||
if (postingsVectorsUpto == threadState.postingsVectors.length) {
|
||||
final int newSize;
|
||||
if (threadState.postingsVectors.length < 2)
|
||||
newSize = 2;
|
||||
else
|
||||
newSize = (int) (1.5*threadState.postingsVectors.length);
|
||||
PostingVector[] newArray = new PostingVector[newSize];
|
||||
System.arraycopy(threadState.postingsVectors, 0, newArray, 0, threadState.postingsVectors.length);
|
||||
threadState.postingsVectors = newArray;
|
||||
}
|
||||
|
||||
p.vector = threadState.postingsVectors[postingsVectorsUpto];
|
||||
if (p.vector == null)
|
||||
p.vector = threadState.postingsVectors[postingsVectorsUpto] = new PostingVector();
|
||||
|
||||
postingsVectorsUpto++;
|
||||
|
||||
final PostingVector v = p.vector;
|
||||
v.p = p;
|
||||
|
||||
if (doVectorPositions) {
|
||||
final int upto = threadState.vectorsPool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
|
||||
v.posStart = v.posUpto = threadState.vectorsPool.byteOffset + upto;
|
||||
}
|
||||
|
||||
if (doVectorOffsets) {
|
||||
final int upto = threadState.vectorsPool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
|
||||
v.offsetStart = v.offsetUpto = threadState.vectorsPool.byteOffset + upto;
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
int offsetStartCode;
|
||||
int offsetStart;
|
||||
|
||||
// Current posting we are working on
|
||||
Posting p;
|
||||
PostingVector vector;
|
||||
|
||||
/** Test whether the text for current Posting p equals
|
||||
* current tokenText. */
|
||||
boolean postingEquals(final char[] tokenText, final int tokenTextLen) {
|
||||
|
||||
final char[] text = threadState.charPool.buffers[p.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
|
||||
assert text != null;
|
||||
int pos = p.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
|
||||
|
||||
int tokenPos = 0;
|
||||
for(;tokenPos<tokenTextLen;pos++,tokenPos++)
|
||||
if (tokenText[tokenPos] != text[pos])
|
||||
return false;
|
||||
return 0xffff == text[pos];
|
||||
}
|
||||
|
||||
/** This is the hotspot of indexing: it's called once
|
||||
* for every term of every document. Its job is to *
|
||||
* update the postings byte stream (Postings hash) *
|
||||
* based on the occurence of a single term. */
|
||||
private void addPosition(Token token) throws AbortException {
|
||||
|
||||
final Payload payload = token.getPayload();
|
||||
|
||||
// Get the text of this term. Term can either
|
||||
// provide a String token or offset into a char[]
|
||||
// array
|
||||
final char[] tokenText = token.termBuffer();
|
||||
final int tokenTextLen = token.termLength();
|
||||
|
||||
int code = 0;
|
||||
|
||||
// Compute hashcode
|
||||
int downto = tokenTextLen;
|
||||
while (downto > 0)
|
||||
code = (code*31) + tokenText[--downto];
|
||||
|
||||
// System.out.println(" addPosition: buffer=" + new String(tokenText, 0, tokenTextLen) + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset + token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets);
|
||||
|
||||
int hashPos = code & postingsHashMask;
|
||||
|
||||
assert !postingsCompacted;
|
||||
|
||||
// Locate Posting in hash
|
||||
p = postingsHash[hashPos];
|
||||
|
||||
if (p != null && !postingEquals(tokenText, tokenTextLen)) {
|
||||
// Conflict: keep searching different locations in
|
||||
// the hash table.
|
||||
final int inc = ((code>>8)+code)|1;
|
||||
do {
|
||||
code += inc;
|
||||
hashPos = code & postingsHashMask;
|
||||
p = postingsHash[hashPos];
|
||||
} while (p != null && !postingEquals(tokenText, tokenTextLen));
|
||||
}
|
||||
|
||||
final int proxCode;
|
||||
|
||||
// If we hit an exception below, it's possible the
|
||||
// posting list or term vectors data will be
|
||||
// partially written and thus inconsistent if
|
||||
// flushed, so we have to abort all documents
|
||||
// since the last flush:
|
||||
|
||||
try {
|
||||
|
||||
if (p != null) { // term seen since last flush
|
||||
|
||||
if (threadState.docID != p.lastDocID) { // term not yet seen in this doc
|
||||
|
||||
// System.out.println(" seen before (new docID=" + docID + ") freqUpto=" + p.freqUpto +" proxUpto=" + p.proxUpto);
|
||||
|
||||
assert p.docFreq > 0;
|
||||
|
||||
// Now that we know doc freq for previous doc,
|
||||
// write it & lastDocCode
|
||||
freqUpto = p.freqUpto & DocumentsWriter.BYTE_BLOCK_MASK;
|
||||
freq = threadState.postingsPool.buffers[p.freqUpto >> DocumentsWriter.BYTE_BLOCK_SHIFT];
|
||||
if (1 == p.docFreq)
|
||||
writeFreqVInt(p.lastDocCode|1);
|
||||
else {
|
||||
writeFreqVInt(p.lastDocCode);
|
||||
writeFreqVInt(p.docFreq);
|
||||
}
|
||||
p.freqUpto = freqUpto + (p.freqUpto & DocumentsWriter.BYTE_BLOCK_NOT_MASK);
|
||||
|
||||
if (doVectors) {
|
||||
vector = addNewVector();
|
||||
if (doVectorOffsets) {
|
||||
offsetStartCode = offsetStart = offset + token.startOffset();
|
||||
offsetEnd = offset + token.endOffset();
|
||||
}
|
||||
}
|
||||
|
||||
proxCode = position;
|
||||
|
||||
p.docFreq = 1;
|
||||
|
||||
// Store code so we can write this after we're
|
||||
// done with this new doc
|
||||
p.lastDocCode = (threadState.docID-p.lastDocID) << 1;
|
||||
p.lastDocID = threadState.docID;
|
||||
|
||||
} else { // term already seen in this doc
|
||||
// System.out.println(" seen before (same docID=" + docID + ") proxUpto=" + p.proxUpto);
|
||||
p.docFreq++;
|
||||
|
||||
proxCode = position-p.lastPosition;
|
||||
|
||||
if (doVectors) {
|
||||
vector = p.vector;
|
||||
if (vector == null)
|
||||
vector = addNewVector();
|
||||
if (doVectorOffsets) {
|
||||
offsetStart = offset + token.startOffset();
|
||||
offsetEnd = offset + token.endOffset();
|
||||
offsetStartCode = offsetStart-vector.lastOffset;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else { // term not seen before
|
||||
// System.out.println(" never seen docID=" + docID);
|
||||
|
||||
// Refill?
|
||||
if (0 == threadState.postingsFreeCount) {
|
||||
threadState.docWriter.getPostings(threadState.postingsFreeList);
|
||||
threadState.postingsFreeCount = threadState.postingsFreeList.length;
|
||||
}
|
||||
|
||||
final int textLen1 = 1+tokenTextLen;
|
||||
if (textLen1 + threadState.charPool.byteUpto > DocumentsWriter.CHAR_BLOCK_SIZE) {
|
||||
if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE) {
|
||||
// Just skip this term, to remain as robust as
|
||||
// possible during indexing. A TokenFilter
|
||||
// can be inserted into the analyzer chain if
|
||||
// other behavior is wanted (pruning the term
|
||||
// to a prefix, throwing an exception, etc).
|
||||
if (threadState.maxTermPrefix == null)
|
||||
threadState.maxTermPrefix = new String(tokenText, 0, 30);
|
||||
|
||||
// Still increment position:
|
||||
position++;
|
||||
return;
|
||||
}
|
||||
threadState.charPool.nextBuffer();
|
||||
}
|
||||
|
||||
final char[] text = threadState.charPool.buffer;
|
||||
final int textUpto = threadState.charPool.byteUpto;
|
||||
|
||||
// Pull next free Posting from free list
|
||||
p = threadState.postingsFreeList[--threadState.postingsFreeCount];
|
||||
|
||||
p.textStart = textUpto + threadState.charPool.byteOffset;
|
||||
threadState.charPool.byteUpto += textLen1;
|
||||
|
||||
System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen);
|
||||
|
||||
text[textUpto+tokenTextLen] = 0xffff;
|
||||
|
||||
assert postingsHash[hashPos] == null;
|
||||
|
||||
postingsHash[hashPos] = p;
|
||||
numPostings++;
|
||||
|
||||
if (numPostings == postingsHashHalfSize)
|
||||
rehashPostings(2*postingsHashSize);
|
||||
|
||||
// Init first slice for freq & prox streams
|
||||
final int upto1 = threadState.postingsPool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
|
||||
p.freqStart = p.freqUpto = threadState.postingsPool.byteOffset + upto1;
|
||||
|
||||
final int upto2 = threadState.postingsPool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
|
||||
p.proxStart = p.proxUpto = threadState.postingsPool.byteOffset + upto2;
|
||||
|
||||
p.lastDocCode = threadState.docID << 1;
|
||||
p.lastDocID = threadState.docID;
|
||||
p.docFreq = 1;
|
||||
|
||||
if (doVectors) {
|
||||
vector = addNewVector();
|
||||
if (doVectorOffsets) {
|
||||
offsetStart = offsetStartCode = offset + token.startOffset();
|
||||
offsetEnd = offset + token.endOffset();
|
||||
}
|
||||
}
|
||||
|
||||
proxCode = position;
|
||||
}
|
||||
|
||||
proxUpto = p.proxUpto & DocumentsWriter.BYTE_BLOCK_MASK;
|
||||
prox = threadState.postingsPool.buffers[p.proxUpto >> DocumentsWriter.BYTE_BLOCK_SHIFT];
|
||||
assert prox != null;
|
||||
|
||||
if (payload != null && payload.length > 0) {
|
||||
writeProxVInt((proxCode<<1)|1);
|
||||
writeProxVInt(payload.length);
|
||||
writeProxBytes(payload.data, payload.offset, payload.length);
|
||||
fieldInfo.storePayloads = true;
|
||||
} else
|
||||
writeProxVInt(proxCode<<1);
|
||||
|
||||
p.proxUpto = proxUpto + (p.proxUpto & DocumentsWriter.BYTE_BLOCK_NOT_MASK);
|
||||
|
||||
p.lastPosition = position++;
|
||||
|
||||
if (doVectorPositions) {
|
||||
posUpto = vector.posUpto & DocumentsWriter.BYTE_BLOCK_MASK;
|
||||
pos = threadState.vectorsPool.buffers[vector.posUpto >> DocumentsWriter.BYTE_BLOCK_SHIFT];
|
||||
writePosVInt(proxCode);
|
||||
vector.posUpto = posUpto + (vector.posUpto & DocumentsWriter.BYTE_BLOCK_NOT_MASK);
|
||||
}
|
||||
|
||||
if (doVectorOffsets) {
|
||||
offsetUpto = vector.offsetUpto & DocumentsWriter.BYTE_BLOCK_MASK;
|
||||
offsets = threadState.vectorsPool.buffers[vector.offsetUpto >> DocumentsWriter.BYTE_BLOCK_SHIFT];
|
||||
writeOffsetVInt(offsetStartCode);
|
||||
writeOffsetVInt(offsetEnd-offsetStart);
|
||||
vector.lastOffset = offsetEnd;
|
||||
vector.offsetUpto = offsetUpto + (vector.offsetUpto & DocumentsWriter.BYTE_BLOCK_NOT_MASK);
|
||||
}
|
||||
} catch (Throwable t) {
|
||||
throw new AbortException(t, threadState.docWriter);
|
||||
}
|
||||
}
|
||||
|
||||
/** Write vInt into freq stream of current Posting */
|
||||
public void writeFreqVInt(int i) {
|
||||
while ((i & ~0x7F) != 0) {
|
||||
writeFreqByte((byte)((i & 0x7f) | 0x80));
|
||||
i >>>= 7;
|
||||
}
|
||||
writeFreqByte((byte) i);
|
||||
}
|
||||
|
||||
/** Write vInt into prox stream of current Posting */
|
||||
public void writeProxVInt(int i) {
|
||||
while ((i & ~0x7F) != 0) {
|
||||
writeProxByte((byte)((i & 0x7f) | 0x80));
|
||||
i >>>= 7;
|
||||
}
|
||||
writeProxByte((byte) i);
|
||||
}
|
||||
|
||||
/** Write byte into freq stream of current Posting */
|
||||
byte[] freq;
|
||||
int freqUpto;
|
||||
public void writeFreqByte(byte b) {
|
||||
assert freq != null;
|
||||
if (freq[freqUpto] != 0) {
|
||||
freqUpto = threadState.postingsPool.allocSlice(freq, freqUpto);
|
||||
freq = threadState.postingsPool.buffer;
|
||||
p.freqUpto = threadState.postingsPool.byteOffset;
|
||||
}
|
||||
freq[freqUpto++] = b;
|
||||
}
|
||||
|
||||
/** Write byte into prox stream of current Posting */
|
||||
byte[] prox;
|
||||
int proxUpto;
|
||||
public void writeProxByte(byte b) {
|
||||
assert prox != null;
|
||||
if (prox[proxUpto] != 0) {
|
||||
proxUpto = threadState.postingsPool.allocSlice(prox, proxUpto);
|
||||
prox = threadState.postingsPool.buffer;
|
||||
p.proxUpto = threadState.postingsPool.byteOffset;
|
||||
assert prox != null;
|
||||
}
|
||||
prox[proxUpto++] = b;
|
||||
assert proxUpto != prox.length;
|
||||
}
|
||||
|
||||
/** Currently only used to copy a payload into the prox
|
||||
* stream. */
|
||||
public void writeProxBytes(byte[] b, int offset, int len) {
|
||||
final int offsetEnd = offset + len;
|
||||
while(offset < offsetEnd) {
|
||||
if (prox[proxUpto] != 0) {
|
||||
// End marker
|
||||
proxUpto = threadState.postingsPool.allocSlice(prox, proxUpto);
|
||||
prox = threadState.postingsPool.buffer;
|
||||
p.proxUpto = threadState.postingsPool.byteOffset;
|
||||
}
|
||||
|
||||
prox[proxUpto++] = b[offset++];
|
||||
assert proxUpto != prox.length;
|
||||
}
|
||||
}
|
||||
|
||||
/** Write vInt into offsets stream of current
|
||||
* PostingVector */
|
||||
public void writeOffsetVInt(int i) {
|
||||
while ((i & ~0x7F) != 0) {
|
||||
writeOffsetByte((byte)((i & 0x7f) | 0x80));
|
||||
i >>>= 7;
|
||||
}
|
||||
writeOffsetByte((byte) i);
|
||||
}
|
||||
|
||||
byte[] offsets;
|
||||
int offsetUpto;
|
||||
|
||||
/** Write byte into offsets stream of current
|
||||
* PostingVector */
|
||||
public void writeOffsetByte(byte b) {
|
||||
assert offsets != null;
|
||||
if (offsets[offsetUpto] != 0) {
|
||||
offsetUpto = threadState.vectorsPool.allocSlice(offsets, offsetUpto);
|
||||
offsets = threadState.vectorsPool.buffer;
|
||||
vector.offsetUpto = threadState.vectorsPool.byteOffset;
|
||||
}
|
||||
offsets[offsetUpto++] = b;
|
||||
}
|
||||
|
||||
/** Write vInt into pos stream of current
|
||||
* PostingVector */
|
||||
public void writePosVInt(int i) {
|
||||
while ((i & ~0x7F) != 0) {
|
||||
writePosByte((byte)((i & 0x7f) | 0x80));
|
||||
i >>>= 7;
|
||||
}
|
||||
writePosByte((byte) i);
|
||||
}
|
||||
|
||||
byte[] pos;
|
||||
int posUpto;
|
||||
|
||||
/** Write byte into pos stream of current
|
||||
* PostingVector */
|
||||
public void writePosByte(byte b) {
|
||||
assert pos != null;
|
||||
if (pos[posUpto] != 0) {
|
||||
posUpto = threadState.vectorsPool.allocSlice(pos, posUpto);
|
||||
pos = threadState.vectorsPool.buffer;
|
||||
vector.posUpto = threadState.vectorsPool.byteOffset;
|
||||
}
|
||||
pos[posUpto++] = b;
|
||||
}
|
||||
|
||||
/** Called when postings hash is too small (> 50%
|
||||
* occupied) or too large (< 20% occupied). */
|
||||
void rehashPostings(final int newSize) {
|
||||
|
||||
final int newMask = newSize-1;
|
||||
|
||||
Posting[] newHash = new Posting[newSize];
|
||||
for(int i=0;i<postingsHashSize;i++) {
|
||||
Posting p0 = postingsHash[i];
|
||||
if (p0 != null) {
|
||||
final int start = p0.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
|
||||
final char[] text = threadState.charPool.buffers[p0.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
|
||||
int pos = start;
|
||||
while(text[pos] != 0xffff)
|
||||
pos++;
|
||||
int code = 0;
|
||||
while (pos > start)
|
||||
code = (code*31) + text[--pos];
|
||||
|
||||
int hashPos = code & newMask;
|
||||
assert hashPos >= 0;
|
||||
if (newHash[hashPos] != null) {
|
||||
final int inc = ((code>>8)+code)|1;
|
||||
do {
|
||||
code += inc;
|
||||
hashPos = code & newMask;
|
||||
} while (newHash[hashPos] != null);
|
||||
}
|
||||
newHash[hashPos] = p0;
|
||||
}
|
||||
}
|
||||
|
||||
postingsHashMask = newMask;
|
||||
postingsHash = newHash;
|
||||
postingsHashSize = newSize;
|
||||
postingsHashHalfSize = newSize >> 1;
|
||||
}
|
||||
|
||||
final ByteSliceReader vectorSliceReader = new ByteSliceReader();
|
||||
|
||||
/** Called once per field per document if term vectors
|
||||
* are enabled, to write the vectors to *
|
||||
* RAMOutputStream, which is then quickly flushed to
|
||||
* * the real term vectors files in the Directory. */
|
||||
void writeVectors(FieldInfo fieldInfo) throws IOException {
|
||||
|
||||
assert fieldInfo.storeTermVector;
|
||||
assert threadState.vectorFieldsInOrder(fieldInfo);
|
||||
|
||||
threadState.vectorFieldNumbers[threadState.numVectorFields] = fieldInfo.number;
|
||||
threadState.vectorFieldPointers[threadState.numVectorFields] = threadState.tvfLocal.getFilePointer();
|
||||
threadState.numVectorFields++;
|
||||
|
||||
final int numPostingsVectors = postingsVectorsUpto;
|
||||
final PostingVector[] postingsVectors = threadState.postingsVectors;
|
||||
|
||||
final IndexOutput tvfLocal = threadState.tvfLocal;
|
||||
|
||||
threadState.tvfLocal.writeVInt(numPostingsVectors);
|
||||
byte bits = 0x0;
|
||||
if (doVectorPositions)
|
||||
bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR;
|
||||
if (doVectorOffsets)
|
||||
bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
|
||||
threadState.tvfLocal.writeByte(bits);
|
||||
|
||||
threadState.doVectorSort(postingsVectors, numPostingsVectors);
|
||||
|
||||
Posting lastPosting = null;
|
||||
|
||||
final ByteSliceReader reader = vectorSliceReader;
|
||||
final char[][] charBuffers = threadState.charPool.buffers;
|
||||
|
||||
for(int j=0;j<numPostingsVectors;j++) {
|
||||
final PostingVector vector = postingsVectors[j];
|
||||
Posting posting = vector.p;
|
||||
final int freq = posting.docFreq;
|
||||
|
||||
final int prefix;
|
||||
final char[] text2 = charBuffers[posting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
|
||||
final int start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
|
||||
int pos2 = start2;
|
||||
|
||||
// Compute common prefix between last term and
|
||||
// this term
|
||||
if (lastPosting == null)
|
||||
prefix = 0;
|
||||
else {
|
||||
final char[] text1 = charBuffers[lastPosting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
|
||||
final int start1 = lastPosting.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
|
||||
int pos1 = start1;
|
||||
while(true) {
|
||||
final char c1 = text1[pos1];
|
||||
final char c2 = text2[pos2];
|
||||
if (c1 != c2 || c1 == 0xffff) {
|
||||
prefix = pos1-start1;
|
||||
break;
|
||||
}
|
||||
pos1++;
|
||||
pos2++;
|
||||
}
|
||||
}
|
||||
lastPosting = posting;
|
||||
|
||||
// Compute length
|
||||
while(text2[pos2] != 0xffff)
|
||||
pos2++;
|
||||
|
||||
final int suffix = pos2 - start2 - prefix;
|
||||
tvfLocal.writeVInt(prefix);
|
||||
tvfLocal.writeVInt(suffix);
|
||||
tvfLocal.writeChars(text2, start2 + prefix, suffix);
|
||||
tvfLocal.writeVInt(freq);
|
||||
|
||||
if (doVectorPositions) {
|
||||
reader.init(threadState.vectorsPool, vector.posStart, vector.posUpto);
|
||||
reader.writeTo(tvfLocal);
|
||||
}
|
||||
|
||||
if (doVectorOffsets) {
|
||||
reader.init(threadState.vectorsPool, vector.offsetStart, vector.offsetUpto);
|
||||
reader.writeTo(tvfLocal);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,89 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/** Used by DocumentsWriter to merge the postings from
|
||||
* multiple ThreadStates when creating a segment */
|
||||
final class DocumentsWriterFieldMergeState {
|
||||
|
||||
DocumentsWriterFieldData field;
|
||||
|
||||
Posting[] postings;
|
||||
|
||||
private Posting p;
|
||||
char[] text;
|
||||
int textOffset;
|
||||
|
||||
private int postingUpto = -1;
|
||||
|
||||
ByteSliceReader freq = new ByteSliceReader();
|
||||
ByteSliceReader prox = new ByteSliceReader();
|
||||
|
||||
int docID;
|
||||
int termFreq;
|
||||
|
||||
boolean nextTerm() throws IOException {
|
||||
postingUpto++;
|
||||
if (postingUpto == field.numPostings)
|
||||
return false;
|
||||
|
||||
p = postings[postingUpto];
|
||||
docID = 0;
|
||||
|
||||
text = field.threadState.charPool.buffers[p.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
|
||||
textOffset = p.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
|
||||
|
||||
if (p.freqUpto > p.freqStart)
|
||||
freq.init(field.threadState.postingsPool, p.freqStart, p.freqUpto);
|
||||
else
|
||||
freq.bufferOffset = freq.upto = freq.endIndex = 0;
|
||||
|
||||
prox.init(field.threadState.postingsPool, p.proxStart, p.proxUpto);
|
||||
|
||||
// Should always be true
|
||||
boolean result = nextDoc();
|
||||
assert result;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean nextDoc() throws IOException {
|
||||
if (freq.bufferOffset + freq.upto == freq.endIndex) {
|
||||
if (p.lastDocCode != -1) {
|
||||
// Return last doc
|
||||
docID = p.lastDocID;
|
||||
termFreq = p.docFreq;
|
||||
p.lastDocCode = -1;
|
||||
return true;
|
||||
} else
|
||||
// EOF
|
||||
return false;
|
||||
}
|
||||
|
||||
final int code = freq.readVInt();
|
||||
docID += code >>> 1;
|
||||
if ((code & 1) != 0)
|
||||
termFreq = 1;
|
||||
else
|
||||
termFreq = freq.readVInt();
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,719 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
|
||||
/** Used by DocumentsWriter to maintain per-thread state.
|
||||
* We keep a separate Posting hash and other state for each
|
||||
* thread and then merge postings hashes from all threads
|
||||
* when writing the segment. */
|
||||
final class DocumentsWriterThreadState {
|
||||
|
||||
Posting[] postingsFreeList; // Free Posting instances
|
||||
int postingsFreeCount;
|
||||
|
||||
RAMOutputStream tvfLocal = new RAMOutputStream(); // Term vectors for one doc
|
||||
RAMOutputStream fdtLocal = new RAMOutputStream(); // Stored fields for one doc
|
||||
FieldsWriter localFieldsWriter; // Fields for one doc
|
||||
|
||||
long[] vectorFieldPointers;
|
||||
int[] vectorFieldNumbers;
|
||||
|
||||
boolean isIdle = true; // Whether we are in use
|
||||
int numThreads = 1; // Number of threads that use this instance
|
||||
|
||||
int docID; // docID we are now working on
|
||||
int numStoredFields; // How many stored fields in current doc
|
||||
float docBoost; // Boost for current doc
|
||||
|
||||
DocumentsWriterFieldData[] fieldDataArray; // Fields touched by current doc
|
||||
int numFieldData; // How many fields in current doc
|
||||
int numVectorFields; // How many vector fields in current doc
|
||||
|
||||
DocumentsWriterFieldData[] allFieldDataArray = new DocumentsWriterFieldData[10]; // All FieldData instances
|
||||
int numAllFieldData;
|
||||
DocumentsWriterFieldData[] fieldDataHash; // Hash FieldData instances by field name
|
||||
int fieldDataHashMask;
|
||||
String maxTermPrefix; // Non-null prefix of a too-large term if this
|
||||
// doc has one
|
||||
|
||||
boolean doFlushAfter;
|
||||
|
||||
final DocumentsWriter docWriter;
|
||||
|
||||
final ByteBlockPool postingsPool;
|
||||
final ByteBlockPool vectorsPool;
|
||||
final CharBlockPool charPool;
|
||||
|
||||
public DocumentsWriterThreadState(DocumentsWriter docWriter) {
|
||||
this.docWriter = docWriter;
|
||||
fieldDataArray = new DocumentsWriterFieldData[8];
|
||||
|
||||
fieldDataHash = new DocumentsWriterFieldData[16];
|
||||
fieldDataHashMask = 15;
|
||||
|
||||
vectorFieldPointers = new long[10];
|
||||
vectorFieldNumbers = new int[10];
|
||||
postingsFreeList = new Posting[256];
|
||||
postingsFreeCount = 0;
|
||||
|
||||
postingsPool = new ByteBlockPool(docWriter ,true);
|
||||
vectorsPool = new ByteBlockPool(docWriter, false);
|
||||
charPool = new CharBlockPool(docWriter);
|
||||
}
|
||||
|
||||
/** Clear the postings hash and return objects back to
|
||||
* shared pool */
|
||||
public void resetPostings() throws IOException {
|
||||
fieldGen = 0;
|
||||
maxPostingsVectors = 0;
|
||||
doFlushAfter = false;
|
||||
if (localFieldsWriter != null) {
|
||||
localFieldsWriter.close();
|
||||
localFieldsWriter = null;
|
||||
}
|
||||
postingsPool.reset();
|
||||
charPool.reset();
|
||||
docWriter.recyclePostings(postingsFreeList, postingsFreeCount);
|
||||
postingsFreeCount = 0;
|
||||
for(int i=0;i<numAllFieldData;i++) {
|
||||
DocumentsWriterFieldData fp = allFieldDataArray[i];
|
||||
fp.lastGen = -1;
|
||||
if (fp.numPostings > 0)
|
||||
fp.resetPostingArrays();
|
||||
}
|
||||
}
|
||||
|
||||
/** Move all per-document state that was accumulated in
|
||||
* the ThreadState into the "real" stores. */
|
||||
public void writeDocument() throws IOException, AbortException {
|
||||
|
||||
// If we hit an exception while appending to the
|
||||
// stored fields or term vectors files, we have to
|
||||
// abort all documents since we last flushed because
|
||||
// it means those files are possibly inconsistent.
|
||||
try {
|
||||
|
||||
docWriter.numDocsInStore++;
|
||||
|
||||
// Append stored fields to the real FieldsWriter:
|
||||
docWriter.fieldsWriter.flushDocument(numStoredFields, fdtLocal);
|
||||
fdtLocal.reset();
|
||||
|
||||
// Append term vectors to the real outputs:
|
||||
final IndexOutput tvx = docWriter.tvx;
|
||||
final IndexOutput tvd = docWriter.tvd;
|
||||
final IndexOutput tvf = docWriter.tvf;
|
||||
if (tvx != null) {
|
||||
tvx.writeLong(tvd.getFilePointer());
|
||||
tvx.writeLong(tvf.getFilePointer());
|
||||
tvd.writeVInt(numVectorFields);
|
||||
if (numVectorFields > 0) {
|
||||
for(int i=0;i<numVectorFields;i++)
|
||||
tvd.writeVInt(vectorFieldNumbers[i]);
|
||||
assert 0 == vectorFieldPointers[0];
|
||||
long lastPos = vectorFieldPointers[0];
|
||||
for(int i=1;i<numVectorFields;i++) {
|
||||
long pos = vectorFieldPointers[i];
|
||||
tvd.writeVLong(pos-lastPos);
|
||||
lastPos = pos;
|
||||
}
|
||||
tvfLocal.writeTo(tvf);
|
||||
tvfLocal.reset();
|
||||
}
|
||||
}
|
||||
|
||||
// Append norms for the fields we saw:
|
||||
for(int i=0;i<numFieldData;i++) {
|
||||
DocumentsWriterFieldData fp = fieldDataArray[i];
|
||||
if (fp.doNorms) {
|
||||
BufferedNorms bn = docWriter.norms[fp.fieldInfo.number];
|
||||
assert bn != null;
|
||||
assert bn.upto <= docID;
|
||||
bn.fill(docID);
|
||||
float norm = fp.boost * docWriter.writer.getSimilarity().lengthNorm(fp.fieldInfo.name, fp.length);
|
||||
bn.add(norm);
|
||||
}
|
||||
}
|
||||
} catch (Throwable t) {
|
||||
// Forcefully idle this threadstate -- its state will
|
||||
// be reset by abort()
|
||||
isIdle = true;
|
||||
throw new AbortException(t, docWriter);
|
||||
}
|
||||
|
||||
if (docWriter.bufferIsFull && !docWriter.flushPending) {
|
||||
docWriter.flushPending = true;
|
||||
doFlushAfter = true;
|
||||
}
|
||||
}
|
||||
|
||||
int fieldGen;
|
||||
|
||||
/** Initializes shared state for this new document */
|
||||
void init(Document doc, int docID) throws IOException, AbortException {
|
||||
|
||||
assert !isIdle;
|
||||
assert docWriter.writer.testPoint("DocumentsWriter.ThreadState.init start");
|
||||
|
||||
this.docID = docID;
|
||||
docBoost = doc.getBoost();
|
||||
numStoredFields = 0;
|
||||
numFieldData = 0;
|
||||
numVectorFields = 0;
|
||||
maxTermPrefix = null;
|
||||
|
||||
assert 0 == fdtLocal.length();
|
||||
assert 0 == fdtLocal.getFilePointer();
|
||||
assert 0 == tvfLocal.length();
|
||||
assert 0 == tvfLocal.getFilePointer();
|
||||
final int thisFieldGen = fieldGen++;
|
||||
|
||||
List docFields = doc.getFields();
|
||||
final int numDocFields = docFields.size();
|
||||
boolean docHasVectors = false;
|
||||
|
||||
// Absorb any new fields first seen in this document.
|
||||
// Also absorb any changes to fields we had already
|
||||
// seen before (eg suddenly turning on norms or
|
||||
// vectors, etc.):
|
||||
|
||||
for(int i=0;i<numDocFields;i++) {
|
||||
Fieldable field = (Fieldable) docFields.get(i);
|
||||
|
||||
FieldInfo fi = docWriter.fieldInfos.add(field.name(), field.isIndexed(), field.isTermVectorStored(),
|
||||
field.isStorePositionWithTermVector(), field.isStoreOffsetWithTermVector(),
|
||||
field.getOmitNorms(), false);
|
||||
if (fi.isIndexed && !fi.omitNorms) {
|
||||
// Maybe grow our buffered norms
|
||||
if (docWriter.norms.length <= fi.number) {
|
||||
int newSize = (int) ((1+fi.number)*1.25);
|
||||
BufferedNorms[] newNorms = new BufferedNorms[newSize];
|
||||
System.arraycopy(docWriter.norms, 0, newNorms, 0, docWriter.norms.length);
|
||||
docWriter.norms = newNorms;
|
||||
}
|
||||
|
||||
if (docWriter.norms[fi.number] == null)
|
||||
docWriter.norms[fi.number] = new BufferedNorms();
|
||||
|
||||
docWriter.hasNorms = true;
|
||||
}
|
||||
|
||||
// Make sure we have a FieldData allocated
|
||||
int hashPos = fi.name.hashCode() & fieldDataHashMask;
|
||||
DocumentsWriterFieldData fp = fieldDataHash[hashPos];
|
||||
while(fp != null && !fp.fieldInfo.name.equals(fi.name))
|
||||
fp = fp.next;
|
||||
|
||||
if (fp == null) {
|
||||
|
||||
fp = new DocumentsWriterFieldData(this, fi);
|
||||
fp.next = fieldDataHash[hashPos];
|
||||
fieldDataHash[hashPos] = fp;
|
||||
|
||||
if (numAllFieldData == allFieldDataArray.length) {
|
||||
int newSize = (int) (allFieldDataArray.length*1.5);
|
||||
int newHashSize = fieldDataHash.length*2;
|
||||
|
||||
DocumentsWriterFieldData newArray[] = new DocumentsWriterFieldData[newSize];
|
||||
DocumentsWriterFieldData newHashArray[] = new DocumentsWriterFieldData[newHashSize];
|
||||
System.arraycopy(allFieldDataArray, 0, newArray, 0, numAllFieldData);
|
||||
|
||||
// Rehash
|
||||
fieldDataHashMask = newSize-1;
|
||||
for(int j=0;j<fieldDataHash.length;j++) {
|
||||
DocumentsWriterFieldData fp0 = fieldDataHash[j];
|
||||
while(fp0 != null) {
|
||||
hashPos = fp0.fieldInfo.name.hashCode() & fieldDataHashMask;
|
||||
DocumentsWriterFieldData nextFP0 = fp0.next;
|
||||
fp0.next = newHashArray[hashPos];
|
||||
newHashArray[hashPos] = fp0;
|
||||
fp0 = nextFP0;
|
||||
}
|
||||
}
|
||||
|
||||
allFieldDataArray = newArray;
|
||||
fieldDataHash = newHashArray;
|
||||
}
|
||||
allFieldDataArray[numAllFieldData++] = fp;
|
||||
} else {
|
||||
assert fp.fieldInfo == fi;
|
||||
}
|
||||
|
||||
if (thisFieldGen != fp.lastGen) {
|
||||
|
||||
// First time we're seeing this field for this doc
|
||||
fp.lastGen = thisFieldGen;
|
||||
fp.fieldCount = 0;
|
||||
fp.doVectors = fp.doVectorPositions = fp.doVectorOffsets = false;
|
||||
fp.doNorms = fi.isIndexed && !fi.omitNorms;
|
||||
|
||||
if (numFieldData == fieldDataArray.length) {
|
||||
int newSize = fieldDataArray.length*2;
|
||||
DocumentsWriterFieldData newArray[] = new DocumentsWriterFieldData[newSize];
|
||||
System.arraycopy(fieldDataArray, 0, newArray, 0, numFieldData);
|
||||
fieldDataArray = newArray;
|
||||
|
||||
}
|
||||
fieldDataArray[numFieldData++] = fp;
|
||||
}
|
||||
|
||||
if (field.isTermVectorStored()) {
|
||||
if (!fp.doVectors && numVectorFields++ == vectorFieldPointers.length) {
|
||||
final int newSize = (int) (numVectorFields*1.5);
|
||||
vectorFieldPointers = new long[newSize];
|
||||
vectorFieldNumbers = new int[newSize];
|
||||
}
|
||||
fp.doVectors = true;
|
||||
docHasVectors = true;
|
||||
|
||||
fp.doVectorPositions |= field.isStorePositionWithTermVector();
|
||||
fp.doVectorOffsets |= field.isStoreOffsetWithTermVector();
|
||||
}
|
||||
|
||||
if (fp.fieldCount == fp.docFields.length) {
|
||||
Fieldable[] newArray = new Fieldable[fp.docFields.length*2];
|
||||
System.arraycopy(fp.docFields, 0, newArray, 0, fp.docFields.length);
|
||||
fp.docFields = newArray;
|
||||
}
|
||||
|
||||
// Lazily allocate arrays for postings:
|
||||
if (field.isIndexed() && fp.postingsHash == null)
|
||||
fp.initPostingArrays();
|
||||
|
||||
fp.docFields[fp.fieldCount++] = field;
|
||||
}
|
||||
|
||||
// Maybe init the local & global fieldsWriter
|
||||
if (localFieldsWriter == null) {
|
||||
if (docWriter.fieldsWriter == null) {
|
||||
assert docWriter.docStoreSegment == null;
|
||||
assert docWriter.segment != null;
|
||||
docWriter.docStoreSegment = docWriter.segment;
|
||||
// If we hit an exception while init'ing the
|
||||
// fieldsWriter, we must abort this segment
|
||||
// because those files will be in an unknown
|
||||
// state:
|
||||
try {
|
||||
docWriter.fieldsWriter = new FieldsWriter(docWriter.directory, docWriter.docStoreSegment, docWriter.fieldInfos);
|
||||
} catch (Throwable t) {
|
||||
throw new AbortException(t, docWriter);
|
||||
}
|
||||
docWriter.files = null;
|
||||
}
|
||||
localFieldsWriter = new FieldsWriter(null, fdtLocal, docWriter.fieldInfos);
|
||||
}
|
||||
|
||||
// First time we see a doc that has field(s) with
|
||||
// stored vectors, we init our tvx writer
|
||||
if (docHasVectors) {
|
||||
if (docWriter.tvx == null) {
|
||||
assert docWriter.docStoreSegment != null;
|
||||
// If we hit an exception while init'ing the term
|
||||
// vector output files, we must abort this segment
|
||||
// because those files will be in an unknown
|
||||
// state:
|
||||
try {
|
||||
docWriter.tvx = docWriter.directory.createOutput(docWriter.docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
|
||||
docWriter.tvx.writeInt(TermVectorsReader.FORMAT_VERSION2);
|
||||
docWriter.tvd = docWriter.directory.createOutput(docWriter.docStoreSegment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
|
||||
docWriter.tvd.writeInt(TermVectorsReader.FORMAT_VERSION2);
|
||||
docWriter.tvf = docWriter.directory.createOutput(docWriter.docStoreSegment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
|
||||
docWriter.tvf.writeInt(TermVectorsReader.FORMAT_VERSION2);
|
||||
|
||||
// We must "catch up" for all docs before us
|
||||
// that had no vectors:
|
||||
for(int i=0;i<docWriter.numDocsInStore;i++) {
|
||||
docWriter.tvx.writeLong(docWriter.tvd.getFilePointer());
|
||||
docWriter.tvd.writeVInt(0);
|
||||
docWriter.tvx.writeLong(0);
|
||||
}
|
||||
} catch (Throwable t) {
|
||||
throw new AbortException(t, docWriter);
|
||||
}
|
||||
docWriter.files = null;
|
||||
}
|
||||
numVectorFields = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/** Do in-place sort of Posting array */
|
||||
void doPostingSort(Posting[] postings, int numPosting) {
|
||||
quickSort(postings, 0, numPosting-1);
|
||||
}
|
||||
|
||||
void quickSort(Posting[] postings, int lo, int hi) {
|
||||
if (lo >= hi)
|
||||
return;
|
||||
else if (hi == 1+lo) {
|
||||
if (comparePostings(postings[lo], postings[hi]) > 0) {
|
||||
final Posting tmp = postings[lo];
|
||||
postings[lo] = postings[hi];
|
||||
postings[hi] = tmp;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
int mid = (lo + hi) >>> 1;
|
||||
|
||||
if (comparePostings(postings[lo], postings[mid]) > 0) {
|
||||
Posting tmp = postings[lo];
|
||||
postings[lo] = postings[mid];
|
||||
postings[mid] = tmp;
|
||||
}
|
||||
|
||||
if (comparePostings(postings[mid], postings[hi]) > 0) {
|
||||
Posting tmp = postings[mid];
|
||||
postings[mid] = postings[hi];
|
||||
postings[hi] = tmp;
|
||||
|
||||
if (comparePostings(postings[lo], postings[mid]) > 0) {
|
||||
Posting tmp2 = postings[lo];
|
||||
postings[lo] = postings[mid];
|
||||
postings[mid] = tmp2;
|
||||
}
|
||||
}
|
||||
|
||||
int left = lo + 1;
|
||||
int right = hi - 1;
|
||||
|
||||
if (left >= right)
|
||||
return;
|
||||
|
||||
Posting partition = postings[mid];
|
||||
|
||||
for (; ;) {
|
||||
while (comparePostings(postings[right], partition) > 0)
|
||||
--right;
|
||||
|
||||
while (left < right && comparePostings(postings[left], partition) <= 0)
|
||||
++left;
|
||||
|
||||
if (left < right) {
|
||||
Posting tmp = postings[left];
|
||||
postings[left] = postings[right];
|
||||
postings[right] = tmp;
|
||||
--right;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
quickSort(postings, lo, left);
|
||||
quickSort(postings, left + 1, hi);
|
||||
}
|
||||
|
||||
/** Do in-place sort of PostingVector array */
|
||||
void doVectorSort(PostingVector[] postings, int numPosting) {
|
||||
quickSort(postings, 0, numPosting-1);
|
||||
}
|
||||
|
||||
void quickSort(PostingVector[] postings, int lo, int hi) {
|
||||
if (lo >= hi)
|
||||
return;
|
||||
else if (hi == 1+lo) {
|
||||
if (comparePostings(postings[lo].p, postings[hi].p) > 0) {
|
||||
final PostingVector tmp = postings[lo];
|
||||
postings[lo] = postings[hi];
|
||||
postings[hi] = tmp;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
int mid = (lo + hi) >>> 1;
|
||||
|
||||
if (comparePostings(postings[lo].p, postings[mid].p) > 0) {
|
||||
PostingVector tmp = postings[lo];
|
||||
postings[lo] = postings[mid];
|
||||
postings[mid] = tmp;
|
||||
}
|
||||
|
||||
if (comparePostings(postings[mid].p, postings[hi].p) > 0) {
|
||||
PostingVector tmp = postings[mid];
|
||||
postings[mid] = postings[hi];
|
||||
postings[hi] = tmp;
|
||||
|
||||
if (comparePostings(postings[lo].p, postings[mid].p) > 0) {
|
||||
PostingVector tmp2 = postings[lo];
|
||||
postings[lo] = postings[mid];
|
||||
postings[mid] = tmp2;
|
||||
}
|
||||
}
|
||||
|
||||
int left = lo + 1;
|
||||
int right = hi - 1;
|
||||
|
||||
if (left >= right)
|
||||
return;
|
||||
|
||||
PostingVector partition = postings[mid];
|
||||
|
||||
for (; ;) {
|
||||
while (comparePostings(postings[right].p, partition.p) > 0)
|
||||
--right;
|
||||
|
||||
while (left < right && comparePostings(postings[left].p, partition.p) <= 0)
|
||||
++left;
|
||||
|
||||
if (left < right) {
|
||||
PostingVector tmp = postings[left];
|
||||
postings[left] = postings[right];
|
||||
postings[right] = tmp;
|
||||
--right;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
quickSort(postings, lo, left);
|
||||
quickSort(postings, left + 1, hi);
|
||||
}
|
||||
|
||||
void quickSort(DocumentsWriterFieldData[] array, int lo, int hi) {
|
||||
if (lo >= hi)
|
||||
return;
|
||||
else if (hi == 1+lo) {
|
||||
if (array[lo].compareTo(array[hi]) > 0) {
|
||||
final DocumentsWriterFieldData tmp = array[lo];
|
||||
array[lo] = array[hi];
|
||||
array[hi] = tmp;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
int mid = (lo + hi) >>> 1;
|
||||
|
||||
if (array[lo].compareTo(array[mid]) > 0) {
|
||||
DocumentsWriterFieldData tmp = array[lo];
|
||||
array[lo] = array[mid];
|
||||
array[mid] = tmp;
|
||||
}
|
||||
|
||||
if (array[mid].compareTo(array[hi]) > 0) {
|
||||
DocumentsWriterFieldData tmp = array[mid];
|
||||
array[mid] = array[hi];
|
||||
array[hi] = tmp;
|
||||
|
||||
if (array[lo].compareTo(array[mid]) > 0) {
|
||||
DocumentsWriterFieldData tmp2 = array[lo];
|
||||
array[lo] = array[mid];
|
||||
array[mid] = tmp2;
|
||||
}
|
||||
}
|
||||
|
||||
int left = lo + 1;
|
||||
int right = hi - 1;
|
||||
|
||||
if (left >= right)
|
||||
return;
|
||||
|
||||
DocumentsWriterFieldData partition = array[mid];
|
||||
|
||||
for (; ;) {
|
||||
while (array[right].compareTo(partition) > 0)
|
||||
--right;
|
||||
|
||||
while (left < right && array[left].compareTo(partition) <= 0)
|
||||
++left;
|
||||
|
||||
if (left < right) {
|
||||
DocumentsWriterFieldData tmp = array[left];
|
||||
array[left] = array[right];
|
||||
array[right] = tmp;
|
||||
--right;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
quickSort(array, lo, left);
|
||||
quickSort(array, left + 1, hi);
|
||||
}
|
||||
|
||||
/** If there are fields we've seen but did not see again
|
||||
* in the last run, then free them up. Also reduce
|
||||
* postings hash size. */
|
||||
void trimFields() {
|
||||
|
||||
int upto = 0;
|
||||
for(int i=0;i<numAllFieldData;i++) {
|
||||
DocumentsWriterFieldData fp = allFieldDataArray[i];
|
||||
if (fp.lastGen == -1) {
|
||||
// This field was not seen since the previous
|
||||
// flush, so, free up its resources now
|
||||
|
||||
// Unhash
|
||||
final int hashPos = fp.fieldInfo.name.hashCode() & fieldDataHashMask;
|
||||
DocumentsWriterFieldData last = null;
|
||||
DocumentsWriterFieldData fp0 = fieldDataHash[hashPos];
|
||||
while(fp0 != fp) {
|
||||
last = fp0;
|
||||
fp0 = fp0.next;
|
||||
}
|
||||
assert fp0 != null;
|
||||
|
||||
if (last == null)
|
||||
fieldDataHash[hashPos] = fp.next;
|
||||
else
|
||||
last.next = fp.next;
|
||||
|
||||
if (docWriter.infoStream != null)
|
||||
docWriter.infoStream.println(" remove field=" + fp.fieldInfo.name);
|
||||
|
||||
} else {
|
||||
// Reset
|
||||
fp.lastGen = -1;
|
||||
allFieldDataArray[upto++] = fp;
|
||||
|
||||
if (fp.numPostings > 0 && ((float) fp.numPostings) / fp.postingsHashSize < 0.2) {
|
||||
int hashSize = fp.postingsHashSize;
|
||||
|
||||
// Reduce hash so it's between 25-50% full
|
||||
while (fp.numPostings < (hashSize>>1) && hashSize >= 2)
|
||||
hashSize >>= 1;
|
||||
hashSize <<= 1;
|
||||
|
||||
if (hashSize != fp.postingsHash.length)
|
||||
fp.rehashPostings(hashSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we didn't see any norms for this field since
|
||||
// last flush, free it
|
||||
for(int i=0;i<docWriter.norms.length;i++) {
|
||||
BufferedNorms n = docWriter.norms[i];
|
||||
if (n != null && n.upto == 0)
|
||||
docWriter.norms[i] = null;
|
||||
}
|
||||
|
||||
numAllFieldData = upto;
|
||||
|
||||
// Also pare back PostingsVectors if it's excessively
|
||||
// large
|
||||
if (maxPostingsVectors * 1.5 < postingsVectors.length) {
|
||||
final int newSize;
|
||||
if (0 == maxPostingsVectors)
|
||||
newSize = 1;
|
||||
else
|
||||
newSize = (int) (1.5*maxPostingsVectors);
|
||||
PostingVector[] newArray = new PostingVector[newSize];
|
||||
System.arraycopy(postingsVectors, 0, newArray, 0, newSize);
|
||||
postingsVectors = newArray;
|
||||
}
|
||||
}
|
||||
|
||||
/** Tokenizes the fields of a document into Postings */
|
||||
void processDocument(Analyzer analyzer)
|
||||
throws IOException, AbortException {
|
||||
|
||||
final int numFields = numFieldData;
|
||||
assert clearLastVectorFieldName();
|
||||
|
||||
assert 0 == fdtLocal.length();
|
||||
|
||||
if (docWriter.tvx != null)
|
||||
// If we are writing vectors then we must visit
|
||||
// fields in sorted order so they are written in
|
||||
// sorted order. TODO: we actually only need to
|
||||
// sort the subset of fields that have vectors
|
||||
// enabled; we could save [small amount of] CPU
|
||||
// here.
|
||||
quickSort(fieldDataArray, 0, numFields-1);
|
||||
|
||||
// We process the document one field at a time
|
||||
for(int i=0;i<numFields;i++)
|
||||
fieldDataArray[i].processField(analyzer);
|
||||
|
||||
if (docWriter.infoStream != null && maxTermPrefix != null)
|
||||
docWriter.infoStream.println("WARNING: document contains at least one immense term (longer than the max length " + DocumentsWriter.MAX_TERM_LENGTH + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + maxTermPrefix + "...'");
|
||||
}
|
||||
|
||||
// USE ONLY FOR DEBUGGING!
|
||||
/*
|
||||
public String getPostingText() {
|
||||
char[] text = charPool.buffers[p.textStart >> CHAR_BLOCK_SHIFT];
|
||||
int upto = p.textStart & CHAR_BLOCK_MASK;
|
||||
while(text[upto] != 0xffff)
|
||||
upto++;
|
||||
return new String(text, p.textStart, upto-(p.textStart & BYTE_BLOCK_MASK));
|
||||
}
|
||||
*/
|
||||
|
||||
/** Compares term text for two Posting instance and
|
||||
* returns -1 if p1 < p2; 1 if p1 > p2; else 0.
|
||||
*/
|
||||
int comparePostings(Posting p1, Posting p2) {
|
||||
if (p1 == p2)
|
||||
return 0;
|
||||
final char[] text1 = charPool.buffers[p1.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
|
||||
int pos1 = p1.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
|
||||
final char[] text2 = charPool.buffers[p2.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
|
||||
int pos2 = p2.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
|
||||
while(true) {
|
||||
final char c1 = text1[pos1++];
|
||||
final char c2 = text2[pos2++];
|
||||
if (c1 < c2)
|
||||
if (0xffff == c2)
|
||||
return 1;
|
||||
else
|
||||
return -1;
|
||||
else if (c2 < c1)
|
||||
if (0xffff == c1)
|
||||
return -1;
|
||||
else
|
||||
return 1;
|
||||
else if (0xffff == c1)
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
String lastVectorFieldName;
|
||||
|
||||
// Called only by assert
|
||||
final boolean clearLastVectorFieldName() {
|
||||
lastVectorFieldName = null;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Called only by assert
|
||||
final boolean vectorFieldsInOrder(FieldInfo fi) {
|
||||
try {
|
||||
if (lastVectorFieldName != null)
|
||||
return lastVectorFieldName.compareTo(fi.name) < 0;
|
||||
else
|
||||
return true;
|
||||
} finally {
|
||||
lastVectorFieldName = fi.name;
|
||||
}
|
||||
}
|
||||
|
||||
PostingVector[] postingsVectors = new PostingVector[1];
|
||||
int maxPostingsVectors;
|
||||
|
||||
// Used to read a string value for a field
|
||||
ReusableStringReader stringReader = new ReusableStringReader();
|
||||
}
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* Used by DocumentsWriter to track postings for a single
|
||||
* term. One of these exists per unique term seen since the
|
||||
* last flush. */
|
||||
final class Posting {
|
||||
int textStart; // Address into char[] blocks where our text is stored
|
||||
int docFreq; // # times this term occurs in the current doc
|
||||
int freqStart; // Address of first byte[] slice for freq
|
||||
int freqUpto; // Next write address for freq
|
||||
int proxStart; // Address of first byte[] slice
|
||||
int proxUpto; // Next write address for prox
|
||||
int lastDocID; // Last docID where this term occurred
|
||||
int lastDocCode; // Code for prior doc
|
||||
int lastPosition; // Last position where this term occurred
|
||||
PostingVector vector; // Corresponding PostingVector instance
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* Used by DocumentsWriter to track data for term vectors.
|
||||
* One of these exists per unique term seen in each field in
|
||||
* the document. */
|
||||
class PostingVector {
|
||||
Posting p; // Corresponding Posting instance for this term
|
||||
int lastOffset; // Last offset we saw
|
||||
int offsetStart; // Address of first slice for offsets
|
||||
int offsetUpto; // Next write address for offsets
|
||||
int posStart; // Address of first slice for positions
|
||||
int posUpto; // Next write address for positions
|
||||
}
|
|
@ -0,0 +1,55 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
/** Used by DocumentsWriter to implemented a StringReader
|
||||
* that can be reset to a new string; we use this when
|
||||
* tokenizing the string value from a Field. */
|
||||
final class ReusableStringReader extends Reader {
|
||||
int upto;
|
||||
int left;
|
||||
String s;
|
||||
void init(String s) {
|
||||
this.s = s;
|
||||
left = s.length();
|
||||
this.upto = 0;
|
||||
}
|
||||
public int read(char[] c) {
|
||||
return read(c, 0, c.length);
|
||||
}
|
||||
public int read(char[] c, int off, int len) {
|
||||
if (left > len) {
|
||||
s.getChars(upto, upto+len, c, off);
|
||||
upto += len;
|
||||
left -= len;
|
||||
return len;
|
||||
} else if (0 == left) {
|
||||
return -1;
|
||||
} else {
|
||||
s.getChars(upto, upto+left, c, off);
|
||||
int r = left;
|
||||
left = 0;
|
||||
upto = s.length();
|
||||
return r;
|
||||
}
|
||||
}
|
||||
public void close() {};
|
||||
}
|
||||
|
Loading…
Reference in New Issue