mirror of https://github.com/apache/lucene.git
LUCENE-866: Adds multi-level skip lists to the posting lists.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@543076 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f86b74623d
commit
87ba4a1258
|
@ -210,6 +210,12 @@ Optimizations
|
|||
these increases yield 10-18% overall performance gain vs the
|
||||
previous 1K defaults. (Mike McCandless)
|
||||
|
||||
7. LUCENE-866: Adds multi-level skip lists to the posting lists. This speeds
|
||||
up most queries that use skipTo(), especially on big indexes with large posting
|
||||
lists. For average AND queries the speedup is about 20%, for queries that
|
||||
contain very frequence and very unique terms the speedup can be over 80%.
|
||||
(Michael Busch)
|
||||
|
||||
Documentation
|
||||
|
||||
1. LUCENE 791 && INFRA-1173: Infrastructure moved the Wiki to
|
||||
|
|
|
@ -0,0 +1,114 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
||||
/**
|
||||
* Implements the skip list reader for the default posting list format
|
||||
* that stores positions and payloads.
|
||||
*
|
||||
*/
|
||||
public class DefaultSkipListReader extends MultiLevelSkipListReader {
|
||||
private boolean currentFieldStoresPayloads;
|
||||
private long freqPointer[];
|
||||
private long proxPointer[];
|
||||
private int payloadLength[];
|
||||
|
||||
private long lastFreqPointer;
|
||||
private long lastProxPointer;
|
||||
private int lastPayloadLength;
|
||||
|
||||
|
||||
DefaultSkipListReader(IndexInput skipStream, int maxSkipLevels, int skipInterval) {
|
||||
super(skipStream, maxSkipLevels, skipInterval);
|
||||
freqPointer = new long[maxSkipLevels];
|
||||
proxPointer = new long[maxSkipLevels];
|
||||
payloadLength = new int[maxSkipLevels];
|
||||
}
|
||||
|
||||
void init(long skipPointer, long freqBasePointer, long proxBasePointer, int df, boolean storesPayloads) {
|
||||
super.init(skipPointer, df);
|
||||
this.currentFieldStoresPayloads = storesPayloads;
|
||||
lastFreqPointer = freqBasePointer;
|
||||
lastProxPointer = proxBasePointer;
|
||||
|
||||
Arrays.fill(freqPointer, freqBasePointer);
|
||||
Arrays.fill(proxPointer, proxBasePointer);
|
||||
Arrays.fill(payloadLength, 0);
|
||||
}
|
||||
|
||||
/** Returns the freq pointer of the doc to which the last call of
|
||||
* {@link MultiLevelSkipListReader#skipTo(int)} has skipped. */
|
||||
long getFreqPointer() {
|
||||
return lastFreqPointer;
|
||||
}
|
||||
|
||||
/** Returns the prox pointer of the doc to which the last call of
|
||||
* {@link MultiLevelSkipListReader#skipTo(int)} has skipped. */
|
||||
long getProxPointer() {
|
||||
return lastProxPointer;
|
||||
}
|
||||
|
||||
/** Returns the payload length of the payload stored just before
|
||||
* the doc to which the last call of {@link MultiLevelSkipListReader#skipTo(int)}
|
||||
* has skipped. */
|
||||
int getPayloadLength() {
|
||||
return lastPayloadLength;
|
||||
}
|
||||
|
||||
protected void seekChild(int level) throws IOException {
|
||||
super.seekChild(level);
|
||||
freqPointer[level] = lastFreqPointer;
|
||||
proxPointer[level] = lastProxPointer;
|
||||
payloadLength[level] = lastPayloadLength;
|
||||
}
|
||||
|
||||
protected void setLastSkipData(int level) {
|
||||
super.setLastSkipData(level);
|
||||
lastFreqPointer = freqPointer[level];
|
||||
lastProxPointer = proxPointer[level];
|
||||
lastPayloadLength = payloadLength[level];
|
||||
}
|
||||
|
||||
|
||||
protected int readSkipData(int level, IndexInput skipStream) throws IOException {
|
||||
int delta;
|
||||
if (currentFieldStoresPayloads) {
|
||||
// the current field stores payloads.
|
||||
// if the doc delta is odd then we have
|
||||
// to read the current payload length
|
||||
// because it differs from the length of the
|
||||
// previous payload
|
||||
delta = skipStream.readVInt();
|
||||
if ((delta & 1) != 0) {
|
||||
payloadLength[level] = skipStream.readVInt();
|
||||
}
|
||||
delta >>>= 1;
|
||||
} else {
|
||||
delta = skipStream.readVInt();
|
||||
}
|
||||
freqPointer[level] += skipStream.readVInt();
|
||||
proxPointer[level] += skipStream.readVInt();
|
||||
|
||||
return delta;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,124 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
|
||||
|
||||
/**
|
||||
* Implements the skip list writer for the default posting list format
|
||||
* that stores positions and payloads.
|
||||
*
|
||||
*/
|
||||
class DefaultSkipListWriter extends MultiLevelSkipListWriter {
|
||||
private int[] lastSkipDoc;
|
||||
private int[] lastSkipPayloadLength;
|
||||
private long[] lastSkipFreqPointer;
|
||||
private long[] lastSkipProxPointer;
|
||||
|
||||
private IndexOutput freqOutput;
|
||||
private IndexOutput proxOutput;
|
||||
|
||||
private int curDoc;
|
||||
private boolean curStorePayloads;
|
||||
private int curPayloadLength;
|
||||
private long curFreqPointer;
|
||||
private long curProxPointer;
|
||||
|
||||
DefaultSkipListWriter(int skipInterval, int numberOfSkipLevels, int docCount, IndexOutput freqOutput, IndexOutput proxOutput) {
|
||||
super(skipInterval, numberOfSkipLevels, docCount);
|
||||
this.freqOutput = freqOutput;
|
||||
this.proxOutput = proxOutput;
|
||||
|
||||
lastSkipDoc = new int[numberOfSkipLevels];
|
||||
lastSkipPayloadLength = new int[numberOfSkipLevels];
|
||||
lastSkipFreqPointer = new long[numberOfSkipLevels];
|
||||
lastSkipProxPointer = new long[numberOfSkipLevels];
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the values for the current skip data.
|
||||
*/
|
||||
void setSkipData(int doc, boolean storePayloads, int payloadLength) {
|
||||
this.curDoc = doc;
|
||||
this.curStorePayloads = storePayloads;
|
||||
this.curPayloadLength = payloadLength;
|
||||
this.curFreqPointer = freqOutput.getFilePointer();
|
||||
this.curProxPointer = proxOutput.getFilePointer();
|
||||
}
|
||||
|
||||
protected void resetSkip() {
|
||||
super.resetSkip();
|
||||
Arrays.fill(lastSkipDoc, 0);
|
||||
Arrays.fill(lastSkipPayloadLength, -1); // we don't have to write the first length in the skip list
|
||||
Arrays.fill(lastSkipFreqPointer, freqOutput.getFilePointer());
|
||||
Arrays.fill(lastSkipProxPointer, proxOutput.getFilePointer());
|
||||
}
|
||||
|
||||
protected void writeSkipData(int level, IndexOutput skipBuffer) throws IOException {
|
||||
// To efficiently store payloads in the posting lists we do not store the length of
|
||||
// every payload. Instead we omit the length for a payload if the previous payload had
|
||||
// the same length.
|
||||
// However, in order to support skipping the payload length at every skip point must be known.
|
||||
// So we use the same length encoding that we use for the posting lists for the skip data as well:
|
||||
// Case 1: current field does not store payloads
|
||||
// SkipDatum --> DocSkip, FreqSkip, ProxSkip
|
||||
// DocSkip,FreqSkip,ProxSkip --> VInt
|
||||
// DocSkip records the document number before every SkipInterval th document in TermFreqs.
|
||||
// Document numbers are represented as differences from the previous value in the sequence.
|
||||
// Case 2: current field stores payloads
|
||||
// SkipDatum --> DocSkip, PayloadLength?, FreqSkip,ProxSkip
|
||||
// DocSkip,FreqSkip,ProxSkip --> VInt
|
||||
// PayloadLength --> VInt
|
||||
// In this case DocSkip/2 is the difference between
|
||||
// the current and the previous value. If DocSkip
|
||||
// is odd, then a PayloadLength encoded as VInt follows,
|
||||
// if DocSkip is even, then it is assumed that the
|
||||
// current payload length equals the length at the previous
|
||||
// skip point
|
||||
if (curStorePayloads) {
|
||||
int delta = curDoc - lastSkipDoc[level];
|
||||
if (curPayloadLength == lastSkipPayloadLength[level]) {
|
||||
// the current payload length equals the length at the previous skip point,
|
||||
// so we don't store the length again
|
||||
skipBuffer.writeVInt(delta * 2);
|
||||
} else {
|
||||
// the payload length is different from the previous one. We shift the DocSkip,
|
||||
// set the lowest bit and store the current payload length as VInt.
|
||||
skipBuffer.writeVInt(delta * 2 + 1);
|
||||
skipBuffer.writeVInt(curPayloadLength);
|
||||
lastSkipPayloadLength[level] = curPayloadLength;
|
||||
}
|
||||
} else {
|
||||
// current field does not store payloads
|
||||
skipBuffer.writeVInt(curDoc - lastSkipDoc[level]);
|
||||
}
|
||||
skipBuffer.writeVInt((int) (curFreqPointer - lastSkipFreqPointer[level]));
|
||||
skipBuffer.writeVInt((int) (curProxPointer - lastSkipProxPointer[level]));
|
||||
|
||||
lastSkipDoc[level] = curDoc;
|
||||
//System.out.println("write doc at level " + level + ": " + curDoc);
|
||||
|
||||
lastSkipFreqPointer[level] = curFreqPointer;
|
||||
lastSkipProxPointer[level] = curProxPointer;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,271 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.store.BufferedIndexInput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
||||
/**
|
||||
* This abstract class reads skip lists with multiple levels.
|
||||
*
|
||||
* See {@link MultiLevelSkipListWriter} for the information about the encoding
|
||||
* of the multi level skip lists.
|
||||
*
|
||||
* Subclasses must implement the abstract method {@link #readSkipData(int, IndexInput)}
|
||||
* which defines the actual format of the skip data.
|
||||
*/
|
||||
abstract class MultiLevelSkipListReader {
|
||||
// the maximum number of skip levels possible for this index
|
||||
private int maxNumberOfSkipLevels;
|
||||
|
||||
// number of levels in this skip list
|
||||
private int numberOfSkipLevels;
|
||||
|
||||
// Expert: defines the number of top skip levels to buffer in memory.
|
||||
// Reducing this number results in less memory usage, but possibly
|
||||
// slower performance due to more random I/Os.
|
||||
// Please notice that the space each level occupies is limited by
|
||||
// the skipInterval. The top level can not contain more than
|
||||
// skipLevel entries, the second top level can not contain more
|
||||
// than skipLevel^2 entries and so forth.
|
||||
private int numberOfLevelsToBuffer = 1;
|
||||
|
||||
private int docCount;
|
||||
private boolean haveSkipped;
|
||||
|
||||
private IndexInput[] skipStream; // skipStream for each level
|
||||
private long skipPointer[]; // the start pointer of each skip level
|
||||
private int skipInterval[]; // skipInterval of each level
|
||||
private int[] numSkipped; // number of docs skipped per level
|
||||
|
||||
private int[] skipDoc; // doc id of current skip entry per level
|
||||
private int lastDoc; // doc id of last read skip entry with docId <= target
|
||||
private long[] childPointer; // child pointer of current skip entry per level
|
||||
private long lastChildPointer; // childPointer of last read skip entry with docId <= target
|
||||
|
||||
private boolean inputIsBuffered;
|
||||
|
||||
public MultiLevelSkipListReader(IndexInput skipStream, int maxSkipLevels, int skipInterval) {
|
||||
this.skipStream = new IndexInput[maxSkipLevels];
|
||||
this.skipPointer = new long[maxSkipLevels];
|
||||
this.childPointer = new long[maxSkipLevels];
|
||||
this.numSkipped = new int[maxSkipLevels];
|
||||
this.maxNumberOfSkipLevels = maxSkipLevels;
|
||||
this.skipInterval = new int[maxSkipLevels];
|
||||
this.skipStream [0]= skipStream;
|
||||
this.inputIsBuffered = (skipStream instanceof BufferedIndexInput);
|
||||
this.skipInterval[0] = skipInterval;
|
||||
for (int i = 1; i < maxSkipLevels; i++) {
|
||||
// cache skip intervals
|
||||
this.skipInterval[i] = this.skipInterval[i - 1] * skipInterval;
|
||||
}
|
||||
skipDoc = new int[maxSkipLevels];
|
||||
}
|
||||
|
||||
|
||||
/** Returns the id of the doc to which the last call of {@link #skipTo(int)}
|
||||
* has skipped. */
|
||||
int getDoc() {
|
||||
return lastDoc;
|
||||
}
|
||||
|
||||
|
||||
/** Skips entries to the first beyond the current whose document number is
|
||||
* greater than or equal to <i>target</i>. Returns the current doc count.
|
||||
*/
|
||||
int skipTo(int target) throws IOException {
|
||||
if (!haveSkipped) {
|
||||
// first time, load skip levels
|
||||
loadSkipLevels();
|
||||
haveSkipped = true;
|
||||
}
|
||||
|
||||
// walk up the levels until highest level is found that has a skip
|
||||
// for this target
|
||||
int level = 0;
|
||||
while (level < numberOfSkipLevels - 1 && target > skipDoc[level + 1]) {
|
||||
level++;
|
||||
}
|
||||
|
||||
while (level >= 0) {
|
||||
if (target > skipDoc[level]) {
|
||||
if (!loadNextSkip(level)) {
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
// no more skips on this level, go down one level
|
||||
if (level > 0 && lastChildPointer > skipStream[level - 1].getFilePointer()) {
|
||||
seekChild(level - 1);
|
||||
}
|
||||
level--;
|
||||
}
|
||||
}
|
||||
|
||||
return numSkipped[0] - skipInterval[0] - 1;
|
||||
}
|
||||
|
||||
private boolean loadNextSkip(int level) throws IOException {
|
||||
// we have to skip, the target document is greater than the current
|
||||
// skip list entry
|
||||
setLastSkipData(level);
|
||||
|
||||
numSkipped[level] += skipInterval[level];
|
||||
|
||||
if (numSkipped[level] > docCount) {
|
||||
// this skip list is exhausted
|
||||
skipDoc[level] = Integer.MAX_VALUE;
|
||||
if (numberOfSkipLevels > level) numberOfSkipLevels = level;
|
||||
return false;
|
||||
}
|
||||
|
||||
// read next skip entry
|
||||
skipDoc[level] += readSkipData(level, skipStream[level]);
|
||||
|
||||
if (level != 0) {
|
||||
// read the child pointer if we are not on the leaf level
|
||||
childPointer[level] = skipStream[level].readVLong() + skipPointer[level - 1];
|
||||
}
|
||||
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
/** Seeks the skip entry on the given level */
|
||||
protected void seekChild(int level) throws IOException {
|
||||
skipStream[level].seek(lastChildPointer);
|
||||
numSkipped[level] = numSkipped[level + 1] - skipInterval[level + 1];
|
||||
skipDoc[level] = lastDoc;
|
||||
if (level > 0) {
|
||||
childPointer[level] = skipStream[level].readVLong() + skipPointer[level - 1];
|
||||
}
|
||||
}
|
||||
|
||||
void close() throws IOException {
|
||||
for (int i = 1; i < skipStream.length; i++) {
|
||||
if (skipStream[i] != null) {
|
||||
skipStream[i].close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** initializes the reader */
|
||||
void init(long skipPointer, int df) {
|
||||
this.skipPointer[0] = skipPointer;
|
||||
this.docCount = df;
|
||||
Arrays.fill(skipDoc, 0);
|
||||
Arrays.fill(numSkipped, 0);
|
||||
haveSkipped = false;
|
||||
for (int i = 1; i < numberOfSkipLevels; i++) {
|
||||
skipStream[0] = null;
|
||||
}
|
||||
}
|
||||
|
||||
/** Loads the skip levels */
|
||||
private void loadSkipLevels() throws IOException {
|
||||
numberOfSkipLevels = docCount == 0 ? 0 : (int) Math.floor(Math.log(docCount) / Math.log(skipInterval[0]));
|
||||
if (numberOfSkipLevels > maxNumberOfSkipLevels) {
|
||||
numberOfSkipLevels = maxNumberOfSkipLevels;
|
||||
}
|
||||
|
||||
skipStream[0].seek(skipPointer[0]);
|
||||
|
||||
int toBuffer = numberOfLevelsToBuffer;
|
||||
|
||||
for (int i = numberOfSkipLevels - 1; i > 0; i--) {
|
||||
// the length of the current level
|
||||
long length = skipStream[0].readVLong();
|
||||
|
||||
// the start pointer of the current level
|
||||
skipPointer[i] = skipStream[0].getFilePointer();
|
||||
if (toBuffer > 0) {
|
||||
// buffer this level
|
||||
skipStream[i] = new SkipBuffer(skipStream[0], (int) length);
|
||||
toBuffer--;
|
||||
} else {
|
||||
// clone this stream, it is already at the start of the current level
|
||||
skipStream[i] = (IndexInput) skipStream[0].clone();
|
||||
if (inputIsBuffered && length < BufferedIndexInput.BUFFER_SIZE) {
|
||||
((BufferedIndexInput) skipStream[i]).setBufferSize((int) length);
|
||||
}
|
||||
|
||||
// move base stream beyond the current level
|
||||
skipStream[0].seek(skipStream[0].getFilePointer() + length);
|
||||
}
|
||||
}
|
||||
|
||||
// use base stream for the lowest level
|
||||
skipPointer[0] = skipStream[0].getFilePointer();
|
||||
}
|
||||
|
||||
/**
|
||||
* Subclasses must implement the actual skip data encoding in this method.
|
||||
*
|
||||
* @param level the level skip data shall be read from
|
||||
* @param skipStream the skip stream to read from
|
||||
*/
|
||||
protected abstract int readSkipData(int level, IndexInput skipStream) throws IOException;
|
||||
|
||||
/** Copies the values of the last read skip entry on this level */
|
||||
protected void setLastSkipData(int level) {
|
||||
lastDoc = skipDoc[level];
|
||||
lastChildPointer = childPointer[level];
|
||||
}
|
||||
|
||||
|
||||
/** used to buffer the top skip levels */
|
||||
private final static class SkipBuffer extends IndexInput {
|
||||
private byte[] data;
|
||||
private long pointer;
|
||||
private int pos;
|
||||
|
||||
SkipBuffer(IndexInput input, int length) throws IOException {
|
||||
data = new byte[length];
|
||||
pointer = input.getFilePointer();
|
||||
input.readBytes(data, 0, length);
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
data = null;
|
||||
}
|
||||
|
||||
public long getFilePointer() {
|
||||
return pointer + pos;
|
||||
}
|
||||
|
||||
public long length() {
|
||||
return data.length;
|
||||
}
|
||||
|
||||
public byte readByte() throws IOException {
|
||||
return data[pos++];
|
||||
}
|
||||
|
||||
public void readBytes(byte[] b, int offset, int len) throws IOException {
|
||||
System.arraycopy(data, pos, b, offset, len);
|
||||
pos += len;
|
||||
}
|
||||
|
||||
public void seek(long pos) throws IOException {
|
||||
this.pos = (int) (pos - pointer);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,151 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
|
||||
/**
|
||||
* This abstract class writes skip lists with multiple levels.
|
||||
*
|
||||
* Example for skipInterval = 3:
|
||||
* c (skip level 2)
|
||||
* c c c (skip level 1)
|
||||
* x x x x x x x x x x (skip level 0)
|
||||
* d d d d d d d d d d d d d d d d d d d d d d d d d d d d d d d d (posting list)
|
||||
* 3 6 9 12 15 18 21 24 27 30 (df)
|
||||
*
|
||||
* d - document
|
||||
* x - skip data
|
||||
* c - skip data with child pointer
|
||||
*
|
||||
* Skip level i contains every skipInterval-th entry from skip level i-1.
|
||||
* Therefore the number of entries on level i is: floor(df / ((skipInterval ^ (i + 1))).
|
||||
*
|
||||
* Each skip entry on a level i>0 contains a pointer to the corresponding skip entry in list i-1.
|
||||
* This guarantess a logarithmic amount of skips to find the target document.
|
||||
*
|
||||
* While this class takes care of writing the different skip levels,
|
||||
* subclasses must define the actual format of the skip data.
|
||||
*
|
||||
*/
|
||||
abstract class MultiLevelSkipListWriter {
|
||||
// number of levels in this skip list
|
||||
private int numberOfSkipLevels;
|
||||
|
||||
// the skip interval in the list with level = 0
|
||||
private int skipInterval;
|
||||
|
||||
// for every skip level a different buffer is used
|
||||
private RAMOutputStream[] skipBuffer;
|
||||
|
||||
protected MultiLevelSkipListWriter(int skipInterval, int maxSkipLevels, int df) {
|
||||
this.skipInterval = skipInterval;
|
||||
|
||||
// calculate the maximum number of skip levels for this document frequency
|
||||
numberOfSkipLevels = df == 0 ? 0 : (int) Math.floor(Math.log(df) / Math.log(skipInterval));
|
||||
|
||||
// make sure it does not exceed maxSkipLevels
|
||||
if (numberOfSkipLevels > maxSkipLevels) {
|
||||
numberOfSkipLevels = maxSkipLevels;
|
||||
}
|
||||
}
|
||||
|
||||
protected void init() {
|
||||
skipBuffer = new RAMOutputStream[numberOfSkipLevels];
|
||||
for (int i = 0; i < numberOfSkipLevels; i++) {
|
||||
skipBuffer[i] = new RAMOutputStream();
|
||||
}
|
||||
}
|
||||
|
||||
protected void resetSkip() {
|
||||
// creates new buffers or empties the existing ones
|
||||
if (skipBuffer == null) {
|
||||
init();
|
||||
} else {
|
||||
for (int i = 0; i < skipBuffer.length; i++) {
|
||||
skipBuffer[i].reset();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Subclasses must implement the actual skip data encoding in this method.
|
||||
*
|
||||
* @param level the level skip data shall be writting for
|
||||
* @param skipBuffer the skip buffer to write to
|
||||
*/
|
||||
protected abstract void writeSkipData(int level, IndexOutput skipBuffer) throws IOException;
|
||||
|
||||
/**
|
||||
* Writes the current skip data to the buffers. The current document frequency determines
|
||||
* the max level is skip data is to be written to.
|
||||
*
|
||||
* @param df the current document frequency
|
||||
* @throws IOException
|
||||
*/
|
||||
void bufferSkip(int df) throws IOException {
|
||||
int numLevels;
|
||||
|
||||
// determine max level
|
||||
for (numLevels = 0; (df % skipInterval) == 0 && numLevels < numberOfSkipLevels; df /= skipInterval) {
|
||||
numLevels++;
|
||||
}
|
||||
|
||||
long childPointer = 0;
|
||||
|
||||
for (int level = 0; level < numLevels; level++) {
|
||||
writeSkipData(level, skipBuffer[level]);
|
||||
|
||||
long newChildPointer = skipBuffer[level].getFilePointer();
|
||||
|
||||
if (level != 0) {
|
||||
// store child pointers for all levels except the lowest
|
||||
skipBuffer[level].writeVLong(childPointer);
|
||||
}
|
||||
|
||||
//remember the childPointer for the next level
|
||||
childPointer = newChildPointer;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes the buffered skip lists to the given output.
|
||||
*
|
||||
* @param output the IndexOutput the skip lists shall be written to
|
||||
* @return the pointer the skip list starts
|
||||
*/
|
||||
long writeSkip(IndexOutput output) throws IOException {
|
||||
long skipPointer = output.getFilePointer();
|
||||
if (skipBuffer == null || skipBuffer.length == 0) return skipPointer;
|
||||
|
||||
for (int level = numberOfSkipLevels - 1; level > 0; level--) {
|
||||
long length = skipBuffer[level].getFilePointer();
|
||||
if (length > 0) {
|
||||
output.writeVLong(length);
|
||||
skipBuffer[level].writeTo(output);
|
||||
}
|
||||
}
|
||||
skipBuffer[0].writeTo(output);
|
||||
|
||||
return skipPointer;
|
||||
}
|
||||
|
||||
}
|
|
@ -26,7 +26,6 @@ import org.apache.lucene.document.FieldSelector;
|
|||
import org.apache.lucene.document.FieldSelectorResult;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
|
||||
/**
|
||||
* The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link #add},
|
||||
|
@ -51,6 +50,8 @@ final class SegmentMerger {
|
|||
private Vector readers = new Vector();
|
||||
private FieldInfos fieldInfos;
|
||||
|
||||
private int mergedDocs;
|
||||
|
||||
/** This ctor used only by test code.
|
||||
*
|
||||
* @param dir The Directory to merge the other segments into
|
||||
|
@ -93,14 +94,14 @@ final class SegmentMerger {
|
|||
final int merge() throws CorruptIndexException, IOException {
|
||||
int value;
|
||||
|
||||
value = mergeFields();
|
||||
mergedDocs = mergeFields();
|
||||
mergeTerms();
|
||||
mergeNorms();
|
||||
|
||||
if (fieldInfos.hasVectors())
|
||||
mergeVectors();
|
||||
|
||||
return value;
|
||||
return mergedDocs;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -241,7 +242,9 @@ final class SegmentMerger {
|
|||
private IndexOutput proxOutput = null;
|
||||
private TermInfosWriter termInfosWriter = null;
|
||||
private int skipInterval;
|
||||
private int maxSkipLevels;
|
||||
private SegmentMergeQueue queue = null;
|
||||
private DefaultSkipListWriter skipListWriter = null;
|
||||
|
||||
private final void mergeTerms() throws CorruptIndexException, IOException {
|
||||
try {
|
||||
|
@ -251,6 +254,8 @@ final class SegmentMerger {
|
|||
new TermInfosWriter(directory, segment, fieldInfos,
|
||||
termIndexInterval);
|
||||
skipInterval = termInfosWriter.skipInterval;
|
||||
maxSkipLevels = termInfosWriter.maxSkipLevels;
|
||||
skipListWriter = new DefaultSkipListWriter(skipInterval, maxSkipLevels, mergedDocs, freqOutput, proxOutput);
|
||||
queue = new SegmentMergeQueue(readers.size());
|
||||
|
||||
mergeTermInfos();
|
||||
|
@ -319,7 +324,7 @@ final class SegmentMerger {
|
|||
|
||||
int df = appendPostings(smis, n); // append posting data
|
||||
|
||||
long skipPointer = writeSkip();
|
||||
long skipPointer = skipListWriter.writeSkip(freqOutput);
|
||||
|
||||
if (df > 0) {
|
||||
// add an entry to the dictionary with pointers to prox and freq files
|
||||
|
@ -344,7 +349,7 @@ final class SegmentMerger {
|
|||
throws CorruptIndexException, IOException {
|
||||
int lastDoc = 0;
|
||||
int df = 0; // number of docs w/ term
|
||||
resetSkip();
|
||||
skipListWriter.resetSkip();
|
||||
boolean storePayloads = fieldInfos.fieldInfo(smis[0].term.field).storePayloads;
|
||||
int lastPayloadLength = -1; // ensures that we write the first length
|
||||
for (int i = 0; i < n; i++) {
|
||||
|
@ -366,7 +371,8 @@ final class SegmentMerger {
|
|||
df++;
|
||||
|
||||
if ((df % skipInterval) == 0) {
|
||||
bufferSkip(lastDoc, storePayloads, lastPayloadLength);
|
||||
skipListWriter.setSkipData(lastDoc, storePayloads, lastPayloadLength);
|
||||
skipListWriter.bufferSkip(df);
|
||||
}
|
||||
|
||||
int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1
|
||||
|
@ -413,75 +419,6 @@ final class SegmentMerger {
|
|||
return df;
|
||||
}
|
||||
|
||||
private RAMOutputStream skipBuffer = new RAMOutputStream();
|
||||
private int lastSkipDoc;
|
||||
private int lastSkipPayloadLength;
|
||||
private long lastSkipFreqPointer;
|
||||
private long lastSkipProxPointer;
|
||||
|
||||
private void resetSkip() {
|
||||
skipBuffer.reset();
|
||||
lastSkipDoc = 0;
|
||||
lastSkipPayloadLength = -1; // we don't have to write the first length in the skip list
|
||||
lastSkipFreqPointer = freqOutput.getFilePointer();
|
||||
lastSkipProxPointer = proxOutput.getFilePointer();
|
||||
}
|
||||
|
||||
private void bufferSkip(int doc, boolean storePayloads, int payloadLength) throws IOException {
|
||||
long freqPointer = freqOutput.getFilePointer();
|
||||
long proxPointer = proxOutput.getFilePointer();
|
||||
|
||||
// To efficiently store payloads in the posting lists we do not store the length of
|
||||
// every payload. Instead we omit the length for a payload if the previous payload had
|
||||
// the same length.
|
||||
// However, in order to support skipping the payload length at every skip point must be known.
|
||||
// So we use the same length encoding that we use for the posting lists for the skip data as well:
|
||||
// Case 1: current field does not store payloads
|
||||
// SkipDatum --> DocSkip, FreqSkip, ProxSkip
|
||||
// DocSkip,FreqSkip,ProxSkip --> VInt
|
||||
// DocSkip records the document number before every SkipInterval th document in TermFreqs.
|
||||
// Document numbers are represented as differences from the previous value in the sequence.
|
||||
// Case 2: current field stores payloads
|
||||
// SkipDatum --> DocSkip, PayloadLength?, FreqSkip,ProxSkip
|
||||
// DocSkip,FreqSkip,ProxSkip --> VInt
|
||||
// PayloadLength --> VInt
|
||||
// In this case DocSkip/2 is the difference between
|
||||
// the current and the previous value. If DocSkip
|
||||
// is odd, then a PayloadLength encoded as VInt follows,
|
||||
// if DocSkip is even, then it is assumed that the
|
||||
// current payload length equals the length at the previous
|
||||
// skip point
|
||||
if (storePayloads) {
|
||||
int delta = doc - lastSkipDoc;
|
||||
if (payloadLength == lastSkipPayloadLength) {
|
||||
// the current payload length equals the length at the previous skip point,
|
||||
// so we don't store the length again
|
||||
skipBuffer.writeVInt(delta * 2);
|
||||
} else {
|
||||
// the payload length is different from the previous one. We shift the DocSkip,
|
||||
// set the lowest bit and store the current payload length as VInt.
|
||||
skipBuffer.writeVInt(delta * 2 + 1);
|
||||
skipBuffer.writeVInt(payloadLength);
|
||||
lastSkipPayloadLength = payloadLength;
|
||||
}
|
||||
} else {
|
||||
// current field does not store payloads
|
||||
skipBuffer.writeVInt(doc - lastSkipDoc);
|
||||
}
|
||||
skipBuffer.writeVInt((int) (freqPointer - lastSkipFreqPointer));
|
||||
skipBuffer.writeVInt((int) (proxPointer - lastSkipProxPointer));
|
||||
|
||||
lastSkipDoc = doc;
|
||||
lastSkipFreqPointer = freqPointer;
|
||||
lastSkipProxPointer = proxPointer;
|
||||
}
|
||||
|
||||
private long writeSkip() throws IOException {
|
||||
long skipPointer = freqOutput.getFilePointer();
|
||||
skipBuffer.writeTo(freqOutput);
|
||||
return skipPointer;
|
||||
}
|
||||
|
||||
private void mergeNorms() throws IOException {
|
||||
byte[] normBuffer = null;
|
||||
IndexOutput output = null;
|
||||
|
|
|
@ -31,16 +31,15 @@ class SegmentTermDocs implements TermDocs {
|
|||
int freq;
|
||||
|
||||
private int skipInterval;
|
||||
private int numSkips;
|
||||
private int skipCount;
|
||||
private IndexInput skipStream;
|
||||
private int skipDoc;
|
||||
private long freqPointer;
|
||||
private long proxPointer;
|
||||
private int maxSkipLevels;
|
||||
private DefaultSkipListReader skipListReader;
|
||||
|
||||
private long freqBasePointer;
|
||||
private long proxBasePointer;
|
||||
|
||||
private long skipPointer;
|
||||
private boolean haveSkipped;
|
||||
|
||||
private int payloadLengthAtLastSkip;
|
||||
protected boolean currentFieldStoresPayloads;
|
||||
|
||||
protected SegmentTermDocs(SegmentReader parent) {
|
||||
|
@ -48,6 +47,7 @@ class SegmentTermDocs implements TermDocs {
|
|||
this.freqStream = (IndexInput) parent.freqStream.clone();
|
||||
this.deletedDocs = parent.deletedDocs;
|
||||
this.skipInterval = parent.tis.getSkipInterval();
|
||||
this.maxSkipLevels = parent.tis.getMaxSkipLevels();
|
||||
}
|
||||
|
||||
public void seek(Term term) throws IOException {
|
||||
|
@ -74,7 +74,6 @@ class SegmentTermDocs implements TermDocs {
|
|||
|
||||
void seek(TermInfo ti, Term term) throws IOException {
|
||||
count = 0;
|
||||
payloadLengthAtLastSkip = 0;
|
||||
FieldInfo fi = parent.fieldInfos.fieldInfo(term.field);
|
||||
currentFieldStoresPayloads = (fi != null) ? fi.storePayloads : false;
|
||||
if (ti == null) {
|
||||
|
@ -82,21 +81,18 @@ class SegmentTermDocs implements TermDocs {
|
|||
} else {
|
||||
df = ti.docFreq;
|
||||
doc = 0;
|
||||
skipDoc = 0;
|
||||
skipCount = 0;
|
||||
numSkips = df / skipInterval;
|
||||
freqPointer = ti.freqPointer;
|
||||
proxPointer = ti.proxPointer;
|
||||
skipPointer = freqPointer + ti.skipOffset;
|
||||
freqStream.seek(freqPointer);
|
||||
freqBasePointer = ti.freqPointer;
|
||||
proxBasePointer = ti.proxPointer;
|
||||
skipPointer = freqBasePointer + ti.skipOffset;
|
||||
freqStream.seek(freqBasePointer);
|
||||
haveSkipped = false;
|
||||
}
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
freqStream.close();
|
||||
if (skipStream != null)
|
||||
skipStream.close();
|
||||
if (skipListReader != null)
|
||||
skipListReader.close();
|
||||
}
|
||||
|
||||
public final int doc() { return doc; }
|
||||
|
@ -157,64 +153,22 @@ class SegmentTermDocs implements TermDocs {
|
|||
/** Optimized implementation. */
|
||||
public boolean skipTo(int target) throws IOException {
|
||||
if (df >= skipInterval) { // optimized case
|
||||
if (skipListReader == null)
|
||||
skipListReader = new DefaultSkipListReader((IndexInput) freqStream.clone(), maxSkipLevels, skipInterval); // lazily clone
|
||||
|
||||
if (skipStream == null)
|
||||
skipStream = (IndexInput) freqStream.clone(); // lazily clone
|
||||
|
||||
if (!haveSkipped) { // lazily seek skip stream
|
||||
skipStream.seek(skipPointer);
|
||||
if (!haveSkipped) { // lazily initialize skip stream
|
||||
skipListReader.init(skipPointer, freqBasePointer, proxBasePointer, df, currentFieldStoresPayloads);
|
||||
haveSkipped = true;
|
||||
}
|
||||
|
||||
// scan skip data
|
||||
int lastSkipDoc = skipDoc;
|
||||
int lastPayloadLength = 0;
|
||||
long lastFreqPointer = freqStream.getFilePointer();
|
||||
long lastProxPointer = -1;
|
||||
int numSkipped = -1 - (count % skipInterval);
|
||||
int newCount = skipListReader.skipTo(target);
|
||||
if (newCount > count) {
|
||||
freqStream.seek(skipListReader.getFreqPointer());
|
||||
skipProx(skipListReader.getProxPointer(), skipListReader.getPayloadLength());
|
||||
|
||||
while (target > skipDoc) {
|
||||
lastSkipDoc = skipDoc;
|
||||
lastFreqPointer = freqPointer;
|
||||
lastProxPointer = proxPointer;
|
||||
lastPayloadLength = payloadLengthAtLastSkip;
|
||||
|
||||
if (skipDoc != 0 && skipDoc >= doc)
|
||||
numSkipped += skipInterval;
|
||||
|
||||
if(skipCount >= numSkips)
|
||||
break;
|
||||
|
||||
if (currentFieldStoresPayloads) {
|
||||
// the current field stores payloads.
|
||||
// if the doc delta is odd then we have
|
||||
// to read the current payload length
|
||||
// because it differs from the length of the
|
||||
// previous payload
|
||||
int delta = skipStream.readVInt();
|
||||
if ((delta & 1) != 0) {
|
||||
payloadLengthAtLastSkip = skipStream.readVInt();
|
||||
doc = skipListReader.getDoc();
|
||||
count = newCount;
|
||||
}
|
||||
delta >>>= 1;
|
||||
skipDoc += delta;
|
||||
} else {
|
||||
skipDoc += skipStream.readVInt();
|
||||
}
|
||||
freqPointer += skipStream.readVInt();
|
||||
proxPointer += skipStream.readVInt();
|
||||
|
||||
skipCount++;
|
||||
}
|
||||
|
||||
// if we found something to skip, then skip it
|
||||
if (lastFreqPointer > freqStream.getFilePointer()) {
|
||||
freqStream.seek(lastFreqPointer);
|
||||
skipProx(lastProxPointer, lastPayloadLength);
|
||||
|
||||
doc = lastSkipDoc;
|
||||
count += numSkipped;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// done skipping, now just scan
|
||||
|
@ -224,5 +178,4 @@ class SegmentTermDocs implements TermDocs {
|
|||
} while (target > doc);
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -37,6 +37,7 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
|
|||
long indexPointer = 0;
|
||||
int indexInterval;
|
||||
int skipInterval;
|
||||
int maxSkipLevels;
|
||||
private int formatM1SkipInterval;
|
||||
|
||||
SegmentTermEnum(IndexInput i, FieldInfos fis, boolean isi)
|
||||
|
@ -44,6 +45,7 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
|
|||
input = i;
|
||||
fieldInfos = fis;
|
||||
isIndex = isi;
|
||||
maxSkipLevels = 1; // use single-level skip lists for formats > -3
|
||||
|
||||
int firstInt = input.readInt();
|
||||
if (firstInt >= 0) {
|
||||
|
@ -54,7 +56,6 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
|
|||
// back-compatible settings
|
||||
indexInterval = 128;
|
||||
skipInterval = Integer.MAX_VALUE; // switch off skipTo optimization
|
||||
|
||||
} else {
|
||||
// we have a format version number
|
||||
format = firstInt;
|
||||
|
@ -73,10 +74,13 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
|
|||
// switch off skipTo optimization for file format prior to 1.4rc2 in order to avoid a bug in
|
||||
// skipTo implementation of these versions
|
||||
skipInterval = Integer.MAX_VALUE;
|
||||
}
|
||||
else{
|
||||
} else {
|
||||
indexInterval = input.readInt();
|
||||
skipInterval = input.readInt();
|
||||
if (format == -3) {
|
||||
// this new format introduces multi-level skipping
|
||||
maxSkipLevels = input.readInt();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -65,6 +65,10 @@ final class TermInfosReader {
|
|||
return origEnum.skipInterval;
|
||||
}
|
||||
|
||||
public int getMaxSkipLevels() {
|
||||
return origEnum.maxSkipLevels;
|
||||
}
|
||||
|
||||
final void close() throws IOException {
|
||||
if (origEnum != null)
|
||||
origEnum.close();
|
||||
|
|
|
@ -28,7 +28,7 @@ import org.apache.lucene.util.StringHelper;
|
|||
|
||||
final class TermInfosWriter {
|
||||
/** The file format version, a negative number. */
|
||||
public static final int FORMAT = -2;
|
||||
public static final int FORMAT = -3;
|
||||
|
||||
private FieldInfos fieldInfos;
|
||||
private IndexOutput output;
|
||||
|
@ -57,6 +57,11 @@ final class TermInfosWriter {
|
|||
* accelerable cases. More detailed experiments would be useful here. */
|
||||
int skipInterval = 16;
|
||||
|
||||
/** Expert: The maximum number of skip levels. Smaller values result in
|
||||
* slightly smaller indexes, but slower skipping in big posting lists.
|
||||
*/
|
||||
int maxSkipLevels = 10;
|
||||
|
||||
private long lastIndexPointer = 0;
|
||||
private boolean isIndex = false;
|
||||
|
||||
|
@ -85,6 +90,7 @@ final class TermInfosWriter {
|
|||
output.writeLong(0); // leave space for size
|
||||
output.writeInt(indexInterval); // write indexInterval
|
||||
output.writeInt(skipInterval); // write skipInterval
|
||||
output.writeInt(maxSkipLevels); // write maxSkipLevels
|
||||
}
|
||||
|
||||
/** Adds a new <Term, TermInfo> pair to the set.
|
||||
|
|
|
@ -1139,7 +1139,7 @@
|
|||
|
||||
<p>
|
||||
TermInfoFile (.tis)-->
|
||||
TIVersion, TermCount, IndexInterval, SkipInterval, TermInfos
|
||||
TIVersion, TermCount, IndexInterval, SkipInterval, MaxSkipLevels, TermInfos
|
||||
</p>
|
||||
<p>TIVersion -->
|
||||
UInt32
|
||||
|
@ -1153,6 +1153,9 @@
|
|||
<p>SkipInterval -->
|
||||
UInt32
|
||||
</p>
|
||||
<p>MaxSkipLevels -->
|
||||
UInt32
|
||||
</p>
|
||||
<p>TermInfos -->
|
||||
<TermInfo>
|
||||
<sup>TermCount</sup>
|
||||
|
@ -1209,7 +1212,8 @@
|
|||
particular, it is the number of bytes
|
||||
after TermFreqs that the SkipData starts.
|
||||
In other words, it is the length of the
|
||||
TermFreq data.
|
||||
TermFreq data. SkipDelta is only stored
|
||||
if DocFreq is not smaller than SkipInterval.
|
||||
</p>
|
||||
</li>
|
||||
<li>
|
||||
|
@ -1233,7 +1237,7 @@
|
|||
|
||||
<p>
|
||||
TermInfoIndex (.tii)-->
|
||||
TIVersion, IndexTermCount, IndexInterval, SkipInterval, TermIndices
|
||||
TIVersion, IndexTermCount, IndexInterval, SkipInterval, MaxSkipLevels, TermIndices
|
||||
</p>
|
||||
<p>TIVersion -->
|
||||
UInt32
|
||||
|
@ -1261,8 +1265,11 @@
|
|||
</p>
|
||||
<p>SkipInterval is the fraction of TermDocs stored in skip tables. It is used to accelerate TermDocs.skipTo(int).
|
||||
Larger values result in smaller indexes, greater acceleration, but fewer accelerable cases, while
|
||||
smaller values result in bigger indexes, less acceleration and more
|
||||
smaller values result in bigger indexes, less acceleration (in case of a small value for MaxSkipLevels) and more
|
||||
accelerable cases.</p>
|
||||
<p>MaxSkipLevels is the max. number of skip levels stored for each term in the .frq file. A low value results in
|
||||
smaller indexes but less acceleration, a larger value results in slighly larger indexes but greater acceleration.
|
||||
See format of .frq file for more information about skip levels.</p>
|
||||
</li>
|
||||
</ol>
|
||||
</section>
|
||||
|
@ -1286,15 +1293,23 @@
|
|||
DocDelta, Freq?
|
||||
</p>
|
||||
<p>SkipData -->
|
||||
<<SkipLevelLength, SkipLevel>
|
||||
<sup>NumSkipLevels-1</sup>, SkipLevel>
|
||||
<SkipDatum>
|
||||
<sup>DocFreq/SkipInterval</sup>
|
||||
</p>
|
||||
<p>SkipLevel -->
|
||||
<SkipDatum>
|
||||
<sup>DocFreq/(SkipInterval^(Level + 1))</sup>
|
||||
</p>
|
||||
<p>SkipDatum -->
|
||||
DocSkip,PayloadLength?,FreqSkip,ProxSkip
|
||||
DocSkip,PayloadLength?,FreqSkip,ProxSkip,SkipChildLevelPointer?
|
||||
</p>
|
||||
<p>DocDelta,Freq,DocSkip,PayloadLength,FreqSkip,ProxSkip -->
|
||||
VInt
|
||||
</p>
|
||||
<p>SkipChildLevelPointer -->
|
||||
VLong
|
||||
</p>
|
||||
<p>TermFreqs
|
||||
are ordered by term (the term is implicit, from the .tis file).
|
||||
</p>
|
||||
|
@ -1362,6 +1377,18 @@
|
|||
<sup>nd</sup>
|
||||
starts.
|
||||
</p>
|
||||
<p>Lucene 2.2 introduces the notion of skip levels. Each term can have multiple skip levels.
|
||||
The amount of skip levels for a term is NumSkipLevels = Min(MaxSkipLevels, floor(log(DocFreq/log(SkipInterval)))).
|
||||
The number of SkipData entries for a skip level is DocFreq/(SkipInterval^(Level + 1)), whereas the lowest skip
|
||||
level is Level=0. <br></br>
|
||||
Example: SkipInterval = 4, MaxSkipLevels = 2, DocFreq = 35. Then skip level 0 has 8 SkipData entries,
|
||||
containing the 3<sup>rd</sup>, 7<sup>th</sup>, 11<sup>th</sup>, 15<sup>th</sup>, 19<sup>th</sup>, 23<sup>rd</sup>,
|
||||
27<sup>th</sup>, and 31<sup>st</sup> document numbers in TermFreqs. Skip level 1 has 2 SkipData entries, containing the
|
||||
15<sup>th</sup> and 31<sup>st</sup> document numbers in TermFreqs. <br></br>
|
||||
The SkipData entries on all upper levels > 0 contain a SkipChildLevelPointer referencing the corresponding SkipData
|
||||
entry in level-1. In the example has entry 15 on level 1 a pointer to entry 15 on level 0 and entry 31 on level 1 a pointer
|
||||
to entry 31 on level 0.
|
||||
</p>
|
||||
|
||||
</section>
|
||||
<section id="Positions"><title>Positions</title>
|
||||
|
|
|
@ -0,0 +1,154 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.LowerCaseTokenizer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.Field.Index;
|
||||
import org.apache.lucene.document.Field.Store;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
|
||||
/**
|
||||
* This testcase tests whether multi-level skipping is being used
|
||||
* to reduce I/O while skipping through posting lists.
|
||||
*
|
||||
* Skipping in general is already covered by several other
|
||||
* testcases.
|
||||
*
|
||||
*/
|
||||
public class TestMultiLevelSkipList extends TestCase {
|
||||
public void testSimpleSkip() throws IOException {
|
||||
RAMDirectory dir = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, new PayloadAnalyzer(), true);
|
||||
Term term = new Term("test", "a");
|
||||
for (int i = 0; i < 5000; i++) {
|
||||
Document d1 = new Document();
|
||||
d1.add(new Field(term.field(), term.text(), Store.NO, Index.TOKENIZED));
|
||||
writer.addDocument(d1);
|
||||
}
|
||||
writer.flush();
|
||||
writer.optimize();
|
||||
writer.close();
|
||||
|
||||
IndexReader reader = IndexReader.open(dir);
|
||||
SegmentTermPositions tp = (SegmentTermPositions) reader.termPositions(term);
|
||||
tp.freqStream = new CountingStream(tp.freqStream);
|
||||
|
||||
tp.next();
|
||||
|
||||
checkSkipTo(tp, 14, 185); // no skips
|
||||
checkSkipTo(tp, 17, 190); // one skip on level 0
|
||||
checkSkipTo(tp, 287, 200); // one skip on level 1, two on level 0
|
||||
|
||||
// this test would fail if we had only one skip level,
|
||||
// because than more bytes would be read from the freqStream
|
||||
checkSkipTo(tp, 4800, 250);// one skip on level 2
|
||||
}
|
||||
|
||||
public void checkSkipTo(TermPositions tp, int target, int maxCounter) throws IOException {
|
||||
tp.skipTo(target);
|
||||
if (maxCounter < counter) {
|
||||
fail("Too many bytes read: " + counter);
|
||||
}
|
||||
|
||||
assertEquals("Wrong document " + tp.doc() + " after skipTo target " + target, target, tp.doc());
|
||||
assertEquals("Frequency is not 1: " + tp.freq(), 1,tp.freq());
|
||||
tp.nextPosition();
|
||||
byte[] b = new byte[1];
|
||||
tp.getPayload(b, 0);
|
||||
assertEquals("Wrong payload for the target " + target + ": " + b[0], (byte) target, b[0]);
|
||||
}
|
||||
|
||||
private static class PayloadAnalyzer extends Analyzer {
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new PayloadFilter(new LowerCaseTokenizer(reader));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static class PayloadFilter extends TokenFilter {
|
||||
static int count = 0;
|
||||
|
||||
protected PayloadFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
public Token next() throws IOException {
|
||||
Token t = input.next();
|
||||
if (t != null) {
|
||||
t.setPayload(new Payload(new byte[] { (byte) count++ }));
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private int counter = 0;
|
||||
|
||||
// Simply extends IndexInput in a way that we are able to count the number
|
||||
// of bytes read
|
||||
class CountingStream extends IndexInput {
|
||||
private IndexInput input;
|
||||
|
||||
CountingStream(IndexInput input) {
|
||||
this.input = input;
|
||||
}
|
||||
|
||||
public byte readByte() throws IOException {
|
||||
TestMultiLevelSkipList.this.counter++;
|
||||
return this.input.readByte();
|
||||
}
|
||||
|
||||
public void readBytes(byte[] b, int offset, int len) throws IOException {
|
||||
TestMultiLevelSkipList.this.counter += len;
|
||||
this.input.readBytes(b, offset, len);
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
this.input.close();
|
||||
}
|
||||
|
||||
public long getFilePointer() {
|
||||
return this.input.getFilePointer();
|
||||
}
|
||||
|
||||
public void seek(long pos) throws IOException {
|
||||
this.input.seek(pos);
|
||||
}
|
||||
|
||||
public long length() {
|
||||
return this.input.length();
|
||||
}
|
||||
|
||||
public Object clone() {
|
||||
return new CountingStream((IndexInput) this.input.clone());
|
||||
}
|
||||
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue