From 53d5541b20f27b32d4c6e25d962e8fe0edf8d784 Mon Sep 17 00:00:00 2001 From: easyice <80152403@qq.com> Date: Sat, 2 Mar 2024 13:04:48 +0800 Subject: [PATCH] use group-varint for positions only --- .../lucene/index/FreqProxTermsWriter.java | 156 +++++++----------- 1 file changed, 59 insertions(+), 97 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java index e2d25b2300d..3c30a35f46f 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java @@ -440,73 +440,46 @@ final class FreqProxTermsWriter extends TermsHash { this.postingInput = buffer.toDataInput(); } - private void writePositionsWithOffsets(final PostingsEnum in, final DataOutput out, int freq) - throws IOException { - int previousPosition = 0; - int previousEndOffset = 0; - for (int i = 0; i < freq; i++) { - final int pos = in.nextPosition(); - final BytesRef payload = in.getPayload(); - // The low-order bit of token is set only if there is a payload, the - // previous bits are the delta-encoded position. - final int token = (pos - previousPosition) << 1 | (payload == null ? 0 : 1); - previousPosition = pos; - - final int startOffset = in.startOffset(); - final int endOffset = in.endOffset(); - posDeltaBuffer[0] = token; - posDeltaBuffer[1] = startOffset - previousEndOffset; - posDeltaBuffer[2] = endOffset - startOffset; - posDeltaBuffer[3] = payload == null ? 0 : payload.length; - out.writeGroupVInts(posDeltaBuffer, 4); - previousEndOffset = endOffset; - if (payload != null) { - out.writeBytes(payload.bytes, payload.offset, payload.length); - } - } - } - - private void writePositionsWithOutOffsets(final PostingsEnum in, final DataOutput out, int freq) - throws IOException { - int previousPosition = 0; - if (storePayloads) { - for (int i = 0; i < freq; i++) { - final int pos = in.nextPosition(); - final BytesRef payload = in.getPayload(); - // The low-order bit of token is set only if there is a payload, the - // previous bits are the delta-encoded position. - final int token = (pos - previousPosition) << 1 | (payload == null ? 0 : 1); - out.writeVInt(token); - previousPosition = pos; - if (payload != null) { - out.writeVInt(payload.length); - out.writeBytes(payload.bytes, payload.offset, payload.length); - } - } - } else { // Only store token - int posWrite = 0; - for (int i = 0; i < freq; i++) { - final int pos = in.nextPosition(); - final int token = (pos - previousPosition) << 1; - posDeltaBuffer[posWrite++] = token; - previousPosition = pos; - if (posWrite == POS_BUFFER_SIZE) { - flushPositions(out, posWrite); - posWrite = 0; - } - } - flushPositions(out, posWrite); - } - } - private void addPositions(final PostingsEnum in, final DataOutput out) throws IOException { int freq = in.freq(); out.writeVInt(freq); if (storePositions) { - if (storeOffsets) { - writePositionsWithOffsets(in, out, freq); - } else { - writePositionsWithOutOffsets(in, out, freq); + int previousPosition = 0; + int previousEndOffset = 0; + if (storeOffsets || storePayloads) { + for (int i = 0; i < freq; i++) { + final int pos = in.nextPosition(); + final BytesRef payload = in.getPayload(); + // The low-order bit of token is set only if there is a payload, the + // previous bits are the delta-encoded position. + final int token = (pos - previousPosition) << 1 | (payload == null ? 0 : 1); + out.writeVInt(token); + previousPosition = pos; + if (storeOffsets) { // don't encode offsets if they are not stored + final int startOffset = in.startOffset(); + final int endOffset = in.endOffset(); + out.writeVInt(startOffset - previousEndOffset); + out.writeVInt(endOffset - startOffset); + previousEndOffset = endOffset; + } + if (payload != null) { + out.writeVInt(payload.length); + out.writeBytes(payload.bytes, payload.offset, payload.length); + } + } + } else { // Only store token + int posWrite = 0; + for (int i = 0; i < freq; i++) { + final int pos = in.nextPosition(); + final int token = (pos - previousPosition) << 1; + posDeltaBuffer[posWrite++] = token; + previousPosition = pos; + if (posWrite == POS_BUFFER_SIZE) { + flushPositions(out, posWrite); + posWrite = 0; + } + } + flushPositions(out, posWrite); } } } @@ -566,47 +539,36 @@ final class FreqProxTermsWriter extends TermsHash { out.writeGroupVInts(posDeltaBuffer, len); } - private void readPayload(int token, boolean readPayloadLength) throws IOException { - if ((token & 1) != 0) { - payload.offset = 0; - if (readPayloadLength) { - payload.length = postingInput.readVInt(); - } - if (payload.length > payload.bytes.length) { - payload.bytes = new byte[ArrayUtil.oversize(payload.length, 1)]; - } - postingInput.readBytes(payload.bytes, 0, payload.length); - } else { - payload.length = 0; - } - } - @Override public int nextPosition() throws IOException { if (storePositions == false) { return -1; } - if (storeOffsets) { - postingInput.readGroupVInts(posDeltaBuffer, 4); - final int token = (int) posDeltaBuffer[0]; + + if (storeOffsets || storePayloads) { + final int token = postingInput.readVInt(); pos += token >>> 1; - startOffset = endOffset + (int) posDeltaBuffer[1]; - endOffset = startOffset + (int) posDeltaBuffer[2]; - payload.length = (int) posDeltaBuffer[3]; - readPayload(token, false); - } else { - if (storePayloads) { - final int token = postingInput.readVInt(); - pos += token >>> 1; - readPayload(token, true); - } else { - // decode token from group-varint - if (posBufferUpto == POS_BUFFER_SIZE) { - refillPositions(); - } - pos += posDeltaBuffer[posBufferUpto] >>> 1; - posBufferUpto++; + if (storeOffsets) { + startOffset = endOffset + postingInput.readVInt(); + endOffset = startOffset + postingInput.readVInt(); } + if ((token & 1) != 0) { + payload.offset = 0; + payload.length = postingInput.readVInt(); + if (payload.length > payload.bytes.length) { + payload.bytes = new byte[ArrayUtil.oversize(payload.length, 1)]; + } + postingInput.readBytes(payload.bytes, 0, payload.length); + } else { + payload.length = 0; + } + } else { + // decode token from group-varint + if (posBufferUpto == POS_BUFFER_SIZE) { + refillPositions(); + } + pos += posDeltaBuffer[posBufferUpto] >>> 1; + posBufferUpto++; } return pos; }