diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index db80c93ffb7..54f01c5c278 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -83,7 +83,10 @@ Other API Changes --------------------- -(No changes) + +* LUCENE-9093: Not an API change but a change in behavior of the UnifiedHighlighter's LengthGoalBreakIterator that will + yield Passages sized a little different due to the fact that the sizing pivot is now the center of the first match and + not its left edge. New Features --------------------- @@ -104,6 +107,12 @@ Improvements * LUCENE-9105: UniformSplit postings format detects corrupted index and better handles IO exceptions. (Bruno Roustant) +* LUCENE-9106: UniformSplit postings format allows extension of block/line serializers. (Bruno Roustant) + +* LUCENE-9093: UnifiedHighlighter's LengthGoalBreakIterator has a new fragmentAlignment option to better center the + first match in the passage. Also the sizing point now pivots at the center of the first match term and not its left + edge. This yields Passages that won't be identical to the previous behavior. (Nándor Mátravölgyi, David Smiley) + Optimizations --------------------- (No changes) diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/BlockHeader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/BlockHeader.java index d512fdcc045..cfdf2e98503 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/BlockHeader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/BlockHeader.java @@ -134,43 +134,49 @@ public class BlockHeader implements Accountable { return basePayloadsFP; } - public void write(DataOutput output) throws IOException { - assert linesCount > 0 : "block header does not seem to be initialized"; - output.writeVInt(linesCount); - - output.writeVLong(baseDocsFP); - output.writeVLong(basePositionsFP); - output.writeVLong(basePayloadsFP); - - output.writeVInt(termStatesBaseOffset); - output.writeVInt(middleLineOffset); - } - - public static BlockHeader read(DataInput input, BlockHeader reuse) throws IOException { - int linesCount = input.readVInt(); - if (linesCount <= 0 || linesCount > UniformSplitTermsWriter.MAX_NUM_BLOCK_LINES) { - throw new CorruptIndexException("Illegal number of lines in a block: " + linesCount, input); - } - - long baseDocsFP = input.readVLong(); - long basePositionsFP = input.readVLong(); - long basePayloadsFP = input.readVLong(); - - int termStatesBaseOffset = input.readVInt(); - if (termStatesBaseOffset < 0) { - throw new CorruptIndexException("Illegal termStatesBaseOffset= " + termStatesBaseOffset, input); - } - int middleTermOffset = input.readVInt(); - if (middleTermOffset < 0) { - throw new CorruptIndexException("Illegal middleTermOffset= " + middleTermOffset, input); - } - - BlockHeader blockHeader = reuse == null ? new BlockHeader() : reuse; - return blockHeader.reset(linesCount, baseDocsFP, basePositionsFP, basePayloadsFP, termStatesBaseOffset, middleTermOffset); - } - @Override public long ramBytesUsed() { return RAM_USAGE; } + + /** + * Reads/writes block header. + */ + public static class Serializer { + + public void write(DataOutput output, BlockHeader blockHeader) throws IOException { + assert blockHeader.linesCount > 0 : "Block header is not initialized"; + output.writeVInt(blockHeader.linesCount); + + output.writeVLong(blockHeader.baseDocsFP); + output.writeVLong(blockHeader.basePositionsFP); + output.writeVLong(blockHeader.basePayloadsFP); + + output.writeVInt(blockHeader.termStatesBaseOffset); + output.writeVInt(blockHeader.middleLineOffset); + } + + public BlockHeader read(DataInput input, BlockHeader reuse) throws IOException { + int linesCount = input.readVInt(); + if (linesCount <= 0 || linesCount > UniformSplitTermsWriter.MAX_NUM_BLOCK_LINES) { + throw new CorruptIndexException("Illegal number of lines in block: " + linesCount, input); + } + + long baseDocsFP = input.readVLong(); + long basePositionsFP = input.readVLong(); + long basePayloadsFP = input.readVLong(); + + int termStatesBaseOffset = input.readVInt(); + if (termStatesBaseOffset < 0) { + throw new CorruptIndexException("Illegal termStatesBaseOffset= " + termStatesBaseOffset, input); + } + int middleTermOffset = input.readVInt(); + if (middleTermOffset < 0) { + throw new CorruptIndexException("Illegal middleTermOffset= " + middleTermOffset, input); + } + + BlockHeader blockHeader = reuse == null ? new BlockHeader() : reuse; + return blockHeader.reset(linesCount, baseDocsFP, basePositionsFP, basePayloadsFP, termStatesBaseOffset, middleTermOffset); + } + } } \ No newline at end of file diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/BlockLine.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/BlockLine.java index cea8e932e72..e39cfc6d56d 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/BlockLine.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/BlockLine.java @@ -107,7 +107,7 @@ public class BlockLine implements Accountable { } /** - * Reads block lines with terms encoded incrementally inside a block. + * Reads/writes block lines with terms encoded incrementally inside a block. * This class keeps a state of the previous term read to decode the next term. */ public static class Serializer implements Accountable { @@ -149,7 +149,7 @@ public class BlockLine implements Accountable { * the incremental encoding. {@code true} for the first * and middle term, {@code false} for other terms. */ - public static void writeLine(DataOutput blockOutput, BlockLine line, BlockLine previousLine, + public void writeLine(DataOutput blockOutput, BlockLine line, BlockLine previousLine, int termStateRelativeOffset, boolean isIncrementalEncodingSeed) throws IOException { blockOutput.writeVInt(termStateRelativeOffset); writeIncrementallyEncodedTerm(line.getTermBytes(), previousLine == null ? null : previousLine.getTermBytes(), @@ -161,13 +161,13 @@ public class BlockLine implements Accountable { * * @param termStatesOutput The output pointing to the details region. */ - protected static void writeLineTermState(DataOutput termStatesOutput, BlockLine line, + protected void writeLineTermState(DataOutput termStatesOutput, BlockLine line, FieldInfo fieldInfo, DeltaBaseTermStateSerializer encoder) throws IOException { assert line.termState != null; encoder.writeTermState(termStatesOutput, fieldInfo, line.termState); } - protected static void writeIncrementallyEncodedTerm(TermBytes termBytes, TermBytes previousTermBytes, + protected void writeIncrementallyEncodedTerm(TermBytes termBytes, TermBytes previousTermBytes, boolean isIncrementalEncodingSeed, DataOutput blockOutput) throws IOException { BytesRef term = termBytes.getTerm(); assert term.offset == 0; @@ -240,7 +240,7 @@ public class BlockLine implements Accountable { * Reads {@code length} bytes from the given {@link DataInput} and stores * them at {@code offset} in {@code bytes.bytes}. */ - protected static void readBytes(DataInput input, BytesRef bytes, int offset, int length) throws IOException { + protected void readBytes(DataInput input, BytesRef bytes, int offset, int length) throws IOException { assert bytes.offset == 0; bytes.length = offset + length; bytes.bytes = ArrayUtil.grow(bytes.bytes, bytes.length); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/BlockReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/BlockReader.java index 5c9c840bd98..8d4bfc04328 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/BlockReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/BlockReader.java @@ -60,6 +60,7 @@ public class BlockReader extends BaseTermsEnum implements Accountable { protected final FieldMetadata fieldMetadata; protected final BlockDecoder blockDecoder; + protected BlockHeader.Serializer blockHeaderReader; protected BlockLine.Serializer blockLineReader; /** * In-memory read buffer for the current block. @@ -406,14 +407,27 @@ public class BlockReader extends BaseTermsEnum implements Accountable { protected void initializeBlockReadLazily() throws IOException { if (blockStartFP == -1) { blockInput = blockInput.clone(); - blockLineReader = new BlockLine.Serializer(); + blockHeaderReader = createBlockHeaderSerializer(); + blockLineReader = createBlockLineSerializer(); blockReadBuffer = new ByteArrayDataInput(); termStatesReadBuffer = new ByteArrayDataInput(); - termStateSerializer = new DeltaBaseTermStateSerializer(); + termStateSerializer = createDeltaBaseTermStateSerializer(); scratchBlockBytes = new BytesRef(); } } + protected BlockHeader.Serializer createBlockHeaderSerializer() { + return new BlockHeader.Serializer(); + } + + protected BlockLine.Serializer createBlockLineSerializer() { + return new BlockLine.Serializer(); + } + + protected DeltaBaseTermStateSerializer createDeltaBaseTermStateSerializer() { + return new DeltaBaseTermStateSerializer(); + } + /** * Reads the block header. * Sets {@link #blockHeader}. @@ -428,7 +442,7 @@ public class BlockReader extends BaseTermsEnum implements Accountable { BytesRef blockBytesRef = decodeBlockBytesIfNeeded(numBlockBytes); blockReadBuffer.reset(blockBytesRef.bytes, blockBytesRef.offset, blockBytesRef.length); termStatesReadBuffer.reset(blockBytesRef.bytes, blockBytesRef.offset, blockBytesRef.length); - return blockHeader = BlockHeader.read(blockReadBuffer, blockHeader); + return blockHeader = blockHeaderReader.read(blockReadBuffer, blockHeader); } protected BytesRef decodeBlockBytesIfNeeded(int numBlockBytes) throws IOException { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/BlockWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/BlockWriter.java index acc397d6084..a1c3d70d6f1 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/BlockWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/BlockWriter.java @@ -60,6 +60,8 @@ public class BlockWriter { protected final ByteBuffersDataOutput blockLinesWriteBuffer; protected final ByteBuffersDataOutput termStatesWriteBuffer; + protected final BlockHeader.Serializer blockHeaderWriter; + protected final BlockLine.Serializer blockLineWriter; protected final DeltaBaseTermStateSerializer termStateSerializer; protected final BlockEncoder blockEncoder; protected final ByteBuffersDataOutput blockWriteBuffer; @@ -81,7 +83,9 @@ public class BlockWriter { this.blockEncoder = blockEncoder; this.blockLines = new ArrayList<>(targetNumBlockLines); - this.termStateSerializer = new DeltaBaseTermStateSerializer(); + this.blockHeaderWriter = createBlockHeaderSerializer(); + this.blockLineWriter = createBlockLineSerializer(); + this.termStateSerializer = createDeltaBaseTermStateSerializer(); this.blockLinesWriteBuffer = ByteBuffersDataOutput.newResettableInstance(); this.termStatesWriteBuffer = ByteBuffersDataOutput.newResettableInstance(); @@ -91,6 +95,18 @@ public class BlockWriter { this.scratchBytesRef = new BytesRef(); } + protected BlockHeader.Serializer createBlockHeaderSerializer() { + return new BlockHeader.Serializer(); + } + + protected BlockLine.Serializer createBlockLineSerializer() { + return new BlockLine.Serializer(); + } + + protected DeltaBaseTermStateSerializer createDeltaBaseTermStateSerializer() { + return new DeltaBaseTermStateSerializer(); + } + /** * Adds a new {@link BlockLine} term for the current field. *

@@ -196,7 +212,7 @@ public class BlockWriter { reusableBlockHeader.reset(blockLines.size(), termStateSerializer.getBaseDocStartFP(), termStateSerializer.getBasePosStartFP(), termStateSerializer.getBasePayStartFP(), Math.toIntExact(blockLinesWriteBuffer.size()), middleOffset); - reusableBlockHeader.write(blockWriteBuffer); + blockHeaderWriter.write(blockWriteBuffer, reusableBlockHeader); blockLinesWriteBuffer.copyTo(blockWriteBuffer); termStatesWriteBuffer.copyTo(blockWriteBuffer); @@ -236,8 +252,8 @@ public class BlockWriter { protected void writeBlockLine(boolean isIncrementalEncodingSeed, BlockLine line, BlockLine previousLine) throws IOException { assert fieldMetadata != null; - BlockLine.Serializer.writeLine(blockLinesWriteBuffer, line, previousLine, Math.toIntExact(termStatesWriteBuffer.size()), isIncrementalEncodingSeed); - BlockLine.Serializer.writeLineTermState(termStatesWriteBuffer, line, fieldMetadata.getFieldInfo(), termStateSerializer); + blockLineWriter.writeLine(blockLinesWriteBuffer, line, previousLine, Math.toIntExact(termStatesWriteBuffer.size()), isIncrementalEncodingSeed); + blockLineWriter.writeLineTermState(termStatesWriteBuffer, line, fieldMetadata.getFieldInfo(), termStateSerializer); } /** diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FieldMetadata.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FieldMetadata.java index f4cede05f06..8e2fc84574b 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FieldMetadata.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FieldMetadata.java @@ -194,88 +194,99 @@ public class FieldMetadata implements Accountable { + (docsSeen == null ? 0 : docsSeen.ramBytesUsed()); } - public static FieldMetadata read(DataInput input, FieldInfos fieldInfos, int maxNumDocs) throws IOException { - int fieldId = input.readVInt(); - FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldId); - if (fieldInfo == null) { - throw new CorruptIndexException("Illegal field id= " + fieldId, input); - } - FieldMetadata fieldMetadata = new FieldMetadata(fieldInfo, 0, false); + /** + * Reads/writes field metadata. + */ + public static class Serializer { - fieldMetadata.numTerms = input.readVInt(); - if (fieldMetadata.numTerms <= 0) { - throw new CorruptIndexException("Illegal number of terms= " + fieldMetadata.numTerms + " for field= " + fieldId, input); - } + /** + * Stateless singleton. + */ + public static final Serializer INSTANCE = new Serializer(); - fieldMetadata.sumDocFreq = input.readVInt(); - fieldMetadata.sumTotalTermFreq = fieldMetadata.sumDocFreq; - if (fieldMetadata.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0) { - fieldMetadata.sumTotalTermFreq += input.readVInt(); - if (fieldMetadata.sumTotalTermFreq < fieldMetadata.sumDocFreq) { - // #positions must be >= #postings. - throw new CorruptIndexException("Illegal sumTotalTermFreq= " + fieldMetadata.sumTotalTermFreq - + " sumDocFreq= " + fieldMetadata.sumDocFreq + " for field= " + fieldId, input); + public void write(DataOutput output, FieldMetadata fieldMetadata) throws IOException { + assert fieldMetadata.dictionaryStartFP >= 0; + assert fieldMetadata.firstBlockStartFP >= 0; + assert fieldMetadata.lastBlockStartFP >= 0; + assert fieldMetadata.numTerms > 0 : "There should be at least one term for field " + fieldMetadata.fieldInfo.name + ": " + fieldMetadata.numTerms; + assert fieldMetadata.firstBlockStartFP <= fieldMetadata.lastBlockStartFP : "start: " + fieldMetadata.firstBlockStartFP + " end: " + fieldMetadata.lastBlockStartFP; + assert fieldMetadata.lastTerm != null : "you must set the last term"; + + output.writeVInt(fieldMetadata.fieldInfo.number); + + output.writeVInt(fieldMetadata.numTerms); + output.writeVInt(fieldMetadata.sumDocFreq); + + if (fieldMetadata.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0) { + assert fieldMetadata.sumTotalTermFreq >= fieldMetadata.sumDocFreq : "sumTotalFQ: " + fieldMetadata.sumTotalTermFreq + " sumDocFQ: " + fieldMetadata.sumDocFreq; + output.writeVInt(fieldMetadata.sumTotalTermFreq - fieldMetadata.sumDocFreq); + } + + output.writeVInt(fieldMetadata.getDocCount()); + + output.writeVLong(fieldMetadata.dictionaryStartFP); + output.writeVLong(fieldMetadata.firstBlockStartFP); + output.writeVLong(fieldMetadata.lastBlockStartFP); + + if (fieldMetadata.lastTerm.length > 0) { + output.writeVInt(fieldMetadata.lastTerm.length); + output.writeBytes(fieldMetadata.lastTerm.bytes, fieldMetadata.lastTerm.offset, fieldMetadata.lastTerm.length); + } else { + output.writeVInt(0); } } - fieldMetadata.docCount = input.readVInt(); - if (fieldMetadata.docCount < 0 || fieldMetadata.docCount > maxNumDocs) { - // #docs with field must be <= #docs. - throw new CorruptIndexException("Illegal number of docs= " + fieldMetadata.docCount - + " maxNumDocs= " + maxNumDocs + " for field=" + fieldId, input); - } - if (fieldMetadata.sumDocFreq < fieldMetadata.docCount) { - // #postings must be >= #docs with field. - throw new CorruptIndexException("Illegal sumDocFreq= " + fieldMetadata.sumDocFreq - + " docCount= " + fieldMetadata.docCount + " for field= " + fieldId, input); - } + public FieldMetadata read(DataInput input, FieldInfos fieldInfos, int maxNumDocs) throws IOException { + int fieldId = input.readVInt(); + FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldId); + if (fieldInfo == null) { + throw new CorruptIndexException("Illegal field id= " + fieldId, input); + } + FieldMetadata fieldMetadata = new FieldMetadata(fieldInfo, 0, false); - fieldMetadata.dictionaryStartFP = input.readVLong(); - fieldMetadata.firstBlockStartFP = input.readVLong(); - fieldMetadata.lastBlockStartFP = input.readVLong(); + fieldMetadata.numTerms = input.readVInt(); + if (fieldMetadata.numTerms <= 0) { + throw new CorruptIndexException("Illegal number of terms= " + fieldMetadata.numTerms + " for field= " + fieldId, input); + } - int lastTermLength = input.readVInt(); - BytesRef lastTerm = new BytesRef(lastTermLength); - if (lastTermLength > 0) { - input.readBytes(lastTerm.bytes, 0, lastTermLength); - lastTerm.length = lastTermLength; - } else if (lastTermLength < 0) { - throw new CorruptIndexException("Illegal last term length= " + lastTermLength + " for field= " + fieldId, input); - } - fieldMetadata.setLastTerm(lastTerm); + fieldMetadata.sumDocFreq = input.readVInt(); + fieldMetadata.sumTotalTermFreq = fieldMetadata.sumDocFreq; + if (fieldMetadata.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0) { + fieldMetadata.sumTotalTermFreq += input.readVInt(); + if (fieldMetadata.sumTotalTermFreq < fieldMetadata.sumDocFreq) { + // #positions must be >= #postings. + throw new CorruptIndexException("Illegal sumTotalTermFreq= " + fieldMetadata.sumTotalTermFreq + + " sumDocFreq= " + fieldMetadata.sumDocFreq + " for field= " + fieldId, input); + } + } - return fieldMetadata; - } + fieldMetadata.docCount = input.readVInt(); + if (fieldMetadata.docCount < 0 || fieldMetadata.docCount > maxNumDocs) { + // #docs with field must be <= #docs. + throw new CorruptIndexException("Illegal number of docs= " + fieldMetadata.docCount + + " maxNumDocs= " + maxNumDocs + " for field=" + fieldId, input); + } + if (fieldMetadata.sumDocFreq < fieldMetadata.docCount) { + // #postings must be >= #docs with field. + throw new CorruptIndexException("Illegal sumDocFreq= " + fieldMetadata.sumDocFreq + + " docCount= " + fieldMetadata.docCount + " for field= " + fieldId, input); + } - public void write(DataOutput output) throws IOException { - assert dictionaryStartFP >= 0; - assert firstBlockStartFP >= 0; - assert lastBlockStartFP >= 0; - assert numTerms > 0 : "There should be at least one term for field " + fieldInfo.name + ": " + numTerms; - assert firstBlockStartFP <= lastBlockStartFP : "start: " + firstBlockStartFP + " end: " + lastBlockStartFP; - assert lastTerm != null : "you must set the last term"; + fieldMetadata.dictionaryStartFP = input.readVLong(); + fieldMetadata.firstBlockStartFP = input.readVLong(); + fieldMetadata.lastBlockStartFP = input.readVLong(); - output.writeVInt(fieldInfo.number); + int lastTermLength = input.readVInt(); + BytesRef lastTerm = new BytesRef(lastTermLength); + if (lastTermLength > 0) { + input.readBytes(lastTerm.bytes, 0, lastTermLength); + lastTerm.length = lastTermLength; + } else if (lastTermLength < 0) { + throw new CorruptIndexException("Illegal last term length= " + lastTermLength + " for field= " + fieldId, input); + } + fieldMetadata.setLastTerm(lastTerm); - output.writeVInt(numTerms); - output.writeVInt(sumDocFreq); - - if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0) { - assert sumTotalTermFreq >= sumDocFreq : "sumTotalFQ: " + sumTotalTermFreq + " sumDocFQ: " + sumDocFreq; - output.writeVInt(sumTotalTermFreq - sumDocFreq); - } - - output.writeVInt(getDocCount()); - - output.writeVLong(dictionaryStartFP); - output.writeVLong(firstBlockStartFP); - output.writeVLong(lastBlockStartFP); - - if (lastTerm.length > 0) { - output.writeVInt(lastTerm.length); - output.writeBytes(lastTerm.bytes, lastTerm.offset, lastTerm.length); - } else { - output.writeVInt(0); + return fieldMetadata; } } } \ No newline at end of file diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsReader.java index f116eee7124..bda04063fee 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsReader.java @@ -69,7 +69,7 @@ public class UniformSplitTermsReader extends FieldsProducer { * It can be used for decompression or decryption. */ public UniformSplitTermsReader(PostingsReaderBase postingsReader, SegmentReadState state, BlockDecoder blockDecoder) throws IOException { - this(postingsReader, state, blockDecoder, NAME, VERSION_START, VERSION_CURRENT, + this(postingsReader, state, blockDecoder, FieldMetadata.Serializer.INSTANCE, NAME, VERSION_START, VERSION_CURRENT, TERMS_BLOCKS_EXTENSION, TERMS_DICTIONARY_EXTENSION); } @@ -77,8 +77,10 @@ public class UniformSplitTermsReader extends FieldsProducer { * @param blockDecoder Optional block decoder, may be null if none. * It can be used for decompression or decryption. */ - protected UniformSplitTermsReader(PostingsReaderBase postingsReader, SegmentReadState state, BlockDecoder blockDecoder, - String codecName, int versionStart, int versionCurrent, String termsBlocksExtension, String dictionaryExtension) throws IOException { + protected UniformSplitTermsReader(PostingsReaderBase postingsReader, SegmentReadState state, + BlockDecoder blockDecoder, FieldMetadata.Serializer fieldMetadataReader, + String codecName, int versionStart, int versionCurrent, + String termsBlocksExtension, String dictionaryExtension) throws IOException { IndexInput dictionaryInput = null; IndexInput blockInput = null; boolean success = false; @@ -100,7 +102,7 @@ public class UniformSplitTermsReader extends FieldsProducer { CodecUtil.retrieveChecksum(blockInput); seekFieldsMetadata(blockInput); - Collection fieldMetadataCollection = parseFieldsMetadata(blockInput, state.fieldInfos, state.segmentInfo.maxDoc()); + Collection fieldMetadataCollection = parseFieldsMetadata(blockInput, state.fieldInfos, fieldMetadataReader, state.segmentInfo.maxDoc()); fieldToTermsMap = new HashMap<>(); this.blockInput = blockInput; @@ -133,19 +135,19 @@ public class UniformSplitTermsReader extends FieldsProducer { * @param indexInput {@link IndexInput} must be positioned to the fields metadata * details by calling {@link #seekFieldsMetadata(IndexInput)} before this call. */ - protected static Collection parseFieldsMetadata(IndexInput indexInput, FieldInfos fieldInfos, int maxNumDocs) throws IOException { + protected static Collection parseFieldsMetadata(IndexInput indexInput, FieldInfos fieldInfos, + FieldMetadata.Serializer fieldMetadataReader, int maxNumDocs) throws IOException { int numFields = indexInput.readVInt(); if (numFields < 0) { throw new CorruptIndexException("Illegal number of fields= " + numFields, indexInput); } Collection fieldMetadataCollection = new ArrayList<>(numFields); for (int i = 0; i < numFields; i++) { - fieldMetadataCollection.add(FieldMetadata.read(indexInput, fieldInfos, maxNumDocs)); + fieldMetadataCollection.add(fieldMetadataReader.read(indexInput, fieldInfos, maxNumDocs)); } return fieldMetadataCollection; } - @Override public void close() throws IOException { try { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsWriter.java index 5adf74de32d..101b6b5942f 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsWriter.java @@ -128,6 +128,7 @@ public class UniformSplitTermsWriter extends FieldsConsumer { protected final int deltaNumLines; protected final BlockEncoder blockEncoder; + protected final FieldMetadata.Serializer fieldMetadataWriter; protected final IndexOutput blockOutput; protected final IndexOutput dictionaryOutput; @@ -146,7 +147,7 @@ public class UniformSplitTermsWriter extends FieldsConsumer { */ public UniformSplitTermsWriter(PostingsWriterBase postingsWriter, SegmentWriteState state, int targetNumBlockLines, int deltaNumLines, BlockEncoder blockEncoder) throws IOException { - this(postingsWriter, state, targetNumBlockLines, deltaNumLines, blockEncoder, + this(postingsWriter, state, targetNumBlockLines, deltaNumLines, blockEncoder, FieldMetadata.Serializer.INSTANCE, NAME, VERSION_CURRENT, TERMS_BLOCKS_EXTENSION, TERMS_DICTIONARY_EXTENSION); } @@ -164,7 +165,7 @@ public class UniformSplitTermsWriter extends FieldsConsumer { * It can be used for compression or encryption. */ protected UniformSplitTermsWriter(PostingsWriterBase postingsWriter, SegmentWriteState state, - int targetNumBlockLines, int deltaNumLines, BlockEncoder blockEncoder, + int targetNumBlockLines, int deltaNumLines, BlockEncoder blockEncoder, FieldMetadata.Serializer fieldMetadataWriter, String codecName, int versionCurrent, String termsBlocksExtension, String dictionaryExtension) throws IOException { validateSettings(targetNumBlockLines, deltaNumLines); IndexOutput blockOutput = null; @@ -177,6 +178,7 @@ public class UniformSplitTermsWriter extends FieldsConsumer { this.targetNumBlockLines = targetNumBlockLines; this.deltaNumLines = deltaNumLines; this.blockEncoder = blockEncoder; + this.fieldMetadataWriter = fieldMetadataWriter; String termsName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, termsBlocksExtension); blockOutput = state.directory.createOutput(termsName, state.context); @@ -278,7 +280,7 @@ public class UniformSplitTermsWriter extends FieldsConsumer { if (fieldMetadata.getNumTerms() > 0) { fieldMetadata.setLastTerm(lastTerm); - fieldMetadata.write(fieldsOutput); + fieldMetadataWriter.write(fieldsOutput, fieldMetadata); writeDictionary(dictionaryBuilder); return 1; } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockLine.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockLine.java index 46f6ab7675b..7dfe312a8a2 100755 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockLine.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockLine.java @@ -75,7 +75,7 @@ public class STBlockLine extends BlockLine { /** * Writes all the {@link BlockTermState} of the provided {@link STBlockLine} to the given output. */ - public static void writeLineTermStates(DataOutput termStatesOutput, STBlockLine line, + public void writeLineTermStates(DataOutput termStatesOutput, STBlockLine line, DeltaBaseTermStateSerializer encoder) throws IOException { FieldMetadataTermState fieldMetadataTermState; @@ -111,7 +111,7 @@ public class STBlockLine extends BlockLine { * @return The {@link BlockTermState} corresponding to the provided field id; or null if the field * does not occur in the line. */ - public static BlockTermState readTermStateForField(int fieldId, DataInput termStatesInput, + public BlockTermState readTermStateForField(int fieldId, DataInput termStatesInput, DeltaBaseTermStateSerializer termStateSerializer, BlockHeader blockHeader, FieldInfos fieldInfos, BlockTermState reuse) throws IOException { @@ -161,7 +161,7 @@ public class STBlockLine extends BlockLine { * @param fieldTermStatesMap Map filled with the term states for each field. It is cleared first. * @see #readTermStateForField */ - public static void readFieldTermStatesMap(DataInput termStatesInput, + public void readFieldTermStatesMap(DataInput termStatesInput, DeltaBaseTermStateSerializer termStateSerializer, BlockHeader blockHeader, FieldInfos fieldInfos, @@ -183,7 +183,7 @@ public class STBlockLine extends BlockLine { /** * Reads all the field ids in the current block line of the provided input. */ - public static int[] readFieldIds(DataInput termStatesInput, int numFields) throws IOException { + public int[] readFieldIds(DataInput termStatesInput, int numFields) throws IOException { int[] fieldIds = new int[numFields]; for (int i = 0; i < numFields; i++) { fieldIds[i] = termStatesInput.readVInt(); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockReader.java index 3f04d92eebc..6d7c79d6f65 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockReader.java @@ -117,6 +117,11 @@ public class STBlockReader extends BlockReader { return blockStartFP > fieldMetadata.getLastBlockStartFP() || super.isBeyondLastTerm(searchedTerm, blockStartFP); } + @Override + protected STBlockLine.Serializer createBlockLineSerializer() { + return new STBlockLine.Serializer(); + } + /** * Reads the {@link BlockTermState} on the current line for this reader's field. * @@ -125,7 +130,7 @@ public class STBlockReader extends BlockReader { @Override protected BlockTermState readTermState() throws IOException { termStatesReadBuffer.setPosition(blockFirstLineStart + blockHeader.getTermStatesBaseOffset() + blockLine.getTermStateRelativeOffset()); - return termState = STBlockLine.Serializer.readTermStateForField( + return termState = ((STBlockLine.Serializer) blockLineReader).readTermStateForField( fieldMetadata.getFieldInfo().number, termStatesReadBuffer, termStateSerializer, diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockWriter.java index 4c37a4ecb27..15e8d542de4 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockWriter.java @@ -84,10 +84,15 @@ public class STBlockWriter extends BlockWriter { super.finishLastBlock(dictionaryBuilder); } + @Override + protected BlockLine.Serializer createBlockLineSerializer() { + return new STBlockLine.Serializer(); + } + @Override protected void writeBlockLine(boolean isIncrementalEncodingSeed, BlockLine line, BlockLine previousLine) throws IOException { - STBlockLine.Serializer.writeLine(blockLinesWriteBuffer, line, previousLine, Math.toIntExact(termStatesWriteBuffer.size()), isIncrementalEncodingSeed); - STBlockLine.Serializer.writeLineTermStates(termStatesWriteBuffer, (STBlockLine) line, termStateSerializer); + blockLineWriter.writeLine(blockLinesWriteBuffer, line, previousLine, Math.toIntExact(termStatesWriteBuffer.size()), isIncrementalEncodingSeed); + ((STBlockLine.Serializer) blockLineWriter).writeLineTermStates(termStatesWriteBuffer, (STBlockLine) line, termStateSerializer); ((STBlockLine) line).collectFields(fieldsInBlock); } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STIntersectBlockReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STIntersectBlockReader.java index 577aae143d0..099b6c31cef 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STIntersectBlockReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STIntersectBlockReader.java @@ -91,6 +91,11 @@ public class STIntersectBlockReader extends IntersectBlockReader { return super.nextBlockMatchingPrefix() && blockHeader != null; } + @Override + protected STBlockLine.Serializer createBlockLineSerializer() { + return new STBlockLine.Serializer(); + } + /** * Reads the {@link BlockTermState} on the current line for the specific field * corresponding this this reader. @@ -100,7 +105,7 @@ public class STIntersectBlockReader extends IntersectBlockReader { @Override protected BlockTermState readTermState() throws IOException { termStatesReadBuffer.setPosition(blockFirstLineStart + blockHeader.getTermStatesBaseOffset() + blockLine.getTermStateRelativeOffset()); - return STBlockLine.Serializer.readTermStateForField( + return ((STBlockLine.Serializer) blockLineReader).readTermStateForField( fieldMetadata.getFieldInfo().number, termStatesReadBuffer, termStateSerializer, diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STMergingBlockReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STMergingBlockReader.java index 5f135bbfac9..fe6bdbe01ff 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STMergingBlockReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STMergingBlockReader.java @@ -98,7 +98,7 @@ public class STMergingBlockReader extends STBlockReader { public void readFieldTermStatesMap(Map fieldTermStatesMap) throws IOException { if (term() != null) { termStatesReadBuffer.setPosition(blockFirstLineStart + blockHeader.getTermStatesBaseOffset() + blockLine.getTermStateRelativeOffset()); - STBlockLine.Serializer.readFieldTermStatesMap( + ((STBlockLine.Serializer) blockLineReader).readFieldTermStatesMap( termStatesReadBuffer, termStateSerializer, blockHeader, diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTermsReader.java index 84360b9e2de..50a17bce8dd 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTermsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTermsReader.java @@ -47,13 +47,15 @@ import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPo public class STUniformSplitTermsReader extends UniformSplitTermsReader { public STUniformSplitTermsReader(PostingsReaderBase postingsReader, SegmentReadState state, BlockDecoder blockDecoder) throws IOException { - super(postingsReader, state, blockDecoder, NAME, VERSION_START, - VERSION_CURRENT, TERMS_BLOCKS_EXTENSION, TERMS_DICTIONARY_EXTENSION); + this(postingsReader, state, blockDecoder, FieldMetadata.Serializer.INSTANCE, + NAME, VERSION_START, VERSION_CURRENT, TERMS_BLOCKS_EXTENSION, TERMS_DICTIONARY_EXTENSION); } - protected STUniformSplitTermsReader(PostingsReaderBase postingsReader, SegmentReadState state, BlockDecoder blockDecoder, - String codecName, int versionStart, int versionCurrent, String termsBlocksExtension, String dictionaryExtension) throws IOException { - super(postingsReader, state, blockDecoder, codecName, versionStart, versionCurrent, termsBlocksExtension, dictionaryExtension); + protected STUniformSplitTermsReader(PostingsReaderBase postingsReader, SegmentReadState state, + BlockDecoder blockDecoder, FieldMetadata.Serializer fieldMetadataReader, + String codecName, int versionStart, int versionCurrent, + String termsBlocksExtension, String dictionaryExtension) throws IOException { + super(postingsReader, state, blockDecoder, fieldMetadataReader, codecName, versionStart, versionCurrent, termsBlocksExtension, dictionaryExtension); } @Override diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTermsWriter.java index d121301b7cb..ca15d6a74da 100755 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTermsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTermsWriter.java @@ -88,13 +88,14 @@ public class STUniformSplitTermsWriter extends UniformSplitTermsWriter { public STUniformSplitTermsWriter(PostingsWriterBase postingsWriter, SegmentWriteState state, int targetNumBlockLines, int deltaNumLines, BlockEncoder blockEncoder) throws IOException { - this(postingsWriter, state, targetNumBlockLines, deltaNumLines, blockEncoder, NAME, VERSION_CURRENT, TERMS_BLOCKS_EXTENSION, TERMS_DICTIONARY_EXTENSION); + this(postingsWriter, state, targetNumBlockLines, deltaNumLines, blockEncoder, FieldMetadata.Serializer.INSTANCE, + NAME, VERSION_CURRENT, TERMS_BLOCKS_EXTENSION, TERMS_DICTIONARY_EXTENSION); } protected STUniformSplitTermsWriter(PostingsWriterBase postingsWriter, SegmentWriteState state, - int targetNumBlockLines, int deltaNumLines, BlockEncoder blockEncoder, + int targetNumBlockLines, int deltaNumLines, BlockEncoder blockEncoder, FieldMetadata.Serializer fieldMetadataWriter, String codecName, int versionCurrent, String termsBlocksExtension, String dictionaryExtension) throws IOException { - super(postingsWriter, state, targetNumBlockLines, deltaNumLines, blockEncoder, codecName, versionCurrent, termsBlocksExtension, dictionaryExtension); + super(postingsWriter, state, targetNumBlockLines, deltaNumLines, blockEncoder, fieldMetadataWriter, codecName, versionCurrent, termsBlocksExtension, dictionaryExtension); } @Override @@ -200,7 +201,7 @@ public class STUniformSplitTermsWriter extends UniformSplitTermsWriter { int fieldsNumber = 0; for (FieldMetadata fieldMetadata : fieldMetadataList) { if (fieldMetadata.getNumTerms() > 0) { - fieldMetadata.write(fieldsOutput); + fieldMetadataWriter.write(fieldsOutput, fieldMetadata); fieldsNumber++; } } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldHighlighter.java index f2a34dc9ea5..5e45e3f40a9 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldHighlighter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldHighlighter.java @@ -141,6 +141,7 @@ public class FieldHighlighter { } }); Passage passage = new Passage(); // the current passage in-progress. Will either get reset or added to queue. + int lastPassageEnd = 0; do { int start = off.startOffset(); @@ -158,9 +159,12 @@ public class FieldHighlighter { if (start >= contentLength) { break; } + // find fragment from the middle of the match, so the result's length may be closer to fragsize + final int center = start + (end - start) / 2; // advance breakIterator - passage.setStartOffset(Math.max(this.breakIterator.preceding(start + 1), 0)); - passage.setEndOffset(Math.min(this.breakIterator.following(start), contentLength)); + passage.setStartOffset(Math.min(start, Math.max(this.breakIterator.preceding(Math.max(start + 1, center)), lastPassageEnd))); + lastPassageEnd = Math.max(end, Math.min(this.breakIterator.following(Math.min(end - 1, center)), contentLength)); + passage.setEndOffset(lastPassageEnd); } // Add this term to the passage. BytesRef term = off.getTerm();// a reference; safe to refer to diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/LengthGoalBreakIterator.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/LengthGoalBreakIterator.java index 3134013ab35..a4b5a07f63b 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/LengthGoalBreakIterator.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/LengthGoalBreakIterator.java @@ -33,22 +33,49 @@ public class LengthGoalBreakIterator extends BreakIterator { private final BreakIterator baseIter; private final int lengthGoal; + private final float fragmentAlignment; // how much text to align before match-fragment, valid in range [0, 1] private final boolean isMinimumLength; // if false then is "closest to" length + private int currentCache; - /** Breaks will be at least {@code minLength} apart (to the extent possible). */ + /** + * Breaks will be at least {@code minLength} apart (to the extent possible), + * while trying to position the match inside the fragment according to {@code fragmentAlignment}. + */ + public static LengthGoalBreakIterator createMinLength(BreakIterator baseIter, int minLength, + float fragmentAlignment) { + return new LengthGoalBreakIterator(baseIter, minLength, fragmentAlignment, true, baseIter.current()); + } + + /** For backwards compatibility you can initialise the break iterator without fragmentAlignment. */ + @Deprecated public static LengthGoalBreakIterator createMinLength(BreakIterator baseIter, int minLength) { - return new LengthGoalBreakIterator(baseIter, minLength, true); + return createMinLength(baseIter, minLength, 0.f); } - /** Breaks will be on average {@code targetLength} apart; the closest break to this target (before or after) - * is chosen. */ + /** + * Breaks will be on average {@code targetLength} apart; the closest break to this target (before or after) + * is chosen. The match will be positioned according to {@code fragmentAlignment} as much as possible. + */ + public static LengthGoalBreakIterator createClosestToLength(BreakIterator baseIter, int targetLength, + float fragmentAlignment) { + return new LengthGoalBreakIterator(baseIter, targetLength, fragmentAlignment, false, baseIter.current()); + } + + /** For backwards compatibility you can initialise the break iterator without fragmentAlignment. */ + @Deprecated public static LengthGoalBreakIterator createClosestToLength(BreakIterator baseIter, int targetLength) { - return new LengthGoalBreakIterator(baseIter, targetLength, false); + return createClosestToLength(baseIter, targetLength, 0.f); } - private LengthGoalBreakIterator(BreakIterator baseIter, int lengthGoal, boolean isMinimumLength) { + private LengthGoalBreakIterator(BreakIterator baseIter, int lengthGoal, float fragmentAlignment, + boolean isMinimumLength, int currentCache) { this.baseIter = baseIter; + this.currentCache = currentCache; this.lengthGoal = lengthGoal; + if (fragmentAlignment < 0.f || fragmentAlignment > 1.f || !Float.isFinite(fragmentAlignment)) { + throw new IllegalArgumentException("fragmentAlignment must be >= zero and <= one"); + } + this.fragmentAlignment = fragmentAlignment; this.isMinimumLength = isMinimumLength; } @@ -60,12 +87,15 @@ public class LengthGoalBreakIterator extends BreakIterator { @Override public String toString() { String goalDesc = isMinimumLength ? "minLen" : "targetLen"; - return getClass().getSimpleName() + "{" + goalDesc + "=" + lengthGoal + ", baseIter=" + baseIter + "}"; + return getClass().getSimpleName() + "{" + goalDesc + "=" + lengthGoal + ", fragAlign=" + fragmentAlignment + + ", baseIter=" + baseIter + "}"; } @Override public Object clone() { - return new LengthGoalBreakIterator((BreakIterator) baseIter.clone(), lengthGoal, isMinimumLength); + return new LengthGoalBreakIterator( + (BreakIterator) baseIter.clone(), lengthGoal, fragmentAlignment, isMinimumLength, currentCache + ); } @Override @@ -76,26 +106,28 @@ public class LengthGoalBreakIterator extends BreakIterator { @Override public void setText(String newText) { baseIter.setText(newText); + currentCache = baseIter.current(); } @Override public void setText(CharacterIterator newText) { baseIter.setText(newText); + currentCache = baseIter.current(); } @Override public int current() { - return baseIter.current(); + return currentCache; } @Override public int first() { - return baseIter.first(); + return currentCache = baseIter.first(); } @Override public int last() { - return baseIter.last(); + return currentCache = baseIter.last(); } @Override @@ -104,10 +136,10 @@ public class LengthGoalBreakIterator extends BreakIterator { return baseIter.next(n); // probably wrong } - // called by getSummaryPassagesNoHighlight to generate default summary. + // Called by getSummaryPassagesNoHighlight to generate default summary. @Override public int next() { - return following(current()); + return following(currentCache, currentCache + lengthGoal); } @Override @@ -116,65 +148,70 @@ public class LengthGoalBreakIterator extends BreakIterator { return baseIter.previous(); } - // called while the current position is the start of a new passage; find end of passage @Override - public int following(int followingIdx) { - final int startIdx = current(); - if (followingIdx < startIdx) { - assert false : "Not supported"; - return baseIter.following(followingIdx); - } - final int targetIdx = startIdx + lengthGoal; - // When followingIdx >= targetIdx, we can simply delegate since it will be >= the target - if (followingIdx >= targetIdx - 1) { - return baseIter.following(followingIdx); - } - // If target exceeds the text length, return the last index. - if (targetIdx >= getText().getEndIndex()) { - return baseIter.last(); - } - - // Find closest break >= the target - final int afterIdx = baseIter.following(targetIdx - 1); - if (afterIdx == DONE) { // we're at the end; can this happen? - return current(); - } - if (afterIdx == targetIdx) { // right on the money - return afterIdx; - } - if (isMinimumLength) { // thus never undershoot - return afterIdx; - } - - // note: it is a shame that we invoke preceding() *in addition to* following(); BI's are sometimes expensive. - - // Find closest break < target - final int beforeIdx = baseIter.preceding(targetIdx); // or could do baseIter.previous() but we hope the BI implements preceding() - if (beforeIdx <= followingIdx) { // too far back - return moveToBreak(afterIdx); - } - - if (targetIdx - beforeIdx <= afterIdx - targetIdx) { - return beforeIdx; - } - return moveToBreak(afterIdx); + public int following(int matchEndIndex) { + return following(matchEndIndex, (matchEndIndex + 1) + (int)(lengthGoal * (1.f - fragmentAlignment))); } - private int moveToBreak(int idx) { // precondition: idx is a known break - // bi.isBoundary(idx) has side-effect of moving the position. Not obvious! - //boolean moved = baseIter.isBoundary(idx); // probably not particularly expensive - //assert moved && current() == idx; + private int following(int matchEndIndex, int targetIdx) { + if (targetIdx >= getText().getEndIndex()) { + if (currentCache == baseIter.last()) { + return DONE; + } + return currentCache = baseIter.last(); + } + final int afterIdx = baseIter.following(targetIdx - 1); + if (afterIdx == DONE) { + currentCache = baseIter.last(); + return DONE; + } + if (afterIdx == targetIdx) { // right on the money + return currentCache = afterIdx; + } + if (isMinimumLength) { // thus never undershoot + return currentCache = afterIdx; + } - // TODO fix: Would prefer to do "- 1" instead of "- 2" but CustomSeparatorBreakIterator has a bug. - int current = baseIter.following(idx - 2); - assert current == idx : "following() didn't move us to the expected index."; - return idx; + // note: it is a shame that we invoke preceding() *one more time*; BI's are sometimes expensive. + + // Find closest break to target + final int beforeIdx = baseIter.preceding(targetIdx); + if (targetIdx - beforeIdx < afterIdx - targetIdx && beforeIdx > matchEndIndex) { + return currentCache = beforeIdx; + } + return currentCache = afterIdx; } // called at start of new Passage given first word start offset @Override - public int preceding(int offset) { - return baseIter.preceding(offset); // no change needed + public int preceding(int matchStartIndex) { + final int targetIdx = (matchStartIndex - 1) - (int)(lengthGoal * fragmentAlignment); + if (targetIdx <= 0) { + if (currentCache == baseIter.first()) { + return DONE; + } + return currentCache = baseIter.first(); + } + final int beforeIdx = baseIter.preceding(targetIdx + 1); + if (beforeIdx == DONE) { + currentCache = baseIter.first(); + return DONE; + } + if (beforeIdx == targetIdx) { // right on the money + return currentCache = beforeIdx; + } + if (isMinimumLength) { // thus never undershoot + return currentCache = beforeIdx; + } + + // note: it is a shame that we invoke following() *one more time*; BI's are sometimes expensive. + + // Find closest break to target + final int afterIdx = baseIter.following(targetIdx - 1); + if (afterIdx - targetIdx < targetIdx - beforeIdx && afterIdx < matchStartIndex) { + return currentCache = afterIdx; + } + return currentCache = beforeIdx; } @Override diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/LengthGoalBreakIteratorTest.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/LengthGoalBreakIteratorTest.java index 2434f5259e5..c2406637bd3 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/LengthGoalBreakIteratorTest.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/LengthGoalBreakIteratorTest.java @@ -18,6 +18,7 @@ package org.apache.lucene.search.uhighlight; import java.io.IOException; +import java.text.BreakIterator; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; @@ -28,6 +29,7 @@ import org.apache.lucene.util.QueryBuilder; public class LengthGoalBreakIteratorTest extends LuceneTestCase { private static final String FIELD = "body"; + private static final float[] ALIGNS = {0.f, 0.5f, 1.f}; // We test LengthGoalBreakIterator as it is used by the UnifiedHighlighter instead of directly, because it is // not a general purpose BreakIterator. A unit test of it directly wouldn't give as much confidence. @@ -39,65 +41,159 @@ public class LengthGoalBreakIteratorTest extends LuceneTestCase { // 0 1 // 01234567890123456789 static final String CONTENT = "Aa bb. Cc dd. Ee ff"; + static final String CONTENT2 = "Aa bb Cc dd X Ee ff Gg hh."; + static final String CONTENT3 = "Aa bbcc ddxyzee ffgg hh."; + + public void testFragmentAlignmentConstructor() throws IOException { + BreakIterator baseBI = new CustomSeparatorBreakIterator('.'); + // test fragmentAlignment validation + float[] valid_aligns = {0.f, 0.3333f, 0.5f, 0.99f, 1.f}; + for (float alignment : valid_aligns) { + LengthGoalBreakIterator.createClosestToLength(baseBI, 50, alignment); + } + float[] invalid_aligns = {-0.01f, -1.f, 1.5f, Float.NaN, Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY}; + for (float alignment : invalid_aligns) { + expectThrows(IllegalArgumentException.class, () -> { + LengthGoalBreakIterator.createClosestToLength(baseBI, 50, alignment); + }); + } + // test backwards compatibility constructors + String backwardCompString = LengthGoalBreakIterator.createClosestToLength(baseBI, 50).toString(); + assertTrue(backwardCompString, backwardCompString.contains("fragAlign=0.0")); + backwardCompString = LengthGoalBreakIterator.createMinLength(baseBI, 50).toString(); + assertTrue(backwardCompString, backwardCompString.contains("fragAlign=0.0")); + } public void testTargetLen() throws IOException { // "goal" means target length goal to find closest break // at first word: Query query = query("aa"); - assertEquals("almost two sent", - "Aa bb.", highlightClosestToLen(CONTENT, query, 9)); - assertEquals( "barely two sent", - "Aa bb. Cc dd.", highlightClosestToLen(CONTENT, query, 10)); - assertEquals("long goal", - "Aa bb. Cc dd. Ee ff", highlightClosestToLen(CONTENT, query, 17 + random().nextInt(20))); - + assertEquals("almost two sent A", + "Aa bb.", highlightClosestToLen(CONTENT, query, 7, 0.f)); + assertEquals("almost two sent B", + "Aa bb.", highlightClosestToLen(CONTENT, query, 15, 0.5f)); + assertEquals("almost two sent C", + "Aa bb.", highlightClosestToLen(CONTENT, query, 64, 1.f)); + assertEquals("barely two sent A", + "Aa bb. Cc dd.", highlightClosestToLen(CONTENT, query, 8, 0.f)); + assertEquals("barely two sent B", + "Aa bb. Cc dd.", highlightClosestToLen(CONTENT, query, 16, 0.5f)); + assertEquals("long goal A", + "Aa bb. Cc dd. Ee ff", highlightClosestToLen(CONTENT, query, 14 + random().nextInt(20), 0.f)); + assertEquals("long goal B", + "Aa bb. Cc dd. Ee ff", highlightClosestToLen(CONTENT, query, 28 + random().nextInt(20), 0.5f)); // at some word not at start of passage query = query("dd"); - assertEquals("short goal", - " Cc dd.", highlightClosestToLen(CONTENT, query, random().nextInt(5))); - assertEquals("almost two sent", - " Cc dd.", highlightClosestToLen(CONTENT, query, 10)); - assertEquals("barely two sent", - " Cc dd. Ee ff", highlightClosestToLen(CONTENT, query, 11)); - assertEquals("long goal", - " Cc dd. Ee ff", highlightClosestToLen(CONTENT, query, 12 + random().nextInt(20))); + for (float align : ALIGNS) { + // alignment is not meaningful if fragsize is shorter than or closer to match-fragment boundaries + assertEquals("short goal " + align, + " Cc dd.", highlightClosestToLen(CONTENT, query, random().nextInt(4), align)); + } + // preceding/following inclusion by alignment parameter + assertEquals("barely two sent A", + " Cc dd. Ee ff", highlightClosestToLen(CONTENT, query, 11, 0.f)); + assertEquals("barely two sent B", + " Cc dd. Ee ff", highlightClosestToLen(CONTENT, query, 11, 0.5f)); + assertEquals("barely two sent C", + "Aa bb. Cc dd.", highlightClosestToLen(CONTENT, query, 11, 1.f)); + assertEquals("long goal A", + " Cc dd. Ee ff", highlightClosestToLen(CONTENT, query, 17 + random().nextInt(20), 0.f)); + assertEquals("long goal B", + "Aa bb. Cc dd. Ee ff", highlightClosestToLen(CONTENT, query, 17 + random().nextInt(20), 0.5f)); + assertEquals("long goal C", + "Aa bb. Cc dd.", highlightClosestToLen(CONTENT, query, 17 + random().nextInt(20), 1.f)); + + query = query("ddxyzee"); + assertEquals("test fragment search from the middle of the match; almost including", + "ddxyzee ", highlightClosestToLen(CONTENT3, query, 7, 0.5f, 1, ' ')); + assertEquals("test fragment search from the middle of the match; barely including", + "bbcc ddxyzee ffgg ", highlightClosestToLen(CONTENT3, query, 14, 0.5f, 1, ' ')); } public void testMinLen() throws IOException { // minLen mode is simpler than targetLen... just test a few cases Query query = query("dd"); - assertEquals("almost two sent", - " Cc dd.", highlightMinLen(CONTENT, query, 6)); - assertEquals("barely two sent", - " Cc dd. Ee ff", highlightMinLen(CONTENT, query, 7)); + assertEquals("almost two sent A", + " Cc dd.", highlightMinLen(CONTENT, query, 0, 0.f)); + assertEquals("almost two sent B", + " Cc dd.", highlightMinLen(CONTENT, query, 1, 0.5f)); + assertEquals("almost two sent C", + " Cc dd.", highlightMinLen(CONTENT, query, 5, 1.f)); + + assertEquals("barely two sent A", + " Cc dd. Ee ff", highlightMinLen(CONTENT, query, 1, 0.f)); + assertEquals("barely two sent B", + " Cc dd. Ee ff", highlightMinLen(CONTENT, query, 2, 0.5f)); + assertEquals("barely two sent C", + "Aa bb. Cc dd.", highlightMinLen(CONTENT, query, 7, 1.f)); + assertEquals("barely two sent D/a", + " Cc dd.", highlightMinLen(CONTENT, query, 2, 0.55f)); + assertEquals("barely two sent D/b", + " Cc dd. Ee ff", highlightMinLen(CONTENT, query, 3, 0.55f)); + assertEquals("barely two sent E/a", + " Cc dd. Ee ff", highlightMinLen(CONTENT, query, 10, 0.5f)); + assertEquals("barely two sent E/b", + "Aa bb. Cc dd. Ee ff", highlightMinLen(CONTENT, query, 10, 0.7f)); + assertEquals("barely two sent E/c", + "Aa bb. Cc dd.", highlightMinLen(CONTENT, query, 9, 0.9f)); + + query = query("ddxyzee"); + assertEquals("test fragment search from the middle of the match; almost including", + "ddxyzee ", highlightMinLen(CONTENT3, query, 7, 0.5f, ' ')); + assertEquals("test fragment search from the middle of the match; barely including", + "bbcc ddxyzee ffgg ", highlightMinLen(CONTENT3, query, 8, 0.5f, ' ')); + } + + public void testMinLenPrecision() throws IOException { + Query query = query("x"); + assertEquals("test absolute minimal length", + "X ", highlightMinLen(CONTENT2, query, 1, 0.5f, ' ')); + assertEquals("test slightly above minimal length", + "dd X Ee ", highlightMinLen(CONTENT2, query, 4, 0.5f, ' ')); } public void testDefaultSummaryTargetLen() throws IOException { Query query = query("zz"); - assertEquals("Aa bb.", - highlightClosestToLen(CONTENT, query, random().nextInt(10))); // < 10 + for (float align : ALIGNS) { // alignment is not used for creating default-summary + assertEquals("Aa bb.", + highlightClosestToLen(CONTENT, query, 6 + random().nextInt(4), align)); + assertEquals("Aa bb. Cc dd.", + highlightClosestToLen(CONTENT, query, 12 + random().nextInt(4), align)); + assertEquals("Aa bb. Cc dd. Ee ff", + highlightClosestToLen(CONTENT, query, 17 + random().nextInt(20), align)); + } assertEquals("Aa bb. Cc dd.", - highlightClosestToLen(CONTENT, query, 10 + 6)); // cusp of adding 3rd sentence - assertEquals("Aa bb. Cc dd. Ee ff", - highlightClosestToLen(CONTENT, query, 17 + random().nextInt(20))); // >= 14 + highlightClosestToLen(CONTENT, query, 6 + random().nextInt(4), 0.f, 2)); } private Query query(String qStr) { return new QueryBuilder(analyzer).createBooleanQuery(FIELD, qStr); } - private String highlightClosestToLen(String content, Query query, int lengthGoal) throws IOException { - UnifiedHighlighter highlighter = new UnifiedHighlighter(null, analyzer); - highlighter.setBreakIterator(() -> LengthGoalBreakIterator.createClosestToLength(new CustomSeparatorBreakIterator('.'), lengthGoal)); - return highlighter.highlightWithoutSearcher(FIELD, query, content, 1).toString(); + private String highlightClosestToLen(String content, Query query, int lengthGoal, float fragAlign) throws IOException { + return highlightClosestToLen(content, query, lengthGoal, fragAlign, 1); } - private String highlightMinLen(String content, Query query, int lengthGoal) throws IOException { + private String highlightClosestToLen(String content, Query query, int lengthGoal, float fragAlign, int maxPassages) throws IOException { + return highlightClosestToLen(content, query, lengthGoal, fragAlign, maxPassages, '.'); + } + + private String highlightClosestToLen(String content, Query query, int lengthGoal, float fragAlign, int maxPassages, char separator) throws IOException { + UnifiedHighlighter highlighter = new UnifiedHighlighter(null, analyzer); + highlighter.setBreakIterator(() -> LengthGoalBreakIterator.createClosestToLength(new CustomSeparatorBreakIterator(separator), lengthGoal, fragAlign)); + return highlighter.highlightWithoutSearcher(FIELD, query, content, maxPassages).toString(); + } + + private String highlightMinLen(String content, Query query, int lengthGoal, float fragAlign) throws IOException { + return highlightMinLen(content, query, lengthGoal, fragAlign, '.'); + } + + private String highlightMinLen(String content, Query query, int lengthGoal, float fragAlign, char separator) throws IOException { // differs from above only by "createMinLength" UnifiedHighlighter highlighter = new UnifiedHighlighter(null, analyzer); - highlighter.setBreakIterator(() -> LengthGoalBreakIterator.createMinLength(new CustomSeparatorBreakIterator('.'), lengthGoal)); + highlighter.setBreakIterator(() -> LengthGoalBreakIterator.createMinLength(new CustomSeparatorBreakIterator(separator), lengthGoal, fragAlign)); return highlighter.highlightWithoutSearcher(FIELD, query, content, 1).toString(); } } \ No newline at end of file diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index f124612c148..a21cebadb73 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -184,6 +184,10 @@ Improvements * SOLR-14129: Reuse Jackson ObjectMapper in AuditLoggerPlugin (janhoy) +* LUCENE-9093: The Unified highlighter has two new passage sizing parameters, hl.fragAlignRatio and + hl.fragsizeIsMinimum, with defaults that aim to better center matches in fragments than previously. See the ref guide. + Regardless of the settings, the passages may be sized differently than before. (Nándor Mátravölgyi, David Smiley) + Optimizations --------------------- (No changes) diff --git a/solr/core/src/java/org/apache/solr/highlight/UnifiedSolrHighlighter.java b/solr/core/src/java/org/apache/solr/highlight/UnifiedSolrHighlighter.java index fc43687eaf6..90632146ecf 100644 --- a/solr/core/src/java/org/apache/solr/highlight/UnifiedSolrHighlighter.java +++ b/solr/core/src/java/org/apache/solr/highlight/UnifiedSolrHighlighter.java @@ -326,8 +326,11 @@ public class UnifiedSolrHighlighter extends SolrHighlighter implements PluginInf if (fragsize <= 1) { // no real minimum size return baseBI; } - return LengthGoalBreakIterator.createMinLength(baseBI, fragsize); - // TODO option for using createClosestToLength() + float fragalign = params.getFieldFloat(field, HighlightParams.FRAGALIGNRATIO, 0.5f); + if (params.getFieldBool(field, HighlightParams.FRAGSIZEISMINIMUM, false)) { + return LengthGoalBreakIterator.createMinLength(baseBI, fragsize, fragalign); + } + return LengthGoalBreakIterator.createClosestToLength(baseBI, fragsize, fragalign); } /** diff --git a/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java b/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java index ac3b7eaddcd..f6418cb8595 100644 --- a/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java +++ b/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java @@ -244,15 +244,24 @@ public class TestUnifiedSolrHighlighter extends SolrTestCaseJ4 { assertU(adoc("text", "This document contains # special characters, while the other document contains the same # special character.", "id", "103")); assertU(adoc("text", "While the other document contains the same # special character.", "id", "104")); assertU(commit()); - assertQ("CUSTOM breakiterator", + assertQ("CUSTOM breakiterator", req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "-1"), "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='This document contains #'"); - assertQ("different breakiterator", + assertQ("different breakiterator", req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "-1"), "//lst[@name='highlighting']/lst[@name='104']/arr[@name='text']/str='While the other document contains the same #'"); + assertQ("CUSTOM breakiterator with fragsize 70 minimum", + req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "70", "hl.fragsizeIsMinimum", "true"), + "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='This document contains # special characters, while the other document contains the same #'"); assertQ("CUSTOM breakiterator with fragsize 70", req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "70"), + "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='This document contains #'"); + assertQ("CUSTOM breakiterator with fragsize 90", + req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "90"), + "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='This document contains #'"); + assertQ("CUSTOM breakiterator with fragsize 100", + req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "100"), "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='This document contains # special characters, while the other document contains the same #'"); } @@ -262,11 +271,17 @@ public class TestUnifiedSolrHighlighter extends SolrTestCaseJ4 { assertU(adoc("id", "10", "text", "This is a sentence just under seventy chars in length blah blah. Next sentence is here.")); assertU(commit()); assertQ("default fragsize", - req("q", "text:seventy", "hl", "true"), + req("q", "text:seventy", "hl", "true", "hl.fragsizeIsMinimum", "true"), "//lst[@name='highlighting']/lst[@name='10']/arr[@name='text']/str='This is a sentence just under seventy chars in length blah blah. Next sentence is here.'"); - assertQ("smaller fragsize", - req("q", "text:seventy", "hl", "true", "hl.fragsize", "60"), // a bit smaller + assertQ("default fragsize", + req("q", "text:seventy", "hl", "true", "hl.fragsizeIsMinimum", "true", "hl.fragsize", "60"), "//lst[@name='highlighting']/lst[@name='10']/arr[@name='text']/str='This is a sentence just under seventy chars in length blah blah. '"); + assertQ("smaller fragsize", + req("q", "text:seventy", "hl", "true"), + "//lst[@name='highlighting']/lst[@name='10']/arr[@name='text']/str='This is a sentence just under seventy chars in length blah blah. '"); + assertQ("default fragsize", + req("q", "text:seventy", "hl", "true", "hl.fragsize", "90"), + "//lst[@name='highlighting']/lst[@name='10']/arr[@name='text']/str='This is a sentence just under seventy chars in length blah blah. Next sentence is here.'"); } public void testEncoder() { diff --git a/solr/solr-ref-guide/src/highlighting.adoc b/solr/solr-ref-guide/src/highlighting.adoc index 8055b79d7de..ab6b5a6da0c 100644 --- a/solr/solr-ref-guide/src/highlighting.adoc +++ b/solr/solr-ref-guide/src/highlighting.adoc @@ -225,6 +225,14 @@ By default, the Unified Highlighter will usually pick the right offset source (s + The offset source can be explicitly configured to one of: `ANALYSIS`, `POSTINGS`, `POSTINGS_WITH_TERM_VECTORS`, or `TERM_VECTORS`. +`hl.fragAlignRatio`:: +Fragment alignment can influence where the match in a passage is positioned. This floating point value is used to break the `hl.fragsize` around the match. The default value of `0.5` means to align the match to the middle. A value of `0.0` would mean to align the match to the left, while `1.0` to align it to the right. _Note: there are situations where the requested alignment is not plausible. This depends on the length of the match, the used breakiterator and the text content around the match._ + +`hl.fragsizeIsMinimum`:: +By default this parameter is `false`. In this case the highlighter will accept a shorter fragment than `hl.fragsize`, if that is closer to the target than what the next longer choice would be. ++ +When treating `hl.fragsize` as a strict minimum length is acceptable, turning this on has some performance benefits to consider. + `hl.tag.ellipsis`:: By default, each snippet is returned as a separate value (as is done with the other highlighters). Set this parameter to instead return one string with this text as the delimiter. _Note: this is likely to be removed in the future._ diff --git a/solr/solrj/src/java/org/apache/solr/common/params/HighlightParams.java b/solr/solrj/src/java/org/apache/solr/common/params/HighlightParams.java index c4825b70627..c5d59800fb4 100644 --- a/solr/solrj/src/java/org/apache/solr/common/params/HighlightParams.java +++ b/solr/solrj/src/java/org/apache/solr/common/params/HighlightParams.java @@ -47,6 +47,8 @@ public interface HighlightParams { // sizing public static final String FRAGSIZE = HIGHLIGHT+".fragsize"; // OH, FVH, UH + public static final String FRAGSIZEISMINIMUM = HIGHLIGHT+".fragsizeIsMinimum"; // UH + public static final String FRAGALIGNRATIO = HIGHLIGHT+".fragAlignRatio"; // UH public static final String FRAGMENTER = HIGHLIGHT+".fragmenter"; // OH public static final String INCREMENT = HIGHLIGHT+".increment"; // OH public static final String REGEX = "regex"; // OH