Deduplicate min and max term in single-term FieldReader (#13618)

I noticed that single-term readers are an edge case but not that uncommon in Elasticsearch heap dumps. It seems quite common to have a constant value for some field across a complete segment (e.g. a version value that is repeated endlessly in logs). Seems simple enough to deduplicate here to save a couple MB of heap.
2025-03-06 08:19:23 +00:00 · 2024-07-31 20:38:21 +02:00 · 2024-07-31 20:38:21 +02:00 · 47650a4314
commit 47650a4314
parent ca098e63b9
2 changed files with 5 additions and 2 deletions
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java
@ -200,6 +200,11 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer {
            final int docCount = metaIn.readVInt();
            BytesRef minTerm = readBytesRef(metaIn);
            BytesRef maxTerm = readBytesRef(metaIn);
+            if (numTerms == 1) {
+              assert maxTerm.equals(minTerm);
+              // save heap for edge case of a single term only so min == max
+              maxTerm = minTerm;
+            }
            if (docCount < 0
                || docCount > state.segmentInfo.maxDoc()) { // #docs with field must be <= #docs
              throw new CorruptIndexException(
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java
@ -598,8 +598,6 @@ public final class Lucene90BlockTreeTermsWriter extends FieldsConsumer {
  private final ByteBuffersDataOutput scratchBytes = ByteBuffersDataOutput.newResettableInstance();
  private final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();

-  static final BytesRef EMPTY_BYTES_REF = new BytesRef();
-
  private static class StatsWriter {

    private final DataOutput out;