From e56faa30973df613decd7ac817bf38dfedfe5f54 Mon Sep 17 00:00:00 2001
From: Doug Cutting <cutting@apache.org>
Date: Fri, 8 Oct 2004 15:58:49 +0000
Subject: [PATCH] Optimize term dictionary lookup to allocate fewer terms.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150581 13f79535-47bb-0310-9956-ffa450edef68
---
 CHANGES.txt                                   |   5 +
 .../apache/lucene/index/SegmentTermEnum.java  |  53 ++++----
 .../org/apache/lucene/index/TermBuffer.java   | 119 ++++++++++++++++++
 .../apache/lucene/index/TermInfosReader.java  |   4 +-
 4 files changed, 151 insertions(+), 30 deletions(-)
 create mode 100644 src/java/org/apache/lucene/index/TermBuffer.java

diff --git a/CHANGES.txt b/CHANGES.txt
index 5a4efac6a1e..4fde622983c 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -97,6 +97,11 @@ $Id$
 21. Add a serializable Parameter Class to standardize parameter enum
     classes in BooleanClause and Field. (Christoph)
 
+22. Optimize term-dictionary lookup to allocate far fewer terms when
+    scanning for the matching term.  This speeds searches involving
+    low-frequency terms, where the cost of dictionary lookup can be
+    significant. (cutting)
+
 
 1.4.1
 
diff --git a/src/java/org/apache/lucene/index/SegmentTermEnum.java b/src/java/org/apache/lucene/index/SegmentTermEnum.java
index 22c26fee35b..9afd6b47a01 100644
--- a/src/java/org/apache/lucene/index/SegmentTermEnum.java
+++ b/src/java/org/apache/lucene/index/SegmentTermEnum.java
@@ -25,7 +25,10 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
   long size;
   long position = -1;
 
-  private Term term = new Term("", "");
+  private TermBuffer termBuffer = new TermBuffer();
+  private TermBuffer prevBuffer = new TermBuffer();
+  private TermBuffer scratch;                     // used for scanning
+
   private TermInfo termInfo = new TermInfo();
 
   private int format;
@@ -34,9 +37,6 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
   int indexInterval;
   int skipInterval;
   private int formatM1SkipInterval;
-  Term prev;
-
-  private char[] buffer = {};
 
   SegmentTermEnum(IndexInput i, FieldInfos fis, boolean isi)
           throws IOException {
@@ -89,7 +89,10 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
 
     clone.input = (IndexInput) input.clone();
     clone.termInfo = new TermInfo(termInfo);
-    if (term != null) clone.growBuffer(term.text.length());
+
+    clone.termBuffer = (TermBuffer)termBuffer.clone();
+    clone.prevBuffer = (TermBuffer)prevBuffer.clone();
+    clone.scratch = null;
 
     return clone;
   }
@@ -98,21 +101,20 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
           throws IOException {
     input.seek(pointer);
     position = p;
-    term = t;
-    prev = null;
+    termBuffer.set(t);
+    prevBuffer.reset();
     termInfo.set(ti);
-    growBuffer(term.text.length());		  // copy term text into buffer
   }
 
   /** Increments the enumeration to the next element.  True if one exists.*/
   public final boolean next() throws IOException {
     if (position++ >= size - 1) {
-      term = null;
+      termBuffer.reset();
       return false;
     }
 
-    prev = term;
-    term = readTerm();
+    prevBuffer.set(termBuffer);
+    termBuffer.read(input, fieldInfos);
 
     termInfo.docFreq = input.readVInt();	  // read doc freq
     termInfo.freqPointer += input.readVLong();	  // read freq pointer
@@ -138,28 +140,23 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
     return true;
   }
 
-  private final Term readTerm() throws IOException {
-    int start = input.readVInt();
-    int length = input.readVInt();
-    int totalLength = start + length;
-    if (buffer.length < totalLength)
-      growBuffer(totalLength);
-
-    input.readChars(buffer, start, length);
-    return new Term(fieldInfos.fieldName(input.readVInt()),
-            new String(buffer, 0, totalLength), false);
-  }
-
-  private final void growBuffer(int length) {
-    buffer = new char[length];
-    for (int i = 0; i < term.text.length(); i++)  // copy contents
-      buffer[i] = term.text.charAt(i);
+  /** Optimized scan, without allocating new terms. */
+  final void scanTo(Term term) throws IOException {
+    if (scratch == null)
+      scratch = new TermBuffer();
+    scratch.set(term);
+    while (scratch.compareTo(termBuffer) > 0 && next()) {}
   }
 
   /** Returns the current Term in the enumeration.
    Initially invalid, valid after next() called for the first time.*/
   public final Term term() {
-    return term;
+    return termBuffer.toTerm();
+  }
+
+  /** Returns the previous Term enumerated. Initially null.*/
+  final Term prev() {
+    return prevBuffer.toTerm();
   }
 
   /** Returns the current TermInfo in the enumeration.
diff --git a/src/java/org/apache/lucene/index/TermBuffer.java b/src/java/org/apache/lucene/index/TermBuffer.java
new file mode 100644
index 00000000000..3bc2ec4726e
--- /dev/null
+++ b/src/java/org/apache/lucene/index/TermBuffer.java
@@ -0,0 +1,119 @@
+package org.apache.lucene.index;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import org.apache.lucene.store.IndexInput;
+
+final class TermBuffer implements Cloneable {
+  private static final char[] NO_CHARS = new char[0];
+
+  private String field;
+  private char[] text = NO_CHARS;
+  private int textLength;
+  private Term term;                            // cached
+
+  public final int compareTo(TermBuffer other) {
+    if (field == other.field)			  // fields are interned
+      return compareChars(text, textLength, other.text, other.textLength);
+    else
+      return field.compareTo(other.field);
+  }
+
+  private static final int compareChars(char[] v1, int len1,
+                                        char[] v2, int len2) {
+    int end = Math.min(len1, len2);
+    for (int k = 0; k < end; k++) {
+      char c1 = v1[k];
+      char c2 = v2[k];
+      if (c1 != c2) {
+        return c1 - c2;
+      }
+    }
+    return len1 - len2;
+  }
+
+  private final void setTextLength(int newLength) {
+    if (text.length < newLength) {
+      char[] newText = new char[newLength];
+      System.arraycopy(text, 0, newText, 0, textLength);
+      text = newText;
+    }
+    textLength = newLength;
+  }
+
+  public final void read(IndexInput input, FieldInfos fieldInfos)
+    throws IOException {
+    this.term = null;                           // invalidate cache
+    int start = input.readVInt();
+    int length = input.readVInt();
+    int totalLength = start + length;
+    setTextLength(totalLength);
+    input.readChars(this.text, start, length);
+    this.field = fieldInfos.fieldName(input.readVInt());
+  }
+
+  public final void set(Term term) {
+    if (term == null) {
+      reset();
+      return;
+    }
+
+    // copy text into the buffer
+    setTextLength(term.text().length());
+    term.text().getChars(0, term.text().length(), text, 0);
+
+    this.field = term.field();
+    this.term = term;
+  }
+
+  public final void set(TermBuffer other) {
+    setTextLength(other.textLength);
+    System.arraycopy(other.text, 0, text, 0, textLength);
+
+    this.field = other.field;
+    this.term = other.term;
+  }
+
+  public void reset() {
+    this.field = null;
+    this.textLength = 0;
+    this.term = null;
+  }
+
+  public Term toTerm() {
+    if (field == null)                            // unset
+      return null;
+
+    if (term == null)
+      term = new Term(field, new String(text, 0, textLength), false);
+
+    return term;
+  }
+
+  protected Object clone() {
+    TermBuffer clone = null;
+    try {
+      clone = (TermBuffer)super.clone();
+    } catch (CloneNotSupportedException e) {}
+
+    clone.text = new char[text.length];
+    System.arraycopy(text, 0, clone.text, 0, textLength);
+
+    return clone;
+  }
+}
diff --git a/src/java/org/apache/lucene/index/TermInfosReader.java b/src/java/org/apache/lucene/index/TermInfosReader.java
index 17ac4667067..3f394132b49 100644
--- a/src/java/org/apache/lucene/index/TermInfosReader.java
+++ b/src/java/org/apache/lucene/index/TermInfosReader.java
@@ -129,7 +129,7 @@ final class TermInfosReader {
     // optimize sequential access: first try scanning cached enum w/o seeking
     SegmentTermEnum enumerator = getEnum();
     if (enumerator.term() != null                 // term is at or past current
-	&& ((enumerator.prev != null && term.compareTo(enumerator.prev) > 0)
+	&& ((enumerator.prev() != null && term.compareTo(enumerator.prev())> 0)
 	    || term.compareTo(enumerator.term()) >= 0)) {
       int enumOffset = (int)(enumerator.position/enumerator.indexInterval)+1;
       if (indexTerms.length == enumOffset	  // but before end of block
@@ -145,7 +145,7 @@ final class TermInfosReader {
   /** Scans within block for matching term. */
   private final TermInfo scanEnum(Term term) throws IOException {
     SegmentTermEnum enumerator = getEnum();
-    while (term.compareTo(enumerator.term()) > 0 && enumerator.next()) {}
+    enumerator.scanTo(term);
     if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0)
       return enumerator.termInfo();
     else