From e56faa30973df613decd7ac817bf38dfedfe5f54 Mon Sep 17 00:00:00 2001 From: Doug Cutting Date: Fri, 8 Oct 2004 15:58:49 +0000 Subject: [PATCH] Optimize term dictionary lookup to allocate fewer terms. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150581 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 5 + .../apache/lucene/index/SegmentTermEnum.java | 53 ++++---- .../org/apache/lucene/index/TermBuffer.java | 119 ++++++++++++++++++ .../apache/lucene/index/TermInfosReader.java | 4 +- 4 files changed, 151 insertions(+), 30 deletions(-) create mode 100644 src/java/org/apache/lucene/index/TermBuffer.java diff --git a/CHANGES.txt b/CHANGES.txt index 5a4efac6a1e..4fde622983c 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -97,6 +97,11 @@ $Id$ 21. Add a serializable Parameter Class to standardize parameter enum classes in BooleanClause and Field. (Christoph) +22. Optimize term-dictionary lookup to allocate far fewer terms when + scanning for the matching term. This speeds searches involving + low-frequency terms, where the cost of dictionary lookup can be + significant. (cutting) + 1.4.1 diff --git a/src/java/org/apache/lucene/index/SegmentTermEnum.java b/src/java/org/apache/lucene/index/SegmentTermEnum.java index 22c26fee35b..9afd6b47a01 100644 --- a/src/java/org/apache/lucene/index/SegmentTermEnum.java +++ b/src/java/org/apache/lucene/index/SegmentTermEnum.java @@ -25,7 +25,10 @@ final class SegmentTermEnum extends TermEnum implements Cloneable { long size; long position = -1; - private Term term = new Term("", ""); + private TermBuffer termBuffer = new TermBuffer(); + private TermBuffer prevBuffer = new TermBuffer(); + private TermBuffer scratch; // used for scanning + private TermInfo termInfo = new TermInfo(); private int format; @@ -34,9 +37,6 @@ final class SegmentTermEnum extends TermEnum implements Cloneable { int indexInterval; int skipInterval; private int formatM1SkipInterval; - Term prev; - - private char[] buffer = {}; SegmentTermEnum(IndexInput i, FieldInfos fis, boolean isi) throws IOException { @@ -89,7 +89,10 @@ final class SegmentTermEnum extends TermEnum implements Cloneable { clone.input = (IndexInput) input.clone(); clone.termInfo = new TermInfo(termInfo); - if (term != null) clone.growBuffer(term.text.length()); + + clone.termBuffer = (TermBuffer)termBuffer.clone(); + clone.prevBuffer = (TermBuffer)prevBuffer.clone(); + clone.scratch = null; return clone; } @@ -98,21 +101,20 @@ final class SegmentTermEnum extends TermEnum implements Cloneable { throws IOException { input.seek(pointer); position = p; - term = t; - prev = null; + termBuffer.set(t); + prevBuffer.reset(); termInfo.set(ti); - growBuffer(term.text.length()); // copy term text into buffer } /** Increments the enumeration to the next element. True if one exists.*/ public final boolean next() throws IOException { if (position++ >= size - 1) { - term = null; + termBuffer.reset(); return false; } - prev = term; - term = readTerm(); + prevBuffer.set(termBuffer); + termBuffer.read(input, fieldInfos); termInfo.docFreq = input.readVInt(); // read doc freq termInfo.freqPointer += input.readVLong(); // read freq pointer @@ -138,28 +140,23 @@ final class SegmentTermEnum extends TermEnum implements Cloneable { return true; } - private final Term readTerm() throws IOException { - int start = input.readVInt(); - int length = input.readVInt(); - int totalLength = start + length; - if (buffer.length < totalLength) - growBuffer(totalLength); - - input.readChars(buffer, start, length); - return new Term(fieldInfos.fieldName(input.readVInt()), - new String(buffer, 0, totalLength), false); - } - - private final void growBuffer(int length) { - buffer = new char[length]; - for (int i = 0; i < term.text.length(); i++) // copy contents - buffer[i] = term.text.charAt(i); + /** Optimized scan, without allocating new terms. */ + final void scanTo(Term term) throws IOException { + if (scratch == null) + scratch = new TermBuffer(); + scratch.set(term); + while (scratch.compareTo(termBuffer) > 0 && next()) {} } /** Returns the current Term in the enumeration. Initially invalid, valid after next() called for the first time.*/ public final Term term() { - return term; + return termBuffer.toTerm(); + } + + /** Returns the previous Term enumerated. Initially null.*/ + final Term prev() { + return prevBuffer.toTerm(); } /** Returns the current TermInfo in the enumeration. diff --git a/src/java/org/apache/lucene/index/TermBuffer.java b/src/java/org/apache/lucene/index/TermBuffer.java new file mode 100644 index 00000000000..3bc2ec4726e --- /dev/null +++ b/src/java/org/apache/lucene/index/TermBuffer.java @@ -0,0 +1,119 @@ +package org.apache.lucene.index; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import org.apache.lucene.store.IndexInput; + +final class TermBuffer implements Cloneable { + private static final char[] NO_CHARS = new char[0]; + + private String field; + private char[] text = NO_CHARS; + private int textLength; + private Term term; // cached + + public final int compareTo(TermBuffer other) { + if (field == other.field) // fields are interned + return compareChars(text, textLength, other.text, other.textLength); + else + return field.compareTo(other.field); + } + + private static final int compareChars(char[] v1, int len1, + char[] v2, int len2) { + int end = Math.min(len1, len2); + for (int k = 0; k < end; k++) { + char c1 = v1[k]; + char c2 = v2[k]; + if (c1 != c2) { + return c1 - c2; + } + } + return len1 - len2; + } + + private final void setTextLength(int newLength) { + if (text.length < newLength) { + char[] newText = new char[newLength]; + System.arraycopy(text, 0, newText, 0, textLength); + text = newText; + } + textLength = newLength; + } + + public final void read(IndexInput input, FieldInfos fieldInfos) + throws IOException { + this.term = null; // invalidate cache + int start = input.readVInt(); + int length = input.readVInt(); + int totalLength = start + length; + setTextLength(totalLength); + input.readChars(this.text, start, length); + this.field = fieldInfos.fieldName(input.readVInt()); + } + + public final void set(Term term) { + if (term == null) { + reset(); + return; + } + + // copy text into the buffer + setTextLength(term.text().length()); + term.text().getChars(0, term.text().length(), text, 0); + + this.field = term.field(); + this.term = term; + } + + public final void set(TermBuffer other) { + setTextLength(other.textLength); + System.arraycopy(other.text, 0, text, 0, textLength); + + this.field = other.field; + this.term = other.term; + } + + public void reset() { + this.field = null; + this.textLength = 0; + this.term = null; + } + + public Term toTerm() { + if (field == null) // unset + return null; + + if (term == null) + term = new Term(field, new String(text, 0, textLength), false); + + return term; + } + + protected Object clone() { + TermBuffer clone = null; + try { + clone = (TermBuffer)super.clone(); + } catch (CloneNotSupportedException e) {} + + clone.text = new char[text.length]; + System.arraycopy(text, 0, clone.text, 0, textLength); + + return clone; + } +} diff --git a/src/java/org/apache/lucene/index/TermInfosReader.java b/src/java/org/apache/lucene/index/TermInfosReader.java index 17ac4667067..3f394132b49 100644 --- a/src/java/org/apache/lucene/index/TermInfosReader.java +++ b/src/java/org/apache/lucene/index/TermInfosReader.java @@ -129,7 +129,7 @@ final class TermInfosReader { // optimize sequential access: first try scanning cached enum w/o seeking SegmentTermEnum enumerator = getEnum(); if (enumerator.term() != null // term is at or past current - && ((enumerator.prev != null && term.compareTo(enumerator.prev) > 0) + && ((enumerator.prev() != null && term.compareTo(enumerator.prev())> 0) || term.compareTo(enumerator.term()) >= 0)) { int enumOffset = (int)(enumerator.position/enumerator.indexInterval)+1; if (indexTerms.length == enumOffset // but before end of block @@ -145,7 +145,7 @@ final class TermInfosReader { /** Scans within block for matching term. */ private final TermInfo scanEnum(Term term) throws IOException { SegmentTermEnum enumerator = getEnum(); - while (term.compareTo(enumerator.term()) > 0 && enumerator.next()) {} + enumerator.scanTo(term); if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) return enumerator.termInfo(); else