LUCENE-2098: speed up BaseCharFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@990161 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-08-27 14:33:22 +00:00
parent 2c8219cf43
commit 48dde8359f
2 changed files with 41 additions and 47 deletions

View File

@ -647,6 +647,9 @@ Optimizations
(getStrings, getStringIndex), consume quite a bit less RAM in most (getStrings, getStringIndex), consume quite a bit less RAM in most
cases. (Mike McCandless) cases. (Mike McCandless)
* LUCENE-2098: Improve the performance of BaseCharFilter, especially for
large documents. (Robin Wojciki, Koji Sekiguchi, Robert Muir)
Build Build
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation * LUCENE-2124: Moved the JDK-based collation support from contrib/collation

View File

@ -17,79 +17,70 @@
package org.apache.lucene.analysis.charfilter; package org.apache.lucene.analysis.charfilter;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.CharStream; import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.util.ArrayUtil;
/** /**
* Base utility class for implementing a {@link CharFilter}. * Base utility class for implementing a {@link CharFilter}.
* You subclass this, and then record mappings by calling * You subclass this, and then record mappings by calling
* {@link #addOffCorrectMap}, and then invoke the correct * {@link #addOffCorrectMap}, and then invoke the correct
* method to correct an offset. * method to correct an offset.
*
* <p><b>NOTE</b>: This class is not particularly efficient.
* For example, a new class instance is created for every
* call to {@link #addOffCorrectMap}, which is then appended
* to a private list.
*/ */
public abstract class BaseCharFilter extends CharFilter { public abstract class BaseCharFilter extends CharFilter {
private List<OffCorrectMap> pcmList; private int offsets[];
private int diffs[];
private int size = 0;
public BaseCharFilter(CharStream in) { public BaseCharFilter(CharStream in) {
super(in); super(in);
} }
/** Retrieve the corrected offset. Note that this method /** Retrieve the corrected offset. */
* is slow, if you correct positions far before the most
* recently added position, as it's a simple linear
* search backwards through all offset corrections added
* by {@link #addOffCorrectMap}. */
@Override @Override
protected int correct(int currentOff) { protected int correct(int currentOff) {
if (pcmList == null || pcmList.isEmpty()) { if (offsets == null || currentOff < offsets[0]) {
return currentOff; return currentOff;
} }
for (int i = pcmList.size() - 1; i >= 0; i--) {
if (currentOff >= pcmList.get(i).off) { int hi = size - 1;
return currentOff + pcmList.get(i).cumulativeDiff; if(currentOff >= offsets[hi])
} return currentOff + diffs[hi];
int lo = 0;
int mid = -1;
while (hi >= lo) {
mid = (lo + hi) >>> 1;
if (currentOff < offsets[mid])
hi = mid - 1;
else if (currentOff > offsets[mid])
lo = mid + 1;
else
return currentOff + diffs[mid];
} }
return currentOff;
if (currentOff < offsets[mid])
return mid == 0 ? currentOff : currentOff + diffs[mid-1];
else
return currentOff + diffs[mid];
} }
protected int getLastCumulativeDiff() { protected int getLastCumulativeDiff() {
return pcmList == null || pcmList.isEmpty() ? return offsets == null ?
0 : pcmList.get(pcmList.size() - 1).cumulativeDiff; 0 : diffs[size-1];
} }
protected void addOffCorrectMap(int off, int cumulativeDiff) { protected void addOffCorrectMap(int off, int cumulativeDiff) {
if (pcmList == null) { if (offsets == null) {
pcmList = new ArrayList<OffCorrectMap>(); offsets = new int[64];
} diffs = new int[64];
pcmList.add(new OffCorrectMap(off, cumulativeDiff)); } else if (size == offsets.length) {
} offsets = ArrayUtil.grow(offsets);
diffs = ArrayUtil.grow(diffs);
static class OffCorrectMap {
int off;
int cumulativeDiff;
OffCorrectMap(int off, int cumulativeDiff) {
this.off = off;
this.cumulativeDiff = cumulativeDiff;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append('(');
sb.append(off);
sb.append(',');
sb.append(cumulativeDiff);
sb.append(')');
return sb.toString();
} }
offsets[size] = off;
diffs[size++] = cumulativeDiff;
} }
} }