LUCENE-2098: speed up BaseCharFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@990161 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-08-27 14:33:22 +00:00
parent 2c8219cf43
commit 48dde8359f
2 changed files with 41 additions and 47 deletions

View File

@ -647,6 +647,9 @@ Optimizations
(getStrings, getStringIndex), consume quite a bit less RAM in most
cases. (Mike McCandless)
* LUCENE-2098: Improve the performance of BaseCharFilter, especially for
large documents. (Robin Wojciki, Koji Sekiguchi, Robert Muir)
Build
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation

View File

@ -17,79 +17,70 @@
package org.apache.lucene.analysis.charfilter;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.util.ArrayUtil;
/**
* Base utility class for implementing a {@link CharFilter}.
* You subclass this, and then record mappings by calling
* {@link #addOffCorrectMap}, and then invoke the correct
* method to correct an offset.
*
* <p><b>NOTE</b>: This class is not particularly efficient.
* For example, a new class instance is created for every
* call to {@link #addOffCorrectMap}, which is then appended
* to a private list.
*/
public abstract class BaseCharFilter extends CharFilter {
private List<OffCorrectMap> pcmList;
private int offsets[];
private int diffs[];
private int size = 0;
public BaseCharFilter(CharStream in) {
super(in);
}
/** Retrieve the corrected offset. Note that this method
* is slow, if you correct positions far before the most
* recently added position, as it's a simple linear
* search backwards through all offset corrections added
* by {@link #addOffCorrectMap}. */
/** Retrieve the corrected offset. */
@Override
protected int correct(int currentOff) {
if (pcmList == null || pcmList.isEmpty()) {
if (offsets == null || currentOff < offsets[0]) {
return currentOff;
}
for (int i = pcmList.size() - 1; i >= 0; i--) {
if (currentOff >= pcmList.get(i).off) {
return currentOff + pcmList.get(i).cumulativeDiff;
}
int hi = size - 1;
if(currentOff >= offsets[hi])
return currentOff + diffs[hi];
int lo = 0;
int mid = -1;
while (hi >= lo) {
mid = (lo + hi) >>> 1;
if (currentOff < offsets[mid])
hi = mid - 1;
else if (currentOff > offsets[mid])
lo = mid + 1;
else
return currentOff + diffs[mid];
}
return currentOff;
if (currentOff < offsets[mid])
return mid == 0 ? currentOff : currentOff + diffs[mid-1];
else
return currentOff + diffs[mid];
}
protected int getLastCumulativeDiff() {
return pcmList == null || pcmList.isEmpty() ?
0 : pcmList.get(pcmList.size() - 1).cumulativeDiff;
return offsets == null ?
0 : diffs[size-1];
}
protected void addOffCorrectMap(int off, int cumulativeDiff) {
if (pcmList == null) {
pcmList = new ArrayList<OffCorrectMap>();
}
pcmList.add(new OffCorrectMap(off, cumulativeDiff));
}
static class OffCorrectMap {
int off;
int cumulativeDiff;
OffCorrectMap(int off, int cumulativeDiff) {
this.off = off;
this.cumulativeDiff = cumulativeDiff;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append('(');
sb.append(off);
sb.append(',');
sb.append(cumulativeDiff);
sb.append(')');
return sb.toString();
if (offsets == null) {
offsets = new int[64];
diffs = new int[64];
} else if (size == offsets.length) {
offsets = ArrayUtil.grow(offsets);
diffs = ArrayUtil.grow(diffs);
}
offsets[size] = off;
diffs[size++] = cumulativeDiff;
}
}