diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 1951c442b87..3e09bff7782 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -647,6 +647,9 @@ Optimizations
(getStrings, getStringIndex), consume quite a bit less RAM in most
cases. (Mike McCandless)
+* LUCENE-2098: Improve the performance of BaseCharFilter, especially for
+ large documents. (Robin Wojciki, Koji Sekiguchi, Robert Muir)
+
Build
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java
index b5357add3a0..0441ac378ed 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java
@@ -17,79 +17,70 @@
package org.apache.lucene.analysis.charfilter;
-import java.util.ArrayList;
-import java.util.List;
-
import org.apache.lucene.analysis.CharStream;
+import org.apache.lucene.util.ArrayUtil;
/**
* Base utility class for implementing a {@link CharFilter}.
* You subclass this, and then record mappings by calling
* {@link #addOffCorrectMap}, and then invoke the correct
* method to correct an offset.
- *
- *
NOTE: This class is not particularly efficient.
- * For example, a new class instance is created for every
- * call to {@link #addOffCorrectMap}, which is then appended
- * to a private list.
*/
public abstract class BaseCharFilter extends CharFilter {
- private List pcmList;
+ private int offsets[];
+ private int diffs[];
+ private int size = 0;
public BaseCharFilter(CharStream in) {
super(in);
}
- /** Retrieve the corrected offset. Note that this method
- * is slow, if you correct positions far before the most
- * recently added position, as it's a simple linear
- * search backwards through all offset corrections added
- * by {@link #addOffCorrectMap}. */
+ /** Retrieve the corrected offset. */
@Override
protected int correct(int currentOff) {
- if (pcmList == null || pcmList.isEmpty()) {
+ if (offsets == null || currentOff < offsets[0]) {
return currentOff;
}
- for (int i = pcmList.size() - 1; i >= 0; i--) {
- if (currentOff >= pcmList.get(i).off) {
- return currentOff + pcmList.get(i).cumulativeDiff;
- }
+
+ int hi = size - 1;
+ if(currentOff >= offsets[hi])
+ return currentOff + diffs[hi];
+
+ int lo = 0;
+ int mid = -1;
+
+ while (hi >= lo) {
+ mid = (lo + hi) >>> 1;
+ if (currentOff < offsets[mid])
+ hi = mid - 1;
+ else if (currentOff > offsets[mid])
+ lo = mid + 1;
+ else
+ return currentOff + diffs[mid];
}
- return currentOff;
+
+ if (currentOff < offsets[mid])
+ return mid == 0 ? currentOff : currentOff + diffs[mid-1];
+ else
+ return currentOff + diffs[mid];
}
protected int getLastCumulativeDiff() {
- return pcmList == null || pcmList.isEmpty() ?
- 0 : pcmList.get(pcmList.size() - 1).cumulativeDiff;
+ return offsets == null ?
+ 0 : diffs[size-1];
}
protected void addOffCorrectMap(int off, int cumulativeDiff) {
- if (pcmList == null) {
- pcmList = new ArrayList();
- }
- pcmList.add(new OffCorrectMap(off, cumulativeDiff));
- }
-
- static class OffCorrectMap {
-
- int off;
- int cumulativeDiff;
-
- OffCorrectMap(int off, int cumulativeDiff) {
- this.off = off;
- this.cumulativeDiff = cumulativeDiff;
- }
-
- @Override
- public String toString() {
- StringBuilder sb = new StringBuilder();
- sb.append('(');
- sb.append(off);
- sb.append(',');
- sb.append(cumulativeDiff);
- sb.append(')');
- return sb.toString();
+ if (offsets == null) {
+ offsets = new int[64];
+ diffs = new int[64];
+ } else if (size == offsets.length) {
+ offsets = ArrayUtil.grow(offsets);
+ diffs = ArrayUtil.grow(diffs);
}
+
+ offsets[size] = off;
+ diffs[size++] = cumulativeDiff;
}
}