mirror of
https://github.com/apache/lucene.git
synced 2025-02-08 02:58:58 +00:00
LUCENE-2743: speed up SimpleText codec
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1031832 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
53a62306ff
commit
740d556ece
@ -33,7 +33,11 @@ import org.apache.lucene.util.StringHelper;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
import java.util.TreeMap;
|
||||||
|
import java.util.SortedMap;
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
class SimpleTextFieldsReader extends FieldsProducer {
|
class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
|
|
||||||
@ -78,7 +82,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||||||
private class SimpleTextFieldsEnum extends FieldsEnum {
|
private class SimpleTextFieldsEnum extends FieldsEnum {
|
||||||
private final IndexInput in;
|
private final IndexInput in;
|
||||||
private final BytesRef scratch = new BytesRef(10);
|
private final BytesRef scratch = new BytesRef(10);
|
||||||
private boolean omitTF;
|
private String current;
|
||||||
|
|
||||||
public SimpleTextFieldsEnum() {
|
public SimpleTextFieldsEnum() {
|
||||||
this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
|
this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
|
||||||
@ -89,11 +93,12 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||||||
while(true) {
|
while(true) {
|
||||||
readLine(in, scratch);
|
readLine(in, scratch);
|
||||||
if (scratch.equals(END)) {
|
if (scratch.equals(END)) {
|
||||||
|
current = null;
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
if (scratch.startsWith(FIELD)) {
|
if (scratch.startsWith(FIELD)) {
|
||||||
String field = StringHelper.intern(new String(scratch.bytes, scratch.offset + FIELD.length, scratch.length - FIELD.length, "UTF-8"));
|
String field = StringHelper.intern(new String(scratch.bytes, scratch.offset + FIELD.length, scratch.length - FIELD.length, "UTF-8"));
|
||||||
omitTF = fieldInfos.fieldInfo(field).omitTermFreqAndPositions;
|
current = field;
|
||||||
return field;
|
return field;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -101,7 +106,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TermsEnum terms() throws IOException {
|
public TermsEnum terms() throws IOException {
|
||||||
return new SimpleTextTermsEnum(in.getFilePointer(), omitTF);
|
return SimpleTextFieldsReader.this.terms(current).iterator();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -109,21 +114,42 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||||||
private final IndexInput in;
|
private final IndexInput in;
|
||||||
private final boolean omitTF;
|
private final boolean omitTF;
|
||||||
private BytesRef current;
|
private BytesRef current;
|
||||||
private final long fieldStart;
|
|
||||||
private final BytesRef scratch = new BytesRef(10);
|
|
||||||
private final BytesRef scratch2 = new BytesRef(10);
|
|
||||||
private int docFreq;
|
private int docFreq;
|
||||||
private long docsStart;
|
private long docsStart;
|
||||||
private boolean ended;
|
private boolean ended;
|
||||||
|
private final TreeMap<BytesRef,TermData> allTerms;
|
||||||
|
private Iterator<Map.Entry<BytesRef,TermData>> iter;
|
||||||
|
|
||||||
public SimpleTextTermsEnum(long offset, boolean omitTF) throws IOException {
|
public SimpleTextTermsEnum(TreeMap<BytesRef,TermData> allTerms, boolean omitTF) throws IOException {
|
||||||
this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
|
this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
|
||||||
this.in.seek(offset);
|
this.allTerms = allTerms;
|
||||||
this.omitTF = omitTF;
|
this.omitTF = omitTF;
|
||||||
fieldStart = offset;
|
iter = allTerms.entrySet().iterator();
|
||||||
}
|
}
|
||||||
|
|
||||||
public SeekStatus seek(BytesRef text, boolean useCache /* ignored */) throws IOException {
|
public SeekStatus seek(BytesRef text, boolean useCache /* ignored */) throws IOException {
|
||||||
|
|
||||||
|
final SortedMap<BytesRef,TermData> tailMap = allTerms.tailMap(text);
|
||||||
|
|
||||||
|
if (tailMap.isEmpty()) {
|
||||||
|
current = null;
|
||||||
|
return SeekStatus.END;
|
||||||
|
} else {
|
||||||
|
current = tailMap.firstKey();
|
||||||
|
final TermData td = tailMap.get(current);
|
||||||
|
docsStart = td.docsStart;
|
||||||
|
docFreq = td.docFreq;
|
||||||
|
iter = tailMap.entrySet().iterator();
|
||||||
|
assert iter.hasNext();
|
||||||
|
iter.next();
|
||||||
|
if (current.equals(text)) {
|
||||||
|
return SeekStatus.FOUND;
|
||||||
|
} else {
|
||||||
|
return SeekStatus.NOT_FOUND;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
if (current != null) {
|
if (current != null) {
|
||||||
final int cmp = current.compareTo(text);
|
final int cmp = current.compareTo(text);
|
||||||
if (cmp == 0) {
|
if (cmp == 0) {
|
||||||
@ -153,6 +179,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||||||
current = null;
|
current = null;
|
||||||
ended = true;
|
ended = true;
|
||||||
return SeekStatus.END;
|
return SeekStatus.END;
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -162,6 +189,20 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||||||
@Override
|
@Override
|
||||||
public BytesRef next() throws IOException {
|
public BytesRef next() throws IOException {
|
||||||
assert !ended;
|
assert !ended;
|
||||||
|
|
||||||
|
if (iter.hasNext()) {
|
||||||
|
Map.Entry<BytesRef,TermData> ent = iter.next();
|
||||||
|
current = ent.getKey();
|
||||||
|
TermData td = ent.getValue();
|
||||||
|
docFreq = td.docFreq;
|
||||||
|
docsStart = td.docsStart;
|
||||||
|
return current;
|
||||||
|
} else {
|
||||||
|
current = null;
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
readLine(in, scratch);
|
readLine(in, scratch);
|
||||||
if (scratch.equals(END) || scratch.startsWith(FIELD)) {
|
if (scratch.equals(END) || scratch.startsWith(FIELD)) {
|
||||||
ended = true;
|
ended = true;
|
||||||
@ -192,6 +233,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||||||
in.seek(lineStart);
|
in.seek(lineStart);
|
||||||
return current;
|
return current;
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -447,20 +489,70 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static class TermData {
|
||||||
|
public long docsStart;
|
||||||
|
public int docFreq;
|
||||||
|
|
||||||
|
public TermData(long docsStart, int docFreq) {
|
||||||
|
this.docsStart = docsStart;
|
||||||
|
this.docFreq = docFreq;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private class SimpleTextTerms extends Terms {
|
private class SimpleTextTerms extends Terms {
|
||||||
private final String field;
|
private final String field;
|
||||||
private final long termsStart;
|
private final long termsStart;
|
||||||
private final boolean omitTF;
|
private final boolean omitTF;
|
||||||
|
|
||||||
public SimpleTextTerms(String field, long termsStart) {
|
// NOTE: horribly, horribly RAM consuming, but then
|
||||||
|
// SimpleText should never be used in production
|
||||||
|
private final TreeMap<BytesRef,TermData> allTerms = new TreeMap<BytesRef,TermData>();
|
||||||
|
|
||||||
|
private final BytesRef scratch = new BytesRef(10);
|
||||||
|
|
||||||
|
public SimpleTextTerms(String field, long termsStart) throws IOException {
|
||||||
this.field = StringHelper.intern(field);
|
this.field = StringHelper.intern(field);
|
||||||
this.termsStart = termsStart;
|
this.termsStart = termsStart;
|
||||||
omitTF = fieldInfos.fieldInfo(field).omitTermFreqAndPositions;
|
omitTF = fieldInfos.fieldInfo(field).omitTermFreqAndPositions;
|
||||||
|
loadTerms();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void loadTerms() throws IOException {
|
||||||
|
IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
|
||||||
|
in.seek(termsStart);
|
||||||
|
final BytesRef lastTerm = new BytesRef(10);
|
||||||
|
long lastDocsStart = -1;
|
||||||
|
int docFreq = 0;
|
||||||
|
while(true) {
|
||||||
|
readLine(in, scratch);
|
||||||
|
if (scratch.equals(END) || scratch.startsWith(FIELD)) {
|
||||||
|
if (lastDocsStart != -1) {
|
||||||
|
allTerms.put(new BytesRef(lastTerm),
|
||||||
|
new TermData(lastDocsStart, docFreq));
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
} else if (scratch.startsWith(DOC)) {
|
||||||
|
docFreq++;
|
||||||
|
} else if (scratch.startsWith(TERM)) {
|
||||||
|
if (lastDocsStart != -1) {
|
||||||
|
allTerms.put(new BytesRef(lastTerm),
|
||||||
|
new TermData(lastDocsStart, docFreq));
|
||||||
|
}
|
||||||
|
lastDocsStart = in.getFilePointer();
|
||||||
|
final int len = scratch.length - TERM.length;
|
||||||
|
if (len > lastTerm.length) {
|
||||||
|
lastTerm.grow(len);
|
||||||
|
}
|
||||||
|
System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len);
|
||||||
|
lastTerm.length = len;
|
||||||
|
docFreq = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TermsEnum iterator() throws IOException {
|
public TermsEnum iterator() throws IOException {
|
||||||
return new SimpleTextTermsEnum(termsStart, omitTF);
|
return new SimpleTextTermsEnum(allTerms, omitTF);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
Loading…
x
Reference in New Issue
Block a user