diff --git a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java index 14c72b8a919..443be686892 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java @@ -33,7 +33,11 @@ import org.apache.lucene.util.StringHelper; import java.io.IOException; import java.util.Comparator; import java.util.Map; +import java.util.Set; import java.util.HashMap; +import java.util.TreeMap; +import java.util.SortedMap; +import java.util.Iterator; class SimpleTextFieldsReader extends FieldsProducer { @@ -78,7 +82,7 @@ class SimpleTextFieldsReader extends FieldsProducer { private class SimpleTextFieldsEnum extends FieldsEnum { private final IndexInput in; private final BytesRef scratch = new BytesRef(10); - private boolean omitTF; + private String current; public SimpleTextFieldsEnum() { this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); @@ -89,11 +93,12 @@ class SimpleTextFieldsReader extends FieldsProducer { while(true) { readLine(in, scratch); if (scratch.equals(END)) { + current = null; return null; } if (scratch.startsWith(FIELD)) { String field = StringHelper.intern(new String(scratch.bytes, scratch.offset + FIELD.length, scratch.length - FIELD.length, "UTF-8")); - omitTF = fieldInfos.fieldInfo(field).omitTermFreqAndPositions; + current = field; return field; } } @@ -101,7 +106,7 @@ class SimpleTextFieldsReader extends FieldsProducer { @Override public TermsEnum terms() throws IOException { - return new SimpleTextTermsEnum(in.getFilePointer(), omitTF); + return SimpleTextFieldsReader.this.terms(current).iterator(); } } @@ -109,21 +114,42 @@ class SimpleTextFieldsReader extends FieldsProducer { private final IndexInput in; private final boolean omitTF; private BytesRef current; - private final long fieldStart; - private final BytesRef scratch = new BytesRef(10); - private final BytesRef scratch2 = new BytesRef(10); private int docFreq; private long docsStart; private boolean ended; + private final TreeMap allTerms; + private Iterator> iter; - public SimpleTextTermsEnum(long offset, boolean omitTF) throws IOException { + public SimpleTextTermsEnum(TreeMap allTerms, boolean omitTF) throws IOException { this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); - this.in.seek(offset); + this.allTerms = allTerms; this.omitTF = omitTF; - fieldStart = offset; + iter = allTerms.entrySet().iterator(); } public SeekStatus seek(BytesRef text, boolean useCache /* ignored */) throws IOException { + + final SortedMap tailMap = allTerms.tailMap(text); + + if (tailMap.isEmpty()) { + current = null; + return SeekStatus.END; + } else { + current = tailMap.firstKey(); + final TermData td = tailMap.get(current); + docsStart = td.docsStart; + docFreq = td.docFreq; + iter = tailMap.entrySet().iterator(); + assert iter.hasNext(); + iter.next(); + if (current.equals(text)) { + return SeekStatus.FOUND; + } else { + return SeekStatus.NOT_FOUND; + } + } + + /* if (current != null) { final int cmp = current.compareTo(text); if (cmp == 0) { @@ -153,6 +179,7 @@ class SimpleTextFieldsReader extends FieldsProducer { current = null; ended = true; return SeekStatus.END; + */ } @Override @@ -162,6 +189,20 @@ class SimpleTextFieldsReader extends FieldsProducer { @Override public BytesRef next() throws IOException { assert !ended; + + if (iter.hasNext()) { + Map.Entry ent = iter.next(); + current = ent.getKey(); + TermData td = ent.getValue(); + docFreq = td.docFreq; + docsStart = td.docsStart; + return current; + } else { + current = null; + return null; + } + + /* readLine(in, scratch); if (scratch.equals(END) || scratch.startsWith(FIELD)) { ended = true; @@ -192,6 +233,7 @@ class SimpleTextFieldsReader extends FieldsProducer { in.seek(lineStart); return current; } + */ } @Override @@ -447,20 +489,70 @@ class SimpleTextFieldsReader extends FieldsProducer { } } + static class TermData { + public long docsStart; + public int docFreq; + + public TermData(long docsStart, int docFreq) { + this.docsStart = docsStart; + this.docFreq = docFreq; + } + } + private class SimpleTextTerms extends Terms { private final String field; private final long termsStart; private final boolean omitTF; - public SimpleTextTerms(String field, long termsStart) { + // NOTE: horribly, horribly RAM consuming, but then + // SimpleText should never be used in production + private final TreeMap allTerms = new TreeMap(); + + private final BytesRef scratch = new BytesRef(10); + + public SimpleTextTerms(String field, long termsStart) throws IOException { this.field = StringHelper.intern(field); this.termsStart = termsStart; omitTF = fieldInfos.fieldInfo(field).omitTermFreqAndPositions; + loadTerms(); + } + + private void loadTerms() throws IOException { + IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); + in.seek(termsStart); + final BytesRef lastTerm = new BytesRef(10); + long lastDocsStart = -1; + int docFreq = 0; + while(true) { + readLine(in, scratch); + if (scratch.equals(END) || scratch.startsWith(FIELD)) { + if (lastDocsStart != -1) { + allTerms.put(new BytesRef(lastTerm), + new TermData(lastDocsStart, docFreq)); + } + break; + } else if (scratch.startsWith(DOC)) { + docFreq++; + } else if (scratch.startsWith(TERM)) { + if (lastDocsStart != -1) { + allTerms.put(new BytesRef(lastTerm), + new TermData(lastDocsStart, docFreq)); + } + lastDocsStart = in.getFilePointer(); + final int len = scratch.length - TERM.length; + if (len > lastTerm.length) { + lastTerm.grow(len); + } + System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len); + lastTerm.length = len; + docFreq = 0; + } + } } @Override public TermsEnum iterator() throws IOException { - return new SimpleTextTermsEnum(termsStart, omitTF); + return new SimpleTextTermsEnum(allTerms, omitTF); } @Override