LUCENE-4854: DocTermsOrd getOrdTermsEnum() buggy, lookupTerm/termsEnum is slow

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1458303 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2013-03-19 14:21:16 +00:00
parent 52d2d3fd6f
commit f7ad4045e2
4 changed files with 118 additions and 9 deletions

View File

@ -145,11 +145,17 @@ Bug Fixes
* LUCENE-4826: PostingsHighlighter was not returning the top N best * LUCENE-4826: PostingsHighlighter was not returning the top N best
scoring passages. (Robert Muir, Mike McCandless) scoring passages. (Robert Muir, Mike McCandless)
* LUCENE-4854: Fix DocTermOrds.getOrdTermsEnum() to not return negative
ord on initial next(). (Robert Muir)
Optimizations Optimizations
* LUCENE-4819: Added Sorted[Set]DocValues.termsEnum(), and optimized the * LUCENE-4819: Added Sorted[Set]DocValues.termsEnum(), and optimized the
default codec for improved enumeration performance. (Robert Muir) default codec for improved enumeration performance. (Robert Muir)
* LUCENE-4854: Speed up TermsEnum of FieldCache.getDocTermOrds.
(Robert Muir)
======================= Lucene 4.2.0 ======================= ======================= Lucene 4.2.0 =======================
Changes in backwards compatibility policy Changes in backwards compatibility policy

View File

@ -24,6 +24,7 @@ import java.util.Comparator;
import java.util.List; import java.util.List;
import org.apache.lucene.codecs.PostingsFormat; // javadocs import org.apache.lucene.codecs.PostingsFormat; // javadocs
import org.apache.lucene.index.TermsEnum.SeekStatus;
import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
@ -632,7 +633,9 @@ public class DocTermOrds {
@Override @Override
public BytesRef next() throws IOException { public BytesRef next() throws IOException {
ord++; if (++ord < 0) {
ord = 0;
}
if (termsEnum.next() == null) { if (termsEnum.next() == null) {
term = null; term = null;
return null; return null;
@ -763,16 +766,17 @@ public class DocTermOrds {
} }
/** Returns a SortedSetDocValues view of this instance */ /** Returns a SortedSetDocValues view of this instance */
public SortedSetDocValues iterator(TermsEnum termsEnum) throws IOException { public SortedSetDocValues iterator(AtomicReader reader) throws IOException {
if (isEmpty()) { if (isEmpty()) {
return SortedSetDocValues.EMPTY; return SortedSetDocValues.EMPTY;
} else { } else {
return new Iterator(termsEnum); return new Iterator(reader);
} }
} }
private class Iterator extends SortedSetDocValues { private class Iterator extends SortedSetDocValues {
final TermsEnum te; final AtomicReader reader;
final TermsEnum te; // used internally for lookupOrd() and lookupTerm()
// currently we read 5 at a time (using the logic of the old iterator) // currently we read 5 at a time (using the logic of the old iterator)
final int buffer[] = new int[5]; final int buffer[] = new int[5];
int bufferUpto; int bufferUpto;
@ -782,8 +786,9 @@ public class DocTermOrds {
private int upto; private int upto;
private byte[] arr; private byte[] arr;
Iterator(TermsEnum te) { Iterator(AtomicReader reader) throws IOException {
this.te = te; this.reader = reader;
this.te = termsEnum();
} }
@Override @Override
@ -880,5 +885,27 @@ public class DocTermOrds {
public long getValueCount() { public long getValueCount() {
return numTerms(); return numTerms();
} }
@Override
public long lookupTerm(BytesRef key) {
try {
if (te.seekCeil(key) == SeekStatus.FOUND) {
return te.ord();
} else {
return -te.ord()-1;
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Override
public TermsEnum termsEnum() {
try {
return getOrdTermsEnum(reader);
} catch (IOException e) {
throw new RuntimeException();
}
}
} }
} }

View File

@ -1388,7 +1388,7 @@ class FieldCacheImpl implements FieldCache {
} }
DocTermOrds dto = (DocTermOrds) caches.get(DocTermOrds.class).get(reader, new CacheKey(field, null), false); DocTermOrds dto = (DocTermOrds) caches.get(DocTermOrds.class).get(reader, new CacheKey(field, null), false);
return dto.iterator(dto.getOrdTermsEnum(reader)); return dto.iterator(reader);
} }
static final class DocTermOrdsCache extends Cache { static final class DocTermOrdsCache extends Cache {

View File

@ -17,18 +17,22 @@ package org.apache.lucene.index;
* limitations under the License. * limitations under the License.
*/ */
import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet; import java.util.HashSet;
import java.util.Set; import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntField; import org.apache.lucene.document.IntField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.TermsEnum.SeekStatus;
import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.FieldCache;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
@ -64,7 +68,7 @@ public class TestDocTermOrds extends LuceneTestCase {
final AtomicReader ar = SlowCompositeReaderWrapper.wrap(r); final AtomicReader ar = SlowCompositeReaderWrapper.wrap(r);
final DocTermOrds dto = new DocTermOrds(ar, ar.getLiveDocs(), "field"); final DocTermOrds dto = new DocTermOrds(ar, ar.getLiveDocs(), "field");
SortedSetDocValues iter = dto.iterator(ar.terms("field").iterator(null)); SortedSetDocValues iter = dto.iterator(ar);
iter.setDocument(0); iter.setDocument(0);
assertEquals(0, iter.nextOrd()); assertEquals(0, iter.nextOrd());
@ -352,7 +356,7 @@ public class TestDocTermOrds extends LuceneTestCase {
} }
} }
SortedSetDocValues iter = dto.iterator(te); SortedSetDocValues iter = dto.iterator(r);
for(int docID=0;docID<r.maxDoc();docID++) { for(int docID=0;docID<r.maxDoc();docID++) {
if (VERBOSE) { if (VERBOSE) {
System.out.println("TEST: docID=" + docID + " of " + r.maxDoc() + " (id=" + docIDToID.get(docID) + ")"); System.out.println("TEST: docID=" + docID + " of " + r.maxDoc() + " (id=" + docIDToID.get(docID) + ")");
@ -402,4 +406,76 @@ public class TestDocTermOrds extends LuceneTestCase {
r2.close(); r2.close();
dir.close(); dir.close();
} }
public void testSortedTermsEnum() throws IOException {
Directory directory = newDirectory();
Analyzer analyzer = new MockAnalyzer(random());
IndexWriterConfig iwconfig = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
iwconfig.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig);
Document doc = new Document();
doc.add(new StringField("field", "hello", Field.Store.NO));
iwriter.addDocument(doc);
doc = new Document();
doc.add(new StringField("field", "world", Field.Store.NO));
iwriter.addDocument(doc);
doc = new Document();
doc.add(new StringField("field", "beer", Field.Store.NO));
iwriter.addDocument(doc);
iwriter.forceMerge(1);
DirectoryReader ireader = iwriter.getReader();
iwriter.close();
AtomicReader ar = getOnlySegmentReader(ireader);
SortedSetDocValues dv = FieldCache.DEFAULT.getDocTermOrds(ar, "field");
assertEquals(3, dv.getValueCount());
TermsEnum termsEnum = dv.termsEnum();
// next()
assertEquals("beer", termsEnum.next().utf8ToString());
assertEquals(0, termsEnum.ord());
assertEquals("hello", termsEnum.next().utf8ToString());
assertEquals(1, termsEnum.ord());
assertEquals("world", termsEnum.next().utf8ToString());
assertEquals(2, termsEnum.ord());
// seekCeil()
assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("ha!")));
assertEquals("hello", termsEnum.term().utf8ToString());
assertEquals(1, termsEnum.ord());
assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef("beer")));
assertEquals("beer", termsEnum.term().utf8ToString());
assertEquals(0, termsEnum.ord());
assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("zzz")));
// seekExact()
assertTrue(termsEnum.seekExact(new BytesRef("beer"), true));
assertEquals("beer", termsEnum.term().utf8ToString());
assertEquals(0, termsEnum.ord());
assertTrue(termsEnum.seekExact(new BytesRef("hello"), true));
assertEquals("hello", termsEnum.term().utf8ToString());
assertEquals(1, termsEnum.ord());
assertTrue(termsEnum.seekExact(new BytesRef("world"), true));
assertEquals("world", termsEnum.term().utf8ToString());
assertEquals(2, termsEnum.ord());
assertFalse(termsEnum.seekExact(new BytesRef("bogus"), true));
// seek(ord)
termsEnum.seekExact(0);
assertEquals("beer", termsEnum.term().utf8ToString());
assertEquals(0, termsEnum.ord());
termsEnum.seekExact(1);
assertEquals("hello", termsEnum.term().utf8ToString());
assertEquals(1, termsEnum.ord());
termsEnum.seekExact(2);
assertEquals("world", termsEnum.term().utf8ToString());
assertEquals(2, termsEnum.ord());
ireader.close();
directory.close();
}
} }