mirror of https://github.com/apache/lucene.git
LUCENE-4854: DocTermsOrd getOrdTermsEnum() buggy, lookupTerm/termsEnum is slow
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1458303 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
52d2d3fd6f
commit
f7ad4045e2
|
@ -145,11 +145,17 @@ Bug Fixes
|
||||||
* LUCENE-4826: PostingsHighlighter was not returning the top N best
|
* LUCENE-4826: PostingsHighlighter was not returning the top N best
|
||||||
scoring passages. (Robert Muir, Mike McCandless)
|
scoring passages. (Robert Muir, Mike McCandless)
|
||||||
|
|
||||||
|
* LUCENE-4854: Fix DocTermOrds.getOrdTermsEnum() to not return negative
|
||||||
|
ord on initial next(). (Robert Muir)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
* LUCENE-4819: Added Sorted[Set]DocValues.termsEnum(), and optimized the
|
* LUCENE-4819: Added Sorted[Set]DocValues.termsEnum(), and optimized the
|
||||||
default codec for improved enumeration performance. (Robert Muir)
|
default codec for improved enumeration performance. (Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-4854: Speed up TermsEnum of FieldCache.getDocTermOrds.
|
||||||
|
(Robert Muir)
|
||||||
|
|
||||||
======================= Lucene 4.2.0 =======================
|
======================= Lucene 4.2.0 =======================
|
||||||
|
|
||||||
Changes in backwards compatibility policy
|
Changes in backwards compatibility policy
|
||||||
|
|
|
@ -24,6 +24,7 @@ import java.util.Comparator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.lucene.codecs.PostingsFormat; // javadocs
|
import org.apache.lucene.codecs.PostingsFormat; // javadocs
|
||||||
|
import org.apache.lucene.index.TermsEnum.SeekStatus;
|
||||||
import org.apache.lucene.search.DocIdSetIterator;
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
import org.apache.lucene.util.Bits;
|
import org.apache.lucene.util.Bits;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
@ -632,7 +633,9 @@ public class DocTermOrds {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public BytesRef next() throws IOException {
|
public BytesRef next() throws IOException {
|
||||||
ord++;
|
if (++ord < 0) {
|
||||||
|
ord = 0;
|
||||||
|
}
|
||||||
if (termsEnum.next() == null) {
|
if (termsEnum.next() == null) {
|
||||||
term = null;
|
term = null;
|
||||||
return null;
|
return null;
|
||||||
|
@ -763,16 +766,17 @@ public class DocTermOrds {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns a SortedSetDocValues view of this instance */
|
/** Returns a SortedSetDocValues view of this instance */
|
||||||
public SortedSetDocValues iterator(TermsEnum termsEnum) throws IOException {
|
public SortedSetDocValues iterator(AtomicReader reader) throws IOException {
|
||||||
if (isEmpty()) {
|
if (isEmpty()) {
|
||||||
return SortedSetDocValues.EMPTY;
|
return SortedSetDocValues.EMPTY;
|
||||||
} else {
|
} else {
|
||||||
return new Iterator(termsEnum);
|
return new Iterator(reader);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private class Iterator extends SortedSetDocValues {
|
private class Iterator extends SortedSetDocValues {
|
||||||
final TermsEnum te;
|
final AtomicReader reader;
|
||||||
|
final TermsEnum te; // used internally for lookupOrd() and lookupTerm()
|
||||||
// currently we read 5 at a time (using the logic of the old iterator)
|
// currently we read 5 at a time (using the logic of the old iterator)
|
||||||
final int buffer[] = new int[5];
|
final int buffer[] = new int[5];
|
||||||
int bufferUpto;
|
int bufferUpto;
|
||||||
|
@ -782,8 +786,9 @@ public class DocTermOrds {
|
||||||
private int upto;
|
private int upto;
|
||||||
private byte[] arr;
|
private byte[] arr;
|
||||||
|
|
||||||
Iterator(TermsEnum te) {
|
Iterator(AtomicReader reader) throws IOException {
|
||||||
this.te = te;
|
this.reader = reader;
|
||||||
|
this.te = termsEnum();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -880,5 +885,27 @@ public class DocTermOrds {
|
||||||
public long getValueCount() {
|
public long getValueCount() {
|
||||||
return numTerms();
|
return numTerms();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long lookupTerm(BytesRef key) {
|
||||||
|
try {
|
||||||
|
if (te.seekCeil(key) == SeekStatus.FOUND) {
|
||||||
|
return te.ord();
|
||||||
|
} else {
|
||||||
|
return -te.ord()-1;
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TermsEnum termsEnum() {
|
||||||
|
try {
|
||||||
|
return getOrdTermsEnum(reader);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1388,7 +1388,7 @@ class FieldCacheImpl implements FieldCache {
|
||||||
}
|
}
|
||||||
|
|
||||||
DocTermOrds dto = (DocTermOrds) caches.get(DocTermOrds.class).get(reader, new CacheKey(field, null), false);
|
DocTermOrds dto = (DocTermOrds) caches.get(DocTermOrds.class).get(reader, new CacheKey(field, null), false);
|
||||||
return dto.iterator(dto.getOrdTermsEnum(reader));
|
return dto.iterator(reader);
|
||||||
}
|
}
|
||||||
|
|
||||||
static final class DocTermOrdsCache extends Cache {
|
static final class DocTermOrdsCache extends Cache {
|
||||||
|
|
|
@ -17,18 +17,22 @@ package org.apache.lucene.index;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.MockAnalyzer;
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
import org.apache.lucene.codecs.Codec;
|
import org.apache.lucene.codecs.Codec;
|
||||||
import org.apache.lucene.codecs.PostingsFormat;
|
import org.apache.lucene.codecs.PostingsFormat;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.document.IntField;
|
import org.apache.lucene.document.IntField;
|
||||||
|
import org.apache.lucene.document.StringField;
|
||||||
|
import org.apache.lucene.index.TermsEnum.SeekStatus;
|
||||||
import org.apache.lucene.search.FieldCache;
|
import org.apache.lucene.search.FieldCache;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
@ -64,7 +68,7 @@ public class TestDocTermOrds extends LuceneTestCase {
|
||||||
|
|
||||||
final AtomicReader ar = SlowCompositeReaderWrapper.wrap(r);
|
final AtomicReader ar = SlowCompositeReaderWrapper.wrap(r);
|
||||||
final DocTermOrds dto = new DocTermOrds(ar, ar.getLiveDocs(), "field");
|
final DocTermOrds dto = new DocTermOrds(ar, ar.getLiveDocs(), "field");
|
||||||
SortedSetDocValues iter = dto.iterator(ar.terms("field").iterator(null));
|
SortedSetDocValues iter = dto.iterator(ar);
|
||||||
|
|
||||||
iter.setDocument(0);
|
iter.setDocument(0);
|
||||||
assertEquals(0, iter.nextOrd());
|
assertEquals(0, iter.nextOrd());
|
||||||
|
@ -352,7 +356,7 @@ public class TestDocTermOrds extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
SortedSetDocValues iter = dto.iterator(te);
|
SortedSetDocValues iter = dto.iterator(r);
|
||||||
for(int docID=0;docID<r.maxDoc();docID++) {
|
for(int docID=0;docID<r.maxDoc();docID++) {
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
System.out.println("TEST: docID=" + docID + " of " + r.maxDoc() + " (id=" + docIDToID.get(docID) + ")");
|
System.out.println("TEST: docID=" + docID + " of " + r.maxDoc() + " (id=" + docIDToID.get(docID) + ")");
|
||||||
|
@ -402,4 +406,76 @@ public class TestDocTermOrds extends LuceneTestCase {
|
||||||
r2.close();
|
r2.close();
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testSortedTermsEnum() throws IOException {
|
||||||
|
Directory directory = newDirectory();
|
||||||
|
Analyzer analyzer = new MockAnalyzer(random());
|
||||||
|
IndexWriterConfig iwconfig = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
|
||||||
|
iwconfig.setMergePolicy(newLogMergePolicy());
|
||||||
|
RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig);
|
||||||
|
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new StringField("field", "hello", Field.Store.NO));
|
||||||
|
iwriter.addDocument(doc);
|
||||||
|
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new StringField("field", "world", Field.Store.NO));
|
||||||
|
iwriter.addDocument(doc);
|
||||||
|
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new StringField("field", "beer", Field.Store.NO));
|
||||||
|
iwriter.addDocument(doc);
|
||||||
|
iwriter.forceMerge(1);
|
||||||
|
|
||||||
|
DirectoryReader ireader = iwriter.getReader();
|
||||||
|
iwriter.close();
|
||||||
|
|
||||||
|
AtomicReader ar = getOnlySegmentReader(ireader);
|
||||||
|
SortedSetDocValues dv = FieldCache.DEFAULT.getDocTermOrds(ar, "field");
|
||||||
|
assertEquals(3, dv.getValueCount());
|
||||||
|
|
||||||
|
TermsEnum termsEnum = dv.termsEnum();
|
||||||
|
|
||||||
|
// next()
|
||||||
|
assertEquals("beer", termsEnum.next().utf8ToString());
|
||||||
|
assertEquals(0, termsEnum.ord());
|
||||||
|
assertEquals("hello", termsEnum.next().utf8ToString());
|
||||||
|
assertEquals(1, termsEnum.ord());
|
||||||
|
assertEquals("world", termsEnum.next().utf8ToString());
|
||||||
|
assertEquals(2, termsEnum.ord());
|
||||||
|
|
||||||
|
// seekCeil()
|
||||||
|
assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("ha!")));
|
||||||
|
assertEquals("hello", termsEnum.term().utf8ToString());
|
||||||
|
assertEquals(1, termsEnum.ord());
|
||||||
|
assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef("beer")));
|
||||||
|
assertEquals("beer", termsEnum.term().utf8ToString());
|
||||||
|
assertEquals(0, termsEnum.ord());
|
||||||
|
assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("zzz")));
|
||||||
|
|
||||||
|
// seekExact()
|
||||||
|
assertTrue(termsEnum.seekExact(new BytesRef("beer"), true));
|
||||||
|
assertEquals("beer", termsEnum.term().utf8ToString());
|
||||||
|
assertEquals(0, termsEnum.ord());
|
||||||
|
assertTrue(termsEnum.seekExact(new BytesRef("hello"), true));
|
||||||
|
assertEquals("hello", termsEnum.term().utf8ToString());
|
||||||
|
assertEquals(1, termsEnum.ord());
|
||||||
|
assertTrue(termsEnum.seekExact(new BytesRef("world"), true));
|
||||||
|
assertEquals("world", termsEnum.term().utf8ToString());
|
||||||
|
assertEquals(2, termsEnum.ord());
|
||||||
|
assertFalse(termsEnum.seekExact(new BytesRef("bogus"), true));
|
||||||
|
|
||||||
|
// seek(ord)
|
||||||
|
termsEnum.seekExact(0);
|
||||||
|
assertEquals("beer", termsEnum.term().utf8ToString());
|
||||||
|
assertEquals(0, termsEnum.ord());
|
||||||
|
termsEnum.seekExact(1);
|
||||||
|
assertEquals("hello", termsEnum.term().utf8ToString());
|
||||||
|
assertEquals(1, termsEnum.ord());
|
||||||
|
termsEnum.seekExact(2);
|
||||||
|
assertEquals("world", termsEnum.term().utf8ToString());
|
||||||
|
assertEquals(2, termsEnum.ord());
|
||||||
|
ireader.close();
|
||||||
|
directory.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue