From f7ad4045e2955300836e1eb68a860bc7893f66a1 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Tue, 19 Mar 2013 14:21:16 +0000 Subject: [PATCH] LUCENE-4854: DocTermsOrd getOrdTermsEnum() buggy, lookupTerm/termsEnum is slow git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1458303 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 6 ++ .../org/apache/lucene/index/DocTermOrds.java | 39 +++++++-- .../apache/lucene/search/FieldCacheImpl.java | 2 +- .../apache/lucene/index/TestDocTermOrds.java | 80 ++++++++++++++++++- 4 files changed, 118 insertions(+), 9 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 9fe043bfb23..b1f3b17cc65 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -145,11 +145,17 @@ Bug Fixes * LUCENE-4826: PostingsHighlighter was not returning the top N best scoring passages. (Robert Muir, Mike McCandless) +* LUCENE-4854: Fix DocTermOrds.getOrdTermsEnum() to not return negative + ord on initial next(). (Robert Muir) + Optimizations * LUCENE-4819: Added Sorted[Set]DocValues.termsEnum(), and optimized the default codec for improved enumeration performance. (Robert Muir) +* LUCENE-4854: Speed up TermsEnum of FieldCache.getDocTermOrds. + (Robert Muir) + ======================= Lucene 4.2.0 ======================= Changes in backwards compatibility policy diff --git a/lucene/core/src/java/org/apache/lucene/index/DocTermOrds.java b/lucene/core/src/java/org/apache/lucene/index/DocTermOrds.java index c4671cbff4c..4bd919fc7a2 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocTermOrds.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocTermOrds.java @@ -24,6 +24,7 @@ import java.util.Comparator; import java.util.List; import org.apache.lucene.codecs.PostingsFormat; // javadocs +import org.apache.lucene.index.TermsEnum.SeekStatus; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; @@ -632,7 +633,9 @@ public class DocTermOrds { @Override public BytesRef next() throws IOException { - ord++; + if (++ord < 0) { + ord = 0; + } if (termsEnum.next() == null) { term = null; return null; @@ -763,16 +766,17 @@ public class DocTermOrds { } /** Returns a SortedSetDocValues view of this instance */ - public SortedSetDocValues iterator(TermsEnum termsEnum) throws IOException { + public SortedSetDocValues iterator(AtomicReader reader) throws IOException { if (isEmpty()) { return SortedSetDocValues.EMPTY; } else { - return new Iterator(termsEnum); + return new Iterator(reader); } } private class Iterator extends SortedSetDocValues { - final TermsEnum te; + final AtomicReader reader; + final TermsEnum te; // used internally for lookupOrd() and lookupTerm() // currently we read 5 at a time (using the logic of the old iterator) final int buffer[] = new int[5]; int bufferUpto; @@ -782,8 +786,9 @@ public class DocTermOrds { private int upto; private byte[] arr; - Iterator(TermsEnum te) { - this.te = te; + Iterator(AtomicReader reader) throws IOException { + this.reader = reader; + this.te = termsEnum(); } @Override @@ -880,5 +885,27 @@ public class DocTermOrds { public long getValueCount() { return numTerms(); } + + @Override + public long lookupTerm(BytesRef key) { + try { + if (te.seekCeil(key) == SeekStatus.FOUND) { + return te.ord(); + } else { + return -te.ord()-1; + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override + public TermsEnum termsEnum() { + try { + return getOrdTermsEnum(reader); + } catch (IOException e) { + throw new RuntimeException(); + } + } } } diff --git a/lucene/core/src/java/org/apache/lucene/search/FieldCacheImpl.java b/lucene/core/src/java/org/apache/lucene/search/FieldCacheImpl.java index c41fcab8063..b94ad6e7c43 100644 --- a/lucene/core/src/java/org/apache/lucene/search/FieldCacheImpl.java +++ b/lucene/core/src/java/org/apache/lucene/search/FieldCacheImpl.java @@ -1388,7 +1388,7 @@ class FieldCacheImpl implements FieldCache { } DocTermOrds dto = (DocTermOrds) caches.get(DocTermOrds.class).get(reader, new CacheKey(field, null), false); - return dto.iterator(dto.getOrdTermsEnum(reader)); + return dto.iterator(reader); } static final class DocTermOrdsCache extends Cache { diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDocTermOrds.java b/lucene/core/src/test/org/apache/lucene/index/TestDocTermOrds.java index 8e0f97ba664..d6bccca699d 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDocTermOrds.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDocTermOrds.java @@ -17,18 +17,22 @@ package org.apache.lucene.index; * limitations under the License. */ +import java.io.IOException; import java.util.Arrays; import java.util.List; import java.util.ArrayList; import java.util.HashSet; import java.util.Set; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.IntField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.TermsEnum.SeekStatus; import org.apache.lucene.search.FieldCache; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; @@ -64,7 +68,7 @@ public class TestDocTermOrds extends LuceneTestCase { final AtomicReader ar = SlowCompositeReaderWrapper.wrap(r); final DocTermOrds dto = new DocTermOrds(ar, ar.getLiveDocs(), "field"); - SortedSetDocValues iter = dto.iterator(ar.terms("field").iterator(null)); + SortedSetDocValues iter = dto.iterator(ar); iter.setDocument(0); assertEquals(0, iter.nextOrd()); @@ -352,7 +356,7 @@ public class TestDocTermOrds extends LuceneTestCase { } } - SortedSetDocValues iter = dto.iterator(te); + SortedSetDocValues iter = dto.iterator(r); for(int docID=0;docID