LUCENE-3260: fix wrong result from MultiTermsEnum.next() after seekExact

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1141593 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2011-06-30 16:05:42 +00:00
parent b5be90974b
commit 9285e08bce
4 changed files with 161 additions and 13 deletions

View File

@ -43,6 +43,7 @@ public final class MultiTermsEnum extends TermsEnum {
private final MultiDocsAndPositionsEnum.EnumWithSlice[] subDocsAndPositions; private final MultiDocsAndPositionsEnum.EnumWithSlice[] subDocsAndPositions;
private BytesRef lastSeek; private BytesRef lastSeek;
private boolean lastSeekExact;
private final BytesRef lastSeekScratch = new BytesRef(); private final BytesRef lastSeekScratch = new BytesRef();
private int numTop; private int numTop;
@ -149,6 +150,7 @@ public final class MultiTermsEnum extends TermsEnum {
} }
lastSeek = null; lastSeek = null;
lastSeekExact = true;
for(int i=0;i<numSubs;i++) { for(int i=0;i<numSubs;i++) {
final boolean status; final boolean status;
@ -179,6 +181,7 @@ public final class MultiTermsEnum extends TermsEnum {
if (status) { if (status) {
top[numTop++] = currentSubs[i]; top[numTop++] = currentSubs[i];
current = currentSubs[i].current = currentSubs[i].terms.term(); current = currentSubs[i].current = currentSubs[i].terms.term();
assert term.equals(currentSubs[i].current);
} }
} }
@ -191,6 +194,7 @@ public final class MultiTermsEnum extends TermsEnum {
public SeekStatus seekCeil(BytesRef term, boolean useCache) throws IOException { public SeekStatus seekCeil(BytesRef term, boolean useCache) throws IOException {
queue.clear(); queue.clear();
numTop = 0; numTop = 0;
lastSeekExact = false;
boolean seekOpt = false; boolean seekOpt = false;
if (lastSeek != null && termComp.compare(lastSeek, term) <= 0) { if (lastSeek != null && termComp.compare(lastSeek, term) <= 0) {
@ -293,6 +297,17 @@ public final class MultiTermsEnum extends TermsEnum {
@Override @Override
public BytesRef next() throws IOException { public BytesRef next() throws IOException {
if (lastSeekExact) {
// Must seekCeil at this point, so those subs that
// didn't have the term can find the following term.
// NOTE: we could save some CPU by only seekCeil the
// subs that didn't match the last exact seek... but
// most impls short-circuit if you seekCeil to term
// they are already on.
final SeekStatus status = seekCeil(current);
assert status == SeekStatus.FOUND;
lastSeekExact = false;
}
lastSeek = null; lastSeek = null;
// restore queue // restore queue

View File

@ -0,0 +1,143 @@
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LineFileDocs;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
public class TestTermsEnum extends LuceneTestCase {
public void test() throws Exception {
final LineFileDocs docs = new LineFileDocs(random);
final Directory d = newDirectory();
final RandomIndexWriter w = new RandomIndexWriter(random, d);
final int numDocs = atLeast(10);
for(int docCount=0;docCount<numDocs;docCount++) {
w.addDocument(docs.nextDoc());
}
final IndexReader r = w.getReader();
w.close();
final List<BytesRef> terms = new ArrayList<BytesRef>();
final TermsEnum termsEnum = MultiFields.getTerms(r, "body").iterator();
BytesRef term;
while((term = termsEnum.next()) != null) {
terms.add(new BytesRef(term));
}
if (VERBOSE) {
System.out.println("TEST: " + terms.size() + " terms");
}
int upto = -1;
final int iters = atLeast(200);
for(int iter=0;iter<iters;iter++) {
final boolean isEnd;
if (upto != -1 && random.nextBoolean()) {
// next
if (VERBOSE) {
System.out.println("TEST: iter next");
}
isEnd = termsEnum.next() == null;
upto++;
if (isEnd) {
if (VERBOSE) {
System.out.println(" end");
}
assertEquals(upto, terms.size());
upto = -1;
} else {
if (VERBOSE) {
System.out.println(" got term=" + termsEnum.term().utf8ToString() + " expected=" + terms.get(upto).utf8ToString());
}
assertTrue(upto < terms.size());
assertEquals(terms.get(upto), termsEnum.term());
}
} else {
final BytesRef target;
final String exists;
if (random.nextBoolean()) {
// likely fake term
if (random.nextBoolean()) {
target = new BytesRef(_TestUtil.randomSimpleString(random));
} else {
target = new BytesRef(_TestUtil.randomRealisticUnicodeString(random));
}
exists = "likely not";
} else {
// real term
target = terms.get(random.nextInt(terms.size()));
exists = "yes";
}
upto = Collections.binarySearch(terms, target);
if (random.nextBoolean()) {
if (VERBOSE) {
System.out.println("TEST: iter seekCeil target=" + target.utf8ToString() + " exists=" + exists);
}
// seekCeil
final TermsEnum.SeekStatus status = termsEnum.seekCeil(target, random.nextBoolean());
if (VERBOSE) {
System.out.println(" got " + status);
}
if (upto < 0) {
upto = -(upto+1);
if (upto >= terms.size()) {
assertEquals(TermsEnum.SeekStatus.END, status);
upto = -1;
} else {
assertEquals(TermsEnum.SeekStatus.NOT_FOUND, status);
assertEquals(terms.get(upto), termsEnum.term());
}
} else {
assertEquals(TermsEnum.SeekStatus.FOUND, status);
assertEquals(terms.get(upto), termsEnum.term());
}
} else {
if (VERBOSE) {
System.out.println("TEST: iter seekExact target=" + target.utf8ToString() + " exists=" + exists);
}
// seekExact
final boolean result = termsEnum.seekExact(target, false);
if (VERBOSE) {
System.out.println(" got " + result);
}
if (upto < 0) {
assertFalse(result);
upto = -1;
} else {
assertTrue(result);
assertEquals(target, termsEnum.term());
}
}
}
}
r.close();
d.close();
}
}

View File

@ -714,12 +714,12 @@ public class TestFSTs extends LuceneTestCase {
if (random.nextBoolean()) { if (random.nextBoolean()) {
if (VERBOSE) { if (VERBOSE) {
System.out.println(" do advanceCeil(" + inputToString(inputMode, pairs.get(upto).input) + ")"); System.out.println(" do seekCeil(" + inputToString(inputMode, pairs.get(upto).input) + ")");
} }
isDone = fstEnum.seekCeil(pairs.get(upto).input) == null; isDone = fstEnum.seekCeil(pairs.get(upto).input) == null;
} else { } else {
if (VERBOSE) { if (VERBOSE) {
System.out.println(" do advanceFloor(" + inputToString(inputMode, pairs.get(upto).input) + ")"); System.out.println(" do seekFloor(" + inputToString(inputMode, pairs.get(upto).input) + ")");
} }
isDone = fstEnum.seekFloor(pairs.get(upto).input) == null; isDone = fstEnum.seekFloor(pairs.get(upto).input) == null;
} }

View File

@ -27,10 +27,8 @@ import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.LogByteSizeMergePolicy; import org.apache.lucene.index.LogByteSizeMergePolicy;
import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms; import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.TermsEnum.SeekStatus;
import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.store.LockObtainFailedException;
@ -798,6 +796,7 @@ public class LuceneTaxonomyWriter implements TaxonomyWriter {
// per step) // per step)
while (otherTaxonomiesLeft>0) { while (otherTaxonomiesLeft>0) {
// TODO: use a pq here
String first=null; String first=null;
for (int i=0; i<taxonomies.length; i++) { for (int i=0; i<taxonomies.length; i++) {
if (currentOthers[i]==null) continue; if (currentOthers[i]==null) continue;
@ -819,7 +818,6 @@ public class LuceneTaxonomyWriter implements TaxonomyWriter {
int newordinal = internalAddCategory(cp, cp.length()); int newordinal = internalAddCategory(cp, cp.length());
// TODO (Facet): we already had this term in our hands before, in nextTE... // TODO (Facet): we already had this term in our hands before, in nextTE...
// // TODO (Facet): no need to make this term? // // TODO (Facet): no need to make this term?
Term t = new Term(Consts.FULL, first);
for (int i=0; i<taxonomies.length; i++) { for (int i=0; i<taxonomies.length; i++) {
if (first.equals(currentOthers[i])) { if (first.equals(currentOthers[i])) {
// remember the remapping of this ordinal. Note how // remember the remapping of this ordinal. Note how
@ -828,8 +826,6 @@ public class LuceneTaxonomyWriter implements TaxonomyWriter {
// like Lucene's merge works, we hope there are few seeks. // like Lucene's merge works, we hope there are few seeks.
// TODO (Facet): is there a quicker way? E.g., not specifying the // TODO (Facet): is there a quicker way? E.g., not specifying the
// next term by name every time? // next term by name every time?
SeekStatus result = othertes[i].seekCeil(t.bytes(), false);
assert result == SeekStatus.FOUND;
otherdocsEnum[i] = othertes[i].docs(MultiFields.getDeletedDocs(otherreaders[i]), otherdocsEnum[i]); otherdocsEnum[i] = othertes[i].docs(MultiFields.getDeletedDocs(otherreaders[i]), otherdocsEnum[i]);
otherdocsEnum[i].nextDoc(); // TODO (Facet): check? otherdocsEnum[i].nextDoc(); // TODO (Facet): check?
int origordinal = otherdocsEnum[i].docID(); int origordinal = otherdocsEnum[i].docID();
@ -847,10 +843,6 @@ public class LuceneTaxonomyWriter implements TaxonomyWriter {
// to be added because it already existed in the main taxonomy. // to be added because it already existed in the main taxonomy.
// TODO (Facet): Again, is there a quicker way? // TODO (Facet): Again, is there a quicker way?
Term t = new Term(Consts.FULL, first);
// TODO: fix bug in MTE seekExact and use that instead.
SeekStatus result = mainte.seekCeil(t.bytes(), false);
assert result == SeekStatus.FOUND; // // TODO (Facet): explicit check / throw exception?
mainde = mainte.docs(MultiFields.getDeletedDocs(mainreader), mainde); mainde = mainte.docs(MultiFields.getDeletedDocs(mainreader), mainde);
mainde.nextDoc(); // TODO (Facet): check? mainde.nextDoc(); // TODO (Facet): check?
int newordinal = mainde.docID(); int newordinal = mainde.docID();
@ -859,8 +851,6 @@ public class LuceneTaxonomyWriter implements TaxonomyWriter {
for (int i=0; i<taxonomies.length; i++) { for (int i=0; i<taxonomies.length; i++) {
if (first.equals(currentOthers[i])) { if (first.equals(currentOthers[i])) {
// TODO (Facet): again, is there a quicker way? // TODO (Facet): again, is there a quicker way?
result = othertes[i].seekCeil(t.bytes(), false);
assert result == SeekStatus.FOUND; // TODO (Facet): explicit check / throw exception?
otherdocsEnum[i] = othertes[i].docs(MultiFields.getDeletedDocs(otherreaders[i]), otherdocsEnum[i]); otherdocsEnum[i] = othertes[i].docs(MultiFields.getDeletedDocs(otherreaders[i]), otherdocsEnum[i]);
otherdocsEnum[i].nextDoc(); // TODO (Facet): check? otherdocsEnum[i].nextDoc(); // TODO (Facet): check?
int origordinal = otherdocsEnum[i].docID(); int origordinal = otherdocsEnum[i].docID();