LUCENE-5667: add test case

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1594846 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2014-05-15 10:12:14 +00:00
parent db8a1c7338
commit 83d150aa21
1 changed files with 152 additions and 0 deletions

View File

@ -27,6 +27,7 @@ import org.apache.lucene.document.IntField;
import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.FieldCache;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LineFileDocs; import org.apache.lucene.util.LineFileDocs;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
@ -884,4 +885,155 @@ public class TestTermsEnum extends LuceneTestCase {
r.close(); r.close();
dir.close(); dir.close();
} }
// LUCENE-5667
public void testCommonPrefixTerms() throws Exception {
Directory d = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), d);
Set<String> terms = new HashSet<String>();
//String prefix = TestUtil.randomSimpleString(random(), 1, 20);
String prefix = TestUtil.randomRealisticUnicodeString(random(), 1, 20);
int numTerms = atLeast(1000);
if (VERBOSE) {
System.out.println("TEST: " + numTerms + " terms; prefix=" + prefix);
}
while (terms.size() < numTerms) {
//terms.add(prefix + TestUtil.randomSimpleString(random(), 1, 20));
terms.add(prefix + TestUtil.randomRealisticUnicodeString(random(), 1, 20));
}
for(String term : terms) {
Document doc = new Document();
doc.add(newStringField("id", term, Field.Store.YES));
w.addDocument(doc);
}
IndexReader r = w.getReader();
if (VERBOSE) {
System.out.println("\nTEST: reader=" + r);
}
TermsEnum termsEnum = MultiFields.getTerms(r, "id").iterator(null);
DocsEnum docsEnum = null;
PerThreadPKLookup pkLookup = new PerThreadPKLookup(r, "id");
int iters = atLeast(numTerms*3);
List<String> termsList = new ArrayList<>(terms);
for(int iter=0;iter<iters;iter++) {
String term;
boolean shouldExist;
if (random().nextBoolean()) {
term = termsList.get(random().nextInt(terms.size()));
shouldExist = true;
} else {
term = prefix + TestUtil.randomSimpleString(random(), 1, 20);
shouldExist = terms.contains(term);
}
if (VERBOSE) {
System.out.println("\nTEST: try term=" + term);
System.out.println(" shouldExist?=" + shouldExist);
}
BytesRef termBytesRef = new BytesRef(term);
boolean actualResult = termsEnum.seekExact(termBytesRef);
assertEquals(shouldExist, actualResult);
if (shouldExist) {
docsEnum = termsEnum.docs(null, docsEnum, 0);
int docID = docsEnum.nextDoc();
assertTrue(docID != DocsEnum.NO_MORE_DOCS);
assertEquals(docID, pkLookup.lookup(termBytesRef));
StoredDocument doc = r.document(docID);
assertEquals(term, doc.get("id"));
if (random().nextInt(7) == 1) {
termsEnum.next();
}
} else {
assertEquals(-1, pkLookup.lookup(termBytesRef));
}
if (random().nextInt(7) == 1) {
TermsEnum.SeekStatus status = termsEnum.seekCeil(termBytesRef);
if (shouldExist) {
assertEquals(TermsEnum.SeekStatus.FOUND, status);
} else {
assertNotSame(TermsEnum.SeekStatus.FOUND, status);
}
}
}
r.close();
w.close();
d.close();
}
/** Utility class to do efficient primary-key (only 1 doc contains the
* given term) lookups by segment, re-using the enums. This class is
* not thread safe, so it is the caller's job to create and use one
* instance of this per thread. Do not use this if a term may appear
* in more than one document! It will only return the first one it
* finds. */
static class PerThreadPKLookup {
private final TermsEnum[] termsEnums;
private final DocsEnum[] docsEnums;
private final Bits[] liveDocs;
private final int[] docBases;
private final int numSegs;
private final boolean hasDeletions;
public PerThreadPKLookup(IndexReader r, String idFieldName) throws IOException {
List<AtomicReaderContext> leaves = new ArrayList<>(r.leaves());
// Larger segments are more likely to have the id, so we sort largest to smallest by numDocs:
Collections.sort(leaves, new Comparator<AtomicReaderContext>() {
@Override
public int compare(AtomicReaderContext c1, AtomicReaderContext c2) {
return c2.reader().numDocs() - c1.reader().numDocs();
}
});
termsEnums = new TermsEnum[leaves.size()];
docsEnums = new DocsEnum[leaves.size()];
liveDocs = new Bits[leaves.size()];
docBases = new int[leaves.size()];
int numSegs = 0;
boolean hasDeletions = false;
for(int i=0;i<leaves.size();i++) {
Fields fields = leaves.get(i).reader().fields();
if (fields != null) {
Terms terms = fields.terms(idFieldName);
if (terms != null) {
termsEnums[numSegs] = terms.iterator(null);
assert termsEnums[numSegs] != null;
docBases[numSegs] = leaves.get(i).docBase;
liveDocs[numSegs] = leaves.get(i).reader().getLiveDocs();
hasDeletions |= leaves.get(i).reader().hasDeletions();
numSegs++;
}
}
}
this.numSegs = numSegs;
this.hasDeletions = hasDeletions;
}
/** Returns docID if found, else -1. */
public int lookup(BytesRef id) throws IOException {
for(int seg=0;seg<numSegs;seg++) {
if (termsEnums[seg].seekExact(id)) {
docsEnums[seg] = termsEnums[seg].docs(liveDocs[seg], docsEnums[seg], 0);
int docID = docsEnums[seg].nextDoc();
if (docID != DocsEnum.NO_MORE_DOCS) {
return docBases[seg] + docID;
}
assert hasDeletions;
}
}
return -1;
}
// TODO: add reopen method to carry over re-used enums...?
}
} }