SOLR-1900: override new methods in SolrIndexReader, convert facet.method=enum to flex API

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940604 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2010-05-03 20:33:16 +00:00
parent 89c24fbe37
commit 1a9fab6982
2 changed files with 141 additions and 59 deletions

View File

@ -17,12 +17,11 @@
package org.apache.solr.request;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.*;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.*;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.FacetParams;
import org.apache.solr.common.params.RequiredSolrParams;
@ -498,67 +497,115 @@ public class SimpleFacets {
IndexReader r = searcher.getReader();
FieldType ft = schema.getFieldType(field);
boolean sortByCount = sort.equals("count") || sort.equals("true");
final int maxsize = limit>=0 ? offset+limit : Integer.MAX_VALUE-1;
final BoundedTreeSet<CountPair<String,Integer>> queue = (sort.equals("count") || sort.equals("true")) ? new BoundedTreeSet<CountPair<String,Integer>>(maxsize) : null;
final BoundedTreeSet<CountPair<BytesRef,Integer>> queue = sortByCount ? new BoundedTreeSet<CountPair<BytesRef,Integer>>(maxsize) : null;
final NamedList res = new NamedList();
int min=mincount-1; // the smallest value in the top 'N' values
int off=offset;
int lim=limit>=0 ? limit : Integer.MAX_VALUE;
String startTerm = prefix==null ? "" : ft.toInternal(prefix);
TermEnum te = r.terms(new Term(field,startTerm));
TermDocs td = r.termDocs();
if (docs.size() >= mincount) {
do {
Term t = te.term();
if (null == t || ! t.field().equals(field))
break;
if (prefix!=null && !t.text().startsWith(prefix)) break;
int df = te.docFreq();
// If we are sorting, we can use df>min (rather than >=) since we
// are going in index order. For certain term distributions this can
// make a large difference (for example, many terms with df=1).
if (df>0 && df>min) {
int c;
if (df >= minDfFilterCache) {
// use the filter cache
c = searcher.numDocs(new TermQuery(t), docs);
} else {
// iterate over TermDocs to calculate the intersection
td.seek(te);
c=0;
while (td.next()) {
if (docs.exists(td.doc())) c++;
}
}
if (sort.equals("count") || sort.equals("true")) {
if (c>min) {
queue.add(new CountPair<String,Integer>(t.text(), c));
if (queue.size()>=maxsize) min=queue.last().val;
}
} else {
if (c >= mincount && --off<0) {
if (--lim<0) break;
res.add(ft.indexedToReadable(t.text()), c);
}
}
}
} while (te.next());
BytesRef startTermBytes = null;
if (prefix != null) {
String indexedPrefix = ft.toInternal(prefix);
startTermBytes = new BytesRef(indexedPrefix);
}
if (sort.equals("count") || sort.equals("true")) {
for (CountPair<String,Integer> p : queue) {
Fields fields = MultiFields.getFields(r);
Terms terms = fields==null ? null : fields.terms(field);
TermsEnum termsEnum = null;
if (terms != null) {
termsEnum = terms.iterator();
// TODO: OPT: if seek(ord) is supported for this termsEnum, then we could use it for
// facet.offset when sorting by index order.
if (startTermBytes != null) {
if (termsEnum.seek(startTermBytes, true) == TermsEnum.SeekStatus.END) {
termsEnum = null;
}
} else {
// position termsEnum on first term
termsEnum.next();
}
}
Term template = new Term(field);
DocsEnum docsEnum = null;
if (termsEnum != null && docs.size() >= mincount) {
for(;;) {
BytesRef term = termsEnum.term();
if (term == null)
break;
if (startTermBytes != null && !term.startsWith(startTermBytes))
break;
int df = termsEnum.docFreq();
// If we are sorting, we can use df>min (rather than >=) since we
// are going in index order. For certain term distributions this can
// make a large difference (for example, many terms with df=1).
if (df>0 && df>min) {
int c;
if (df >= minDfFilterCache) {
// use the filter cache
// TODO: not a big deal, but there are prob more efficient ways to go from utf8 to string
// TODO: need a term query that takes a BytesRef
Term t = template.createTerm(new String(term.utf8ToString()));
c = searcher.numDocs(new TermQuery(t), docs);
} else {
// iterate over TermDocs to calculate the intersection
// TODO: specialize when base docset is a bitset or hash set (skipDocs)? or does it matter for this?
// TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class impl)
docsEnum = termsEnum.docs(null, docsEnum);
// this should be the same bulk result object if sharing of the docsEnum succeeded
DocsEnum.BulkReadResult bulk = docsEnum.getBulkResult();
c=0;
for (;;) {
int nDocs = docsEnum.read();
if (nDocs == 0) break;
int[] docArr = bulk.docs.ints; // this might be movable outside the loop, but perhaps not worth the risk.
for (int i=0; i<nDocs; i++) {
if (docs.exists(docArr[i])) c++;
}
}
}
if (sortByCount) {
if (c>min) {
BytesRef termCopy = new BytesRef(term);
queue.add(new CountPair<BytesRef,Integer>(termCopy, c));
if (queue.size()>=maxsize) min=queue.last().val;
}
} else {
if (c >= mincount && --off<0) {
if (--lim<0) break;
BytesRef termCopy = new BytesRef(term);
String s = term.utf8ToString();
res.add(ft.indexedToReadable(s), c);
}
}
}
termsEnum.next();
}
}
if (sortByCount) {
for (CountPair<BytesRef,Integer> p : queue) {
if (--off>=0) continue;
if (--lim<0) break;
res.add(ft.indexedToReadable(p.key), p.val);
String s = p.key.utf8ToString();
res.add(ft.indexedToReadable(s), p.val);
}
}
@ -566,9 +613,6 @@ public class SimpleFacets {
res.add(null, getFieldMissingCount(searcher,docs,field));
}
te.close();
td.close();
return res;
}

View File

@ -23,6 +23,8 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import java.io.IOException;
import java.util.Collection;
@ -222,6 +224,11 @@ public class SolrIndexReader extends FilterIndexReader {
return in.directory();
}
@Override
public Bits getDeletedDocs() throws IOException {
return in.getDeletedDocs();
}
@Override
public TermFreqVector[] getTermFreqVectors(int docNumber) throws IOException {
return in.getTermFreqVectors(docNumber);
@ -297,6 +304,11 @@ public class SolrIndexReader extends FilterIndexReader {
return in.terms();
}
@Override
public Fields fields() throws IOException {
return in.fields();
}
@Override
public TermEnum terms(Term t) throws IOException {
return in.terms(t);
@ -308,6 +320,11 @@ public class SolrIndexReader extends FilterIndexReader {
return in.docFreq(t);
}
@Override
public int docFreq(String field, BytesRef t) throws IOException {
return in.docFreq(field, t);
}
@Override
public TermDocs termDocs() throws IOException {
ensureOpen();
@ -320,6 +337,21 @@ public class SolrIndexReader extends FilterIndexReader {
return in.termDocs(term);
}
@Override
public Terms terms(String field) throws IOException {
return in.terms(field);
}
@Override
public DocsEnum termDocsEnum(Bits skipDocs, String field, BytesRef term) throws IOException {
return in.termDocsEnum(skipDocs, field, term);
}
@Override
public DocsAndPositionsEnum termPositionsEnum(Bits skipDocs, String field, BytesRef term) throws IOException {
return in.termPositionsEnum(skipDocs, field, term);
}
@Override
public TermPositions termPositions() throws IOException {
ensureOpen();
@ -329,6 +361,7 @@ public class SolrIndexReader extends FilterIndexReader {
@Override
protected void doDelete(int n) throws CorruptIndexException, IOException { in.deleteDocument(n); }
// Let FilterIndexReader handle commit()... we cannot override commit()
// or call in.commit() ourselves.
// protected void doCommit() throws IOException { in.commit(); }
@ -363,6 +396,11 @@ public class SolrIndexReader extends FilterIndexReader {
return subReaders;
}
@Override
public int getSubReaderDocBase(IndexReader subReader) {
return in.getSubReaderDocBase(subReader);
}
@Override
public int hashCode() {
return in.hashCode();
@ -405,7 +443,7 @@ public class SolrIndexReader extends FilterIndexReader {
@Override
public long getUniqueTermCount() throws IOException {
return super.getUniqueTermCount();
return in.getUniqueTermCount();
}
@Override