SOLR-1900: override new methods in SolrIndexReader, convert facet.method=enum to flex API

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940604 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2010-05-03 20:33:16 +00:00
parent 89c24fbe37
commit 1a9fab6982
2 changed files with 141 additions and 59 deletions

View File

@ -17,12 +17,11 @@
package org.apache.solr.request; package org.apache.solr.request;
import org.apache.lucene.index.Term; import org.apache.lucene.index.*;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.*; import org.apache.lucene.search.*;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.FacetParams; import org.apache.solr.common.params.FacetParams;
import org.apache.solr.common.params.RequiredSolrParams; import org.apache.solr.common.params.RequiredSolrParams;
@ -498,28 +497,55 @@ public class SimpleFacets {
IndexReader r = searcher.getReader(); IndexReader r = searcher.getReader();
FieldType ft = schema.getFieldType(field); FieldType ft = schema.getFieldType(field);
boolean sortByCount = sort.equals("count") || sort.equals("true");
final int maxsize = limit>=0 ? offset+limit : Integer.MAX_VALUE-1; final int maxsize = limit>=0 ? offset+limit : Integer.MAX_VALUE-1;
final BoundedTreeSet<CountPair<String,Integer>> queue = (sort.equals("count") || sort.equals("true")) ? new BoundedTreeSet<CountPair<String,Integer>>(maxsize) : null; final BoundedTreeSet<CountPair<BytesRef,Integer>> queue = sortByCount ? new BoundedTreeSet<CountPair<BytesRef,Integer>>(maxsize) : null;
final NamedList res = new NamedList(); final NamedList res = new NamedList();
int min=mincount-1; // the smallest value in the top 'N' values int min=mincount-1; // the smallest value in the top 'N' values
int off=offset; int off=offset;
int lim=limit>=0 ? limit : Integer.MAX_VALUE; int lim=limit>=0 ? limit : Integer.MAX_VALUE;
String startTerm = prefix==null ? "" : ft.toInternal(prefix); BytesRef startTermBytes = null;
TermEnum te = r.terms(new Term(field,startTerm)); if (prefix != null) {
TermDocs td = r.termDocs(); String indexedPrefix = ft.toInternal(prefix);
startTermBytes = new BytesRef(indexedPrefix);
}
if (docs.size() >= mincount) { Fields fields = MultiFields.getFields(r);
do { Terms terms = fields==null ? null : fields.terms(field);
Term t = te.term(); TermsEnum termsEnum = null;
if (null == t || ! t.field().equals(field)) if (terms != null) {
termsEnum = terms.iterator();
// TODO: OPT: if seek(ord) is supported for this termsEnum, then we could use it for
// facet.offset when sorting by index order.
if (startTermBytes != null) {
if (termsEnum.seek(startTermBytes, true) == TermsEnum.SeekStatus.END) {
termsEnum = null;
}
} else {
// position termsEnum on first term
termsEnum.next();
}
}
Term template = new Term(field);
DocsEnum docsEnum = null;
if (termsEnum != null && docs.size() >= mincount) {
for(;;) {
BytesRef term = termsEnum.term();
if (term == null)
break; break;
if (prefix!=null && !t.text().startsWith(prefix)) break; if (startTermBytes != null && !term.startsWith(startTermBytes))
break;
int df = te.docFreq(); int df = termsEnum.docFreq();
// If we are sorting, we can use df>min (rather than >=) since we // If we are sorting, we can use df>min (rather than >=) since we
// are going in index order. For certain term distributions this can // are going in index order. For certain term distributions this can
@ -529,36 +555,57 @@ public class SimpleFacets {
if (df >= minDfFilterCache) { if (df >= minDfFilterCache) {
// use the filter cache // use the filter cache
// TODO: not a big deal, but there are prob more efficient ways to go from utf8 to string
// TODO: need a term query that takes a BytesRef
Term t = template.createTerm(new String(term.utf8ToString()));
c = searcher.numDocs(new TermQuery(t), docs); c = searcher.numDocs(new TermQuery(t), docs);
} else { } else {
// iterate over TermDocs to calculate the intersection // iterate over TermDocs to calculate the intersection
td.seek(te);
// TODO: specialize when base docset is a bitset or hash set (skipDocs)? or does it matter for this?
// TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class impl)
docsEnum = termsEnum.docs(null, docsEnum);
// this should be the same bulk result object if sharing of the docsEnum succeeded
DocsEnum.BulkReadResult bulk = docsEnum.getBulkResult();
c=0; c=0;
while (td.next()) { for (;;) {
if (docs.exists(td.doc())) c++; int nDocs = docsEnum.read();
if (nDocs == 0) break;
int[] docArr = bulk.docs.ints; // this might be movable outside the loop, but perhaps not worth the risk.
for (int i=0; i<nDocs; i++) {
if (docs.exists(docArr[i])) c++;
}
} }
} }
if (sort.equals("count") || sort.equals("true")) { if (sortByCount) {
if (c>min) { if (c>min) {
queue.add(new CountPair<String,Integer>(t.text(), c)); BytesRef termCopy = new BytesRef(term);
queue.add(new CountPair<BytesRef,Integer>(termCopy, c));
if (queue.size()>=maxsize) min=queue.last().val; if (queue.size()>=maxsize) min=queue.last().val;
} }
} else { } else {
if (c >= mincount && --off<0) { if (c >= mincount && --off<0) {
if (--lim<0) break; if (--lim<0) break;
res.add(ft.indexedToReadable(t.text()), c); BytesRef termCopy = new BytesRef(term);
String s = term.utf8ToString();
res.add(ft.indexedToReadable(s), c);
} }
} }
} }
} while (te.next());
}
if (sort.equals("count") || sort.equals("true")) { termsEnum.next();
for (CountPair<String,Integer> p : queue) { }
}
if (sortByCount) {
for (CountPair<BytesRef,Integer> p : queue) {
if (--off>=0) continue; if (--off>=0) continue;
if (--lim<0) break; if (--lim<0) break;
res.add(ft.indexedToReadable(p.key), p.val); String s = p.key.utf8ToString();
res.add(ft.indexedToReadable(s), p.val);
} }
} }
@ -566,9 +613,6 @@ public class SimpleFacets {
res.add(null, getFieldMissingCount(searcher,docs,field)); res.add(null, getFieldMissingCount(searcher,docs,field));
} }
te.close();
td.close();
return res; return res;
} }

View File

@ -23,6 +23,8 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldSelector; import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import java.io.IOException; import java.io.IOException;
import java.util.Collection; import java.util.Collection;
@ -222,6 +224,11 @@ public class SolrIndexReader extends FilterIndexReader {
return in.directory(); return in.directory();
} }
@Override
public Bits getDeletedDocs() throws IOException {
return in.getDeletedDocs();
}
@Override @Override
public TermFreqVector[] getTermFreqVectors(int docNumber) throws IOException { public TermFreqVector[] getTermFreqVectors(int docNumber) throws IOException {
return in.getTermFreqVectors(docNumber); return in.getTermFreqVectors(docNumber);
@ -297,6 +304,11 @@ public class SolrIndexReader extends FilterIndexReader {
return in.terms(); return in.terms();
} }
@Override
public Fields fields() throws IOException {
return in.fields();
}
@Override @Override
public TermEnum terms(Term t) throws IOException { public TermEnum terms(Term t) throws IOException {
return in.terms(t); return in.terms(t);
@ -308,6 +320,11 @@ public class SolrIndexReader extends FilterIndexReader {
return in.docFreq(t); return in.docFreq(t);
} }
@Override
public int docFreq(String field, BytesRef t) throws IOException {
return in.docFreq(field, t);
}
@Override @Override
public TermDocs termDocs() throws IOException { public TermDocs termDocs() throws IOException {
ensureOpen(); ensureOpen();
@ -320,6 +337,21 @@ public class SolrIndexReader extends FilterIndexReader {
return in.termDocs(term); return in.termDocs(term);
} }
@Override
public Terms terms(String field) throws IOException {
return in.terms(field);
}
@Override
public DocsEnum termDocsEnum(Bits skipDocs, String field, BytesRef term) throws IOException {
return in.termDocsEnum(skipDocs, field, term);
}
@Override
public DocsAndPositionsEnum termPositionsEnum(Bits skipDocs, String field, BytesRef term) throws IOException {
return in.termPositionsEnum(skipDocs, field, term);
}
@Override @Override
public TermPositions termPositions() throws IOException { public TermPositions termPositions() throws IOException {
ensureOpen(); ensureOpen();
@ -329,6 +361,7 @@ public class SolrIndexReader extends FilterIndexReader {
@Override @Override
protected void doDelete(int n) throws CorruptIndexException, IOException { in.deleteDocument(n); } protected void doDelete(int n) throws CorruptIndexException, IOException { in.deleteDocument(n); }
// Let FilterIndexReader handle commit()... we cannot override commit() // Let FilterIndexReader handle commit()... we cannot override commit()
// or call in.commit() ourselves. // or call in.commit() ourselves.
// protected void doCommit() throws IOException { in.commit(); } // protected void doCommit() throws IOException { in.commit(); }
@ -363,6 +396,11 @@ public class SolrIndexReader extends FilterIndexReader {
return subReaders; return subReaders;
} }
@Override
public int getSubReaderDocBase(IndexReader subReader) {
return in.getSubReaderDocBase(subReader);
}
@Override @Override
public int hashCode() { public int hashCode() {
return in.hashCode(); return in.hashCode();
@ -405,7 +443,7 @@ public class SolrIndexReader extends FilterIndexReader {
@Override @Override
public long getUniqueTermCount() throws IOException { public long getUniqueTermCount() throws IOException {
return super.getUniqueTermCount(); return in.getUniqueTermCount();
} }
@Override @Override