diff --git a/solr/src/java/org/apache/solr/request/SimpleFacets.java b/solr/src/java/org/apache/solr/request/SimpleFacets.java index 80746dcc2da..6dd94ca693d 100644 --- a/solr/src/java/org/apache/solr/request/SimpleFacets.java +++ b/solr/src/java/org/apache/solr/request/SimpleFacets.java @@ -17,12 +17,11 @@ package org.apache.solr.request; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermEnum; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.*; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.search.*; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.UnicodeUtil; import org.apache.solr.common.SolrException; import org.apache.solr.common.params.FacetParams; import org.apache.solr.common.params.RequiredSolrParams; @@ -498,67 +497,115 @@ public class SimpleFacets { IndexReader r = searcher.getReader(); FieldType ft = schema.getFieldType(field); - final int maxsize = limit>=0 ? offset+limit : Integer.MAX_VALUE-1; - final BoundedTreeSet> queue = (sort.equals("count") || sort.equals("true")) ? new BoundedTreeSet>(maxsize) : null; + boolean sortByCount = sort.equals("count") || sort.equals("true"); + final int maxsize = limit>=0 ? offset+limit : Integer.MAX_VALUE-1; + final BoundedTreeSet> queue = sortByCount ? new BoundedTreeSet>(maxsize) : null; final NamedList res = new NamedList(); int min=mincount-1; // the smallest value in the top 'N' values int off=offset; int lim=limit>=0 ? limit : Integer.MAX_VALUE; - String startTerm = prefix==null ? "" : ft.toInternal(prefix); - TermEnum te = r.terms(new Term(field,startTerm)); - TermDocs td = r.termDocs(); - - if (docs.size() >= mincount) { - do { - Term t = te.term(); - - if (null == t || ! t.field().equals(field)) - break; - - if (prefix!=null && !t.text().startsWith(prefix)) break; - - int df = te.docFreq(); - - // If we are sorting, we can use df>min (rather than >=) since we - // are going in index order. For certain term distributions this can - // make a large difference (for example, many terms with df=1). - if (df>0 && df>min) { - int c; - - if (df >= minDfFilterCache) { - // use the filter cache - c = searcher.numDocs(new TermQuery(t), docs); - } else { - // iterate over TermDocs to calculate the intersection - td.seek(te); - c=0; - while (td.next()) { - if (docs.exists(td.doc())) c++; - } - } - - if (sort.equals("count") || sort.equals("true")) { - if (c>min) { - queue.add(new CountPair(t.text(), c)); - if (queue.size()>=maxsize) min=queue.last().val; - } - } else { - if (c >= mincount && --off<0) { - if (--lim<0) break; - res.add(ft.indexedToReadable(t.text()), c); - } - } - } - } while (te.next()); + BytesRef startTermBytes = null; + if (prefix != null) { + String indexedPrefix = ft.toInternal(prefix); + startTermBytes = new BytesRef(indexedPrefix); } - if (sort.equals("count") || sort.equals("true")) { - for (CountPair p : queue) { + Fields fields = MultiFields.getFields(r); + Terms terms = fields==null ? null : fields.terms(field); + TermsEnum termsEnum = null; + + if (terms != null) { + termsEnum = terms.iterator(); + + // TODO: OPT: if seek(ord) is supported for this termsEnum, then we could use it for + // facet.offset when sorting by index order. + + if (startTermBytes != null) { + if (termsEnum.seek(startTermBytes, true) == TermsEnum.SeekStatus.END) { + termsEnum = null; + } + } else { + // position termsEnum on first term + termsEnum.next(); + } + } + + Term template = new Term(field); + DocsEnum docsEnum = null; + + + if (termsEnum != null && docs.size() >= mincount) { + for(;;) { + BytesRef term = termsEnum.term(); + if (term == null) + break; + + if (startTermBytes != null && !term.startsWith(startTermBytes)) + break; + + int df = termsEnum.docFreq(); + + // If we are sorting, we can use df>min (rather than >=) since we + // are going in index order. For certain term distributions this can + // make a large difference (for example, many terms with df=1). + if (df>0 && df>min) { + int c; + + if (df >= minDfFilterCache) { + // use the filter cache + // TODO: not a big deal, but there are prob more efficient ways to go from utf8 to string + // TODO: need a term query that takes a BytesRef + Term t = template.createTerm(new String(term.utf8ToString())); + c = searcher.numDocs(new TermQuery(t), docs); + } else { + // iterate over TermDocs to calculate the intersection + + // TODO: specialize when base docset is a bitset or hash set (skipDocs)? or does it matter for this? + // TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class impl) + docsEnum = termsEnum.docs(null, docsEnum); + + // this should be the same bulk result object if sharing of the docsEnum succeeded + DocsEnum.BulkReadResult bulk = docsEnum.getBulkResult(); + + c=0; + for (;;) { + int nDocs = docsEnum.read(); + if (nDocs == 0) break; + int[] docArr = bulk.docs.ints; // this might be movable outside the loop, but perhaps not worth the risk. + for (int i=0; imin) { + BytesRef termCopy = new BytesRef(term); + queue.add(new CountPair(termCopy, c)); + if (queue.size()>=maxsize) min=queue.last().val; + } + } else { + if (c >= mincount && --off<0) { + if (--lim<0) break; + BytesRef termCopy = new BytesRef(term); + String s = term.utf8ToString(); + res.add(ft.indexedToReadable(s), c); + } + } + } + + termsEnum.next(); + } + } + + if (sortByCount) { + for (CountPair p : queue) { if (--off>=0) continue; if (--lim<0) break; - res.add(ft.indexedToReadable(p.key), p.val); + String s = p.key.utf8ToString(); + res.add(ft.indexedToReadable(s), p.val); } } @@ -566,9 +613,6 @@ public class SimpleFacets { res.add(null, getFieldMissingCount(searcher,docs,field)); } - te.close(); - td.close(); - return res; } diff --git a/solr/src/java/org/apache/solr/search/SolrIndexReader.java b/solr/src/java/org/apache/solr/search/SolrIndexReader.java index b59d1cf3db3..73c61c2ae2a 100755 --- a/solr/src/java/org/apache/solr/search/SolrIndexReader.java +++ b/solr/src/java/org/apache/solr/search/SolrIndexReader.java @@ -23,6 +23,8 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldSelector; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; import java.io.IOException; import java.util.Collection; @@ -222,6 +224,11 @@ public class SolrIndexReader extends FilterIndexReader { return in.directory(); } + @Override + public Bits getDeletedDocs() throws IOException { + return in.getDeletedDocs(); + } + @Override public TermFreqVector[] getTermFreqVectors(int docNumber) throws IOException { return in.getTermFreqVectors(docNumber); @@ -297,6 +304,11 @@ public class SolrIndexReader extends FilterIndexReader { return in.terms(); } + @Override + public Fields fields() throws IOException { + return in.fields(); + } + @Override public TermEnum terms(Term t) throws IOException { return in.terms(t); @@ -308,6 +320,11 @@ public class SolrIndexReader extends FilterIndexReader { return in.docFreq(t); } + @Override + public int docFreq(String field, BytesRef t) throws IOException { + return in.docFreq(field, t); + } + @Override public TermDocs termDocs() throws IOException { ensureOpen(); @@ -320,6 +337,21 @@ public class SolrIndexReader extends FilterIndexReader { return in.termDocs(term); } + @Override + public Terms terms(String field) throws IOException { + return in.terms(field); + } + + @Override + public DocsEnum termDocsEnum(Bits skipDocs, String field, BytesRef term) throws IOException { + return in.termDocsEnum(skipDocs, field, term); + } + + @Override + public DocsAndPositionsEnum termPositionsEnum(Bits skipDocs, String field, BytesRef term) throws IOException { + return in.termPositionsEnum(skipDocs, field, term); + } + @Override public TermPositions termPositions() throws IOException { ensureOpen(); @@ -329,6 +361,7 @@ public class SolrIndexReader extends FilterIndexReader { @Override protected void doDelete(int n) throws CorruptIndexException, IOException { in.deleteDocument(n); } + // Let FilterIndexReader handle commit()... we cannot override commit() // or call in.commit() ourselves. // protected void doCommit() throws IOException { in.commit(); } @@ -363,6 +396,11 @@ public class SolrIndexReader extends FilterIndexReader { return subReaders; } + @Override + public int getSubReaderDocBase(IndexReader subReader) { + return in.getSubReaderDocBase(subReader); + } + @Override public int hashCode() { return in.hashCode(); @@ -405,7 +443,7 @@ public class SolrIndexReader extends FilterIndexReader { @Override public long getUniqueTermCount() throws IOException { - return super.getUniqueTermCount(); + return in.getUniqueTermCount(); } @Override