SOLR-1900: override new methods in SolrIndexReader, convert facet.method=enum to flex API

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940604 13f79535-47bb-0310-9956-ffa450edef68
2010-05-03 20:33:16 +00:00 · 2010-05-03 20:33:16 +00:00 · 1a9fab6982
parent 89c24fbe37
commit 1a9fab6982
2 changed files with 141 additions and 59 deletions
--- a/solr/src/java/org/apache/solr/request/SimpleFacets.java
+++ b/solr/src/java/org/apache/solr/request/SimpleFacets.java
@ -17,12 +17,11 @@

 package org.apache.solr.request;

-import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermEnum;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.index.*;
 import org.apache.lucene.queryParser.ParseException;
 import org.apache.lucene.search.*;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.UnicodeUtil;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.params.FacetParams;
 import org.apache.solr.common.params.RequiredSolrParams;
@ -498,67 +497,115 @@ public class SimpleFacets {
    IndexReader r = searcher.getReader();
    FieldType ft = schema.getFieldType(field);

+    boolean sortByCount = sort.equals("count") || sort.equals("true");
    final int maxsize = limit>=0 ? offset+limit : Integer.MAX_VALUE-1;
-    final BoundedTreeSet<CountPair<String,Integer>> queue = (sort.equals("count") || sort.equals("true")) ? new BoundedTreeSet<CountPair<String,Integer>>(maxsize) : null;
+    final BoundedTreeSet<CountPair<BytesRef,Integer>> queue = sortByCount ? new BoundedTreeSet<CountPair<BytesRef,Integer>>(maxsize) : null;
    final NamedList res = new NamedList();

    int min=mincount-1;  // the smallest value in the top 'N' values    
    int off=offset;
    int lim=limit>=0 ? limit : Integer.MAX_VALUE;

-    String startTerm = prefix==null ? "" : ft.toInternal(prefix);
-    TermEnum te = r.terms(new Term(field,startTerm));
-    TermDocs td = r.termDocs();
-
-    if (docs.size() >= mincount) { 
-    do {
-      Term t = te.term();
-
-      if (null == t || ! t.field().equals(field))
-        break;
-
-      if (prefix!=null && !t.text().startsWith(prefix)) break;
-
-      int df = te.docFreq();
-
-      // If we are sorting, we can use df>min (rather than >=) since we
-      // are going in index order.  For certain term distributions this can
-      // make a large difference (for example, many terms with df=1).
-      if (df>0 && df>min) {
-        int c;
-
-        if (df >= minDfFilterCache) {
-          // use the filter cache
-          c = searcher.numDocs(new TermQuery(t), docs);
-        } else {
-          // iterate over TermDocs to calculate the intersection
-          td.seek(te);
-          c=0;
-          while (td.next()) {
-            if (docs.exists(td.doc())) c++;
-          }
-        }
-
-        if (sort.equals("count") || sort.equals("true")) {
-          if (c>min) {
-            queue.add(new CountPair<String,Integer>(t.text(), c));
-            if (queue.size()>=maxsize) min=queue.last().val;
-          }
-        } else {
-          if (c >= mincount && --off<0) {
-            if (--lim<0) break;
-            res.add(ft.indexedToReadable(t.text()), c);
-          }
-        }
-      }
-    } while (te.next());
+    BytesRef startTermBytes = null;
+    if (prefix != null) {
+      String indexedPrefix = ft.toInternal(prefix);
+      startTermBytes = new BytesRef(indexedPrefix);
    }

-    if (sort.equals("count") || sort.equals("true")) {
-      for (CountPair<String,Integer> p : queue) {
+    Fields fields = MultiFields.getFields(r);
+    Terms terms = fields==null ? null : fields.terms(field);
+    TermsEnum termsEnum = null;
+
+    if (terms != null) {
+      termsEnum = terms.iterator();
+
+      // TODO: OPT: if seek(ord) is supported for this termsEnum, then we could use it for
+      // facet.offset when sorting by index order.
+
+      if (startTermBytes != null) {
+        if (termsEnum.seek(startTermBytes, true) == TermsEnum.SeekStatus.END) {
+          termsEnum = null;
+        }
+      } else {
+        // position termsEnum on first term
+        termsEnum.next();
+      }
+    }
+
+    Term template = new Term(field);
+    DocsEnum docsEnum = null;
+
+
+    if (termsEnum != null && docs.size() >= mincount) {
+      for(;;) {
+        BytesRef term = termsEnum.term();
+        if (term == null)
+          break;
+
+        if (startTermBytes != null && !term.startsWith(startTermBytes))
+          break;
+
+        int df = termsEnum.docFreq();
+
+        // If we are sorting, we can use df>min (rather than >=) since we
+        // are going in index order.  For certain term distributions this can
+        // make a large difference (for example, many terms with df=1).
+        if (df>0 && df>min) {
+          int c;
+
+          if (df >= minDfFilterCache) {
+            // use the filter cache
+            // TODO: not a big deal, but there are prob more efficient ways to go from utf8 to string
+            // TODO: need a term query that takes a BytesRef
+            Term t = template.createTerm(new String(term.utf8ToString()));
+            c = searcher.numDocs(new TermQuery(t), docs);
+          } else {
+            // iterate over TermDocs to calculate the intersection
+
+            // TODO: specialize when base docset is a bitset or hash set (skipDocs)?  or does it matter for this?
+            // TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class impl)
+            docsEnum = termsEnum.docs(null, docsEnum);
+
+            // this should be the same bulk result object if sharing of the docsEnum succeeded
+            DocsEnum.BulkReadResult bulk = docsEnum.getBulkResult();
+
+            c=0;
+            for (;;) {
+              int nDocs = docsEnum.read();
+              if (nDocs == 0) break;
+              int[] docArr = bulk.docs.ints;  // this might be movable outside the loop, but perhaps not worth the risk.
+              for (int i=0; i<nDocs; i++) {
+                if (docs.exists(docArr[i])) c++;
+              }
+            }
+          }
+
+          if (sortByCount) {
+            if (c>min) {
+              BytesRef termCopy = new BytesRef(term);
+              queue.add(new CountPair<BytesRef,Integer>(termCopy, c));
+              if (queue.size()>=maxsize) min=queue.last().val;
+            }
+          } else {
+            if (c >= mincount && --off<0) {
+              if (--lim<0) break;
+              BytesRef termCopy = new BytesRef(term);
+              String s = term.utf8ToString();
+              res.add(ft.indexedToReadable(s), c);
+            }
+          }
+        }
+
+        termsEnum.next();
+      }
+    }
+
+    if (sortByCount) {
+      for (CountPair<BytesRef,Integer> p : queue) {
        if (--off>=0) continue;
        if (--lim<0) break;
-        res.add(ft.indexedToReadable(p.key), p.val);
+        String s = p.key.utf8ToString();        
+        res.add(ft.indexedToReadable(s), p.val);
      }
    }

@ -566,9 +613,6 @@ public class SimpleFacets {
      res.add(null, getFieldMissingCount(searcher,docs,field));
    }

-    te.close();
-    td.close();    
-
    return res;
  }

--- a/solr/src/java/org/apache/solr/search/SolrIndexReader.java
+++ b/solr/src/java/org/apache/solr/search/SolrIndexReader.java
@ -23,6 +23,8 @@ import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.LockObtainFailedException;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.FieldSelector;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;

 import java.io.IOException;
 import java.util.Collection;
@ -222,6 +224,11 @@ public class SolrIndexReader extends FilterIndexReader {
    return in.directory();
  }

+  @Override
+  public Bits getDeletedDocs() throws IOException {
+    return in.getDeletedDocs();
+  }
+
  @Override
  public TermFreqVector[] getTermFreqVectors(int docNumber) throws IOException {
    return in.getTermFreqVectors(docNumber);
@ -297,6 +304,11 @@ public class SolrIndexReader extends FilterIndexReader {
    return in.terms();
  }

+  @Override
+  public Fields fields() throws IOException {
+    return in.fields();
+  }
+
  @Override
  public TermEnum terms(Term t) throws IOException {
    return in.terms(t);
@ -308,6 +320,11 @@ public class SolrIndexReader extends FilterIndexReader {
    return in.docFreq(t);
  }

+  @Override
+  public int docFreq(String field, BytesRef t) throws IOException {
+    return in.docFreq(field, t);
+  }
+
  @Override
  public TermDocs termDocs() throws IOException {
    ensureOpen();
@ -320,6 +337,21 @@ public class SolrIndexReader extends FilterIndexReader {
    return in.termDocs(term);
  }

+  @Override
+  public Terms terms(String field) throws IOException {
+    return in.terms(field);
+  }
+
+  @Override
+  public DocsEnum termDocsEnum(Bits skipDocs, String field, BytesRef term) throws IOException {
+    return in.termDocsEnum(skipDocs, field, term);
+  }
+
+  @Override
+  public DocsAndPositionsEnum termPositionsEnum(Bits skipDocs, String field, BytesRef term) throws IOException {
+    return in.termPositionsEnum(skipDocs, field, term);
+  }
+
  @Override
  public TermPositions termPositions() throws IOException {
    ensureOpen();
@ -329,6 +361,7 @@ public class SolrIndexReader extends FilterIndexReader {
  @Override
  protected void doDelete(int n) throws  CorruptIndexException, IOException { in.deleteDocument(n); }

+
  // Let FilterIndexReader handle commit()... we cannot override commit()
  // or call in.commit() ourselves.
  // protected void doCommit() throws IOException { in.commit(); }
@ -363,6 +396,11 @@ public class SolrIndexReader extends FilterIndexReader {
    return subReaders;
  }

+  @Override
+  public int getSubReaderDocBase(IndexReader subReader) {
+    return in.getSubReaderDocBase(subReader);
+  }
+
  @Override
  public int hashCode() {
    return in.hashCode();
@ -405,7 +443,7 @@ public class SolrIndexReader extends FilterIndexReader {

  @Override
  public long getUniqueTermCount() throws IOException {
-    return super.getUniqueTermCount();
+    return in.getUniqueTermCount();
  }

  @Override