Fix for SOLR-3132

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1243774 13f79535-47bb-0310-9956-ffa450edef68
2012-02-14 03:09:02 +00:00 · 2012-02-14 03:09:02 +00:00 · 6bb4ea097f
parent 26de35a65c
commit 6bb4ea097f
5 changed files with 164 additions and 151 deletions
--- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java
@ -738,7 +738,7 @@ public class CoreAdminHandler extends RequestHandlerBase {
        info.add("uptime", System.currentTimeMillis() - core.getStartTime());
        RefCounted<SolrIndexSearcher> searcher = core.getSearcher();
        try {
-          SimpleOrderedMap<Object> indexInfo = LukeRequestHandler.getIndexInfo(searcher.get().getIndexReader(), false);
+          SimpleOrderedMap<Object> indexInfo = LukeRequestHandler.getIndexInfo(searcher.get().getIndexReader());
          long size = getIndexSize(core);
          indexInfo.add("sizeInBytes", size);
          indexInfo.add("size", NumberUtils.readableSize(size));
--- a/solr/core/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java
@ -82,25 +82,25 @@ public class LukeRequestHandler extends RequestHandlerBase
  public static final String DOC_ID = "docId";
  public static final String ID = "id";
  public static final int DEFAULT_COUNT = 10;
-  
+
  static final int HIST_ARRAY_SIZE = 33;
-  
+
  private static enum ShowStyle {
    ALL,
    DOC,
    SCHEMA,
    INDEX;
-    
+
    public static ShowStyle get(String v) {
      if(v==null) return null;
-      if("schema".equals(v)) return SCHEMA;
-      if("index".equals(v))  return INDEX;
-      if("doc".equals(v))    return DOC;
-      if("all".equals(v))    return ALL;
+      if("schema".equalsIgnoreCase(v)) return SCHEMA;
+      if("index".equalsIgnoreCase(v))  return INDEX;
+      if("doc".equalsIgnoreCase(v))    return DOC;
+      if("all".equalsIgnoreCase(v))    return ALL;
      throw new SolrException(ErrorCode.BAD_REQUEST, "Unknown Show Style: "+v);
    }
  };
-  
+

  @Override
  public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception
@ -109,29 +109,17 @@ public class LukeRequestHandler extends RequestHandlerBase
    SolrIndexSearcher searcher = req.getSearcher();
    DirectoryReader reader = searcher.getIndexReader();
    SolrParams params = req.getParams();
-    int numTerms = params.getInt( NUMTERMS, DEFAULT_COUNT );
    ShowStyle style = ShowStyle.get(params.get("show"));

-    // Always show the core lucene info
-    Map<String, TopTermQueue> topTerms = new TreeMap<String, TopTermQueue>();
-
    // If no doc is given, show all fields and top terms
-    Set<String> fields = null;
-    String fl = params.get(CommonParams.FL);
-    if (fl != null) {
-      fields = new TreeSet<String>(Arrays.asList(fl.split( "[,\\s]+" )));
-    }
-    if( ShowStyle.SCHEMA == style ) {
-      numTerms = 0; // Abort any statistics gathering.
+
+    rsp.add("index", getIndexInfo(reader));
+
+    if(ShowStyle.INDEX==style) {
+      return; // that's all we need
    }

-    rsp.add("index", getIndexInfo(reader, numTerms, topTerms, fields ));
-    
-    if(ShowStyle.INDEX==style) {
-      return; // thats all we need
-    }
-        
-    
+
    Integer docId = params.getInt( DOC_ID );
    if( docId == null && params.get( ID ) != null ) {
      // Look for something with a given solr ID
@ -170,7 +158,7 @@ public class LukeRequestHandler extends RequestHandlerBase
      rsp.add( "schema", getSchemaInfo( req.getSchema() ) );
    }
    else {
-      rsp.add( "fields", getIndexedFieldsInfo( searcher, fields, numTerms, topTerms) ) ;
+      rsp.add( "fields", getIndexedFieldsInfo(req) ) ;
    }

    // Add some generally helpful information
@ -255,7 +243,8 @@ public class LukeRequestHandler extends RequestHandlerBase
    return key;
  }

-  private static SimpleOrderedMap<Object> getDocumentFieldsInfo( Document doc, int docId, IndexReader reader, IndexSchema schema ) throws IOException
+  private static SimpleOrderedMap<Object> getDocumentFieldsInfo( Document doc, int docId, IndexReader reader,
+                                                                 IndexSchema schema ) throws IOException
  {
    final CharsRef spare = new CharsRef();
    SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<Object>();
@ -311,13 +300,22 @@ public class LukeRequestHandler extends RequestHandlerBase
  }

  @SuppressWarnings("unchecked")
-  private static SimpleOrderedMap<Object> getIndexedFieldsInfo(
-      final SolrIndexSearcher searcher, final Set<String> fields, final int numTerms, Map<String,TopTermQueue> ttinfo)
+  private static SimpleOrderedMap<Object> getIndexedFieldsInfo(SolrQueryRequest req)
      throws Exception {

+    SolrIndexSearcher searcher = req.getSearcher();
+    SolrParams params = req.getParams();
+
+    Set<String> fields = null;
+    String fl = params.get(CommonParams.FL);
+    if (fl != null) {
+      fields = new TreeSet<String>(Arrays.asList(fl.split( "[,\\s]+" )));
+    }
+
    AtomicReader reader = searcher.getAtomicReader();
    IndexSchema schema = searcher.getSchema();

+    // Don't be tempted to put this in the loop below, the whole point here is to alphabetize the fields!
    Set<String> fieldNames = new TreeSet<String>();
    for(FieldInfo fieldInfo : reader.getFieldInfos()) {
      fieldNames.add(fieldInfo.name);
@ -325,82 +323,90 @@ public class LukeRequestHandler extends RequestHandlerBase

    // Walk the term enum and keep a priority queue for each map in our set
    SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<Object>();
-    Fields theFields = reader.fields();

    for (String fieldName : fieldNames) {
      if (fields != null && ! fields.contains(fieldName) && ! fields.contains("*")) {
-        continue; // we're not interested in this term
+        continue; //we're not interested in this field Still an issue here
      }

-      SimpleOrderedMap<Object> f = new SimpleOrderedMap<Object>();
+      SimpleOrderedMap<Object> fieldMap = new SimpleOrderedMap<Object>();

      SchemaField sfield = schema.getFieldOrNull( fieldName );
      FieldType ftype = (sfield==null)?null:sfield.getType();

-      f.add( "type", (ftype==null)?null:ftype.getTypeName() );
-      f.add( "schema", getFieldFlags( sfield ) );
+      fieldMap.add( "type", (ftype==null)?null:ftype.getTypeName() );
+      fieldMap.add("schema", getFieldFlags(sfield));
      if (sfield != null && schema.isDynamicField(sfield.getName()) && schema.getDynamicPattern(sfield.getName()) != null) {
-        f.add("dynamicBase", schema.getDynamicPattern(sfield.getName()));
+        fieldMap.add("dynamicBase", schema.getDynamicPattern(sfield.getName()));
      }
-
-      Terms terms = theFields.terms(fieldName);
+      Terms terms = reader.fields().terms(fieldName);
      if (terms == null) { // Not indexed, so we need to report what we can (it made it through the fl param if specified)
-        finfo.add( fieldName, f );
+        finfo.add( fieldName, fieldMap );
        continue;
      }

-      TopTermQueue topTerms = ttinfo.get( fieldName );
-      // If numTerms==0, the call is just asking for a quick field list
-      if( ttinfo != null && sfield != null && sfield.indexed() ) {
-        if (numTerms > 0) { // Read the actual field from the index and report that too.
-          Document doc = null;
-          if (topTerms != null && topTerms.getTopTermInfo() != null) {
-            Term term = topTerms.getTopTermInfo().term;
-            DocsEnum docsEnum = reader.termDocsEnum(reader.getLiveDocs(),
-                term.field(),
-                new BytesRef(term.text()),
-                false);
-            if (docsEnum != null) {
-              int docId;
-              if ((docId = docsEnum.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
-                doc = reader.document(docId);
-              }
+      if(sfield != null && sfield.indexed() ) {
+        // In the pre-4.0 days, this did a veeeery expensive range query. But we can be much faster now,
+        // so just do this all the time.
+        Document doc = getFirstLiveDoc(reader, fieldName, terms);
+
+
+        if( doc != null ) {
+          // Found a document with this field
+          try {
+            IndexableField fld = doc.getField( fieldName );
+            if( fld != null ) {
+              fieldMap.add("index", getFieldFlags(fld));
+            }
+            else {
+              // it is a non-stored field...
+              fieldMap.add("index", "(unstored field)");
            }
          }
-          if( doc != null ) {
-            // Found a document with this field
-            try {
-              IndexableField fld = doc.getField( fieldName );
-              if( fld != null ) {
-                f.add( "index", getFieldFlags( fld ) );
-              }
-              else {
-                // it is a non-stored field...
-                f.add( "index", "(unstored field)" );
-              }
-            }
-            catch( Exception ex ) {
-              log.warn( "error reading field: "+fieldName );
-            }
+          catch( Exception ex ) {
+            log.warn( "error reading field: "+fieldName );
          }
-          f.add("docs", terms.getDocCount());
        }
-        if( topTerms != null ) {
-          f.add( "distinct", topTerms.distinctTerms );
+        fieldMap.add("docs", terms.getDocCount());

-          // Include top terms
-          f.add( "topTerms", topTerms.toNamedList( searcher.getSchema() ) );
-
-          // Add a histogram
-          f.add( "histogram", topTerms.histogram.toNamedList() );
-        }
+      }
+      if (fields != null && (fields.contains(fieldName) || fields.contains("*"))) {
+        getDetailedFieldInfo(req, fieldName, fieldMap);
      }
      // Add the field
-      finfo.add( fieldName, f );
+      finfo.add( fieldName, fieldMap );
    }
    return finfo;
  }

+  // Just get a document with the term in it, the first one will do!
+  // Is there a better way to do this? Shouldn't actually be very costly
+  // to do it this way.
+  private static Document getFirstLiveDoc(AtomicReader reader, String fieldName, Terms terms) throws IOException {
+    DocsEnum docsEnum = null;
+    TermsEnum termsEnum = terms.iterator(null);
+    BytesRef text;
+    // Deal with the chance that the first bunch of terms are in deleted documents. Is there a better way?
+    for (int idx = 0; idx < 1000 && docsEnum == null; ++idx) {
+      text = termsEnum.next();
+      if (text == null) { // Ran off the end of the terms enum without finding any live docs with that field in them.
+        return null;
+      }
+      Term term = new Term(fieldName, text);
+      docsEnum = reader.termDocsEnum(reader.getLiveDocs(),
+          term.field(),
+          new BytesRef(term.text()),
+          false);
+      if (docsEnum != null) {
+        int docId;
+        if ((docId = docsEnum.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
+          return reader.document(docId);
+        }
+      }
+    }
+    return null;
+  }
+
  /**
   * Return info from the index
   */
@ -525,67 +531,24 @@ public class LukeRequestHandler extends RequestHandlerBase
    v.add( f.getName() );
    typeusemap.put( ft.getTypeName(), v );
  }
-  public static SimpleOrderedMap<Object> getIndexInfo(DirectoryReader reader, boolean countTerms) throws IOException {
-    return getIndexInfo(reader, countTerms ? 1 : 0, null, null);
+
+  /**
+   * @deprecated use {@link #getIndexInfo(DirectoryReader)} since you now have to explicitly pass the "fl" prameter
+   * and this was always called with "false" anyway from CoreAdminHandler
+   */
+  public static SimpleOrderedMap<Object> getIndexInfo(DirectoryReader reader, boolean detail) throws IOException {
+    return getIndexInfo(reader);
  }
-  public static SimpleOrderedMap<Object> getIndexInfo( DirectoryReader reader, int numTerms,
-                                                       Map<String, TopTermQueue> topTerms,
-                                                       Set<String> fieldList) throws IOException {
+  // This method just gets the top-most level of information. This was conflated with getting detailed info
+  // for *all* the fields, called from CoreAdminHandler etc.
+
+  public static SimpleOrderedMap<Object> getIndexInfo(DirectoryReader reader) throws IOException {
    Directory dir = reader.directory();
    SimpleOrderedMap<Object> indexInfo = new SimpleOrderedMap<Object>();

    indexInfo.add("numDocs", reader.numDocs());
    indexInfo.add("maxDoc", reader.maxDoc());
-    final CharsRef spare = new CharsRef();
-    if( numTerms > 0 ) {
-      Fields fields = MultiFields.getFields(reader);
-      long totalTerms = 0;
-      if (fields != null) {
-        FieldsEnum fieldsEnum = fields.iterator();
-        String field;
-        while ((field = fieldsEnum.next()) != null) {
-          Terms terms = fieldsEnum.terms();
-          if (terms == null) {
-            continue;
-          }
-          totalTerms += terms.getUniqueTermCount();

-          if (fieldList != null && ! fieldList.contains(field) && ! fieldList.contains("*")) {
-            continue;
-          }
-
-          TermsEnum termsEnum = terms.iterator(null);
-          BytesRef text;
-          int[] buckets = new int[HIST_ARRAY_SIZE];
-          TopTermQueue tiq = topTerms.get(field);
-          if (tiq == null) {
-            tiq = new TopTermQueue(numTerms + 1);   // Allocating slots for the top N terms to collect freqs.
-            topTerms.put(field, tiq);
-          }
-          while ((text = termsEnum.next()) != null) {
-            int freq = termsEnum.docFreq();  // This calculation seems odd, but it gives the same results as it used to.
-            int slot = 32 - Integer.numberOfLeadingZeros(Math.max(0, freq - 1));
-            buckets[slot] = buckets[slot] + 1;
-            if (freq > tiq.minFreq) {
-              UnicodeUtil.UTF8toUTF16(text, spare);
-              String t = spare.toString();
-              tiq.distinctTerms = new Long(fieldsEnum.terms().getUniqueTermCount()).intValue();
-
-              tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum.docFreq()));
-              if (tiq.size() > numTerms) { // if tiq full
-                tiq.pop(); // remove lowest in tiq
-                tiq.minFreq  = tiq.getTopTermInfo().docFreq;
-              }
-            }
-          }
-          tiq.histogram.add(buckets);
-        }
-      }
-      //Clumsy, but I'm tired.
-      indexInfo.add("numTerms", (new Long(totalTerms)).intValue());
-
-    }
-        
    indexInfo.add("version", reader.getVersion());  // TODO? Is this different then: IndexReader.getCurrentVersion( dir )?
    indexInfo.add("segmentCount", reader.getSequentialSubReaders().length);
    indexInfo.add("current", reader.isCurrent() );
@ -598,6 +561,57 @@ public class LukeRequestHandler extends RequestHandlerBase
    }
    return indexInfo;
  }
+
+  // Get terribly detailed information about a particular field. This is a very expensive call, use it with caution
+  // especially on large indexes!
+  private static void getDetailedFieldInfo(SolrQueryRequest req, String field, SimpleOrderedMap<Object> fieldMap)
+      throws IOException {
+
+    SolrParams params = req.getParams();
+    int numTerms = params.getInt( NUMTERMS, DEFAULT_COUNT );
+
+    TopTermQueue tiq = new TopTermQueue(numTerms + 1);  // Something to collect the top N terms in.
+
+    final CharsRef spare = new CharsRef();
+
+    Fields fields = MultiFields.getFields(req.getSearcher().getIndexReader());
+
+    if (fields == null) { // No indexed fields
+      return;
+    }
+
+    Terms terms = fields.terms(field);
+    if (terms == null) {  // No terms in the field.
+      return;
+    }
+    TermsEnum termsEnum = terms.iterator(null);
+    BytesRef text;
+    int[] buckets = new int[HIST_ARRAY_SIZE];
+    while ((text = termsEnum.next()) != null) {
+      int freq = termsEnum.docFreq();  // This calculation seems odd, but it gives the same results as it used to.
+      int slot = 32 - Integer.numberOfLeadingZeros(Math.max(0, freq - 1));
+      buckets[slot] = buckets[slot] + 1;
+      if (freq > tiq.minFreq) {
+        UnicodeUtil.UTF8toUTF16(text, spare);
+        String t = spare.toString();
+        tiq.distinctTerms = new Long(terms.getUniqueTermCount()).intValue();
+
+        tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum.docFreq()));
+        if (tiq.size() > numTerms) { // if tiq full
+          tiq.pop(); // remove lowest in tiq
+          tiq.minFreq = tiq.getTopTermInfo().docFreq;
+        }
+      }
+    }
+    tiq.histogram.add(buckets);
+    fieldMap.add("distinct", tiq.distinctTerms);
+
+    // Include top terms
+    fieldMap.add("topTerms", tiq.toNamedList(req.getSearcher().getSchema()));
+
+    // Add a histogram
+    fieldMap.add("histogram", tiq.histogram.toNamedList());
+  }
  //////////////////////// SolrInfoMBeans methods //////////////////////

  @Override
--- a/solr/core/src/test/org/apache/solr/MinimalSchemaTest.java
+++ b/solr/core/src/test/org/apache/solr/MinimalSchemaTest.java
@ -82,7 +82,6 @@ public class MinimalSchemaTest extends SolrTestCaseJ4 {
    assertQ("basic luke request failed",
            req("qt", "/admin/luke")
            ,"//int[@name='numDocs'][.='2']"
-            ,"//int[@name='numTerms'][.='5']"
            );

    assertQ("luke show schema failed",
--- a/solr/core/src/test/org/apache/solr/handler/admin/LukeRequestHandlerTest.java
+++ b/solr/core/src/test/org/apache/solr/handler/admin/LukeRequestHandlerTest.java
@ -108,35 +108,35 @@ public class LukeRequestHandlerTest extends AbstractSolrTestCase {
    final int numFlags = EnumSet.allOf(FieldFlag.class).size();

    assertQ("Not all flags ("+numFlags+") mentioned in info->key",
-            req("qt","/admin/luke"),
-            numFlags+"=count(//lst[@name='info']/lst[@name='key']/str)");
+        req("qt","/admin/luke"),
+        numFlags+"=count(//lst[@name='info']/lst[@name='key']/str)");

    // code should be the same for all fields, but just in case do several
    for (String f : Arrays.asList("solr_t","solr_s","solr_ti",
-                                  "solr_td","solr_pl","solr_dt","solr_b",
-                                  "solr_sS","solr_sI")) {
+        "solr_td","solr_pl","solr_dt","solr_b",
+        "solr_sS","solr_sI")) {

      final String xp = getFieldXPathPrefix(f);
      assertQ("Not as many schema flags as expected ("+numFlags+") for " + f,
-              req("qt","/admin/luke", "fl", f),
-              numFlags+"=string-length("+xp+"[@name='schema'])");
+          req("qt","/admin/luke", "fl", f),
+          numFlags+"=string-length("+xp+"[@name='schema'])");

    }

    // diff loop for checking 'index' flags,
    // only valid for fields that are indexed & stored
    for (String f : Arrays.asList("solr_t","solr_s","solr_ti",
-                                  "solr_td","solr_pl","solr_dt","solr_b")) {
+        "solr_td","solr_pl","solr_dt","solr_b")) {

      final String xp = getFieldXPathPrefix(f);
      assertQ("Not as many index flags as expected ("+numFlags+") for " + f,
-              req("qt","/admin/luke", "fl", f),
-              numFlags+"=string-length("+xp+"[@name='index'])");
+          req("qt","/admin/luke", "fl", f),
+          numFlags+"=string-length("+xp+"[@name='index'])");

-    final String hxp = getFieldXPathHistogram(f);
-    assertQ("Historgram field should be present for field "+f,
-        req("qt", "/admin/luke", "fl", f),
-        hxp+"[@name='histogram']");
+      final String hxp = getFieldXPathHistogram(f);
+      assertQ("Historgram field should be present for field "+f,
+          req("qt", "/admin/luke", "fl", f),
+          hxp+"[@name='histogram']");
    }
  }

@ -149,7 +149,7 @@ public class LukeRequestHandlerTest extends AbstractSolrTestCase {

  @Test
  public void testFlParam() {
-    SolrQueryRequest req = req("qt", "/admin/luke", "fl", "solr_t solr_s");
+    SolrQueryRequest req = req("qt", "/admin/luke", "fl", "solr_t solr_s", "show", "all");
    try {
      // First, determine that the two fields ARE there
      String response = h.query(req);
--- a/solr/webapp/web/admin/schema.jsp
+++ b/solr/webapp/web/admin/schema.jsp
@ -114,7 +114,7 @@
    //further populates the loaded schema with information gathered
    // from the no argument LukeRequestHandler
    loadFromLukeHandler: function(func) {
-      $.getJSON(solr.pathToLukeHandler+'?wt=json', function(data) {
+      $.getJSON(solr.pathToLukeHandler+'?wt=json&fl=*', function(data) {
        $.each(data.fields, function(i, item) {
          var field = solr.schemaFields[i];