SOLR-13512: Raw index data analysis tool (extension of COLSTATUS collection command).

2019-06-10 19:21:22 +02:00 · 2019-06-10 19:21:22 +02:00 · 010466ec04
parent 67104dd615
commit 010466ec04
8 changed files with 1309 additions and 9 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -110,6 +110,8 @@ New Features

 * SOLR-13434: OpenTracing support for Solr (Cao Manh Dat)

+* SOLR-13512: Raw index data analysis tool (extension of COLSTATUS collection command). (ab)
+
 Bug Fixes
 ----------------------

--- a/solr/core/src/java/org/apache/solr/handler/admin/ColStatus.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/ColStatus.java
@ -57,9 +57,13 @@ public class ColStatus {
  private final ZkNodeProps props;
  private final SolrClientCache solrClientCache;

-  public static final String CORE_INFO_PROP = SegmentsInfoRequestHandler.WITH_CORE_INFO;
-  public static final String FIELD_INFO_PROP = SegmentsInfoRequestHandler.WITH_FIELD_INFO;
-  public static final String SIZE_INFO_PROP = SegmentsInfoRequestHandler.WITH_SIZE_INFO;
+  public static final String CORE_INFO_PROP = SegmentsInfoRequestHandler.CORE_INFO_PARAM;
+  public static final String FIELD_INFO_PROP = SegmentsInfoRequestHandler.FIELD_INFO_PARAM;
+  public static final String SIZE_INFO_PROP = SegmentsInfoRequestHandler.SIZE_INFO_PARAM;
+  public static final String RAW_SIZE_PROP = SegmentsInfoRequestHandler.RAW_SIZE_PARAM;
+  public static final String RAW_SIZE_SUMMARY_PROP = SegmentsInfoRequestHandler.RAW_SIZE_SUMMARY_PARAM;
+  public static final String RAW_SIZE_DETAILS_PROP = SegmentsInfoRequestHandler.RAW_SIZE_DETAILS_PARAM;
+  public static final String RAW_SIZE_SAMPLING_PERCENT_PROP = SegmentsInfoRequestHandler.RAW_SIZE_SAMPLING_PERCENT_PARAM;
  public static final String SEGMENTS_PROP = "segments";

  public ColStatus(HttpClient httpClient, ClusterState clusterState, ZkNodeProps props) {
@ -80,6 +84,14 @@ public class ColStatus {
    boolean withSegments = props.getBool(SEGMENTS_PROP, false);
    boolean withCoreInfo = props.getBool(CORE_INFO_PROP, false);
    boolean withSizeInfo = props.getBool(SIZE_INFO_PROP, false);
+    boolean withRawSizeInfo = props.getBool(RAW_SIZE_PROP, false);
+    boolean withRawSizeSummary = props.getBool(RAW_SIZE_SUMMARY_PROP, false);
+    boolean withRawSizeDetails = props.getBool(RAW_SIZE_DETAILS_PROP, false);
+    Object samplingPercentVal = props.get(RAW_SIZE_SAMPLING_PERCENT_PROP);
+    Float samplingPercent = samplingPercentVal != null ? Float.parseFloat(String.valueOf(samplingPercentVal)) : null;
+    if (withRawSizeSummary || withRawSizeDetails) {
+      withRawSizeInfo = true;
+    }
    if (withFieldInfo || withSizeInfo) {
      withSegments = true;
    }
@ -159,6 +171,12 @@ public class ColStatus {
          params.add(FIELD_INFO_PROP, "true");
          params.add(CORE_INFO_PROP, String.valueOf(withCoreInfo));
          params.add(SIZE_INFO_PROP, String.valueOf(withSizeInfo));
+          params.add(RAW_SIZE_PROP, String.valueOf(withRawSizeInfo));
+          params.add(RAW_SIZE_SUMMARY_PROP, String.valueOf(withRawSizeSummary));
+          params.add(RAW_SIZE_DETAILS_PROP, String.valueOf(withRawSizeDetails));
+          if (samplingPercent != null) {
+            params.add(RAW_SIZE_SAMPLING_PERCENT_PROP, String.valueOf(samplingPercent));
+          }
          QueryRequest req = new QueryRequest(params);
          NamedList<Object> rsp = client.request(req);
          rsp.remove("responseHeader");
--- a/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java
@ -530,6 +530,10 @@ public class CollectionsHandler extends RequestHandlerBase implements Permission
          ColStatus.CORE_INFO_PROP,
          ColStatus.SEGMENTS_PROP,
          ColStatus.FIELD_INFO_PROP,
+          ColStatus.RAW_SIZE_PROP,
+          ColStatus.RAW_SIZE_SUMMARY_PROP,
+          ColStatus.RAW_SIZE_DETAILS_PROP,
+          ColStatus.RAW_SIZE_SAMPLING_PERCENT_PROP,
          ColStatus.SIZE_INFO_PROP);
      // make sure we can get the name if there's "name" but not "collection"
      if (props.containsKey(CoreAdminParams.NAME) && !props.containsKey(COLLECTION_PROP)) {
--- a/solr/core/src/java/org/apache/solr/handler/admin/IndexSizeEstimator.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/IndexSizeEstimator.java
@ -0,0 +1,711 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.admin;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.nio.file.Paths;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.Objects;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.function.Function;
+
+import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
+import org.apache.lucene.codecs.StoredFieldsReader;
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.CodecReader;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.PointValues;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.index.SortedNumericDocValues;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.index.StandardDirectoryReader;
+import org.apache.lucene.index.StoredFieldVisitor;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.PriorityQueue;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.SuppressForbidden;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.solr.common.MapWriter;
+import org.apache.solr.common.util.Utils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Estimates the raw size of all uncompressed indexed data by scanning term, docValues and
+ * stored fields data. This utility also provides detailed statistics about term, docValues,
+ * postings and stored fields distributions.
+ */
+public class IndexSizeEstimator {
+  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  public static final String TERMS = "terms";
+  public static final String STORED_FIELDS = "storedFields";
+  public static final String NORMS = "norms";
+  public static final String DOC_VALUES = "docValues";
+  public static final String POINTS = "points";
+  public static final String TERM_VECTORS = "termVectors";
+  public static final String SUMMARY = "summary";
+  public static final String DETAILS = "details";
+  public static final String FIELDS_BY_SIZE = "fieldsBySize";
+  public static final String TYPES_BY_SIZE = "typesBySize";
+
+  public static final int DEFAULT_SAMPLING_THRESHOLD = 100_000;
+  public static final float DEFAULT_SAMPLING_PERCENT = 5.0f;
+
+  private final IndexReader reader;
+  private final int topN;
+  private final int maxLength;
+  private final boolean withSummary;
+  private final boolean withDetails;
+  private int samplingThreshold = DEFAULT_SAMPLING_THRESHOLD;
+  private float samplingPercent = DEFAULT_SAMPLING_PERCENT;
+  private int samplingStep = 1;
+
+  public static final class Estimate implements MapWriter {
+    private final Map<String, Long> fieldsBySize;
+    private final Map<String, Long> typesBySize;
+    private final Map<String, Object> summary;
+    private final Map<String, Object> details;
+
+    public Estimate(Map<String, Long> fieldsBySize, Map<String, Long> typesBySize, Map<String, Object> summary, Map<String, Object> details) {
+      Objects.requireNonNull(fieldsBySize);
+      Objects.requireNonNull(typesBySize);
+      this.fieldsBySize = fieldsBySize;
+      this.typesBySize = typesBySize;
+      this.summary = summary;
+      this.details = details;
+    }
+
+    public Map<String, Long> getFieldsBySize() {
+      return fieldsBySize;
+    }
+
+    public Map<String, Long> getTypesBySize() {
+      return typesBySize;
+    }
+
+    public Map<String, String> getHumanReadableFieldsBySize() {
+      LinkedHashMap<String, String> result = new LinkedHashMap<>();
+      fieldsBySize.forEach((field, size) -> result.put(field, RamUsageEstimator.humanReadableUnits(size)));
+      return result;
+    }
+
+    public Map<String, String> getHumanReadableTypesBySize() {
+      LinkedHashMap<String, String> result = new LinkedHashMap<>();
+      typesBySize.forEach((field, size) -> result.put(field, RamUsageEstimator.humanReadableUnits(size)));
+      return result;
+    }
+
+    public Map<String, Object> getSummary() {
+      return summary;
+    }
+
+    public Map<String, Object> getDetails() {
+      return details;
+    }
+
+    @Override
+    public void writeMap(EntryWriter ew) throws IOException {
+      ew.put(FIELDS_BY_SIZE, fieldsBySize);
+      ew.put(TYPES_BY_SIZE, typesBySize);
+      if (summary != null) {
+        ew.put(SUMMARY, summary);
+      }
+      if (details != null) {
+        ew.put(DETAILS, details);
+      }
+    }
+  }
+
+  public IndexSizeEstimator(IndexReader reader, int topN, int maxLength, boolean withSummary, boolean withDetails) {
+    this.reader = reader;
+    this.topN = topN;
+    this.maxLength = maxLength;
+    this.withSummary = withSummary;
+    this.withDetails = withDetails;
+  }
+
+  /**
+   * Set the sampling threshold. If the index has more documents than this threshold
+   * then only some values will be sampled and the totals will be extrapolated.
+   * @param threshold size threshold (number of documents). Default value is {@link #DEFAULT_SAMPLING_THRESHOLD}.
+   *                  Setting this to values &lt;= 0 means no threshold (and no sampling).
+   */
+  public void setSamplingThreshold(int threshold) {
+    if (threshold <= 0) {
+      threshold = Integer.MAX_VALUE;
+    }
+    this.samplingThreshold = threshold;
+  }
+
+  /**
+   * Sampling percent (a number greater than 0 and less or equal to 100). When index size exceeds
+   * the threshold then approximately only this percent of data will be retrieved from the index and the
+   * totals will be extrapolated.
+   * @param percent sample percent. Default value is {@link #DEFAULT_SAMPLING_PERCENT}.
+   * @throws IllegalArgumentException when value is less than or equal to 0.0 or greater than 100.0, or
+   *        the sampling percent is so small that less than 10 documents would be sampled.
+   */
+  public void setSamplingPercent(float percent) throws IllegalArgumentException {
+    if (percent <= 0 || percent > 100) {
+      throw new IllegalArgumentException("samplingPercent must be 0 < percent <= 100");
+    }
+    if (reader.maxDoc() > samplingThreshold) {
+      samplingStep = Math.round(100.0f / samplingPercent);
+      log.info("- number of documents {} larger than {}, sampling percent is {} and sampling step {}", reader.maxDoc(), samplingThreshold, samplingPercent, samplingStep);
+      if (reader.maxDoc() / samplingStep < 10) {
+        throw new IllegalArgumentException("Out of " + reader.maxDoc() + " less than 10 documents would be sampled, which is too unreliable. Increase the samplingPercent.");
+      }
+    }
+    this.samplingPercent = percent;
+  }
+
+  public Estimate estimate() throws Exception {
+    Map<String, Object> details = new LinkedHashMap<>();
+    Map<String, Object> summary = new LinkedHashMap<>();
+    estimateStoredFields(details);
+    estimateTerms(details);
+    estimateNorms(details);
+    estimatePoints(details);
+    estimateTermVectors(details);
+    estimateDocValues(details);
+    estimateSummary(details, summary);
+    if (samplingStep > 1) {
+      details.put("samplingPercent", samplingPercent);
+      details.put("samplingStep", samplingStep);
+    }
+    ItemPriorityQueue fieldSizeQueue = new ItemPriorityQueue(summary.size());
+    summary.forEach((field, perField) -> {
+      long size = ((AtomicLong)((Map<String, Object>)perField).get("totalSize")).get();
+      if (size > 0) {
+        fieldSizeQueue.insertWithOverflow(new Item(field, size));
+      }
+    });
+    Map<String, Long> fieldsBySize = new LinkedHashMap<>();
+    fieldSizeQueue._forEachEntry((k, v) -> fieldsBySize.put((String)k, (Long)v));
+    Map<String, AtomicLong> typeSizes = new HashMap<>();
+    summary.forEach((field, perField) -> {
+      Map<String, Object> perType = (Map<String, Object>)((Map<String, Object>)perField).get("perType");
+      perType.forEach((type, size) -> {
+        if (type.contains("_lengths")) {
+          AtomicLong totalSize = typeSizes.computeIfAbsent(type.replace("_lengths", ""), t -> new AtomicLong());
+          totalSize.addAndGet(((AtomicLong)size).get());
+        }
+      });
+    });
+    ItemPriorityQueue typesSizeQueue = new ItemPriorityQueue(typeSizes.size());
+    typeSizes.forEach((type, size) -> {
+      if (size.get() > 0) {
+        typesSizeQueue.insertWithOverflow(new Item(type, size.get()));
+      }
+    });
+    Map<String, Long> typesBySize = new LinkedHashMap<>();
+    typesSizeQueue._forEachEntry((k, v) -> typesBySize.put((String)k, (Long)v));
+    // sort summary by field size
+    Map<String, Object> newSummary = new LinkedHashMap<>();
+    fieldsBySize.keySet().forEach(k -> newSummary.put(String.valueOf(k), summary.get(k)));
+    // convert everything to maps and primitives
+    convert(newSummary);
+    convert(details);
+    return new Estimate(fieldsBySize, typesBySize, withSummary ? newSummary : null, withDetails ? details : null);
+  }
+
+  private void convert(Map<String, Object> result) {
+    for (Map.Entry<String, Object> entry : result.entrySet()) {
+      Object value = entry.getValue();
+      if (value instanceof ItemPriorityQueue) {
+        ItemPriorityQueue queue = (ItemPriorityQueue)value;
+        Map<String, Object> map = new LinkedHashMap<>();
+        queue.toMap(map);
+        entry.setValue(map);
+      } else if (value instanceof MapWriterSummaryStatistics) {
+        MapWriterSummaryStatistics stats = (MapWriterSummaryStatistics)value;
+        Map<String, Object> map = new LinkedHashMap<>();
+        stats.toMap(map);
+        entry.setValue(map);
+      } else if (value instanceof AtomicLong) {
+        entry.setValue(((AtomicLong)value).longValue());
+      } else if (value instanceof Map) {
+        // recurse
+        convert((Map<String, Object>)value);
+      }
+    }
+  }
+
+  private void estimateSummary(Map<String, Object> details, Map<String, Object> summary) {
+    log.info("- preparing summary...");
+    details.forEach((type, perType) -> {
+      ((Map<String, Object>)perType).forEach((field, perField) -> {
+        Map<String, Object> perFieldSummary = (Map<String, Object>)summary.computeIfAbsent(field, f -> new HashMap<>());
+        ((Map<String, Object>)perField).forEach((k, val) -> {
+          if (val instanceof SummaryStatistics) {
+            SummaryStatistics stats = (SummaryStatistics)val;
+            if (k.startsWith("lengths")) {
+              AtomicLong total = (AtomicLong)perFieldSummary.computeIfAbsent("totalSize", kt -> new AtomicLong());
+              total.addAndGet((long)stats.getSum());
+            }
+            Map<String, Object> perTypeSummary = (Map<String, Object>)perFieldSummary.computeIfAbsent("perType", pt -> new HashMap<>());
+            AtomicLong total = (AtomicLong)perTypeSummary.computeIfAbsent(type + "_" + k, t -> new AtomicLong());
+            total.addAndGet((long)stats.getSum());
+          }
+        });
+      });
+    });
+  }
+
+  private void estimateNorms(Map<String, Object> result) throws IOException {
+    log.info("- estimating norms...");
+    Map<String, Map<String, Object>> stats = new HashMap<>();
+    for (LeafReaderContext leafReaderContext : reader.leaves()) {
+      LeafReader leafReader = leafReaderContext.reader();
+      FieldInfos fieldInfos = leafReader.getFieldInfos();
+      for (FieldInfo info : fieldInfos) {
+        NumericDocValues norms = leafReader.getNormValues(info.name);
+        if (norms == null) {
+          continue;
+        }
+        Map<String, Object> perField = stats.computeIfAbsent(info.name, n -> new HashMap<>());
+        SummaryStatistics lengthSummary = (SummaryStatistics)perField.computeIfAbsent("lengths", s -> new MapWriterSummaryStatistics());
+        while (norms.advance(norms.docID() + samplingStep) != DocIdSetIterator.NO_MORE_DOCS) {
+          for (int i = 0; i < samplingStep; i++) {
+            lengthSummary.addValue(8);
+          }
+        }
+      }
+    }
+    result.put(NORMS, stats);
+  }
+
+  private void estimatePoints(Map<String, Object> result) throws IOException {
+    log.info("- estimating points...");
+    Map<String, Map<String, Object>> stats = new HashMap<>();
+    for (LeafReaderContext leafReaderContext : reader.leaves()) {
+      LeafReader leafReader = leafReaderContext.reader();
+      FieldInfos fieldInfos = leafReader.getFieldInfos();
+      for (FieldInfo info : fieldInfos) {
+        PointValues values = leafReader.getPointValues(info.name);
+        if (values == null) {
+          continue;
+        }
+        Map<String, Object> perField = stats.computeIfAbsent(info.name, n -> new HashMap<>());
+        SummaryStatistics lengthSummary = (SummaryStatistics)perField.computeIfAbsent("lengths", s -> new MapWriterSummaryStatistics());
+        lengthSummary.addValue(values.size() * values.getBytesPerDimension() * values.getNumIndexDimensions());
+      }
+    }
+    result.put(POINTS, stats);
+  }
+
+  private void estimateTermVectors(Map<String, Object> result) throws IOException {
+    log.info("- estimating term vectors...");
+    Map<String, Map<String, Object>> stats = new HashMap<>();
+    for (LeafReaderContext leafReaderContext : reader.leaves()) {
+      LeafReader leafReader = leafReaderContext.reader();
+      Bits liveDocs = leafReader.getLiveDocs();
+      for (int docId = 0; docId < leafReader.maxDoc(); docId += samplingStep) {
+        if (liveDocs != null && !liveDocs.get(docId)) {
+          continue;
+        }
+        Fields termVectors = leafReader.getTermVectors(docId);
+        if (termVectors == null) {
+          continue;
+        }
+        for (String field : termVectors) {
+          Terms terms = termVectors.terms(field);
+          if (terms == null) {
+            continue;
+          }
+          estimateTermStats(field, terms, stats, true);
+        }
+      }
+    }
+    result.put(TERM_VECTORS, stats);
+  }
+
+  private void estimateDocValues(Map<String, Object> result) throws IOException {
+    log.info("- estimating docValues...");
+    Map<String, Map<String, Object>> stats = new HashMap<>();
+    for (LeafReaderContext context : reader.leaves()) {
+      LeafReader leafReader = context.reader();
+      FieldInfos fieldInfos = leafReader.getFieldInfos();
+      for (FieldInfo info : fieldInfos) {
+        // binary
+        countDocValues(stats, info.name, "binary", leafReader.getBinaryDocValues(info.name), values -> {
+          try {
+            BytesRef value = ((BinaryDocValues) values).binaryValue();
+            return value.length;
+          } catch (IOException e) {
+            // ignore
+          }
+          return 0;
+        });
+        // numeric
+        countDocValues(stats, info.name, "numeric", leafReader.getNumericDocValues(info.name), values -> 8);
+        countDocValues(stats, info.name, "sorted", leafReader.getSortedDocValues(info.name), values -> {
+          try {
+            TermsEnum termsEnum = ((SortedDocValues) values).termsEnum();
+            BytesRef term;
+            while ((term = termsEnum.next()) != null) {
+              return term.length;
+            }
+          } catch (IOException e) {
+            // ignore
+          }
+          return 0;
+        });
+        countDocValues(stats, info.name, "sortedNumeric", leafReader.getSortedNumericDocValues(info.name),
+            values -> ((SortedNumericDocValues) values).docValueCount() * 8);
+        countDocValues(stats, info.name, "sortedSet", leafReader.getSortedSetDocValues(info.name), values -> {
+          try {
+            TermsEnum termsEnum = ((SortedSetDocValues) values).termsEnum();
+            BytesRef term;
+            while ((term = termsEnum.next()) != null) {
+              return term.length;
+            }
+          } catch (IOException e) {
+            // ignore
+          }
+          return 0;
+        });
+      }
+    }
+    result.put(DOC_VALUES, stats);
+  }
+
+  private void countDocValues(Map<String, Map<String, Object>> stats, String field, String type, DocIdSetIterator values,
+                              Function<DocIdSetIterator, Integer> valueLength) throws IOException {
+    if (values == null) {
+      return;
+    }
+    Map<String, Object> perField = stats.computeIfAbsent(field, n -> new HashMap<>());
+    SummaryStatistics lengthSummary = (SummaryStatistics)perField.computeIfAbsent("lengths_" + type, s -> new MapWriterSummaryStatistics());
+    while (values.advance(values.docID() + samplingStep) != DocIdSetIterator.NO_MORE_DOCS) {
+      int len = valueLength.apply(values);
+      for (int i = 0; i < samplingStep; i++) {
+        lengthSummary.addValue(len);
+      }
+    }
+  }
+
+  private void estimateTerms(Map<String, Object> result) throws IOException {
+    log.info("- estimating terms...");
+    Map<String, Map<String, Object>> stats = new HashMap<>();
+    for (LeafReaderContext context : reader.leaves()) {
+      LeafReader leafReader = context.reader();
+      FieldInfos fieldInfos = leafReader.getFieldInfos();
+      for (FieldInfo info : fieldInfos) {
+        Terms terms = leafReader.terms(info.name);
+        if (terms == null) {
+          continue;
+        }
+        estimateTermStats(info.name, terms, stats, false);
+      }
+    }
+    result.put(TERMS, stats);
+  }
+
+  private void estimateTermStats(String field, Terms terms, Map<String, Map<String, Object>> stats, boolean isSampling) throws IOException {
+    Map<String, Object> perField = stats.computeIfAbsent(field, n -> new HashMap<>());
+    SummaryStatistics lengthSummary = (SummaryStatistics)perField.computeIfAbsent("lengths_terms", s -> new MapWriterSummaryStatistics());
+    SummaryStatistics docFreqSummary = (SummaryStatistics)perField.computeIfAbsent("docFreqs", s -> new MapWriterSummaryStatistics());
+    SummaryStatistics totalFreqSummary = (SummaryStatistics)perField.computeIfAbsent("lengths_postings", s -> new MapWriterSummaryStatistics());
+    // TODO: add this at some point
+    //SummaryStatistics impactsSummary = (SummaryStatistics)perField.computeIfAbsent("lengths_impacts", s -> new MapWriterSummaryStatistics());
+    SummaryStatistics payloadSummary = null;
+    if (terms.hasPayloads()) {
+      payloadSummary = (SummaryStatistics)perField.computeIfAbsent("lengths_payloads", s -> new MapWriterSummaryStatistics());
+    }
+    ItemPriorityQueue topLen = (ItemPriorityQueue)perField.computeIfAbsent("topLen", s -> new ItemPriorityQueue(topN));
+    ItemPriorityQueue topTotalFreq = (ItemPriorityQueue)perField.computeIfAbsent("topTotalFreq", s -> new ItemPriorityQueue(topN));
+    TermsEnum termsEnum = terms.iterator();
+    BytesRef term;
+    PostingsEnum postings = null;
+    while ((term = termsEnum.next()) != null) {
+      if (isSampling) {
+        for (int i = 0; i < samplingStep; i++) {
+          lengthSummary.addValue(term.length);
+          docFreqSummary.addValue(termsEnum.docFreq());
+          totalFreqSummary.addValue(termsEnum.totalTermFreq());
+        }
+      } else {
+        lengthSummary.addValue(term.length);
+        docFreqSummary.addValue(termsEnum.docFreq());
+        totalFreqSummary.addValue(termsEnum.totalTermFreq());
+      }
+      if (terms.hasPayloads()) {
+        postings = termsEnum.postings(postings, PostingsEnum.ALL);
+        while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+          int freq = postings.freq();
+          for (int i = 0; i < freq; i++) {
+            if (postings.nextPosition() < 0) {
+              break;
+            }
+            BytesRef payload = postings.getPayload();
+            if (payload != null) {
+              if (isSampling) {
+                for (int k = 0; k < samplingStep; k++) {
+                  payloadSummary.addValue(payload.length);
+                }
+              } else {
+                payloadSummary.addValue(payload.length);
+              }
+            }
+          }
+        }
+      }
+      String value = term.utf8ToString();
+      if (value.length() > maxLength) {
+        value = value.substring(0, maxLength);
+      }
+      topLen.insertWithOverflow(new Item(value, term.length));
+      topTotalFreq.insertWithOverflow(new Item(value, termsEnum.totalTermFreq()));
+    }
+  }
+
+
+  private void estimateStoredFields(Map<String, Object> result) throws IOException {
+    log.info("- estimating stored fields...");
+    Map<String, Map<String, Object>> stats = new HashMap<>();
+    for (LeafReaderContext context : reader.leaves()) {
+      LeafReader leafReader = context.reader();
+      EstimatingVisitor visitor = new EstimatingVisitor(stats, topN, maxLength, samplingStep);
+      Bits liveDocs = leafReader.getLiveDocs();
+      if (leafReader instanceof CodecReader) {
+        CodecReader codecReader = (CodecReader)leafReader;
+        StoredFieldsReader storedFieldsReader = codecReader.getFieldsReader();
+        // this instance may be faster for a full sequential pass
+        storedFieldsReader = storedFieldsReader.getMergeInstance();
+        for (int docId = 0; docId < leafReader.maxDoc(); docId += samplingStep) {
+          if (liveDocs != null && !liveDocs.get(docId)) {
+            continue;
+          }
+          storedFieldsReader.visitDocument(docId, visitor);
+        }
+        storedFieldsReader.close();
+      } else {
+        for (int docId = 0; docId < leafReader.maxDoc(); docId += samplingStep) {
+          if (liveDocs != null && !liveDocs.get(docId)) {
+            continue;
+          }
+          leafReader.document(docId, visitor);
+        }
+      }
+    }
+    result.put(STORED_FIELDS, stats);
+  }
+
+  public static class Item {
+    Object value;
+    long size;
+
+    public Item(Object value, long size) {
+      this.value = value;
+      this.size = size;
+    }
+
+    public String toString() {
+      return "size=" + size + ", value=" + value;
+    }
+  }
+
+  public static class MapWriterSummaryStatistics extends SummaryStatistics implements MapWriter {
+
+    @Override
+    public void writeMap(EntryWriter ew) throws IOException {
+      ew.put("n", getN());
+      ew.put("min", getMin());
+      ew.put("max", getMax());
+      ew.put("sum", getSum());
+      ew.put("mean", getMean());
+      ew.put("geoMean", getGeometricMean());
+      ew.put("variance", getVariance());
+      ew.put("populationVariance", getPopulationVariance());
+      ew.put("stddev", getStandardDeviation());
+      ew.put("secondMoment", getSecondMoment());
+      ew.put("sumOfSquares", getSumsq());
+      ew.put("sumOfLogs", getSumOfLogs());
+    }
+  }
+
+  public static class ItemPriorityQueue extends PriorityQueue<Item> implements MapWriter {
+
+    public ItemPriorityQueue(int maxSize) {
+      super(maxSize);
+    }
+
+    @Override
+    protected boolean lessThan(Item a, Item b) {
+      return a.size < b.size;
+    }
+
+    public String toString() {
+      StringBuilder sb = new StringBuilder();
+      Iterator<Item> it = iterator();
+      while (it.hasNext()) {
+        if (sb.length() > 0) {
+          sb.append('\n');
+        }
+        sb.append(it.next());
+      }
+      return sb.toString();
+    }
+
+    // WARNING: destructive! empties the queue
+    @Override
+    public void writeMap(EntryWriter ew) throws IOException {
+      Item[] items = new Item[size()];
+      int pos = size() - 1;
+      while (size() > 0) {
+        items[pos] = pop();
+        pos--;
+      }
+      for (Item item : items) {
+        ew.put(String.valueOf(item.value), item.size);
+      }
+    }
+  }
+
+  private static class EstimatingVisitor extends StoredFieldVisitor {
+    final Map<String, Map<String, Object>> stats;
+    final int topN;
+    final int maxLength;
+    final int samplingStep;
+
+    EstimatingVisitor(Map<String, Map<String, Object>> stats, int topN, int maxLength, int samplingStep) {
+      this.stats = stats;
+      this.topN = topN;
+      this.maxLength = maxLength;
+      this.samplingStep = samplingStep;
+    }
+
+    /** Process a binary field.
+     * @param value newly allocated byte array with the binary contents.
+     */
+    public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException {
+      // trim the value if needed
+      int len = value != null ? value.length : 0;
+      if (len > maxLength) {
+        byte[] newValue = new byte[maxLength];
+        System.arraycopy(value, 0, newValue, 0, maxLength);
+        value = newValue;
+      }
+      String strValue = new BytesRef(value).toString();
+      countItem(fieldInfo.name, strValue, len);
+    }
+
+    /** Process a string field. */
+    public void stringField(FieldInfo fieldInfo, String value) throws IOException {
+      // trim the value if needed
+      int len = value != null ? UnicodeUtil.calcUTF16toUTF8Length(value, 0, value.length()) : 0;
+      if (value.length() > maxLength) {
+        value = value.substring(0, maxLength);
+      }
+      countItem(fieldInfo.name, value, len);
+    }
+
+    /** Process a int numeric field. */
+    public void intField(FieldInfo fieldInfo, int value) throws IOException {
+      countItem(fieldInfo.name, String.valueOf(value), 4);
+    }
+
+    /** Process a long numeric field. */
+    public void longField(FieldInfo fieldInfo, long value) throws IOException {
+      countItem(fieldInfo.name, String.valueOf(value), 8);
+    }
+
+    /** Process a float numeric field. */
+    public void floatField(FieldInfo fieldInfo, float value) throws IOException {
+      countItem(fieldInfo.name, String.valueOf(value), 4);
+    }
+
+    /** Process a double numeric field. */
+    public void doubleField(FieldInfo fieldInfo, double value) throws IOException {
+      countItem(fieldInfo.name, String.valueOf(value), 8);
+    }
+
+    private void countItem(String field, Object value, int size) {
+      Map<String, Object> perField = stats.computeIfAbsent(field, n -> new HashMap<>());
+      SummaryStatistics summary = (SummaryStatistics)perField.computeIfAbsent("lengths", s -> new MapWriterSummaryStatistics());
+      for (int i = 0; i < samplingStep; i++) {
+        summary.addValue(size);
+      }
+      ItemPriorityQueue topNqueue = (ItemPriorityQueue)perField.computeIfAbsent("topLen", s-> new ItemPriorityQueue(topN));
+      topNqueue.insertWithOverflow(new Item(value, size));
+    }
+
+    @Override
+    public Status needsField(FieldInfo fieldInfo) throws IOException {
+      return Status.YES;
+    }
+  }
+
+  @SuppressForbidden(reason = "System.err and System.out required for a command-line utility")
+  public static void main(String[] args) throws Exception {
+    if (args.length == 0) {
+      System.err.println("Usage: " + IndexSizeEstimator.class.getName() + " [-topN NUM] [-maxLen NUM] [-summary] [-details] <indexDir>");
+      System.err.println();
+      System.err.println("\t<indexDir>\tpath to the index (parent path of 'segments_N' file)");
+      System.err.println("\t-topN NUM\tnumber of top largest items to collect");
+      System.err.println("\t-maxLen NUM\ttruncate the largest items to NUM bytes / characters");
+      System.err.println(-1);
+    }
+    String path = null;
+    int topN = 20;
+    int maxLen = 100;
+    boolean details = false;
+    boolean summary = false;
+    for (int i = 0; i < args.length; i++) {
+      if (args[i].equals("-topN")) {
+        topN = Integer.parseInt(args[++i]);
+      } else if (args[i].equals("-maxLen")) {
+        maxLen = Integer.parseInt(args[++i]);
+      } else if (args[i].equals("-details")) {
+        details = true;
+      } else if (args[i].equals("-summary")) {
+        summary = true;
+      } else {
+        path = args[i];
+      }
+    }
+    if (path == null) {
+      System.err.println("ERROR: <indexDir> argument is required.");
+      System.exit(-2);
+    }
+    Directory dir = FSDirectory.open(Paths.get(path));
+    DirectoryReader reader = StandardDirectoryReader.open(dir);
+    IndexSizeEstimator stats = new IndexSizeEstimator(reader, topN, maxLen, summary, details);
+    System.out.println(Utils.toJSONString(stats.estimate()));
+    System.exit(0);
+  }
+}
--- a/solr/core/src/java/org/apache/solr/handler/admin/SegmentsInfoRequestHandler.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/SegmentsInfoRequestHandler.java
@ -74,9 +74,13 @@ import static org.apache.solr.common.params.CommonParams.NAME;
 public class SegmentsInfoRequestHandler extends RequestHandlerBase {
  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

-  public static final String WITH_FIELD_INFO = "fieldInfo";
-  public static final String WITH_CORE_INFO = "coreInfo";
-  public static final String WITH_SIZE_INFO = "sizeInfo";
+  public static final String FIELD_INFO_PARAM = "fieldInfo";
+  public static final String CORE_INFO_PARAM = "coreInfo";
+  public static final String SIZE_INFO_PARAM = "sizeInfo";
+  public static final String RAW_SIZE_PARAM = "rawSize";
+  public static final String RAW_SIZE_SUMMARY_PARAM = "rawSizeSummary";
+  public static final String RAW_SIZE_DETAILS_PARAM = "rawSizeDetails";
+  public static final String RAW_SIZE_SAMPLING_PERCENT_PARAM = "rawSizeSamplingPercent";

  private static final List<String> FI_LEGEND;

@ -106,9 +110,15 @@ public class SegmentsInfoRequestHandler extends RequestHandlerBase {

  private void getSegmentsInfo(SolrQueryRequest req, SolrQueryResponse rsp)
      throws Exception {
-    boolean withFieldInfo = req.getParams().getBool(WITH_FIELD_INFO, false);
-    boolean withCoreInfo = req.getParams().getBool(WITH_CORE_INFO, false);
-    boolean withSizeInfo = req.getParams().getBool(WITH_SIZE_INFO, false);
+    boolean withFieldInfo = req.getParams().getBool(FIELD_INFO_PARAM, false);
+    boolean withCoreInfo = req.getParams().getBool(CORE_INFO_PARAM, false);
+    boolean withSizeInfo = req.getParams().getBool(SIZE_INFO_PARAM, false);
+    boolean withRawSizeInfo = req.getParams().getBool(RAW_SIZE_PARAM, false);
+    boolean withRawSizeSummary = req.getParams().getBool(RAW_SIZE_SUMMARY_PARAM, false);
+    boolean withRawSizeDetails = req.getParams().getBool(RAW_SIZE_DETAILS_PARAM, false);
+    if (withRawSizeSummary || withRawSizeDetails) {
+      withRawSizeInfo  = true;
+    }
    SolrIndexSearcher searcher = req.getSearcher();

    SegmentInfos infos =
@ -187,6 +197,25 @@ public class SegmentsInfoRequestHandler extends RequestHandlerBase {
      rsp.add("fieldInfoLegend", FI_LEGEND);
    }
    rsp.add("segments", segmentInfos);
+    if (withRawSizeInfo) {
+      IndexSizeEstimator estimator = new IndexSizeEstimator(searcher.getRawReader(), 20, 100, withRawSizeSummary, withRawSizeDetails);
+      Object samplingPercentVal = req.getParams().get(RAW_SIZE_SAMPLING_PERCENT_PARAM);
+      if (samplingPercentVal != null) {
+        estimator.setSamplingPercent(Float.parseFloat(String.valueOf(samplingPercentVal)));
+      }
+      IndexSizeEstimator.Estimate estimate = estimator.estimate();
+      SimpleOrderedMap<Object> estimateMap = new SimpleOrderedMap<>();
+      // make the units more user-friendly
+      estimateMap.add(IndexSizeEstimator.FIELDS_BY_SIZE, estimate.getHumanReadableFieldsBySize());
+      estimateMap.add(IndexSizeEstimator.TYPES_BY_SIZE, estimate.getHumanReadableTypesBySize());
+      if (estimate.getSummary() != null) {
+        estimateMap.add(IndexSizeEstimator.SUMMARY, estimate.getSummary());
+      }
+      if (estimate.getDetails() != null) {
+        estimateMap.add(IndexSizeEstimator.DETAILS, estimate.getDetails());
+      }
+      rsp.add("rawSize", estimateMap);
+    }
  }

  private SimpleOrderedMap<Object> getSegmentInfo(
--- a/solr/core/src/test/org/apache/solr/handler/admin/IndexSizeEstimatorTest.java
+++ b/solr/core/src/test/org/apache/solr/handler/admin/IndexSizeEstimatorTest.java
@ -0,0 +1,241 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.admin;
+
+import java.lang.invoke.MethodHandles;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.TestUtil;
+import org.apache.solr.client.solrj.embedded.JettySolrRunner;
+import org.apache.solr.client.solrj.impl.CloudSolrClient;
+import org.apache.solr.client.solrj.request.CollectionAdminRequest;
+import org.apache.solr.client.solrj.request.UpdateRequest;
+import org.apache.solr.client.solrj.response.CollectionAdminResponse;
+import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.cloud.SolrCloudTestCase;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.common.util.TimeSource;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.search.SolrIndexSearcher;
+import org.apache.solr.util.RefCounted;
+import org.apache.solr.util.TimeOut;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ *
+ */
+public class IndexSizeEstimatorTest extends SolrCloudTestCase {
+  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  private static CloudSolrClient solrClient;
+  private static String collection = IndexSizeEstimator.class.getSimpleName() + "_collection";
+  private static int NUM_DOCS = 2000;
+  private static Set<String> fields;
+
+  @BeforeClass
+  public static void setupCluster() throws Exception {
+    // create predictable field names
+    System.setProperty("solr.tests.numeric.dv", "true");
+    System.setProperty("solr.tests.numeric.points", "true");
+    System.setProperty("solr.tests.numeric.points.dv", "true");
+    configureCluster(2)
+        .addConfig("conf", configset("cloud-dynamic"))
+        .configure();
+    solrClient = cluster.getSolrClient();
+    CollectionAdminRequest.createCollection(collection, "conf", 2, 2)
+        .setMaxShardsPerNode(2).process(solrClient);
+    cluster.waitForActiveCollection(collection, 2, 4);
+    SolrInputDocument lastDoc = addDocs(collection, NUM_DOCS);
+    HashSet<String> docFields = new HashSet<>(lastDoc.keySet());
+    docFields.add("_version_");
+    docFields.add("_root_");
+    docFields.add("point_0___double");
+    docFields.add("point_1___double");
+    fields = docFields;
+  }
+
+  @AfterClass
+  public static void releaseClient() throws Exception {
+    solrClient = null;
+  }
+
+  @Test
+  public void testEstimator() throws Exception {
+    JettySolrRunner jetty = cluster.getRandomJetty(random());
+    String randomCoreName = jetty.getCoreContainer().getAllCoreNames().iterator().next();
+    SolrCore core = jetty.getCoreContainer().getCore(randomCoreName);
+    RefCounted<SolrIndexSearcher> searcherRef = core.getSearcher();
+    try {
+      SolrIndexSearcher searcher = searcherRef.get();
+      // limit the max length
+      IndexSizeEstimator estimator = new IndexSizeEstimator(searcher.getRawReader(), 20, 50, true, true);
+      IndexSizeEstimator.Estimate estimate = estimator.estimate();
+      Map<String, Long> fieldsBySize = estimate.getFieldsBySize();
+      assertFalse("empty fieldsBySize", fieldsBySize.isEmpty());
+      assertEquals(fieldsBySize.toString(), fields.size(), fieldsBySize.size());
+      fieldsBySize.forEach((k, v) -> assertTrue("unexpected size of " + k + ": " + v, v > 0));
+      Map<String, Long> typesBySize = estimate.getTypesBySize();
+      assertFalse("empty typesBySize", typesBySize.isEmpty());
+      assertTrue("expected at least 8 types: " + typesBySize.toString(), typesBySize.size() >= 8);
+      typesBySize.forEach((k, v) -> assertTrue("unexpected size of " + k + ": " + v, v > 0));
+      Map<String, Object> summary = estimate.getSummary();
+      assertNotNull("summary", summary);
+      assertFalse("empty summary", summary.isEmpty());
+      assertEquals(summary.keySet().toString(), fields.size(), summary.keySet().size());
+      Map<String, Object> details = estimate.getDetails();
+      assertNotNull("details", details);
+      assertFalse("empty details", details.isEmpty());
+      // by type
+      assertEquals(details.keySet().toString(), 6, details.keySet().size());
+
+      // check sampling
+      estimator.setSamplingThreshold(searcher.getRawReader().maxDoc() / 2);
+      IndexSizeEstimator.Estimate sampledEstimate = estimator.estimate();
+      Map<String, Long> sampledFieldsBySize = sampledEstimate.getFieldsBySize();
+      assertFalse("empty fieldsBySize", sampledFieldsBySize.isEmpty());
+      // verify that the sampled values are within 50% of the original values
+      fieldsBySize.forEach((field, size) -> {
+        Long sampledSize = sampledFieldsBySize.get(field);
+        assertNotNull("sampled size for " + field + " is missing in " + sampledFieldsBySize, sampledSize);
+        double delta = (double) size * 0.5;
+        assertEquals("sampled size of " + field + " is wildly off", (double)size, (double)sampledSize, delta);
+      });
+    } finally {
+      searcherRef.decref();
+      core.close();
+    }
+  }
+
+  @Test
+  public void testIntegration() throws Exception {
+    CollectionAdminResponse rsp = CollectionAdminRequest.collectionStatus(collection)
+        .setWithRawSizeInfo(true)
+        .setWithRawSizeSummary(true)
+        .setWithRawSizeDetails(true)
+        .process(solrClient);
+    CollectionAdminResponse sampledRsp = CollectionAdminRequest.collectionStatus(collection)
+        .setWithRawSizeInfo(true)
+        .setWithRawSizeSummary(true)
+        .setWithRawSizeDetails(true)
+        .setRawSizeSamplingPercent(5)
+        .process(solrClient);
+    assertEquals(0, rsp.getStatus());
+    assertEquals(0, sampledRsp.getStatus());
+    for (int i : Arrays.asList(1, 2)) {
+      NamedList<Object> segInfos = (NamedList<Object>) rsp.getResponse().findRecursive(collection, "shards", "shard" + i, "leader", "segInfos");
+      NamedList<Object> rawSize = (NamedList<Object>)segInfos.get("rawSize");
+      assertNotNull("rawSize missing", rawSize);
+      Map<String, Object> rawSizeMap = rawSize.asMap(10);
+      Map<String, Object> fieldsBySize = (Map<String, Object>)rawSizeMap.get(IndexSizeEstimator.FIELDS_BY_SIZE);
+      assertNotNull("fieldsBySize missing", fieldsBySize);
+      assertEquals(fieldsBySize.toString(), fields.size(), fieldsBySize.size());
+      fields.forEach(field -> assertNotNull("missing field " + field, fieldsBySize.get(field)));
+      Map<String, Object> typesBySize = (Map<String, Object>)rawSizeMap.get(IndexSizeEstimator.TYPES_BY_SIZE);
+      assertNotNull("typesBySize missing", typesBySize);
+      assertTrue("expected at least 8 types: " + typesBySize.toString(), typesBySize.size() >= 8);
+      Map<String, Object> summary = (Map<String, Object>)rawSizeMap.get(IndexSizeEstimator.SUMMARY);
+      assertNotNull("summary missing", summary);
+      assertEquals(summary.toString(), fields.size(), summary.size());
+      fields.forEach(field -> assertNotNull("missing field " + field, summary.get(field)));
+      Map<String, Object> details = (Map<String, Object>)rawSizeMap.get(IndexSizeEstimator.DETAILS);
+      assertNotNull("details missing", summary);
+      assertEquals(details.keySet().toString(), 6, details.size());
+
+      // compare with sampled
+      NamedList<Object> sampledRawSize = (NamedList<Object>) rsp.getResponse().findRecursive(collection, "shards", "shard" + i, "leader", "segInfos", "rawSize");
+      assertNotNull("sampled rawSize missing", sampledRawSize);
+      Map<String, Object> sampledRawSizeMap = rawSize.asMap(10);
+      Map<String, Object> sampledFieldsBySize = (Map<String, Object>)sampledRawSizeMap.get(IndexSizeEstimator.FIELDS_BY_SIZE);
+      assertNotNull("sampled fieldsBySize missing", sampledFieldsBySize);
+      fieldsBySize.forEach((k, v) -> {
+        double size = fromHumanReadableUnits((String)v);
+        double sampledSize = fromHumanReadableUnits((String)sampledFieldsBySize.get(k));
+        assertNotNull("sampled size missing for field " + k + " in " + sampledFieldsBySize, sampledSize);
+        double delta = size * 0.5;
+        assertEquals("sampled size of " + k + " is wildly off", size, sampledSize, delta);
+      });
+    }
+
+  }
+
+  private static double fromHumanReadableUnits(String value) {
+    String[] parts = value.split(" ");
+    assertEquals("invalid value", 2, parts.length);
+    double result = Double.parseDouble(parts[0]);
+    if (parts[1].equals("GB")) {
+      result = result * RamUsageEstimator.ONE_GB;
+    } else if (parts[1].equals("MB")) {
+      result = result * RamUsageEstimator.ONE_MB;
+    } else if (parts[1].equals("KB")) {
+      result = result * RamUsageEstimator.ONE_KB;
+    } else if (parts[1].equals("bytes")) {
+      // do nothing
+    } else {
+      fail("invalid unit in " + value);
+    }
+    return result;
+  }
+
+  private static SolrInputDocument addDocs(String collection, int n) throws Exception {
+    UpdateRequest ureq = new UpdateRequest();
+    SolrInputDocument doc = null;
+    for (int i = 0; i < n; i++) {
+      doc = new SolrInputDocument();
+      doc.addField("id", "id-" + i);
+      doc.addField("long_l", i);
+      doc.addField("long_tl", i);
+      doc.addField("multival_long_ll", i);
+      doc.addField("multival_long_ll", i + 1);
+      // indexed, not stored
+      doc.addField("string_sI", TestUtil.randomAnalysisString(random(), 100, true));
+      // stored, not indexed
+      doc.addField("string_sS", TestUtil.randomAnalysisString(random(), 100, true));
+      // multival, stored, indexed, tv, pos, offsets
+      doc.addField("tv_mv_string", TestUtil.randomAnalysisString(random(), 100, true));
+      doc.addField("tv_mv_string", TestUtil.randomAnalysisString(random(), 100, true));
+      //binary
+      doc.addField("payload", TestUtil.randomBinaryTerm(random()).bytes);
+      // points
+      doc.addField("point", random().nextInt(100) + "," + random().nextInt(100));
+      ureq.add(doc);
+    }
+    solrClient.request(ureq, collection);
+    solrClient.commit(collection);
+    // verify the number of docs
+    TimeOut timeOut = new TimeOut(30, TimeUnit.SECONDS, TimeSource.NANO_TIME);
+    while (!timeOut.hasTimedOut()) {
+      QueryResponse rsp = solrClient.query(collection, params("q", "*:*", "rows", "0"));
+      if (rsp.getResults().getNumFound() == n) {
+        break;
+      }
+      timeOut.sleep(500);
+    }
+    assertFalse("timed out waiting for documents to be added", timeOut.hasTimedOut());
+    return doc;
+  }
+
+}
--- a/solr/solr-ref-guide/src/collections-api.adoc
+++ b/solr/solr-ref-guide/src/collections-api.adoc
@ -1566,6 +1566,78 @@ and their corresponding Solr schema types.
 Optional boolean. If true then additional information about the index files
 size and their RAM usage will be provided.

+==== Index Size Analysis Tool
+The `COLSTATUS` command also provides a tool for analyzing and estimating the composition of raw index data. Please note that
+this tool should be used with care because it generates a significant IO load on all shard leaders of the
+analyzed collections. A sampling threshold and a sampling percent parameters can be adjusted to reduce this
+load to some degree.
+
+Size estimates produced by this tool are only approximate and represent the aggregated size of uncompressed
+index data. In reality these values would never occur, because Lucene (and Solr) always stores data in a
+compressed format - still, these values help to understand what occupies most of the space and the relative size
+of each type of data and each field in the index.
+
+In the following sections whenever "size" is mentioned it means an estimated aggregated size of
+uncompressed (raw) data.
+
+The following parameters are specific to this tool:
+
+`rawSize`::
+Optional boolean. If true then run the raw index data analysis tool (other boolean options below imply
+this option if any of them are true). Command response will include sections that show estimated breakdown of
+data size per field and per data type.
+
+`rawSizeSummary`::
+Optional boolean. If true then include also a more detailed breakdown of data size per field and per type.
+
+`rawSizeDetails`::
+Optional boolean. If true then provide exhaustive details that include statistical distribution of items per
+field and per type as well as top 20 largest items per field.
+
+`rawSizeSamplingPercent`::
+Optional float. When the index is larger than a certain threshold (100k documents per shard) only a part of
+data is actually retrieved and analyzed in order to reduce the IO load, and then the final results are extrapolated.
+Values must be greater than 0 and less or equal to 100.0. Default value is 5.0. Very small values (between 0.0 and 1.0)
+may introduce significant estimation errors. Also, values that would result in less than 10 documents being sampled
+are rejected with an exception.
+
+Response for this command always contains two sections:
+
+* `fieldsBySize` is a map where field names are keys and values are estimated sizes of raw (uncompressed) data
+that belongs to the field. The map is sorted by size so that it's easy to see what field occupies most space.
+
+* `typesBySize` is a map where data types are the keys and values are estimates sizes of raw (uncompressed) data
+of particular type. This map is also sorted by size.
+
+Optional sections include:
+
+* `summary` section containing a breakdown of data sizes for each field by data type.
+
+* `details` section containing detailed statistical summary of size distribution within each field, per data type.
+This section also shows `topN` values by size from each field.
+
+Data types shown in the response can be roughly divided into the following groups:
+
+* `storedFields` - represents the raw uncompressed data in stored fields. Eg. for UTF-8 strings this represents
+the aggregated sum of the number of bytes in the strings' UTF-8 representation, for long numbers this is 8 bytes per value, etc.
+
+* `terms_terms` - represents the aggregated size of the term dictionary. The size of this data is affected by the
+the number and length of unique terms, which in turn depends on the field size and the analysis chain.
+
+* `terms_postings` - represents the aggregated size of all term position and offset information, if present.
+This information may be absent if position-based searching, such as phrase queries, is not needed.
+
+* `terms_payloads` - represents the aggregated size of all per-term payload data, if present.
+
+* `norms` - represents the aggregated size of field norm information. This information may be omitted if a field
+has an `omitNorms` flag in the schema, which is common for fields that don't need weighting or scoring by field length.
+
+* `termVectors` - represents the aggregated size of term vectors.
+
+* `docValues_*` - represents aggregated size of doc values, by type (eg. `docValues_numeric`, `docValues_binary`, etc).
+
+* `points` - represents aggregated size of point values.
+
 === COLSTATUS Response
 The response will include an overview of the collection status, the number of
 active or inactive shards and replicas, and additional index information
@ -1717,6 +1789,201 @@ http://localhost:8983/solr/admin/collections?action=COLSTATUS&collection=getting
                                  }}}}}}}}}}}
 ----

+Example of using the raw index data analysis tool:
+
+*Input*
+
+[source,text]
+----
+http://localhost:8983/solr/admin/collections?action=COLSTATUS&collection=gettingstarted&rawSize=true&rawSizeSamplingPercent=0.1
+----
+
+*Output*
+
+[source,json]
+----
+{
+    "responseHeader": {
+        "status": 0,
+        "QTime": 26812
+    },
+    "gettingstarted": {
+        "stateFormat": 2,
+        "znodeVersion": 33,
+        "properties": {
+            "autoAddReplicas": "false",
+            "maxShardsPerNode": "-1",
+            "nrtReplicas": "2",
+            "pullReplicas": "0",
+            "replicationFactor": "2",
+            "router": {
+                "name": "compositeId"
+            },
+            "tlogReplicas": "0"
+        },
+        "activeShards": 2,
+        "inactiveShards": 0,
+        "schemaNonCompliant": [
+            "(NONE)"
+        ],
+        "shards": {
+            "shard1": {
+                "state": "active",
+                "range": "80000000-ffffffff",
+                "replicas": {
+                    "total": 2,
+                    "active": 2,
+                    "down": 0,
+                    "recovering": 0,
+                    "recovery_failed": 0
+                },
+                "leader": {
+                    "coreNode": "core_node5",
+                    "core": "gettingstarted_shard1_replica_n2",
+                    "base_url": "http://192.168.0.80:8983/solr",
+                    "node_name": "192.168.0.80:8983_solr",
+                    "state": "active",
+                    "type": "NRT",
+                    "force_set_state": "false",
+                    "leader": "true",
+                    "segInfos": {
+                        "info": {
+                            "minSegmentLuceneVersion": "9.0.0",
+                            "commitLuceneVersion": "9.0.0",
+                            "numSegments": 46,
+                            "segmentsFileName": "segments_4h",
+                            "totalMaxDoc": 3283741,
+                            "userData": {
+                                "commitCommandVer": "1635676266902323200",
+                                "commitTimeMSec": "1559902446318"
+                            }
+                        },
+                        "rawSize": {
+                            "fieldsBySize": {
+                                "revision.text": "7.9 GB",
+                                "revision.text_str": "734.7 MB",
+                                "revision.comment_str": "259.1 MB",
+                                "revision": "239.2 MB",
+                                "revision.sha1": "211.9 MB",
+                                "revision.comment": "201.3 MB",
+                                "title": "114.9 MB",
+                                "revision.contributor": "103.5 MB",
+                                "revision.sha1_str": "96.4 MB",
+                                "revision.id": "75.2 MB",
+                                "ns": "75.2 MB",
+                                "revision.timestamp": "75.2 MB",
+                                "revision.contributor.id": "74.7 MB",
+                                "revision.format": "69 MB",
+                                "id": "65 MB",
+                                "title_str": "26.8 MB",
+                                "revision.model_str": "25.4 MB",
+                                "_version_": "24.9 MB",
+                                "_root_": "24.7 MB",
+                                "revision.contributor.ip_str": "22 MB",
+                                "revision.contributor_str": "21.8 MB",
+                                "revision_str": "15.5 MB",
+                                "revision.contributor.ip": "13.5 MB",
+                                "restrictions_str": "428.7 KB",
+                                "restrictions": "164.2 KB",
+                                "name_str": "84 KB",
+                                "includes_str": "8.8 KB"
+                            },
+                            "typesBySize": {
+                                "storedFields": "7.8 GB",
+                                "docValues_sortedSet": "1.2 GB",
+                                "terms_postings": "788.8 MB",
+                                "terms_terms": "342.2 MB",
+                                "norms": "237 MB",
+                                "docValues_sortedNumeric": "124.3 MB",
+                                "points": "115.7 MB",
+                                "docValues_numeric": "24.9 MB",
+                                "docValues_sorted": "18.5 MB"
+                            }
+                        }
+                    }
+                }
+            },
+            "shard2": {
+                "state": "active",
+                "range": "0-7fffffff",
+                "replicas": {
+                    "total": 2,
+                    "active": 2,
+                    "down": 0,
+                    "recovering": 0,
+                    "recovery_failed": 0
+                },
+                "leader": {
+                    "coreNode": "core_node8",
+                    "core": "gettingstarted_shard2_replica_n6",
+                    "base_url": "http://192.168.0.80:8983/solr",
+                    "node_name": "192.168.0.80:8983_solr",
+                    "state": "active",
+                    "type": "NRT",
+                    "force_set_state": "false",
+                    "leader": "true",
+                    "segInfos": {
+                        "info": {
+                            "minSegmentLuceneVersion": "9.0.0",
+                            "commitLuceneVersion": "9.0.0",
+                            "numSegments": 55,
+                            "segmentsFileName": "segments_4d",
+                            "totalMaxDoc": 3284863,
+                            "userData": {
+                                "commitCommandVer": "1635676259742646272",
+                                "commitTimeMSec": "1559902445005"
+                            }
+                        },
+                        "rawSize": {
+                            "fieldsBySize": {
+                                "revision.text": "8.3 GB",
+                                "revision.text_str": "687.5 MB",
+                                "revision": "238.9 MB",
+                                "revision.sha1": "212 MB",
+                                "revision.comment_str": "211.5 MB",
+                                "revision.comment": "201.7 MB",
+                                "title": "115.9 MB",
+                                "revision.contributor": "103.4 MB",
+                                "revision.sha1_str": "96.3 MB",
+                                "ns": "75.2 MB",
+                                "revision.id": "75.2 MB",
+                                "revision.timestamp": "75.2 MB",
+                                "revision.contributor.id": "74.6 MB",
+                                "revision.format": "69 MB",
+                                "id": "67 MB",
+                                "title_str": "29.5 MB",
+                                "_version_": "24.8 MB",
+                                "revision.model_str": "24 MB",
+                                "revision.contributor_str": "21.7 MB",
+                                "revision.contributor.ip_str": "20.9 MB",
+                                "revision_str": "15.5 MB",
+                                "revision.contributor.ip": "13.8 MB",
+                                "restrictions_str": "411.1 KB",
+                                "restrictions": "132.9 KB",
+                                "name_str": "42 KB",
+                                "includes_str": "41 KB"
+                            },
+                            "typesBySize": {
+                                "storedFields": "8.2 GB",
+                                "docValues_sortedSet": "1.1 GB",
+                                "terms_postings": "787.4 MB",
+                                "terms_terms": "337.5 MB",
+                                "norms": "236.6 MB",
+                                "docValues_sortedNumeric": "124.1 MB",
+                                "points": "115.7 MB",
+                                "docValues_numeric": "24.9 MB",
+                                "docValues_sorted": "20.5 MB"
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+----
+
+
 [[migrate]]
 == MIGRATE: Migrate Documents to Another Collection

--- a/solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java
@ -911,6 +911,10 @@ public abstract class CollectionAdminRequest<T extends CollectionAdminResponse>
    protected Boolean withFieldInfo = null;
    protected Boolean withCoreInfo = null;
    protected Boolean withSizeInfo = null;
+    protected Boolean withRawSizeInfo = null;
+    protected Boolean withRawSizeSummary = null;
+    protected Boolean withRawSizeDetails = null;
+    protected Float rawSizeSamplingPercent = null;

    private ColStatus(String collection) {
      super(CollectionAction.COLSTATUS, collection);
@ -936,6 +940,26 @@ public abstract class CollectionAdminRequest<T extends CollectionAdminResponse>
      return this;
    }

+    public ColStatus setWithRawSizeInfo(boolean withRawSizeInfo) {
+      this.withRawSizeInfo = withRawSizeInfo;
+      return this;
+    }
+
+    public ColStatus setWithRawSizeSummary(boolean withRawSizeSummary) {
+      this.withRawSizeSummary = withRawSizeSummary;
+      return this;
+    }
+
+    public ColStatus setWithRawSizeDetails(boolean withRawSizeDetails) {
+      this.withRawSizeDetails = withRawSizeDetails;
+      return this;
+    }
+
+    public ColStatus setRawSizeSamplingPercent(float rawSizeSamplingPercent) {
+      this.rawSizeSamplingPercent = rawSizeSamplingPercent;
+      return this;
+    }
+
    @Override
    public SolrParams getParams() {
      ModifiableSolrParams params = (ModifiableSolrParams)super.getParams();
@ -943,6 +967,10 @@ public abstract class CollectionAdminRequest<T extends CollectionAdminResponse>
      params.setNonNull("fieldInfo", withFieldInfo);
      params.setNonNull("coreInfo", withCoreInfo);
      params.setNonNull("sizeInfo", withSizeInfo);
+      params.setNonNull("rawSizeInfo", withRawSizeInfo);
+      params.setNonNull("rawSizeSummary", withRawSizeSummary);
+      params.setNonNull("rawSizeDetails", withRawSizeDetails);
+      params.setNonNull("rawSizeSamplingPercent", rawSizeSamplingPercent);
      return params;
    }
  }