Merge remote-tracking branch 'apache/master' into quidem-kttm

2024-09-06 10:10:15 +00:00 · 2024-09-06 10:10:15 +00:00 · 3e15085007
parent 01fed27890 73ff9f9047
commit 3e15085007
872 changed files with 30913 additions and 14666 deletions
--- a/.idea/inspectionProfiles/Druid.xml
+++ b/.idea/inspectionProfiles/Druid.xml
@ -120,6 +120,7 @@
    <inspection_tool class="MavenDuplicatePluginInspection" enabled="true" level="ERROR" enabled_by_default="true" />
    <inspection_tool class="MavenModelInspection" enabled="true" level="WARNING" enabled_by_default="true" />
    <inspection_tool class="MethodComplexity" enabled="false" level="WARNING" enabled_by_default="false" />
+    <inspection_tool class="MethodIsIdenticalToSuperMethod" enabled="true" level="ERROR" enabled_by_default="true" />
    <inspection_tool class="MismatchedArrayReadWrite" enabled="true" level="ERROR" enabled_by_default="true" />
    <inspection_tool class="MismatchedCollectionQueryUpdate" enabled="true" level="ERROR" enabled_by_default="true">
      <option name="queryNames">
--- a/benchmarks/src/test/java/org/apache/druid/benchmark/ExpressionAggregationBenchmark.java
+++ b/benchmarks/src/test/java/org/apache/druid/benchmark/ExpressionAggregationBenchmark.java
@ -20,11 +20,9 @@
 package org.apache.druid.benchmark;

 import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Iterables;
 import org.apache.druid.common.config.NullHandling;
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.granularity.Granularities;
-import org.apache.druid.java.util.common.guava.Sequence;
 import org.apache.druid.java.util.common.io.Closer;
 import org.apache.druid.js.JavaScriptConfig;
 import org.apache.druid.query.aggregation.BufferAggregator;
@ -35,9 +33,10 @@ import org.apache.druid.query.monomorphicprocessing.RuntimeShapeInspector;
 import org.apache.druid.segment.BaseFloatColumnValueSelector;
 import org.apache.druid.segment.ColumnSelectorFactory;
 import org.apache.druid.segment.Cursor;
+import org.apache.druid.segment.CursorBuildSpec;
+import org.apache.druid.segment.CursorHolder;
 import org.apache.druid.segment.QueryableIndex;
 import org.apache.druid.segment.QueryableIndexStorageAdapter;
-import org.apache.druid.segment.VirtualColumns;
 import org.apache.druid.segment.column.ValueType;
 import org.apache.druid.segment.generator.GeneratorColumnSchema;
 import org.apache.druid.segment.generator.GeneratorSchemaInfo;
@ -60,7 +59,6 @@ import org.openjdk.jmh.annotations.Warmup;
 import org.openjdk.jmh.infra.Blackhole;

 import java.nio.ByteBuffer;
-import java.util.List;
 import java.util.concurrent.TimeUnit;
 import java.util.function.Function;

@ -165,32 +163,22 @@ public class ExpressionAggregationBenchmark
  {
    final QueryableIndexStorageAdapter adapter = new QueryableIndexStorageAdapter(index);

-    final Sequence<Cursor> cursors = adapter.makeCursors(
-        null,
-        index.getDataInterval(),
-        VirtualColumns.EMPTY,
-        Granularities.ALL,
-        false,
-        null
-    );
+    try (final CursorHolder cursorHolder = adapter.makeCursorHolder(CursorBuildSpec.FULL_SCAN)) {
+      final Cursor cursor = cursorHolder.asCursor();

-    final List<Double> results = cursors
-        .map(cursor -> {
-          final BufferAggregator bufferAggregator = aggregatorFactory.apply(cursor.getColumnSelectorFactory());
-          bufferAggregator.init(aggregationBuffer, 0);
+      final BufferAggregator bufferAggregator = aggregatorFactory.apply(cursor.getColumnSelectorFactory());
+      bufferAggregator.init(aggregationBuffer, 0);

-          while (!cursor.isDone()) {
-            bufferAggregator.aggregate(aggregationBuffer, 0);
-            cursor.advance();
-          }
+      while (!cursor.isDone()) {
+        bufferAggregator.aggregate(aggregationBuffer, 0);
+        cursor.advance();
+      }

-          final Double dbl = (Double) bufferAggregator.get(aggregationBuffer, 0);
-          bufferAggregator.close();
-          return dbl;
-        })
-        .toList();
+      final Double dbl = (Double) bufferAggregator.get(aggregationBuffer, 0);
+      bufferAggregator.close();

-    return Iterables.getOnlyElement(results);
+      return dbl;
+    }
  }

  private static class NativeBufferAggregator implements BufferAggregator
--- a/benchmarks/src/test/java/org/apache/druid/benchmark/ExpressionFilterBenchmark.java
+++ b/benchmarks/src/test/java/org/apache/druid/benchmark/ExpressionFilterBenchmark.java
@ -23,7 +23,6 @@ import com.google.common.collect.ImmutableList;
 import org.apache.druid.common.config.NullHandling;
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.granularity.Granularities;
-import org.apache.druid.java.util.common.guava.Sequence;
 import org.apache.druid.java.util.common.io.Closer;
 import org.apache.druid.math.expr.ExpressionProcessing;
 import org.apache.druid.query.expression.TestExprMacroTable;
@ -33,9 +32,10 @@ import org.apache.druid.query.filter.ExpressionDimFilter;
 import org.apache.druid.query.filter.SelectorDimFilter;
 import org.apache.druid.segment.ColumnValueSelector;
 import org.apache.druid.segment.Cursor;
+import org.apache.druid.segment.CursorBuildSpec;
+import org.apache.druid.segment.CursorHolder;
 import org.apache.druid.segment.QueryableIndex;
 import org.apache.druid.segment.QueryableIndexStorageAdapter;
-import org.apache.druid.segment.VirtualColumns;
 import org.apache.druid.segment.column.ValueType;
 import org.apache.druid.segment.generator.GeneratorColumnSchema;
 import org.apache.druid.segment.generator.GeneratorSchemaInfo;
@ -58,7 +58,6 @@ import org.openjdk.jmh.annotations.Warmup;
 import org.openjdk.jmh.infra.Blackhole;

 import java.util.Arrays;
-import java.util.List;
 import java.util.concurrent.TimeUnit;

@State(Scope.Benchmark)
@ -146,52 +145,34 @@ public class ExpressionFilterBenchmark
  @Benchmark
  public void expressionFilter(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = new QueryableIndexStorageAdapter(index).makeCursors(
-        expressionFilter.toFilter(),
-        index.getDataInterval(),
-        VirtualColumns.EMPTY,
-        Granularities.ALL,
-        false,
-        null
-    );
-    final List<?> results = cursors
-        .map(cursor -> {
-          final ColumnValueSelector selector = cursor.getColumnSelectorFactory().makeColumnValueSelector("x");
-          consumeString(cursor, selector, blackhole);
-          return null;
-        })
-        .toList();
+    final CursorBuildSpec buildSpec = CursorBuildSpec.builder()
+                                                     .setFilter(expressionFilter.toFilter())
+                                                     .build();
+    try (final CursorHolder cursorHolder = new QueryableIndexStorageAdapter(index).makeCursorHolder(buildSpec)) {
+      final Cursor cursor = cursorHolder.asCursor();

-    blackhole.consume(results);
+
+      final ColumnValueSelector selector = cursor.getColumnSelectorFactory().makeColumnValueSelector("x");
+      while (!cursor.isDone()) {
+        blackhole.consume(selector.getObject());
+        cursor.advance();
+      }
+    }
  }

  @Benchmark
  public void nativeFilter(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = new QueryableIndexStorageAdapter(index).makeCursors(
-        nativeFilter.toFilter(),
-        index.getDataInterval(),
-        VirtualColumns.EMPTY,
-        Granularities.ALL,
-        false,
-        null
-    );
-    final List<?> results = cursors
-        .map(cursor -> {
-          final ColumnValueSelector selector = cursor.getColumnSelectorFactory().makeColumnValueSelector("x");
-          consumeString(cursor, selector, blackhole);
-          return null;
-        })
-        .toList();
-
-    blackhole.consume(results);
-  }
-
-  private void consumeString(final Cursor cursor, final ColumnValueSelector selector, final Blackhole blackhole)
-  {
-    while (!cursor.isDone()) {
-      blackhole.consume(selector.getLong());
-      cursor.advance();
+    final CursorBuildSpec buildSpec = CursorBuildSpec.builder()
+                                                     .setFilter(nativeFilter.toFilter())
+                                                     .build();
+    try (final CursorHolder cursorHolder = new QueryableIndexStorageAdapter(index).makeCursorHolder(buildSpec)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      final ColumnValueSelector selector = cursor.getColumnSelectorFactory().makeColumnValueSelector("x");
+      while (!cursor.isDone()) {
+        blackhole.consume(selector.getObject());
+        cursor.advance();
+      }
    }
  }
 }
--- a/benchmarks/src/test/java/org/apache/druid/benchmark/ExpressionSelectorBenchmark.java
+++ b/benchmarks/src/test/java/org/apache/druid/benchmark/ExpressionSelectorBenchmark.java
@ -25,8 +25,10 @@ import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.StringUtils;
 import org.apache.druid.java.util.common.granularity.Granularities;
 import org.apache.druid.java.util.common.guava.Sequence;
+import org.apache.druid.java.util.common.guava.Sequences;
 import org.apache.druid.java.util.common.io.Closer;
 import org.apache.druid.math.expr.ExpressionProcessing;
+import org.apache.druid.query.CursorGranularizer;
 import org.apache.druid.query.dimension.DefaultDimensionSpec;
 import org.apache.druid.query.dimension.ExtractionDimensionSpec;
 import org.apache.druid.query.expression.LookupEnabledTestExprMacroTable;
@ -35,9 +37,14 @@ import org.apache.druid.query.extraction.StrlenExtractionFn;
 import org.apache.druid.query.extraction.TimeFormatExtractionFn;
 import org.apache.druid.segment.ColumnValueSelector;
 import org.apache.druid.segment.Cursor;
+import org.apache.druid.segment.CursorBuildSpec;
+import org.apache.druid.segment.CursorHolder;
+import org.apache.druid.segment.Cursors;
 import org.apache.druid.segment.DimensionSelector;
 import org.apache.druid.segment.QueryableIndex;
 import org.apache.druid.segment.QueryableIndexStorageAdapter;
+import org.apache.druid.segment.QueryableIndexTimeBoundaryInspector;
+import org.apache.druid.segment.StorageAdapter;
 import org.apache.druid.segment.VirtualColumns;
 import org.apache.druid.segment.column.ColumnHolder;
 import org.apache.druid.segment.column.ColumnType;
@ -62,8 +69,8 @@ import org.openjdk.jmh.annotations.State;
 import org.openjdk.jmh.annotations.TearDown;
 import org.openjdk.jmh.annotations.Warmup;
 import org.openjdk.jmh.infra.Blackhole;
+
 import java.util.BitSet;
-import java.util.List;
 import java.util.concurrent.TimeUnit;

@State(Scope.Benchmark)
@ -141,388 +148,281 @@ public class ExpressionSelectorBenchmark
  @Benchmark
  public void timeFloorUsingExpression(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = new QueryableIndexStorageAdapter(index).makeCursors(
-        null,
-        index.getDataInterval(),
-        VirtualColumns.create(
-            ImmutableList.of(
-                new ExpressionVirtualColumn(
-                    "v",
-                    "timestamp_floor(__time, 'PT1H')",
-                    ColumnType.LONG,
-                    TestExprMacroTable.INSTANCE
-                )
-            )
-        ),
-        Granularities.ALL,
-        false,
-        null
-    );
+    final CursorBuildSpec buildSpec = CursorBuildSpec.builder()
+                                                     .setVirtualColumns(
+                                                         VirtualColumns.create(
+                                                             new ExpressionVirtualColumn(
+                                                                 "v",
+                                                                 "timestamp_floor(__time, 'PT1H')",
+                                                                 ColumnType.LONG,
+                                                                 TestExprMacroTable.INSTANCE
+                                                             )
+                                                         )
+                                                     )
+                                                     .build();
+    try (final CursorHolder cursorHolder = new QueryableIndexStorageAdapter(index).makeCursorHolder(buildSpec)) {
+      final Cursor cursor = cursorHolder.asCursor();

-    final List<?> results = cursors
-        .map(cursor -> {
-          final ColumnValueSelector selector = cursor.getColumnSelectorFactory().makeColumnValueSelector("v");
-          consumeLong(cursor, selector, blackhole);
-          return null;
-        })
-        .toList();
-
-    blackhole.consume(results);
+      final ColumnValueSelector selector = cursor.getColumnSelectorFactory().makeColumnValueSelector("v");
+      consumeLong(cursor, selector, blackhole);
+    }
  }

  @Benchmark
  public void timeFloorUsingExtractionFn(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = new QueryableIndexStorageAdapter(index).makeCursors(
-        null,
-        index.getDataInterval(),
-        VirtualColumns.EMPTY,
-        Granularities.ALL,
-        false,
-        null
-    );
+    try (final CursorHolder cursorHolder = new QueryableIndexStorageAdapter(index).makeCursorHolder(CursorBuildSpec.FULL_SCAN)) {
+      final Cursor cursor = cursorHolder.asCursor();

-    final List<?> results = cursors
-        .map(cursor -> {
-          final DimensionSelector selector = cursor
-              .getColumnSelectorFactory()
-              .makeDimensionSelector(
-                  new ExtractionDimensionSpec(
-                      ColumnHolder.TIME_COLUMN_NAME,
-                      "v",
-                      new TimeFormatExtractionFn(null, null, null, Granularities.HOUR, true)
-                  )
-              );
-          consumeDimension(cursor, selector, blackhole);
-          return null;
-        })
-        .toList();
-
-    blackhole.consume(results);
+      final DimensionSelector selector = cursor
+          .getColumnSelectorFactory()
+          .makeDimensionSelector(
+              new ExtractionDimensionSpec(
+                  ColumnHolder.TIME_COLUMN_NAME,
+                  "v",
+                  new TimeFormatExtractionFn(null, null, null, Granularities.HOUR, true)
+              )
+          );
+      consumeDimension(cursor, selector, blackhole);
+    }
  }

  @Benchmark
  public void timeFloorUsingCursor(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = new QueryableIndexStorageAdapter(index).makeCursors(
-        null,
-        index.getDataInterval(),
-        VirtualColumns.EMPTY,
-        Granularities.HOUR,
-        false,
-        null
-    );
+    final StorageAdapter adapter = new QueryableIndexStorageAdapter(index);
+    try (final CursorHolder cursorHolder = adapter.makeCursorHolder(CursorBuildSpec.FULL_SCAN)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      final CursorGranularizer granularizer = CursorGranularizer.create(
+          cursor,
+          QueryableIndexTimeBoundaryInspector.create(index),
+          Cursors.getTimeOrdering(index.getOrdering()),
+          Granularities.HOUR,
+          adapter.getInterval()
+      );
+      final Sequence<Long> results =
+          Sequences.simple(granularizer.getBucketIterable())
+                   .map(bucketInterval -> {
+                     if (!granularizer.advanceToBucket(bucketInterval)) {
+                       return 0L;
+                     }
+                     long count = 0L;
+                     while (!cursor.isDone()) {
+                       count++;
+                       if (!granularizer.advanceCursorWithinBucket()) {
+                         break;
+                       }
+                     }
+                     return count;
+                   });

-    final List<Long> results = cursors
-        .map(cursor -> {
-          long count = 0L;
-          while (!cursor.isDone()) {
-            count++;
-            cursor.advance();
-          }
-          return count;
-        })
-        .toList();
+      long count = 0L;
+      for (Long result : results.toList()) {
+        count += result;
+      }

-    long count = 0L;
-    for (Long result : results) {
-      count += result;
+      blackhole.consume(count);
    }
-
-    blackhole.consume(count);
  }

  @Benchmark
  public void timeFormatUsingExpression(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = new QueryableIndexStorageAdapter(index).makeCursors(
-        null,
-        index.getDataInterval(),
-        VirtualColumns.create(
-            ImmutableList.of(
-                new ExpressionVirtualColumn(
-                    "v",
-                    "timestamp_format(__time, 'yyyy-MM-dd')",
-                    ColumnType.STRING,
-                    TestExprMacroTable.INSTANCE
-                )
-            )
-        ),
-        Granularities.ALL,
-        false,
-        null
-    );
+    final CursorBuildSpec buildSpec = CursorBuildSpec.builder()
+                                                     .setVirtualColumns(
+                                                         VirtualColumns.create(
+                                                             new ExpressionVirtualColumn(
+                                                                 "v",
+                                                                 "timestamp_format(__time, 'yyyy-MM-dd')",
+                                                                 ColumnType.STRING,
+                                                                 TestExprMacroTable.INSTANCE
+                                                             )
+                                                         )
+                                                     )
+                                                     .build();

-    final List<?> results = cursors
-        .map(cursor -> {
-          final DimensionSelector selector = cursor.getColumnSelectorFactory().makeDimensionSelector(
-              DefaultDimensionSpec.of("v")
-          );
-          consumeDimension(cursor, selector, blackhole);
-          return null;
-        })
-        .toList();
-
-    blackhole.consume(results);
+    try (final CursorHolder cursorHolder = new QueryableIndexStorageAdapter(index).makeCursorHolder(buildSpec)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      final DimensionSelector selector = cursor.getColumnSelectorFactory().makeDimensionSelector(
+          DefaultDimensionSpec.of("v")
+      );
+      consumeDimension(cursor, selector, blackhole);
+    }
  }

  @Benchmark
  public void timeFormatUsingExtractionFn(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = new QueryableIndexStorageAdapter(index).makeCursors(
-        null,
-        index.getDataInterval(),
-        VirtualColumns.EMPTY,
-        Granularities.ALL,
-        false,
-        null
-    );
-
-    final List<?> results = cursors
-        .map(cursor -> {
-          final DimensionSelector selector = cursor
-              .getColumnSelectorFactory()
-              .makeDimensionSelector(
-                  new ExtractionDimensionSpec(
-                      ColumnHolder.TIME_COLUMN_NAME,
-                      "v",
-                      new TimeFormatExtractionFn("yyyy-MM-dd", null, null, null, false)
-                  )
-              );
-          consumeDimension(cursor, selector, blackhole);
-          return null;
-        })
-        .toList();
-
-    blackhole.consume(results);
+    try (final CursorHolder cursorHolder = new QueryableIndexStorageAdapter(index).makeCursorHolder(CursorBuildSpec.FULL_SCAN)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      final DimensionSelector selector = cursor
+          .getColumnSelectorFactory()
+          .makeDimensionSelector(
+              new ExtractionDimensionSpec(
+                  ColumnHolder.TIME_COLUMN_NAME,
+                  "v",
+                  new TimeFormatExtractionFn("yyyy-MM-dd", null, null, null, false)
+              )
+          );
+      consumeDimension(cursor, selector, blackhole);
+    }
  }

  @Benchmark
  public void strlenUsingExpressionAsLong(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = new QueryableIndexStorageAdapter(index).makeCursors(
-        null,
-        index.getDataInterval(),
-        VirtualColumns.create(
-            ImmutableList.of(
-                new ExpressionVirtualColumn(
-                    "v",
-                    "strlen(s)",
-                    ColumnType.STRING,
-                    TestExprMacroTable.INSTANCE
-                )
-            )
-        ),
-        Granularities.ALL,
-        false,
-        null
-    );
+    final CursorBuildSpec buildSpec = CursorBuildSpec.builder()
+                                                     .setVirtualColumns(
+                                                         VirtualColumns.create(
+                                                             new ExpressionVirtualColumn(
+                                                                 "v",
+                                                                 "strlen(s)",
+                                                                 ColumnType.STRING,
+                                                                 TestExprMacroTable.INSTANCE
+                                                             )
+                                                         )
+                                                     )
+                                                     .build();

-    final List<?> results = cursors
-        .map(cursor -> {
-          final ColumnValueSelector selector = cursor.getColumnSelectorFactory().makeColumnValueSelector("v");
-          consumeLong(cursor, selector, blackhole);
-          return null;
-        })
-        .toList();
-
-    blackhole.consume(results);
+    try (final CursorHolder cursorHolder = new QueryableIndexStorageAdapter(index).makeCursorHolder(buildSpec)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      final ColumnValueSelector selector = cursor.getColumnSelectorFactory().makeColumnValueSelector("v");
+      consumeLong(cursor, selector, blackhole);
+    }
  }

  @Benchmark
  public void strlenUsingExpressionAsString(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = new QueryableIndexStorageAdapter(index).makeCursors(
-        null,
-        index.getDataInterval(),
-        VirtualColumns.create(
-            ImmutableList.of(
-                new ExpressionVirtualColumn(
-                    "v",
-                    "strlen(s)",
-                    ColumnType.STRING,
-                    TestExprMacroTable.INSTANCE
-                )
-            )
-        ),
-        Granularities.ALL,
-        false,
-        null
-    );
+    final CursorBuildSpec buildSpec = CursorBuildSpec.builder()
+                                                     .setVirtualColumns(
+                                                         VirtualColumns.create(
+                                                             new ExpressionVirtualColumn(
+                                                                 "v",
+                                                                 "strlen(s)",
+                                                                 ColumnType.STRING,
+                                                                 TestExprMacroTable.INSTANCE
+                                                             )
+                                                         )
+                                                     )
+                                                     .build();

-    final List<?> results = cursors
-        .map(cursor -> {
-          final DimensionSelector selector = cursor
-              .getColumnSelectorFactory()
-              .makeDimensionSelector(new DefaultDimensionSpec("v", "v", ColumnType.STRING));
+    try (final CursorHolder cursorHolder = new QueryableIndexStorageAdapter(index).makeCursorHolder(buildSpec)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      final DimensionSelector selector = cursor
+          .getColumnSelectorFactory()
+          .makeDimensionSelector(new DefaultDimensionSpec("v", "v", ColumnType.STRING));

-          consumeDimension(cursor, selector, blackhole);
-          return null;
-        })
-        .toList();
-
-    blackhole.consume(results);
+      consumeDimension(cursor, selector, blackhole);
+    }
  }

  @Benchmark
  public void strlenUsingExtractionFn(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = new QueryableIndexStorageAdapter(index).makeCursors(
-        null,
-        index.getDataInterval(),
-        VirtualColumns.EMPTY,
-        Granularities.ALL,
-        false,
-        null
-    );
+    try (final CursorHolder cursorHolder = new QueryableIndexStorageAdapter(index).makeCursorHolder(CursorBuildSpec.FULL_SCAN)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      final DimensionSelector selector = cursor
+          .getColumnSelectorFactory()
+          .makeDimensionSelector(new ExtractionDimensionSpec("x", "v", StrlenExtractionFn.instance()));

-    final List<?> results = cursors
-        .map(cursor -> {
-          final DimensionSelector selector = cursor
-              .getColumnSelectorFactory()
-              .makeDimensionSelector(new ExtractionDimensionSpec("x", "v", StrlenExtractionFn.instance()));
-
-          consumeDimension(cursor, selector, blackhole);
-          return null;
-        })
-        .toList();
-
-    blackhole.consume(results);
+      consumeDimension(cursor, selector, blackhole);
+    }
  }

  @Benchmark
  public void arithmeticOnLong(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = new QueryableIndexStorageAdapter(index).makeCursors(
-        null,
-        index.getDataInterval(),
-        VirtualColumns.create(
-            ImmutableList.of(
-                new ExpressionVirtualColumn(
-                    "v",
-                    "n + 1",
-                    ColumnType.LONG,
-                    TestExprMacroTable.INSTANCE
-                )
-            )
-        ),
-        Granularities.ALL,
-        false,
-        null
-    );
+    final CursorBuildSpec buildSpec = CursorBuildSpec.builder()
+                                                     .setVirtualColumns(
+                                                         VirtualColumns.create(
+                                                             new ExpressionVirtualColumn(
+                                                                 "v",
+                                                                 "n + 1",
+                                                                 ColumnType.LONG,
+                                                                 TestExprMacroTable.INSTANCE
+                                                             )
+                                                         )
+                                                     )
+                                                     .build();

-    final List<?> results = cursors
-        .map(cursor -> {
-          final ColumnValueSelector selector = cursor.getColumnSelectorFactory().makeColumnValueSelector("v");
-          consumeLong(cursor, selector, blackhole);
-          return null;
-        })
-        .toList();
-
-    blackhole.consume(results);
+    try (final CursorHolder cursorHolder = new QueryableIndexStorageAdapter(index).makeCursorHolder(buildSpec)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      final ColumnValueSelector selector = cursor.getColumnSelectorFactory().makeColumnValueSelector("v");
+      consumeLong(cursor, selector, blackhole);
+    }
  }

  @Benchmark
  public void stringConcatAndCompareOnLong(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = new QueryableIndexStorageAdapter(index).makeCursors(
-        null,
-        index.getDataInterval(),
-        VirtualColumns.create(
-            ImmutableList.of(
-                new ExpressionVirtualColumn(
-                    "v",
-                    "concat(n, ' is my favorite number') == '3 is my favorite number'",
-                    ColumnType.LONG,
-                    TestExprMacroTable.INSTANCE
-                )
-            )
-        ),
-        Granularities.ALL,
-        false,
-        null
-    );
-
-    final List<?> results = cursors
-        .map(cursor -> {
-          final ColumnValueSelector selector = cursor.getColumnSelectorFactory().makeColumnValueSelector("v");
-          consumeLong(cursor, selector, blackhole);
-          return null;
-        })
-        .toList();
-
-    blackhole.consume(results);
+    final CursorBuildSpec buildSpec = CursorBuildSpec.builder()
+                                                     .setVirtualColumns(
+                                                         VirtualColumns.create(
+                                                             new ExpressionVirtualColumn(
+                                                                 "v",
+                                                                 "concat(n, ' is my favorite number') == '3 is my favorite number'",
+                                                                 ColumnType.LONG,
+                                                                 TestExprMacroTable.INSTANCE
+                                                             )
+                                                         )
+                                                     )
+                                                     .build();
+    try (final CursorHolder cursorHolder = new QueryableIndexStorageAdapter(index).makeCursorHolder(buildSpec)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      final ColumnValueSelector selector = cursor.getColumnSelectorFactory().makeColumnValueSelector("v");
+      consumeLong(cursor, selector, blackhole);
+    }
  }

  @Benchmark
  public void caseSearched1(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = new QueryableIndexStorageAdapter(index).makeCursors(
-        null,
-        index.getDataInterval(),
-        VirtualColumns.create(
-            ImmutableList.of(
-                new ExpressionVirtualColumn(
-                    "v",
-                    "case_searched(s == 'asd' || isnull(s) || s == 'xxx', 1, s == 'foo' || s == 'bar', 2, 3)",
-                    ColumnType.LONG,
-                    TestExprMacroTable.INSTANCE
-                )
-            )
-        ),
-        Granularities.ALL,
-        false,
-        null
-    );
-
-    final List<?> results = cursors
-        .map(cursor -> {
-          final ColumnValueSelector selector = cursor.getColumnSelectorFactory().makeColumnValueSelector("v");
-          consumeLong(cursor, selector, blackhole);
-          return null;
-        })
-        .toList();
-
-    blackhole.consume(results);
+    final CursorBuildSpec buildSpec = CursorBuildSpec.builder()
+                                                     .setVirtualColumns(
+                                                         VirtualColumns.create(
+                                                             new ExpressionVirtualColumn(
+                                                                 "v",
+                                                                 "case_searched(s == 'asd' || isnull(s) || s == 'xxx', 1, s == 'foo' || s == 'bar', 2, 3)",
+                                                                 ColumnType.LONG,
+                                                                 TestExprMacroTable.INSTANCE
+                                                             )
+                                                         )
+                                                     )
+                                                     .build();
+    try (final CursorHolder cursorHolder = new QueryableIndexStorageAdapter(index).makeCursorHolder(buildSpec)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      final ColumnValueSelector selector = cursor.getColumnSelectorFactory().makeColumnValueSelector("v");
+      consumeLong(cursor, selector, blackhole);
+    }
  }

  @Benchmark
  public void caseSearched2(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = new QueryableIndexStorageAdapter(index).makeCursors(
-        null,
-        index.getDataInterval(),
-        VirtualColumns.create(
-            ImmutableList.of(
-                new ExpressionVirtualColumn(
-                    "v",
-                    "case_searched(s == 'asd' || isnull(s) || n == 1, 1, n == 2, 2, 3)",
-                    ColumnType.LONG,
-                    TestExprMacroTable.INSTANCE
-                )
-            )
-        ),
-        Granularities.ALL,
-        false,
-        null
-    );
-
-    final List<?> results = cursors
-        .map(cursor -> {
-          final ColumnValueSelector selector = cursor.getColumnSelectorFactory().makeColumnValueSelector("v");
-          consumeLong(cursor, selector, blackhole);
-          return null;
-        })
-        .toList();
-
-    blackhole.consume(results);
+    final CursorBuildSpec buildSpec = CursorBuildSpec.builder()
+                                                     .setVirtualColumns(
+                                                         VirtualColumns.create(
+                                                             new ExpressionVirtualColumn(
+                                                                 "v",
+                                                                 "case_searched(s == 'asd' || isnull(s) || n == 1, 1, n == 2, 2, 3)",
+                                                                 ColumnType.LONG,
+                                                                 TestExprMacroTable.INSTANCE
+                                                             )
+                                                         )
+                                                     )
+                                                     .build();
+    try (final CursorHolder cursorHolder = new QueryableIndexStorageAdapter(index).makeCursorHolder(buildSpec)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      final ColumnValueSelector selector = cursor.getColumnSelectorFactory().makeColumnValueSelector("v");
+      consumeLong(cursor, selector, blackhole);
+    }
  }


  @Benchmark
  public void caseSearched100(Blackhole blackhole)
  {
-
    StringBuilder caseBranches = new StringBuilder();
    for (int i = 0; i < 100; i++) {
      caseBranches.append(
@ -534,115 +434,90 @@ public class ExpressionSelectorBenchmark
      );
    }

-    final Sequence<Cursor> cursors = new QueryableIndexStorageAdapter(index).makeCursors(
-        null,
-        index.getDataInterval(),
-        VirtualColumns.create(
-            ImmutableList.of(
-                new ExpressionVirtualColumn(
-                    "v",
-                    "case_searched(s == 'asd' || isnull(s) || n == 1, 1, " + caseBranches + " 3)",
-                    ColumnType.LONG,
-                    TestExprMacroTable.INSTANCE
-                )
-            )
-        ),
-        Granularities.ALL,
-        false,
-        null
-    );
+    final CursorBuildSpec buildSpec = CursorBuildSpec.builder()
+                                                     .setVirtualColumns(
+                                                         VirtualColumns.create(
+                                                             new ExpressionVirtualColumn(
+                                                                 "v",
+                                                                 "case_searched(s == 'asd' || isnull(s) || n == 1, 1, " + caseBranches + " 3)",
+                                                                 ColumnType.LONG,
+                                                                 TestExprMacroTable.INSTANCE
+                                                             )
+                                                         )
+                                                     )
+                                                     .build();

-    final List<?> results = cursors
-        .map(cursor -> {
-          final ColumnValueSelector selector = cursor.getColumnSelectorFactory().makeColumnValueSelector("v");
-          consumeLong(cursor, selector, blackhole);
-          return null;
-        })
-        .toList();
-
-    blackhole.consume(results);
+    try (final CursorHolder cursorHolder = new QueryableIndexStorageAdapter(index).makeCursorHolder(buildSpec)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      final ColumnValueSelector selector = cursor.getColumnSelectorFactory().makeColumnValueSelector("v");
+      consumeLong(cursor, selector, blackhole);
+    }
  }

  @Benchmark
  public void caseSearchedWithLookup(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = new QueryableIndexStorageAdapter(index).makeCursors(
-        null,
-        index.getDataInterval(),
-        VirtualColumns.create(
-            ImmutableList.of(
-                new ExpressionVirtualColumn(
-                    "v",
-                    "case_searched(n == 1001, -1, "
-                        + "lookup(s, 'lookyloo') == 'asd1', 1, "
-                        + "lookup(s, 'lookyloo') == 'asd2', 2, "
-                        + "lookup(s, 'lookyloo') == 'asd3', 3, "
-                        + "lookup(s, 'lookyloo') == 'asd4', 4, "
-                        + "lookup(s, 'lookyloo') == 'asd5', 5, "
-                        + "-2)",
-                    ColumnType.LONG,
-                    LookupEnabledTestExprMacroTable.INSTANCE
-                )
-            )
-        ),
-        Granularities.ALL,
-        false,
-        null
-    );
+    final CursorBuildSpec buildSpec = CursorBuildSpec.builder()
+                                                     .setVirtualColumns(
+                                                         VirtualColumns.create(
+                                                             new ExpressionVirtualColumn(
+                                                                 "v",
+                                                                 "case_searched(n == 1001, -1, "
+                                                                 + "lookup(s, 'lookyloo') == 'asd1', 1, "
+                                                                 + "lookup(s, 'lookyloo') == 'asd2', 2, "
+                                                                 + "lookup(s, 'lookyloo') == 'asd3', 3, "
+                                                                 + "lookup(s, 'lookyloo') == 'asd4', 4, "
+                                                                 + "lookup(s, 'lookyloo') == 'asd5', 5, "
+                                                                 + "-2)",
+                                                                 ColumnType.LONG,
+                                                                 LookupEnabledTestExprMacroTable.INSTANCE
+                                                             )
+                                                         )
+                                                     )
+                                                     .build();

-    final List<?> results = cursors
-        .map(cursor -> {
-          final ColumnValueSelector selector = cursor.getColumnSelectorFactory().makeColumnValueSelector("v");
-          consumeLong(cursor, selector, blackhole);
-          return null;
-        })
-        .toList();
-
-    blackhole.consume(results);
+    try (final CursorHolder cursorHolder = new QueryableIndexStorageAdapter(index).makeCursorHolder(buildSpec)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      final ColumnValueSelector selector = cursor.getColumnSelectorFactory().makeColumnValueSelector("v");
+      consumeLong(cursor, selector, blackhole);
+    }
  }

  @Benchmark
  public void caseSearchedWithLookup2(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = new QueryableIndexStorageAdapter(index).makeCursors(
-        null,
-        index.getDataInterval(),
-        VirtualColumns.create(
-            ImmutableList.of(
-                new ExpressionVirtualColumn(
-                    "ll",
-                    "lookup(s, 'lookyloo')",
-                    ColumnType.STRING,
-                    LookupEnabledTestExprMacroTable.INSTANCE
-                ),
-                new ExpressionVirtualColumn(
-                    "v",
-                    "case_searched(n == 1001, -1, "
-                        + "ll == 'asd1', 1, "
-                        + "ll == 'asd2', 2, "
-                        + "ll == 'asd3', 3, "
-                        + "ll == 'asd4', 4, "
-                        + "ll == 'asd5', 5, "
-                        + "-2)",
-                    ColumnType.LONG,
-                    LookupEnabledTestExprMacroTable.INSTANCE
-                )
-            )
-        ),
-        Granularities.ALL,
-        false,
-        null
-    );
+    final CursorBuildSpec buildSpec = CursorBuildSpec.builder()
+                                                     .setVirtualColumns(
+                                                         VirtualColumns.create(
+                                                             ImmutableList.of(
+                                                                 new ExpressionVirtualColumn(
+                                                                     "ll",
+                                                                     "lookup(s, 'lookyloo')",
+                                                                     ColumnType.STRING,
+                                                                     LookupEnabledTestExprMacroTable.INSTANCE
+                                                                 ),
+                                                                 new ExpressionVirtualColumn(
+                                                                     "v",
+                                                                     "case_searched(n == 1001, -1, "
+                                                                     + "ll == 'asd1', 1, "
+                                                                     + "ll == 'asd2', 2, "
+                                                                     + "ll == 'asd3', 3, "
+                                                                     + "ll == 'asd4', 4, "
+                                                                     + "ll == 'asd5', 5, "
+                                                                     + "-2)",
+                                                                     ColumnType.LONG,
+                                                                     LookupEnabledTestExprMacroTable.INSTANCE
+                                                                 )
+                                                             )
+                                                         )
+                                                     )
+                                                     .build();

-    final List<?> results = cursors
-        .map(cursor -> {
-          final ColumnValueSelector selector = cursor.getColumnSelectorFactory().makeColumnValueSelector("v");
-          consumeLong(cursor, selector, blackhole);
-          return null;
-        })
-        .toList();
-
-    blackhole.consume(results);
+    try (final CursorHolder cursorHolder = new QueryableIndexStorageAdapter(index).makeCursorHolder(buildSpec)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      final ColumnValueSelector selector = cursor.getColumnSelectorFactory().makeColumnValueSelector("v");
+      consumeLong(cursor, selector, blackhole);
+    }
  }


--- a/benchmarks/src/test/java/org/apache/druid/benchmark/ExpressionVectorSelectorBenchmark.java
+++ b/benchmarks/src/test/java/org/apache/druid/benchmark/ExpressionVectorSelectorBenchmark.java
@ -22,7 +22,6 @@ package org.apache.druid.benchmark;
 import com.google.common.collect.ImmutableList;
 import org.apache.druid.common.config.NullHandling;
 import org.apache.druid.java.util.common.granularity.Granularities;
-import org.apache.druid.java.util.common.guava.Sequence;
 import org.apache.druid.java.util.common.io.Closer;
 import org.apache.druid.math.expr.Expr;
 import org.apache.druid.math.expr.ExprMacroTable;
@ -33,6 +32,8 @@ import org.apache.druid.query.expression.TestExprMacroTable;
 import org.apache.druid.segment.ColumnCache;
 import org.apache.druid.segment.ColumnValueSelector;
 import org.apache.druid.segment.Cursor;
+import org.apache.druid.segment.CursorBuildSpec;
+import org.apache.druid.segment.CursorHolder;
 import org.apache.druid.segment.QueryableIndex;
 import org.apache.druid.segment.QueryableIndexStorageAdapter;
 import org.apache.druid.segment.VirtualColumns;
@ -150,15 +151,14 @@ public class ExpressionVectorSelectorBenchmark
            )
        )
    );
+    final CursorBuildSpec buildSpec = CursorBuildSpec.builder()
+                                                     .setVirtualColumns(virtualColumns)
+                                                     .build();
+    final CursorHolder cursorHolder = closer.register(
+        new QueryableIndexStorageAdapter(index).makeCursorHolder(buildSpec)
+    );
    if (vectorize) {
-      VectorCursor cursor = new QueryableIndexStorageAdapter(index).makeVectorCursor(
-          null,
-          index.getDataInterval(),
-          virtualColumns,
-          false,
-          512,
-          null
-      );
+      VectorCursor cursor = cursorHolder.asVectorCursor();
      if (outputType.isNumeric()) {
        VectorValueSelector selector = cursor.getColumnSelectorFactory().makeValueSelector("v");
        if (outputType.is(ExprType.DOUBLE)) {
@ -174,29 +174,34 @@ public class ExpressionVectorSelectorBenchmark
            cursor.advance();
          }
        }
-        closer.register(cursor);
      }
    } else {
-      Sequence<Cursor> cursors = new QueryableIndexStorageAdapter(index).makeCursors(
-          null,
-          index.getDataInterval(),
-          virtualColumns,
-          Granularities.ALL,
-          false,
-          null
-      );
-
-      int rowCount = cursors
-          .map(cursor -> {
-            final ColumnValueSelector selector = cursor.getColumnSelectorFactory().makeColumnValueSelector("v");
-            int rows = 0;
-            while (!cursor.isDone()) {
-              blackhole.consume(selector.getObject());
-              rows++;
-              cursor.advance();
-            }
-            return rows;
-          }).accumulate(0, (acc, in) -> acc + in);
+      final Cursor cursor = cursorHolder.asCursor();
+      final ColumnValueSelector selector = cursor.getColumnSelectorFactory().makeColumnValueSelector("v");
+      int rowCount = 0;
+      if (outputType.isNumeric()) {
+        if (outputType.is(ExprType.DOUBLE)) {
+          while (!cursor.isDone()) {
+            blackhole.consume(selector.isNull());
+            blackhole.consume(selector.getDouble());
+            rowCount++;
+            cursor.advance();
+          }
+        } else {
+          while (!cursor.isDone()) {
+            blackhole.consume(selector.isNull());
+            blackhole.consume(selector.getLong());
+            rowCount++;
+            cursor.advance();
+          }
+        }
+      } else {
+        while (!cursor.isDone()) {
+          blackhole.consume(selector.getObject());
+          rowCount++;
+          cursor.advance();
+        }
+      }

      blackhole.consume(rowCount);
    }
--- a/benchmarks/src/test/java/org/apache/druid/benchmark/FilterPartitionBenchmark.java
+++ b/benchmarks/src/test/java/org/apache/druid/benchmark/FilterPartitionBenchmark.java
@ -25,9 +25,6 @@ import org.apache.druid.common.config.NullHandling;
 import org.apache.druid.data.input.InputRow;
 import org.apache.druid.jackson.DefaultObjectMapper;
 import org.apache.druid.java.util.common.FileUtils;
-import org.apache.druid.java.util.common.granularity.Granularities;
-import org.apache.druid.java.util.common.guava.Sequence;
-import org.apache.druid.java.util.common.guava.Sequences;
 import org.apache.druid.java.util.common.logger.Logger;
 import org.apache.druid.js.JavaScriptConfig;
 import org.apache.druid.query.aggregation.hyperloglog.HyperUniquesSerde;
@ -48,6 +45,8 @@ import org.apache.druid.query.filter.SelectorDimFilter;
 import org.apache.druid.query.ordering.StringComparators;
 import org.apache.druid.segment.BaseLongColumnValueSelector;
 import org.apache.druid.segment.Cursor;
+import org.apache.druid.segment.CursorBuildSpec;
+import org.apache.druid.segment.CursorHolder;
 import org.apache.druid.segment.DimensionSelector;
 import org.apache.druid.segment.IndexIO;
 import org.apache.druid.segment.IndexMergerV9;
@ -55,7 +54,6 @@ import org.apache.druid.segment.IndexSpec;
 import org.apache.druid.segment.QueryableIndex;
 import org.apache.druid.segment.QueryableIndexStorageAdapter;
 import org.apache.druid.segment.StorageAdapter;
-import org.apache.druid.segment.VirtualColumns;
 import org.apache.druid.segment.column.ColumnConfig;
 import org.apache.druid.segment.column.ColumnHolder;
 import org.apache.druid.segment.data.IndexedInts;
@ -234,8 +232,10 @@ public class FilterPartitionBenchmark
  public void stringRead(Blackhole blackhole)
  {
    StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex);
-    Sequence<Cursor> cursors = makeCursors(sa, null);
-    readCursors(cursors, blackhole);
+    try (final CursorHolder cursorHolder = makeCursorHolder(sa, null)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      readCursor(cursor, blackhole);
+    }
  }

  @Benchmark
@ -244,9 +244,10 @@ public class FilterPartitionBenchmark
  public void longRead(Blackhole blackhole)
  {
    StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex);
-    Sequence<Cursor> cursors = makeCursors(sa, null);
-
-    readCursorsLong(cursors, blackhole);
+    try (final CursorHolder cursorHolder = makeCursorHolder(sa, null)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      readCursorLong(cursor, blackhole);
+    }
  }

  @Benchmark
@ -255,9 +256,10 @@ public class FilterPartitionBenchmark
  public void timeFilterNone(Blackhole blackhole)
  {
    StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex);
-    Sequence<Cursor> cursors = makeCursors(sa, timeFilterNone);
-
-    readCursorsLong(cursors, blackhole);
+    try (CursorHolder cursorHolder = makeCursorHolder(sa, timeFilterNone)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      readCursorLong(cursor, blackhole);
+    }
  }

  @Benchmark
@ -266,9 +268,10 @@ public class FilterPartitionBenchmark
  public void timeFilterHalf(Blackhole blackhole)
  {
    StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex);
-    Sequence<Cursor> cursors = makeCursors(sa, timeFilterHalf);
-
-    readCursorsLong(cursors, blackhole);
+    try (final CursorHolder cursorHolder = makeCursorHolder(sa, timeFilterHalf)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      readCursorLong(cursor, blackhole);
+    }
  }

  @Benchmark
@ -277,9 +280,10 @@ public class FilterPartitionBenchmark
  public void timeFilterAll(Blackhole blackhole)
  {
    StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex);
-    Sequence<Cursor> cursors = makeCursors(sa, timeFilterAll);
-
-    readCursorsLong(cursors, blackhole);
+    try (final CursorHolder cursorHolder = makeCursorHolder(sa, timeFilterAll)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      readCursorLong(cursor, blackhole);
+    }
  }

  @Benchmark
@ -290,8 +294,10 @@ public class FilterPartitionBenchmark
    Filter filter = new SelectorFilter("dimSequential", "199");

    StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex);
-    Sequence<Cursor> cursors = makeCursors(sa, filter);
-    readCursors(cursors, blackhole);
+    try (final CursorHolder cursorHolder = makeCursorHolder(sa, filter)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      readCursor(cursor, blackhole);
+    }
  }

  @Benchmark
@ -302,8 +308,10 @@ public class FilterPartitionBenchmark
    Filter filter = new NoBitmapSelectorFilter("dimSequential", "199");

    StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex);
-    Sequence<Cursor> cursors = makeCursors(sa, filter);
-    readCursors(cursors, blackhole);
+    try (final CursorHolder cursorHolder = makeCursorHolder(sa, filter)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      readCursor(cursor, blackhole);
+    }
  }

  @Benchmark
@ -314,8 +322,10 @@ public class FilterPartitionBenchmark
    Filter filter = new SelectorDimFilter("dimSequential", "super-199", JS_EXTRACTION_FN).toFilter();

    StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex);
-    Sequence<Cursor> cursors = makeCursors(sa, filter);
-    readCursors(cursors, blackhole);
+    try (final CursorHolder cursorHolder = makeCursorHolder(sa, filter)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      readCursor(cursor, blackhole);
+    }
  }

  @Benchmark
@ -326,8 +336,10 @@ public class FilterPartitionBenchmark
    Filter filter = new NoBitmapSelectorDimFilter("dimSequential", "super-199", JS_EXTRACTION_FN).toFilter();

    StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex);
-    Sequence<Cursor> cursors = makeCursors(sa, filter);
-    readCursors(cursors, blackhole);
+    try (final CursorHolder cursorHolder = makeCursorHolder(sa, filter)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      readCursor(cursor, blackhole);
+    }
  }

  @Benchmark
@ -343,8 +355,10 @@ public class FilterPartitionBenchmark
    );

    StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex);
-    Sequence<Cursor> cursors = makeCursors(sa, andFilter);
-    readCursors(cursors, blackhole);
+    try (final CursorHolder cursorHolder = makeCursorHolder(sa, andFilter)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      readCursor(cursor, blackhole);
+    }
  }

  @Benchmark
@ -357,8 +371,10 @@ public class FilterPartitionBenchmark
    Filter orFilter = new OrFilter(Arrays.asList(filter, filter2));

    StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex);
-    Sequence<Cursor> cursors = makeCursors(sa, orFilter);
-    readCursors(cursors, blackhole);
+    try (final CursorHolder cursorHolder = makeCursorHolder(sa, orFilter)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      readCursor(cursor, blackhole);
+    }
  }

  @Benchmark
@ -371,8 +387,10 @@ public class FilterPartitionBenchmark
    Filter orFilter = new OrFilter(Arrays.asList(filter, filter2));

    StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex);
-    Sequence<Cursor> cursors = makeCursors(sa, Filters.toCnf(orFilter));
-    readCursors(cursors, blackhole);
+    try (final CursorHolder cursorHolder = makeCursorHolder(sa, Filters.toCnf(orFilter))) {
+      final Cursor cursor = cursorHolder.asCursor();
+      readCursor(cursor, blackhole);
+    }
  }

  @Benchmark
@ -408,8 +426,10 @@ public class FilterPartitionBenchmark
    );

    StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex);
-    Sequence<Cursor> cursors = makeCursors(sa, dimFilter3.toFilter());
-    readCursors(cursors, blackhole);
+    try (final CursorHolder cursorHolder = makeCursorHolder(sa, dimFilter3.toFilter())) {
+      final Cursor cursor = cursorHolder.asCursor();
+      readCursor(cursor, blackhole);
+    }
  }

  @Benchmark
@ -445,55 +465,46 @@ public class FilterPartitionBenchmark
    );

    StorageAdapter sa = new QueryableIndexStorageAdapter(qIndex);
-    Sequence<Cursor> cursors = makeCursors(sa, Filters.toCnf(dimFilter3.toFilter()));
-    readCursors(cursors, blackhole);
+    try (final CursorHolder cursorHolder = makeCursorHolder(sa, Filters.toCnf(dimFilter3.toFilter()))) {
+      final Cursor cursor = cursorHolder.asCursor();
+      readCursor(cursor, blackhole);
+    }
  }

-  private Sequence<Cursor> makeCursors(StorageAdapter sa, Filter filter)
+  private CursorHolder makeCursorHolder(StorageAdapter sa, Filter filter)
  {
-    return sa.makeCursors(filter, schemaInfo.getDataInterval(), VirtualColumns.EMPTY, Granularities.ALL, false, null);
-  }
-
-  private void readCursors(Sequence<Cursor> cursors, Blackhole blackhole)
-  {
-    final Sequence<Void> voids = Sequences.map(
-        cursors,
-        input -> {
-          List<DimensionSelector> selectors = new ArrayList<>();
-          selectors.add(
-              input.getColumnSelectorFactory().makeDimensionSelector(new DefaultDimensionSpec("dimSequential", null))
-          );
-          while (!input.isDone()) {
-            for (DimensionSelector selector : selectors) {
-              IndexedInts row = selector.getRow();
-              blackhole.consume(selector.lookupName(row.get(0)));
-            }
-            input.advance();
-          }
-          return null;
-        }
+    return sa.makeCursorHolder(
+        CursorBuildSpec.builder()
+                       .setFilter(filter)
+                       .setInterval(schemaInfo.getDataInterval())
+                       .build()
    );
-
-    blackhole.consume(voids.toList());
  }

-  private void readCursorsLong(Sequence<Cursor> cursors, final Blackhole blackhole)
+  private void readCursor(Cursor cursor, Blackhole blackhole)
  {
-    final Sequence<Void> voids = Sequences.map(
-        cursors,
-        input -> {
-          BaseLongColumnValueSelector selector = input.getColumnSelectorFactory()
-                                                      .makeColumnValueSelector("sumLongSequential");
-          while (!input.isDone()) {
-            long rowval = selector.getLong();
-            blackhole.consume(rowval);
-            input.advance();
-          }
-          return null;
-        }
+    List<DimensionSelector> selectors = new ArrayList<>();
+    selectors.add(
+        cursor.getColumnSelectorFactory().makeDimensionSelector(new DefaultDimensionSpec("dimSequential", null))
    );
+    while (!cursor.isDone()) {
+      for (DimensionSelector selector : selectors) {
+        IndexedInts row = selector.getRow();
+        blackhole.consume(selector.lookupName(row.get(0)));
+      }
+      cursor.advance();
+    }
+  }

-    blackhole.consume(voids.toList());
+  private void readCursorLong(Cursor cursor, final Blackhole blackhole)
+  {
+    BaseLongColumnValueSelector selector = cursor.getColumnSelectorFactory()
+                                                 .makeColumnValueSelector("sumLongSequential");
+    while (!cursor.isDone()) {
+      long rowval = selector.getLong();
+      blackhole.consume(rowval);
+      cursor.advance();
+    }
  }

  private static class NoBitmapSelectorFilter extends SelectorFilter
--- a/benchmarks/src/test/java/org/apache/druid/benchmark/IndexedTableJoinCursorBenchmark.java
+++ b/benchmarks/src/test/java/org/apache/druid/benchmark/IndexedTableJoinCursorBenchmark.java
@ -27,7 +27,6 @@ import org.apache.druid.java.util.common.IAE;
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.StringUtils;
 import org.apache.druid.java.util.common.granularity.Granularities;
-import org.apache.druid.java.util.common.guava.Sequence;
 import org.apache.druid.java.util.common.io.Closer;
 import org.apache.druid.math.expr.ExprMacroTable;
 import org.apache.druid.query.QueryContexts;
@ -37,6 +36,8 @@ import org.apache.druid.query.dimension.DefaultDimensionSpec;
 import org.apache.druid.segment.BaseObjectColumnValueSelector;
 import org.apache.druid.segment.ColumnSelectorFactory;
 import org.apache.druid.segment.Cursor;
+import org.apache.druid.segment.CursorBuildSpec;
+import org.apache.druid.segment.CursorHolder;
 import org.apache.druid.segment.DimensionSelector;
 import org.apache.druid.segment.QueryableIndex;
 import org.apache.druid.segment.QueryableIndexSegment;
@ -216,9 +217,11 @@ public class IndexedTableJoinCursorBenchmark
  @OutputTimeUnit(TimeUnit.MILLISECONDS)
  public void hashJoinCursorColumnValueSelectors(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = makeCursors();
-    int rowCount = processRowsValueSelector(blackhole, cursors, projectionColumns);
-    blackhole.consume(rowCount);
+    try (final CursorHolder cursorHolder = makeCursorHolder()) {
+      final Cursor cursor = cursorHolder.asCursor();
+      int rowCount = processRowsValueSelector(blackhole, cursor, projectionColumns);
+      blackhole.consume(rowCount);
+    }
  }

  @Benchmark
@ -226,21 +229,16 @@ public class IndexedTableJoinCursorBenchmark
  @OutputTimeUnit(TimeUnit.MILLISECONDS)
  public void hashJoinCursorDimensionSelectors(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = makeCursors();
-    int rowCount = processRowsDimensionSelectors(blackhole, cursors, projectionColumns);
-    blackhole.consume(rowCount);
+    try (final CursorHolder cursorHolder = makeCursorHolder()) {
+      final Cursor cursor = cursorHolder.asCursor();
+      int rowCount = processRowsDimensionSelectors(blackhole, cursor, projectionColumns);
+      blackhole.consume(rowCount);
+    }
  }

-  private Sequence<Cursor> makeCursors()
+  private CursorHolder makeCursorHolder()
  {
-    return hashJoinSegment.asStorageAdapter().makeCursors(
-        null,
-        Intervals.ETERNITY,
-        VirtualColumns.EMPTY,
-        Granularities.ALL,
-        false,
-        null
-    );
+    return hashJoinSegment.asStorageAdapter().makeCursorHolder(CursorBuildSpec.FULL_SCAN);
  }


@ -313,106 +311,96 @@ public class IndexedTableJoinCursorBenchmark

  private static int processRowsDimensionSelectors(
      final Blackhole blackhole,
-      final Sequence<Cursor> cursors,
+      final Cursor cursor,
      final Set<String> columns
  )
  {
    if (columns.size() == 1) {
-      return processRowsSingleDimensionSelector(blackhole, cursors, Iterables.getOnlyElement(columns));
+      return processRowsSingleDimensionSelector(blackhole, cursor, Iterables.getOnlyElement(columns));
    }
-    return cursors.map(
-        cursor -> {
-          List<DimensionSelector> selectors = columns.stream().map(column -> {
-            ColumnSelectorFactory factory = cursor.getColumnSelectorFactory();
-            return factory.makeDimensionSelector(DefaultDimensionSpec.of(column));
-          }).collect(Collectors.toList());
+    List<DimensionSelector> selectors = columns.stream().map(column -> {
+      ColumnSelectorFactory factory = cursor.getColumnSelectorFactory();
+      return factory.makeDimensionSelector(DefaultDimensionSpec.of(column));
+    }).collect(Collectors.toList());

-          int rowCount = 0;
-          while (!cursor.isDone()) {
-            for (DimensionSelector selector : selectors) {
-              if (selector.getValueCardinality() < 0) {
-                final IndexedInts row = selector.getRow();
-                final int sz = row.size();
-                for (int i = 0; i < sz; i++) {
-                  blackhole.consume(selector.lookupName(row.get(i)));
-                }
-              } else {
-                final IndexedInts row = selector.getRow();
-                final int sz = row.size();
-                for (int i = 0; i < sz; i++) {
-                  blackhole.consume(row.get(i));
-                }
-              }
-            }
-
-            rowCount++;
-            cursor.advance();
+    int rowCount = 0;
+    while (!cursor.isDone()) {
+      for (DimensionSelector selector : selectors) {
+        if (selector.getValueCardinality() < 0) {
+          final IndexedInts row = selector.getRow();
+          final int sz = row.size();
+          for (int i = 0; i < sz; i++) {
+            blackhole.consume(selector.lookupName(row.get(i)));
          }
-          return rowCount;
-        }).accumulate(0, (acc, in) -> acc + in);
+        } else {
+          final IndexedInts row = selector.getRow();
+          final int sz = row.size();
+          for (int i = 0; i < sz; i++) {
+            blackhole.consume(row.get(i));
+          }
+        }
+      }
+
+      rowCount++;
+      cursor.advance();
+    }
+    return rowCount;
  }

  private static int processRowsSingleDimensionSelector(
      final Blackhole blackhole,
-      final Sequence<Cursor> cursors,
+      final Cursor cursor,
      final String dimension
  )
  {
-    return cursors.map(
-        cursor -> {
-          final DimensionSelector selector = cursor.getColumnSelectorFactory()
-                                                   .makeDimensionSelector(DefaultDimensionSpec.of(dimension));
+    final DimensionSelector selector = cursor.getColumnSelectorFactory()
+                                             .makeDimensionSelector(DefaultDimensionSpec.of(dimension));

-          int rowCount = 0;
-          if (selector.getValueCardinality() < 0) {
-            String lastValue;
-            while (!cursor.isDone()) {
-              final IndexedInts row = selector.getRow();
-              final int sz = row.size();
-              for (int i = 0; i < sz; i++) {
-                lastValue = selector.lookupName(row.get(i));
-                blackhole.consume(lastValue);
-              }
-              rowCount++;
-              cursor.advance();
-            }
-            return rowCount;
-          } else {
-            int lastValue;
-            while (!cursor.isDone()) {
-              final IndexedInts row = selector.getRow();
-              final int sz = row.size();
-              for (int i = 0; i < sz; i++) {
-                lastValue = row.get(i);
-                blackhole.consume(lastValue);
-              }
-              rowCount++;
-              cursor.advance();
-            }
-            return rowCount;
-          }
+    int rowCount = 0;
+    if (selector.getValueCardinality() < 0) {
+      String lastValue;
+      while (!cursor.isDone()) {
+        final IndexedInts row = selector.getRow();
+        final int sz = row.size();
+        for (int i = 0; i < sz; i++) {
+          lastValue = selector.lookupName(row.get(i));
+          blackhole.consume(lastValue);
        }
-    ).accumulate(0, (acc, in) -> acc + in);
+        rowCount++;
+        cursor.advance();
+      }
+      return rowCount;
+    } else {
+      int lastValue;
+      while (!cursor.isDone()) {
+        final IndexedInts row = selector.getRow();
+        final int sz = row.size();
+        for (int i = 0; i < sz; i++) {
+          lastValue = row.get(i);
+          blackhole.consume(lastValue);
+        }
+        rowCount++;
+        cursor.advance();
+      }
+      return rowCount;
+    }
  }

-  private static int processRowsValueSelector(final Blackhole blackhole, final Sequence<Cursor> cursors, final Set<String> columns)
+  private static int processRowsValueSelector(final Blackhole blackhole, final Cursor cursor, final Set<String> columns)
  {
-    return cursors.map(
-        cursor -> {
-          ColumnSelectorFactory factory = cursor.getColumnSelectorFactory();
+    ColumnSelectorFactory factory = cursor.getColumnSelectorFactory();

-          List<BaseObjectColumnValueSelector> selectors =
-              columns.stream().map(factory::makeColumnValueSelector).collect(Collectors.toList());
-          int rowCount = 0;
-          while (!cursor.isDone()) {
-            for (BaseObjectColumnValueSelector<?> selector : selectors) {
-              blackhole.consume(selector.getObject());
-            }
+    List<BaseObjectColumnValueSelector> selectors =
+        columns.stream().map(factory::makeColumnValueSelector).collect(Collectors.toList());
+    int rowCount = 0;
+    while (!cursor.isDone()) {
+      for (BaseObjectColumnValueSelector<?> selector : selectors) {
+        blackhole.consume(selector.getObject());
+      }

-            rowCount++;
-            cursor.advance();
-          }
-          return rowCount;
-        }).accumulate(0, (acc, in) -> acc + in);
+      rowCount++;
+      cursor.advance();
+    }
+    return rowCount;
  }
 }
--- a/benchmarks/src/test/java/org/apache/druid/benchmark/JoinAndLookupBenchmark.java
+++ b/benchmarks/src/test/java/org/apache/druid/benchmark/JoinAndLookupBenchmark.java
@ -23,18 +23,18 @@ import com.google.common.collect.ImmutableList;
 import com.google.common.collect.ImmutableSet;
 import org.apache.druid.common.config.NullHandling;
 import org.apache.druid.java.util.common.FileUtils;
-import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.StringUtils;
-import org.apache.druid.java.util.common.granularity.Granularities;
-import org.apache.druid.java.util.common.guava.Sequence;
 import org.apache.druid.math.expr.ExprMacroTable;
 import org.apache.druid.query.dimension.DefaultDimensionSpec;
 import org.apache.druid.query.expression.LookupExprMacro;
+import org.apache.druid.query.filter.Filter;
 import org.apache.druid.query.filter.SelectorDimFilter;
 import org.apache.druid.query.lookup.LookupExtractorFactoryContainer;
 import org.apache.druid.query.lookup.LookupExtractorFactoryContainerProvider;
 import org.apache.druid.query.lookup.MapLookupExtractorFactory;
 import org.apache.druid.segment.Cursor;
+import org.apache.druid.segment.CursorBuildSpec;
+import org.apache.druid.segment.CursorHolder;
 import org.apache.druid.segment.DimensionSelector;
 import org.apache.druid.segment.QueryableIndex;
 import org.apache.druid.segment.QueryableIndexSegment;
@ -334,38 +334,34 @@ public class JoinAndLookupBenchmark
    );
  }

-  private static String getLastValue(final Sequence<Cursor> cursors, final String dimension)
+  private static String getLastValue(final Cursor cursor, final String dimension)
  {
-    return cursors.map(
-        cursor -> {
-          final DimensionSelector selector = cursor.getColumnSelectorFactory()
-                                                   .makeDimensionSelector(DefaultDimensionSpec.of(dimension));
+    final DimensionSelector selector = cursor.getColumnSelectorFactory()
+                                             .makeDimensionSelector(DefaultDimensionSpec.of(dimension));

-          if (selector.getValueCardinality() < 0) {
-            String lastValue = null;
-            while (!cursor.isDone()) {
-              final IndexedInts row = selector.getRow();
-              final int sz = row.size();
-              for (int i = 0; i < sz; i++) {
-                lastValue = selector.lookupName(row.get(i));
-              }
-              cursor.advance();
-            }
-            return lastValue;
-          } else {
-            int lastValue = -1;
-            while (!cursor.isDone()) {
-              final IndexedInts row = selector.getRow();
-              final int sz = row.size();
-              for (int i = 0; i < sz; i++) {
-                lastValue = row.get(i);
-              }
-              cursor.advance();
-            }
-            return selector.lookupName(lastValue);
-          }
+    if (selector.getValueCardinality() < 0) {
+      String lastValue = null;
+      while (!cursor.isDone()) {
+        final IndexedInts row = selector.getRow();
+        final int sz = row.size();
+        for (int i = 0; i < sz; i++) {
+          lastValue = selector.lookupName(row.get(i));
        }
-    ).accumulate(null, (acc, in) -> in);
+        cursor.advance();
+      }
+      return lastValue;
+    } else {
+      int lastValue = -1;
+      while (!cursor.isDone()) {
+        final IndexedInts row = selector.getRow();
+        final int sz = row.size();
+        for (int i = 0; i < sz; i++) {
+          lastValue = row.get(i);
+        }
+        cursor.advance();
+      }
+      return selector.lookupName(lastValue);
+    }
  }

  @Benchmark
@ -373,16 +369,10 @@ public class JoinAndLookupBenchmark
  @OutputTimeUnit(TimeUnit.MILLISECONDS)
  public void baseSegment(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = baseSegment.asStorageAdapter().makeCursors(
-        null,
-        Intervals.ETERNITY,
-        VirtualColumns.EMPTY,
-        Granularities.ALL,
-        false,
-        null
-    );
-
-    blackhole.consume(getLastValue(cursors, "countryIsoCode"));
+    try (final CursorHolder cursorHolder = baseSegment.asStorageAdapter().makeCursorHolder(CursorBuildSpec.FULL_SCAN)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      blackhole.consume(getLastValue(cursor, "countryIsoCode"));
+    }
  }

  @Benchmark
@ -390,16 +380,14 @@ public class JoinAndLookupBenchmark
  @OutputTimeUnit(TimeUnit.MILLISECONDS)
  public void baseSegmentWithFilter(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = baseSegment.asStorageAdapter().makeCursors(
-        new SelectorDimFilter("countryIsoCode", "CA", null).toFilter(),
-        Intervals.ETERNITY,
-        VirtualColumns.EMPTY,
-        Granularities.ALL,
-        false,
-        null
-    );
-
-    blackhole.consume(getLastValue(cursors, "countryIsoCode"));
+    final Filter filter = new SelectorDimFilter("countryIsoCode", "CA", null).toFilter();
+    final CursorBuildSpec buildSpec = CursorBuildSpec.builder()
+                                                     .setFilter(filter)
+                                                     .build();
+    try (final CursorHolder cursorHolder = baseSegment.asStorageAdapter().makeCursorHolder(buildSpec)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      blackhole.consume(getLastValue(cursor, "countryIsoCode"));
+    }
  }

  @Benchmark
@ -407,16 +395,11 @@ public class JoinAndLookupBenchmark
  @OutputTimeUnit(TimeUnit.MILLISECONDS)
  public void joinLookupStringKey(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = hashJoinLookupStringKeySegment.asStorageAdapter().makeCursors(
-        null,
-        Intervals.ETERNITY,
-        VirtualColumns.EMPTY,
-        Granularities.ALL,
-        false,
-        null
-    );
-
-    blackhole.consume(getLastValue(cursors, "c.v"));
+    try (final CursorHolder cursorHolder = hashJoinLookupStringKeySegment.asStorageAdapter()
+                                                                         .makeCursorHolder(CursorBuildSpec.FULL_SCAN)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      blackhole.consume(getLastValue(cursor, "c.v"));
+    }
  }

  @Benchmark
@ -424,16 +407,15 @@ public class JoinAndLookupBenchmark
  @OutputTimeUnit(TimeUnit.MILLISECONDS)
  public void joinLookupStringKeyWithFilter(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = hashJoinLookupStringKeySegment.asStorageAdapter().makeCursors(
-        new SelectorDimFilter("c.v", "Canada", null).toFilter(),
-        Intervals.ETERNITY,
-        VirtualColumns.EMPTY,
-        Granularities.ALL,
-        false,
-        null
-    );
-
-    blackhole.consume(getLastValue(cursors, "c.v"));
+    final Filter filter = new SelectorDimFilter("c.v", "Canada", null).toFilter();
+    final CursorBuildSpec buildSpec = CursorBuildSpec.builder()
+                                                     .setFilter(filter)
+                                                     .build();
+    try (final CursorHolder cursorHolder = hashJoinLookupStringKeySegment.asStorageAdapter()
+                                                                         .makeCursorHolder(buildSpec)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      blackhole.consume(getLastValue(cursor, "c.v"));
+    }
  }

  @Benchmark
@ -441,16 +423,11 @@ public class JoinAndLookupBenchmark
  @OutputTimeUnit(TimeUnit.MILLISECONDS)
  public void joinLookupLongKey(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = hashJoinLookupLongKeySegment.asStorageAdapter().makeCursors(
-        null,
-        Intervals.ETERNITY,
-        VirtualColumns.EMPTY,
-        Granularities.ALL,
-        false,
-        null
-    );
-
-    blackhole.consume(getLastValue(cursors, "c.v"));
+    try (final CursorHolder cursorHolder = hashJoinLookupStringKeySegment.asStorageAdapter()
+                                                                         .makeCursorHolder(CursorBuildSpec.FULL_SCAN)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      blackhole.consume(getLastValue(cursor, "c.v"));
+    }
  }

  @Benchmark
@ -458,16 +435,15 @@ public class JoinAndLookupBenchmark
  @OutputTimeUnit(TimeUnit.MILLISECONDS)
  public void joinLookupLongKeyWithFilter(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = hashJoinLookupLongKeySegment.asStorageAdapter().makeCursors(
-        new SelectorDimFilter("c.v", "Canada", null).toFilter(),
-        Intervals.ETERNITY,
-        VirtualColumns.EMPTY,
-        Granularities.ALL,
-        false,
-        null
-    );
-
-    blackhole.consume(getLastValue(cursors, "c.v"));
+    final Filter filter = new SelectorDimFilter("c.v", "Canada", null).toFilter();
+    final CursorBuildSpec buildSpec = CursorBuildSpec.builder()
+                                                     .setFilter(filter)
+                                                     .build();
+    try (final CursorHolder cursorHolder = hashJoinLookupStringKeySegment.asStorageAdapter()
+                                                                         .makeCursorHolder(buildSpec)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      blackhole.consume(getLastValue(cursor, "c.v"));
+    }
  }

  @Benchmark
@ -475,16 +451,11 @@ public class JoinAndLookupBenchmark
  @OutputTimeUnit(TimeUnit.MILLISECONDS)
  public void joinIndexedTableLongKey(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = hashJoinIndexedTableLongKeySegment.asStorageAdapter().makeCursors(
-        null,
-        Intervals.ETERNITY,
-        VirtualColumns.EMPTY,
-        Granularities.ALL,
-        false,
-        null
-    );
-
-    blackhole.consume(getLastValue(cursors, "c.countryName"));
+    try (final CursorHolder cursorHolder = hashJoinLookupStringKeySegment.asStorageAdapter()
+                                                                         .makeCursorHolder(CursorBuildSpec.FULL_SCAN)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      blackhole.consume(getLastValue(cursor, "c.countryName"));
+    }
  }

  @Benchmark
@ -492,16 +463,15 @@ public class JoinAndLookupBenchmark
  @OutputTimeUnit(TimeUnit.MILLISECONDS)
  public void joinIndexedTableLongKeyWithFilter(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = hashJoinIndexedTableLongKeySegment.asStorageAdapter().makeCursors(
-        new SelectorDimFilter("c.countryName", "Canada", null).toFilter(),
-        Intervals.ETERNITY,
-        VirtualColumns.EMPTY,
-        Granularities.ALL,
-        false,
-        null
-    );
-
-    blackhole.consume(getLastValue(cursors, "c.countryName"));
+    final Filter filter = new SelectorDimFilter("c.countryName", "Canada", null).toFilter();
+    final CursorBuildSpec buildSpec = CursorBuildSpec.builder()
+                                                     .setFilter(filter)
+                                                     .build();
+    try (final CursorHolder cursorHolder = hashJoinLookupStringKeySegment.asStorageAdapter()
+                                                                         .makeCursorHolder(buildSpec)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      blackhole.consume(getLastValue(cursor, "c.countryName"));
+    }
  }

  @Benchmark
@ -509,16 +479,11 @@ public class JoinAndLookupBenchmark
  @OutputTimeUnit(TimeUnit.MILLISECONDS)
  public void joinIndexedTableStringKey(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = hashJoinIndexedTableStringKeySegment.asStorageAdapter().makeCursors(
-        null,
-        Intervals.ETERNITY,
-        VirtualColumns.EMPTY,
-        Granularities.ALL,
-        false,
-        null
-    );
-
-    blackhole.consume(getLastValue(cursors, "c.countryName"));
+    try (final CursorHolder cursorHolder = hashJoinLookupStringKeySegment.asStorageAdapter()
+                                                                         .makeCursorHolder(CursorBuildSpec.FULL_SCAN)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      blackhole.consume(getLastValue(cursor, "c.countryName"));
+    }
  }

  @Benchmark
@ -526,16 +491,15 @@ public class JoinAndLookupBenchmark
  @OutputTimeUnit(TimeUnit.MILLISECONDS)
  public void joinIndexedTableStringKeyWithFilter(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = hashJoinIndexedTableStringKeySegment.asStorageAdapter().makeCursors(
-        new SelectorDimFilter("c.countryName", "Canada", null).toFilter(),
-        Intervals.ETERNITY,
-        VirtualColumns.EMPTY,
-        Granularities.ALL,
-        false,
-        null
-    );
-
-    blackhole.consume(getLastValue(cursors, "c.countryName"));
+    final Filter filter = new SelectorDimFilter("c.countryName", "Canada", null).toFilter();
+    final CursorBuildSpec buildSpec = CursorBuildSpec.builder()
+                                                     .setFilter(filter)
+                                                     .build();
+    try (final CursorHolder cursorHolder = hashJoinLookupStringKeySegment.asStorageAdapter()
+                                                                         .makeCursorHolder(buildSpec)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      blackhole.consume(getLastValue(cursor, "c.countryName"));
+    }
  }

  @Benchmark
@ -543,16 +507,14 @@ public class JoinAndLookupBenchmark
  @OutputTimeUnit(TimeUnit.MILLISECONDS)
  public void lookupVirtualColumnStringKey(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = baseSegment.asStorageAdapter().makeCursors(
-        null,
-        Intervals.ETERNITY,
-        lookupVirtualColumns,
-        Granularities.ALL,
-        false,
-        null
-    );
-
-    blackhole.consume(getLastValue(cursors, LOOKUP_COUNTRY_CODE_TO_NAME));
+    final CursorBuildSpec buildSpec = CursorBuildSpec.builder()
+                                                     .setVirtualColumns(lookupVirtualColumns)
+                                                     .build();
+    try (final CursorHolder cursorHolder = hashJoinLookupStringKeySegment.asStorageAdapter()
+                                                                         .makeCursorHolder(buildSpec)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      blackhole.consume(getLastValue(cursor, LOOKUP_COUNTRY_CODE_TO_NAME));
+    }
  }

  @Benchmark
@ -560,16 +522,16 @@ public class JoinAndLookupBenchmark
  @OutputTimeUnit(TimeUnit.MILLISECONDS)
  public void lookupVirtualColumnStringKeyWithFilter(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = baseSegment.asStorageAdapter().makeCursors(
-        new SelectorDimFilter(LOOKUP_COUNTRY_CODE_TO_NAME, "Canada", null).toFilter(),
-        Intervals.ETERNITY,
-        lookupVirtualColumns,
-        Granularities.ALL,
-        false,
-        null
-    );
-
-    blackhole.consume(getLastValue(cursors, LOOKUP_COUNTRY_CODE_TO_NAME));
+    final Filter filter = new SelectorDimFilter(LOOKUP_COUNTRY_CODE_TO_NAME, "Canada", null).toFilter();
+    final CursorBuildSpec buildSpec = CursorBuildSpec.builder()
+                                                     .setFilter(filter)
+                                                     .setVirtualColumns(lookupVirtualColumns)
+                                                     .build();
+    try (final CursorHolder cursorHolder = hashJoinLookupStringKeySegment.asStorageAdapter()
+                                                                         .makeCursorHolder(buildSpec)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      blackhole.consume(getLastValue(cursor, LOOKUP_COUNTRY_CODE_TO_NAME));
+    }
  }

  @Benchmark
@ -577,16 +539,13 @@ public class JoinAndLookupBenchmark
  @OutputTimeUnit(TimeUnit.MILLISECONDS)
  public void lookupVirtualColumnLongKey(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = baseSegment.asStorageAdapter().makeCursors(
-        null,
-        Intervals.ETERNITY,
-        lookupVirtualColumns,
-        Granularities.ALL,
-        false,
-        null
-    );
-
-    blackhole.consume(getLastValue(cursors, LOOKUP_COUNTRY_NUMBER_TO_NAME));
+    final CursorBuildSpec buildSpec = CursorBuildSpec.builder()
+                                                     .setVirtualColumns(lookupVirtualColumns)
+                                                     .build();
+    try (final CursorHolder cursorHolder = baseSegment.asStorageAdapter().makeCursorHolder(buildSpec)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      blackhole.consume(getLastValue(cursor, LOOKUP_COUNTRY_NUMBER_TO_NAME));
+    }
  }

  @Benchmark
@ -594,15 +553,14 @@ public class JoinAndLookupBenchmark
  @OutputTimeUnit(TimeUnit.MILLISECONDS)
  public void lookupVirtualColumnLongKeyWithFilter(Blackhole blackhole)
  {
-    final Sequence<Cursor> cursors = baseSegment.asStorageAdapter().makeCursors(
-        new SelectorDimFilter(LOOKUP_COUNTRY_NUMBER_TO_NAME, "Canada", null).toFilter(),
-        Intervals.ETERNITY,
-        lookupVirtualColumns,
-        Granularities.ALL,
-        false,
-        null
-    );
-
-    blackhole.consume(getLastValue(cursors, LOOKUP_COUNTRY_NUMBER_TO_NAME));
+    final Filter filter = new SelectorDimFilter(LOOKUP_COUNTRY_NUMBER_TO_NAME, "Canada", null).toFilter();
+    final CursorBuildSpec buildSpec = CursorBuildSpec.builder()
+                                                     .setVirtualColumns(lookupVirtualColumns)
+                                                     .setFilter(filter)
+                                                     .build();
+    try (final CursorHolder cursorHolder = baseSegment.asStorageAdapter().makeCursorHolder(buildSpec)) {
+      final Cursor cursor = cursorHolder.asCursor();
+      blackhole.consume(getLastValue(cursor, LOOKUP_COUNTRY_NUMBER_TO_NAME));
+    }
  }
 }
--- a/benchmarks/src/test/java/org/apache/druid/benchmark/indexing/IncrementalIndexReadBenchmark.java
+++ b/benchmarks/src/test/java/org/apache/druid/benchmark/indexing/IncrementalIndexReadBenchmark.java
@ -20,8 +20,6 @@
 package org.apache.druid.benchmark.indexing;

 import org.apache.druid.common.config.NullHandling;
-import org.apache.druid.java.util.common.granularity.Granularities;
-import org.apache.druid.java.util.common.guava.Sequence;
 import org.apache.druid.java.util.common.logger.Logger;
 import org.apache.druid.js.JavaScriptConfig;
 import org.apache.druid.query.aggregation.hyperloglog.HyperUniquesSerde;
@ -36,8 +34,9 @@ import org.apache.druid.query.filter.SearchQueryDimFilter;
 import org.apache.druid.query.ordering.StringComparators;
 import org.apache.druid.query.search.ContainsSearchQuerySpec;
 import org.apache.druid.segment.Cursor;
+import org.apache.druid.segment.CursorBuildSpec;
+import org.apache.druid.segment.CursorHolder;
 import org.apache.druid.segment.DimensionSelector;
-import org.apache.druid.segment.VirtualColumns;
 import org.apache.druid.segment.data.IndexedInts;
 import org.apache.druid.segment.generator.DataGenerator;
 import org.apache.druid.segment.generator.GeneratorBasicSchemas;
@ -149,22 +148,23 @@ public class IncrementalIndexReadBenchmark
  public void read(Blackhole blackhole)
  {
    IncrementalIndexStorageAdapter sa = new IncrementalIndexStorageAdapter(incIndex);
-    Sequence<Cursor> cursors = makeCursors(sa, null);
-    Cursor cursor = cursors.limit(1).toList().get(0);
+    try (final CursorHolder cursorHolder = makeCursor(sa, null)) {
+      Cursor cursor = cursorHolder.asCursor();

-    List<DimensionSelector> selectors = new ArrayList<>();
-    selectors.add(makeDimensionSelector(cursor, "dimSequential"));
-    selectors.add(makeDimensionSelector(cursor, "dimZipf"));
-    selectors.add(makeDimensionSelector(cursor, "dimUniform"));
-    selectors.add(makeDimensionSelector(cursor, "dimSequentialHalfNull"));
+      List<DimensionSelector> selectors = new ArrayList<>();
+      selectors.add(makeDimensionSelector(cursor, "dimSequential"));
+      selectors.add(makeDimensionSelector(cursor, "dimZipf"));
+      selectors.add(makeDimensionSelector(cursor, "dimUniform"));
+      selectors.add(makeDimensionSelector(cursor, "dimSequentialHalfNull"));

-    cursor.reset();
-    while (!cursor.isDone()) {
-      for (DimensionSelector selector : selectors) {
-        IndexedInts row = selector.getRow();
-        blackhole.consume(selector.lookupName(row.get(0)));
+      cursor.reset();
+      while (!cursor.isDone()) {
+        for (DimensionSelector selector : selectors) {
+          IndexedInts row = selector.getRow();
+          blackhole.consume(selector.lookupName(row.get(0)));
+        }
+        cursor.advance();
      }
-      cursor.advance();
    }
  }

@ -184,35 +184,34 @@ public class IncrementalIndexReadBenchmark
    );

    IncrementalIndexStorageAdapter sa = new IncrementalIndexStorageAdapter(incIndex);
-    Sequence<Cursor> cursors = makeCursors(sa, filter);
-    Cursor cursor = cursors.limit(1).toList().get(0);
+    try (final CursorHolder cursorHolder = makeCursor(sa, filter)) {
+      Cursor cursor = cursorHolder.asCursor();

-    List<DimensionSelector> selectors = new ArrayList<>();
-    selectors.add(makeDimensionSelector(cursor, "dimSequential"));
-    selectors.add(makeDimensionSelector(cursor, "dimZipf"));
-    selectors.add(makeDimensionSelector(cursor, "dimUniform"));
-    selectors.add(makeDimensionSelector(cursor, "dimSequentialHalfNull"));
+      List<DimensionSelector> selectors = new ArrayList<>();
+      selectors.add(makeDimensionSelector(cursor, "dimSequential"));
+      selectors.add(makeDimensionSelector(cursor, "dimZipf"));
+      selectors.add(makeDimensionSelector(cursor, "dimUniform"));
+      selectors.add(makeDimensionSelector(cursor, "dimSequentialHalfNull"));

-    cursor.reset();
-    while (!cursor.isDone()) {
-      for (DimensionSelector selector : selectors) {
-        IndexedInts row = selector.getRow();
-        blackhole.consume(selector.lookupName(row.get(0)));
+      cursor.reset();
+      while (!cursor.isDone()) {
+        for (DimensionSelector selector : selectors) {
+          IndexedInts row = selector.getRow();
+          blackhole.consume(selector.lookupName(row.get(0)));
+        }
+        cursor.advance();
      }
-      cursor.advance();
    }
  }

-  private Sequence<Cursor> makeCursors(IncrementalIndexStorageAdapter sa, DimFilter filter)
+  private CursorHolder makeCursor(IncrementalIndexStorageAdapter sa, DimFilter filter)
  {
-    return sa.makeCursors(
-        filter == null ? null : filter.toFilter(),
-        schemaInfo.getDataInterval(),
-        VirtualColumns.EMPTY,
-        Granularities.ALL,
-        false,
-        null
-    );
+    CursorBuildSpec.CursorBuildSpecBuilder builder = CursorBuildSpec.builder()
+                                                                    .setInterval(schemaInfo.getDataInterval());
+    if (filter != null) {
+      builder.setFilter(filter.toFilter());
+    }
+    return sa.makeCursorHolder(builder.build());
  }

  private static DimensionSelector makeDimensionSelector(Cursor cursor, String name)
--- a/benchmarks/src/test/java/org/apache/druid/benchmark/query/GroupByBenchmark.java
+++ b/benchmarks/src/test/java/org/apache/druid/benchmark/query/GroupByBenchmark.java
@ -121,12 +121,12 @@ import java.util.concurrent.ExecutorService;
 import java.util.concurrent.TimeUnit;

@State(Scope.Benchmark)
-@Fork(value = 2)
-@Warmup(iterations = 10)
-@Measurement(iterations = 25)
+@Fork(value = 1)
+@Warmup(iterations = 5)
+@Measurement(iterations = 15)
 public class GroupByBenchmark
 {
-  @Param({"2", "4"})
+  @Param({"4"})
  private int numProcessingThreads;

  @Param({"-1"})
@ -141,7 +141,7 @@ public class GroupByBenchmark
  @Param({"all", "day"})
  private String queryGranularity;

-  @Param({"force", "false"})
+  @Param({"false", "force"})
  private String vectorize;

  private static final Logger log = new Logger(GroupByBenchmark.class);
@ -514,7 +514,7 @@ public class GroupByBenchmark
  @State(Scope.Benchmark)
  public static class IncrementalIndexState
  {
-    @Param({"onheap", "offheap"})
+    @Param({"onheap"})
    private String indexType;

    IncrementalIndex incIndex;
--- a/benchmarks/src/test/java/org/apache/druid/benchmark/query/ScanBenchmark.java
+++ b/benchmarks/src/test/java/org/apache/druid/benchmark/query/ScanBenchmark.java
@ -34,6 +34,7 @@ import org.apache.druid.java.util.common.logger.Logger;
 import org.apache.druid.query.DefaultGenericQueryMetricsFactory;
 import org.apache.druid.query.Druids;
 import org.apache.druid.query.FinalizeResultsQueryRunner;
+import org.apache.druid.query.Order;
 import org.apache.druid.query.Query;
 import org.apache.druid.query.QueryPlus;
 import org.apache.druid.query.QueryRunner;
@ -116,7 +117,7 @@ public class ScanBenchmark
  private int limit;

  @Param({"NONE", "DESCENDING", "ASCENDING"})
-  private static ScanQuery.Order ordering;
+  private static Order ordering;

  private static final Logger log = new Logger(ScanBenchmark.class);
  private static final int RNG_SEED = 9999;
--- a/benchmarks/src/test/java/org/apache/druid/benchmark/query/TimeseriesBenchmark.java
+++ b/benchmarks/src/test/java/org/apache/druid/benchmark/query/TimeseriesBenchmark.java
@ -26,7 +26,7 @@ import org.apache.druid.jackson.DefaultObjectMapper;
 import org.apache.druid.java.util.common.FileUtils;
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.concurrent.Execs;
-import org.apache.druid.java.util.common.granularity.Granularities;
+import org.apache.druid.java.util.common.granularity.Granularity;
 import org.apache.druid.java.util.common.guava.Sequence;
 import org.apache.druid.java.util.common.logger.Logger;
 import org.apache.druid.query.Druids;
@ -101,8 +101,8 @@ import java.util.concurrent.TimeUnit;

@State(Scope.Benchmark)
@Fork(value = 1)
-@Warmup(iterations = 10)
-@Measurement(iterations = 25)
+@Warmup(iterations = 5)
+@Measurement(iterations = 15)
 public class TimeseriesBenchmark
 {
  @Param({"750000"})
@ -114,6 +114,9 @@ public class TimeseriesBenchmark
  @Param({"true", "false"})
  private boolean descending;

+  @Param({"all", "hour"})
+  private String queryGranularity;
+
  private static final Logger log = new Logger(TimeseriesBenchmark.class);
  private static final int RNG_SEED = 9999;
  private static final IndexMergerV9 INDEX_MERGER_V9;
@ -162,7 +165,7 @@ public class TimeseriesBenchmark
      TimeseriesQuery queryA =
          Druids.newTimeseriesQueryBuilder()
                .dataSource("blah")
-                .granularity(Granularities.ALL)
+                .granularity(Granularity.fromString(queryGranularity))
                .intervals(intervalSpec)
                .aggregators(queryAggs)
                .descending(descending)
@ -182,7 +185,7 @@ public class TimeseriesBenchmark
      TimeseriesQuery timeFilterQuery =
          Druids.newTimeseriesQueryBuilder()
                .dataSource("blah")
-                .granularity(Granularities.ALL)
+                .granularity(Granularity.fromString(queryGranularity))
                .intervals(intervalSpec)
                .aggregators(queryAggs)
                .descending(descending)
@ -202,7 +205,7 @@ public class TimeseriesBenchmark
      TimeseriesQuery timeFilterQuery =
          Druids.newTimeseriesQueryBuilder()
                .dataSource("blah")
-                .granularity(Granularities.ALL)
+                .granularity(Granularity.fromString(queryGranularity))
                .intervals(intervalSpec)
                .aggregators(queryAggs)
                .descending(descending)
@ -219,7 +222,7 @@ public class TimeseriesBenchmark
      TimeseriesQuery timeFilterQuery =
          Druids.newTimeseriesQueryBuilder()
                .dataSource("blah")
-                .granularity(Granularities.ALL)
+                .granularity(Granularity.fromString(queryGranularity))
                .intervals(intervalSpec)
                .aggregators(queryAggs)
                .descending(descending)
@ -271,7 +274,7 @@ public class TimeseriesBenchmark
  @State(Scope.Benchmark)
  public static class IncrementalIndexState
  {
-    @Param({"onheap", "offheap"})
+    @Param({"onheap"})
    private String indexType;

    IncrementalIndex incIndex;
--- a/benchmarks/src/test/java/org/apache/druid/benchmark/query/TopNBenchmark.java
+++ b/benchmarks/src/test/java/org/apache/druid/benchmark/query/TopNBenchmark.java
@ -26,7 +26,7 @@ import org.apache.druid.common.config.NullHandling;
 import org.apache.druid.jackson.DefaultObjectMapper;
 import org.apache.druid.java.util.common.FileUtils;
 import org.apache.druid.java.util.common.concurrent.Execs;
-import org.apache.druid.java.util.common.granularity.Granularities;
+import org.apache.druid.java.util.common.granularity.Granularity;
 import org.apache.druid.java.util.common.guava.Sequence;
 import org.apache.druid.java.util.common.logger.Logger;
 import org.apache.druid.offheap.OffheapBufferGenerator;
@ -98,8 +98,8 @@ import java.util.concurrent.TimeUnit;

@State(Scope.Benchmark)
@Fork(value = 1)
-@Warmup(iterations = 10)
-@Measurement(iterations = 25)
+@Warmup(iterations = 5)
+@Measurement(iterations = 15)
 public class TopNBenchmark
 {
  @Param({"750000"})
@ -111,6 +111,9 @@ public class TopNBenchmark
  @Param({"10"})
  private int threshold;

+  @Param({"all", "hour"})
+  private String queryGranularity;
+
  private static final Logger log = new Logger(TopNBenchmark.class);
  private static final int RNG_SEED = 9999;
  private static final IndexMergerV9 INDEX_MERGER_V9;
@ -159,7 +162,7 @@ public class TopNBenchmark

      TopNQueryBuilder queryBuilderA = new TopNQueryBuilder()
          .dataSource("blah")
-          .granularity(Granularities.ALL)
+          .granularity(Granularity.fromString(queryGranularity))
          .dimension("dimSequential")
          .metric("sumFloatNormal")
          .intervals(intervalSpec)
@ -175,7 +178,7 @@ public class TopNBenchmark

      TopNQueryBuilder queryBuilderA = new TopNQueryBuilder()
          .dataSource("blah")
-          .granularity(Granularities.ALL)
+          .granularity(Granularity.fromString(queryGranularity))
          .dimension("dimUniform")
          .metric(new DimensionTopNMetricSpec(null, StringComparators.NUMERIC))
          .intervals(intervalSpec)
@ -191,7 +194,7 @@ public class TopNBenchmark

      TopNQueryBuilder queryBuilderA = new TopNQueryBuilder()
          .dataSource("blah")
-          .granularity(Granularities.ALL)
+          .granularity(Granularity.fromString(queryGranularity))
          .dimension("dimUniform")
          .metric(new DimensionTopNMetricSpec(null, StringComparators.ALPHANUMERIC))
          .intervals(intervalSpec)
@ -249,7 +252,7 @@ public class TopNBenchmark
  @State(Scope.Benchmark)
  public static class IncrementalIndexState
  {
-    @Param({"onheap", "offheap"})
+    @Param({"onheap"})
    private String indexType;

    IncrementalIndex incIndex;
--- a/benchmarks/src/test/java/org/apache/druid/server/coordinator/NewestSegmentFirstPolicyBenchmark.java
+++ b/benchmarks/src/test/java/org/apache/druid/server/coordinator/NewestSegmentFirstPolicyBenchmark.java
@ -24,9 +24,11 @@ import com.google.common.collect.ImmutableMap;
 import org.apache.druid.client.DataSourcesSnapshot;
 import org.apache.druid.jackson.DefaultObjectMapper;
 import org.apache.druid.java.util.common.DateTimes;
-import org.apache.druid.server.coordinator.compact.CompactionSegmentIterator;
-import org.apache.druid.server.coordinator.compact.CompactionSegmentSearchPolicy;
-import org.apache.druid.server.coordinator.compact.NewestSegmentFirstPolicy;
+import org.apache.druid.server.compaction.CompactionCandidateSearchPolicy;
+import org.apache.druid.server.compaction.CompactionSegmentIterator;
+import org.apache.druid.server.compaction.CompactionStatusTracker;
+import org.apache.druid.server.compaction.NewestSegmentFirstPolicy;
+import org.apache.druid.server.compaction.PriorityBasedCompactionSegmentIterator;
 import org.apache.druid.timeline.DataSegment;
 import org.apache.druid.timeline.SegmentTimeline;
 import org.apache.druid.timeline.partition.NumberedShardSpec;
@ -61,7 +63,7 @@ public class NewestSegmentFirstPolicyBenchmark
 {
  private static final String DATA_SOURCE_PREFIX = "dataSource_";

-  private final CompactionSegmentSearchPolicy policy = new NewestSegmentFirstPolicy(new DefaultObjectMapper());
+  private final CompactionCandidateSearchPolicy policy = new NewestSegmentFirstPolicy(null);

  @Param("100")
  private int numDataSources;
@ -132,7 +134,13 @@ public class NewestSegmentFirstPolicyBenchmark
  @Benchmark
  public void measureNewestSegmentFirstPolicy(Blackhole blackhole)
  {
-    final CompactionSegmentIterator iterator = policy.createIterator(compactionConfigs, dataSources, Collections.emptyMap());
+    final CompactionSegmentIterator iterator = new PriorityBasedCompactionSegmentIterator(
+        policy,
+        compactionConfigs,
+        dataSources,
+        Collections.emptyMap(),
+        new CompactionStatusTracker(new DefaultObjectMapper())
+    );
    for (int i = 0; i < numCompactionTaskSlots && iterator.hasNext(); i++) {
      blackhole.consume(iterator.next());
    }
--- a/codestyle/spotbugs-exclude.xml
+++ b/codestyle/spotbugs-exclude.xml
@ -46,6 +46,7 @@
            <Or>
                <Class name="org.apache.druid.jackson.DefaultTrueJsonIncludeFilter"/>
                <Class name="org.apache.druid.java.util.common.StringEncodingDefaultUTF16LEJsonIncludeFilter"/>
+                <Class name="org.apache.druid.msq.kernel.LimitHintJsonIncludeFilter"/>
                <Class name="org.apache.druid.query.scan.ScanQuery$ScanRowsLimitJsonIncludeFilter"/>
                <Class name="org.apache.druid.query.scan.ScanQuery$ScanTimeOrderJsonIncludeFilter"/>
                <Class name="org.apache.druid.query.scan.ScanQuery$BatchSizeJsonIncludeFilter"/>
--- a/docs/api-reference/service-status-api.md
+++ b/docs/api-reference/service-status-api.md
@ -45,7 +45,7 @@ You can use each endpoint with the ports for each type of service. The following
 | Router|8888|
 | Broker|8082|
 | Historical|8083|
-| MiddleManager|8091|
+| Middle Manager|8091|

 ### Get service information

@ -791,11 +791,11 @@ Host: http://OVERLORD_IP:OVERLORD_PORT
 </details>


-## MiddleManager
+## Middle Manager

-### Get MiddleManager state status
+### Get Middle Manager state status

-Retrieves the enabled state of the MiddleManager. Returns JSON object keyed by the combined `druid.host` and `druid.port` with a boolean `true` or `false` state as the value.
+Retrieves the enabled state of the Middle Manager process. Returns JSON object keyed by the combined `druid.host` and `druid.port` with a boolean `true` or `false` state as the value.

 #### URL

@ -810,7 +810,7 @@ Retrieves the enabled state of the MiddleManager. Returns JSON object keyed by t

 <br/>

-*Successfully retrieved MiddleManager state*
+*Successfully retrieved Middle Manager state*

 </TabItem>
 </Tabs>
@ -855,7 +855,7 @@ Host: http://MIDDLEMANAGER_IP:MIDDLEMANAGER_PORT

 ### Get active tasks

-Retrieves a list of active tasks being run on MiddleManager. Returns JSON list of task ID strings. Note that for normal usage, you should use the `/druid/indexer/v1/tasks` [Tasks API](./tasks-api.md) endpoint or one of the task state specific variants instead.
+Retrieves a list of active tasks being run on the Middle Manager. Returns JSON list of task ID strings. Note that for normal usage, you should use the `/druid/indexer/v1/tasks` [Tasks API](./tasks-api.md) endpoint or one of the task state specific variants instead.

 #### URL

@ -984,9 +984,9 @@ Host: http://MIDDLEMANAGER_IP:MIDDLEMANAGER_PORT

 </details>

-### Disable MiddleManager
+### Disable Middle Manager

-Disables a MiddleManager, causing it to stop accepting new tasks but complete all existing tasks. Returns a JSON  object
+Disables a Middle Manager, causing it to stop accepting new tasks but complete all existing tasks. Returns a JSON  object
 keyed by the combined `druid.host` and `druid.port`.

 #### URL
@ -1002,7 +1002,7 @@ keyed by the combined `druid.host` and `druid.port`.

 <br/>

-*Successfully disabled MiddleManager*
+*Successfully disabled Middle Manager*

 </TabItem>
 </Tabs>
@ -1043,9 +1043,9 @@ Host: http://MIDDLEMANAGER_IP:MIDDLEMANAGER_PORT

 </details>

-### Enable MiddleManager
+### Enable Middle Manager

-Enables a MiddleManager, allowing it to accept new tasks again if it was previously disabled. Returns a JSON object keyed by the combined `druid.host` and `druid.port`.
+Enables a Middle Manager, allowing it to accept new tasks again if it was previously disabled. Returns a JSON object keyed by the combined `druid.host` and `druid.port`.

 #### URL

@ -1060,7 +1060,7 @@ Enables a MiddleManager, allowing it to accept new tasks again if it was previou

 <br/>

-*Successfully enabled MiddleManager*
+*Successfully enabled Middle Manager*

 </TabItem>
 </Tabs>
--- a/docs/api-reference/sql-ingestion-api.md
+++ b/docs/api-reference/sql-ingestion-api.md
@ -157,7 +157,7 @@ headers = {
  'Content-Type': 'application/json'
 }

-response = requests.post(url, headers=headers, data=payload, auth=('USER', 'PASSWORD'))
+response = requests.post(url, headers=headers, data=payload)

 print(response.text)

@ -254,8 +254,9 @@ url = "http://ROUTER_IP:ROUTER_PORT/druid/indexer/v1/task/query-3dc0c45d-34d7-4b
 payload={}
 headers = {}

-response = requests.get(url, headers=headers, data=payload, auth=('USER', 'PASSWORD'))
+response = requests.post(url, headers=headers, data=payload)

+print(response.text)
 print(response.text)
 ```

@ -363,7 +364,10 @@ import requests
 url = "http://ROUTER_IP:ROUTER_PORT/druid/indexer/v1/task/query-3dc0c45d-34d7-4b15-86c9-cdb2d3ebfc4e/reports"

 headers = {}
-response = requests.get(url, headers=headers, auth=('USER', 'PASSWORD'))
+
+response = requests.post(url, headers=headers, data=payload)
+
+print(response.text)
 print(response.text)
 ```

@ -826,11 +830,12 @@ import requests

 url = "http://ROUTER_IP:ROUTER_PORT/druid/indexer/v1/task/query-655efe33-781a-4c50-ae84-c2911b42d63c/shutdown"

-payload={}
+payload = {}
 headers = {}

-response = requests.post(url, headers=headers, data=payload, auth=('USER', 'PASSWORD'))
+response = requests.post(url, headers=headers, data=payload)

+print(response.text)
 print(response.text)
 ```

--- a/docs/assets/druid-architecture.svg
+++ b/docs/assets/druid-architecture.svg
--- a/docs/configuration/index.md
+++ b/docs/configuration/index.md
@ -166,8 +166,8 @@ The indexing service also uses its own set of paths. These configs can be includ
 |Property|Description|Default|
 |--------|-----------|-------|
 |`druid.zk.paths.indexer.base`|Base ZooKeeper path for |`${druid.zk.paths.base}/indexer`|
-|`druid.zk.paths.indexer.announcementsPath`|MiddleManagers announce themselves here.|`${druid.zk.paths.indexer.base}/announcements`|
-|`druid.zk.paths.indexer.tasksPath`|Used to assign tasks to MiddleManagers.|`${druid.zk.paths.indexer.base}/tasks`|
+|`druid.zk.paths.indexer.announcementsPath`|Middle Managers announce themselves here.|`${druid.zk.paths.indexer.base}/announcements`|
+|`druid.zk.paths.indexer.tasksPath`|Used to assign tasks to Middle Managers.|`${druid.zk.paths.indexer.base}/tasks`|
 |`druid.zk.paths.indexer.statusPath`|Parent path for announcement of task statuses.|`${druid.zk.paths.indexer.base}/status`|

 If `druid.zk.paths.base` and `druid.zk.paths.indexer.base` are both set, and none of the other `druid.zk.paths.*` or `druid.zk.paths.indexer.*` values are set, then the other properties will be evaluated relative to their respective `base`.
@ -363,6 +363,8 @@ Coordinator and Overlord log changes to lookups, segment load/drop rules, and dy

 |Property|Description|Default|
 |--------|-----------|-------|
+|`druid.audit.manager.type`|Type of audit manager used for handling audited events. Audited events are logged when set to `log` or persisted in metadata store when set to `sql`.|sql|
+|`druid.audit.manager.logLevel`|Log level of audit events with possible values DEBUG, INFO, WARN. This property is used only when `druid.audit.manager.type` is set to `log`.|INFO|
 |`druid.audit.manager.auditHistoryMillis`|Default duration for querying audit history.|1 week|
 |`druid.audit.manager.includePayloadAsDimensionInMetric`|Boolean flag on whether to add `payload` column in service metric.|false|
 |`druid.audit.manager.maxPayloadSizeBytes`|The maximum size of audit payload to store in Druid's metadata store audit table. If the size of audit payload exceeds this value, the audit log would be stored with a message indicating that the payload was omitted instead. Setting `maxPayloadSizeBytes` to -1 (default value) disables this check, meaning Druid will always store audit payload regardless of it's size. Setting to any negative number other than `-1` is invalid. Human-readable format is supported, see [here](human-readable-byte.md).  |-1|
@ -401,7 +403,7 @@ Metric monitoring is an essential part of Druid operations. The following monito
 |`org.apache.druid.server.emitter.HttpEmittingMonitor`|Reports internal metrics of `http` or `parametrized` emitter (see below). Must not be used with another emitter type. See the description of the metrics here: <https://github.com/apache/druid/pull/4973>.|
 |`org.apache.druid.server.metrics.TaskCountStatsMonitor`|Reports how many ingestion tasks are currently running/pending/waiting and also the number of successful/failed tasks per emission period.|
 |`org.apache.druid.server.metrics.TaskSlotCountStatsMonitor`|Reports metrics about task slot usage per emission period.|
-|`org.apache.druid.server.metrics.WorkerTaskCountStatsMonitor`|Reports how many ingestion tasks are currently running/pending/waiting, the number of successful/failed tasks, and metrics about task slot usage for the reporting worker, per emission period. Only supported by MiddleManager node types.|
+|`org.apache.druid.server.metrics.WorkerTaskCountStatsMonitor`|Reports how many ingestion tasks are currently running/pending/waiting, the number of successful/failed tasks, and metrics about task slot usage for the reporting worker, per emission period. Only supported by Middle Manager node types.|
 |`org.apache.druid.server.metrics.ServiceStatusMonitor`|Reports a heartbeat for the service.|

 For example, you might configure monitors on all services for system and JVM information within `common.runtime.properties` as follows:
@ -592,9 +594,11 @@ need arises.
 |Property|Description|Default|Required|
 |-----|-----------|-------|--------|
 |`druid.centralizedDatasourceSchema.enabled`|Boolean flag for enabling datasource schema building in the Coordinator, this should be specified in the common runtime properties.|false|No.|
-|`druid.indexer.fork.property.druid.centralizedDatasourceSchema.enabled`| This config should be set when CentralizedDatasourceSchema feature is enabled. This should be specified in the MiddleManager runtime properties.|false|No.|
+|`druid.indexer.fork.property.druid.centralizedDatasourceSchema.enabled`| This config should be set when CentralizedDatasourceSchema feature is enabled. This should be specified in the Middle Manager runtime properties.|false|No.|

-For, stale schema cleanup configs, refer to properties with the prefix `druid.coordinator.kill.segmentSchema` in [Metadata Management](#metadata-management).
+If you enable this feature, you can query datasources that are only stored in deep storage and are not loaded on a Historical. For more information, see [Query from deep storage](../querying/query-from-deep-storage.md).
+
+For stale schema cleanup configs, refer to properties with the prefix `druid.coordinator.kill.segmentSchema` in [Metadata Management](#metadata-management). 

 ### Ingestion security configuration

@ -887,8 +891,7 @@ These Coordinator static configurations can be defined in the `coordinator/runti
 |`druid.coordinator.kill.ignoreDurationToRetain`|A way to override `druid.coordinator.kill.durationToRetain` and tell the coordinator that you do not care about the end date of unused segment intervals when it comes to killing them. If true, the coordinator considers all unused segments as eligible to be killed.|false|
 |`druid.coordinator.kill.bufferPeriod`|The amount of time that a segment must be unused before it is able to be permanently removed from metadata and deep storage. This can serve as a buffer period to prevent data loss if data ends up being needed after being marked unused.|`P30D`|
 |`druid.coordinator.kill.maxSegments`|The number of unused segments to kill per kill task. This number must be greater than 0. This only applies when `druid.coordinator.kill.on=true`.|100|
-|`druid.coordinator.balancer.strategy`|Specify the type of balancing strategy for the Coordinator to use to distribute segments among the Historical services. `cachingCost` is logically equivalent to `cost` but is more CPU-efficient on large clusters. `diskNormalized` weights the costs according to the servers' disk usage ratios - there are known issues with this strategy distributing segments unevenly across the cluster. `random` distributes segments among services randomly.|`cost`|
-|`druid.coordinator.balancer.cachingCost.awaitInitialization`|Whether to wait for segment view initialization before creating the `cachingCost` balancing strategy. This property is enabled only when `druid.coordinator.balancer.strategy` is `cachingCost`. If set to true, the Coordinator will not start to assign segments, until the segment view is initialized. If set to false, the Coordinator will fallback to use the `cost` balancing strategy only if the segment view is not initialized yet. It may take much time to wait for the initialization since the `cachingCost` balancing strategy involves much computing to build itself.|false|
+|`druid.coordinator.balancer.strategy`|Specify the type of balancing strategy for the Coordinator to use to distribute segments among the Historical services. `diskNormalized` weights the costs according to the servers' disk usage ratios - there are known issues with this strategy distributing segments unevenly across the cluster. `random` distributes segments among services randomly.|`cost`|
 |`druid.coordinator.loadqueuepeon.http.repeatDelay`|The start and repeat delay (in milliseconds) for the load queue peon, which manages the load/drop queue of segments for any server.|1 minute|
 |`druid.coordinator.loadqueuepeon.http.batchSize`|Number of segment load/drop requests to batch in one HTTP request. Note that it must be smaller than `druid.segmentCache.numLoadingThreads` config on Historical service.|1|
 |`druid.coordinator.asOverlord.enabled`|Boolean value for whether this Coordinator service should act like an Overlord as well. This configuration allows users to simplify a Druid cluster by not having to deploy any standalone Overlord services. If set to true, then Overlord console is available at `http://coordinator-host:port/console.html` and be sure to set `druid.coordinator.asOverlord.overlordService` also.|false|
@ -1129,17 +1132,17 @@ The following configs only apply if the Overlord is running in remote mode. For

 |Property|Description|Default|
 |--------|-----------|-------|
-|`druid.indexer.runner.taskAssignmentTimeout`|How long to wait after a task has been assigned to a MiddleManager before throwing an error.|`PT5M`|
-|`druid.indexer.runner.minWorkerVersion`|The minimum MiddleManager version to send tasks to. The version number is a string. This affects the expected behavior during certain operations like comparison against `druid.worker.version`. Specifically, the version comparison follows dictionary order. Use ISO8601 date format for the version to accommodate date comparisons. |"0"|
+|`druid.indexer.runner.taskAssignmentTimeout`|How long to wait after a task has been assigned to a Middle Manager before throwing an error.|`PT5M`|
+|`druid.indexer.runner.minWorkerVersion`|The minimum Middle Manager version to send tasks to. The version number is a string. This affects the expected behavior during certain operations like comparison against `druid.worker.version`. Specifically, the version comparison follows dictionary order. Use ISO8601 date format for the version to accommodate date comparisons. |"0"|
 | `druid.indexer.runner.parallelIndexTaskSlotRatio`| The ratio of task slots available for parallel indexing supervisor tasks per worker. The specified value must be in the range `[0, 1]`. |1|
-|`druid.indexer.runner.compressZnodes`|Indicates whether or not the Overlord should expect MiddleManagers to compress Znodes.|true|
+|`druid.indexer.runner.compressZnodes`|Indicates whether or not the Overlord should expect Middle Managers to compress Znodes.|true|
 |`druid.indexer.runner.maxZnodeBytes`|The maximum size Znode in bytes that can be created in ZooKeeper, should be in the range of `[10KiB, 2GiB)`. [Human-readable format](human-readable-byte.md) is supported.| 512 KiB |
-|`druid.indexer.runner.taskCleanupTimeout`|How long to wait before failing a task after a MiddleManager is disconnected from ZooKeeper.|`PT15M`|
-|`druid.indexer.runner.taskShutdownLinkTimeout`|How long to wait on a shutdown request to a MiddleManager before timing out|`PT1M`|
+|`druid.indexer.runner.taskCleanupTimeout`|How long to wait before failing a task after a Middle Manager is disconnected from ZooKeeper.|`PT15M`|
+|`druid.indexer.runner.taskShutdownLinkTimeout`|How long to wait on a shutdown request to a Middle Manager before timing out|`PT1M`|
 |`druid.indexer.runner.pendingTasksRunnerNumThreads`|Number of threads to allocate pending-tasks to workers, must be at least 1.|1|
-|`druid.indexer.runner.maxRetriesBeforeBlacklist`|Number of consecutive times the MiddleManager can fail tasks,  before the worker is blacklisted, must be at least 1|5|
+|`druid.indexer.runner.maxRetriesBeforeBlacklist`|Number of consecutive times the Middle Manager can fail tasks,  before the worker is blacklisted, must be at least 1|5|
 |`druid.indexer.runner.workerBlackListBackoffTime`|How long to wait before a task is whitelisted again. This value should be greater that the value set for taskBlackListCleanupPeriod.|`PT15M`|
-|`druid.indexer.runner.workerBlackListCleanupPeriod`|A duration after which the cleanup thread will startup to clean blacklisted workers.|`PT5M`|
+|`druid.indexer.runner.workerBlackListCleanupPeriod`|A duration after which the cleanup thread will start up to clean blacklisted workers.|`PT5M`|
 |`druid.indexer.runner.maxPercentageBlacklistWorkers`|The maximum percentage of workers to blacklist, this must be between 0 and 100.|20|

 If autoscaling is enabled, you can set these additional configs:
@ -1148,16 +1151,16 @@ If autoscaling is enabled, you can set these additional configs:
 |--------|-----------|-------|
 |`druid.indexer.autoscale.strategy`|Sets the strategy to run when autoscaling is required. One of `noop`, `ec2` or `gce`.|`noop`|
 |`druid.indexer.autoscale.doAutoscale`|If set to true, autoscaling will be enabled.|false|
-|`druid.indexer.autoscale.provisionPeriod`|How often to check whether or not new MiddleManagers should be added.|`PT1M`|
-|`druid.indexer.autoscale.terminatePeriod`|How often to check when MiddleManagers should be removed.|`PT5M`|
+|`druid.indexer.autoscale.provisionPeriod`|How often to check whether or not new Middle Managers should be added.|`PT1M`|
+|`druid.indexer.autoscale.terminatePeriod`|How often to check when Middle Managers should be removed.|`PT5M`|
 |`druid.indexer.autoscale.originTime`|The starting reference timestamp that the terminate period increments upon.|`2012-01-01T00:55:00.000Z`|
 |`druid.indexer.autoscale.workerIdleTimeout`|How long can a worker be idle (not a run task) before it can be considered for termination.|`PT90M`|
-|`druid.indexer.autoscale.maxScalingDuration`|How long the Overlord will wait around for a MiddleManager to show up before giving up.|`PT15M`|
+|`druid.indexer.autoscale.maxScalingDuration`|How long the Overlord will wait around for a Middle Manager to show up before giving up.|`PT15M`|
 |`druid.indexer.autoscale.numEventsToTrack`|The number of autoscaling related events (node creation and termination) to track.|10|
 |`druid.indexer.autoscale.pendingTaskTimeout`|How long a task can be in "pending" state before the Overlord tries to scale up.|`PT30S`|
 |`druid.indexer.autoscale.workerVersion`|If set, will only create nodes of set version during autoscaling. Overrides dynamic configuration. |null|
-|`druid.indexer.autoscale.workerPort`|The port that MiddleManagers will run on.|8080|
-|`druid.indexer.autoscale.workerCapacityHint`| An estimation of the number of task slots available for each worker launched by the auto scaler when there are no workers running. The auto scaler uses the worker capacity hint to launch workers with an adequate capacity to handle pending tasks. When unset or set to a value less than or equal to 0, the auto scaler scales workers equal to the value for `minNumWorkers` in autoScaler config instead. The auto scaler assumes that each worker, either a MiddleManager or indexer, has the same amount of task slots. Therefore, when all your workers have the same capacity (homogeneous capacity), set the value for `autoscale.workerCapacityHint` equal to `druid.worker.capacity`. If your workers have different capacities (heterogeneous capacity), set the value to the average of `druid.worker.capacity` across the workers. For example, if two workers have `druid.worker.capacity=10`, and one has `druid.worker.capacity=4`, set `autoscale.workerCapacityHint=8`. Only applies to `pendingTaskBased` provisioning strategy.|-1|
+|`druid.indexer.autoscale.workerPort`|The port that Middle Managers will run on.|8080|
+|`druid.indexer.autoscale.workerCapacityHint`| An estimation of the number of task slots available for each worker launched by the auto scaler when there are no workers running. The auto scaler uses the worker capacity hint to launch workers with an adequate capacity to handle pending tasks. When unset or set to a value less than or equal to 0, the auto scaler scales workers equal to the value for `minNumWorkers` in autoScaler config instead. The auto scaler assumes that each worker, either a Middle Manager or indexer, has the same amount of task slots. Therefore, when all your workers have the same capacity (homogeneous capacity), set the value for `autoscale.workerCapacityHint` equal to `druid.worker.capacity`. If your workers have different capacities (heterogeneous capacity), set the value to the average of `druid.worker.capacity` across the workers. For example, if two workers have `druid.worker.capacity=10`, and one has `druid.worker.capacity=4`, set `autoscale.workerCapacityHint=8`. Only applies to `pendingTaskBased` provisioning strategy.|-1|

 ##### Supervisors

@ -1183,7 +1186,7 @@ The following table shows the dynamic configuration properties for the Overlord.

 |Property|Description|Default|
 |--------|-----------|-------|
-|`selectStrategy`| Describes how to assign tasks to MiddleManagers. The type can be `equalDistribution`, `equalDistributionWithCategorySpec`, `fillCapacity`, `fillCapacityWithCategorySpec`, and `javascript`. | `{"type":"equalDistribution"}` |
+|`selectStrategy`| Describes how to assign tasks to Middle Managers. The type can be `equalDistribution`, `equalDistributionWithCategorySpec`, `fillCapacity`, `fillCapacityWithCategorySpec`, and `javascript`. | `{"type":"equalDistribution"}` |
 |`autoScaler`| Only used if [autoscaling](#autoscaler) is enabled.| null |

 The following is an example of an Overlord dynamic config:
@ -1230,7 +1233,7 @@ The following is an example of an Overlord dynamic config:

 ##### Worker select strategy

-The select strategy controls how Druid assigns tasks to workers (MiddleManagers).
+The select strategy controls how Druid assigns tasks to workers (Middle Managers).
 At a high level, the select strategy determines the list of eligible workers for a given task using
 either an `affinityConfig` or a `categorySpec`. Then, Druid assigns the task by either trying to distribute load equally
 (`equalDistribution`) or to fill as many workers as possible to capacity (`fillCapacity`).
@ -1267,8 +1270,8 @@ not be assigned a category, and you want the work to be concentrated on the fewe

 ###### `equalDistribution`

-Tasks are assigned to the MiddleManager with the most free slots at the time the task begins running.
-This evenly distributes work across your MiddleManagers.
+Tasks are assigned to the Middle Manager with the most free slots at the time the task begins running.
+This evenly distributes work across your Middle Managers.

 |Property|Description|Default|
 |--------|-----------|-------|
@ -1278,7 +1281,7 @@ This evenly distributes work across your MiddleManagers.
 ###### `equalDistributionWithCategorySpec`

 This strategy is a variant of `equalDistribution`, which supports `workerCategorySpec` field rather than `affinityConfig`.
-By specifying `workerCategorySpec`, you can assign tasks to run on different categories of MiddleManagers based on the **type** and **dataSource** of the task.
+By specifying `workerCategorySpec`, you can assign tasks to run on different categories of Middle Managers based on the **type** and **dataSource** of the task.
 This strategy doesn't work with `AutoScaler` since the behavior is undefined.

 |Property|Description|Default|
@ -1286,7 +1289,7 @@ This strategy doesn't work with `AutoScaler` since the behavior is undefined.
 |`type`|`equalDistributionWithCategorySpec`|required; must be `equalDistributionWithCategorySpec`|
 |`workerCategorySpec`|[`WorkerCategorySpec`](#workercategoryspec) object|null (no worker category spec)|

-The following example shows tasks of type `index_kafka` that default to running on MiddleManagers of category `c1`, except for tasks that write to datasource `ds1`, which run on MiddleManagers of category `c2`.
+The following example shows tasks of type `index_kafka` that default to running on Middle Managers of category `c1`, except for tasks that write to datasource `ds1`, which run on Middle Managers of category `c2`.

 ```json
 {
@ -1310,11 +1313,11 @@ The following example shows tasks of type `index_kafka` that default to running
 ###### `fillCapacity`

 Tasks are assigned to the worker with the most currently-running tasks. This is
-useful when you are auto-scaling MiddleManagers since it tends to pack some full and
+useful when you are auto-scaling Middle Managers since it tends to pack some full and
 leave others empty. The empty ones can be safely terminated.

 Note that if `druid.indexer.runner.pendingTasksRunnerNumThreads` is set to _N_ > 1, then this strategy will fill _N_
-MiddleManagers up to capacity simultaneously, rather than a single MiddleManager.
+Middle Managers up to capacity simultaneously, rather than a single Middle Manager.

 |Property|Description|Default|
 |--------|-----------|-------|
@ -1367,8 +1370,8 @@ If not provided, the default is to have no affinity.

 |Property|Description|Default|
 |--------|-----------|-------|
-|`affinity`|JSON object mapping a datasource String name to a list of indexing service MiddleManager `host:port` values. Druid doesn't perform DNS resolution, so the 'host' value must match what is configured on the MiddleManager and what the MiddleManager announces itself as (examine the Overlord logs to see what your MiddleManager announces itself as).|`{}`|
-|`strong`|When `true` tasks for a datasource must be assigned to affinity-mapped MiddleManagers. Tasks remain queued until a slot becomes available. When `false`, Druid may assign tasks for a datasource to other MiddleManagers when affinity-mapped MiddleManagers are unavailable to run queued tasks.|false|
+|`affinity`|JSON object mapping a datasource String name to a list of indexing service Middle Manager `host:port` values. Druid doesn't perform DNS resolution, so the 'host' value must match what is configured on the Middle Manager and what the Middle Manager announces itself as (examine the Overlord logs to see what your Middle Manager announces itself as).|`{}`|
+|`strong`|When `true` tasks for a datasource must be assigned to affinity-mapped Middle Managers. Tasks remain queued until a slot becomes available. When `false`, Druid may assign tasks for a datasource to other Middle Managers when affinity-mapped Middle Managers are unavailable to run queued tasks.|false|

 ###### workerCategorySpec

@ -1378,14 +1381,14 @@ field. If not provided, the default is to not use it at all.
 |Property|Description|Default|
 |--------|-----------|-------|
 |`categoryMap`|A JSON map object mapping a task type String name to a [CategoryConfig](#categoryconfig) object, by which you can specify category config for different task type.|`{}`|
-|`strong`|With weak workerCategorySpec (the default), tasks for a dataSource may be assigned to other MiddleManagers if the MiddleManagers specified in `categoryMap` are not able to run all pending tasks in the queue for that dataSource. With strong workerCategorySpec, tasks for a dataSource will only ever be assigned to their specified MiddleManagers, and will wait in the pending queue if necessary.|false|
+|`strong`|With weak workerCategorySpec (the default), tasks for a dataSource may be assigned to other Middle Managers if the Middle Managers specified in `categoryMap` are not able to run all pending tasks in the queue for that dataSource. With strong workerCategorySpec, tasks for a dataSource will only ever be assigned to their specified Middle Managers, and will wait in the pending queue if necessary.|false|

 ###### CategoryConfig

 |Property|Description|Default|
 |--------|-----------|-------|
 |`defaultCategory`|Specify default category for a task type.|null|
-|`categoryAffinity`|A JSON map object mapping a datasource String name to a category String name of the MiddleManager. If category isn't specified for a datasource, then using the `defaultCategory`. If no specified category and the `defaultCategory` is also null, then tasks can run on any available MiddleManagers.|null|
+|`categoryAffinity`|A JSON map object mapping a datasource String name to a category String name of the Middle Manager. If category isn't specified for a datasource, then using the `defaultCategory`. If no specified category and the `defaultCategory` is also null, then tasks can run on any available Middle Managers.|null|

 ##### Autoscaler

@ -1406,15 +1409,15 @@ For GCE's properties, please refer to the [gce-extensions](../development/extens

 ## Data server

-This section contains the configuration options for the services that reside on Data servers (MiddleManagers/Peons and Historicals) in the suggested [three-server configuration](../design/architecture.md#druid-servers).
+This section contains the configuration options for the services that reside on Data servers (Middle Managers/Peons and Historicals) in the suggested [three-server configuration](../design/architecture.md#druid-servers).

 Configuration options for the [Indexer process](../design/indexer.md) are also provided here.

-### MiddleManager and Peons
+### Middle Manager and Peon

-These MiddleManager and Peon configurations can be defined in the `middleManager/runtime.properties` file.
+These Middle Manager and Peon configurations can be defined in the `middleManager/runtime.properties` file.

-#### MiddleManager service config
+#### Middle Manager service config

 |Property|Description|Default|
 |--------|-----------|-------|
@ -1424,14 +1427,14 @@ These MiddleManager and Peon configurations can be defined in the `middleManager
 |`druid.tlsPort`|TLS port for HTTPS connector, if [druid.enableTlsPort](../operations/tls-support.md) is set then this config will be used. If `druid.host` contains port then that port will be ignored. This should be a non-negative Integer.|8291|
 |`druid.service`|The name of the service. This is used as a dimension when emitting metrics and alerts to differentiate between the various services|`druid/middlemanager`|

-#### MiddleManager configuration
+#### Middle Manager configuration

-MiddleManagers pass their configurations down to their child peons. The MiddleManager requires the following configs:
+Middle Managers pass their configurations down to their child peons. The Middle Manager requires the following configs:

 |Property|Description|Default|
 |--------|-----------|-------|
 |`druid.indexer.runner.allowedPrefixes`|Whitelist of prefixes for configs that can be passed down to child peons.|`com.metamx`, `druid`, `org.apache.druid`, `user.timezone`, `file.encoding`, `java.io.tmpdir`, `hadoop`|
-|`druid.indexer.runner.compressZnodes`|Indicates whether or not the MiddleManagers should compress Znodes.|true|
+|`druid.indexer.runner.compressZnodes`|Indicates whether or not the Middle Managers should compress Znodes.|true|
 |`druid.indexer.runner.classpath`|Java classpath for the peon.|`System.getProperty("java.class.path")`|
 |`druid.indexer.runner.javaCommand`|Command required to execute java.|java|
 |`druid.indexer.runner.javaOpts`|_DEPRECATED_ A string of -X Java options to pass to the peon's JVM. Quotable parameters or parameters with spaces are encouraged to use javaOptsArray|`''`|
@ -1441,16 +1444,16 @@ MiddleManagers pass their configurations down to their child peons. The MiddleMa
 |`druid.indexer.runner.endPort`|Ending port used for Peon services, should be greater than or equal to `druid.indexer.runner.startPort` and less than 65536.|65535|
 |`druid.indexer.runner.ports`|A JSON array of integers to specify ports that used for Peon services. If provided and non-empty, ports for Peon services will be chosen from these ports. And `druid.indexer.runner.startPort/druid.indexer.runner.endPort` will be completely ignored.|`[]`|
 |`druid.worker.ip`|The IP of the worker.|`localhost`|
-|`druid.worker.version`|Version identifier for the MiddleManager. The version number is a string. This affects the expected behavior during certain operations like comparison against `druid.indexer.runner.minWorkerVersion`. Specifically, the version comparison follows dictionary order. Use ISO8601 date format for the version to accommodate date comparisons.|0|
-|`druid.worker.capacity`|Maximum number of tasks the MiddleManager can accept.|Number of CPUs on the machine - 1|
+|`druid.worker.version`|Version identifier for the Middle Manager. The version number is a string. This affects the expected behavior during certain operations like comparison against `druid.indexer.runner.minWorkerVersion`. Specifically, the version comparison follows dictionary order. Use ISO8601 date format for the version to accommodate date comparisons.|0|
+|`druid.worker.capacity`|Maximum number of tasks the Middle Manager can accept.|Number of CPUs on the machine - 1|
 |`druid.worker.baseTaskDirs`|List of base temporary working directories, one of which is assigned per task in a round-robin fashion. This property can be used to allow usage of multiple disks for indexing. This property is recommended in place of and takes precedence over `${druid.indexer.task.baseTaskDir}`.  If this configuration is not set, `${druid.indexer.task.baseTaskDir}` is used. For example, `druid.worker.baseTaskDirs=[\"PATH1\",\"PATH2\",...]`.|null|
 |`druid.worker.baseTaskDirSize`|The total amount of bytes that can be used by tasks on any single task dir. This value is treated symmetrically across all directories, that is, if this is 500 GB and there are 3 `baseTaskDirs`, then each of those task directories is assumed to allow for 500 GB to be used and a total of 1.5 TB will potentially be available across all tasks. The actual amount of memory assigned to each task is discussed in [Configuring task storage sizes](../ingestion/tasks.md#configuring-task-storage-sizes)|`Long.MAX_VALUE`|
-|`druid.worker.category`|A string to name the category that the MiddleManager node belongs to.|`_default_worker_category`|
+|`druid.worker.category`|A string to name the category that the Middle Manager node belongs to.|`_default_worker_category`|
 |`druid.indexer.fork.property.druid.centralizedDatasourceSchema.enabled`| This config should be set when [Centralized Datasource Schema](#centralized-datasource-schema) feature is enabled. |false|

 #### Peon processing

-Processing properties set on the MiddleManager are passed through to Peons.
+Processing properties set on the Middle Manager are passed through to Peons.

 |Property|Description|Default|
 |--------|-----------|-------|
@ -1461,7 +1464,7 @@ Processing properties set on the MiddleManager are passed through to Peons.
 |`druid.processing.numThreads`|The number of processing threads to have available for parallel processing of segments. Our rule of thumb is `num_cores - 1`, which means that even under heavy load there will still be one core available to do background tasks like talking with ZooKeeper and pulling down segments. If only one core is available, this property defaults to the value `1`.|Number of cores - 1 (or 1)|
 |`druid.processing.fifo`|Enables the processing queue to treat tasks of equal priority in a FIFO manner.|`true`|
 |`druid.processing.tmpDir`|Path where temporary files created while processing a query should be stored. If specified, this configuration takes priority over the default `java.io.tmpdir` path.|path represented by `java.io.tmpdir`|
-|`druid.processing.intermediaryData.storage.type`|Storage type for intermediary segments of data shuffle between native parallel index tasks. <br />Set to `local` to store segment files in the local storage of the MiddleManager or Indexer. <br />Set to `deepstore` to use configured deep storage for better fault tolerance during rolling updates. When the storage type is `deepstore`, Druid stores the data in the `shuffle-data` directory under the configured deep storage path. Druid does not support automated cleanup for the `shuffle-data` directory. You can set up cloud storage lifecycle rules for automated cleanup of data at the `shuffle-data` prefix location.|`local`|
+|`druid.processing.intermediaryData.storage.type`|Storage type for intermediary segments of data shuffle between native parallel index tasks. <br />Set to `local` to store segment files in the local storage of the Middle Manager or Indexer. <br />Set to `deepstore` to use configured deep storage for better fault tolerance during rolling updates. When the storage type is `deepstore`, Druid stores the data in the `shuffle-data` directory under the configured deep storage path. Druid does not support automated cleanup for the `shuffle-data` directory. You can set up cloud storage lifecycle rules for automated cleanup of data at the `shuffle-data` prefix location.|`local`|

 The amount of direct memory needed by Druid is at least
 `druid.processing.buffer.sizeBytes * (druid.processing.numMergeBuffers + druid.processing.numThreads + 1)`. You can
@ -1487,7 +1490,7 @@ See [cache configuration](#cache-configuration) for how to configure cache setti

 #### Additional Peon configuration

-Although Peons inherit the configurations of their parent MiddleManagers, explicit child Peon configs in MiddleManager can be set by prefixing them with:
+Although Peons inherit the configurations of their parent Middle Managers, explicit child Peon configs in Middle Manager can be set by prefixing them with:

 ```properties
 druid.indexer.fork.property
@ -1503,9 +1506,9 @@ Additional Peon configs include:
 |`druid.indexer.task.defaultHadoopCoordinates`|Hadoop version to use with HadoopIndexTasks that do not request a particular version.|`org.apache.hadoop:hadoop-client-api:3.3.6`, `org.apache.hadoop:hadoop-client-runtime:3.3.6`|
 |`druid.indexer.task.defaultRowFlushBoundary`|Highest row count before persisting to disk. Used for indexing generating tasks.|75000|
 |`druid.indexer.task.directoryLockTimeout`|Wait this long for zombie Peons to exit before giving up on their replacements.|PT10M|
-|`druid.indexer.task.gracefulShutdownTimeout`|Wait this long on MiddleManager restart for restorable tasks to gracefully exit.|PT5M|
+|`druid.indexer.task.gracefulShutdownTimeout`|Wait this long on Middle Manager restart for restorable tasks to gracefully exit.|PT5M|
 |`druid.indexer.task.hadoopWorkingPath`|Temporary working directory for Hadoop tasks.|`/tmp/druid-indexing`|
-|`druid.indexer.task.restoreTasksOnRestart`|If true, MiddleManagers will attempt to stop tasks gracefully on shutdown and restore them on restart.|false|
+|`druid.indexer.task.restoreTasksOnRestart`|If true, Middle Managers will attempt to stop tasks gracefully on shutdown and restore them on restart.|false|
 |`druid.indexer.task.ignoreTimestampSpecForDruidInputSource`|If true, tasks using the [Druid input source](../ingestion/input-sources.md) will ignore the provided timestampSpec, and will use the `__time` column of the input datasource. This option is provided for compatibility with ingestion specs written before Druid 0.22.0.|false|
 |`druid.indexer.task.storeEmptyColumns`|Boolean value for whether or not to store empty columns during ingestion. When set to true, Druid stores every column specified in the [`dimensionsSpec`](../ingestion/ingestion-spec.md#dimensionsspec). If you use the string-based schemaless ingestion and don't specify any dimensions to ingest, you must also set [`includeAllDimensions`](../ingestion/ingestion-spec.md#dimensionsspec) for Druid to store empty columns.<br/><br/>If you set `storeEmptyColumns` to false, Druid SQL queries referencing empty columns will fail. If you intend to leave `storeEmptyColumns` disabled, you should either ingest placeholder data for empty columns or else not query on empty columns.<br/><br/>You can overwrite this configuration  by setting `storeEmptyColumns` in the [task context](../ingestion/tasks.md#context-parameters).|true|
 |`druid.indexer.task.tmpStorageBytesPerTask`|Maximum number of bytes per task to be used to store temporary files on disk. This config is generally intended for internal usage. Attempts to set it are very likely to be overwritten by the TaskRunner that executes the task, so be sure of what you expect to happen before directly adjusting this configuration parameter. The config is documented here primarily to provide an understanding of what it means if/when someone sees that it has been set. A value of -1 disables this limit.  |-1|
@ -1528,7 +1531,7 @@ The following types of medium exist for the buffers:
 This type of medium may do unnecessary disk I/O and requires some disk space to be available.

 * **Off-heap memory** (`offHeapMemory`) creates buffers in off-heap memory of a JVM process that is running a task.
-This type of medium is preferred, but it may require to allow the JVM to have more off-heap memory, by changing `-XX:MaxDirectMemorySize` configuration. It is not yet understood how does the required off-heap memory size relates to the size of the segments being created. But definitely it doesn't make sense to add more extra off-heap memory, than the configured maximum _heap_ size (`-Xmx`) for the same JVM.
+This type of medium is preferred, but it may require you to allow the JVM to have more off-heap memory by changing the `-XX:MaxDirectMemorySize` configuration. It's not understood yet how the required off-heap memory size relates to the size of the segments being created. But you shouldn't add more extra off-heap memory than the configured maximum _heap_ size (`-Xmx`) for the same JVM.

 * **On-heap memory** (`onHeapMemory`) creates buffers using the allocated heap memory of the JVM process running a task. Using on-heap memory introduces garbage collection overhead and so is not recommended in most cases. This type of medium is most helpful for tasks run on external clusters where it may be difficult to allocate and work with direct memory effectively.

@ -2004,9 +2007,9 @@ See [cache configuration](#cache-configuration) for how to configure cache setti

 ## Cache configuration

-This section describes caching configuration that is common to Broker, Historical, and MiddleManager/Peon processes.
+This section describes caching configuration that is common to Broker, Historical, and Middle Manager/Peon processes.

-Caching could optionally be enabled on the Broker, Historical, and MiddleManager/Peon processes. See
+Caching could optionally be enabled on the Broker, Historical, and Middle Manager/Peon processes. See
 [Broker](#broker-caching), [Historical](#historical-caching), and [Peon](#peon-caching) configuration options for how to
 enable it for different processes.

@ -2109,7 +2112,7 @@ If there is an L1 miss and L2 hit, it will also populate L1.

 ## General query configuration

-This section describes configurations that control behavior of Druid's query types, applicable to Broker, Historical, and MiddleManager processes.
+This section describes configurations that control behavior of Druid's query types, applicable to Broker, Historical, and Middle Manager processes.

 ### Overriding default query context values

@ -2160,7 +2163,7 @@ context). If query does have `maxQueuedBytes` in the context, then that value is

 ### GroupBy query config

-This section describes the configurations for groupBy queries. You can set the runtime properties in the `runtime.properties` file on Broker, Historical, and MiddleManager processes. You can set the query context parameters through the [query context](../querying/query-context.md).
+This section describes the configurations for groupBy queries. You can set the runtime properties in the `runtime.properties` file on Broker, Historical, and Middle Manager processes. You can set the query context parameters through the [query context](../querying/query-context.md).

 Supported runtime properties:

--- a/docs/design/architecture.md
+++ b/docs/design/architecture.md
@ -40,8 +40,8 @@ Druid has several types of services:
 * [Broker](../design/broker.md) handles queries from external clients.
 * [Router](../design/router.md) routes requests to Brokers, Coordinators, and Overlords.
 * [Historical](../design/historical.md) stores queryable data.
-* [MiddleManager](../design/middlemanager.md) and [Peon](../design/peons.md) ingest data.
-* [Indexer](../design/indexer.md) serves an alternative to the MiddleManager + Peon task execution system.
+* [Middle Manager](../design/middlemanager.md) and [Peon](../design/peons.md) ingest data.
+* [Indexer](../design/indexer.md) serves an alternative to the Middle Manager + Peon task execution system.

 You can view services in the **Services** tab in the web console: 

@ -63,7 +63,7 @@ Master servers divide operations between Coordinator and Overlord services.

 #### Overlord service

-[Overlord](../design/overlord.md) services watch over the MiddleManager services on the Data servers and are the controllers of data ingestion into Druid. They are responsible for assigning ingestion tasks to MiddleManagers and for coordinating segment publishing.
+[Overlord](../design/overlord.md) services watch over the Middle Manager services on the Data servers and are the controllers of data ingestion into Druid. They are responsible for assigning ingestion tasks to Middle Managers and for coordinating segment publishing.

 ### Query server

@ -73,7 +73,7 @@ Query servers divide operations between Broker and Router services.

 #### Broker service

-[Broker](../design/broker.md) services receive queries from external clients and forward those queries to Data servers. When Brokers receive results from those subqueries, they merge those results and return them to the caller. Typically, you query Brokers rather than querying Historical or MiddleManager services on Data servers directly.
+[Broker](../design/broker.md) services receive queries from external clients and forward those queries to Data servers. When Brokers receive results from those subqueries, they merge those results and return them to the caller. Typically, you query Brokers rather than querying Historical or Middle Manager services on Data servers directly.

 #### Router service

@ -85,30 +85,30 @@ The Router service also runs the [web console](../operations/web-console.md), a

 A Data server executes ingestion jobs and stores queryable data.

-Data servers divide operations between Historical and MiddleManager services.
+Data servers divide operations between Historical and Middle Manager services.

 #### Historical service

 [**Historical**](../design/historical.md) services handle storage and querying on historical data, including any streaming data that has been in the system long enough to be committed. Historical services download segments from deep storage and respond to queries about these segments. They don't accept writes.

-#### MiddleManager service
+#### Middle Manager service

-[**MiddleManager**](../design/middlemanager.md) services handle ingestion of new data into the cluster. They are responsible
+[**Middle Manager**](../design/middlemanager.md) services handle ingestion of new data into the cluster. They are responsible
 for reading from external data sources and publishing new Druid segments.

 ##### Peon service

-[**Peon**](../design/peons.md) services are task execution engines spawned by MiddleManagers. Each Peon runs a separate JVM and is responsible for executing a single task. Peons always run on the same host as the MiddleManager that spawned them.
+[**Peon**](../design/peons.md) services are task execution engines spawned by Middle Managers. Each Peon runs a separate JVM and is responsible for executing a single task. Peons always run on the same host as the Middle Manager that spawned them.

 #### Indexer service (optional)

-[**Indexer**](../design/indexer.md) services are an alternative to MiddleManagers and Peons. Instead of
+[**Indexer**](../design/indexer.md) services are an alternative to Middle Managers and Peons. Instead of
 forking separate JVM processes per-task, the Indexer runs tasks as individual threads within a single JVM process.

-The Indexer is designed to be easier to configure and deploy compared to the MiddleManager + Peon system and to better enable resource sharing across tasks. The Indexer is a newer feature and is currently designated [experimental](../development/experimental.md) due to the fact that its memory management system is still under
+The Indexer is designed to be easier to configure and deploy compared to the Middle Manager + Peon system and to better enable resource sharing across tasks. The Indexer is a newer feature and is currently designated [experimental](../development/experimental.md) due to the fact that its memory management system is still under
 development. It will continue to mature in future versions of Druid.

-Typically, you would deploy either MiddleManagers or Indexers, but not both.
+Typically, you would deploy either Middle Managers or Indexers, but not both.

 ## Colocation of services

@ -126,11 +126,11 @@ In clusters with very high segment counts, it can make sense to separate the Coo
 You can run the Coordinator and Overlord services as a single combined service by setting the `druid.coordinator.asOverlord.enabled` property.
 For more information, see [Coordinator Operation](../configuration/index.md#coordinator-operation).

-### Historicals and MiddleManagers
+### Historicals and Middle Managers

-With higher levels of ingestion or query load, it can make sense to deploy the Historical and MiddleManager services on separate hosts to to avoid CPU and memory contention.
+With higher levels of ingestion or query load, it can make sense to deploy the Historical and Middle Manager services on separate hosts to to avoid CPU and memory contention.

-The Historical service also benefits from having free memory for memory mapped segments, which can be another reason to deploy the Historical and MiddleManager services separately.
+The Historical service also benefits from having free memory for memory mapped segments, which can be another reason to deploy the Historical and Middle Manager services separately.

 ## External dependencies

--- a/docs/design/indexer.md
+++ b/docs/design/indexer.md
@ -28,9 +28,9 @@ sidebar_label: "Indexer"
 Its memory management system is still under development and will be significantly enhanced in later releases.
 :::

-The Apache Druid Indexer service is an alternative to the MiddleManager + Peon task execution system. Instead of forking a separate JVM process per-task, the Indexer runs tasks as separate threads within a single JVM process.
+The Apache Druid Indexer service is an alternative to the Middle Manager + Peon task execution system. Instead of forking a separate JVM process per-task, the Indexer runs tasks as separate threads within a single JVM process.

-The Indexer is designed to be easier to configure and deploy compared to the MiddleManager + Peon system and to better enable resource sharing across tasks.
+The Indexer is designed to be easier to configure and deploy compared to the Middle Manager + Peon system and to better enable resource sharing across tasks.

 ## Configuration

@ -38,7 +38,7 @@ For Apache Druid Indexer service configuration, see [Indexer Configuration](../c

 ## HTTP endpoints

-The Indexer service shares the same HTTP endpoints as the [MiddleManager](../api-reference/service-status-api.md#middlemanager).
+The Indexer service shares the same HTTP endpoints as the [Middle Manager](../api-reference/service-status-api.md#middle-manager).

 ## Running

@ -73,7 +73,7 @@ This global limit is evenly divided across the number of task slots configured b

 To apply the per-task heap limit, the Indexer overrides `maxBytesInMemory` in task tuning configurations, that is ignoring the default value or any user configured value. It also overrides `maxRowsInMemory` to an essentially unlimited value: the Indexer does not support row limits.

-By default, `druid.worker.globalIngestionHeapLimitBytes` is set to 1/6th of the available JVM heap. This default is chosen to align with the default value of `maxBytesInMemory` in task tuning configs when using the MiddleManager + Peon system, which is also 1/6th of the JVM heap.
+By default, `druid.worker.globalIngestionHeapLimitBytes` is set to 1/6th of the available JVM heap. This default is chosen to align with the default value of `maxBytesInMemory` in task tuning configs when using the Middle Manager + Peon system, which is also 1/6th of the JVM heap.

 The peak usage for rows held in heap memory relates to the interaction between the `maxBytesInMemory` and `maxPendingPersists` properties in the task tuning configs. When the amount of row data held in-heap by a task reaches the limit specified by `maxBytesInMemory`, a task will persist the in-heap row data. After the persist has been started, the task can again ingest up to `maxBytesInMemory` bytes worth of row data while the persist is running.

--- a/docs/design/indexing-service.md
+++ b/docs/design/indexing-service.md
@ -27,8 +27,8 @@ The Apache Druid indexing service is a highly-available, distributed service tha

 Indexing [tasks](../ingestion/tasks.md) are responsible for creating and [killing](../ingestion/tasks.md#kill) Druid [segments](../design/segments.md).

-The indexing service is composed of three main components: [Peons](../design/peons.md) that can run a single task, [MiddleManagers](../design/middlemanager.md) that manage Peons, and an [Overlord](../design/overlord.md) that manages task distribution to MiddleManagers.
-Overlords and MiddleManagers may run on the same process or across multiple processes, while MiddleManagers and Peons always run on the same process.
+The indexing service is composed of three main components: [Peons](../design/peons.md) that can run a single task, [Middle Managers](../design/middlemanager.md) that manage Peons, and an [Overlord](../design/overlord.md) that manages task distribution to Middle Managers.
+Overlords and Middle Managers may run on the same process or across multiple processes, while Middle Managers and Peons always run on the same process.

 Tasks are managed using API endpoints on the Overlord service. Please see [Tasks API](../api-reference/tasks-api.md) for more information.

--- a/docs/design/metadata-storage.md
+++ b/docs/design/metadata-storage.md
@ -149,7 +149,7 @@ parameters across the cluster at runtime.

 ### Task-related tables

-Task-related tables are created and used by the [Overlord](../design/overlord.md) and [MiddleManager](../design/middlemanager.md) when managing tasks.
+Task-related tables are created and used by the [Overlord](../design/overlord.md) and [Middle Manager](../design/middlemanager.md) when managing tasks.

 ### Audit table

--- a/docs/design/middlemanager.md
+++ b/docs/design/middlemanager.md
@ -1,7 +1,7 @@
 ---
 id: middlemanager
-title: "MiddleManager service"
-sidebar_label: "MiddleManager"
+title: "Middle Manager service"
+sidebar_label: "Middle Manager"
 ---

 <!--
@ -23,18 +23,18 @@ sidebar_label: "MiddleManager"
  ~ under the License.
  -->

-The MiddleManager service is a worker service that executes submitted tasks. MiddleManagers forward tasks to [Peons](../design/peons.md) that run in separate JVMs.
-Druid uses separate JVMs for tasks to isolate resources and logs. Each Peon is capable of running only one task at a time, wheres a MiddleManager may have multiple Peons.
+The Middle Manager service is a worker service that executes submitted tasks. Middle Managers forward tasks to [Peons](../design/peons.md) that run in separate JVMs.
+Druid uses separate JVMs for tasks to isolate resources and logs. Each Peon is capable of running only one task at a time, whereas a Middle Manager may have multiple Peons.

 ## Configuration

-For Apache Druid MiddleManager service configuration, see [MiddleManager and Peons](../configuration/index.md#middlemanager-and-peons).
+For Apache Druid Middle Manager service configuration, see [Middle Manager and Peons](../configuration/index.md#middle-manager-and-peon).

-For basic tuning guidance for the MiddleManager service, see [Basic cluster tuning](../operations/basic-cluster-tuning.md#middlemanager).
+For basic tuning guidance for the Middle Manager service, see [Basic cluster tuning](../operations/basic-cluster-tuning.md#middle-manager).

 ## HTTP endpoints

-For a list of API endpoints supported by the MiddleManager, see the [Service status API reference](../api-reference/service-status-api.md#middlemanager).
+For a list of API endpoints supported by the Middle Manager, see the [Service status API reference](../api-reference/service-status-api.md#middle-manager).

 ## Running

--- a/docs/design/overlord.md
+++ b/docs/design/overlord.md
@ -25,8 +25,8 @@ sidebar_label: "Overlord"


 The Overlord service is responsible for accepting tasks, coordinating task distribution, creating locks around tasks, and returning statuses to callers. The Overlord can be configured to run in one of two modes - local or remote (local being default).
-In local mode, the Overlord is also responsible for creating Peons for executing tasks. When running the Overlord in local mode, all MiddleManager and Peon configurations must be provided as well.
-Local mode is typically used for simple workflows. In remote mode, the Overlord and MiddleManager are run in separate services and you can run each on a different server.
+In local mode, the Overlord is also responsible for creating Peons for executing tasks. When running the Overlord in local mode, all Middle Manager and Peon configurations must be provided as well.
+Local mode is typically used for simple workflows. In remote mode, the Overlord and Middle Manager are run in separate services and you can run each on a different server.
 This mode is recommended if you intend to use the indexing service as the single endpoint for all Druid indexing.

 ## Configuration
@ -41,7 +41,7 @@ For a list of API endpoints supported by the Overlord, please see the [Service s

 ## Blacklisted workers

-If a MiddleManager has task failures above a threshold, the Overlord will blacklist these MiddleManagers. No more than 20% of the MiddleManagers can be blacklisted. Blacklisted MiddleManagers will be periodically whitelisted.
+If a Middle Manager has task failures above a threshold, the Overlord will blacklist these Middle Managers. No more than 20% of the Middle Managers can be blacklisted. Blacklisted Middle Managers will be periodically whitelisted.

 The following variables can be used to set the threshold and blacklist timeouts.

@ -54,6 +54,6 @@ druid.indexer.runner.maxPercentageBlacklistWorkers

 ## Autoscaling

-The autoscaling mechanisms currently in place are tightly coupled with our deployment infrastructure but the framework should be in place for other implementations. We are highly open to new implementations or extensions of the existing mechanisms. In our own deployments, MiddleManager services are Amazon AWS EC2 nodes and they are provisioned to register themselves in a [galaxy](https://github.com/ning/galaxy) environment.
+The autoscaling mechanisms currently in place are tightly coupled with our deployment infrastructure but the framework should be in place for other implementations. We are highly open to new implementations or extensions of the existing mechanisms. In our own deployments, Middle Manager services are Amazon AWS EC2 nodes and they are provisioned to register themselves in a [galaxy](https://github.com/ning/galaxy) environment.

-If autoscaling is enabled, new MiddleManagers may be added when a task has been in pending state for too long. MiddleManagers may be terminated if they have not run any tasks for a period of time.
+If autoscaling is enabled, new Middle Managers may be added when a task has been in pending state for too long. Middle Managers may be terminated if they have not run any tasks for a period of time.
--- a/docs/design/peons.md
+++ b/docs/design/peons.md
@ -23,22 +23,22 @@ sidebar_label: "Peon"
  ~ under the License.
  -->

-The Peon service is a task execution engine spawned by the MiddleManager. Each Peon runs a separate JVM and is responsible for executing a single task. Peons always run on the same host as the MiddleManager that spawned them.
+The Peon service is a task execution engine spawned by the Middle Manager. Each Peon runs a separate JVM and is responsible for executing a single task. Peons always run on the same host as the Middle Manager that spawned them.

 ## Configuration

 For Apache Druid Peon configuration, see [Peon Query Configuration](../configuration/index.md#peon-query-configuration) and [Additional Peon Configuration](../configuration/index.md#additional-peon-configuration).

-For basic tuning guidance for MiddleManager tasks, see [Basic cluster tuning](../operations/basic-cluster-tuning.md#task-configurations).
+For basic tuning guidance for Middle Manager tasks, see [Basic cluster tuning](../operations/basic-cluster-tuning.md#task-configurations).

 ## HTTP endpoints

-Peons run a single task in a single JVM. The MiddleManager is responsible for creating Peons for running tasks.
+Peons run a single task in a single JVM. The Middle Manager is responsible for creating Peons for running tasks.
 Peons should rarely run on their own.

 ## Running

-The Peon should seldom run separately from the MiddleManager, except for development purposes.
+The Peon should seldom run separately from the Middle Manager, except for development purposes.

 ```
 org.apache.druid.cli.Main internal peon <task_file> <status_file>
--- a/docs/design/storage.md
+++ b/docs/design/storage.md
@ -28,7 +28,7 @@ Druid stores data in datasources, which are similar to tables in a traditional R

 ![Segment timeline](../assets/druid-timeline.png)

-A datasource may have anywhere from just a few segments, up to hundreds of thousands and even millions of segments. Each segment is created by a MiddleManager as mutable and uncommitted. Data is queryable as soon as it is added to an uncommitted segment. The segment building process accelerates later queries by producing a data file that is compact and indexed:
+A datasource may have anywhere from just a few segments, up to hundreds of thousands and even millions of segments. Each segment is created by a Middle Manager as mutable and uncommitted. Data is queryable as soon as it is added to an uncommitted segment. The segment building process accelerates later queries by producing a data file that is compact and indexed:

 - Conversion to columnar format
 - Indexing with bitmap indexes
@ -37,7 +37,7 @@ A datasource may have anywhere from just a few segments, up to hundreds of thous
    - Bitmap compression for bitmap indexes
    - Type-aware compression for all columns

-Periodically, segments are committed and published to [deep storage](deep-storage.md), become immutable, and move from MiddleManagers to the Historical services. An entry about the segment is also written to the [metadata store](metadata-storage.md). This entry is a self-describing bit of metadata about the segment, including things like the schema of the segment, its size, and its location on deep storage. These entries tell the Coordinator what data is available on the cluster.
+Periodically, segments are committed and published to [deep storage](deep-storage.md), become immutable, and move from Middle Managers to the Historical services. An entry about the segment is also written to the [metadata store](metadata-storage.md). This entry is a self-describing bit of metadata about the segment, including things like the schema of the segment, its size, and its location on deep storage. These entries tell the Coordinator what data is available on the cluster.

 For details on the segment file format, see [segment files](segments.md).

@ -67,7 +67,7 @@ On the Coordinator / Historical side:
 Segments all have a four-part identifier with the following components:

 - Datasource name.
- Time interval (for the time chunk containing the segment; this corresponds to the `segmentGranularity` specified at ingestion time).
+- Time interval for the time chunk containing the segment; this corresponds to the `segmentGranularity` specified at ingestion time. Uses the same format as [query granularity](../querying/granularities.md).
 - Version number (generally an ISO8601 timestamp corresponding to when the segment set was first started).
 - Partition number (an integer, unique within a datasource+interval+version; may not necessarily be contiguous).

--- a/docs/design/zookeeper.md
+++ b/docs/design/zookeeper.md
@ -41,7 +41,7 @@ The operations that happen over ZK are
 1.  [Coordinator](../design/coordinator.md) leader election
 2.  Segment "publishing" protocol from [Historical](../design/historical.md)
 3.  [Overlord](../design/overlord.md) leader election
-4.  [Overlord](../design/overlord.md) and [MiddleManager](../design/middlemanager.md) task management
+4.  [Overlord](../design/overlord.md) and [Middle Manager](../design/middlemanager.md) task management

 ## Coordinator Leader Election

--- a/docs/development/extensions-contrib/gce-extensions.md
+++ b/docs/development/extensions-contrib/gce-extensions.md
@ -32,7 +32,7 @@ of GCE (MIG from now on). This choice has been made to ease the configuration of
 management.

 For this reason, in order to use this extension, the user must have created
-1. An instance template with the right machine type and image to bu used to run the MiddleManager
+1. An instance template with the right machine type and image to bu used to run the Middle Manager
 2. A MIG that has been configured to use the instance template created in the point above

 Moreover, in order to be able to rescale the machines in the MIG, the Overlord must run with a service account
@ -98,6 +98,6 @@ for parameters other than the ones specified here, such as `selectStrategy` etc.
 - The module internally uses the [ListManagedInstances](https://cloud.google.com/compute/docs/reference/rest/v1/instanceGroupManagers/listManagedInstances)
 call from the API and, while the documentation of the API states that the call can be paged through using the
 `pageToken` argument, the responses to such call do not provide any `nextPageToken` to set such parameter. This means
- that the extension can operate safely with a maximum of 500 MiddleManagers instances at any time (the maximum number
+ that the extension can operate safely with a maximum of 500 Middle Managers instances at any time (the maximum number
 of instances to be returned for each call).
 
--- a/docs/development/extensions-contrib/iceberg.md
+++ b/docs/development/extensions-contrib/iceberg.md
@ -22,6 +22,9 @@ title: "Iceberg extension"
  ~ under the License.
  -->

+<!-- If the URL changes for this topic, make modifications
+to Apache Iceberg docs: https://github.com/apache/iceberg/blob/main/docs/mkdocs.yml -->
+
 ## Iceberg Ingest extension

 Apache Iceberg is an open table format for huge analytic datasets. [IcebergInputSource](../../ingestion/input-sources.md#iceberg-input-source) lets you ingest data stored in the Iceberg table format into Apache Druid. To use the iceberg extension, add the `druid-iceberg-extensions` to the list of loaded extensions. See [Loading extensions](../../configuration/extensions.md#loading-extensions) for more information.
--- a/docs/development/extensions-contrib/prometheus.md
+++ b/docs/development/extensions-contrib/prometheus.md
@ -51,7 +51,7 @@ All the configuration parameters for the Prometheus emitter are under `druid.emi

 ### Ports for colocated Druid processes

-In certain instances, Druid processes may be colocated on the same host. For example, the Broker and Router may share the same server. Other colocated processes include the Historical and MiddleManager or the Coordinator and Overlord. When you have colocated processes, specify `druid.emitter.prometheus.port` separately for each process on each host. For example, even if the Broker and Router share the same host, the Broker runtime properties and the Router runtime properties each need to list `druid.emitter.prometheus.port`, and the port value for both must be different.
+In certain instances, Druid processes may be colocated on the same host. For example, the Broker and Router may share the same server. Other colocated processes include the Historical and Middle Manager or the Coordinator and Overlord. When you have colocated processes, specify `druid.emitter.prometheus.port` separately for each process on each host. For example, even if the Broker and Router share the same host, the Broker runtime properties and the Router runtime properties each need to list `druid.emitter.prometheus.port`, and the port value for both must be different.

 ### Override properties for Peon Tasks

--- a/docs/development/extensions-contrib/rabbit-stream-ingestion.md
+++ b/docs/development/extensions-contrib/rabbit-stream-ingestion.md
@ -160,8 +160,6 @@ The `tuningConfig` is optional. If no `tuningConfig` is specified, default param
 |`resetOffsetAutomatically`|Boolean|Controls behavior when Druid needs to read RabbitMQ messages that are no longer available. Not supported.  |no (default == false)|
 |`skipSequenceNumberAvailabilityCheck`|Boolean|Whether to enable checking if the current sequence number is still available in a particular RabbitMQ stream. If set to false, the indexing task will attempt to reset the current sequence number (or not), depending on the value of `resetOffsetAutomatically`.|no (default == false)|
 |`workerThreads`|Integer|The number of threads that the supervisor uses to handle requests/responses for worker tasks, along with any other internal asynchronous operation.|no (default == min(10, taskCount))|
-|`chatAsync`|Boolean| If true, use asynchronous communication with indexing tasks, and ignore the `chatThreads` parameter. If false, use synchronous communication in a thread pool of size `chatThreads`.  | no (default == true) |
-|`chatThreads`|Integer| The number of threads that will be used for communicating with indexing tasks. Ignored if `chatAsync` is `true` (the default).| no (default == min(10, taskCount * replicas))|
 |`chatRetries`|Integer|The number of times HTTP requests to indexing tasks will be retried before considering tasks unresponsive.| no (default == 8)|
 |`httpTimeout`|ISO8601 Period|How long to wait for a HTTP response from an indexing task.|no (default == PT10S)|
 |`shutdownTimeout`|ISO8601 Period|How long to wait for the supervisor to attempt a graceful shutdown of tasks before exiting.|no (default == PT80S)|
--- a/docs/development/extensions-core/s3.md
+++ b/docs/development/extensions-core/s3.md
@ -55,8 +55,6 @@ To use S3 for Deep Storage, you must supply [connection information](#configurat
 |`druid.storage.bucket`|Bucket to store in.|Must be set.|
 |`druid.storage.baseKey`|A prefix string that will be prepended to the object names for the segments published to S3 deep storage|Must be set.|
 |`druid.storage.type`|Global deep storage provider. Must be set to `s3` to make use of this extension.|Must be set (likely `s3`).|
-|`druid.storage.archiveBucket`|S3 bucket name for archiving when running the *archive task*.|none|
-|`druid.storage.archiveBaseKey`|S3 object key prefix for archiving.|none|
 |`druid.storage.disableAcl`|Boolean flag for how object permissions are handled. To use ACLs, set this property to `false`. To use Object Ownership, set it to `true`. The permission requirements for ACLs and Object Ownership are different. For more information, see [S3 permissions settings](#s3-permissions-settings).|false|
 |`druid.storage.useS3aSchema`|If true, use the "s3a" filesystem when using Hadoop-based ingestion. If false, the "s3n" filesystem will be used. Only affects Hadoop-based ingestion.|false|

@ -119,7 +117,7 @@ The AWS SDK requires that a target region be specified.  You can set these by us
 For example, to set the region to 'us-east-1' through system properties:

 * Add `-Daws.region=us-east-1` to the `jvm.config` file for all Druid services.
-* Add `-Daws.region=us-east-1` to `druid.indexer.runner.javaOpts` in [Middle Manager configuration](../../configuration/index.md#middlemanager-configuration) so that the property will be passed to Peon (worker) processes.
+* Add `-Daws.region=us-east-1` to `druid.indexer.runner.javaOpts` in [Middle Manager configuration](../../configuration/index.md#middle-manager-configuration) so that the property will be passed to Peon (worker) processes.

 ### Connecting to S3 configuration

--- a/docs/ingestion/hadoop.md
+++ b/docs/ingestion/hadoop.md
@ -149,7 +149,7 @@ For example, using the static input paths:
 ```

 You can also read from cloud storage such as Amazon S3 or Google Cloud Storage.
-To do so, you need to install the necessary library under Druid's classpath in _all MiddleManager or Indexer processes_.
+To do so, you need to install the necessary library under Druid's classpath in _all Middle Manager or Indexer processes_.
 For S3, you can run the below command to install the [Hadoop AWS module](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/).

 ```bash
@ -157,7 +157,7 @@ java -classpath "${DRUID_HOME}lib/*" org.apache.druid.cli.Main tools pull-deps -
 cp ${DRUID_HOME}/hadoop-dependencies/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar ${DRUID_HOME}/extensions/druid-hdfs-storage/
 ```

-Once you install the Hadoop AWS module in all MiddleManager and Indexer processes, you can put
+Once you install the Hadoop AWS module in all Middle Manager and Indexer processes, you can put
 your S3 paths in the inputSpec with the below job properties.
 For more configurations, see the [Hadoop AWS module](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/).

@ -175,8 +175,8 @@ For more configurations, see the [Hadoop AWS module](https://hadoop.apache.org/d
 ```

 For Google Cloud Storage, you need to install [GCS connector jar](https://github.com/GoogleCloudPlatform/bigdata-interop/blob/master/gcs/INSTALL.md)
-under `${DRUID_HOME}/hadoop-dependencies` in _all MiddleManager or Indexer processes_.
-Once you install the GCS Connector jar in all MiddleManager and Indexer processes, you can put
+under `${DRUID_HOME}/hadoop-dependencies` in _all Middle Manager or Indexer processes_.
+Once you install the GCS Connector jar in all Middle Manager and Indexer processes, you can put
 your Google Cloud Storage paths in the inputSpec with the below job properties.
 For more configurations, see the [instructions to configure Hadoop](https://github.com/GoogleCloudPlatform/bigdata-interop/blob/master/gcs/INSTALL.md#configure-hadoop),
 [GCS core default](https://github.com/GoogleCloudDataproc/hadoop-connectors/blob/v2.0.0/gcs/conf/gcs-core-default.xml)
--- a/docs/ingestion/index.md
+++ b/docs/ingestion/index.md
@ -27,7 +27,7 @@ Loading data in Druid is called _ingestion_ or _indexing_. When you ingest data
 your source system and stores it in data files called [_segments_](../design/segments.md).
 In general, segment files contain a few million rows each.

-For most ingestion methods, the Druid [MiddleManager](../design/middlemanager.md) processes or the
+For most ingestion methods, the Druid [Middle Manager](../design/middlemanager.md) processes or the
 [Indexer](../design/indexer.md) processes load your source data. The sole exception is Hadoop-based ingestion, which
 uses a Hadoop MapReduce job on YARN.

--- a/docs/ingestion/ingestion-spec.md
+++ b/docs/ingestion/ingestion-spec.md
@ -232,7 +232,7 @@ A `dimensionsSpec` can have the following components:
 | `spatialDimensions`    | An array of [spatial dimensions](../querying/geo.md).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            | `[]`    |
 | `includeAllDimensions` | Note that this field only applies to string-based schema discovery where Druid ingests dimensions it discovers as strings. This is different from schema auto-discovery where Druid infers the type for data. You can set `includeAllDimensions` to true to ingest both explicit dimensions in the `dimensions` field and other dimensions that the ingestion task discovers from input data. In this case, the explicit dimensions will appear first in the order that you specify them, and the dimensions dynamically discovered will come after. This flag can be useful especially with auto schema discovery using [`flattenSpec`](./data-formats.md#flattenspec). If this is not set and the `dimensions` field is not empty, Druid will ingest only explicit dimensions. If this is not set and the `dimensions` field is empty, all discovered dimensions will be ingested. | false   |
 | `useSchemaDiscovery` | Configure Druid to use schema auto-discovery to discover some or all of the dimensions and types for your data. For any dimensions that aren't a uniform type, Druid ingests them as JSON. You can use this for native batch or streaming ingestion.  | false  | 
-
+| `forceSegmentSortByTime` | When set to true (the default), segments created by the ingestion job are sorted by `{__time, dimensions[0], dimensions[1], ...}`. When set to false, segments created by the ingestion job are sorted by `{dimensions[0], dimensions[1], ...}`. To include `__time` in the sort order when this parameter is set to `false`, you must include a dimension named `__time` with type `long` explicitly in the `dimensions` list.<br /><br />Setting this to `false` is an experimental feature; see [Sorting](partitioning.md#sorting) for details. | `true` |

 #### Dimension objects

@ -301,15 +301,15 @@ An example `metricsSpec` is:

 ### `granularitySpec`

-The `granularitySpec` is located in `dataSchema` → `granularitySpec` and is responsible for configuring
-the following operations:
+The `granularitySpec`, located in `dataSchema` → `granularitySpec`, specifies the following:

-1. Partitioning a datasource into [time chunks](../design/storage.md) (via `segmentGranularity`).
-2. Truncating the timestamp, if desired (via `queryGranularity`).
-3. Specifying which time chunks of segments should be created, for batch ingestion (via `intervals`).
-4. Specifying whether ingestion-time [rollup](./rollup.md) should be used or not (via `rollup`).
+1. `segmentGranularity` to partitioning a datasource into [time chunks](../design/storage.md).
+2. `queryGranularity` to optionally truncate the timestamp.
+3. `intervals` to define the time chunks of segments to create for batch ingestion.
+4.  `rollup` to enable ingestion-time [rollup](./rollup.md) or not.

 Other than `rollup`, these operations are all based on the [primary timestamp](./schema-model.md#primary-timestamp).
+Use the format from [Query granularities] to specify both `segmentGranualarity` and `queryGranularity`.

 An example `granularitySpec` is:

@ -520,6 +520,7 @@ For information on defining an `indexSpec` in a query context, see [SQL-based in
 |stringDictionaryEncoding|Encoding format for string value dictionaries used by STRING and [COMPLEX&lt;json&gt;](../querying/nested-columns.md) columns. To enable front coding, set `stringDictionaryEncoding.type` to `frontCoded`. Optionally, you can specify the `bucketSize` and `formatVersion` properties. See [Front coding](#front-coding) for more information.|`{"type":"utf8"}`|
 |metricCompression|Compression format for primitive type metric columns. Options are `lz4`, `lzf`, `zstd`, `uncompressed`, or `none` (which is more efficient than `uncompressed`, but not supported by older versions of Druid).|`lz4`|
 |longEncoding|Encoding format for long-typed columns. Applies regardless of whether they are dimensions or metrics. Options are `auto` or `longs`. `auto` encodes the values using offset or lookup table depending on column cardinality, and store them with variable size. `longs` stores the value as-is with 8 bytes each.|`longs`|
+|complexMetricCompression|Compression format for complex type metric columns. Options are `lz4`, `lzf`, `zstd`, `uncompressed`. Options other than `uncompressed` are not compatible with Druid versions older than 31, and only applies to complex metrics which do not have specialized column formats.|`uncompressed`|
 |jsonCompression|Compression format to use for nested column raw data. Options are `lz4`, `lzf`, `zstd`, or `uncompressed`.|`lz4`|

 #### Front coding
@ -561,4 +562,4 @@ For example:
    }
  }
 }
-```
+```
--- a/docs/ingestion/input-sources.md
+++ b/docs/ingestion/input-sources.md
@ -875,7 +875,7 @@ Each of the SQL queries will be run in its own sub-task and thus for the above e

 Compared to the other native batch input sources, SQL input source behaves differently in terms of reading the input data. Therefore, consider the following points before using this input source in a production environment:

-* During indexing, each sub-task would execute one of the SQL queries and the results are stored locally on disk. The sub-tasks then proceed to read the data from these local input files and generate segments. Presently, there isn’t any restriction on the size of the generated files and this would require the MiddleManagers or Indexers to have sufficient disk capacity based on the volume of data being indexed.
+* During indexing, each sub-task would execute one of the SQL queries and the results are stored locally on disk. The sub-tasks then proceed to read the data from these local input files and generate segments. Presently, there isn’t any restriction on the size of the generated files and this would require the Middle Managers or Indexers to have sufficient disk capacity based on the volume of data being indexed.

 * Filtering the SQL queries based on the intervals specified in the `granularitySpec` can avoid unwanted data being retrieved and stored locally by the indexing sub-tasks. For example, if the `intervals` specified in the `granularitySpec` is `["2013-01-01/2013-01-02"]` and the SQL query is `SELECT * FROM table1`, `SqlInputSource` will read all the data for `table1` based on the query, even though only data between the intervals specified will be indexed into Druid.

--- a/docs/ingestion/kafka-ingestion.md
+++ b/docs/ingestion/kafka-ingestion.md
@ -36,7 +36,7 @@ This topic contains configuration information for the Kafka indexing service sup

 ## Setup

-To use the Kafka indexing service, you must first load the `druid-kafka-indexing-service` extension on both the Overlord and the MiddleManager. See [Loading extensions](../configuration/extensions.md) for more information.
+To use the Kafka indexing service, you must first load the `druid-kafka-indexing-service` extension on both the Overlord and the Middle Manager. See [Loading extensions](../configuration/extensions.md) for more information.

 ## Supervisor spec configuration

@ -421,14 +421,12 @@ For configuration properties shared across all streaming ingestion methods, refe
 |Property|Type|Description|Required|Default|
 |--------|----|-----------|--------|-------|
 |`numPersistThreads`|Integer|The number of threads to use to create and persist incremental segments on the disk. Higher ingestion data throughput results in a larger number of incremental segments, causing significant CPU time to be spent on the creation of the incremental segments on the disk. For datasources with number of columns running into hundreds or thousands, creation of the incremental segments may take up significant time, in the order of multiple seconds. In both of these scenarios, ingestion can stall or pause frequently, causing it to fall behind. You can use additional threads to parallelize the segment creation without blocking ingestion as long as there are sufficient CPU resources available.|No|1|
-|`chatAsync`|Boolean|If `true`, use asynchronous communication with indexing tasks, and ignore the `chatThreads` parameter. If `false`, use synchronous communication in a thread pool of size `chatThreads`.|No|`true`|
-|`chatThreads`|Integer|The number of threads to use for communicating with indexing tasks. Ignored if `chatAsync` is `true`.|No|`min(10, taskCount * replicas)`|

 ## Deployment notes on Kafka partitions and Druid segments

 Druid assigns Kafka partitions to each Kafka indexing task. A task writes the events it consumes from Kafka into a single segment for the segment granularity interval until it reaches one of the following limits: `maxRowsPerSegment`, `maxTotalRows`, or `intermediateHandoffPeriod`. At this point, the task creates a new partition for this segment granularity to contain subsequent events.

-The Kafka indexing task also does incremental hand-offs. Therefore, segments become available as they are ready and you don't have to wait for all segments until the end of the task duration. When the task reaches one of `maxRowsPerSegment`, `maxTotalRows`, or `intermediateHandoffPeriod`, it hands off all the segments and creates a new set of segments for further events. This allows the task to run for longer durations without accumulating old segments locally on MiddleManager services.
+The Kafka indexing task also does incremental hand-offs. Therefore, segments become available as they are ready and you don't have to wait for all segments until the end of the task duration. When the task reaches one of `maxRowsPerSegment`, `maxTotalRows`, or `intermediateHandoffPeriod`, it hands off all the segments and creates a new set of segments for further events. This allows the task to run for longer durations without accumulating old segments locally on Middle Manager services.

 The Kafka indexing service may still produce some small segments. For example, consider the following scenario:
 - Task duration is 4 hours.
--- a/docs/ingestion/kinesis-ingestion.md
+++ b/docs/ingestion/kinesis-ingestion.md
@ -33,7 +33,7 @@ This topic contains configuration information for the Kinesis indexing service s

 ## Setup

-To use the Kinesis indexing service, you must first load the `druid-kinesis-indexing-service` core extension on both the Overlord and the MiddleManager. See [Loading extensions](../configuration/extensions.md#loading-extensions) for more information.
+To use the Kinesis indexing service, you must first load the `druid-kinesis-indexing-service` core extension on both the Overlord and the Middle Manager. See [Loading extensions](../configuration/extensions.md#loading-extensions) for more information.

 Review [Known issues](#known-issues) before deploying the `druid-kinesis-indexing-service` extension to production.

@ -249,7 +249,7 @@ At this point, the task creates a new shard for this segment granularity to cont

 The Kinesis indexing task also performs incremental hand-offs so that the segments created by the task are not held up until the task duration is over.
 When the task reaches one of the `maxRowsPerSegment`, `maxTotalRows`, or `intermediateHandoffPeriod` limits, it hands off all the segments and creates a new set of segments for further events. This allows the task to run for longer durations
-without accumulating old segments locally on MiddleManager services.
+without accumulating old segments locally on Middle Manager services.

 The Kinesis indexing service may still produce some small segments.
 For example, consider the following scenario:
--- a/docs/ingestion/native-batch.md
+++ b/docs/ingestion/native-batch.md
@ -55,7 +55,7 @@ The parallel task type `index_parallel` is a task for multi-threaded batch index
 The `index_parallel` task is a supervisor task that orchestrates
 the whole indexing process. The supervisor task splits the input data and creates worker tasks to process the individual portions of data.

-Druid issues the worker tasks to the Overlord. The Overlord schedules and runs the workers on MiddleManagers or Indexers. After a worker task successfully processes the assigned input portion, it reports the resulting segment list to the Supervisor task.
+Druid issues the worker tasks to the Overlord. The Overlord schedules and runs the workers on Middle Managers or Indexers. After a worker task successfully processes the assigned input portion, it reports the resulting segment list to the Supervisor task.

 The Supervisor task periodically checks the status of worker tasks. If a task fails, the Supervisor retries the task until the number of retries reaches the configured limit. If all worker tasks succeed, it publishes the reported segments at once and finalizes ingestion.

@ -369,11 +369,11 @@ the Parallel task splits the input data based on the split hint spec
 and assigns each split to a worker task. Each worker task (type `partial_index_generate`) reads the assigned split, and partitions rows by the time chunk from `segmentGranularity` (primary partition key) in the `granularitySpec`
 and then by the hash value of `partitionDimensions` (secondary partition key) in the `partitionsSpec`.
 The partitioned data is stored in local storage of
-the [middleManager](../design/middlemanager.md) or the [indexer](../design/indexer.md).
+the [middle Manager](../design/middlemanager.md) or the [indexer](../design/indexer.md).

 The `partial segment merge` phase is similar to the Reduce phase in MapReduce.
 The Parallel task spawns a new set of worker tasks (type `partial_index_generic_merge`) to merge the partitioned data created in the previous phase. Here, the partitioned data is shuffled based on
-the time chunk and the hash value of `partitionDimensions` to be merged; each worker task reads the data falling in the same time chunk and the same hash value from multiple MiddleManager/Indexer processes and merges them to create the final segments. Finally, they push the final segments to the deep storage at once.
+the time chunk and the hash value of `partitionDimensions` to be merged; each worker task reads the data falling in the same time chunk and the same hash value from multiple Middle Manager/Indexer processes and merges them to create the final segments. Finally, they push the final segments to the deep storage at once.

 ##### Hash partition function

@ -426,12 +426,12 @@ to create partitioned data. Each worker task reads a split created as in the pre
 partitions rows by the time chunk from the `segmentGranularity` (primary partition key) in the `granularitySpec`
 and then by the range partitioning found in the previous phase.
 The partitioned data is stored in local storage of
-the [middleManager](../design/middlemanager.md) or the [indexer](../design/indexer.md).
+the [Middle Manager](../design/middlemanager.md) or the [indexer](../design/indexer.md).

 In the `partial segment merge` phase, the parallel index task spawns a new set of worker tasks (type `partial_index_generic_merge`) to merge the partitioned
 data created in the previous phase. Here, the partitioned data is shuffled based on
 the time chunk and the value of `partitionDimension`; each worker task reads the segments
-falling in the same partition of the same range from multiple MiddleManager/Indexer processes and merges
+falling in the same partition of the same range from multiple Middle Manager/Indexer processes and merges
 them to create the final segments. Finally, they push the final segments to the deep storage.

 :::info
--- a/docs/ingestion/partitioning.md
+++ b/docs/ingestion/partitioning.md
@ -44,33 +44,60 @@ Partitioning by time is important for two reasons:
 The most common choices to balance these considerations are `hour` and `day`. For streaming ingestion, `hour` is especially
 common, because it allows compaction to follow ingestion with less of a time delay.

+The following table describes how to configure time chunk partitioning.
+
+|Method|Configuration|
+|------|------------|
+|[SQL](../multi-stage-query/index.md)|[`PARTITIONED BY`](../multi-stage-query/concepts.md#partitioning)|
+|[Kafka](../ingestion/kafka-ingestion.md) or [Kinesis](../ingestion/kinesis-ingestion.md)|`segmentGranularity` inside the [`granularitySpec`](ingestion-spec.md#granularityspec)|
+|[Native batch](native-batch.md) or [Hadoop](hadoop.md)|`segmentGranularity` inside the [`granularitySpec`](ingestion-spec.md#granularityspec)|
+
 ## Secondary partitioning

-Druid can partition segments within a particular time chunk further depending upon options that vary based on the ingestion type you have chosen. In general, secondary partitioning on a particular dimension improves locality. This means that rows with the same value for that dimension are stored together, decreasing access time.
+Druid further partitions each time chunk into immutable segments. Secondary partitioning on a particular dimension improves locality. This means that rows with the same value for that dimension are stored together, decreasing access time.

-To achieve the best performance and smallest overall footprint, partition your data on a "natural"
-dimension that you often use as a filter when possible. Such partitioning often improves compression and query performance. For example, some cases have yielded threefold storage size decreases.
+To achieve the best performance and smallest overall footprint, partition your data on a "natural" dimension that
+you often use as a filter, or that achieves some alignment within your data. Such partitioning can improve compression
+and query performance by significant multiples.

-## Partitioning and sorting
+The following table describes how to configure secondary partitioning.

-Partitioning and sorting work well together. If you do have a "natural" partitioning dimension, consider placing it first in the `dimensions` list of your `dimensionsSpec`. This way Druid sorts rows within each segment by that column. This sorting configuration frequently improves compression more than using partitioning alone.
-
-Note that Druid always sorts rows within a segment by timestamp first, even before the first dimension listed in your `dimensionsSpec`. This sorting can preclude the efficacy of dimension sorting. To work around this limitation if necessary, set your `queryGranularity` equal to `segmentGranularity` in your [`granularitySpec`](./ingestion-spec.md#granularityspec). Druid will set all timestamps within the segment to the same value, letting you identify a [secondary timestamp](schema-design.md#secondary-timestamps) as the "real" timestamp.
-
-## How to configure partitioning
-
-Not all ingestion methods support an explicit partitioning configuration, and not all have equivalent levels of flexibility. If you are doing initial ingestion through a less-flexible method like
-Kafka), you can use [reindexing](../data-management/update.md#reindex) or [compaction](../data-management/compaction.md) to repartition your data after initial ingestion. This is a powerful technique you can use to optimally partition any data older than a certain time threshold while you continuously add new data from a stream.
-
-The following table shows how each ingestion method handles partitioning:
-
-|Method|How it works|
+|Method|Configuration|
 |------|------------|
-|[Native batch](native-batch.md)|Configured using [`partitionsSpec`](native-batch.md#partitionsspec) inside the `tuningConfig`.|
-|[SQL](../multi-stage-query/index.md)|Configured using [`PARTITIONED BY`](../multi-stage-query/concepts.md#partitioning) and [`CLUSTERED BY`](../multi-stage-query/concepts.md#clustering).|
-|[Hadoop](hadoop.md)|Configured using [`partitionsSpec`](hadoop.md#partitionsspec) inside the `tuningConfig`.|
-|[Kafka indexing service](../ingestion/kafka-ingestion.md)|Kafka topic partitioning defines how Druid partitions the datasource. You can also [reindex](../data-management/update.md#reindex) or [compact](../data-management/compaction.md) to repartition after initial ingestion.|
-|[Kinesis indexing service](../ingestion/kinesis-ingestion.md)|Kinesis stream sharding defines how Druid partitions the datasource. You can also [reindex](../data-management/update.md#reindex) or [compact](../data-management/compaction.md) to repartition after initial ingestion.|
+|[SQL](../multi-stage-query/index.md)|[`CLUSTERED BY`](../multi-stage-query/concepts.md#clustering)|
+|[Kafka](../ingestion/kafka-ingestion.md) or [Kinesis](../ingestion/kinesis-ingestion.md)|Upstream partitioning defines how Druid partitions the datasource. You can also alter clustering using [`REPLACE`](../multi-stage-query/concepts.md#replace) (with `CLUSTERED BY`) or [compaction](../data-management/compaction.md) after initial ingestion.|
+|[Native batch](native-batch.md) or [Hadoop](hadoop.md)|[`partitionsSpec`](native-batch.md#partitionsspec) inside the `tuningConfig`|
+
+## Sorting
+
+Each segment is internally sorted to promote compression and locality.
+
+Partitioning and sorting work well together. If you do have a "natural" partitioning dimension, consider placing it
+first in your sort order as well. This way, Druid sorts rows within each segment by that column. This sorting configuration
+frequently improves compression and performance more than using partitioning alone.
+
+The following table describes how to configure sorting.
+
+|Method|Configuration|
+|------|------------|
+|[SQL](../multi-stage-query/index.md)|Uses order of fields in [`CLUSTERED BY`](../multi-stage-query/concepts.md#clustering) or [`segmentSortOrder`](../multi-stage-query/reference.md#context) in the query context|
+|[Kafka](../ingestion/kafka-ingestion.md) or [Kinesis](../ingestion/kinesis-ingestion.md)|Uses order of fields in [`dimensionsSpec`](ingestion-spec.md#granularityspec)|
+|[Native batch](native-batch.md) or [Hadoop](hadoop.md)|Uses order of fields in [`dimensionsSpec`](ingestion-spec.md#granularityspec)|
+
+:::info
+Druid implicitly sorts rows within a segment by `__time` first before any `dimensions` or `CLUSTERED BY` fields, unless
+you set `forceSegmentSortByTime` to `false` in your
+[query context](../multi-stage-query/reference.md#context-parameters) (for SQL) or in your
+[`dimensionsSpec`](ingestion-spec.md#dimensionsspec) (for other ingestion forms).
+
+Setting `forceSegmentSortByTime` to `false` is an experimental feature. Segments created with sort orders that
+do not start with `__time` can only be read by Druid 31 or later. Additionally, at this time, certain queries are not
+supported on such segments, including:
+
+- Native queries with `granularity` other than `all`.
+- Native `scan` query with ascending or descending time order.
+- SQL queries that plan into an unsupported native query.
+:::

 ## Learn more

--- a/docs/ingestion/schema-design.md
+++ b/docs/ingestion/schema-design.md
@ -57,7 +57,7 @@ In Druid, on the other hand, it is common to use totally flat datasources that d
 the example of the "sales" table, in Druid it would be typical to store "product_id", "product_name", and
 "product_category" as dimensions directly in a Druid "sales" datasource, without using a separate "products" table.
 Totally flat schemas substantially increase performance, since the need for joins is eliminated at query time. As an
-an added speed boost, this also allows Druid's query layer to operate directly on compressed dictionary-encoded data.
+added speed boost, this also allows Druid's query layer to operate directly on compressed dictionary-encoded data.
 Perhaps counter-intuitively, this does _not_ substantially increase storage footprint relative to normalized schemas,
 since Druid uses dictionary encoding to effectively store just a single integer per row for string columns.

@ -101,7 +101,7 @@ see [partitioning and sorting](./partitioning.md) below for details).
 * Create other dimensions for attributes attached to your data points. These are often called "tags" in timeseries
 database systems.
 * Create [metrics](../querying/aggregations.md) corresponding to the types of aggregations that you want to be able
-to query. Typically this includes "sum", "min", and "max" (in one of the long, float, or double flavors). If you want the ability
+to query. Typically, this includes "sum", "min", and "max" (in one of the long, float, or double flavors). If you want the ability
 to compute percentiles or quantiles, use Druid's [approximate aggregators](../querying/aggregations.md#approximate-aggregations).
 * Consider enabling [rollup](./rollup.md), which will allow Druid to potentially combine multiple points into one
 row in your Druid datasource. This can be useful if you want to store data at a different time granularity than it is
@ -160,7 +160,7 @@ approximate distinct counts, and you'll reduce your storage footprint.

 Sketches reduce memory footprint at query time because they limit the amount of data that needs to be shuffled between
 servers. For example, in a quantile computation, instead of needing to send all data points to a central location
-so they can be sorted and the quantile can be computed, Druid instead only needs to send a sketch of the points. This
+so that they can be sorted and the quantile can be computed, Druid instead only needs to send a sketch of the points. This
 can reduce data transfer needs to mere kilobytes.

 For details about the sketches available in Druid, see the
@ -255,7 +255,7 @@ Druid can infer the schema for your data in one of two ways:

 You can have Druid infer the schema and types for your data partially or fully by setting `dimensionsSpec.useSchemaDiscovery` to `true` and defining some or no dimensions in the dimensions list. 

-When performing type-aware schema discovery, Druid can discover all of the columns of your input data (that aren't in
+When performing type-aware schema discovery, Druid can discover all the columns of your input data (that are not present in
 the exclusion list). Druid automatically chooses the most appropriate native Druid type among `STRING`, `LONG`,
 `DOUBLE`, `ARRAY<STRING>`, `ARRAY<LONG>`, `ARRAY<DOUBLE>`, or `COMPLEX<json>` for nested data. For input formats with
 native boolean types, Druid ingests these values as longs if `druid.expressions.useStrictBooleans` is set to `true`
@ -298,7 +298,7 @@ If you previously used string-based schema discovery and want to migrate to type
 ### Including the same column as a dimension and a metric

 One workflow with unique IDs is to be able to filter on a particular ID, while still being able to do fast unique counts on the ID column.
-If you are not using schema-less dimensions, this use case is supported by setting the `name` of the metric to something different than the dimension.
+If you are not using schema-less dimensions, this use case is supported by setting the `name` of the metric to something different from the dimension.
 If you are using schema-less dimensions, the best practice here is to include the same column twice, once as a dimension, and as a `hyperUnique` metric. This may involve
 some work at ETL time.

--- a/docs/ingestion/streaming.md
+++ b/docs/ingestion/streaming.md
@ -28,7 +28,7 @@ Apache Druid can consume data streams from the following external streaming sour
 * Amazon Kinesis through the bundled [Kinesis indexing service](kinesis-ingestion.md) extension.

 Each indexing service provides real-time data ingestion with exactly-once stream processing guarantee.
-To use either of the streaming ingestion methods, you must first load the associated extension on both the Overlord and the MiddleManager. See [Loading extensions](../configuration/extensions.md#loading-extensions) for more information.
+To use either of the streaming ingestion methods, you must first load the associated extension on both the Overlord and the Middle Manager. See [Loading extensions](../configuration/extensions.md#loading-extensions) for more information.

 Streaming ingestion is controlled by a continuously running [supervisor](supervisor.md).
 The supervisor oversees the state of indexing tasks to coordinate handoffs, manage failures, and ensure that scalability and replication requirements are maintained.
--- a/docs/ingestion/supervisor.md
+++ b/docs/ingestion/supervisor.md
@ -393,7 +393,7 @@ For information on how to terminate a supervisor by API, see [Supervisors: Termi

 ## Capacity planning

-Indexing tasks run on MiddleManagers and are limited by the resources available in the MiddleManager cluster. In particular, you should make sure that you have sufficient worker capacity, configured using the
+Indexing tasks run on Middle Managers and are limited by the resources available in the Middle Manager cluster. In particular, you should make sure that you have sufficient worker capacity, configured using the
 `druid.worker.capacity` property, to handle the configuration in the supervisor spec. Note that worker capacity is
 shared across all types of indexing tasks, so you should plan your worker capacity to handle your total indexing load, such as batch processing, streaming tasks, and merging tasks. If your workers run out of capacity, indexing tasks queue and wait for the next available worker. This may cause queries to return partial results but will not result in data loss, assuming the tasks run before the stream purges those sequence numbers.

--- a/docs/ingestion/tasks.md
+++ b/docs/ingestion/tasks.md
@ -239,6 +239,7 @@ The `ingestionState` shows what step of ingestion the task reached. Possible sta
 - `NOT_STARTED`: The task has not begun reading any rows
 - `DETERMINE_PARTITIONS`: The task is processing rows to determine partitioning
 - `BUILD_SEGMENTS`: The task is processing rows to construct segments
+- `SEGMENT_AVAILABILITY_WAIT`: The task has published its segments and is waiting for them to become available.
 - `COMPLETED`: The task has finished its work.

 Only batch tasks have the DETERMINE_PARTITIONS phase. Realtime tasks such as those created by the Kafka Indexing Service do not have a DETERMINE_PARTITIONS phase.
@ -457,7 +458,7 @@ To enable batched segment allocation on the overlord, set  `druid.indexer.tasklo
 The task context is used for various individual task configuration.
 Specify task context configurations in the `context` field of the ingestion spec.
 When configuring [automatic compaction](../data-management/automatic-compaction.md), set the task context configurations in `taskContext` rather than in `context`.
-The settings get passed into the `context` field of the compaction tasks issued to MiddleManagers.
+The settings get passed into the `context` field of the compaction tasks issued to Middle Managers.

 The following parameters apply to all task types.

@ -476,18 +477,18 @@ Logs are created by ingestion tasks as they run. You can configure Druid to push

 Once the task has been submitted to the Overlord it remains `WAITING` for locks to be acquired. Worker slot allocation is then `PENDING` until the task can actually start executing.

-The task then starts creating logs in a local directory of the middle manager (or indexer) in a `log` directory for the specific `taskId` at [`druid.worker.baseTaskDirs`](../configuration/index.md#middlemanager-configuration).
+The task then starts creating logs in a local directory of the middle manager (or indexer) in a `log` directory for the specific `taskId` at [`druid.worker.baseTaskDirs`](../configuration/index.md#middle-manager-configuration).

 When the task completes - whether it succeeds or fails - the middle manager (or indexer) will push the task log file into the location specified in [`druid.indexer.logs`](../configuration/index.md#task-logging).

-Task logs on the Druid web console are retrieved via an [API](../api-reference/service-status-api.md#overlord) on the Overlord. It automatically detects where the log file is, either in the middleManager / indexer or in long-term storage, and passes it back.
+Task logs on the Druid web console are retrieved via an [API](../api-reference/service-status-api.md#overlord) on the Overlord. It automatically detects where the log file is, either in the Middle Manager / indexer or in long-term storage, and passes it back.

 If you don't see the log file in long-term storage, it means either:

-1. the middleManager / indexer failed to push the log file to deep storage or
-2. the task did not complete.
+- the Middle Manager / indexer failed to push the log file to deep storage or
+- the task did not complete.

-You can check the middleManager / indexer logs locally to see if there was a push failure. If there was not, check the Overlord's own process logs to see why the task failed before it started.
+You can check the Middle Manager / indexer logs locally to see if there was a push failure. If there was not, check the Overlord's own process logs to see why the task failed before it started.

 :::info
 If you are running the indexing service in remote mode, the task logs must be stored in S3, Azure Blob Store, Google Cloud Storage or HDFS.
--- a/docs/multi-stage-query/concepts.md
+++ b/docs/multi-stage-query/concepts.md
@ -172,13 +172,13 @@ Clustering is important for two reasons:

 1. Lower storage footprint due to improved locality, and therefore improved compressibility.
 2. Better query performance due to dimension-based segment pruning, which removes segments from consideration when they
-   cannot possibly contain data matching a query's filter. This speeds up filters like `x = 'foo'` and `x IN ('foo',
-   'bar')`.
+   cannot possibly contain data matching a query's filter. This speeds up filters like `x = 'foo'` and
+   `x IN ('foo', 'bar')`.

 To activate dimension-based pruning, these requirements must be met:

 - Segments were generated by a `REPLACE` statement, not an `INSERT` statement.
- All `CLUSTERED BY` columns are single-valued string columns.
+- `CLUSTERED BY` begins with single-valued string columns. These single-valued string columns are used for pruning.

 If these requirements are _not_ met, Druid still clusters data during ingestion but will not be able to perform
 dimension-based segment pruning at query time. You can tell if dimension-based segment pruning is possible by using the
@ -188,6 +188,10 @@ available in the **Segments** view under the **Partitioning** column.

 For more information about syntax, see [`CLUSTERED BY`](./reference.md#clustered-by).

+For more information about the mechanics of clustering, refer to
+[Secondary partitioning](../ingestion/partitioning#secondary-partitioning) and
+[Sorting](../ingestion/partitioning#sorting).
+
 ### Rollup

 [Rollup](../ingestion/rollup.md) is a technique that pre-aggregates data during ingestion to reduce the amount of data
--- a/docs/multi-stage-query/reference.md
+++ b/docs/multi-stage-query/reference.md
@ -130,13 +130,13 @@ FROM <table>

 ##### S3

-Export results to S3 by passing the function `S3()` as an argument to the `EXTERN` function. Note that this requires the `druid-s3-extensions`.
-The `S3()` function is a Druid function that configures the connection. Arguments for `S3()` should be passed as named parameters with the value in single quotes like the following example:
+Export results to S3 by passing the function `s3()` as an argument to the `EXTERN` function. Note that this requires the `druid-s3-extensions`.
+The `s3()` function is a Druid function that configures the connection. Arguments for `s3()` should be passed as named parameters with the value in single quotes like the following example:

 ```sql
 INSERT INTO
  EXTERN(
-    S3(bucket => 'your_bucket', prefix => 'prefix/to/files')
+    s3(bucket => 'your_bucket', prefix => 'prefix/to/files')
  )
 AS CSV
 SELECT
@ -362,8 +362,13 @@ For more information about partitioning, see [Partitioning](concepts.md#partitio
 ### `CLUSTERED BY`

 The `CLUSTERED BY <column list>` clause is optional for [INSERT](#insert) and [REPLACE](#replace). It accepts a list of
-column names or expressions. Druid's segment generation only supports ascending order, so an `INSERT` or `REPLACE` query with
-`CLUSTERED BY` columns in `DESC` ordering is not allowed.
+column names or expressions.
+
+This column list is used for [secondary partitioning](../ingestion/partitioning.md#secondary-partitioning) of segments
+within a time chunk, and [sorting](../ingestion/partitioning.md#sorting) of rows within a segment. For sorting purposes,
+Druid implicitly prepends `__time` to the `CLUSTERED BY` column list, unless
+[`forceSegmentSortByTime`](#context) is set to `false`
+(an experimental feature; see [Sorting](../ingestion/partitioning.md#sorting) for details).

 For more information about clustering, see [Clustering](concepts.md#clustering).

@ -397,14 +402,15 @@ The following table lists the context parameters for the MSQ task engine:
 | `arrayIngestMode` | INSERT, REPLACE<br /><br /> Controls how ARRAY type values are stored in Druid segments. When set to `array` (recommended for SQL compliance), Druid will store all ARRAY typed values in [ARRAY typed columns](../querying/arrays.md), and supports storing both VARCHAR and numeric typed arrays. When set to `mvd` (the default, for backwards compatibility), Druid only supports VARCHAR typed arrays, and will store them as [multi-value string columns](../querying/multi-value-dimensions.md). See [`arrayIngestMode`] in the [Arrays](../querying/arrays.md) page for more details. | `mvd` (for backwards compatibility, recommended to use `array` for SQL compliance)|
 | `sqlJoinAlgorithm` | SELECT, INSERT, REPLACE<br /><br />Algorithm to use for JOIN. Use `broadcast` (the default) for broadcast hash join or `sortMerge` for sort-merge join. Affects all JOIN operations in the query. This is a hint to the MSQ engine and the actual joins in the query may proceed in a different way than specified. See [Joins](#joins) for more details. | `broadcast` |
 | `rowsInMemory` | INSERT or REPLACE<br /><br />Maximum number of rows to store in memory at once before flushing to disk during the segment generation process. Ignored for non-INSERT queries. In most cases, use the default value. You may need to override the default if you run into one of the [known issues](./known-issues.md) around memory usage. | 100,000 |
-| `segmentSortOrder` | INSERT or REPLACE<br /><br />Normally, Druid sorts rows in individual segments using `__time` first, followed by the [CLUSTERED BY](#clustered-by) clause. When you set `segmentSortOrder`, Druid sorts rows in segments using this column list first, followed by the CLUSTERED BY order.<br /><br />You provide the column list as comma-separated values or as a JSON array in string form. If your query includes `__time`, then this list must begin with `__time`. For example, consider an INSERT query that uses `CLUSTERED BY country` and has `segmentSortOrder` set to `__time,city`. Within each time chunk, Druid assigns rows to segments based on `country`, and then within each of those segments, Druid sorts those rows by `__time` first, then `city`, then `country`. | empty list |
+| `segmentSortOrder` | INSERT or REPLACE<br /><br />Normally, Druid sorts rows in individual segments using `__time` first, followed by the [CLUSTERED BY](#clustered-by) clause. When you set `segmentSortOrder`, Druid uses the order from this context parameter instead. Provide the column list as comma-separated values or as a JSON array in string form.<br />< br/>For example, consider an INSERT query that uses `CLUSTERED BY country` and has `segmentSortOrder` set to `__time,city,country`. Within each time chunk, Druid assigns rows to segments based on `country`, and then within each of those segments, Druid sorts those rows by `__time` first, then `city`, then `country`. | empty list |
+| `forceSegmentSortByTime` | INSERT or REPLACE<br /><br />When set to `true` (the default), Druid prepends `__time` to [CLUSTERED BY](#clustered-by) when determining the sort order for individual segments. Druid also requires that `segmentSortOrder`, if provided, starts with `__time`.<br /><br />When set to `false`, Druid uses the [CLUSTERED BY](#clustered-by) alone to determine the sort order for individual segments, and does not require that `segmentSortOrder` begin with `__time`. Setting this parameter to `false` is an experimental feature; see [Sorting](../ingestion/partitioning#sorting) for details. | `true` |
 | `maxParseExceptions`| SELECT, INSERT, REPLACE<br /><br />Maximum number of parse exceptions that are ignored while executing the query before it stops with `TooManyWarningsFault`. To ignore all the parse exceptions, set the value to -1. | 0 |
 | `rowsPerSegment` | INSERT or REPLACE<br /><br />The number of rows per segment to target. The actual number of rows per segment may be somewhat higher or lower than this number. In most cases, use the default. For general information about sizing rows per segment, see [Segment Size Optimization](../operations/segment-optimization.md). | 3,000,000 |
 | `indexSpec` | INSERT or REPLACE<br /><br />An [`indexSpec`](../ingestion/ingestion-spec.md#indexspec) to use when generating segments. May be a JSON string or object. See [Front coding](../ingestion/ingestion-spec.md#front-coding) for details on configuring an `indexSpec` with front coding. | See [`indexSpec`](../ingestion/ingestion-spec.md#indexspec). |
 | `durableShuffleStorage` | SELECT, INSERT, REPLACE <br /><br />Whether to use durable storage for shuffle mesh. To use this feature, configure the durable storage at the server level using `druid.msq.intermediate.storage.enable=true`). If these properties are not configured, any query with the context variable `durableShuffleStorage=true` fails with a configuration error. <br /><br /> | `false` |
 | `faultTolerance` | SELECT, INSERT, REPLACE<br /><br /> Whether to turn on fault tolerance mode or not. Failed workers are retried based on [Limits](#limits). Cannot be used when `durableShuffleStorage` is explicitly set to false. | `false` |
 | `selectDestination` | SELECT<br /><br /> Controls where the final result of the select query is written. <br />Use `taskReport`(the default) to write select results to the task report. <b> This is not scalable since task reports size explodes for large results </b> <br/>Use `durableStorage` to write results to durable storage location. <b>For large results sets, its recommended to use `durableStorage` </b>. To configure durable storage see [`this`](#durable-storage) section. | `taskReport` |
-| `waitUntilSegmentsLoad` | INSERT, REPLACE<br /><br /> If set, the ingest query waits for the generated segment to be loaded before exiting, else the ingest query exits without waiting. The task and live reports contain the information about the status of loading segments if this flag is set. This will ensure that any future queries made after the ingestion exits will include results from the ingestion. The drawback is that the controller task will stall till the segments are loaded. | `false` |
+| `waitUntilSegmentsLoad` | INSERT, REPLACE<br /><br /> If set, the ingest query waits for the generated segments to be loaded before exiting, else the ingest query exits without waiting. The task and live reports contain the information about the status of loading segments if this flag is set. This will ensure that any future queries made after the ingestion exits will include results from the ingestion. The drawback is that the controller task will stall till the segments are loaded. | `false` |
 | `includeSegmentSource` | SELECT, INSERT, REPLACE<br /><br /> Controls the sources, which will be queried for results in addition to the segments present on deep storage. Can be `NONE` or `REALTIME`. If this value is `NONE`, only non-realtime (published and used) segments will be downloaded from deep storage. If this value is `REALTIME`, results will also be included from realtime tasks. `REALTIME` cannot be used while writing data into the same datasource it is read from.| `NONE` |
 | `rowsPerPage` | SELECT<br /><br />The number of rows per page to target. The actual number of rows per page may be somewhat higher or lower than this number. In most cases, use the default.<br /> This property comes into effect only when `selectDestination` is set to `durableStorage` | 100000 |
 | `skipTypeVerification` | INSERT or REPLACE<br /><br />During query validation, Druid validates that [string arrays](../querying/arrays.md) and [multi-value dimensions](../querying/multi-value-dimensions.md) are not mixed in the same column. If you are intentionally migrating from one to the other, use this context parameter to disable type validation.<br /><br />Provide the column list as comma-separated values or as a JSON array in string form.| empty list |
--- a/docs/operations/basic-cluster-tuning.md
+++ b/docs/operations/basic-cluster-tuning.md
@ -176,29 +176,29 @@ To estimate total memory usage of the Broker under these guidelines:
 - Heap: allocated heap size
 - Direct Memory: `(druid.processing.numMergeBuffers + 1) * druid.processing.buffer.sizeBytes`

-### MiddleManager
+### Middle Manager

-The MiddleManager is a lightweight task controller/manager that launches Task processes, which perform ingestion work.
+The Middle Manager is a lightweight task controller/manager that launches Task processes, which perform ingestion work.

-#### MiddleManager heap sizing
+#### Middle Manager heap sizing

-The MiddleManager itself does not require much resources, you can set the heap to ~128MiB generally.
+The Middle Manager itself does not require much resources, you can set the heap to ~128MiB generally.

 #### SSD storage

-We recommend using SSDs for storage on the MiddleManagers, as the Tasks launched by MiddleManagers handle segment data stored on disk.
+We recommend using SSDs for storage on the Middle Managers, as the Tasks launched by Middle Managers handle segment data stored on disk.

 #### Task Count

-The number of tasks a MiddleManager can launch is controlled by the `druid.worker.capacity` setting.
+The number of tasks a Middle Manager can launch is controlled by the `druid.worker.capacity` setting.

 The number of workers needed in your cluster depends on how many concurrent ingestion tasks you need to run for your use cases. The number of workers that can be launched on a given machine depends on the size of resources allocated per worker and available system resources.

-You can allocate more MiddleManager machines to your cluster to add task capacity.
+You can allocate more Middle Manager machines to your cluster to add task capacity.

 #### Task configurations

-The following section below describes configuration for Tasks launched by the MiddleManager. The Tasks can be queried and perform ingestion workloads, so they require more resources than the MM.
+The following section below describes configuration for Tasks launched by the Middle Manager. The Tasks can be queried and perform ingestion workloads, so they require more resources than the MM.

 ##### Task heap sizing

@ -249,7 +249,7 @@ To estimate total memory usage of a Task under these guidelines:
 - Heap: `1GiB + (2 * total size of lookup maps)`
 - Direct Memory: `(druid.processing.numThreads + druid.processing.numMergeBuffers + 1) * druid.processing.buffer.sizeBytes`

-The total memory usage of the MiddleManager + Tasks:
+The total memory usage of the Middle Manager + Tasks:

 `MM heap size + druid.worker.capacity * (single task memory usage)`

@ -435,7 +435,7 @@ Additionally, for large JVM heaps, here are a few Garbage Collection efficiency


 - Mount /tmp on tmpfs. See [The Four Month Bug: JVM statistics cause garbage collection pauses](http://www.evanjones.ca/jvm-mmap-pause.html).
- On Disk-IO intensive processes (e.g., Historical and MiddleManager), GC and Druid logs should be written to a different disk than where data is written.
+- On Disk-IO intensive processes (e.g., Historical and Middle Manager), GC and Druid logs should be written to a different disk than where data is written.
 - Disable [Transparent Huge Pages](https://www.kernel.org/doc/html/latest/admin-guide/mm/transhuge.html).
 - Try disabling biased locking by using `-XX:-UseBiasedLocking` JVM flag. See [Logging Stop-the-world Pauses in JVM](https://dzone.com/articles/logging-stop-world-pauses-jvm).

@ -447,7 +447,7 @@ We recommend using UTC timezone for all your events and across your hosts, not j

 #### SSDs

-SSDs are highly recommended for Historical, MiddleManager, and Indexer processes if you are not running a cluster that is entirely in memory. SSDs can greatly mitigate the time required to page data in and out of memory.
+SSDs are highly recommended for Historical, Middle Manager, and Indexer processes if you are not running a cluster that is entirely in memory. SSDs can greatly mitigate the time required to page data in and out of memory.

 #### JBOD vs RAID

@ -455,11 +455,11 @@ Historical processes store large number of segments on Disk and support specifyi

 #### Swap space

-We recommend _not_ using swap space for Historical, MiddleManager, and Indexer processes since due to the large number of memory mapped segment files can lead to poor and unpredictable performance.
+We recommend _not_ using swap space for Historical, Middle Manager, and Indexer processes since due to the large number of memory mapped segment files can lead to poor and unpredictable performance.

 #### Linux limits

-For Historical, MiddleManager, and Indexer processes (and for really large clusters, Broker processes), you might need to adjust some Linux system limits to account for a large number of open files, a large number of network connections, or a large number of memory mapped files.
+For Historical, Middle Manager, and Indexer processes (and for really large clusters, Broker processes), you might need to adjust some Linux system limits to account for a large number of open files, a large number of network connections, or a large number of memory mapped files.

 ##### ulimit

@ -467,4 +467,4 @@ The limit on the number of open files can be set permanently by editing `/etc/se

 ##### max_map_count

-Historical processes and to a lesser extent, MiddleManager and Indexer processes memory map segment files, so depending on the number of segments per server, `/proc/sys/vm/max_map_count` might also need to be adjusted. Depending on the variant of Linux, this might be done via `sysctl` by placing a file in `/etc/sysctl.d/` that sets `vm.max_map_count`.
+Historical processes and to a lesser extent, Middle Manager and Indexer processes memory map segment files, so depending on the number of segments per server, `/proc/sys/vm/max_map_count` might also need to be adjusted. Depending on the variant of Linux, this might be done via `sysctl` by placing a file in `/etc/sysctl.d/` that sets `vm.max_map_count`.
--- a/docs/operations/metrics.md
+++ b/docs/operations/metrics.md
@ -119,6 +119,7 @@ Most metric values reset each emission period, as specified in `druid.monitoring
 |`query/failed/count`|Number of failed queries.|This metric is only available if the `QueryCountStatsMonitor` module is included.||
 |`query/interrupted/count`|Number of queries interrupted due to cancellation.|This metric is only available if the `QueryCountStatsMonitor` module is included.||
 |`query/timeout/count`|Number of timed out queries.|This metric is only available if the `QueryCountStatsMonitor` module is included.||
+|`mergeBuffer/pendingRequests`|Number of requests waiting to acquire a batch of buffers from the merge buffer pool.|This metric is only available if the `QueryCountStatsMonitor` module is included.||

 ### Jetty

@ -382,6 +383,9 @@ These metrics are emitted by the Druid Coordinator in every run of the correspon
 |`metadatacache/finalizedSchemaPayload/count`|Number of finalized segment schema cached.||Depends on the number of distinct schema in the cluster.|
 |`metadatacache/temporaryMetadataQueryResults/count`|Number of segments for which schema was fetched by executing segment metadata query.||Eventually it should be 0.|
 |`metadatacache/temporaryPublishedMetadataQueryResults/count`|Number of segments for which schema is cached after back filling in the database.||This value gets reset after each database poll. Eventually it should be 0.|
+|`metadatacache/deepStorageOnly/segment/count`|Number of available segments present only in deep storage.|`dataSource`||
+|`metadatacache/deepStorageOnly/refresh/count`|Number of deep storage only segments with cached schema.|`dataSource`||
+|`metadatacache/deepStorageOnly/process/time`|Time taken in milliseconds to process deep storage only segment schema.||Under a minute|

 ## General Health

--- a/docs/operations/other-hadoop.md
+++ b/docs/operations/other-hadoop.md
@ -55,7 +55,7 @@ Generally, you should only set one of these parameters, not both.
 These properties can be set in either one of the following ways:

 - Using the task definition, e.g. add `"mapreduce.job.classloader": "true"` to the `jobProperties` of the `tuningConfig` of your indexing task (see the [Hadoop batch ingestion documentation](../ingestion/hadoop.md)).
- Using system properties, e.g. on the MiddleManager set `druid.indexer.runner.javaOpts=... -Dhadoop.mapreduce.job.classloader=true` in [Middle Manager configuration](../configuration/index.md#middlemanager-configuration).
+- Using system properties, e.g. on the Middle Manager set `druid.indexer.runner.javaOpts=... -Dhadoop.mapreduce.job.classloader=true` in [Middle Manager configuration](../configuration/index.md#middle-manager-configuration).

 ### Overriding specific classes

--- a/docs/operations/rolling-updates.md
+++ b/docs/operations/rolling-updates.md
@ -30,7 +30,7 @@ following order:
 2. Middle Manager and Indexer (if any)
 3. Broker
 4. Router
-5. Overlord (Note that you can upgrade the Overlord before any MiddleManager processes if you use [autoscaling-based replacement](#autoscaling-based-replacement).)
+5. Overlord (Note that you can upgrade the Overlord before any Middle Manager processes if you use [autoscaling-based replacement](#autoscaling-based-replacement).)
 6. Coordinator ( or merged Coordinator+Overlord )

 If you need to do a rolling downgrade, reverse the order and start with the Coordinator processes.
@ -70,14 +70,14 @@ Middle Managers can be gracefully terminated using the "disable" API. This works
 even tasks that are not restorable.

 To prepare a Middle Manager for update, send a POST request to
-`<MiddleManager_IP:PORT>/druid/worker/v1/disable`. The Overlord will now no longer send tasks to
+`<Middle_Manager_IP:PORT>/druid/worker/v1/disable`. The Overlord will now no longer send tasks to
 this Middle Manager. Tasks that have already started will run to completion. Current state can be checked
-using `<MiddleManager_IP:PORT>/druid/worker/v1/enabled` .
+using `<Middle_Manager_IP:PORT>/druid/worker/v1/enabled` .

-To view all existing tasks, send a GET request to `<MiddleManager_IP:PORT>/druid/worker/v1/tasks`.
+To view all existing tasks, send a GET request to `<Middle_Manager_IP:PORT>/druid/worker/v1/tasks`.
 When this list is empty, you can safely update the Middle Manager. After the Middle Manager starts
 back up, it is automatically enabled again. You can also manually enable Middle Managers by POSTing
-to `<MiddleManager_IP:PORT>/druid/worker/v1/enable`.
+to `<Middle_Manager_IP:PORT>/druid/worker/v1/enable`.

 ### Autoscaling-based replacement

--- a/docs/operations/rule-configuration.md
+++ b/docs/operations/rule-configuration.md
@ -277,7 +277,9 @@ Set the following property:

 ## Broadcast rules

-Druid extensions use broadcast rules to load segment data onto all brokers in the cluster. Apply broadcast rules in a test environment, not in production.
+Druid extensions use broadcast rules to load segment data onto all Brokers in the cluster. Apply broadcast rules in a test environment, not in production.
+To use broadcast rules, ensure that `druid.segmentCache.locations` is configured on both Brokers and Historicals.
+This ensures that Druid can load the segments onto those servers. For more information, see [Segment cache size](../operations/basic-cluster-tuning.md#segment-cache-size).

 ### Forever broadcast rule

--- a/docs/operations/web-console.md
+++ b/docs/operations/web-console.md
@ -87,7 +87,7 @@ It is equivalent to the **Task** view in the **Ingestion** view with the filter
 9. The **Preview** button appears when you enter an INSERT/REPLACE query. It runs the query inline without the INSERT/REPLACE clause and with an added LIMIT to give you a preview of the data that would be ingested if you click **Run**.
 The added LIMIT makes the query run faster but provides incomplete results.
 10. The engine selector lets you choose which engine (API endpoint) to send a query to. By default, it automatically picks which endpoint to use based on an analysis of the query, but you can select a specific engine explicitly. You can also configure the engine specific context parameters from this menu.
-11. The **Max tasks** picker appears when you have the **sql-msq-task** engine selected. It lets you configure the degree of parallelism.
+11. The **Max tasks** picker appears when you have the **SQL MSQ-task** engine selected. It lets you configure the degree of parallelism.
 12. The More menu (**...**) contains the following helpful tools:
 - **Explain SQL query** shows you the logical plan returned by `EXPLAIN PLAN FOR` for a SQL query.
 - **Query history** shows you previously executed queries.
--- a/docs/querying/granularities.md
+++ b/docs/querying/granularities.md
@ -30,7 +30,8 @@ sidebar_label: "Granularities"
 [SQL documentation](sql-scalar.md#date-and-time-functions).
 :::

-Granularity determines how to bucket data across the time dimension, or how to aggregate data by hour, day, minute, etc.
+Granularity determines how to bucket data across the time dimension, or how to aggregate data by hour, day, minute, etc and defines how it is stored.
+The granularity formats here apply also to `segmentGranularity` and `queryGranularity` in the `granularitySpec` section of the the [ingestion spec](../ingestion/ingestion-spec.md#granularityspec).

 For example, use time granularities in [native queries](querying.md) to bucket results by time, and in the `dataSchema` \\ [`granularitySpec`](../ingestion/ingestion-spec.md#granularityspec) section of ingestion specifications to segment incoming data.

--- a/docs/querying/groupbyquery.md
+++ b/docs/querying/groupbyquery.md
@ -337,7 +337,7 @@ dictionary that can spill to disk. The outer query is run on the Broker in a sin

 ### Configurations

-This section describes the configurations for groupBy queries. You can set the runtime properties in the `runtime.properties` file on Broker, Historical, and MiddleManager processes. You can set the query context parameters through the [query context](query-context.md).
+This section describes the configurations for groupBy queries. You can set the runtime properties in the `runtime.properties` file on Broker, Historical, and Middle Manager processes. You can set the query context parameters through the [query context](query-context.md).

 Supported runtime properties:

--- a/docs/querying/query-execution.md
+++ b/docs/querying/query-execution.md
@ -47,7 +47,7 @@ by range using the [`single_dim` partitionsSpec](../ingestion/native-batch.md#pa
 the dimension used for partitioning.

 3. The Broker, having pruned the list of segments for the query, forwards the query to data servers (like Historicals
-and tasks running on MiddleManagers) that are currently serving those segments.
+and tasks running on Middle Managers) that are currently serving those segments.

 4. For all query types except [Scan](scan-query.md), data servers process each segment in parallel and generate partial
 results for each segment. The specific processing that is done depends on the query type. These partial results may be
--- a/docs/querying/query-from-deep-storage.md
+++ b/docs/querying/query-from-deep-storage.md
@ -28,13 +28,20 @@ Druid can query segments that are only stored in deep storage. Running a query f

 Query from deep storage requires the Multi-stage query (MSQ) task engine. Load the extension for it if you don't already have it enabled before you begin. See [enable MSQ](../multi-stage-query/index.md#load-the-extension) for more information.

+To be queryable, your datasource must meet one of the following conditions:
+
+- At least one segment from the datasource is loaded onto a Historical service for Druid to plan the query. This segment can be any segment from the datasource. You can verify that a datasource has at least one segment on a Historical service if it's visible in the Druid console.
+- You have the centralized datasource schema feature enabled. For more information, see [Centralized datasource schema](../configuration/index.md#centralized-datasource-schema).
+
+If you use centralized data source schema, there's an additional step for any datasource created prior to enabling it to make the datasource queryable from deep storage. You need to load the segments from deep storage onto a Historical so that the schema can be backfilled in the metadata database. You can load some or all of the segments that are only in deep storage. If you don't load all the segments, any dimensions that are only in the segments you didn't load will not be in the queryable datasource schema and won't be queryable from deep storage. That is, only the dimensions that are present in the segment schema in metadata database are queryable. Once that process is complete, you can unload all the segments from the Historical and only keep the data in deep storage.
+
 ## Keep segments in deep storage only

-Any data you ingest into Druid is already stored in deep storage, so you don't need to perform any additional configuration from that perspective. However, to take advantage of the cost savings that querying from deep storage provides, make sure not all your segments get loaded onto Historical processes.
+Any data you ingest into Druid is already stored in deep storage, so you don't need to perform any additional configuration from that perspective. However, to take advantage of the cost savings that querying from deep storage provides, make sure not all your segments get loaded onto Historical processes. If you use centralized datasource schema, a datasource can be kept only in deep storage but remain queryable.

-To do this, configure [load rules](../operations/rule-configuration.md#load-rules) to manage the which segments are only in deep storage and which get loaded onto Historical processes.
+To manage which segments are kept only in deep storage and which get loaded onto Historical processes, configure [load rules](../operations/rule-configuration.md#load-rules) 

-The easiest way to do this is to explicitly configure the segments that don't get loaded onto Historical processes. Set `tieredReplicants` to an empty array and `useDefaultTierForNull` to `false`. For example, if you configure the following rule for a datasource:
+The easiest way to keep segments only in deep storage is to explicitly configure the segments that don't get loaded onto Historical processes. Set `tieredReplicants` to an empty array and `useDefaultTierForNull` to `false`. For example, if you configure the following rule for a datasource:

 ```json
 [
@ -64,10 +71,7 @@ Segments with a `replication_factor` of `0` are not assigned to any Historical t

 You can also confirm this through the Druid console. On the **Segments** page, see the **Replication factor** column.

-Keep the following in mind when working with load rules to control what exists only in deep storage:
-
- At least one of the segments in a datasource must be loaded onto a Historical process so that Druid can plan the query. The segment on the Historical process can be any segment from the datasource. It does not need to be a specific segment. One way to verify that a datasource has at least one segment on a Historical process is if it's visible in the Druid console.
- The actual number of replicas may differ from the replication factor temporarily as Druid processes your load rules.
+Note that the actual number of replicas may differ from the replication factor temporarily as Druid processes your load rules.

 ## Run a query from deep storage

--- a/docs/querying/query-processing.md
+++ b/docs/querying/query-processing.md
@ -27,8 +27,8 @@ This topic provides a high-level overview of how Apache Druid distributes and pr
 The general flow is as follows:

 1. A query enters the [Broker](../design/broker.md) service, which identifies the segments with data that may pertain to that query. The list of segments is always pruned by time, and may also be pruned by other attributes depending on how the datasource is partitioned.
-2. The Broker identifies which [Historical](../design/historical.md) and [MiddleManager](../design/middlemanager.md) services are serving those segments and distributes a rewritten subquery to each of the services.
-3. The Historical and MiddleManager services execute each subquery and return results to the Broker.
+2. The Broker identifies which [Historical](../design/historical.md) and [Middle Manager](../design/middlemanager.md) services are serving those segments and distributes a rewritten subquery to each of the services.
+3. The Historical and Middle Manager services execute each subquery and return results to the Broker.
 4. The Broker merges the partial results to get the final answer, which it returns to the original caller.

 Druid uses time and attribute pruning to minimize the data it must scan for each query.
--- a/docs/querying/sql-functions.md
+++ b/docs/querying/sql-functions.md
--- a/docs/querying/sql-query-context.md
+++ b/docs/querying/sql-query-context.md
@ -50,11 +50,11 @@ Configure Druid SQL query planning using the parameters in the table below.
 |`sqlReverseLookup`|Whether to consider the [reverse-lookup rewrite](lookups.md#reverse-lookup) of the `LOOKUP` function during SQL planning.<br /><br />Calls to `LOOKUP` are only reversed when the number of matching keys is lower than both `inSubQueryThreshold` and `sqlReverseLookupThreshold`.|true|
 |`sqlReverseLookupThreshold`|Maximum size of `IN` filter to create when applying a [reverse-lookup rewrite](lookups.md#reverse-lookup). If a `LOOKUP` call matches more keys than this threshold, it is left as-is.<br /><br />If `inSubQueryThreshold` is lower than `sqlReverseLookupThreshold`, the `inSubQueryThreshold` is used as the threshold instead.|10000|
 |`sqlPullUpLookup`|Whether to consider the [pull-up rewrite](lookups.md#pull-up) of the `LOOKUP` function during SQL planning.|true|
-|`enableJoinLeftTableScanDirect`|`false`|This flag applies to queries which have joins. For joins, where left child is a simple scan with a filter,  by default, druid will run the scan as a query and the join the results to the right child on broker. Setting this flag to true overrides that behavior and druid will attempt to push the join to data servers instead. Please note that the flag could be applicable to queries even if there is no explicit join. since queries can internally translated into a join by the SQL planner.|
-|`maxNumericInFilters`|`-1`|Max limit for the amount of numeric values that can be compared for a string type dimension when the entire SQL WHERE clause of a query translates only to an [OR](../querying/filters.md#or) of [Bound filter](../querying/filters.md#bound-filter). By default, Druid does not restrict the amount of of numeric Bound Filters on String columns, although this situation may block other queries from running. Set this parameter to a smaller value to prevent Druid from running queries that have prohibitively long segment processing times. The optimal limit requires some trial and error; we recommend starting with 100.  Users who submit a query that exceeds the limit of `maxNumericInFilters` should instead rewrite their queries to use strings in the `WHERE` clause instead of numbers. For example, `WHERE someString IN (‘123’, ‘456’)`. This value cannot exceed the set system configuration `druid.sql.planner.maxNumericInFilters`. This value is ignored if `druid.sql.planner.maxNumericInFilters` is not set explicitly.|
-|`inFunctionThreshold`|`100`| At or beyond this threshold number of values, SQL `IN` is converted to [`SCALAR_IN_ARRAY`](sql-functions.md#scalar_in_array). A threshold of 0 forces this conversion in all cases. A threshold of [Integer.MAX_VALUE] disables this conversion. The converted function is eligible for fewer planning-time optimizations, which speeds up planning, but may prevent certain planning-time optimizations.|
-|`inFunctionExprThreshold`|`2`| At or beyond this threshold number of values, SQL `IN` is eligible for execution using the native function `scalar_in_array` rather than an <code>&#124;&#124;</code> of `==`, even if the number of values is below `inFunctionThreshold`. This property only affects translation of SQL `IN` to a [native expression](math-expr.md). It does not affect translation of SQL `IN` to a [native filter](filters.md). This property is provided for backwards compatibility purposes, and may be removed in a future release.|
-|`inSubQueryThreshold`|`2147483647`| At or beyond this threshold number of values, SQL `IN` is converted to `JOIN` on an inline table. `inFunctionThreshold` takes priority over this setting. A threshold of 0 forces usage of an inline table in all cases where the size of a SQL `IN` is larger than `inFunctionThreshold`. A threshold of `2147483647` disables the rewrite of SQL `IN` to `JOIN`. |
+|`enableJoinLeftTableScanDirect`|This flag applies to queries which have joins. For joins, where left child is a simple scan with a filter,  by default, druid will run the scan as a query and the join the results to the right child on broker. Setting this flag to true overrides that behavior and druid will attempt to push the join to data servers instead. Please note that the flag could be applicable to queries even if there is no explicit join. since queries can internally translated into a join by the SQL planner.|`false`|
+|`maxNumericInFilters`|Max limit for the amount of numeric values that can be compared for a string type dimension when the entire SQL WHERE clause of a query translates only to an [OR](../querying/filters.md#or) of [Bound filter](../querying/filters.md#bound-filter). By default, Druid does not restrict the amount of of numeric Bound Filters on String columns, although this situation may block other queries from running. Set this parameter to a smaller value to prevent Druid from running queries that have prohibitively long segment processing times. The optimal limit requires some trial and error; we recommend starting with 100.  Users who submit a query that exceeds the limit of `maxNumericInFilters` should instead rewrite their queries to use strings in the `WHERE` clause instead of numbers. For example, `WHERE someString IN (‘123’, ‘456’)`. This value cannot exceed the set system configuration `druid.sql.planner.maxNumericInFilters`. This value is ignored if `druid.sql.planner.maxNumericInFilters` is not set explicitly.|`-1`|
+|`inFunctionThreshold`| At or beyond this threshold number of values, SQL `IN` is converted to [`SCALAR_IN_ARRAY`](sql-functions.md#scalar_in_array). A threshold of 0 forces this conversion in all cases. A threshold of [Integer.MAX_VALUE] disables this conversion. The converted function is eligible for fewer planning-time optimizations, which speeds up planning, but may prevent certain planning-time optimizations.| `100`|
+|`inFunctionExprThreshold`|At or beyond this threshold number of values, SQL `IN` is eligible for execution using the native function `scalar_in_array` rather than an <code>&#124;&#124;</code> of `==`, even if the number of values is below `inFunctionThreshold`. This property only affects translation of SQL `IN` to a [native expression](math-expr.md). It does not affect translation of SQL `IN` to a [native filter](filters.md). This property is provided for backwards compatibility purposes, and may be removed in a future release.|`2`|
+|`inSubQueryThreshold`|At or beyond this threshold number of values, SQL `IN` is converted to `JOIN` on an inline table. `inFunctionThreshold` takes priority over this setting. A threshold of 0 forces usage of an inline table in all cases where the size of a SQL `IN` is larger than `inFunctionThreshold`. A threshold of `2147483647` disables the rewrite of SQL `IN` to `JOIN`. |`2147483647`|

 ## Setting the query context
 The query context parameters can be specified as a "context" object in the [JSON API](../api-reference/sql-api.md) or as a [JDBC connection properties object](../api-reference/sql-jdbc.md).
--- a/docs/querying/sql-scalar.md
+++ b/docs/querying/sql-scalar.md
@ -102,22 +102,22 @@ String functions accept strings and return a type appropriate to the function.
 |`CHAR_LENGTH(expr)`|Alias for `LENGTH`.|
 |`CHARACTER_LENGTH(expr)`|Alias for `LENGTH`.|
 |`STRLEN(expr)`|Alias for `LENGTH`.|
-|`LOOKUP(expr, lookupName[, replaceMissingValueWith])`|Looks up `expr` in an existing [query-time lookup table](lookups.md) that has the name `lookupName`. Returns the optional constant `replaceMissingValueWith` when `expr` is null or when the lookup does not contain a value for `expr`.<br /><br />You can query lookups directly using the [`lookup` schema](sql.md#from).|
+|`LOOKUP(expr, lookupName[, replaceMissingValueWith])`|Searches for `expr` in a registered [query-time lookup table](lookups.md) named `lookupName` and returns the mapped value. If `expr` is null or not contained in the lookup, returns `replaceMissingValueWith` if supplied, otherwise returns null.<br /><br />You can query lookups directly using the [`lookup` schema](sql.md#from).|
 |`LOWER(expr)`|Returns `expr` in all lowercase.|
 |`UPPER(expr)`|Returns `expr` in all uppercase.|
-|`LPAD(expr, length[, chars])`|Returns a string of `length` from `expr` left-padded with `chars`. If `length` is shorter than the length of `expr`, the result is `expr` which is truncated to `length`. The result is null if either `expr` or `chars` is null. If `chars` is an empty string, no padding is added; however, `expr` may be trimmed if necessary.|
-|`RPAD(expr, length[, chars])`|Returns a string of `length` from `expr` right-padded with `chars`. If `length` is shorter than the length of `expr`, the result is `expr` which is truncated to `length`. The result is null if either `expr` or `chars` is null. If `chars` is an empty string, no padding is added; however, `expr` may be trimmed if necessary.|
-|`PARSE_LONG(string[, radix])`|Parses a string into a long (BIGINT) with the given radix or 10 (decimal) if a radix is not provided.|
+|`LPAD(expr, length[, chars])`|Returns a string of `length` from `expr`. If `expr` is shorter than `length`, left pads `expr` with `chars`, which defaults to space characters. If `expr` exceeds `length`, truncates `expr` to equal `length`.  If `chars` is an empty string, no padding is added. Returns `null` if either `expr` or `chars` is null.|
+|`RPAD(expr, length[, chars])`|Returns a string of `length` from `expr`. If `expr` is shorter than `length`, right pads `expr` with `chars`, which defaults to space characters. If `expr` exceeds `length`, truncates `expr` to equal `length`.  If `chars` is an empty string, no padding is added. Returns `null` if either `expr` or `chars` is null.|
+|`PARSE_LONG(string[, radix])`|Parses a string into a long (BIGINT) with the given radix, or 10 (decimal) if a radix is not provided.|
 |`POSITION(substring IN expr [FROM startingIndex])`|Returns the index of `substring` within `expr` with indexes starting from 1. The search begins at `startingIndex`. If `startingIndex` is not specified, the default is 1. If `substring` is not found, returns 0.|
-|`REGEXP_EXTRACT(expr, pattern[, index])`|Apply regular expression `pattern` to `expr` and extract a capture group or `NULL` if there is no match. If `index` is unspecified or zero, returns the first substring that matched the pattern. The pattern may match anywhere inside `expr`. To match the entire string, use the `^` and `$` markers at the start and end of your pattern. When `druid.generic.useDefaultValueForNull = true`, it is not possible to differentiate an empty-string match from a non-match (both return `NULL`).|
+|`REGEXP_EXTRACT(expr, pattern[, index])`|Apply regular expression `pattern` to `expr` and extract a capture group or `NULL` if there is no match. If `index` is unspecified or zero, returns the first substring that matches the pattern. The pattern may match anywhere inside `expr`. To match the entire string, use the `^` and `$` markers at the start and end of your pattern. When `druid.generic.useDefaultValueForNull = true`, it is not possible to differentiate an empty-string match from a non-match (both return `NULL`).|
 |`REGEXP_LIKE(expr, pattern)`|Returns whether `expr` matches regular expression `pattern`. The pattern may match anywhere inside `expr`; if you want to match the entire string instead, use the `^` and `$` markers at the start and end of your pattern. Similar to [`LIKE`](sql-operators.md#logical-operators), but uses regexps instead of LIKE patterns. Especially useful in WHERE clauses.|
 |`REGEXP_REPLACE(expr, pattern, replacement)`|Replaces all occurrences of regular expression `pattern` within `expr` with `replacement`. The replacement string may refer to capture groups using `$1`, `$2`, etc. The pattern may match anywhere inside `expr`; if you want to match the entire string instead, use the `^` and `$` markers at the start and end of your pattern.|
-|`REPLACE(expr, pattern, replacement)`|Replaces pattern with replacement in `expr`, and returns the result.|
+|`REPLACE(expr, substring, replacement)`|Replaces instances of `substring` in `expr` with `replacement` and returns the result.|
 |`REPEAT(expr, N)`|Repeats `expr` `N` times.|
 |`REVERSE(expr)`|Reverses `expr`.|
 |`STRING_FORMAT(pattern[, args...])`|Returns a string formatted in the manner of Java's [String.format](https://docs.oracle.com/javase/8/docs/api/java/lang/String.html#format-java.lang.String-java.lang.Object...-).|
-|`STRPOS(haystack, needle)`|Returns the index of `needle` within `haystack`, with indexes starting from 1. If `needle` is not found, returns 0.|
-|`SUBSTRING(expr, index[, length])`|Returns a substring of `expr` starting at `index`, with a max `length`, both measured in UTF-16 code units.|
+|`STRPOS(expr, substring)`|Returns the index of `substring` within `expr`, with indexes starting from 1. If `substring` is not found, returns 0.|
+|`SUBSTRING(expr, index[, length])`|Returns a substring of `expr` starting at a given one-based index. If `length` is omitted, extracts characters to the end of the string, otherwise returns a substring of `length` UTF-16 characters.|
 |`SUBSTR(expr, index[, length])`|Alias for `SUBSTRING`.|
 |`TRIM([BOTH `<code>&#124;</code>` LEADING `<code>&#124;</code>` TRAILING] [chars FROM] expr)`|Returns `expr` with characters removed from the leading, trailing, or both ends of `expr` if they are in `chars`. If `chars` is not provided, it defaults to `''` (a space). If the directional argument is not provided, it defaults to `BOTH`.|
 |`BTRIM(expr[, chars])`|Alternate form of `TRIM(BOTH chars FROM expr)`.|
@ -143,11 +143,11 @@ Literal timestamps in the connection time zone can be written using `TIMESTAMP '
 simplest way to write literal timestamps in other time zones is to use TIME_PARSE, like
 `TIME_PARSE('2000-02-01 00:00:00', NULL, 'America/Los_Angeles')`.

-The best ways to filter based on time are by using ISO8601 intervals, like
+The best way to filter based on time is by using ISO 8601 intervals, like
 `TIME_IN_INTERVAL(__time, '2000-01-01/2000-02-01')`, or by using literal timestamps with the `>=` and `<` operators, like
 `__time >= TIMESTAMP '2000-01-01 00:00:00' AND __time < TIMESTAMP '2000-02-01 00:00:00'`.

-Druid supports the standard SQL BETWEEN operator, but we recommend avoiding it for time filters. BETWEEN is inclusive
+Druid supports the standard SQL `BETWEEN` operator, but we recommend avoiding it for time filters. `BETWEEN` is inclusive
 of its upper bound, which makes it awkward to write time filters correctly. For example, the equivalent of
 `TIME_IN_INTERVAL(__time, '2000-01-01/2000-02-01')` is
 `__time BETWEEN TIMESTAMP '2000-01-01 00:00:00' AND TIMESTAMP '2000-01-31 23:59:59.999'`.
@ -160,16 +160,16 @@ overhead.

 |Function|Notes|
 |--------|-----|
-|`CURRENT_TIMESTAMP`|Current timestamp in the connection's time zone.|
-|`CURRENT_DATE`|Current date in the connection's time zone.|
+|`CURRENT_TIMESTAMP`|Current timestamp in UTC time, unless you specify a different timezone in the query context.|
+|`CURRENT_DATE`|Current date in UTC time, unless you specify a different timezone in the query context.|
 |`DATE_TRUNC(unit, timestamp_expr)`|Rounds down a timestamp, returning it as a new timestamp. Unit can be 'milliseconds', 'second', 'minute', 'hour', 'day', 'week', 'month', 'quarter', 'year', 'decade', 'century', or 'millennium'.|
-|`TIME_CEIL(timestamp_expr, period[, origin[, timezone]])`|Rounds up a timestamp, returning it as a new timestamp. Period can be any ISO8601 period, like P3M (quarters) or PT12H (half-days). Specify `origin` as a timestamp to set the reference time for rounding. For example, `TIME_CEIL(__time, 'PT1H', TIMESTAMP '2016-06-27 00:30:00')` measures an hourly period from 00:30-01:30 instead of 00:00-01:00. See [Period granularities](granularities.md) for details on the default starting boundaries. The time zone, if provided, should be a time zone name like "America/Los_Angeles" or offset like "-08:00". This function is similar to `CEIL` but is more flexible.|
-|`TIME_FLOOR(timestamp_expr, period[, origin[, timezone]])`|Rounds down a timestamp, returning it as a new timestamp. Period can be any ISO8601 period, like P3M (quarters) or PT12H (half-days). Specify `origin` as a timestamp to set the reference time for rounding. For example, `TIME_FLOOR(__time, 'PT1H', TIMESTAMP '2016-06-27 00:30:00')` measures an hourly period from 00:30-01:30 instead of 00:00-01:00. See [Period granularities](granularities.md) for details on the default starting boundaries. The time zone, if provided, should be a time zone name like "America/Los_Angeles" or offset like "-08:00". This function is similar to `FLOOR` but is more flexible.|
-|`TIME_SHIFT(timestamp_expr, period, step[, timezone])`|Shifts a timestamp by a period (step times), returning it as a new timestamp. Period can be any ISO8601 period. Step may be negative. The time zone, if provided, should be a time zone name like "America/Los_Angeles" or offset like "-08:00".|
-|`TIME_EXTRACT(timestamp_expr[, unit[, timezone]])`|Extracts a time part from `expr`, returning it as a number. Unit can be EPOCH, SECOND, MINUTE, HOUR, DAY (day of month), DOW (day of week), DOY (day of year), WEEK (week of [week year](https://en.wikipedia.org/wiki/ISO_week_date)), MONTH (1 through 12), QUARTER (1 through 4), or YEAR. The time zone, if provided, should be a time zone name like "America/Los_Angeles" or offset like "-08:00". This function is similar to `EXTRACT` but is more flexible. Unit and time zone must be literals, and must be provided quoted, like `TIME_EXTRACT(__time, 'HOUR')` or `TIME_EXTRACT(__time, 'HOUR', 'America/Los_Angeles')`.|
-|`TIME_PARSE(string_expr[, pattern[, timezone]])`|Parses a string into a timestamp using a given [Joda DateTimeFormat pattern](http://www.joda.org/joda-time/apidocs/org/joda/time/format/DateTimeFormat.html), or ISO8601 (e.g. `2000-01-02T03:04:05Z`) if the pattern is not provided. The time zone, if provided, should be a time zone name like "America/Los_Angeles" or offset like "-08:00", and will be used as the time zone for strings that do not include a time zone offset. Pattern and time zone must be literals. Strings that cannot be parsed as timestamps will be returned as NULL.|
-|`TIME_FORMAT(timestamp_expr[, pattern[, timezone]])`|Formats a timestamp as a string with a given [Joda DateTimeFormat pattern](http://www.joda.org/joda-time/apidocs/org/joda/time/format/DateTimeFormat.html), or ISO8601 (e.g. `2000-01-02T03:04:05Z`) if the pattern is not provided. The time zone, if provided, should be a time zone name like "America/Los_Angeles" or offset like "-08:00". Pattern and time zone must be literals.|
-|`TIME_IN_INTERVAL(timestamp_expr, interval)`|Returns whether a timestamp is contained within a particular interval. The interval must be a literal string containing any ISO8601 interval, such as `'2001-01-01/P1D'` or `'2001-01-01T01:00:00/2001-01-02T01:00:00'`. The start instant of the interval is inclusive and the end instant is exclusive.|
+|`TIME_CEIL(timestamp_expr, period[, origin[, timezone]])`|Rounds up a timestamp, returning it as a new timestamp. Period can be any ISO 8601 period, like P3M (quarters) or PT12H (half-days). Specify `origin` as a timestamp to set the reference time for rounding. For example, `TIME_CEIL(__time, 'PT1H', TIMESTAMP '2016-06-27 00:30:00')` measures an hourly period from 00:30-01:30 instead of 00:00-01:00. See [Period granularities](granularities.md) for details on the default starting boundaries. The time zone, if provided, should be a time zone name like `America/Los_Angeles` or an offset like `-08:00`. This function is similar to `CEIL` but is more flexible.|
+|`TIME_FLOOR(timestamp_expr, period[, origin[, timezone]])`|Rounds down a timestamp, returning it as a new timestamp. Period can be any ISO 8601 period, like P3M (quarters) or PT12H (half-days). Specify `origin` as a timestamp to set the reference time for rounding. For example, `TIME_FLOOR(__time, 'PT1H', TIMESTAMP '2016-06-27 00:30:00')` measures an hourly period from 00:30-01:30 instead of 00:00-01:00. See [Period granularities](granularities.md) for details on the default starting boundaries. The time zone, if provided, should be a time zone name like `America/Los_Angeles` or an offset like `-08:00`. This function is similar to `FLOOR` but is more flexible.|
+|`TIME_SHIFT(timestamp_expr, period, step[, timezone])`|Shifts a timestamp by a period (step times), returning it as a new timestamp. The `period` parameter can be any ISO 8601 period. The `step` parameter can be negative. The time zone, if provided, should be a time zone name like `America/Los_Angeles` or an offset like `-08:00`.|
+|`TIME_EXTRACT(timestamp_expr, unit[, timezone])`|Extracts a time part from `expr`, returning it as a number. Unit can be EPOCH, SECOND, MINUTE, HOUR, DAY (day of month), DOW (day of week), DOY (day of year), WEEK (week of [week year](https://en.wikipedia.org/wiki/ISO_week_date)), MONTH (1 through 12), QUARTER (1 through 4), or YEAR. The time zone, if provided, should be a time zone name like `America/Los_Angeles` or an offset like `-08:00`. The `unit` and `timezone` parameters must be provided as quoted literals, such as `TIME_EXTRACT(__time, 'HOUR')` or `TIME_EXTRACT(__time, 'HOUR', 'America/Los_Angeles')`. This function is similar to `EXTRACT` but is more flexible. |
+|`TIME_PARSE(string_expr[, pattern[, timezone]])`|Parses a string into a timestamp using a given [Joda DateTimeFormat pattern](http://www.joda.org/joda-time/apidocs/org/joda/time/format/DateTimeFormat.html), or ISO 8601 (e.g. `2000-01-02T03:04:05Z`) if the pattern is not provided. The `timezone` parameter is used as the time zone for strings that do not already include a time zone offset. If provided, `timezone` should be a time zone name like `America/Los_Angeles` or an offset like `-08:00`. The `pattern` and `timezone` parameters must be literals. Strings that cannot be parsed as timestamps return NULL.|
+|`TIME_FORMAT(timestamp_expr[, pattern[, timezone]])`|Formats a timestamp as a string with a given [Joda DateTimeFormat pattern](http://www.joda.org/joda-time/apidocs/org/joda/time/format/DateTimeFormat.html), or ISO 8601 (e.g. `2000-01-02T03:04:05Z`) if the pattern is not provided. If provided, the `timezone` parameter should be a time zone name like `America/Los_Angeles` or an offset like `-08:00`. The `pattern` and `timezone` parameters must be literals.|
+|`TIME_IN_INTERVAL(timestamp_expr, interval)`|Returns whether a timestamp is contained within a particular interval. The interval must be a literal string containing any ISO 8601 interval, such as `'2001-01-01/P1D'` or `'2001-01-01T01:00:00/2001-01-02T01:00:00'`. The start instant of the interval is inclusive and the end instant is exclusive.|
 |`MILLIS_TO_TIMESTAMP(millis_expr)`|Converts a number of milliseconds since the epoch (1970-01-01 00:00:00 UTC) into a timestamp.|
 |`TIMESTAMP_TO_MILLIS(timestamp_expr)`|Converts a timestamp into a number of milliseconds since the epoch.|
 |`EXTRACT(unit FROM timestamp_expr)`|Extracts a time part from `expr`, returning it as a number. Unit can be EPOCH, MICROSECOND, MILLISECOND, SECOND, MINUTE, HOUR, DAY (day of month), DOW (day of week), ISODOW (ISO day of week), DOY (day of year), WEEK (week of year), MONTH, QUARTER, YEAR, ISOYEAR, DECADE, CENTURY or MILLENNIUM. Units must be provided unquoted, like `EXTRACT(HOUR FROM __time)`.|
--- a/docs/querying/sql.md
+++ b/docs/querying/sql.md
@ -398,6 +398,21 @@ at execution time. To use dynamic parameters, replace any literal in the query w
 corresponding parameter value when you execute the query. Parameters are bound to the placeholders in the order in
 which they are passed. Parameters are supported in both the [HTTP POST](../api-reference/sql-api.md) and [JDBC](../api-reference/sql-jdbc.md) APIs.

+Druid supports double and null values in arrays for dynamic queries.
+The following example query uses the [ARRAY_CONTAINS](./sql-functions.md#array_contains) function to return `doubleArrayColumn` when the reference array `[-25.7, null, 36.85]` contains all elements of the value of `doubleArrayColumn`:
+
+```sql
+{
+   "query": "SELECT doubleArrayColumn from druid.table where ARRAY_CONTAINS(?, doubleArrayColumn)",
+   "parameters": [
+      {
+        "type": "ARRAY",
+        "value": [-25.7, null, 36.85]
+      }
+   ]
+}
+```
+
 In certain cases, using dynamic parameters in expressions can cause type inference issues which cause your query to fail, for example:

 ```sql
@ -423,3 +438,16 @@ SELECT count(city) from druid.table where SCALAR_IN_ARRAY(city, ?)
 ```

 sample java code using dynamic parameters is provided [here](../api-reference/sql-jdbc.md#dynamic-parameters).
+
+## Reserved keywords
+
+Druid SQL reserves certain keywords which are used in its query language. Apache Druid inherits all of the reserved keywords from [Apache Calcite](https://calcite.apache.org/docs/reference.html#keywords). In addition to these, the following reserved keywords are unique to Apache Druid:
+
+* **CLUSTERED**
+* **PARTITIONED**
+
+To use the reserved keywords in queries, enclose them in double quotation marks. For example, the reserved keyword **PARTITIONED** can be used in a query if and only if it is correctly quoted:
+
+```sql
+SELECT "PARTITIONED" from druid.table
+```
--- a/docs/release-info/migr-ansi-sql-null.md
+++ b/docs/release-info/migr-ansi-sql-null.md
@ -184,7 +184,7 @@ PARTITIONED BY MONTH

 Druid ingests the data with no null values as follows:

-| `__time` | `string_examle` | `number_example`|
+| `__time` | `string_example` | `number_example`|
 | -- | -- | -- |
 | `2024-01-01T00:00:00.000Z`| `my_string`| 99 |
 | `2024-01-02T00:00:00.000Z`| `empty`| 0 |
@ -376,7 +376,7 @@ Druid returns the following:
 | -- |
 | 100 |
 | 1 |
-| null |
+| 1 |

 ## Learn more

--- a/docs/release-info/upgrade-notes.md
+++ b/docs/release-info/upgrade-notes.md
@ -74,7 +74,7 @@ The following are the changes to the default values for the Coordinator service:

 #### `GoogleTaskLogs` upload buffer size

-Changed the upload buffer size in `GoogleTaskLogs` to 1 MB instead of 15 MB to allow more uploads in parallel and prevent the MiddleManager service from running out of memory.
+Changed the upload buffer size in `GoogleTaskLogs` to 1 MB instead of 15 MB to allow more uploads in parallel and prevent the Middle Manager service from running out of memory.

 [#16236](https://github.com/apache/druid/pull/16236)

--- a/docs/tutorials/cluster.md
+++ b/docs/tutorials/cluster.md
@ -32,7 +32,7 @@ your needs.
 This simple cluster will feature:

 - A Master server to host the Coordinator and Overlord processes
- - Two scalable, fault-tolerant Data servers running Historical and MiddleManager processes
+ - Two scalable, fault-tolerant Data servers running Historical and Middle Manager processes
 - A query server, hosting the Druid Broker and Router processes

 In production, we recommend deploying multiple Master servers and multiple Query servers in a fault-tolerant configuration based on your specific fault-tolerance needs, but you can get started quickly with one Master and one Query server and add more servers later.
@ -58,7 +58,7 @@ Example Master server configurations that have been sized for this hardware can

 #### Data server

-Historicals and MiddleManagers can be colocated on the same server to handle the actual data in your cluster. These servers benefit greatly from CPU, RAM,
+Historicals and Middle Managers can be colocated on the same server to handle the actual data in your cluster. These servers benefit greatly from CPU, RAM,
 and SSDs.

 In this example, we will be deploying the equivalent of two AWS [i3.4xlarge](https://aws.amazon.com/ec2/instance-types/i3/) instances.
@ -117,7 +117,7 @@ In a clustered deployment, having multiple Data servers is a good idea for fault

 When choosing the Data server hardware, you can choose a split factor `N`, divide the original CPU/RAM of the single-server deployment by `N`, and deploy `N` Data servers of reduced size in the new cluster.

-Instructions for adjusting the Historical/MiddleManager configs for the split are described in a later section in this guide.
+Instructions for adjusting the Historical/Middle Manager configs for the split are described in a later section in this guide.

 #### Query server

@ -324,7 +324,7 @@ You can copy your existing `coordinator-overlord` configs from the single-server

 #### Data

-Suppose we are migrating from a single-server deployment that had 32 CPU and 256GiB RAM. In the old deployment, the following configurations for Historicals and MiddleManagers were applied:
+Suppose we are migrating from a single-server deployment that had 32 CPU and 256GiB RAM. In the old deployment, the following configurations for Historicals and Middle Managers were applied:

 Historical (Single-server)

@ -334,7 +334,7 @@ druid.processing.numMergeBuffers=8
 druid.processing.numThreads=31
 ```

-MiddleManager (Single-server)
+Middle Manager (Single-server)

 ```
 druid.worker.capacity=8
@ -351,7 +351,7 @@ Historical
 - `druid.processing.numMergeBuffers`: Divide the old value from the single-server deployment by the split factor
 - `druid.processing.buffer.sizeBytes`: Keep this unchanged

-MiddleManager:
+Middle Manager:

 - `druid.worker.capacity`: Divide the old value from the single-server deployment by the split factor
 - `druid.indexer.fork.property.druid.processing.numMergeBuffers`: Keep this unchanged
@ -368,7 +368,7 @@ druid.processing.numMergeBuffers=4
 druid.processing.numThreads=15
 ```

-New MiddleManager (on 2 Data servers)
+New Middle Manager (on 2 Data servers)

 ```
 druid.worker.capacity=4
@ -460,8 +460,8 @@ bin/start-cluster-data-server
 You can add more Data servers as needed.

 :::info
- For clusters with complex resource allocation needs, you can break apart Historicals and MiddleManagers and scale the components individually.
- This also allows you take advantage of Druid's built-in MiddleManager autoscaling facility.
+ For clusters with complex resource allocation needs, you can break apart Historicals and Middle Managers and scale the components individually.
+ This also allows you take advantage of Druid's built-in Middle Manager autoscaling facility.
 :::

 ## Start Query Server
--- a/docs/tutorials/tutorial-ingestion-spec.md
+++ b/docs/tutorials/tutorial-ingestion-spec.md
@ -590,24 +590,18 @@ bin/post-index-task --file quickstart/ingestion-tutorial-index.json --url http:/

 After the script completes, we will query the data.

-Let's run `bin/dsql` and issue a `select * from "ingestion-tutorial";` query to see what data was ingested.
+In the web console, open a new tab in the **Query** view. Run the following query to view the ingested data:

-```bash
-$ bin/dsql
-Welcome to dsql, the command-line client for Druid SQL.
-Type "\h" for help.
-dsql> select * from "ingestion-tutorial";
-
-┌──────────────────────────┬───────┬──────┬───────┬─────────┬─────────┬─────────┬──────────┬─────────┬─────────┐
-│ __time                   │ bytes │ cost │ count │ dstIP   │ dstPort │ packets │ protocol │ srcIP   │ srcPort │
-├──────────────────────────┼───────┼──────┼───────┼─────────┼─────────┼─────────┼──────────┼─────────┼─────────┤
-│ 2018-01-01T01:01:00.000Z │  6000 │  4.9 │     3 │ 2.2.2.2 │    3000 │      60 │ 6        │ 1.1.1.1 │    2000 │
-│ 2018-01-01T01:02:00.000Z │  9000 │ 18.1 │     2 │ 2.2.2.2 │    7000 │      90 │ 6        │ 1.1.1.1 │    5000 │
-│ 2018-01-01T01:03:00.000Z │  6000 │  4.3 │     1 │ 2.2.2.2 │    7000 │      60 │ 6        │ 1.1.1.1 │    5000 │
-│ 2018-01-01T02:33:00.000Z │ 30000 │ 56.9 │     2 │ 8.8.8.8 │    5000 │     300 │ 17       │ 7.7.7.7 │    4000 │
-│ 2018-01-01T02:35:00.000Z │ 30000 │ 46.3 │     1 │ 8.8.8.8 │    5000 │     300 │ 17       │ 7.7.7.7 │    4000 │
-└──────────────────────────┴───────┴──────┴───────┴─────────┴─────────┴─────────┴──────────┴─────────┴─────────┘
-Retrieved 5 rows in 0.12s.
-
-dsql>
+```sql
+select * from "ingestion-tutorial"
 ```
+
+Returns the following:
+
+| `__time` | `bytes` | `cost` | `count` | `dstIP` | `dstPort` | `packets` | `protocol` | `srcIP` | `srcPort` |
+| -- | -- | -- | -- | -- | -- | -- | -- | -- | -- |
+| `2018-01-01T01:01:00.000Z` | `6000` | `4.9` | `3` | `2.2.2.2` | `3000` | `60` | `6` | `1.1.1.1` | `2000` |
+| `2018-01-01T01:02:00.000Z` |  `9000` | `18.1` | `2` | `2.2.2.2` | `7000` | `90` | `6` | `1.1.1.1` | `5000` |
+| `2018-01-01T01:03:00.000Z` | `6000` |  `4.3` | `1` | `2.2.2.2` | `7000` | `60` | `6` | `1.1.1.1` | `5000` |
+| `2018-01-01T02:33:00.000Z` | `30000` | `56.9` | `2` | `8.8.8.8` | `5000` | `300` | `17` | `7.7.7.7` | `4000` |
+| `2018-01-01T02:35:00.000Z` | `30000` | `46.3` | `1` | `8.8.8.8` | `5000` | `300` | `17` | `7.7.7.7` | `4000` |
--- a/docs/tutorials/tutorial-query-deep-storage.md
+++ b/docs/tutorials/tutorial-query-deep-storage.md
@ -25,7 +25,7 @@ sidebar_label: "Query from deep storage"

 Query from deep storage allows you to query segments that are stored only in deep storage, which provides lower costs than if you were to load everything onto Historical processes. The tradeoff is that queries from deep storage may take longer to complete. 

-This tutorial walks you through loading example data, configuring load rules so that not all the segments get loaded onto Historical processes, and querying data from deep storage.
+This tutorial walks you through loading example data, configuring load rules so that not all the segments get loaded onto Historical services, and querying data from deep storage. If you have [centralized datasource schema enabled](../configuration/index.md#centralized-datasource-schema), you can query datasources that are only in deep storage without having any segment available on Historical.

 To run the queries in this tutorial, replace `ROUTER:PORT` with the location of the Router process and its port number. For example, use `localhost:8888` for the quickstart deployment.

--- a/docs/tutorials/tutorial-sql-query-view.md
+++ b/docs/tutorials/tutorial-sql-query-view.md
@ -100,9 +100,9 @@ In this section you run some queries using aggregate functions and perform some

   ![aggregate-query](../assets/tutorial-sql-aggregate-query.png)

-7. Click **Engine: auto (sql-native)** to display the engine options&mdash;**native** for native (JSON-based) queries, **sql-native** for Druid SQL queries, and **sql-msq-task** for SQL-based ingestion. 
+7. Click **Engine: Auto (SQL native)** to display the engine options&mdash;**Native** for native (JSON-based) queries, **SQL native** for Druid SQL queries, and **SQL MSQ-task** for SQL-based ingestion. 

-   Select **auto** to let Druid select the most efficient engine based on your query input.
+   Select **Auto** to let Druid select the most efficient engine based on your query input.

 8. From the engine menu you can also edit the query context and turn off some query defaults. 

--- a/docs/tutorials/tutorial-transform-spec.md
+++ b/docs/tutorials/tutorial-transform-spec.md
@ -1,156 +0,0 @@
---
-id: tutorial-transform-spec
-title: Transform input data
-sidebar_label: Transform input data
---
-
-<!--
-  ~ Licensed to the Apache Software Foundation (ASF) under one
-  ~ or more contributor license agreements.  See the NOTICE file
-  ~ distributed with this work for additional information
-  ~ regarding copyright ownership.  The ASF licenses this file
-  ~ to you under the Apache License, Version 2.0 (the
-  ~ "License"); you may not use this file except in compliance
-  ~ with the License.  You may obtain a copy of the License at
-  ~
-  ~   http://www.apache.org/licenses/LICENSE-2.0
-  ~
-  ~ Unless required by applicable law or agreed to in writing,
-  ~ software distributed under the License is distributed on an
-  ~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  ~ KIND, either express or implied.  See the License for the
-  ~ specific language governing permissions and limitations
-  ~ under the License.
-  -->
-
-
-This tutorial will demonstrate how to use transform specs to filter and transform input data during ingestion.
-
-For this tutorial, we'll assume you've already downloaded Apache Druid as described in
-the [single-machine quickstart](index.md) and have it running on your local machine.
-
-It will also be helpful to have finished [Load a file](../tutorials/tutorial-batch.md) and [Query data](../tutorials/tutorial-query.md) tutorials.
-
-## Sample data
-
-We've included sample data for this tutorial at `quickstart/tutorial/transform-data.json`, reproduced here for convenience:
-
-```json
-{"timestamp":"2018-01-01T07:01:35Z","animal":"octopus",  "location":1, "number":100}
-{"timestamp":"2018-01-01T05:01:35Z","animal":"mongoose", "location":2,"number":200}
-{"timestamp":"2018-01-01T06:01:35Z","animal":"snake", "location":3, "number":300}
-{"timestamp":"2018-01-01T01:01:35Z","animal":"lion", "location":4, "number":300}
-```
-
-## Load data with transform specs
-
-We will ingest the sample data using the following spec, which demonstrates the use of transform specs:
-
-```json
-{
-  "type" : "index_parallel",
-  "spec" : {
-    "dataSchema" : {
-      "dataSource" : "transform-tutorial",
-      "timestampSpec": {
-        "column": "timestamp",
-        "format": "iso"
-      },
-      "dimensionsSpec" : {
-        "dimensions" : [
-          "animal",
-          { "name": "location", "type": "long" }
-        ]
-      },
-      "metricsSpec" : [
-        { "type" : "count", "name" : "count" },
-        { "type" : "longSum", "name" : "number", "fieldName" : "number" },
-        { "type" : "longSum", "name" : "triple-number", "fieldName" : "triple-number" }
-      ],
-      "granularitySpec" : {
-        "type" : "uniform",
-        "segmentGranularity" : "week",
-        "queryGranularity" : "minute",
-        "intervals" : ["2018-01-01/2018-01-03"],
-        "rollup" : true
-      },
-      "transformSpec": {
-        "transforms": [
-          {
-            "type": "expression",
-            "name": "animal",
-            "expression": "concat('super-', animal)"
-          },
-          {
-            "type": "expression",
-            "name": "triple-number",
-            "expression": "number * 3"
-          }
-        ],
-        "filter": {
-          "type":"or",
-          "fields": [
-            { "type": "selector", "dimension": "animal", "value": "super-mongoose" },
-            { "type": "selector", "dimension": "triple-number", "value": "300" },
-            { "type": "selector", "dimension": "location", "value": "3" }
-          ]
-        }
-      }
-    },
-    "ioConfig" : {
-      "type" : "index_parallel",
-      "inputSource" : {
-        "type" : "local",
-        "baseDir" : "quickstart/tutorial",
-        "filter" : "transform-data.json"
-      },
-      "inputFormat" : {
-        "type" :"json"
-      },
-      "appendToExisting" : false
-    },
-    "tuningConfig" : {
-      "type" : "index_parallel",
-      "partitionsSpec": {
-        "type": "dynamic"
-      },
-      "maxRowsInMemory" : 25000
-    }
-  }
-}
-```
-
-In the transform spec, we have two expression transforms:
-* `super-animal`: prepends "super-" to the values in the `animal` column. This will override the `animal` column with the transformed version, since the transform's name is `animal`.
-* `triple-number`: multiplies the `number` column by 3. This will create a new `triple-number` column. Note that we are ingesting both the original and the transformed column.
-
-Additionally, we have an OR filter with three clauses:
-* `super-animal` values that match "super-mongoose"
-* `triple-number` values that match 300
-* `location` values that match 3
-
-This filter selects the first 3 rows, and it will exclude the final "lion" row in the input data. Note that the filter is applied after the transformation.
-
-Let's submit this task now, which has been included at `quickstart/tutorial/transform-index.json`:
-
-```bash
-bin/post-index-task --file quickstart/tutorial/transform-index.json --url http://localhost:8081
-```
-
-## Query the transformed data
-
-Let's run `bin/dsql` and issue a `select * from "transform-tutorial";` query to see what was ingested:
-
-```bash
-dsql> select * from "transform-tutorial";
-┌──────────────────────────┬────────────────┬───────┬──────────┬────────┬───────────────┐
-│ __time                   │ animal         │ count │ location │ number │ triple-number │
-├──────────────────────────┼────────────────┼───────┼──────────┼────────┼───────────────┤
-│ 2018-01-01T05:01:00.000Z │ super-mongoose │     1 │        2 │    200 │           600 │
-│ 2018-01-01T06:01:00.000Z │ super-snake    │     1 │        3 │    300 │           900 │
-│ 2018-01-01T07:01:00.000Z │ super-octopus  │     1 │        1 │    100 │           300 │
-└──────────────────────────┴────────────────┴───────┴──────────┴────────┴───────────────┘
-Retrieved 3 rows in 0.03s.
-```
-
-The "lion" row has been discarded, the `animal` column has been transformed, and we have both the original and transformed `number` column.
--- a/docs/tutorials/tutorial-transform.md
+++ b/docs/tutorials/tutorial-transform.md
@ -0,0 +1,103 @@
+---
+id: tutorial-transform
+title: Transform input data
+sidebar_label: Transform input data
+---
+
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one
+  ~ or more contributor license agreements.  See the NOTICE file
+  ~ distributed with this work for additional information
+  ~ regarding copyright ownership.  The ASF licenses this file
+  ~ to you under the Apache License, Version 2.0 (the
+  ~ "License"); you may not use this file except in compliance
+  ~ with the License.  You may obtain a copy of the License at
+  ~
+  ~   http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing,
+  ~ software distributed under the License is distributed on an
+  ~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  ~ KIND, either express or implied.  See the License for the
+  ~ specific language governing permissions and limitations
+  ~ under the License.
+  -->
+
+
+This tutorial demonstrates how to transform input data during ingestion.
+
+## Prerequisite
+
+Before proceeding, download Apache Druid&circledR; as described in [Quickstart (local)](index.md) and have it running on your local machine. You don't need to load any data into the Druid cluster.
+
+You should be familiar with data querying in Druid. If you haven't already, go through the [Query data](../tutorials/tutorial-query.md) tutorial first.
+
+## Sample data
+
+For this tutorial, you use the following sample data:
+
+```json
+{"timestamp":"2018-01-01T07:01:35Z", "animal":"octopus", "location":1, "number":100}
+{"timestamp":"2018-01-01T05:01:35Z", "animal":"mongoose", "location":2,"number":200}
+{"timestamp":"2018-01-01T06:01:35Z", "animal":"snake", "location":3, "number":300}
+{"timestamp":"2018-01-01T01:01:35Z", "animal":"lion", "location":4, "number":300}
+```
+
+## Transform data during ingestion
+
+Load the sample dataset using the [`INSERT INTO`](../multi-stage-query/reference.md/#insert) statement and the [`EXTERN`](../multi-stage-query/reference.md/#extern-function) function to ingest the data inline. In the [Druid web console](../operations/web-console.md), go to the **Query** view and run the following query:
+
+```sql
+INSERT INTO "transform_tutorial"
+WITH "ext" AS (
+  SELECT *
+  FROM TABLE(EXTERN('{"type":"inline","data":"{\"timestamp\":\"2018-01-01T07:01:35Z\",\"animal\":\"octopus\",  \"location\":1, \"number\":100}\n{\"timestamp\":\"2018-01-01T05:01:35Z\",\"animal\":\"mongoose\", \"location\":2,\"number\":200}\n{\"timestamp\":\"2018-01-01T06:01:35Z\",\"animal\":\"snake\", \"location\":3, \"number\":300}\n{\"timestamp\":\"2018-01-01T01:01:35Z\",\"animal\":\"lion\", \"location\":4, \"number\":300}"}', '{"type":"json"}')) EXTEND ("timestamp" VARCHAR, "animal" VARCHAR, "location" BIGINT, "number" BIGINT)
+)
+SELECT
+  TIME_PARSE("timestamp") AS "__time",
+  TEXTCAT('super-', "animal") AS "animal",
+  "location",
+  "number",
+  "number" * 3 AS "triple-number"
+FROM "ext"
+WHERE (TEXTCAT('super-', "animal") = 'super-mongoose' OR "location" = 3 OR "number" = 100)
+PARTITIONED BY DAY
+```
+
+In the `SELECT` clause, you specify the following transformations:
+* `animal`: prepends "super-" to the values in the `animal` column using the [`TEXTCAT`](../querying/sql-functions.md/#textcat) function. Note that it only ingests the transformed data.
+* `triple-number`: multiplies the `number` column by three and stores the results in a column named `triple-number`. Note that the query ingests both the original and the transformed data.
+
+Additionally, the `WHERE` clause applies the following three OR operators so that the query only ingests the rows where at least one of the following conditions is `true`:
+
+* `TEXTCAT('super-', "animal")` matches "super-mongoose"
+* `location` matches 3
+* `number` matches 100
+
+Once a row passes the filter, the ingestion job applies the transformations. In this example, the filter selects the first three rows because each row meets at least one of the required OR conditions. For the selected rows, the ingestion job ingests the transformed `animal` column, the `location` column, and both the original `number` and the transformed `triple-number` column. The "lion" row doesn't meet any of the conditions, so it is not ingested or transformed.
+
+## Query the transformed data
+
+In the web console, open a new tab in the **Query** view. Run the following query to view the ingested data:
+
+```sql
+SELECT * FROM "transform_tutorial"
+```
+
+Returns the following:
+
+| `__time` | `animal` | `location` | `number` | `triple-number` | 
+| -- | -- | -- | -- | -- |
+| `2018-01-01T05:01:35.000Z` | `super-mongoose` | `2` | `200` | `600` |
+| `2018-01-01T06:01:35.000Z` | `super-snake` | `3` | `300` | `900` |
+| `2018-01-01T07:01:35.000Z` | `super-octopus` | `1` |  `100` | `300` |
+
+Notice how the "lion" row is missing, and how the other three rows that were ingested have transformations applied to them.
+
+## Learn more
+
+See the following topics for more information:
+
+* [All functions](../querying/sql-functions.md) for a list of functions that can be used to transform data. 
+* [Transform spec reference](../ingestion/ingestion-spec.md/#transformspec) to learn more about transforms in JSON-based batch ingestion.
+* [WHERE clause](../querying/sql.md#where) to learn how to specify filters in Druid SQL.
--- a/examples/quickstart/tutorial/hadoop/docker/Dockerfile
+++ b/examples/quickstart/tutorial/hadoop/docker/Dockerfile
@ -19,6 +19,11 @@ FROM centos:7

 USER root

+# CentOS is EOL, have to use vault
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo && \
+    sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo && \
+    sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+
 # install dev tools
 RUN yum clean all \
    && rpm --rebuilddb \
--- a/extensions-contrib/compressed-bigdecimal/src/main/java/org/apache/druid/compressedbigdecimal/CompressedBigDecimalColumn.java
+++ b/extensions-contrib/compressed-bigdecimal/src/main/java/org/apache/druid/compressedbigdecimal/CompressedBigDecimalColumn.java
@ -69,6 +69,7 @@ public class CompressedBigDecimalColumn implements ComplexColumn
  }

  @Override
+  @Nullable
  public CompressedBigDecimal getRowValue(int rowNum)
  {
    int s = scale.get(rowNum);
@ -96,7 +97,8 @@ public class CompressedBigDecimalColumn implements ComplexColumn
  {
    return new ObjectColumnSelector<CompressedBigDecimal>()
    {
-      @Override @Nullable
+      @Override
+      @Nullable
      public CompressedBigDecimal getObject()
      {
        return getRowValue(offset.getOffset());
--- a/extensions-contrib/compressed-bigdecimal/src/main/java/org/apache/druid/compressedbigdecimal/CompressedBigDecimalMetricSerde.java
+++ b/extensions-contrib/compressed-bigdecimal/src/main/java/org/apache/druid/compressedbigdecimal/CompressedBigDecimalMetricSerde.java
@ -20,6 +20,7 @@
 package org.apache.druid.compressedbigdecimal;

 import org.apache.druid.data.input.InputRow;
+import org.apache.druid.segment.IndexSpec;
 import org.apache.druid.segment.column.ColumnBuilder;
 import org.apache.druid.segment.data.ObjectStrategy;
 import org.apache.druid.segment.serde.ComplexMetricExtractor;
@ -73,7 +74,8 @@ public class CompressedBigDecimalMetricSerde extends ComplexMetricSerde
  public void deserializeColumn(ByteBuffer buffer, ColumnBuilder builder)
  {
    builder.setComplexColumnSupplier(
-        CompressedBigDecimalColumnPartSupplier.fromByteBuffer(buffer));
+        CompressedBigDecimalColumnPartSupplier.fromByteBuffer(buffer)
+    );
  }

  /* (non-Javadoc)
@ -83,7 +85,8 @@ public class CompressedBigDecimalMetricSerde extends ComplexMetricSerde
  @Override
  public CompressedBigDecimalLongColumnSerializer getSerializer(
      SegmentWriteOutMedium segmentWriteOutMedium,
-      String column
+      String column,
+      IndexSpec indexSpec
  )
  {
    return CompressedBigDecimalLongColumnSerializer.create(segmentWriteOutMedium, column);
--- a/extensions-contrib/compressed-bigdecimal/src/main/java/org/apache/druid/compressedbigdecimal/CompressedBigDecimalObjectStrategy.java
+++ b/extensions-contrib/compressed-bigdecimal/src/main/java/org/apache/druid/compressedbigdecimal/CompressedBigDecimalObjectStrategy.java
@ -91,4 +91,10 @@ public class CompressedBigDecimalObjectStrategy implements ObjectStrategy<Compre

    return buf.array();
  }
+
+  @Override
+  public boolean readRetainsBufferReference()
+  {
+    return false;
+  }
 }
--- a/extensions-contrib/compressed-bigdecimal/src/test/java/org/apache/druid/compressedbigdecimal/CompressedBigDecimalFactoryTestBase.java
+++ b/extensions-contrib/compressed-bigdecimal/src/test/java/org/apache/druid/compressedbigdecimal/CompressedBigDecimalFactoryTestBase.java
@ -22,7 +22,6 @@ package org.apache.druid.compressedbigdecimal;
 import com.fasterxml.jackson.databind.MapperFeature;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.collect.ImmutableList;
-import org.apache.druid.java.util.common.DateTimes;
 import org.apache.druid.query.aggregation.AggregateCombiner;
 import org.apache.druid.query.aggregation.Aggregator;
 import org.apache.druid.query.aggregation.AggregatorFactory;
@ -151,8 +150,7 @@ public abstract class CompressedBigDecimalFactoryTestBase
  {
    TestColumnValueSelector<CompressedBigDecimal> columnValueSelector = TestColumnValueSelector.of(
        CompressedBigDecimal.class,
-        ImmutableList.of(ArrayCompressedBigDecimal.wrap(new int[]{67, 0}, 0)),
-        DateTimes.of("2020-01-01")
+        ImmutableList.of(ArrayCompressedBigDecimal.wrap(new int[]{67, 0}, 0))
    );

    columnValueSelector.advance();
@ -167,8 +165,7 @@ public abstract class CompressedBigDecimalFactoryTestBase
        ImmutableList.of(
            ArrayCompressedBigDecimal.wrap(new int[]{1, 0}, 0),
            ArrayCompressedBigDecimal.wrap(new int[]{10, 0}, 0)
-        ),
-        DateTimes.of("2020-01-01")
+        )
    );

    columnValueSelector.advance();
--- a/extensions-contrib/ddsketch/src/main/java/org/apache/druid/query/aggregation/ddsketch/DDSketchComplexMetricSerde.java
+++ b/extensions-contrib/ddsketch/src/main/java/org/apache/druid/query/aggregation/ddsketch/DDSketchComplexMetricSerde.java
@ -22,17 +22,9 @@ package org.apache.druid.query.aggregation.ddsketch;
 import com.datadoghq.sketch.ddsketch.DDSketch;
 import org.apache.druid.data.input.InputRow;
 import org.apache.druid.java.util.common.IAE;
-import org.apache.druid.segment.GenericColumnSerializer;
-import org.apache.druid.segment.column.ColumnBuilder;
-import org.apache.druid.segment.data.GenericIndexed;
 import org.apache.druid.segment.data.ObjectStrategy;
-import org.apache.druid.segment.serde.ComplexColumnPartSupplier;
 import org.apache.druid.segment.serde.ComplexMetricExtractor;
 import org.apache.druid.segment.serde.ComplexMetricSerde;
-import org.apache.druid.segment.serde.LargeColumnSupportedComplexColumnSerializer;
-import org.apache.druid.segment.writeout.SegmentWriteOutMedium;
-
-import java.nio.ByteBuffer;


 public class DDSketchComplexMetricSerde extends ComplexMetricSerde
@ -84,31 +76,9 @@ public class DDSketchComplexMetricSerde extends ComplexMetricSerde
    };
  }

-  @Override
-  public void deserializeColumn(ByteBuffer buffer, ColumnBuilder builder)
-  {
-    final GenericIndexed<DDSketch> column = GenericIndexed.read(
-        buffer,
-        STRATEGY,
-        builder.getFileMapper()
-    );
-    builder.setComplexColumnSupplier(new ComplexColumnPartSupplier(getTypeName(), column));
-  }
-
  @Override
  public ObjectStrategy<DDSketch> getObjectStrategy()
  {
    return STRATEGY;
  }
-
-  @Override
-  public GenericColumnSerializer getSerializer(SegmentWriteOutMedium segmentWriteOutMedium, String column)
-  {
-    return LargeColumnSupportedComplexColumnSerializer.create(
-        segmentWriteOutMedium,
-        column,
-        this.getObjectStrategy()
-    );
-  }
-
 }
--- a/extensions-contrib/ddsketch/src/main/java/org/apache/druid/query/aggregation/ddsketch/DDSketchObjectStrategy.java
+++ b/extensions-contrib/ddsketch/src/main/java/org/apache/druid/query/aggregation/ddsketch/DDSketchObjectStrategy.java
@ -70,4 +70,10 @@ public class DDSketchObjectStrategy implements ObjectStrategy<DDSketch>
  {
    return DDSketchAggregatorFactory.COMPARATOR.compare(o1, o2);
  }
+
+  @Override
+  public boolean readRetainsBufferReference()
+  {
+    return false;
+  }
 }
--- a/extensions-contrib/ddsketch/src/test/java/org/apache/druid/query/aggregation/ddsketch/DDSketchObjectStrategyTest.java
+++ b/extensions-contrib/ddsketch/src/test/java/org/apache/druid/query/aggregation/ddsketch/DDSketchObjectStrategyTest.java
@ -17,18 +17,17 @@
 * under the License.
 */

+package org.apache.druid.query.aggregation.ddsketch;

-package org.apache.druid.indexing.overlord.supervisor;
+import org.junit.Assert;
+import org.junit.Test;

-import com.google.inject.Binder;
-import com.google.inject.Module;
-import org.apache.druid.guice.JsonConfigProvider;
-
-public class SupervisorModule implements Module
+public class DDSketchObjectStrategyTest
 {
-  @Override
-  public void configure(Binder binder)
+  @Test
+  public void testReadRetainsBufferReference()
  {
-    JsonConfigProvider.bind(binder, "druid.supervisor", SupervisorStateManagerConfig.class);
+    DDSketchObjectStrategy strategy = new DDSketchObjectStrategy();
+    Assert.assertFalse(strategy.readRetainsBufferReference());
  }
 }
--- a/extensions-contrib/distinctcount/src/test/java/org/apache/druid/query/aggregation/distinctcount/DistinctCountTimeseriesQueryTest.java
+++ b/extensions-contrib/distinctcount/src/test/java/org/apache/druid/query/aggregation/distinctcount/DistinctCountTimeseriesQueryTest.java
@ -32,6 +32,7 @@ import org.apache.druid.query.timeseries.DefaultTimeseriesQueryMetrics;
 import org.apache.druid.query.timeseries.TimeseriesQuery;
 import org.apache.druid.query.timeseries.TimeseriesQueryEngine;
 import org.apache.druid.query.timeseries.TimeseriesResultValue;
+import org.apache.druid.segment.IncrementalIndexTimeBoundaryInspector;
 import org.apache.druid.segment.TestHelper;
 import org.apache.druid.segment.incremental.IncrementalIndex;
 import org.apache.druid.segment.incremental.IncrementalIndexSchema;
@ -101,7 +102,12 @@ public class DistinctCountTimeseriesQueryTest extends InitializedNullHandlingTes
                                  .build();

    final Iterable<Result<TimeseriesResultValue>> results =
-        engine.process(query, new IncrementalIndexStorageAdapter(index), new DefaultTimeseriesQueryMetrics()).toList();
+        engine.process(
+            query,
+            new IncrementalIndexStorageAdapter(index),
+            new IncrementalIndexTimeBoundaryInspector(index),
+            new DefaultTimeseriesQueryMetrics()
+        ).toList();

    List<Result<TimeseriesResultValue>> expectedResults = Collections.singletonList(
        new Result<>(
--- a/extensions-contrib/distinctcount/src/test/java/org/apache/druid/query/aggregation/distinctcount/DistinctCountTopNQueryTest.java
+++ b/extensions-contrib/distinctcount/src/test/java/org/apache/druid/query/aggregation/distinctcount/DistinctCountTopNQueryTest.java
@ -33,6 +33,7 @@ import org.apache.druid.query.topn.TopNQuery;
 import org.apache.druid.query.topn.TopNQueryBuilder;
 import org.apache.druid.query.topn.TopNQueryEngine;
 import org.apache.druid.query.topn.TopNResultValue;
+import org.apache.druid.segment.IncrementalIndexTimeBoundaryInspector;
 import org.apache.druid.segment.TestHelper;
 import org.apache.druid.segment.incremental.IncrementalIndex;
 import org.apache.druid.segment.incremental.IncrementalIndexSchema;
@ -130,7 +131,12 @@ public class DistinctCountTopNQueryTest extends InitializedNullHandlingTest
                          .build();

    final Iterable<Result<TopNResultValue>> results =
-        engine.query(query, new IncrementalIndexStorageAdapter(index), null).toList();
+        engine.query(
+            query,
+            new IncrementalIndexStorageAdapter(index),
+            new IncrementalIndexTimeBoundaryInspector(index),
+            null
+        ).toList();

    List<Result<TopNResultValue>> expectedResults = Collections.singletonList(
        new Result<>(
--- a/extensions-contrib/druid-deltalake-extensions/src/main/java/org/apache/druid/delta/input/DeltaInputRow.java
+++ b/extensions-contrib/druid-deltalake-extensions/src/main/java/org/apache/druid/delta/input/DeltaInputRow.java
@ -19,6 +19,10 @@

 package org.apache.druid.delta.input;

+import io.delta.kernel.data.ArrayValue;
+import io.delta.kernel.data.MapValue;
+import io.delta.kernel.internal.util.VectorUtils;
+import io.delta.kernel.types.ArrayType;
 import io.delta.kernel.types.BinaryType;
 import io.delta.kernel.types.BooleanType;
 import io.delta.kernel.types.ByteType;
@ -29,6 +33,7 @@ import io.delta.kernel.types.DoubleType;
 import io.delta.kernel.types.FloatType;
 import io.delta.kernel.types.IntegerType;
 import io.delta.kernel.types.LongType;
+import io.delta.kernel.types.MapType;
 import io.delta.kernel.types.ShortType;
 import io.delta.kernel.types.StringType;
 import io.delta.kernel.types.StructField;
@ -197,6 +202,15 @@ public class DeltaInputRow implements InputRow
      return String.valueOf(charArray);
    } else if (dataType instanceof DecimalType) {
      return dataRow.getDecimal(columnOrdinal).longValue();
+    } else if (dataType instanceof StructType) {
+      final io.delta.kernel.data.Row structRow = dataRow.getStruct(columnOrdinal);
+      return RowSerde.convertRowToJsonObject(structRow);
+    } else if (dataType instanceof ArrayType) {
+      final ArrayValue arrayRow = dataRow.getArray(columnOrdinal);
+      return VectorUtils.toJavaList(arrayRow);
+    } else if (dataType instanceof MapType) {
+      final MapValue map = dataRow.getMap(columnOrdinal);
+      return VectorUtils.toJavaMap(map);
    } else {
      throw InvalidInput.exception(
          "Unsupported data type[%s] for fieldName[%s].",
--- a/extensions-contrib/druid-deltalake-extensions/src/test/java/org/apache/druid/delta/input/ComplexTypesDeltaTable.java
+++ b/extensions-contrib/druid-deltalake-extensions/src/test/java/org/apache/druid/delta/input/ComplexTypesDeltaTable.java
@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.delta.input;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import org.apache.druid.data.input.ColumnsFilter;
+import org.apache.druid.data.input.InputRowSchema;
+import org.apache.druid.data.input.impl.DimensionsSpec;
+import org.apache.druid.data.input.impl.TimestampSpec;
+import org.apache.druid.java.util.common.DateTimes;
+import org.apache.druid.segment.AutoTypeColumnSchema;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+/**
+ * Refer to extensions-contrib/druid-deltalake-extensions/src/test/resources/README.md to generate the
+ * sample complex types Delta Lake table used in the unit tests.
+ *
+ */
+public class ComplexTypesDeltaTable
+{
+  /**
+   * The Delta table path used by unit tests.
+   */
+  public static final String DELTA_TABLE_PATH = "src/test/resources/complex-types-table";
+
+  /**
+   * The list of dimensions in the Delta table {@link #DELTA_TABLE_PATH}.
+   */
+  public static final List<String> DIMENSIONS = ImmutableList.of(
+      "id",
+      "array_info",
+      "struct_info",
+      "nested_struct_info",
+      "map_info"
+  );
+
+  /**
+   * The expected set of rows from the first checkpoint file {@code {@link #DELTA_TABLE_PATH}/_delta_log/00000000000000000000.json}
+   */
+  private static final List<Map<String, Object>> SPLIT_0_EXPECTED_ROWS = new ArrayList<>(
+      ImmutableList.of(
+          ImmutableMap.of(
+              "id", 0L,
+              "array_info", ImmutableList.of(0, 1, 2, 3),
+              "struct_info", ImmutableMap.of("id", 0L, "name", "0"),
+              "nested_struct_info", ImmutableMap.of("id", 0L, "name", "0", "nested", ImmutableMap.of("nested_int", 0, "nested_double", 1.0)),
+              "map_info", ImmutableMap.of("key1", 1.0f, "key2", 1.0f)
+          ),
+          ImmutableMap.of(
+              "id", 1L,
+              "array_info", ImmutableList.of(1, 2, 3, 4),
+              "struct_info", ImmutableMap.of("id", 1L, "name", "1"),
+              "nested_struct_info", ImmutableMap.of("id", 1L, "name", "1", "nested", ImmutableMap.of("nested_int", 1, "nested_double", 2.0)),
+              "map_info", ImmutableMap.of("key1", 2.0f, "key2", 2.0f)
+          ),
+          ImmutableMap.of(
+              "id", 2L,
+              "array_info", ImmutableList.of(2, 3, 4, 5),
+              "struct_info", ImmutableMap.of("id", 2L, "name", "2"),
+              "nested_struct_info", ImmutableMap.of("id", 2L, "name", "2", "nested", ImmutableMap.of("nested_int", 2, "nested_double", 3.0)),
+              "map_info", ImmutableMap.of("key1", 3.0f, "key2", 3.0f)
+          ),
+          ImmutableMap.of(
+              "id", 3L,
+              "array_info", ImmutableList.of(3, 4, 5, 6),
+              "struct_info", ImmutableMap.of("id", 3L, "name", "3"),
+              "nested_struct_info", ImmutableMap.of("id", 3L, "name", "3", "nested", ImmutableMap.of("nested_int", 3, "nested_double", 4.0)),
+              "map_info", ImmutableMap.of("key1", 4.0f, "key2", 4.0f)
+          ),
+          ImmutableMap.of(
+              "id", 4L,
+              "array_info", ImmutableList.of(4, 5, 6, 7),
+              "struct_info", ImmutableMap.of("id", 4L, "name", "4"),
+              "nested_struct_info", ImmutableMap.of("id", 4L, "name", "4", "nested", ImmutableMap.of("nested_int", 4, "nested_double", 5.0)),
+              "map_info", ImmutableMap.of("key1", 5.0f, "key2", 5.0f)
+          )
+      )
+  );
+
+  /**
+   * Mapping of checkpoint file identifier to the list of expected rows in that checkpoint.
+   */
+  public static final Map<Integer, List<Map<String, Object>>> SPLIT_TO_EXPECTED_ROWS = new HashMap<>(
+      ImmutableMap.of(
+          0, SPLIT_0_EXPECTED_ROWS
+      )
+  );
+
+  /**
+   * Complete set of expected rows across all checkpoint files for {@link #DELTA_TABLE_PATH}.
+   */
+  public static final List<Map<String, Object>> EXPECTED_ROWS = SPLIT_TO_EXPECTED_ROWS.values().stream()
+                                                                                      .flatMap(List::stream)
+                                                                                      .collect(Collectors.toList());
+
+  /**
+   * The Druid schema used for ingestion of {@link #DELTA_TABLE_PATH}.
+   */
+  public static final InputRowSchema FULL_SCHEMA = new InputRowSchema(
+      new TimestampSpec("na", "posix", DateTimes.of("2024-01-01")),
+      new DimensionsSpec(
+          ImmutableList.of(
+              new AutoTypeColumnSchema("id", null),
+              new AutoTypeColumnSchema("array_info", null),
+              new AutoTypeColumnSchema("struct_info", null),
+              new AutoTypeColumnSchema("nested_struct_info", null),
+              new AutoTypeColumnSchema("map_info", null)
+          )
+      ),
+      ColumnsFilter.all()
+  );
+}
--- a/extensions-contrib/druid-deltalake-extensions/src/test/java/org/apache/druid/delta/input/DeltaInputRowTest.java
+++ b/extensions-contrib/druid-deltalake-extensions/src/test/java/org/apache/druid/delta/input/DeltaInputRowTest.java
@ -54,7 +54,8 @@ public class DeltaInputRowTest
  {
    Object[][] data = new Object[][]{
        {NonPartitionedDeltaTable.DELTA_TABLE_PATH, NonPartitionedDeltaTable.FULL_SCHEMA, NonPartitionedDeltaTable.DIMENSIONS, NonPartitionedDeltaTable.EXPECTED_ROWS},
-        {PartitionedDeltaTable.DELTA_TABLE_PATH, PartitionedDeltaTable.FULL_SCHEMA, PartitionedDeltaTable.DIMENSIONS, PartitionedDeltaTable.EXPECTED_ROWS}
+        {PartitionedDeltaTable.DELTA_TABLE_PATH, PartitionedDeltaTable.FULL_SCHEMA, PartitionedDeltaTable.DIMENSIONS, PartitionedDeltaTable.EXPECTED_ROWS},
+        {ComplexTypesDeltaTable.DELTA_TABLE_PATH, ComplexTypesDeltaTable.FULL_SCHEMA, ComplexTypesDeltaTable.DIMENSIONS, ComplexTypesDeltaTable.EXPECTED_ROWS}
    };
    return Arrays.asList(data);
  }
@ -116,7 +117,7 @@ public class DeltaInputRowTest
        }
      }
    }
-    Assert.assertEquals(NonPartitionedDeltaTable.EXPECTED_ROWS.size(), totalRecordCount);
+    Assert.assertEquals(expectedRows.size(), totalRecordCount);
  }

  @MethodSource("data")
--- a/extensions-contrib/druid-deltalake-extensions/src/test/java/org/apache/druid/delta/input/DeltaInputSourceTest.java
+++ b/extensions-contrib/druid-deltalake-extensions/src/test/java/org/apache/druid/delta/input/DeltaInputSourceTest.java
@ -84,6 +84,11 @@ public class DeltaInputSourceTest
              PartitionedDeltaTable.DELTA_TABLE_PATH,
              PartitionedDeltaTable.FULL_SCHEMA,
              PartitionedDeltaTable.EXPECTED_ROWS
+          },
+          {
+              ComplexTypesDeltaTable.DELTA_TABLE_PATH,
+              ComplexTypesDeltaTable.FULL_SCHEMA,
+              ComplexTypesDeltaTable.EXPECTED_ROWS
          }
      };
    }
--- a/extensions-contrib/druid-deltalake-extensions/src/test/resources/README.md
+++ b/extensions-contrib/druid-deltalake-extensions/src/test/resources/README.md
@ -84,3 +84,14 @@ python3 create_delta_table.py --save_path=employee-delta-table-partitioned-name

 The resulting Delta table is checked in to the repo. The expectated rows to be used in tests are updated in
 `PartitionedDeltaTable.java` accordingly.
+
+### Complex types table `complex-types-table`:
+
+The test data in `resources/complex-types-table` contains 5 Delta records generated with 1 snapshot.
+The table was generated by running the following commands:
+```shell
+python3 create_delta_table.py --save_path=complex-types-table --num_records=5 --gen_complex_types=True
+```
+
+The resulting Delta table is checked in to the repo. The expectated rows to be used in tests are updated in
+`ComplexTypesDeltaTable.java` accordingly.
--- a/extensions-contrib/druid-deltalake-extensions/src/test/resources/complex-types-table/.part-00000-f4353008-5e85-4a53-9b74-0cc7b853103a-c000.snappy.parquet.crc
+++ b/extensions-contrib/druid-deltalake-extensions/src/test/resources/complex-types-table/.part-00000-f4353008-5e85-4a53-9b74-0cc7b853103a-c000.snappy.parquet.crc
--- a/extensions-contrib/druid-deltalake-extensions/src/test/resources/complex-types-table/.part-00001-01efecb8-5771-4e91-834e-2a1cb6601eb8-c000.snappy.parquet.crc
+++ b/extensions-contrib/druid-deltalake-extensions/src/test/resources/complex-types-table/.part-00001-01efecb8-5771-4e91-834e-2a1cb6601eb8-c000.snappy.parquet.crc
--- a/extensions-contrib/druid-deltalake-extensions/src/test/resources/complex-types-table/.part-00003-383f5a97-c624-4ef3-82a4-f3f273308e53-c000.snappy.parquet.crc
+++ b/extensions-contrib/druid-deltalake-extensions/src/test/resources/complex-types-table/.part-00003-383f5a97-c624-4ef3-82a4-f3f273308e53-c000.snappy.parquet.crc
--- a/extensions-contrib/druid-deltalake-extensions/src/test/resources/complex-types-table/.part-00005-febee455-5e89-404a-bb38-f627c47eb20b-c000.snappy.parquet.crc
+++ b/extensions-contrib/druid-deltalake-extensions/src/test/resources/complex-types-table/.part-00005-febee455-5e89-404a-bb38-f627c47eb20b-c000.snappy.parquet.crc
--- a/extensions-contrib/druid-deltalake-extensions/src/test/resources/complex-types-table/.part-00007-07d88387-16f9-4141-bc77-0106e7f28f7a-c000.snappy.parquet.crc
+++ b/extensions-contrib/druid-deltalake-extensions/src/test/resources/complex-types-table/.part-00007-07d88387-16f9-4141-bc77-0106e7f28f7a-c000.snappy.parquet.crc
--- a/extensions-contrib/druid-deltalake-extensions/src/test/resources/complex-types-table/.part-00009-73760316-7ace-43fe-b605-506c942cd969-c000.snappy.parquet.crc
+++ b/extensions-contrib/druid-deltalake-extensions/src/test/resources/complex-types-table/.part-00009-73760316-7ace-43fe-b605-506c942cd969-c000.snappy.parquet.crc
--- a/extensions-contrib/druid-deltalake-extensions/src/test/resources/complex-types-table/_delta_log/.00000000000000000000.json.crc
+++ b/extensions-contrib/druid-deltalake-extensions/src/test/resources/complex-types-table/_delta_log/.00000000000000000000.json.crc
--- a/extensions-contrib/druid-deltalake-extensions/src/test/resources/complex-types-table/_delta_log/00000000000000000000.json
+++ b/extensions-contrib/druid-deltalake-extensions/src/test/resources/complex-types-table/_delta_log/00000000000000000000.json
@ -0,0 +1,8 @@
+{"commitInfo":{"timestamp":1723511561738,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"6","numOutputRows":"5","numOutputBytes":"17937"},"engineInfo":"Apache-Spark/3.5.0 Delta-Lake/3.1.0","txnId":"b9eae5f4-d55b-4c38-b365-8228ec09248e"}}
+{"metaData":{"id":"ce998219-9bde-4831-b78c-14b11f919fbe","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"array_info\",\"type\":{\"type\":\"array\",\"elementType\":\"integer\",\"containsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"struct_info\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"nested_struct_info\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"nested\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"nested_int\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"nested_double\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"map_info\",\"type\":{\"type\":\"map\",\"keyType\":\"string\",\"valueType\":\"float\",\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1723511559184}}
+{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}
+{"add":{"path":"part-00001-01efecb8-5771-4e91-834e-2a1cb6601eb8-c000.snappy.parquet","partitionValues":{},"size":3288,"modificationTime":1723511561689,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":0,\"struct_info\":{\"id\":0,\"name\":\"0\"},\"nested_struct_info\":{\"id\":0,\"name\":\"0\",\"nested\":{\"nested_int\":0,\"nested_double\":1.0}}},\"maxValues\":{\"id\":0,\"struct_info\":{\"id\":0,\"name\":\"0\"},\"nested_struct_info\":{\"id\":0,\"name\":\"0\",\"nested\":{\"nested_int\":0,\"nested_double\":1.0}}},\"nullCount\":{\"id\":0,\"array_info\":0,\"struct_info\":{\"id\":0,\"name\":0},\"nested_struct_info\":{\"id\":0,\"name\":0,\"nested\":{\"nested_int\":0,\"nested_double\":0}},\"map_info\":0}}"}}
+{"add":{"path":"part-00003-383f5a97-c624-4ef3-82a4-f3f273308e53-c000.snappy.parquet","partitionValues":{},"size":3291,"modificationTime":1723511561689,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"struct_info\":{\"id\":1,\"name\":\"1\"},\"nested_struct_info\":{\"id\":1,\"name\":\"1\",\"nested\":{\"nested_int\":1,\"nested_double\":2.0}}},\"maxValues\":{\"id\":1,\"struct_info\":{\"id\":1,\"name\":\"1\"},\"nested_struct_info\":{\"id\":1,\"name\":\"1\",\"nested\":{\"nested_int\":1,\"nested_double\":2.0}}},\"nullCount\":{\"id\":0,\"array_info\":0,\"struct_info\":{\"id\":0,\"name\":0},\"nested_struct_info\":{\"id\":0,\"name\":0,\"nested\":{\"nested_int\":0,\"nested_double\":0}},\"map_info\":0}}"}}
+{"add":{"path":"part-00005-febee455-5e89-404a-bb38-f627c47eb20b-c000.snappy.parquet","partitionValues":{},"size":3289,"modificationTime":1723511561689,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"struct_info\":{\"id\":2,\"name\":\"2\"},\"nested_struct_info\":{\"id\":2,\"name\":\"2\",\"nested\":{\"nested_int\":2,\"nested_double\":3.0}}},\"maxValues\":{\"id\":2,\"struct_info\":{\"id\":2,\"name\":\"2\"},\"nested_struct_info\":{\"id\":2,\"name\":\"2\",\"nested\":{\"nested_int\":2,\"nested_double\":3.0}}},\"nullCount\":{\"id\":0,\"array_info\":0,\"struct_info\":{\"id\":0,\"name\":0},\"nested_struct_info\":{\"id\":0,\"name\":0,\"nested\":{\"nested_int\":0,\"nested_double\":0}},\"map_info\":0}}"}}
+{"add":{"path":"part-00007-07d88387-16f9-4141-bc77-0106e7f28f7a-c000.snappy.parquet","partitionValues":{},"size":3290,"modificationTime":1723511561689,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"struct_info\":{\"id\":3,\"name\":\"3\"},\"nested_struct_info\":{\"id\":3,\"name\":\"3\",\"nested\":{\"nested_int\":3,\"nested_double\":4.0}}},\"maxValues\":{\"id\":3,\"struct_info\":{\"id\":3,\"name\":\"3\"},\"nested_struct_info\":{\"id\":3,\"name\":\"3\",\"nested\":{\"nested_int\":3,\"nested_double\":4.0}}},\"nullCount\":{\"id\":0,\"array_info\":0,\"struct_info\":{\"id\":0,\"name\":0},\"nested_struct_info\":{\"id\":0,\"name\":0,\"nested\":{\"nested_int\":0,\"nested_double\":0}},\"map_info\":0}}"}}
+{"add":{"path":"part-00009-73760316-7ace-43fe-b605-506c942cd969-c000.snappy.parquet","partitionValues":{},"size":3291,"modificationTime":1723511561689,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":4,\"struct_info\":{\"id\":4,\"name\":\"4\"},\"nested_struct_info\":{\"id\":4,\"name\":\"4\",\"nested\":{\"nested_int\":4,\"nested_double\":5.0}}},\"maxValues\":{\"id\":4,\"struct_info\":{\"id\":4,\"name\":\"4\"},\"nested_struct_info\":{\"id\":4,\"name\":\"4\",\"nested\":{\"nested_int\":4,\"nested_double\":5.0}}},\"nullCount\":{\"id\":0,\"array_info\":0,\"struct_info\":{\"id\":0,\"name\":0},\"nested_struct_info\":{\"id\":0,\"name\":0,\"nested\":{\"nested_int\":0,\"nested_double\":0}},\"map_info\":0}}"}}
--- a/extensions-contrib/druid-deltalake-extensions/src/test/resources/complex-types-table/part-00000-f4353008-5e85-4a53-9b74-0cc7b853103a-c000.snappy.parquet
+++ b/extensions-contrib/druid-deltalake-extensions/src/test/resources/complex-types-table/part-00000-f4353008-5e85-4a53-9b74-0cc7b853103a-c000.snappy.parquet
--- a/extensions-contrib/druid-deltalake-extensions/src/test/resources/complex-types-table/part-00001-01efecb8-5771-4e91-834e-2a1cb6601eb8-c000.snappy.parquet
+++ b/extensions-contrib/druid-deltalake-extensions/src/test/resources/complex-types-table/part-00001-01efecb8-5771-4e91-834e-2a1cb6601eb8-c000.snappy.parquet
--- a/extensions-contrib/druid-deltalake-extensions/src/test/resources/complex-types-table/part-00003-383f5a97-c624-4ef3-82a4-f3f273308e53-c000.snappy.parquet
+++ b/extensions-contrib/druid-deltalake-extensions/src/test/resources/complex-types-table/part-00003-383f5a97-c624-4ef3-82a4-f3f273308e53-c000.snappy.parquet
--- a/extensions-contrib/druid-deltalake-extensions/src/test/resources/complex-types-table/part-00005-febee455-5e89-404a-bb38-f627c47eb20b-c000.snappy.parquet
+++ b/extensions-contrib/druid-deltalake-extensions/src/test/resources/complex-types-table/part-00005-febee455-5e89-404a-bb38-f627c47eb20b-c000.snappy.parquet
--- a/extensions-contrib/druid-deltalake-extensions/src/test/resources/complex-types-table/part-00007-07d88387-16f9-4141-bc77-0106e7f28f7a-c000.snappy.parquet
+++ b/extensions-contrib/druid-deltalake-extensions/src/test/resources/complex-types-table/part-00007-07d88387-16f9-4141-bc77-0106e7f28f7a-c000.snappy.parquet
--- a/extensions-contrib/druid-deltalake-extensions/src/test/resources/complex-types-table/part-00009-73760316-7ace-43fe-b605-506c942cd969-c000.snappy.parquet
+++ b/extensions-contrib/druid-deltalake-extensions/src/test/resources/complex-types-table/part-00009-73760316-7ace-43fe-b605-506c942cd969-c000.snappy.parquet
--- a/extensions-contrib/druid-deltalake-extensions/src/test/resources/create_delta_table.py
+++ b/extensions-contrib/druid-deltalake-extensions/src/test/resources/create_delta_table.py
@ -15,12 +15,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import os
-
 import argparse
 from delta import *
 import pyspark
-from pyspark.sql.types import StructType, StructField, ShortType, StringType, TimestampType, LongType, IntegerType, DoubleType, FloatType, DateType, BooleanType
+from pyspark.sql.types import MapType, StructType, StructField, ShortType, StringType, TimestampType, LongType, IntegerType, DoubleType, FloatType, DateType, BooleanType, ArrayType
 from datetime import datetime, timedelta
 import random

@ -39,6 +37,55 @@ def config_spark_with_delta_lake():
    return spark


+def create_dataset_with_complex_types(num_records):
+    """
+    Create a mock dataset with records containing complex types like arrays, structs and maps.
+
+    Parameters:
+    - num_records (int): Number of records to generate.
+
+    Returns:
+    - Tuple: A tuple containing a list of records and the corresponding schema.
+      - List of Records: Each record is a tuple representing a row of data.
+      - StructType: The schema defining the structure of the records.
+
+    Example:
+    ```python
+    data, schema = create_dataset_with_complex_types(10)
+    ```
+    """
+    schema = StructType([
+        StructField("id", LongType(), False),
+        StructField("array_info", ArrayType(IntegerType(), True), True),
+        StructField("struct_info", StructType([
+            StructField("id", LongType(), False),
+            StructField("name", StringType(), True)
+        ])),
+        StructField("nested_struct_info", StructType([
+            StructField("id", LongType(), False),
+            StructField("name", StringType(), True),
+            StructField("nested", StructType([
+                StructField("nested_int", IntegerType(), False),
+                StructField("nested_double", DoubleType(), True),
+            ]))
+        ])),
+        StructField("map_info", MapType(StringType(), FloatType()))
+    ])
+
+    data = []
+
+    for idx in range(num_records):
+        record = (
+            idx,
+            (idx, idx + 1, idx + 2, idx + 3),
+            (idx, f"{idx}"),
+            (idx, f"{idx}", (idx, idx + 1.0)),
+            {"key1": idx + 1.0, "key2": idx + 1.0}
+        )
+        data.append(record)
+    return data, schema
+
+
 def create_dataset(num_records):
    """
    Generate a mock employee dataset with different datatypes for testing purposes.
@ -94,6 +141,9 @@ def main():
    parser = argparse.ArgumentParser(description="Script to write a Delta Lake table.",
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)

+    parser.add_argument("--gen_complex_types", type=bool, default=False, help="Generate a Delta table with records"
+                                                                              " containing complex types like structs,"
+                                                                              " maps and arrays.")
    parser.add_argument('--save_path', default=None, required=True, help="Save path for Delta table")
    parser.add_argument('--save_mode', choices=('append', 'overwrite'), default="append",
                        help="Specify write mode (append/overwrite)")
@ -103,6 +153,7 @@ def main():

    args = parser.parse_args()

+    is_gen_complex_types = args.gen_complex_types
    save_mode = args.save_mode
    save_path = args.save_path
    num_records = args.num_records
@ -110,7 +161,11 @@ def main():

    spark = config_spark_with_delta_lake()

-    data, schema = create_dataset(num_records=num_records)
+    if is_gen_complex_types:
+        data, schema = create_dataset_with_complex_types(num_records=num_records)
+    else:
+        data, schema = create_dataset(num_records=num_records)
+
    df = spark.createDataFrame(data, schema=schema)
    if not partitioned_by:
        df.write.format("delta").mode(save_mode).save(save_path)
--- a/Show More
+++ b/Show More