Improve query error logging (#11519)

* Improve query error logging * add docs * address comments * address comments
2021-08-05 22:51:09 +07:00 · 2021-08-05 22:51:09 +07:00 · 3257913737
parent 4470ca6a92
commit 3257913737
4 changed files with 38 additions and 1 deletions
--- a/docs/querying/query-context.md
+++ b/docs/querying/query-context.md
@ -61,6 +61,7 @@ Unless otherwise noted, the following parameters apply to all query types.
 |useFilterCNF|`false`| If true, Druid will attempt to convert the query filter to Conjunctive Normal Form (CNF). During query processing, columns can be pre-filtered by intersecting the bitmap indexes of all values that match the eligible filters, often greatly reducing the raw number of rows which need to be scanned. But this effect only happens for the top level filter, or individual clauses of a top level 'and' filter. As such, filters in CNF potentially have a higher chance to utilize a large amount of bitmap indexes on string columns during pre-filtering. However, this setting should be used with great caution, as it can sometimes have a negative effect on performance, and in some cases, the act of computing CNF of a filter can be expensive. We recommend hand tuning your filters to produce an optimal form if possible, or at least verifying through experimentation that using this parameter actually improves your query performance with no ill-effects.|
 |secondaryPartitionPruning|`true`|Enable secondary partition pruning on the Broker. The Broker will always prune unnecessary segments from the input scan based on a filter on time intervals, but if the data is further partitioned with hash or range partitioning, this option will enable additional pruning based on a filter on secondary partition dimensions.|
 |enableJoinLeftTableScanDirect|`false`|This flag applies to queries which have joins. For joins, where left child is a simple scan with a filter,  by default, druid will run the scan as a query and the join the results to the right child on broker. Setting this flag to true overrides that behavior and druid will attempt to push the join to data servers instead. Please note that the flag could be applicable to queries even if there is no explicit join. since queries can internally translated into a join by the SQL planner.|
+|debug| `false` | Flag indicating whether to enable debugging outputs for the query. When set to false, no additional logs will be produced (logs produced will be entirely dependent on your logging level). When set to true, the following addition logs will be produced:<br />- Log the stack trace of the exception (if any) produced by the query |

 ## Query-type-specific parameters

--- a/processing/src/main/java/org/apache/druid/query/QueryContexts.java
+++ b/processing/src/main/java/org/apache/druid/query/QueryContexts.java
@ -65,6 +65,7 @@ public class QueryContexts
  public static final String RETURN_PARTIAL_RESULTS_KEY = "returnPartialResults";
  public static final String USE_CACHE_KEY = "useCache";
  public static final String SECONDARY_PARTITION_PRUNING_KEY = "secondaryPartitionPruning";
+  public static final String ENABLE_DEBUG = "debug";
  public static final String BY_SEGMENT_KEY = "bySegment";
  public static final String BROKER_SERVICE_NAME = "brokerService";

@ -88,6 +89,7 @@ public class QueryContexts
  public static final boolean DEFAULT_ENABLE_SQL_JOIN_LEFT_SCAN_DIRECT = false;
  public static final boolean DEFAULT_USE_FILTER_CNF = false;
  public static final boolean DEFAULT_SECONDARY_PARTITION_PRUNING = true;
+  public static final boolean DEFAULT_ENABLE_DEBUG = false;

  @SuppressWarnings("unused") // Used by Jackson serialization
  public enum Vectorize
@ -322,6 +324,11 @@ public class QueryContexts
    return parseBoolean(query, SECONDARY_PARTITION_PRUNING_KEY, DEFAULT_SECONDARY_PARTITION_PRUNING);
  }

+  public static <T> boolean isDebug(Query<T> query)
+  {
+    return parseBoolean(query, ENABLE_DEBUG, DEFAULT_ENABLE_DEBUG);
+  }
+
  public static <T> Query<T> withMaxScatterGatherBytes(Query<T> query, long maxScatterGatherBytesLimit)
  {
    Object obj = query.getContextValue(MAX_SCATTER_GATHER_BYTES_KEY);
--- a/processing/src/test/java/org/apache/druid/query/QueryContextsTest.java
+++ b/processing/src/test/java/org/apache/druid/query/QueryContextsTest.java
@ -177,4 +177,28 @@ public class QueryContextsTest
    exception.expect(ClassCastException.class);
    QueryContexts.getBrokerServiceName(query);
  }
+
+  @Test
+  public void testDefaultEnableQueryDebugging()
+  {
+    Query<?> query = new TestQuery(
+        new TableDataSource("test"),
+        new MultipleIntervalSegmentSpec(ImmutableList.of(Intervals.of("0/100"))),
+        false,
+        ImmutableMap.of()
+    );
+    Assert.assertFalse(QueryContexts.isDebug(query));
+  }
+
+  @Test
+  public void testEnableQueryDebuggingSetToTrue()
+  {
+    Query<?> query = new TestQuery(
+        new TableDataSource("test"),
+        new MultipleIntervalSegmentSpec(ImmutableList.of(Intervals.of("0/100"))),
+        false,
+        ImmutableMap.of(QueryContexts.ENABLE_DEBUG, true)
+    );
+    Assert.assertTrue(QueryContexts.isDebug(query));
+  }
 }
--- a/server/src/main/java/org/apache/druid/server/QueryLifecycle.java
+++ b/server/src/main/java/org/apache/druid/server/QueryLifecycle.java
@ -36,6 +36,7 @@ import org.apache.druid.query.DefaultQueryConfig;
 import org.apache.druid.query.DruidMetrics;
 import org.apache.druid.query.GenericQueryMetricsFactory;
 import org.apache.druid.query.Query;
+import org.apache.druid.query.QueryContexts;
 import org.apache.druid.query.QueryInterruptedException;
 import org.apache.druid.query.QueryMetrics;
 import org.apache.druid.query.QueryPlus;
@ -316,7 +317,11 @@ public class QueryLifecycle

      if (e != null) {
        statsMap.put("exception", e.toString());
-        log.noStackTrace().warn(e, "Exception while processing queryId [%s]", baseQuery.getId());
+        if (QueryContexts.isDebug(baseQuery)) {
+          log.error(e, "Exception while processing queryId [%s]", baseQuery.getId());
+        } else {
+          log.noStackTrace().error(e, "Exception while processing queryId [%s]", baseQuery.getId());
+        }
        if (e instanceof QueryInterruptedException || e instanceof QueryTimeoutException) {
          // Mimic behavior from QueryResource, where this code was originally taken from.
          statsMap.put("interrupted", true);