druid/examples/bin/dsql-main

#!/usr/bin/env python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from __future__ import print_function

import argparse
import base64
import collections
import csv
import errno
import json
import numbers
import os
import re
import readline
import ssl
import sys
import time
import unicodedata
import urllib2

class DruidSqlException(Exception):
  def friendly_message(self):
    return self.message if self.message else "Query failed"

  def write_to(self, f):
    f.write('\x1b[31m')
    f.write(self.friendly_message())
    f.write('\x1b[0m')
    f.write('\n')
    f.flush()

def do_query_with_args(url, sql, context, args):
  return do_query(url, sql, context, args.timeout, args.user, args.ignore_ssl_verification, args.cafile, args.capath)

def do_query(url, sql, context, timeout, user, ignore_ssl_verification, ca_file, ca_path):
  json_decoder = json.JSONDecoder(object_pairs_hook=collections.OrderedDict)
  try:
    if timeout <= 0:
      timeout = None
      query_context = context
    elif int(context.get('timeout', 0)) / 1000. < timeout:
      query_context = context.copy()
      query_context['timeout'] = timeout * 1000

    sql_json = json.dumps({'query' : sql, 'context' : query_context})

    # SSL stuff
    ssl_context = None
    if ignore_ssl_verification or ca_file is not None or ca_path is not None:
      ssl_context = ssl.create_default_context()
      if ignore_ssl_verification:
        ssl_context.check_hostname = False
        ssl_context.verify_mode = ssl.CERT_NONE
      else:
        ssl_context.load_verify_locations(cafile=ca_file, capath=ca_path)

    req = urllib2.Request(url, sql_json, {'Content-Type' : 'application/json'})

    if user:
      req.add_header("Authorization", "Basic %s" % base64.b64encode(user))

    response = urllib2.urlopen(req, None, timeout, context=ssl_context)

    first_chunk = True
    eof = False
    buf = ''

    while not eof or len(buf) > 0:
      while True:
        try:
          # Remove starting ','
          buf = buf.lstrip(',')
          obj, sz = json_decoder.raw_decode(buf)
          yield obj
          buf = buf[sz:]
        except ValueError as e:
          # Maybe invalid JSON, maybe partial object; it's hard to tell with this library.
          if eof and buf.rstrip() == ']':
            # Stream done and all objects read.
            buf = ''
            break
          elif eof or len(buf) > 256 * 1024:
            # If we read more than 256KB or if it's eof then report the parse error.
            raise
          else:
            # Stop reading objects, get more from the stream instead.
            break

      # Read more from the http stream
      if not eof:
        chunk = response.read(8192)
        if chunk:
          buf = buf + chunk
          if first_chunk:
            # Remove starting '['
            buf = buf.lstrip('[')
        else:
          # Stream done. Keep reading objects out of buf though.
          eof = True

  except urllib2.URLError as e:
    raise_friendly_error(e)

def raise_friendly_error(e):
  if isinstance(e, urllib2.HTTPError):
    text = e.read().strip()
    error_obj = {}
    try:
      error_obj = dict(json.loads(text))
    except:
      pass
    if e.code == 500 and 'errorMessage' in error_obj:
      error_text = ''
      if error_obj['error'] != 'Unknown exception':
        error_text = error_text + error_obj['error'] + ': '
      if error_obj['errorClass']:
        error_text = error_text + str(error_obj['errorClass']) + ': '
      error_text = error_text + str(error_obj['errorMessage'])
      if error_obj['host']:
        error_text = error_text + ' (' + str(error_obj['host']) + ')'
      raise DruidSqlException(error_text)
    elif e.code == 405:
      error_text = 'HTTP Error {0}: {1}\n{2}'.format(e.code, e.reason + " - Are you using the correct broker URL and " +\
      "is druid.sql.enabled set to true on your broker?", text)
      raise DruidSqlException(error_text)
    else:
      raise DruidSqlException("HTTP Error {0}: {1}\n{2}".format(e.code, e.reason, text))
  else:
    raise DruidSqlException(str(e))

def to_utf8(value):
  if value is None:
    return ""
  elif isinstance(value, unicode):
    return value.encode("utf-8")
  else:
    return str(value)

def to_tsv(values, delimiter):
  return delimiter.join(to_utf8(v).replace(delimiter, '') for v in values)

def print_csv(rows, header):
  csv_writer = csv.writer(sys.stdout)
  first = True
  for row in rows:
    if first and header:
      csv_writer.writerow(list(to_utf8(k) for k in row.keys()))
      first = False

    values = []
    for key, value in row.iteritems():
      values.append(to_utf8(value))

    csv_writer.writerow(values)

def print_tsv(rows, header, tsv_delimiter):
  first = True
  for row in rows:
    if first and header:
      print(to_tsv(row.keys(), tsv_delimiter))
      first = False

    values = []
    for key, value in row.iteritems():
      values.append(value)

    print(to_tsv(values, tsv_delimiter))

def print_json(rows):
  for row in rows:
    print(json.dumps(row))

def table_to_printable_value(value):
  # Unicode string, trimmed with control characters removed
  if value is None:
    return u"NULL"
  else:
    return to_utf8(value).strip().decode('utf-8').translate(dict.fromkeys(range(32)))

def table_compute_string_width(v):
  normalized = unicodedata.normalize('NFC', v)
  width = 0
  for c in normalized:
    ccategory = unicodedata.category(c)
    cwidth = unicodedata.east_asian_width(c)
    if ccategory == 'Cf':
      # Formatting control, zero width
      pass
    elif cwidth == 'F' or cwidth == 'W':
      # Double-wide character, prints in two columns
      width = width + 2
    else:
      # All other characters
      width = width + 1
  return width

def table_compute_column_widths(row_buffer):
  widths = None
  for values in row_buffer:
    values_widths = [table_compute_string_width(v) for v in values]
    if not widths:
      widths = values_widths
    else:
      i = 0
      for v in values:
        widths[i] = max(widths[i], values_widths[i])
        i = i + 1
  return widths

def table_print_row(values, column_widths, column_types):
  vertical_line = u'\u2502'.encode('utf-8')
  for i in xrange(0, len(values)):
    padding = ' ' * max(0, column_widths[i] - table_compute_string_width(values[i]))
    if column_types and column_types[i] == 'n':
      print(vertical_line + ' ' + padding + values[i].encode('utf-8') + ' ', end="")
    else:
      print(vertical_line + ' ' + values[i].encode('utf-8') + padding + ' ', end="")
  print(vertical_line)

def table_print_header(values, column_widths):
  # Line 1
  left_corner = u'\u250C'.encode('utf-8')
  horizontal_line = u'\u2500'.encode('utf-8')
  top_tee = u'\u252C'.encode('utf-8')
  right_corner = u'\u2510'.encode('utf-8')
  print(left_corner, end="")
  for i in xrange(0, len(column_widths)):
    print(horizontal_line * max(0, column_widths[i] + 2), end="")
    if i + 1 < len(column_widths):
      print(top_tee, end="")
  print(right_corner)

  # Line 2
  table_print_row(values, column_widths, None)

  # Line 3
  left_tee = u'\u251C'.encode('utf-8')
  cross = u'\u253C'.encode('utf-8')
  right_tee = u'\u2524'.encode('utf-8')
  print(left_tee, end="")
  for i in xrange(0, len(column_widths)):
    print(horizontal_line * max(0, column_widths[i] + 2), end="")
    if i + 1 < len(column_widths):
      print(cross, end="")
  print(right_tee)

def table_print_bottom(column_widths):
  left_corner = u'\u2514'.encode('utf-8')
  right_corner = u'\u2518'.encode('utf-8')
  bottom_tee = u'\u2534'.encode('utf-8')
  horizontal_line = u'\u2500'.encode('utf-8')
  print(left_corner, end="")
  for i in xrange(0, len(column_widths)):
    print(horizontal_line * max(0, column_widths[i] + 2), end="")
    if i + 1 < len(column_widths):
      print(bottom_tee, end="")
  print(right_corner)

def table_print_row_buffer(row_buffer, column_widths, column_types):
  first = True
  for values in row_buffer:
    if first:
      table_print_header(values, column_widths)
      first = False
    else:
      table_print_row(values, column_widths, column_types)

def print_table(rows):
  start = time.time()
  nrows = 0
  first = True

  # Buffer some rows before printing.
  rows_to_buffer = 500
  row_buffer = []
  column_types = []
  column_widths = None

  for row in rows:
    nrows = nrows + 1

    if first:
      row_buffer.append([table_to_printable_value(k) for k in row.keys()])
      for k in row.keys():
        if isinstance(row[k], numbers.Number):
          column_types.append('n')
        else:
          column_types.append('s')
      first = False

    values = [table_to_printable_value(v) for k, v in row.iteritems()]
    if rows_to_buffer > 0:
      row_buffer.append(values)
      rows_to_buffer = rows_to_buffer - 1
    else:
      if row_buffer:
        column_widths = table_compute_column_widths(row_buffer)
        table_print_row_buffer(row_buffer, column_widths, column_types)
        del row_buffer[:]
      table_print_row(values, column_widths, column_types)

  if row_buffer:
    column_widths = table_compute_column_widths(row_buffer)
    table_print_row_buffer(row_buffer, column_widths, column_types)

  if column_widths:
    table_print_bottom(column_widths)

  print("Retrieved {0:,d} row{1:s} in {2:.2f}s.".format(nrows, 's' if nrows != 1 else '', time.time() - start))
  print("")

def display_query(url, sql, context, args):
  rows = do_query_with_args(url, sql, context, args)

  if args.format == 'csv':
    print_csv(rows, args.header)
  elif args.format == 'tsv':
    print_tsv(rows, args.header, args.tsv_delimiter)
  elif args.format == 'json':
    print_json(rows)
  elif args.format == 'table':
    print_table(rows)

def sql_literal_escape(s):
  if s is None:
    return "''"
  elif isinstance(s, unicode):
    ustr = s
  else:
    ustr = str(s).decode('utf-8')

  escaped = [u"U&'"]

  for c in ustr:
    ccategory = unicodedata.category(c)
    if ccategory.startswith('L') or ccategory.startswith('N') or c == ' ':
      escaped.append(c)
    else:
      escaped.append(u'\\')
      escaped.append('%04x' % ord(c))

  escaped.append("'")
  return ''.join(escaped)

def make_readline_completer(url, context, args):
  starters = [
    'EXPLAIN PLAN FOR',
    'SELECT'
  ]

  middlers = [
    'FROM',
    'WHERE',
    'GROUP BY',
    'ORDER BY',
    'LIMIT'
  ]

  def readline_completer(text, state):
    if readline.get_begidx() == 0:
      results = [x for x in starters if x.startswith(text.upper())] + [None]
    else:
      results = ([x for x in middlers if x.startswith(text.upper())] + [None])

    return results[state] + " "

  print("Connected to [" + args.host + "].")
  print("")

  return readline_completer

def main():
  parser = argparse.ArgumentParser(description='Druid SQL command-line client.')
  parser_cnn = parser.add_argument_group('Connection options')
  parser_fmt = parser.add_argument_group('Formatting options')
  parser_oth = parser.add_argument_group('Other options')
  parser_cnn.add_argument('--host', '-H', type=str, default='http://localhost:8082/', help='Druid query host or url, like https://localhost:8282/')
  parser_cnn.add_argument('--user', '-u', type=str, help='HTTP basic authentication credentials, like user:password')
  parser_cnn.add_argument('--timeout', type=int, default=0, help='Timeout in seconds')
  parser_cnn.add_argument('--cafile', type=str, help='Path to SSL CA file for validating server certificates. See load_verify_locations() in https://docs.python.org/2/library/ssl.html#ssl.SSLContext.')
  parser_cnn.add_argument('--capath', type=str, help='SSL CA path for validating server certificates. See load_verify_locations() in https://docs.python.org/2/library/ssl.html#ssl.SSLContext.')
  parser_cnn.add_argument('--ignore-ssl-verification', '-k', action='store_true', default=False, help='Skip verification of SSL certificates.')
  parser_fmt.add_argument('--format', type=str, default='table', choices=('csv', 'tsv', 'json', 'table'), help='Result format')
  parser_fmt.add_argument('--header', action='store_true', help='Include header row for formats "csv" and "tsv"')
  parser_fmt.add_argument('--tsv-delimiter', type=str, default='\t', help='Delimiter for format "tsv"')
  parser_oth.add_argument('--context-option', '-c', type=str, action='append', help='Set context option for this connection, see https://docs.imply.io/on-prem/query-data/sql for options')
  parser_oth.add_argument('--execute', '-e', type=str, help='Execute single SQL query')
  args = parser.parse_args()

  # Build broker URL
  url = args.host.rstrip('/') + '/druid/v2/sql/'
  if not url.startswith('http:') and not url.startswith('https:'):
    url = 'http://' + url

  # Build context
  context = {}
  if args.context_option:
    for opt in args.context_option:
      kv = opt.split("=", 1)
      if len(kv) != 2:
        raise ValueError('Invalid context option, should be key=value: ' + opt)
      if re.match(r"^\d+$", kv[1]):
        context[kv[0]] = long(kv[1])
      else:
        context[kv[0]] = kv[1]

  if args.execute:
    display_query(url, args.execute, context, args)
  else:
    # interactive mode
    print("Welcome to dsql, the command-line client for Druid SQL.")

    readline_history_file = os.path.expanduser("~/.dsql_history")
    readline.parse_and_bind('tab: complete')
    readline.set_history_length(500)
    readline.set_completer(make_readline_completer(url, context, args))

    try:
      readline.read_history_file(readline_history_file)
    except IOError:
      # IOError can happen if the file doesn't exist.
      pass

    print("Type \"\\h\" for help.")

    while True:
      sql = ''
      while not sql.endswith(';'):
        prompt = "dsql> " if sql == '' else 'more> '
        try:
          more_sql = raw_input(prompt)
        except EOFError:
          sys.stdout.write('\n')
          sys.exit(1)
        if sql == '' and more_sql.startswith('\\'):
          # backslash command
          dmatch = re.match(r'^\\d(S?)(\+?)(\s+.*?|)\s*$', more_sql)
          if dmatch:
            include_system = dmatch.group(1)
            extra_info = dmatch.group(2)
            arg = dmatch.group(3).strip()
            if arg:
              sql = "SELECT TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME, DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = " + sql_literal_escape(arg)
              if not include_system:
                sql = sql + " AND TABLE_SCHEMA = 'druid'"
              # break to execute sql
              break
            else:
              sql = "SELECT TABLE_SCHEMA, TABLE_NAME FROM INFORMATION_SCHEMA.TABLES"
              if not include_system:
                sql = sql + " WHERE TABLE_SCHEMA = 'druid'"
              # break to execute sql
              break

          hmatch = re.match(r'^\\h\s*$', more_sql)
          if hmatch:
            print("Commands:")
            print("  \\d             show tables")
            print("  \\dS            show tables, including system tables")
            print("  \\d table_name  describe table")
            print("  \\h             show this help")
            print("  \\q             exit this program")
            print("Or enter a SQL query ending with a semicolon (;).")
            continue

          qmatch = re.match(r'^\\q\s*$', more_sql)
          if qmatch:
            sys.exit(0)

          print("No such command: " + more_sql)
        else:
          sql = (sql + ' ' + more_sql).strip()

      try:
        readline.write_history_file(readline_history_file)
        display_query(url, sql.rstrip(';'), context, args)
      except DruidSqlException as e:
        e.write_to(sys.stdout)
      except KeyboardInterrupt:
        sys.stdout.write("Query interrupted\n")
        sys.stdout.flush()

try:
  main()
except DruidSqlException as e:
  e.write_to(sys.stderr)
  sys.exit(1)
except KeyboardInterrupt:
  sys.exit(1)
except IOError as e:
  if e.errno == errno.EPIPE:
    sys.exit(1)
  else:
    raise