OpenSearch/dev-tools/upgrade-tests.py

# Licensed to Elasticsearch under one or more contributor
# license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright
# ownership. Elasticsearch licenses this file to you under
# the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance  with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on
# an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
# either express or implied. See the License for the specific
# language governing permissions and limitations under the License.

import random
import os
import tempfile
import shutil
import subprocess
import time
import argparse
import logging
import sys
import re

from datetime import datetime
try:
  from elasticsearch import Elasticsearch
  from elasticsearch.exceptions import ConnectionError
  from elasticsearch.exceptions import TransportError
except ImportError as e:
  print('Can\'t import elasticsearch please install `sudo pip install elasticsearch`')
  raise e


'''This file executes a basic upgrade test by running a full cluster restart.

The upgrade test starts 2 or more nodes of an old elasticsearch version, indexes
a random number of documents into the running nodes and executes a full cluster restart.
After the nodes are recovered a small set of basic checks are executed to ensure all
documents are still searchable and field data can be loaded etc.

NOTE: This script requires the elasticsearch python client `elasticsearch-py` run the following command to install:

  `sudo pip install elasticsearch`

if you are running python3 you need to install the client using pip3. On OSX `pip3` will be included in the Python 3.4
release available on `https://www.python.org/download/`:

  `sudo pip3 install elasticsearch`

See `https://github.com/elasticsearch/elasticsearch-py` for details

In order to run this test two different version of elasticsearch are required. Both need to be unpacked into
the same directory:

```
   $ cd /path/to/elasticsearch/clone
   $ mkdir backwards && cd backwards
   $ wget  https://download.elasticsearch.org/elasticsearch/elasticsearch/elasticsearch-1.3.1.tar.gz
   $ wget  https://download.elasticsearch.org/elasticsearch/elasticsearch/elasticsearch-0.90.13.tar.gz
   $ tar -zxvf elasticsearch-1.3.1.tar.gz && tar -zxvf elasticsearch-0.90.13.tar.gz
   $ cd ..
   $ python dev-tools/upgrade-tests.py --version.backwards 0.90.13 --version.current 1.3.1
```
'''

BLACK_LIST = {'1.2.0' : { 'reason': 'Contains a major bug where routing hashes are not consistent with previous version',
                          'issue': 'https://github.com/elasticsearch/elasticsearch/pull/6393'},
              '1.3.0' : { 'reason': 'Lucene Related bug prevents upgrades from 0.90.7 and some earlier versions ',
                          'issue' : 'https://github.com/elasticsearch/elasticsearch/pull/7055'}}
# sometimes returns True
def rarely():
  return random.randint(0, 10) == 0

# usually returns True
def frequently():
  return not rarely()

# asserts the correctness of the given hits given they are sorted asc
def assert_sort(hits):
  values = [hit['sort'] for hit in hits['hits']['hits']]
  assert len(values) > 0, 'expected non emtpy result'
  val = min(values)
  for x in values:
    assert x >= val, '%s >= %s' % (x, val)
    val = x

# asserts that the cluster health didn't timeout etc.
def assert_health(cluster_health, num_shards, num_replicas):
  assert cluster_health['timed_out'] == False, 'cluster health timed out %s' % cluster_health


# Starts a new elasticsearch node from a released & untared version.
# This node uses unicast discovery with the provided unicast host list and starts
# the nodes with the given data directory. This allows shutting down and starting up
# nodes on the same data dir simulating a full cluster restart.
def start_node(version, data_dir, node_dir, unicast_host_list, tcp_port, http_port):
  es_run_path = os.path.join(node_dir, 'elasticsearch-%s' % (version), 'bin/elasticsearch')
  if version.startswith('0.90.'):
    foreground = '-f' # 0.90.x starts in background automatically
  else:
    foreground = ''
  return subprocess.Popen([es_run_path,
    '-Des.path.data=%s' % data_dir, '-Des.cluster.name=upgrade_test',
    '-Des.discovery.zen.ping.unicast.hosts=%s' % unicast_host_list,
    '-Des.transport.tcp.port=%s' % tcp_port,
    '-Des.http.port=%s' % http_port,
    foreground], stdout=subprocess.PIPE, stderr=subprocess.PIPE)


# Indexes the given number of document into the given index
# and randomly runs refresh, optimize and flush commands
def index_documents(es, index_name, type, num_docs):
  logging.info('Indexing %s docs' % num_docs)
  for id in range(0, num_docs):
    es.index(index=index_name, doc_type=type, id=id, body={'string': str(random.randint(0, 100)),
                                                           'long_sort': random.randint(0, 100),
                                                           'double_sort' : float(random.randint(0, 100))})
    if rarely():
      es.indices.refresh(index=index_name)
    if rarely():
      es.indices.flush(index=index_name, force=frequently())
  if rarely():
      es.indices.optimize(index=index_name)
  es.indices.refresh(index=index_name)

# Runs a basic number of assertions including:
#  - document counts
#  - match all search with sort on double / long
#  - Realtime GET operations
# TODO(simonw): we should add stuff like:
#  - dates including sorting
#  - string sorting
#  - docvalues if available
#  - global ordinal if available
def run_basic_asserts(es, index_name, type, num_docs):
  count = es.count(index=index_name)['count']
  assert count == num_docs, 'Expected %r but got %r documents' % (num_docs, count)
  for _ in range(0, num_docs):
    random_doc_id = random.randint(0, num_docs-1)
    doc = es.get(index=index_name, doc_type=type, id=random_doc_id)
    assert doc, 'Expected document for id %s but got %s' % (random_doc_id, doc)

  assert_sort(es.search(index=index_name,
                  body={
                    'sort': [
                      {'double_sort': {'order': 'asc'}}
                    ]
                  }))

  assert_sort(es.search(index=index_name,
                  body={
                    'sort': [
                      {'long_sort': {'order': 'asc'}}
                    ]
                  }))


# picks a random version or and entire random version tuple from the directory
# to run the backwards tests against.
def pick_random_upgrade_version(directory, lower_version=None, upper_version=None):
  if lower_version and upper_version:
    return lower_version, upper_version
  assert os.path.isdir(directory), 'No such directory %s' % directory
  versions = []
  for version in map(lambda x : x[len('elasticsearch-'):], filter(lambda x : re.match(r'^elasticsearch-\d+[.]\d+[.]\d+$', x), os.listdir(directory))):
    if not version in BLACK_LIST:
      versions.append(build_tuple(version))
  versions.sort()

  if lower_version: # lower version is set - picking a higher one
    versions = filter(lambda x : x > build_tuple(lower_version), versions)
    assert len(versions) >= 1, 'Expected at least 1 higher version than %s version in %s ' % (lower_version, directory)
    random.shuffle(versions)
    return lower_version, build_version(versions[0])
  if upper_version:
    versions = filter(lambda x : x < build_tuple(upper_version), versions)
    assert len(versions) >= 1, 'Expected at least 1 lower version than %s version in %s ' % (upper_version, directory)
    random.shuffle(versions)
    return build_version(versions[0]), upper_version
  assert len(versions) >= 2, 'Expected at least 2 different version in %s but found %s' % (directory, len(versions))
  random.shuffle(versions)
  versions = versions[0:2]
  versions.sort()
  return build_version(versions[0]), build_version(versions[1])

def build_version(version_tuple):
  return '.'.join([str(x) for x in version_tuple])

def build_tuple(version_string):
  return [int(x) for x in version_string.split('.')]

# returns a new elasticsearch client and ensures the all nodes have joined the cluster
# this method waits at most 30 seconds for all nodes to join
def new_es_instance(num_nodes, http_port, timeout = 30):
  logging.info('Waiting for %s nodes to join the cluster' % num_nodes)
  for _ in range(0, timeout):
    # TODO(simonw): ask Honza if there is a better way to do this?
    try:
      es = Elasticsearch([
      {'host': '127.0.0.1', 'port': http_port + x}
        for x in range(0, num_nodes)])
      es.cluster.health(wait_for_nodes=num_nodes)
      es.count() # can we actually search or do we get a 503? -- anyway retry
      return es
    except (ConnectionError, TransportError):
      pass
    time.sleep(1)
  assert False, 'Timed out waiting for %s nodes for %s seconds' % (num_nodes, timeout)

def assert_versions(bwc_version, current_version, node_dir):
  assert [int(x) for x in bwc_version.split('.')] < [int(x) for x in current_version.split('.')],\
      '[%s] must be < than [%s]' % (bwc_version, current_version)
  for version in [bwc_version, current_version]:
    assert not version in BLACK_LIST, 'Version %s is blacklisted - %s, see %s' \
                                          % (version, BLACK_LIST[version]['reason'],
                                             BLACK_LIST[version]['issue'])
    dir = os.path.join(node_dir, 'elasticsearch-%s' % current_version)
    assert os.path.isdir(dir), 'Expected elasticsearch-%s install directory does not exists: %s' % (version, dir)

def full_cluster_restart(node_dir, current_version, bwc_version, tcp_port, http_port):
  assert_versions(bwc_version, current_version, node_dir)
  num_nodes = random.randint(2, 3)
  nodes = []
  data_dir = tempfile.mkdtemp()
  logging.info('Running upgrade test from [%s] to [%s] seed: [%s] es.path.data: [%s] es.http.port [%s] es.tcp.port [%s]'
        % (bwc_version, current_version, seed, data_dir, http_port, tcp_port))
  try:
    logging.info('Starting %s BWC nodes of version %s' % (num_nodes, bwc_version))
    unicast_addresses = ','.join(['127.0.0.1:%s' % (tcp_port+x) for x in range(0, num_nodes)])
    for id in range(0, num_nodes):
      nodes.append(start_node(bwc_version, data_dir, node_dir, unicast_addresses, tcp_port+id, http_port+id))
    es = new_es_instance(num_nodes, http_port)
    es.indices.delete(index='test_index', ignore=404)
    num_shards = random.randint(1, 10)
    num_replicas = random.randint(0, 1)
    logging.info('Create index with [%s] shards and [%s] replicas' % (num_shards, num_replicas))
    es.indices.create(index='test_index', body={
        # TODO(simonw): can we do more here in terms of randomization - seems hard due to all the different version
        'settings': {
            'number_of_shards': num_shards,
            'number_of_replicas': num_replicas
        }
    })
    logging.info('Nodes joined, waiting for green status')
    health = es.cluster.health(wait_for_status='green', wait_for_relocating_shards=0)
    assert_health(health, num_shards, num_replicas)
    num_docs = random.randint(10, 100)
    index_documents(es, 'test_index', 'test_type', num_docs)
    logging.info('Run basic asserts before full cluster restart')
    run_basic_asserts(es, 'test_index', 'test_type', num_docs)
    logging.info('kill bwc nodes -- prepare upgrade')
    for node in nodes:
      node.terminate()

    # now upgrade the nodes and rerun the checks
    tcp_port = tcp_port + len(nodes) # bump up port to make sure we can claim them
    http_port = http_port + len(nodes)
    logging.info('Full Cluster restart starts upgrading to version [elasticsearch-%s] es.http.port [%s] es.tcp.port [%s]'
                 % (current_version, http_port, tcp_port))
    nodes = []
    unicast_addresses = ','.join(['127.0.0.1:%s' % (tcp_port+x) for x in range(0, num_nodes)])
    for id in range(0, num_nodes+1): # one more to trigger relocation
      nodes.append(start_node(current_version, data_dir, node_dir, unicast_addresses, tcp_port+id, http_port+id))
    es = new_es_instance(num_nodes+1, http_port)
    logging.info('Nodes joined, waiting for green status')
    health = es.cluster.health(wait_for_status='green', wait_for_relocating_shards=0)
    assert_health(health, num_shards, num_replicas)
    run_basic_asserts(es, 'test_index', 'test_type', num_docs)
    # by running the indexing again we try to catch possible mapping problems after the upgrade
    index_documents(es, 'test_index', 'test_type', num_docs)
    run_basic_asserts(es, 'test_index', 'test_type', num_docs)
    logging.info("[SUCCESS] - all test passed upgrading from version [%s] to version [%s]" % (bwc_version, current_version))
  finally:
    for node in nodes:
      node.terminate()
    time.sleep(1) # wait a second until removing the data dirs to give the nodes a chance to shutdown
    shutil.rmtree(data_dir) # remove the temp data dir

if __name__ == '__main__':
  logging.basicConfig(format='[%(levelname)s] [%(asctime)s] %(message)s', level=logging.INFO,
                      datefmt='%Y-%m-%d %I:%M:%S %p')
  logging.getLogger('elasticsearch').setLevel(logging.ERROR)
  logging.getLogger('urllib3').setLevel(logging.WARN)
  parser = argparse.ArgumentParser(description='Tests Full Cluster Restarts across major version')
  parser.add_argument('--version.backwards', '-b', dest='backwards_version', metavar='V',
                      help='The elasticsearch version to upgrade from')
  parser.add_argument('--version.current', '-c', dest='current_version', metavar='V',
                      help='The elasticsearch version to upgrade to')
  parser.add_argument('--seed', '-s', dest='seed', metavar='N', type=int,
                      help='The random seed to use')
  parser.add_argument('--backwards.dir', '-d', dest='bwc_directory', default='backwards', metavar='dir',
                      help='The directory to the backwards compatibility sources')

  parser.add_argument('--tcp.port', '-p', dest='tcp_port', default=9300, metavar='port', type=int,
                      help='The port to use as the minimum port for TCP communication')
  parser.add_argument('--http.port', '-t', dest='http_port', default=9200, metavar='port', type=int,
                      help='The port to use as the minimum port for HTTP communication')

  parser.set_defaults(bwc_directory='backwards')
  parser.set_defaults(seed=int(time.time()))
  args = parser.parse_args()
  node_dir = args.bwc_directory
  current_version = args.current_version
  bwc_version = args.backwards_version
  seed = args.seed
  random.seed(seed)
  bwc_version, current_version = pick_random_upgrade_version(node_dir, bwc_version, current_version)
  tcp_port = args.tcp_port
  http_port = args.http_port
  try:
    full_cluster_restart(node_dir, current_version, bwc_version, tcp_port, http_port)
  except:
    logging.warn('REPRODUCE WITH: \n\t`python %s --version.backwards %s --version.current %s --seed %s --tcp.port %s --http.port %s`'
                   % (sys.argv[0], bwc_version, current_version, seed, tcp_port, http_port))
    raise