druid/distribution/bin/check-licenses.py

446 lines
20 KiB
Python
Executable File

#!/usr/bin/env python3
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import yaml
import os
import sys
from html.parser import HTMLParser
import argparse
class DependencyReportParser(HTMLParser):
# This class parses the given html file to find all dependency reports under "Project dependencies"
# and "Projection transparent dependencies" sections.
# The parser works based on the state machine and its state is updated whenever it reads a new tag.
# The state changes as below:
#
# none -> h2_start -> project_dependencies_start -> h3_start -> compile_start -> table_start -> row_start -> th_start / td_start -> th_end / td_end -> row_end -> table_end -> compile_end -> h3_end -> project_dependencies_end -> h2_end -> none
attr_index = 0
group_id = None
artifact_id = None
version = None
classifier = None
dep_type = None
license = None
state = "none"
dep_to_license = None
compatible_license_names = None
include_classifier = False
druid_module_name = None
def __init__(self, druid_module_name, compatible_license_names):
HTMLParser.__init__(self)
self.state = "none"
self.druid_module_name = druid_module_name
self.compatible_license_names = compatible_license_names
def parse(self, f):
self.dep_to_license = {}
self.feed(f.read())
return self.dep_to_license
def handle_starttag(self, tag, attrs):
# print("current: {}, start tag: {}, attrs:{} ".format(self.state, tag, attrs))
if self.state == "none":
if tag == "h2":
self.state = "h2_start"
if self.state == "h2_start":
if tag == "a":
for attr in attrs:
if attr[0] == "name" and (attr[1] == "Project_Dependencies" or attr[1] == "Project_Transitive_Dependencies"):
self.state = "project_dependencies_start"
self.include_classifier = False
if self.state == "h2_end":
if tag == "h3":
self.state = "h3_start"
if self.state == "h3_start":
if tag == "a":
for attr in attrs:
if attr[0] == "name" and attr[1] == "compile":
self.state = "compile_start"
if self.state == "h3_end":
if tag == "table":
self.state = "table_start"
if self.state == "table_start":
if tag == "tr":
self.state = "row_start"
self.clear_attr()
if self.state == "row_end":
if tag == "tr":
self.state = "row_start"
self.clear_attr()
if self.state == "row_start":
if tag == "td":
self.state = "td_start"
elif tag == "th":
self.state = "th_start"
if self.state == "th_end":
if tag == "th":
self.state = "th_start"
if self.state == "td_end":
if tag == "td":
self.state = "td_start"
def handle_endtag(self, tag):
# print("current: {}, end tag: {}".format(self.state, tag))
if self.state == "project_dependencies_start":
if tag == "a":
self.state = "project_dependencies_end"
if self.state == "h2_start":
if tag == "h2":
self.state = "h2_end"
if self.state == "project_dependencies_end":
if tag == "h2":
self.state = "h2_end"
if self.state == "compile_start":
if tag == "a":
self.state = "compile_end"
if self.state == "compile_end":
if tag == "h3":
self.state = "h3_end"
if self.state == "table_start":
if tag == "table":
self.state = "none"
if self.state == "td_start":
if tag == "td":
self.state = "td_end"
self.attr_index = self.attr_index + 1
if self.state == "th_start":
if tag == "th":
self.state = "th_end"
if self.state == "row_start":
if tag == "tr":
self.state = "row_end"
if self.state == "th_end":
if tag == "tr":
self.state = "row_end"
if self.state == "td_end":
if tag == "tr":
self.state = "row_end"
# print(json.dumps({"groupId": self.group_id, "artifactId": self.artifact_id, "version": self.version, "classifier": self.classifier, "type": self.dep_type, "license": self.license}))
if self.group_id.find("org.apache.druid") < 0:
self.dep_to_license[get_dep_key(self.group_id, self.artifact_id, self.version)] = (self.license, self.druid_module_name)
if self.state == "row_end":
if tag == "table":
self.state = "none"
def handle_data(self, data):
if self.state == "td_start":
self.set_attr(data)
elif self.state == "th_start":
if data.lower() == "classifier":
self.include_classifier = True
def clear_attr(self):
self.group_id = None
self.artifact_id = None
self.version = None
self.classifier = None
self.dep_type = None
self.license = None
self.attr_index = 0
def set_attr(self, data):
#print("set data: {}".format(data))
if self.attr_index == 0:
self.group_id = data
elif self.attr_index == 1:
self.artifact_id = data
elif self.attr_index == 2:
self.version = get_version_string(data)
elif self.attr_index == 3:
if self.include_classifier:
self.classifier = data
else:
self.dep_type = data
elif self.attr_index == 4:
if self.include_classifier:
self.dep_type = data
else:
self.set_license(data)
elif self.attr_index == 5:
if self.include_classifier:
self.set_license(data)
else:
raise Exception("Unknown attr_index [{}]".format(self.attr_index))
else:
raise Exception("Unknown attr_index [{}]".format(self.attr_index))
def set_license(self, data):
if data.upper().find("GPL") < 0:
if self.license != 'Apache License version 2.0':
self.license = self.compatible_license_names[data]
def print_log_to_stderr(string):
print(string, file=sys.stderr)
def build_compatible_license_names():
compatible_licenses = {}
compatible_licenses['Apache License, Version 2.0'] = 'Apache License version 2.0'
compatible_licenses['The Apache Software License, Version 2.0'] = 'Apache License version 2.0'
compatible_licenses['Apache 2.0'] = 'Apache License version 2.0'
compatible_licenses['Apache-2.0'] = 'Apache License version 2.0'
compatible_licenses['Apache 2'] = 'Apache License version 2.0'
compatible_licenses['Apache License 2'] = 'Apache License version 2.0'
compatible_licenses['Apache License 2.0'] = 'Apache License version 2.0'
compatible_licenses['Apache Software License - Version 2.0'] = 'Apache License version 2.0'
compatible_licenses['The Apache License, Version 2.0'] = 'Apache License version 2.0'
compatible_licenses['Apache License version 2.0'] = 'Apache License version 2.0'
compatible_licenses['Apache License Version 2.0'] = 'Apache License version 2.0'
compatible_licenses['Apache License Version 2'] = 'Apache License version 2.0'
compatible_licenses['Apache License v2.0'] = 'Apache License version 2.0'
compatible_licenses['Apache License, 2.0'] = 'Apache License version 2.0'
compatible_licenses['Apache License, version 2.0'] = 'Apache License version 2.0'
compatible_licenses['Apache 2.0 License'] = 'Apache License version 2.0'
compatible_licenses['Apache License, 2.0'] = 'Apache License version 2.0'
compatible_licenses['Public Domain'] = 'Public Domain'
compatible_licenses['BSD-2-Clause License'] = 'BSD-2-Clause License'
compatible_licenses['BSD-2-Clause'] = 'BSD-2-Clause License'
compatible_licenses['BSD 2-Clause license'] = 'BSD-2-Clause License'
compatible_licenses['BSD 2-Clause License'] = 'BSD-2-Clause License'
compatible_licenses['BSD-3-Clause License'] = 'BSD-3-Clause License'
compatible_licenses['New BSD license'] = 'BSD-3-Clause License'
compatible_licenses['BSD'] = 'BSD-3-Clause License'
compatible_licenses['The BSD License'] = 'BSD-3-Clause License'
compatible_licenses['BSD licence'] = 'BSD-3-Clause License'
compatible_licenses['BSD License'] = 'BSD-3-Clause License'
compatible_licenses['BSD-like'] = 'BSD-3-Clause License'
compatible_licenses['BSD 3-clause'] = 'BSD-3-Clause License'
compatible_licenses['The BSD 3-Clause License'] = 'BSD-3-Clause License'
compatible_licenses['Revised BSD'] = 'BSD-3-Clause License'
compatible_licenses['New BSD License'] = 'BSD-3-Clause License'
compatible_licenses['3-Clause BSD License'] = 'BSD-3-Clause License'
compatible_licenses['BSD 3-Clause'] = 'BSD-3-Clause License'
compatible_licenses['BSD-3-Clause'] = 'BSD-3-Clause License'
compatible_licenses['Unicode/ICU License'] = 'Unicode/ICU License'
compatible_licenses['SIL Open Font License 1.1'] = 'SIL Open Font License 1.1'
compatible_licenses['CDDL 1.1'] = 'CDDL 1.1'
compatible_licenses['CDDL/GPLv2+CE'] = 'CDDL 1.1'
compatible_licenses['CDDL + GPLv2 with classpath exception'] = 'CDDL 1.1'
compatible_licenses['CDDL License'] = 'CDDL 1.1'
compatible_licenses['COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0'] = 'CDDL 1.0'
compatible_licenses['Eclipse Public License 1.0'] = 'Eclipse Public License 1.0'
compatible_licenses['The Eclipse Public License, Version 1.0'] = 'Eclipse Public License 1.0'
compatible_licenses['Eclipse Public License - Version 1.0'] = 'Eclipse Public License 1.0'
compatible_licenses['Eclipse Public License, Version 1.0'] = 'Eclipse Public License 1.0'
compatible_licenses['Eclipse Public License v1.0'] = 'Eclipse Public License 1.0'
compatible_licenses['Eclipse Public License - v1.0'] = 'Eclipse Public License 1.0'
compatible_licenses['Eclipse Public License - v 1.0'] = 'Eclipse Public License 1.0'
compatible_licenses['EPL 1.0'] = 'Eclipse Public License 1.0'
compatible_licenses['Eclipse Public License 2.0'] = 'Eclipse Public License 2.0'
compatible_licenses['The Eclipse Public License, Version 2.0'] = 'Eclipse Public License 2.0'
compatible_licenses['Eclipse Public License - Version 2.0'] = 'Eclipse Public License 2.0'
compatible_licenses['Eclipse Public License, Version 2.0'] = 'Eclipse Public License 2.0'
compatible_licenses['Eclipse Public License v2.0'] = 'Eclipse Public License 2.0'
compatible_licenses['EPL 2.0'] = 'Eclipse Public License 2.0'
compatible_licenses['Eclipse Distribution License 1.0'] = 'Eclipse Distribution License 1.0'
compatible_licenses['Eclipse Distribution License - v 1.0'] = 'Eclipse Distribution License 1.0'
compatible_licenses['Eclipse Distribution License v. 1.0'] = 'Eclipse Distribution License 1.0'
compatible_licenses['EDL 1.0'] = 'Eclipse Distribution License 1.0'
compatible_licenses['Mozilla Public License Version 2.0'] = 'Mozilla Public License Version 2.0'
compatible_licenses['Mozilla Public License, Version 2.0'] = 'Mozilla Public License Version 2.0'
compatible_licenses['Creative Commons Attribution 2.5'] = 'Creative Commons Attribution 2.5'
compatible_licenses['Creative Commons CC0'] = 'Creative Commons CC0'
compatible_licenses['CC0'] = 'Creative Commons CC0'
compatible_licenses['Public Domain, per Creative Commons CC0'] = 'Creative Commons CC0'
compatible_licenses['The MIT License'] = 'MIT License'
compatible_licenses['MIT License'] = 'MIT License'
compatible_licenses['The MIT License (MIT)'] = 'MIT License'
compatible_licenses['Bouncy Castle Licence'] = 'MIT License'
compatible_licenses['SPDX-License-Identifier: MIT'] = 'MIT License'
compatible_licenses['The Go license'] = 'The Go license'
compatible_licenses['-'] = '-'
return compatible_licenses
def get_dep_key(group_id, artifact_id, version):
return (group_id, artifact_id, version)
def get_version_string(version):
if type(version) == str:
return version
else:
return str(version)
def find_druid_module_name(dirpath):
ext_start = dirpath.find("/ext/")
if ext_start > 0:
# Found an extension
subpath = dirpath[(len("/ext/") + ext_start):]
ext_name_end = subpath.find("/")
if ext_name_end < 0:
raise Exception("Can't determine extension name from [{}]".format(dirpath))
else:
return subpath[0:ext_name_end]
else:
# Druid core
return "core"
def check_licenses(license_yaml, dependency_reports_root):
# Build a dictionary to facilitate comparing reported licenses and registered ones.
# These dictionaries are the mapping of (group_id, artifact_id, version) to license_name.
# Build reported license dictionary.
reported_dep_to_licenses = {}
compatible_license_names = build_compatible_license_names()
for dirpath, dirnames, filenames in os.walk(dependency_reports_root):
for filename in filenames:
if filename == "dependencies.html":
full_path = os.path.join(dirpath, filename)
# Determine if it's druid core or an extension
druid_module_name = find_druid_module_name(dirpath)
print_log_to_stderr("Parsing {}".format(full_path))
with open(full_path, encoding="utf-8") as report_file:
parser = DependencyReportParser(druid_module_name, compatible_license_names)
reported_dep_to_licenses.update(parser.parse(report_file))
if len(reported_dep_to_licenses) == 0:
raise Exception("No dependency reports are found")
print_log_to_stderr("Found {} reported licenses\n".format(len(reported_dep_to_licenses)))
# Build registered license dictionary.
registered_dep_to_licenses = {}
skipping_licenses = {}
with open(license_yaml, encoding='utf-8') as registry_file:
licenses_list = list(yaml.load_all(registry_file, Loader=yaml.FullLoader))
for license in licenses_list:
if 'libraries' in license:
for library in license['libraries']:
if type(library) is not dict:
raise Exception("Expected dict but got {}[{}]".format(type(library), library))
if len(library) > 1:
raise Exception("Expected 1 groupId and artifactId, but got [{}]".format(library))
for group_id, artifact_id in library.items():
if 'version' not in license:
raise Exception("version is missing in {}".format(license))
if 'license_name' not in license:
raise Exception("name is missing in {}".format(license))
if 'skip_dependency_report_check' in license and license['skip_dependency_report_check']:
if 'version' not in license:
version = "-"
else:
version = get_version_string(license['version'])
skipping_licenses[get_dep_key(group_id, artifact_id, version)] = license
else:
registered_dep_to_licenses[get_dep_key(group_id, artifact_id, get_version_string(license['version']))] = compatible_license_names[license['license_name']]
if len(registered_dep_to_licenses) == 0:
raise Exception("No registered licenses are found")
# Compare licenses in registry and those in dependency reports.
mismatched_licenses = []
missing_licenses = []
unchecked_licenses = []
# Iterate through registered licenses and check if its license is same with the reported one.
for key, registered_license in registered_dep_to_licenses.items():
if key in reported_dep_to_licenses: # key is (group_id, artifact_id, version)
reported_license_druid_module = reported_dep_to_licenses[key]
reported_license = reported_license_druid_module[0]
druid_module = reported_license_druid_module[1]
if reported_license is not None and reported_license != "-" and reported_license != registered_license:
group_id = key[0]
artifact_id = key[1]
version = key[2]
mismatched_licenses.append((druid_module, group_id, artifact_id, version, reported_license, registered_license))
# If we find any mismatched license, stop immediately.
if len(mismatched_licenses) > 0:
print_log_to_stderr("Error: found {} mismatches between reported licenses and registered licenses".format(len(mismatched_licenses)))
for mismatched_license in mismatched_licenses:
print_log_to_stderr("druid_module: {}, groupId: {}, artifactId: {}, version: {}, reported_license: {}, registered_license: {}".format(mismatched_license[0], mismatched_license[1], mismatched_license[2], mismatched_license[3], mismatched_license[4], mismatched_license[5]))
print_log_to_stderr("")
# Let's find missing licenses, which are reported but missing in the registry.
for key, reported_license_druid_module in reported_dep_to_licenses.items():
if reported_license_druid_module[0] != "-" and key not in registered_dep_to_licenses and key not in skipping_licenses:
missing_licenses.append((reported_license_druid_module[1], key[0], key[1], key[2], reported_license_druid_module[0]))
if len(missing_licenses) > 0:
print_log_to_stderr("Error: found {} missing licenses. These licenses are reported, but missing in the registry".format(len(missing_licenses)))
for missing_license in missing_licenses:
print_log_to_stderr("druid_module: {}, groupId: {}, artifactId: {}, version: {}, license: {}".format(missing_license[0], missing_license[1], missing_license[2], missing_license[3], missing_license[4]))
print_log_to_stderr("")
# Let's find unchecked licenses, which are registered but missing in the report.
# These licenses should be checked manually.
for key, registered_license in registered_dep_to_licenses.items():
if key not in reported_dep_to_licenses:
unchecked_licenses.append((key[0], key[1], key[2], registered_license))
elif reported_dep_to_licenses[key][0] == "-":
unchecked_licenses.append((key[0], key[1], key[2], registered_license))
if len(unchecked_licenses) > 0:
print_log_to_stderr("Warn: found {} unchecked licenses. These licenses are registered, but not found in dependency reports.".format(len(unchecked_licenses)))
print_log_to_stderr("These licenses must be checked manually.")
for unchecked_license in unchecked_licenses:
print_log_to_stderr("groupId: {}, artifactId: {}, version: {}, reported_license: {}".format(unchecked_license[0], unchecked_license[1], unchecked_license[2], unchecked_license[3]))
print_log_to_stderr("")
if len(mismatched_licenses) > 0 or len(missing_licenses) > 0:
sys.exit(1)
if __name__ == "__main__":
try:
parser = argparse.ArgumentParser(description='Check and generate license file.')
parser.add_argument('license_yaml', metavar='<path to license.yaml>', type=str)
parser.add_argument('dependency_reports_root', metavar='<root to maven dependency reports>', type=str)
args = parser.parse_args()
license_yaml = args.license_yaml
dependency_reports_root = args.dependency_reports_root
check_licenses(license_yaml, dependency_reports_root)
except KeyboardInterrupt:
print('Interrupted, closing.')