python-peps/pep-0465/scan-ops.py

#!/usr/bin/env python3
# https://peps.python.org/pep-0465/
# https://gist.github.com/njsmith/9157645

# usage:
#   python3 scan-ops.py stdlib_path sklearn_path nipy_path

import sys
import os
import os.path
import tokenize
from collections import OrderedDict

NON_SOURCE_TOKENS = [
    tokenize.COMMENT, tokenize.NL, tokenize.ENCODING, tokenize.NEWLINE,
    tokenize.INDENT, tokenize.DEDENT,
    ]

SKIP_OPS = list("(),.:[]{}@;") + ["->", "..."]

class TokenCounts(object):
    def __init__(self, dot_names=[]):
        self.counts = {}
        self.sloc = 0
        self.dot_names = dot_names

    def count(self, path):
        sloc_idxes = set()
        for token in tokenize.tokenize(open(path, "rb").readline):
            if token.type == tokenize.OP:
                self.counts.setdefault(token.string, 0)
                self.counts[token.string] += 1
            if token.string in self.dot_names:
                self.counts.setdefault("dot", 0)
                self.counts["dot"] += 1
            if token.type not in NON_SOURCE_TOKENS:
                sloc_idxes.add(token.start[0])
        self.sloc += len(sloc_idxes)

    @classmethod
    def combine(cls, objs):
        combined = cls()
        for obj in objs:
            for op, count in obj.counts.items():
                combined.counts.setdefault(op, 0)
                combined.counts[op] += count
            combined.sloc += obj.sloc
        return combined

def count_tree(root, **kwargs):
    c = TokenCounts(**kwargs)
    for dirpath, _, filenames in os.walk(root):
        for filename in filenames:
            if filename.endswith(".py"):
                path = os.path.join(dirpath, filename)
                try:
                    c.count(path)
                    sys.stderr.write(".")
                    sys.stderr.flush()
                except Exception as e:
                    sys.stderr.write("\nFailed to read %s: %s\n" % (path, e))
    return c

# count_objs is OrderedDict (name -> TokenCounts)
def summarize(count_objs, out):
    ops = {}
    for count_obj in count_objs.values():
        for op in count_obj.counts:
            ops[op] = []
    for count_obj in count_objs.values():
        for op, row in ops.items():
            count = count_obj.counts.get(op, 0)
            row.append(count / count_obj.sloc)
    titles = ["Op"] + list(count_objs)
    # 4 chars is enough for ops and all numbers.
    column_widths = [max(len(title), 4) for title in titles]

    rows = []
    for op, row in ops.items():
        #rows.append(["``" + op + "``"] + row)
        rows.append([op] + row)

    rows.sort(key=lambda row: row[-1])
    rows.reverse()

    def write_row(entries):
        out.write("  ".join(entries))
        out.write("\n")

    def lines():
        write_row("=" * w for w in column_widths)

    lines()
    write_row(t.rjust(w) for w, t in zip(column_widths, titles))
    lines()
    for row in rows:
        op = row[0]
        if op in SKIP_OPS:
            continue
        # numbers here are avg number of uses per sloc, which is
        # inconveniently small. convert to uses/1e4 sloc
        numbers = row[1:]
        number_strs = [str(int(round(x * 10000))) for x in numbers]
        formatted_row = [op] + number_strs
        write_row(str(e).rjust(w)
                  for w, e in zip(column_widths, formatted_row))
    lines()

def run_projects(names, dot_names, dirs, out):
    assert len(names) == len(dot_names) == len(dirs)
    count_objs = OrderedDict()
    for name, dot_name, dir in zip(names, dot_names, dirs):
        counts = count_tree(dir, dot_names=dot_name)
        count_objs[name] = counts
        out.write("%s: %s sloc\n" % (name, counts.sloc))
    count_objs["combined"] = TokenCounts.combine(count_objs.values())
    summarize(count_objs, out)

if __name__ == "__main__":
    run_projects(["stdlib", "scikit-learn", "nipy"],
                 [[],
                  # https://github.com/numpy/numpy/pull/4351#discussion_r9977913
                  # sklearn fast_dot is used to fix up some optimizations that
                  # are missing from older numpy's, but in modern days is
                  # exactly the same, so it's fair to count. safe_sparse_dot
                  # has hacks to workaround some quirks in scipy.sparse
                  # matrices, but these quirks are also already fixed, so
                  # counting this calls is also fair.
                  ["dot", "fast_dot", "safe_sparse_dot"],
                  ["dot"]],
                 sys.argv[1:],
                 sys.stdout)