gem5/util/find_copyrights.py

#!/usr/bin/env python

import os
import re
import sys

from file_types import lang_type, find_files

mode_line = re.compile('(-\*- *mode:.* *-\*-)')
shell_comment = re.compile(r'^\s*#')
lisp_comment = re.compile(r';')
cpp_comment = re.compile(r'//')
c_comment_start = re.compile(r'/\*')
c_comment_end   = re.compile(r'\*/')
def find_copyright_block(lines, lang_type):
    start = None
    if lang_type in ('python', 'make', 'shell', 'perl', 'scons'):
        for i,line in enumerate(lines):
            if i == 0 and (line.startswith('#!') or mode_line.search(line)):
                continue

            if shell_comment.search(line):
                if start is None:
                    start = i
            elif start is None:
                if line.strip():
                    return
            else:
                yield start, i-1
                start = None

    elif lang_type in ('lisp', ):
        for i,line in enumerate(lines):
            if i == 0 and mode_line.search(line):
                continue

            if lisp_comment.search(line):
                if start is None:
                    start = i
            elif start is None:
                if line.strip():
                    return
            else:
                yield start, i-1
                start = None

    elif lang_type in ('C', 'C++', 'swig', 'isa', 'asm', 'slicc',
                       'lex', 'yacc'):
        mode = None
        for i,line in enumerate(lines):
            if i == 0 and mode_line.search(line):
                continue

            if mode == 'C':
                assert start is not None, 'on line %d' % (i + 1)
                match = c_comment_end.search(line)
                if match:
                    yield start, i
                    mode = None
                continue

            cpp_match = cpp_comment.search(line)
            c_match = c_comment_start.search(line)

            if cpp_match:
                assert not c_match, 'on line %d' % (i + 1)
                if line[:cpp_match.start()].strip():
                    return
                if mode is None:
                    mode = 'CPP'
                    start = i
                else:
                    text = line[cpp_match.end():].lstrip()
                    if text.startswith("Copyright") > 0:
                        yield start, i-1
                        start = i
                continue
            elif mode == 'CPP':
                assert start is not None, 'on line %d' % (i + 1)
                if not line.strip():
                    continue
                yield start, i-1
                mode = None
                if not c_match:
                    return

            if c_match:
                assert mode is None, 'on line %d' % (i + 1)
                mode = 'C'
                start = i

            if mode is None and line.strip():
                return

    else:
        raise AttributeError, "Could not handle language %s" % lang_type

date_range_re = re.compile(r'([0-9]{4})\s*-\s*([0-9]{4})')
def process_dates(dates):
    dates = [ d.strip() for d in dates.split(',') ]

    output = set()
    for date in dates:
        match = date_range_re.match(date)
        if match:
            f,l = [ int(d) for d in match.groups() ]
            for i in xrange(f, l+1):
                output.add(i)
        else:
            try:
                date = int(date)
                output.add(date)
            except ValueError:
                pass

    return output

copyright_re = \
    re.compile(r'Copyright (\([cC]\)) ([-, 0-9]+)[\s*#/]*([A-z-,. ]+)',
               re.DOTALL)

authors_re = re.compile(r'^[\s*#/]*Authors:\s*([A-z .]+)\s*$')
more_authors_re = re.compile(r'^[\s*#/]*([A-z .]+)\s*$')

all_owners = set()
def get_data(lang_type, lines):
    data = []
    last = None
    for start,end in find_copyright_block(lines, lang_type):
        joined = ''.join(lines[start:end+1])
        match = copyright_re.search(joined)
        if not match:
            continue

        c,dates,owner = match.groups()
        dates = dates.strip()
        owner = owner.strip()

        all_owners.add(owner)
        try:
            dates = process_dates(dates)
        except Exception:
            print dates
            print owner
            raise

        authors = []
        for i in xrange(start,end+1):
            line = lines[i]
            if not authors:
                match = authors_re.search(line)
                if match:
                    authors.append(match.group(1).strip())
            else:
                match = more_authors_re.search(line)
                if not match:
                    for j in xrange(i, end+1):
                        line = lines[j].strip()
                        if not line:
                            end = j
                            break
                        if line.startswith('//'):
                            line = line[2:].lstrip()
                            if line:
                                end = j - 1
                                break
                    break
                authors.append(match.group(1).strip())

        info = (owner, dates, authors, start, end)
        data.append(info)

    return data

def datestr(dates):
    dates = list(dates)
    dates.sort()

    output = []
    def add_output(first, second):
        if first == second:
            output.append('%d' % (first))
        else:
            output.append('%d-%d' % (first, second))

    first = dates.pop(0)
    second = first
    while dates:
        next = dates.pop(0)
        if next == second + 1:
            second = next
        else:
            add_output(first, second)
            first = next
            second = next

    add_output(first, second)

    return ','.join(output)

usage_str = """usage:
%s [-v] <directory>"""

def usage(exitcode):
    print usage_str % sys.argv[0]
    if exitcode is not None:
        sys.exit(exitcode)

if __name__ == '__main__':
    import getopt

    show_counts = False
    ignore = set()
    verbose = False
    try:
        opts, args = getopt.getopt(sys.argv[1:], "ci:v")
    except getopt.GetoptError:
        usage(1)

    for o,a in opts:
        if o == '-c':
            show_counts = True
        if o == '-i':
            ignore.add(a)
        if o == '-v':
            verbose = True

    files = []

    for base in args:
        if os.path.isfile(base):
            files += [ (base, lang_type(base)) ]
        elif os.path.isdir(base):
            files += find_files(base)
        else:
            raise AttributeError, "can't access '%s'" %  base

    copyrights = {}
    counts = {}

    for filename, lang in files:
        f = file(filename, 'r')
        lines = f.readlines()
        if not lines:
            continue

        lines = [ line.rstrip('\r\n') for line in lines ]

        lt = lang_type(filename, lines[0])
        try:
            data = get_data(lt, lines)
        except Exception, e:
            if verbose:
                if len(e.args) == 1:
                    e.args = ('%s (%s))' % (e, filename), )
                print "could not parse %s: %s" % (filename, e)
            continue

        for owner, dates, authors, start, end in data:
            if owner not in copyrights:
                copyrights[owner] = set()
            if owner not in counts:
                counts[owner] = 0

            copyrights[owner] |= dates
            counts[owner] += 1

    info = [ (counts[o], d, o) for o,d in copyrights.items() ]

    for count,dates,owner in sorted(info, reverse=True):
        if show_counts:
            owner = '%s (%s files)' % (owner, count)
        print 'Copyright (c) %s %s' % (datestr(dates), owner)