Auto-generate a complete NOTICE file.

Remove the hand-collated ones, and switch to a script that pulls the copyright headers out of every file and collects the unique ones. Change-Id: Ied3b98b3f56241df97166c410ff81de4e0157c9d
2012-08-09 15:17:46 -07:00
parent 709bb0f79c
commit 387d4b7de9
9 changed files with 5126 additions and 831 deletions
--- a/libc/tools/generate-NOTICE.py
+++ b/libc/tools/generate-NOTICE.py
@@ -0,0 +1,147 @@
+#!/usr/bin/python
+# Run with directory arguments from any directory, with no special setup required.
+
+import ftplib
+import hashlib
+import os
+import re
+import shutil
+import string
+import subprocess
+import sys
+import tarfile
+import tempfile
+
+def IsUninteresting(path):
+    path = path.lower()
+    if path.endswith(".mk") or path.endswith(".py") or path.endswith(".pyc") or path.endswith(".txt") or path.endswith(".3"):
+        return True
+    if path.endswith("/notice") or path.endswith("/readme") or path.endswith("/caveats"):
+        return True
+    if path.endswith("/zoneinfo.dat") or path.endswith("/zoneinfo.idx") or path.endswith("/zoneinfo.version") or path.endswith("/zoneinfo/generate"):
+        return True
+    return False
+
+def IsAutoGenerated(content):
+    if "generated by gensyscalls.py" in content or "generated by genserv.py" in content:
+        return True
+    if "This header was automatically generated from a Linux kernel header" in content:
+        return True
+    return False
+
+copyrights = set()
+
+def ExtractCopyrightAt(lines, i):
+    hash = lines[i].startswith("#")
+
+    # Read comment lines until we hit something that terminates a
+    # copyright header.
+    start = i
+    while i < len(lines):
+        if "*/" in lines[i]:
+            break
+        if hash and len(lines[i]) == 0:
+            break
+        if "\t@(#)" in lines[i] or "\tfrom: @(#)" in lines[i] or "From: @(#)" in lines[i] or "from OpenBSD:" in lines[i]:
+            break
+        if "\tcitrus Id: " in lines[i]:
+            break
+        if "\t$OpenBSD: " in lines[i] or " $FreeBSD: " in lines[i] or "\t$NetBSD: " in lines[i]:
+            break
+        if "$FreeBSD$" in lines[i] or "$Citrus$" in lines[i]:
+            break
+        i += 1
+
+    end = i
+
+    # Trim trailing cruft.
+    while end > 0:
+        if lines[end - 1] != " *" and lines[end - 1] != " * ====================================================":
+            break
+        end -= 1
+
+    # Remove C/assembler comment formatting, pulling out just the text.
+    clean_lines = []
+    for line in lines[start:end]:
+        line = line.replace("\t", "    ")
+        line = line.replace("/* ", "")
+        line = line.replace(" * ", "")
+        line = line.replace("** ", "")
+        line = line.replace("# ", "")
+        if line.startswith("++Copyright++"):
+            continue
+        line = line.replace("--Copyright--", "")
+        line = line.rstrip()
+        # These come last and take care of "blank" comment lines.
+        if line == "#" or line == " *" or line == "**" or line == "-":
+            line = ""
+        clean_lines.append(line)
+
+    # Trim blank lines from head and tail.
+    while clean_lines[0] == "":
+        clean_lines = clean_lines[1:]
+    while clean_lines[len(clean_lines) - 1] == "":
+        clean_lines = clean_lines[0:(len(clean_lines) - 1)]
+
+    copyright = "\n".join(clean_lines)
+    copyrights.add(copyright)
+
+    return i
+
+args = sys.argv[1:]
+if len(args) == 0:
+    args = [ "." ]
+
+for arg in args:
+    sys.stderr.write('Searching for source files in "%s"...\n' % arg)
+
+    for directory, sub_directories, filenames in os.walk(arg):
+        if ".git" in sub_directories:
+            sub_directories.remove(".git")
+        sub_directories = sorted(sub_directories)
+
+        for filename in sorted(filenames):
+            path = os.path.join(directory, filename)
+            if IsUninteresting(path):
+                #print "ignoring uninteresting file %s" % path
+                continue
+
+            try:
+                content = open(path, 'r').read().decode('utf-8')
+            except:
+                # TODO: update hash.h, md5.c, and md5.h; upstream is probably UTF-8 already.
+                sys.stderr.write('warning: bad UTF-8 in %s\n' % path)
+                content = open(path, 'r').read().decode('iso-8859-1')
+
+            lines = content.split("\n")
+
+            if len(lines) <= 4:
+                #print "ignoring short file %s" % path
+                continue
+
+            if IsAutoGenerated(content):
+                #print "ignoring auto-generated file %s" % path
+                continue
+
+            if not "Copyright" in content:
+                if "public domain" in content.lower():
+                    #print "ignoring public domain file %s" % path
+                    continue
+                sys.stderr.write('warning: no copyright notice found in "%s" (%d lines)\n' % (path, len(lines)))
+                continue
+
+            i = 0
+            while i < len(lines):
+                if "Copyright" in lines[i]:
+                    i = ExtractCopyrightAt(lines, i)
+                i += 1
+
+            #print path
+
+for copyright in copyrights:
+    print copyright.encode('utf-8')
+    print
+    print '-------------------------------------------------------------------'
+    print
+
+sys.exit(0)