Adding upstream version 1.4.13.

Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-02-12 16:43:50 +01:00 · 2025-02-12 16:43:50 +01:00 · 03367abfa8
commit 03367abfa8
parent afaf4643e1
25 changed files with 7987 additions and 0 deletions
--- a/identify/init.py
+++ b/identify/init.py
--- a/identify/cli.py
+++ b/identify/cli.py
@ -0,0 +1,36 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+from __future__ import unicode_literals
+
+import argparse
+import json
+
+from identify import identify
+
+
+def main(argv=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--filename-only', action='store_true')
+    parser.add_argument('path')
+    args = parser.parse_args(argv)
+
+    if args.filename_only:
+        func = identify.tags_from_filename
+    else:
+        func = identify.tags_from_path
+
+    try:
+        tags = sorted(func(args.path))
+    except ValueError as e:
+        print(e)
+        return 1
+
+    if not tags:
+        return 1
+    else:
+        print(json.dumps(tags))
+        return 0
+
+
+if __name__ == '__main__':
+    exit(main())
--- a/identify/extensions.py
+++ b/identify/extensions.py
@ -0,0 +1,224 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+from __future__ import unicode_literals
+
+
+EXTENSIONS = {
+    'apinotes': {'text', 'apinotes'},
+    'asar': {'binary', 'asar'},
+    'bash': {'text', 'shell', 'bash'},
+    'bat': {'text', 'batch'},
+    'bmp': {'binary', 'image', 'bitmap'},
+    'bz2': {'binary', 'bzip2'},
+    'c': {'text', 'c'},
+    'cc': {'text', 'c++'},
+    'cu': {'text', 'cuda'},
+    'cfg': {'text'},
+    'cmake': {'text', 'cmake'},
+    'cnf': {'text'},
+    'coffee': {'text', 'coffee'},
+    'conf': {'text'},
+    'cpp': {'text', 'c++'},
+    'crt': {'text', 'pem'},
+    'cs': {'text', 'c#'},
+    'cson': {'text', 'cson'},
+    'css': {'text', 'css'},
+    'csv': {'text', 'csv'},
+    'cxx': {'text', 'c++'},
+    'dart': {'text', 'dart'},
+    'def': {'text', 'def'},
+    'dtd': {'text', 'dtd'},
+    'ear': {'binary', 'zip', 'jar'},
+    'ejs': {'text', 'ejs'},
+    'eot': {'binary', 'eot'},
+    'eps': {'binary', 'eps'},
+    'erb': {'text', 'erb'},
+    'exe': {'binary'},
+    'eyaml': {'text', 'yaml'},
+    'feature': {'text', 'gherkin'},
+    'fish': {'text', 'fish'},
+    'gemspec': {'text', 'ruby'},
+    'gif': {'binary', 'image', 'gif'},
+    'go': {'text', 'go'},
+    'gotmpl': {'text', 'gotmpl'},
+    'gpx': {'text', 'gpx', 'xml'},
+    'gradle': {'text', 'groovy'},
+    'groovy': {'text', 'groovy'},
+    'gyb': {'text', 'gyb'},
+    'gyp': {'text', 'gyp', 'python'},
+    'gypi': {'text', 'gyp', 'python'},
+    'gz': {'binary', 'gzip'},
+    'h': {'text', 'header', 'c', 'c++'},
+    'hpp': {'text', 'header', 'c++'},
+    'htm': {'text', 'html'},
+    'html': {'text', 'html'},
+    'hxx': {'text', 'header', 'c++'},
+    'icns': {'binary', 'icns'},
+    'ico': {'binary', 'icon'},
+    'ics': {'text', 'icalendar'},
+    'idl': {'text', 'idl'},
+    'idr': {'text', 'idris'},
+    'inc': {'text', 'inc'},
+    'ini': {'text', 'ini'},
+    'j2': {'text', 'jinja'},
+    'jade': {'text', 'jade'},
+    'jar': {'binary', 'zip', 'jar'},
+    'java': {'text', 'java'},
+    'jenkinsfile': {'text', 'groovy'},
+    'jinja': {'text', 'jinja'},
+    'jinja2': {'text', 'jinja'},
+    'jpeg': {'binary', 'image', 'jpeg'},
+    'jpg': {'binary', 'image', 'jpeg'},
+    'js': {'text', 'javascript'},
+    'json': {'text', 'json'},
+    'jsonnet': {'text', 'jsonnet'},
+    'jsx': {'text', 'jsx'},
+    'key': {'text', 'pem'},
+    'kml': {'text', 'kml', 'xml'},
+    'kt': {'text', 'kotlin'},
+    'less': {'text', 'less'},
+    'lidr': {'text', 'idris'},
+    'lua': {'text', 'lua'},
+    'm': {'text', 'c', 'objective-c'},
+    'manifest': {'text', 'manifest'},
+    'map': {'text', 'map'},
+    'markdown': {'text', 'markdown'},
+    'md': {'text', 'markdown'},
+    'mib': {'text', 'mib'},
+    'mk': {'text', 'makefile'},
+    'mm': {'text', 'c++', 'objective-c++'},
+    'modulemap': {'text', 'modulemap'},
+    'ngdoc': {'text', 'ngdoc'},
+    'nim': {'text', 'nim'},
+    'nims': {'text', 'nim'},
+    'nimble': {'text', 'nimble'},
+    'nix': {'text', 'nix'},
+    'otf': {'binary', 'otf'},
+    'p12': {'binary', 'p12'},
+    'patch': {'text', 'diff'},
+    'pdf': {'binary', 'pdf'},
+    'pem': {'text', 'pem'},
+    'php': {'text', 'php'},
+    'php4': {'text', 'php'},
+    'php5': {'text', 'php'},
+    'phtml': {'text', 'php'},
+    'pl': {'text', 'perl'},
+    'plantuml': {'text', 'plantuml'},
+    'pm': {'text', 'perl'},
+    'png': {'binary', 'image', 'png'},
+    'po': {'text', 'pofile'},
+    'pp': {'text', 'puppet'},
+    'properties': {'text', 'java-properties'},
+    'proto': {'text', 'proto'},
+    'puml': {'text', 'plantuml'},
+    'purs': {'text', 'purescript'},
+    'py': {'text', 'python'},
+    'pyi': {'text', 'pyi'},
+    'pyx': {'text', 'cython'},
+    'pxd': {'text', 'cython'},
+    'pxi': {'text', 'cython'},
+    'r': {'text', 'r'},
+    'rb': {'text', 'ruby'},
+    'rs': {'text', 'rust'},
+    'rst': {'text', 'rst'},
+    's': {'text', 'asm'},
+    'sbt': {'text', 'sbt', 'scala'},
+    'sc': {'text', 'scala'},
+    'scala': {'text', 'scala'},
+    'scss': {'text', 'scss'},
+    'scm': {'text', 'scheme'},
+    'sh': {'text', 'shell'},
+    'sls': {'text', 'salt'},
+    'so': {'binary'},
+    'sol': {'text', 'solidity'},
+    'spec': {'text', 'spec'},
+    'ss': {'text', 'scheme'},
+    'styl': {'text', 'stylus'},
+    'sql': {'text', 'sql'},
+    'svg': {'text', 'image', 'svg'},
+    'swf': {'binary', 'swf'},
+    'swift': {'text', 'swift'},
+    'swiftdeps': {'text', 'swiftdeps'},
+    'tac': {'text', 'twisted', 'python'},
+    'tar': {'binary', 'tar'},
+    'tgz': {'binary', 'gzip'},
+    'thrift': {'text', 'thrift'},
+    'tiff': {'binary', 'image', 'tiff'},
+    'toml': {'text', 'toml'},
+    'tf': {'text', 'terraform'},
+    'ts': {'text', 'ts'},
+    'tsx': {'text', 'tsx'},
+    'ttf': {'binary', 'ttf'},
+    'txt': {'text', 'plain-text'},
+    'vdx': {'text', 'vdx'},
+    'vim': {'text', 'vim'},
+    'vue': {'text', 'vue'},
+    'war': {'binary', 'zip', 'jar'},
+    'wav': {'binary', 'audio', 'wav'},
+    'wkt': {'text', 'wkt'},
+    'whl': {'binary', 'wheel', 'zip'},
+    'woff': {'binary', 'woff'},
+    'woff2': {'binary', 'woff2'},
+    'wsgi': {'text', 'wsgi', 'python'},
+    'xml': {'text', 'xml'},
+    'xq': {'text', 'xquery'},
+    'xql': {'text', 'xquery'},
+    'xqm': {'text', 'xquery'},
+    'xqu': {'text', 'xquery'},
+    'xquery': {'text', 'xquery'},
+    'xqy': {'text', 'xquery'},
+    'xsd': {'text', 'xml', 'xsd'},
+    'xsl': {'text', 'xml', 'xsl'},
+    'yaml': {'text', 'yaml'},
+    'yang': {'text', 'yang'},
+    'yin': {'text', 'xml', 'yin'},
+    'yml': {'text', 'yaml'},
+    'zig': {'text', 'zig'},
+    'zip': {'binary', 'zip'},
+    'zsh': {'text', 'shell', 'zsh'},
+}
+EXTENSIONS_NEED_BINARY_CHECK = {
+    'plist': {'plist'},
+}
+
+NAMES = {
+    '.babelrc': EXTENSIONS['json'] | {'babelrc'},
+    '.bashrc': EXTENSIONS['bash'],
+    '.bash_aliases': EXTENSIONS['bash'],
+    '.bash_profile': EXTENSIONS['bash'],
+    '.bowerrc': EXTENSIONS['json'] | {'bowerrc'},
+    '.coveragerc': EXTENSIONS['ini'] | {'coveragerc'},
+    '.dockerignore': {'text', 'dockerignore'},
+    '.editorconfig': {'text', 'editorconfig'},
+    '.gitconfig': EXTENSIONS['ini'] | {'gitconfig'},
+    '.hgrc': EXTENSIONS['ini'] | {'hgrc'},
+    '.gitattributes': {'text', 'gitattributes'},
+    '.gitignore': {'text', 'gitignore'},
+    '.gitmodules': {'text', 'gitmodules'},
+    '.jshintrc': EXTENSIONS['json'] | {'jshintrc'},
+    '.mailmap': {'text', 'mailmap'},
+    '.mention-bot': EXTENSIONS['json'] | {'mention-bot'},
+    '.npmignore': {'text', 'npmignore'},
+    '.pdbrc': EXTENSIONS['py'] | {'pdbrc'},
+    '.pypirc': EXTENSIONS['ini'] | {'pypirc'},
+    '.yamllint': EXTENSIONS['yaml'] | {'yamllint'},
+    '.zshrc': EXTENSIONS['zsh'],
+    'AUTHORS': EXTENSIONS['txt'],
+    'BUILD.bazel': {'text', 'bazel'},
+    'BUILD': {'text', 'bazel'},
+    'CMakeLists.txt': EXTENSIONS['cmake'],
+    'COPYING': EXTENSIONS['txt'],
+    'Dockerfile': {'text', 'dockerfile'},
+    'Gemfile': EXTENSIONS['rb'],
+    'Jenkinsfile': {'text', 'groovy'},
+    'LICENSE': EXTENSIONS['txt'],
+    'MAINTAINERS': EXTENSIONS['txt'],
+    'Makefile': EXTENSIONS['mk'],
+    'NOTICE': EXTENSIONS['txt'],
+    'PATENTS': EXTENSIONS['txt'],
+    'Pipfile': EXTENSIONS['toml'],
+    'Pipfile.lock': EXTENSIONS['json'],
+    'README': EXTENSIONS['txt'],
+    'Rakefile': EXTENSIONS['rb'],
+    'setup.cfg': EXTENSIONS['ini'],
+}
--- a/identify/identify.py
+++ b/identify/identify.py
@ -0,0 +1,230 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import unicode_literals
+
+import io
+import os.path
+import re
+import shlex
+import string
+import sys
+
+from identify import extensions
+from identify import interpreters
+from identify.vendor import licenses
+
+
+printable = frozenset(string.printable)
+
+DIRECTORY = 'directory'
+SYMLINK = 'symlink'
+FILE = 'file'
+EXECUTABLE = 'executable'
+NON_EXECUTABLE = 'non-executable'
+TEXT = 'text'
+BINARY = 'binary'
+
+ALL_TAGS = {DIRECTORY, SYMLINK, FILE, EXECUTABLE, NON_EXECUTABLE, TEXT, BINARY}
+ALL_TAGS.update(*extensions.EXTENSIONS.values())
+ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values())
+ALL_TAGS.update(*extensions.NAMES.values())
+ALL_TAGS.update(*interpreters.INTERPRETERS.values())
+ALL_TAGS = frozenset(ALL_TAGS)
+
+
+def tags_from_path(path):
+    if not os.path.lexists(path):
+        raise ValueError('{} does not exist.'.format(path))
+    if os.path.isdir(path):
+        return {DIRECTORY}
+    if os.path.islink(path):
+        return {SYMLINK}
+
+    tags = {FILE}
+
+    executable = os.access(path, os.X_OK)
+    if executable:
+        tags.add(EXECUTABLE)
+    else:
+        tags.add(NON_EXECUTABLE)
+
+    # As an optimization, if we're able to read tags from the filename, then we
+    # don't peek at the file contents.
+    t = tags_from_filename(os.path.basename(path))
+    if len(t) > 0:
+        tags.update(t)
+    else:
+        if executable:
+            shebang = parse_shebang_from_file(path)
+            if len(shebang) > 0:
+                tags.update(tags_from_interpreter(shebang[0]))
+
+    # some extensions can be both binary and text
+    # see EXTENSIONS_NEED_BINARY_CHECK
+    if not {TEXT, BINARY} & tags:
+        if file_is_text(path):
+            tags.add(TEXT)
+        else:
+            tags.add(BINARY)
+
+    assert {TEXT, BINARY} & tags, tags
+    assert {EXECUTABLE, NON_EXECUTABLE} & tags, tags
+    return tags
+
+
+def tags_from_filename(filename):
+    _, filename = os.path.split(filename)
+    _, ext = os.path.splitext(filename)
+
+    ret = set()
+
+    # Allow e.g. "Dockerfile.xenial" to match "Dockerfile"
+    for part in [filename] + filename.split('.'):
+        if part in extensions.NAMES:
+            ret.update(extensions.NAMES[part])
+            break
+
+    if len(ext) > 0:
+        ext = ext[1:].lower()
+        if ext in extensions.EXTENSIONS:
+            ret.update(extensions.EXTENSIONS[ext])
+        elif ext in extensions.EXTENSIONS_NEED_BINARY_CHECK:
+            ret.update(extensions.EXTENSIONS_NEED_BINARY_CHECK[ext])
+
+    return ret
+
+
+def tags_from_interpreter(interpreter):
+    _, _, interpreter = interpreter.rpartition('/')
+
+    # Try "python3.5.2" => "python3.5" => "python3" until one matches.
+    while interpreter:
+        if interpreter in interpreters.INTERPRETERS:
+            return interpreters.INTERPRETERS[interpreter]
+        else:
+            interpreter, _, _ = interpreter.rpartition('.')
+
+    return set()
+
+
+def is_text(bytesio):
+    """Return whether the first KB of contents seems to be binary.
+
+    This is roughly based on libmagic's binary/text detection:
+    https://github.com/file/file/blob/df74b09b9027676088c797528edcaae5a9ce9ad0/src/encoding.c#L203-L228
+    """
+    text_chars = (
+        bytearray([7, 8, 9, 10, 11, 12, 13, 27]) +
+        bytearray(range(0x20, 0x7F)) +
+        bytearray(range(0x80, 0X100))
+    )
+    return not bool(bytesio.read(1024).translate(None, text_chars))
+
+
+def file_is_text(path):
+    if not os.path.lexists(path):
+        raise ValueError('{} does not exist.'.format(path))
+    with open(path, 'rb') as f:
+        return is_text(f)
+
+
+def _shebang_split(line):
+    try:
+        # shebangs aren't supposed to be quoted, though some tools such as
+        # setuptools will write them with quotes so we'll best-guess parse
+        # with shlex first
+        return shlex.split(line)
+    except ValueError:
+        # failing that, we'll do a more "traditional" shebang parsing which
+        # just involves splitting by whitespace
+        return line.split()
+
+
+def parse_shebang(bytesio):
+    """Parse the shebang from a file opened for reading binary."""
+    if bytesio.read(2) != b'#!':
+        return ()
+    first_line = bytesio.readline()
+    try:
+        first_line = first_line.decode('UTF-8')
+    except UnicodeDecodeError:
+        return ()
+
+    # Require only printable ascii
+    for c in first_line:
+        if c not in printable:
+            return ()
+
+    cmd = tuple(_shebang_split(first_line.strip()))
+    if cmd and cmd[0] == '/usr/bin/env':
+        cmd = cmd[1:]
+    return cmd
+
+
+def parse_shebang_from_file(path):
+    """Parse the shebang given a file path."""
+    if not os.path.lexists(path):
+        raise ValueError('{} does not exist.'.format(path))
+    if not os.access(path, os.X_OK):
+        return ()
+
+    with open(path, 'rb') as f:
+        return parse_shebang(f)
+
+
+COPYRIGHT_RE = re.compile(r'^\s*(Copyright|\(C\)) .*$', re.I | re.MULTILINE)
+WS_RE = re.compile(r'\s+')
+
+
+def _norm_license(s):
+    s = COPYRIGHT_RE.sub('', s)
+    s = WS_RE.sub(' ', s)
+    return s.strip()
+
+
+def license_id(filename):
+    """Return the spdx id for the license contained in `filename`.  If no
+    license is detected, returns `None`.
+
+    spdx: https://spdx.org/licenses/
+    licenses from choosealicense.com: https://github.com/choosealicense.com
+
+    Approximate algorithm:
+
+    1. strip copyright line
+    2. normalize whitespace (replace all whitespace with a single space)
+    3. check exact text match with existing licenses
+    4. failing that use edit distance
+    """
+    import editdistance  # `pip install identify[license]`
+
+    with io.open(filename, encoding='UTF-8') as f:
+        contents = f.read()
+
+    norm = _norm_license(contents)
+
+    min_edit_dist = sys.maxsize
+    min_edit_dist_spdx = ''
+
+    # try exact matches
+    for spdx, text in licenses.LICENSES:
+        norm_license = _norm_license(text)
+        if norm == norm_license:
+            return spdx
+
+        # skip the slow calculation if the lengths are very different
+        if norm and abs(len(norm) - len(norm_license)) / len(norm) > .05:
+            continue
+
+        edit_dist = editdistance.eval(norm, norm_license)
+        if edit_dist < min_edit_dist:
+            min_edit_dist = edit_dist
+            min_edit_dist_spdx = spdx
+
+    # if there's less than 5% edited from the license, we found our match
+    if norm and min_edit_dist / len(norm) < .05:
+        return min_edit_dist_spdx
+    else:
+        # no matches :'(
+        return None
--- a/identify/interpreters.py
+++ b/identify/interpreters.py
@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+from __future__ import unicode_literals
+
+INTERPRETERS = {
+    'bash': {'shell', 'bash'},
+    'dash': {'shell', 'dash'},
+    'node': {'javascript'},
+    'nodejs': {'javascript'},
+    'perl': {'perl'},
+    'python': {'python'},
+    'python2': {'python', 'python2'},
+    'python3': {'python', 'python3'},
+    'ruby': {'ruby'},
+    'sh': {'shell', 'sh'},
+    'tcsh': {'shell', 'tcsh'},
+    'zsh': {'shell', 'zsh'},
+}
--- a/identify/vendor/init.py
+++ b/identify/vendor/init.py
--- a/identify/vendor/licenses.py
+++ b/identify/vendor/licenses.py