identify/identify/identify.py

# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals

import io
import os.path
import re
import shlex
import string
import sys

from identify import extensions
from identify import interpreters
from identify.vendor import licenses


printable = frozenset(string.printable)

DIRECTORY = 'directory'
SYMLINK = 'symlink'
FILE = 'file'
EXECUTABLE = 'executable'
NON_EXECUTABLE = 'non-executable'
TEXT = 'text'
BINARY = 'binary'

ALL_TAGS = {DIRECTORY, SYMLINK, FILE, EXECUTABLE, NON_EXECUTABLE, TEXT, BINARY}
ALL_TAGS.update(*extensions.EXTENSIONS.values())
ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values())
ALL_TAGS.update(*extensions.NAMES.values())
ALL_TAGS.update(*interpreters.INTERPRETERS.values())
ALL_TAGS = frozenset(ALL_TAGS)


def tags_from_path(path):
    if not os.path.lexists(path):
        raise ValueError('{} does not exist.'.format(path))
    if os.path.isdir(path):
        return {DIRECTORY}
    if os.path.islink(path):
        return {SYMLINK}

    tags = {FILE}

    executable = os.access(path, os.X_OK)
    if executable:
        tags.add(EXECUTABLE)
    else:
        tags.add(NON_EXECUTABLE)

    # As an optimization, if we're able to read tags from the filename, then we
    # don't peek at the file contents.
    t = tags_from_filename(os.path.basename(path))
    if len(t) > 0:
        tags.update(t)
    else:
        if executable:
            shebang = parse_shebang_from_file(path)
            if len(shebang) > 0:
                tags.update(tags_from_interpreter(shebang[0]))

    # some extensions can be both binary and text
    # see EXTENSIONS_NEED_BINARY_CHECK
    if not {TEXT, BINARY} & tags:
        if file_is_text(path):
            tags.add(TEXT)
        else:
            tags.add(BINARY)

    assert {TEXT, BINARY} & tags, tags
    assert {EXECUTABLE, NON_EXECUTABLE} & tags, tags
    return tags


def tags_from_filename(filename):
    _, filename = os.path.split(filename)
    _, ext = os.path.splitext(filename)

    ret = set()

    # Allow e.g. "Dockerfile.xenial" to match "Dockerfile"
    for part in [filename] + filename.split('.'):
        if part in extensions.NAMES:
            ret.update(extensions.NAMES[part])
            break

    if len(ext) > 0:
        ext = ext[1:].lower()
        if ext in extensions.EXTENSIONS:
            ret.update(extensions.EXTENSIONS[ext])
        elif ext in extensions.EXTENSIONS_NEED_BINARY_CHECK:
            ret.update(extensions.EXTENSIONS_NEED_BINARY_CHECK[ext])

    return ret


def tags_from_interpreter(interpreter):
    _, _, interpreter = interpreter.rpartition('/')

    # Try "python3.5.2" => "python3.5" => "python3" until one matches.
    while interpreter:
        if interpreter in interpreters.INTERPRETERS:
            return interpreters.INTERPRETERS[interpreter]
        else:
            interpreter, _, _ = interpreter.rpartition('.')

    return set()


def is_text(bytesio):
    """Return whether the first KB of contents seems to be binary.

    This is roughly based on libmagic's binary/text detection:
    https://github.com/file/file/blob/df74b09b9027676088c797528edcaae5a9ce9ad0/src/encoding.c#L203-L228
    """
    text_chars = (
        bytearray([7, 8, 9, 10, 11, 12, 13, 27]) +
        bytearray(range(0x20, 0x7F)) +
        bytearray(range(0x80, 0X100))
    )
    return not bool(bytesio.read(1024).translate(None, text_chars))


def file_is_text(path):
    if not os.path.lexists(path):
        raise ValueError('{} does not exist.'.format(path))
    with open(path, 'rb') as f:
        return is_text(f)


def _shebang_split(line):
    try:
        # shebangs aren't supposed to be quoted, though some tools such as
        # setuptools will write them with quotes so we'll best-guess parse
        # with shlex first
        return shlex.split(line)
    except ValueError:
        # failing that, we'll do a more "traditional" shebang parsing which
        # just involves splitting by whitespace
        return line.split()


def parse_shebang(bytesio):
    """Parse the shebang from a file opened for reading binary."""
    if bytesio.read(2) != b'#!':
        return ()
    first_line = bytesio.readline()
    try:
        first_line = first_line.decode('UTF-8')
    except UnicodeDecodeError:
        return ()

    # Require only printable ascii
    for c in first_line:
        if c not in printable:
            return ()

    cmd = tuple(_shebang_split(first_line.strip()))
    if cmd and cmd[0] == '/usr/bin/env':
        cmd = cmd[1:]
    return cmd


def parse_shebang_from_file(path):
    """Parse the shebang given a file path."""
    if not os.path.lexists(path):
        raise ValueError('{} does not exist.'.format(path))
    if not os.access(path, os.X_OK):
        return ()

    with open(path, 'rb') as f:
        return parse_shebang(f)


COPYRIGHT_RE = re.compile(r'^\s*(Copyright|\(C\)) .*$', re.I | re.MULTILINE)
WS_RE = re.compile(r'\s+')


def _norm_license(s):
    s = COPYRIGHT_RE.sub('', s)
    s = WS_RE.sub(' ', s)
    return s.strip()


def license_id(filename):
    """Return the spdx id for the license contained in `filename`.  If no
    license is detected, returns `None`.

    spdx: https://spdx.org/licenses/
    licenses from choosealicense.com: https://github.com/choosealicense.com

    Approximate algorithm:

    1. strip copyright line
    2. normalize whitespace (replace all whitespace with a single space)
    3. check exact text match with existing licenses
    4. failing that use edit distance
    """
    import editdistance  # `pip install identify[license]`

    with io.open(filename, encoding='UTF-8') as f:
        contents = f.read()

    norm = _norm_license(contents)

    min_edit_dist = sys.maxsize
    min_edit_dist_spdx = ''

    # try exact matches
    for spdx, text in licenses.LICENSES:
        norm_license = _norm_license(text)
        if norm == norm_license:
            return spdx

        # skip the slow calculation if the lengths are very different
        if norm and abs(len(norm) - len(norm_license)) / len(norm) > .05:
            continue

        edit_dist = editdistance.eval(norm, norm_license)
        if edit_dist < min_edit_dist:
            min_edit_dist = edit_dist
            min_edit_dist_spdx = spdx

    # if there's less than 5% edited from the license, we found our match
    if norm and min_edit_dist / len(norm) < .05:
        return min_edit_dist_spdx
    else:
        # no matches :'(
        return None