Adding upstream version 1.4.13.
Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
parent
afaf4643e1
commit
03367abfa8
25 changed files with 7987 additions and 0 deletions
230
identify/identify.py
Normal file
230
identify/identify.py
Normal file
|
@ -0,0 +1,230 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import io
|
||||
import os.path
|
||||
import re
|
||||
import shlex
|
||||
import string
|
||||
import sys
|
||||
|
||||
from identify import extensions
|
||||
from identify import interpreters
|
||||
from identify.vendor import licenses
|
||||
|
||||
|
||||
printable = frozenset(string.printable)
|
||||
|
||||
DIRECTORY = 'directory'
|
||||
SYMLINK = 'symlink'
|
||||
FILE = 'file'
|
||||
EXECUTABLE = 'executable'
|
||||
NON_EXECUTABLE = 'non-executable'
|
||||
TEXT = 'text'
|
||||
BINARY = 'binary'
|
||||
|
||||
ALL_TAGS = {DIRECTORY, SYMLINK, FILE, EXECUTABLE, NON_EXECUTABLE, TEXT, BINARY}
|
||||
ALL_TAGS.update(*extensions.EXTENSIONS.values())
|
||||
ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values())
|
||||
ALL_TAGS.update(*extensions.NAMES.values())
|
||||
ALL_TAGS.update(*interpreters.INTERPRETERS.values())
|
||||
ALL_TAGS = frozenset(ALL_TAGS)
|
||||
|
||||
|
||||
def tags_from_path(path):
|
||||
if not os.path.lexists(path):
|
||||
raise ValueError('{} does not exist.'.format(path))
|
||||
if os.path.isdir(path):
|
||||
return {DIRECTORY}
|
||||
if os.path.islink(path):
|
||||
return {SYMLINK}
|
||||
|
||||
tags = {FILE}
|
||||
|
||||
executable = os.access(path, os.X_OK)
|
||||
if executable:
|
||||
tags.add(EXECUTABLE)
|
||||
else:
|
||||
tags.add(NON_EXECUTABLE)
|
||||
|
||||
# As an optimization, if we're able to read tags from the filename, then we
|
||||
# don't peek at the file contents.
|
||||
t = tags_from_filename(os.path.basename(path))
|
||||
if len(t) > 0:
|
||||
tags.update(t)
|
||||
else:
|
||||
if executable:
|
||||
shebang = parse_shebang_from_file(path)
|
||||
if len(shebang) > 0:
|
||||
tags.update(tags_from_interpreter(shebang[0]))
|
||||
|
||||
# some extensions can be both binary and text
|
||||
# see EXTENSIONS_NEED_BINARY_CHECK
|
||||
if not {TEXT, BINARY} & tags:
|
||||
if file_is_text(path):
|
||||
tags.add(TEXT)
|
||||
else:
|
||||
tags.add(BINARY)
|
||||
|
||||
assert {TEXT, BINARY} & tags, tags
|
||||
assert {EXECUTABLE, NON_EXECUTABLE} & tags, tags
|
||||
return tags
|
||||
|
||||
|
||||
def tags_from_filename(filename):
|
||||
_, filename = os.path.split(filename)
|
||||
_, ext = os.path.splitext(filename)
|
||||
|
||||
ret = set()
|
||||
|
||||
# Allow e.g. "Dockerfile.xenial" to match "Dockerfile"
|
||||
for part in [filename] + filename.split('.'):
|
||||
if part in extensions.NAMES:
|
||||
ret.update(extensions.NAMES[part])
|
||||
break
|
||||
|
||||
if len(ext) > 0:
|
||||
ext = ext[1:].lower()
|
||||
if ext in extensions.EXTENSIONS:
|
||||
ret.update(extensions.EXTENSIONS[ext])
|
||||
elif ext in extensions.EXTENSIONS_NEED_BINARY_CHECK:
|
||||
ret.update(extensions.EXTENSIONS_NEED_BINARY_CHECK[ext])
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
def tags_from_interpreter(interpreter):
|
||||
_, _, interpreter = interpreter.rpartition('/')
|
||||
|
||||
# Try "python3.5.2" => "python3.5" => "python3" until one matches.
|
||||
while interpreter:
|
||||
if interpreter in interpreters.INTERPRETERS:
|
||||
return interpreters.INTERPRETERS[interpreter]
|
||||
else:
|
||||
interpreter, _, _ = interpreter.rpartition('.')
|
||||
|
||||
return set()
|
||||
|
||||
|
||||
def is_text(bytesio):
|
||||
"""Return whether the first KB of contents seems to be binary.
|
||||
|
||||
This is roughly based on libmagic's binary/text detection:
|
||||
https://github.com/file/file/blob/df74b09b9027676088c797528edcaae5a9ce9ad0/src/encoding.c#L203-L228
|
||||
"""
|
||||
text_chars = (
|
||||
bytearray([7, 8, 9, 10, 11, 12, 13, 27]) +
|
||||
bytearray(range(0x20, 0x7F)) +
|
||||
bytearray(range(0x80, 0X100))
|
||||
)
|
||||
return not bool(bytesio.read(1024).translate(None, text_chars))
|
||||
|
||||
|
||||
def file_is_text(path):
|
||||
if not os.path.lexists(path):
|
||||
raise ValueError('{} does not exist.'.format(path))
|
||||
with open(path, 'rb') as f:
|
||||
return is_text(f)
|
||||
|
||||
|
||||
def _shebang_split(line):
|
||||
try:
|
||||
# shebangs aren't supposed to be quoted, though some tools such as
|
||||
# setuptools will write them with quotes so we'll best-guess parse
|
||||
# with shlex first
|
||||
return shlex.split(line)
|
||||
except ValueError:
|
||||
# failing that, we'll do a more "traditional" shebang parsing which
|
||||
# just involves splitting by whitespace
|
||||
return line.split()
|
||||
|
||||
|
||||
def parse_shebang(bytesio):
|
||||
"""Parse the shebang from a file opened for reading binary."""
|
||||
if bytesio.read(2) != b'#!':
|
||||
return ()
|
||||
first_line = bytesio.readline()
|
||||
try:
|
||||
first_line = first_line.decode('UTF-8')
|
||||
except UnicodeDecodeError:
|
||||
return ()
|
||||
|
||||
# Require only printable ascii
|
||||
for c in first_line:
|
||||
if c not in printable:
|
||||
return ()
|
||||
|
||||
cmd = tuple(_shebang_split(first_line.strip()))
|
||||
if cmd and cmd[0] == '/usr/bin/env':
|
||||
cmd = cmd[1:]
|
||||
return cmd
|
||||
|
||||
|
||||
def parse_shebang_from_file(path):
|
||||
"""Parse the shebang given a file path."""
|
||||
if not os.path.lexists(path):
|
||||
raise ValueError('{} does not exist.'.format(path))
|
||||
if not os.access(path, os.X_OK):
|
||||
return ()
|
||||
|
||||
with open(path, 'rb') as f:
|
||||
return parse_shebang(f)
|
||||
|
||||
|
||||
COPYRIGHT_RE = re.compile(r'^\s*(Copyright|\(C\)) .*$', re.I | re.MULTILINE)
|
||||
WS_RE = re.compile(r'\s+')
|
||||
|
||||
|
||||
def _norm_license(s):
|
||||
s = COPYRIGHT_RE.sub('', s)
|
||||
s = WS_RE.sub(' ', s)
|
||||
return s.strip()
|
||||
|
||||
|
||||
def license_id(filename):
|
||||
"""Return the spdx id for the license contained in `filename`. If no
|
||||
license is detected, returns `None`.
|
||||
|
||||
spdx: https://spdx.org/licenses/
|
||||
licenses from choosealicense.com: https://github.com/choosealicense.com
|
||||
|
||||
Approximate algorithm:
|
||||
|
||||
1. strip copyright line
|
||||
2. normalize whitespace (replace all whitespace with a single space)
|
||||
3. check exact text match with existing licenses
|
||||
4. failing that use edit distance
|
||||
"""
|
||||
import editdistance # `pip install identify[license]`
|
||||
|
||||
with io.open(filename, encoding='UTF-8') as f:
|
||||
contents = f.read()
|
||||
|
||||
norm = _norm_license(contents)
|
||||
|
||||
min_edit_dist = sys.maxsize
|
||||
min_edit_dist_spdx = ''
|
||||
|
||||
# try exact matches
|
||||
for spdx, text in licenses.LICENSES:
|
||||
norm_license = _norm_license(text)
|
||||
if norm == norm_license:
|
||||
return spdx
|
||||
|
||||
# skip the slow calculation if the lengths are very different
|
||||
if norm and abs(len(norm) - len(norm_license)) / len(norm) > .05:
|
||||
continue
|
||||
|
||||
edit_dist = editdistance.eval(norm, norm_license)
|
||||
if edit_dist < min_edit_dist:
|
||||
min_edit_dist = edit_dist
|
||||
min_edit_dist_spdx = spdx
|
||||
|
||||
# if there's less than 5% edited from the license, we found our match
|
||||
if norm and min_edit_dist / len(norm) < .05:
|
||||
return min_edit_dist_spdx
|
||||
else:
|
||||
# no matches :'(
|
||||
return None
|
Loading…
Add table
Add a link
Reference in a new issue