1
0
Fork 0
identify/identify/identify.py
Daniel Baumann 03367abfa8
Adding upstream version 1.4.13.
Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-02-12 16:43:50 +01:00

230 lines
6.4 KiB
Python

# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
import io
import os.path
import re
import shlex
import string
import sys
from identify import extensions
from identify import interpreters
from identify.vendor import licenses
printable = frozenset(string.printable)
DIRECTORY = 'directory'
SYMLINK = 'symlink'
FILE = 'file'
EXECUTABLE = 'executable'
NON_EXECUTABLE = 'non-executable'
TEXT = 'text'
BINARY = 'binary'
ALL_TAGS = {DIRECTORY, SYMLINK, FILE, EXECUTABLE, NON_EXECUTABLE, TEXT, BINARY}
ALL_TAGS.update(*extensions.EXTENSIONS.values())
ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values())
ALL_TAGS.update(*extensions.NAMES.values())
ALL_TAGS.update(*interpreters.INTERPRETERS.values())
ALL_TAGS = frozenset(ALL_TAGS)
def tags_from_path(path):
if not os.path.lexists(path):
raise ValueError('{} does not exist.'.format(path))
if os.path.isdir(path):
return {DIRECTORY}
if os.path.islink(path):
return {SYMLINK}
tags = {FILE}
executable = os.access(path, os.X_OK)
if executable:
tags.add(EXECUTABLE)
else:
tags.add(NON_EXECUTABLE)
# As an optimization, if we're able to read tags from the filename, then we
# don't peek at the file contents.
t = tags_from_filename(os.path.basename(path))
if len(t) > 0:
tags.update(t)
else:
if executable:
shebang = parse_shebang_from_file(path)
if len(shebang) > 0:
tags.update(tags_from_interpreter(shebang[0]))
# some extensions can be both binary and text
# see EXTENSIONS_NEED_BINARY_CHECK
if not {TEXT, BINARY} & tags:
if file_is_text(path):
tags.add(TEXT)
else:
tags.add(BINARY)
assert {TEXT, BINARY} & tags, tags
assert {EXECUTABLE, NON_EXECUTABLE} & tags, tags
return tags
def tags_from_filename(filename):
_, filename = os.path.split(filename)
_, ext = os.path.splitext(filename)
ret = set()
# Allow e.g. "Dockerfile.xenial" to match "Dockerfile"
for part in [filename] + filename.split('.'):
if part in extensions.NAMES:
ret.update(extensions.NAMES[part])
break
if len(ext) > 0:
ext = ext[1:].lower()
if ext in extensions.EXTENSIONS:
ret.update(extensions.EXTENSIONS[ext])
elif ext in extensions.EXTENSIONS_NEED_BINARY_CHECK:
ret.update(extensions.EXTENSIONS_NEED_BINARY_CHECK[ext])
return ret
def tags_from_interpreter(interpreter):
_, _, interpreter = interpreter.rpartition('/')
# Try "python3.5.2" => "python3.5" => "python3" until one matches.
while interpreter:
if interpreter in interpreters.INTERPRETERS:
return interpreters.INTERPRETERS[interpreter]
else:
interpreter, _, _ = interpreter.rpartition('.')
return set()
def is_text(bytesio):
"""Return whether the first KB of contents seems to be binary.
This is roughly based on libmagic's binary/text detection:
https://github.com/file/file/blob/df74b09b9027676088c797528edcaae5a9ce9ad0/src/encoding.c#L203-L228
"""
text_chars = (
bytearray([7, 8, 9, 10, 11, 12, 13, 27]) +
bytearray(range(0x20, 0x7F)) +
bytearray(range(0x80, 0X100))
)
return not bool(bytesio.read(1024).translate(None, text_chars))
def file_is_text(path):
if not os.path.lexists(path):
raise ValueError('{} does not exist.'.format(path))
with open(path, 'rb') as f:
return is_text(f)
def _shebang_split(line):
try:
# shebangs aren't supposed to be quoted, though some tools such as
# setuptools will write them with quotes so we'll best-guess parse
# with shlex first
return shlex.split(line)
except ValueError:
# failing that, we'll do a more "traditional" shebang parsing which
# just involves splitting by whitespace
return line.split()
def parse_shebang(bytesio):
"""Parse the shebang from a file opened for reading binary."""
if bytesio.read(2) != b'#!':
return ()
first_line = bytesio.readline()
try:
first_line = first_line.decode('UTF-8')
except UnicodeDecodeError:
return ()
# Require only printable ascii
for c in first_line:
if c not in printable:
return ()
cmd = tuple(_shebang_split(first_line.strip()))
if cmd and cmd[0] == '/usr/bin/env':
cmd = cmd[1:]
return cmd
def parse_shebang_from_file(path):
"""Parse the shebang given a file path."""
if not os.path.lexists(path):
raise ValueError('{} does not exist.'.format(path))
if not os.access(path, os.X_OK):
return ()
with open(path, 'rb') as f:
return parse_shebang(f)
COPYRIGHT_RE = re.compile(r'^\s*(Copyright|\(C\)) .*$', re.I | re.MULTILINE)
WS_RE = re.compile(r'\s+')
def _norm_license(s):
s = COPYRIGHT_RE.sub('', s)
s = WS_RE.sub(' ', s)
return s.strip()
def license_id(filename):
"""Return the spdx id for the license contained in `filename`. If no
license is detected, returns `None`.
spdx: https://spdx.org/licenses/
licenses from choosealicense.com: https://github.com/choosealicense.com
Approximate algorithm:
1. strip copyright line
2. normalize whitespace (replace all whitespace with a single space)
3. check exact text match with existing licenses
4. failing that use edit distance
"""
import editdistance # `pip install identify[license]`
with io.open(filename, encoding='UTF-8') as f:
contents = f.read()
norm = _norm_license(contents)
min_edit_dist = sys.maxsize
min_edit_dist_spdx = ''
# try exact matches
for spdx, text in licenses.LICENSES:
norm_license = _norm_license(text)
if norm == norm_license:
return spdx
# skip the slow calculation if the lengths are very different
if norm and abs(len(norm) - len(norm_license)) / len(norm) > .05:
continue
edit_dist = editdistance.eval(norm, norm_license)
if edit_dist < min_edit_dist:
min_edit_dist = edit_dist
min_edit_dist_spdx = spdx
# if there's less than 5% edited from the license, we found our match
if norm and min_edit_dist / len(norm) < .05:
return min_edit_dist_spdx
else:
# no matches :'(
return None