Adding upstream version 1.4.13.
Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
parent
afaf4643e1
commit
03367abfa8
25 changed files with 7987 additions and 0 deletions
0
identify/__init__.py
Normal file
0
identify/__init__.py
Normal file
36
identify/cli.py
Normal file
36
identify/cli.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import argparse
|
||||
import json
|
||||
|
||||
from identify import identify
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--filename-only', action='store_true')
|
||||
parser.add_argument('path')
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
if args.filename_only:
|
||||
func = identify.tags_from_filename
|
||||
else:
|
||||
func = identify.tags_from_path
|
||||
|
||||
try:
|
||||
tags = sorted(func(args.path))
|
||||
except ValueError as e:
|
||||
print(e)
|
||||
return 1
|
||||
|
||||
if not tags:
|
||||
return 1
|
||||
else:
|
||||
print(json.dumps(tags))
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
exit(main())
|
224
identify/extensions.py
Normal file
224
identify/extensions.py
Normal file
|
@ -0,0 +1,224 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
EXTENSIONS = {
|
||||
'apinotes': {'text', 'apinotes'},
|
||||
'asar': {'binary', 'asar'},
|
||||
'bash': {'text', 'shell', 'bash'},
|
||||
'bat': {'text', 'batch'},
|
||||
'bmp': {'binary', 'image', 'bitmap'},
|
||||
'bz2': {'binary', 'bzip2'},
|
||||
'c': {'text', 'c'},
|
||||
'cc': {'text', 'c++'},
|
||||
'cu': {'text', 'cuda'},
|
||||
'cfg': {'text'},
|
||||
'cmake': {'text', 'cmake'},
|
||||
'cnf': {'text'},
|
||||
'coffee': {'text', 'coffee'},
|
||||
'conf': {'text'},
|
||||
'cpp': {'text', 'c++'},
|
||||
'crt': {'text', 'pem'},
|
||||
'cs': {'text', 'c#'},
|
||||
'cson': {'text', 'cson'},
|
||||
'css': {'text', 'css'},
|
||||
'csv': {'text', 'csv'},
|
||||
'cxx': {'text', 'c++'},
|
||||
'dart': {'text', 'dart'},
|
||||
'def': {'text', 'def'},
|
||||
'dtd': {'text', 'dtd'},
|
||||
'ear': {'binary', 'zip', 'jar'},
|
||||
'ejs': {'text', 'ejs'},
|
||||
'eot': {'binary', 'eot'},
|
||||
'eps': {'binary', 'eps'},
|
||||
'erb': {'text', 'erb'},
|
||||
'exe': {'binary'},
|
||||
'eyaml': {'text', 'yaml'},
|
||||
'feature': {'text', 'gherkin'},
|
||||
'fish': {'text', 'fish'},
|
||||
'gemspec': {'text', 'ruby'},
|
||||
'gif': {'binary', 'image', 'gif'},
|
||||
'go': {'text', 'go'},
|
||||
'gotmpl': {'text', 'gotmpl'},
|
||||
'gpx': {'text', 'gpx', 'xml'},
|
||||
'gradle': {'text', 'groovy'},
|
||||
'groovy': {'text', 'groovy'},
|
||||
'gyb': {'text', 'gyb'},
|
||||
'gyp': {'text', 'gyp', 'python'},
|
||||
'gypi': {'text', 'gyp', 'python'},
|
||||
'gz': {'binary', 'gzip'},
|
||||
'h': {'text', 'header', 'c', 'c++'},
|
||||
'hpp': {'text', 'header', 'c++'},
|
||||
'htm': {'text', 'html'},
|
||||
'html': {'text', 'html'},
|
||||
'hxx': {'text', 'header', 'c++'},
|
||||
'icns': {'binary', 'icns'},
|
||||
'ico': {'binary', 'icon'},
|
||||
'ics': {'text', 'icalendar'},
|
||||
'idl': {'text', 'idl'},
|
||||
'idr': {'text', 'idris'},
|
||||
'inc': {'text', 'inc'},
|
||||
'ini': {'text', 'ini'},
|
||||
'j2': {'text', 'jinja'},
|
||||
'jade': {'text', 'jade'},
|
||||
'jar': {'binary', 'zip', 'jar'},
|
||||
'java': {'text', 'java'},
|
||||
'jenkinsfile': {'text', 'groovy'},
|
||||
'jinja': {'text', 'jinja'},
|
||||
'jinja2': {'text', 'jinja'},
|
||||
'jpeg': {'binary', 'image', 'jpeg'},
|
||||
'jpg': {'binary', 'image', 'jpeg'},
|
||||
'js': {'text', 'javascript'},
|
||||
'json': {'text', 'json'},
|
||||
'jsonnet': {'text', 'jsonnet'},
|
||||
'jsx': {'text', 'jsx'},
|
||||
'key': {'text', 'pem'},
|
||||
'kml': {'text', 'kml', 'xml'},
|
||||
'kt': {'text', 'kotlin'},
|
||||
'less': {'text', 'less'},
|
||||
'lidr': {'text', 'idris'},
|
||||
'lua': {'text', 'lua'},
|
||||
'm': {'text', 'c', 'objective-c'},
|
||||
'manifest': {'text', 'manifest'},
|
||||
'map': {'text', 'map'},
|
||||
'markdown': {'text', 'markdown'},
|
||||
'md': {'text', 'markdown'},
|
||||
'mib': {'text', 'mib'},
|
||||
'mk': {'text', 'makefile'},
|
||||
'mm': {'text', 'c++', 'objective-c++'},
|
||||
'modulemap': {'text', 'modulemap'},
|
||||
'ngdoc': {'text', 'ngdoc'},
|
||||
'nim': {'text', 'nim'},
|
||||
'nims': {'text', 'nim'},
|
||||
'nimble': {'text', 'nimble'},
|
||||
'nix': {'text', 'nix'},
|
||||
'otf': {'binary', 'otf'},
|
||||
'p12': {'binary', 'p12'},
|
||||
'patch': {'text', 'diff'},
|
||||
'pdf': {'binary', 'pdf'},
|
||||
'pem': {'text', 'pem'},
|
||||
'php': {'text', 'php'},
|
||||
'php4': {'text', 'php'},
|
||||
'php5': {'text', 'php'},
|
||||
'phtml': {'text', 'php'},
|
||||
'pl': {'text', 'perl'},
|
||||
'plantuml': {'text', 'plantuml'},
|
||||
'pm': {'text', 'perl'},
|
||||
'png': {'binary', 'image', 'png'},
|
||||
'po': {'text', 'pofile'},
|
||||
'pp': {'text', 'puppet'},
|
||||
'properties': {'text', 'java-properties'},
|
||||
'proto': {'text', 'proto'},
|
||||
'puml': {'text', 'plantuml'},
|
||||
'purs': {'text', 'purescript'},
|
||||
'py': {'text', 'python'},
|
||||
'pyi': {'text', 'pyi'},
|
||||
'pyx': {'text', 'cython'},
|
||||
'pxd': {'text', 'cython'},
|
||||
'pxi': {'text', 'cython'},
|
||||
'r': {'text', 'r'},
|
||||
'rb': {'text', 'ruby'},
|
||||
'rs': {'text', 'rust'},
|
||||
'rst': {'text', 'rst'},
|
||||
's': {'text', 'asm'},
|
||||
'sbt': {'text', 'sbt', 'scala'},
|
||||
'sc': {'text', 'scala'},
|
||||
'scala': {'text', 'scala'},
|
||||
'scss': {'text', 'scss'},
|
||||
'scm': {'text', 'scheme'},
|
||||
'sh': {'text', 'shell'},
|
||||
'sls': {'text', 'salt'},
|
||||
'so': {'binary'},
|
||||
'sol': {'text', 'solidity'},
|
||||
'spec': {'text', 'spec'},
|
||||
'ss': {'text', 'scheme'},
|
||||
'styl': {'text', 'stylus'},
|
||||
'sql': {'text', 'sql'},
|
||||
'svg': {'text', 'image', 'svg'},
|
||||
'swf': {'binary', 'swf'},
|
||||
'swift': {'text', 'swift'},
|
||||
'swiftdeps': {'text', 'swiftdeps'},
|
||||
'tac': {'text', 'twisted', 'python'},
|
||||
'tar': {'binary', 'tar'},
|
||||
'tgz': {'binary', 'gzip'},
|
||||
'thrift': {'text', 'thrift'},
|
||||
'tiff': {'binary', 'image', 'tiff'},
|
||||
'toml': {'text', 'toml'},
|
||||
'tf': {'text', 'terraform'},
|
||||
'ts': {'text', 'ts'},
|
||||
'tsx': {'text', 'tsx'},
|
||||
'ttf': {'binary', 'ttf'},
|
||||
'txt': {'text', 'plain-text'},
|
||||
'vdx': {'text', 'vdx'},
|
||||
'vim': {'text', 'vim'},
|
||||
'vue': {'text', 'vue'},
|
||||
'war': {'binary', 'zip', 'jar'},
|
||||
'wav': {'binary', 'audio', 'wav'},
|
||||
'wkt': {'text', 'wkt'},
|
||||
'whl': {'binary', 'wheel', 'zip'},
|
||||
'woff': {'binary', 'woff'},
|
||||
'woff2': {'binary', 'woff2'},
|
||||
'wsgi': {'text', 'wsgi', 'python'},
|
||||
'xml': {'text', 'xml'},
|
||||
'xq': {'text', 'xquery'},
|
||||
'xql': {'text', 'xquery'},
|
||||
'xqm': {'text', 'xquery'},
|
||||
'xqu': {'text', 'xquery'},
|
||||
'xquery': {'text', 'xquery'},
|
||||
'xqy': {'text', 'xquery'},
|
||||
'xsd': {'text', 'xml', 'xsd'},
|
||||
'xsl': {'text', 'xml', 'xsl'},
|
||||
'yaml': {'text', 'yaml'},
|
||||
'yang': {'text', 'yang'},
|
||||
'yin': {'text', 'xml', 'yin'},
|
||||
'yml': {'text', 'yaml'},
|
||||
'zig': {'text', 'zig'},
|
||||
'zip': {'binary', 'zip'},
|
||||
'zsh': {'text', 'shell', 'zsh'},
|
||||
}
|
||||
EXTENSIONS_NEED_BINARY_CHECK = {
|
||||
'plist': {'plist'},
|
||||
}
|
||||
|
||||
NAMES = {
|
||||
'.babelrc': EXTENSIONS['json'] | {'babelrc'},
|
||||
'.bashrc': EXTENSIONS['bash'],
|
||||
'.bash_aliases': EXTENSIONS['bash'],
|
||||
'.bash_profile': EXTENSIONS['bash'],
|
||||
'.bowerrc': EXTENSIONS['json'] | {'bowerrc'},
|
||||
'.coveragerc': EXTENSIONS['ini'] | {'coveragerc'},
|
||||
'.dockerignore': {'text', 'dockerignore'},
|
||||
'.editorconfig': {'text', 'editorconfig'},
|
||||
'.gitconfig': EXTENSIONS['ini'] | {'gitconfig'},
|
||||
'.hgrc': EXTENSIONS['ini'] | {'hgrc'},
|
||||
'.gitattributes': {'text', 'gitattributes'},
|
||||
'.gitignore': {'text', 'gitignore'},
|
||||
'.gitmodules': {'text', 'gitmodules'},
|
||||
'.jshintrc': EXTENSIONS['json'] | {'jshintrc'},
|
||||
'.mailmap': {'text', 'mailmap'},
|
||||
'.mention-bot': EXTENSIONS['json'] | {'mention-bot'},
|
||||
'.npmignore': {'text', 'npmignore'},
|
||||
'.pdbrc': EXTENSIONS['py'] | {'pdbrc'},
|
||||
'.pypirc': EXTENSIONS['ini'] | {'pypirc'},
|
||||
'.yamllint': EXTENSIONS['yaml'] | {'yamllint'},
|
||||
'.zshrc': EXTENSIONS['zsh'],
|
||||
'AUTHORS': EXTENSIONS['txt'],
|
||||
'BUILD.bazel': {'text', 'bazel'},
|
||||
'BUILD': {'text', 'bazel'},
|
||||
'CMakeLists.txt': EXTENSIONS['cmake'],
|
||||
'COPYING': EXTENSIONS['txt'],
|
||||
'Dockerfile': {'text', 'dockerfile'},
|
||||
'Gemfile': EXTENSIONS['rb'],
|
||||
'Jenkinsfile': {'text', 'groovy'},
|
||||
'LICENSE': EXTENSIONS['txt'],
|
||||
'MAINTAINERS': EXTENSIONS['txt'],
|
||||
'Makefile': EXTENSIONS['mk'],
|
||||
'NOTICE': EXTENSIONS['txt'],
|
||||
'PATENTS': EXTENSIONS['txt'],
|
||||
'Pipfile': EXTENSIONS['toml'],
|
||||
'Pipfile.lock': EXTENSIONS['json'],
|
||||
'README': EXTENSIONS['txt'],
|
||||
'Rakefile': EXTENSIONS['rb'],
|
||||
'setup.cfg': EXTENSIONS['ini'],
|
||||
}
|
230
identify/identify.py
Normal file
230
identify/identify.py
Normal file
|
@ -0,0 +1,230 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import io
|
||||
import os.path
|
||||
import re
|
||||
import shlex
|
||||
import string
|
||||
import sys
|
||||
|
||||
from identify import extensions
|
||||
from identify import interpreters
|
||||
from identify.vendor import licenses
|
||||
|
||||
|
||||
printable = frozenset(string.printable)
|
||||
|
||||
DIRECTORY = 'directory'
|
||||
SYMLINK = 'symlink'
|
||||
FILE = 'file'
|
||||
EXECUTABLE = 'executable'
|
||||
NON_EXECUTABLE = 'non-executable'
|
||||
TEXT = 'text'
|
||||
BINARY = 'binary'
|
||||
|
||||
ALL_TAGS = {DIRECTORY, SYMLINK, FILE, EXECUTABLE, NON_EXECUTABLE, TEXT, BINARY}
|
||||
ALL_TAGS.update(*extensions.EXTENSIONS.values())
|
||||
ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values())
|
||||
ALL_TAGS.update(*extensions.NAMES.values())
|
||||
ALL_TAGS.update(*interpreters.INTERPRETERS.values())
|
||||
ALL_TAGS = frozenset(ALL_TAGS)
|
||||
|
||||
|
||||
def tags_from_path(path):
|
||||
if not os.path.lexists(path):
|
||||
raise ValueError('{} does not exist.'.format(path))
|
||||
if os.path.isdir(path):
|
||||
return {DIRECTORY}
|
||||
if os.path.islink(path):
|
||||
return {SYMLINK}
|
||||
|
||||
tags = {FILE}
|
||||
|
||||
executable = os.access(path, os.X_OK)
|
||||
if executable:
|
||||
tags.add(EXECUTABLE)
|
||||
else:
|
||||
tags.add(NON_EXECUTABLE)
|
||||
|
||||
# As an optimization, if we're able to read tags from the filename, then we
|
||||
# don't peek at the file contents.
|
||||
t = tags_from_filename(os.path.basename(path))
|
||||
if len(t) > 0:
|
||||
tags.update(t)
|
||||
else:
|
||||
if executable:
|
||||
shebang = parse_shebang_from_file(path)
|
||||
if len(shebang) > 0:
|
||||
tags.update(tags_from_interpreter(shebang[0]))
|
||||
|
||||
# some extensions can be both binary and text
|
||||
# see EXTENSIONS_NEED_BINARY_CHECK
|
||||
if not {TEXT, BINARY} & tags:
|
||||
if file_is_text(path):
|
||||
tags.add(TEXT)
|
||||
else:
|
||||
tags.add(BINARY)
|
||||
|
||||
assert {TEXT, BINARY} & tags, tags
|
||||
assert {EXECUTABLE, NON_EXECUTABLE} & tags, tags
|
||||
return tags
|
||||
|
||||
|
||||
def tags_from_filename(filename):
|
||||
_, filename = os.path.split(filename)
|
||||
_, ext = os.path.splitext(filename)
|
||||
|
||||
ret = set()
|
||||
|
||||
# Allow e.g. "Dockerfile.xenial" to match "Dockerfile"
|
||||
for part in [filename] + filename.split('.'):
|
||||
if part in extensions.NAMES:
|
||||
ret.update(extensions.NAMES[part])
|
||||
break
|
||||
|
||||
if len(ext) > 0:
|
||||
ext = ext[1:].lower()
|
||||
if ext in extensions.EXTENSIONS:
|
||||
ret.update(extensions.EXTENSIONS[ext])
|
||||
elif ext in extensions.EXTENSIONS_NEED_BINARY_CHECK:
|
||||
ret.update(extensions.EXTENSIONS_NEED_BINARY_CHECK[ext])
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
def tags_from_interpreter(interpreter):
|
||||
_, _, interpreter = interpreter.rpartition('/')
|
||||
|
||||
# Try "python3.5.2" => "python3.5" => "python3" until one matches.
|
||||
while interpreter:
|
||||
if interpreter in interpreters.INTERPRETERS:
|
||||
return interpreters.INTERPRETERS[interpreter]
|
||||
else:
|
||||
interpreter, _, _ = interpreter.rpartition('.')
|
||||
|
||||
return set()
|
||||
|
||||
|
||||
def is_text(bytesio):
|
||||
"""Return whether the first KB of contents seems to be binary.
|
||||
|
||||
This is roughly based on libmagic's binary/text detection:
|
||||
https://github.com/file/file/blob/df74b09b9027676088c797528edcaae5a9ce9ad0/src/encoding.c#L203-L228
|
||||
"""
|
||||
text_chars = (
|
||||
bytearray([7, 8, 9, 10, 11, 12, 13, 27]) +
|
||||
bytearray(range(0x20, 0x7F)) +
|
||||
bytearray(range(0x80, 0X100))
|
||||
)
|
||||
return not bool(bytesio.read(1024).translate(None, text_chars))
|
||||
|
||||
|
||||
def file_is_text(path):
|
||||
if not os.path.lexists(path):
|
||||
raise ValueError('{} does not exist.'.format(path))
|
||||
with open(path, 'rb') as f:
|
||||
return is_text(f)
|
||||
|
||||
|
||||
def _shebang_split(line):
|
||||
try:
|
||||
# shebangs aren't supposed to be quoted, though some tools such as
|
||||
# setuptools will write them with quotes so we'll best-guess parse
|
||||
# with shlex first
|
||||
return shlex.split(line)
|
||||
except ValueError:
|
||||
# failing that, we'll do a more "traditional" shebang parsing which
|
||||
# just involves splitting by whitespace
|
||||
return line.split()
|
||||
|
||||
|
||||
def parse_shebang(bytesio):
|
||||
"""Parse the shebang from a file opened for reading binary."""
|
||||
if bytesio.read(2) != b'#!':
|
||||
return ()
|
||||
first_line = bytesio.readline()
|
||||
try:
|
||||
first_line = first_line.decode('UTF-8')
|
||||
except UnicodeDecodeError:
|
||||
return ()
|
||||
|
||||
# Require only printable ascii
|
||||
for c in first_line:
|
||||
if c not in printable:
|
||||
return ()
|
||||
|
||||
cmd = tuple(_shebang_split(first_line.strip()))
|
||||
if cmd and cmd[0] == '/usr/bin/env':
|
||||
cmd = cmd[1:]
|
||||
return cmd
|
||||
|
||||
|
||||
def parse_shebang_from_file(path):
|
||||
"""Parse the shebang given a file path."""
|
||||
if not os.path.lexists(path):
|
||||
raise ValueError('{} does not exist.'.format(path))
|
||||
if not os.access(path, os.X_OK):
|
||||
return ()
|
||||
|
||||
with open(path, 'rb') as f:
|
||||
return parse_shebang(f)
|
||||
|
||||
|
||||
COPYRIGHT_RE = re.compile(r'^\s*(Copyright|\(C\)) .*$', re.I | re.MULTILINE)
|
||||
WS_RE = re.compile(r'\s+')
|
||||
|
||||
|
||||
def _norm_license(s):
|
||||
s = COPYRIGHT_RE.sub('', s)
|
||||
s = WS_RE.sub(' ', s)
|
||||
return s.strip()
|
||||
|
||||
|
||||
def license_id(filename):
|
||||
"""Return the spdx id for the license contained in `filename`. If no
|
||||
license is detected, returns `None`.
|
||||
|
||||
spdx: https://spdx.org/licenses/
|
||||
licenses from choosealicense.com: https://github.com/choosealicense.com
|
||||
|
||||
Approximate algorithm:
|
||||
|
||||
1. strip copyright line
|
||||
2. normalize whitespace (replace all whitespace with a single space)
|
||||
3. check exact text match with existing licenses
|
||||
4. failing that use edit distance
|
||||
"""
|
||||
import editdistance # `pip install identify[license]`
|
||||
|
||||
with io.open(filename, encoding='UTF-8') as f:
|
||||
contents = f.read()
|
||||
|
||||
norm = _norm_license(contents)
|
||||
|
||||
min_edit_dist = sys.maxsize
|
||||
min_edit_dist_spdx = ''
|
||||
|
||||
# try exact matches
|
||||
for spdx, text in licenses.LICENSES:
|
||||
norm_license = _norm_license(text)
|
||||
if norm == norm_license:
|
||||
return spdx
|
||||
|
||||
# skip the slow calculation if the lengths are very different
|
||||
if norm and abs(len(norm) - len(norm_license)) / len(norm) > .05:
|
||||
continue
|
||||
|
||||
edit_dist = editdistance.eval(norm, norm_license)
|
||||
if edit_dist < min_edit_dist:
|
||||
min_edit_dist = edit_dist
|
||||
min_edit_dist_spdx = spdx
|
||||
|
||||
# if there's less than 5% edited from the license, we found our match
|
||||
if norm and min_edit_dist / len(norm) < .05:
|
||||
return min_edit_dist_spdx
|
||||
else:
|
||||
# no matches :'(
|
||||
return None
|
18
identify/interpreters.py
Normal file
18
identify/interpreters.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
INTERPRETERS = {
|
||||
'bash': {'shell', 'bash'},
|
||||
'dash': {'shell', 'dash'},
|
||||
'node': {'javascript'},
|
||||
'nodejs': {'javascript'},
|
||||
'perl': {'perl'},
|
||||
'python': {'python'},
|
||||
'python2': {'python', 'python2'},
|
||||
'python3': {'python', 'python3'},
|
||||
'ruby': {'ruby'},
|
||||
'sh': {'shell', 'sh'},
|
||||
'tcsh': {'shell', 'tcsh'},
|
||||
'zsh': {'shell', 'zsh'},
|
||||
}
|
0
identify/vendor/__init__.py
vendored
Normal file
0
identify/vendor/__init__.py
vendored
Normal file
6749
identify/vendor/licenses.py
vendored
Normal file
6749
identify/vendor/licenses.py
vendored
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue