1
0
Fork 0

Adding upstream version 1.4.13.

Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
Daniel Baumann 2025-02-12 16:43:50 +01:00
parent afaf4643e1
commit 03367abfa8
Signed by: daniel
GPG key ID: FBB4F0E80A80222F
25 changed files with 7987 additions and 0 deletions

0
identify/__init__.py Normal file
View file

36
identify/cli.py Normal file
View file

@ -0,0 +1,36 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import unicode_literals
import argparse
import json
from identify import identify
def main(argv=None):
parser = argparse.ArgumentParser()
parser.add_argument('--filename-only', action='store_true')
parser.add_argument('path')
args = parser.parse_args(argv)
if args.filename_only:
func = identify.tags_from_filename
else:
func = identify.tags_from_path
try:
tags = sorted(func(args.path))
except ValueError as e:
print(e)
return 1
if not tags:
return 1
else:
print(json.dumps(tags))
return 0
if __name__ == '__main__':
exit(main())

224
identify/extensions.py Normal file
View file

@ -0,0 +1,224 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import unicode_literals
EXTENSIONS = {
'apinotes': {'text', 'apinotes'},
'asar': {'binary', 'asar'},
'bash': {'text', 'shell', 'bash'},
'bat': {'text', 'batch'},
'bmp': {'binary', 'image', 'bitmap'},
'bz2': {'binary', 'bzip2'},
'c': {'text', 'c'},
'cc': {'text', 'c++'},
'cu': {'text', 'cuda'},
'cfg': {'text'},
'cmake': {'text', 'cmake'},
'cnf': {'text'},
'coffee': {'text', 'coffee'},
'conf': {'text'},
'cpp': {'text', 'c++'},
'crt': {'text', 'pem'},
'cs': {'text', 'c#'},
'cson': {'text', 'cson'},
'css': {'text', 'css'},
'csv': {'text', 'csv'},
'cxx': {'text', 'c++'},
'dart': {'text', 'dart'},
'def': {'text', 'def'},
'dtd': {'text', 'dtd'},
'ear': {'binary', 'zip', 'jar'},
'ejs': {'text', 'ejs'},
'eot': {'binary', 'eot'},
'eps': {'binary', 'eps'},
'erb': {'text', 'erb'},
'exe': {'binary'},
'eyaml': {'text', 'yaml'},
'feature': {'text', 'gherkin'},
'fish': {'text', 'fish'},
'gemspec': {'text', 'ruby'},
'gif': {'binary', 'image', 'gif'},
'go': {'text', 'go'},
'gotmpl': {'text', 'gotmpl'},
'gpx': {'text', 'gpx', 'xml'},
'gradle': {'text', 'groovy'},
'groovy': {'text', 'groovy'},
'gyb': {'text', 'gyb'},
'gyp': {'text', 'gyp', 'python'},
'gypi': {'text', 'gyp', 'python'},
'gz': {'binary', 'gzip'},
'h': {'text', 'header', 'c', 'c++'},
'hpp': {'text', 'header', 'c++'},
'htm': {'text', 'html'},
'html': {'text', 'html'},
'hxx': {'text', 'header', 'c++'},
'icns': {'binary', 'icns'},
'ico': {'binary', 'icon'},
'ics': {'text', 'icalendar'},
'idl': {'text', 'idl'},
'idr': {'text', 'idris'},
'inc': {'text', 'inc'},
'ini': {'text', 'ini'},
'j2': {'text', 'jinja'},
'jade': {'text', 'jade'},
'jar': {'binary', 'zip', 'jar'},
'java': {'text', 'java'},
'jenkinsfile': {'text', 'groovy'},
'jinja': {'text', 'jinja'},
'jinja2': {'text', 'jinja'},
'jpeg': {'binary', 'image', 'jpeg'},
'jpg': {'binary', 'image', 'jpeg'},
'js': {'text', 'javascript'},
'json': {'text', 'json'},
'jsonnet': {'text', 'jsonnet'},
'jsx': {'text', 'jsx'},
'key': {'text', 'pem'},
'kml': {'text', 'kml', 'xml'},
'kt': {'text', 'kotlin'},
'less': {'text', 'less'},
'lidr': {'text', 'idris'},
'lua': {'text', 'lua'},
'm': {'text', 'c', 'objective-c'},
'manifest': {'text', 'manifest'},
'map': {'text', 'map'},
'markdown': {'text', 'markdown'},
'md': {'text', 'markdown'},
'mib': {'text', 'mib'},
'mk': {'text', 'makefile'},
'mm': {'text', 'c++', 'objective-c++'},
'modulemap': {'text', 'modulemap'},
'ngdoc': {'text', 'ngdoc'},
'nim': {'text', 'nim'},
'nims': {'text', 'nim'},
'nimble': {'text', 'nimble'},
'nix': {'text', 'nix'},
'otf': {'binary', 'otf'},
'p12': {'binary', 'p12'},
'patch': {'text', 'diff'},
'pdf': {'binary', 'pdf'},
'pem': {'text', 'pem'},
'php': {'text', 'php'},
'php4': {'text', 'php'},
'php5': {'text', 'php'},
'phtml': {'text', 'php'},
'pl': {'text', 'perl'},
'plantuml': {'text', 'plantuml'},
'pm': {'text', 'perl'},
'png': {'binary', 'image', 'png'},
'po': {'text', 'pofile'},
'pp': {'text', 'puppet'},
'properties': {'text', 'java-properties'},
'proto': {'text', 'proto'},
'puml': {'text', 'plantuml'},
'purs': {'text', 'purescript'},
'py': {'text', 'python'},
'pyi': {'text', 'pyi'},
'pyx': {'text', 'cython'},
'pxd': {'text', 'cython'},
'pxi': {'text', 'cython'},
'r': {'text', 'r'},
'rb': {'text', 'ruby'},
'rs': {'text', 'rust'},
'rst': {'text', 'rst'},
's': {'text', 'asm'},
'sbt': {'text', 'sbt', 'scala'},
'sc': {'text', 'scala'},
'scala': {'text', 'scala'},
'scss': {'text', 'scss'},
'scm': {'text', 'scheme'},
'sh': {'text', 'shell'},
'sls': {'text', 'salt'},
'so': {'binary'},
'sol': {'text', 'solidity'},
'spec': {'text', 'spec'},
'ss': {'text', 'scheme'},
'styl': {'text', 'stylus'},
'sql': {'text', 'sql'},
'svg': {'text', 'image', 'svg'},
'swf': {'binary', 'swf'},
'swift': {'text', 'swift'},
'swiftdeps': {'text', 'swiftdeps'},
'tac': {'text', 'twisted', 'python'},
'tar': {'binary', 'tar'},
'tgz': {'binary', 'gzip'},
'thrift': {'text', 'thrift'},
'tiff': {'binary', 'image', 'tiff'},
'toml': {'text', 'toml'},
'tf': {'text', 'terraform'},
'ts': {'text', 'ts'},
'tsx': {'text', 'tsx'},
'ttf': {'binary', 'ttf'},
'txt': {'text', 'plain-text'},
'vdx': {'text', 'vdx'},
'vim': {'text', 'vim'},
'vue': {'text', 'vue'},
'war': {'binary', 'zip', 'jar'},
'wav': {'binary', 'audio', 'wav'},
'wkt': {'text', 'wkt'},
'whl': {'binary', 'wheel', 'zip'},
'woff': {'binary', 'woff'},
'woff2': {'binary', 'woff2'},
'wsgi': {'text', 'wsgi', 'python'},
'xml': {'text', 'xml'},
'xq': {'text', 'xquery'},
'xql': {'text', 'xquery'},
'xqm': {'text', 'xquery'},
'xqu': {'text', 'xquery'},
'xquery': {'text', 'xquery'},
'xqy': {'text', 'xquery'},
'xsd': {'text', 'xml', 'xsd'},
'xsl': {'text', 'xml', 'xsl'},
'yaml': {'text', 'yaml'},
'yang': {'text', 'yang'},
'yin': {'text', 'xml', 'yin'},
'yml': {'text', 'yaml'},
'zig': {'text', 'zig'},
'zip': {'binary', 'zip'},
'zsh': {'text', 'shell', 'zsh'},
}
EXTENSIONS_NEED_BINARY_CHECK = {
'plist': {'plist'},
}
NAMES = {
'.babelrc': EXTENSIONS['json'] | {'babelrc'},
'.bashrc': EXTENSIONS['bash'],
'.bash_aliases': EXTENSIONS['bash'],
'.bash_profile': EXTENSIONS['bash'],
'.bowerrc': EXTENSIONS['json'] | {'bowerrc'},
'.coveragerc': EXTENSIONS['ini'] | {'coveragerc'},
'.dockerignore': {'text', 'dockerignore'},
'.editorconfig': {'text', 'editorconfig'},
'.gitconfig': EXTENSIONS['ini'] | {'gitconfig'},
'.hgrc': EXTENSIONS['ini'] | {'hgrc'},
'.gitattributes': {'text', 'gitattributes'},
'.gitignore': {'text', 'gitignore'},
'.gitmodules': {'text', 'gitmodules'},
'.jshintrc': EXTENSIONS['json'] | {'jshintrc'},
'.mailmap': {'text', 'mailmap'},
'.mention-bot': EXTENSIONS['json'] | {'mention-bot'},
'.npmignore': {'text', 'npmignore'},
'.pdbrc': EXTENSIONS['py'] | {'pdbrc'},
'.pypirc': EXTENSIONS['ini'] | {'pypirc'},
'.yamllint': EXTENSIONS['yaml'] | {'yamllint'},
'.zshrc': EXTENSIONS['zsh'],
'AUTHORS': EXTENSIONS['txt'],
'BUILD.bazel': {'text', 'bazel'},
'BUILD': {'text', 'bazel'},
'CMakeLists.txt': EXTENSIONS['cmake'],
'COPYING': EXTENSIONS['txt'],
'Dockerfile': {'text', 'dockerfile'},
'Gemfile': EXTENSIONS['rb'],
'Jenkinsfile': {'text', 'groovy'},
'LICENSE': EXTENSIONS['txt'],
'MAINTAINERS': EXTENSIONS['txt'],
'Makefile': EXTENSIONS['mk'],
'NOTICE': EXTENSIONS['txt'],
'PATENTS': EXTENSIONS['txt'],
'Pipfile': EXTENSIONS['toml'],
'Pipfile.lock': EXTENSIONS['json'],
'README': EXTENSIONS['txt'],
'Rakefile': EXTENSIONS['rb'],
'setup.cfg': EXTENSIONS['ini'],
}

230
identify/identify.py Normal file
View file

@ -0,0 +1,230 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
import io
import os.path
import re
import shlex
import string
import sys
from identify import extensions
from identify import interpreters
from identify.vendor import licenses
printable = frozenset(string.printable)
DIRECTORY = 'directory'
SYMLINK = 'symlink'
FILE = 'file'
EXECUTABLE = 'executable'
NON_EXECUTABLE = 'non-executable'
TEXT = 'text'
BINARY = 'binary'
ALL_TAGS = {DIRECTORY, SYMLINK, FILE, EXECUTABLE, NON_EXECUTABLE, TEXT, BINARY}
ALL_TAGS.update(*extensions.EXTENSIONS.values())
ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values())
ALL_TAGS.update(*extensions.NAMES.values())
ALL_TAGS.update(*interpreters.INTERPRETERS.values())
ALL_TAGS = frozenset(ALL_TAGS)
def tags_from_path(path):
if not os.path.lexists(path):
raise ValueError('{} does not exist.'.format(path))
if os.path.isdir(path):
return {DIRECTORY}
if os.path.islink(path):
return {SYMLINK}
tags = {FILE}
executable = os.access(path, os.X_OK)
if executable:
tags.add(EXECUTABLE)
else:
tags.add(NON_EXECUTABLE)
# As an optimization, if we're able to read tags from the filename, then we
# don't peek at the file contents.
t = tags_from_filename(os.path.basename(path))
if len(t) > 0:
tags.update(t)
else:
if executable:
shebang = parse_shebang_from_file(path)
if len(shebang) > 0:
tags.update(tags_from_interpreter(shebang[0]))
# some extensions can be both binary and text
# see EXTENSIONS_NEED_BINARY_CHECK
if not {TEXT, BINARY} & tags:
if file_is_text(path):
tags.add(TEXT)
else:
tags.add(BINARY)
assert {TEXT, BINARY} & tags, tags
assert {EXECUTABLE, NON_EXECUTABLE} & tags, tags
return tags
def tags_from_filename(filename):
_, filename = os.path.split(filename)
_, ext = os.path.splitext(filename)
ret = set()
# Allow e.g. "Dockerfile.xenial" to match "Dockerfile"
for part in [filename] + filename.split('.'):
if part in extensions.NAMES:
ret.update(extensions.NAMES[part])
break
if len(ext) > 0:
ext = ext[1:].lower()
if ext in extensions.EXTENSIONS:
ret.update(extensions.EXTENSIONS[ext])
elif ext in extensions.EXTENSIONS_NEED_BINARY_CHECK:
ret.update(extensions.EXTENSIONS_NEED_BINARY_CHECK[ext])
return ret
def tags_from_interpreter(interpreter):
_, _, interpreter = interpreter.rpartition('/')
# Try "python3.5.2" => "python3.5" => "python3" until one matches.
while interpreter:
if interpreter in interpreters.INTERPRETERS:
return interpreters.INTERPRETERS[interpreter]
else:
interpreter, _, _ = interpreter.rpartition('.')
return set()
def is_text(bytesio):
"""Return whether the first KB of contents seems to be binary.
This is roughly based on libmagic's binary/text detection:
https://github.com/file/file/blob/df74b09b9027676088c797528edcaae5a9ce9ad0/src/encoding.c#L203-L228
"""
text_chars = (
bytearray([7, 8, 9, 10, 11, 12, 13, 27]) +
bytearray(range(0x20, 0x7F)) +
bytearray(range(0x80, 0X100))
)
return not bool(bytesio.read(1024).translate(None, text_chars))
def file_is_text(path):
if not os.path.lexists(path):
raise ValueError('{} does not exist.'.format(path))
with open(path, 'rb') as f:
return is_text(f)
def _shebang_split(line):
try:
# shebangs aren't supposed to be quoted, though some tools such as
# setuptools will write them with quotes so we'll best-guess parse
# with shlex first
return shlex.split(line)
except ValueError:
# failing that, we'll do a more "traditional" shebang parsing which
# just involves splitting by whitespace
return line.split()
def parse_shebang(bytesio):
"""Parse the shebang from a file opened for reading binary."""
if bytesio.read(2) != b'#!':
return ()
first_line = bytesio.readline()
try:
first_line = first_line.decode('UTF-8')
except UnicodeDecodeError:
return ()
# Require only printable ascii
for c in first_line:
if c not in printable:
return ()
cmd = tuple(_shebang_split(first_line.strip()))
if cmd and cmd[0] == '/usr/bin/env':
cmd = cmd[1:]
return cmd
def parse_shebang_from_file(path):
"""Parse the shebang given a file path."""
if not os.path.lexists(path):
raise ValueError('{} does not exist.'.format(path))
if not os.access(path, os.X_OK):
return ()
with open(path, 'rb') as f:
return parse_shebang(f)
COPYRIGHT_RE = re.compile(r'^\s*(Copyright|\(C\)) .*$', re.I | re.MULTILINE)
WS_RE = re.compile(r'\s+')
def _norm_license(s):
s = COPYRIGHT_RE.sub('', s)
s = WS_RE.sub(' ', s)
return s.strip()
def license_id(filename):
"""Return the spdx id for the license contained in `filename`. If no
license is detected, returns `None`.
spdx: https://spdx.org/licenses/
licenses from choosealicense.com: https://github.com/choosealicense.com
Approximate algorithm:
1. strip copyright line
2. normalize whitespace (replace all whitespace with a single space)
3. check exact text match with existing licenses
4. failing that use edit distance
"""
import editdistance # `pip install identify[license]`
with io.open(filename, encoding='UTF-8') as f:
contents = f.read()
norm = _norm_license(contents)
min_edit_dist = sys.maxsize
min_edit_dist_spdx = ''
# try exact matches
for spdx, text in licenses.LICENSES:
norm_license = _norm_license(text)
if norm == norm_license:
return spdx
# skip the slow calculation if the lengths are very different
if norm and abs(len(norm) - len(norm_license)) / len(norm) > .05:
continue
edit_dist = editdistance.eval(norm, norm_license)
if edit_dist < min_edit_dist:
min_edit_dist = edit_dist
min_edit_dist_spdx = spdx
# if there's less than 5% edited from the license, we found our match
if norm and min_edit_dist / len(norm) < .05:
return min_edit_dist_spdx
else:
# no matches :'(
return None

18
identify/interpreters.py Normal file
View file

@ -0,0 +1,18 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import unicode_literals
INTERPRETERS = {
'bash': {'shell', 'bash'},
'dash': {'shell', 'dash'},
'node': {'javascript'},
'nodejs': {'javascript'},
'perl': {'perl'},
'python': {'python'},
'python2': {'python', 'python2'},
'python3': {'python', 'python3'},
'ruby': {'ruby'},
'sh': {'shell', 'sh'},
'tcsh': {'shell', 'tcsh'},
'zsh': {'shell', 'zsh'},
}

0
identify/vendor/__init__.py vendored Normal file
View file

6749
identify/vendor/licenses.py vendored Normal file

File diff suppressed because it is too large Load diff