1
0
Fork 0

Adding upstream version 1.4.13.

Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
Daniel Baumann 2025-02-12 16:43:50 +01:00
parent afaf4643e1
commit 03367abfa8
Signed by: daniel
GPG key ID: FBB4F0E80A80222F
25 changed files with 7987 additions and 0 deletions

1
.activate.sh Symbolic link
View file

@ -0,0 +1 @@
venv/bin/activate

29
.coveragerc Normal file
View file

@ -0,0 +1,29 @@
[run]
branch = True
source =
.
omit =
.tox/*
/usr/*
setup.py
[report]
show_missing = True
exclude_lines =
# Have to re-enable the standard pragma
\#\s*pragma: no cover
# Don't complain if tests don't hit defensive assertion code:
^\s*raise AssertionError\b
^\s*raise NotImplementedError\b
^\s*return NotImplemented\b
^\s*raise$
# Don't complain if non-runnable code isn't run:
^if __name__ == ['"]__main__['"]:$
[html]
directory = coverage-html
# vim:ft=dosini

1
.deactivate.sh Normal file
View file

@ -0,0 +1 @@
deactivate

8
.gitignore vendored Normal file
View file

@ -0,0 +1,8 @@
*.egg-info
*.py[co]
/.coverage
/.pytest_cache
/.tox
/coverage-html
/dist
/venv

36
.pre-commit-config.yaml Normal file
View file

@ -0,0 +1,36 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v2.1.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-docstring-first
- id: check-merge-conflict
- id: check-yaml
- id: debug-statements
- id: double-quote-string-fixer
- id: name-tests-test
- id: check-added-large-files
- id: check-byte-order-marker
- id: fix-encoding-pragma
- repo: https://gitlab.com/pycqa/flake8
rev: 3.7.7
hooks:
- id: flake8
exclude: ^identify/vendor/licenses\.py$
- repo: https://github.com/pre-commit/mirrors-autopep8
rev: v1.4.3
hooks:
- id: autopep8
- repo: https://github.com/asottile/reorder_python_imports
rev: v1.4.0
hooks:
- id: reorder-python-imports
args: [
'--add-import', 'from __future__ import absolute_import',
'--add-import', 'from __future__ import unicode_literals',
]
- repo: https://github.com/asottile/add-trailing-comma
rev: v1.0.0
hooks:
- id: add-trailing-comma

17
.travis.yml Normal file
View file

@ -0,0 +1,17 @@
language: python
matrix:
include:
- env: TOXENV=py27
- env: TOXENV=py35
python: 3.5
- env: TOXENV=py36
python: 3.6
- env: TOXENV=pypy
python: pypy
install: pip install coveralls tox
script: tox
after_success: coveralls
cache:
directories:
- $HOME/.cache/pip
- $HOME/.cache/pre-commit

19
LICENSE Normal file
View file

@ -0,0 +1,19 @@
Copyright (c) 2017 Chris Kuehl, Anthony Sottile
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

16
Makefile Normal file
View file

@ -0,0 +1,16 @@
.PHONY: minimal
minimal: venv
venv: setup.py requirements-dev.txt tox.ini
tox -e venv
.PHONY: test
test:
tox
.PHONY: clean
clean:
find -name '*.pyc' -delete
find -name '__pycache__' -delete
rm -rf .tox
rm -rf venv

131
README.md Normal file
View file

@ -0,0 +1,131 @@
identify
========
[![Build Status](https://travis-ci.org/chriskuehl/identify.svg?branch=master)](https://travis-ci.org/chriskuehl/identify)
[![Coverage Status](https://coveralls.io/repos/github/chriskuehl/identify/badge.svg?branch=master)](https://coveralls.io/github/chriskuehl/identify?branch=master)
[![PyPI version](https://badge.fury.io/py/identify.svg)](https://pypi.python.org/pypi/identify)
File identification library for Python.
Given a file (or some information about a file), return a set of standardized
tags identifying what the file is.
## Usage
### With a file on disk
If you have an actual file on disk, you can get the most information possible
(a superset of all other methods):
```python
>>> identify.tags_from_path('/path/to/file.py')
{'file', 'text', 'python', 'non-executable'}
>>> identify.tags_from_path('/path/to/file-with-shebang')
{'file', 'text', 'shell', 'bash', 'executable'}
>>> identify.tags_from_path('/bin/bash')
{'file', 'binary', 'executable'}
>>> identify.tags_from_path('/path/to/directory')
{'directory'}
>>> identify.tags_from_path('/path/to/symlink')
{'symlink'}
```
When using a file on disk, the checks performed are:
* File type (file, symlink, directory)
* Mode (is it executable?)
* File name (mostly based on extension)
* If executable, the shebang is read and the interpreter interpreted
### If you only have the filename
```python
>>> identify.tags_from_filename('file.py')
{'text', 'python'}
```
### If you only have the interpreter
```python
>>> identify.tags_from_interpreter('python3.5')
{'python', 'python3'}
>>> identify.tags_from_interpreter('bash')
{'shell', 'bash'}
>>> identify.tags_from_interpreter('some-unrecognized-thing')
set()
```
### As a cli
```
$ identify-cli --help
usage: identify-cli [-h] [--filename-only] path
positional arguments:
path
optional arguments:
-h, --help show this help message and exit
--filename-only
```
```bash
$ identify-cli setup.py; echo $?
["file", "non-executable", "python", "text"]
0
identify setup.py --filename-only; echo $?
["python", "text"]
0
$ identify-cli wat.wat; echo $?
wat.wat does not exist.
1
$ identify-cli wat.wat --filename-only; echo $?
1
```
### Identifying LICENSE files
`identify` also has an api for determining what type of license is contained
in a file. This routine is roughly based on the approaches used by
[licensee] (the ruby gem that github uses to figure out the license for a
repo).
The approach that `identify` uses is as follows:
1. Strip the copyright line
2. Normalize all whitespace
3. Return any exact matches
4. Return the closest by edit distance (where edit distance < 5%)
To use the api, install via `pip install identify[license]`
```pycon
>>> from identify import identify
>>> identify.license_id('LICENSE')
'MIT'
```
The return value of the `license_id` function is an [SPDX] id. Currently
licenses are sourced from [choosealicense.com].
[licensee]: https://github.com/benbalter/licensee
[SPDX]: https://spdx.org/licenses/
[choosealicense.com]: https://github.com/github/choosealicense.com
## How it works
A call to `tags_from_path` does this:
1. What is the type: file, symlink, directory? If it's not file, stop here.
2. Is it executable? Add the appropriate tag.
3. Do we recognize the file extension? If so, add the appropriate tags, stop
here. These tags would include binary/text.
4. Peek at the first X bytes of the file. Use these to determine whether it is
binary or text, add the appropriate tag.
5. If identified as text above, try to read and interpret the shebang, and add
appropriate tags.
By design, this means we don't need to partially read files where we recognize
the file extension.

63
bin/vendor-licenses Executable file
View file

@ -0,0 +1,63 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Usage:
./bin/vendor-licenses > identify/vendor/licenses.py
"""
from __future__ import absolute_import
from __future__ import unicode_literals
import argparse
import os.path
import subprocess
import tempfile
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--revision', default='HEAD')
args = parser.parse_args()
licenses = []
with tempfile.TemporaryDirectory() as tmpdir:
subprocess.check_call((
'git', 'clone', '--no-checkout', '--quiet',
'https://github.com/github/choosealicense.com', tmpdir,
))
subprocess.check_call((
'git', '-C', tmpdir, 'checkout', args.revision, '--', '_licenses',
))
for filename in os.listdir(os.path.join(tmpdir, '_licenses')):
filename = os.path.join(tmpdir, '_licenses', filename)
with open(filename) as f:
contents = f.read()
_, data, license_text = contents.split('---\n', 2)
spdx, = [
line[len('spdx-id:'):].strip()
for line in data.splitlines()
if line.startswith('spdx-id:')
]
licenses.append((spdx, license_text))
print('# -*- coding: utf-8 -*-')
print('from __future__ import absolute_import')
print('from __future__ import unicode_literals')
print('LICENSES = (')
for spdx, text in sorted(licenses):
print(' (')
print(' {!r},'.format(spdx))
print(" '''\\")
print(text.replace('\t', ' ').replace(' \n', '').strip())
print("''',")
print(' ),')
print(')')
if __name__ == '__main__':
exit(main())

0
identify/__init__.py Normal file
View file

36
identify/cli.py Normal file
View file

@ -0,0 +1,36 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import unicode_literals
import argparse
import json
from identify import identify
def main(argv=None):
parser = argparse.ArgumentParser()
parser.add_argument('--filename-only', action='store_true')
parser.add_argument('path')
args = parser.parse_args(argv)
if args.filename_only:
func = identify.tags_from_filename
else:
func = identify.tags_from_path
try:
tags = sorted(func(args.path))
except ValueError as e:
print(e)
return 1
if not tags:
return 1
else:
print(json.dumps(tags))
return 0
if __name__ == '__main__':
exit(main())

224
identify/extensions.py Normal file
View file

@ -0,0 +1,224 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import unicode_literals
EXTENSIONS = {
'apinotes': {'text', 'apinotes'},
'asar': {'binary', 'asar'},
'bash': {'text', 'shell', 'bash'},
'bat': {'text', 'batch'},
'bmp': {'binary', 'image', 'bitmap'},
'bz2': {'binary', 'bzip2'},
'c': {'text', 'c'},
'cc': {'text', 'c++'},
'cu': {'text', 'cuda'},
'cfg': {'text'},
'cmake': {'text', 'cmake'},
'cnf': {'text'},
'coffee': {'text', 'coffee'},
'conf': {'text'},
'cpp': {'text', 'c++'},
'crt': {'text', 'pem'},
'cs': {'text', 'c#'},
'cson': {'text', 'cson'},
'css': {'text', 'css'},
'csv': {'text', 'csv'},
'cxx': {'text', 'c++'},
'dart': {'text', 'dart'},
'def': {'text', 'def'},
'dtd': {'text', 'dtd'},
'ear': {'binary', 'zip', 'jar'},
'ejs': {'text', 'ejs'},
'eot': {'binary', 'eot'},
'eps': {'binary', 'eps'},
'erb': {'text', 'erb'},
'exe': {'binary'},
'eyaml': {'text', 'yaml'},
'feature': {'text', 'gherkin'},
'fish': {'text', 'fish'},
'gemspec': {'text', 'ruby'},
'gif': {'binary', 'image', 'gif'},
'go': {'text', 'go'},
'gotmpl': {'text', 'gotmpl'},
'gpx': {'text', 'gpx', 'xml'},
'gradle': {'text', 'groovy'},
'groovy': {'text', 'groovy'},
'gyb': {'text', 'gyb'},
'gyp': {'text', 'gyp', 'python'},
'gypi': {'text', 'gyp', 'python'},
'gz': {'binary', 'gzip'},
'h': {'text', 'header', 'c', 'c++'},
'hpp': {'text', 'header', 'c++'},
'htm': {'text', 'html'},
'html': {'text', 'html'},
'hxx': {'text', 'header', 'c++'},
'icns': {'binary', 'icns'},
'ico': {'binary', 'icon'},
'ics': {'text', 'icalendar'},
'idl': {'text', 'idl'},
'idr': {'text', 'idris'},
'inc': {'text', 'inc'},
'ini': {'text', 'ini'},
'j2': {'text', 'jinja'},
'jade': {'text', 'jade'},
'jar': {'binary', 'zip', 'jar'},
'java': {'text', 'java'},
'jenkinsfile': {'text', 'groovy'},
'jinja': {'text', 'jinja'},
'jinja2': {'text', 'jinja'},
'jpeg': {'binary', 'image', 'jpeg'},
'jpg': {'binary', 'image', 'jpeg'},
'js': {'text', 'javascript'},
'json': {'text', 'json'},
'jsonnet': {'text', 'jsonnet'},
'jsx': {'text', 'jsx'},
'key': {'text', 'pem'},
'kml': {'text', 'kml', 'xml'},
'kt': {'text', 'kotlin'},
'less': {'text', 'less'},
'lidr': {'text', 'idris'},
'lua': {'text', 'lua'},
'm': {'text', 'c', 'objective-c'},
'manifest': {'text', 'manifest'},
'map': {'text', 'map'},
'markdown': {'text', 'markdown'},
'md': {'text', 'markdown'},
'mib': {'text', 'mib'},
'mk': {'text', 'makefile'},
'mm': {'text', 'c++', 'objective-c++'},
'modulemap': {'text', 'modulemap'},
'ngdoc': {'text', 'ngdoc'},
'nim': {'text', 'nim'},
'nims': {'text', 'nim'},
'nimble': {'text', 'nimble'},
'nix': {'text', 'nix'},
'otf': {'binary', 'otf'},
'p12': {'binary', 'p12'},
'patch': {'text', 'diff'},
'pdf': {'binary', 'pdf'},
'pem': {'text', 'pem'},
'php': {'text', 'php'},
'php4': {'text', 'php'},
'php5': {'text', 'php'},
'phtml': {'text', 'php'},
'pl': {'text', 'perl'},
'plantuml': {'text', 'plantuml'},
'pm': {'text', 'perl'},
'png': {'binary', 'image', 'png'},
'po': {'text', 'pofile'},
'pp': {'text', 'puppet'},
'properties': {'text', 'java-properties'},
'proto': {'text', 'proto'},
'puml': {'text', 'plantuml'},
'purs': {'text', 'purescript'},
'py': {'text', 'python'},
'pyi': {'text', 'pyi'},
'pyx': {'text', 'cython'},
'pxd': {'text', 'cython'},
'pxi': {'text', 'cython'},
'r': {'text', 'r'},
'rb': {'text', 'ruby'},
'rs': {'text', 'rust'},
'rst': {'text', 'rst'},
's': {'text', 'asm'},
'sbt': {'text', 'sbt', 'scala'},
'sc': {'text', 'scala'},
'scala': {'text', 'scala'},
'scss': {'text', 'scss'},
'scm': {'text', 'scheme'},
'sh': {'text', 'shell'},
'sls': {'text', 'salt'},
'so': {'binary'},
'sol': {'text', 'solidity'},
'spec': {'text', 'spec'},
'ss': {'text', 'scheme'},
'styl': {'text', 'stylus'},
'sql': {'text', 'sql'},
'svg': {'text', 'image', 'svg'},
'swf': {'binary', 'swf'},
'swift': {'text', 'swift'},
'swiftdeps': {'text', 'swiftdeps'},
'tac': {'text', 'twisted', 'python'},
'tar': {'binary', 'tar'},
'tgz': {'binary', 'gzip'},
'thrift': {'text', 'thrift'},
'tiff': {'binary', 'image', 'tiff'},
'toml': {'text', 'toml'},
'tf': {'text', 'terraform'},
'ts': {'text', 'ts'},
'tsx': {'text', 'tsx'},
'ttf': {'binary', 'ttf'},
'txt': {'text', 'plain-text'},
'vdx': {'text', 'vdx'},
'vim': {'text', 'vim'},
'vue': {'text', 'vue'},
'war': {'binary', 'zip', 'jar'},
'wav': {'binary', 'audio', 'wav'},
'wkt': {'text', 'wkt'},
'whl': {'binary', 'wheel', 'zip'},
'woff': {'binary', 'woff'},
'woff2': {'binary', 'woff2'},
'wsgi': {'text', 'wsgi', 'python'},
'xml': {'text', 'xml'},
'xq': {'text', 'xquery'},
'xql': {'text', 'xquery'},
'xqm': {'text', 'xquery'},
'xqu': {'text', 'xquery'},
'xquery': {'text', 'xquery'},
'xqy': {'text', 'xquery'},
'xsd': {'text', 'xml', 'xsd'},
'xsl': {'text', 'xml', 'xsl'},
'yaml': {'text', 'yaml'},
'yang': {'text', 'yang'},
'yin': {'text', 'xml', 'yin'},
'yml': {'text', 'yaml'},
'zig': {'text', 'zig'},
'zip': {'binary', 'zip'},
'zsh': {'text', 'shell', 'zsh'},
}
EXTENSIONS_NEED_BINARY_CHECK = {
'plist': {'plist'},
}
NAMES = {
'.babelrc': EXTENSIONS['json'] | {'babelrc'},
'.bashrc': EXTENSIONS['bash'],
'.bash_aliases': EXTENSIONS['bash'],
'.bash_profile': EXTENSIONS['bash'],
'.bowerrc': EXTENSIONS['json'] | {'bowerrc'},
'.coveragerc': EXTENSIONS['ini'] | {'coveragerc'},
'.dockerignore': {'text', 'dockerignore'},
'.editorconfig': {'text', 'editorconfig'},
'.gitconfig': EXTENSIONS['ini'] | {'gitconfig'},
'.hgrc': EXTENSIONS['ini'] | {'hgrc'},
'.gitattributes': {'text', 'gitattributes'},
'.gitignore': {'text', 'gitignore'},
'.gitmodules': {'text', 'gitmodules'},
'.jshintrc': EXTENSIONS['json'] | {'jshintrc'},
'.mailmap': {'text', 'mailmap'},
'.mention-bot': EXTENSIONS['json'] | {'mention-bot'},
'.npmignore': {'text', 'npmignore'},
'.pdbrc': EXTENSIONS['py'] | {'pdbrc'},
'.pypirc': EXTENSIONS['ini'] | {'pypirc'},
'.yamllint': EXTENSIONS['yaml'] | {'yamllint'},
'.zshrc': EXTENSIONS['zsh'],
'AUTHORS': EXTENSIONS['txt'],
'BUILD.bazel': {'text', 'bazel'},
'BUILD': {'text', 'bazel'},
'CMakeLists.txt': EXTENSIONS['cmake'],
'COPYING': EXTENSIONS['txt'],
'Dockerfile': {'text', 'dockerfile'},
'Gemfile': EXTENSIONS['rb'],
'Jenkinsfile': {'text', 'groovy'},
'LICENSE': EXTENSIONS['txt'],
'MAINTAINERS': EXTENSIONS['txt'],
'Makefile': EXTENSIONS['mk'],
'NOTICE': EXTENSIONS['txt'],
'PATENTS': EXTENSIONS['txt'],
'Pipfile': EXTENSIONS['toml'],
'Pipfile.lock': EXTENSIONS['json'],
'README': EXTENSIONS['txt'],
'Rakefile': EXTENSIONS['rb'],
'setup.cfg': EXTENSIONS['ini'],
}

230
identify/identify.py Normal file
View file

@ -0,0 +1,230 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
import io
import os.path
import re
import shlex
import string
import sys
from identify import extensions
from identify import interpreters
from identify.vendor import licenses
printable = frozenset(string.printable)
DIRECTORY = 'directory'
SYMLINK = 'symlink'
FILE = 'file'
EXECUTABLE = 'executable'
NON_EXECUTABLE = 'non-executable'
TEXT = 'text'
BINARY = 'binary'
ALL_TAGS = {DIRECTORY, SYMLINK, FILE, EXECUTABLE, NON_EXECUTABLE, TEXT, BINARY}
ALL_TAGS.update(*extensions.EXTENSIONS.values())
ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values())
ALL_TAGS.update(*extensions.NAMES.values())
ALL_TAGS.update(*interpreters.INTERPRETERS.values())
ALL_TAGS = frozenset(ALL_TAGS)
def tags_from_path(path):
if not os.path.lexists(path):
raise ValueError('{} does not exist.'.format(path))
if os.path.isdir(path):
return {DIRECTORY}
if os.path.islink(path):
return {SYMLINK}
tags = {FILE}
executable = os.access(path, os.X_OK)
if executable:
tags.add(EXECUTABLE)
else:
tags.add(NON_EXECUTABLE)
# As an optimization, if we're able to read tags from the filename, then we
# don't peek at the file contents.
t = tags_from_filename(os.path.basename(path))
if len(t) > 0:
tags.update(t)
else:
if executable:
shebang = parse_shebang_from_file(path)
if len(shebang) > 0:
tags.update(tags_from_interpreter(shebang[0]))
# some extensions can be both binary and text
# see EXTENSIONS_NEED_BINARY_CHECK
if not {TEXT, BINARY} & tags:
if file_is_text(path):
tags.add(TEXT)
else:
tags.add(BINARY)
assert {TEXT, BINARY} & tags, tags
assert {EXECUTABLE, NON_EXECUTABLE} & tags, tags
return tags
def tags_from_filename(filename):
_, filename = os.path.split(filename)
_, ext = os.path.splitext(filename)
ret = set()
# Allow e.g. "Dockerfile.xenial" to match "Dockerfile"
for part in [filename] + filename.split('.'):
if part in extensions.NAMES:
ret.update(extensions.NAMES[part])
break
if len(ext) > 0:
ext = ext[1:].lower()
if ext in extensions.EXTENSIONS:
ret.update(extensions.EXTENSIONS[ext])
elif ext in extensions.EXTENSIONS_NEED_BINARY_CHECK:
ret.update(extensions.EXTENSIONS_NEED_BINARY_CHECK[ext])
return ret
def tags_from_interpreter(interpreter):
_, _, interpreter = interpreter.rpartition('/')
# Try "python3.5.2" => "python3.5" => "python3" until one matches.
while interpreter:
if interpreter in interpreters.INTERPRETERS:
return interpreters.INTERPRETERS[interpreter]
else:
interpreter, _, _ = interpreter.rpartition('.')
return set()
def is_text(bytesio):
"""Return whether the first KB of contents seems to be binary.
This is roughly based on libmagic's binary/text detection:
https://github.com/file/file/blob/df74b09b9027676088c797528edcaae5a9ce9ad0/src/encoding.c#L203-L228
"""
text_chars = (
bytearray([7, 8, 9, 10, 11, 12, 13, 27]) +
bytearray(range(0x20, 0x7F)) +
bytearray(range(0x80, 0X100))
)
return not bool(bytesio.read(1024).translate(None, text_chars))
def file_is_text(path):
if not os.path.lexists(path):
raise ValueError('{} does not exist.'.format(path))
with open(path, 'rb') as f:
return is_text(f)
def _shebang_split(line):
try:
# shebangs aren't supposed to be quoted, though some tools such as
# setuptools will write them with quotes so we'll best-guess parse
# with shlex first
return shlex.split(line)
except ValueError:
# failing that, we'll do a more "traditional" shebang parsing which
# just involves splitting by whitespace
return line.split()
def parse_shebang(bytesio):
"""Parse the shebang from a file opened for reading binary."""
if bytesio.read(2) != b'#!':
return ()
first_line = bytesio.readline()
try:
first_line = first_line.decode('UTF-8')
except UnicodeDecodeError:
return ()
# Require only printable ascii
for c in first_line:
if c not in printable:
return ()
cmd = tuple(_shebang_split(first_line.strip()))
if cmd and cmd[0] == '/usr/bin/env':
cmd = cmd[1:]
return cmd
def parse_shebang_from_file(path):
"""Parse the shebang given a file path."""
if not os.path.lexists(path):
raise ValueError('{} does not exist.'.format(path))
if not os.access(path, os.X_OK):
return ()
with open(path, 'rb') as f:
return parse_shebang(f)
COPYRIGHT_RE = re.compile(r'^\s*(Copyright|\(C\)) .*$', re.I | re.MULTILINE)
WS_RE = re.compile(r'\s+')
def _norm_license(s):
s = COPYRIGHT_RE.sub('', s)
s = WS_RE.sub(' ', s)
return s.strip()
def license_id(filename):
"""Return the spdx id for the license contained in `filename`. If no
license is detected, returns `None`.
spdx: https://spdx.org/licenses/
licenses from choosealicense.com: https://github.com/choosealicense.com
Approximate algorithm:
1. strip copyright line
2. normalize whitespace (replace all whitespace with a single space)
3. check exact text match with existing licenses
4. failing that use edit distance
"""
import editdistance # `pip install identify[license]`
with io.open(filename, encoding='UTF-8') as f:
contents = f.read()
norm = _norm_license(contents)
min_edit_dist = sys.maxsize
min_edit_dist_spdx = ''
# try exact matches
for spdx, text in licenses.LICENSES:
norm_license = _norm_license(text)
if norm == norm_license:
return spdx
# skip the slow calculation if the lengths are very different
if norm and abs(len(norm) - len(norm_license)) / len(norm) > .05:
continue
edit_dist = editdistance.eval(norm, norm_license)
if edit_dist < min_edit_dist:
min_edit_dist = edit_dist
min_edit_dist_spdx = spdx
# if there's less than 5% edited from the license, we found our match
if norm and min_edit_dist / len(norm) < .05:
return min_edit_dist_spdx
else:
# no matches :'(
return None

18
identify/interpreters.py Normal file
View file

@ -0,0 +1,18 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import unicode_literals
INTERPRETERS = {
'bash': {'shell', 'bash'},
'dash': {'shell', 'dash'},
'node': {'javascript'},
'nodejs': {'javascript'},
'perl': {'perl'},
'python': {'python'},
'python2': {'python', 'python2'},
'python3': {'python', 'python3'},
'ruby': {'ruby'},
'sh': {'shell', 'sh'},
'tcsh': {'shell', 'tcsh'},
'zsh': {'shell', 'zsh'},
}

0
identify/vendor/__init__.py vendored Normal file
View file

6749
identify/vendor/licenses.py vendored Normal file

File diff suppressed because it is too large Load diff

3
requirements-dev.txt Normal file
View file

@ -0,0 +1,3 @@
coverage
pre-commit>=0.12.0
pytest

41
setup.cfg Normal file
View file

@ -0,0 +1,41 @@
[metadata]
name = identify
version = 1.4.13
description = File identification library for Python
long_description = file: README.md
long_description_content_type = text/markdown
url = https://github.com/chriskuehl/identify
author = Chris Kuehl
author_email = ckuehl@ocf.berkeley.edu
license = MIT
license_file = LICENSE
classifiers =
License :: OSI Approved :: MIT License
Programming Language :: Python :: 2
Programming Language :: Python :: 2.7
Programming Language :: Python :: 3
Programming Language :: Python :: 3.4
Programming Language :: Python :: 3.5
Programming Language :: Python :: 3.6
Programming Language :: Python :: 3.7
Programming Language :: Python :: Implementation :: CPython
Programming Language :: Python :: Implementation :: PyPy
[options]
packages = find:
python_requires = >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*
[options.entry_points]
console_scripts =
identify-cli=identify.cli:main
[options.extras_require]
license = editdistance
[options.packages.find]
exclude =
tests*
testing*
[wheel]
universal = True

6
setup.py Normal file
View file

@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import unicode_literals
from setuptools import setup
setup()

0
tests/__init__.py Normal file
View file

33
tests/cli_test.py Normal file
View file

@ -0,0 +1,33 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import unicode_literals
from identify import cli
def test_identify_cli(capsys):
ret = cli.main(('setup.py',))
out, _ = capsys.readouterr()
assert ret == 0
assert out == '["file", "non-executable", "python", "text"]\n'
def test_identify_cli_filename_only(capsys):
ret = cli.main(('setup.py', '--filename-only'))
out, _ = capsys.readouterr()
assert ret == 0
assert out == '["python", "text"]\n'
def test_identify_cli_filename_only_unidentified(capsys):
ret = cli.main(('x.unknown', '--filename-only'))
out, _ = capsys.readouterr()
assert ret == 1
assert out == ''
def test_file_not_found(capsys):
ret = cli.main(('x.unknown',))
out, _ = capsys.readouterr()
assert ret == 1
assert out == 'x.unknown does not exist.\n'

26
tests/extensions_test.py Normal file
View file

@ -0,0 +1,26 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import unicode_literals
import pytest
from identify import extensions
@pytest.mark.parametrize('extension', extensions.EXTENSIONS)
def test_extensions_have_binary_or_text(extension):
tags = extensions.EXTENSIONS[extension]
assert len({'text', 'binary'} & tags) == 1, tags
@pytest.mark.parametrize('extension', extensions.EXTENSIONS_NEED_BINARY_CHECK)
def test_need_binary_check_do_not_specify_text_binary(extension):
tags = extensions.EXTENSIONS_NEED_BINARY_CHECK[extension]
assert len({'text', 'binary'} & tags) == 0, tags
def test_mutually_exclusive_check_types():
assert not (
set(extensions.EXTENSIONS) &
set(extensions.EXTENSIONS_NEED_BINARY_CHECK)
)

281
tests/identify_test.py Normal file
View file

@ -0,0 +1,281 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import unicode_literals
import io
import os
import stat
import pytest
from identify import identify
def test_all_tags_includes_basic_ones():
assert 'file' in identify.ALL_TAGS
assert 'directory' in identify.ALL_TAGS
def test_all_tags_contains_each_type():
assert 'xml' in identify.ALL_TAGS # extension
assert 'plist' in identify.ALL_TAGS # extension, needs binary check
assert 'dockerfile' in identify.ALL_TAGS # by file convention
assert 'python3' in identify.ALL_TAGS # by shebang
def test_tags_from_path_does_not_exist(tmpdir):
x = tmpdir.join('foo')
with pytest.raises(ValueError):
identify.tags_from_path(x.strpath)
def test_tags_from_path_directory(tmpdir):
x = tmpdir.join('foo')
x.mkdir()
assert identify.tags_from_path(x.strpath) == {'directory'}
def test_tags_from_path_symlink(tmpdir):
x = tmpdir.join('foo')
x.mksymlinkto(tmpdir.join('lol').ensure())
assert identify.tags_from_path(x.strpath) == {'symlink'}
def test_tags_from_path_broken_symlink(tmpdir):
x = tmpdir.join('foo')
x.mksymlinkto(tmpdir.join('lol'))
assert identify.tags_from_path(x.strpath) == {'symlink'}
def test_tags_from_path_simple_file(tmpdir):
x = tmpdir.join('test.py').ensure()
assert identify.tags_from_path(x.strpath) == {
'file', 'text', 'non-executable', 'python',
}
def test_tags_from_path_file_with_incomplete_shebang(tmpdir):
x = tmpdir.join('test')
x.write_text('#! \n', encoding='UTF-8')
make_executable(x.strpath)
assert identify.tags_from_path(x.strpath) == {
'file', 'text', 'executable',
}
def test_tags_from_path_file_with_shebang_non_executable(tmpdir):
x = tmpdir.join('test')
x.write_text('#!/usr/bin/env python\nimport sys\n', encoding='UTF-8')
assert identify.tags_from_path(x.strpath) == {
'file', 'text', 'non-executable',
}
def test_tags_from_path_file_with_shebang_executable(tmpdir):
x = tmpdir.join('test')
x.write_text('#!/usr/bin/env python\nimport sys\n', encoding='UTF-8')
make_executable(x.strpath)
assert identify.tags_from_path(x.strpath) == {
'file', 'text', 'executable', 'python',
}
def test_tags_from_path_binary(tmpdir):
x = tmpdir.join('test')
x.write(b'\x7f\x45\x4c\x46\x02\x01\x01')
make_executable(x.strpath)
assert identify.tags_from_path(x.strpath) == {
'file', 'binary', 'executable',
}
def test_tags_from_path_plist_binary(tmpdir):
x = tmpdir.join('t.plist')
x.write_binary(
b'bplist00\xd1\x01\x02_\x10\x0fLast Login NameWDefault\x08\x0b\x1d\x00'
b'\x00\x00\x00\x00\x00\x01\x01\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00'
b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00%',
)
assert identify.tags_from_path(x.strpath) == {
'file', 'plist', 'binary', 'non-executable',
}
def test_tags_from_path_plist_text(tmpdir):
x = tmpdir.join('t.plist')
x.write(
'<?xml version="1.0" encoding="UTF-8"?>\n'
'<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">\n'
'<plist version="1.0">\n'
'<dict>\n'
'\t<key>Last Login Name</key>\n'
'\t<string>Default</string>\n'
'</dict>\n'
'</plist>\n',
)
assert identify.tags_from_path(x.strpath) == {
'file', 'plist', 'text', 'non-executable',
}
@pytest.mark.parametrize(
('filename', 'expected'),
(
('test.py', {'text', 'python'}),
('test.mk', {'text', 'makefile'}),
('Makefile', {'text', 'makefile'}),
('Dockerfile', {'text', 'dockerfile'}),
('Dockerfile.xenial', {'text', 'dockerfile'}),
('xenial.Dockerfile', {'text', 'dockerfile'}),
('Pipfile', {'text', 'toml'}),
('Pipfile.lock', {'text', 'json'}),
('mod/test.py', {'text', 'python'}),
('mod/Dockerfile', {'text', 'dockerfile'}),
# does not set binary / text
('f.plist', {'plist'}),
# case of extension should be ignored
('f.JPG', {'binary', 'image', 'jpeg'}),
# but case of name checks should still be honored
('dockerfile.py', {'text', 'python'}),
# full filename tests should take precedence over extension tests
('test.cfg', {'text'}),
('setup.cfg', {'text', 'ini'}),
# Filename matches should still include extensions if applicable
('README.md', {'text', 'markdown', 'plain-text'}),
('test.weird-unrecognized-extension', set()),
('test', set()),
('', set()),
),
)
def test_tags_from_filename(filename, expected):
assert identify.tags_from_filename(filename) == expected
@pytest.mark.parametrize(
('interpreter', 'expected'),
(
('python', {'python'}),
('python3', {'python3', 'python'}),
('python3.5.2', {'python3', 'python'}),
('/usr/bin/python3.5.2', {'python3', 'python'}),
('/usr/bin/herpderpderpderpderp', set()),
('something-random', set()),
('', set()),
),
)
def test_tags_from_interpreter(interpreter, expected):
assert identify.tags_from_interpreter(interpreter) == expected
@pytest.mark.parametrize(
('data', 'expected'),
(
(b'hello world', True),
(b'', True),
('éóñəå ⊂(◉‿◉)つ(ノ≥∇≤)'.encode('utf8'), True),
(r'¯\_(ツ)_/¯'.encode('utf8'), True),
('♪┏(・o・)┛♪┗ ( ・o・) ┓♪┏ ( ) ┛♪┗ (・o・ ) ┓♪┏(・o・)┛♪'.encode('utf8'), True),
('éóñå'.encode('latin1'), True),
(b'hello world\x00', False),
(b'\x7f\x45\x4c\x46\x02\x01\x01', False), # first few bytes of /bin/bash
(b'\x43\x92\xd9\x0f\xaf\x32\x2c', False), # some /dev/urandom output
),
)
def test_is_text(data, expected):
assert identify.is_text(io.BytesIO(data)) is expected
def test_file_is_text_simple(tmpdir):
x = tmpdir.join('f')
x.write_text('hello there\n', encoding='UTF-8')
assert identify.file_is_text(x.strpath) is True
def test_file_is_text_does_not_exist(tmpdir):
x = tmpdir.join('f')
with pytest.raises(ValueError):
identify.file_is_text(x.strpath)
@pytest.mark.parametrize(
('s', 'expected'),
(
(b'', ()),
(b'#!/usr/bin/python', ('/usr/bin/python',)),
(b'#!/usr/bin/env python', ('python',)),
(b'#! /usr/bin/python', ('/usr/bin/python',)),
(b'#!/usr/bin/foo python', ('/usr/bin/foo', 'python')),
# despite this being invalid, setuptools will write shebangs like this
(b'#!"/path/with spaces/x" y', ('/path/with spaces/x', 'y')),
# this is apparently completely ok to embed quotes
(b"#!/path'with/quotes y", ("/path'with/quotes", 'y')),
# Don't regress on leading/trailing ws
(b"#! /path'with/quotes y ", ("/path'with/quotes", 'y')),
(b'\xf9\x93\x01\x42\xcd', ()),
(b'#!\xf9\x93\x01\x42\xcd', ()),
(b'#!\x00\x00\x00\x00', ()),
),
)
def test_parse_shebang(s, expected):
assert identify.parse_shebang(io.BytesIO(s)) == expected
def test_parse_shebang_from_file_does_not_exist():
with pytest.raises(ValueError):
identify.parse_shebang_from_file('herp derp derp')
def test_parse_shebang_from_file_nonexecutable(tmpdir):
x = tmpdir.join('f')
x.write_text('#!/usr/bin/env python', encoding='UTF-8')
assert identify.parse_shebang_from_file(x.strpath) == ()
def test_parse_shebang_from_file_simple(tmpdir):
x = tmpdir.join('f')
x.write_text('#!/usr/bin/env python', encoding='UTF-8')
make_executable(x.strpath)
assert identify.parse_shebang_from_file(x.strpath) == ('python',)
def make_executable(filename):
original_mode = os.stat(filename).st_mode
os.chmod(
filename,
original_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH,
)
def test_license_identification():
assert identify.license_id('LICENSE') == 'MIT'
def test_license_exact_identification(tmpdir):
wtfpl = '''\
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
Version 2, December 2004
Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
Everyone is permitted to copy and distribute verbatim or modified
copies of this license document, and changing it is allowed as long
as the name is changed.
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. You just DO WHAT THE FUCK YOU WANT TO.
'''
f = tmpdir.join('LICENSE')
f.write(wtfpl)
assert identify.license_id(f.strpath) == 'WTFPL'
def test_license_not_identified():
assert identify.license_id(os.devnull) is None

19
tox.ini Normal file
View file

@ -0,0 +1,19 @@
[tox]
envlist = py27,py35,py36,pypy
tox_pip_extensions_ext_venv_update = true
[testenv]
deps = -rrequirements-dev.txt
extras = license
commands =
coverage erase
coverage run -m pytest {posargs:tests}
coverage report --fail-under 100
pre-commit install -f --install-hooks
pre-commit run --all-files
[flake8]
max-line-length = 119
[pep8]
ignore = E265,E501,W504