1
0
Fork 0

Adding upstream version 2.1.0.

Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
Daniel Baumann 2025-02-12 21:44:15 +01:00
parent c3f707bfbc
commit 085459798b
Signed by: daniel
GPG key ID: FBB4F0E80A80222F
14 changed files with 132 additions and 104 deletions

View file

@ -2,22 +2,24 @@ repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v3.4.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-docstring-first
- id: check-merge-conflict
- id: check-yaml
- id: debug-statements
- id: double-quote-string-fixer
- id: end-of-file-fixer
- id: name-tests-test
- id: check-added-large-files
- id: check-byte-order-marker
- id: fix-encoding-pragma
- id: requirements-txt-fixer
- id: trailing-whitespace
- repo: https://github.com/asottile/setup-cfg-fmt
rev: v1.16.0
hooks:
- id: setup-cfg-fmt
- repo: https://gitlab.com/pycqa/flake8
rev: 3.8.4
hooks:
- id: flake8
exclude: ^identify/vendor/licenses\.py$
additional_dependencies: [flake8-typing-imports==1.10.1]
- repo: https://github.com/pre-commit/mirrors-autopep8
rev: v1.5.4
hooks:
@ -26,11 +28,18 @@ repos:
rev: v2.4.0
hooks:
- id: reorder-python-imports
args: [
'--add-import', 'from __future__ import absolute_import',
'--add-import', 'from __future__ import unicode_literals',
]
args: [--py3-plus]
- repo: https://github.com/asottile/add-trailing-comma
rev: v2.1.0
hooks:
- id: add-trailing-comma
args: [--py36-plus]
- repo: https://github.com/asottile/pyupgrade
rev: v2.10.0
hooks:
- id: pyupgrade
args: [--py36-plus]
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v0.812
hooks:
- id: mypy

View file

@ -37,7 +37,7 @@ If you have an actual file on disk, you can get the most information possible
When using a file on disk, the checks performed are:
* File type (file, symlink, directory)
* File type (file, symlink, directory, socket)
* Mode (is it executable?)
* File name (mostly based on extension)
* If executable, the shebang is read and the interpreter interpreted
@ -76,11 +76,11 @@ optional arguments:
--filename-only
```
```bash
```console
$ identify-cli setup.py; echo $?
["file", "non-executable", "python", "text"]
0
identify setup.py --filename-only; echo $?
$ identify setup.py --filename-only; echo $?
["python", "text"]
0
$ identify-cli wat.wat; echo $?

View file

@ -1,19 +1,15 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Usage:
./bin/vendor-licenses > identify/vendor/licenses.py
"""
from __future__ import absolute_import
from __future__ import unicode_literals
import argparse
import os.path
import subprocess
import tempfile
def main():
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument('--revision', default='HEAD')
args = parser.parse_args()
@ -45,18 +41,16 @@ def main():
licenses.append((spdx, license_text))
print('# -*- coding: utf-8 -*-')
print('from __future__ import absolute_import')
print('from __future__ import unicode_literals')
print('LICENSES = (')
for spdx, text in sorted(licenses):
print(' (')
print(' {!r},'.format(spdx))
print(f' {spdx!r},')
print(" '''\\")
print(text.replace('\t', ' ').replace(' \n', '').strip())
print("''',")
print(' ),')
print(')')
return 0
if __name__ == '__main__':

View file

@ -1,14 +1,12 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import unicode_literals
import argparse
import json
from typing import Optional
from typing import Sequence
from identify import identify
def main(argv=None):
def main(argv: Optional[Sequence[str]] = None) -> int:
parser = argparse.ArgumentParser()
parser.add_argument('--filename-only', action='store_true')
parser.add_argument('path')

View file

@ -1,13 +1,9 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import unicode_literals
EXTENSIONS = {
'adoc': {'text', 'asciidoc'},
'asciidoc': {'text', 'asciidoc'},
'apinotes': {'text', 'apinotes'},
'asar': {'binary', 'asar'},
'avif': {'binary', 'image', 'avif'},
'bash': {'text', 'shell', 'bash'},
'bat': {'text', 'batch'},
'bib': {'text', 'bib'},

View file

@ -1,14 +1,14 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
import io
import os.path
import re
import shlex
import stat
import string
import sys
from typing import IO
from typing import List
from typing import Optional
from typing import Set
from typing import Tuple
from identify import extensions
from identify import interpreters
@ -19,27 +19,37 @@ printable = frozenset(string.printable)
DIRECTORY = 'directory'
SYMLINK = 'symlink'
SOCKET = 'socket'
FILE = 'file'
EXECUTABLE = 'executable'
NON_EXECUTABLE = 'non-executable'
TEXT = 'text'
BINARY = 'binary'
ALL_TAGS = {DIRECTORY, SYMLINK, FILE, EXECUTABLE, NON_EXECUTABLE, TEXT, BINARY}
ALL_TAGS.update(*extensions.EXTENSIONS.values())
ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values())
ALL_TAGS.update(*extensions.NAMES.values())
ALL_TAGS.update(*interpreters.INTERPRETERS.values())
ALL_TAGS = frozenset(ALL_TAGS)
TYPE_TAGS = frozenset((DIRECTORY, FILE, SYMLINK, SOCKET))
MODE_TAGS = frozenset((EXECUTABLE, NON_EXECUTABLE))
ENCODING_TAGS = frozenset((BINARY, TEXT))
_ALL_TAGS = {*TYPE_TAGS, *MODE_TAGS, *ENCODING_TAGS}
_ALL_TAGS.update(*extensions.EXTENSIONS.values())
_ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values())
_ALL_TAGS.update(*extensions.NAMES.values())
_ALL_TAGS.update(*interpreters.INTERPRETERS.values())
ALL_TAGS = frozenset(_ALL_TAGS)
def tags_from_path(path):
if not os.path.lexists(path):
raise ValueError('{} does not exist.'.format(path))
if os.path.isdir(path):
def tags_from_path(path: str) -> Set[str]:
try:
sr = os.lstat(path)
except (OSError, ValueError): # same error-handling as `os.lexists()`
raise ValueError(f'{path} does not exist.')
mode = sr.st_mode
if stat.S_ISDIR(mode):
return {DIRECTORY}
if os.path.islink(path):
if stat.S_ISLNK(mode):
return {SYMLINK}
if stat.S_ISSOCK(mode):
return {SOCKET}
tags = {FILE}
@ -62,19 +72,19 @@ def tags_from_path(path):
# some extensions can be both binary and text
# see EXTENSIONS_NEED_BINARY_CHECK
if not {TEXT, BINARY} & tags:
if not ENCODING_TAGS & tags:
if file_is_text(path):
tags.add(TEXT)
else:
tags.add(BINARY)
assert {TEXT, BINARY} & tags, tags
assert {EXECUTABLE, NON_EXECUTABLE} & tags, tags
assert ENCODING_TAGS & tags, tags
assert MODE_TAGS & tags, tags
return tags
def tags_from_filename(filename):
_, filename = os.path.split(filename)
def tags_from_filename(path: str) -> Set[str]:
_, filename = os.path.split(path)
_, ext = os.path.splitext(filename)
ret = set()
@ -95,7 +105,7 @@ def tags_from_filename(filename):
return ret
def tags_from_interpreter(interpreter):
def tags_from_interpreter(interpreter: str) -> Set[str]:
_, _, interpreter = interpreter.rpartition('/')
# Try "python3.5.2" => "python3.5" => "python3" until one matches.
@ -108,7 +118,7 @@ def tags_from_interpreter(interpreter):
return set()
def is_text(bytesio):
def is_text(bytesio: IO[bytes]) -> bool:
"""Return whether the first KB of contents seems to be binary.
This is roughly based on libmagic's binary/text detection:
@ -122,14 +132,14 @@ def is_text(bytesio):
return not bool(bytesio.read(1024).translate(None, text_chars))
def file_is_text(path):
def file_is_text(path: str) -> bool:
if not os.path.lexists(path):
raise ValueError('{} does not exist.'.format(path))
raise ValueError(f'{path} does not exist.')
with open(path, 'rb') as f:
return is_text(f)
def _shebang_split(line):
def _shebang_split(line: str) -> List[str]:
try:
# shebangs aren't supposed to be quoted, though some tools such as
# setuptools will write them with quotes so we'll best-guess parse
@ -141,11 +151,14 @@ def _shebang_split(line):
return line.split()
def _parse_nix_shebang(bytesio, cmd):
def _parse_nix_shebang(
bytesio: IO[bytes],
cmd: Tuple[str, ...],
) -> Tuple[str, ...]:
while bytesio.read(2) == b'#!':
next_line = bytesio.readline()
next_line_b = bytesio.readline()
try:
next_line = next_line.decode('UTF-8')
next_line = next_line_b.decode('UTF-8')
except UnicodeDecodeError:
return cmd
@ -162,13 +175,13 @@ def _parse_nix_shebang(bytesio, cmd):
return cmd
def parse_shebang(bytesio):
def parse_shebang(bytesio: IO[bytes]) -> Tuple[str, ...]:
"""Parse the shebang from a file opened for reading binary."""
if bytesio.read(2) != b'#!':
return ()
first_line = bytesio.readline()
first_line_b = bytesio.readline()
try:
first_line = first_line.decode('UTF-8')
first_line = first_line_b.decode('UTF-8')
except UnicodeDecodeError:
return ()
@ -185,10 +198,10 @@ def parse_shebang(bytesio):
return cmd
def parse_shebang_from_file(path):
def parse_shebang_from_file(path: str) -> Tuple[str, ...]:
"""Parse the shebang given a file path."""
if not os.path.lexists(path):
raise ValueError('{} does not exist.'.format(path))
raise ValueError(f'{path} does not exist.')
if not os.access(path, os.X_OK):
return ()
@ -200,13 +213,13 @@ COPYRIGHT_RE = re.compile(r'^\s*(Copyright|\(C\)) .*$', re.I | re.MULTILINE)
WS_RE = re.compile(r'\s+')
def _norm_license(s):
def _norm_license(s: str) -> str:
s = COPYRIGHT_RE.sub('', s)
s = WS_RE.sub(' ', s)
return s.strip()
def license_id(filename):
def license_id(filename: str) -> Optional[str]:
"""Return the spdx id for the license contained in `filename`. If no
license is detected, returns `None`.
@ -222,7 +235,7 @@ def license_id(filename):
"""
import editdistance # `pip install identify[license]`
with io.open(filename, encoding='UTF-8') as f:
with open(filename, encoding='UTF-8') as f:
contents = f.read()
norm = _norm_license(contents)

View file

@ -1,7 +1,3 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import unicode_literals
INTERPRETERS = {
'ash': {'shell', 'ash'},
'awk': {'awk'},

View file

@ -1,6 +1,3 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import unicode_literals
LICENSES = (
(
'0BSD',

View file

@ -1,6 +1,6 @@
[metadata]
name = identify
version = 1.5.14
version = 2.1.0
description = File identification library for Python
long_description = file: README.md
long_description_content_type = text/markdown
@ -11,26 +11,26 @@ license = MIT
license_file = LICENSE
classifiers =
License :: OSI Approved :: MIT License
Programming Language :: Python :: 2
Programming Language :: Python :: 2.7
Programming Language :: Python :: 3
Programming Language :: Python :: 3.4
Programming Language :: Python :: 3.5
Programming Language :: Python :: 3 :: Only
Programming Language :: Python :: 3.6
Programming Language :: Python :: 3.7
Programming Language :: Python :: 3.8
Programming Language :: Python :: 3.9
Programming Language :: Python :: Implementation :: CPython
Programming Language :: Python :: Implementation :: PyPy
[options]
packages = find:
python_requires = >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*
python_requires = >=3.6.1
[options.entry_points]
console_scripts =
identify-cli=identify.cli:main
[options.extras_require]
license = editdistance
license =
editdistance
[options.packages.find]
exclude =
@ -42,3 +42,16 @@ universal = True
[coverage:run]
plugins = covdefaults
[mypy]
check_untyped_defs = true
disallow_any_generics = true
disallow_incomplete_defs = true
disallow_untyped_defs = true
no_implicit_optional = true
[mypy-testing.*]
disallow_untyped_defs = false
[mypy-tests.*]
disallow_untyped_defs = false

View file

@ -1,6 +1,2 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import unicode_literals
from setuptools import setup
setup()

View file

@ -1,7 +1,3 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import unicode_literals
from identify import cli

View file

@ -1,7 +1,3 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import unicode_literals
import pytest
from identify import extensions

View file

@ -1,10 +1,8 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import unicode_literals
import io
import os
import socket
import stat
from tempfile import TemporaryDirectory
import pytest
@ -14,6 +12,21 @@ from identify import identify
def test_all_tags_includes_basic_ones():
assert 'file' in identify.ALL_TAGS
assert 'directory' in identify.ALL_TAGS
assert 'executable' in identify.ALL_TAGS
assert 'text' in identify.ALL_TAGS
assert 'socket' in identify.ALL_TAGS
@pytest.mark.parametrize(
'tag_group',
(
identify.TYPE_TAGS,
identify.MODE_TAGS,
identify.ENCODING_TAGS,
),
)
def test_all_tags_contains_all_groups(tag_group):
assert tag_group < identify.ALL_TAGS
def test_all_tags_contains_each_type():
@ -41,6 +54,17 @@ def test_tags_from_path_symlink(tmpdir):
assert identify.tags_from_path(x.strpath) == {'symlink'}
def test_tags_from_path_socket():
tmproot = '/tmp' # short path avoids `OSError: AF_UNIX path too long`
with TemporaryDirectory(dir=tmproot) as tmpdir:
socket_path = os.path.join(tmpdir, 'socket')
with socket.socket(socket.AF_UNIX) as sock:
sock.bind(socket_path)
tags = identify.tags_from_path(socket_path)
assert tags == {'socket'}
def test_tags_from_path_broken_symlink(tmpdir):
x = tmpdir.join('foo')
x.mksymlinkto(tmpdir.join('lol'))
@ -177,9 +201,9 @@ def test_tags_from_interpreter(interpreter, expected):
(
(b'hello world', True),
(b'', True),
('éóñəå ⊂(◉‿◉)つ(ノ≥∇≤)'.encode('utf8'), True),
(r'¯\_(ツ)_/¯'.encode('utf8'), True),
('♪┏(・o・)┛♪┗ ( ・o・) ┓♪┏ ( ) ┛♪┗ (・o・ ) ┓♪'.encode('utf8'), True),
('éóñəå ⊂(◉‿◉)つ(ノ≥∇≤)'.encode(), True),
(r'¯\_(ツ)_/¯'.encode(), True),
('♪┏(・o・)┛♪┗ ( ・o・) ┓♪┏ ( ) ┛♪┗ (・o・ ) ┓♪'.encode(), True),
('éóñå'.encode('latin1'), True),
(b'hello world\x00', False),

View file

@ -1,5 +1,5 @@
[tox]
envlist = py27,py35,py36,pypy,pre-commit
envlist = py36,pypy3,pre-commit
[testenv]
deps = -rrequirements-dev.txt