Adding upstream version 2.1.0.

Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-02-12 21:44:15 +01:00 · 2025-02-12 21:44:15 +01:00 · 085459798b
commit 085459798b
parent c3f707bfbc
14 changed files with 132 additions and 104 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -2,22 +2,24 @@ repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v3.4.0
    hooks:
-    -   id: trailing-whitespace
-    -   id: end-of-file-fixer
    -   id: check-docstring-first
-    -   id: check-merge-conflict
    -   id: check-yaml
    -   id: debug-statements
    -   id: double-quote-string-fixer
+    -   id: end-of-file-fixer
    -   id: name-tests-test
-    -   id: check-added-large-files
-    -   id: check-byte-order-marker
-    -   id: fix-encoding-pragma
+    -   id: requirements-txt-fixer
+    -   id: trailing-whitespace
+-   repo: https://github.com/asottile/setup-cfg-fmt
+    rev: v1.16.0
+    hooks:
+    -   id: setup-cfg-fmt
 -   repo: https://gitlab.com/pycqa/flake8
    rev: 3.8.4
    hooks:
    -   id: flake8
        exclude: ^identify/vendor/licenses\.py$
+        additional_dependencies: [flake8-typing-imports==1.10.1]
 -   repo: https://github.com/pre-commit/mirrors-autopep8
    rev: v1.5.4
    hooks:
@ -26,11 +28,18 @@ repos:
    rev: v2.4.0
    hooks:
    -   id: reorder-python-imports
-        args: [
-            '--add-import', 'from __future__ import absolute_import',
-            '--add-import', 'from __future__ import unicode_literals',
-        ]
+        args: [--py3-plus]
 -   repo: https://github.com/asottile/add-trailing-comma
    rev: v2.1.0
    hooks:
    -   id: add-trailing-comma
+        args: [--py36-plus]
+-   repo: https://github.com/asottile/pyupgrade
+    rev: v2.10.0
+    hooks:
+    -   id: pyupgrade
+        args: [--py36-plus]
+-   repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v0.812
+    hooks:
+    -   id: mypy
--- a/README.md
+++ b/README.md
@ -37,7 +37,7 @@ If you have an actual file on disk, you can get the most information possible

 When using a file on disk, the checks performed are:

-* File type (file, symlink, directory)
+* File type (file, symlink, directory, socket)
 * Mode (is it executable?)
 * File name (mostly based on extension)
 * If executable, the shebang is read and the interpreter interpreted
@ -76,11 +76,11 @@ optional arguments:
  --filename-only
 ```

-```bash
+```console
 $ identify-cli setup.py; echo $?
 ["file", "non-executable", "python", "text"]
 0
-identify setup.py --filename-only; echo $?
+$ identify setup.py --filename-only; echo $?
 ["python", "text"]
 0
 $ identify-cli wat.wat; echo $?
--- a/bin/vendor-licenses
+++ b/bin/vendor-licenses
@ -1,19 +1,15 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 """Usage:

    ./bin/vendor-licenses > identify/vendor/licenses.py
 """
-from __future__ import absolute_import
-from __future__ import unicode_literals
-
 import argparse
 import os.path
 import subprocess
 import tempfile


-def main():
+def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument('--revision', default='HEAD')
    args = parser.parse_args()
@ -45,18 +41,16 @@ def main():

            licenses.append((spdx, license_text))

-        print('# -*- coding: utf-8 -*-')
-        print('from __future__ import absolute_import')
-        print('from __future__ import unicode_literals')
        print('LICENSES = (')
        for spdx, text in sorted(licenses):
            print('    (')
-            print('        {!r},'.format(spdx))
+            print(f'        {spdx!r},')
            print("        '''\\")
            print(text.replace('\t', '    ').replace(' \n', '').strip())
            print("''',")
            print('    ),')
        print(')')
+    return 0


 if __name__ == '__main__':
--- a/identify/cli.py
+++ b/identify/cli.py
@ -1,14 +1,12 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import
-from __future__ import unicode_literals
-
 import argparse
 import json
+from typing import Optional
+from typing import Sequence

 from identify import identify


-def main(argv=None):
+def main(argv: Optional[Sequence[str]] = None) -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument('--filename-only', action='store_true')
    parser.add_argument('path')
--- a/identify/extensions.py
+++ b/identify/extensions.py
@ -1,13 +1,9 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import
-from __future__ import unicode_literals
-
-
 EXTENSIONS = {
    'adoc': {'text', 'asciidoc'},
    'asciidoc': {'text', 'asciidoc'},
    'apinotes': {'text', 'apinotes'},
    'asar': {'binary', 'asar'},
+    'avif': {'binary', 'image', 'avif'},
    'bash': {'text', 'shell', 'bash'},
    'bat': {'text', 'batch'},
    'bib': {'text', 'bib'},
--- a/identify/identify.py
+++ b/identify/identify.py
@ -1,14 +1,14 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import unicode_literals
-
-import io
 import os.path
 import re
 import shlex
+import stat
 import string
 import sys
+from typing import IO
+from typing import List
+from typing import Optional
+from typing import Set
+from typing import Tuple

 from identify import extensions
 from identify import interpreters
@ -19,27 +19,37 @@ printable = frozenset(string.printable)

 DIRECTORY = 'directory'
 SYMLINK = 'symlink'
+SOCKET = 'socket'
 FILE = 'file'
 EXECUTABLE = 'executable'
 NON_EXECUTABLE = 'non-executable'
 TEXT = 'text'
 BINARY = 'binary'

-ALL_TAGS = {DIRECTORY, SYMLINK, FILE, EXECUTABLE, NON_EXECUTABLE, TEXT, BINARY}
-ALL_TAGS.update(*extensions.EXTENSIONS.values())
-ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values())
-ALL_TAGS.update(*extensions.NAMES.values())
-ALL_TAGS.update(*interpreters.INTERPRETERS.values())
-ALL_TAGS = frozenset(ALL_TAGS)
+TYPE_TAGS = frozenset((DIRECTORY, FILE, SYMLINK, SOCKET))
+MODE_TAGS = frozenset((EXECUTABLE, NON_EXECUTABLE))
+ENCODING_TAGS = frozenset((BINARY, TEXT))
+_ALL_TAGS = {*TYPE_TAGS, *MODE_TAGS, *ENCODING_TAGS}
+_ALL_TAGS.update(*extensions.EXTENSIONS.values())
+_ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values())
+_ALL_TAGS.update(*extensions.NAMES.values())
+_ALL_TAGS.update(*interpreters.INTERPRETERS.values())
+ALL_TAGS = frozenset(_ALL_TAGS)


-def tags_from_path(path):
-    if not os.path.lexists(path):
-        raise ValueError('{} does not exist.'.format(path))
-    if os.path.isdir(path):
+def tags_from_path(path: str) -> Set[str]:
+    try:
+        sr = os.lstat(path)
+    except (OSError, ValueError):  # same error-handling as `os.lexists()`
+        raise ValueError(f'{path} does not exist.')
+
+    mode = sr.st_mode
+    if stat.S_ISDIR(mode):
        return {DIRECTORY}
-    if os.path.islink(path):
+    if stat.S_ISLNK(mode):
        return {SYMLINK}
+    if stat.S_ISSOCK(mode):
+        return {SOCKET}

    tags = {FILE}

@ -62,19 +72,19 @@ def tags_from_path(path):

    # some extensions can be both binary and text
    # see EXTENSIONS_NEED_BINARY_CHECK
-    if not {TEXT, BINARY} & tags:
+    if not ENCODING_TAGS & tags:
        if file_is_text(path):
            tags.add(TEXT)
        else:
            tags.add(BINARY)

-    assert {TEXT, BINARY} & tags, tags
-    assert {EXECUTABLE, NON_EXECUTABLE} & tags, tags
+    assert ENCODING_TAGS & tags, tags
+    assert MODE_TAGS & tags, tags
    return tags


-def tags_from_filename(filename):
-    _, filename = os.path.split(filename)
+def tags_from_filename(path: str) -> Set[str]:
+    _, filename = os.path.split(path)
    _, ext = os.path.splitext(filename)

    ret = set()
@ -95,7 +105,7 @@ def tags_from_filename(filename):
    return ret


-def tags_from_interpreter(interpreter):
+def tags_from_interpreter(interpreter: str) -> Set[str]:
    _, _, interpreter = interpreter.rpartition('/')

    # Try "python3.5.2" => "python3.5" => "python3" until one matches.
@ -108,7 +118,7 @@ def tags_from_interpreter(interpreter):
    return set()


-def is_text(bytesio):
+def is_text(bytesio: IO[bytes]) -> bool:
    """Return whether the first KB of contents seems to be binary.

    This is roughly based on libmagic's binary/text detection:
@ -122,14 +132,14 @@ def is_text(bytesio):
    return not bool(bytesio.read(1024).translate(None, text_chars))


-def file_is_text(path):
+def file_is_text(path: str) -> bool:
    if not os.path.lexists(path):
-        raise ValueError('{} does not exist.'.format(path))
+        raise ValueError(f'{path} does not exist.')
    with open(path, 'rb') as f:
        return is_text(f)


-def _shebang_split(line):
+def _shebang_split(line: str) -> List[str]:
    try:
        # shebangs aren't supposed to be quoted, though some tools such as
        # setuptools will write them with quotes so we'll best-guess parse
@ -141,11 +151,14 @@ def _shebang_split(line):
        return line.split()


-def _parse_nix_shebang(bytesio, cmd):
+def _parse_nix_shebang(
+        bytesio: IO[bytes],
+        cmd: Tuple[str, ...],
+) -> Tuple[str, ...]:
    while bytesio.read(2) == b'#!':
-        next_line = bytesio.readline()
+        next_line_b = bytesio.readline()
        try:
-            next_line = next_line.decode('UTF-8')
+            next_line = next_line_b.decode('UTF-8')
        except UnicodeDecodeError:
            return cmd

@ -162,13 +175,13 @@ def _parse_nix_shebang(bytesio, cmd):
    return cmd


-def parse_shebang(bytesio):
+def parse_shebang(bytesio: IO[bytes]) -> Tuple[str, ...]:
    """Parse the shebang from a file opened for reading binary."""
    if bytesio.read(2) != b'#!':
        return ()
-    first_line = bytesio.readline()
+    first_line_b = bytesio.readline()
    try:
-        first_line = first_line.decode('UTF-8')
+        first_line = first_line_b.decode('UTF-8')
    except UnicodeDecodeError:
        return ()

@ -185,10 +198,10 @@ def parse_shebang(bytesio):
    return cmd


-def parse_shebang_from_file(path):
+def parse_shebang_from_file(path: str) -> Tuple[str, ...]:
    """Parse the shebang given a file path."""
    if not os.path.lexists(path):
-        raise ValueError('{} does not exist.'.format(path))
+        raise ValueError(f'{path} does not exist.')
    if not os.access(path, os.X_OK):
        return ()

@ -200,13 +213,13 @@ COPYRIGHT_RE = re.compile(r'^\s*(Copyright|\(C\)) .*$', re.I | re.MULTILINE)
 WS_RE = re.compile(r'\s+')


-def _norm_license(s):
+def _norm_license(s: str) -> str:
    s = COPYRIGHT_RE.sub('', s)
    s = WS_RE.sub(' ', s)
    return s.strip()


-def license_id(filename):
+def license_id(filename: str) -> Optional[str]:
    """Return the spdx id for the license contained in `filename`.  If no
    license is detected, returns `None`.

@ -222,7 +235,7 @@ def license_id(filename):
    """
    import editdistance  # `pip install identify[license]`

-    with io.open(filename, encoding='UTF-8') as f:
+    with open(filename, encoding='UTF-8') as f:
        contents = f.read()

    norm = _norm_license(contents)
--- a/identify/interpreters.py
+++ b/identify/interpreters.py
@ -1,7 +1,3 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import
-from __future__ import unicode_literals
-
 INTERPRETERS = {
    'ash': {'shell', 'ash'},
    'awk': {'awk'},
--- a/identify/vendor/licenses.py
+++ b/identify/vendor/licenses.py
@ -1,6 +1,3 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import
-from __future__ import unicode_literals
 LICENSES = (
    (
        '0BSD',
--- a/setup.cfg
+++ b/setup.cfg
@ -1,6 +1,6 @@
 [metadata]
 name = identify
-version = 1.5.14
+version = 2.1.0
 description = File identification library for Python
 long_description = file: README.md
 long_description_content_type = text/markdown
@ -11,26 +11,26 @@ license = MIT
 license_file = LICENSE
 classifiers =
    License :: OSI Approved :: MIT License
-    Programming Language :: Python :: 2
-    Programming Language :: Python :: 2.7
    Programming Language :: Python :: 3
-    Programming Language :: Python :: 3.4
-    Programming Language :: Python :: 3.5
+    Programming Language :: Python :: 3 :: Only
    Programming Language :: Python :: 3.6
    Programming Language :: Python :: 3.7
+    Programming Language :: Python :: 3.8
+    Programming Language :: Python :: 3.9
    Programming Language :: Python :: Implementation :: CPython
    Programming Language :: Python :: Implementation :: PyPy

 [options]
 packages = find:
-python_requires = >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*
+python_requires = >=3.6.1

 [options.entry_points]
 console_scripts =
    identify-cli=identify.cli:main

 [options.extras_require]
-license = editdistance
+license =
+    editdistance

 [options.packages.find]
 exclude =
@ -42,3 +42,16 @@ universal = True

 [coverage:run]
 plugins = covdefaults
+
+[mypy]
+check_untyped_defs = true
+disallow_any_generics = true
+disallow_incomplete_defs = true
+disallow_untyped_defs = true
+no_implicit_optional = true
+
+[mypy-testing.*]
+disallow_untyped_defs = false
+
+[mypy-tests.*]
+disallow_untyped_defs = false
--- a/setup.py
+++ b/setup.py
@ -1,6 +1,2 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import
-from __future__ import unicode_literals
-
 from setuptools import setup
 setup()
--- a/tests/cli_test.py
+++ b/tests/cli_test.py
@ -1,7 +1,3 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import
-from __future__ import unicode_literals
-
 from identify import cli


--- a/tests/extensions_test.py
+++ b/tests/extensions_test.py
@ -1,7 +1,3 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import
-from __future__ import unicode_literals
-
 import pytest

 from identify import extensions
--- a/tests/identify_test.py
+++ b/tests/identify_test.py
@ -1,10 +1,8 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import
-from __future__ import unicode_literals
-
 import io
 import os
+import socket
 import stat
+from tempfile import TemporaryDirectory

 import pytest

@ -14,6 +12,21 @@ from identify import identify
 def test_all_tags_includes_basic_ones():
    assert 'file' in identify.ALL_TAGS
    assert 'directory' in identify.ALL_TAGS
+    assert 'executable' in identify.ALL_TAGS
+    assert 'text' in identify.ALL_TAGS
+    assert 'socket' in identify.ALL_TAGS
+
+
+@pytest.mark.parametrize(
+    'tag_group',
+    (
+        identify.TYPE_TAGS,
+        identify.MODE_TAGS,
+        identify.ENCODING_TAGS,
+    ),
+)
+def test_all_tags_contains_all_groups(tag_group):
+    assert tag_group < identify.ALL_TAGS


 def test_all_tags_contains_each_type():
@ -41,6 +54,17 @@ def test_tags_from_path_symlink(tmpdir):
    assert identify.tags_from_path(x.strpath) == {'symlink'}


+def test_tags_from_path_socket():
+    tmproot = '/tmp'  # short path avoids `OSError: AF_UNIX path too long`
+    with TemporaryDirectory(dir=tmproot) as tmpdir:
+        socket_path = os.path.join(tmpdir, 'socket')
+        with socket.socket(socket.AF_UNIX) as sock:
+            sock.bind(socket_path)
+            tags = identify.tags_from_path(socket_path)
+
+    assert tags == {'socket'}
+
+
 def test_tags_from_path_broken_symlink(tmpdir):
    x = tmpdir.join('foo')
    x.mksymlinkto(tmpdir.join('lol'))
@ -177,9 +201,9 @@ def test_tags_from_interpreter(interpreter, expected):
    (
        (b'hello world', True),
        (b'', True),
-        ('éóñəå  ⊂(◉‿◉)つ(ノ≥∇≤)ノ'.encode('utf8'), True),
-        (r'¯\_(ツ)_/¯'.encode('utf8'), True),
-        ('♪┏(・o･)┛♪┗ ( ･o･) ┓♪┏ ( ) ┛♪┗ (･o･ ) ┓♪'.encode('utf8'), True),
+        ('éóñəå  ⊂(◉‿◉)つ(ノ≥∇≤)ノ'.encode(), True),
+        (r'¯\_(ツ)_/¯'.encode(), True),
+        ('♪┏(・o･)┛♪┗ ( ･o･) ┓♪┏ ( ) ┛♪┗ (･o･ ) ┓♪'.encode(), True),
        ('éóñå'.encode('latin1'), True),

        (b'hello world\x00', False),
--- a/tox.ini
+++ b/tox.ini
@ -1,5 +1,5 @@
 [tox]
-envlist = py27,py35,py36,pypy,pre-commit
+envlist = py36,pypy3,pre-commit

 [testenv]
 deps = -rrequirements-dev.txt