Adding upstream version 2.1.0.

Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-02-12 21:44:15 +01:00 · 2025-02-12 21:44:15 +01:00 · 085459798b
commit 085459798b
parent c3f707bfbc
14 changed files with 132 additions and 104 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -2,22 +2,24 @@ repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v3.4.0
    hooks:
    -   id: trailing-whitespace
    -   id: end-of-file-fixer
    -   id: check-docstring-first
    -   id: check-merge-conflict
    -   id: check-yaml
    -   id: debug-statements
    -   id: double-quote-string-fixer
    -   id: end-of-file-fixer
    -   id: name-tests-test
-    -   id: check-added-large-files
+    -   id: requirements-txt-fixer
-    -   id: check-byte-order-marker
+    -   id: trailing-whitespace
-    -   id: fix-encoding-pragma
+-   repo: https://github.com/asottile/setup-cfg-fmt
    rev: v1.16.0
    hooks:
    -   id: setup-cfg-fmt
 -   repo: https://gitlab.com/pycqa/flake8
    rev: 3.8.4
    hooks:
    -   id: flake8
        exclude: ^identify/vendor/licenses\.py$
        additional_dependencies: [flake8-typing-imports==1.10.1]
 -   repo: https://github.com/pre-commit/mirrors-autopep8
    rev: v1.5.4
    hooks:
@ -26,11 +28,18 @@ repos:
    rev: v2.4.0
    hooks:
    -   id: reorder-python-imports
-        args: [
+        args: [--py3-plus]
            '--add-import', 'from __future__ import absolute_import',
            '--add-import', 'from __future__ import unicode_literals',
        ]
 -   repo: https://github.com/asottile/add-trailing-comma
    rev: v2.1.0
    hooks:
    -   id: add-trailing-comma
        args: [--py36-plus]
 -   repo: https://github.com/asottile/pyupgrade
    rev: v2.10.0
    hooks:
    -   id: pyupgrade
        args: [--py36-plus]
 -   repo: https://github.com/pre-commit/mirrors-mypy
    rev: v0.812
    hooks:
    -   id: mypy
--- a/README.md
+++ b/README.md
@ -37,7 +37,7 @@ If you have an actual file on disk, you can get the most information possible
 When using a file on disk, the checks performed are:
-* File type (file, symlink, directory)
+* File type (file, symlink, directory, socket)
 * Mode (is it executable?)
 * File name (mostly based on extension)
 * If executable, the shebang is read and the interpreter interpreted
@ -76,11 +76,11 @@ optional arguments:
  --filename-only
 ```
-```bash
+```console
 $ identify-cli setup.py; echo $?
 ["file", "non-executable", "python", "text"]
 0
-identify setup.py --filename-only; echo $?
+$ identify setup.py --filename-only; echo $?
 ["python", "text"]
 0
 $ identify-cli wat.wat; echo $?
--- a/bin/vendor-licenses
+++ b/bin/vendor-licenses
@ -1,19 +1,15 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """Usage:
    ./bin/vendor-licenses > identify/vendor/licenses.py
 """
 from __future__ import absolute_import
 from __future__ import unicode_literals
 import argparse
 import os.path
 import subprocess
 import tempfile
-def main():
+def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument('--revision', default='HEAD')
    args = parser.parse_args()
@ -45,18 +41,16 @@ def main():
            licenses.append((spdx, license_text))
        print('# -*- coding: utf-8 -*-')
        print('from __future__ import absolute_import')
        print('from __future__ import unicode_literals')
        print('LICENSES = (')
        for spdx, text in sorted(licenses):
            print('    (')
-            print('        {!r},'.format(spdx))
+            print(f'        {spdx!r},')
            print("        '''\\")
            print(text.replace('\t', '    ').replace(' \n', '').strip())
            print("''',")
            print('    ),')
        print(')')
    return 0
 if __name__ == '__main__':
--- a/identify/cli.py
+++ b/identify/cli.py
@ -1,14 +1,12 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import
 from __future__ import unicode_literals
 import argparse
 import json
 from typing import Optional
 from typing import Sequence
 from identify import identify
-def main(argv=None):
+def main(argv: Optional[Sequence[str]] = None) -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument('--filename-only', action='store_true')
    parser.add_argument('path')
--- a/identify/extensions.py
+++ b/identify/extensions.py
@ -1,13 +1,9 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import
 from __future__ import unicode_literals
 EXTENSIONS = {
    'adoc': {'text', 'asciidoc'},
    'asciidoc': {'text', 'asciidoc'},
    'apinotes': {'text', 'apinotes'},
    'asar': {'binary', 'asar'},
    'avif': {'binary', 'image', 'avif'},
    'bash': {'text', 'shell', 'bash'},
    'bat': {'text', 'batch'},
    'bib': {'text', 'bib'},
--- a/identify/identify.py
+++ b/identify/identify.py
@ -1,14 +1,14 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import unicode_literals
 import io
 import os.path
 import re
 import shlex
 import stat
 import string
 import sys
 from typing import IO
 from typing import List
 from typing import Optional
 from typing import Set
 from typing import Tuple
 from identify import extensions
 from identify import interpreters
@ -19,27 +19,37 @@ printable = frozenset(string.printable)
 DIRECTORY = 'directory'
 SYMLINK = 'symlink'
 SOCKET = 'socket'
 FILE = 'file'
 EXECUTABLE = 'executable'
 NON_EXECUTABLE = 'non-executable'
 TEXT = 'text'
 BINARY = 'binary'
-ALL_TAGS = {DIRECTORY, SYMLINK, FILE, EXECUTABLE, NON_EXECUTABLE, TEXT, BINARY}
+TYPE_TAGS = frozenset((DIRECTORY, FILE, SYMLINK, SOCKET))
-ALL_TAGS.update(*extensions.EXTENSIONS.values())
+MODE_TAGS = frozenset((EXECUTABLE, NON_EXECUTABLE))
-ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values())
+ENCODING_TAGS = frozenset((BINARY, TEXT))
-ALL_TAGS.update(*extensions.NAMES.values())
+_ALL_TAGS = {*TYPE_TAGS, *MODE_TAGS, *ENCODING_TAGS}
-ALL_TAGS.update(*interpreters.INTERPRETERS.values())
+_ALL_TAGS.update(*extensions.EXTENSIONS.values())
-ALL_TAGS = frozenset(ALL_TAGS)
+_ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values())
 _ALL_TAGS.update(*extensions.NAMES.values())
 _ALL_TAGS.update(*interpreters.INTERPRETERS.values())
 ALL_TAGS = frozenset(_ALL_TAGS)
-def tags_from_path(path):
+def tags_from_path(path: str) -> Set[str]:
-    if not os.path.lexists(path):
+    try:
-        raise ValueError('{} does not exist.'.format(path))
+        sr = os.lstat(path)
-    if os.path.isdir(path):
+    except (OSError, ValueError):  # same error-handling as `os.lexists()`
        raise ValueError(f'{path} does not exist.')
    mode = sr.st_mode
    if stat.S_ISDIR(mode):
        return {DIRECTORY}
-    if os.path.islink(path):
+    if stat.S_ISLNK(mode):
        return {SYMLINK}
    if stat.S_ISSOCK(mode):
        return {SOCKET}
    tags = {FILE}
@ -62,19 +72,19 @@ def tags_from_path(path):
    # some extensions can be both binary and text
    # see EXTENSIONS_NEED_BINARY_CHECK
-    if not {TEXT, BINARY} & tags:
+    if not ENCODING_TAGS & tags:
        if file_is_text(path):
            tags.add(TEXT)
        else:
            tags.add(BINARY)
-    assert {TEXT, BINARY} & tags, tags
+    assert ENCODING_TAGS & tags, tags
-    assert {EXECUTABLE, NON_EXECUTABLE} & tags, tags
+    assert MODE_TAGS & tags, tags
    return tags
-def tags_from_filename(filename):
+def tags_from_filename(path: str) -> Set[str]:
-    _, filename = os.path.split(filename)
+    _, filename = os.path.split(path)
    _, ext = os.path.splitext(filename)
    ret = set()
@ -95,7 +105,7 @@ def tags_from_filename(filename):
    return ret
-def tags_from_interpreter(interpreter):
+def tags_from_interpreter(interpreter: str) -> Set[str]:
    _, _, interpreter = interpreter.rpartition('/')
    # Try "python3.5.2" => "python3.5" => "python3" until one matches.
@ -108,7 +118,7 @@ def tags_from_interpreter(interpreter):
    return set()
-def is_text(bytesio):
+def is_text(bytesio: IO[bytes]) -> bool:
    """Return whether the first KB of contents seems to be binary.
    This is roughly based on libmagic's binary/text detection:
@ -122,14 +132,14 @@ def is_text(bytesio):
    return not bool(bytesio.read(1024).translate(None, text_chars))
-def file_is_text(path):
+def file_is_text(path: str) -> bool:
    if not os.path.lexists(path):
-        raise ValueError('{} does not exist.'.format(path))
+        raise ValueError(f'{path} does not exist.')
    with open(path, 'rb') as f:
        return is_text(f)
-def _shebang_split(line):
+def _shebang_split(line: str) -> List[str]:
    try:
        # shebangs aren't supposed to be quoted, though some tools such as
        # setuptools will write them with quotes so we'll best-guess parse
@ -141,11 +151,14 @@ def _shebang_split(line):
        return line.split()
-def _parse_nix_shebang(bytesio, cmd):
+def _parse_nix_shebang(
        bytesio: IO[bytes],
        cmd: Tuple[str, ...],
 ) -> Tuple[str, ...]:
    while bytesio.read(2) == b'#!':
-        next_line = bytesio.readline()
+        next_line_b = bytesio.readline()
        try:
-            next_line = next_line.decode('UTF-8')
+            next_line = next_line_b.decode('UTF-8')
        except UnicodeDecodeError:
            return cmd
@ -162,13 +175,13 @@ def _parse_nix_shebang(bytesio, cmd):
    return cmd
-def parse_shebang(bytesio):
+def parse_shebang(bytesio: IO[bytes]) -> Tuple[str, ...]:
    """Parse the shebang from a file opened for reading binary."""
    if bytesio.read(2) != b'#!':
        return ()
-    first_line = bytesio.readline()
+    first_line_b = bytesio.readline()
    try:
-        first_line = first_line.decode('UTF-8')
+        first_line = first_line_b.decode('UTF-8')
    except UnicodeDecodeError:
        return ()
@ -185,10 +198,10 @@ def parse_shebang(bytesio):
    return cmd
-def parse_shebang_from_file(path):
+def parse_shebang_from_file(path: str) -> Tuple[str, ...]:
    """Parse the shebang given a file path."""
    if not os.path.lexists(path):
-        raise ValueError('{} does not exist.'.format(path))
+        raise ValueError(f'{path} does not exist.')
    if not os.access(path, os.X_OK):
        return ()
@ -200,13 +213,13 @@ COPYRIGHT_RE = re.compile(r'^\s*(Copyright|\(C\)) .*$', re.I | re.MULTILINE)
 WS_RE = re.compile(r'\s+')
-def _norm_license(s):
+def _norm_license(s: str) -> str:
    s = COPYRIGHT_RE.sub('', s)
    s = WS_RE.sub(' ', s)
    return s.strip()
-def license_id(filename):
+def license_id(filename: str) -> Optional[str]:
    """Return the spdx id for the license contained in `filename`.  If no
    license is detected, returns `None`.
@ -222,7 +235,7 @@ def license_id(filename):
    """
    import editdistance  # `pip install identify[license]`
-    with io.open(filename, encoding='UTF-8') as f:
+    with open(filename, encoding='UTF-8') as f:
        contents = f.read()
    norm = _norm_license(contents)
--- a/identify/interpreters.py
+++ b/identify/interpreters.py
@ -1,7 +1,3 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import
 from __future__ import unicode_literals
 INTERPRETERS = {
    'ash': {'shell', 'ash'},
    'awk': {'awk'},
--- a/identify/vendor/licenses.py
+++ b/identify/vendor/licenses.py
@ -1,6 +1,3 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import
 from __future__ import unicode_literals
 LICENSES = (
    (
        '0BSD',
--- a/setup.cfg
+++ b/setup.cfg
@ -1,6 +1,6 @@
 [metadata]
 name = identify
-version = 1.5.14
+version = 2.1.0
 description = File identification library for Python
 long_description = file: README.md
 long_description_content_type = text/markdown
@ -11,26 +11,26 @@ license = MIT
 license_file = LICENSE
 classifiers =
    License :: OSI Approved :: MIT License
    Programming Language :: Python :: 2
    Programming Language :: Python :: 2.7
    Programming Language :: Python :: 3
-    Programming Language :: Python :: 3.4
+    Programming Language :: Python :: 3 :: Only
    Programming Language :: Python :: 3.5
    Programming Language :: Python :: 3.6
    Programming Language :: Python :: 3.7
    Programming Language :: Python :: 3.8
    Programming Language :: Python :: 3.9
    Programming Language :: Python :: Implementation :: CPython
    Programming Language :: Python :: Implementation :: PyPy
 [options]
 packages = find:
-python_requires = >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*
+python_requires = >=3.6.1
 [options.entry_points]
 console_scripts =
    identify-cli=identify.cli:main
 [options.extras_require]
-license = editdistance
+license =
    editdistance
 [options.packages.find]
 exclude =
@ -42,3 +42,16 @@ universal = True
 [coverage:run]
 plugins = covdefaults
 [mypy]
 check_untyped_defs = true
 disallow_any_generics = true
 disallow_incomplete_defs = true
 disallow_untyped_defs = true
 no_implicit_optional = true
 [mypy-testing.*]
 disallow_untyped_defs = false
 [mypy-tests.*]
 disallow_untyped_defs = false
--- a/setup.py
+++ b/setup.py
@ -1,6 +1,2 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import
 from __future__ import unicode_literals
 from setuptools import setup
 setup()
--- a/tests/cli_test.py
+++ b/tests/cli_test.py
@ -1,7 +1,3 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import
 from __future__ import unicode_literals
 from identify import cli
--- a/tests/extensions_test.py
+++ b/tests/extensions_test.py
@ -1,7 +1,3 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import
 from __future__ import unicode_literals
 import pytest
 from identify import extensions
--- a/tests/identify_test.py
+++ b/tests/identify_test.py
@ -1,10 +1,8 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import
 from __future__ import unicode_literals
 import io
 import os
 import socket
 import stat
 from tempfile import TemporaryDirectory
 import pytest
@ -14,6 +12,21 @@ from identify import identify
 def test_all_tags_includes_basic_ones():
    assert 'file' in identify.ALL_TAGS
    assert 'directory' in identify.ALL_TAGS
    assert 'executable' in identify.ALL_TAGS
    assert 'text' in identify.ALL_TAGS
    assert 'socket' in identify.ALL_TAGS
@pytest.mark.parametrize(
    'tag_group',
    (
        identify.TYPE_TAGS,
        identify.MODE_TAGS,
        identify.ENCODING_TAGS,
    ),
 )
 def test_all_tags_contains_all_groups(tag_group):
    assert tag_group < identify.ALL_TAGS
 def test_all_tags_contains_each_type():
@ -41,6 +54,17 @@ def test_tags_from_path_symlink(tmpdir):
    assert identify.tags_from_path(x.strpath) == {'symlink'}
 def test_tags_from_path_socket():
    tmproot = '/tmp'  # short path avoids `OSError: AF_UNIX path too long`
    with TemporaryDirectory(dir=tmproot) as tmpdir:
        socket_path = os.path.join(tmpdir, 'socket')
        with socket.socket(socket.AF_UNIX) as sock:
            sock.bind(socket_path)
            tags = identify.tags_from_path(socket_path)
    assert tags == {'socket'}
 def test_tags_from_path_broken_symlink(tmpdir):
    x = tmpdir.join('foo')
    x.mksymlinkto(tmpdir.join('lol'))
@ -177,9 +201,9 @@ def test_tags_from_interpreter(interpreter, expected):
    (
        (b'hello world', True),
        (b'', True),
-        ('éóñəå  ⊂(◉‿◉)つ(ノ≥∇≤)ノ'.encode('utf8'), True),
+        ('éóñəå  ⊂(◉‿◉)つ(ノ≥∇≤)ノ'.encode(), True),
-        (r'¯\_(ツ)_/¯'.encode('utf8'), True),
+        (r'¯\_(ツ)_/¯'.encode(), True),
-        ('♪┏(・o･)┛♪┗ ( ･o･) ┓♪┏ ( ) ┛♪┗ (･o･ ) ┓♪'.encode('utf8'), True),
+        ('♪┏(・o･)┛♪┗ ( ･o･) ┓♪┏ ( ) ┛♪┗ (･o･ ) ┓♪'.encode(), True),
        ('éóñå'.encode('latin1'), True),
        (b'hello world\x00', False),
--- a/tox.ini
+++ b/tox.ini
@ -1,5 +1,5 @@
 [tox]
-envlist = py27,py35,py36,pypy,pre-commit
+envlist = py36,pypy3,pre-commit
 [testenv]
 deps = -rrequirements-dev.txt