1
0
Fork 0

Merging upstream version 2.1.0.

Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
Daniel Baumann 2025-02-12 21:44:30 +01:00
parent cbed83fde7
commit ae97967170
Signed by: daniel
GPG key ID: FBB4F0E80A80222F
14 changed files with 132 additions and 104 deletions

View file

@ -1,14 +1,12 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import unicode_literals
import argparse
import json
from typing import Optional
from typing import Sequence
from identify import identify
def main(argv=None):
def main(argv: Optional[Sequence[str]] = None) -> int:
parser = argparse.ArgumentParser()
parser.add_argument('--filename-only', action='store_true')
parser.add_argument('path')

View file

@ -1,13 +1,9 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import unicode_literals
EXTENSIONS = {
'adoc': {'text', 'asciidoc'},
'asciidoc': {'text', 'asciidoc'},
'apinotes': {'text', 'apinotes'},
'asar': {'binary', 'asar'},
'avif': {'binary', 'image', 'avif'},
'bash': {'text', 'shell', 'bash'},
'bat': {'text', 'batch'},
'bib': {'text', 'bib'},

View file

@ -1,14 +1,14 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
import io
import os.path
import re
import shlex
import stat
import string
import sys
from typing import IO
from typing import List
from typing import Optional
from typing import Set
from typing import Tuple
from identify import extensions
from identify import interpreters
@ -19,27 +19,37 @@ printable = frozenset(string.printable)
DIRECTORY = 'directory'
SYMLINK = 'symlink'
SOCKET = 'socket'
FILE = 'file'
EXECUTABLE = 'executable'
NON_EXECUTABLE = 'non-executable'
TEXT = 'text'
BINARY = 'binary'
ALL_TAGS = {DIRECTORY, SYMLINK, FILE, EXECUTABLE, NON_EXECUTABLE, TEXT, BINARY}
ALL_TAGS.update(*extensions.EXTENSIONS.values())
ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values())
ALL_TAGS.update(*extensions.NAMES.values())
ALL_TAGS.update(*interpreters.INTERPRETERS.values())
ALL_TAGS = frozenset(ALL_TAGS)
TYPE_TAGS = frozenset((DIRECTORY, FILE, SYMLINK, SOCKET))
MODE_TAGS = frozenset((EXECUTABLE, NON_EXECUTABLE))
ENCODING_TAGS = frozenset((BINARY, TEXT))
_ALL_TAGS = {*TYPE_TAGS, *MODE_TAGS, *ENCODING_TAGS}
_ALL_TAGS.update(*extensions.EXTENSIONS.values())
_ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values())
_ALL_TAGS.update(*extensions.NAMES.values())
_ALL_TAGS.update(*interpreters.INTERPRETERS.values())
ALL_TAGS = frozenset(_ALL_TAGS)
def tags_from_path(path):
if not os.path.lexists(path):
raise ValueError('{} does not exist.'.format(path))
if os.path.isdir(path):
def tags_from_path(path: str) -> Set[str]:
try:
sr = os.lstat(path)
except (OSError, ValueError): # same error-handling as `os.lexists()`
raise ValueError(f'{path} does not exist.')
mode = sr.st_mode
if stat.S_ISDIR(mode):
return {DIRECTORY}
if os.path.islink(path):
if stat.S_ISLNK(mode):
return {SYMLINK}
if stat.S_ISSOCK(mode):
return {SOCKET}
tags = {FILE}
@ -62,19 +72,19 @@ def tags_from_path(path):
# some extensions can be both binary and text
# see EXTENSIONS_NEED_BINARY_CHECK
if not {TEXT, BINARY} & tags:
if not ENCODING_TAGS & tags:
if file_is_text(path):
tags.add(TEXT)
else:
tags.add(BINARY)
assert {TEXT, BINARY} & tags, tags
assert {EXECUTABLE, NON_EXECUTABLE} & tags, tags
assert ENCODING_TAGS & tags, tags
assert MODE_TAGS & tags, tags
return tags
def tags_from_filename(filename):
_, filename = os.path.split(filename)
def tags_from_filename(path: str) -> Set[str]:
_, filename = os.path.split(path)
_, ext = os.path.splitext(filename)
ret = set()
@ -95,7 +105,7 @@ def tags_from_filename(filename):
return ret
def tags_from_interpreter(interpreter):
def tags_from_interpreter(interpreter: str) -> Set[str]:
_, _, interpreter = interpreter.rpartition('/')
# Try "python3.5.2" => "python3.5" => "python3" until one matches.
@ -108,7 +118,7 @@ def tags_from_interpreter(interpreter):
return set()
def is_text(bytesio):
def is_text(bytesio: IO[bytes]) -> bool:
"""Return whether the first KB of contents seems to be binary.
This is roughly based on libmagic's binary/text detection:
@ -122,14 +132,14 @@ def is_text(bytesio):
return not bool(bytesio.read(1024).translate(None, text_chars))
def file_is_text(path):
def file_is_text(path: str) -> bool:
if not os.path.lexists(path):
raise ValueError('{} does not exist.'.format(path))
raise ValueError(f'{path} does not exist.')
with open(path, 'rb') as f:
return is_text(f)
def _shebang_split(line):
def _shebang_split(line: str) -> List[str]:
try:
# shebangs aren't supposed to be quoted, though some tools such as
# setuptools will write them with quotes so we'll best-guess parse
@ -141,11 +151,14 @@ def _shebang_split(line):
return line.split()
def _parse_nix_shebang(bytesio, cmd):
def _parse_nix_shebang(
bytesio: IO[bytes],
cmd: Tuple[str, ...],
) -> Tuple[str, ...]:
while bytesio.read(2) == b'#!':
next_line = bytesio.readline()
next_line_b = bytesio.readline()
try:
next_line = next_line.decode('UTF-8')
next_line = next_line_b.decode('UTF-8')
except UnicodeDecodeError:
return cmd
@ -162,13 +175,13 @@ def _parse_nix_shebang(bytesio, cmd):
return cmd
def parse_shebang(bytesio):
def parse_shebang(bytesio: IO[bytes]) -> Tuple[str, ...]:
"""Parse the shebang from a file opened for reading binary."""
if bytesio.read(2) != b'#!':
return ()
first_line = bytesio.readline()
first_line_b = bytesio.readline()
try:
first_line = first_line.decode('UTF-8')
first_line = first_line_b.decode('UTF-8')
except UnicodeDecodeError:
return ()
@ -185,10 +198,10 @@ def parse_shebang(bytesio):
return cmd
def parse_shebang_from_file(path):
def parse_shebang_from_file(path: str) -> Tuple[str, ...]:
"""Parse the shebang given a file path."""
if not os.path.lexists(path):
raise ValueError('{} does not exist.'.format(path))
raise ValueError(f'{path} does not exist.')
if not os.access(path, os.X_OK):
return ()
@ -200,13 +213,13 @@ COPYRIGHT_RE = re.compile(r'^\s*(Copyright|\(C\)) .*$', re.I | re.MULTILINE)
WS_RE = re.compile(r'\s+')
def _norm_license(s):
def _norm_license(s: str) -> str:
s = COPYRIGHT_RE.sub('', s)
s = WS_RE.sub(' ', s)
return s.strip()
def license_id(filename):
def license_id(filename: str) -> Optional[str]:
"""Return the spdx id for the license contained in `filename`. If no
license is detected, returns `None`.
@ -222,7 +235,7 @@ def license_id(filename):
"""
import editdistance # `pip install identify[license]`
with io.open(filename, encoding='UTF-8') as f:
with open(filename, encoding='UTF-8') as f:
contents = f.read()
norm = _norm_license(contents)

View file

@ -1,7 +1,3 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import unicode_literals
INTERPRETERS = {
'ash': {'shell', 'ash'},
'awk': {'awk'},

View file

@ -1,6 +1,3 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import unicode_literals
LICENSES = (
(
'0BSD',