1
0
Fork 0

Merging upstream version 2.4.0.

Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
Daniel Baumann 2025-02-12 21:54:34 +01:00
parent c351810ea0
commit dc4cf4c101
Signed by: daniel
GPG key ID: FBB4F0E80A80222F
3 changed files with 12 additions and 6 deletions

View file

@ -1,6 +1,7 @@
EXTENSIONS = {
'adoc': {'text', 'asciidoc'},
'ai': {'binary', 'adobe-illustrator'},
'aj': {'text', 'aspectj'},
'asciidoc': {'text', 'asciidoc'},
'apinotes': {'text', 'apinotes'},
'asar': {'binary', 'asar'},
@ -287,6 +288,8 @@ NAMES = {
'Gemfile': EXTENSIONS['rb'],
'Gemfile.lock': {'text'},
'GNUmakefile': EXTENSIONS['mk'],
'go.mod': {'text', 'go-mod'},
'go.sum': {'text', 'go-sum'},
'Jenkinsfile': EXTENSIONS['jenkins'],
'LICENSE': EXTENSIONS['txt'],
'MAINTAINERS': EXTENSIONS['txt'],

View file

@ -1,4 +1,5 @@
import errno
import math
import os.path
import re
import shlex
@ -244,7 +245,7 @@ def license_id(filename: str) -> Optional[str]:
3. check exact text match with existing licenses
4. failing that use edit distance
"""
import editdistance_s # `pip install identify[license]`
import ukkonen # `pip install identify[license]`
with open(filename, encoding='UTF-8') as f:
contents = f.read()
@ -254,6 +255,8 @@ def license_id(filename: str) -> Optional[str]:
min_edit_dist = sys.maxsize
min_edit_dist_spdx = ''
cutoff = math.ceil(.05 * len(norm))
# try exact matches
for spdx, text in licenses.LICENSES:
norm_license = _norm_license(text)
@ -264,13 +267,13 @@ def license_id(filename: str) -> Optional[str]:
if norm and abs(len(norm) - len(norm_license)) / len(norm) > .05:
continue
edit_dist = editdistance_s.distance(norm, norm_license)
if edit_dist < min_edit_dist:
edit_dist = ukkonen.distance(norm, norm_license, cutoff)
if edit_dist < cutoff and edit_dist < min_edit_dist:
min_edit_dist = edit_dist
min_edit_dist_spdx = spdx
# if there's less than 5% edited from the license, we found our match
if norm and min_edit_dist / len(norm) < .05:
if norm and min_edit_dist < cutoff:
return min_edit_dist_spdx
else:
# no matches :'(

View file

@ -1,6 +1,6 @@
[metadata]
name = identify
version = 2.3.5
version = 2.4.0
description = File identification library for Python
long_description = file: README.md
long_description_content_type = text/markdown
@ -36,7 +36,7 @@ console_scripts =
[options.extras_require]
license =
editdistance-s
ukkonen
[bdist_wheel]
universal = True