1
0
Fork 0

Merging upstream version 2.4.0.

Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
Daniel Baumann 2025-02-12 21:54:34 +01:00
parent c351810ea0
commit dc4cf4c101
Signed by: daniel
GPG key ID: FBB4F0E80A80222F
3 changed files with 12 additions and 6 deletions

View file

@ -1,6 +1,7 @@
EXTENSIONS = { EXTENSIONS = {
'adoc': {'text', 'asciidoc'}, 'adoc': {'text', 'asciidoc'},
'ai': {'binary', 'adobe-illustrator'}, 'ai': {'binary', 'adobe-illustrator'},
'aj': {'text', 'aspectj'},
'asciidoc': {'text', 'asciidoc'}, 'asciidoc': {'text', 'asciidoc'},
'apinotes': {'text', 'apinotes'}, 'apinotes': {'text', 'apinotes'},
'asar': {'binary', 'asar'}, 'asar': {'binary', 'asar'},
@ -287,6 +288,8 @@ NAMES = {
'Gemfile': EXTENSIONS['rb'], 'Gemfile': EXTENSIONS['rb'],
'Gemfile.lock': {'text'}, 'Gemfile.lock': {'text'},
'GNUmakefile': EXTENSIONS['mk'], 'GNUmakefile': EXTENSIONS['mk'],
'go.mod': {'text', 'go-mod'},
'go.sum': {'text', 'go-sum'},
'Jenkinsfile': EXTENSIONS['jenkins'], 'Jenkinsfile': EXTENSIONS['jenkins'],
'LICENSE': EXTENSIONS['txt'], 'LICENSE': EXTENSIONS['txt'],
'MAINTAINERS': EXTENSIONS['txt'], 'MAINTAINERS': EXTENSIONS['txt'],

View file

@ -1,4 +1,5 @@
import errno import errno
import math
import os.path import os.path
import re import re
import shlex import shlex
@ -244,7 +245,7 @@ def license_id(filename: str) -> Optional[str]:
3. check exact text match with existing licenses 3. check exact text match with existing licenses
4. failing that use edit distance 4. failing that use edit distance
""" """
import editdistance_s # `pip install identify[license]` import ukkonen # `pip install identify[license]`
with open(filename, encoding='UTF-8') as f: with open(filename, encoding='UTF-8') as f:
contents = f.read() contents = f.read()
@ -254,6 +255,8 @@ def license_id(filename: str) -> Optional[str]:
min_edit_dist = sys.maxsize min_edit_dist = sys.maxsize
min_edit_dist_spdx = '' min_edit_dist_spdx = ''
cutoff = math.ceil(.05 * len(norm))
# try exact matches # try exact matches
for spdx, text in licenses.LICENSES: for spdx, text in licenses.LICENSES:
norm_license = _norm_license(text) norm_license = _norm_license(text)
@ -264,13 +267,13 @@ def license_id(filename: str) -> Optional[str]:
if norm and abs(len(norm) - len(norm_license)) / len(norm) > .05: if norm and abs(len(norm) - len(norm_license)) / len(norm) > .05:
continue continue
edit_dist = editdistance_s.distance(norm, norm_license) edit_dist = ukkonen.distance(norm, norm_license, cutoff)
if edit_dist < min_edit_dist: if edit_dist < cutoff and edit_dist < min_edit_dist:
min_edit_dist = edit_dist min_edit_dist = edit_dist
min_edit_dist_spdx = spdx min_edit_dist_spdx = spdx
# if there's less than 5% edited from the license, we found our match # if there's less than 5% edited from the license, we found our match
if norm and min_edit_dist / len(norm) < .05: if norm and min_edit_dist < cutoff:
return min_edit_dist_spdx return min_edit_dist_spdx
else: else:
# no matches :'( # no matches :'(

View file

@ -1,6 +1,6 @@
[metadata] [metadata]
name = identify name = identify
version = 2.3.5 version = 2.4.0
description = File identification library for Python description = File identification library for Python
long_description = file: README.md long_description = file: README.md
long_description_content_type = text/markdown long_description_content_type = text/markdown
@ -36,7 +36,7 @@ console_scripts =
[options.extras_require] [options.extras_require]
license = license =
editdistance-s ukkonen
[bdist_wheel] [bdist_wheel]
universal = True universal = True