Merging upstream version 2.4.0.
Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
parent
c351810ea0
commit
dc4cf4c101
3 changed files with 12 additions and 6 deletions
|
@ -1,6 +1,7 @@
|
|||
EXTENSIONS = {
|
||||
'adoc': {'text', 'asciidoc'},
|
||||
'ai': {'binary', 'adobe-illustrator'},
|
||||
'aj': {'text', 'aspectj'},
|
||||
'asciidoc': {'text', 'asciidoc'},
|
||||
'apinotes': {'text', 'apinotes'},
|
||||
'asar': {'binary', 'asar'},
|
||||
|
@ -287,6 +288,8 @@ NAMES = {
|
|||
'Gemfile': EXTENSIONS['rb'],
|
||||
'Gemfile.lock': {'text'},
|
||||
'GNUmakefile': EXTENSIONS['mk'],
|
||||
'go.mod': {'text', 'go-mod'},
|
||||
'go.sum': {'text', 'go-sum'},
|
||||
'Jenkinsfile': EXTENSIONS['jenkins'],
|
||||
'LICENSE': EXTENSIONS['txt'],
|
||||
'MAINTAINERS': EXTENSIONS['txt'],
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import errno
|
||||
import math
|
||||
import os.path
|
||||
import re
|
||||
import shlex
|
||||
|
@ -244,7 +245,7 @@ def license_id(filename: str) -> Optional[str]:
|
|||
3. check exact text match with existing licenses
|
||||
4. failing that use edit distance
|
||||
"""
|
||||
import editdistance_s # `pip install identify[license]`
|
||||
import ukkonen # `pip install identify[license]`
|
||||
|
||||
with open(filename, encoding='UTF-8') as f:
|
||||
contents = f.read()
|
||||
|
@ -254,6 +255,8 @@ def license_id(filename: str) -> Optional[str]:
|
|||
min_edit_dist = sys.maxsize
|
||||
min_edit_dist_spdx = ''
|
||||
|
||||
cutoff = math.ceil(.05 * len(norm))
|
||||
|
||||
# try exact matches
|
||||
for spdx, text in licenses.LICENSES:
|
||||
norm_license = _norm_license(text)
|
||||
|
@ -264,13 +267,13 @@ def license_id(filename: str) -> Optional[str]:
|
|||
if norm and abs(len(norm) - len(norm_license)) / len(norm) > .05:
|
||||
continue
|
||||
|
||||
edit_dist = editdistance_s.distance(norm, norm_license)
|
||||
if edit_dist < min_edit_dist:
|
||||
edit_dist = ukkonen.distance(norm, norm_license, cutoff)
|
||||
if edit_dist < cutoff and edit_dist < min_edit_dist:
|
||||
min_edit_dist = edit_dist
|
||||
min_edit_dist_spdx = spdx
|
||||
|
||||
# if there's less than 5% edited from the license, we found our match
|
||||
if norm and min_edit_dist / len(norm) < .05:
|
||||
if norm and min_edit_dist < cutoff:
|
||||
return min_edit_dist_spdx
|
||||
else:
|
||||
# no matches :'(
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[metadata]
|
||||
name = identify
|
||||
version = 2.3.5
|
||||
version = 2.4.0
|
||||
description = File identification library for Python
|
||||
long_description = file: README.md
|
||||
long_description_content_type = text/markdown
|
||||
|
@ -36,7 +36,7 @@ console_scripts =
|
|||
|
||||
[options.extras_require]
|
||||
license =
|
||||
editdistance-s
|
||||
ukkonen
|
||||
|
||||
[bdist_wheel]
|
||||
universal = True
|
||||
|
|
Loading…
Add table
Reference in a new issue