Merging upstream version 10.0.8.
Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
parent
407314e8d2
commit
efc1e37108
67 changed files with 2461 additions and 840 deletions
|
@ -1,3 +1,6 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import typing as t
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from heapq import heappop, heappush
|
||||
|
@ -6,6 +9,10 @@ from sqlglot import Dialect
|
|||
from sqlglot import expressions as exp
|
||||
from sqlglot.helper import ensure_collection
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
T = t.TypeVar("T")
|
||||
Edit = t.Union[Insert, Remove, Move, Update, Keep]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Insert:
|
||||
|
@ -44,7 +51,7 @@ class Keep:
|
|||
target: exp.Expression
|
||||
|
||||
|
||||
def diff(source, target):
|
||||
def diff(source: exp.Expression, target: exp.Expression) -> t.List[Edit]:
|
||||
"""
|
||||
Returns the list of changes between the source and the target expressions.
|
||||
|
||||
|
@ -89,25 +96,25 @@ class ChangeDistiller:
|
|||
Chawathe et al. described in http://ilpubs.stanford.edu:8090/115/1/1995-46.pdf.
|
||||
"""
|
||||
|
||||
def __init__(self, f=0.6, t=0.6):
|
||||
def __init__(self, f: float = 0.6, t: float = 0.6) -> None:
|
||||
self.f = f
|
||||
self.t = t
|
||||
self._sql_generator = Dialect().generator()
|
||||
|
||||
def diff(self, source, target):
|
||||
def diff(self, source: exp.Expression, target: exp.Expression) -> t.List[Edit]:
|
||||
self._source = source
|
||||
self._target = target
|
||||
self._source_index = {id(n[0]): n[0] for n in source.bfs()}
|
||||
self._target_index = {id(n[0]): n[0] for n in target.bfs()}
|
||||
self._unmatched_source_nodes = set(self._source_index)
|
||||
self._unmatched_target_nodes = set(self._target_index)
|
||||
self._bigram_histo_cache = {}
|
||||
self._bigram_histo_cache: t.Dict[int, t.DefaultDict[str, int]] = {}
|
||||
|
||||
matching_set = self._compute_matching_set()
|
||||
return self._generate_edit_script(matching_set)
|
||||
|
||||
def _generate_edit_script(self, matching_set):
|
||||
edit_script = []
|
||||
def _generate_edit_script(self, matching_set: t.Set[t.Tuple[int, int]]) -> t.List[Edit]:
|
||||
edit_script: t.List[Edit] = []
|
||||
for removed_node_id in self._unmatched_source_nodes:
|
||||
edit_script.append(Remove(self._source_index[removed_node_id]))
|
||||
for inserted_node_id in self._unmatched_target_nodes:
|
||||
|
@ -125,7 +132,9 @@ class ChangeDistiller:
|
|||
|
||||
return edit_script
|
||||
|
||||
def _generate_move_edits(self, source, target, matching_set):
|
||||
def _generate_move_edits(
|
||||
self, source: exp.Expression, target: exp.Expression, matching_set: t.Set[t.Tuple[int, int]]
|
||||
) -> t.List[Move]:
|
||||
source_args = [id(e) for e in _expression_only_args(source)]
|
||||
target_args = [id(e) for e in _expression_only_args(target)]
|
||||
|
||||
|
@ -138,7 +147,7 @@ class ChangeDistiller:
|
|||
|
||||
return move_edits
|
||||
|
||||
def _compute_matching_set(self):
|
||||
def _compute_matching_set(self) -> t.Set[t.Tuple[int, int]]:
|
||||
leaves_matching_set = self._compute_leaf_matching_set()
|
||||
matching_set = leaves_matching_set.copy()
|
||||
|
||||
|
@ -183,8 +192,8 @@ class ChangeDistiller:
|
|||
|
||||
return matching_set
|
||||
|
||||
def _compute_leaf_matching_set(self):
|
||||
candidate_matchings = []
|
||||
def _compute_leaf_matching_set(self) -> t.Set[t.Tuple[int, int]]:
|
||||
candidate_matchings: t.List[t.Tuple[float, int, exp.Expression, exp.Expression]] = []
|
||||
source_leaves = list(_get_leaves(self._source))
|
||||
target_leaves = list(_get_leaves(self._target))
|
||||
for source_leaf in source_leaves:
|
||||
|
@ -216,7 +225,7 @@ class ChangeDistiller:
|
|||
|
||||
return matching_set
|
||||
|
||||
def _dice_coefficient(self, source, target):
|
||||
def _dice_coefficient(self, source: exp.Expression, target: exp.Expression) -> float:
|
||||
source_histo = self._bigram_histo(source)
|
||||
target_histo = self._bigram_histo(target)
|
||||
|
||||
|
@ -231,13 +240,13 @@ class ChangeDistiller:
|
|||
|
||||
return 2 * overlap_len / total_grams
|
||||
|
||||
def _bigram_histo(self, expression):
|
||||
def _bigram_histo(self, expression: exp.Expression) -> t.DefaultDict[str, int]:
|
||||
if id(expression) in self._bigram_histo_cache:
|
||||
return self._bigram_histo_cache[id(expression)]
|
||||
|
||||
expression_str = self._sql_generator.generate(expression)
|
||||
count = max(0, len(expression_str) - 1)
|
||||
bigram_histo = defaultdict(int)
|
||||
bigram_histo: t.DefaultDict[str, int] = defaultdict(int)
|
||||
for i in range(count):
|
||||
bigram_histo[expression_str[i : i + 2]] += 1
|
||||
|
||||
|
@ -245,7 +254,7 @@ class ChangeDistiller:
|
|||
return bigram_histo
|
||||
|
||||
|
||||
def _get_leaves(expression):
|
||||
def _get_leaves(expression: exp.Expression) -> t.Generator[exp.Expression, None, None]:
|
||||
has_child_exprs = False
|
||||
|
||||
for a in expression.args.values():
|
||||
|
@ -258,7 +267,7 @@ def _get_leaves(expression):
|
|||
yield expression
|
||||
|
||||
|
||||
def _is_same_type(source, target):
|
||||
def _is_same_type(source: exp.Expression, target: exp.Expression) -> bool:
|
||||
if type(source) is type(target):
|
||||
if isinstance(source, exp.Join):
|
||||
return source.args.get("side") == target.args.get("side")
|
||||
|
@ -271,15 +280,17 @@ def _is_same_type(source, target):
|
|||
return False
|
||||
|
||||
|
||||
def _expression_only_args(expression):
|
||||
args = []
|
||||
def _expression_only_args(expression: exp.Expression) -> t.List[exp.Expression]:
|
||||
args: t.List[t.Union[exp.Expression, t.List]] = []
|
||||
if expression:
|
||||
for a in expression.args.values():
|
||||
args.extend(ensure_collection(a))
|
||||
return [a for a in args if isinstance(a, exp.Expression)]
|
||||
|
||||
|
||||
def _lcs(seq_a, seq_b, equal):
|
||||
def _lcs(
|
||||
seq_a: t.Sequence[T], seq_b: t.Sequence[T], equal: t.Callable[[T, T], bool]
|
||||
) -> t.Sequence[t.Optional[T]]:
|
||||
"""Calculates the longest common subsequence"""
|
||||
|
||||
len_a = len(seq_a)
|
||||
|
@ -289,14 +300,14 @@ def _lcs(seq_a, seq_b, equal):
|
|||
for i in range(len_a + 1):
|
||||
for j in range(len_b + 1):
|
||||
if i == 0 or j == 0:
|
||||
lcs_result[i][j] = []
|
||||
lcs_result[i][j] = [] # type: ignore
|
||||
elif equal(seq_a[i - 1], seq_b[j - 1]):
|
||||
lcs_result[i][j] = lcs_result[i - 1][j - 1] + [seq_a[i - 1]]
|
||||
lcs_result[i][j] = lcs_result[i - 1][j - 1] + [seq_a[i - 1]] # type: ignore
|
||||
else:
|
||||
lcs_result[i][j] = (
|
||||
lcs_result[i - 1][j]
|
||||
if len(lcs_result[i - 1][j]) > len(lcs_result[i][j - 1])
|
||||
if len(lcs_result[i - 1][j]) > len(lcs_result[i][j - 1]) # type: ignore
|
||||
else lcs_result[i][j - 1]
|
||||
)
|
||||
|
||||
return lcs_result[len_a][len_b]
|
||||
return lcs_result[len_a][len_b] # type: ignore
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue