From 14a882e9d32a2777f8560bb59c4e326f28b9a614 Mon Sep 17 00:00:00 2001 From: raj_mathe Date: Thu, 9 Jun 2022 01:51:08 +0200 Subject: [PATCH] master > master: code py - hirschberg --- code/python/src/main.py | 27 +- code/python/src/string_alignment/__init__.py | 0 .../python/src/string_alignment/hirschberg.py | 411 ++++++++++++++++++ 3 files changed, 428 insertions(+), 10 deletions(-) create mode 100644 code/python/src/string_alignment/__init__.py create mode 100644 code/python/src/string_alignment/hirschberg.py diff --git a/code/python/src/main.py b/code/python/src/main.py index b541ccd..10e90b6 100644 --- a/code/python/src/main.py +++ b/code/python/src/main.py @@ -16,6 +16,7 @@ from src.local.maths import *; from src.graphs.graph import *; from src.graphs.tarjan import *; from src.travel.naive import *; +from src.string_alignment.hirschberg import *; # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # GLOBAL CONSTANTS/VARIABLES @@ -28,16 +29,22 @@ from src.travel.naive import *; # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def enter(): - ## Beispiel aus Seminarblatt 8 - tsp_naive_algorithm( - dist = np.asarray([ - [0, 7, 2, 5], - [7, 0, 5, 6], - [2, 5, 0, 5], - [2, 7, 4, 0], - ], dtype=float), - optimise=max, - verbose=True, + # ## Beispiel für Seminarwoche 9 (Blatt 8): + # tsp_naive_algorithm( + # dist = np.asarray([ + # [0, 7, 4, 3], + # [7, 0, 5, 6], + # [2, 5, 0, 5], + # [2, 7, 4, 0], + # ], dtype=float), + # optimise=min, + # verbose=True, + # ); + ## Beispiel für Seminarwoche 10 (Blatt 9): + hirschberg_algorithm_full( + X = 'ACGAAG', + Y = 'AGAT', + verbose = True, ); return; diff --git a/code/python/src/string_alignment/__init__.py b/code/python/src/string_alignment/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/code/python/src/string_alignment/hirschberg.py b/code/python/src/string_alignment/hirschberg.py new file mode 100644 index 0000000..71362b4 --- /dev/null +++ b/code/python/src/string_alignment/hirschberg.py @@ -0,0 +1,411 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# IMPORTS +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +from __future__ import annotations; +from src.local.typing import *; +from src.local.maths import *; + + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# EXPORTS +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +__all__ = [ + 'hirschberg_algorithm', + 'hirschberg_algorithm_full', +]; + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# CONSTANTS / SETUP +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +class Directions(Enum): + UNSET = -1; + DIAGONAL = 1; + HORIZONTAL = 0; + VERTICAL = 2; + +def gap_penalty(x: str): + return 1; + +def missmatch_penalty(x: str, y: str): + return 0 if x == y else 1; + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# METHOD hirschberg_algorithm +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +def hirschberg_algorithm( + X: str, + Y: str, + verbose: bool = False, +) -> Tuple[str, str]: + Costs, Moves = hirschberg_match_matrix(X = '-' + X, Y = '-' + Y); + path = reconstruct_optimal_path(Moves=Moves); + word_x, word_y = reconstruct_words(X = '-' + X, Y = '-' + Y, Moves=Moves, path=path); + if verbose: + L = len(word_x); + costs_repr, moves_repr = display_cost_matrix(Costs=Costs, path=path, X = '-' + X, Y = '-' + Y); + print(''); + print('\x1b[1mAlignment:\x1b[0m'); + print(f' {word_y}'); + print(f' {L*"-"}'); + print(f' {word_x}'); + print(''); + print(costs_repr); + print(''); + print(moves_repr); + return word_x, word_y; + +def hirschberg_algorithm_full( + X: str, + Y: str, + depth: int = 0, + verbose: bool = False, +) -> Tuple[str, str]: + n = len(Y); + if n > 1: + n = int(np.ceil(n/2)); + + # bilde linke Hälfte vom horizontalen Wort: + Y1 = Y[:n]; + X1 = X; + + # bilde rechte Hälfte vom horizontalen Wort (und kehre h. + v. um): + Y2 = Y[n:][::-1]; + X2 = X[::-1]; + + # Löse Teilprobleme: + Costs1, Moves1 = hirschberg_match_matrix(X = '-' + X1, Y = '-' + Y1); + Costs2, Moves2 = hirschberg_match_matrix(X = '-' + X2, Y = '-' + Y2); + path1, path2 = reconstruct_optimal_path_halves( + Costs1=Costs1, + Costs2=Costs2, + Moves1=Moves1, + Moves2=Moves2, + ); + word_x_1, word_y_1 = reconstruct_words(X = '-' + X1, Y = '-' + Y1, Moves=Moves1, path=path1); + word_x_2, word_y_2 = reconstruct_words(X = '-' + X2, Y = '-' + Y2, Moves=Moves2, path=path2); + + if verbose: + L = len(word_x_1) + len(word_x_2); + costs_repr, moves_repr = display_cost_matrix_halves( + Costs1 = Costs1, + Costs2 = Costs2, + path1 = path1, + path2 = path2, + X1 = '-' + X1, + X2 = '-' + X2, + Y1 = '-' + Y1, + Y2 = '-' + Y2, + ); + print(''); + print(f'\x1b[1mRekursionstiefe: {depth}\x1b[0m') + print(''); + print('\x1b[1mAlignment:\x1b[0m'); + print(f' {word_y_1} {word_y_2[::-1]}'); + print(f' {(L+1)*"-"}'); + print(f' {word_x_1} {word_x_2[::-1]}'); + print(''); + print(moves_repr); + + coord = path1[-1]; + m = coord[0]; + word_x_1, word_y_1 = hirschberg_algorithm_full(X=X[:m], Y=Y[:n], depth=depth+1, verbose=True); + word_x_2, word_y_2 = hirschberg_algorithm_full(X=X[m:], Y=Y[n:], depth=depth+1, verbose=True); + word_x = word_x_1 + word_x_2; + word_y = word_y_1 + word_y_2; + else: + word_x, word_y = hirschberg_algorithm(X=X, Y=Y, verbose=False); + if depth == 0: + L = len(word_x); + print(''); + print('\x1b[1mAlignment:\x1b[0m'); + print(f' {word_y}'); + print(f' {L*"-"}'); + print(f' {word_x}'); + print(''); + return word_x, word_y; + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# METHODS cost matrix + optimal paths +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +def hirschberg_match_matrix( + X: str, + Y: str, +) -> Tuple[NDArray[(Any, Any), int], NDArray[(Any, Any), Directions]]: + ''' + Berechnet Hirschberg-Costs-Matrix (ohne Rekursion). + + Annahmen: + - X[0] = gap + - Y[0] = gap + ''' + m = len(X); # display vertically + n = len(Y); # display horizontally + Costs = np.full(shape=(m, n), dtype=int, fill_value=0); + Moves = np.full(shape=(m, n), dtype=Directions, fill_value=Directions.UNSET); + + # zuerst 0. Spalte und 0. Zeile ausfüllen: + for i, x in list(enumerate(X))[1:]: + update_cost_matrix(Costs, Moves, x, '', i, 0); + + for j, y in list(enumerate(Y))[1:]: + update_cost_matrix(Costs, Moves, '', y, 0, j); + + # jetzt alle »inneren« Werte bestimmen: + for i, x in list(enumerate(X))[1:]: + for j, y in list(enumerate(Y))[1:]: + update_cost_matrix(Costs, Moves, x, y, i, j); + return Costs, Moves; + +def update_cost_matrix( + Costs: NDArray[(Any, Any), int], + Moves: NDArray[(Any, Any), Directions], + x: str, + y: str, + i: int, + j: int, +): + ''' + Schrittweise Funktion zur Aktualisierung vom Eintrag `(i,j)` in der Kostenmatrix. + + Annahme: + - alle »Vorgänger« von `(i,j)` in der Matrix sind bereits optimiert. + + @inputs + - `Costs` - bisher berechnete Kostenmatrix + - `Moves` - bisher berechnete optimale Schritte + - `i`, `x` - Position und Wert in String `X` (»vertical« dargestellt) + - `j`, `y` - Position und Wert in String `Y` (»horizontal« dargestellt) + ''' + + # nichts zu tun, wenn (i, j) == (0, 0): + if i == 0 and j == 0: + Costs[0, 0] = 0; + return; + + ################################ + # NOTE: Berechnung von möglichen Moves wie folgt. + # + # Fall 1: (i-1,j-1) ---> (i,j) + # ==> Stringvergleich ändert sich wie folgt: + # s1 s1 x + # ---- ---> ------ + # s2 s2 y + # + # Fall 2: (i,j-1) ---> (i,j) + # ==> Stringvergleich ändert sich wie folgt: + # s1 s1 GAP + # ---- ---> ------- + # s2 s2 y + # + # Fall 3: (i-1,j) ---> (i,j) + # ==> Stringvergleich ändert sich wie folgt: + # s1 s1 x + # ---- ---> ------- + # s2 s2 GAP + # + # Diese Fälle berücksichtigen wir: + ################################ + edges = []; + if i > 0 and j > 0: + edges.append(( + Directions.DIAGONAL, + Costs[i-1, j-1] + missmatch_penalty(x, y), + )); + if j > 0: + edges.append(( + Directions.HORIZONTAL, + Costs[i, j-1] + gap_penalty(y), + )); + if i > 0: + edges.append(( + Directions.VERTICAL, + Costs[i-1, j] + gap_penalty(x), + )); + + if len(edges) > 0: + # Sortiere nach Priorität (festgelegt in Enum): + edges = sorted(edges, key=lambda x: x[0].value); + # Wähle erste Möglichkeit mit minimalen Kosten: + index = np.argmin([ cost for _, cost in edges]); + Moves[i, j], Costs[i, j] = edges[index]; + return; + +def reconstruct_words( + X: str, + Y: str, + Moves: NDArray[(Any, Any), Directions], + path: List[Tuple[int, int]], +) -> Tuple[str, str]: + word_x = ''; + word_y = ''; + for (i, j) in path: + x = X[i]; + y = Y[j]; + match Moves[i, j]: + case Directions.DIAGONAL: + word_x += x; + word_y += y; + case Directions.HORIZONTAL: + word_x += '-'; + word_y += y; + case Directions.VERTICAL: + word_x += x; + word_y += '-'; + return word_x, word_y; + +def reconstruct_optimal_path( + Moves: NDArray[(Any, Any), Directions], + coord: Optional[Tuple[int, int]] = None, +) -> List[Tuple[int, int]]: + ''' + Liest Matrix mit optimalen Schritten den optimalen Pfad aus, + angenfangen von Endkoordinaten. + ''' + if coord is None: + m, n = Moves.shape; + (i, j) = (m-1, n-1); + else: + (i, j) = coord; + path = [(i, j)]; + while (i, j) != (0, 0): + match Moves[i, j]: + case Directions.DIAGONAL: + (i, j) = (i - 1, j - 1); + case Directions.HORIZONTAL: + (i, j) = (i, j - 1); + case Directions.VERTICAL: + (i, j) = (i - 1, j); + case _: + break; + path.append((i, j)); + return path[::-1]; + +def reconstruct_optimal_path_halves( + Costs1: NDArray[(Any, Any), int], + Costs2: NDArray[(Any, Any), int], + Moves1: NDArray[(Any, Any), Directions], + Moves2: NDArray[(Any, Any), Directions], +) -> Tuple[List[Tuple[int, int]], List[Tuple[int, int]]]: + (m, n1) = Costs1.shape; + (m, n2) = Costs2.shape; + info = [ + ( + Costs1[i, n1-1] + Costs2[m-1-i, n2-1], + (i, n1-1), + (m-1-i, n2-1), + ) + for i in range(m) + ]; + index = np.argmin([ cost for cost, _, _ in info ]); + path1 = reconstruct_optimal_path(Moves1, coord=info[index][1]); + path2 = reconstruct_optimal_path(Moves2, coord=info[index][2]); + return path1, path2; + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# AUXILIARY METHODS +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +def represent_cost_matrix( + Costs: NDArray[(Any, Any), int], + path: List[Tuple[int, int]], + X: str, + Y: str, + pad: bool = False, +) -> Tuple[NDArray[(Any, Any), Any], NDArray[(Any, Any), Any]]: + m = len(X); # display vertically + n = len(Y); # display horizontally + + # erstelle string-Array: + if pad: + table = np.full(shape=(3 + m + 3, 3 + n + 1), dtype=object, fill_value=''); + else: + table = np.full(shape=(3 + m, 3 + n), dtype=object, fill_value=''); + + # topmost rows: + table[0, 3:(3+n)] = [str(j) for j in range(n)]; + table[1, 3:(3+n)] = [y for y in Y]; + table[2, 3:(3+n)] = '--'; + # leftmost columns: + table[3:(3+m), 0] = [str(i) for i in range(m)]; + table[3:(3+m), 1] = [x for x in X]; + table[3:(3+m), 2] = '|'; + + if pad: + table[-3, 3:(3+n)] = '--'; + table[3:(3+m), -1] = '|'; + + table_costs = table.copy(); + table_moves = table.copy(); + table_costs[3:(3+m), 3:(3+n)] = Costs; + table_moves[3:(3+m), 3:(3+n)] = '.'; + for (i, j) in path: + # table_costs[3 + i, 3 + j] = f'\x1b[92;1m{table_costs[3 + i, 3 + j]}\x1b[0m'; + table_moves[3 + i, 3 + j] = '@'; + + return table_costs, table_moves; + +def display_cost_matrix( + Costs: NDArray[(Any, Any), int], + path: List[Tuple[int, int]], + X: str, + Y: str, +) -> Tuple[str, str]: + ''' + Zeigt Kostenmatrix + optimalen Pfad. + + @inputs + - `Costs` - Kostenmatrix + - `Moves` - Kodiert die optimalen Schritte + - `X`, `Y` - Strings + + @returns + - eine 'printable' Darstellung der Matrix mit den Strings X, Y + Indexes. + ''' + table_costs, table_moves = represent_cost_matrix(Costs=Costs, path=path, X=X, Y=Y); + # benutze pandas-Dataframe, um schöner darzustellen: + costs_repr = pd.DataFrame(table_costs).to_string(index=False, header=False); + moves_repr = pd.DataFrame(table_moves).to_string(index=False, header=False); + return costs_repr, moves_repr; + +def display_cost_matrix_halves( + Costs1: NDArray[(Any, Any), int], + Costs2: NDArray[(Any, Any), int], + path1: List[Tuple[int, int]], + path2: List[Tuple[int, int]], + X1: str, + X2: str, + Y1: str, + Y2: str, +) -> Tuple[str, str]: + ''' + Zeigt Kostenmatrix + optimalen Pfad für Schritt im D & C Hirschberg-Algorithmus + + @inputs + - `Costs1`, `Costs2` - Kostenmatrizen + - `Moves1`, `Moves2` - Kodiert die optimalen Schritte + - `X1`, `X2`, `Y1`, `Y2` - Strings + + @returns + - eine 'printable' Darstellung der Matrix mit den Strings X, Y + Indexes. + ''' + table_costs1, table_moves1 = represent_cost_matrix(Costs=Costs1, path=path1, X=X1, Y=Y1, pad=True); + table_costs2, table_moves2 = represent_cost_matrix(Costs=Costs2, path=path2, X=X2, Y=Y2, pad=True); + + # merge Taellen: + table_costs = np.concatenate([table_costs1, table_costs2[::-1, ::-1]], axis=1); + table_moves = np.concatenate([table_moves1, table_moves2[::-1, ::-1]], axis=1); + + # benutze pandas-Dataframe, um schöner darzustellen: + costs_repr = pd.DataFrame(table_costs).to_string(index=False, header=False); + moves_repr = pd.DataFrame(table_moves).to_string(index=False, header=False); + + return costs_repr, moves_repr;