master > master: code py - hirschberg

2022-06-09 01:51:08 +02:00 · 2022-06-09 01:51:08 +02:00 · 14a882e9d3
commit 14a882e9d3
parent 53b2066e0d
3 changed files with 428 additions and 10 deletions
--- a/code/python/src/main.py
+++ b/code/python/src/main.py
@ -16,6 +16,7 @@ from src.local.maths import *;
 from src.graphs.graph import *;
 from src.graphs.tarjan import *;
 from src.travel.naive import *;
+from src.string_alignment.hirschberg import *;

 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 # GLOBAL CONSTANTS/VARIABLES
@ -28,16 +29,22 @@ from src.travel.naive import *;
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 def enter():
-    ## Beispiel aus Seminarblatt 8
-    tsp_naive_algorithm(
-        dist = np.asarray([
-            [0, 7, 2, 5],
-            [7, 0, 5, 6],
-            [2, 5, 0, 5],
-            [2, 7, 4, 0],
-        ], dtype=float),
-        optimise=max,
-        verbose=True,
+    # ## Beispiel für Seminarwoche 9 (Blatt 8):
+    # tsp_naive_algorithm(
+    #     dist = np.asarray([
+    #         [0, 7, 4, 3],
+    #         [7, 0, 5, 6],
+    #         [2, 5, 0, 5],
+    #         [2, 7, 4, 0],
+    #     ], dtype=float),
+    #     optimise=min,
+    #     verbose=True,
+    # );
+    ## Beispiel für Seminarwoche 10 (Blatt 9):
+    hirschberg_algorithm_full(
+        X = 'ACGAAG',
+        Y = 'AGAT',
+        verbose = True,
    );
    return;

--- a/code/python/src/string_alignment/init.py
+++ b/code/python/src/string_alignment/init.py
--- a/code/python/src/string_alignment/hirschberg.py
+++ b/code/python/src/string_alignment/hirschberg.py
@ -0,0 +1,411 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# IMPORTS
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+from __future__ import annotations;
+from src.local.typing import *;
+from src.local.maths import *;
+
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# EXPORTS
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+__all__ = [
+    'hirschberg_algorithm',
+    'hirschberg_algorithm_full',
+];
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# CONSTANTS / SETUP
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+class Directions(Enum):
+    UNSET = -1;
+    DIAGONAL = 1;
+    HORIZONTAL = 0;
+    VERTICAL = 2;
+
+def gap_penalty(x: str):
+    return 1;
+
+def missmatch_penalty(x: str, y: str):
+    return 0 if x == y else 1;
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# METHOD hirschberg_algorithm
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+def hirschberg_algorithm(
+    X: str,
+    Y: str,
+    verbose: bool = False,
+) -> Tuple[str, str]:
+    Costs, Moves = hirschberg_match_matrix(X = '-' + X, Y = '-' + Y);
+    path = reconstruct_optimal_path(Moves=Moves);
+    word_x, word_y = reconstruct_words(X = '-' + X, Y = '-' + Y, Moves=Moves, path=path);
+    if verbose:
+        L = len(word_x);
+        costs_repr, moves_repr = display_cost_matrix(Costs=Costs, path=path, X = '-' + X, Y = '-' + Y);
+        print('');
+        print('\x1b[1mAlignment:\x1b[0m');
+        print(f'  {word_y}');
+        print(f'  {L*"-"}');
+        print(f'  {word_x}');
+        print('');
+        print(costs_repr);
+        print('');
+        print(moves_repr);
+    return word_x, word_y;
+
+def hirschberg_algorithm_full(
+    X: str,
+    Y: str,
+    depth: int = 0,
+    verbose: bool = False,
+) -> Tuple[str, str]:
+    n = len(Y);
+    if n > 1:
+        n = int(np.ceil(n/2));
+
+        # bilde linke Hälfte vom horizontalen Wort:
+        Y1 = Y[:n];
+        X1 = X;
+
+        # bilde rechte Hälfte vom horizontalen Wort (und kehre h. + v. um):
+        Y2 = Y[n:][::-1];
+        X2 = X[::-1];
+
+        # Löse Teilprobleme:
+        Costs1, Moves1 = hirschberg_match_matrix(X = '-' + X1, Y = '-' + Y1);
+        Costs2, Moves2 = hirschberg_match_matrix(X = '-' + X2, Y = '-' + Y2);
+        path1, path2 = reconstruct_optimal_path_halves(
+            Costs1=Costs1,
+            Costs2=Costs2,
+            Moves1=Moves1,
+            Moves2=Moves2,
+        );
+        word_x_1, word_y_1 = reconstruct_words(X = '-' + X1, Y = '-' + Y1, Moves=Moves1, path=path1);
+        word_x_2, word_y_2 = reconstruct_words(X = '-' + X2, Y = '-' + Y2, Moves=Moves2, path=path2);
+
+        if verbose:
+            L = len(word_x_1) + len(word_x_2);
+            costs_repr, moves_repr = display_cost_matrix_halves(
+                Costs1 = Costs1,
+                Costs2 = Costs2,
+                path1  = path1,
+                path2  = path2,
+                X1     =  '-' + X1,
+                X2     =  '-' + X2,
+                Y1     =  '-' + Y1,
+                Y2     =  '-' + Y2,
+            );
+            print('');
+            print(f'\x1b[1mRekursionstiefe: {depth}\x1b[0m')
+            print('');
+            print('\x1b[1mAlignment:\x1b[0m');
+            print(f'  {word_y_1} {word_y_2[::-1]}');
+            print(f'  {(L+1)*"-"}');
+            print(f'  {word_x_1} {word_x_2[::-1]}');
+            print('');
+            print(moves_repr);
+
+        coord = path1[-1];
+        m = coord[0];
+        word_x_1, word_y_1 = hirschberg_algorithm_full(X=X[:m], Y=Y[:n], depth=depth+1, verbose=True);
+        word_x_2, word_y_2 = hirschberg_algorithm_full(X=X[m:], Y=Y[n:], depth=depth+1, verbose=True);
+        word_x = word_x_1 + word_x_2;
+        word_y = word_y_1 + word_y_2;
+    else:
+        word_x, word_y = hirschberg_algorithm(X=X, Y=Y, verbose=False);
+    if depth == 0:
+        L = len(word_x);
+        print('');
+        print('\x1b[1mAlignment:\x1b[0m');
+        print(f'  {word_y}');
+        print(f'  {L*"-"}');
+        print(f'  {word_x}');
+        print('');
+    return word_x, word_y;
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# METHODS cost matrix + optimal paths
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+def hirschberg_match_matrix(
+    X: str,
+    Y: str,
+) -> Tuple[NDArray[(Any, Any), int], NDArray[(Any, Any), Directions]]:
+    '''
+    Berechnet Hirschberg-Costs-Matrix (ohne Rekursion).
+
+    Annahmen:
+    - X[0] = gap
+    - Y[0] = gap
+    '''
+    m = len(X); # display vertically
+    n = len(Y); # display horizontally
+    Costs = np.full(shape=(m, n), dtype=int, fill_value=0);
+    Moves = np.full(shape=(m, n), dtype=Directions, fill_value=Directions.UNSET);
+
+    # zuerst 0. Spalte und 0. Zeile ausfüllen:
+    for i, x in list(enumerate(X))[1:]:
+        update_cost_matrix(Costs, Moves, x, '', i, 0);
+
+    for j, y in list(enumerate(Y))[1:]:
+        update_cost_matrix(Costs, Moves, '', y, 0, j);
+
+    # jetzt alle »inneren« Werte bestimmen:
+    for i, x in list(enumerate(X))[1:]:
+        for j, y in list(enumerate(Y))[1:]:
+            update_cost_matrix(Costs, Moves, x, y, i, j);
+    return Costs, Moves;
+
+def update_cost_matrix(
+    Costs: NDArray[(Any, Any), int],
+    Moves: NDArray[(Any, Any), Directions],
+    x: str,
+    y: str,
+    i: int,
+    j: int,
+):
+    '''
+    Schrittweise Funktion zur Aktualisierung vom Eintrag `(i,j)` in der Kostenmatrix.
+
+    Annahme:
+    - alle »Vorgänger« von `(i,j)` in der Matrix sind bereits optimiert.
+
+    @inputs
+    - `Costs` - bisher berechnete Kostenmatrix
+    - `Moves` - bisher berechnete optimale Schritte
+    - `i`, `x` - Position und Wert in String `X` (»vertical« dargestellt)
+    - `j`, `y` - Position und Wert in String `Y` (»horizontal« dargestellt)
+    '''
+
+    # nichts zu tun, wenn (i, j) == (0, 0):
+    if i == 0 and j == 0:
+        Costs[0, 0] = 0;
+        return;
+
+    ################################
+    # NOTE: Berechnung von möglichen Moves wie folgt.
+    #
+    # Fall 1: (i-1,j-1) ---> (i,j)
+    # ==> Stringvergleich ändert sich wie folgt:
+    #     s1          s1 x
+    #    ----  --->  ------
+    #     s2          s2 y
+    #
+    # Fall 2: (i,j-1) ---> (i,j)
+    # ==> Stringvergleich ändert sich wie folgt:
+    #     s1          s1 GAP
+    #    ----  --->  -------
+    #     s2          s2  y
+    #
+    # Fall 3: (i-1,j) ---> (i,j)
+    # ==> Stringvergleich ändert sich wie folgt:
+    #     s1          s1  x
+    #    ----  --->  -------
+    #     s2          s2 GAP
+    #
+    # Diese Fälle berücksichtigen wir:
+    ################################
+    edges = [];
+    if i > 0 and j > 0:
+        edges.append((
+            Directions.DIAGONAL,
+            Costs[i-1, j-1] + missmatch_penalty(x, y),
+        ));
+    if j > 0:
+        edges.append((
+            Directions.HORIZONTAL,
+            Costs[i, j-1] + gap_penalty(y),
+        ));
+    if i > 0:
+        edges.append((
+            Directions.VERTICAL,
+            Costs[i-1, j] + gap_penalty(x),
+        ));
+
+    if len(edges) > 0:
+        # Sortiere nach Priorität (festgelegt in Enum):
+        edges = sorted(edges, key=lambda x: x[0].value);
+        # Wähle erste Möglichkeit mit minimalen Kosten:
+        index = np.argmin([ cost for _, cost in edges]);
+        Moves[i, j], Costs[i, j] = edges[index];
+    return;
+
+def reconstruct_words(
+    X: str,
+    Y: str,
+    Moves: NDArray[(Any, Any), Directions],
+    path: List[Tuple[int, int]],
+) -> Tuple[str, str]:
+    word_x = '';
+    word_y = '';
+    for (i, j) in path:
+        x = X[i];
+        y = Y[j];
+        match Moves[i, j]:
+            case Directions.DIAGONAL:
+                word_x += x;
+                word_y += y;
+            case Directions.HORIZONTAL:
+                word_x += '-';
+                word_y += y;
+            case Directions.VERTICAL:
+                word_x += x;
+                word_y += '-';
+    return word_x, word_y;
+
+def reconstruct_optimal_path(
+    Moves: NDArray[(Any, Any), Directions],
+    coord: Optional[Tuple[int, int]] = None,
+) -> List[Tuple[int, int]]:
+    '''
+    Liest Matrix mit optimalen Schritten den optimalen Pfad aus,
+    angenfangen von Endkoordinaten.
+    '''
+    if coord is None:
+        m, n = Moves.shape;
+        (i, j) = (m-1, n-1);
+    else:
+        (i, j) = coord;
+    path = [(i, j)];
+    while (i, j) != (0, 0):
+        match Moves[i, j]:
+            case Directions.DIAGONAL:
+                (i, j) = (i - 1, j - 1);
+            case Directions.HORIZONTAL:
+                (i, j) = (i, j - 1);
+            case Directions.VERTICAL:
+                (i, j) = (i - 1, j);
+            case _:
+                break;
+        path.append((i, j));
+    return path[::-1];
+
+def reconstruct_optimal_path_halves(
+    Costs1: NDArray[(Any, Any), int],
+    Costs2: NDArray[(Any, Any), int],
+    Moves1: NDArray[(Any, Any), Directions],
+    Moves2: NDArray[(Any, Any), Directions],
+) -> Tuple[List[Tuple[int, int]], List[Tuple[int, int]]]:
+    (m, n1) = Costs1.shape;
+    (m, n2) = Costs2.shape;
+    info = [
+        (
+            Costs1[i, n1-1] + Costs2[m-1-i, n2-1],
+            (i, n1-1),
+            (m-1-i, n2-1),
+        )
+        for i in range(m)
+    ];
+    index = np.argmin([ cost for cost, _, _ in info ]);
+    path1 = reconstruct_optimal_path(Moves1, coord=info[index][1]);
+    path2 = reconstruct_optimal_path(Moves2, coord=info[index][2]);
+    return path1, path2;
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# AUXILIARY METHODS
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+def represent_cost_matrix(
+    Costs: NDArray[(Any, Any), int],
+    path: List[Tuple[int, int]],
+    X: str,
+    Y: str,
+    pad: bool = False,
+) -> Tuple[NDArray[(Any, Any), Any], NDArray[(Any, Any), Any]]:
+    m = len(X); # display vertically
+    n = len(Y); # display horizontally
+
+    # erstelle string-Array:
+    if pad:
+        table = np.full(shape=(3 + m + 3, 3 + n + 1), dtype=object, fill_value='');
+    else:
+        table = np.full(shape=(3 + m, 3 + n), dtype=object, fill_value='');
+
+    # topmost rows:
+    table[0, 3:(3+n)] = [str(j) for j in range(n)];
+    table[1, 3:(3+n)] = [y for y in Y];
+    table[2, 3:(3+n)] = '--';
+    # leftmost columns:
+    table[3:(3+m), 0] = [str(i) for i in range(m)];
+    table[3:(3+m), 1] = [x for x in X];
+    table[3:(3+m), 2] = '|';
+
+    if pad:
+        table[-3, 3:(3+n)] = '--';
+        table[3:(3+m), -1] = '|';
+
+    table_costs = table.copy();
+    table_moves = table.copy();
+    table_costs[3:(3+m), 3:(3+n)] = Costs;
+    table_moves[3:(3+m), 3:(3+n)] = '.';
+    for (i, j) in path:
+        # table_costs[3 + i, 3 + j] = f'\x1b[92;1m{table_costs[3 + i, 3 + j]}\x1b[0m';
+        table_moves[3 + i, 3 + j] = '@';
+
+    return table_costs, table_moves;
+
+def display_cost_matrix(
+    Costs: NDArray[(Any, Any), int],
+    path: List[Tuple[int, int]],
+    X: str,
+    Y: str,
+) -> Tuple[str, str]:
+    '''
+    Zeigt Kostenmatrix + optimalen Pfad.
+
+    @inputs
+    - `Costs` - Kostenmatrix
+    - `Moves` - Kodiert die optimalen Schritte
+    - `X`, `Y` - Strings
+
+    @returns
+    - eine 'printable' Darstellung der Matrix mit den Strings X, Y + Indexes.
+    '''
+    table_costs, table_moves = represent_cost_matrix(Costs=Costs, path=path, X=X, Y=Y);
+    # benutze pandas-Dataframe, um schöner darzustellen:
+    costs_repr = pd.DataFrame(table_costs).to_string(index=False, header=False);
+    moves_repr = pd.DataFrame(table_moves).to_string(index=False, header=False);
+    return costs_repr, moves_repr;
+
+def display_cost_matrix_halves(
+    Costs1: NDArray[(Any, Any), int],
+    Costs2: NDArray[(Any, Any), int],
+    path1: List[Tuple[int, int]],
+    path2: List[Tuple[int, int]],
+    X1: str,
+    X2: str,
+    Y1: str,
+    Y2: str,
+) -> Tuple[str, str]:
+    '''
+    Zeigt Kostenmatrix + optimalen Pfad für Schritt im D & C Hirschberg-Algorithmus
+
+    @inputs
+    - `Costs1`, `Costs2` - Kostenmatrizen
+    - `Moves1`, `Moves2` - Kodiert die optimalen Schritte
+    - `X1`, `X2`, `Y1`, `Y2` - Strings
+
+    @returns
+    - eine 'printable' Darstellung der Matrix mit den Strings X, Y + Indexes.
+    '''
+    table_costs1, table_moves1 = represent_cost_matrix(Costs=Costs1, path=path1, X=X1, Y=Y1, pad=True);
+    table_costs2, table_moves2 = represent_cost_matrix(Costs=Costs2, path=path2, X=X2, Y=Y2, pad=True);
+
+    # merge Taellen:
+    table_costs = np.concatenate([table_costs1, table_costs2[::-1, ::-1]], axis=1);
+    table_moves = np.concatenate([table_moves1, table_moves2[::-1, ::-1]], axis=1);
+
+    # benutze pandas-Dataframe, um schöner darzustellen:
+    costs_repr = pd.DataFrame(table_costs).to_string(index=False, header=False);
+    moves_repr = pd.DataFrame(table_moves).to_string(index=False, header=False);
+
+    return costs_repr, moves_repr;