ads2_2022/code/python/src/string_alignment/hirschberg.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# IMPORTS
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

from __future__ import annotations;
from src.local.typing import *;
from src.local.maths import *;


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# EXPORTS
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

__all__ = [
    'hirschberg_algorithm',
    'hirschberg_algorithm_once',
    'DisplayMode'
];

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# CONSTANTS / SETUP
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

class DisplayMode(Enum):
    NONE = -1;
    COSTS = 0;
    MOVES = 1;
    COSTS_AND_MOVES = 2;

class Directions(Enum):
    UNSET = -1;
    # Prioritäten hier setzen
    DIAGONAL = 0;
    HORIZONTAL = 1;
    VERTICAL = 2;

def gap_penalty(x: str):
    return 1;

def missmatch_penalty(x: str, y: str):
    return 0 if x == y else 1;

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# METHOD hirschberg_algorithm
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

def hirschberg_algorithm_once(
    X: str,
    Y: str,
    mode: DisplayMode = DisplayMode.NONE,
) -> Tuple[str, str]:
    Costs, Moves = compute_cost_matrix(X = '-' + X, Y = '-' + Y);
    path = reconstruct_optimal_path(Moves=Moves);
    word_x, word_y = reconstruct_words(X = '-' + X, Y = '-' + Y, moves=[Moves[coord] for coord in path], path=path);
    if mode != DisplayMode.NONE:
        repr = display_cost_matrix(Costs=Costs, path=path, X = '-' + X, Y = '-' + Y, mode=mode);
        print(f'\n{repr}');
        print(f'\n\x1b[1mOptimales Alignment:\x1b[0m');
        print('');
        print(word_y);
        print(len(word_x) * '-');
        print(word_x);
        print('');
    return word_x, word_y;

def hirschberg_algorithm(
    X: str,
    Y: str,
    mode: DisplayMode = DisplayMode.NONE,
) -> Tuple[str, str]:
    alignments_x, alignments_y = hirschberg_algorithm_step(X=X, Y=Y, depth=1, mode=mode);
    word_x = ''.join(alignments_x);
    word_y = ''.join(alignments_y);
    if mode != DisplayMode.NONE:
        display_x = f'[{"][".join(alignments_x)}]';
        display_y = f'[{"][".join(alignments_y)}]';
        print(f'\n\x1b[1mOptimales Alignment:\x1b[0m');
        print('');
        print(display_y);
        print(len(display_x) * '-');
        print(display_x);
        print('');
    return word_x, word_y;

def hirschberg_algorithm_step(
    X: str,
    Y: str,
    depth: int = 0,
    mode: DisplayMode = DisplayMode.NONE,
) -> Tuple[List[str], List[str]]:
    n = len(Y);
    if n == 1:
        Costs, Moves = compute_cost_matrix(X = '-' + X, Y = '-' + Y);
        path = reconstruct_optimal_path(Moves=Moves);
        word_x, word_y = reconstruct_words(X = '-' + X, Y = '-' + Y, moves=[Moves[coord] for coord in path], path=path);
        # if verbose:
        #     repr = display_cost_matrix(Costs=Costs, path=path, X = '-' + X, Y = '-' + Y);
        #     print(f'\n\x1b[1mRekursionstiefe: {depth}\x1b[0m\n\n{repr}')
        return [word_x], [word_y];
    else:
        n = int(np.ceil(n/2));

        # bilde linke Hälfte vom horizontalen Wort:
        Y1 = Y[:n];
        X1 = X;

        # bilde rechte Hälfte vom horizontalen Wort (und kehre h. + v. um):
        Y2 = Y[n:][::-1];
        X2 = X[::-1];

        # Löse Teilprobleme:
        Costs1, Moves1 = compute_cost_matrix(X = '-' + X1, Y = '-' + Y1);
        Costs2, Moves2 = compute_cost_matrix(X = '-' + X2, Y = '-' + Y2);

        if mode != DisplayMode.NONE:
            path1, path2 = reconstruct_optimal_path_halves(Costs1=Costs1, Costs2=Costs2, Moves1=Moves1, Moves2=Moves2);
            repr = display_cost_matrix_halves(
                Costs1 = Costs1,
                Costs2 = Costs2,
                path1  = path1,
                path2  = path2,
                X1     =  '-' + X1,
                X2     =  '-' + X2,
                Y1     =  '-' + Y1,
                Y2     =  '-' + Y2,
                mode   = mode,
            );
            print(f'\n\x1b[1mRekursionstiefe: {depth}\x1b[0m\n\n{repr}')

        # Koordinaten des optimalen Übergangs berechnen:
        coord1, coord2 = get_optimal_transition(Costs1=Costs1, Costs2=Costs2);
        p = coord1[0];
        # Divide and Conquer ausführen:
        alignments_x_1, alignments_y_1 = hirschberg_algorithm_step(X=X[:p], Y=Y[:n], depth=depth+1, verbose=verbose, mode=mode);
        alignments_x_2, alignments_y_2 = hirschberg_algorithm_step(X=X[p:], Y=Y[n:], depth=depth+1, verbose=verbose, mode=mode);

        # Resultate zusammensetzen:
        alignments_x = alignments_x_1 + alignments_x_2;
        alignments_y = alignments_y_1 + alignments_y_2;
        if len(Y[:n]) <= 1 and len(Y[n:]) <= 1:
            # falls linke + rechte Hälfte nur aus <= 1 Buchstsaben bestehen, bestehen Alignment aus nur einem Teil ---> führe zusammen:
            alignments_x = [ ''.join(alignments_x) ];
            alignments_y = [ ''.join(alignments_y) ];
        return alignments_x, alignments_y;

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# METHODS cost matrix + optimal paths
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

def compute_cost_matrix(
    X: str,
    Y: str,
) -> Tuple[NDArray[(Any, Any), int], NDArray[(Any, Any), Directions]]:
    '''
    Berechnet Hirschberg-Costs-Matrix (ohne Rekursion).

    Annahmen:
    - X[0] = gap
    - Y[0] = gap
    '''
    m = len(X); # display vertically
    n = len(Y); # display horizontally
    Costs = np.full(shape=(m, n), dtype=int, fill_value=0);
    Moves = np.full(shape=(m, n), dtype=Directions, fill_value=Directions.UNSET);

    # zuerst 0. Spalte und 0. Zeile ausfüllen:
    for i, x in list(enumerate(X))[1:]:
        update_cost_matrix(Costs, Moves, x, '', i, 0);

    for j, y in list(enumerate(Y))[1:]:
        update_cost_matrix(Costs, Moves, '', y, 0, j);

    # jetzt alle »inneren« Werte bestimmen:
    for i, x in list(enumerate(X))[1:]:
        for j, y in list(enumerate(Y))[1:]:
            update_cost_matrix(Costs, Moves, x, y, i, j);
    return Costs, Moves;

def update_cost_matrix(
    Costs: NDArray[(Any, Any), int],
    Moves: NDArray[(Any, Any), Directions],
    x: str,
    y: str,
    i: int,
    j: int,
):
    '''
    Schrittweise Funktion zur Aktualisierung vom Eintrag `(i,j)` in der Kostenmatrix.

    Annahme:
    - alle »Vorgänger« von `(i,j)` in der Matrix sind bereits optimiert.

    @inputs
    - `Costs` - bisher berechnete Kostenmatrix
    - `Moves` - bisher berechnete optimale Schritte
    - `i`, `x` - Position und Wert in String `X` (»vertical« dargestellt)
    - `j`, `y` - Position und Wert in String `Y` (»horizontal« dargestellt)
    '''

    # nichts zu tun, wenn (i, j) == (0, 0):
    if i == 0 and j == 0:
        Costs[0, 0] = 0;
        return;

    ################################
    # NOTE: Berechnung von möglichen Moves wie folgt.
    #
    # Fall 1: (i-1,j-1) ---> (i,j)
    # ==> Stringvergleich ändert sich wie folgt:
    #     s1          s1 x
    #    ----  --->  ------
    #     s2          s2 y
    #
    # Fall 2: (i,j-1) ---> (i,j)
    # ==> Stringvergleich ändert sich wie folgt:
    #     s1          s1 GAP
    #    ----  --->  -------
    #     s2          s2  y
    #
    # Fall 3: (i-1,j) ---> (i,j)
    # ==> Stringvergleich ändert sich wie folgt:
    #     s1          s1  x
    #    ----  --->  -------
    #     s2          s2 GAP
    #
    # Diese Fälle berücksichtigen wir:
    ################################
    edges = [];
    if i > 0 and j > 0:
        edges.append((
            Directions.DIAGONAL,
            Costs[i-1, j-1] + missmatch_penalty(x, y),
        ));
    if j > 0:
        edges.append((
            Directions.HORIZONTAL,
            Costs[i, j-1] + gap_penalty(y),
        ));
    if i > 0:
        edges.append((
            Directions.VERTICAL,
            Costs[i-1, j] + gap_penalty(x),
        ));

    if len(edges) > 0:
        # Sortiere nach Priorität (festgelegt in Enum):
        edges = sorted(edges, key=lambda x: x[0].value);
        # Wähle erste Möglichkeit mit minimalen Kosten:
        index = np.argmin([ cost for _, cost in edges]);
        Moves[i, j], Costs[i, j] = edges[index];
    return;

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# METHODS optimaler treffpunkt
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

def get_optimal_transition(
    Costs1: NDArray[(Any, Any), int],
    Costs2: NDArray[(Any, Any), int],
) -> Tuple[Tuple[int, int], Tuple[int, int]]:
    '''
    Rekonstruiere »Treffpunkt«, wo die Gesamtkosten minimiert sind.
    Dieser Punkt stellt einen optimal Übergang für den Rekursionsschritt dar.
    '''
    (m, n1) = Costs1.shape;
    (m, n2) = Costs2.shape;
    info = [
        (
            Costs1[i, n1-1] + Costs2[m-1-i, n2-1],
            (i, n1-1),
            (m-1-i, n2-1),
        )
        for i in range(m)
    ];
    index = np.argmin([ cost for cost, _, _ in info ]);
    coord1 = info[index][1];
    coord2 = info[index][2];
    return coord1, coord2;

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# METHODS reconstruction von words/paths
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

def reconstruct_optimal_path(
    Moves: NDArray[(Any, Any), Directions],
    coord: Optional[Tuple[int, int]] = None,
) -> List[Tuple[int, int]]:
    '''
    Liest Matrix mit optimalen Schritten den optimalen Pfad aus,
    angenfangen von Endkoordinaten.
    '''
    if coord is None:
        m, n = Moves.shape;
        (i, j) = (m-1, n-1);
    else:
        (i, j) = coord;
    path = [(i, j)];
    while (i, j) != (0, 0):
        match Moves[i, j]:
            case Directions.DIAGONAL:
                (i, j) = (i - 1, j - 1);
            case Directions.HORIZONTAL:
                (i, j) = (i, j - 1);
            case Directions.VERTICAL:
                (i, j) = (i - 1, j);
            case _:
                break;
        path.append((i, j));
    return path[::-1];

def reconstruct_optimal_path_halves(
    Costs1: NDArray[(Any, Any), int],
    Costs2: NDArray[(Any, Any), int],
    Moves1: NDArray[(Any, Any), Directions],
    Moves2: NDArray[(Any, Any), Directions],
) -> Tuple[List[Tuple[int, int]], List[Tuple[int, int]]]:
    '''
    Rekonstruiere optimale Pfad für Rekursionsschritt,
    wenn horizontales Wort in 2 aufgeteilt wird.
    '''
    coord1, coord2 = get_optimal_transition(Costs1=Costs1, Costs2=Costs2);
    path1 = reconstruct_optimal_path(Moves1, coord=coord1);
    path2 = reconstruct_optimal_path(Moves2, coord=coord2);
    return path1, path2;

def reconstruct_words(
    X: str,
    Y: str,
    moves: List[Directions],
    path: List[Tuple[int, int]],
) -> Tuple[str, str]:
    word_x = '';
    word_y = '';
    for ((i, j), move) in zip(path, moves):
        x = X[i];
        y = Y[j];
        match move:
            case Directions.DIAGONAL:
                word_x += x;
                word_y += y;
            case Directions.HORIZONTAL:
                word_x += '-';
                word_y += y;
            case Directions.VERTICAL:
                word_x += x;
                word_y += '-';
    return word_x, word_y;

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# AUXILIARY METHODS
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

def represent_cost_matrix(
    Costs: NDArray[(Any, Any), int],
    path: List[Tuple[int, int]],
    X: str,
    Y: str,
    mode: DisplayMode,
    pad: bool = False,
) -> NDArray[(Any, Any), Any]:
    m = len(X); # display vertically
    n = len(Y); # display horizontally

    # erstelle string-Array:
    if pad:
        table = np.full(shape=(3 + m + 3, 3 + n + 1), dtype=object, fill_value='');
    else:
        table = np.full(shape=(3 + m, 3 + n), dtype=object, fill_value='');

    # topmost rows:
    table[0, 3:(3+n)] = [str(j) for j in range(n)];
    table[1, 3:(3+n)] = [y for y in Y];
    table[2, 3:(3+n)] = '--';
    # leftmost columns:
    table[3:(3+m), 0] = [str(i) for i in range(m)];
    table[3:(3+m), 1] = [x for x in X];
    table[3:(3+m), 2] = '|';

    if pad:
        table[-3, 3:(3+n)] = '--';
        table[3:(3+m), -1] = '|';

    match mode:
        case DisplayMode.MOVES:
            table[3:(3+m), 3:(3+n)] = '.';
            for (i, j) in path:
                table[3 + i, 3 + j] = '*';
        case DisplayMode.COSTS | DisplayMode.COSTS_AND_MOVES:
            table[3:(3+m), 3:(3+n)] = Costs.copy();
            if mode == DisplayMode.COSTS_AND_MOVES:
                for (i, j) in path:
                    table[3 + i, 3 + j] = f'{{{table[3 + i, 3 + j]}}}';

    return table;

def display_cost_matrix(
    Costs: NDArray[(Any, Any), int],
    path: List[Tuple[int, int]],
    X: str,
    Y: str,
    mode: DisplayMode,
) -> str:
    '''
    Zeigt Kostenmatrix + optimalen Pfad.

    @inputs
    - `Costs` - Kostenmatrix
    - `Moves` - Kodiert die optimalen Schritte
    - `X`, `Y` - Strings

    @returns
    - eine 'printable' Darstellung der Matrix mit den Strings X, Y + Indexes.
    '''
    table = represent_cost_matrix(Costs=Costs, path=path, X=X, Y=Y, mode=mode);
    # benutze pandas-Dataframe + tabulate, um schöner darzustellen:
    repr = tabulate(pd.DataFrame(table), showindex=False, stralign='center', tablefmt='plain');
    return repr;

def display_cost_matrix_halves(
    Costs1: NDArray[(Any, Any), int],
    Costs2: NDArray[(Any, Any), int],
    path1: List[Tuple[int, int]],
    path2: List[Tuple[int, int]],
    X1: str,
    X2: str,
    Y1: str,
    Y2: str,
    mode: DisplayMode,
) -> str:
    '''
    Zeigt Kostenmatrix + optimalen Pfad für Schritt im D & C Hirschberg-Algorithmus

    @inputs
    - `Costs1`, `Costs2` - Kostenmatrizen
    - `Moves1`, `Moves2` - Kodiert die optimalen Schritte
    - `X1`, `X2`, `Y1`, `Y2` - Strings

    @returns
    - eine 'printable' Darstellung der Matrix mit den Strings X, Y + Indexes.
    '''
    table1 = represent_cost_matrix(Costs=Costs1, path=path1, X=X1, Y=Y1, mode=mode, pad=True);
    table2 = represent_cost_matrix(Costs=Costs2, path=path2, X=X2, Y=Y2, mode=mode, pad=True);

    # merge Taellen:
    table = np.concatenate([table1[:, :-1], table2[::-1, ::-1]], axis=1);

    # benutze pandas-Dataframe + tabulate, um schöner darzustellen:
    repr = tabulate(pd.DataFrame(table), showindex=False, stralign='center', tablefmt='plain');
    return repr;