ads2_2022/code/python/src/string_alignment/hirschberg.py

454 lines
15 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# IMPORTS
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
from __future__ import annotations;
from src.local.typing import *;
from src.local.maths import *;
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# EXPORTS
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
__all__ = [
'hirschberg_algorithm',
'hirschberg_algorithm_once',
'DisplayMode'
];
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# CONSTANTS / SETUP
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
class DisplayMode(Enum):
NONE = -1;
COSTS = 0;
MOVES = 1;
COSTS_AND_MOVES = 2;
class Directions(Enum):
UNSET = -1;
# Prioritäten hier setzen
DIAGONAL = 0;
HORIZONTAL = 1;
VERTICAL = 2;
def gap_penalty(x: str):
return 1;
def missmatch_penalty(x: str, y: str):
return 0 if x == y else 1;
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# METHOD hirschberg_algorithm
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def hirschberg_algorithm_once(
X: str,
Y: str,
mode: DisplayMode = DisplayMode.NONE,
) -> Tuple[str, str]:
Costs, Moves = compute_cost_matrix(X = '-' + X, Y = '-' + Y);
path = reconstruct_optimal_path(Moves=Moves);
word_x, word_y = reconstruct_words(X = '-' + X, Y = '-' + Y, moves=[Moves[coord] for coord in path], path=path);
if mode != DisplayMode.NONE:
repr = display_cost_matrix(Costs=Costs, path=path, X = '-' + X, Y = '-' + Y, mode=mode);
print(f'\n{repr}');
print(f'\n\x1b[1mOptimales Alignment:\x1b[0m');
print('');
print(word_y);
print(len(word_x) * '-');
print(word_x);
print('');
return word_x, word_y;
def hirschberg_algorithm(
X: str,
Y: str,
mode: DisplayMode = DisplayMode.NONE,
) -> Tuple[str, str]:
alignments_x, alignments_y = hirschberg_algorithm_step(X=X, Y=Y, depth=1, mode=mode);
word_x = ''.join(alignments_x);
word_y = ''.join(alignments_y);
if mode != DisplayMode.NONE:
display_x = f'[{"][".join(alignments_x)}]';
display_y = f'[{"][".join(alignments_y)}]';
print(f'\n\x1b[1mOptimales Alignment:\x1b[0m');
print('');
print(display_y);
print(len(display_x) * '-');
print(display_x);
print('');
return word_x, word_y;
def hirschberg_algorithm_step(
X: str,
Y: str,
depth: int = 0,
mode: DisplayMode = DisplayMode.NONE,
) -> Tuple[List[str], List[str]]:
n = len(Y);
if n == 1:
Costs, Moves = compute_cost_matrix(X = '-' + X, Y = '-' + Y);
path = reconstruct_optimal_path(Moves=Moves);
word_x, word_y = reconstruct_words(X = '-' + X, Y = '-' + Y, moves=[Moves[coord] for coord in path], path=path);
# if verbose:
# repr = display_cost_matrix(Costs=Costs, path=path, X = '-' + X, Y = '-' + Y);
# print(f'\n\x1b[1mRekursionstiefe: {depth}\x1b[0m\n\n{repr}')
return [word_x], [word_y];
else:
n = int(np.ceil(n/2));
# bilde linke Hälfte vom horizontalen Wort:
Y1 = Y[:n];
X1 = X;
# bilde rechte Hälfte vom horizontalen Wort (und kehre h. + v. um):
Y2 = Y[n:][::-1];
X2 = X[::-1];
# Löse Teilprobleme:
Costs1, Moves1 = compute_cost_matrix(X = '-' + X1, Y = '-' + Y1);
Costs2, Moves2 = compute_cost_matrix(X = '-' + X2, Y = '-' + Y2);
if mode != DisplayMode.NONE:
path1, path2 = reconstruct_optimal_path_halves(Costs1=Costs1, Costs2=Costs2, Moves1=Moves1, Moves2=Moves2);
repr = display_cost_matrix_halves(
Costs1 = Costs1,
Costs2 = Costs2,
path1 = path1,
path2 = path2,
X1 = '-' + X1,
X2 = '-' + X2,
Y1 = '-' + Y1,
Y2 = '-' + Y2,
mode = mode,
);
print(f'\n\x1b[1mRekursionstiefe: {depth}\x1b[0m\n\n{repr}')
# Koordinaten des optimalen Übergangs berechnen:
coord1, coord2 = get_optimal_transition(Costs1=Costs1, Costs2=Costs2);
p = coord1[0];
# Divide and Conquer ausführen:
alignments_x_1, alignments_y_1 = hirschberg_algorithm_step(X=X[:p], Y=Y[:n], depth=depth+1, verbose=verbose, mode=mode);
alignments_x_2, alignments_y_2 = hirschberg_algorithm_step(X=X[p:], Y=Y[n:], depth=depth+1, verbose=verbose, mode=mode);
# Resultate zusammensetzen:
alignments_x = alignments_x_1 + alignments_x_2;
alignments_y = alignments_y_1 + alignments_y_2;
if len(Y[:n]) <= 1 and len(Y[n:]) <= 1:
# falls linke + rechte Hälfte nur aus <= 1 Buchstsaben bestehen, bestehen Alignment aus nur einem Teil ---> führe zusammen:
alignments_x = [ ''.join(alignments_x) ];
alignments_y = [ ''.join(alignments_y) ];
return alignments_x, alignments_y;
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# METHODS cost matrix + optimal paths
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def compute_cost_matrix(
X: str,
Y: str,
) -> Tuple[NDArray[(Any, Any), int], NDArray[(Any, Any), Directions]]:
'''
Berechnet Hirschberg-Costs-Matrix (ohne Rekursion).
Annahmen:
- X[0] = gap
- Y[0] = gap
'''
m = len(X); # display vertically
n = len(Y); # display horizontally
Costs = np.full(shape=(m, n), dtype=int, fill_value=0);
Moves = np.full(shape=(m, n), dtype=Directions, fill_value=Directions.UNSET);
# zuerst 0. Spalte und 0. Zeile ausfüllen:
for i, x in list(enumerate(X))[1:]:
update_cost_matrix(Costs, Moves, x, '', i, 0);
for j, y in list(enumerate(Y))[1:]:
update_cost_matrix(Costs, Moves, '', y, 0, j);
# jetzt alle »inneren« Werte bestimmen:
for i, x in list(enumerate(X))[1:]:
for j, y in list(enumerate(Y))[1:]:
update_cost_matrix(Costs, Moves, x, y, i, j);
return Costs, Moves;
def update_cost_matrix(
Costs: NDArray[(Any, Any), int],
Moves: NDArray[(Any, Any), Directions],
x: str,
y: str,
i: int,
j: int,
):
'''
Schrittweise Funktion zur Aktualisierung vom Eintrag `(i,j)` in der Kostenmatrix.
Annahme:
- alle »Vorgänger« von `(i,j)` in der Matrix sind bereits optimiert.
@inputs
- `Costs` - bisher berechnete Kostenmatrix
- `Moves` - bisher berechnete optimale Schritte
- `i`, `x` - Position und Wert in String `X` (»vertical« dargestellt)
- `j`, `y` - Position und Wert in String `Y` (»horizontal« dargestellt)
'''
# nichts zu tun, wenn (i, j) == (0, 0):
if i == 0 and j == 0:
Costs[0, 0] = 0;
return;
################################
# NOTE: Berechnung von möglichen Moves wie folgt.
#
# Fall 1: (i-1,j-1) ---> (i,j)
# ==> Stringvergleich ändert sich wie folgt:
# s1 s1 x
# ---- ---> ------
# s2 s2 y
#
# Fall 2: (i,j-1) ---> (i,j)
# ==> Stringvergleich ändert sich wie folgt:
# s1 s1 GAP
# ---- ---> -------
# s2 s2 y
#
# Fall 3: (i-1,j) ---> (i,j)
# ==> Stringvergleich ändert sich wie folgt:
# s1 s1 x
# ---- ---> -------
# s2 s2 GAP
#
# Diese Fälle berücksichtigen wir:
################################
edges = [];
if i > 0 and j > 0:
edges.append((
Directions.DIAGONAL,
Costs[i-1, j-1] + missmatch_penalty(x, y),
));
if j > 0:
edges.append((
Directions.HORIZONTAL,
Costs[i, j-1] + gap_penalty(y),
));
if i > 0:
edges.append((
Directions.VERTICAL,
Costs[i-1, j] + gap_penalty(x),
));
if len(edges) > 0:
# Sortiere nach Priorität (festgelegt in Enum):
edges = sorted(edges, key=lambda x: x[0].value);
# Wähle erste Möglichkeit mit minimalen Kosten:
index = np.argmin([ cost for _, cost in edges]);
Moves[i, j], Costs[i, j] = edges[index];
return;
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# METHODS optimaler treffpunkt
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def get_optimal_transition(
Costs1: NDArray[(Any, Any), int],
Costs2: NDArray[(Any, Any), int],
) -> Tuple[Tuple[int, int], Tuple[int, int]]:
'''
Rekonstruiere »Treffpunkt«, wo die Gesamtkosten minimiert sind.
Dieser Punkt stellt einen optimal Übergang für den Rekursionsschritt dar.
'''
(m, n1) = Costs1.shape;
(m, n2) = Costs2.shape;
info = [
(
Costs1[i, n1-1] + Costs2[m-1-i, n2-1],
(i, n1-1),
(m-1-i, n2-1),
)
for i in range(m)
];
index = np.argmin([ cost for cost, _, _ in info ]);
coord1 = info[index][1];
coord2 = info[index][2];
return coord1, coord2;
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# METHODS reconstruction von words/paths
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def reconstruct_optimal_path(
Moves: NDArray[(Any, Any), Directions],
coord: Optional[Tuple[int, int]] = None,
) -> List[Tuple[int, int]]:
'''
Liest Matrix mit optimalen Schritten den optimalen Pfad aus,
angenfangen von Endkoordinaten.
'''
if coord is None:
m, n = Moves.shape;
(i, j) = (m-1, n-1);
else:
(i, j) = coord;
path = [(i, j)];
while (i, j) != (0, 0):
match Moves[i, j]:
case Directions.DIAGONAL:
(i, j) = (i - 1, j - 1);
case Directions.HORIZONTAL:
(i, j) = (i, j - 1);
case Directions.VERTICAL:
(i, j) = (i - 1, j);
case _:
break;
path.append((i, j));
return path[::-1];
def reconstruct_optimal_path_halves(
Costs1: NDArray[(Any, Any), int],
Costs2: NDArray[(Any, Any), int],
Moves1: NDArray[(Any, Any), Directions],
Moves2: NDArray[(Any, Any), Directions],
) -> Tuple[List[Tuple[int, int]], List[Tuple[int, int]]]:
'''
Rekonstruiere optimale Pfad für Rekursionsschritt,
wenn horizontales Wort in 2 aufgeteilt wird.
'''
coord1, coord2 = get_optimal_transition(Costs1=Costs1, Costs2=Costs2);
path1 = reconstruct_optimal_path(Moves1, coord=coord1);
path2 = reconstruct_optimal_path(Moves2, coord=coord2);
return path1, path2;
def reconstruct_words(
X: str,
Y: str,
moves: List[Directions],
path: List[Tuple[int, int]],
) -> Tuple[str, str]:
word_x = '';
word_y = '';
for ((i, j), move) in zip(path, moves):
x = X[i];
y = Y[j];
match move:
case Directions.DIAGONAL:
word_x += x;
word_y += y;
case Directions.HORIZONTAL:
word_x += '-';
word_y += y;
case Directions.VERTICAL:
word_x += x;
word_y += '-';
return word_x, word_y;
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# AUXILIARY METHODS
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def represent_cost_matrix(
Costs: NDArray[(Any, Any), int],
path: List[Tuple[int, int]],
X: str,
Y: str,
mode: DisplayMode,
pad: bool = False,
) -> NDArray[(Any, Any), Any]:
m = len(X); # display vertically
n = len(Y); # display horizontally
# erstelle string-Array:
if pad:
table = np.full(shape=(3 + m + 3, 3 + n + 1), dtype=object, fill_value='');
else:
table = np.full(shape=(3 + m, 3 + n), dtype=object, fill_value='');
# topmost rows:
table[0, 3:(3+n)] = [str(j) for j in range(n)];
table[1, 3:(3+n)] = [y for y in Y];
table[2, 3:(3+n)] = '--';
# leftmost columns:
table[3:(3+m), 0] = [str(i) for i in range(m)];
table[3:(3+m), 1] = [x for x in X];
table[3:(3+m), 2] = '|';
if pad:
table[-3, 3:(3+n)] = '--';
table[3:(3+m), -1] = '|';
match mode:
case DisplayMode.MOVES:
table[3:(3+m), 3:(3+n)] = '.';
for (i, j) in path:
table[3 + i, 3 + j] = '*';
case DisplayMode.COSTS | DisplayMode.COSTS_AND_MOVES:
table[3:(3+m), 3:(3+n)] = Costs.copy();
if mode == DisplayMode.COSTS_AND_MOVES:
for (i, j) in path:
table[3 + i, 3 + j] = f'{{{table[3 + i, 3 + j]}}}';
return table;
def display_cost_matrix(
Costs: NDArray[(Any, Any), int],
path: List[Tuple[int, int]],
X: str,
Y: str,
mode: DisplayMode,
) -> str:
'''
Zeigt Kostenmatrix + optimalen Pfad.
@inputs
- `Costs` - Kostenmatrix
- `Moves` - Kodiert die optimalen Schritte
- `X`, `Y` - Strings
@returns
- eine 'printable' Darstellung der Matrix mit den Strings X, Y + Indexes.
'''
table = represent_cost_matrix(Costs=Costs, path=path, X=X, Y=Y, mode=mode);
# benutze pandas-Dataframe + tabulate, um schöner darzustellen:
repr = tabulate(pd.DataFrame(table), showindex=False, stralign='center', tablefmt='plain');
return repr;
def display_cost_matrix_halves(
Costs1: NDArray[(Any, Any), int],
Costs2: NDArray[(Any, Any), int],
path1: List[Tuple[int, int]],
path2: List[Tuple[int, int]],
X1: str,
X2: str,
Y1: str,
Y2: str,
mode: DisplayMode,
) -> str:
'''
Zeigt Kostenmatrix + optimalen Pfad für Schritt im D & C Hirschberg-Algorithmus
@inputs
- `Costs1`, `Costs2` - Kostenmatrizen
- `Moves1`, `Moves2` - Kodiert die optimalen Schritte
- `X1`, `X2`, `Y1`, `Y2` - Strings
@returns
- eine 'printable' Darstellung der Matrix mit den Strings X, Y + Indexes.
'''
table1 = represent_cost_matrix(Costs=Costs1, path=path1, X=X1, Y=Y1, mode=mode, pad=True);
table2 = represent_cost_matrix(Costs=Costs2, path=path2, X=X2, Y=Y2, mode=mode, pad=True);
# merge Taellen:
table = np.concatenate([table1[:, :-1], table2[::-1, ::-1]], axis=1);
# benutze pandas-Dataframe + tabulate, um schöner darzustellen:
repr = tabulate(pd.DataFrame(table), showindex=False, stralign='center', tablefmt='plain');
return repr;