1import numpy as np
2import seaborn as sns
3import matplotlib.pyplot as plt
4import matplotlib as mpl
5import pandas as pd
6
7mpl.rcParams["font.family"] = "Segoe UI Emoji"
8
9text1 = """
10 🧁🧯 🤇🥌🦟 🤇🤫🥌😼🤫 🤇🤟 🦘🦊🥾
11🤯🤯🧛😎😼 🥌🥦🤇🤇🦟😒 🧁🧛🧁🦊🧯 🦘😎🦘🦟🤯🦺 🥾🥾🥑🥑🥌
12🤟🧁🥦 🥦😼🦊🥾🦺🤇 🧯🦟🦘🤶🦺🦟🥷🧯 😎🦟🦊🤟
13🧁😎🦺🦊🤫🥌🧯 🦺😎 🦘😼🥦😎🥑 🤯🦘🦘🦺🦺 😼🤟🥾🤯🥾🤶🥦🦘
14🦟😎🤟🦺😼🤶🤟 🧁🦟😎🤯🧛🤇🤟 🧯🤟 😎🦘🥌🧁🦘🤶🦺🦊 🤶🤶🤫🤟🦺
15🤇🥌😎🥌🤟🤇 🥦🦺🦘🤇🥾🥷 🤯🦘🦺😒 🧯🦟🦊 🤇🧯🥌 🦟🤇🧛🧛🤟🤯🧛
16🤫🦟 🥦🤶🥑🤇🥑 🥦🦊🥦 😼🤇🦟😎🥑 🧁🦊🤯🤫🦘🥾🦊
17😎😼 🧛😎🧛🦊🤇🦊🦺🥷 🤯🦘🤶🧁🧁🧁🥌 🥷🤶🦊🦊😒🧛 😼😎🤫🧁"""
18
19
20text2 = """
21 🦺🤶🦺🦟 🦘🧯 🧛🥑🤯 🤶🥌🤇🥷😒
22🧁🦘🥾 😎🤯 🤫🧁🤟🦺🤟🧯 🦺🤟🦟🤫😼
23🤶😒🦺🥾 🧁🤶🥷🧛🧛🤫🧛 🤇🥾🤫🤇😒🦟🦟 🤇🥦🥷🧁🥑
24😎🧯🦘🦟 🧛🦟🤇🧛🤫 😼🤟🦘 😎🦟🤯
25🥷🤯🤟🧯🧁 🧁🤶🧁🧛🦘 🧁🤶🧯🥑🤯😒 🥦🥾🤶🤫😎 😎😎🤯😼🤶🧁🥷🦟
26🧛😼🤟 🥑🤇🥾 🥾😒🤫 🧁😎🧛🤟🦺🥑🤯🤇 🦟🧯🥷 🥾🤇🥾
27🥦🧁🦊😎🧁😒 😎🤇😼🤯🦊🥾🥷 😎🥌🤯🦟🧛🧁🧛 🦘🧁 😎😒🦊🧛 🥾🥾🤶🤯
28😎😒🧛🤯🧁 🤟🦺🤯🧛 🦊🤶🥾🥑🥷🦘🦺 🦟😼🦘 🦘🤶😼🤫 🦟🦘😼🤯🤯"""
29
30
31text3 = """
32 🧯🧛🦟🤫🥦🤇🦟 😎🥑🤟🥷🧯🤫🤫 🥷🧯🦺🦺 🧛🥑🥾🦺🧁
33🤇🦟🧁🥌🤇🥾🤶🥑 🥾🤫 😼🥑🦺 🤶🦺🦟 🤫🦺 🥦😼🧯🤫🦺🤶
34🤟🥾🦺🤇 🤯🥦🤫🦟🤯🤟🤟🤶 🧛🥦🧁 🦘🧯🥾🧛🧯 🥑🤶😼
35😎😎🥾🧛🧛 🦘🥑🧁😼🥌🤫🤟 🤟🤇🤶🦊🥾 🤫😼🤫🥾🦟🧁 🤇🦘🦺🧛😼 🥾🤇🥌🧁🤫🥷🦟🤫
36🥑🧁🧁🤫🥑🦟🤟🥑 🧛🥌🥾 😼😼🤶🤟🦘 🤟🦺🤇🦘🧛🤯🥌 🧁🦟🤯🥾
37🦘🥑 😼🥦🦟🧛🥑🧯🥌 🤇😼 😼🤫
38🥾🦊🥷 🦘🤟 🦊🥑🤯🤫🥌🧛 🤇🤫😎🦺🥌 🥾🥦🥑🦊🧁 🥌😼😎🤫🤯
39🥦🤶🧛🤟😒🦟🧁🧯 🤇🤫🧛🥦🦺🤟 🥦🤶🥑🦟 🦺🧛🦘🥦🧯🦺"""
40
41
42text4 = """
43 😎🧁🥦🦊🦟🥦 🤶🧯🥌🧯🤟 🧯🦺🧯😒🧁🧛 🦊🧁🥑🤫🤶 🤶🧁🦺🦊😎🤫🧛🥷 😒🤇🥌🤯🤇🤶🤶🥾
44🦊😼🦟🦘😎🧛🤟 😎🤇🤯🧛🤇🦘🦘😼 🥑🥷🥑🤇🤟🤇 🦊🥑🥌😼🦘🤇 😼🤟 🦺🦟🥷😒🥦🤯
45🤇🦟 🧛🤟🥾🤯🥦 🥦😒😎🥑🥷🤟🥾 🥑🤶 🦺🤫🥷😒🦺🤟 🥦😒🧛🧯🤇🥑🤶🥾
46😎🤟🦺😒🥾🤇🤯🤶 🥦🦟🥦🤟🤫🤯🥑🤇 🤟🦊🥾🤟🤶🧯 😼🤯😒 🥦🤇🧁
47😒🤟🥌🥌🤫🧛😼 🤯🦟🥑🤇🥑 🥑🦟😎😼😎 🦘🦟🥦🤇🦘
48🤯😎🥦🦊😼😎🤶🤇 😼🤯🤯😎🥾😼🥷 🤯🥑🦟😼 🥷😎🤇🥌 🦺🦺🦺🥌🦘🧛🤫🤇
49🥷🧁🥦😎 🧁🤇😒🦊🤯🤯😎😼 🧯🧁🥦 😎🤯 🦟🤫🥑🤟🤫🦟 😼🧁🦺😒😒🦺🤫🥦
50🧛🥦🦟😼😒😒🦘 🤇🥾 🥌🥦🧛🤯🤶 🦟🥷🤇🥌😒🤶 🦟🤯😼😼"""
51
52
53text5 = """
54 🥌🦊🥾🦊🧁🧯🧛🤯 🧛😎🦟🥌🦘🥾🤯 😎🤶🧁🥌🦘🤇 🧛🦟🤯 🦊🦘🤶🦘🦊🥌 🥷🦊🧛
55🧯😒🤫 😼🥌🧛🥾😼😎🤶 🧁🤫🧛🥾🤫 🦊🦟🦘🤶🥌😼
56🧛🦊🤯🥾 🥑🦟🥑🤇😼 😒🦟 🦘🧛😎 🤟😼
57😒🥾🤶😒🥾 🤇🦘 🥾😼 🥦🦊🥌🦘🦟
58🤶🦘🦟 🧁🧯 🤫🤟😒😒🥦 🥦🧁🥌🦘😼🤟🤶
59🤫🥑🦘🤇🥷🧁 🥑🧁 🤶🧛😒 🥷🥦
60🧯🦊🧛 😼🤯🥑🤟 🧯😼🧯🦊😒🥦 🥾😒🦘🤫🦺🤟🧯 🧛😎🧛🧛😼🤯 🧁😒🦺🦺😒🦺
61🧯😼🤯🧁🥌🤇🧛🦘 🧯🤯 😒🤟😼🥾🤟🤫 😎🤫"""
62
63sequence = "😎😎🥦🦊🦺🥑🤇🧛🥦🦟🦘😼🥾🥦🤇🥌🦺🤶🦊😎🦟🥷🥷🥌😒🥑🦟🦺🤶🤶🥾🥾😼🥑😎🤫😎🦘🥷🦘🤯🤯🦟🤟🤯😎🥷🦊🥾🦟"
64
65
66def clean_text(text):
67 """
68 **TODO**:
69 Clean the text by removing all white spaces and new line character (\\\\n)
70
71 :param text: The text to clean
72 :return: The same text witout white spaces and new line characters
73 """
74 return text.replace(" ", "").replace("\n", "")
75
76
77def character_propabilities(text, all_chars):
78 """
79 **TODO**:
80 Given a text, calculate the empirical observation propability of
81 all characters from the "all_chars" list.
82
83 The observation propability for character c
84 is given as the number of occurrences of that character divided by the total
85 number of characters in the string.
86
87 :param text: The text for which character observation propabilities are to be calculated
88 :param all_chars: A set of unique characters. The propability for each such character is to be calcualted.
89 :return: A dictionary mapping all characters within the all_chars parameter to its respective observation propability.
90 """
91 text = clean_text(text)
92 return {char: text.count(char) / len(text) for char in all_chars}
93
94
95def get_emmision_propabilities(all_texts):
96 """
97 **TODO**:
98 Return the emmision propabilities for each character in all the sets.
99 This is essentially a list of dictionaries provided by :py:meth:`forward.character_propabilities`
100
101 * Join all the texts together and clean the result (call :py:meth:`clean_text`).
102 * Convert the joined string into a set to retrieve the unique characters (call `set <https://www.w3schools.com/python/python_sets.asp>`_)
103 * Return a list of emmision propabilities dictionaries for all the texts (call :py:meth:`forward.character_propabilities`)
104
105 :param all_texts: A list of texts
106 :return: A list of dictionaries with emmision propabilities for each text
107 """
108 # Join all texts and clean them
109 joined_text = clean_text("".join(all_texts))
110
111 # Get a unique list of all characters across all five texts
112 all_chars = set(joined_text)
113
114 # Now get the character emmision propabilities for each text
115 return [character_propabilities(text, all_chars) for text in all_texts]
116
117
118def get_initial_alpha():
119 """
120 **TODO**:
121 Return the initial alpha vector for the forward algorithm.
122
123 Hint: In the beginning, all states are equally likely
124
125 :return: np.array of shape 5x1 with the initial (equally likely) alpha values.
126 """
127 # In the begining, we don´t know which text our colleague choose
128 # to start with, so all texts are equally likely
129 return np.array([1.0, 1.0, 1.0, 1.0, 1.0])
130
131
132def get_state_transition_matrix():
133 """
134 **TODO**:
135 Return the state transition matrix for the forward algorithm.
136
137 Hint: With 90% chance the state stays the same while the remaining 10% shall be equally divided between the four other states.
138
139 :return: np.array of shape 5x5 with the correct state transition propabilities
140 """
141 return np.array(
142 [
143 [0.900, 0.025, 0.025, 0.025, 0.025],
144 [0.025, 0.900, 0.025, 0.025, 0.025],
145 [0.025, 0.025, 0.900, 0.025, 0.025],
146 [0.025, 0.025, 0.025, 0.900, 0.025],
147 [0.025, 0.025, 0.025, 0.025, 0.900],
148 ]
149 )
150
151
152def forward(alpha, character, state_transition_matrix, emmision_propabilities):
153 """
154 **TODO**: Implement one step of the forward algorithm.
155
156 * Given the past alpha-values and the newly read character, use the state_transition_matrix
157 to first predict the new state propabilities (new alpha values) according to the script.
158
159 * Then multiply the state propabilities with the emmision propabilities of the observed character
160 for each alphabet to retrieve the new alpha values.
161
162 * Normalize the alpha vector after each step by diving by its sum. This helps to achieve numerically more stable results
163 and allows for better interpretation of the results.
164
165 :param alpha: np.array of shape (5,1) holding the past alpha values
166 :param character: Observed character in this step
167 :param state_transition_matrix: np.array of shape (5,5) holding the state transition propabilities
168 :param emmision_propabilities: List of dictionaries holding the character emmision propabilities for each alphabet.
169 :return: New alpha-vector after state transition and observation update (np.array of shape 5,1)
170 """
171 # TODO: Implement state transition and update the alpha vector accordingly
172 alpha = state_transition_matrix @ alpha
173
174 # TODO: Retrieve symbol emmision propabilties for the given character and update the alpha vector
175 Y = np.array([alphabet[character] for alphabet in emmision_propabilities])
176 alpha = Y * alpha
177
178 # TODO: Normalize alpha for better visualization (divide by sum)
179 alpha /= alpha.sum()
180
181 # TODO: Return alpha
182 return alpha
183
184
185if __name__ == "__main__":
186 # Get initial alpha values
187 alpha = get_initial_alpha()
188
189 # Estimate the emmision propabilities for the five texts
190 emmision_propabilities = get_emmision_propabilities(
191 [text1, text2, text3, text4, text5]
192 )
193
194 # Build the state transition matrix
195 state_transition_matrix = get_state_transition_matrix()
196
197 alpha_matrix = np.zeros((50, 5)) # shape: (T, num_states)
198
199 # Clean the sequence
200 sequence = clean_text(sequence)
201
202 # Iterate over whole sequence
203 for t, character in enumerate(sequence):
204 # Run forward algorithm
205 alpha = forward(
206 alpha, character, state_transition_matrix, emmision_propabilities
207 )
208
209 # Store current alpha for later
210 alpha_matrix[t, :] = alpha
211
212 # Visualize alpha vectors as heat map
213 states = ["Text 1", "Text 2", "Text 3", "Text 4", "Text 5"]
214 df = pd.DataFrame(
215 alpha_matrix.T, index=states, columns=[f"t{t+1}" for t in range(len(sequence))]
216 )
217
218 plt.ioff()
219 plt.figure(figsize=(10, 4))
220 sns.heatmap(df, annot=False, cmap="YlGnBu", cbar=True)
221 plt.title("Alpha-Werte pro Zustand über die Zeit")
222 plt.xlabel("Zeit (Position in Sequenz)")
223 plt.ylabel("Zustand")
224 plt.tight_layout()
225 plt.show()