alan-turing-institute · boykovdn · Dec 6, 2024 · Dec 6, 2024 · Dec 6, 2024 · jack89roberts
diff --git a/README.md b/README.md
@@ -21,6 +21,12 @@ python -m pip install .
 
 `typst compile notes.typ`
 
+## CallHome Dataset
+
+Go [https://ca.talkbank.org/access/CallHome](here), select the conversation language, create account, then you can download the "media folder". There you can find the .cha files, which contain the transcriptions.
+
+To load the transcriptions as a bag of sentences, use `m4st.parse.TranscriptParser.from_folder` to load all conversation lines. This class does not group them by participant, or conversation - it just loads every line as an entry to a list (+ some pre-processing).
+
 
 ## License
 

diff --git a/doc/notes.typ b/doc/notes.typ
@@ -24,7 +24,7 @@ Either way, the translation will be influenced by the domain shift due to filler
 //We don't know the real world distribution of filler words, but we could use a LLM to sample from $bb(P)(hat(x) | x)$, where $x$ is the clean input, and $hat(x)$ is the filler-word-corrupted input.
 
 The translation model can be defined as $cal(T): x arrow x^prime$, where $x^prime$ is the translated text.
-The metric can be defined as $cal(M): x^prime, x, {y_i}_(i=1)^N arrow bb(R)$, where $y_i$ are reference translations provided by $N$ translators.
+The metric can be defined as $cal(M): [a], x^prime, x, {y_i}_(i=1)^N arrow bb(R)$, where $y_i$ are reference translations provided by $N$ translators, and $a$ is the source audio (denoted optional since some metrics don't accept it).
 In our use case $N=1$.
 
 We are generally not interested in benchmarking different models, so we can assume that $cal(T)$ is given.

diff --git a/pyproject.toml b/pyproject.toml
@@ -26,7 +26,9 @@ classifiers = [
   "Topic :: Scientific/Engineering",
   "Typing :: Typed",
 ]
-dependencies = []
+dependencies = [
+  "tqdm"
+]
 
 [project.optional-dependencies]
 dev = [
@@ -70,6 +72,7 @@ disallow_untyped_defs = false
 disallow_incomplete_defs = false
 check_untyped_defs = true
 strict = false
+ignore_missing_imports = true
 
 
 [tool.ruff]

diff --git a/src/m4st/parse.py b/src/m4st/parse.py
@@ -0,0 +1,70 @@
+import glob
+import os
+import re
+
+from tqdm import tqdm
+
+
+class TranscriptParser:
+    r"""
+    Provides a bag of conversational lines.
+
+    Instantiate this by using the `from_folder` class method and
+    pointing it to a folder from the CallHome dataset, for example
+    the 'deu' folder for transcriptions in German. This class will
+    try its best to remove the .cha format specifics, and only
+    keep the UTF8 characters, thus providing text we can use for
+    downstream translation.
+    """
+
+    def __init__(self):
+        self.lines = []
+
+    @classmethod
+    def from_folder(cls, folder_path: str):
+        parser = cls()
+        # Loop through all .cha files in the folder
+        for file_path in tqdm(
+            glob.glob(os.path.join(folder_path, "*.cha")), desc=f"Parsing {folder_path}"
+        ):
+            with open(file_path) as file:
+                data = file.read()
+                parser.parse_transcription(data)
+
+        return parser
+
+    def parse_line(self, line: str):
+        # Match lines with participant utterances
+        match = re.match(r"\*(\w):\s+(.*)", line)
+        if match:
+            participant, text = match.groups()
+            # Remove timestamps (e.g., •50770_51060•) from the text
+            # And other artefacts
+            clean_text = re.sub(r"\x15\d+_\d+\x15", "", text).strip()
+            clean_text = re.sub(r"&=\S+", "", clean_text).strip()
+            clean_text = re.sub(r"&+\S+", "", clean_text).strip()
+            clean_text = re.sub(r"\+/", "", clean_text).strip()
+            clean_text = re.sub(r"\+", "", clean_text).strip()
+            if clean_text in [".", "?", "!"]:
+                # Nothing but the punctuation is remaining
+                return
+
+            self.lines.append(clean_text)
+
+    def parse_transcription(self, data: str):
+        lines = data.split("\n")
+        for line in lines:
+            if line in ["@Begin", "@UTF8", "@End"]:
+                # The begin header
+                pass
+            elif line.startswith("*"):
+                # Participant line
+                self.parse_line(line)
+
+
+if __name__ == "__main__":
+    # Input transcription data
+    # Parse the transcription
+    folder_path = "/Users/bvodenicharski/Downloads/deu"
+    tp = TranscriptParser.from_folder(folder_path)
+    print(len(tp.lines))