0.0.1 of the csvparser

Kentoseth · Jul 2, 2019 · 38ed1a8 · 38ed1a8
1 parent baa7fce
commit 38ed1a8
Show file tree

Hide file tree

Showing 3 changed files with 194 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -1,2 +1,87 @@
 # tatoeba-sentence-pairs
-Generates sentence pairs from the Tatoeba corpus and exports as JSON
+
+This project is a high-performance and self-contained CLI clone of the [sentence-pairs](https://github.com/kmicklas/sentence-pairs) project. The Nim programming language was used for 2 reasons:
+
+ - To increase the performance of execution on the large CSV/raw-data files containing all the Tatoeba languages and links
+ - To provide a self-contained binary CLI application that can be executed without dependencies or runtimes
+
+This application generates sentence pairs from the Tatoeba corpus and exports as JSON.
+
+## Contents
+ * [Requirements](#requirements)
+ * [Compiling](#compile)
+ * [Installation](#installation)
+ * [Releases](#releases)
+ * [Usage](#usage)
+ * [Contribute](#contribute)
+ * [License](#license)
+ * [Queries](#queries)
+
+## Requirements
+
+ * csvparser binary (available under "Releases")
+ * The sentence-output file available [here](https://tatoeba.org/eng/downloads)
+ * The links file matching sentences with their translations available [here](https://tatoeba.org/eng/downloads)
+
+## Compile
+
+ * Run the `csvparser.nimble` file to install the dependencies
+ * Dependencies: cligen
+ * `cd src`
+ * `nim c -d:release csvparser.nim`
+ * (this program was compiled on the `devel` branch of Nim, which is/was `0.20.99` at time of compilation)
+
+## Installation
+
+ * Create a folder to store the binary file in, example: `mkdir ~/tatoeba-bin`
+ * Place the binary file in `~/tatoeba-bin`
+ * Export the PATH in your .profile or other file: `export PATH=/home/user/tatoeba-bin:$PATH`
+ * Refresh by starting a new terminal session or using: `source .profile`
+ * The CLI should now be globally executable
+
+(if you don't want to make the program globally executable, just make sure the binary file is in the same directory as the CSV/raw-data files)
+
+## Releases
+
+Available at: https://github.com/Kentoseth/tatoeba-sentence-pairs/releases
+
+Release will contain:
+
+ * `csvparser` binary
+ * This README file
+
+**NOTE** This binary only supports Linux X64 distros. It is untested on Mac and will need to be compiled on Mac/Windows to work on those operating systems
+
+## Usage
+
+csvparser currently has 1 supported command: main
+
+ * `./csvparser main -t=cmn -o=eng -s=sentences_file.csv -l=links_file.csv` - will fetch all `cmn` sentences and find the corresponding `eng` translations, removing any empty translations and then exporting as JSON `output-ara-eng.json`
+
+Use `./csvparser --help` or `./csvparser main --help` for more information.
+
+## Contribute
+
+You can create a PR or discuss it first by opening an Issue
+
+PR Rules:
+
+ * Make sure the program compiles
+ * Test it locally to make sure the program runs
+ * If the code is not readable, the PR may be rejected
+
+This project follows the UNIX philosophy of doing one thing only and (hopefully) doing it well.
+
+## License
+
+This package is licensed under the open-source "GNU GPL, Version 3".
+
+The full license text is available in the file LICENSE
+
+## Queries
+
+Open an Issue to discuss
+
+-----
+
+If you find this project interesting or useful, please star it and share it with colleagues and friends.
diff --git a/csvparser.nimble b/csvparser.nimble
@@ -0,0 +1,14 @@
+# Package
+
+version       = "0.0.1"
+author        = "Kentoseth"
+description   = "A CLI that generates sentence pairs from the Tatoeba corpus and exports as JSON"
+license       = "GPL-3.0"
+srcDir        = "src"
+bin           = @["csvparser"]
+
+
+# Dependencies
+
+requires "nim >= 0.20.99"
+requires "cligen >= 0.9.31"
diff --git a/src/csvparser.nim b/src/csvparser.nim
@@ -0,0 +1,94 @@
+import parsecsv
+import os
+import strutils
+import tables
+import json
+
+var p: CsvParser #the main language file
+
+var y: CsvParser #the link file matching the translations
+
+# this object stores the index and sequence of translations for each main sentence from the first chosen language
+type
+  main_table_data = object
+    f0* : string
+    f1* : seq[string]
+
+# this table holds the target sentence(f0) and the translations that match to it stored in a sequence(f1)
+var main_table = newTable[string, main_table_data]()
+
+# stores the translations
+var second_table = newTable[string, string]()
+
+# used to store targets with empty elements
+var counter : seq[int] = @[]
+
+# the main function that executes the filtering; requires 4 inputs
+proc main(target: string = "", translation: string = "", sentences_file: string = "", links_file: string ="") =
+
+  ## >>example: ./csvparser main -t=cmn -o=eng -s=sentences_file.csv -l=links_file.csv
+
+  p.open(sentences_file, separator='\t', quote = '\0')
+
+  y.open(links_file, separator='\t')
+
+  # looping through the first file, necessary due to how the parsecsv module works
+  while p.readRow():
+
+    if p.row[1] == target:
+
+      main_table[p.row[0]] = main_table_data(f0: p.row[2])
+
+    elif p.row[1] == translation:
+      second_table[p.row[0]] =  p.row[2]
+
+
+  # see line 33
+  while y.readRow():
+
+    # checks if the links file has the first and second values
+    # in the main and second table
+    if y.row[0] in main_table and y.row[1] in second_table:
+
+      main_table[y.row[0]].f1.add(second_table[y.row[1]]) # adding multiple translations associated with each target sentence
+
+
+  # checks for empty elements and adds them to the counter
+  for x, y in main_table:
+
+    if y.f1 == []:
+
+      counter.add(parseInt(x))
+
+    # echo x
+    # 47
+    # echo y
+    # (f0: "如果我可以像那樣的話...", f1: @[])
+
+  # removes empty elements from main_table
+  for x in counter:
+    main_table.del(intToStr(x))
+
+
+  var myjs = $(%main_table) #converts main_table from a table to json
+
+  var file_name: string = "output-" & target & "-" & translation & ".json" #generates filename for output
+  # no checking for existing filenames, be careful of file overwrites!
+
+  writeFile(file_name, myjs)
+
+  p.close()
+
+  y.close()
+
+# Unneeded test function to make sure cligen is working correctly
+proc upath(run: seq[string]) =
+
+  ## for testing purposes. Can ignore
+
+  echo "Test"
+
+when isMainModule:
+  import cligen
+  dispatchMulti([ main, short = { "target": 't', "translation": 'o', "sentences_file": 's', "links_file": 'l' } ],
+                [ upath ] )