-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Kentoseth
committed
Jul 2, 2019
1 parent
baa7fce
commit 38ed1a8
Showing
3 changed files
with
194 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,87 @@ | ||
# tatoeba-sentence-pairs | ||
Generates sentence pairs from the Tatoeba corpus and exports as JSON | ||
|
||
This project is a high-performance and self-contained CLI clone of the [sentence-pairs](https://github.com/kmicklas/sentence-pairs) project. The Nim programming language was used for 2 reasons: | ||
|
||
- To increase the performance of execution on the large CSV/raw-data files containing all the Tatoeba languages and links | ||
- To provide a self-contained binary CLI application that can be executed without dependencies or runtimes | ||
|
||
This application generates sentence pairs from the Tatoeba corpus and exports as JSON. | ||
|
||
## Contents | ||
* [Requirements](#requirements) | ||
* [Compiling](#compile) | ||
* [Installation](#installation) | ||
* [Releases](#releases) | ||
* [Usage](#usage) | ||
* [Contribute](#contribute) | ||
* [License](#license) | ||
* [Queries](#queries) | ||
|
||
## Requirements | ||
|
||
* csvparser binary (available under "Releases") | ||
* The sentence-output file available [here](https://tatoeba.org/eng/downloads) | ||
* The links file matching sentences with their translations available [here](https://tatoeba.org/eng/downloads) | ||
|
||
## Compile | ||
|
||
* Run the `csvparser.nimble` file to install the dependencies | ||
* Dependencies: cligen | ||
* `cd src` | ||
* `nim c -d:release csvparser.nim` | ||
* (this program was compiled on the `devel` branch of Nim, which is/was `0.20.99` at time of compilation) | ||
|
||
## Installation | ||
|
||
* Create a folder to store the binary file in, example: `mkdir ~/tatoeba-bin` | ||
* Place the binary file in `~/tatoeba-bin` | ||
* Export the PATH in your .profile or other file: `export PATH=/home/user/tatoeba-bin:$PATH` | ||
* Refresh by starting a new terminal session or using: `source .profile` | ||
* The CLI should now be globally executable | ||
|
||
(if you don't want to make the program globally executable, just make sure the binary file is in the same directory as the CSV/raw-data files) | ||
|
||
## Releases | ||
|
||
Available at: https://github.com/Kentoseth/tatoeba-sentence-pairs/releases | ||
|
||
Release will contain: | ||
|
||
* `csvparser` binary | ||
* This README file | ||
|
||
**NOTE** This binary only supports Linux X64 distros. It is untested on Mac and will need to be compiled on Mac/Windows to work on those operating systems | ||
|
||
## Usage | ||
|
||
csvparser currently has 1 supported command: main | ||
|
||
* `./csvparser main -t=cmn -o=eng -s=sentences_file.csv -l=links_file.csv` - will fetch all `cmn` sentences and find the corresponding `eng` translations, removing any empty translations and then exporting as JSON `output-ara-eng.json` | ||
|
||
Use `./csvparser --help` or `./csvparser main --help` for more information. | ||
|
||
## Contribute | ||
|
||
You can create a PR or discuss it first by opening an Issue | ||
|
||
PR Rules: | ||
|
||
* Make sure the program compiles | ||
* Test it locally to make sure the program runs | ||
* If the code is not readable, the PR may be rejected | ||
|
||
This project follows the UNIX philosophy of doing one thing only and (hopefully) doing it well. | ||
|
||
## License | ||
|
||
This package is licensed under the open-source "GNU GPL, Version 3". | ||
|
||
The full license text is available in the file LICENSE | ||
|
||
## Queries | ||
|
||
Open an Issue to discuss | ||
|
||
----- | ||
|
||
If you find this project interesting or useful, please star it and share it with colleagues and friends. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# Package | ||
|
||
version = "0.0.1" | ||
author = "Kentoseth" | ||
description = "A CLI that generates sentence pairs from the Tatoeba corpus and exports as JSON" | ||
license = "GPL-3.0" | ||
srcDir = "src" | ||
bin = @["csvparser"] | ||
|
||
|
||
# Dependencies | ||
|
||
requires "nim >= 0.20.99" | ||
requires "cligen >= 0.9.31" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
import parsecsv | ||
import os | ||
import strutils | ||
import tables | ||
import json | ||
|
||
var p: CsvParser #the main language file | ||
|
||
var y: CsvParser #the link file matching the translations | ||
|
||
# this object stores the index and sequence of translations for each main sentence from the first chosen language | ||
type | ||
main_table_data = object | ||
f0* : string | ||
f1* : seq[string] | ||
|
||
# this table holds the target sentence(f0) and the translations that match to it stored in a sequence(f1) | ||
var main_table = newTable[string, main_table_data]() | ||
|
||
# stores the translations | ||
var second_table = newTable[string, string]() | ||
|
||
# used to store targets with empty elements | ||
var counter : seq[int] = @[] | ||
|
||
# the main function that executes the filtering; requires 4 inputs | ||
proc main(target: string = "", translation: string = "", sentences_file: string = "", links_file: string ="") = | ||
|
||
## >>example: ./csvparser main -t=cmn -o=eng -s=sentences_file.csv -l=links_file.csv | ||
|
||
p.open(sentences_file, separator='\t', quote = '\0') | ||
|
||
y.open(links_file, separator='\t') | ||
|
||
# looping through the first file, necessary due to how the parsecsv module works | ||
while p.readRow(): | ||
|
||
if p.row[1] == target: | ||
|
||
main_table[p.row[0]] = main_table_data(f0: p.row[2]) | ||
|
||
elif p.row[1] == translation: | ||
second_table[p.row[0]] = p.row[2] | ||
|
||
|
||
# see line 33 | ||
while y.readRow(): | ||
|
||
# checks if the links file has the first and second values | ||
# in the main and second table | ||
if y.row[0] in main_table and y.row[1] in second_table: | ||
|
||
main_table[y.row[0]].f1.add(second_table[y.row[1]]) # adding multiple translations associated with each target sentence | ||
|
||
|
||
# checks for empty elements and adds them to the counter | ||
for x, y in main_table: | ||
|
||
if y.f1 == []: | ||
|
||
counter.add(parseInt(x)) | ||
|
||
# echo x | ||
# 47 | ||
# echo y | ||
# (f0: "如果我可以像那樣的話...", f1: @[]) | ||
|
||
# removes empty elements from main_table | ||
for x in counter: | ||
main_table.del(intToStr(x)) | ||
|
||
|
||
var myjs = $(%main_table) #converts main_table from a table to json | ||
|
||
var file_name: string = "output-" & target & "-" & translation & ".json" #generates filename for output | ||
# no checking for existing filenames, be careful of file overwrites! | ||
|
||
writeFile(file_name, myjs) | ||
|
||
p.close() | ||
|
||
y.close() | ||
|
||
# Unneeded test function to make sure cligen is working correctly | ||
proc upath(run: seq[string]) = | ||
|
||
## for testing purposes. Can ignore | ||
|
||
echo "Test" | ||
|
||
when isMainModule: | ||
import cligen | ||
dispatchMulti([ main, short = { "target": 't', "translation": 'o', "sentences_file": 's', "links_file": 'l' } ], | ||
[ upath ] ) |