Skip to content

Commit

Permalink
0.0.1 of the csvparser
Browse files Browse the repository at this point in the history
  • Loading branch information
Kentoseth committed Jul 2, 2019
1 parent baa7fce commit 38ed1a8
Show file tree
Hide file tree
Showing 3 changed files with 194 additions and 1 deletion.
87 changes: 86 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,87 @@
# tatoeba-sentence-pairs
Generates sentence pairs from the Tatoeba corpus and exports as JSON

This project is a high-performance and self-contained CLI clone of the [sentence-pairs](https://github.com/kmicklas/sentence-pairs) project. The Nim programming language was used for 2 reasons:

- To increase the performance of execution on the large CSV/raw-data files containing all the Tatoeba languages and links
- To provide a self-contained binary CLI application that can be executed without dependencies or runtimes

This application generates sentence pairs from the Tatoeba corpus and exports as JSON.

## Contents
* [Requirements](#requirements)
* [Compiling](#compile)
* [Installation](#installation)
* [Releases](#releases)
* [Usage](#usage)
* [Contribute](#contribute)
* [License](#license)
* [Queries](#queries)

## Requirements

* csvparser binary (available under "Releases")
* The sentence-output file available [here](https://tatoeba.org/eng/downloads)
* The links file matching sentences with their translations available [here](https://tatoeba.org/eng/downloads)

## Compile

* Run the `csvparser.nimble` file to install the dependencies
* Dependencies: cligen
* `cd src`
* `nim c -d:release csvparser.nim`
* (this program was compiled on the `devel` branch of Nim, which is/was `0.20.99` at time of compilation)

## Installation

* Create a folder to store the binary file in, example: `mkdir ~/tatoeba-bin`
* Place the binary file in `~/tatoeba-bin`
* Export the PATH in your .profile or other file: `export PATH=/home/user/tatoeba-bin:$PATH`
* Refresh by starting a new terminal session or using: `source .profile`
* The CLI should now be globally executable

(if you don't want to make the program globally executable, just make sure the binary file is in the same directory as the CSV/raw-data files)

## Releases

Available at: https://github.com/Kentoseth/tatoeba-sentence-pairs/releases

Release will contain:

* `csvparser` binary
* This README file

**NOTE** This binary only supports Linux X64 distros. It is untested on Mac and will need to be compiled on Mac/Windows to work on those operating systems

## Usage

csvparser currently has 1 supported command: main

* `./csvparser main -t=cmn -o=eng -s=sentences_file.csv -l=links_file.csv` - will fetch all `cmn` sentences and find the corresponding `eng` translations, removing any empty translations and then exporting as JSON `output-ara-eng.json`

Use `./csvparser --help` or `./csvparser main --help` for more information.

## Contribute

You can create a PR or discuss it first by opening an Issue

PR Rules:

* Make sure the program compiles
* Test it locally to make sure the program runs
* If the code is not readable, the PR may be rejected

This project follows the UNIX philosophy of doing one thing only and (hopefully) doing it well.

## License

This package is licensed under the open-source "GNU GPL, Version 3".

The full license text is available in the file LICENSE

## Queries

Open an Issue to discuss

-----

If you find this project interesting or useful, please star it and share it with colleagues and friends.
14 changes: 14 additions & 0 deletions csvparser.nimble
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Package

version = "0.0.1"
author = "Kentoseth"
description = "A CLI that generates sentence pairs from the Tatoeba corpus and exports as JSON"
license = "GPL-3.0"
srcDir = "src"
bin = @["csvparser"]


# Dependencies

requires "nim >= 0.20.99"
requires "cligen >= 0.9.31"
94 changes: 94 additions & 0 deletions src/csvparser.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import parsecsv
import os
import strutils
import tables
import json

var p: CsvParser #the main language file

var y: CsvParser #the link file matching the translations

# this object stores the index and sequence of translations for each main sentence from the first chosen language
type
main_table_data = object
f0* : string
f1* : seq[string]

# this table holds the target sentence(f0) and the translations that match to it stored in a sequence(f1)
var main_table = newTable[string, main_table_data]()

# stores the translations
var second_table = newTable[string, string]()

# used to store targets with empty elements
var counter : seq[int] = @[]

# the main function that executes the filtering; requires 4 inputs
proc main(target: string = "", translation: string = "", sentences_file: string = "", links_file: string ="") =

## >>example: ./csvparser main -t=cmn -o=eng -s=sentences_file.csv -l=links_file.csv

p.open(sentences_file, separator='\t', quote = '\0')

y.open(links_file, separator='\t')

# looping through the first file, necessary due to how the parsecsv module works
while p.readRow():

if p.row[1] == target:

main_table[p.row[0]] = main_table_data(f0: p.row[2])

elif p.row[1] == translation:
second_table[p.row[0]] = p.row[2]


# see line 33
while y.readRow():

# checks if the links file has the first and second values
# in the main and second table
if y.row[0] in main_table and y.row[1] in second_table:

main_table[y.row[0]].f1.add(second_table[y.row[1]]) # adding multiple translations associated with each target sentence


# checks for empty elements and adds them to the counter
for x, y in main_table:

if y.f1 == []:

counter.add(parseInt(x))

# echo x
# 47
# echo y
# (f0: "如果我可以像那樣的話...", f1: @[])

# removes empty elements from main_table
for x in counter:
main_table.del(intToStr(x))


var myjs = $(%main_table) #converts main_table from a table to json

var file_name: string = "output-" & target & "-" & translation & ".json" #generates filename for output
# no checking for existing filenames, be careful of file overwrites!

writeFile(file_name, myjs)

p.close()

y.close()

# Unneeded test function to make sure cligen is working correctly
proc upath(run: seq[string]) =

## for testing purposes. Can ignore

echo "Test"

when isMainModule:
import cligen
dispatchMulti([ main, short = { "target": 't', "translation": 'o', "sentences_file": 's', "links_file": 'l' } ],
[ upath ] )

0 comments on commit 38ed1a8

Please sign in to comment.