-
Notifications
You must be signed in to change notification settings - Fork 3
/
convert.py
30 lines (25 loc) · 834 Bytes
/
convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import re
import pandas as pd
import numpy as np
# from exporter import text_to_textline
"""
Generate a doccano import compatible file
that contains the reports split into one sentence per line
"""
input = "data/ProcessedData.csv"
output = "data/processed_data.text"
column = "statement"
# def convert(reports):
# textline = []
# for index, row in reports.iterrows():
# textline.extend(text_to_textline(row['text']))
# with open(output, 'w') as outfile:
# for e in textline:
# outfile.write(e+'\n')
try:
reports = pd.read_csv(input)
print(f"Converting '{input}' into '{output}'...")
np.savetxt(output, reports[column].sample(n=9000,random_state=2).values, fmt='%s')
print(f'Conversion completed.')
except FileNotFoundError:
print(f"File '{input}' could not be found.")