-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathsplit_by_authors.py
73 lines (48 loc) · 1.1 KB
/
split_by_authors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
'like split, but take writers from the original file into account'
'input file with headers, output files without headers'
import csv
import sys
import random
orig_train_file = sys.argv[1]
input_file = sys.argv[2]
output_file1 = sys.argv[3]
output_file2 = sys.argv[4]
try:
P = float( sys.argv[5] )
except IndexError:
P = 0.9
try:
seed = sys.argv[6]
except IndexError:
seed = None
print "P = %s" % ( P )
if seed:
random.seed( seed )
i_orig = open( orig_train_file )
i = open( input_file )
o1 = open( output_file1, 'wb' )
o2 = open( output_file2, 'wb' )
orig_reader = csv.reader( i_orig )
reader = csv.reader( i )
writer1 = csv.writer( o1 )
writer2 = csv.writer( o2 )
headers = reader.next()
orig_reader.next()
#writer1.writerow( headers )
#writer2.writerow( headers )
counter = 0
current_writer = None
for line in reader:
orig_line = orig_reader.next()
writer = orig_line[0]
if writer != current_writer:
current_writer = writer
r = random.random()
if r > P:
w = writer2
else:
w = writer1
w.writerow( line )
counter += 1
if counter % 100000 == 0:
print counter