Skip to content

Commit c6f1d96

Browse files
committed
e01_working_with_data.py
1 parent 860467d commit c6f1d96

File tree

7 files changed

+89
-105
lines changed

7 files changed

+89
-105
lines changed

src/data-scratch-library/dsl/c10_working_with_data/e01_working_with_data.py

Lines changed: 3 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
import csv
2+
import json
23
import logging
34
import os
45
from collections import defaultdict
56
from datetime import datetime
67
from logging.config import dictConfig
7-
from os.path import abspath
8+
from os.path import abspath, dirname
89

910
from dsl.c05_statistics.e0503_correlation import correlation
1011
from dsl.c06_probability.e0603_normal import random_normal
@@ -60,109 +61,6 @@ def group_by_symbol(_data, column_name="symbol"):
6061
return by_symbol
6162

6263

63-
x_matrix_list = [
64-
[20.9666776351559, -13.1138080189357],
65-
[22.7719907680008, -19.8890894944696],
66-
[25.6687103160153, -11.9956004517219],
67-
[18.0019794950564, -18.1989191165133],
68-
[21.3967402102156, -10.8893126308196],
69-
[0.443696899177716, -19.7221132386308],
70-
[29.9198322142127, -14.0958668502427],
71-
[19.0805843080126, -13.7888747608312],
72-
[16.4685063521314, -11.2612927034291],
73-
[21.4597664701884, -12.4740034586705],
74-
[3.87655283720532, -17.575162461771],
75-
[34.5713920556787, -10.705185165378],
76-
[13.3732115747722, -16.7270274494424],
77-
[20.7281704141919, -8.81165591556553],
78-
[24.839851437942, -12.1240962157419],
79-
[20.3019544741252, -12.8725060780898],
80-
[21.9021426929599, -17.3225432396452],
81-
[23.2285885715486, -12.2676568419045],
82-
[28.5749111681851, -13.2616470619453],
83-
[29.2957424128701, -14.6299928678996],
84-
[15.2495527798625, -18.4649714274207],
85-
[26.5567257400476, -9.19794350561966],
86-
[30.1934232346361, -12.6272709845971],
87-
[36.8267446011057, -7.25409849336718],
88-
[32.157416823084, -10.4729534347553],
89-
[5.85964365291694, -22.6573731626132],
90-
[25.7426190674693, -14.8055803854566],
91-
[16.237602636139, -16.5920595763719],
92-
[14.7408608850568, -20.0537715298403],
93-
[6.85907008242544, -18.3965586884781],
94-
[26.5918329233128, -8.92664811750842],
95-
[-11.2216019958228, -27.0519081982856],
96-
[8.93593745011035, -20.8261235122575],
97-
[24.4481258671796, -18.0324012215159],
98-
[2.82048515404903, -22.4208457598703],
99-
[30.8803004755948, -11.455358009593],
100-
[15.4586738236098, -11.1242825084309],
101-
[28.5332537090494, -14.7898744423126],
102-
[40.4830293441052, -2.41946428697183],
103-
[15.7563759125684, -13.5771266003795],
104-
[19.3635588851727, -20.6224770470434],
105-
[13.4212840786467, -19.0238227375766],
106-
[7.77570680426702, -16.6385739839089],
107-
[21.4865983854408, -15.290799330002],
108-
[12.6392705930724, -23.6433305964301],
109-
[12.4746151388128, -17.9720169566614],
110-
[23.4572410437998, -14.602080545086],
111-
[13.6878189833565, -18.9687408182414],
112-
[15.4077465943441, -14.5352487124086],
113-
[20.3356581548895, -10.0883159703702],
114-
[20.7093833689359, -12.6939091236766],
115-
[11.1032293684441, -14.1383848928755],
116-
[17.5048321498308, -9.2338593361801],
117-
[16.3303688220188, -15.1054735529158],
118-
[26.6929062710726, -13.306030567991],
119-
[34.4985678099711, -9.86199941278607],
120-
[39.1374291499406, -10.5621430853401],
121-
[21.9088956482146, -9.95198845621849],
122-
[22.2367457578087, -17.2200123442707],
123-
[10.0032784145577, -19.3557700653426],
124-
[14.045833906665, -15.871937521131],
125-
[15.5640911917607, -18.3396956121887],
126-
[24.4771926581586, -14.8715313479137],
127-
[26.533415556629, -14.693883922494],
128-
[12.8722580202544, -21.2750596021509],
129-
[24.4768291376862, -15.9592080959207],
130-
[18.2230748567433, -14.6541444069985],
131-
[4.1902148367447, -20.6144032528762],
132-
[12.4332594022086, -16.6079789231489],
133-
[20.5483758651873, -18.8512560786321],
134-
[17.8180560451358, -12.5451990696752],
135-
[11.0071081078049, -20.3938092335862],
136-
[8.30560561422449, -22.9503944138682],
137-
[33.9857852657284, -4.8371294974382],
138-
[17.4376502239652, -14.5095976075022],
139-
[29.0379635148943, -14.8461553663227],
140-
[29.1344666599319, -7.70862921632672],
141-
[32.9730697624544, -15.5839178785654],
142-
[13.4211493998212, -20.150199857584],
143-
[11.380538260355, -12.8619410359766],
144-
[28.672631499186, -8.51866271785711],
145-
[16.4296061111902, -23.3326051279759],
146-
[25.7168371582585, -13.8899296143829],
147-
[13.3185154732595, -17.8959160024249],
148-
[3.60832478605376, -25.4023343597712],
149-
[39.5445949652652, -11.466377647931],
150-
[25.1693484426101, -12.2752652925707],
151-
[25.2884257196471, -7.06710309184533],
152-
[6.77665715793125, -22.3947299635571],
153-
[20.1844223778907, -16.0427471125407],
154-
[25.5506805272535, -9.33856532270204],
155-
[25.1495682602477, -7.17350567090738],
156-
[15.6978431006492, -17.5979197162642],
157-
[37.42780451491, -10.843637288504],
158-
[22.974620174842, -10.6171162611686],
159-
[34.6327117468934, -9.26182440487384],
160-
[34.7042513789061, -6.9630753351114],
161-
[15.6563953929008, -17.2196961218915],
162-
[25.2049825789225, -14.1592086208169],
163-
]
164-
165-
16664
def main1(path_to_csv_data):
16765
logging.info("safe parsing")
16866
_data = read_comma_delimited_stock_prices(path_to_csv_data)
@@ -222,6 +120,7 @@ def main3():
222120

223121
def main4():
224122
logging.info("PCA")
123+
x_matrix_list = json.load(open(abspath(f"{dirname(__file__)}/../../../../data/x_matrix.json"), "r"))
225124
x_matrix_demeaned = de_mean_matrix(x_matrix_list)
226125
components_p = principal_component_analysis(x_matrix_demeaned, 2)
227126
transform_demeaned = transform_vector(x_matrix_demeaned[0], components_p)

src/data-scratch-library/tests/test_c10_working_with_data/test_e01_working_with_data.py

Whitespace-only changes.

src/data-scratch-library/tests/test_c10_working_with_data/test_manipulation.py

Whitespace-only changes.

src/data-scratch-library/tests/test_c10_working_with_data/test_working_with_data.py

Whitespace-only changes.
Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,33 @@
1+
import random
2+
from typing import List
3+
4+
from matplotlib import pyplot as plt
5+
6+
from dsl.c06_probability.e0603_normal import inverse_normal_cdf
7+
from dsl.c10_working_with_data.e1001_univariate import make_histogram
8+
9+
110
def plot_histogram(points: List[float], bucket_size: float, title: str = ""):
211
histogram = make_histogram(points, bucket_size)
312
plt.bar(histogram.keys(), histogram.values(), width=bucket_size)
4-
plt.title(title)
13+
plt.title(title)
14+
15+
16+
if __name__ == "__main__":
17+
random.seed(0)
18+
# uniform between -100 and 100
19+
uniform = [200 * random.random() - 100 for _ in range(10000)]
20+
21+
# normal distribution with mean 0, standard deviation 57
22+
normal = [
23+
57 * inverse_normal_cdf(random.random())
24+
for _ in range(10000)
25+
]
26+
27+
plot_histogram(uniform, 10, "Uniform Histogram")
28+
plt.savefig("uniform-histogram.png")
29+
plt.close()
30+
31+
plot_histogram(normal, 10, "Normal Histogram")
32+
plt.savefig("normal-histogram.png")
33+
plt.close()
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,18 @@
1+
from matplotlib import pyplot as plt
12

3+
from dsl.c05_statistics.e0503_correlation import correlation
4+
from dsl.c10_working_with_data.e1002_bivariate import random_normal
5+
6+
xs = [random_normal() for _ in range(1000)]
7+
ys1 = [x + random_normal() / 2 for x in xs]
8+
ys2 = [-x + random_normal() / 2 for x in xs]
9+
plt.scatter(xs, ys1, marker='.', color='black', label='ys1')
10+
plt.scatter(xs, ys2, marker='.', color='gray', label='ys2')
11+
plt.xlabel('xs')
12+
plt.ylabel('ys')
13+
plt.legend(loc=9)
14+
plt.title("Very Different Joint Distributions")
15+
plt.show()
16+
17+
print(correlation(xs, ys1)) # about 0.9
18+
print(correlation(xs, ys2)) # about -0.9
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,40 @@
1+
# corr_data is a list of four 100-d vectors
2+
from matplotlib import pyplot as plt
13

4+
from dsl.c06_probability.e0603_normal import random_normal
5+
6+
xs = [random_normal() for _ in range(100)]
7+
corr_data = [
8+
xs,
9+
[x + random_normal() / 2 for x in xs],
10+
[-x + random_normal() / 2 for x in xs],
11+
[0 if x % 2 == 0 else 1 for x in xs],
12+
]
13+
14+
num_vectors = len(corr_data)
15+
fig, ax = plt.subplots(num_vectors, num_vectors)
16+
for i in range(num_vectors):
17+
for j in range(num_vectors):
18+
# Scatter column_j on the x-axis vs. column_i on the y-axis
19+
if i != j:
20+
ax[i][j].scatter(corr_data[j], corr_data[i])
21+
22+
# unless i == j, in which case show the series name
23+
else:
24+
ax[i][j].annotate(
25+
"series " + str(i), (0.5, 0.5),
26+
xycoords='axes fraction',
27+
ha="center", va="center"
28+
)
29+
30+
# Then hide axis labels except left and bottom charts
31+
if i < num_vectors - 1:
32+
ax[i][j].xaxis.set_visible(False)
33+
if j > 0:
34+
ax[i][j].yaxis.set_visible(False)
35+
36+
# Fix the bottom-right and top-left axis labels, which are wrong because
37+
# their charts only have text in them
38+
ax[-1][-1].set_xlim(ax[0][-1].get_xlim())
39+
ax[0][0].set_ylim(ax[0][1].get_ylim())
40+
plt.show()

0 commit comments

Comments
 (0)