Skip to content

Commit 8b109f1

Browse files
committed
finished test regresion
1 parent 0381490 commit 8b109f1

File tree

4 files changed

+191
-5
lines changed

4 files changed

+191
-5
lines changed

.gitignore

+5-1
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,11 @@ data/raw
164164
data/shape_files
165165
data/interim
166166
data/external
167-
notebooks/
167+
*.ipynb
168+
*.css
169+
*.woff
170+
*.js
171+
*.html
168172
#force to keep the pnb.csv file in data/external
169173

170174
.DS_Store

notebooks/regresion.qmd

+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
---
2+
title: "Graph development"
3+
format:
4+
html:
5+
code-fold: true
6+
jupyter: python3
7+
---
8+
9+
```{python}
10+
import os
11+
os.chdir("..")
12+
```
13+
14+
```{python}
15+
import pandas as pd
16+
import polars as pl
17+
import numpy as np
18+
import geopandas as gpd
19+
import spreg
20+
import numpy
21+
import libpysal
22+
import spreg
23+
from pysal.lib import weights
24+
```
25+
26+
```{python}
27+
puma = gpd.read_file("data/interim/puma_06.gpkg", engine="pyogrio")
28+
df_roads = pd.read_parquet("data/processed/roads_final_06.parquet")
29+
df_acs = pd.read_parquet("data/processed/acs.parquet")
30+
```
31+
32+
```{python}
33+
df_acs["puma_id"] = df_acs["state"].astype(str).str.zfill(2) + df_acs["PUMA"].astype(str).str.zfill(5)
34+
df_acs = df_acs[(df_acs["state"] == 6) & (df_acs["year"] >= 2012) & (df_acs["race"] == "ALL") & (df_acs["year"] <= 2018)].reset_index(drop=True)
35+
df_acs
36+
```
37+
38+
```{python}
39+
master_df = df_acs.merge(df_roads, on=["puma_id", "year"], how="left")
40+
master_df = master_df[["year", "puma_id", "avg_time", "leangth"]].sort_values(by=["year", "puma_id"], ascending=True).reset_index(drop=True)
41+
master_df
42+
```
43+
44+
```{python}
45+
puma_06 = puma[puma["geo_id"].str.startswith("06")].reset_index(drop=True)
46+
puma_06.rename(columns={"geo_id": "puma_id"}, inplace=True)
47+
puma_06
48+
```
49+
50+
```{python}
51+
wq = weights.contiguity.Queen.from_dataframe(puma_06, geom_col="geometry", ids="puma_id")
52+
wq.transform = 'r'
53+
```
54+
55+
```{python}
56+
master_df["avg_time"].values
57+
```
58+
59+
```{python}
60+
y_reshaped = master_df["avg_time"].values.reshape(-1, 1)
61+
x_reshaped = master_df["leangth"].values.reshape(-1, 1)
62+
63+
print(f"x shape: {x.shape}, y shape: {y.shape}") # Check the shape of x and y
64+
65+
fe_lag = spreg.Panel_FE_Lag(
66+
y=y_reshaped,
67+
x=x_reshaped,
68+
w=wq)
69+
70+
print(fe_lag.summary)
71+
```
72+
73+
```{python}
74+

notebooks/roads.qmd

+111
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
---
2+
title: "Graph development"
3+
format:
4+
html:
5+
code-fold: true
6+
jupyter: python3
7+
---
8+
9+
```{python}
10+
import os
11+
os.chdir('..')
12+
```
13+
14+
```{python}
15+
import polars as pl
16+
import pandas as pd
17+
import geopandas as gpd
18+
```
19+
20+
```{python}
21+
roads_df = gpd.read_file("data/interim/roads_2012_06.gpkg", engine="pyogrio")
22+
puma = gpd.read_file("data/interim/puma_06.gpkg", engine="pyogrio")
23+
tmp_puma = puma[(puma["geo_id"] == "0600101")].copy().reset_index(drop=True)
24+
clipped = roads_df.clip(tmp_puma['geometry'])
25+
```
26+
27+
```{python}
28+
base = tmp_puma.plot(color='white', edgecolor='black')
29+
clipped.plot(ax=base, marker='o', color='red', markersize=5)
30+
```
31+
32+
```{python}
33+
clipped.length.sum()
34+
```
35+
36+
```{python}
37+
empty_df = [
38+
pl.Series("year", [], dtype=pl.Int64),
39+
pl.Series("puma_id", [], dtype=pl.String),
40+
pl.Series("leangth", [], dtype=pl.Float64)
41+
]
42+
df = pl.DataFrame(empty_df)
43+
puma2 = puma[puma["geo_id"].str.startswith("06")]
44+
for pum in puma2["geo_id"]:
45+
tmp = puma.loc[puma["geo_id"] == pum]
46+
clipped = roads_df.clip(tmp['geometry'])
47+
leng = pl.DataFrame(
48+
{
49+
"year": 2012,
50+
"puma_id": pum,
51+
"leangth": clipped.length.sum()
52+
}
53+
)
54+
df = pl.concat([df, leng], how="vertical")
55+
print(df)
56+
```
57+
58+
```{python}
59+
def process(roads, state_id, pumas_df, year):
60+
empty_df = [
61+
pl.Series("year", [], dtype=pl.Int64),
62+
pl.Series("puma_id", [], dtype=pl.String),
63+
pl.Series("leangth", [], dtype=pl.Float64)
64+
]
65+
df = pl.DataFrame(empty_df)
66+
pumas = pumas_df[pumas_df["geo_id"].str.startswith(state_id)]
67+
for puma in pumas["geo_id"]:
68+
tmp = pumas.loc[pumas["geo_id"] == puma]
69+
clipped = roads.clip(tmp['geometry'])
70+
leng = pl.DataFrame(
71+
{
72+
"year": year,
73+
"puma_id": puma,
74+
"leangth": clipped.length.sum()
75+
}
76+
)
77+
df = pl.concat([df, leng], how="vertical")
78+
print("\033[0;35mINFO: \033[0m" + f"Finished processing roads for {puma}")
79+
return df
80+
```
81+
82+
```{python}
83+
temp = process(roads_df, "06", puma, 2012)
84+
temp
85+
```
86+
87+
88+
```{python}
89+
empty_df = [
90+
pl.Series("year", [], dtype=pl.Int64),
91+
pl.Series("puma_id", [], dtype=pl.String),
92+
pl.Series("leangth", [], dtype=pl.Float64)
93+
]
94+
data = pl.DataFrame(empty_df)
95+
for year in range(2012, 2019):
96+
roads_df = gpd.GeoDataFrame(columns=['linear_id', 'year', 'geometry'])
97+
for file in os.listdir("data/shape_files/"):
98+
if file.startswith(f"roads_{year}_06"):
99+
gdf = gpd.read_file(f"data/shape_files/{file}", engine="pyogrio")
100+
gdf.rename(columns={"LINEARID": "linear_id"}, inplace=True)
101+
gdf[["county_id", "year"]] = "01063", 2012
102+
gdf = gdf[["year", "linear_id", "county_id", "geometry"]].set_crs(3857, allow_override=True)
103+
roads_df = pd.concat([roads_df, gdf], ignore_index=True)
104+
print("\033[0;36mINFO: \033[0m" + f"Finished processing roads for {file}")
105+
106+
data = pl.concat([data, process(roads_df, "06", puma, year)], how="vertical")
107+
```
108+
109+
```{python}
110+
data.write_parquet("data/processed/roads_final_06.parquet")
111+
```

src/data/data_process.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,8 @@
22
import geopandas as gpd
33
import pandas as pd
44
import polars as pl
5-
import numpy as np
65
import os
76

8-
97
class DataProcess(DataPull):
108
def __init__(self, debug=False):
119
super().__init__()
@@ -83,13 +81,12 @@ def process_acs(self):
8381
print("\033[0;36mINFO: \033[0m" + "Finished processing acs")
8482

8583
def process_roads(self):
86-
roads = gpd.GeoDataFrame(columns=['linear_id', 'year', 'geometry'])
8784

8885
for year in range(2012, 2019):
8986
roads_df = gpd.GeoDataFrame(columns=['linear_id', 'year', 'geometry'])
9087
if os.path.exists(f"data/interim/roads_{year}.gpkg"):
9188
continue
92-
for file in os.listdir(f"data/shape_files/"):
89+
for file in os.listdir("data/shape_files/"):
9390
if file.startswith(f"roads_{year}"):
9491
gdf = gpd.read_file(f"data/shape_files/{file}", engine="pyogrio")
9592
gdf.rename(columns={"LINEARID": "linear_id"}, inplace=True)

0 commit comments

Comments
 (0)