finished test regresion

ouslan · ouslan · commit 8b109f1a2cdf · 2024-07-10T19:22:41.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -164,7 +164,11 @@ data/raw
 data/shape_files
 data/interim
 data/external
-notebooks/
+*.ipynb
+*.css
+*.woff
+*.js
+*.html
 #force to keep the pnb.csv file in data/external
 
 .DS_Store
diff --git a/notebooks/regresion.qmd b/notebooks/regresion.qmd
@@ -0,0 +1,74 @@
+---
+title: "Graph development"
+format:
+  html:
+    code-fold: true
+jupyter: python3
+---
+
+```{python}
+import os
+os.chdir("..")
+```
+
+```{python}
+import pandas as pd
+import polars as pl
+import numpy as np
+import geopandas as gpd
+import spreg
+import numpy
+import libpysal
+import spreg
+from pysal.lib import weights
+```
+
+```{python}
+puma = gpd.read_file("data/interim/puma_06.gpkg", engine="pyogrio")
+df_roads = pd.read_parquet("data/processed/roads_final_06.parquet")
+df_acs = pd.read_parquet("data/processed/acs.parquet")
+```
+
+```{python}
+df_acs["puma_id"] = df_acs["state"].astype(str).str.zfill(2) + df_acs["PUMA"].astype(str).str.zfill(5)
+df_acs = df_acs[(df_acs["state"] == 6) & (df_acs["year"] >= 2012) & (df_acs["race"] == "ALL") &  (df_acs["year"] <= 2018)].reset_index(drop=True)
+df_acs
+```
+
+```{python}
+master_df = df_acs.merge(df_roads, on=["puma_id", "year"], how="left")
+master_df = master_df[["year", "puma_id", "avg_time", "leangth"]].sort_values(by=["year", "puma_id"], ascending=True).reset_index(drop=True)
+master_df
+```
+
+```{python}
+puma_06 = puma[puma["geo_id"].str.startswith("06")].reset_index(drop=True)
+puma_06.rename(columns={"geo_id": "puma_id"}, inplace=True)
+puma_06
+```
+
+```{python}
+wq = weights.contiguity.Queen.from_dataframe(puma_06, geom_col="geometry", ids="puma_id")
+wq.transform = 'r'
+```
+
+```{python}
+master_df["avg_time"].values
+```
+
+```{python}
+y_reshaped = master_df["avg_time"].values.reshape(-1, 1)
+x_reshaped = master_df["leangth"].values.reshape(-1, 1)
+
+print(f"x shape: {x.shape}, y shape: {y.shape}")  # Check the shape of x and y
+
+fe_lag = spreg.Panel_FE_Lag(
+    y=y_reshaped, 
+    x=x_reshaped, 
+    w=wq)
+
+print(fe_lag.summary)
+```
+
+```{python}
+
diff --git a/notebooks/roads.qmd b/notebooks/roads.qmd
@@ -0,0 +1,111 @@
+---
+title: "Graph development"
+format:
+  html:
+    code-fold: true
+jupyter: python3
+---
+
+```{python}
+import os 
+os.chdir('..')
+```
+
+```{python}
+import polars as pl
+import pandas as pd
+import geopandas as gpd
+```
+
+```{python}
+roads_df = gpd.read_file("data/interim/roads_2012_06.gpkg", engine="pyogrio")
+puma = gpd.read_file("data/interim/puma_06.gpkg", engine="pyogrio")
+tmp_puma = puma[(puma["geo_id"] == "0600101")].copy().reset_index(drop=True)
+clipped = roads_df.clip(tmp_puma['geometry'])
+```
+
+```{python}
+base = tmp_puma.plot(color='white', edgecolor='black')
+clipped.plot(ax=base, marker='o', color='red', markersize=5)
+```
+
+```{python}
+clipped.length.sum()
+```
+
+```{python}
+empty_df = [
+  pl.Series("year", [], dtype=pl.Int64),
+  pl.Series("puma_id", [], dtype=pl.String),
+  pl.Series("leangth", [], dtype=pl.Float64)
+]
+df = pl.DataFrame(empty_df)
+puma2 = puma[puma["geo_id"].str.startswith("06")]
+for pum in puma2["geo_id"]:
+    tmp = puma.loc[puma["geo_id"] == pum]
+    clipped = roads_df.clip(tmp['geometry'])
+    leng = pl.DataFrame(
+      {
+        "year": 2012,
+        "puma_id": pum,
+        "leangth": clipped.length.sum()
+      }
+    )
+    df = pl.concat([df, leng], how="vertical")
+print(df)
+```
+
+```{python}
+def process(roads, state_id, pumas_df, year):
+  empty_df = [
+              pl.Series("year", [], dtype=pl.Int64),
+              pl.Series("puma_id", [], dtype=pl.String),
+              pl.Series("leangth", [], dtype=pl.Float64)
+             ]
+  df = pl.DataFrame(empty_df)
+  pumas = pumas_df[pumas_df["geo_id"].str.startswith(state_id)]
+  for puma in pumas["geo_id"]:
+      tmp = pumas.loc[pumas["geo_id"] == puma]
+      clipped = roads.clip(tmp['geometry'])
+      leng = pl.DataFrame(
+        {
+          "year": year,
+          "puma_id": puma,
+          "leangth": clipped.length.sum()
+        }
+      )
+      df = pl.concat([df, leng], how="vertical")
+      print("\033[0;35mINFO: \033[0m" + f"Finished processing roads for {puma}")
+  return df
+```
+
+```{python}
+temp = process(roads_df, "06", puma, 2012)
+temp
+```
+
+
+```{python}
+empty_df = [
+            pl.Series("year", [], dtype=pl.Int64),
+            pl.Series("puma_id", [], dtype=pl.String),
+            pl.Series("leangth", [], dtype=pl.Float64)
+           ]
+data = pl.DataFrame(empty_df)
+for year in range(2012, 2019):
+  roads_df = gpd.GeoDataFrame(columns=['linear_id', 'year', 'geometry'])
+  for file in os.listdir("data/shape_files/"):
+    if file.startswith(f"roads_{year}_06"):
+      gdf = gpd.read_file(f"data/shape_files/{file}", engine="pyogrio")
+      gdf.rename(columns={"LINEARID": "linear_id"}, inplace=True)
+      gdf[["county_id", "year"]] = "01063", 2012
+      gdf = gdf[["year", "linear_id", "county_id", "geometry"]].set_crs(3857, allow_override=True)
+      roads_df = pd.concat([roads_df, gdf], ignore_index=True)
+      print("\033[0;36mINFO: \033[0m" + f"Finished processing roads for {file}")
+    
+  data = pl.concat([data, process(roads_df, "06", puma, year)], how="vertical")
+```
+
+```{python}
+data.write_parquet("data/processed/roads_final_06.parquet")
+```
diff --git a/src/data/data_process.py b/src/data/data_process.py
@@ -2,10 +2,8 @@
 import geopandas as gpd
 import pandas as pd
 import polars as pl
-import numpy as np
 import os
 
-
 class DataProcess(DataPull):
     def __init__(self, debug=False):
         super().__init__()
@@ -83,13 +81,12 @@ def process_acs(self):
                 print("\033[0;36mINFO: \033[0m" + "Finished processing acs")
 
     def process_roads(self):
-        roads = gpd.GeoDataFrame(columns=['linear_id', 'year', 'geometry'])
         
         for year in range(2012, 2019):
             roads_df = gpd.GeoDataFrame(columns=['linear_id', 'year', 'geometry'])
             if os.path.exists(f"data/interim/roads_{year}.gpkg"):
                 continue
-            for file in os.listdir(f"data/shape_files/"):
+            for file in os.listdir("data/shape_files/"):
                 if file.startswith(f"roads_{year}"):
                     gdf = gpd.read_file(f"data/shape_files/{file}", engine="pyogrio")
                     gdf.rename(columns={"LINEARID": "linear_id"}, inplace=True)