move methoddes to there own files #2

ouslan · ouslan · commit 973ace3749f5 · 2024-06-19T17:33:48.000-04:00
Signed-off-by: Alejandro Ouslan &lt;alejandro.ouslan@upr.edu&gt;
diff --git a/app.py b/app.py
@@ -1,12 +1,12 @@
 from dash import Dash, html, dcc, Input, Output
-from src.data.data_process import DataClean
+from src.visualization.data_graph import DataGraph
 import plotly.express as px
 
 # Initialize the Dash app
 app = Dash(__name__, external_stylesheets=["https://cdn.jsdelivr.net/npm/bootstrap@4.3.1/dist/css/bootstrap.min.css"], suppress_callback_exceptions=True)
 
 # Data processing
-data = DataClean()
+data = DataGraph()
 df = data.graph(2016)
 
 # Define the layout of the app
diff --git a/src/data/data_process.py b/src/data/data_process.py
@@ -8,86 +8,89 @@
 class DataProcess(DataPull):
 
     def __init__(self):
+        super().__init__(self)
+        self.blocks = self.process_shps()
+        self.process_lodes()
 
-        self.mov_file_url = "https://www2.census.gov/ces/movs/movs_st_main2005.csv"
-        self.mov_file_path = "data/raw/movs_st_main2005.csv"
-        self.shape_file_url = "https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_state_500k.zip"
-        self.shape_file_path = "data/shape_files/states.zip"
-        self.state_code_file_path = "data/external/state_code.parquet"
-        self.blocks_file_path = "data/processed/blocks.parquet"
-        self.lodes_file_path = "data/processed/lodes.parquet"
-        
-        self.mov = self.load_mov_data()
-        self.shp = self.load_shape_data()
-        self.codes = self.load_state_codes()
-        self.blocks = self.load_blocks_data()
-        self.lodes = self.load_lodes_data()
-        self.df = self.create_graph_dataset()
     
-    def retrieve_shps(self, blocks):
+    def process_shps(self) -> pl.DataFrame:
 
-        for state, name in self.codes.select(pl.col("fips", "state_name")).rows():
-            print(f"Processing {name}, {state}")
-            url = f"https://www2.census.gov/geo/tiger/TIGER2023/TABBLOCK20/tl_2023_{str(state).zfill(2)}_tabblock20.zip"
-            file_name = f"data/shape_files/{name}_{str(state).zfill(2)}.zip"
-            self.retrieve_file(url, file_name)
-            tmp = gpd.read_file(file_name, engine="pyogrio")
-            tmp = tmp.set_crs(3857, allow_override=True)
-            tmp_shp = tmp[["STATEFP20", "GEOID20", "geometry"]].copy()
-            tmp_shp["centroid"] = tmp_shp.centroid
-            tmp_shp['lon'] = tmp_shp.centroid.x
-            tmp_shp['lat'] = tmp_shp.centroid.y
-            tmp_block = pl.from_pandas(tmp_shp[["STATEFP20", "GEOID20", "lon", "lat"]])
-            blocks = pl.concat([blocks, tmp_block], how="vertical")
-            print(f"Finished processing {state}")
-        blocks.write_parquet(self.blocks_file_path)
+        empty_df = [
+                    pl.Series("STATEFP20", [], dtype=pl.String),
+                    pl.Series("GEOID20", [], dtype=pl.String),
+                    pl.Series("lon", [], dtype=pl.Float64),
+                    pl.Series("lat", [], dtype=pl.Float64),
+                    ]
+        blocks = pl.DataFrame(empty_df).clear()
+        if not os.path.exists("data/processed/blocks.parquet"):
+            for state, name in self.codes.select(pl.col("fips", "state_name")).rows():
+                file_name = f"data/shape_files/block_{name}_{str(state).zfill(2)}.zip"
+                tmp = gpd.read_file(file_name, engine="pyogrio")
+                tmp = tmp.set_crs(3857, allow_override=True)
+                tmp_shp = tmp[["STATEFP20", "GEOID20", "geometry"]].copy()
+                tmp_shp["centroid"] = tmp_shp.centroid
+                tmp_shp['lon'] = tmp_shp.centroid.x
+                tmp_shp['lat'] = tmp_shp.centroid.y
+                tmp_block = pl.from_pandas(tmp_shp[["STATEFP20", "GEOID20", "lon", "lat"]])
+                blocks = pl.concat([blocks, tmp_block], how="vertical")
+                print("\033[0;36mPROCESS: \033[0m" + f"Finished processing {name} Shapes")
+            blocks.sort(by=["STATEFP20", "GEOID20"]).write_parquet("data/processed/blocks.parquet")
+            return blocks
+        else:
+            return pl.read_parquet("data/processed/blocks.parquet")
     
-    def process_lodes(self, lodes):
-
-        for state, name, fips in self.codes.select(pl.col("state_abbr", "state_name", "fips")).rows():
-            for year in range(2005, 2020):
-                url = f"https://lehd.ces.census.gov/data/lodes/LODES8/{state}/od/{state}_od_main_JT00_{year}.csv.gz"
-                file_name = f"data/raw/lodes_{state}_{year}.csv.gz"
-                try:
-                    self.retrieve_file(url, file_name)
-                except:
-                    print(f"Failed to download {name}, {state}, {year}")
-                    continue
-                value = self.process_lodes(file_name)
-                tmp_df = pl.DataFrame([
-                    pl.Series("state", [state], dtype=pl.String),
-                    pl.Series("fips", [fips], dtype=pl.String),
-                    pl.Series("state_abbr", [name], dtype=pl.String),
-                    pl.Series("year", [year], dtype=pl.Int64),
-                    pl.Series("avg_distance", [value], dtype=pl.Float64),
-                ])
-                lodes = pl.concat([lodes, tmp_df], how="vertical")
-                print(f"Finished processing {name}, {state}, {year}")
-        lodes.write_parquet(self.lodes_file_path)
-
-    def process_lodes(self, path):
+    def process_lodes(self):
+        empty_df = [
+                    pl.Series("state", [], dtype=pl.String),
+                    pl.Series("fips", [], dtype=pl.String),
+                    pl.Series("state_abbr", [], dtype=pl.String),
+                    pl.Series("year", [], dtype=pl.Int64),
+                    pl.Series("avg_distance", [], dtype=pl.Float64),
+                    ]
+        lodes = pl.DataFrame(empty_df).clear()
+        if not os.path.exists("data/processed/lodes.parquet"):
+            for state, name, fips in self.codes.select(pl.col("state_abbr", "state_name", "fips")).rows():
+                for year in range(2005, 2020):
+                    file_name = f"data/raw/lodes_{state}_{year}.csv.gz"
+                    try:
+                        value = self.process_block(file_name)
+                    except:
+                        print("\033[1;33mWARNING:  \033[0m" + f"Failed to process lodes_{name}_{state}_{year}")
+                        continue
+                    tmp_df = pl.DataFrame([
+                        pl.Series("state", [state], dtype=pl.String),
+                        pl.Series("fips", [fips], dtype=pl.String),
+                        pl.Series("state_abbr", [name], dtype=pl.String),
+                        pl.Series("year", [year], dtype=pl.Int64),
+                        pl.Series("avg_distance", [value], dtype=pl.Float64),
+                    ])
+                    lodes = pl.concat([lodes, tmp_df], how="vertical")
+                    if self.debug:
+                        print("\033[0;36mINFO: \033[0m" + f"Finished processing lodes {name} for {year}")
+            lodes.sort(by=["state", "year"]).write_parquet("data/processed/lodes.parquet")
 
+    def process_block(self, path) -> float:
         df = pl.read_csv(path, ignore_errors=True)
         df = df.rename({"S000": "total_jobs"}).select(pl.col("w_geocode", "h_geocode", "total_jobs"))
-
         dest = self.blocks.rename({"GEOID20": "w_geocode", "lon": "w_lon", "lat": "w_lat"})
         dest = dest.with_columns((pl.col("w_geocode").cast(pl.Int64)).alias("w_geocode"))
-        
         origin = self.blocks.rename({"GEOID20": "h_geocode", "lon": "h_lon", "lat": "h_lat"})
         origin = origin.with_columns((pl.col("h_geocode").cast(pl.Int64)).alias("h_geocode"))
-
         df = df.join(origin, on="h_geocode", how="left")
         df = df.join(dest, on="w_geocode", how="left")
         df = df.with_columns(
-            (6371.01 * np.arccos(
-                np.sin(pl.col("h_lat")) * np.sin(pl.col("w_lat")) + 
-                np.cos(pl.col("h_lat")) * np.cos(pl.col("w_lat")) * 
-                np.cos(pl.col("h_lon") - pl.col("w_lon"))
-            )).alias("distance")
-        )
-        
+                             (6371.01 * np.arccos(
+                                                  np.sin(pl.col("h_lat")) * np.sin(pl.col("w_lat")) + 
+                                                  np.cos(pl.col("h_lat")) * np.cos(pl.col("w_lat")) * 
+                                                  np.cos(pl.col("h_lon") - pl.col("w_lon"))
+                                                  )
+                             ).alias("distance")
+                            )
         df = df.filter(pl.col("distance") != np.nan)
         df = df.select(pl.col("distance").sum().alias("total_distance"),
                        pl.col("total_jobs").sum().alias("total_jobs"))
         value = df.select((pl.col("total_distance") / pl.col("total_jobs")).alias("avg_distance")).item()
-        return value
+        return value
+
+if __name__ == "__main__":
+    DataProcess()
diff --git a/src/data/data_pull.py b/src/data/data_pull.py
@@ -4,12 +4,13 @@
 
 class DataPull:
     def __init__(self, debug=False):
-        self.debug = debug
+        self.debug = False
         self.mov = self.pull_movs()
         self.codes = self.pull_codes()
+        self.pull_states()
         self.pull_blocks()
         self.pull_pumas()
-        self.pull_lodes(2005)
+        self.pull_lodes(2006)
     
     def pull_movs(self) -> pl.DataFrame:
         self.pull_file("https://www2.census.gov/ces/movs/movs_st_main2005.csv","data/raw/movs.csv")
@@ -24,7 +25,7 @@ def pull_codes(self) -> pl.DataFrame:
             codes = codes.join(self.mov.with_columns(pl.col("state_abbr").str.to_lowercase()), on="state_abbr", how="inner")
             codes = codes.select(pl.col("state_abbr", "fips", "state_name")).unique()
             codes.write_parquet("data/external/state_code.parquet")
-            if debug:
+            if self.debug:
                 print("\033[0;36mPROCESS: \033[0m" + f"Finished processing state_code.parquet")
         return pl.read_parquet("data/external/state_code.parquet")
     
@@ -43,12 +44,11 @@ def pull_blocks(self) -> None:
     
     def pull_pumas(self) -> None:
         for state, name in self.codes.select(pl.col("fips", "state_name")).rows():
-            url = f"https://www2.census.gov/geo/tiger/TIGER2023/TABBLOCK20/tl_2023_{str(state).zfill(2)}_tabblock20.zip"
-            url = f"https://www2.census.gov/geo/tiger/GENZ2020/TABBLOCK20/cb_2018_{str(state).zfill(2)}_puma10_500k.zip"
-            file_name = f"data/shape_files/block_{name}_{str(state).zfill(2)}.zip"
+            url = f"https://www2.census.gov/geo/tiger/TIGER2023/PUMA/tl_2023_{str(state).zfill(2)}_puma20.zip"
+            file_name = f"data/shape_files/puma_{name}_{str(state).zfill(2)}.zip"
             self.pull_file(url, file_name)
             if self.debug:
-                print("\033[0;32mINFO: \033[0m" + f"Finished downloading block_{name}.zip")
+                print("\033[0;32mINFO: \033[0m" + f"Finished downloading puma_{name}.zip")
 
     def pull_lodes(self, start_years:int) -> None:
         for state, name, fips in self.codes.select(pl.col("state_abbr", "state_name", "fips")).rows():
diff --git a/src/visualization/data_graph.py b/src/visualization/data_graph.py
@@ -1,18 +1,28 @@
-
+import geopandas as gpd
+import pandas as pd
+import polars as pl
 class DataGraph:
-    def __init__(self, data):
-        self.data = data
+    def __init__(self, data_path="data/processed/lodes.parquet"):
+        self.data = pl.read_parquet(data_path)
+        self.shp = self.load_shape_data()
+        self.df = self.create_graph_dataset()
+        self.lodes = pl.read_parquet("data/processed/lodes.parquet")
 
-    def create_graph_dataset(self):
+    def load_shape_data(self):
+        shp = gpd.read_file("data/shape_files/states.zip", engine="pyogrio")
+        shp.rename({"GEOID": "state", "NAME": "state_name"}, axis=1, inplace=True)
+        shp["state"] = shp["state"].astype(int)
+        return shp
+    
+    def create_graph_dataset(self) -> gpd.GeoDataFrame:
 
-        df = self.lodes.rename({"state": "STUSPS"})
+        df = self.data.rename({"state": "STUSPS"})
         df = df.with_columns(pl.col("STUSPS").str.to_uppercase())
         df = df.to_pandas()
         df = pd.merge(df, self.shp, on="STUSPS", how="inner")
         return gpd.GeoDataFrame(df, geometry="geometry")
 
-    def graph(self, year):
-
+    def graph(self, year) -> gpd.GeoDataFrame:
         gdf = self.df.copy()
         gdf = gdf[gdf["year"] == year].reset_index(drop=True)
         return gdf