Skip to content

Commit 973ace3

Browse files
committed
move methoddes to there own files #2
Signed-off-by: Alejandro Ouslan <[email protected]>
1 parent 5550e63 commit 973ace3

File tree

4 files changed

+94
-81
lines changed

4 files changed

+94
-81
lines changed

app.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
from dash import Dash, html, dcc, Input, Output
2-
from src.data.data_process import DataClean
2+
from src.visualization.data_graph import DataGraph
33
import plotly.express as px
44

55
# Initialize the Dash app
66
app = Dash(__name__, external_stylesheets=["https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css"], suppress_callback_exceptions=True)
77

88
# Data processing
9-
data = DataClean()
9+
data = DataGraph()
1010
df = data.graph(2016)
1111

1212
# Define the layout of the app

src/data/data_process.py

+68-65
Original file line numberDiff line numberDiff line change
@@ -8,86 +8,89 @@
88
class DataProcess(DataPull):
99

1010
def __init__(self):
11+
super().__init__(self)
12+
self.blocks = self.process_shps()
13+
self.process_lodes()
1114

12-
self.mov_file_url = "https://www2.census.gov/ces/movs/movs_st_main2005.csv"
13-
self.mov_file_path = "data/raw/movs_st_main2005.csv"
14-
self.shape_file_url = "https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_state_500k.zip"
15-
self.shape_file_path = "data/shape_files/states.zip"
16-
self.state_code_file_path = "data/external/state_code.parquet"
17-
self.blocks_file_path = "data/processed/blocks.parquet"
18-
self.lodes_file_path = "data/processed/lodes.parquet"
19-
20-
self.mov = self.load_mov_data()
21-
self.shp = self.load_shape_data()
22-
self.codes = self.load_state_codes()
23-
self.blocks = self.load_blocks_data()
24-
self.lodes = self.load_lodes_data()
25-
self.df = self.create_graph_dataset()
2615

27-
def retrieve_shps(self, blocks):
16+
def process_shps(self) -> pl.DataFrame:
2817

29-
for state, name in self.codes.select(pl.col("fips", "state_name")).rows():
30-
print(f"Processing {name}, {state}")
31-
url = f"https://www2.census.gov/geo/tiger/TIGER2023/TABBLOCK20/tl_2023_{str(state).zfill(2)}_tabblock20.zip"
32-
file_name = f"data/shape_files/{name}_{str(state).zfill(2)}.zip"
33-
self.retrieve_file(url, file_name)
34-
tmp = gpd.read_file(file_name, engine="pyogrio")
35-
tmp = tmp.set_crs(3857, allow_override=True)
36-
tmp_shp = tmp[["STATEFP20", "GEOID20", "geometry"]].copy()
37-
tmp_shp["centroid"] = tmp_shp.centroid
38-
tmp_shp['lon'] = tmp_shp.centroid.x
39-
tmp_shp['lat'] = tmp_shp.centroid.y
40-
tmp_block = pl.from_pandas(tmp_shp[["STATEFP20", "GEOID20", "lon", "lat"]])
41-
blocks = pl.concat([blocks, tmp_block], how="vertical")
42-
print(f"Finished processing {state}")
43-
blocks.write_parquet(self.blocks_file_path)
18+
empty_df = [
19+
pl.Series("STATEFP20", [], dtype=pl.String),
20+
pl.Series("GEOID20", [], dtype=pl.String),
21+
pl.Series("lon", [], dtype=pl.Float64),
22+
pl.Series("lat", [], dtype=pl.Float64),
23+
]
24+
blocks = pl.DataFrame(empty_df).clear()
25+
if not os.path.exists("data/processed/blocks.parquet"):
26+
for state, name in self.codes.select(pl.col("fips", "state_name")).rows():
27+
file_name = f"data/shape_files/block_{name}_{str(state).zfill(2)}.zip"
28+
tmp = gpd.read_file(file_name, engine="pyogrio")
29+
tmp = tmp.set_crs(3857, allow_override=True)
30+
tmp_shp = tmp[["STATEFP20", "GEOID20", "geometry"]].copy()
31+
tmp_shp["centroid"] = tmp_shp.centroid
32+
tmp_shp['lon'] = tmp_shp.centroid.x
33+
tmp_shp['lat'] = tmp_shp.centroid.y
34+
tmp_block = pl.from_pandas(tmp_shp[["STATEFP20", "GEOID20", "lon", "lat"]])
35+
blocks = pl.concat([blocks, tmp_block], how="vertical")
36+
print("\033[0;36mPROCESS: \033[0m" + f"Finished processing {name} Shapes")
37+
blocks.sort(by=["STATEFP20", "GEOID20"]).write_parquet("data/processed/blocks.parquet")
38+
return blocks
39+
else:
40+
return pl.read_parquet("data/processed/blocks.parquet")
4441

45-
def process_lodes(self, lodes):
46-
47-
for state, name, fips in self.codes.select(pl.col("state_abbr", "state_name", "fips")).rows():
48-
for year in range(2005, 2020):
49-
url = f"https://lehd.ces.census.gov/data/lodes/LODES8/{state}/od/{state}_od_main_JT00_{year}.csv.gz"
50-
file_name = f"data/raw/lodes_{state}_{year}.csv.gz"
51-
try:
52-
self.retrieve_file(url, file_name)
53-
except:
54-
print(f"Failed to download {name}, {state}, {year}")
55-
continue
56-
value = self.process_lodes(file_name)
57-
tmp_df = pl.DataFrame([
58-
pl.Series("state", [state], dtype=pl.String),
59-
pl.Series("fips", [fips], dtype=pl.String),
60-
pl.Series("state_abbr", [name], dtype=pl.String),
61-
pl.Series("year", [year], dtype=pl.Int64),
62-
pl.Series("avg_distance", [value], dtype=pl.Float64),
63-
])
64-
lodes = pl.concat([lodes, tmp_df], how="vertical")
65-
print(f"Finished processing {name}, {state}, {year}")
66-
lodes.write_parquet(self.lodes_file_path)
67-
68-
def process_lodes(self, path):
42+
def process_lodes(self):
43+
empty_df = [
44+
pl.Series("state", [], dtype=pl.String),
45+
pl.Series("fips", [], dtype=pl.String),
46+
pl.Series("state_abbr", [], dtype=pl.String),
47+
pl.Series("year", [], dtype=pl.Int64),
48+
pl.Series("avg_distance", [], dtype=pl.Float64),
49+
]
50+
lodes = pl.DataFrame(empty_df).clear()
51+
if not os.path.exists("data/processed/lodes.parquet"):
52+
for state, name, fips in self.codes.select(pl.col("state_abbr", "state_name", "fips")).rows():
53+
for year in range(2005, 2020):
54+
file_name = f"data/raw/lodes_{state}_{year}.csv.gz"
55+
try:
56+
value = self.process_block(file_name)
57+
except:
58+
print("\033[1;33mWARNING: \033[0m" + f"Failed to process lodes_{name}_{state}_{year}")
59+
continue
60+
tmp_df = pl.DataFrame([
61+
pl.Series("state", [state], dtype=pl.String),
62+
pl.Series("fips", [fips], dtype=pl.String),
63+
pl.Series("state_abbr", [name], dtype=pl.String),
64+
pl.Series("year", [year], dtype=pl.Int64),
65+
pl.Series("avg_distance", [value], dtype=pl.Float64),
66+
])
67+
lodes = pl.concat([lodes, tmp_df], how="vertical")
68+
if self.debug:
69+
print("\033[0;36mINFO: \033[0m" + f"Finished processing lodes {name} for {year}")
70+
lodes.sort(by=["state", "year"]).write_parquet("data/processed/lodes.parquet")
6971

72+
def process_block(self, path) -> float:
7073
df = pl.read_csv(path, ignore_errors=True)
7174
df = df.rename({"S000": "total_jobs"}).select(pl.col("w_geocode", "h_geocode", "total_jobs"))
72-
7375
dest = self.blocks.rename({"GEOID20": "w_geocode", "lon": "w_lon", "lat": "w_lat"})
7476
dest = dest.with_columns((pl.col("w_geocode").cast(pl.Int64)).alias("w_geocode"))
75-
7677
origin = self.blocks.rename({"GEOID20": "h_geocode", "lon": "h_lon", "lat": "h_lat"})
7778
origin = origin.with_columns((pl.col("h_geocode").cast(pl.Int64)).alias("h_geocode"))
78-
7979
df = df.join(origin, on="h_geocode", how="left")
8080
df = df.join(dest, on="w_geocode", how="left")
8181
df = df.with_columns(
82-
(6371.01 * np.arccos(
83-
np.sin(pl.col("h_lat")) * np.sin(pl.col("w_lat")) +
84-
np.cos(pl.col("h_lat")) * np.cos(pl.col("w_lat")) *
85-
np.cos(pl.col("h_lon") - pl.col("w_lon"))
86-
)).alias("distance")
87-
)
88-
82+
(6371.01 * np.arccos(
83+
np.sin(pl.col("h_lat")) * np.sin(pl.col("w_lat")) +
84+
np.cos(pl.col("h_lat")) * np.cos(pl.col("w_lat")) *
85+
np.cos(pl.col("h_lon") - pl.col("w_lon"))
86+
)
87+
).alias("distance")
88+
)
8989
df = df.filter(pl.col("distance") != np.nan)
9090
df = df.select(pl.col("distance").sum().alias("total_distance"),
9191
pl.col("total_jobs").sum().alias("total_jobs"))
9292
value = df.select((pl.col("total_distance") / pl.col("total_jobs")).alias("avg_distance")).item()
93-
return value
93+
return value
94+
95+
if __name__ == "__main__":
96+
DataProcess()

src/data/data_pull.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@
44

55
class DataPull:
66
def __init__(self, debug=False):
7-
self.debug = debug
7+
self.debug = False
88
self.mov = self.pull_movs()
99
self.codes = self.pull_codes()
10+
self.pull_states()
1011
self.pull_blocks()
1112
self.pull_pumas()
12-
self.pull_lodes(2005)
13+
self.pull_lodes(2006)
1314

1415
def pull_movs(self) -> pl.DataFrame:
1516
self.pull_file("https://www2.census.gov/ces/movs/movs_st_main2005.csv","data/raw/movs.csv")
@@ -24,7 +25,7 @@ def pull_codes(self) -> pl.DataFrame:
2425
codes = codes.join(self.mov.with_columns(pl.col("state_abbr").str.to_lowercase()), on="state_abbr", how="inner")
2526
codes = codes.select(pl.col("state_abbr", "fips", "state_name")).unique()
2627
codes.write_parquet("data/external/state_code.parquet")
27-
if debug:
28+
if self.debug:
2829
print("\033[0;36mPROCESS: \033[0m" + f"Finished processing state_code.parquet")
2930
return pl.read_parquet("data/external/state_code.parquet")
3031

@@ -43,12 +44,11 @@ def pull_blocks(self) -> None:
4344

4445
def pull_pumas(self) -> None:
4546
for state, name in self.codes.select(pl.col("fips", "state_name")).rows():
46-
url = f"https://www2.census.gov/geo/tiger/TIGER2023/TABBLOCK20/tl_2023_{str(state).zfill(2)}_tabblock20.zip"
47-
url = f"https://www2.census.gov/geo/tiger/GENZ2020/TABBLOCK20/cb_2018_{str(state).zfill(2)}_puma10_500k.zip"
48-
file_name = f"data/shape_files/block_{name}_{str(state).zfill(2)}.zip"
47+
url = f"https://www2.census.gov/geo/tiger/TIGER2023/PUMA/tl_2023_{str(state).zfill(2)}_puma20.zip"
48+
file_name = f"data/shape_files/puma_{name}_{str(state).zfill(2)}.zip"
4949
self.pull_file(url, file_name)
5050
if self.debug:
51-
print("\033[0;32mINFO: \033[0m" + f"Finished downloading block_{name}.zip")
51+
print("\033[0;32mINFO: \033[0m" + f"Finished downloading puma_{name}.zip")
5252

5353
def pull_lodes(self, start_years:int) -> None:
5454
for state, name, fips in self.codes.select(pl.col("state_abbr", "state_name", "fips")).rows():

src/visualization/data_graph.py

+17-7
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,28 @@
1-
1+
import geopandas as gpd
2+
import pandas as pd
3+
import polars as pl
24
class DataGraph:
3-
def __init__(self, data):
4-
self.data = data
5+
def __init__(self, data_path="data/processed/lodes.parquet"):
6+
self.data = pl.read_parquet(data_path)
7+
self.shp = self.load_shape_data()
8+
self.df = self.create_graph_dataset()
9+
self.lodes = pl.read_parquet("data/processed/lodes.parquet")
510

6-
def create_graph_dataset(self):
11+
def load_shape_data(self):
12+
shp = gpd.read_file("data/shape_files/states.zip", engine="pyogrio")
13+
shp.rename({"GEOID": "state", "NAME": "state_name"}, axis=1, inplace=True)
14+
shp["state"] = shp["state"].astype(int)
15+
return shp
16+
17+
def create_graph_dataset(self) -> gpd.GeoDataFrame:
718

8-
df = self.lodes.rename({"state": "STUSPS"})
19+
df = self.data.rename({"state": "STUSPS"})
920
df = df.with_columns(pl.col("STUSPS").str.to_uppercase())
1021
df = df.to_pandas()
1122
df = pd.merge(df, self.shp, on="STUSPS", how="inner")
1223
return gpd.GeoDataFrame(df, geometry="geometry")
1324

14-
def graph(self, year):
15-
25+
def graph(self, year) -> gpd.GeoDataFrame:
1626
gdf = self.df.copy()
1727
gdf = gdf[gdf["year"] == year].reset_index(drop=True)
1828
return gdf

0 commit comments

Comments
 (0)