Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added new census distribution over buildings UDF #97

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import geopandas as gpd
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In UDFs, import statement should go inside the UDF. You already have this import in line 11.


@fused.udf
def udf(
bbox: fused.types.TileGDF = None,
release: str = "2024-03-12-alpha-0",
polygon: gpd.GeoDataFrame = None,
resolution: int = 10

):
import geopandas as gpd
import concurrent.futures
from utils import get_buildings_h3,acs_5yr_bbox, get_census
import h3
import pandas as pd
from shapely.geometry import shape, box, Polygon
import logging

# Getting Overture buildings Data in H3 Format
building_data = get_buildings_h3(bbox, release, resolution)

# Getting Census Data in H3 Format
census_df = get_census(bbox, census_variable='Total Pop', scale_factor=200, is_density=True, year=2022)

print(census_df)
print(building_data)

# Performing SJoin on buildings data and Census data to find the population distribution
joined_gdf = gpd.sjoin(building_data, census_df, how="left", op="intersects")

# Calculation of population count to visualize population
joined_gdf['cnt'] = joined_gdf['cnt'].fillna(0)


return joined_gdf
15 changes: 15 additions & 0 deletions public/Census_Distribution_across_Buildings/README.MD
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<!--fused:preview-->
<p align="center"><img src="https://raw.githubusercontent.com/fusedio/udfs/main/public/Census_Distribution_across_Buildings/fused-screenshot-overture-ACS.png" width="600" alt="UDF preview image"></p>

<!--fused:tags-->
Tags: `population` `buildings` `overture` `census` `usa` `h3`

<!--fused:readme-->
## Overview

This UDF calculates and displays the approximate population distribution from the ACS (American Community Survey) population census dataset across building footprints in the United States, using the Overture Maps Building Footprint Dataset. By integrating H3 index, it spatially joins census population data with building footprint geometries to provide a detailed analysis of population distribution at a granular level.

## External links

- [ACS official Website](https://www.census.gov/programs-surveys/acs/)
- Buildings footprints [Overture Maps](https://overturemaps.org/)
106 changes: 106 additions & 0 deletions public/Census_Distribution_across_Buildings/meta.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
{
"version": "0.0.3",
"job_config": {
"version": "0.0.3",
"name": null,
"steps": [
{
"type": "udf",
"udf": {
"type": "geopandas_v2",
"name": "Census_Distribution_across_Buildings",
"entrypoint": "udf",
"parameters": {},
"metadata": {
"fused:defaultParameters": [
{
"parameter": "release",
"value": "",
"type": "string",
"suggestedValues": [
"2024-02-15-alpha-0",
"2024-03-12-alpha-0"
]
}
],
"fused:defaultViewState": {
"enable": true,
"latitude": 40.779884303572246,
"longitude": -73.96453426313244,
"zoom": 12.08646398175605,
"pitch": 0,
"bearing": 0
},
"fused:tags": [
{
"id": "population",
"label": "population",
"isCreatable": true
},
{
"id": "buildings",
"label": "buildings"
},
{
"id": "overture",
"label": "overture",
"isCreatable": true
},
{
"id": "census",
"label": "census",
"isCreatable": true
},
{
"id": "usa",
"label": "usa",
"isCreatable": true
},
{
"id": "h3",
"label": "h3",
"isCreatable": true
}
],
"fused:description": "## Overview\n\nThis UDF calculates and displays the approximate population distribution from the ACS (American Community Survey) population census dataset across building footprints in the United States, using the Overture Maps Building Footprint Dataset. By integrating H3 index, it spatially joins census population data with building footprint geometries to provide a detailed analysis of population distribution at a granular level.\n\n## External links\n\n- [ACS official Website](https://www.census.gov/programs-surveys/acs/)\n- Buildings footprints [Overture Maps](https://overturemaps.org/) \n",
"fused:assetUrl": "https://raw.githubusercontent.com/fusedio/udfs/main/public/Census_Distribution_across_Buildings/fused-screenshot-overture-ACS.png",
"fused:vizConfig": {
"tileLayer": {
"@@type": "TileLayer",
"minZoom": 0,
"maxZoom": 19,
"tileSize": 256,
"pickable": true
},
"rasterLayer": {
"@@type": "BitmapLayer",
"pickable": true
},
"vectorLayer": {
"opacity": 5,
"@@type": "GeoJsonLayer",
"stroked": false,
"filled": true,
"pickable": true,
"getRadius": 10,
"getFillColor": "@@=[properties.cnt/3+200, properties.cnt/5, properties.cnt/20]"
}
},
"fused:udfType": "auto",
"fused:slug": "Census_Distribution_across_Buildings",
"fused:name": "Census_Distribution_across_Buildings",
"fused:id": null
},
"source": "Census_Distribution_across_Buildings.py",
"headers": [
{
"module_name": "utils",
"source_file": "utils.py"
}
]
}
}
],
"metadata": null
}
}
188 changes: 188 additions & 0 deletions public/Census_Distribution_across_Buildings/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
import geopandas as gpd
import concurrent.futures
import fused
import geopandas as gpd
import shapely
from shapely import Point, Polygon
import h3

# Function to Geometry to H3 polygon from centroid location
def geometry_to_hexagon(geom, resolution=10):
centroid = geom.centroid
h3_index = h3.latlng_to_cell(centroid.y, centroid.x, resolution)
hex_boundary = h3.cell_to_boundary(h3_index)
return Polygon([(b[1], b[0]) for b in hex_boundary])

# Converting Overture Maps Buildings to Hexagons
@fused.cache
def get_buildings_h3(
bbox: fused.types.TileGDF = None,
release: str = "2024-03-12-alpha-0",
resolution: int = 10
):
import pandas as pd
from shapely.geometry import box

utils = fused.load(
"https://github.com/fusedio/udfs/tree/f8f0c0f/public/common/"
).utils

theme = "buildings"
overture_type = "building"
min_zoom = 12
num_parts = 5

table_path = f"s3://us-west-2.opendata.source.coop/fused/overture/{release}/theme={theme}/type={overture_type}"
table_path = table_path.rstrip("/")

def get_part(part):
part_path = f"{table_path}/part={part}/" if num_parts != 1 else table_path
try:
return utils.table_to_tile(
bbox, table=part_path, min_zoom=min_zoom
)
except ValueError:
return None

if num_parts > 1:
with concurrent.futures.ThreadPoolExecutor(max_workers=num_parts) as pool:
dfs = list(pool.map(get_part, range(num_parts)))
else:
dfs = [get_part(0)]

dfs = [df for df in dfs if df is not None]

if len(dfs):
gdf = pd.concat(dfs)
else:
print("Failed to get any data")
return None

hex_polygons = []

if 'geometry' in gdf.columns:
gdf['hexagon'] = gdf['geometry'].apply(geometry_to_hexagon)
hex_gdf = gpd.GeoDataFrame(gdf, geometry='hexagon', crs='epsg:4326')
else:
hex_gdf = gpd.GeoDataFrame(gdf)

return hex_gdf


# Census Data UDF Functions
@fused.cache
def acs_5yr_bbox(bounds, census_variable='population', suffix='simplify_01', year=2022):
if int(year) not in (2021, 2022):
raise ValueError('The only available years are 2021 and 2022')

import shapely
import geopandas as gpd
bbox = gpd.GeoDataFrame(geometry=[shapely.box(*bounds)], crs=4326)
table_to_tile = fused.utils.common.table_to_tile
fused.utils.common.import_env()
tid = search_title(census_variable)
df = acs_5yr_table(tid, year=year)
df['GEOID'] = df.GEO_ID.map(lambda x: x.split('US')[-1])
df = df[['GEOID'] + [i for i in df.columns if '_E' in i]]
name_dict = acs_5yr_meta(short=False).set_index('Unique ID').to_dict()['Full Title']
df.columns = ['GEOID'] + [name_dict[i.replace('_E', "_")] for i in df.columns[1:]]
df = df.rename(columns={df.columns[1]: 'cnt'}) # Rename population column to 'cnt'
table_path = 's3://fused-asset/infra/census_bg_us'
print(df['GEOID'] , "there are the geoIDS")
if suffix:
table_path += f'_{suffix}'
print("meow", df.columns)
gdf = table_to_tile(bbox, table_path, use_columns=['GEOID', 'geometry'], min_zoom=12)
gdf['h3_index'] = gdf['geometry'].apply(lambda x: h3.latlng_to_cell(x.centroid.y, x.centroid.x, 11))
print(gdf)


if len(gdf)>0:
gdf2 = gdf.merge(df)
return gdf2
else:
print('No geometry is intersecting with the given bbox.')
return gpd.GeoDataFrame({})

@fused.cache
def acs_5yr_meta(short=True):
import pandas as pd
Copy link
Contributor

@pgzmnk pgzmnk Jun 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Import statement at top of file. If it were a "rare" import statement that introduced latency at import, it'd make sense to have it within the function. In this case, pandas is common enough so it belongs at the top of the file.

#Filter only records with cencus block groups data
tmp = pd.read_excel('https://www2.census.gov/programs-surveys/acs/summary_file/2021/sequence-based-SF/documentation/tech_docs/ACS_2021_SF_5YR_Appendices.xlsx')
table_ids_cbgs = tmp[tmp['Geography Restrictions'].isna()]['Table Number']
#Get the list of tables and filter by only totals (the first row of each table)
df_tables = pd.read_csv('https://www2.census.gov/programs-surveys/acs/summary_file/2022/table-based-SF/documentation/ACS20225YR_Table_Shells.txt', delimiter='|')
if short:
df_tables2 = df_tables.drop_duplicates('Table ID')
else:
df_tables2 = df_tables
df_tables2['Full Title']=df_tables2['Label']+' | '+df_tables2['Title']+' | '+df_tables2['Unique ID']
df_tables2 = df_tables2[df_tables2['Table ID'].isin(table_ids_cbgs)]
print(df_tables2, df_tables)
return df_tables2


@fused.cache
def acs_5yr_table(tid, year=2022):
import pandas as pd
url=f'https://www2.census.gov/programs-surveys/acs/summary_file/{year}/table-based-SF/data/5YRData/acsdt5y{year}-{tid.lower()}.dat'
return pd.read_csv(url, delimiter='|')

def search_title(title):
df_meta=acs_5yr_meta()
#search for title in the list of tables
search_column = 'Title' #'Title' #'Topics'
meta_dict = df_meta[['Table ID', search_column]].set_index(search_column).to_dict()['Table ID']
List = [[meta_dict[i], i] for i in meta_dict.keys() if title.lower() in i.lower()]
print(f'Chosen: {List[0]}\nfrom: {List[:20]}')
return List[0][0]


import geopandas as gpd
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please run isort on every new .py file. This import statement should be at top of file.


@fused.cache
def get_census(bbox, census_variable='Total Pop', scale_factor=200, is_density=True, year=2022):
from utils import acs_5yr_bbox
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This import statement is incorrect. This function is already in the utils file so acs_5yr_bbox is already in scope and needn't be imported.

import h3
import shapely
from shapely import Point, Polygon
#different geometry details per zoom level
if bbox.z[0]>12:
suffix=None
elif bbox.z[0]>10:
suffix='simplify_0001'
elif bbox.z[0]>8:
suffix='simplify_001'
elif bbox.z[0]>5:
suffix='simplify_01'
else:
suffix='centroid'
print(suffix)

#read the variables
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
#read the variables
# Read the variables

Please standardize all comments to use sentence case and have a space between the # and first letter, and add this to the contributing md.

gdf=acs_5yr_bbox(bbox.total_bounds, census_variable=census_variable, year=year)
if len(gdf)==0:
return None

#shorten the column name
gdf.columns = gdf.columns.map(lambda x:(str(x.split('|')[0])+str(x.split('|')[-1])) if '|' in x else x)
print(gdf.columns)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need for print statement in helper function.



def geometry_to_hexagon(geom):
centroid = geom.centroid
h3_index = h3.latlng_to_cell(centroid.y, centroid.x, 10)
hex_boundary = h3.cell_to_boundary(h3_index)
return Polygon([(b[1], b[0]) for b in hex_boundary])

if 'geometry' in gdf.columns:
gdf['hexagon'] = gdf['geometry'].apply(geometry_to_hexagon)
hex_gdf = gpd.GeoDataFrame(gdf, geometry='hexagon', crs='epsg:4326')
else:
hex_gdf = gpd.GeoDataFrame(gdf)


Comment on lines +183 to +184
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please run black to autoformat.

return hex_gdf