Skip to content

Commit 2f9a672

Browse files
committed
Add script and documentation for california chip creation
Closes #3
1 parent b17d5a5 commit 2f9a672

File tree

2 files changed

+188
-0
lines changed

2 files changed

+188
-0
lines changed

Diff for: scripts/california_tiles.md

+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
# California chips
2+
This file documents how the Worldcover chips were created for the earth-text
3+
model.
4+
5+
This script creates 111'920 chips with a pizel size of `256x256` pixels. Each
6+
Chip has 5 bands: `[red, green, blue, nir, vv, vh]`, all stored as 16bit
7+
unsigned integer.
8+
9+
The steps for creating the tiles were as follows:
10+
11+
1. Create file with worldcover grid cells over california
12+
2. Download the worldcover data for those cells for rgbnir and vvvh
13+
3. Merge the cells into single bigtiff files
14+
4. Use a python script to create chips with the rgb, nir, and vvvh bands in one file
15+
5. Create index file with bounding boxes for all these chips
16+
17+
18+
## Normalizations statistics for the Sentinel-1 bands
19+
20+
The `vv` and `vh` bands are in uint16, which is different from the data that the
21+
Clay model has been trained with. The normalization parameters have to be adapted,
22+
The statistics of the data are as follows.
23+
24+
### vv band statistics
25+
26+
```
27+
Band 1 Block=256x256 Type=UInt16, ColorInterp=Gray
28+
Min=0.000 Max=56073.000
29+
Minimum=4659.000, Maximum=58757.000, Mean=34369.390, StdDev=3572.343
30+
NoData Value=0
31+
Metadata:
32+
STATISTICS_MAXIMUM=58757
33+
STATISTICS_MEAN=34369.390015449
34+
STATISTICS_MINIMUM=4659
35+
STATISTICS_STDDEV=3572.3427052005
36+
STATISTICS_VALID_PERCENT=46.72
37+
```
38+
39+
### vh band statistics
40+
41+
```
42+
Band 2 Block=256x256 Type=UInt16, ColorInterp=Undefined
43+
Min=0.000 Max=47888.000
44+
Minimum=4948.000, Maximum=58353.000, Mean=26265.214, StdDev=5056.598
45+
NoData Value=0
46+
Metadata:
47+
STATISTICS_MAXIMUM=58353
48+
STATISTICS_MEAN=26265.213704928
49+
STATISTICS_MINIMUM=4948
50+
STATISTICS_STDDEV=5056.5981159613
51+
STATISTICS_VALID_PERCENT=46.72
52+
```
53+
54+
## Data location
55+
56+
### Worldcover grid cells layer
57+
58+
```
59+
s3://clay-california-worldcover-rgbnir-vvvh-chips/esa_wordlcover_grid_california.fgb
60+
```
61+
62+
### Index file for all the chips
63+
64+
```
65+
s3://clay-california-worldcover-rgbnir-vvvh-chips/california-worldcover-chips.fgb
66+
```
67+
68+
### Chips
69+
70+
Chips are stored under the `chips` common prefix on S3
71+
72+
```
73+
s3://clay-california-worldcover-rgbnir-vvvh-chips/chips/
74+
```

Diff for: scripts/california_tiles.py

+114
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
from pathlib import Path
2+
3+
import boto3
4+
import geopandas as gpd
5+
import numpy
6+
import pandas as pd
7+
import rasterio
8+
from rasterio.windows import Window, bounds, transform
9+
from shapely import box
10+
11+
wd = Path("~/Desktop/california").expanduser()
12+
13+
# Download worldcover layers over california
14+
grid = gpd.read_file(
15+
"s3://clay-california-worldcover-rgbnir-vvvh-chips/esa_wordlcover_grid_california.fgb"
16+
)
17+
s3 = boto3.client("s3")
18+
19+
for id, tile in grid.iterrows():
20+
print(
21+
"esa-worldcover-s2",
22+
tile.s2_rgbnir_2021.split("s3://esa-worldcover-s2/")[1],
23+
f"{wd}/rgbnir/{tile.s2_rgbnir_2021.split('/')[-1]}",
24+
)
25+
s3.download_file(
26+
"esa-worldcover-s2",
27+
tile.s2_rgbnir_2021.split("s3://esa-worldcover-s2/")[1],
28+
f"{wd}/rgbnir/{tile.s2_rgbnir_2021.split('/')[-1]}",
29+
)
30+
31+
32+
for id, tile in grid.iterrows():
33+
print(
34+
"esa-worldcover-s1",
35+
tile.s1_vvvhratio_2021.split("s3://esa-worldcover-s1/")[1],
36+
f"{wd}/vvvhratio/{tile.s1_vvvhratio_2021.split('/')[-1]}",
37+
)
38+
s3.download_file(
39+
"esa-worldcover-s1",
40+
tile.s1_vvvhratio_2021.split("s3://esa-worldcover-s1/")[1],
41+
f"{wd}/vvvhratio/{tile.s1_vvvhratio_2021.split('/')[-1]}",
42+
)
43+
44+
# Merge all into very large files
45+
"""
46+
gdal_merge.py -co BIGTIFF=YES -co BLOCKXSIZE=256 -co BLOCKYSIZE=256 -co TILED=YES -co COMPRESS=DEFLATE -o ${wd}/california_rgbnir_worldcover.tif ${wd}/rgbnir/*.tif
47+
gdal_merge.py -co BIGTIFF=YES -co BLOCKXSIZE=256 -co BLOCKYSIZE=256 -co TILED=YES -co COMPRESS=DEFLATE -o ${wd}/california_vvvhratio_worldcover.tif ${wd}/vvvhratio/*.tif
48+
"""
49+
50+
# Create chips from merged files
51+
RGB = wd / "california_rgbnir_worldcover.tif"
52+
VVVH = wd / "california_vvvhratio_worldcover.tif"
53+
TILE_SIZE = 256
54+
NO_DATA = 0
55+
count = 0
56+
boxes = []
57+
cols = []
58+
rows = []
59+
with rasterio.open(RGB) as rgbnir:
60+
meta = rgbnir.meta.copy()
61+
meta["count"] = 6
62+
meta["width"] = TILE_SIZE
63+
meta["height"] = TILE_SIZE
64+
meta["compress"] = "deflate"
65+
with rasterio.open(VVVH) as vvvh:
66+
for i in range(0, rgbnir.width, TILE_SIZE):
67+
for j in range(0, rgbnir.height, TILE_SIZE):
68+
dst_path = wd / f"chips/worldcover_california_chip_{i}_{j}.tif"
69+
# if dst_path.exists():
70+
# continue
71+
win = Window(i, j, TILE_SIZE, TILE_SIZE)
72+
meta["transform"] = transform(win, rgbnir.transform)
73+
rgbnir_chip = rgbnir.read([1, 2, 3, 4], window=win)
74+
if 0 in rgbnir_chip.shape:
75+
continue
76+
if rgbnir_chip.shape[1] != TILE_SIZE:
77+
continue
78+
if rgbnir_chip.shape[2] != TILE_SIZE:
79+
continue
80+
# Filter at 10% nodata
81+
if numpy.sum(rgbnir_chip[2] == NO_DATA) > (TILE_SIZE * TILE_SIZE) / 10:
82+
continue
83+
vvvh_chip = vvvh.read([1, 2], window=win)
84+
count += 1
85+
data = numpy.vstack([rgbnir_chip, vvvh_chip])
86+
with rasterio.open(dst_path, "w", **meta) as dst:
87+
dst.write(data)
88+
# Track bounding boxes
89+
boxes.append(box(*bounds(win, rgbnir.transform)))
90+
cols.append(i)
91+
rows.append(j)
92+
print(f"Done with column {i} / {rgbnir.width}")
93+
94+
95+
chips = gpd.GeoDataFrame(
96+
pd.DataFrame(
97+
{
98+
"col": cols,
99+
"row": rows,
100+
}
101+
),
102+
crs="EPSG:4326",
103+
geometry=boxes,
104+
)
105+
106+
chips.to_file(
107+
"s3://clay-california-worldcover-rgbnir-vvvh-chips/california-worldcover-chips.fgb"
108+
)
109+
110+
111+
# Upload chips to s3
112+
"""
113+
s5cmd sync ${wd}/chips/ s3://clay-california-worldcover-rgbnir-vvvh-chips/chips/
114+
"""

0 commit comments

Comments
 (0)