ingest #7011
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: ingest | |
on: | |
# push: {} | |
schedule: | |
- cron: "45 */3 * * *" | |
workflow_dispatch: {} | |
concurrency: ingest | |
env: | |
INFURA_IPFS_PROJECT_ID: ${{ secrets.INFURA_IPFS_PROJECT_ID }} | |
INFURA_IPFS_PROJECT_SECRET: ${{ secrets.INFURA_IPFS_PROJECT_SECRET }} | |
PINATA_API_URL: https://api.pinata.cloud/psa | |
PINATA_ACCESS_TOKEN: ${{ secrets.PINATA_ACCESS_TOKEN }} | |
# TODO: We use public gateway as local fetches always timeout :( | |
# IPFS_HTTP_GATEWAY: http://localhost:8080 | |
IPFS_HTTP_GATEWAY: https://kamu.infura-ipfs.io | |
# IPFS_HTTP_GATEWAY: https://gateway.pinata.cloud | |
AWS_ROUTE53_HOSTED_ZONE_ID: Z25CPWZR4S9IHS | |
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
AWS_DEFAULT_REGION: us-west-2 | |
AWS_ROUTE53_TTL: 60 | |
AWS_S3_BASE_URL: s3://datasets.kamu.dev/odf/v2/contrib | |
# Required by dataset fetch steps | |
ETH_NODE_PROVIDER_URL: ${{ secrets.INFURA_NODE_URL }} | |
ALPHA_VANTAGE_API_KEY: ${{ secrets.ALPHA_VANTAGE_API_KEY }} | |
jobs: | |
update: | |
name: Update Dataset | |
# TODO: Run all matrix jobs to the end even if one fails | |
# continue-on-error: true | |
strategy: | |
fail-fast: false | |
# TODO: To decrease the chance of public gateway rate limiting | |
# max-parallel: 1 | |
matrix: | |
include: | |
# - dataset_name: com.coinpaprika.tickers.eth-usd | |
# ipns_id: k51qzi5uqu5dgxqmxnxblkbh0z56k4o7rpxani1u6s3z5u2oew1zerzbxhcs3j | |
# ipns_key_secret: COINPAPRIKA_TICKERS_ETH_USD_KEY | |
# dnslink_subdomain: com.coinpaprika.tickers.eth-usd.ipns.kamu.dev | |
- dataset_name: com.cryptocompare.ohlcv.eth-usd | |
ipns_id: k51qzi5uqu5di44ukms656i7lfpx3kzy1eotzaa514pterzr6d73ee3ndmbs4t | |
ipns_key_secret: CRYPTOCOMPARE_OHLCV_ETH_USD_KEY | |
dnslink_subdomain: com.cryptocompare.ohlcv.eth-usd.ipns.kamu.dev | |
- dataset_name: net.rocketpool.reth.mint-burn | |
ipns_id: k51qzi5uqu5dj6e8rprw175y8tqt55ipji4c1maqqtpxl11gpnpd3qb7luta6r | |
ipns_key_secret: NET_ROCKETPOOL_RETH_MINT_BURN_KEY | |
dnslink_subdomain: net.rocketpool.reth.mint-burn.ipns.kamu.dev | |
- dataset_name: co.alphavantage.tickers.daily.spy | |
ipns_id: k51qzi5uqu5djln0j1uan84f9btedeqfi9inj1exw8xj4dp9tpc2znbedkcazy | |
ipns_key_secret: CO_ALPHAVANTAGE_TICKERS_DAILY_SPY_KEY | |
dnslink_subdomain: co.alphavantage.tickers.daily.spy.ipns.kamu.dev | |
runs-on: ubuntu-latest | |
steps: | |
- uses: ibnesayeed/setup-ipfs@master | |
with: | |
ipfs_version: ^0.19 | |
run_daemon: true | |
- name: Install kamu | |
shell: bash | |
run: | | |
curl -s "https://get.kamu.dev" | sh | |
- name: Print info | |
shell: bash | |
run: | | |
echo "Work dir: $PWD" | |
echo "IPFS version:" | |
ipfs version | |
ipfs id | |
echo "Podman version:" | |
podman version | |
echo "AWS CLI version:" | |
aws --version | |
echo "Kamu version:" | |
kamu --version | |
- uses: actions/cache@v3 | |
with: | |
path: | | |
.kamu | |
# TODO: Currently cache action will not update the cache upon hit and there is no option to do so | |
# Below uses a workaround suggested here: https://github.com/actions/cache/issues/342#issuecomment-673371329 | |
key: cachegen-9-kamu-dataset-${{ matrix.dataset_name }}-${{ github.run_id }} | |
restore-keys: | | |
cachegen-9-kamu-dataset-${{ matrix.dataset_name }}- | |
# TODO: IPFS daemon often times out when searching for a CID :( | |
# We point kamu to Pinata's own IPFS gateway to increase chances of resolving it fast. | |
# This however puts us at risk of blowing past the rate limits on the public gateway. | |
- name: Configure | |
shell: bash | |
env: | |
IPNS_KEY: ${{ secrets[matrix.ipns_key_secret] }} | |
run: | | |
# Add IPNS key | |
echo $IPNS_KEY | base64 -d > ipns_key | |
ipfs key import ${{ matrix.dataset_name }} ipns_key | |
rm ipns_key | |
# Add pinning service | |
# ipfs pin remote service add svc "$PINATA_API_URL" "$PINATA_ACCESS_TOKEN" | |
# Create workspace and point kamu to IPFS gateway | |
kamu init || true | |
kamu list --output-format table -w | |
kamu config set protocol.ipfs.httpGateway "$IPFS_HTTP_GATEWAY" | |
# TODO: We use DNSLink instead of IPNS due to IPNS's current usability issues like: | |
# - frequent loss of writes | |
# - lack of read-after-write consistency | |
# - abysmal resolution times and frequent timeouts | |
# | |
# - name: Resolve dataset | |
# shell: bash | |
# run: | | |
# ipfs_cid=$(ipfs name resolve "/ipns/$IPNS_ID" | grep -oE "[^/]+$") | |
# echo "Current IPFS CID: $ipfs_cid" | |
- name: Resolve dataset | |
id: resolve-dataset | |
shell: bash | |
run: | | |
ipfs_cid=$( \ | |
aws route53 list-resource-record-sets \ | |
--hosted-zone-id $AWS_ROUTE53_HOSTED_ZONE_ID \ | |
| jq -r ".ResourceRecordSets[] | select(.Name == \"_dnslink.${{ matrix.dnslink_subdomain }}.\") | .ResourceRecords[0].Value" \ | |
| grep -oE '[^/]+"$' \ | |
| grep -oE '^[^"]+' | |
) | |
echo "Current IPFS CID (from DNSLink): $ipfs_cid" | |
echo "CID=$ipfs_cid" >> $GITHUB_OUTPUT | |
# # TODO: This always times out! | |
# - name: Pin dataset in local IPFS node | |
# env: | |
# CURRENT_CID: ${{ steps.resolve-dataset.outputs.CID }} | |
# shell: bash | |
# run: | | |
# echo "Waiting for peers" | |
# sleep 60 | |
# | |
# # echo "Searching providers" | |
# # timeout 120 ipfs dht findprovs "/ipfs/$CURRENT_CID" | |
# | |
# ipfs_peers=$(ipfs swarm peers | wc -l) | |
# echo "Pinning CID $CURRENT_CID (currently connected to $ipfs_peers peers)" | |
# timeout 300 ipfs pin add "/ipfs/$CURRENT_CID" | |
# # TODO: Repeatedly downloading datasets from IPFS gateways causes a lot of strain once the number of blocks gets high | |
# # so currently we are basing all update on S3 and only pin to IPFS as the last step | |
# - name: Materialize dataset from IPFS | |
# env: | |
# CURRENT_CID: ${{ steps.resolve-dataset.outputs.CID }} | |
# shell: bash | |
# run: | | |
# kamu -v pull "ipfs://$CURRENT_CID" --as ${{ matrix.dataset_name }} --no-alias | |
# kamu list --output-format table -w | |
- name: Sync dataset from S3 | |
shell: bash | |
run: | | |
kamu -v pull "$AWS_S3_BASE_URL/${{ matrix.dataset_name }}" --as ${{ matrix.dataset_name }} --no-alias | |
kamu list --output-format table -w | |
- name: Ingest new data | |
shell: bash | |
run: | | |
kamu -v pull --fetch-uncacheable ${{ matrix.dataset_name }} | |
kamu list --output-format table -w | |
- name: Publish dataset to S3 | |
shell: bash | |
run: | | |
kamu push ${{ matrix.dataset_name }} --to "$AWS_S3_BASE_URL/${{ matrix.dataset_name }}" --no-alias | |
# TODO: We are not using `kamu push` here to avoid resolving and publishing into IPNS. | |
# IPNS has proven to be unreliable and cause frequent timeouts, so we limit interactions to IPFS and then do | |
# best-effort attempt to publish to IPNS. | |
- name: Publish dataset to IPFS | |
shell: bash | |
env: | |
OLD_CID: ${{ steps.resolve-dataset.outputs.CID }} | |
PIN_TRIES: "10" | |
run: | | |
# Add dataset to IPFS | |
ipfs_cid=$(kamu -v system ipfs add ${{ matrix.dataset_name }}) | |
echo "New IPFS CID: $ipfs_cid" | |
if [ "$OLD_CID" == "$ipfs_cid" ]; then | |
echo "CID didn't change - skipping publishing" | |
else | |
# Pin new CID to replicate data to the pinning service | |
for i in $(seq 1 $PIN_TRIES); do | |
echo "Pinning CID (attempt $i)" | |
if curl --fail -sS -X POST -u "$INFURA_IPFS_PROJECT_ID:$INFURA_IPFS_PROJECT_SECRET" "https://ipfs.infura.io:5001/api/v0/pin/add?arg=$ipfs_cid"; then | |
echo "Successfully pinned" | |
break | |
else | |
echo "Pinning service returned an error: $response" | |
# TODO: Remove this debug stuff when pinning is reliable | |
peers_num=$(ipfs swarm peers | wc -l) | |
echo "IPFS is connected to $peers_num peers" | |
echo "IPFS addresses:" | |
ipfs id | |
if curl -s --fail "https://dweb.link/ipfs/$ipfs_cid" > /dev/null; then | |
echo "CID is RESOLVABLE via public gateway" | |
else | |
echo "CID is NOT resolvable via public gateway" | |
fi | |
if [ $i == $PIN_TRIES ]; then | |
echo "Failed to pin CID after $PIN_TRIES attempts" | |
# TODO: Pinning fails so often that we have to ignore this for now | |
# not to obscure ingest errors. | |
exit 0 | |
fi | |
fi | |
done | |
# timeout 60 ipfs pin remote add --service svc --name ${{ matrix.dataset_name }} "/ipfs/$ipfs_cid" | |
# Update DNSLink entry | |
echo "{ | |
\"Comment\": \"GHAction update of DNSLink record\", | |
\"Changes\": [ | |
{ | |
\"Action\": \"UPSERT\", | |
\"ResourceRecordSet\": { | |
\"Name\": \"_dnslink.${{ matrix.dnslink_subdomain }}.\", | |
\"Type\": \"TXT\", | |
\"TTL\": $AWS_ROUTE53_TTL, | |
\"ResourceRecords\": [{ | |
\"Value\": \"\\\"dnslink=/ipfs/$ipfs_cid\\\"\" | |
}] | |
} | |
} | |
] | |
}" > dns-upsert.json | |
echo "Updating DNSLink entry with: $(cat dns-upsert.json)" | |
aws route53 change-resource-record-sets --hosted-zone-id $AWS_ROUTE53_HOSTED_ZONE_ID --change-batch file://dns-upsert.json | |
# TODO: IPNS is not ready for use | |
# # Try publishing to IPNS (best effort) | |
# echo "Attempting to publish to IPNS" | |
# timeout 60 ipfs name publish --resolve=false --key=${{ matrix.dataset_name }} /ipfs/$ipfs_cid || true | |
fi | |
# - name: Ingest & push loop | |
# shell: bash | |
# run: | | |
# stop_time=$((SECONDS + LOOP_TOTAL_SEC)) | |
# while [ $SECONDS -lt $stop_time ]; do | |
# # Chill | |
# echo "Sleeping for $LOOP_SLEEP_SEC seconds..." | |
# sleep $LOOP_SLEEP_SEC | |
# done |