Skip to content

Commit

Permalink
feat(airflow): add astro cli bulk load
Browse files Browse the repository at this point in the history
  • Loading branch information
Lee-W committed Dec 5, 2023
1 parent 1aeb78a commit 236162a
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 3 deletions.
16 changes: 14 additions & 2 deletions airflow/dags/ingestion/ask-astro-load.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import pandas as pd
from include.tasks import split
from include.tasks.extract import airflow_docs, blogs, github, registry, stack_overflow
from include.tasks.extract import airflow_docs, astro_cli_docs, blogs, github, registry, stack_overflow
from include.tasks.extract.utils.weaviate.ask_astro_weaviate_hook import AskAstroWeaviateHook

from airflow.decorators import dag, task
Expand Down Expand Up @@ -177,6 +177,17 @@ def extract_airflow_docs():

return [df]

@task(trigger_rule="none_failed")
def extract_astro_cli_docs():
astro_cli_parquet_path = "include/data/astronomer/docs/astro-cli.parquet"
try:
df = pd.read_parquet(astro_cli_parquet_path)
except Exception:
df = astro_cli_docs.extract_astro_cli_docs()[0]
df.to_parquet(astro_cli_parquet_path)

return [df]

@task(trigger_rule="none_failed")
def extract_stack_overflow(tag: str, stackoverflow_cutoff_date: str = stackoverflow_cutoff_date):
try:
Expand Down Expand Up @@ -237,6 +248,7 @@ def extract_astro_blogs():
registry_dags_docs = extract_astro_registry_dags()
code_samples = extract_github_python.expand(source=code_samples_sources)
_airflow_docs = extract_airflow_docs()
_astro_cli_docs = extract_astro_cli_docs()

_get_schema = get_schema_and_process(schema_file="include/data/schema.json")
_check_schema = check_schema(class_objects=_get_schema)
Expand All @@ -251,7 +263,7 @@ def extract_astro_blogs():
registry_cells_docs,
]

html_tasks = [_airflow_docs]
html_tasks = [_airflow_docs, _astro_cli_docs]

python_code_tasks = [registry_dags_docs, code_samples]

Expand Down
2 changes: 1 addition & 1 deletion airflow/include/tasks/extract/astro_cli_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def extract_astro_cli_docs() -> list[pd.DataFrame]:
df["content"] = df["content"].apply(lambda x: re.sub("¶", "", x))

df["sha"] = df["content"].apply(generate_uuid5)
df["docSource"] = "apache/airflow/docs"
df["docSource"] = "astronomer/docs/astro-cli"
df.reset_index(drop=True, inplace=True)

# column order matters for uuid generation
Expand Down

0 comments on commit 236162a

Please sign in to comment.