diff --git a/airflow/include/tasks/extract/astro_forum_docs.py b/airflow/include/tasks/extract/astro_forum_docs.py index 81044e04..26a08d49 100644 --- a/airflow/include/tasks/extract/astro_forum_docs.py +++ b/airflow/include/tasks/extract/astro_forum_docs.py @@ -1,6 +1,7 @@ from __future__ import annotations import logging +import time from datetime import datetime import pandas as pd @@ -49,6 +50,7 @@ def filter_cutoff_questions(questions_urls: list[str]) -> list[str]: for question_url in questions_urls: try: html_content = requests.get(question_url).content + time.sleep(1) except requests.RequestException as e: logger.error(f"Error fetching content for {question_url}: {e}") continue # Move on to the next iteration @@ -75,6 +77,7 @@ def get_cutoff_questions(forum_url: str) -> set[str]: base_url = f"{forum_url}?page=" all_valid_url = [] while True: + time.sleep(1) page_url = f"{base_url}{page_number}" logger.info(page_url) page_number = page_number + 1 diff --git a/airflow/include/tasks/extract/utils/html_utils.py b/airflow/include/tasks/extract/utils/html_utils.py index 2f30db8d..1caac139 100644 --- a/airflow/include/tasks/extract/utils/html_utils.py +++ b/airflow/include/tasks/extract/utils/html_utils.py @@ -1,6 +1,7 @@ from __future__ import annotations import logging +import time from urllib.parse import urljoin, urlparse import pandas as pd @@ -31,7 +32,7 @@ def fetch_page_content(url: str) -> str: param url: The url of a page """ try: - response = requests.get(url) + response = requests.get(url, headers={"User-agent": "Ask Astro"}) response.raise_for_status() # Raise an HTTPError for bad responses return response.content except requests.RequestException: @@ -179,6 +180,7 @@ def urls_to_dataframe( """ content_list = [] for url in urls: + time.sleep(1) data = process_url(url, doc_source, clean_tag, truncate_text) if data: content_list.append(data)