Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Quick Fix] Add wait time between web scrapping requests #269

Merged
merged 1 commit into from
Jan 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions airflow/include/tasks/extract/astro_forum_docs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import logging
import time
from datetime import datetime

import pandas as pd
Expand Down Expand Up @@ -49,6 +50,7 @@ def filter_cutoff_questions(questions_urls: list[str]) -> list[str]:
for question_url in questions_urls:
try:
html_content = requests.get(question_url).content
time.sleep(1)
davidgxue marked this conversation as resolved.
Show resolved Hide resolved
except requests.RequestException as e:
logger.error(f"Error fetching content for {question_url}: {e}")
continue # Move on to the next iteration
Expand All @@ -75,6 +77,7 @@ def get_cutoff_questions(forum_url: str) -> set[str]:
base_url = f"{forum_url}?page="
all_valid_url = []
while True:
time.sleep(1)
page_url = f"{base_url}{page_number}"
logger.info(page_url)
page_number = page_number + 1
Expand Down
4 changes: 3 additions & 1 deletion airflow/include/tasks/extract/utils/html_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import logging
import time
from urllib.parse import urljoin, urlparse

import pandas as pd
Expand Down Expand Up @@ -31,7 +32,7 @@ def fetch_page_content(url: str) -> str:
param url: The url of a page
"""
try:
response = requests.get(url)
response = requests.get(url, headers={"User-agent": "Ask Astro"})
response.raise_for_status() # Raise an HTTPError for bad responses
return response.content
except requests.RequestException:
Expand Down Expand Up @@ -179,6 +180,7 @@ def urls_to_dataframe(
"""
content_list = []
for url in urls:
time.sleep(1)
data = process_url(url, doc_source, clean_tag, truncate_text)
if data:
content_list.append(data)
Expand Down
Loading