Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
5062c30
replace scraper logic with github .md files parsing
rudransh-shrivastava Sep 14, 2025
ca2e0e1
Update code(add tests and coderabbit suggestions)
rudransh-shrivastava Sep 15, 2025
044c95a
Update code
rudransh-shrivastava Sep 15, 2025
c0804c1
reduce cognitive complexity
rudransh-shrivastava Sep 15, 2025
277c16e
Update text
rudransh-shrivastava Sep 15, 2025
158c76c
Merge branch 'main' into feature/migrate-scraper-completely
rudransh-shrivastava Sep 15, 2025
e5559d1
Merge branch 'main' into feature/migrate-scraper-completely
kasya Sep 16, 2025
0f55b4b
remove *scrape* files and move logic to update-owasp-organization
rudransh-shrivastava Sep 16, 2025
dd3dff2
update code to fix tests
rudransh-shrivastava Sep 16, 2025
6b4e5ed
add tests for github_update_owasp_organization
rudransh-shrivastava Sep 17, 2025
e51acd7
update code
rudransh-shrivastava Sep 17, 2025
de99125
Merge branch 'main' into feature/migrate-scraper-completely
rudransh-shrivastava Sep 17, 2025
2059679
Merge branch 'main' into feature/migrate-scraper-completely
rudransh-shrivastava Sep 19, 2025
77bceb0
Merge branch 'main' into feature/migrate-scraper-completely
rudransh-shrivastava Sep 20, 2025
1596833
refactor markdown sync logic
rudransh-shrivastava Sep 20, 2025
71ee9b5
update tests
rudransh-shrivastava Sep 20, 2025
f4f8677
add tests for _verify_url method
rudransh-shrivastava Sep 20, 2025
183866d
add tests for _verify_url Project method
rudransh-shrivastava Sep 20, 2025
e5d6f54
Merge branch 'main' into feature/migrate-scraper-completely
rudransh-shrivastava Sep 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions backend/apps/owasp/management/commands/owasp_scrape_chapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
import time

from django.core.management.base import BaseCommand
from github.GithubException import GithubException, UnknownObjectException

from apps.github.auth import get_github_client
from apps.github.utils import normalize_url
from apps.owasp.models.chapter import Chapter
from apps.owasp.scraper import OwaspScraper

logger: logging.Logger = logging.getLogger(__name__)

Expand All @@ -26,6 +27,8 @@ def add_arguments(self, parser) -> None:

def handle(self, *args, **options) -> None:
"""Handle the command execution."""
gh = get_github_client()

active_chapters = Chapter.active_chapters.order_by("-created_at")
active_chapters_count = active_chapters.count()
offset = options["offset"]
Expand All @@ -34,10 +37,15 @@ def handle(self, *args, **options) -> None:
prefix = f"{idx + offset + 1} of {active_chapters_count}"
print(f"{prefix:<10} {chapter.owasp_url}")

scraper = OwaspScraper(chapter.owasp_url)
if scraper.page_tree is None:
try:
gh.get_repo(f"owasp/{chapter.key}")
except UnknownObjectException:
chapter.deactivate()
continue
except GithubException as e:
logger.warning("GitHub API error for %s: %s", chapter.key, e)
time.sleep(1)
continue

chapter.leaders_raw = chapter.get_leaders()
if leaders_emails := chapter.get_leaders_emails():
Expand All @@ -62,7 +70,7 @@ def handle(self, *args, **options) -> None:
invalid_urls = set()
related_urls = set()
for scraped_url in scraped_urls:
verified_url = scraper.verify_url(scraped_url)
verified_url = chapter.verify_url(scraped_url)
if not verified_url:
invalid_urls.add(scraped_url)
continue
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
import time

from django.core.management.base import BaseCommand
from github.GithubException import GithubException, UnknownObjectException

from apps.github.auth import get_github_client
from apps.github.utils import normalize_url
from apps.owasp.models.committee import Committee
from apps.owasp.scraper import OwaspScraper

logger: logging.Logger = logging.getLogger(__name__)

Expand All @@ -26,6 +27,8 @@ def add_arguments(self, parser) -> None:

def handle(self, *args, **options) -> None:
"""Handle the command execution."""
gh = get_github_client()

active_committees = Committee.active_committees.order_by("-created_at")
active_committees_count = active_committees.count()
offset = options["offset"]
Expand All @@ -34,10 +37,15 @@ def handle(self, *args, **options) -> None:
prefix = f"{idx + offset + 1} of {active_committees_count}"
print(f"{prefix:<10} {committee.owasp_url}")

scraper = OwaspScraper(committee.owasp_url)
if scraper.page_tree is None:
try:
gh.get_repo(f"owasp/{committee.key}")
except UnknownObjectException:
committee.deactivate()
continue
except GithubException as e:
logger.warning("GitHub API error for %s: %s", committee.key, e)
time.sleep(1)
continue

committee.leaders_raw = committee.get_leaders()
if leaders_emails := committee.get_leaders_emails():
Expand All @@ -62,7 +70,7 @@ def handle(self, *args, **options) -> None:
invalid_urls = set()
related_urls = set()
for scraped_url in scraped_urls:
verified_url = scraper.verify_url(scraped_url)
verified_url = committee.verify_url(scraped_url)
if not verified_url:
invalid_urls.add(scraped_url)
continue
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,12 @@
import time

from django.core.management.base import BaseCommand
from github.GithubException import UnknownObjectException
from github.GithubException import GithubException, UnknownObjectException

from apps.github.auth import get_github_client
from apps.github.constants import GITHUB_USER_RE
from apps.github.utils import normalize_url
from apps.owasp.models.project import Project
from apps.owasp.scraper import OwaspScraper

logger: logging.Logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -46,10 +45,15 @@ def handle(self, *args, **options) -> None:
prefix = f"{idx + offset + 1} of {active_projects_count}"
print(f"{prefix:<10} {project.owasp_url}")

scraper = OwaspScraper(project.owasp_url)
if scraper.page_tree is None:
try:
gh.get_repo(f"owasp/{project.key}")
except UnknownObjectException:
project.deactivate()
continue
except GithubException as e:
logger.warning("GitHub API error for %s: %s", project.key, e)
time.sleep(1)
continue

project.audience = project.get_audience()
project.leaders_raw = project.get_leaders()
Expand All @@ -69,7 +73,7 @@ def handle(self, *args, **options) -> None:
invalid_urls: set[str] = set()
related_urls: set[str] = set()
for scraped_url in scraped_urls:
verified_url = scraper.verify_url(scraped_url)
verified_url = project.verify_url(scraped_url)
if not verified_url:
invalid_urls.add(scraped_url)
continue
Expand Down
34 changes: 34 additions & 0 deletions backend/apps/owasp/models/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
import itertools
import logging
import re
from http import HTTPStatus
from urllib.parse import urlparse

import requests
import yaml
from django.contrib.contenttypes.models import ContentType
from django.db import models
Expand Down Expand Up @@ -345,3 +347,35 @@ def sync_leaders(self, leaders_emails):

if leaders:
BulkSaveModel.bulk_save(EntityMember, leaders)

def verify_url(self, url):
"""Verify URL."""
location = urlparse(url).netloc.lower()
if not location:
return None

if location.endswith(("linkedin.com", "slack.com", "youtube.com")):
return url

try:
# Check for redirects.
response = requests.get(url, allow_redirects=False, timeout=(5, 10))
except requests.exceptions.RequestException:
logger.exception("Request failed", extra={"url": url})
return None

if response.status_code == HTTPStatus.OK:
return url

if response.status_code in {
HTTPStatus.MOVED_PERMANENTLY, # 301
HTTPStatus.FOUND, # 302
HTTPStatus.SEE_OTHER, # 303
HTTPStatus.TEMPORARY_REDIRECT, # 307
HTTPStatus.PERMANENT_REDIRECT, # 308
}:
return self.verify_url(response.headers["Location"])

logger.warning("Couldn't verify URL %s", url)

return None
127 changes: 0 additions & 127 deletions backend/apps/owasp/scraper.py

This file was deleted.

Loading