Skip to content

Commit 5062c30

Browse files
replace scraper logic with github .md files parsing
1 parent 3069dc0 commit 5062c30

File tree

10 files changed

+162
-388
lines changed

10 files changed

+162
-388
lines changed

backend/apps/owasp/management/commands/owasp_scrape_chapters.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@
44
import time
55

66
from django.core.management.base import BaseCommand
7+
from github.GithubException import UnknownObjectException
78

9+
from apps.github.auth import get_github_client
810
from apps.github.utils import normalize_url
911
from apps.owasp.models.chapter import Chapter
10-
from apps.owasp.scraper import OwaspScraper
1112

1213
logger: logging.Logger = logging.getLogger(__name__)
1314

@@ -26,6 +27,8 @@ def add_arguments(self, parser) -> None:
2627

2728
def handle(self, *args, **options) -> None:
2829
"""Handle the command execution."""
30+
gh = get_github_client()
31+
2932
active_chapters = Chapter.active_chapters.order_by("-created_at")
3033
active_chapters_count = active_chapters.count()
3134
offset = options["offset"]
@@ -34,8 +37,9 @@ def handle(self, *args, **options) -> None:
3437
prefix = f"{idx + offset + 1} of {active_chapters_count}"
3538
print(f"{prefix:<10} {chapter.owasp_url}")
3639

37-
scraper = OwaspScraper(chapter.owasp_url)
38-
if scraper.page_tree is None:
40+
try:
41+
gh.get_repo(f"owasp/{chapter.key}")
42+
except UnknownObjectException:
3943
chapter.deactivate()
4044
continue
4145

@@ -62,7 +66,7 @@ def handle(self, *args, **options) -> None:
6266
invalid_urls = set()
6367
related_urls = set()
6468
for scraped_url in scraped_urls:
65-
verified_url = scraper.verify_url(scraped_url)
69+
verified_url = chapter.verify_url(scraped_url)
6670
if not verified_url:
6771
invalid_urls.add(scraped_url)
6872
continue

backend/apps/owasp/management/commands/owasp_scrape_committees.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@
44
import time
55

66
from django.core.management.base import BaseCommand
7+
from github.GithubException import UnknownObjectException
78

9+
from apps.github.auth import get_github_client
810
from apps.github.utils import normalize_url
911
from apps.owasp.models.committee import Committee
10-
from apps.owasp.scraper import OwaspScraper
1112

1213
logger: logging.Logger = logging.getLogger(__name__)
1314

@@ -26,6 +27,8 @@ def add_arguments(self, parser) -> None:
2627

2728
def handle(self, *args, **options) -> None:
2829
"""Handle the command execution."""
30+
gh = get_github_client()
31+
2932
active_committees = Committee.active_committees.order_by("-created_at")
3033
active_committees_count = active_committees.count()
3134
offset = options["offset"]
@@ -34,8 +37,9 @@ def handle(self, *args, **options) -> None:
3437
prefix = f"{idx + offset + 1} of {active_committees_count}"
3538
print(f"{prefix:<10} {committee.owasp_url}")
3639

37-
scraper = OwaspScraper(committee.owasp_url)
38-
if scraper.page_tree is None:
40+
try:
41+
gh.get_repo(f"owasp/{committee.key}")
42+
except UnknownObjectException:
3943
committee.deactivate()
4044
continue
4145

@@ -62,7 +66,7 @@ def handle(self, *args, **options) -> None:
6266
invalid_urls = set()
6367
related_urls = set()
6468
for scraped_url in scraped_urls:
65-
verified_url = scraper.verify_url(scraped_url)
69+
verified_url = committee.verify_url(scraped_url)
6670
if not verified_url:
6771
invalid_urls.add(scraped_url)
6872
continue

backend/apps/owasp/management/commands/owasp_scrape_projects.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
from apps.github.constants import GITHUB_USER_RE
1111
from apps.github.utils import normalize_url
1212
from apps.owasp.models.project import Project
13-
from apps.owasp.scraper import OwaspScraper
1413

1514
logger: logging.Logger = logging.getLogger(__name__)
1615

@@ -46,8 +45,9 @@ def handle(self, *args, **options) -> None:
4645
prefix = f"{idx + offset + 1} of {active_projects_count}"
4746
print(f"{prefix:<10} {project.owasp_url}")
4847

49-
scraper = OwaspScraper(project.owasp_url)
50-
if scraper.page_tree is None:
48+
try:
49+
gh.get_repo(f"owasp/{project.key}")
50+
except UnknownObjectException:
5151
project.deactivate()
5252
continue
5353

@@ -69,7 +69,7 @@ def handle(self, *args, **options) -> None:
6969
invalid_urls: set[str] = set()
7070
related_urls: set[str] = set()
7171
for scraped_url in scraped_urls:
72-
verified_url = scraper.verify_url(scraped_url)
72+
verified_url = project.verify_url(scraped_url)
7373
if not verified_url:
7474
invalid_urls.add(scraped_url)
7575
continue

backend/apps/owasp/models/common.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@
55
import itertools
66
import logging
77
import re
8+
from http import HTTPStatus
89
from urllib.parse import urlparse
910

11+
import requests
1012
import yaml
1113
from django.contrib.contenttypes.models import ContentType
1214
from django.db import models
@@ -25,6 +27,8 @@
2527

2628
logger = logging.getLogger(__name__)
2729

30+
TIMEOUT = 5, 10
31+
2832

2933
class RepositoryBasedEntityModel(models.Model):
3034
"""Repository based entity model."""
@@ -345,3 +349,35 @@ def sync_leaders(self, leaders_emails):
345349

346350
if leaders:
347351
BulkSaveModel.bulk_save(EntityMember, leaders)
352+
353+
def verify_url(self, url):
354+
"""Verify URL."""
355+
location = urlparse(url).netloc.lower()
356+
if not location:
357+
return None
358+
359+
if location.endswith(("linkedin.com", "slack.com", "youtube.com")):
360+
return url
361+
362+
try:
363+
# Check for redirects.
364+
response = requests.get(url, allow_redirects=False, timeout=TIMEOUT)
365+
except requests.exceptions.RequestException:
366+
logger.exception("Request failed", extra={"url": url})
367+
return None
368+
369+
if response.status_code == HTTPStatus.OK:
370+
return url
371+
372+
if response.status_code in {
373+
HTTPStatus.MOVED_PERMANENTLY, # 301
374+
HTTPStatus.FOUND, # 302
375+
HTTPStatus.SEE_OTHER, # 303
376+
HTTPStatus.TEMPORARY_REDIRECT, # 307
377+
HTTPStatus.PERMANENT_REDIRECT, # 308
378+
}:
379+
return self.verify_url(response.headers["Location"])
380+
381+
logger.warning("Couldn't verify URL %s", url)
382+
383+
return None

backend/apps/owasp/scraper.py

Lines changed: 0 additions & 127 deletions
This file was deleted.

backend/tests/apps/owasp/management/commands/owasp_scrape_chapters_test.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1+
import os
12
from unittest import mock
23

34
import pytest
45

56
from apps.owasp.management.commands.owasp_scrape_chapters import (
67
Chapter,
78
Command,
8-
OwaspScraper,
99
normalize_url,
1010
)
1111

@@ -34,17 +34,16 @@ def mock_chapter(self):
3434
(1, 8),
3535
],
3636
)
37-
@mock.patch.dict("os.environ", {"SCRAPER_API_KEY": "test-token"})
37+
@mock.patch.dict(os.environ, {"GITHUB_TOKEN": "test-token"})
3838
@mock.patch.object(Chapter, "bulk_save", autospec=True)
39-
def test_handle(self, mock_bulk_save, command, mock_chapter, offset, chapters):
40-
mock_scraper = mock.Mock(spec=OwaspScraper)
39+
@mock.patch("apps.owasp.management.commands.owasp_scrape_chapters.get_github_client")
40+
def test_handle(self, mock_github, mock_bulk_save, command, mock_chapter, offset, chapters):
4141
mock_chapter.get_urls.return_value = [
4242
"https://example.com/repo1",
4343
"https://example.com/repo2",
4444
"https://invalid.com/repo3",
4545
]
46-
mock_scraper.verify_url.side_effect = lambda url: None if "invalid" in url else url
47-
mock_scraper.page_tree = True
46+
mock_chapter.verify_url.side_effect = lambda url: None if "invalid" in url else url
4847

4948
mock_chapter.get_related_url.side_effect = lambda url, **_: url
5049

@@ -60,14 +59,14 @@ def test_handle(self, mock_bulk_save, command, mock_chapter, offset, chapters):
6059
)
6160
mock_active_chapters.order_by.return_value = mock_active_chapters
6261

62+
mock_github_instance = mock.Mock()
63+
mock_github.return_value = mock_github_instance
64+
mock_github_instance.get_repo.return_value = mock.Mock()
65+
6366
with (
6467
mock.patch.object(Chapter, "active_chapters", mock_active_chapters),
6568
mock.patch("builtins.print") as mock_print,
6669
mock.patch("time.sleep", return_value=None),
67-
mock.patch(
68-
"apps.owasp.management.commands.owasp_scrape_chapters.OwaspScraper",
69-
return_value=mock_scraper,
70-
),
7170
mock.patch(
7271
"apps.owasp.management.commands.owasp_scrape_chapters.normalize_url",
7372
side_effect=normalize_url,

0 commit comments

Comments
 (0)