3
3
"""
4
4
import datetime
5
5
import uuid
6
-
6
+ import os
7
7
import requests
8
+
8
9
from bs4 import BeautifulSoup
9
10
from progress .bar import Bar
10
11
from threadsafe .safe_csv import SafeDictWriter
11
12
12
- from .utils import join_local_path
13
+ from .config import get_data_directory
13
14
from .validators import validate_link
14
15
from .log import debug
15
16
16
17
17
- def parse_links (html : str ):
18
- """Parses HTML page to extract links.
19
-
20
- Returns:
21
- (list): List of all valid links found.
18
+ def parse_links (html : str ) -> list [str ]:
19
+ """
20
+ Finds all anchor tags and parses the href attribute.
22
21
"""
23
22
soup = BeautifulSoup (html , 'html.parser' )
24
23
tags = soup .find_all ('a' )
25
24
return [tag ['href' ] for tag in tags if validate_link (tag ['href' ])]
26
25
27
26
28
- def parse_meta_tags (soup : BeautifulSoup ):
29
- """Retrieve all meta elements from HTML object.
30
-
31
- Returns:
32
- list: List containing content from meta tags
27
+ def parse_meta_tags (soup : BeautifulSoup ) -> list [object ]:
28
+ """
29
+ Parses all meta tags.
33
30
"""
34
31
meta_tags = soup .find_all ('meta' )
35
32
content_list = list ()
@@ -38,23 +35,23 @@ def parse_meta_tags(soup: BeautifulSoup):
38
35
return content_list
39
36
40
37
41
- def get_links (url : str ):
38
+ def get_links (url : str ) -> list [str ]:
39
+ """
40
+ Returns all valid links found on the URL.
41
+ """
42
42
resp = requests .get (url )
43
43
links = parse_links (resp .text )
44
44
return links
45
45
46
46
47
- default_url = 'https://thehiddenwiki.org'
48
-
49
-
50
- def collect_data (user_url : str ):
51
- url = user_url if user_url is not None else default_url
47
+ def collect_data (url : str = 'https://thehiddenwiki.org' ):
52
48
print (f"Gathering data for { url } " )
53
49
links = get_links (url )
54
50
current_time = datetime .datetime .now ().isoformat ()
55
51
file_name = f'torbot_{ current_time } .csv'
56
- file_path = join_local_path (file_name )
57
- with open (file_path , 'w+' ) as outcsv :
52
+ data_directory = get_data_directory ()
53
+ local_file_path = os .path .join (data_directory , file_name )
54
+ with open (local_file_path , 'w+' ) as outcsv :
58
55
fieldnames = ['ID' , 'Title' , 'Metadata' , 'Content' ]
59
56
writer = SafeDictWriter (outcsv , fieldnames = fieldnames )
60
57
bar = Bar ('Processing...' , max = len (links ))
@@ -71,8 +68,9 @@ def collect_data(user_url: str):
71
68
}
72
69
writer .writerow (entry )
73
70
except requests .exceptions .RequestException as e :
71
+ print (f"Failed to connect to [{ link } ]." )
74
72
debug (e )
75
- debug (f"Failed to connect to [{ link } ]." )
76
73
bar .next ()
77
74
bar .finish ()
78
- print (f'Data has been saved to { file_path } .' )
75
+
76
+ print (f'Data has been saved to { local_file_path } .' )
0 commit comments