-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_companies.py
119 lines (102 loc) · 4.27 KB
/
parse_companies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import csv
import re
import requests
from bs4 import BeautifulSoup
def get_nonce(html_source: str) -> str | None:
match = re.search(r'"nonce":\s*"([^"]+)"', html_source)
return match.group(1) if match else None
def get_company_data(company_id: str, nonce: str):
url = 'https://www.sequoiacap.com/wp-admin/admin-ajax.php'
# Headers for the request
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Accept': '*/*',
'Sec-Fetch-Site': 'same-origin',
'Accept-Language': 'en-GB,en;q=0.9',
'Sec-Fetch-Mode': 'cors',
'Host': 'www.sequoiacap.com',
'Origin': 'https://www.sequoiacap.com',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.6 Safari/605.1.15',
'Referer': 'https://www.sequoiacap.com/our-companies/?_stage_current=ipo',
'Connection': 'keep-alive',
'Sec-Fetch-Dest': 'empty',
'X-Requested-With': 'XMLHttpRequest',
}
# Data to be sent in the POST request
data = {
'action': 'load_company_content',
'post_id': str(company_id),
'nonce': nonce,
}
# Sending the POST request
response = requests.post(url, headers=headers, data=data)
return response.text
def parse_milestones(milestones_html: str) -> str | None:
# Parse the HTML content
milestones_soup = BeautifulSoup(milestones_html, 'html.parser')
# Find the Milestones section
milestones_section = None
for h2 in milestones_soup.find_all('h2'):
if h2.get_text(strip=True) == 'Milestones':
milestones_section = h2.find_next('ul')
break
milestones = None
if milestones_section:
milestones = [li.get_text(strip=True) for li in milestones_section.find_all('li')]
for milestone in milestones:
if milestone.__contains__('IPO'):
return milestone
return None
def parse_categories(company_html: str) -> list[str]:
"""
Parses the categories from a 'More info' section of HTML for a company.
The categories are buttons that appear at the bottom of the element.
They are 'a' tags and have an attribute 'data-bs-target' = 'categories'
:param company_html: HTML for the 'More info' section of a company
:return:
"""
milestones_soup = BeautifulSoup(company_html, 'html.parser')
elements = milestones_soup.find_all('a', attrs={'data-bs-target': 'categories'})
return [element.get_text(strip=True) for element in elements]
url = 'https://www.sequoiacap.com/our-companies/?_stage_current=ipo'
response = requests.get(url)
# Check if the request was successful
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table') # You may need to specify attributes to find the correct table
nonce = get_nonce(response.text)
# Step 4: Extract data from the table
data = []
for row in table.find_all('tr'):
print(f'Processing row {row}')
categories = None
if 'data-target' in row.attrs:
company_id = row.attrs['data-target'].split('-')[-1]
more_info = get_company_data(company_id, nonce)
company_milestones = parse_milestones(more_info)
categories = ','.join(parse_categories(more_info))
else:
company_milestones = None
cols = row.find_all(['td', 'th']) # Get both header and data cells
cols = [col.text.strip() for col in cols] # Strip whitespace
if cols[0] == 'Loading':
continue
cols.pop()
if not company_milestones:
cols.append('Milestones')
else:
cols.append(company_milestones)
print(f'Processed row {cols}')
data.append(cols)
if categories:
cols.append(categories)
else:
cols.append('Categories')
# Step 5: Write the data to a CSV file
csv_file_path = 'table_data.csv' # Specify the desired CSV file name
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerows(data)
print(f"Data has been written to {csv_file_path}")
else:
print(f"Failed to retrieve the webpage. Status code: {response.status_code}")