-
Notifications
You must be signed in to change notification settings - Fork 0
/
lambda_function.py
98 lines (82 loc) · 4.09 KB
/
lambda_function.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import json
import requests
import boto3
from datetime import datetime
import os
import csv
from bs4 import BeautifulSoup
import uuid
# Initializing DynamoDB resource
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('JobScraperTable')
def lambda_handler(event, context):
url = event["html"]
if isinstance(url, list):
url = url[0] # Ensuring url is a string if it's passed as a list
bucket = "greenhub-bucket"
file_name = "/tmp/job_details.csv" # Defining a static filename for the CSV
download_and_upload_csv(url, bucket, file_name)
return {
'statusCode': 200,
'body': json.dumps('Job details have been scraped, saved in CSV, uploaded to S3, and stored in DynamoDB!')
}
def download_and_upload_csv(url, bucket, file_name):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'
}
response = requests.get(url, headers=headers, timeout=30)
if response.status_code == 200:
html_content = response.text
else:
raise Exception(f"Failed to download HTML content. Status code: {response.status_code}")
soup = BeautifulSoup(html_content, "html.parser")
# Seletor based on example HTML structure
jobs = soup.find_all("div", class_="sc-beqWaB gupdsY job-card") # selector for job card divs
# Open the CSV file in append mode to add new data to the existing file(This is to avoid rereating new sv files all the time the shedule runs)
with open(file_name, mode='a', encoding='utf-8', newline='') as file:
writer = csv.writer(file)
# Write the header row only if the file is empty
if file.tell() == 0:
writer.writerow(["Job Title", "Company Name", "Location", "Date Posted", "Job Link"])
for job in jobs:
job_title_tag = job.find("div", itemprop="title")
job_title = job_title_tag.text.strip() if job_title_tag else "N/A"
company_name_tag = job.find("a", {"data-testid": "link"})
company_name = company_name_tag.text.strip() if company_name_tag else "N/A"
# location extraction
location_tag = job.find("meta", itemprop="address")
location = location_tag['content'].strip() if location_tag and location_tag.has_attr('content') else "N/A"
date_posted_tag = job.find("div", class_="sc-beqWaB enQFes")
date_posted = date_posted_tag.text.strip() if date_posted_tag else "N/A"
job_link_tag = job.find("a", {"data-testid": "job-title-link"}, href=True)
job_link = job_link_tag['href'] if job_link_tag else "N/A"
if not job_link.startswith(('http://', 'https://')):
job_link = f"https://climatejobs.shortlist.net{job_link}"
# Write to CSV
writer.writerow([job_title, company_name, location, date_posted, job_link])
# Store each job in DynamoDB
job_id = str(uuid.uuid4()) # Generate a unique ID for each job
try:
table.put_item(
Item={
'job_id': job_id,
'job_title': job_title,
'company_name': company_name,
'location': location,
'date_posted': date_posted,
'job_link': job_link
}
)
print(f"Job {job_title} saved to DynamoDB with ID {job_id}.")
except Exception as e:
print(f"Failed to save job to DynamoDB: {e}")
# Upload the single CSV file to S3
s3_client = boto3.client('s3')
s3_file_path = f"Jobs/job_details.csv" # Keep the S3 path static as well
try:
s3_client.upload_file(file_name, bucket, s3_file_path)
print(f"CSV file '{file_name}' uploaded successfully to S3 bucket '{bucket}' in the 'Jobs' folder.")
except Exception as e:
print(f"Failed to upload CSV file to S3: {e}")
finally:
os.remove(file_name)