Skip to content

Commit 3821f92

Browse files
committed
feat: add playwright rendering
1 parent 7b52ac7 commit 3821f92

File tree

4 files changed

+222
-104
lines changed

4 files changed

+222
-104
lines changed

lambda_function.py

+110-39
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,21 @@
11
import base64
2+
import enum
23
import json
34
import logging
45
import os
56
import subprocess
67
import sys
78
import tempfile
9+
import uuid
810
import xml.etree.ElementTree as ET
911
from dataclasses import dataclass, field
10-
from typing import Literal
1112
from urllib.parse import urlparse
1213

1314
import boto3
1415
import pydantic
1516
import requests
1617
import sentry_sdk
18+
from playwright.sync_api import sync_playwright
1719
from sentry_sdk.integrations.aws_lambda import AwsLambdaIntegration
1820

1921
logger = logging.getLogger("splat")
@@ -28,15 +30,25 @@
2830
)
2931

3032

33+
class Renderers(str, enum.Enum):
34+
playwright = "playwright"
35+
prince = "prince"
36+
37+
3138
class Payload(pydantic.BaseModel):
3239
# General Parameters
3340
javascript: bool = False
3441
check_license: bool = False
3542

3643
# Input parameters
44+
## Embed the document content as a string
3745
document_content: str | None = None
46+
## Fetch the document from a URL, store it and render it
3847
document_url: str | None = None
39-
renderer: Literal["prince", "playwright", "playwright+prince"] = "prince"
48+
## Browse the document in a browser before rendering
49+
browser_url: str | None = None
50+
browser_headers: dict = pydantic.Field(default_factory=dict)
51+
renderer: Renderers = Renderers.prince
4052

4153
# Output parameters
4254
bucket_name: str | None = None
@@ -77,26 +89,77 @@ def init() -> None:
7789
os.environ["FONTCONFIG_PATH"] = "/var/task/fonts"
7890

7991

80-
def pdf_from_string(document_content: str, output_filepath: str, javascript: bool = False) -> str:
81-
print("splat|pdf_from_string")
82-
# Save document_content to file
83-
with tempfile.NamedTemporaryFile(mode="w", suffix=".html") as f:
84-
f.write(document_content)
85-
return prince_handler(f.name, output_filepath, javascript)
92+
def playwright_page_to_pdf(browser_url: str, headers: dict, output_filepath: str) -> None:
93+
print("splat|playwright_handler|url=", browser_url)
94+
with sync_playwright() as p:
95+
browser = p.chromium.launch()
96+
context = browser.new_context()
97+
context.set_extra_http_headers(headers)
98+
page = context.new_page()
99+
page.goto(
100+
browser_url,
101+
wait_until="domcontentloaded",
102+
)
103+
page.emulate_media(media="print")
104+
page.pdf(path=output_filepath, format="A4")
105+
106+
107+
def playwright_page_to_html_string(browser_url: str, headers: dict) -> str:
108+
print("splat|playwright_handler|url=", browser_url)
109+
with sync_playwright() as p:
110+
browser = p.chromium.launch()
111+
context = browser.new_context()
112+
context.set_extra_http_headers(headers)
113+
page = context.new_page()
114+
page.goto(
115+
browser_url,
116+
wait_until="domcontentloaded",
117+
)
118+
page.emulate_media(media="print")
119+
return page.content()
120+
121+
122+
def pdf_from_document_content(payload: Payload, output_filepath: str) -> None:
123+
"""Generates pdf from string content of the document"""
124+
print("splat|pdf_from_document_content")
125+
with tempfile.NamedTemporaryFile(mode="w", suffix=".html") as temporary_html_file:
126+
assert payload.document_content
127+
temporary_html_file.write(payload.document_content)
128+
temporary_html_file.flush()
129+
if payload.renderer == Renderers.prince:
130+
prince_handler(temporary_html_file.name, output_filepath, payload.javascript)
131+
else:
132+
playwright_page_to_pdf(f"file://{temporary_html_file.name}", payload.browser_headers, output_filepath)
86133

87134

88-
def pdf_from_url(document_url: str, output_filepath: str, javascript: bool = False) -> str:
89-
print("splat|pdf_from_url")
90-
# Fetch document_url and save to file
91-
response = requests.get(document_url, timeout=120)
135+
def pdf_from_document_url(payload: Payload, output_filepath: str) -> None:
136+
"""Generates pdf from a remote html document"""
137+
print("splat|pdf_from_document_url")
138+
response = requests.get(payload.document_url, timeout=120)
92139
if response.status_code != 200:
93140
raise SplatPDFGenerationFailure(
94141
f"Document was unable to be fetched from document_url provided. Server response: {response.content}",
95142
status_code=500,
96143
)
97-
with tempfile.NamedTemporaryFile(mode="w", suffix=".html") as f:
98-
f.write(response.content.decode("utf-8"))
99-
return prince_handler(f.name, output_filepath, javascript)
144+
with tempfile.NamedTemporaryFile(mode="w", suffix=".html") as temporary_html_file:
145+
temporary_html_file.write(response.content.decode("utf-8"))
146+
temporary_html_file.flush()
147+
if payload.renderer == Renderers.prince:
148+
prince_handler(temporary_html_file.name, output_filepath, payload.javascript)
149+
else:
150+
playwright_page_to_pdf(f"file://{temporary_html_file.name}", payload.browser_headers, output_filepath)
151+
152+
153+
def pdf_from_browser_url(payload: Payload, output_filepath: str) -> None:
154+
"""Generates pdf by visiting a browser url"""
155+
print("splat|pdf_from_browser_url")
156+
# First we need to visit the browser with playwright and save the html
157+
assert payload.browser_url
158+
if payload.renderer == Renderers.prince:
159+
html = playwright_page_to_html_string(payload.browser_url, payload.browser_headers)
160+
pdf_from_document_content(Payload(document_content=html, renderer=Renderers.prince), output_filepath)
161+
else:
162+
playwright_page_to_pdf(payload.browser_url, payload.browser_headers, output_filepath)
100163

101164

102165
def execute(cmd: list[str]) -> None:
@@ -105,7 +168,7 @@ def execute(cmd: list[str]) -> None:
105168
raise subprocess.CalledProcessError(result.returncode, cmd)
106169

107170

108-
def prince_handler(input_filepath: str, output_filepath: str, javascript: bool = False) -> str:
171+
def prince_handler(input_filepath: str, output_filepath: str, javascript: bool = False) -> None:
109172
print("splat|prince_command_run")
110173
# Prepare command
111174
command = [
@@ -121,42 +184,49 @@ def prince_handler(input_filepath: str, output_filepath: str, javascript: bool =
121184
# Run command and capture output
122185
print(f"splat|invoke_prince {' '.join(command)}")
123186
execute(command)
124-
# Log prince output
125-
return output_filepath
126187

127188

128189
def create_pdf(payload: Payload, output_filepath: str) -> str:
129190
"""Creates the PDF and stores it from the payload"""
130191
if payload.document_content:
131-
pdf_from_string(payload.document_content, output_filepath, payload.javascript)
192+
pdf_from_document_content(payload, output_filepath)
132193
elif payload.document_url:
133-
pdf_from_url(payload.document_url, output_filepath, payload.javascript)
194+
pdf_from_document_url(payload, output_filepath)
195+
elif payload.browser_url:
196+
pdf_from_browser_url(payload, output_filepath)
134197
else:
135198
raise SplatPDFGenerationFailure(
136-
"Please specify either document_content or document_url",
199+
"Please specify either document_content or document_url or browser_url in the payload.",
137200
status_code=400,
138201
)
139202
return output_filepath
140203

141204

142-
def deliver_pdf_to_s3_bucket(body: dict, output_filepath: str) -> Response:
205+
def deliver_pdf_to_s3_bucket(payload: Payload, output_filepath: str) -> Response:
143206
print("splat|bucket_save")
144-
# Upload to s3 and return URL
145-
bucket_name = body.get("bucket_name")
146-
key = "output.pdf"
207+
key = f"{uuid.uuid4()}.pdf"
147208
s3 = boto3.resource("s3")
148-
bucket = s3.Bucket(bucket_name)
149-
bucket.upload_file(output_filepath, key) # noqa S108
150-
location = boto3.client("s3").get_bucket_location(Bucket=bucket_name)["LocationConstraint"]
151-
url = f"https://{bucket_name}.s3-{location}.amazonaws.com/{key}"
209+
bucket = s3.Bucket(payload.bucket_name)
210+
bucket.upload_file(output_filepath, key)
211+
212+
presigned_url = boto3.client("s3").generate_presigned_url(
213+
"get_object",
214+
Params={"Bucket": payload.bucket_name, "Key": key},
215+
)
152216
return Response(
153-
body=json.dumps({"url": url}),
217+
body=json.dumps(
218+
{
219+
"bucket": payload.bucket_name,
220+
"key": key,
221+
"presigned_url": presigned_url,
222+
}
223+
),
154224
)
155225

156226

157-
def deliver_pdf_to_presigned_url(body: dict, output_filepath: str) -> Response:
227+
def deliver_pdf_to_presigned_url(payload: Payload, output_filepath: str) -> Response:
158228
print("splat|presigned_url_save")
159-
presigned_url = body.get("presigned_url")
229+
presigned_url = payload.presigned_url
160230
try:
161231
urlparse(presigned_url["url"])
162232
assert presigned_url["fields"]
@@ -222,11 +292,11 @@ def deliver_pdf_via_streaming_base64(output_filepath: str) -> Response:
222292
)
223293

224294

225-
def deliver_pdf(body: dict, output_filepath: str) -> Response:
226-
if body.get("bucket_name"):
227-
return deliver_pdf_to_s3_bucket(body, output_filepath)
228-
elif body.get("presigned_url"):
229-
return deliver_pdf_to_presigned_url(body, output_filepath)
295+
def deliver_pdf(payload: Payload, output_filepath: str) -> Response:
296+
if payload.bucket_name:
297+
return deliver_pdf_to_s3_bucket(payload, output_filepath)
298+
elif payload.presigned_url:
299+
return deliver_pdf_to_presigned_url(payload, output_filepath)
230300
else:
231301
return deliver_pdf_via_streaming_base64(output_filepath)
232302

@@ -257,22 +327,23 @@ def handle_event(event: dict) -> Response: # noqa
257327
except pydantic.ValidationError as e:
258328
raise SplatPDFGenerationFailure(
259329
status_code=400,
260-
message="Invalid payload",
330+
message=f"Invalid payload: {e}",
261331
) from e
262332

263333
# 3) Check licence if user is requesting that
264334
if payload.check_license:
265335
return check_license()
266336

267337
print(f"splat|javascript={payload.javascript}")
338+
print(f"splat|renderer={payload.renderer}")
268339

269340
# 4) Generate PDF
270341
with tempfile.NamedTemporaryFile(suffix=".pdf") as output_pdf:
271342
output_filepath = output_pdf.name
272343
create_pdf(payload, output_filepath)
273344

274345
# 5) Deliver the PDF
275-
resp = deliver_pdf(body, output_filepath)
346+
resp = deliver_pdf(payload, output_filepath)
276347
return resp
277348

278349

lambda_requirements.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@ requests==2.31.0
22
boto3==1.34.0
33
sentry-sdk==1.39.0
44
awslambdaric
5-
pydantic
5+
pydantic
6+
playwright==1.43.0

scripts/local.py

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import base64
2+
import json
3+
import pathlib
4+
5+
import requests
6+
7+
LAMBDA_URL = "http://localhost:8080/2015-03-31/functions/function/invocations"
8+
9+
10+
def call_lamdba(body: dict, raise_exception=True) -> tuple[int, dict, bytes]:
11+
response = requests.post(LAMBDA_URL, json={"body": json.dumps(body)}, timeout=60)
12+
if raise_exception:
13+
response.raise_for_status()
14+
data = response.json()
15+
status_code = data["statusCode"]
16+
is_base64_encoded = data["isBase64Encoded"]
17+
if is_base64_encoded:
18+
return status_code, {}, base64.b64decode(data["body"])
19+
else:
20+
body = json.loads(data.get("body")) if data.get("body") else {}
21+
if raise_exception and status_code not in {200, 201}:
22+
raise Exception(body)
23+
24+
return status_code, body, b""
25+
26+
27+
_, _, pdf_bytes = call_lamdba({"renderer": "playwright", "document_content": "<h1>Hello world</h1>"})
28+
pathlib.Path("/tmp/output.pdf").write_bytes(pdf_bytes) # noqa

0 commit comments

Comments
 (0)