1
1
import base64
2
+ import enum
2
3
import json
3
4
import logging
4
5
import os
5
6
import subprocess
6
7
import sys
7
8
import tempfile
9
+ import uuid
8
10
import xml .etree .ElementTree as ET
9
11
from dataclasses import dataclass , field
10
- from typing import Literal
11
12
from urllib .parse import urlparse
12
13
13
14
import boto3
14
15
import pydantic
15
16
import requests
16
17
import sentry_sdk
18
+ from playwright .sync_api import sync_playwright
17
19
from sentry_sdk .integrations .aws_lambda import AwsLambdaIntegration
18
20
19
21
logger = logging .getLogger ("splat" )
28
30
)
29
31
30
32
33
+ class Renderers (str , enum .Enum ):
34
+ playwright = "playwright"
35
+ prince = "prince"
36
+
37
+
31
38
class Payload (pydantic .BaseModel ):
32
39
# General Parameters
33
40
javascript : bool = False
34
41
check_license : bool = False
35
42
36
43
# Input parameters
44
+ ## Embed the document content as a string
37
45
document_content : str | None = None
46
+ ## Fetch the document from a URL, store it and render it
38
47
document_url : str | None = None
39
- renderer : Literal ["prince" , "playwright" , "playwright+prince" ] = "prince"
48
+ ## Browse the document in a browser before rendering
49
+ browser_url : str | None = None
50
+ browser_headers : dict = pydantic .Field (default_factory = dict )
51
+ renderer : Renderers = Renderers .prince
40
52
41
53
# Output parameters
42
54
bucket_name : str | None = None
@@ -77,26 +89,77 @@ def init() -> None:
77
89
os .environ ["FONTCONFIG_PATH" ] = "/var/task/fonts"
78
90
79
91
80
- def pdf_from_string (document_content : str , output_filepath : str , javascript : bool = False ) -> str :
81
- print ("splat|pdf_from_string" )
82
- # Save document_content to file
83
- with tempfile .NamedTemporaryFile (mode = "w" , suffix = ".html" ) as f :
84
- f .write (document_content )
85
- return prince_handler (f .name , output_filepath , javascript )
92
+ def playwright_page_to_pdf (browser_url : str , headers : dict , output_filepath : str ) -> None :
93
+ print ("splat|playwright_handler|url=" , browser_url )
94
+ with sync_playwright () as p :
95
+ browser = p .chromium .launch ()
96
+ context = browser .new_context ()
97
+ context .set_extra_http_headers (headers )
98
+ page = context .new_page ()
99
+ page .goto (
100
+ browser_url ,
101
+ wait_until = "domcontentloaded" ,
102
+ )
103
+ page .emulate_media (media = "print" )
104
+ page .pdf (path = output_filepath , format = "A4" )
105
+
106
+
107
+ def playwright_page_to_html_string (browser_url : str , headers : dict ) -> str :
108
+ print ("splat|playwright_handler|url=" , browser_url )
109
+ with sync_playwright () as p :
110
+ browser = p .chromium .launch ()
111
+ context = browser .new_context ()
112
+ context .set_extra_http_headers (headers )
113
+ page = context .new_page ()
114
+ page .goto (
115
+ browser_url ,
116
+ wait_until = "domcontentloaded" ,
117
+ )
118
+ page .emulate_media (media = "print" )
119
+ return page .content ()
120
+
121
+
122
+ def pdf_from_document_content (payload : Payload , output_filepath : str ) -> None :
123
+ """Generates pdf from string content of the document"""
124
+ print ("splat|pdf_from_document_content" )
125
+ with tempfile .NamedTemporaryFile (mode = "w" , suffix = ".html" ) as temporary_html_file :
126
+ assert payload .document_content
127
+ temporary_html_file .write (payload .document_content )
128
+ temporary_html_file .flush ()
129
+ if payload .renderer == Renderers .prince :
130
+ prince_handler (temporary_html_file .name , output_filepath , payload .javascript )
131
+ else :
132
+ playwright_page_to_pdf (f"file://{ temporary_html_file .name } " , payload .browser_headers , output_filepath )
86
133
87
134
88
- def pdf_from_url ( document_url : str , output_filepath : str , javascript : bool = False ) -> str :
89
- print ( "splat|pdf_from_url" )
90
- # Fetch document_url and save to file
91
- response = requests .get (document_url , timeout = 120 )
135
+ def pdf_from_document_url ( payload : Payload , output_filepath : str ) -> None :
136
+ """Generates pdf from a remote html document"""
137
+ print ( "splat|pdf_from_document_url" )
138
+ response = requests .get (payload . document_url , timeout = 120 )
92
139
if response .status_code != 200 :
93
140
raise SplatPDFGenerationFailure (
94
141
f"Document was unable to be fetched from document_url provided. Server response: { response .content } " ,
95
142
status_code = 500 ,
96
143
)
97
- with tempfile .NamedTemporaryFile (mode = "w" , suffix = ".html" ) as f :
98
- f .write (response .content .decode ("utf-8" ))
99
- return prince_handler (f .name , output_filepath , javascript )
144
+ with tempfile .NamedTemporaryFile (mode = "w" , suffix = ".html" ) as temporary_html_file :
145
+ temporary_html_file .write (response .content .decode ("utf-8" ))
146
+ temporary_html_file .flush ()
147
+ if payload .renderer == Renderers .prince :
148
+ prince_handler (temporary_html_file .name , output_filepath , payload .javascript )
149
+ else :
150
+ playwright_page_to_pdf (f"file://{ temporary_html_file .name } " , payload .browser_headers , output_filepath )
151
+
152
+
153
+ def pdf_from_browser_url (payload : Payload , output_filepath : str ) -> None :
154
+ """Generates pdf by visiting a browser url"""
155
+ print ("splat|pdf_from_browser_url" )
156
+ # First we need to visit the browser with playwright and save the html
157
+ assert payload .browser_url
158
+ if payload .renderer == Renderers .prince :
159
+ html = playwright_page_to_html_string (payload .browser_url , payload .browser_headers )
160
+ pdf_from_document_content (Payload (document_content = html , renderer = Renderers .prince ), output_filepath )
161
+ else :
162
+ playwright_page_to_pdf (payload .browser_url , payload .browser_headers , output_filepath )
100
163
101
164
102
165
def execute (cmd : list [str ]) -> None :
@@ -105,7 +168,7 @@ def execute(cmd: list[str]) -> None:
105
168
raise subprocess .CalledProcessError (result .returncode , cmd )
106
169
107
170
108
- def prince_handler (input_filepath : str , output_filepath : str , javascript : bool = False ) -> str :
171
+ def prince_handler (input_filepath : str , output_filepath : str , javascript : bool = False ) -> None :
109
172
print ("splat|prince_command_run" )
110
173
# Prepare command
111
174
command = [
@@ -121,42 +184,49 @@ def prince_handler(input_filepath: str, output_filepath: str, javascript: bool =
121
184
# Run command and capture output
122
185
print (f"splat|invoke_prince { ' ' .join (command )} " )
123
186
execute (command )
124
- # Log prince output
125
- return output_filepath
126
187
127
188
128
189
def create_pdf (payload : Payload , output_filepath : str ) -> str :
129
190
"""Creates the PDF and stores it from the payload"""
130
191
if payload .document_content :
131
- pdf_from_string (payload . document_content , output_filepath , payload . javascript )
192
+ pdf_from_document_content (payload , output_filepath )
132
193
elif payload .document_url :
133
- pdf_from_url (payload .document_url , output_filepath , payload .javascript )
194
+ pdf_from_document_url (payload , output_filepath )
195
+ elif payload .browser_url :
196
+ pdf_from_browser_url (payload , output_filepath )
134
197
else :
135
198
raise SplatPDFGenerationFailure (
136
- "Please specify either document_content or document_url" ,
199
+ "Please specify either document_content or document_url or browser_url in the payload. " ,
137
200
status_code = 400 ,
138
201
)
139
202
return output_filepath
140
203
141
204
142
- def deliver_pdf_to_s3_bucket (body : dict , output_filepath : str ) -> Response :
205
+ def deliver_pdf_to_s3_bucket (payload : Payload , output_filepath : str ) -> Response :
143
206
print ("splat|bucket_save" )
144
- # Upload to s3 and return URL
145
- bucket_name = body .get ("bucket_name" )
146
- key = "output.pdf"
207
+ key = f"{ uuid .uuid4 ()} .pdf"
147
208
s3 = boto3 .resource ("s3" )
148
- bucket = s3 .Bucket (bucket_name )
149
- bucket .upload_file (output_filepath , key ) # noqa S108
150
- location = boto3 .client ("s3" ).get_bucket_location (Bucket = bucket_name )["LocationConstraint" ]
151
- url = f"https://{ bucket_name } .s3-{ location } .amazonaws.com/{ key } "
209
+ bucket = s3 .Bucket (payload .bucket_name )
210
+ bucket .upload_file (output_filepath , key )
211
+
212
+ presigned_url = boto3 .client ("s3" ).generate_presigned_url (
213
+ "get_object" ,
214
+ Params = {"Bucket" : payload .bucket_name , "Key" : key },
215
+ )
152
216
return Response (
153
- body = json .dumps ({"url" : url }),
217
+ body = json .dumps (
218
+ {
219
+ "bucket" : payload .bucket_name ,
220
+ "key" : key ,
221
+ "presigned_url" : presigned_url ,
222
+ }
223
+ ),
154
224
)
155
225
156
226
157
- def deliver_pdf_to_presigned_url (body : dict , output_filepath : str ) -> Response :
227
+ def deliver_pdf_to_presigned_url (payload : Payload , output_filepath : str ) -> Response :
158
228
print ("splat|presigned_url_save" )
159
- presigned_url = body . get ( " presigned_url" )
229
+ presigned_url = payload . presigned_url
160
230
try :
161
231
urlparse (presigned_url ["url" ])
162
232
assert presigned_url ["fields" ]
@@ -222,11 +292,11 @@ def deliver_pdf_via_streaming_base64(output_filepath: str) -> Response:
222
292
)
223
293
224
294
225
- def deliver_pdf (body : dict , output_filepath : str ) -> Response :
226
- if body . get ( " bucket_name" ) :
227
- return deliver_pdf_to_s3_bucket (body , output_filepath )
228
- elif body . get ( " presigned_url" ) :
229
- return deliver_pdf_to_presigned_url (body , output_filepath )
295
+ def deliver_pdf (payload : Payload , output_filepath : str ) -> Response :
296
+ if payload . bucket_name :
297
+ return deliver_pdf_to_s3_bucket (payload , output_filepath )
298
+ elif payload . presigned_url :
299
+ return deliver_pdf_to_presigned_url (payload , output_filepath )
230
300
else :
231
301
return deliver_pdf_via_streaming_base64 (output_filepath )
232
302
@@ -257,22 +327,23 @@ def handle_event(event: dict) -> Response: # noqa
257
327
except pydantic .ValidationError as e :
258
328
raise SplatPDFGenerationFailure (
259
329
status_code = 400 ,
260
- message = "Invalid payload" ,
330
+ message = f "Invalid payload: { e } " ,
261
331
) from e
262
332
263
333
# 3) Check licence if user is requesting that
264
334
if payload .check_license :
265
335
return check_license ()
266
336
267
337
print (f"splat|javascript={ payload .javascript } " )
338
+ print (f"splat|renderer={ payload .renderer } " )
268
339
269
340
# 4) Generate PDF
270
341
with tempfile .NamedTemporaryFile (suffix = ".pdf" ) as output_pdf :
271
342
output_filepath = output_pdf .name
272
343
create_pdf (payload , output_filepath )
273
344
274
345
# 5) Deliver the PDF
275
- resp = deliver_pdf (body , output_filepath )
346
+ resp = deliver_pdf (payload , output_filepath )
276
347
return resp
277
348
278
349
0 commit comments