From a9fd32c14cd1924cb83e5cdab81102a795d90104 Mon Sep 17 00:00:00 2001 From: Colin Leitner Date: Fri, 18 Apr 2014 16:40:47 +0200 Subject: [PATCH] Change filename logic for PDF attachments This patch honors the filename key of a fetched resource, which can be set by the `Content-Disposition` or `Content-Type` headers and uses `mimetypes.guess_extension` for resources that lack any indication of a filename. --- weasyprint/pdf.py | 48 +++++++++++++++++++++++++++--------- weasyprint/tests/test_pdf.py | 2 +- 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/weasyprint/pdf.py b/weasyprint/pdf.py index 41eb80d03..1fc64065c 100644 --- a/weasyprint/pdf.py +++ b/weasyprint/pdf.py @@ -34,6 +34,7 @@ import binascii import hashlib import io +import mimetypes import os import re import string @@ -435,15 +436,40 @@ def _write_compressed_file_object(pdf, file): return object_number -def _get_filename_from_url(url): +def _get_filename_from_result(url, result): """ - Derives a filename from an URL or returns a synthetic name if the URL has - no path component + Derives a filename from a fetched resource. This is either the filename + returned by the URL fetcher, the last URL path component or a synthetic + name if the URL has no path """ + + # A given filename will always take precedence + filename = result.get('filename') + if filename: + return filename + + # The URL path likely contains a filename, which is a good second guess split = urlsplit(url) filename = split.path.split("/")[-1] if split.scheme == 'data' or filename == '': - filename = 'attachment.bin' + # The URL lacks a path altogether. Use a synthetic name. + + # Using guess_extension is a great idea, but sadly the extension is + # probably random, depending on the alignment of the stars, which car + # you're driving and which software has been installed on your machine. + # + # Unfortuneatly this isn't even imdepodent on one machine, because the + # extension can depend on PYTHONHASHSEED if mimetypes has multiple + # extensions to offer + extension = None + mime_type = result.get('mime_type') + if mime_type == 'text/plain': + # text/plain has a phletora of extensions - all garbage + extension = '.txt' + else: + extension = mimetypes.guess_extension(mime_type) or '.bin' + + filename = 'attachment' + extension else: filename = unquote(filename) @@ -460,9 +486,8 @@ def _write_pdf_embedded_files(pdf, attachments, url_fetcher): file_spec_ids = [] for url, description in attachments: - filename = _get_filename_from_url(url) - file_spec_id = _write_pdf_attachment(pdf, filename, url, description, - url_fetcher) + file_spec_id = _write_pdf_attachment(pdf, url, description, + url_fetcher) if not file_spec_id is None: file_spec_ids.append(file_spec_id) @@ -478,7 +503,7 @@ def _write_pdf_embedded_files(pdf, attachments, url_fetcher): return pdf.write_new_object(b''.join(content)) -def _write_pdf_attachment(pdf, filename, url, description, url_fetcher): +def _write_pdf_attachment(pdf, url, description, url_fetcher): """ Writes an attachment to the PDF stream @@ -493,6 +518,8 @@ def _write_pdf_attachment(pdf, filename, url, description, url_fetcher): io.BytesIO(result.get('string')) file_stream_id = _write_compressed_file_object(pdf, stream) + filename = _get_filename_from_result(url, result) + return pdf.write_new_object(pdf_format( '<< /Type /Filespec /F () /UF {0!P} /EF << /F {1} 0 R >> ' '/Desc {2!P}\n>>', @@ -518,10 +545,9 @@ def _write_pdf_annotation_files(pdf, links, url_fetcher): for is_internal, target, rectangle in page_links: if is_internal == 'attachment' and not target in annot_files: annot_files[target] = None - filename = _get_filename_from_url(target) # TODO: use the title attribute as description - annot_files[target] = _write_pdf_attachment(pdf, filename, - target, None, url_fetcher) + annot_files[target] = _write_pdf_attachment(pdf, target, None, + url_fetcher) return annot_files diff --git a/weasyprint/tests/test_pdf.py b/weasyprint/tests/test_pdf.py index d8747c879..12e1ff827 100644 --- a/weasyprint/tests/test_pdf.py +++ b/weasyprint/tests/test_pdf.py @@ -376,7 +376,7 @@ def test_embedded_files(): pdf_bytes) assert (b'/F ()' in pdf_bytes) assert (b'/UF (\xfe\xff\x00a\x00t\x00t\x00a\x00c\x00h\x00m\x00e\x00n' - b'\x00t\x00.\x00b\x00i\x00n)' in pdf_bytes) + b'\x00t\x00.\x00t\x00x\x00t)' in pdf_bytes) assert (b'/Desc (\xfe\xff\x00s\x00o\x00m\x00e\x00 \x00f\x00i\x00l\x00e' b'\x00 \x00a\x00t\x00t\x00a\x00c\x00h\x00m\x00e\x00n\x00t\x00 ' b'\x00\xe4\x00\xf6\x00\xfc)' in pdf_bytes)