Skip to content

Commit

Permalink
Change filename logic for PDF attachments
Browse files Browse the repository at this point in the history
This patch honors the filename key of a fetched resource, which can be set by
the `Content-Disposition` or `Content-Type` headers and uses
`mimetypes.guess_extension` for resources that lack any indication of a
filename.
  • Loading branch information
cleitner committed Apr 18, 2014
1 parent 05ec8df commit a9fd32c
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 12 deletions.
48 changes: 37 additions & 11 deletions weasyprint/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import binascii
import hashlib
import io
import mimetypes
import os
import re
import string
Expand Down Expand Up @@ -435,15 +436,40 @@ def _write_compressed_file_object(pdf, file):
return object_number


def _get_filename_from_url(url):
def _get_filename_from_result(url, result):
"""
Derives a filename from an URL or returns a synthetic name if the URL has
no path component
Derives a filename from a fetched resource. This is either the filename
returned by the URL fetcher, the last URL path component or a synthetic
name if the URL has no path
"""

# A given filename will always take precedence
filename = result.get('filename')
if filename:
return filename

# The URL path likely contains a filename, which is a good second guess
split = urlsplit(url)
filename = split.path.split("/")[-1]
if split.scheme == 'data' or filename == '':
filename = 'attachment.bin'
# The URL lacks a path altogether. Use a synthetic name.

# Using guess_extension is a great idea, but sadly the extension is
# probably random, depending on the alignment of the stars, which car
# you're driving and which software has been installed on your machine.
#
# Unfortuneatly this isn't even imdepodent on one machine, because the
# extension can depend on PYTHONHASHSEED if mimetypes has multiple
# extensions to offer
extension = None
mime_type = result.get('mime_type')
if mime_type == 'text/plain':
# text/plain has a phletora of extensions - all garbage
extension = '.txt'
else:
extension = mimetypes.guess_extension(mime_type) or '.bin'

filename = 'attachment' + extension
else:
filename = unquote(filename)

Expand All @@ -460,9 +486,8 @@ def _write_pdf_embedded_files(pdf, attachments, url_fetcher):

file_spec_ids = []
for url, description in attachments:
filename = _get_filename_from_url(url)
file_spec_id = _write_pdf_attachment(pdf, filename, url, description,
url_fetcher)
file_spec_id = _write_pdf_attachment(pdf, url, description,
url_fetcher)
if not file_spec_id is None:
file_spec_ids.append(file_spec_id)

Expand All @@ -478,7 +503,7 @@ def _write_pdf_embedded_files(pdf, attachments, url_fetcher):
return pdf.write_new_object(b''.join(content))


def _write_pdf_attachment(pdf, filename, url, description, url_fetcher):
def _write_pdf_attachment(pdf, url, description, url_fetcher):
"""
Writes an attachment to the PDF stream
Expand All @@ -493,6 +518,8 @@ def _write_pdf_attachment(pdf, filename, url, description, url_fetcher):
io.BytesIO(result.get('string'))
file_stream_id = _write_compressed_file_object(pdf, stream)

filename = _get_filename_from_result(url, result)

return pdf.write_new_object(pdf_format(
'<< /Type /Filespec /F () /UF {0!P} /EF << /F {1} 0 R >> '
'/Desc {2!P}\n>>',
Expand All @@ -518,10 +545,9 @@ def _write_pdf_annotation_files(pdf, links, url_fetcher):
for is_internal, target, rectangle in page_links:
if is_internal == 'attachment' and not target in annot_files:
annot_files[target] = None
filename = _get_filename_from_url(target)
# TODO: use the title attribute as description
annot_files[target] = _write_pdf_attachment(pdf, filename,
target, None, url_fetcher)
annot_files[target] = _write_pdf_attachment(pdf, target, None,
url_fetcher)
return annot_files


Expand Down
2 changes: 1 addition & 1 deletion weasyprint/tests/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ def test_embedded_files():
pdf_bytes)
assert (b'/F ()' in pdf_bytes)
assert (b'/UF (\xfe\xff\x00a\x00t\x00t\x00a\x00c\x00h\x00m\x00e\x00n'
b'\x00t\x00.\x00b\x00i\x00n)' in pdf_bytes)
b'\x00t\x00.\x00t\x00x\x00t)' in pdf_bytes)
assert (b'/Desc (\xfe\xff\x00s\x00o\x00m\x00e\x00 \x00f\x00i\x00l\x00e'
b'\x00 \x00a\x00t\x00t\x00a\x00c\x00h\x00m\x00e\x00n\x00t\x00 '
b'\x00\xe4\x00\xf6\x00\xfc)' in pdf_bytes)
Expand Down

0 comments on commit a9fd32c

Please sign in to comment.