Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Image styles - alignments + dimension(pixel/pt only) #29

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 82 additions & 42 deletions htmldocx/h2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,29 @@

How to deal with block level style applied over table elements? e.g. text align
"""
import re, argparse
import io, os
import argparse
import enum
import io
import os
import re
import urllib.request
from urllib.parse import urlparse
from html.parser import HTMLParser
from urllib.parse import urlparse

import docx, docx.table
import docx
import docx.table
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import RGBColor, Pt, Inches
from docx.enum.text import WD_COLOR, WD_ALIGN_PARAGRAPH
from docx.oxml import OxmlElement
from docx.oxml.ns import qn

from bs4 import BeautifulSoup
from docx.shared import Pt
from docx.shared import RGBColor, Inches

# values in inches
INDENT = 0.25
LIST_INDENT = 0.5
MAX_INDENT = 5.5 # To stop indents going off the page
MAX_INDENT = 5.5 # To stop indents going off the page

# Style to use with tables. By default no style is used.
DEFAULT_TABLE_STYLE = None
Expand All @@ -42,6 +46,7 @@
def get_filename_from_url(url):
return os.path.basename(urlparse(url).path)


def is_url(url):
"""
Not to be used for actually validating a url, but in our use case we only
Expand All @@ -50,6 +55,7 @@ def is_url(url):
parts = urlparse(url)
return all([parts.scheme, parts.netloc, parts.path])


def fetch_image(url):
"""
Attempts to fetch an image from a url.
Expand All @@ -64,9 +70,11 @@ def fetch_image(url):
except urllib.error.URLError:
return None


def remove_last_occurence(ls, x):
ls.pop(len(ls) - ls[::-1].index(x) - 1)


def remove_whitespace(string, leading=False, trailing=False):
"""Remove white space from a string.

Expand Down Expand Up @@ -132,12 +140,28 @@ def remove_whitespace(string, leading=False, trailing=False):
# TODO need some way to get rid of extra spaces in e.g. text <span> </span> text
return re.sub(r'\s+', ' ', string)


def delete_paragraph(paragraph):
# https://github.com/python-openxml/python-docx/issues/33#issuecomment-77661907
p = paragraph._element
p.getparent().remove(p)
p._p = p._element = None


class ImageAlignment(enum.Enum):
LEFT = 1
CENTER = 2
RIGHT = 3


def get_image_alignment(image_style):
if image_style == 'float: right;':
return ImageAlignment.RIGHT
if image_style == 'display: block; margin-left: auto; margin-right: auto;':
return ImageAlignment.CENTER
return ImageAlignment.LEFT


font_styles = {
'b': 'bold',
'strong': 'bold',
Expand All @@ -160,6 +184,7 @@ def delete_paragraph(paragraph):
'LIST_NUMBER': 'List Number',
}


class HtmlToDocx(HTMLParser):

def __init__(self):
Expand Down Expand Up @@ -188,9 +213,9 @@ def set_initial_attrs(self, document=None):
self.doc = document
else:
self.doc = Document()
self.bs = self.options['fix-html'] # whether or not to clean with BeautifulSoup
self.bs = self.options['fix-html'] # whether or not to clean with BeautifulSoup
self.document = self.doc
self.include_tables = True #TODO add this option back in?
self.include_tables = True # TODO add this option back in?
self.include_images = self.options['images']
self.include_styles = self.options['styles']
self.paragraph = None
Expand Down Expand Up @@ -233,25 +258,25 @@ def add_styles_to_run(self, style):
colors = [int(x) for x in color.split(',')]
elif '#' in style['color']:
color = style['color'].lstrip('#')
colors = tuple(int(color[i:i+2], 16) for i in (0, 2, 4))
colors = tuple(int(color[i:i + 2], 16) for i in (0, 2, 4))
else:
colors = [0, 0, 0]
# TODO map colors to named colors (and extended colors...)
# For now set color to black to prevent crashing
self.run.font.color.rgb = RGBColor(*colors)

if 'background-color' in style:
if 'rgb' in style['background-color']:
color = color = re.sub(r'[a-z()]+', '', style['background-color'])
colors = [int(x) for x in color.split(',')]
elif '#' in style['background-color']:
color = style['background-color'].lstrip('#')
colors = tuple(int(color[i:i+2], 16) for i in (0, 2, 4))
colors = tuple(int(color[i:i + 2], 16) for i in (0, 2, 4))
else:
colors = [0, 0, 0]
# TODO map colors to named colors (and extended colors...)
# For now set color to black to prevent crashing
self.run.font.highlight_color = WD_COLOR.GRAY_25 #TODO: map colors
self.run.font.highlight_color = WD_COLOR.GRAY_25 # TODO: map colors

def apply_paragraph_style(self, style=None):
try:
Expand All @@ -273,29 +298,32 @@ def handle_li(self):
if list_depth:
list_type = self.tags['list'][-1]
else:
list_type = 'ul' # assign unordered if no tag
list_type = 'ul' # assign unordered if no tag

if list_type == 'ol':
list_style = styles['LIST_NUMBER']
else:
list_style = styles['LIST_BULLET']

self.paragraph = self.doc.add_paragraph(style=list_style)
self.paragraph = self.doc.add_paragraph(style=list_style)
self.paragraph.paragraph_format.left_indent = Inches(min(list_depth * LIST_INDENT, MAX_INDENT))
self.paragraph.paragraph_format.line_spacing = 1

def add_image_to_cell(self, cell, image):
def add_image_to_cell(self, cell, image, width, height):
# python-docx doesn't have method yet for adding images to table cells. For now we use this
paragraph = cell.add_paragraph()
run = paragraph.add_run()
run.add_picture(image)
run.add_picture(image, width, height)

def handle_img(self, current_attrs):
if not self.include_images:
self.skip = True
self.skip_tag = 'img'
return
src = current_attrs['src']
# added image dimension, interpreting values as pixel only
height = Pt(int(current_attrs['height'][:-2])) if 'height' in current_attrs else None
width = Pt(int(current_attrs['width'][:-2])) if 'width' in current_attrs else None
# fetch image
src_is_url = is_url(src)
if src_is_url:
Expand All @@ -309,9 +337,9 @@ def handle_img(self, current_attrs):
if image:
try:
if isinstance(self.doc, docx.document.Document):
self.doc.add_picture(image)
self.doc.add_picture(image, width, height)
else:
self.add_image_to_cell(self.doc, image)
self.add_image_to_cell(self.doc, image, width, height)
except FileNotFoundError:
image = None
if not image:
Expand All @@ -320,7 +348,20 @@ def handle_img(self, current_attrs):
else:
# avoid exposing filepaths in document
self.doc.add_paragraph("<image: %s>" % get_filename_from_url(src))
# add styles?
'''
#adding style
For right-alignment: `'float: right;'`
For center-alignment: `'display: block; margin-left: auto; margin-right: auto;'`
Everything else would be Left aligned
'''
if 'style' in current_attrs:
style = current_attrs['style']
image_alignment = get_image_alignment(style)
last_paragraph = self.doc.paragraphs[-1]
if image_alignment == ImageAlignment.RIGHT:
last_paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT
if image_alignment == ImageAlignment.CENTER:
last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER

def handle_table(self):
"""
Expand Down Expand Up @@ -355,7 +396,7 @@ def handle_table(self):
child_parser.add_html_to_cell(cell_html, docx_cell)
cell_col += 1
cell_row += 1

# skip all tags until corresponding closing tag
self.instances_to_skip = len(table_soup.find_all('table'))
self.skip_tag = 'table'
Expand All @@ -375,7 +416,6 @@ def handle_link(self, href, text):
hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
hyperlink.set(docx.oxml.shared.qn('r:id'), rel_id)


# Create sub-run
subrun = self.paragraph.add_run()
rPr = docx.oxml.shared.OxmlElement('w:rPr')
Expand Down Expand Up @@ -417,7 +457,7 @@ def handle_starttag(self, tag, attrs):
return
elif tag == 'ol' or tag == 'ul':
self.tags['list'].append(tag)
return # don't apply styles for now
return # don't apply styles for now
elif tag == 'br':
self.run.add_break()
return
Expand All @@ -439,14 +479,14 @@ def handle_starttag(self, tag, attrs):
pPr = self.paragraph._p.get_or_add_pPr()
pBdr = OxmlElement('w:pBdr')
pPr.insert_element_before(pBdr,
'w:shd', 'w:tabs', 'w:suppressAutoHyphens', 'w:kinsoku', 'w:wordWrap',
'w:overflowPunct', 'w:topLinePunct', 'w:autoSpaceDE', 'w:autoSpaceDN',
'w:bidi', 'w:adjustRightInd', 'w:snapToGrid', 'w:spacing', 'w:ind',
'w:contextualSpacing', 'w:mirrorIndents', 'w:suppressOverlap', 'w:jc',
'w:textDirection', 'w:textAlignment', 'w:textboxTightWrap',
'w:outlineLvl', 'w:divId', 'w:cnfStyle', 'w:rPr', 'w:sectPr',
'w:pPrChange'
)
'w:shd', 'w:tabs', 'w:suppressAutoHyphens', 'w:kinsoku', 'w:wordWrap',
'w:overflowPunct', 'w:topLinePunct', 'w:autoSpaceDE', 'w:autoSpaceDN',
'w:bidi', 'w:adjustRightInd', 'w:snapToGrid', 'w:spacing', 'w:ind',
'w:contextualSpacing', 'w:mirrorIndents', 'w:suppressOverlap', 'w:jc',
'w:textDirection', 'w:textAlignment', 'w:textboxTightWrap',
'w:outlineLvl', 'w:divId', 'w:cnfStyle', 'w:rPr', 'w:sectPr',
'w:pPrChange'
)
bottom = OxmlElement('w:bottom')
bottom.set(qn('w:val'), 'single')
bottom.set(qn('w:sz'), '6')
Expand Down Expand Up @@ -588,7 +628,7 @@ def get_tables(self):
self.include_tables = False
return
# find other way to do it, or require this dependency?
self.tables = self.ignore_nested_tables(self.soup.find_all('table'))
self.tables = self.ignore_nested_tables(self.soup.find_all('table'))
self.table_no = 0

def run_process(self, html):
Expand Down Expand Up @@ -618,7 +658,7 @@ def add_html_to_cell(self, html, cell):
# cells must end with a paragraph or will get message about corrupt file
# https://stackoverflow.com/a/29287121
if not self.doc.paragraphs:
self.doc.add_paragraph('')
self.doc.add_paragraph('')

def parse_html_file(self, filename_html, filename_docx=None):
with open(filename_html, 'r') as infile:
Expand All @@ -629,24 +669,24 @@ def parse_html_file(self, filename_html, filename_docx=None):
path, filename = os.path.split(filename_html)
filename_docx = '%s/new_docx_file_%s' % (path, filename)
self.doc.save('%s.docx' % filename_docx)

def parse_html_string(self, html):
self.set_initial_attrs()
self.run_process(html)
return self.doc

if __name__=='__main__':

if __name__ == '__main__':
arg_parser = argparse.ArgumentParser(description='Convert .html file into .docx file with formatting')
arg_parser.add_argument('filename_html', help='The .html file to be parsed')
arg_parser.add_argument(
'filename_docx',
nargs='?',
help='The name of the .docx file to be saved. Default new_docx_file_[filename_html]',
'filename_docx',
nargs='?',
help='The name of the .docx file to be saved. Default new_docx_file_[filename_html]',
default=None
)
arg_parser.add_argument('--bs', action='store_true',
help='Attempt to fix html before parsing. Requires bs4. Default True')
arg_parser.add_argument('--bs', action='store_true',
help='Attempt to fix html before parsing. Requires bs4. Default True')

args = vars(arg_parser.parse_args())
file_html = args.pop('filename_html')
Expand Down
4 changes: 4 additions & 0 deletions tests/text1.html
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@
<pre>2 + 3 = 5↵this is code</pre>
<p>A picture from file: <img src='testimg.png' /></p>
<p>A picture from url: <img src='https://raw.githubusercontent.com/pqzx/h2d/master/testimg.png' /></p>
<p>A centered picture from url: <img style="display: block; margin-left: auto; margin-right: auto;" src='https://raw.githubusercontent.com/pqzx/h2d/master/testimg.png' /></p>
<p>A right aligned picture from url: <img img style="float: right;" src='https://raw.githubusercontent.com/pqzx/h2d/master/testimg.png' /></p>
<p>A picture from url that's broken: <img src='https://raw.githubusercontent.com/pqzx/h2d/master/fakeimg.png' /></p>
<p>A picture with height and width: <img height="100px" width="200px" src='https://raw.githubusercontent.com/pqzx/h2d/master/testimg.png' /></p>
<p>A picture with height and width and centered: <img style="display: block; margin-left: auto; margin-right: auto;" height="100px" width="200px" src='https://raw.githubusercontent.com/pqzx/h2d/master/testimg.png' /></p>
<h1>heading 1</h1>
<ol>
<li>Ordered list first item</li>
Expand Down