pqzx · maifeeulasad · Dec 23, 2021 · Dec 23, 2021 · Dec 23, 2021 · Dec 23, 2021
diff --git a/htmldocx/h2d.py b/htmldocx/h2d.py
@@ -12,25 +12,29 @@
 
 How to deal with block level style applied over table elements? e.g. text align
 """
-import re, argparse
-import io, os
+import argparse
+import enum
+import io
+import os
+import re
 import urllib.request
-from urllib.parse import urlparse
 from html.parser import HTMLParser
+from urllib.parse import urlparse
 
-import docx, docx.table
+import docx
+import docx.table
+from bs4 import BeautifulSoup
 from docx import Document
-from docx.shared import RGBColor, Pt, Inches
 from docx.enum.text import WD_COLOR, WD_ALIGN_PARAGRAPH
 from docx.oxml import OxmlElement
 from docx.oxml.ns import qn
-
-from bs4 import BeautifulSoup
+from docx.shared import Pt
+from docx.shared import RGBColor, Inches
 
 # values in inches
 INDENT = 0.25
 LIST_INDENT = 0.5
-MAX_INDENT = 5.5 # To stop indents going off the page
+MAX_INDENT = 5.5  # To stop indents going off the page
 
 # Style to use with tables. By default no style is used.
 DEFAULT_TABLE_STYLE = None
@@ -42,6 +46,7 @@
 def get_filename_from_url(url):
     return os.path.basename(urlparse(url).path)
 
+
 def is_url(url):
     """
     Not to be used for actually validating a url, but in our use case we only 
@@ -50,6 +55,7 @@ def is_url(url):
     parts = urlparse(url)
     return all([parts.scheme, parts.netloc, parts.path])
 
+
 def fetch_image(url):
     """
     Attempts to fetch an image from a url. 
@@ -64,9 +70,11 @@ def fetch_image(url):
     except urllib.error.URLError:
         return None
 
+
 def remove_last_occurence(ls, x):
     ls.pop(len(ls) - ls[::-1].index(x) - 1)
 
+
 def remove_whitespace(string, leading=False, trailing=False):
     """Remove white space from a string.
 
@@ -132,12 +140,28 @@ def remove_whitespace(string, leading=False, trailing=False):
     # TODO need some way to get rid of extra spaces in e.g. text <span>   </span>  text
     return re.sub(r'\s+', ' ', string)
 
+
 def delete_paragraph(paragraph):
     # https://github.com/python-openxml/python-docx/issues/33#issuecomment-77661907
     p = paragraph._element
     p.getparent().remove(p)
     p._p = p._element = None
 
+
+class ImageAlignment(enum.Enum):
+    LEFT = 1
+    CENTER = 2
+    RIGHT = 3
+
+
+def get_image_alignment(image_style):
+    if image_style == 'float: right;':
+        return ImageAlignment.RIGHT
+    if image_style == 'display: block; margin-left: auto; margin-right: auto;':
+        return ImageAlignment.CENTER
+    return ImageAlignment.LEFT
+
+
 font_styles = {
     'b': 'bold',
     'strong': 'bold',
@@ -160,6 +184,7 @@ def delete_paragraph(paragraph):
     'LIST_NUMBER': 'List Number',
 }
 
+
 class HtmlToDocx(HTMLParser):
 
     def __init__(self):
@@ -188,9 +213,9 @@ def set_initial_attrs(self, document=None):
             self.doc = document
         else:
             self.doc = Document()
-        self.bs = self.options['fix-html'] # whether or not to clean with BeautifulSoup
+        self.bs = self.options['fix-html']  # whether or not to clean with BeautifulSoup
         self.document = self.doc
-        self.include_tables = True #TODO add this option back in?
+        self.include_tables = True  # TODO add this option back in?
         self.include_images = self.options['images']
         self.include_styles = self.options['styles']
         self.paragraph = None
@@ -233,25 +258,25 @@ def add_styles_to_run(self, style):
                 colors = [int(x) for x in color.split(',')]
             elif '#' in style['color']:
                 color = style['color'].lstrip('#')
-                colors = tuple(int(color[i:i+2], 16) for i in (0, 2, 4))
+                colors = tuple(int(color[i:i + 2], 16) for i in (0, 2, 4))
             else:
                 colors = [0, 0, 0]
                 # TODO map colors to named colors (and extended colors...)
                 # For now set color to black to prevent crashing
             self.run.font.color.rgb = RGBColor(*colors)
-            
+
         if 'background-color' in style:
             if 'rgb' in style['background-color']:
                 color = color = re.sub(r'[a-z()]+', '', style['background-color'])
                 colors = [int(x) for x in color.split(',')]
             elif '#' in style['background-color']:
                 color = style['background-color'].lstrip('#')
-                colors = tuple(int(color[i:i+2], 16) for i in (0, 2, 4))
+                colors = tuple(int(color[i:i + 2], 16) for i in (0, 2, 4))
             else:
                 colors = [0, 0, 0]
                 # TODO map colors to named colors (and extended colors...)
                 # For now set color to black to prevent crashing
-            self.run.font.highlight_color = WD_COLOR.GRAY_25 #TODO: map colors
+            self.run.font.highlight_color = WD_COLOR.GRAY_25  # TODO: map colors
 
     def apply_paragraph_style(self, style=None):
         try:
@@ -273,29 +298,32 @@ def handle_li(self):
         if list_depth:
             list_type = self.tags['list'][-1]
         else:
-            list_type = 'ul' # assign unordered if no tag
+            list_type = 'ul'  # assign unordered if no tag
 
         if list_type == 'ol':
             list_style = styles['LIST_NUMBER']
         else:
             list_style = styles['LIST_BULLET']
 
-        self.paragraph = self.doc.add_paragraph(style=list_style)            
+        self.paragraph = self.doc.add_paragraph(style=list_style)
         self.paragraph.paragraph_format.left_indent = Inches(min(list_depth * LIST_INDENT, MAX_INDENT))
         self.paragraph.paragraph_format.line_spacing = 1
 
-    def add_image_to_cell(self, cell, image):
+    def add_image_to_cell(self, cell, image, width, height):
         # python-docx doesn't have method yet for adding images to table cells. For now we use this
         paragraph = cell.add_paragraph()
         run = paragraph.add_run()
-        run.add_picture(image)
+        run.add_picture(image, width, height)
 
     def handle_img(self, current_attrs):
         if not self.include_images:
             self.skip = True
             self.skip_tag = 'img'
             return
         src = current_attrs['src']
+        # added image dimension, interpreting values as pixel only
+        height = Pt(int(current_attrs['height'][:-2])) if 'height' in current_attrs else None
+        width = Pt(int(current_attrs['width'][:-2])) if 'width' in current_attrs else None
         # fetch image
         src_is_url = is_url(src)
         if src_is_url:
@@ -309,9 +337,9 @@ def handle_img(self, current_attrs):
         if image:
             try:
                 if isinstance(self.doc, docx.document.Document):
-                    self.doc.add_picture(image)
+                    self.doc.add_picture(image, width, height)
                 else:
-                    self.add_image_to_cell(self.doc, image)
+                    self.add_image_to_cell(self.doc, image, width, height)
             except FileNotFoundError:
                 image = None
         if not image:
@@ -320,7 +348,20 @@ def handle_img(self, current_attrs):
             else:
                 # avoid exposing filepaths in document
                 self.doc.add_paragraph("<image: %s>" % get_filename_from_url(src))
-        # add styles?
+        '''
+        #adding style
+        For right-alignment: `'float: right;'`
+        For center-alignment: `'display: block; margin-left: auto; margin-right: auto;'`
+        Everything else would be Left aligned
+        '''
+        if 'style' in current_attrs:
+            style = current_attrs['style']
+            image_alignment = get_image_alignment(style)
+            last_paragraph = self.doc.paragraphs[-1]
+            if image_alignment == ImageAlignment.RIGHT:
+                last_paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT
+            if image_alignment == ImageAlignment.CENTER:
+                last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
 
     def handle_table(self):
         """
@@ -355,7 +396,7 @@ def handle_table(self):
                 child_parser.add_html_to_cell(cell_html, docx_cell)
                 cell_col += 1
             cell_row += 1
-        
+
         # skip all tags until corresponding closing tag
         self.instances_to_skip = len(table_soup.find_all('table'))
         self.skip_tag = 'table'
@@ -375,7 +416,6 @@ def handle_link(self, href, text):
         hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
         hyperlink.set(docx.oxml.shared.qn('r:id'), rel_id)
 
-
         # Create sub-run
         subrun = self.paragraph.add_run()
         rPr = docx.oxml.shared.OxmlElement('w:rPr')
@@ -417,7 +457,7 @@ def handle_starttag(self, tag, attrs):
             return
         elif tag == 'ol' or tag == 'ul':
             self.tags['list'].append(tag)
-            return # don't apply styles for now
+            return  # don't apply styles for now
         elif tag == 'br':
             self.run.add_break()
             return
@@ -439,14 +479,14 @@ def handle_starttag(self, tag, attrs):
             pPr = self.paragraph._p.get_or_add_pPr()
             pBdr = OxmlElement('w:pBdr')
             pPr.insert_element_before(pBdr,
-                'w:shd', 'w:tabs', 'w:suppressAutoHyphens', 'w:kinsoku', 'w:wordWrap',
-                'w:overflowPunct', 'w:topLinePunct', 'w:autoSpaceDE', 'w:autoSpaceDN',
-                'w:bidi', 'w:adjustRightInd', 'w:snapToGrid', 'w:spacing', 'w:ind',
-                'w:contextualSpacing', 'w:mirrorIndents', 'w:suppressOverlap', 'w:jc',
-                'w:textDirection', 'w:textAlignment', 'w:textboxTightWrap',
-                'w:outlineLvl', 'w:divId', 'w:cnfStyle', 'w:rPr', 'w:sectPr',
-                'w:pPrChange'
-            )
+                                      'w:shd', 'w:tabs', 'w:suppressAutoHyphens', 'w:kinsoku', 'w:wordWrap',
+                                      'w:overflowPunct', 'w:topLinePunct', 'w:autoSpaceDE', 'w:autoSpaceDN',
+                                      'w:bidi', 'w:adjustRightInd', 'w:snapToGrid', 'w:spacing', 'w:ind',
+                                      'w:contextualSpacing', 'w:mirrorIndents', 'w:suppressOverlap', 'w:jc',
+                                      'w:textDirection', 'w:textAlignment', 'w:textboxTightWrap',
+                                      'w:outlineLvl', 'w:divId', 'w:cnfStyle', 'w:rPr', 'w:sectPr',
+                                      'w:pPrChange'
+                                      )
             bottom = OxmlElement('w:bottom')
             bottom.set(qn('w:val'), 'single')
             bottom.set(qn('w:sz'), '6')
@@ -588,7 +628,7 @@ def get_tables(self):
             self.include_tables = False
             return
             # find other way to do it, or require this dependency?
-        self.tables = self.ignore_nested_tables(self.soup.find_all('table'))  
+        self.tables = self.ignore_nested_tables(self.soup.find_all('table'))
         self.table_no = 0
 
     def run_process(self, html):
@@ -618,7 +658,7 @@ def add_html_to_cell(self, html, cell):
         # cells must end with a paragraph or will get message about corrupt file
         # https://stackoverflow.com/a/29287121
         if not self.doc.paragraphs:
-            self.doc.add_paragraph('')  
+            self.doc.add_paragraph('')
 
     def parse_html_file(self, filename_html, filename_docx=None):
         with open(filename_html, 'r') as infile:
@@ -629,24 +669,24 @@ def parse_html_file(self, filename_html, filename_docx=None):
             path, filename = os.path.split(filename_html)
             filename_docx = '%s/new_docx_file_%s' % (path, filename)
         self.doc.save('%s.docx' % filename_docx)
-    
+
     def parse_html_string(self, html):
         self.set_initial_attrs()
         self.run_process(html)
         return self.doc
 
-if __name__=='__main__':
-    
+
+if __name__ == '__main__':
     arg_parser = argparse.ArgumentParser(description='Convert .html file into .docx file with formatting')
     arg_parser.add_argument('filename_html', help='The .html file to be parsed')
     arg_parser.add_argument(
-        'filename_docx', 
-        nargs='?', 
-        help='The name of the .docx file to be saved. Default new_docx_file_[filename_html]', 
+        'filename_docx',
+        nargs='?',
+        help='The name of the .docx file to be saved. Default new_docx_file_[filename_html]',
         default=None
     )
-    arg_parser.add_argument('--bs', action='store_true', 
-        help='Attempt to fix html before parsing. Requires bs4. Default True')
+    arg_parser.add_argument('--bs', action='store_true',
+                            help='Attempt to fix html before parsing. Requires bs4. Default True')
 
     args = vars(arg_parser.parse_args())
     file_html = args.pop('filename_html')

diff --git a/tests/text1.html b/tests/text1.html
@@ -14,7 +14,11 @@
 <pre>2 + 3 = 5↵this is code</pre>
 <p>A picture from file: <img src='testimg.png' /></p>
 <p>A picture from url: <img src='https://raw.githubusercontent.com/pqzx/h2d/master/testimg.png' /></p>
+<p>A centered picture from url: <img style="display: block; margin-left: auto; margin-right: auto;" src='https://raw.githubusercontent.com/pqzx/h2d/master/testimg.png' /></p>
+<p>A right aligned picture from url: <img img style="float: right;" src='https://raw.githubusercontent.com/pqzx/h2d/master/testimg.png' /></p>
 <p>A picture from url that's broken: <img src='https://raw.githubusercontent.com/pqzx/h2d/master/fakeimg.png' /></p>
+<p>A picture with height and width: <img height="100px" width="200px" src='https://raw.githubusercontent.com/pqzx/h2d/master/testimg.png' /></p>
+<p>A picture with height and width and centered: <img style="display: block; margin-left: auto; margin-right: auto;" height="100px" width="200px" src='https://raw.githubusercontent.com/pqzx/h2d/master/testimg.png' /></p>
 <h1>heading 1</h1>
 <ol>
     <li>Ordered list first item</li>