diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
new file mode 100644
index 0000000..30324e2
--- /dev/null
+++ b/.github/workflows/pypi.yml
@@ -0,0 +1,57 @@
+# Source: https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/#the-whole-ci-cd-workflow
+name: Publish Python 🐍 distribution 📦 to PyPI and TestPyPI
+
+on:
+ push:
+ branches: [ main ]
+ workflow_run:
+ workflows: [ tests ]
+ types:
+ - completed
+
+jobs:
+ build:
+ name: Build distribution 📦
+ runs-on: ubuntu-latest
+
+ steps:
+ - uses: actions/checkout@v4
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: "3.x"
+ - name: Install pypa/build
+ run: >-
+ python3 -m
+ pip install
+ build
+ --user
+ - name: Build a binary wheel and a source tarball
+ run: python3 -m build
+ - name: Store the distribution packages
+ uses: actions/upload-artifact@v3
+ with:
+ name: python-package-distributions
+ path: dist/
+
+ publish-to-pypi:
+ name: >-
+ Publish Python 🐍 distribution 📦 to PyPI
+ if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes
+ needs:
+ - build
+ runs-on: ubuntu-latest
+ environment:
+ name: pypi
+ url: https://pypi.org/p/html-for-docx
+ permissions:
+ id-token: write # IMPORTANT: mandatory for trusted publishing
+
+ steps:
+ - name: Download all the dists
+ uses: actions/download-artifact@v3
+ with:
+ name: python-package-distributions
+ path: dist/
+ - name: Publish distribution 📦 to PyPI
+ uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/HISTORY.rst b/HISTORY.rst
new file mode 100644
index 0000000..28c0dad
--- /dev/null
+++ b/HISTORY.rst
@@ -0,0 +1,57 @@
+.. :changelog:
+
+Release History
+---------------
+
+1.0.4 (2024-08-11)
+++++++++++++++++++
+
+**Updates**
+- Create Changelog HISTORY.
+- Update README.
+- Add Github Action Workflow to publish in pypi.
+- Change default VERSION tag, removing the "v" from new releases.
+
+**New Features**
+- Support to internal links (Anchor) | [Dfop02](https://github.com/dfop02)
+
+
+1.0.3 (2024-02-27)
+++++++++++++++++++
+
+- Adapt font_size when text, ex.: small, medium, etc. | [Dfop02](https://github.com/dfop02)
+- Fix error for image weight and height when no digits | [Dfop02](https://github.com/dfop02)
+
+
+1.0.2 (2024-02-20)
+++++++++++++++++++
+
+- Support px, cm, pt and % for style margin-left to paragraphs | [Dfop02](https://github.com/dfop02)
+- Fix 'style lookup by style_id is deprecated.' | [Dfop02](https://github.com/dfop02)
+- Fix bug when any style has `!important` | [Dfop02](https://github.com/dfop02)
+- Refactory Tests to be more consistent and less 'human validation' | [Dfop02](https://github.com/dfop02)
+- Support to color by name | [Dfop02](https://github.com/dfop02)
+
+
+1.0.1 (2024-02-05)
+++++++++++++++++++
+
+- Fix README.
+
+
+1.0.0 (2024-02-05)
++++++++++++++++++++
+
+- Initial Release!
+
+**Fixes**
+- Handle missing run for leading br tag | [dashingdove](https://github.com/dashingdove) from [PR](https://github.com/pqzx/html2docx/pull/53)
+- Fix base64 images | [djplaner](https://github.com/djplaner) from [Issue](https://github.com/pqzx/html2docx/issues/28#issuecomment-1052736896)
+- Handle img tag without src attribute | [johnjor](https://github.com/johnjor) from [PR](https://github.com/pqzx/html2docx/pull/63)
+
+**New Features**
+- Add Witdh/Height style to images | [maifeeulasad](https://github.com/maifeeulasad) from [PR](https://github.com/pqzx/html2docx/pull/29)
+- Improve performance on large tables | [dashingdove](https://github.com/dashingdove) from [PR](https://github.com/pqzx/html2docx/pull/58)
+- Support for HTML Pagination | [Evilran](https://github.com/Evilran) from [PR](https://github.com/pqzx/html2docx/pull/39)
+- Support Table style | [Evilran](https://github.com/Evilran) from [PR](https://github.com/pqzx/html2docx/pull/39)
+- Support alternative encoding | [HebaElwazzan](https://github.com/HebaElwazzan) from [PR](https://github.com/pqzx/html2docx/pull/59)
diff --git a/README.md b/README.md
index e1b85aa..5a129ca 100644
--- a/README.md
+++ b/README.md
@@ -90,6 +90,9 @@ My goal to fork and fix/update this package was to complete my current task at w
- Support for HTML Pagination | [Evilran](https://github.com/Evilran) from [PR](https://github.com/pqzx/html2docx/pull/39)
- Support Table style | [Evilran](https://github.com/Evilran) from [PR](https://github.com/pqzx/html2docx/pull/39)
- Support alternative encoding | [HebaElwazzan](https://github.com/HebaElwazzan) from [PR](https://github.com/pqzx/html2docx/pull/59)
+- Support colors by name | [Dfop02](https://github.com/dfop02)
+- Support font_size when text, ex.: small, medium, etc. | [Dfop02](https://github.com/dfop02)
+- Support to internal links (Anchor) | [Dfop02](https://github.com/dfop02)
- Refactory Tests to be more consistent and less 'human validation' | [Dfop02](https://github.com/dfop02)
## License
diff --git a/html4docx/h4d.py b/html4docx/h4d.py
index 255947a..3b8a757 100644
--- a/html4docx/h4d.py
+++ b/html4docx/h4d.py
@@ -64,8 +64,8 @@ def set_initial_attrs(self, document=None):
'list': [],
}
self.doc = document if document else Document()
- self.bs = self.options['fix-html'] # whether or not to clean with BeautifulSoup
self.document = self.doc
+ self.bs = self.options['fix-html'] # whether or not to clean with BeautifulSoup
self.include_tables = True # TODO add this option back in?
self.include_images = self.options['images']
self.include_styles = self.options['styles']
@@ -73,20 +73,50 @@ def set_initial_attrs(self, document=None):
self.skip = False
self.skip_tag = None
self.instances_to_skip = 0
+ self.bookmark_id = 0
def copy_settings_from(self, other):
"""Copy settings from another instance of HtmlToDocx"""
self.table_style = other.table_style
def get_cell_html(self, soup):
- # Returns string of td element with opening and closing
tags removed
- # Cannot use find_all as it only finds element tags and does not find text which
- # is not inside an element
+ """
+ Returns string of td element with opening and closing | tags removed
+ Cannot use find_all as it only finds element tags and does not find text which
+ is not inside an element
+ """
return ' '.join([str(i) for i in soup.contents])
+ def unit_converter(self, unit: str, value: int):
+ result = None
+ if unit == 'px':
+ result = Inches(min(value // 10 * INDENT, MAX_INDENT))
+ elif unit == 'cm':
+ result = Cm(min(value // 10 * INDENT, MAX_INDENT) * 2.54)
+ elif unit == 'pt':
+ result = Pt(min(value // 10 * INDENT, MAX_INDENT) * 72)
+ elif unit == '%':
+ result = int(MAX_INDENT * (value / 100))
+
+ # When unit is not supported returns None
+ return result
+
+ def add_bookmark(self, bookmark_name):
+ """Adds a word bookmark to an existing paragraph"""
+ bookmark_start = OxmlElement('w:bookmarkStart')
+ bookmark_start.set(qn('w:id'), str(self.bookmark_id))
+ bookmark_start.set(qn('w:name'), bookmark_name)
+ self.paragraph._element.insert(0, bookmark_start)
+
+ bookmark_end = OxmlElement('w:bookmarkEnd')
+ bookmark_end.set(qn('w:id'), str(self.bookmark_id))
+ self.paragraph._element.append(bookmark_end)
+
+ self.bookmark_id += 1
+
def add_styles_to_paragraph(self, style):
if 'text-align' in style:
- align = re.sub('!important', '', style['text-align'], flags=re.IGNORECASE)
+ align = utils.remove_important_from_style(style['text-align'])
if 'center' in align:
self.paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
@@ -101,25 +131,15 @@ def add_styles_to_paragraph(self, style):
if 'auto' in margin_left and 'auto' in margin_right:
self.paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
elif 'margin-left' in style:
- margin = re.sub('!important', '', style['margin-left'], flags=re.IGNORECASE)
+ margin = utils.remove_important_from_style(style['margin-left'])
units = re.sub(r'[0-9]+', '', margin)
- margin = int(float(re.sub(r'[a-zA-Z\!]+', '', margin)))
+ margin = int(float(re.sub(r'[a-zA-Z\!\%]+', '', margin)))
- if units == 'px':
- self.paragraph.paragraph_format.left_indent = Inches(min(margin // 10 * INDENT, MAX_INDENT))
- elif units == 'cm':
- self.paragraph.paragraph_format.left_indent = Cm(min(margin // 10 * INDENT, MAX_INDENT) * 2.54)
- elif units == 'pt':
- self.paragraph.paragraph_format.left_indent = Pt(min(margin // 10 * INDENT, MAX_INDENT) * 72)
- elif units == '%':
- self.paragraph.paragraph_format.left_indent = MAX_INDENT * (units / 100)
- else:
- # When unit is not supported
- self.paragraph.paragraph_format.left_indent = None
+ self.paragraph.paragraph_format.left_indent = self.unit_converter(units, margin)
def add_styles_to_table(self, style):
if 'text-align' in style:
- align = re.sub('!important', '', style['text-align'], flags=re.IGNORECASE)
+ align = utils.remove_important_from_style(style['text-align'])
if 'center' in align:
self.table.alignment = WD_ALIGN_PARAGRAPH.CENTER
@@ -134,30 +154,20 @@ def add_styles_to_table(self, style):
if 'auto' in margin_left and 'auto' in margin_right:
self.table.alignment = WD_ALIGN_PARAGRAPH.CENTER
elif 'margin-left' in style:
- margin = re.sub('!important', '', style['margin-left'], flags=re.IGNORECASE)
+ margin = utils.remove_important_from_style(style['margin-left'])
units = re.sub(r'[0-9]+', '', margin)
- margin = int(float(re.sub(r'[a-zA-Z\!]+', '', margin)))
+ margin = int(float(re.sub(r'[a-zA-Z\!\%]+', '', margin)))
- if units == 'px':
- self.table.left_indent = Inches(min(margin // 10 * INDENT, MAX_INDENT))
- elif units == 'cm':
- self.table.left_indent = Cm(min(margin // 10 * INDENT, MAX_INDENT) * 2.54)
- elif units == 'pt':
- self.table.left_indent = Pt(min(margin // 10 * INDENT, MAX_INDENT) * 72)
- elif units == '%':
- self.table.left_indent = MAX_INDENT * (units / 100)
- else:
- # When unit is not supported
- self.table.left_indent = None
+ self.table.left_indent = self.unit_converter(units, margin)
def add_styles_to_run(self, style):
if 'font-size' in style:
- font_size = re.sub('!important', '', style['font-size'], flags=re.IGNORECASE)
+ font_size = utils.remove_important_from_style(style['font-size'])
# Adapt font_size when text, ex.: small, medium, etc.
font_size = utils.adapt_font_size(font_size)
units = re.sub(r'[0-9]+', '', font_size)
- font_size = int(float(re.sub(r'[a-zA-Z\!]+', '', font_size)))
+ font_size = int(float(re.sub(r'[a-zA-Z\!\%]+', '', font_size)))
if units == 'px':
font_size_unit = Inches(utils.px_to_inches(font_size))
@@ -174,7 +184,7 @@ def add_styles_to_run(self, style):
run.font.size = font_size_unit
if 'color' in style:
- font_color = re.sub('!important', '', style['color'].lower(), flags=re.IGNORECASE)
+ font_color = utils.remove_important_from_style(style['color'].lower())
if 'rgb' in font_color:
color = re.sub(r'[a-z()]+', '', font_color)
@@ -192,7 +202,7 @@ def add_styles_to_run(self, style):
self.run.font.color.rgb = RGBColor(*colors)
if 'background-color' in style:
- background_color = re.sub('!important', '', style['background-color'].lower(), flags=re.IGNORECASE)
+ background_color = utils.remove_important_from_style(style['background-color'].lower())
if 'rgb' in background_color:
color = re.sub(r'[a-z()]+', '', background_color)
@@ -369,31 +379,48 @@ def handle_div(self, current_attrs):
if 'style' in current_attrs and 'page-break-after: always' in current_attrs['style']:
self.doc.add_page_break()
- def handle_link(self, href, text):
- # Link requires a relationship
- # is_external = href.startswith('http')
- rel_id = self.paragraph.part.relate_to(
- href,
- docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK,
- is_external=True # TO-DO support anchor links for this library yet
- )
+ def handle_link(self, href, text, tooltip=None):
+ """
+ A function that places a hyperlink within a paragraph object.
- # Create the w:hyperlink tag and add needed values
- hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
- hyperlink.set(docx.oxml.shared.qn('r:id'), rel_id)
+ Args:
+ href: A string containing the required url.
+ text: The text displayed for the url.
+ tooltip: The text displayed when holder link.
+ """
+ is_external = href.startswith('http')
+ hyperlink = OxmlElement('w:hyperlink')
+
+ if is_external:
+ # Create external hyperlink
+ rel_id = self.paragraph.part.relate_to(
+ href,
+ docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK,
+ is_external=True
+ )
+
+ # Create the w:hyperlink tag and add needed values
+ hyperlink.set(qn('r:id'), rel_id)
+ else:
+ # Create internal hyperlink (anchor)
+ hyperlink.set(qn('w:anchor'), href.replace('#', ''))
+
+ if tooltip is not None:
+ # set tooltip to hyperlink
+ hyperlink.set(qn('w:tooltip'), tooltip)
# Create sub-run
subrun = self.paragraph.add_run()
- rPr = docx.oxml.shared.OxmlElement('w:rPr')
+ rPr = OxmlElement('w:rPr')
# add default color
- c = docx.oxml.shared.OxmlElement('w:color')
- c.set(docx.oxml.shared.qn('w:val'), "0000EE")
+ c = OxmlElement('w:color')
+ c.set(qn('w:val'), "0000EE")
rPr.append(c)
# add underline
- u = docx.oxml.shared.OxmlElement('w:u')
- u.set(docx.oxml.shared.qn('w:val'), 'single')
+ u = OxmlElement('w:u')
+ u.set(qn('w:val'), 'single')
rPr.append(u)
subrun._r.append(rPr)
@@ -485,6 +512,9 @@ def handle_starttag(self, tag, attrs):
if tag in ['p', 'li', 'pre']:
self.run = self.paragraph.add_run()
+ if 'id' in current_attrs:
+ self.add_bookmark(current_attrs['id'])
+
# add style
if not self.include_styles:
return
@@ -540,7 +570,7 @@ def handle_data(self, data):
# https://html.spec.whatwg.org/#interactive-content
link = self.tags.get('a')
if link:
- self.handle_link(link['href'], data)
+ self.handle_link(link.get('href', None), data, link.get('title', None))
else:
# If there's a link, dont put the data directly in the run
self.run = self.paragraph.add_run(data)
@@ -617,15 +647,15 @@ def run_process(self, html):
def add_html_to_document(self, html, document):
if not isinstance(html, str):
- raise ValueError('First argument needs to be a %s' % str)
+ raise ValueError(f'First argument needs to be a {str}')
elif not isinstance(document, docx.document.Document) and not isinstance(document, docx.table._Cell):
- raise ValueError('Second argument needs to be a %s' % docx.document.Document)
+ raise ValueError(f'Second argument needs to be a {docx.document.Document}')
self.set_initial_attrs(document)
self.run_process(html)
def add_html_to_cell(self, html, cell):
if not isinstance(cell, docx.table._Cell):
- raise ValueError('Second argument needs to be a %s' % docx.table._Cell)
+ raise ValueError(f'Second argument needs to be a {docx.table._Cell}')
unwanted_paragraph = cell.paragraphs[0]
utils.delete_paragraph(unwanted_paragraph)
self.set_initial_attrs(cell)
@@ -642,8 +672,8 @@ def parse_html_file(self, filename_html, filename_docx=None, encoding='utf-8'):
self.run_process(html)
if not filename_docx:
path, filename = os.path.split(filename_html)
- filename_docx = '%s/new_docx_file_%s' % (path, filename)
- self.doc.save('%s.docx' % filename_docx)
+ filename_docx = f'{path}/new_docx_file_{filename}'
+ self.doc.save(f'{filename_docx}.docx')
def parse_html_string(self, html):
self.set_initial_attrs()
diff --git a/html4docx/utils.py b/html4docx/utils.py
index 12d2cc4..d333ae1 100644
--- a/html4docx/utils.py
+++ b/html4docx/utils.py
@@ -66,6 +66,9 @@ def adapt_font_size(size):
return size
+def remove_important_from_style(text):
+ return re.sub('!important', '', text, flags=re.IGNORECASE)
+
def fetch_image(url):
"""
Attempts to fetch an image from a url.
diff --git a/requirements.txt b/requirements.txt
index 5187bc7..402b1b8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,2 @@
-beautifulsoup4==4.12.2
-python-docx==1.1.0
\ No newline at end of file
+beautifulsoup4>=4.12.2
+python-docx>=1.1.0
diff --git a/setup.cfg b/setup.cfg
index 833b934..61e3a70 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,8 +1,9 @@
[metadata]
name = html-for-docx
-version = 1.0.3
+version = 1.0.4
url = https://github.com/dfop02/html4docx
project_urls =
+ Changelog = https://github.com/dfop02/html4docx/blob/master/HISTORY.rst
Bug Tracker = https://github.com/dfop02/html4docx/issues
Repository = https://github.com/dfop02/html4docx
author = Diogo Fernandes
@@ -26,6 +27,7 @@ classifiers =
Programming Language :: Python :: 3.11
Programming Language :: Python :: 3.12
Topic :: Software Development :: Libraries :: Python Modules
+ Topic :: Software Development :: Libraries
Topic :: Utilities
[options]
diff --git a/setup.py b/setup.py
index 2153237..1488eed 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
here = os.path.abspath(os.path.dirname(__file__))
README = open(os.path.join(here, 'README.md')).read()
-VERSION = '1.0.3'
+VERSION = '1.0.4'
setup(
name = 'html-for-docx',
@@ -14,20 +14,22 @@
packages = find_packages(),
python_requires = '>=3.7',
author = 'Diogo Fernandes',
- author_email = 'dfop02@hotmail.com',
+ author_email = 'diogofernandesop@gmail.com',
platforms = ['any'],
include_package_data = True,
- keywords = ['html', 'docx', 'convert'],
+ keywords = ['html', 'docx', 'office', 'word', 'convert', 'transform'],
zip_safe = False,
url = 'https://github.com/dfop02/html4docx',
project_urls = {
+ "Changelog": "https://github.com/dfop02/html4docx/blob/master/HISTORY.rst",
"Bug Tracker": "https://github.com/dfop02/html4docx/issues",
"Repository": "https://github.com/dfop02/html4docx"
},
- download_url = f'https://github.com/dfop02/html4docx/archive/v{VERSION}.tar.gz',
+ download_url = f'https://github.com/dfop02/html4docx/archive/{VERSION}.tar.gz',
classifiers = [
'Intended Audience :: Developers',
'Topic :: Software Development :: Build Tools',
+ 'Topic :: Software Development :: Libraries',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.7',
diff --git a/tests/assets/htmls/text1.html b/tests/assets/htmls/text1.html
index a43d301..93cd7a2 100644
--- a/tests/assets/htmls/text1.html
+++ b/tests/assets/htmls/text1.html
@@ -44,6 +44,9 @@ heading 1
Indent 4
Indent max?
Indent 2.5em
+Indent 20cm
+Indent 15pt
+Indent 10%
asdfsa
link
diff --git a/tests/test.py b/tests/test.py
index 4ba79df..b53b940 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -221,7 +221,7 @@ def test_handling_hr(self):
assert '' in document._body._body.xml
def test_external_hyperlink(self):
- hyperlink_html_example = "Anchor Link"
+ hyperlink_html_example = "Google External Link"
self.document.add_heading(
'Test: Handling external hyperlink',
@@ -231,8 +231,35 @@ def test_external_hyperlink(self):
self.parser.add_html_to_document(hyperlink_html_example, self.document)
document = self.parser.parse_html_string(hyperlink_html_example)
+ # Extract external hyperlinks
+ external_hyperlinks = []
+
+ for rel in document.part.rels.values():
+ if "hyperlink" in rel.reltype:
+ external_hyperlinks.append(rel.target_ref)
+
+ assert 'https://www.google.com' in external_hyperlinks
assert 'Introduction Header"
+ "Click here: Link to intro "
+ )
+
+ self.document.add_heading(
+ 'Test: Handling internal hyperlink',
+ level=1
+ )
+ # Add on document for human validation
+ self.parser.add_html_to_document(hyperlink_html_example, self.document)
+
+ document = self.parser.parse_html_string(hyperlink_html_example)
+ document_body = document._body._body.xml
+ assert '' in document_body
+ assert '' in document_body
+ assert '' in document_body
+
def test_image_no_src(self):
self.document.add_heading(
'Test: Handling img without src',
|