diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml new file mode 100644 index 0000000..30324e2 --- /dev/null +++ b/.github/workflows/pypi.yml @@ -0,0 +1,57 @@ +# Source: https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/#the-whole-ci-cd-workflow +name: Publish Python 🐍 distribution 📦 to PyPI and TestPyPI + +on: + push: + branches: [ main ] + workflow_run: + workflows: [ tests ] + types: + - completed + +jobs: + build: + name: Build distribution 📦 + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.x" + - name: Install pypa/build + run: >- + python3 -m + pip install + build + --user + - name: Build a binary wheel and a source tarball + run: python3 -m build + - name: Store the distribution packages + uses: actions/upload-artifact@v3 + with: + name: python-package-distributions + path: dist/ + + publish-to-pypi: + name: >- + Publish Python 🐍 distribution 📦 to PyPI + if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes + needs: + - build + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/html-for-docx + permissions: + id-token: write # IMPORTANT: mandatory for trusted publishing + + steps: + - name: Download all the dists + uses: actions/download-artifact@v3 + with: + name: python-package-distributions + path: dist/ + - name: Publish distribution 📦 to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/HISTORY.rst b/HISTORY.rst new file mode 100644 index 0000000..28c0dad --- /dev/null +++ b/HISTORY.rst @@ -0,0 +1,57 @@ +.. :changelog: + +Release History +--------------- + +1.0.4 (2024-08-11) +++++++++++++++++++ + +**Updates** +- Create Changelog HISTORY. +- Update README. +- Add Github Action Workflow to publish in pypi. +- Change default VERSION tag, removing the "v" from new releases. + +**New Features** +- Support to internal links (Anchor) | [Dfop02](https://github.com/dfop02) + + +1.0.3 (2024-02-27) +++++++++++++++++++ + +- Adapt font_size when text, ex.: small, medium, etc. | [Dfop02](https://github.com/dfop02) +- Fix error for image weight and height when no digits | [Dfop02](https://github.com/dfop02) + + +1.0.2 (2024-02-20) +++++++++++++++++++ + +- Support px, cm, pt and % for style margin-left to paragraphs | [Dfop02](https://github.com/dfop02) +- Fix 'style lookup by style_id is deprecated.' | [Dfop02](https://github.com/dfop02) +- Fix bug when any style has `!important` | [Dfop02](https://github.com/dfop02) +- Refactory Tests to be more consistent and less 'human validation' | [Dfop02](https://github.com/dfop02) +- Support to color by name | [Dfop02](https://github.com/dfop02) + + +1.0.1 (2024-02-05) +++++++++++++++++++ + +- Fix README. + + +1.0.0 (2024-02-05) ++++++++++++++++++++ + +- Initial Release! + +**Fixes** +- Handle missing run for leading br tag | [dashingdove](https://github.com/dashingdove) from [PR](https://github.com/pqzx/html2docx/pull/53) +- Fix base64 images | [djplaner](https://github.com/djplaner) from [Issue](https://github.com/pqzx/html2docx/issues/28#issuecomment-1052736896) +- Handle img tag without src attribute | [johnjor](https://github.com/johnjor) from [PR](https://github.com/pqzx/html2docx/pull/63) + +**New Features** +- Add Witdh/Height style to images | [maifeeulasad](https://github.com/maifeeulasad) from [PR](https://github.com/pqzx/html2docx/pull/29) +- Improve performance on large tables | [dashingdove](https://github.com/dashingdove) from [PR](https://github.com/pqzx/html2docx/pull/58) +- Support for HTML Pagination | [Evilran](https://github.com/Evilran) from [PR](https://github.com/pqzx/html2docx/pull/39) +- Support Table style | [Evilran](https://github.com/Evilran) from [PR](https://github.com/pqzx/html2docx/pull/39) +- Support alternative encoding | [HebaElwazzan](https://github.com/HebaElwazzan) from [PR](https://github.com/pqzx/html2docx/pull/59) diff --git a/README.md b/README.md index e1b85aa..5a129ca 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,9 @@ My goal to fork and fix/update this package was to complete my current task at w - Support for HTML Pagination | [Evilran](https://github.com/Evilran) from [PR](https://github.com/pqzx/html2docx/pull/39) - Support Table style | [Evilran](https://github.com/Evilran) from [PR](https://github.com/pqzx/html2docx/pull/39) - Support alternative encoding | [HebaElwazzan](https://github.com/HebaElwazzan) from [PR](https://github.com/pqzx/html2docx/pull/59) +- Support colors by name | [Dfop02](https://github.com/dfop02) +- Support font_size when text, ex.: small, medium, etc. | [Dfop02](https://github.com/dfop02) +- Support to internal links (Anchor) | [Dfop02](https://github.com/dfop02) - Refactory Tests to be more consistent and less 'human validation' | [Dfop02](https://github.com/dfop02) ## License diff --git a/html4docx/h4d.py b/html4docx/h4d.py index 255947a..3b8a757 100644 --- a/html4docx/h4d.py +++ b/html4docx/h4d.py @@ -64,8 +64,8 @@ def set_initial_attrs(self, document=None): 'list': [], } self.doc = document if document else Document() - self.bs = self.options['fix-html'] # whether or not to clean with BeautifulSoup self.document = self.doc + self.bs = self.options['fix-html'] # whether or not to clean with BeautifulSoup self.include_tables = True # TODO add this option back in? self.include_images = self.options['images'] self.include_styles = self.options['styles'] @@ -73,20 +73,50 @@ def set_initial_attrs(self, document=None): self.skip = False self.skip_tag = None self.instances_to_skip = 0 + self.bookmark_id = 0 def copy_settings_from(self, other): """Copy settings from another instance of HtmlToDocx""" self.table_style = other.table_style def get_cell_html(self, soup): - # Returns string of td element with opening and closing tags removed - # Cannot use find_all as it only finds element tags and does not find text which - # is not inside an element + """ + Returns string of td element with opening and closing tags removed + Cannot use find_all as it only finds element tags and does not find text which + is not inside an element + """ return ' '.join([str(i) for i in soup.contents]) + def unit_converter(self, unit: str, value: int): + result = None + if unit == 'px': + result = Inches(min(value // 10 * INDENT, MAX_INDENT)) + elif unit == 'cm': + result = Cm(min(value // 10 * INDENT, MAX_INDENT) * 2.54) + elif unit == 'pt': + result = Pt(min(value // 10 * INDENT, MAX_INDENT) * 72) + elif unit == '%': + result = int(MAX_INDENT * (value / 100)) + + # When unit is not supported returns None + return result + + def add_bookmark(self, bookmark_name): + """Adds a word bookmark to an existing paragraph""" + bookmark_start = OxmlElement('w:bookmarkStart') + bookmark_start.set(qn('w:id'), str(self.bookmark_id)) + bookmark_start.set(qn('w:name'), bookmark_name) + self.paragraph._element.insert(0, bookmark_start) + + bookmark_end = OxmlElement('w:bookmarkEnd') + bookmark_end.set(qn('w:id'), str(self.bookmark_id)) + self.paragraph._element.append(bookmark_end) + + self.bookmark_id += 1 + def add_styles_to_paragraph(self, style): if 'text-align' in style: - align = re.sub('!important', '', style['text-align'], flags=re.IGNORECASE) + align = utils.remove_important_from_style(style['text-align']) if 'center' in align: self.paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER @@ -101,25 +131,15 @@ def add_styles_to_paragraph(self, style): if 'auto' in margin_left and 'auto' in margin_right: self.paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER elif 'margin-left' in style: - margin = re.sub('!important', '', style['margin-left'], flags=re.IGNORECASE) + margin = utils.remove_important_from_style(style['margin-left']) units = re.sub(r'[0-9]+', '', margin) - margin = int(float(re.sub(r'[a-zA-Z\!]+', '', margin))) + margin = int(float(re.sub(r'[a-zA-Z\!\%]+', '', margin))) - if units == 'px': - self.paragraph.paragraph_format.left_indent = Inches(min(margin // 10 * INDENT, MAX_INDENT)) - elif units == 'cm': - self.paragraph.paragraph_format.left_indent = Cm(min(margin // 10 * INDENT, MAX_INDENT) * 2.54) - elif units == 'pt': - self.paragraph.paragraph_format.left_indent = Pt(min(margin // 10 * INDENT, MAX_INDENT) * 72) - elif units == '%': - self.paragraph.paragraph_format.left_indent = MAX_INDENT * (units / 100) - else: - # When unit is not supported - self.paragraph.paragraph_format.left_indent = None + self.paragraph.paragraph_format.left_indent = self.unit_converter(units, margin) def add_styles_to_table(self, style): if 'text-align' in style: - align = re.sub('!important', '', style['text-align'], flags=re.IGNORECASE) + align = utils.remove_important_from_style(style['text-align']) if 'center' in align: self.table.alignment = WD_ALIGN_PARAGRAPH.CENTER @@ -134,30 +154,20 @@ def add_styles_to_table(self, style): if 'auto' in margin_left and 'auto' in margin_right: self.table.alignment = WD_ALIGN_PARAGRAPH.CENTER elif 'margin-left' in style: - margin = re.sub('!important', '', style['margin-left'], flags=re.IGNORECASE) + margin = utils.remove_important_from_style(style['margin-left']) units = re.sub(r'[0-9]+', '', margin) - margin = int(float(re.sub(r'[a-zA-Z\!]+', '', margin))) + margin = int(float(re.sub(r'[a-zA-Z\!\%]+', '', margin))) - if units == 'px': - self.table.left_indent = Inches(min(margin // 10 * INDENT, MAX_INDENT)) - elif units == 'cm': - self.table.left_indent = Cm(min(margin // 10 * INDENT, MAX_INDENT) * 2.54) - elif units == 'pt': - self.table.left_indent = Pt(min(margin // 10 * INDENT, MAX_INDENT) * 72) - elif units == '%': - self.table.left_indent = MAX_INDENT * (units / 100) - else: - # When unit is not supported - self.table.left_indent = None + self.table.left_indent = self.unit_converter(units, margin) def add_styles_to_run(self, style): if 'font-size' in style: - font_size = re.sub('!important', '', style['font-size'], flags=re.IGNORECASE) + font_size = utils.remove_important_from_style(style['font-size']) # Adapt font_size when text, ex.: small, medium, etc. font_size = utils.adapt_font_size(font_size) units = re.sub(r'[0-9]+', '', font_size) - font_size = int(float(re.sub(r'[a-zA-Z\!]+', '', font_size))) + font_size = int(float(re.sub(r'[a-zA-Z\!\%]+', '', font_size))) if units == 'px': font_size_unit = Inches(utils.px_to_inches(font_size)) @@ -174,7 +184,7 @@ def add_styles_to_run(self, style): run.font.size = font_size_unit if 'color' in style: - font_color = re.sub('!important', '', style['color'].lower(), flags=re.IGNORECASE) + font_color = utils.remove_important_from_style(style['color'].lower()) if 'rgb' in font_color: color = re.sub(r'[a-z()]+', '', font_color) @@ -192,7 +202,7 @@ def add_styles_to_run(self, style): self.run.font.color.rgb = RGBColor(*colors) if 'background-color' in style: - background_color = re.sub('!important', '', style['background-color'].lower(), flags=re.IGNORECASE) + background_color = utils.remove_important_from_style(style['background-color'].lower()) if 'rgb' in background_color: color = re.sub(r'[a-z()]+', '', background_color) @@ -369,31 +379,48 @@ def handle_div(self, current_attrs): if 'style' in current_attrs and 'page-break-after: always' in current_attrs['style']: self.doc.add_page_break() - def handle_link(self, href, text): - # Link requires a relationship - # is_external = href.startswith('http') - rel_id = self.paragraph.part.relate_to( - href, - docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, - is_external=True # TO-DO support anchor links for this library yet - ) + def handle_link(self, href, text, tooltip=None): + """ + A function that places a hyperlink within a paragraph object. - # Create the w:hyperlink tag and add needed values - hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink') - hyperlink.set(docx.oxml.shared.qn('r:id'), rel_id) + Args: + href: A string containing the required url. + text: The text displayed for the url. + tooltip: The text displayed when holder link. + """ + is_external = href.startswith('http') + hyperlink = OxmlElement('w:hyperlink') + + if is_external: + # Create external hyperlink + rel_id = self.paragraph.part.relate_to( + href, + docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, + is_external=True + ) + + # Create the w:hyperlink tag and add needed values + hyperlink.set(qn('r:id'), rel_id) + else: + # Create internal hyperlink (anchor) + hyperlink.set(qn('w:anchor'), href.replace('#', '')) + + if tooltip is not None: + # set tooltip to hyperlink + hyperlink.set(qn('w:tooltip'), tooltip) # Create sub-run subrun = self.paragraph.add_run() - rPr = docx.oxml.shared.OxmlElement('w:rPr') + rPr = OxmlElement('w:rPr') # add default color - c = docx.oxml.shared.OxmlElement('w:color') - c.set(docx.oxml.shared.qn('w:val'), "0000EE") + c = OxmlElement('w:color') + c.set(qn('w:val'), "0000EE") rPr.append(c) # add underline - u = docx.oxml.shared.OxmlElement('w:u') - u.set(docx.oxml.shared.qn('w:val'), 'single') + u = OxmlElement('w:u') + u.set(qn('w:val'), 'single') rPr.append(u) subrun._r.append(rPr) @@ -485,6 +512,9 @@ def handle_starttag(self, tag, attrs): if tag in ['p', 'li', 'pre']: self.run = self.paragraph.add_run() + if 'id' in current_attrs: + self.add_bookmark(current_attrs['id']) + # add style if not self.include_styles: return @@ -540,7 +570,7 @@ def handle_data(self, data): # https://html.spec.whatwg.org/#interactive-content link = self.tags.get('a') if link: - self.handle_link(link['href'], data) + self.handle_link(link.get('href', None), data, link.get('title', None)) else: # If there's a link, dont put the data directly in the run self.run = self.paragraph.add_run(data) @@ -617,15 +647,15 @@ def run_process(self, html): def add_html_to_document(self, html, document): if not isinstance(html, str): - raise ValueError('First argument needs to be a %s' % str) + raise ValueError(f'First argument needs to be a {str}') elif not isinstance(document, docx.document.Document) and not isinstance(document, docx.table._Cell): - raise ValueError('Second argument needs to be a %s' % docx.document.Document) + raise ValueError(f'Second argument needs to be a {docx.document.Document}') self.set_initial_attrs(document) self.run_process(html) def add_html_to_cell(self, html, cell): if not isinstance(cell, docx.table._Cell): - raise ValueError('Second argument needs to be a %s' % docx.table._Cell) + raise ValueError(f'Second argument needs to be a {docx.table._Cell}') unwanted_paragraph = cell.paragraphs[0] utils.delete_paragraph(unwanted_paragraph) self.set_initial_attrs(cell) @@ -642,8 +672,8 @@ def parse_html_file(self, filename_html, filename_docx=None, encoding='utf-8'): self.run_process(html) if not filename_docx: path, filename = os.path.split(filename_html) - filename_docx = '%s/new_docx_file_%s' % (path, filename) - self.doc.save('%s.docx' % filename_docx) + filename_docx = f'{path}/new_docx_file_{filename}' + self.doc.save(f'{filename_docx}.docx') def parse_html_string(self, html): self.set_initial_attrs() diff --git a/html4docx/utils.py b/html4docx/utils.py index 12d2cc4..d333ae1 100644 --- a/html4docx/utils.py +++ b/html4docx/utils.py @@ -66,6 +66,9 @@ def adapt_font_size(size): return size +def remove_important_from_style(text): + return re.sub('!important', '', text, flags=re.IGNORECASE) + def fetch_image(url): """ Attempts to fetch an image from a url. diff --git a/requirements.txt b/requirements.txt index 5187bc7..402b1b8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ -beautifulsoup4==4.12.2 -python-docx==1.1.0 \ No newline at end of file +beautifulsoup4>=4.12.2 +python-docx>=1.1.0 diff --git a/setup.cfg b/setup.cfg index 833b934..61e3a70 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,8 +1,9 @@ [metadata] name = html-for-docx -version = 1.0.3 +version = 1.0.4 url = https://github.com/dfop02/html4docx project_urls = + Changelog = https://github.com/dfop02/html4docx/blob/master/HISTORY.rst Bug Tracker = https://github.com/dfop02/html4docx/issues Repository = https://github.com/dfop02/html4docx author = Diogo Fernandes @@ -26,6 +27,7 @@ classifiers = Programming Language :: Python :: 3.11 Programming Language :: Python :: 3.12 Topic :: Software Development :: Libraries :: Python Modules + Topic :: Software Development :: Libraries Topic :: Utilities [options] diff --git a/setup.py b/setup.py index 2153237..1488eed 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ here = os.path.abspath(os.path.dirname(__file__)) README = open(os.path.join(here, 'README.md')).read() -VERSION = '1.0.3' +VERSION = '1.0.4' setup( name = 'html-for-docx', @@ -14,20 +14,22 @@ packages = find_packages(), python_requires = '>=3.7', author = 'Diogo Fernandes', - author_email = 'dfop02@hotmail.com', + author_email = 'diogofernandesop@gmail.com', platforms = ['any'], include_package_data = True, - keywords = ['html', 'docx', 'convert'], + keywords = ['html', 'docx', 'office', 'word', 'convert', 'transform'], zip_safe = False, url = 'https://github.com/dfop02/html4docx', project_urls = { + "Changelog": "https://github.com/dfop02/html4docx/blob/master/HISTORY.rst", "Bug Tracker": "https://github.com/dfop02/html4docx/issues", "Repository": "https://github.com/dfop02/html4docx" }, - download_url = f'https://github.com/dfop02/html4docx/archive/v{VERSION}.tar.gz', + download_url = f'https://github.com/dfop02/html4docx/archive/{VERSION}.tar.gz', classifiers = [ 'Intended Audience :: Developers', 'Topic :: Software Development :: Build Tools', + 'Topic :: Software Development :: Libraries', 'License :: OSI Approved :: MIT License', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.7', diff --git a/tests/assets/htmls/text1.html b/tests/assets/htmls/text1.html index a43d301..93cd7a2 100644 --- a/tests/assets/htmls/text1.html +++ b/tests/assets/htmls/text1.html @@ -44,6 +44,9 @@

heading 1

Indent 4

Indent max?

Indent 2.5em

+

Indent 20cm

+

Indent 15pt

+

Indent 10%

asdfsa

link

diff --git a/tests/test.py b/tests/test.py index 4ba79df..b53b940 100644 --- a/tests/test.py +++ b/tests/test.py @@ -221,7 +221,7 @@ def test_handling_hr(self): assert '' in document._body._body.xml def test_external_hyperlink(self): - hyperlink_html_example = "Anchor Link" + hyperlink_html_example = "Google External Link" self.document.add_heading( 'Test: Handling external hyperlink', @@ -231,8 +231,35 @@ def test_external_hyperlink(self): self.parser.add_html_to_document(hyperlink_html_example, self.document) document = self.parser.parse_html_string(hyperlink_html_example) + # Extract external hyperlinks + external_hyperlinks = [] + + for rel in document.part.rels.values(): + if "hyperlink" in rel.reltype: + external_hyperlinks.append(rel.target_ref) + + assert 'https://www.google.com' in external_hyperlinks assert '

Introduction Header

" + "

Click here: Link to intro

" + ) + + self.document.add_heading( + 'Test: Handling internal hyperlink', + level=1 + ) + # Add on document for human validation + self.parser.add_html_to_document(hyperlink_html_example, self.document) + + document = self.parser.parse_html_string(hyperlink_html_example) + document_body = document._body._body.xml + assert '' in document_body + assert '' in document_body + assert '' in document_body + def test_image_no_src(self): self.document.add_heading( 'Test: Handling img without src',