Org: Fix lxml usage in html annotation.

TYPE: Bugfix
OneGov · Apr 14, 2024 · 7daa3c5 · 7daa3c5
1 parent f1ea705
commit 7daa3c5
Showing 1 changed file with 5 additions and 6 deletions.
diff --git a/src/onegov/org/utils.py b/src/onegov/org/utils.py
@@ -9,6 +9,7 @@
 from isodate import parse_date, parse_datetime
 from itertools import groupby
 from libres.modules import errors as libres_errors
+from lxml.etree import ParserError
 from lxml.html import fragments_fromstring, tostring
 from onegov.core.cache import lru_cache
 from onegov.core.layout import Layout
@@ -161,19 +162,17 @@ def annotate_html(
     if not html:
         return html
 
-    fragments = fragments_fromstring(html)
+    try:
+        fragments = fragments_fromstring(html, no_leading_text=True)
+    except ParserError:
+        return html
     images = []
 
     # we perform a root xpath lookup, which will result in all paragraphs
     # being looked at - so we don't need to loop over all elements (yah, it's
     # a bit weird)
     for element in fragments[:1]:
 
-        # instead of failing, lxml will return strings instead of elements if
-        # they can't be parsed.. so we have to inspect the objects
-        if not hasattr(element, 'xpath'):
-            return html
-
         for paragraph in element.xpath('//p[img]'):
             add_class_to_node(paragraph, 'has-img')