Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions changedetectionio/html_tools.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import List
from loguru import logger
from lxml import etree
from typing import List
import json
import re

Expand Down Expand Up @@ -298,8 +299,10 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
# https://github.com/dgtlmoon/changedetection.io/pull/2041#issuecomment-1848397161w
# Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded within HTML tags
try:
stripped_text_from_html = _parse_json(json.loads(content), json_filter)
except json.JSONDecodeError:
# .lstrip("\ufeff") strings ByteOrderMark from UTF8 and still lets the UTF work
stripped_text_from_html = _parse_json(json.loads(content.lstrip("\ufeff") ), json_filter)
except json.JSONDecodeError as e:
logger.warning(str(e))

# Foreach <script json></script> blob.. just return the first that matches json_filter
# As a last resort, try to parse the whole <body>
Expand Down
12 changes: 12 additions & 0 deletions changedetectionio/tests/test_jsonpath_jq_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -514,3 +514,15 @@ def test_check_jq_ext_filter(client, live_server, measure_memory_usage):
def test_check_jqraw_ext_filter(client, live_server, measure_memory_usage):
if jq_support:
check_json_ext_filter('jq:.[] | select(.status | contains("Sold"))', client, live_server)

def test_jsonpath_BOM_utf8(client, live_server, measure_memory_usage):
from .. import html_tools

# JSON string with BOM and correct double-quoted keys
json_str = '\ufeff{"name": "José", "emoji": "😊", "language": "中文", "greeting": "Привет"}'

# See that we can find the second <script> one, which is not broken, and matches our filter
text = html_tools.extract_json_as_string(json_str, "json:$.name")
assert text == '"José"'