diff --git a/changedetectionio/blueprint/rss/__init__.py b/changedetectionio/blueprint/rss/__init__.py index 113e35ac5a9..d4e091948d7 100644 --- a/changedetectionio/blueprint/rss/__init__.py +++ b/changedetectionio/blueprint/rss/__init__.py @@ -1,102 +1 @@ -import time -import datetime -import pytz -from flask import Blueprint, make_response, request, url_for -from loguru import logger -from feedgen.feed import FeedGenerator - -from changedetectionio.store import ChangeDetectionStore -from changedetectionio.safe_jinja import render as jinja_render - -def construct_blueprint(datastore: ChangeDetectionStore): - rss_blueprint = Blueprint('rss', __name__) - - # Import the login decorator if needed - # from changedetectionio.auth_decorator import login_optionally_required - @rss_blueprint.route("", methods=['GET']) - def feed(): - now = time.time() - # Always requires token set - app_rss_token = datastore.data['settings']['application'].get('rss_access_token') - rss_url_token = request.args.get('token') - if rss_url_token != app_rss_token: - return "Access denied, bad token", 403 - - from changedetectionio import diff - limit_tag = request.args.get('tag', '').lower().strip() - # Be sure limit_tag is a uuid - for uuid, tag in datastore.data['settings']['application'].get('tags', {}).items(): - if limit_tag == tag.get('title', '').lower().strip(): - limit_tag = uuid - - # Sort by last_changed and add the uuid which is usually the key.. - sorted_watches = [] - - # @todo needs a .itemsWithTag() or something - then we can use that in Jinaj2 and throw this away - for uuid, watch in datastore.data['watching'].items(): - # @todo tag notification_muted skip also (improve Watch model) - if datastore.data['settings']['application'].get('rss_hide_muted_watches') and watch.get('notification_muted'): - continue - if limit_tag and not limit_tag in watch['tags']: - continue - watch['uuid'] = uuid - sorted_watches.append(watch) - - sorted_watches.sort(key=lambda x: x.last_changed, reverse=False) - - fg = FeedGenerator() - fg.title('changedetection.io') - fg.description('Feed description') - fg.link(href='https://changedetection.io') - - for watch in sorted_watches: - - dates = list(watch.history.keys()) - # Re #521 - Don't bother processing this one if theres less than 2 snapshots, means we never had a change detected. - if len(dates) < 2: - continue - - if not watch.viewed: - # Re #239 - GUID needs to be individual for each event - # @todo In the future make this a configurable link back (see work on BASE_URL https://github.com/dgtlmoon/changedetection.io/pull/228) - guid = "{}/{}".format(watch['uuid'], watch.last_changed) - fe = fg.add_entry() - - # Include a link to the diff page, they will have to login here to see if password protection is enabled. - # Description is the page you watch, link takes you to the diff JS UI page - # Dict val base_url will get overriden with the env var if it is set. - ext_base_url = datastore.data['settings']['application'].get('active_base_url') - - # Because we are called via whatever web server, flask should figure out the right path ( - diff_link = {'href': url_for('ui.ui_views.diff_history_page', uuid=watch['uuid'], _external=True)} - - fe.link(link=diff_link) - - # @todo watch should be a getter - watch.get('title') (internally if URL else..) - - watch_title = watch.get('title') if watch.get('title') else watch.get('url') - fe.title(title=watch_title) - - html_diff = diff.render_diff(previous_version_file_contents=watch.get_history_snapshot(dates[-2]), - newest_version_file_contents=watch.get_history_snapshot(dates[-1]), - include_equal=False, - line_feed_sep="
") - - # @todo Make this configurable and also consider html-colored markup - # @todo User could decide if goes to the diff page, or to the watch link - rss_template = "\n

{{watch_title}}

\n

{{html_diff}}

\n\n" - content = jinja_render(template_str=rss_template, watch_title=watch_title, html_diff=html_diff, watch_url=watch.link) - - fe.content(content=content, type='CDATA') - - fe.guid(guid, permalink=False) - dt = datetime.datetime.fromtimestamp(int(watch.newest_history_key)) - dt = dt.replace(tzinfo=pytz.UTC) - fe.pubDate(dt) - - response = make_response(fg.rss_str()) - response.headers.set('Content-Type', 'application/rss+xml;charset=utf-8') - logger.trace(f"RSS generated in {time.time() - now:.3f}s") - return response - - return rss_blueprint \ No newline at end of file +RSS_FORMAT_TYPES = [('plaintext', 'Plain text'), ('html', 'HTML Color')] diff --git a/changedetectionio/blueprint/rss/blueprint.py b/changedetectionio/blueprint/rss/blueprint.py new file mode 100644 index 00000000000..edaa5b1ec8a --- /dev/null +++ b/changedetectionio/blueprint/rss/blueprint.py @@ -0,0 +1,147 @@ + +from changedetectionio.safe_jinja import render as jinja_render +from changedetectionio.store import ChangeDetectionStore +from feedgen.feed import FeedGenerator +from flask import Blueprint, make_response, request, url_for, redirect +from loguru import logger +import datetime +import pytz +import re +import time + + +BAD_CHARS_REGEX=r'[\x00-\x08\x0B\x0C\x0E-\x1F]' + +# Anything that is not text/UTF-8 should be stripped before it breaks feedgen (such as binary data etc) +def scan_invalid_chars_in_rss(content): + for match in re.finditer(BAD_CHARS_REGEX, content): + i = match.start() + bad_char = content[i] + hex_value = f"0x{ord(bad_char):02x}" + # Grab context + start = max(0, i - 20) + end = min(len(content), i + 21) + context = content[start:end].replace('\n', '\\n').replace('\r', '\\r') + logger.warning(f"Invalid char {hex_value} at pos {i}: ...{context}...") + # First match is enough + return True + + return False + + +def clean_entry_content(content): + cleaned = re.sub(BAD_CHARS_REGEX, '', content) + return cleaned + +def construct_blueprint(datastore: ChangeDetectionStore): + rss_blueprint = Blueprint('rss', __name__) + + # Some RSS reader situations ended up with rss/ (forward slash after RSS) due + # to some earlier blueprint rerouting work, it should goto feed. + @rss_blueprint.route("/", methods=['GET']) + def extraslash(): + return redirect(url_for('rss.feed')) + + # Import the login decorator if needed + # from changedetectionio.auth_decorator import login_optionally_required + @rss_blueprint.route("", methods=['GET']) + def feed(): + now = time.time() + # Always requires token set + app_rss_token = datastore.data['settings']['application'].get('rss_access_token') + rss_url_token = request.args.get('token') + if rss_url_token != app_rss_token: + return "Access denied, bad token", 403 + + from changedetectionio import diff + limit_tag = request.args.get('tag', '').lower().strip() + # Be sure limit_tag is a uuid + for uuid, tag in datastore.data['settings']['application'].get('tags', {}).items(): + if limit_tag == tag.get('title', '').lower().strip(): + limit_tag = uuid + + # Sort by last_changed and add the uuid which is usually the key.. + sorted_watches = [] + + # @todo needs a .itemsWithTag() or something - then we can use that in Jinaj2 and throw this away + for uuid, watch in datastore.data['watching'].items(): + # @todo tag notification_muted skip also (improve Watch model) + if datastore.data['settings']['application'].get('rss_hide_muted_watches') and watch.get('notification_muted'): + continue + if limit_tag and not limit_tag in watch['tags']: + continue + watch['uuid'] = uuid + sorted_watches.append(watch) + + sorted_watches.sort(key=lambda x: x.last_changed, reverse=False) + + fg = FeedGenerator() + fg.title('changedetection.io') + fg.description('Feed description') + fg.link(href='https://changedetection.io') + + html_colour_enable = False + if datastore.data['settings']['application'].get('rss_content_format') == 'html': + html_colour_enable = True + + for watch in sorted_watches: + + dates = list(watch.history.keys()) + # Re #521 - Don't bother processing this one if theres less than 2 snapshots, means we never had a change detected. + if len(dates) < 2: + continue + + if not watch.viewed: + # Re #239 - GUID needs to be individual for each event + # @todo In the future make this a configurable link back (see work on BASE_URL https://github.com/dgtlmoon/changedetection.io/pull/228) + guid = "{}/{}".format(watch['uuid'], watch.last_changed) + fe = fg.add_entry() + + # Include a link to the diff page, they will have to login here to see if password protection is enabled. + # Description is the page you watch, link takes you to the diff JS UI page + # Dict val base_url will get overriden with the env var if it is set. + ext_base_url = datastore.data['settings']['application'].get('active_base_url') + # @todo fix + + # Because we are called via whatever web server, flask should figure out the right path ( + diff_link = {'href': url_for('ui.ui_views.diff_history_page', uuid=watch['uuid'], _external=True)} + + fe.link(link=diff_link) + + # @todo watch should be a getter - watch.get('title') (internally if URL else..) + + watch_title = watch.get('title') if watch.get('title') else watch.get('url') + fe.title(title=watch_title) + try: + + html_diff = diff.render_diff(previous_version_file_contents=watch.get_history_snapshot(dates[-2]), + newest_version_file_contents=watch.get_history_snapshot(dates[-1]), + include_equal=False, + line_feed_sep="
", + html_colour=html_colour_enable + ) + except FileNotFoundError as e: + html_diff = f"History snapshot file for watch {watch.get('uuid')}@{watch.last_changed} - '{watch.get('title')} not found." + + # @todo Make this configurable and also consider html-colored markup + # @todo User could decide if goes to the diff page, or to the watch link + rss_template = "\n

{{watch_title}}

\n

{{html_diff}}

\n\n" + + content = jinja_render(template_str=rss_template, watch_title=watch_title, html_diff=html_diff, watch_url=watch.link) + + # Out of range chars could also break feedgen + if scan_invalid_chars_in_rss(content): + content = clean_entry_content(content) + + fe.content(content=content, type='CDATA') + fe.guid(guid, permalink=False) + dt = datetime.datetime.fromtimestamp(int(watch.newest_history_key)) + dt = dt.replace(tzinfo=pytz.UTC) + fe.pubDate(dt) + + response = make_response(fg.rss_str()) + response.headers.set('Content-Type', 'application/rss+xml;charset=utf-8') + logger.trace(f"RSS generated in {time.time() - now:.3f}s") + return response + + return rss_blueprint \ No newline at end of file diff --git a/changedetectionio/blueprint/settings/templates/settings.html b/changedetectionio/blueprint/settings/templates/settings.html index 65ce6c66026..ef58f4abe5b 100644 --- a/changedetectionio/blueprint/settings/templates/settings.html +++ b/changedetectionio/blueprint/settings/templates/settings.html @@ -78,7 +78,10 @@ {{ render_field(form.application.form.pager_size) }} Number of items per page in the watch overview list, 0 to disable. - +
+ {{ render_field(form.application.form.rss_content_format) }} + Love RSS? Does your reader support HTML? Set it here +
{{ render_checkbox_field(form.application.form.extract_title_as_title) }} Note: This will automatically apply to all existing watches. diff --git a/changedetectionio/flask_app.py b/changedetectionio/flask_app.py index b914393730e..e4da5c960c7 100644 --- a/changedetectionio/flask_app.py +++ b/changedetectionio/flask_app.py @@ -429,7 +429,7 @@ def static_content(group, filename): import changedetectionio.conditions.blueprint as conditions app.register_blueprint(conditions.construct_blueprint(datastore), url_prefix='/conditions') - import changedetectionio.blueprint.rss as rss + import changedetectionio.blueprint.rss.blueprint as rss app.register_blueprint(rss.construct_blueprint(datastore), url_prefix='/rss') # watchlist UI buttons etc diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index 3fd199bb5c9..87c4421cfb3 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -3,6 +3,7 @@ from loguru import logger from wtforms.widgets.core import TimeInput +from changedetectionio.blueprint.rss import RSS_FORMAT_TYPES from changedetectionio.conditions.form import ConditionFormRow from changedetectionio.strtobool import strtobool @@ -739,6 +740,9 @@ class globalSettingsApplicationForm(commonSettingsForm): render_kw={"style": "width: 5em;"}, validators=[validators.NumberRange(min=0, message="Should be atleast zero (disabled)")]) + + rss_content_format = SelectField('RSS Content format', choices=RSS_FORMAT_TYPES) + removepassword_button = SubmitField('Remove password', render_kw={"class": "pure-button pure-button-primary"}) render_anchor_tag_content = BooleanField('Render anchor tag content', default=False) shared_diff_access = BooleanField('Allow access to view diff page when password is enabled', default=False, validators=[validators.Optional()]) diff --git a/changedetectionio/model/App.py b/changedetectionio/model/App.py index 4c9c34fec05..6e564630e12 100644 --- a/changedetectionio/model/App.py +++ b/changedetectionio/model/App.py @@ -1,4 +1,7 @@ from os import getenv + +from changedetectionio.blueprint.rss import RSS_FORMAT_TYPES + from changedetectionio.notification import ( default_notification_body, default_notification_format, @@ -9,6 +12,8 @@ _FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT = 6 DEFAULT_SETTINGS_HEADERS_USERAGENT='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36' + + class model(dict): base_config = { 'note': "Hello! If you change this file manually, please be sure to restart your changedetection.io instance!", @@ -48,6 +53,7 @@ class model(dict): 'password': False, 'render_anchor_tag_content': False, 'rss_access_token': None, + 'rss_content_format': RSS_FORMAT_TYPES[0][0], 'rss_hide_muted_watches': True, 'schema_version' : 0, 'shared_diff_access': False, diff --git a/changedetectionio/tests/test_rss.py b/changedetectionio/tests/test_rss.py index 0b9654345c5..e70fb027e49 100644 --- a/changedetectionio/tests/test_rss.py +++ b/changedetectionio/tests/test_rss.py @@ -49,6 +49,22 @@ def set_original_cdata_xml(): f.write(test_return_data) + +def set_html_content(content): + test_return_data = f""" + + Some initial text
+

{content}

+
+ So let's see what happens.
+ + + """ + + # Write as UTF-8 encoded bytes + with open("test-datastore/endpoint-content.txt", "wb") as f: + f.write(test_return_data.encode('utf-8')) + def test_setup(client, live_server, measure_memory_usage): live_server_setup(live_server) @@ -164,3 +180,58 @@ def test_rss_xpath_filtering(client, live_server, measure_memory_usage): assert b'Some other description' not in res.data # Should NOT be selected by the xpath res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True) + + +def test_rss_bad_chars_breaking(client, live_server): + """This should absolutely trigger the RSS builder to go into worst state mode + + - source: prefix means no html conversion (which kinda filters out the bad stuff) + - Binary data + - Very long so that the saving is performed by Brotli (and decoded back to bytes) + + Otherwise feedgen should support regular unicode + """ + #live_server_setup(live_server) + + with open("test-datastore/endpoint-content.txt", "w") as f: + ten_kb_string = "A" * 10_000 + f.write(ten_kb_string) + + test_url = url_for('test_endpoint', _external=True) + res = client.post( + url_for("imports.import_page"), + data={"urls": "source:"+test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + wait_for_all_checks(client) + + # Set the bad content + with open("test-datastore/endpoint-content.txt", "w") as f: + jpeg_bytes = "\xff\xd8\xff\xe0\x00\x10XXXXXXXX\x00\x01\x02\x00\x00\x01\x00\x01\x00\x00" # JPEG header + jpeg_bytes += "A" * 10_000 + + f.write(jpeg_bytes) + + res = client.get(url_for("ui.form_watch_checknow"), follow_redirects=True) + assert b'Queued 1 watch for rechecking.' in res.data + wait_for_all_checks(client) + rss_token = extract_rss_token_from_UI(client) + + uuid = next(iter(live_server.app.config['DATASTORE'].data['watching'])) + assert live_server.app.config['DATASTORE'].data['watching'][uuid].history_n == 2 + + # Check RSS feed is still working + res = client.get( + url_for("rss.feed", uuid=uuid, token=rss_token), + follow_redirects=False # Important! leave this off! it should not redirect + ) + assert res.status_code == 200 + + #assert live_server.app.config['DATASTORE'].data['watching'][uuid].history_n == 2 + #assert live_server.app.config['DATASTORE'].data['watching'][uuid].history_n == 2 + + + + + diff --git a/changedetectionio/tests/util.py b/changedetectionio/tests/util.py index 00fc16a44ea..1112db22e26 100644 --- a/changedetectionio/tests/util.py +++ b/changedetectionio/tests/util.py @@ -173,7 +173,7 @@ def test_endpoint(): return resp # Tried using a global var here but didn't seem to work, so reading from a file instead. - with open("test-datastore/endpoint-content.txt", "r") as f: + with open("test-datastore/endpoint-content.txt", "rb") as f: resp = make_response(f.read(), status_code) if uppercase_headers: resp.headers['CONTENT-TYPE'] = ctype if ctype else 'text/html'