Skip to content

Commit

Permalink
fix: add more HTML validation & fixes (#3891)
Browse files Browse the repository at this point in the history
* Update vnu.jar

* Fix py2 -> py3 issue

* Run pyupgrade

* test: Add default-jdk to images

* test: Add option to also validate HTML with vnu.jar

Since it's already installed in bin. Don't do this by default, since it
increases the time needed for tests by ~50%.

* fix: Stop the urlizer from urlizing in linkified mailto: text

* More HTML fixes

* More HTML validation fixes

* And more HTML fixes

* Fix floating badge

* Ignore unicode errors

* Only URLize docs that are existing

* Final fixes

* Don't URLize everything during test-crawl

* Feed HTML into vnu using python rather than Java to speed things up

* Allow test-crawl to start vnu on a different port

* Increase retry count to vnu. Restore batch size to 30.

* More HTML validation fixes

* Use urllib3 to make requests to vnu, since overriding requests_mock is tricky

* Undo commit of unmodified file

* Also urlize ftp links

* Fix matching of file name

* More HTML fixes

* Add `is_valid_url` filter

* weekday -> data-weekday

* urlencode URLs

* Add and use vnu_fmt_message. Bump vnu max buffer.

* Simplify doc_exists

* Don't add tab link to mail archive if the URL is invalid

* Run urlize_ietf_docs before linkify

Reduces the possibility of generating incorrect HTML

* Undo superfluous change

* Runner fixes

* Consolidate vnu message filtering into vnu_filter_message

* Correctly handle multiple persons with same name

* Minimze diff

* Fix HTML nits

* Print source snippet in vnu_fmt_message

* Only escape if there is something to escape

* Fix snippet

* Skip crufty old IPR declarations

* Only include modal when needed. Add handles.

* Fix wordwrap+linkification

* Update ietf/doc/templatetags/ietf_filters.py

* Update ietf/doc/templatetags/tests_ietf_filters.py

* Don't right-align second column
  • Loading branch information
larseggert authored May 3, 2022
1 parent f778058 commit 5598762
Show file tree
Hide file tree
Showing 55 changed files with 479 additions and 609 deletions.
493 changes: 52 additions & 441 deletions bin/test-crawl

Large diffs are not rendered by default.

Binary file modified bin/vnu.jar
Binary file not shown.
1 change: 1 addition & 0 deletions dev/tests/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ RUN apt-get install -qy \
bash \
build-essential \
curl \
default-jdk \
docker-ce-cli \
enscript \
gawk \
Expand Down
1 change: 1 addition & 0 deletions docker/app.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ RUN apt-get install -qy \
bash \
build-essential \
curl \
default-jdk \
docker-ce-cli \
enscript \
fish \
Expand Down
107 changes: 94 additions & 13 deletions ietf/doc/templatetags/ietf_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import datetime
import re
import os
from urllib.parse import urljoin

from email.utils import parseaddr
Expand All @@ -17,10 +18,13 @@
from django.utils.encoding import force_text
from django.utils.encoding import force_str # pyflakes:ignore force_str is used in the doctests
from django.urls import reverse as urlreverse
from django.core.cache import cache
from django.core.validators import URLValidator
from django.core.exceptions import ValidationError

import debug # pyflakes:ignore

from ietf.doc.models import BallotDocEvent
from ietf.doc.models import BallotDocEvent, DocAlias
from ietf.doc.models import ConsensusDocEvent
from ietf.utils.html import sanitize_fragment
from ietf.utils import log
Expand Down Expand Up @@ -184,49 +188,113 @@ def rfceditor_info_url(rfcnum : str):
"""Link to the RFC editor info page for an RFC"""
return urljoin(settings.RFC_EDITOR_INFO_BASE_URL, f'rfc{rfcnum}')


def doc_exists(name):
"""Check whether a given document exists"""
def find_unique(n):
key = hash(n)
found = cache.get(key)
if not found:
exact = DocAlias.objects.filter(name=n).first()
found = exact.name if exact else "_"
cache.set(key, found)
return None if found == "_" else found

# all documents exist when tests are running
if settings.SERVER_MODE == 'test':
# unless we are running test-crawl, which would otherwise 404
if "DJANGO_URLIZE_IETF_DOCS_PRODUCTION" not in os.environ:
return True

# chop away extension
extension_split = re.search(r"^(.+)\.(txt|ps|pdf)$", name)
if extension_split:
name = extension_split.group(1)

if find_unique(name):
return True

# check for embedded rev - this may be ambiguous, so don't
# chop it off if we don't find a match
rev_split = re.search("^(.+)-([0-9]{2,})$", name)
if rev_split:
name = rev_split.group(1)
if find_unique(name):
return True

return False


def link_charter_doc_match1(match):
if not doc_exists(match[0]):
return match[0]
return f'<a href="/doc/{match[1][:-1]}/{match[2]}/">{match[0]}</a>'


def link_charter_doc_match2(match):
if not doc_exists(match[0]):
return match[0]
return f'<a href="/doc/{match[1][:-1]}/{match[2]}/">{match[0]}</a>'


def link_non_charter_doc_match(match):
if len(match[3])==2 and match[3].isdigit():
if not doc_exists(match[0]):
return match[0]
if len(match[3]) == 2 and match[3].isdigit():
return f'<a href="/doc/{match[2][:-1]}/{match[3]}/">{match[0]}</a>'
else:
return f'<a href="/doc/{match[2]}{match[3]}/">{match[0]}</a>'

@register.filter(name='urlize_ietf_docs', is_safe=True, needs_autoescape=True)

def link_other_doc_match(match):
# there may be whitespace in the match
doc = re.sub(r"\s+", "", match[0])
if not doc_exists(doc):
return match[0]
return f'<a href="/doc/{match[2].strip().lower()}{match[3]}/">{match[1]}</a>'


@register.filter(name="urlize_ietf_docs", is_safe=True, needs_autoescape=True)
def urlize_ietf_docs(string, autoescape=None):
"""
Make occurrences of RFC NNNN and draft-foo-bar links to /doc/.
"""
if autoescape and not isinstance(string, SafeData):
string = escape(string)
exp1 = r"\b(charter-(?:[\d\w\.+]+-)*)(\d\d-\d\d)(\.txt)?\b"
exp2 = r"\b(charter-(?:[\d\w\.+]+-)*)(\d\d)(\.txt)?\b"
if "<" in string:
string = escape(string)
else:
string = mark_safe(string)
exp1 = r"\b(?<![/\-:=#])(charter-(?:[\d\w\.+]+-)*)(\d\d-\d\d)(\.txt)?\b"
exp2 = r"\b(?<![/\-:=#])(charter-(?:[\d\w\.+]+-)*)(\d\d)(\.txt)?\b"
if re.search(exp1, string):
string = re.sub(
exp1,
lambda x: f'<a href="/doc/{x[1][:-1]}/{x[2]}/">{x[0]}</a>',
link_charter_doc_match1,
string,
flags=re.IGNORECASE | re.ASCII,
)
elif re.search(exp2, string):
elif re.search(exp2, string):
string = re.sub(
exp2,
lambda x: f'<a href="/doc/{x[1][:-1]}/{x[2]}/">{x[0]}</a>',
link_charter_doc_match2,
string,
flags=re.IGNORECASE | re.ASCII,
)
string = re.sub(
r"\b(?<![/-])(((?:draft-|bofreq-|conflict-review-|status-change-)(?:[\d\w\.+]+-)*)([\d\w\.+]+?)(\.txt)?)\b",
r"\b(?<![/\-:=#])(((?:draft-|bofreq-|conflict-review-|status-change-)(?:[\d\w\.+]+-)*)([\d\w\.+]+?)(\.txt)?)\b(?![-@])",
link_non_charter_doc_match,
string,
flags=re.IGNORECASE | re.ASCII,
)
string = re.sub(
# r"\b((RFC|BCP|STD|FYI|(?:draft-|bofreq-|conflict-review-|status-change-|charter-)[-\d\w.+]+)\s*0*(\d+))\b",
r"\b(?<!-)((RFC|BCP|STD|FYI)\s*0*(\d+))\b",
lambda x: f'<a href="/doc/{x[2].strip().lower()}{x[3]}/">{x[1]}</a>',
r"\b(?<![/\-:=#])((RFC|BCP|STD|FYI)\s*0*(\d+))\b",
link_other_doc_match,
string,
flags=re.IGNORECASE | re.ASCII,
)
return mark_safe(string)

urlize_ietf_docs = stringfilter(urlize_ietf_docs)

@register.filter(name='urlize_related_source_list', is_safe=True, needs_autoescape=True)
Expand Down Expand Up @@ -444,7 +512,7 @@ def format_snippet(text, trunc_words=25):
@register.simple_tag
def doc_edit_button(url_name, *args, **kwargs):
"""Given URL name/args/kwargs, looks up the URL just like "url" tag and returns a properly formatted button for the document material tables."""
return mark_safe('<a class="btn btn-primary btn-sm" type="button" href="%s">Edit</a>' % (urlreverse(url_name, args=args, kwargs=kwargs)))
return mark_safe('<a class="btn btn-primary btn-sm" href="%s">Edit</a>' % (urlreverse(url_name, args=args, kwargs=kwargs)))

@register.filter
def textify(text):
Expand Down Expand Up @@ -765,3 +833,16 @@ def absurl(viewname, **kwargs):
Uses settings.IDTRACKER_BASE_URL as the base.
"""
return urljoin(settings.IDTRACKER_BASE_URL, urlreverse(viewname, kwargs=kwargs))


@register.filter
def is_valid_url(url):
"""
Check if the given URL is syntactically valid
"""
validate_url = URLValidator()
try:
validate_url(url)
except ValidationError:
return False
return True
14 changes: 11 additions & 3 deletions ietf/doc/templatetags/tests_ietf_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,22 @@ def test_urlize_ietf_docs(self):
),
(
"draft-madanapalli-nd-over-802.16-problems",
'<a href="/doc/draft-madanapalli-nd-over-802.16-problems/">draft-madanapalli-nd-over-802.16-problems</a>'
'<a href="/doc/draft-madanapalli-nd-over-802.16-problems/">draft-madanapalli-nd-over-802.16-problems</a>'
),
(
"draft-madanapalli-nd-over-802.16-problems-02.txt",
'<a href="/doc/draft-madanapalli-nd-over-802.16-problems/02/">draft-madanapalli-nd-over-802.16-problems-02.txt</a>'
'<a href="/doc/draft-madanapalli-nd-over-802.16-problems/02/">draft-madanapalli-nd-over-802.16-problems-02.txt</a>'
),
(
'<a href="mailto:[email protected]">[email protected]</a>',
'<a href="mailto:[email protected]">[email protected]</a>',
),
(
"http://ieee802.org/1/files/public/docs2015/cn-thaler-Qcn-draft-PAR.pdf",
"http://ieee802.org/1/files/public/docs2015/cn-thaler-Qcn-draft-PAR.pdf"
)
]

# Some edge cases scraped from existing old draft names
for name in [
# "draft-odell-8+8", # This fails since + matches the right side of \b
Expand Down
4 changes: 3 additions & 1 deletion ietf/group/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from ietf.review.utils import can_manage_review_requests_for_team
from ietf.utils import log
from ietf.utils.history import get_history_object_for, copy_many_to_many_for_history
from ietf.doc.templatetags.ietf_filters import is_valid_url
from functools import reduce

def save_group_in_history(group):
Expand Down Expand Up @@ -208,7 +209,8 @@ def construct_group_menu_context(request, group, selected, group_type, others):
entries.append(("Photos", urlreverse("ietf.group.views.group_photos", kwargs=kwargs)))
entries.append(("Email expansions", urlreverse("ietf.group.views.email", kwargs=kwargs)))
if group.list_archive.startswith("http:") or group.list_archive.startswith("https:") or group.list_archive.startswith("ftp:"):
entries.append((mark_safe("List archive &raquo;"), group.list_archive))
if is_valid_url(group.list_archive):
entries.append((mark_safe("List archive &raquo;"), group.list_archive))


# actions
Expand Down
4 changes: 2 additions & 2 deletions ietf/meeting/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@

# need to insert empty option for use in ChoiceField
# countries.insert(0, ('', '-'*9 ))
countries.insert(0, ('', ''))
countries.insert(0, ('', '-' * 9))
timezones.insert(0, ('', '-' * 9))

# -------------------------------------------------
Expand Down Expand Up @@ -827,4 +827,4 @@ def sessiondetailsformset_factory(min_num=1, max_num=3):
min_num=min_num,
max_num=max_num,
extra=max_num, # only creates up to max_num total
)
)
4 changes: 3 additions & 1 deletion ietf/secr/proceedings/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@

class RecordingForm(forms.Form):
external_url = forms.URLField(label='Url')
session = forms.ModelChoiceField(queryset=Session.objects,empty_label='')
session = forms.ModelChoiceField(queryset=Session.objects)
session.widget.attrs['class'] = "select2-field"
session.widget.attrs['data-minimum-input-length'] = 0

def __init__(self, *args, **kwargs):
self.meeting = kwargs.pop('meeting')
Expand Down
6 changes: 0 additions & 6 deletions ietf/secr/static/js/proceedings-recording.js

This file was deleted.

3 changes: 1 addition & 2 deletions ietf/secr/templates/proceedings/recording.html
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
{% block extrahead %}{{ block.super }}
<script src="{% static 'ietf/js/jquery-ui.js' %}"></script>
<script src="{% static 'ietf/js/select2.js' %}"></script>
<script src="{% static 'secr/js/proceedings-recording.js' %}"></script>
{% endblock %}

{% block breadcrumbs %}{{ block.super }}
Expand Down Expand Up @@ -119,4 +118,4 @@ <h2>Unmatched Recording Files</h2>

{% block footer-extras %}
{% include "includes/upload_footer.html" %}
{% endblock %}
{% endblock %}
2 changes: 1 addition & 1 deletion ietf/static/js/agenda_timezone.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
.utc();
item.end_ts = moment.unix(this.getAttribute("data-end-time"))
.utc();
if (this.hasAttribute("weekday")) {
if (this.hasAttribute("data-weekday")) {
item.format = 2;
} else {
item.format = 1;
Expand Down
6 changes: 3 additions & 3 deletions ietf/templates/doc/ballot/send_ballot_comment.html
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,13 @@ <h1>
<input id="subject" class="form-control" type="text" placeholder="{{ subject }}" disabled>
</div>
<div class="mb-3">
<label class="form-label" for="body">Body</label>
<pre id="body" class="border p-3">{{ body|maybewordwrap }}</pre>
<p class="form-label">Body</p>
<pre class="border p-3">{{ body|maybewordwrap }}</pre>
</div>
<button type="submit" class="btn btn-danger">Send</button>
<a class="btn btn-secondary float-end"
href="{% url "ietf.doc.views_doc.document_main" name=doc.canonical_name %}">
Back
</a>
</form>
{% endblock %}
{% endblock %}
8 changes: 4 additions & 4 deletions ietf/templates/doc/document_ballot_content.html
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@
</div>
</div>
<div class="card-body">
<pre class="ballot pasted">{{ p.discuss|linkify|urlize_ietf_docs }}</pre>
<pre class="ballot pasted">{{ p.discuss|urlize_ietf_docs|linkify }}</pre>
</div>
</div>
{% endif %}
Expand All @@ -148,7 +148,7 @@
</div>
</div>
<div class="card-body">
<pre class="ballot pasted">{{ p.comment|linkify|urlize_ietf_docs }}</pre>
<pre class="ballot pasted">{{ p.comment|urlize_ietf_docs|linkify }}</pre>
</div>
</div>
{% endif %}
Expand Down Expand Up @@ -199,11 +199,11 @@
</div>
{% if p.pos.blocking and p.discuss %}
<div class="card-body">
<pre class="ballot pasted">{{ p.discuss|linkify|urlize_ietf_docs }}</pre>
<pre class="ballot pasted">{{ p.discuss|urlize_ietf_docs|linkify }}</pre>
</div>
{% else %}
<div class="card-body">
<pre class="ballot pasted">{{ p.comment|linkify|urlize_ietf_docs }}</pre>
<pre class="ballot pasted">{{ p.comment|urlize_ietf_docs|linkify }}</pre>
</div>
{% endif %}
</div>
Expand Down
6 changes: 3 additions & 3 deletions ietf/templates/doc/document_bofreq.html
Original file line number Diff line number Diff line change
Expand Up @@ -107,13 +107,13 @@
{% if resources %}
{% for resource in resources|dictsort:"display_name" %}
{% if resource.name.type.slug == 'url' or resource.name.type.slug == 'email' %}
<a href="{{ resource.value }}" title="{{ resource.name.name }}">
<a href="{{ resource.value|urlencode }}" title="{{ resource.name.name }}">
{% firstof resource.display_name resource.name.name %}
</a>
<br>
{# Maybe make how a resource displays itself a method on the class so templates aren't doing this switching #}
{% else %}
<span title="{{ resource.name.name }}">{% firstof resource.display_name resource.name.name %}: {{ resource.value }}</span>
<span title="{{ resource.name.name }}">{% firstof resource.display_name resource.name.name %}: {{ resource.value|escape }}</span>
<br>
{% endif %}
{% endfor %}
Expand Down Expand Up @@ -164,4 +164,4 @@
</script>
<script src="{% static 'ietf/js/document_timeline.js' %}">
</script>
{% endblock %}
{% endblock %}
2 changes: 1 addition & 1 deletion ietf/templates/doc/document_charter.html
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@
{{ doc.canonical_name }}-{{ doc.rev }}
</div>
<div class="card-body">
<pre>{{ content|maybewordwrap|linkify|urlize_ietf_docs }}</pre>
<pre>{{ content|maybewordwrap|urlize_ietf_docs|linkify }}</pre>
</div>
</div>
{% endif %}
Expand Down
2 changes: 1 addition & 1 deletion ietf/templates/doc/document_conflict_review.html
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@
{{ doc.name }}-{{ doc.rev }}
</div>
<div class="card-body">
<pre>{{ content|maybewordwrap|linkify|urlize_ietf_docs }}</pre>
<pre>{{ content|maybewordwrap|urlize_ietf_docs|linkify }}</pre>
</div>
</div>
{% endif %}
Expand Down
Loading

0 comments on commit 5598762

Please sign in to comment.