Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add check for characters that cannot be ascii encoded #269

Merged
merged 1 commit into from
Nov 3, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions bin/emailpager
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@

# stdlib imports
import argparse
from copy import copy
import configparser
import os.path
import sys
import smtplib
import unicodedata
from xml.dom import minidom
from datetime import datetime, timedelta
import logging
Expand All @@ -30,6 +32,37 @@ ALERT_DICT = {'green': 0,
'red': 3}


def _convert_ascii(text):
"""Temporary patch to encode text in ascii.

Args:
text (str): Text to convert to ascii encoding.
any characters cannot be converted to ascii,
then they are removed.

Returns:
str: text converted to ascii.
"""
text = str(text)
# Normalize characters and exclude leftovers with accents
text = ''.join(c for c in unicodedata.normalize(
'NFKD', text) if unicodedata.category(c) != 'Mn')
# Remove any leftovers that cannot be normalized
text = bytes(text, 'ascii', errors='ignore')
return text.decode("utf-8")


def _is_ascii(text):
"""
Args:
text (str): Text to check.

Returns:
bool: whether or not the text can be encoded in ascii.
"""
return all(ord(char) < 128 for char in text)


def get_version(session, pdata, release=False, renotify=False):
eventid = pdata.id
event = session.query(es.Event).filter(
Expand Down Expand Up @@ -311,6 +344,34 @@ def main(args):
version = send_emails(
version, long_addresses_nonupdate, all_props, long_msg, subject, DEBUG)

# Temporary fix for messages that cannot be ascii encoded
# TODO: Move this to earthquake-impact-utils when it is clear that it
# will not effect other realtime products
try:
# Copy original text
temp_message = copy(long_msg)
temp_subject = copy(subject)
temp_subject_update = copy(subject_update)

# Check if the characters in the message and subject line are ascii
if not _is_ascii(temp_subject):
temp_subject = _convert_ascii(temp_subject)

if not _is_ascii(temp_subject_update):
temp_subject_update = _convert_ascii(temp_subject_update)

if not _is_ascii(temp_message):
temp_message = _convert_ascii(temp_message)

# No errors in the check/convert so update
long_msg = temp_message
subject_update = temp_subject_update
subject = temp_subject
except:
# If this code checking/encoding the text does not work
# it should not change the original text
pass

# send emails to all pdf format addresses
logging.debug('Sending pdf addresses...')
onepager_file = os.path.join(args.directory, 'onepager.pdf')
Expand Down
88 changes: 88 additions & 0 deletions test/mail/ascii_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from copy import copy
import unicodedata


def _convert_ascii(text):
"""Temporary patch to encode text in ascii.

Args:
text (str): Text to convert to ascii encoding.
any characters cannot be converted to ascii,
then they are removed.

Returns:
str: text converted to ascii.
"""
text = str(text)
# Normalize characters and exclude leftovers with accents
text = ''.join(c for c in unicodedata.normalize(
'NFKD', text) if unicodedata.category(c) != 'Mn')
# Remove any leftovers that cannot be normalized
text = bytes(text, 'ascii', errors='ignore')
return text.decode("utf-8")


def _is_ascii(text):
"""
Args:
text (str): Text to check.

Returns:
bool: whether or not the text can be encoded in ascii.
"""
return all(ord(char) < 128 for char in text)


def convert_if_necessary(long_msg, subject, subject_update):
# Code implemented in emailpager
try:
# Copy original text
temp_message = copy(long_msg)
temp_subject = copy(subject)
temp_subject_update = copy(subject_update)

# Check if the characters in the message and subject line are ascii
if not _is_ascii(temp_subject):
temp_subject = _convert_ascii(temp_subject)

if not _is_ascii(temp_subject_update):
temp_subject_update = _convert_ascii(temp_subject_update)

if not _is_ascii(temp_message):
temp_message = _convert_ascii(temp_message)

# No errors in the check/convert so update
long_msg = temp_message
subject_update = temp_subject_update
subject = temp_subject
except:
# If this code checking/encoding the text does not work
# it should not change the original text
pass
return long_msg, subject, subject_update


def test_ascii_check():
most_let_symb = ('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefg'
'hijklmnopqrstuvwxyz,./<>?;:[]}{}1234567890-=_+')
t1, t2, t3 = convert_if_necessary(
most_let_symb, most_let_symb, most_let_symb)
assert t1 == most_let_symb
assert t2 == most_let_symb
assert t3 == most_let_symb
iceland = 'Hafnarfjörður, Iceland'
ice_ascii = 'Hafnarfjorur, Iceland'
mexico = 'María Xadani, Mexico'
mex_ascii = 'Maria Xadani, Mexico'
t4, t5, t6 = convert_if_necessary(iceland, mexico, iceland)
assert t4 == ice_ascii
assert t5 == mex_ascii
assert t6 == ice_ascii
t7, t8, t9 = convert_if_necessary(mexico, iceland, mexico)
assert t7 == mex_ascii
assert t8 == ice_ascii
assert t9 == mex_ascii


if __name__ == '__main__':
test_ascii_check()