Skip to content

Commit

Permalink
Add check for characters that cannot be ascii encoded
Browse files Browse the repository at this point in the history
This will check if there are characters that cannot be
ascii encoded. The check is wrapped in a try/except to
prevent interferring with regular events or causing any
other encoding problems. This should be moved to
earthquake-impact-utils. (Issue created for that repo.)
  • Loading branch information
hschovanec-usgs committed Oct 20, 2020
1 parent 599e5f8 commit d66d785
Show file tree
Hide file tree
Showing 2 changed files with 149 additions and 0 deletions.
61 changes: 61 additions & 0 deletions bin/emailpager
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@

# stdlib imports
import argparse
from copy import copy
import configparser
import os.path
import sys
import smtplib
import unicodedata
from xml.dom import minidom
from datetime import datetime, timedelta
import logging
Expand All @@ -30,6 +32,37 @@ ALERT_DICT = {'green': 0,
'red': 3}


def _convert_ascii(text):
"""Temporary patch to encode text in ascii.
Args:
text (str): Text to convert to ascii encoding.
any characters cannot be converted to ascii,
then they are removed.
Returns:
str: text converted to ascii.
"""
text = str(text)
# Normalize characters and exclude leftovers with accents
text = ''.join(c for c in unicodedata.normalize(
'NFKD', text) if unicodedata.category(c) != 'Mn')
# Remove any leftovers that cannot be normalized
text = bytes(text, 'ascii', errors='ignore')
return text.decode("utf-8")


def _is_ascii(text):
"""
Args:
text (str): Text to check.
Returns:
bool: whether or not the text can be encoded in ascii.
"""
return all(ord(char) < 128 for char in text)


def get_version(session, pdata, release=False, renotify=False):
eventid = pdata.id
event = session.query(es.Event).filter(
Expand Down Expand Up @@ -311,6 +344,34 @@ def main(args):
version = send_emails(
version, long_addresses_nonupdate, all_props, long_msg, subject, DEBUG)

# Temporary fix for messages that cannot be ascii encoded
# TODO: Move this to earthquake-impact-utils when it is clear that it
# will not effect other realtime products
try:
# Copy original text
temp_message = copy(long_msg)
temp_subject = copy(subject)
temp_subject_update = copy(subject_update)

# Check if the characters in the message and subject line are ascii
if not _is_ascii(temp_subject):
temp_subject = _convert_ascii(temp_subject)

if not _is_ascii(temp_subject_update):
temp_subject_update = _convert_ascii(temp_subject_update)

if not _is_ascii(temp_message):
temp_message = _convert_ascii(temp_message)

# No errors in the check/convert so update
long_msg = temp_message
subject_update = temp_subject_update
subject = temp_subject
except:
# If this code checking/encoding the text does not work
# it should not change the original text
pass

# send emails to all pdf format addresses
logging.debug('Sending pdf addresses...')
onepager_file = os.path.join(args.directory, 'onepager.pdf')
Expand Down
88 changes: 88 additions & 0 deletions test/mail/ascii_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from copy import copy
import unicodedata


def _convert_ascii(text):
"""Temporary patch to encode text in ascii.
Args:
text (str): Text to convert to ascii encoding.
any characters cannot be converted to ascii,
then they are removed.
Returns:
str: text converted to ascii.
"""
text = str(text)
# Normalize characters and exclude leftovers with accents
text = ''.join(c for c in unicodedata.normalize(
'NFKD', text) if unicodedata.category(c) != 'Mn')
# Remove any leftovers that cannot be normalized
text = bytes(text, 'ascii', errors='ignore')
return text.decode("utf-8")


def _is_ascii(text):
"""
Args:
text (str): Text to check.
Returns:
bool: whether or not the text can be encoded in ascii.
"""
return all(ord(char) < 128 for char in text)


def convert_if_necessary(long_msg, subject, subject_update):
# Code implemented in emailpager
try:
# Copy original text
temp_message = copy(long_msg)
temp_subject = copy(subject)
temp_subject_update = copy(subject_update)

# Check if the characters in the message and subject line are ascii
if not _is_ascii(temp_subject):
temp_subject = _convert_ascii(temp_subject)

if not _is_ascii(temp_subject_update):
temp_subject_update = _convert_ascii(temp_subject_update)

if not _is_ascii(temp_message):
temp_message = _convert_ascii(temp_message)

# No errors in the check/convert so update
long_msg = temp_message
subject_update = temp_subject_update
subject = temp_subject
except:
# If this code checking/encoding the text does not work
# it should not change the original text
pass
return long_msg, subject, subject_update


def test_ascii_check():
most_let_symb = ('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefg'
'hijklmnopqrstuvwxyz,./<>?;:[]}{}1234567890-=_+')
t1, t2, t3 = convert_if_necessary(
most_let_symb, most_let_symb, most_let_symb)
assert t1 == most_let_symb
assert t2 == most_let_symb
assert t3 == most_let_symb
iceland = 'Hafnarfjörður, Iceland'
ice_ascii = 'Hafnarfjorur, Iceland'
mexico = 'María Xadani, Mexico'
mex_ascii = 'Maria Xadani, Mexico'
t4, t5, t6 = convert_if_necessary(iceland, mexico, iceland)
assert t4 == ice_ascii
assert t5 == mex_ascii
assert t6 == ice_ascii
t7, t8, t9 = convert_if_necessary(mexico, iceland, mexico)
assert t7 == mex_ascii
assert t8 == ice_ascii
assert t9 == mex_ascii


if __name__ == '__main__':
test_ascii_check()

0 comments on commit d66d785

Please sign in to comment.