Skip to content

Commit

Permalink
Fixes for non-ascii character handling.
Browse files Browse the repository at this point in the history
 - Legacy-Id: 410
  • Loading branch information
rpcross committed Oct 9, 2015
1 parent a4fda4d commit d1dd88d
Show file tree
Hide file tree
Showing 13 changed files with 231 additions and 135 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@

mailarch (1.2.6) ietf; urgency=normal

* Fixes ticket #1521. Resolve unicode error with export
when From line contains non-ascii characters.

* Improve handling of non-ascii search strings.

-- Ryan Cross <[email protected]> 09 Oct 2015 10:00:00 +0800

mailarch (1.2.5) ietf; urgency=normal

* Fixes ticket #1754. Search field doesn't handle
Expand Down
2 changes: 1 addition & 1 deletion mlarchive/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from .celeryapp import app

__version__ = "1.2.5"
__version__ = "1.2.6"

__date__ = "$Date$"

Expand Down
2 changes: 1 addition & 1 deletion mlarchive/archive/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def clean_email_list(self):
email_list = self.cleaned_data['email_list']
if email_list:
return [email_list.pk]

class AdvancedSearchForm(FacetedSearchForm):
start_date = forms.DateField(required=False,
widget=forms.TextInput(attrs={'class':'defaultText','title':'YYYY-MM-DD'}))
Expand Down
34 changes: 23 additions & 11 deletions mlarchive/archive/management/commands/_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import tempfile
import uuid
from collections import deque
from email.header import decode_header
from email.utils import parsedate_tz, getaddresses, make_msgid

from django.conf import settings
Expand All @@ -23,11 +22,31 @@
from mlarchive.archive.models import Attachment, EmailList, Legacy, Message, Thread
from mlarchive.archive.management.commands._mimetypes import CONTENT_TYPES, UNKNOWN_CONTENT_TYPE
from mlarchive.utils.decorators import check_datetime
from mlarchive.utils.encoding import decode_safely
from mlarchive.utils.encoding import decode_safely, decode_rfc2047_header

from django.utils.log import getLogger
logger = getLogger('mlarchive.custom')

'''
Notes on character encoding.
In general we want work with unicode strings. To do this it's important to do encoding
and decoding of Unicode at the furthest boundary of the interface.
Standards do not allow for non-ascii data in email headers 2822 (822). RFC2047 defines
extensions to allow non-ascii text data in headers through the use of encoded-words.
Nevertheless, we find non-ascii data in email headers and need to handle this
consistently. See scan ##
When parsing an email message Python2 email module returns a byte-string for header
values
In [23]: x.get('subject')
Out[23]: 'Voc\xea recebeu um Vivo Torpedo SMS'
'''


# --------------------------------------------------
# Globals
# --------------------------------------------------
Expand Down Expand Up @@ -115,13 +134,6 @@ def clean_spaces(s):
s = re.sub(r'\s+',' ',s)
return s

def decode_rfc2047_header(text):
try:
return ' '.join(decode_safely(s, charset) for s, charset in decode_header(text))
except email.header.HeaderParseError as error:
logger.error('Decode header failed [{0},{1}]'.format(error.args,text))
return ''

def flatten_message(msg):
"""Returns the message flattened to a string, for use in writing to a file. NOTE:
use this instead of message.as_string() to avoid mangling message.
Expand Down Expand Up @@ -731,7 +743,7 @@ def normalize(self, header_text):
normal = decode_safely(header_text)

# encode as UTF8 and compress whitespace
normal = normal.encode('utf8')
# normal = normal.encode('utf8') # this is unnecessary
normal = clean_spaces(normal)
return normal.rstrip()

Expand Down Expand Up @@ -822,7 +834,7 @@ def save(self, test=False):

# ensure message has been processed
x = self.archive_message

# write message to disk and then save, post_save signal calls indexer
# which requires file to be present
if not test:
Expand Down
23 changes: 11 additions & 12 deletions mlarchive/archive/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,25 +188,24 @@ def get_file_path(self):
def get_from_line(self):
"""Returns the "From " envelope header from the original mbox file if it
exists or constructs one. Useful when exporting in mbox format.
NOTE: returns unicode, call to_str() before writing to file.
"""
if self.from_line:
return 'From {0}'.format(self.from_line)
return u'From {}'.format(self.from_line)
elif self.frm_email:
return u'From {} {}'.format(self.frm_email,self.date.strftime('%a %b %d %H:%M:%S %Y'))
else:
try:
output = 'From {0} {1}'.format(self.frm_email,self.date.strftime('%a %b %d %H:%M:%S %Y'))
except UnicodeEncodeError:
output = 'From {0} {1}'.format(self.frm_email.encode("ascii","ignore"),self.date.strftime('%a %b %d %H:%M:%S %Y'))
return output
return u'From (none) {}'.format(self.date.strftime('%a %b %d %H:%M:%S %Y'))

def get_removed_dir(self):
return self.email_list.removed_dir

def list_by_date_url(self):
return reverse('archive_search') + '?email_list={}&index={}'.format(self.email_list.name,self.hashcode.rstrip('='))

def list_by_thread_url(self):
return reverse('archive_search') + '?email_list={}&gbt=1&index={}'.format(self.email_list.name,self.hashcode.rstrip('='))

def mark(self,bit):
"""Mark this message using the bit provided, using field spam_score
"""
Expand Down Expand Up @@ -285,7 +284,7 @@ def _get_lists_as_xml():
lines.append(" </shared_root>")
lines.append("</ms_config>")
return "\n".join(lines)

def _export_lists():
"""Produce XML dump of list memberships and call external program"""
# Dump XML
Expand All @@ -299,16 +298,16 @@ def _export_lists():
os.chmod(path,0666)
except Exception as error:
logger.error('Error creating export file: {}'.format(error))
return
return

# Call external script
if hasattr(settings,'NOTIFY_LIST_CHANGE_COMMAND'):
command = settings.NOTIFY_LIST_CHANGE_COMMAND
try:
subprocess.check_call([command,path])
except (OSError,subprocess.CalledProcessError) as error:
logger.error('Error calling external command: {} ({})'.format(command,error))

@receiver(pre_delete, sender=Message)
def _message_remove(sender, instance, **kwargs):
"""When messages are removed, via the admin page, we need to move the message
Expand Down
16 changes: 3 additions & 13 deletions mlarchive/archive/templatetags/archive_extras.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from django import template
from django.utils.safestring import mark_safe

import urllib
from django.utils.http import urlquote_plus

register = template.Library()

Expand Down Expand Up @@ -186,17 +185,8 @@ def get_query_string(p, new_params, remove, context):
except:
p[k]=v

#return mark_safe('?' + '&amp;'.join([u'%s=%s' % (k, v) for k, v in p.items()]).replace(' ', '%20'))
return mark_safe('?' + '&amp;'.join([u'%s=%s' % (urllib.quote_plus(convert_utf8(k)), urllib.quote_plus(convert_utf8(v))) for k, v in p.items()]))

def convert_utf8(v):
'''Returns a string given various inputs: unicode, string, int'''
if isinstance(v, unicode):
return v.encode('utf8')
if isinstance(v, str):
return v
if isinstance(v, int):
return str(v)
return mark_safe('?' + '&amp;'.join([u'%s=%s' % (urlquote_plus(k), urlquote_plus(v)) for k, v in p.items()]))


# Taken from lib/utils.py
def string_to_dict(string):
Expand Down
29 changes: 15 additions & 14 deletions mlarchive/archive/view_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

from mlarchive.archive.forms import RulesForm
from mlarchive.archive.models import EmailList

from mlarchive.utils.encoding import to_str

contain_pattern = re.compile(r'(?P<neg>[-]?)(?P<field>[a-z]+):\((?P<value>[^\)]+)\)')
exact_pattern = re.compile(r'(?P<neg>[-]?)(?P<field>[a-z]+):\"(?P<value>[^\"]+)\"')
Expand Down Expand Up @@ -49,7 +49,7 @@ def find_message_date(sqs, msg):
hi = sqs.count() - 1
if hi == -1: # abort if queryset is empty
return -1

while lo <= hi:
mid = (lo+hi)/2
midval = sqs[mid]
Expand All @@ -59,12 +59,12 @@ def find_message_date(sqs, msg):
hi = mid
else:
break

if midval.object == msg:
return mid
if midval.date != msg.date:
return -1

# we get here if there are messages with the exact same date
# find the first message with this date
count = sqs.count()
Expand All @@ -89,22 +89,22 @@ def find_message_date_reverse(sqs, msg):
hi = sqs.count() - 1
if hi == -1: # abort if queryset is empty
return -1

while lo <= hi:
mid = (lo+hi)/2
midval = sqs[mid]
if midval.date > msg.date:
lo = mid+1
elif midval.date < msg.date:
elif midval.date < msg.date:
hi = mid
else:
break

if midval.object == msg:
return mid
if midval.date != msg.date:
return -1

# we get here if there are messages with the exact same date
# find the first message with this date
count = sqs.count()
Expand All @@ -117,14 +117,14 @@ def find_message_date_reverse(sqs, msg):
if sqs[mid].object == msg:
return mid
mid = mid + 1

return -1

def find_message_gbt(sqs,msg):
"""Returns the position of message (mag) in queryset (sqs)
for queries grouped by thread. Uses binary search to locate the thread,
then traverses the thread"""

lo = 0
hi = sqs.count() - 1
if hi == -1: # abort if queryset is empty
Expand All @@ -134,7 +134,7 @@ def find_message_gbt(sqs,msg):
return 0
else:
return -1

cdate = msg.thread.date
# first locate the thread
while lo < hi:
Expand All @@ -148,7 +148,7 @@ def find_message_gbt(sqs,msg):
break
if midval.object == msg:
return mid

# traverse thread
thread = midval.object.thread
if midval.object.date < msg.date:
Expand All @@ -164,7 +164,7 @@ def find_message_gbt(sqs,msg):
step = 1
mid = mid + 1
midval = sqs[mid]

# next step through thread
while midval.object.thread == thread:
mid = mid + step
Expand Down Expand Up @@ -303,7 +303,8 @@ def get_export(sqs, type, request):

with open(result.object.get_file_path()) as input:
# add envelope header
mbox_file.write(result.object.get_from_line() + '\n')
from_line = to_str(result.object.get_from_line()) + '\n'
mbox_file.write(from_line)
mbox_file.write(input.read())
mbox_file.write('\n')

Expand Down
Loading

0 comments on commit d1dd88d

Please sign in to comment.