Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix PEP 0 name parsing #1386

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions AUTHORS.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
Full Name, Surname First, Name Reference
Ernest W. Durbin III, "Durbin, Ernest W., III", Durbin
Inada Naoki, "Inada, Naoki", Inada
Guido van Rossum, "van Rossum, Guido (GvR)", GvR
Just van Rossum, "van Rossum, Just (JvR)", JvR
The Python core team and community, The Python core team and community, python-dev
P.J. Eby, "Eby, Phillip J.", Eby
Greg Ewing, "Ewing, Gregory", Ewing
Jim Jewett, "Jewett, Jim J.", Jewett
Nathaniel Smith, "Smith, Nathaniel J.", Smith
Martin v. Löwis, "von Löwis, Martin", von Löwis
15 changes: 13 additions & 2 deletions genpepindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import sys
import os
import csv
import codecs

from operator import attrgetter
Expand All @@ -33,6 +34,15 @@ def main(argv):
else:
path = argv[1]

# AUTHORS.csv is an exception file for PEP0 name parsing
with open("AUTHORS.csv", "r", encoding="UTF8") as f:
read = csv.DictReader(f, quotechar='"', skipinitialspace=True)
author_exception_data = {}
for line in read:
full_name = line.pop("Full Name").strip()
details = {k.strip(): v.strip() for k, v in line.items()}
author_exception_data[full_name] = details

peps = []
if os.path.isdir(path):
for file_path in os.listdir(path):
Expand All @@ -44,7 +54,7 @@ def main(argv):
if file_path.startswith("pep-") and file_path.endswith((".txt", "rst")):
with codecs.open(abs_file_path, 'r', encoding='UTF-8') as pep_file:
try:
pep = PEP(pep_file)
pep = PEP(pep_file, author_exception_data)
if pep.number != int(file_path[4:-4]):
raise PEPError('PEP number does not match file name',
file_path, pep.number)
Expand All @@ -57,12 +67,13 @@ def main(argv):
peps.sort(key=attrgetter('number'))
elif os.path.isfile(path):
with open(path, 'r') as pep_file:
peps.append(PEP(pep_file))
peps.append(PEP(pep_file, author_exception_data))
else:
raise ValueError("argument must be a directory or file path")

with codecs.open('pep-0000.rst', 'w', encoding='UTF-8') as pep0_file:
write_pep0(peps, pep0_file)


if __name__ == "__main__":
main(sys.argv)
7 changes: 4 additions & 3 deletions pep0/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import sys
import unicodedata

from itertools import groupby
from operator import attrgetter

from . import constants
Expand Down Expand Up @@ -124,9 +125,9 @@ def verify_email_addresses(peps):


def sort_authors(authors_dict):
authors_list = list(authors_dict.keys())
authors_list.sort(key=attrgetter('sort_by'))
return authors_list
authors_list = sorted(authors_dict.keys(), key=attrgetter("sort_by"))
unique_authors = [next(a) for k, a in groupby(authors_list, key=attrgetter("last_first"))]
return unique_authors

def normalized_last_first(name):
return len(unicodedata.normalize('NFC', name.last_first))
Expand Down
118 changes: 76 additions & 42 deletions pep0/pep.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,37 +60,39 @@ class Author(object):
The author's email address.
"""

def __init__(self, author_and_email_tuple):
def __init__(self, author_and_email_tuple, authors_exceptions):
"""Parse the name and email address of an author."""
self.first = self.last = ''

name, email = author_and_email_tuple
self.first_last = name.strip()
self.email = email.lower()
last_name_fragment, suffix = self._last_name(name)
name_sep = name.index(last_name_fragment)
self.first = name[:name_sep].rstrip()
self.last = last_name_fragment
if self.last[1] == u'.':
# Add an escape to avoid docutils turning `v.` into `22.`.
self.last = u'\\' + self.last
self.suffix = suffix
if not self.first:
self.last_first = self.last

name_dict = authors_exceptions.get(self.first_last)
if name_dict:
self.last_first = name_dict["Surname First"]
self.nick = self.last = name_dict["Name Reference"]
else:
self.last_first = u', '.join([self.last, self.first])
if self.suffix:
self.last_first += u', ' + self.suffix
if self.last == "van Rossum":
# Special case for our beloved BDFL. :)
if self.first == "Guido":
self.nick = "GvR"
elif self.first == "Just":
self.nick = "JvR"
else:
raise ValueError("unknown van Rossum %r!" % self)
self.last_first += " (%s)" % (self.nick,)
self.set_name_parts()

def set_name_parts(self):
name_dict = self._parse_name(self.first_last)
suffix = name_dict.get("suffix")
if "name" in name_dict:
self.last_first = name_dict["name"]
self.nick = name_dict["name"]
else:
self.first = name_dict["forename"].rstrip()
self.last = name_dict["surname"]
if self.last[1] == ".":
# Add an escape to avoid docutils turning `v.` into `22.`.
self.last = "\\" + self.last
self.last_first = ", ".join([self.last, self.first])
self.nick = self.last

if suffix:
self.last_first += f", {suffix}"

def __hash__(self):
return hash(self.first_last)

Expand All @@ -109,28 +111,60 @@ def sort_by(self):
base = self.last.lower()
return unicodedata.normalize('NFKD', base).encode('ASCII', 'ignore')

def _last_name(self, full_name):
"""Find the last name (or nickname) of a full name.
@staticmethod
def _parse_name(full_name):
"""Decompose a full name into parts.

If no last name (e.g, 'Aahz') then return the full name. If there is
a leading, lowercase portion to the last name (e.g., 'van' or 'von')
then include it. If there is a suffix (e.g., 'Jr.') that is appended
through a comma, then drop the suffix.
If a mononym (e.g, 'Aahz') then return the full name. If there are
suffixes in the name (e.g. ', Jr.' or 'III'), then find and extract
them. If there is a middle initial followed by a full stop, then
combine the following words into a surname (e.g. N. Vander Weele). If
there is a leading, lowercase portion to the last name (e.g. 'van' or
'von') then include it in the surname.

"""
name_partition = full_name.partition(u',')
no_suffix = name_partition[0].strip()
suffix = name_partition[2].strip()
name_parts = no_suffix.split()
part_count = len(name_parts)
if part_count == 1 or part_count == 2:
return name_parts[-1], suffix
else:
assert part_count > 2
possible_suffixes = ["Jr", "Jr.", "II", "III"]

suffix_partition = full_name.partition(",")
pre_suffix = suffix_partition[0].strip()
suffix = suffix_partition[2].strip()

name_parts = pre_suffix.split(" ")
num_parts = len(name_parts)
name = {"suffix": suffix}

if num_parts == 0:
raise ValueError("Name is empty!")
elif num_parts == 1:
name.update(name=name_parts[0])
elif num_parts == 2:
name.update(forename=name_parts[0], surname=name_parts[1])
elif num_parts > 2:
# handles III etc.
if name_parts[-1] in possible_suffixes:
new_suffix = " ".join([*name_parts[-1:], suffix]).strip()
name_parts.pop(-1)
name.update(suffix=new_suffix)

# handles von, van, v. etc.
if name_parts[-2].islower():
return u' '.join(name_parts[-2:]), suffix
forename = " ".join(name_parts[:-2])
surname = " ".join(name_parts[-2:])
name.update(forename=forename, surname=surname)

# handles double surnames after a middle initial (e.g. N. Vander Weele)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is tough, a name like R. David Murray should give Murray as a surname for the PEP index.

That said, a name written in Japanese English convention like INADA Naoki should return INADA as surname.
In short, it’s not generally possible to parse world names into US categories of «first», «middle», «last»

I guess we have to accept imperfect for now, add special cases when we notice problems (what PEP will add the first Name MacName Sr 🙂), and maybe someday rework the system to have the proper way: metadata (or some author index dict) should include full name and short name for PEP 0.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Luckily (?!) in PEP 458 R. David Murray is listed without the full stop in his first initial, so the name is still parsed correctly. I don't think there's a good solution for INADA Naoki beyond adding a special-case exception

In short, it’s not generally possible to parse world names into US categories of «first», «middle», «last»

Or UK, Australia, Canada etc 😜. But I get your point. I'm reminded of this post about names - hopefully #​40 doesn't apply to us...

I think that your last suggestion having some sort of lookup table is probably the best solution, as in all the PEPs there are still only a relativley small number of authors (248) - it's quite late here so will add that feature tommorow. It also keeps special cases etc. out of the code to keep it from becoming knobbly.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be OK if this PEP fixed the most egregious cases (III) without covering 100% of possibilities 🙂

Copy link
Member Author

@AA-Turner AA-Turner Apr 28, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gives me something to do! Latest commit adds such a metadata lookup and therefore simplifies the name parsing code.

This should make it so that names can be correctly entered into AUTHORS.csv and PEP 0 will reflect this. I've also identified some duplicate entries (e.g. P.J. Eby & Phillip J. Eby, Greg and Gregory Ewing, Jim J. Jewett & Jim Jewett, Martin v. Löwis & Martin von Löwis). Is it acceptable to modify PEP headers to canonicalise these names?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I would add multiple entries to the data file rather than editing historical documents.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The latest data file (exceptions rather than full mapping) doesn’t de-duplicate these entries, should it?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The data file is checked first (in init), so unsure where duplicates would propogate from?

Always good to be preventative but not sure I understand this one, sorry!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe my comment doesn’t make sense!
I don’t have a clear picture of the current behaviour of the code, so I wondered if the change from full data file to exceptions data file did preserve the feature you added of normalizing the duplicate names.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aha! I forgot to add that back in, you're right - have done so now (only adding the less used variant and mapping it to the 'cannonical' variant, to keep the file smaller)

elif any(s.endswith(".") for s in name_parts):
split_position = [i for i, x in enumerate(name_parts) if x.endswith(".")][-1] + 1
forename = " ".join(name_parts[:split_position])
surname = " ".join(name_parts[split_position:])
name.update(forename=forename, surname=surname)

else:
return name_parts[-1], suffix
forename = " ".join(name_parts[:-1])
surname = " ".join(name_parts[-1:])
name.update(forename=forename, surname=surname)

return name


class PEP(object):
Expand Down Expand Up @@ -176,7 +210,7 @@ class PEP(object):
u"Rejected", u"Withdrawn", u"Deferred",
u"Final", u"Active", u"Draft", u"Superseded")

def __init__(self, pep_file):
def __init__(self, pep_file, author_exceptions: dict):
"""Init object from an open PEP file object."""
# Parse the headers.
self.filename = pep_file
Expand Down Expand Up @@ -244,7 +278,7 @@ def __init__(self, pep_file):
if len(authors_and_emails) < 1:
raise PEPError("no authors found", pep_file.name,
self.number)
self.authors = list(map(Author, authors_and_emails))
self.authors = [Author(author_email, author_exceptions) for author_email in authors_and_emails]

def _parse_author(self, data):
"""Return a list of author names and emails."""
Expand Down