python · AA-Turner · Apr 27, 2020 · Apr 28, 2020 · Apr 28, 2020 · Apr 29, 2020
diff --git a/AUTHORS.csv b/AUTHORS.csv
@@ -0,0 +1,11 @@
+Full Name, Surname First, Name Reference
+Ernest W. Durbin III, "Durbin, Ernest W., III", Durbin
+Inada Naoki, "Inada, Naoki", Inada
+Guido van Rossum, "van Rossum, Guido (GvR)", GvR
+Just van Rossum, "van Rossum, Just (JvR)", JvR
+The Python core team and community, The Python core team and community, python-dev
+P.J. Eby, "Eby, Phillip J.", Eby
+Greg Ewing,  "Ewing, Gregory", Ewing
+Jim Jewett, "Jewett, Jim J.", Jewett
+Nathaniel Smith, "Smith, Nathaniel J.", Smith
+Martin v. Löwis, "von Löwis, Martin", von Löwis
diff --git a/genpepindex.py b/genpepindex.py
@@ -19,6 +19,7 @@
 
 import sys
 import os
+import csv
 import codecs
 
 from operator import attrgetter
@@ -33,6 +34,15 @@ def main(argv):
     else:
         path = argv[1]
 
+    # AUTHORS.csv is an exception file for PEP0 name parsing
+    with open("AUTHORS.csv", "r", encoding="UTF8") as f:
+        read = csv.DictReader(f, quotechar='"', skipinitialspace=True)
+        author_exception_data = {}
+        for line in read:
+            full_name = line.pop("Full Name").strip()
+            details = {k.strip(): v.strip() for k, v in line.items()}
+            author_exception_data[full_name] = details
+
     peps = []
     if os.path.isdir(path):
         for file_path in os.listdir(path):
@@ -44,7 +54,7 @@ def main(argv):
             if file_path.startswith("pep-") and file_path.endswith((".txt", "rst")):
                 with codecs.open(abs_file_path, 'r', encoding='UTF-8') as pep_file:
                     try:
-                        pep = PEP(pep_file)
+                        pep = PEP(pep_file, author_exception_data)
                         if pep.number != int(file_path[4:-4]):
                             raise PEPError('PEP number does not match file name',
                                            file_path, pep.number)
@@ -57,12 +67,13 @@ def main(argv):
         peps.sort(key=attrgetter('number'))
     elif os.path.isfile(path):
         with open(path, 'r') as pep_file:
-            peps.append(PEP(pep_file))
+            peps.append(PEP(pep_file, author_exception_data))
     else:
         raise ValueError("argument must be a directory or file path")
 
     with codecs.open('pep-0000.rst', 'w', encoding='UTF-8') as pep0_file:
         write_pep0(peps, pep0_file)
 
+
 if __name__ == "__main__":
     main(sys.argv)
diff --git a/pep0/output.py b/pep0/output.py
@@ -5,6 +5,7 @@
 import sys
 import unicodedata
 
+from itertools import groupby
 from operator import attrgetter
 
 from . import constants
@@ -124,9 +125,9 @@ def verify_email_addresses(peps):
 
 
 def sort_authors(authors_dict):
-    authors_list = list(authors_dict.keys())
-    authors_list.sort(key=attrgetter('sort_by'))
-    return authors_list
+    authors_list = sorted(authors_dict.keys(), key=attrgetter("sort_by"))
+    unique_authors = [next(a) for k, a in groupby(authors_list, key=attrgetter("last_first"))]
+    return unique_authors
 
 def normalized_last_first(name):
     return len(unicodedata.normalize('NFC', name.last_first))

diff --git a/pep0/pep.py b/pep0/pep.py
@@ -60,37 +60,39 @@ class Author(object):
             The author's email address.
     """
 
-    def __init__(self, author_and_email_tuple):
+    def __init__(self, author_and_email_tuple, authors_exceptions):
         """Parse the name and email address of an author."""
+        self.first = self.last = ''
+
         name, email = author_and_email_tuple
         self.first_last = name.strip()
         self.email = email.lower()
-        last_name_fragment, suffix = self._last_name(name)
-        name_sep = name.index(last_name_fragment)
-        self.first = name[:name_sep].rstrip()
-        self.last = last_name_fragment
-        if self.last[1] == u'.':
-            # Add an escape to avoid docutils turning `v.` into `22.`.
-            self.last = u'\\' + self.last
-        self.suffix = suffix
-        if not self.first:
-            self.last_first = self.last
+
+        name_dict = authors_exceptions.get(self.first_last)
+        if name_dict:
+            self.last_first = name_dict["Surname First"]
+            self.nick = self.last = name_dict["Name Reference"]
         else:
-            self.last_first = u', '.join([self.last, self.first])
-            if self.suffix:
-                self.last_first += u', ' + self.suffix
-        if self.last == "van Rossum":
-            # Special case for our beloved BDFL. :)
-            if self.first == "Guido":
-                self.nick = "GvR"
-            elif self.first == "Just":
-                self.nick = "JvR"
-            else:
-                raise ValueError("unknown van Rossum %r!" % self)
-            self.last_first += " (%s)" % (self.nick,)
+            self.set_name_parts()
+
+    def set_name_parts(self):
+        name_dict = self._parse_name(self.first_last)
+        suffix = name_dict.get("suffix")
+        if "name" in name_dict:
+            self.last_first = name_dict["name"]
+            self.nick = name_dict["name"]
         else:
+            self.first = name_dict["forename"].rstrip()
+            self.last = name_dict["surname"]
+            if self.last[1] == ".":
+                # Add an escape to avoid docutils turning `v.` into `22.`.
+                self.last = "\\" + self.last
+            self.last_first = ", ".join([self.last, self.first])
             self.nick = self.last
 
+        if suffix:
+            self.last_first += f", {suffix}"
+
     def __hash__(self):
         return hash(self.first_last)
 
@@ -109,28 +111,60 @@ def sort_by(self):
             base = self.last.lower()
         return unicodedata.normalize('NFKD', base).encode('ASCII', 'ignore')
 
-    def _last_name(self, full_name):
-        """Find the last name (or nickname) of a full name.
+    @staticmethod
+    def _parse_name(full_name):
+        """Decompose a full name into parts.
 
-        If no last name (e.g, 'Aahz') then return the full name.  If there is
-        a leading, lowercase portion to the last name (e.g., 'van' or 'von')
-        then include it.  If there is a suffix (e.g., 'Jr.') that is appended
-        through a comma, then drop the suffix.
+        If a mononym (e.g, 'Aahz') then return the full name. If there are
+        suffixes in the name (e.g. ', Jr.' or 'III'), then find and extract
+        them. If there is a middle initial followed by a full stop, then
+        combine the following words into a surname (e.g. N. Vander Weele). If
+        there is a leading, lowercase portion to the last name (e.g. 'van' or
+        'von') then include it in the surname.
 
         """
-        name_partition = full_name.partition(u',')
-        no_suffix = name_partition[0].strip()
-        suffix = name_partition[2].strip()
-        name_parts = no_suffix.split()
-        part_count = len(name_parts)
-        if part_count == 1 or part_count == 2:
-            return name_parts[-1], suffix
-        else:
-            assert part_count > 2
+        possible_suffixes = ["Jr", "Jr.", "II", "III"]
+
+        suffix_partition = full_name.partition(",")
+        pre_suffix = suffix_partition[0].strip()
+        suffix = suffix_partition[2].strip()
+
+        name_parts = pre_suffix.split(" ")
+        num_parts = len(name_parts)
+        name = {"suffix": suffix}
+
+        if num_parts == 0:
+            raise ValueError("Name is empty!")
+        elif num_parts == 1:
+            name.update(name=name_parts[0])
+        elif num_parts == 2:
+            name.update(forename=name_parts[0], surname=name_parts[1])
+        elif num_parts > 2:
+            # handles III etc.
+            if name_parts[-1] in possible_suffixes:
+                new_suffix = " ".join([*name_parts[-1:], suffix]).strip()
+                name_parts.pop(-1)
+                name.update(suffix=new_suffix)
+
+            # handles von, van, v. etc.
             if name_parts[-2].islower():
-                return u' '.join(name_parts[-2:]), suffix
+                forename = " ".join(name_parts[:-2])
+                surname = " ".join(name_parts[-2:])
+                name.update(forename=forename, surname=surname)
+
+            # handles double surnames after a middle initial (e.g. N. Vander Weele)
+            elif any(s.endswith(".") for s in name_parts):
+                split_position = [i for i, x in enumerate(name_parts) if x.endswith(".")][-1] + 1
+                forename = " ".join(name_parts[:split_position])
+                surname = " ".join(name_parts[split_position:])
+                name.update(forename=forename, surname=surname)
+
             else:
-                return name_parts[-1], suffix
+                forename = " ".join(name_parts[:-1])
+                surname = " ".join(name_parts[-1:])
+                name.update(forename=forename, surname=surname)
+
+        return name
 
 
 class PEP(object):
@@ -176,7 +210,7 @@ class PEP(object):
                      u"Rejected", u"Withdrawn", u"Deferred",
                      u"Final", u"Active", u"Draft", u"Superseded")
 
-    def __init__(self, pep_file):
+    def __init__(self, pep_file, author_exceptions: dict):
         """Init object from an open PEP file object."""
         # Parse the headers.
         self.filename = pep_file
@@ -244,7 +278,7 @@ def __init__(self, pep_file):
         if len(authors_and_emails) < 1:
             raise PEPError("no authors found", pep_file.name,
                            self.number)
-        self.authors = list(map(Author, authors_and_emails))
+        self.authors = [Author(author_email, author_exceptions) for author_email in authors_and_emails]
 
     def _parse_author(self, data):
         """Return a list of author names and emails."""