Skip to content

Commit f9e1f89

Browse files
author
Andrew Or
committed
[Release] Correctly translate contributors name in release notes
This commit involves three main changes: (1) It separates the translation of contributor names from the generation of the contributors list. This is largely motivated by the Github API limit; even if we exceed this limit, we should at least be able to proceed manually as before. This is why the translation logic is abstracted into its own script translate-contributors.py. (2) When we look for candidate replacements for invalid author names, we should look for the assignees of the associated JIRAs too. As a result, the intermediate file must keep track of these. (3) This provides an interactive mode with which the user can sit at the terminal and manually pick the candidate replacement that he/she thinks makes the most sense. As before, there is a non-interactive mode that picks the first candidate that the script considers "valid." TODO: We should have a known_contributors file that stores known mappings so we don't have to go through all of this translation every time. This is also valuable because some contributors simply cannot be automatically translated. Conflicts: .gitignore
1 parent 9880bb4 commit f9e1f89

File tree

4 files changed

+230
-56
lines changed

4 files changed

+230
-56
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
*.ipr
66
*.iml
77
*.iws
8+
*.pyc
89
.idea/
910
.idea_modules/
1011
sbt/*.jar
@@ -49,7 +50,9 @@ dependency-reduced-pom.xml
4950
checkpoint
5051
derby.log
5152
dist/
52-
spark-*-bin.tar.gz
53+
dev/create-release/*txt
54+
dev/create-release/*new
55+
spark-*-bin-*.tgz
5356
unit-tests.log
5457
/lib/
5558
rat-results.txt

dev/create-release/generate-contributors.py

Lines changed: 31 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,6 @@
2626

2727
# You must set the following before use!
2828
JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
29-
JIRA_USERNAME = os.environ.get("JIRA_USERNAME", None)
30-
JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", None)
3129
START_COMMIT = os.environ.get("START_COMMIT", "37b100")
3230
END_COMMIT = os.environ.get("END_COMMIT", "3693ae")
3331

@@ -40,8 +38,6 @@
4038
END_COMMIT = raw_input("Please specify ending commit hash (non-inclusive): ")
4139

4240
# Verify provided arguments
43-
if not JIRA_USERNAME: sys.exit("JIRA_USERNAME must be provided")
44-
if not JIRA_PASSWORD: sys.exit("JIRA_PASSWORD must be provided")
4541
start_commit_line = get_one_line(START_COMMIT)
4642
end_commit_line = get_one_line(END_COMMIT)
4743
num_commits = num_commits_in_range(START_COMMIT, END_COMMIT)
@@ -60,14 +56,6 @@
6056
sys.exit("Ok, exiting")
6157
print "==================================================================================\n"
6258

63-
# Setup JIRA and github clients. We use two JIRA clients, one with authentication
64-
# and one without, because authentication is slow and required only when we query
65-
# JIRA user details but not Spark issues
66-
jira_options = { "server": JIRA_API_BASE }
67-
jira_client = JIRA(options = jira_options)
68-
jira_client_auth = JIRA(options = jira_options, basic_auth = (JIRA_USERNAME, JIRA_PASSWORD))
69-
github_client = Github()
70-
7159
# Find all commits within this range
7260
print "Gathering commits within range [%s..%s)" % (START_COMMIT, END_COMMIT)
7361
commits = get_one_line_commits(START_COMMIT, END_COMMIT)
@@ -105,13 +93,17 @@ def print_indented(_list):
10593
if reverts: print "Reverts (%d)" % len(reverts); print_indented(reverts)
10694
if nojiras: print "No JIRA (%d)" % len(nojiras); print_indented(nojiras)
10795
print "==================== Warning: the above commits will be ignored ==================\n"
108-
response = raw_input("%d commits left to process. Ok to proceed? [y/N] " % len(filtered_commits))
109-
if response.lower() != "y":
96+
response = raw_input("%d commits left to process. Ok to proceed? [Y/n] " % len(filtered_commits))
97+
if response.lower() != "y" and response:
11098
sys.exit("Ok, exiting.")
11199

112100
# Keep track of warnings to tell the user at the end
113101
warnings = []
114102

103+
# Mapping from the invalid author name to its associated JIRA issues
104+
# E.g. andrewor14 -> set("SPARK-2413", "SPARK-3551", "SPARK-3471")
105+
invalid_authors = {}
106+
115107
# Populate a map that groups issues and components by author
116108
# It takes the form: Author name -> { Contribution type -> Spark components }
117109
# For instance,
@@ -127,16 +119,23 @@ def print_indented(_list):
127119
# }
128120
#
129121
author_info = {}
122+
jira_options = { "server": JIRA_API_BASE }
123+
jira_client = JIRA(options = jira_options)
130124
print "\n=========================== Compiling contributor list ==========================="
131125
for commit in filtered_commits:
132126
commit_hash = re.findall("^[a-z0-9]+", commit)[0]
133127
issues = re.findall("SPARK-[0-9]+", commit.upper())
134-
# Translate the author in case the github username is not an actual name
135-
# Also guard against any special characters used in the name
136-
# Note the JIRA client we use here must have authentication enabled
137128
author = get_author(commit_hash)
138-
author = unidecode.unidecode(unicode(author, "UTF-8"))
139-
author = translate_author(author, github_client, jira_client_auth, warnings)
129+
author = unidecode.unidecode(unicode(author, "UTF-8")).strip() # guard against special characters
130+
# If the author name is invalid, keep track of it along
131+
# with all associated issues so we can translate it later
132+
if is_valid_author(author):
133+
author = capitalize_author(author)
134+
else:
135+
if author not in invalid_authors:
136+
invalid_authors[author] = set()
137+
for issue in issues:
138+
invalid_authors[author].add(issue)
140139
date = get_date(commit_hash)
141140
# Parse components from the commit message, if any
142141
commit_components = find_components(commit, commit_hash)
@@ -147,7 +146,7 @@ def populate(issue_type, components):
147146
author_info[author] = {}
148147
if issue_type not in author_info[author]:
149148
author_info[author][issue_type] = set()
150-
for component in all_components:
149+
for component in components:
151150
author_info[author][issue_type].add(component)
152151
# Find issues and components associated with this commit
153152
for issue in issues:
@@ -168,7 +167,6 @@ def populate(issue_type, components):
168167
# Each line takes the format "Author name - semi-colon delimited contributions"
169168
# e.g. Andrew Or - Bug fixes in Windows, Core, and Web UI; improvements in Core
170169
# e.g. Tathagata Das - Bug fixes and new features in Streaming
171-
contributors_file_name = "contributors.txt"
172170
contributors_file = open(contributors_file_name, "w")
173171
authors = author_info.keys()
174172
authors.sort()
@@ -192,11 +190,23 @@ def populate(issue_type, components):
192190
# Do not use python's capitalize() on the whole string to preserve case
193191
assert contribution
194192
contribution = contribution[0].capitalize() + contribution[1:]
193+
# If the author name is invalid, use an intermediate format that
194+
# can be translated through translate-contributors.py later
195+
# E.g. andrewor14/SPARK-3425/SPARK-1157/SPARK-6672
196+
if author in invalid_authors and invalid_authors[author]:
197+
author = author + "/" + "/".join(invalid_authors[author])
195198
line = "%s - %s" % (author, contribution)
196199
contributors_file.write(line + "\n")
197200
contributors_file.close()
198201
print "Contributors list is successfully written to %s!" % contributors_file_name
199202

203+
# Prompt the user to translate author names if necessary
204+
if invalid_authors:
205+
warnings.append("Found the following invalid authors:")
206+
for a in invalid_authors:
207+
warnings.append("\t%s" % a)
208+
warnings.append("Please run './translate-contributors.py' to translate them.")
209+
200210
# Log any warnings encountered in the process
201211
if warnings:
202212
print "\n============ Warnings encountered while creating the contributor list ============"

dev/create-release/releaseutils.py

Lines changed: 5 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@
4444
print "Install using 'sudo pip install unidecode'"
4545
sys.exit(-1)
4646

47+
# Contributors list file name
48+
contributors_file_name = "contributors.txt"
49+
4750
# Utility functions run git commands (written with Git 1.8.5)
4851
def run_cmd(cmd): return Popen(cmd, stdout=PIPE).communicate()[0]
4952
def get_author(commit_hash):
@@ -69,7 +72,8 @@ def num_commits_in_range(start_hash, end_hash):
6972
"build": "build fixes",
7073
"improvement": "improvements",
7174
"new feature": "new features",
72-
"documentation": "documentation"
75+
"documentation": "documentation",
76+
"test": "test"
7377
}
7478

7579
# Maintain a mapping for translating component names when creating the release notes
@@ -182,36 +186,3 @@ def capitalize_author(author):
182186
words = [w[0].capitalize() + w[1:] for w in words if w]
183187
return " ".join(words)
184188

185-
# Maintain a mapping of translated author names as a cache
186-
translated_authors = {}
187-
188-
# Format the given author in a format appropriate for the contributors list.
189-
# If the author is not an actual name, search github and JIRA for potential
190-
# replacements and log all candidates as a warning.
191-
def translate_author(github_author, github_client, jira_client, warnings):
192-
if is_valid_author(github_author):
193-
return capitalize_author(github_author)
194-
# If the translated author is already cached, just return it
195-
if github_author in translated_authors:
196-
return translated_authors[github_author]
197-
# Otherwise, author name is not found, so we need to search for an alternative name
198-
candidates = set()
199-
github_name = get_github_name(github_author, github_client)
200-
jira_name = get_jira_name(github_author, jira_client)
201-
if is_valid_author(github_name): github_name = capitalize_author(github_name)
202-
if is_valid_author(jira_name): jira_name = capitalize_author(jira_name)
203-
if github_name: candidates.add(github_name)
204-
if jira_name: candidates.add(jira_name)
205-
# Only use the github name as a replacement automatically
206-
# The JIRA name may not make sense because it can belong to someone else
207-
if is_valid_author(github_name):
208-
candidates_message = " (another candidate is %s)" % jira_name if jira_name else ""
209-
warnings.append("Replacing github user %s with %s%s" % (github_author, github_name, candidates_message))
210-
translated_authors[github_name] = github_name
211-
return translated_authors[github_name]
212-
# No direct replacement, so return the original author and list any candidates found
213-
candidates_message = " (candidates: %s)" % nice_join(candidates) if candidates else ""
214-
warnings.append("Unable to find a replacement for github user %s%s" % (github_author, candidates_message))
215-
translated_authors[github_author] = github_author
216-
return translated_authors[github_author]
217-
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
#!/usr/bin/env python
2+
#
3+
# Licensed to the Apache Software Foundation (ASF) under one or more
4+
# contributor license agreements. See the NOTICE file distributed with
5+
# this work for additional information regarding copyright ownership.
6+
# The ASF licenses this file to You under the Apache License, Version 2.0
7+
# (the "License"); you may not use this file except in compliance with
8+
# the License. You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
18+
# This script translates invalid authors in the contributors list generated
19+
# by generate-contributors.py. When the script encounters an author name that
20+
# is considered invalid, it searches Github and JIRA in an attempt to search
21+
# for replacements. This tool runs in two modes:
22+
#
23+
# (1) Interactive mode: For each invalid author name, this script presents
24+
# all candidate replacements to the user and awaits user response. In this
25+
# mode, the user may also input a custom name. This is the default.
26+
#
27+
# (2) Non-interactive mode: For each invalid author name, this script replaces
28+
# the name with the first valid candidate it can find. If there is none, it
29+
# uses the original name. This can be enabled through the --non-interactive flag.
30+
31+
import os
32+
import sys
33+
34+
from releaseutils import *
35+
36+
# You must set the following before use!
37+
JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
38+
JIRA_USERNAME = os.environ.get("JIRA_USERNAME", None)
39+
JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", None)
40+
if not JIRA_USERNAME or not JIRA_PASSWORD:
41+
sys.exit("Both JIRA_USERNAME and JIRA_PASSWORD must be set")
42+
43+
# Write new contributors list to <old_file_name>.new
44+
if not os.path.isfile(contributors_file_name):
45+
print "Contributors file %s does not exist!" % contributors_file_name
46+
print "Have you run ./generate-contributors.py yet?"
47+
sys.exit(1)
48+
contributors_file = open(contributors_file_name, "r")
49+
new_contributors_file_name = contributors_file_name + ".new"
50+
new_contributors_file = open(new_contributors_file_name, "w")
51+
warnings = []
52+
53+
# In non-interactive mode, this script will choose the first replacement that is valid
54+
INTERACTIVE_MODE = True
55+
if len(sys.argv) > 1:
56+
options = set(sys.argv[1:])
57+
if "--non-interactive" in options:
58+
INTERACTIVE_MODE = False
59+
if INTERACTIVE_MODE:
60+
print "Running in interactive mode. To disable this, provide the --non-interactive flag."
61+
62+
# Setup Github and JIRA clients
63+
jira_options = { "server": JIRA_API_BASE }
64+
jira_client = JIRA(options = jira_options, basic_auth = (JIRA_USERNAME, JIRA_PASSWORD))
65+
github_client = Github()
66+
67+
# Generate candidates for the given author. This should only be called if the given author
68+
# name does not represent a full name as this operation is somewhat expensive. Under the
69+
# hood, it makes several calls to the Github and JIRA API servers to find the candidates.
70+
#
71+
# This returns a list of (candidate name, source) 2-tuples. E.g.
72+
# [
73+
# (NOT_FOUND, "No full name found for Github user andrewor14"),
74+
# ("Andrew Or", "Full name of JIRA user andrewor14"),
75+
# ("Andrew Orso", "Full name of SPARK-1444 assignee andrewor14"),
76+
# ("Andrew Ordall", "Full name of SPARK-1663 assignee andrewor14"),
77+
# (NOT_FOUND, "No assignee found for SPARK-1763")
78+
# ]
79+
NOT_FOUND = "Not found"
80+
def generate_candidates(author, issues):
81+
candidates = []
82+
# First check for full name of Github user
83+
github_name = get_github_name(new_author, github_client)
84+
if github_name:
85+
candidates.append((github_name, "Full name of Github user %s" % new_author))
86+
else:
87+
candidates.append((NOT_FOUND, "No full name found for Github user %s" % new_author))
88+
# Then do the same for JIRA user
89+
jira_name = get_jira_name(new_author, jira_client)
90+
if jira_name:
91+
candidates.append((jira_name, "Full name of JIRA user %s" % new_author))
92+
else:
93+
candidates.append((NOT_FOUND, "No full name found for JIRA user %s" % new_author))
94+
# Then do the same for the assignee of each of the associated JIRAs
95+
# Note that a given issue may not have an assignee, or the assignee may not have a full name
96+
for issue in issues:
97+
jira_issue = jira_client.issue(issue)
98+
jira_assignee = jira_issue.fields.assignee
99+
if jira_assignee:
100+
user_name = jira_assignee.name
101+
display_name = jira_assignee.displayName
102+
if display_name:
103+
candidates.append((display_name, "Full name of %s assignee %s" % (issue, user_name)))
104+
else:
105+
candidates.append((NOT_FOUND, "No full name found for %s assignee %" % (issue, user_name)))
106+
else:
107+
candidates.append((NOT_FOUND, "No assignee found for %s" % issue))
108+
# Guard against special characters in candidate names
109+
# Note that the candidate name may already be in unicode (JIRA returns this)
110+
for i, (candidate, source) in enumerate(candidates):
111+
try:
112+
candidate = unicode(candidate, "UTF-8")
113+
except TypeError:
114+
# already in unicode
115+
pass
116+
candidate = unidecode.unidecode(candidate).strip()
117+
candidates[i] = (candidate, source)
118+
return candidates
119+
120+
# Translate each invalid author by searching for possible candidates from Github and JIRA
121+
# In interactive mode, this script presents the user with a list of choices and have the user
122+
# select from this list. Additionally, the user may also choose to enter a custom name.
123+
# In non-interactive mode, this script picks the first valid author name from the candidates
124+
# If no such name exists, the original name is used (without the JIRA numbers).
125+
print "\n========================== Translating contributor list =========================="
126+
for line in contributors_file:
127+
author = line.split(" - ")[0]
128+
print "Processing author %s" % author
129+
if not author:
130+
print " ERROR: Expected the following format <author> - <contributions>"
131+
print " ERROR: Actual = %s" % line
132+
if not is_valid_author(author):
133+
new_author = author.split("/")[0]
134+
issues = author.split("/")[1:]
135+
candidates = generate_candidates(new_author, issues)
136+
# Print out potential replacement candidates along with the sources, e.g.
137+
# [X] No full name found for Github user andrewor14
138+
# [0] Andrew Or - Full name of JIRA user andrewor14
139+
# [1] Andrew Orso - Full name of SPARK-1444 assignee andrewor14
140+
# [2] Andrew Ordall - Full name of SPARK-1663 assignee andrewor14
141+
# [X] No assignee found for SPARK-1763
142+
# [3] Custom
143+
candidate_names = []
144+
for candidate, source in candidates:
145+
if candidate == NOT_FOUND:
146+
print " [X] %s" % source
147+
else:
148+
index = len(candidate_names)
149+
candidate_names.append(candidate)
150+
print " [%d] %s - %s" % (index, candidate, source)
151+
custom_index = len(candidate_names)
152+
# In interactive mode, additionally provide "custom" option and await user response
153+
if INTERACTIVE_MODE:
154+
print " [%d] Custom" % custom_index
155+
response = raw_input(" Your choice: ")
156+
while not response.isdigit() or int(response) > custom_index:
157+
response = raw_input(" Please enter an integer between 0 and %d: " % custom_index)
158+
response = int(response)
159+
if response == custom_index:
160+
new_author = raw_input(" Please type a custom name for this author: ")
161+
else:
162+
new_author = candidate_names[response]
163+
# In non-interactive mode, just pick the first candidate
164+
else:
165+
valid_candidate_names = [name for name, _ in candidates\
166+
if is_valid_author(name) and name != NOT_FOUND]
167+
if valid_candidate_names:
168+
new_author = valid_candidate_names[0]
169+
# Finally, capitalize the author and replace the original one with it
170+
# If the final replacement is still invalid, log a warning
171+
if is_valid_author(new_author):
172+
new_author = capitalize_author(new_author)
173+
else:
174+
warnings.append("Unable to find a valid name %s for author %s" % (new_author, author))
175+
print " * Replacing %s with %s" % (author, new_author)
176+
line = line.replace(author, new_author)
177+
new_contributors_file.write(line)
178+
print "==================================================================================\n"
179+
contributors_file.close()
180+
new_contributors_file.close()
181+
182+
print "Translated contributors list successfully written to %s!" % new_contributors_file_name
183+
184+
# Log any warnings encountered in the process
185+
if warnings:
186+
print "\n========== Warnings encountered while translating the contributor list ==========="
187+
for w in warnings: print w
188+
print "Please manually correct these in the final contributors list at %s." % new_contributors_file_name
189+
print "==================================================================================\n"
190+

0 commit comments

Comments
 (0)