Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 116 additions & 34 deletions documentation/scripts/community_stars.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,20 @@
- Top 5 Team Stars (Block employees, non-goose team)
- Monthly Leaderboard (all eligible contributors)

The script automatically:
- Fetches contributor data from GitHub API (with retry logic)
- Checks public org memberships to detect Block employees
- Categorizes contributors as Block or External
- Caches data locally for faster subsequent runs

Usage:
python3 community_stars.py "November 2025"
python3 community_stars.py "November 1, 2025 - November 17, 2025"
python3 community_stars.py "2025-11-01 - 2025-11-17"

Requirements:
- GitHub contributor data at /tmp/github_contributors.json
- Team list file (local or from GitHub)
- Internet connection (to fetch GitHub data)
- Team list file at documentation/scripts/community_stars_teams.txt
"""

import json
Expand All @@ -24,11 +30,52 @@
from datetime import datetime
import calendar
from pathlib import Path
import time

# GitHub URL for team list file
TEAMS_FILE_URL = "https://raw.githubusercontent.com/block/goose/main/documentation/scripts/community_stars_teams.txt"
LOCAL_TEAMS_FILE = Path(__file__).parent / "community_stars_teams.txt"

# Block-related organizations to check
BLOCK_ORGS = {'square', 'block', 'squareup', 'block-ghc', 'cashapp'}

def is_block_employee(username):
"""Check if a user is a Block employee by checking their profile and org memberships.

Makes a single API call to get user profile (includes company field),
then only calls orgs endpoint if company field doesn't match.
"""
try:
# First check the user's profile (single API call)
url = f"https://api.github.com/users/{username}"
Copy link

Copilot AI Nov 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same issue as line 46 - the username should be URL-encoded using urllib.parse.quote() to handle special characters safely.

Copilot uses AI. Check for mistakes.
Copy link

Copilot AI Nov 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Username is inserted directly into URL without validation or encoding. If a username contains special characters, this could break the URL or potentially be exploited. Use urllib.parse.quote() to encode the username: url = f"https://api.github.com/users/{urllib.parse.quote(username)}"

Copilot uses AI. Check for mistakes.
with urllib.request.urlopen(url) as response:
Comment on lines +50 to +51
Copy link

Copilot AI Nov 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing timeout parameter on urlopen call. This could hang indefinitely if the GitHub API is unresponsive. Add a timeout parameter like line 219: urllib.request.urlopen(url, timeout=30)

Copilot uses AI. Check for mistakes.
user_data = json.loads(response.read().decode('utf-8'))

# Check company field first (no additional API call needed)
company = user_data.get('company', '').lower() if user_data.get('company') else ''
if company:
# Check for Block-related keywords in company field
block_keywords = ['block', 'square', 'cash app', 'cashapp', 'tidal']
if any(keyword in company for keyword in block_keywords):
return True

# Only check orgs if company field didn't match (second API call only when needed)
url = f"https://api.github.com/users/{username}/orgs"
Copy link

Copilot AI Nov 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Username is inserted directly into URL without validation or encoding. Use urllib.parse.quote() to encode the username: url = f"https://api.github.com/users/{urllib.parse.quote(username)}/orgs"

Copilot uses AI. Check for mistakes.
with urllib.request.urlopen(url) as response:
Comment on lines +63 to +64
Copy link

Copilot AI Nov 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing timeout parameter on urlopen call. This could hang indefinitely if the GitHub API is unresponsive. Add a timeout parameter: urllib.request.urlopen(url, timeout=30)

Copilot uses AI. Check for mistakes.
orgs = json.loads(response.read().decode('utf-8'))

# Check if any org matches Block orgs (case-insensitive)
user_orgs = {org['login'].lower() for org in orgs}
if user_orgs & BLOCK_ORGS:
return True

return False

except Exception as e:
# If we can't check (rate limit, network error, etc.), return False
# This means we'll default to treating them as external
return False
Comment on lines 42 to 77
Copy link

Copilot AI Nov 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The is_block_employee function makes two sequential API calls for every user check (orgs then user profile), which could be slow and hit rate limits when processing many contributors. Consider fetching both in parallel or combining the user profile check (which includes org data if public) into a single call.

Copilot uses AI. Check for mistakes.
Comment on lines +42 to +77
Copy link

Copilot AI Nov 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The is_block_employee function makes unauthenticated GitHub API calls without checking rate limits. For unauthenticated requests, GitHub's rate limit is only 60 requests per hour. With the 0.1s delay on line 291, processing 60+ contributors will exceed this limit and cause failures. Consider adding a GitHub token for authentication (5000 req/hr) or implementing exponential backoff when rate limit errors occur.

Copilot uses AI. Check for mistakes.

def load_team_lists():
"""Load and parse team lists from file (local or GitHub)."""
content = None
Expand All @@ -51,7 +98,6 @@ def load_team_lists():
goose_maintainers = set()
block_non_goose = set()
external_goose = set()
external = set()
bots = set()

current_section = None
Expand All @@ -67,28 +113,22 @@ def load_team_lists():
current_section = 'block_non_goose'
elif '# External, goose' in line:
current_section = 'external_goose'
elif line.startswith('# External') and 'goose' not in line.lower():
current_section = 'external'
elif '# Bots' in line:
current_section = 'bots'
continue

# Add username to appropriate set (lowercase for case-insensitive matching)
# Apply .lower() to entire username including brackets (e.g., "dependabot[bot]")
# This matches the pattern used above: 'goose' not in line.lower()
username = line.lower()
if current_section == 'goose_maintainers':
goose_maintainers.add(username)
elif current_section == 'block_non_goose':
block_non_goose.add(username)
elif current_section == 'external_goose':
external_goose.add(username)
elif current_section == 'external':
external.add(username)
elif current_section == 'bots':
bots.add(username)

return goose_maintainers, block_non_goose, external_goose, external, bots
return goose_maintainers, block_non_goose, external_goose, bots

def parse_date_range(date_input):
"""Parse various date input formats and return start/end timestamps."""
Expand Down Expand Up @@ -148,25 +188,69 @@ def main():
sys.exit(1)

# Load team lists
goose_maintainers, block_non_goose, external_goose, external, bots = load_team_lists()
goose_maintainers, block_non_goose, external_goose, bots = load_team_lists()

# Load GitHub data
github_data_file = '/tmp/github_contributors.json'
contributors_data = None

# Try to load existing file first
try:
with open(github_data_file, 'r') as f:
contributors_data = json.load(f)
except FileNotFoundError:
print(f"Error: GitHub contributor data not found at {github_data_file}")
print("Please run: curl -s -H 'Accept: application/vnd.github.v3+json' 'https://api.github.com/repos/block/goose/stats/contributors' > /tmp/github_contributors.json")
sys.exit(1)
except json.JSONDecodeError as e:
print(f"Error: Invalid JSON in {github_data_file}")
print(f"Details: {e}")
print("The GitHub API may have returned an error. Try fetching the data again.")
sys.exit(1)

# Validate the data is not empty or invalid
if not contributors_data or not isinstance(contributors_data, list) or len(contributors_data) == 0:
print(f"Warning: GitHub data file exists but is empty or invalid. Fetching fresh data...", file=sys.stderr)
contributors_data = None
except (FileNotFoundError, json.JSONDecodeError):
print(f"GitHub data file not found or invalid. Fetching fresh data...", file=sys.stderr)
contributors_data = None

# Fetch from GitHub API if needed
if contributors_data is None:
print("Fetching contributor data from GitHub API...", file=sys.stderr)
max_retries = 3
retry_delay = 2

for attempt in range(max_retries):
try:
url = "https://api.github.com/repos/block/goose/stats/contributors"
with urllib.request.urlopen(url, timeout=30) as response:
contributors_data = json.loads(response.read().decode('utf-8'))

# Validate the response
if contributors_data and isinstance(contributors_data, list) and len(contributors_data) > 0:
# Save to file for future use
with open(github_data_file, 'w') as f:
json.dump(contributors_data, f)
print(f"✓ Successfully fetched data for {len(contributors_data)} contributors", file=sys.stderr)
break
else:
print(f"Attempt {attempt + 1}/{max_retries}: GitHub API returned empty data. Retrying...", file=sys.stderr)
contributors_data = None
if attempt < max_retries - 1:
time.sleep(retry_delay)
except Exception as e:
print(f"Attempt {attempt + 1}/{max_retries}: Error fetching from GitHub API: {e}", file=sys.stderr)
if attempt < max_retries - 1:
time.sleep(retry_delay)
else:
print("\nError: Could not fetch GitHub contributor data after multiple attempts.")
print("The GitHub stats API may be temporarily unavailable or still computing statistics.")
print("Please try again in a few minutes.")
sys.exit(1)

if contributors_data is None:
print("\nError: GitHub API returned empty data after multiple attempts.")
print("The repository statistics may still be computing. Please try again in a few minutes.")
sys.exit(1)

# Process contributors
contributor_stats = []
checked_orgs = {} # Cache org checks to avoid redundant API calls

print("Checking contributor organizations...", file=sys.stderr)

for contributor in contributors_data:
# Skip if author is None (deleted users)
Expand Down Expand Up @@ -199,10 +283,18 @@ def main():
# Categorize (only Block non-goose and External now)
if username_lower in block_non_goose:
category = 'block_non_goose'
elif username_lower in external:
category = 'external'
else:
category = 'unknown'
# Check if user is in a Block org (with caching)
if username not in checked_orgs:
checked_orgs[username] = is_block_employee(username)
# Add a small delay to avoid rate limiting
time.sleep(0.1)
Copy link

Copilot AI Nov 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The 0.1 second delay applies to every uncached contributor check, even if they're already in the block_non_goose list (line 286). Move this delay inside the is_block_employee function or only apply it when actually making API calls to avoid unnecessary delays.

Copilot uses AI. Check for mistakes.

if checked_orgs[username]:
Comment on lines +288 to +293
Copy link

Copilot AI Nov 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cache check uses username but the categorization check at line 286 uses username_lower. If a username differs only in case, it could bypass the cache and make redundant API calls. Use username_lower consistently for cache keys.

Suggested change
if username not in checked_orgs:
checked_orgs[username] = is_block_employee(username)
# Add a small delay to avoid rate limiting
time.sleep(0.1)
if checked_orgs[username]:
if username_lower not in checked_orgs:
checked_orgs[username_lower] = is_block_employee(username)
# Add a small delay to avoid rate limiting
time.sleep(0.1)
if checked_orgs[username_lower]:

Copilot uses AI. Check for mistakes.
category = 'block_non_goose'
print(f" ✓ Detected Block employee: @{username}", file=sys.stderr)
else:
category = 'external'

contributor_stats.append({
'username': username,
Expand All @@ -220,7 +312,6 @@ def main():
# Separate by category
block_list = [c for c in contributor_stats if c['category'] == 'block_non_goose']
external_list = [c for c in contributor_stats if c['category'] == 'external']
unknown_list = [c for c in contributor_stats if c['category'] == 'unknown']

# Get top 5 from each
top_external = external_list[:5]
Expand Down Expand Up @@ -255,25 +346,16 @@ def main():
print("-" * 70)
if contributor_stats:
for i, contrib in enumerate(contributor_stats, 1):
cat_label = "External" if contrib['category'] == 'external' else "Block" if contrib['category'] == 'block_non_goose' else "Unknown"
cat_label = "External" if contrib['category'] == 'external' else "Block"
print(f"{i:2d}. @{contrib['username']:20s} - {contrib['commits']:3d} commits, {contrib['total_lines']:6,d} lines [{cat_label}]")
else:
print("No contributors found for this period.")

if unknown_list:
print()
print("⚠️ UNKNOWN CONTRIBUTORS (not in team lists):")
print("-" * 70)
for contrib in unknown_list:
print(f" @{contrib['username']:20s} - {contrib['commits']:3d} commits, {contrib['total_lines']:6,d} lines")

print()
print("=" * 70)
print(f"Total contributors (excluding bots, goose maintainers, external goose): {len(contributor_stats)}")
print(f" External: {len(external_list)}")
print(f" Block (non-goose): {len(block_list)}")
if unknown_list:
print(f" Unknown: {len(unknown_list)}")
print("=" * 70)

if __name__ == "__main__":
Expand Down
44 changes: 4 additions & 40 deletions documentation/scripts/community_stars_teams.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# Community Stars Team Lists
# This file categorizes contributors for the block/goose Community Stars program
# Format: One username per line under each category header
#
# Note: External contributors (eligible for Community All-Stars) are automatically
# detected - they don't need to be listed here. Anyone not in the lists below
# defaults to "external" unless they have public Block org membership.

# Goose Maintainers (excluded from rankings)
angiejones
Expand Down Expand Up @@ -65,46 +69,6 @@ chaitanyarahalkar
The-Best-Codes
Abhijay007

# External (eligible for Community All-Stars)
ARYPROGRAMMER
dbraduan
codefromthecrypt
Better-Boy
GaryZhous
iandouglas
lamchau
laanak08
Lymah123
the-matrixneo
arielherself
Developerayo
SalvatoreT
sheikhlimon
cgwalters
Anudhyan
johnlanda
alexyao2015
aegntic
bwalding
ajgray-stripe
sfc-gh-twhite
adhintz
sana-db
toyamagu-2021
Shreyanshsingh23
Jay4242
jalateras
sings-to-bees-on-wednesdays
myaple
necaris
par5ul1
rockwotj
ki3ani
vlascik
eyelight
nick-w-nick
ayax79

# Bots (excluded from rankings)
dependabot[bot]
SquareGist
Expand Down
Loading