-
-
Notifications
You must be signed in to change notification settings - Fork 4.2k
/
committers.py
397 lines (328 loc) · 14.3 KB
/
committers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
from __future__ import annotations
import operator
from collections import defaultdict
from collections.abc import Iterator, Mapping, MutableMapping, Sequence
from enum import Enum
from functools import reduce
from typing import Any, TypedDict
from django.core.cache import cache
from django.db.models import Q
from sentry.api.serializers import serialize
from sentry.api.serializers.models.commit import CommitSerializer, get_users_for_commits
from sentry.api.serializers.models.release import Author
from sentry.eventstore.models import BaseEvent, Event, GroupEvent
from sentry.models.commit import Commit
from sentry.models.commitfilechange import CommitFileChange
from sentry.models.group import Group
from sentry.models.groupowner import GroupOwner, GroupOwnerType
from sentry.models.project import Project
from sentry.models.release import Release
from sentry.models.releasecommit import ReleaseCommit
from sentry.users.services.user.service import user_service
from sentry.utils import metrics
from sentry.utils.event_frames import find_stack_frames, get_sdk_name, munged_filename_and_frames
from sentry.utils.hashlib import hash_values
PATH_SEPARATORS = frozenset(["/", "\\"])
def tokenize_path(path: str) -> Iterator[str]:
for sep in PATH_SEPARATORS:
if sep in path:
# Exclude empty path segments as some repository integrations
# start their paths with `/` which we want to ignore.
return reversed([x for x in path.split(sep) if x != ""])
else:
return iter([path])
def score_path_match_length(path_a: str, path_b: str) -> int:
score = 0
for a, b in zip(tokenize_path(path_a), tokenize_path(path_b)):
if a.lower() != b.lower():
break
score += 1
return score
def get_frame_paths(event: Event | GroupEvent) -> Any | Sequence[Any]:
return find_stack_frames(event.data)
def release_cache_key(release: Release) -> str:
return f"release_commits:{release.id}"
def _get_commits(releases: Sequence[Release]) -> Sequence[Commit]:
commits = []
fetched = cache.get_many([release_cache_key(release) for release in releases])
if fetched:
missed = []
for release in releases:
cached_commits = fetched.get(release_cache_key(release))
if cached_commits is None:
missed.append(release)
else:
commits += [c for c in cached_commits if c not in commits]
else:
missed = list(releases)
if missed:
release_commits = ReleaseCommit.objects.filter(release__in=missed).select_related(
"commit", "release", "commit__author"
)
to_cache = defaultdict(list)
for rc in release_commits:
to_cache[release_cache_key(rc.release)].append(rc.commit)
if rc.commit not in commits:
commits.append(rc.commit)
cache.set_many(to_cache)
return commits
def _get_commit_file_changes(
commits: Sequence[Commit], path_name_set: set[str]
) -> Sequence[CommitFileChange]:
# Get distinct file names and bail if there are no files.
filenames = {next(tokenize_path(path), None) for path in path_name_set}
filenames = {path for path in filenames if path is not None}
if not len(filenames):
return []
# build a single query to get all of the commit file that might match the first n frames
path_query = reduce(operator.or_, (Q(filename__iendswith=path) for path in filenames))
commit_file_change_matches = CommitFileChange.objects.filter(path_query, commit__in=commits)
return list(commit_file_change_matches)
def _match_commits_path(
commit_file_changes: Sequence[CommitFileChange], path: str
) -> Sequence[tuple[Commit, int]]:
# find commits that match the run time path the best.
matching_commits: MutableMapping[int, tuple[Commit, int]] = {}
best_score = 1
for file_change in commit_file_changes:
score = score_path_match_length(file_change.filename, path)
if score > best_score:
# reset matches for better match.
best_score = score
matching_commits = {}
if score == best_score:
# skip 1-score matches when file change is longer than 1 token
if score == 1 and len(list(tokenize_path(file_change.filename))) > 1:
continue
# we want a list of unique commits that tie for longest match
matching_commits[file_change.commit.id] = (file_change.commit, score)
return list(matching_commits.values())
class AuthorCommits(TypedDict):
author: Author | None
commits: Sequence[tuple[Commit, int]]
class AuthorCommitsSerialized(TypedDict):
author: Author | None
commits: Sequence[MutableMapping[str, Any]]
class AnnotatedFrame(TypedDict):
frame: str
commits: Sequence[tuple[Commit, int]]
class SuspectCommitType(Enum):
"""Used to distinguish old suspect commits from the newer commits obtained via the commit_context."""
RELEASE_COMMIT = "via commit in release"
INTEGRATION_COMMIT = "via SCM integration"
def _get_committers(
annotated_frames: Sequence[AnnotatedFrame],
commits: Sequence[tuple[Commit, int]],
) -> Sequence[AuthorCommits]:
# extract the unique committers and return their serialized sentry accounts
committers: MutableMapping[int, int] = defaultdict(int)
# organize them by this heuristic (first frame is worth 5 points, second is worth 4, etc.)
limit = 5
for annotated_frame in annotated_frames:
if limit == 0:
break
for commit, score in annotated_frame["commits"]:
if not commit.author_id:
continue
committers[commit.author_id] += limit
limit -= 1
if limit == 0:
break
author_users: Mapping[str, Author] = get_users_for_commits([c for c, _ in commits])
return [
{
"author": author_users.get(str(author_id)),
"commits": [
(commit, score) for (commit, score) in commits if commit.author_id == author_id
],
}
for author_id, _ in sorted(committers.items(), key=operator.itemgetter(1))
]
def get_previous_releases(
project: Project, start_version: str, limit: int = 5
) -> Any | Sequence[Release]:
# given a release version + project, return the previous
# `limit` releases (includes the release specified by `version`)
key = "get_previous_releases:1:%s" % hash_values([project.id, start_version, limit])
rv = cache.get(key)
if rv is None:
try:
first_release = Release.objects.filter(
organization_id=project.organization_id, version=start_version, projects=project
).get()
except Release.DoesNotExist:
rv = []
else:
start_date = first_release.date_released or first_release.date_added
# XXX: This query could be very inefficient for projects with a large
# number of releases. To work around this, we only check 100 releases
# ordered by highest release id, which is generally correlated with
# most recent releases for a project. This isn't guaranteed to be correct,
# since `date_released` could end up out of order, but should be close
# enough for what we need this for with suspect commits.
# To make this better, we should denormalize the coalesce of date_released
# and date_added onto `ReleaseProject`, which would have benefits for other
# similar queries.
rv = list(
Release.objects.raw(
"""
SELECT sr.*
FROM sentry_release as sr
INNER JOIN (
SELECT release_id
FROM sentry_release_project
WHERE project_id = %s
AND sentry_release_project.release_id <= %s
ORDER BY release_id desc
LIMIT 100
) AS srp ON (sr.id = srp.release_id)
WHERE sr.organization_id = %s
AND coalesce(sr.date_released, sr.date_added) <= %s
ORDER BY coalesce(sr.date_released, sr.date_added) DESC
LIMIT %s;
""",
[project.id, first_release.id, project.organization_id, start_date, limit],
)
)
cache.set(key, rv, 60)
return rv
def get_event_file_committers(
project: Project,
group_id: int,
event_frames: Sequence[Mapping[str, Any]],
event_platform: str,
frame_limit: int = 25,
sdk_name: str | None = None,
) -> Sequence[AuthorCommits]:
group = Group.objects.get_from_cache(id=group_id)
first_release_version = group.get_first_release()
if not first_release_version:
raise Release.DoesNotExist
releases = get_previous_releases(project, first_release_version)
if not releases:
raise Release.DoesNotExist
commits = _get_commits(releases)
if not commits:
raise Commit.DoesNotExist
frames = event_frames or []
munged = munged_filename_and_frames(event_platform, frames, "munged_filename", sdk_name)
if munged:
frames = munged[1]
app_frames = [frame for frame in frames if frame and frame.get("in_app")][-frame_limit:]
if not app_frames:
app_frames = [frame for frame in frames][-frame_limit:]
# TODO(maxbittker) return this set instead of annotated frames
# XXX(dcramer): frames may not define a filepath. For example, in Java its common
# to only have a module name/path
path_set = {
str(f) for f in (get_stacktrace_path_from_event_frame(frame) for frame in app_frames) if f
}
file_changes: Sequence[CommitFileChange] = (
_get_commit_file_changes(commits, path_set) if path_set else []
)
commit_path_matches: Mapping[str, Sequence[tuple[Commit, int]]] = {
path: _match_commits_path(file_changes, path) for path in path_set
}
annotated_frames: Sequence[AnnotatedFrame] = [
{
"frame": str(frame),
"commits": commit_path_matches.get(
str(get_stacktrace_path_from_event_frame(frame)),
[],
),
}
for frame in app_frames
]
relevant_commits: Sequence[tuple[Commit, int]] = [
match for matches in commit_path_matches.values() for match in matches
]
return _get_committers(annotated_frames, relevant_commits)
def get_serialized_event_file_committers(
project: Project, event: BaseEvent, frame_limit: int = 25
) -> Sequence[AuthorCommitsSerialized]:
group_owners = GroupOwner.objects.filter(
group_id=event.group_id,
project=project,
organization_id=project.organization_id,
type=GroupOwnerType.SUSPECT_COMMIT.value,
context__isnull=False,
).order_by("-date_added")
if len(group_owners) > 0:
owner = next(filter(lambda go: go.context.get("commitId"), group_owners), None)
if not owner:
return []
commit = Commit.objects.get(id=owner.context.get("commitId"))
commit_author = commit.author
if not commit_author:
return []
author = {"email": commit_author.email, "name": commit_author.name}
if owner.user_id is not None:
serialized_owners = user_service.serialize_many(filter={"user_ids": [owner.user_id]})
# No guarantee that just because the user_id is set that the value exists, so we still have to check
if serialized_owners:
author = serialized_owners[0]
return [
{
"author": author,
"commits": [
serialize(
commit,
serializer=CommitSerializer(
exclude=["author"],
type=SuspectCommitType.INTEGRATION_COMMIT.value,
),
)
],
}
]
# TODO(nisanthan): We create GroupOwner records for
# legacy Suspect Commits in process_suspect_commits task.
# We should refactor to query GroupOwner rather than recalculate.
# But we need to store the commitId and a way to differentiate
# if the Suspect Commit came from ReleaseCommits or CommitContext.
else:
event_frames = get_frame_paths(event)
sdk_name = get_sdk_name(event.data)
committers = get_event_file_committers(
project,
event.group_id,
event_frames,
event.platform,
frame_limit=frame_limit,
sdk_name=sdk_name,
)
commits = [commit for committer in committers for commit in committer["commits"]]
serialized_commits: Sequence[MutableMapping[str, Any]] = serialize(
[c for (c, score) in commits],
serializer=CommitSerializer(
exclude=["author"],
type=SuspectCommitType.RELEASE_COMMIT.value,
),
)
serialized_commits_by_id = {}
for (commit, score), serialized_commit in zip(commits, serialized_commits):
serialized_commit["score"] = score
serialized_commits_by_id[commit.id] = serialized_commit
serialized_committers: list[AuthorCommitsSerialized] = []
for committer in committers:
commit_ids = [commit.id for (commit, _) in committer["commits"]]
commits_result = [serialized_commits_by_id[commit_id] for commit_id in commit_ids]
serialized_committers.append(
{"author": committer["author"], "commits": dedupe_commits(commits_result)}
)
metrics.incr(
"feature.owners.has-committers",
instance="hit" if committers else "miss",
skip_internal=False,
)
return serialized_committers
def dedupe_commits(
commits: Sequence[MutableMapping[str, Any]]
) -> Sequence[MutableMapping[str, Any]]:
return list({c["id"]: c for c in commits}.values())
def get_stacktrace_path_from_event_frame(frame: Mapping[str, Any]) -> str | None:
"""
Returns the filepath from a stacktrace's frame.
frame: Event frame
"""
return frame.get("munged_filename") or frame.get("filename") or frame.get("abs_path")