Skip to content

Commit

Permalink
crawler - Add support for crawlings GitHub PRs created by users
Browse files Browse the repository at this point in the history
  • Loading branch information
morucci committed Sep 18, 2023
1 parent fe7e089 commit 3a96382
Show file tree
Hide file tree
Showing 12 changed files with 122 additions and 31 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ All notable changes to this project will be documented in this file.

### Added

- [crawler] Enable usage of the GitHub user PRs crawler via the Monocle config.

### Changed

### Removed
Expand Down
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,20 @@ Regarding the Github fine grained tokens (new):
- To crawl privates repositories, select "All repositories" or "Only select repositories", then in "Repository permissions"
select "Pull Requests", "Contents" as "Read-only".

The GitHub provider can also be configured to crawl Pull-Requests created by specific GitHub users.
For instance the following crawler's provider will fetch Pull-Requests and related events for
users john and jane:

```YAML
provider:
github_users:
- john
- jane
# Optional settings
github_url: https://github.com/api/graphql
github_token: GITHUB_TOKEN
```

A Gerrit provider settings

```YAML
Expand Down
44 changes: 39 additions & 5 deletions codegen/Monocle/Protob/Crawler.hs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 4 additions & 2 deletions doc/openapi.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 4 additions & 2 deletions schemas/monocle/protob/crawler.proto
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,25 @@ message Entity {
string project_name = 2;
string project_issue_name = 4;
string td_name = 3;
string user_name = 5;
}
}

enum EntityType {
ENTITY_TYPE_ORGANIZATION = 0;
ENTITY_TYPE_PROJECT = 1;
ENTITY_TYPE_TASK_DATA = 2;
ENTITY_TYPE_USER = 3;
}

message AddDocRequest {
string index = 1;
string crawler = 2;
string apikey = 3;
Entity entity = 4;
// changes are added when Entity is project_name
// changes are added when Entity is project_name or user_name
repeated monocle_change.Change changes = 5;
// events are added when Entity is project_name
// events are added when Entity is project_name or user_name
repeated monocle_change.ChangeEvent events = 6;
// projects are added when Entity is organization_name
repeated Project projects = 7;
Expand Down
21 changes: 15 additions & 6 deletions src/Macroscope/Main.hs
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ import Effectful.Concurrent.MVar qualified as E
import Effectful.Env
import Effectful.Prometheus
import Effectful.Reader.Static qualified as E
import Lentille.GitHub.UserPullRequests (streamUserPullRequests)
import Monocle.Effects

-- | A structure to carry a single crawler information.
Expand Down Expand Up @@ -362,22 +363,27 @@ getCrawler inf@(InfoCrawler _ _ crawler idents) = getCompose $ fmap addInfos (Co
pure $ Just (k, [bzCrawler bzClient])
Config.GithubProvider ghCrawler -> do
let Config.Github {..} = ghCrawler
ghToken <- lift $ Config.getSecret "GITHUB_TOKEN" github_token
(k, ghClient) <-
getClientGraphQL
(fromMaybe "https://api.github.com/graphql" github_url)
ghToken
(k, ghClient) <- getGHClient github_token github_url
let crawlers =
[ghOrgCrawler ghClient | isNothing github_repositories]
<> [ghIssuesCrawler ghClient]
<> [ghPRCrawler ghClient getIdentByAliasCB]
pure $ Just (k, crawlers)
Config.GithubUserProvider _ -> pure Nothing -- Not yet implemented
Config.GithubUserProvider ghUserCrawler -> do
let Config.GithubUser {..} = ghUserCrawler
(k, ghClient) <- getGHClient github_token github_url
pure $ Just (k, [ghUserPRCrawler ghClient getIdentByAliasCB])
Config.GithubApplicationProvider _ -> pure Nothing -- "Not (yet) implemented"
Config.TaskDataProvider -> pure Nothing -- This is a generic crawler, not managed by the macroscope
getIdentByAliasCB :: Text -> Maybe Text
getIdentByAliasCB = flip Config.getIdentByAliasFromIdents idents

getGHClient mToken mAPIUrl = do
ghToken <- lift $ Config.getSecret "GITHUB_TOKEN" mToken
getClientGraphQL
(fromMaybe "https://api.github.com/graphql" mAPIUrl)
ghToken

glMRCrawler :: GraphClient -> (Text -> Maybe Text) -> DocumentStream es
glMRCrawler glClient cb = Changes $ streamMergeRequests glClient cb

Expand All @@ -396,6 +402,9 @@ getCrawler inf@(InfoCrawler _ _ crawler idents) = getCompose $ fmap addInfos (Co
ghPRCrawler :: GraphClient -> (Text -> Maybe Text) -> DocumentStream es
ghPRCrawler glClient cb = Changes $ streamPullRequests glClient cb

ghUserPRCrawler :: GraphClient -> (Text -> Maybe Text) -> DocumentStream es
ghUserPRCrawler glClient cb = UserChanges $ streamUserPullRequests glClient cb

gerritRegexProjects :: [Text] -> [Text]
gerritRegexProjects = filter (T.isPrefixOf "^")

Expand Down
7 changes: 7 additions & 0 deletions src/Macroscope/Worker.hs
Original file line number Diff line number Diff line change
Expand Up @@ -36,20 +36,24 @@ data DocumentStream es
Changes (UTCTime -> Text -> LentilleStream es (Change, [ChangeEvent]))
| -- | Fetch recent task data
TaskDatas (UTCTime -> Text -> LentilleStream es TaskData)
| -- | Fetch recent changes from a user
UserChanges (UTCTime -> Text -> LentilleStream es (Change, [ChangeEvent]))

-- | Get the entity type managed by a given stream
streamEntity :: DocumentStream es -> CrawlerPB.EntityType
streamEntity = \case
Projects _ -> EntityTypeENTITY_TYPE_ORGANIZATION
Changes _ -> EntityTypeENTITY_TYPE_PROJECT
TaskDatas _ -> EntityTypeENTITY_TYPE_TASK_DATA
UserChanges _ -> EntityTypeENTITY_TYPE_USER

-- | Get a text representation of a stream type
streamName :: DocumentStream m -> Text
streamName = \case
Projects _ -> "Projects"
Changes _ -> "Changes"
TaskDatas _ -> "TaskDatas"
UserChanges _ -> "UserChanges"

isTDStream :: DocumentStream m -> Bool
isTDStream = \case
Expand Down Expand Up @@ -204,6 +208,9 @@ runStreamError startTime apiKey indexName (CrawlerName crawlerName) documentStre
TaskDatas s ->
let td = extractEntityValue _TaskDataEntity
in S.map (fmap DTTaskData) (s oldestAge td)
UserChanges s ->
let user = extractEntityValue _User
in S.map (fmap DTChanges) (s oldestAge user)
where
extractEntityValue prism =
fromMaybe (error $ "Entity is not the right shape: " <> show entity)
Expand Down
6 changes: 6 additions & 0 deletions src/Monocle/Api/Server.hs
Original file line number Diff line number Diff line change
Expand Up @@ -266,11 +266,16 @@ pattern TDEntity :: LText -> Maybe CrawlerPB.Entity
pattern TDEntity td =
Just (CrawlerPB.Entity (Just (CrawlerPB.EntityEntityTdName td)))

pattern UserEntity :: LText -> Maybe CrawlerPB.Entity
pattern UserEntity user =
Just (CrawlerPB.Entity (Just (CrawlerPB.EntityEntityUserName user)))

toEntity :: Maybe CrawlerPB.Entity -> Entity
toEntity entityPB = case entityPB of
ProjectEntity projectName -> Project $ from projectName
OrganizationEntity organizationName -> Organization $ from organizationName
TDEntity tdName -> TaskDataEntity $ from tdName
UserEntity userName -> User $ from userName
otherEntity -> error $ "Unknown Entity type: " <> show otherEntity

-- | /crawler/add endpoint
Expand Down Expand Up @@ -311,6 +316,7 @@ crawlerAddDoc _auth request = do
ProjectIssue _ -> addIssues crawlerName issues issuesEvents
Organization organizationName -> addProjects crawler organizationName projects
TaskDataEntity _ -> addTDs crawlerName taskDatas
User _ -> addChanges crawlerName changes events
Left err -> pure $ toErrorResponse err
where
addTDs crawlerName taskDatas = do
Expand Down
2 changes: 2 additions & 0 deletions src/Monocle/Backend/Documents.hs
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,7 @@ instance ToJSON ECrawlerMetadataObject where
Project n -> n
ProjectIssue n -> n
TaskDataEntity n -> n
User n -> n

instance FromJSON ECrawlerMetadataObject where
parseJSON = withObject "CrawlerMetadataObject" $ \v -> do
Expand All @@ -635,6 +636,7 @@ instance FromJSON ECrawlerMetadataObject where
"organization" -> pure $ Organization evalue
"project" -> pure $ Project evalue
"taskdata" -> pure $ TaskDataEntity evalue
"user" -> pure $ User evalue
_ -> fail $ "Unknown crawler entity type name: " <> from etype
pure ECrawlerMetadataObject {..}

Expand Down
27 changes: 11 additions & 16 deletions src/Monocle/Backend/Index.hs
Original file line number Diff line number Diff line change
Expand Up @@ -930,27 +930,22 @@ initCrawlerEntities entities worker = traverse_ run entities
ensureCrawlerMetadata (CrawlerName $ getWorkerName worker) updated_since entity
defaultUpdatedSince = getWorkerUpdatedSince worker

getProjectEntityFromCrawler :: Config.Crawler -> [Entity]
getProjectEntityFromCrawler worker = Project <$> Config.getCrawlerProject worker

getProjectIssueFromCrawler :: Config.Crawler -> [Entity]
getProjectIssueFromCrawler worker = ProjectIssue <$> Config.getCrawlerProjectIssue worker

getOrganizationEntityFromCrawler :: Config.Crawler -> [Entity]
getOrganizationEntityFromCrawler worker = Organization <$> Config.getCrawlerOrganization worker

getTaskDataEntityFromCrawler :: Config.Crawler -> [Entity]
getTaskDataEntityFromCrawler worker = TaskDataEntity <$> Config.getCrawlerTaskData worker

initCrawlerMetadata :: MonoQuery :> es => IndexEffects es => Config.Crawler -> Eff es ()
initCrawlerMetadata crawler =
initCrawlerEntities
( getProjectEntityFromCrawler crawler
<> getOrganizationEntityFromCrawler crawler
<> getTaskDataEntityFromCrawler crawler
<> getProjectIssueFromCrawler crawler
( getProjectEntityFromCrawler
<> getOrganizationEntityFromCrawler
<> getTaskDataEntityFromCrawler
<> getProjectIssueFromCrawler
<> getUserEntityFromCrawler
)
crawler
where
getProjectEntityFromCrawler = Project <$> Config.getCrawlerProject crawler
getProjectIssueFromCrawler = ProjectIssue <$> Config.getCrawlerProjectIssue crawler
getOrganizationEntityFromCrawler = Organization <$> Config.getCrawlerOrganization crawler
getTaskDataEntityFromCrawler = TaskDataEntity <$> Config.getCrawlerTaskData crawler
getUserEntityFromCrawler = User <$> Config.getCrawlerUser crawler

-- Author cache functions
-------------------------
Expand Down
Loading

0 comments on commit 3a96382

Please sign in to comment.