Skip to content

Commit

Permalink
cli - Add the Janitor set-crawler-commit-date command
Browse files Browse the repository at this point in the history
  • Loading branch information
morucci committed Sep 20, 2023
1 parent 818f553 commit 9b283e3
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 11 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ All notable changes to this project will be documented in this file.

### Added

- [cli] Add a Janitor command to reset the commit date of a crawler

### Changed

### Removed
Expand Down
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -611,6 +611,15 @@ docker-compose run --rm --no-deps api monocle janitor wipe-crawler-data --elasti
docker-compose start crawler
```

## Reset the crawler commit date

Monocle crawlers keep track of the last date (commit date) when a successful document fetch happened. The command
below can be used to force a crawler to fetch (again) documents since another date.

```bash
docker-compose run --rm --no-deps api monocle janitor set-crawler-commit-date --elastic elastic:9200 --config /etc/monocle/config.yaml --workspace <workspace> --crawler-name <crawler-name> --commit-date 2023-01-01
```

## Components

![architecture](./doc/architecture.png)
Expand Down
18 changes: 17 additions & 1 deletion src/CLI.hs
Original file line number Diff line number Diff line change
Expand Up @@ -142,11 +142,13 @@ usageJanitor =
subparser
( mkSubCommand "update-idents" "Update author identities" janitorUpdateIdent
<> mkSubCommand "wipe-crawler-data" "Remove changes/task-data and events related to a crawler name" janitorRemoveCrawlerData
<> mkSubCommand "set-crawler-commit-date" "Overwrite the crawler commit date" janitorSetCrawlerCommitDate
)
where
configOption = strOption (long "config" <> O.help "Path to configuration file" <> metavar "MONOCLE_CONFIG")
elasticOption = strOption (long "elastic" <> O.help "The Elastic endpoint url" <> metavar "MONOCLE_ELASTIC_URL")
workspaceOption = strOption (long "workspace" <> O.help "Workspace name" <> metavar "WORKSPACE")
crawlerNameOption = strOption (long "crawler-name" <> O.help "The crawler name" <> metavar "CRAWLER_NAME")
runOnWorkspace env action' workspace = runEff $ runLoggerEffect $ runElasticEffect env $ runEmptyQueryM workspace action'
noWorkspace workspaceName = "Unable to find the workspace " <> workspaceName <> " in the Monocle config"
janitorUpdateIdent = io <$> parser
Expand All @@ -163,7 +165,6 @@ usageJanitor =
Nothing -> traverse_ (runOnWorkspace env J.updateIdentsOnWorkspace) $ Config.getWorkspaces config
janitorRemoveCrawlerData = io <$> parser
where
crawlerNameOption = strOption (long "crawler-name" <> O.help "The crawler name" <> metavar "CRAWLER_NAME")
parser = (,,,) <$> configOption <*> elasticOption <*> workspaceOption <*> crawlerNameOption
io (configPath, elasticUrl, workspaceName, crawlerName) = do
config <- Config.loadConfigWithoutEnv configPath
Expand All @@ -173,6 +174,21 @@ usageJanitor =
Just workspace -> do
runOnWorkspace env (J.wipeCrawlerData crawlerName) workspace
runOnWorkspace env (J.removeTDCrawlerData crawlerName) workspace
janitorSetCrawlerCommitDate = io <$> parser
where
newDateOption = strOption (long "commit-date" <> O.help "The new crawler commit-date" <> metavar "COMMIT_DATE")
parser = (,,,,) <$> configOption <*> elasticOption <*> workspaceOption <*> crawlerNameOption <*> newDateOption
io (configPath, elasticUrl, workspaceName, crawlerName, newDate) = do
config <- Config.loadConfigWithoutEnv configPath
env <- mkEnv $ getURL elasticUrl
void $ case Config.lookupTenant (Config.getWorkspaces config) workspaceName of
Nothing -> print $ noWorkspace workspaceName
Just workspace ->
runOnWorkspace
env
( J.updateCrawlerMDLastUpdatedDate workspace crawlerName newDate
)
workspace

---------------------------------------------------------------
-- Lentille cli
Expand Down
21 changes: 13 additions & 8 deletions src/Monocle/Backend/Index.hs
Original file line number Diff line number Diff line change
Expand Up @@ -943,14 +943,19 @@ getTaskDataEntityFromCrawler :: Config.Crawler -> [Entity]
getTaskDataEntityFromCrawler worker = TaskDataEntity <$> Config.getCrawlerTaskData worker

initCrawlerMetadata :: MonoQuery :> es => IndexEffects es => Config.Crawler -> Eff es ()
initCrawlerMetadata crawler =
initCrawlerEntities
( getProjectEntityFromCrawler crawler
<> getOrganizationEntityFromCrawler crawler
<> getTaskDataEntityFromCrawler crawler
<> getProjectIssueFromCrawler crawler
)
crawler
initCrawlerMetadata crawler = initCrawlerEntities (getCrawlerEntities crawler) crawler

resetCrawlerMetadataLastUpdatedDate :: MonoQuery :> es => IndexEffects es => Config.Crawler -> UTCTime -> Eff es ()
resetCrawlerMetadataLastUpdatedDate crawler newDate = do
let crawlerName = (CrawlerName $ Config.getCrawlerName crawler)
traverse_ (setLastUpdated crawlerName newDate) (getCrawlerEntities crawler)

getCrawlerEntities :: Config.Crawler -> [Entity]
getCrawlerEntities crawler =
getProjectEntityFromCrawler crawler
<> getOrganizationEntityFromCrawler crawler
<> getTaskDataEntityFromCrawler crawler
<> getProjectIssueFromCrawler crawler

-- Author cache functions
-------------------------
Expand Down
18 changes: 18 additions & 0 deletions src/Monocle/Backend/Janitor.hs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ module Monocle.Backend.Janitor (
updateIdentsOnEvents,
updateIdentsOnChanges,
updateIdentsOnWorkspace,
updateCrawlerMDLastUpdatedDate,
removeProjectMD,
) where

Expand Down Expand Up @@ -300,3 +301,20 @@ removeMD entity crawlerName = do
>>> I.bulkStream
)
logInfo "Deleted metadata" ["crawler" .= crawlerName, "count" .= deletedCount]

updateCrawlerMDLastUpdatedDate :: QEffects es => Config.Index -> Text -> Text -> Eff es ()
updateCrawlerMDLastUpdatedDate index crawlerNameText newDateText = do
let eCheckParam = do
newDate <-
toEither "Unable to parse the date: Expected format YYYY-mm-dd or YYYY-mm-dd hh:mm:ss UTC"
$ parseDateValue (from newDateText)
crawler <- toEither "Unable to find the crawler" $ Config.lookupCrawler index crawlerNameText
pure (newDate, crawler)
case eCheckParam of
Left err -> logInfo err ["crawler" .= crawlerNameText, "newDate" .= newDateText]
Right (newDate, crawler) -> I.resetCrawlerMetadataLastUpdatedDate crawler newDate
where
toEither :: Text -> Maybe a -> Either Text a
toEither msg = \case
Just a -> Right a
Nothing -> Left msg
4 changes: 2 additions & 2 deletions src/Monocle/Config.hs
Original file line number Diff line number Diff line change
Expand Up @@ -373,9 +373,9 @@ lookupProject index projectName = find isProject (fromMaybe [] (projects index))

-- | Find a 'Crawler' in an 'Index'
lookupCrawler :: Index -> Text -> Maybe Crawler
lookupCrawler index crawlerName = find isProject index.crawlers
lookupCrawler index crawlerName = find isCrawler index.crawlers
where
isProject Crawler {..} = name == crawlerName
isCrawler Crawler {..} = name == crawlerName

-- | Find an 'Ident' in an 'Index'
lookupIdent :: Index -> Text -> Maybe Ident
Expand Down

0 comments on commit 9b283e3

Please sign in to comment.