Skip to content

Commit d821899

Browse files
azure-sdkweshaggard
authored andcommitted
Add caching support to verify-links (#20902)
- Update link checking pipeline to enable caching - Add loading cache file from a http endpoint - Add retry logic when hitting 429 Fix verify link renaming scenarios - Correctly error if link format is incorrect - Handle case where the replace regex is not passed - Improve the error logging so it is easier to identify the broken links Co-authored-by: Wes Haggard <[email protected]>
1 parent 61c1bd2 commit d821899

File tree

2 files changed

+134
-53
lines changed

2 files changed

+134
-53
lines changed

eng/common/pipelines/templates/steps/verify-links.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,10 @@ steps:
2222
arguments: >
2323
-urls ${{ parameters.Urls }}
2424
-rootUrl "file://${{ parameters.WorkingDirectory }}/${{ parameters.Directory }}"
25-
-recursive: ${{ parameters.Recursive }}
25+
-recursive: ${{ parameters.Recursive }}
2626
-ignoreLinksFile ${{ parameters.IgnoreLinksFile }}
2727
-branchReplaceRegex "${{ parameters.BranchReplaceRegex }}"
2828
-branchReplacementName ${{ parameters.BranchReplacementName }}
2929
-devOpsLogging: $true
30-
-checkLinkGuidance: ${{ parameters.CheckLinkGuidance }}
30+
-checkLinkGuidance: ${{ parameters.CheckLinkGuidance }}
31+
-inputCacheFile "https://azuresdkartifacts.blob.core.windows.net/verify-links-cache/verify-links-cache.txt"

eng/common/scripts/Verify-Links.ps1

Lines changed: 131 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -12,47 +12,48 @@
1212
Specifies the file that contains a set of links to ignore when verifying.
1313
1414
.PARAMETER devOpsLogging
15-
Switch that will enable devops specific logging for warnings
15+
Switch that will enable devops specific logging for warnings
1616
1717
.PARAMETER recursive
18-
Check the links recurisvely based on recursivePattern.
19-
18+
Check the links recurisvely based on recursivePattern.
19+
2020
.PARAMETER baseUrl
2121
Recursively check links for all links verified that begin with this baseUrl, defaults to the folder the url is contained in.
22-
22+
2323
.PARAMETER rootUrl
24-
Path to the root of the site for resolving rooted relative links, defaults to host root for http and file directory for local files.
25-
24+
Path to the root of the site for resolving rooted relative links, defaults to host root for http and file directory for local files.
25+
2626
.PARAMETER errorStatusCodes
27-
List of http status codes that count as broken links. Defaults to 400, 401, 404, SocketError.HostNotFound = 11001, SocketError.NoData = 11004.
28-
27+
List of http status codes that count as broken links. Defaults to 400, 401, 404, SocketError.HostNotFound = 11001, SocketError.NoData = 11004.
28+
2929
.PARAMETER branchReplaceRegex
3030
Regex to check if the link needs to be replaced. E.g. ^(https://github.com/.*/(?:blob|tree)/)master(/.*)$
31-
31+
3232
.PARAMETER branchReplacementName
33-
The substitute branch name or SHA commit.
34-
33+
The substitute branch name or SHA commit.
34+
3535
.PARAMETER checkLinkGuidance
36-
Flag to allow checking against azure sdk link guidance. Check link guidance here: https://aka.ms/azsdk/guideline/links.
37-
36+
Flag to allow checking against azure sdk link guidance. Check link guidance here: https://aka.ms/azsdk/guideline/links.
37+
3838
.PARAMETER userAgent
3939
UserAgent to be configured for web requests. Defaults to current Chrome version.
4040
41-
.INPUTS
42-
None. No required inputs.
41+
.PARAMETER inputCacheFile
42+
Path to a file that contains a list of links that are known valid so we can skip checking them.
4343
44-
.OUTPUTS
45-
None. Verify-Links.ps1 does not generate any output.
44+
.PARAMETER outputCacheFile
45+
Path to a file that the script will output all the validated links after running all checks.
4646
4747
.EXAMPLE
48-
PS> .\Verify-Links.ps1
48+
PS> .\Verify-Links.ps1 C:\README.md
4949
5050
.EXAMPLE
51-
PS> .\Verify-Links.ps1 -urls C:\README.md
51+
PS> .\Verify-Links.ps1 https://azure.github.io/azure-sdk/index.html
5252
5353
.EXAMPLE
54-
PS> .\Verify-Links -urls C:\README.md -checkLinkGuidance $true
54+
PS> .\Verify-Links C:\README.md -checkLinkGuidance $true
5555
#>
56+
[CmdletBinding()]
5657
param (
5758
[string[]] $urls,
5859
[string] $ignoreLinksFile = "$PSScriptRoot/ignore-links.txt",
@@ -64,7 +65,9 @@ param (
6465
[string] $branchReplaceRegex = "",
6566
[string] $branchReplacementName = "",
6667
[bool] $checkLinkGuidance = $false,
67-
[string] $userAgent
68+
[string] $userAgent,
69+
[string] $inputCacheFile,
70+
[string] $outputCacheFile
6871
)
6972

7073
$ProgressPreference = "SilentlyContinue"; # Disable invoke-webrequest progress dialog
@@ -88,7 +91,7 @@ function NormalizeUrl([string]$url){
8891
}
8992

9093
if ($script:rootUrl -eq "") {
91-
if ($uri.IsFile) {
94+
if ($uri.IsFile) {
9295
# for files default to the containing directory
9396
$script:rootUrl = $script:baseUrl;
9497
}
@@ -129,7 +132,7 @@ function ResolveUri ([System.Uri]$referralUri, [string]$link)
129132
# If the link is mailto, skip it.
130133
if ($link.StartsWith("mailto:")) {
131134
Write-Verbose "Skipping $link because it is a mailto link."
132-
return $null
135+
return
133136
}
134137

135138
$linkUri = [System.Uri]$link;
@@ -156,12 +159,12 @@ function ResolveUri ([System.Uri]$referralUri, [string]$link)
156159
# If the link is not a web request, like mailto, skip it.
157160
if (!$linkUri.Scheme.StartsWith("http") -and !$linkUri.IsFile) {
158161
Write-Verbose "Skipping $linkUri because it is not http or file based."
159-
return $null
162+
return
160163
}
161164

162165
if ($null -ne $ignoreLinks -and ($ignoreLinks.Contains($link) -or $ignoreLinks.Contains($linkUri.ToString()))) {
163166
Write-Verbose "Ignoring invalid link $linkUri because it is in the ignore file."
164-
return $null
167+
return
165168
}
166169

167170
return $linkUri;
@@ -177,28 +180,34 @@ function ParseLinks([string]$baseUri, [string]$htmlContent)
177180
#$hrefs | Foreach-Object { Write-Host $_ }
178181

179182
Write-Verbose "Found $($hrefs.Count) raw href's in page $baseUri";
180-
$links = $hrefs | ForEach-Object { ResolveUri $baseUri $_.Groups["href"].Value } | Sort-Object -Unique
183+
$links = $hrefs | ForEach-Object { ResolveUri $baseUri $_.Groups["href"].Value }
181184

182185
#$links | Foreach-Object { Write-Host $_ }
183186

184187
return $links
185188
}
186189

187-
function CheckLink ([System.Uri]$linkUri)
190+
function CheckLink ([System.Uri]$linkUri, $allowRetry=$true)
188191
{
189192
if(!$linkUri.ToString().Trim()) {
190193
LogWarning "Found Empty link. Please use absolute link instead. Check here for more information: https://aka.ms/azsdk/guideline/links"
191194
return $false
192195
}
193-
if ($checkedLinks.ContainsKey($linkUri)) {
194-
if (!$checkedLinks[$linkUri]) {
195-
LogWarning "broken link $linkUri"
196+
197+
$originalLinkUri = $linkUri
198+
$linkUri = ReplaceGithubLink $linkUri
199+
200+
$link = $linkUri.ToString()
201+
202+
if ($checkedLinks.ContainsKey($link)) {
203+
if (!$checkedLinks[$link]) {
204+
LogWarning "broken link $link"
196205
}
197-
return $checkedLinks[$linkUri]
206+
return $checkedLinks[$link]
198207
}
199208

200209
$linkValid = $true
201-
Write-Verbose "Checking link $linkUri..."
210+
Write-Verbose "Checking link $linkUri..."
202211

203212
if ($linkUri.IsFile) {
204213
if (!(Test-Path $linkUri.LocalPath)) {
@@ -234,27 +243,58 @@ function CheckLink ([System.Uri]$linkUri)
234243
}
235244

236245
if ($statusCode -in $errorStatusCodes) {
237-
LogWarning "[$statusCode] broken link $linkUri"
246+
if ($originalLinkUri -ne $linkUri) {
247+
LogWarning "[$statusCode] broken link $originalLinkUri (resolved to $linkUri)"
248+
}
249+
else {
250+
LogWarning "[$statusCode] broken link $linkUri"
251+
}
252+
238253
$linkValid = $false
239254
}
240255
else {
241256
if ($null -ne $statusCode) {
242-
Write-Host "[$statusCode] while requesting $linkUri"
257+
# For 429 rate-limiting try to pause if possible
258+
if ($allowRetry -and $_.Exception.Response -and $statusCode -eq 429) {
259+
$retryAfter = $_.Exception.Response.Headers.RetryAfter.Delta.TotalSeconds
260+
261+
# Default retry after 60 (arbitrary) seconds if no header given
262+
if (!$retryAfter -or $retryAfter -gt 60) { $retryAfter = 60 }
263+
Write-Host "Rate-Limited for $retryAfter seconds while requesting $linkUri"
264+
265+
Start-Sleep -Seconds $retryAfter
266+
$linkValid = CheckLink $originalLinkUri -allowRetry $false
267+
}
268+
else {
269+
Write-Host "[$statusCode] handled while requesting $linkUri"
270+
# Override and set status code in the cache so it is truthy
271+
# so we don't keep checking but we don't think it is valid either
272+
$linkValid = $statusCode
273+
}
243274
}
244275
else {
245276
Write-Host "Exception while requesting $linkUri"
246277
Write-Host $_.Exception.ToString()
278+
# Override and set exception in the cache so it is truthy
279+
# so we don't keep checking but we don't think it is valid either
280+
$linkValid = "Exception"
247281
}
248282
}
249283
}
250284
}
251-
285+
elseif ($link.StartsWith("#")) {
286+
# Ignore anchor links as we don't have a great way to check them.
287+
}
288+
else {
289+
LogWarning "Link has invalid format $linkUri"
290+
$linkValid = $false
291+
}
292+
252293
if ($checkLinkGuidance) {
253294
if ($linkUri.Scheme -eq 'http') {
254295
LogWarning "DO NOT use 'http' in $linkUri. Please use secure link with https instead. Check here for more information: https://aka.ms/azsdk/guideline/links"
255296
$linkValid = $false
256297
}
257-
$link = $linkUri.ToString()
258298
# Check if the url is relative links, suppress the archor link validation.
259299
if (!$linkUri.IsAbsoluteUri -and !$link.StartsWith("#")) {
260300
LogWarning "DO NOT use relative link $linkUri. Please use absolute link instead. Check here for more information: https://aka.ms/azsdk/guideline/links"
@@ -272,16 +312,16 @@ function CheckLink ([System.Uri]$linkUri)
272312
}
273313
}
274314

275-
$checkedLinks[$linkUri] = $linkValid
315+
$checkedLinks[$link] = $linkValid
276316
return $linkValid
277317
}
278318

279319
function ReplaceGithubLink([string]$originLink) {
280-
if (!$branchReplacementName) {
320+
if (!$branchReplacementName -or !$branchReplaceRegex) {
281321
return $originLink
282322
}
283323
$ReplacementPattern = "`${1}$branchReplacementName`$2"
284-
return $originLink -replace $branchReplaceRegex, $ReplacementPattern
324+
return $originLink -replace $branchReplaceRegex, $ReplacementPattern
285325
}
286326

287327
function GetLinks([System.Uri]$pageUri)
@@ -327,25 +367,55 @@ if ($urls) {
327367
if ($urls.Count -eq 0) {
328368
Write-Host "Usage $($MyInvocation.MyCommand.Name) <urls>";
329369
exit 1;
330-
}
370+
}
331371
}
332372

333373
if ($PSVersionTable.PSVersion.Major -lt 6)
334374
{
335375
LogWarning "Some web requests will not work in versions of PS earlier then 6. You are running version $($PSVersionTable.PSVersion)."
336376
}
337377
$ignoreLinks = @();
338-
if (Test-Path $ignoreLinksFile)
378+
if (Test-Path $ignoreLinksFile) {
379+
$ignoreLinks = (Get-Content $ignoreLinksFile).Where({ $_.Trim() -ne "" -and !$_.StartsWith("#") })
380+
}
381+
382+
# Use default hashtable constructor instead of @{} because we need them to be case sensitive
383+
$checkedPages = New-Object Hashtable
384+
$checkedLinks = New-Object Hashtable
385+
386+
if ($inputCacheFile)
339387
{
340-
$ignoreLinks = [Array](Get-Content $ignoreLinksFile | ForEach-Object { ($_ -replace "#.*", "").Trim() } | Where-Object { $_ -ne "" })
388+
$cacheContent = ""
389+
if ($inputCacheFile.StartsWith("http")) {
390+
try {
391+
$response = Invoke-WebRequest -Uri $inputCacheFile
392+
$cacheContent = $response.Content
393+
}
394+
catch {
395+
$statusCode = $_.Exception.Response.StatusCode.value__
396+
Write-Error "Failed to read cache file from page [$statusCode] $inputCacheFile"
397+
}
398+
}
399+
elseif (Test-Path $inputCacheFile) {
400+
$cacheContent = Get-Content $inputCacheFile -Raw
401+
}
402+
$goodLinks = $cacheContent.Split("`n").Where({ $_.Trim() -ne "" -and !$_.StartsWith("#") })
403+
404+
foreach ($goodLink in $goodLinks) {
405+
$checkedLinks[$goodLink] = $true
406+
}
341407
}
342408

343-
$checkedPages = @{};
344-
$checkedLinks = @{};
345-
$badLinks = @{};
409+
$cachedLinksCount = $checkedLinks.Count
410+
411+
if ($cachedLinksCount) {
412+
Write-Host "Skipping checks on $cachedLinksCount links found in the given cache of known good links."
413+
}
414+
415+
$badLinks = New-Object Hashtable
346416
$pageUrisToCheck = new-object System.Collections.Queue
347417
foreach ($url in $urls) {
348-
$uri = NormalizeUrl $url
418+
$uri = NormalizeUrl $url
349419
$pageUrisToCheck.Enqueue($uri);
350420
}
351421

@@ -359,8 +429,7 @@ while ($pageUrisToCheck.Count -ne 0)
359429
Write-Host "Found $($linkUris.Count) links on page $pageUri";
360430
$badLinksPerPage = @();
361431
foreach ($linkUri in $linkUris) {
362-
$replacedLink = ReplaceGithubLink $linkUri
363-
$isLinkValid = CheckLink $replacedLink
432+
$isLinkValid = CheckLink $linkUri
364433
if (!$isLinkValid -and !$badLinksPerPage.Contains($linkUri)) {
365434
if (!$linkUri.ToString().Trim()) {
366435
$linkUri = $emptyLinkMessage
@@ -388,10 +457,21 @@ foreach ($pageLink in $badLinks.Keys) {
388457
}
389458
}
390459

460+
$linksChecked = $checkedLinks.Count - $cachedLinksCount
461+
391462
if ($badLinks.Count -gt 0) {
392-
LogError "Found $($checkedLinks.Count) links with $($badLinks.Count) page(s) broken."
393-
}
463+
LogError "Checked $linksChecked links with $($badLinks.Count) page(s) broken."
464+
}
394465
else {
395-
Write-Host "Found $($checkedLinks.Count) links. No broken links found."
466+
Write-Host "Checked $linksChecked links. No broken links found."
396467
}
468+
469+
if ($outputCacheFile)
470+
{
471+
$goodLinks = $checkedLinks.Keys.Where({ "True" -eq $checkedLinks[$_].ToString() }) | Sort-Object
472+
473+
Write-Host "Writing the list of validated links to $outputCacheFile"
474+
$goodLinks | Set-Content $outputCacheFile
475+
}
476+
397477
exit $badLinks.Count

0 commit comments

Comments
 (0)