1212 Specifies the file that contains a set of links to ignore when verifying.
1313
1414 . PARAMETER devOpsLogging
15- Switch that will enable devops specific logging for warnings
15+ Switch that will enable devops specific logging for warnings
1616
1717 . PARAMETER recursive
18- Check the links recurisvely based on recursivePattern.
19-
18+ Check the links recurisvely based on recursivePattern.
19+
2020 . PARAMETER baseUrl
2121 Recursively check links for all links verified that begin with this baseUrl, defaults to the folder the url is contained in.
22-
22+
2323 . PARAMETER rootUrl
24- Path to the root of the site for resolving rooted relative links, defaults to host root for http and file directory for local files.
25-
24+ Path to the root of the site for resolving rooted relative links, defaults to host root for http and file directory for local files.
25+
2626 . PARAMETER errorStatusCodes
27- List of http status codes that count as broken links. Defaults to 400, 401, 404, SocketError.HostNotFound = 11001, SocketError.NoData = 11004.
28-
27+ List of http status codes that count as broken links. Defaults to 400, 401, 404, SocketError.HostNotFound = 11001, SocketError.NoData = 11004.
28+
2929 . PARAMETER branchReplaceRegex
3030 Regex to check if the link needs to be replaced. E.g. ^(https://github.com/.*/(?:blob|tree)/)master(/.*)$
31-
31+
3232 . PARAMETER branchReplacementName
33- The substitute branch name or SHA commit.
34-
33+ The substitute branch name or SHA commit.
34+
3535 . PARAMETER checkLinkGuidance
36- Flag to allow checking against azure sdk link guidance. Check link guidance here: https://aka.ms/azsdk/guideline/links.
37-
36+ Flag to allow checking against azure sdk link guidance. Check link guidance here: https://aka.ms/azsdk/guideline/links.
37+
3838 . PARAMETER userAgent
3939 UserAgent to be configured for web requests. Defaults to current Chrome version.
4040
41- . INPUTS
42- None. No required inputs .
41+ . PARAMETER inputCacheFile
42+ Path to a file that contains a list of links that are known valid so we can skip checking them .
4343
44- . OUTPUTS
45- None. Verify-Links.ps1 does not generate any output.
44+ . PARAMETER outputCacheFile
45+ Path to a file that the script will output all the validated links after running all checks .
4646
4747 . EXAMPLE
48- PS> .\Verify-Links.ps1
48+ PS> .\Verify-Links.ps1 C:\README.md
4949
5050 . EXAMPLE
51- PS> .\Verify-Links.ps1 -urls C:\README.md
51+ PS> .\Verify-Links.ps1 https://azure.github.io/azure-sdk/index.html
5252
5353 . EXAMPLE
54- PS> .\Verify-Links -urls C:\README.md -checkLinkGuidance $true
54+ PS> .\Verify-Links C:\README.md -checkLinkGuidance $true
5555#>
56+ [CmdletBinding ()]
5657param (
5758 [string []] $urls ,
5859 [string ] $ignoreLinksFile = " $PSScriptRoot /ignore-links.txt" ,
@@ -64,7 +65,9 @@ param (
6465 [string ] $branchReplaceRegex = " " ,
6566 [string ] $branchReplacementName = " " ,
6667 [bool ] $checkLinkGuidance = $false ,
67- [string ] $userAgent
68+ [string ] $userAgent ,
69+ [string ] $inputCacheFile ,
70+ [string ] $outputCacheFile
6871)
6972
7073$ProgressPreference = " SilentlyContinue" ; # Disable invoke-webrequest progress dialog
@@ -88,7 +91,7 @@ function NormalizeUrl([string]$url){
8891 }
8992
9093 if ($script :rootUrl -eq " " ) {
91- if ($uri.IsFile ) {
94+ if ($uri.IsFile ) {
9295 # for files default to the containing directory
9396 $script :rootUrl = $script :baseUrl ;
9497 }
@@ -129,7 +132,7 @@ function ResolveUri ([System.Uri]$referralUri, [string]$link)
129132 # If the link is mailto, skip it.
130133 if ($link.StartsWith (" mailto:" )) {
131134 Write-Verbose " Skipping $link because it is a mailto link."
132- return $null
135+ return
133136 }
134137
135138 $linkUri = [System.Uri ]$link ;
@@ -156,12 +159,12 @@ function ResolveUri ([System.Uri]$referralUri, [string]$link)
156159 # If the link is not a web request, like mailto, skip it.
157160 if (! $linkUri.Scheme.StartsWith (" http" ) -and ! $linkUri.IsFile ) {
158161 Write-Verbose " Skipping $linkUri because it is not http or file based."
159- return $null
162+ return
160163 }
161164
162165 if ($null -ne $ignoreLinks -and ($ignoreLinks.Contains ($link ) -or $ignoreLinks.Contains ($linkUri.ToString ()))) {
163166 Write-Verbose " Ignoring invalid link $linkUri because it is in the ignore file."
164- return $null
167+ return
165168 }
166169
167170 return $linkUri ;
@@ -177,28 +180,34 @@ function ParseLinks([string]$baseUri, [string]$htmlContent)
177180 # $hrefs | Foreach-Object { Write-Host $_ }
178181
179182 Write-Verbose " Found $ ( $hrefs.Count ) raw href's in page $baseUri " ;
180- $links = $hrefs | ForEach-Object { ResolveUri $baseUri $_.Groups [" href" ].Value } | Sort-Object - Unique
183+ $links = $hrefs | ForEach-Object { ResolveUri $baseUri $_.Groups [" href" ].Value }
181184
182185 # $links | Foreach-Object { Write-Host $_ }
183186
184187 return $links
185188}
186189
187- function CheckLink ([System.Uri ]$linkUri )
190+ function CheckLink ([System.Uri ]$linkUri , $allowRetry = $true )
188191{
189192 if (! $linkUri.ToString ().Trim()) {
190193 LogWarning " Found Empty link. Please use absolute link instead. Check here for more information: https://aka.ms/azsdk/guideline/links"
191194 return $false
192195 }
193- if ($checkedLinks.ContainsKey ($linkUri )) {
194- if (! $checkedLinks [$linkUri ]) {
195- LogWarning " broken link $linkUri "
196+
197+ $originalLinkUri = $linkUri
198+ $linkUri = ReplaceGithubLink $linkUri
199+
200+ $link = $linkUri.ToString ()
201+
202+ if ($checkedLinks.ContainsKey ($link )) {
203+ if (! $checkedLinks [$link ]) {
204+ LogWarning " broken link $link "
196205 }
197- return $checkedLinks [$linkUri ]
206+ return $checkedLinks [$link ]
198207 }
199208
200209 $linkValid = $true
201- Write-Verbose " Checking link $linkUri ..."
210+ Write-Verbose " Checking link $linkUri ..."
202211
203212 if ($linkUri.IsFile ) {
204213 if (! (Test-Path $linkUri.LocalPath )) {
@@ -234,27 +243,58 @@ function CheckLink ([System.Uri]$linkUri)
234243 }
235244
236245 if ($statusCode -in $errorStatusCodes ) {
237- LogWarning " [$statusCode ] broken link $linkUri "
246+ if ($originalLinkUri -ne $linkUri ) {
247+ LogWarning " [$statusCode ] broken link $originalLinkUri (resolved to $linkUri )"
248+ }
249+ else {
250+ LogWarning " [$statusCode ] broken link $linkUri "
251+ }
252+
238253 $linkValid = $false
239254 }
240255 else {
241256 if ($null -ne $statusCode ) {
242- Write-Host " [$statusCode ] while requesting $linkUri "
257+ # For 429 rate-limiting try to pause if possible
258+ if ($allowRetry -and $_.Exception.Response -and $statusCode -eq 429 ) {
259+ $retryAfter = $_.Exception.Response.Headers.RetryAfter.Delta.TotalSeconds
260+
261+ # Default retry after 60 (arbitrary) seconds if no header given
262+ if (! $retryAfter -or $retryAfter -gt 60 ) { $retryAfter = 60 }
263+ Write-Host " Rate-Limited for $retryAfter seconds while requesting $linkUri "
264+
265+ Start-Sleep - Seconds $retryAfter
266+ $linkValid = CheckLink $originalLinkUri - allowRetry $false
267+ }
268+ else {
269+ Write-Host " [$statusCode ] handled while requesting $linkUri "
270+ # Override and set status code in the cache so it is truthy
271+ # so we don't keep checking but we don't think it is valid either
272+ $linkValid = $statusCode
273+ }
243274 }
244275 else {
245276 Write-Host " Exception while requesting $linkUri "
246277 Write-Host $_.Exception.ToString ()
278+ # Override and set exception in the cache so it is truthy
279+ # so we don't keep checking but we don't think it is valid either
280+ $linkValid = " Exception"
247281 }
248282 }
249283 }
250284 }
251-
285+ elseif ($link.StartsWith (" #" )) {
286+ # Ignore anchor links as we don't have a great way to check them.
287+ }
288+ else {
289+ LogWarning " Link has invalid format $linkUri "
290+ $linkValid = $false
291+ }
292+
252293 if ($checkLinkGuidance ) {
253294 if ($linkUri.Scheme -eq ' http' ) {
254295 LogWarning " DO NOT use 'http' in $linkUri . Please use secure link with https instead. Check here for more information: https://aka.ms/azsdk/guideline/links"
255296 $linkValid = $false
256297 }
257- $link = $linkUri.ToString ()
258298 # Check if the url is relative links, suppress the archor link validation.
259299 if (! $linkUri.IsAbsoluteUri -and ! $link.StartsWith (" #" )) {
260300 LogWarning " DO NOT use relative link $linkUri . Please use absolute link instead. Check here for more information: https://aka.ms/azsdk/guideline/links"
@@ -272,16 +312,16 @@ function CheckLink ([System.Uri]$linkUri)
272312 }
273313 }
274314
275- $checkedLinks [$linkUri ] = $linkValid
315+ $checkedLinks [$link ] = $linkValid
276316 return $linkValid
277317}
278318
279319function ReplaceGithubLink ([string ]$originLink ) {
280- if (! $branchReplacementName ) {
320+ if (! $branchReplacementName -or ! $branchReplaceRegex ) {
281321 return $originLink
282322 }
283323 $ReplacementPattern = " `$ {1}$branchReplacementName `$ 2"
284- return $originLink -replace $branchReplaceRegex , $ReplacementPattern
324+ return $originLink -replace $branchReplaceRegex , $ReplacementPattern
285325}
286326
287327function GetLinks ([System.Uri ]$pageUri )
@@ -327,25 +367,55 @@ if ($urls) {
327367 if ($urls.Count -eq 0 ) {
328368 Write-Host " Usage $ ( $MyInvocation.MyCommand.Name ) <urls>" ;
329369 exit 1 ;
330- }
370+ }
331371}
332372
333373if ($PSVersionTable.PSVersion.Major -lt 6 )
334374{
335375 LogWarning " Some web requests will not work in versions of PS earlier then 6. You are running version $ ( $PSVersionTable.PSVersion ) ."
336376}
337377$ignoreLinks = @ ();
338- if (Test-Path $ignoreLinksFile )
378+ if (Test-Path $ignoreLinksFile ) {
379+ $ignoreLinks = (Get-Content $ignoreLinksFile ).Where ({ $_.Trim () -ne " " -and ! $_.StartsWith (" #" ) })
380+ }
381+
382+ # Use default hashtable constructor instead of @{} because we need them to be case sensitive
383+ $checkedPages = New-Object Hashtable
384+ $checkedLinks = New-Object Hashtable
385+
386+ if ($inputCacheFile )
339387{
340- $ignoreLinks = [Array ](Get-Content $ignoreLinksFile | ForEach-Object { ($_ -replace " #.*" , " " ).Trim() } | Where-Object { $_ -ne " " })
388+ $cacheContent = " "
389+ if ($inputCacheFile.StartsWith (" http" )) {
390+ try {
391+ $response = Invoke-WebRequest - Uri $inputCacheFile
392+ $cacheContent = $response.Content
393+ }
394+ catch {
395+ $statusCode = $_.Exception.Response.StatusCode.value__
396+ Write-Error " Failed to read cache file from page [$statusCode ] $inputCacheFile "
397+ }
398+ }
399+ elseif (Test-Path $inputCacheFile ) {
400+ $cacheContent = Get-Content $inputCacheFile - Raw
401+ }
402+ $goodLinks = $cacheContent.Split (" `n " ).Where ({ $_.Trim () -ne " " -and ! $_.StartsWith (" #" ) })
403+
404+ foreach ($goodLink in $goodLinks ) {
405+ $checkedLinks [$goodLink ] = $true
406+ }
341407}
342408
343- $checkedPages = @ {};
344- $checkedLinks = @ {};
345- $badLinks = @ {};
409+ $cachedLinksCount = $checkedLinks.Count
410+
411+ if ($cachedLinksCount ) {
412+ Write-Host " Skipping checks on $cachedLinksCount links found in the given cache of known good links."
413+ }
414+
415+ $badLinks = New-Object Hashtable
346416$pageUrisToCheck = new-object System.Collections.Queue
347417foreach ($url in $urls ) {
348- $uri = NormalizeUrl $url
418+ $uri = NormalizeUrl $url
349419 $pageUrisToCheck.Enqueue ($uri );
350420}
351421
@@ -359,8 +429,7 @@ while ($pageUrisToCheck.Count -ne 0)
359429 Write-Host " Found $ ( $linkUris.Count ) links on page $pageUri " ;
360430 $badLinksPerPage = @ ();
361431 foreach ($linkUri in $linkUris ) {
362- $replacedLink = ReplaceGithubLink $linkUri
363- $isLinkValid = CheckLink $replacedLink
432+ $isLinkValid = CheckLink $linkUri
364433 if (! $isLinkValid -and ! $badLinksPerPage.Contains ($linkUri )) {
365434 if (! $linkUri.ToString ().Trim()) {
366435 $linkUri = $emptyLinkMessage
@@ -388,10 +457,21 @@ foreach ($pageLink in $badLinks.Keys) {
388457 }
389458}
390459
460+ $linksChecked = $checkedLinks.Count - $cachedLinksCount
461+
391462if ($badLinks.Count -gt 0 ) {
392- LogError " Found $ ( $checkedLinks .Count ) links with $ ( $badLinks.Count ) page(s) broken."
393- }
463+ LogError " Checked $linksChecked links with $ ( $badLinks.Count ) page(s) broken."
464+ }
394465else {
395- Write-Host " Found $ ( $checkedLinks .Count ) links. No broken links found."
466+ Write-Host " Checked $linksChecked links. No broken links found."
396467}
468+
469+ if ($outputCacheFile )
470+ {
471+ $goodLinks = $checkedLinks.Keys.Where ({ " True" -eq $checkedLinks [$_ ].ToString() }) | Sort-Object
472+
473+ Write-Host " Writing the list of validated links to $outputCacheFile "
474+ $goodLinks | Set-Content $outputCacheFile
475+ }
476+
397477exit $badLinks.Count
0 commit comments