|
| 1 | +param ( |
| 2 | + # url list to verify links. Can either be a http address or a local file request. Local file paths support md and html files. |
| 3 | + [string[]] $urls, |
| 4 | + # file that contains a set of links to ignore when verifying |
| 5 | + [string] $ignoreLinksFile = "$PSScriptRoot/ignore-links.txt", |
| 6 | + # switch that will enable devops specific logging for warnings |
| 7 | + [switch] $devOpsLogging = $false, |
| 8 | + # check the links recurisvely based on recursivePattern |
| 9 | + [switch] $recursive = $true, |
| 10 | + # recusiving check links for all links verified that begin with this baseUrl, defaults to the folder the url is contained in |
| 11 | + [string] $baseUrl = "", |
| 12 | + # path to the root of the site for resolving rooted relative links, defaults to host root for http and file directory for local files |
| 13 | + [string] $rootUrl = "", |
| 14 | + # list of http status codes count as broken links. Defaults to 404. |
| 15 | + [array] $errorStatusCodes = @(404), |
| 16 | + # flag to allow resolving relative paths or not |
| 17 | + [bool] $resolveRelativeLinks = $true |
| 18 | +) |
| 19 | + |
| 20 | +$ProgressPreference = "SilentlyContinue"; # Disable invoke-webrequest progress dialog |
| 21 | + |
| 22 | +function NormalizeUrl([string]$url){ |
| 23 | + if (Test-Path $url) { |
| 24 | + $url = "file://" + (Resolve-Path $url).ToString(); |
| 25 | + } |
| 26 | + |
| 27 | + $uri = [System.Uri]$url; |
| 28 | + |
| 29 | + if ($script:baseUrl -eq "") { |
| 30 | + # for base url default to containing directory |
| 31 | + $script:baseUrl = (new-object System.Uri($uri, ".")).ToString(); |
| 32 | + } |
| 33 | + |
| 34 | + if ($script:rootUrl -eq "") { |
| 35 | + if ($uri.IsFile) { |
| 36 | + # for files default to the containing directory |
| 37 | + $script:rootUrl = $script:baseUrl; |
| 38 | + } |
| 39 | + else { |
| 40 | + # for http links default to the root path |
| 41 | + $script:rootUrl = new-object System.Uri($uri, "/"); |
| 42 | + } |
| 43 | + } |
| 44 | + return $uri |
| 45 | +} |
| 46 | + |
| 47 | +function LogWarning |
| 48 | +{ |
| 49 | + if ($devOpsLogging) |
| 50 | + { |
| 51 | + Write-Host "##vso[task.LogIssue type=warning;]$args" |
| 52 | + } |
| 53 | + else |
| 54 | + { |
| 55 | + Write-Warning "$args" |
| 56 | + } |
| 57 | +} |
| 58 | + |
| 59 | +function ResolveUri ([System.Uri]$referralUri, [string]$link) |
| 60 | +{ |
| 61 | + # If the link is mailto, skip it. |
| 62 | + if ($link.StartsWith("mailto:")) { |
| 63 | + Write-Verbose "Skipping $link because it is a mailto link." |
| 64 | + return $null |
| 65 | + } |
| 66 | + |
| 67 | + $linkUri = [System.Uri]$link; |
| 68 | + if($resolveRelativeLinks){ |
| 69 | + if (!$linkUri.IsAbsoluteUri) { |
| 70 | + # For rooted paths resolve from the baseUrl |
| 71 | + if ($link.StartsWith("/")) { |
| 72 | + echo "rooturl = $rootUrl" |
| 73 | + $linkUri = new-object System.Uri([System.Uri]$rootUrl, ".$link"); |
| 74 | + } |
| 75 | + else { |
| 76 | + $linkUri = new-object System.Uri($referralUri, $link); |
| 77 | + } |
| 78 | + } |
| 79 | + } |
| 80 | + |
| 81 | + $linkUri = [System.Uri]$linkUri.GetComponents([System.UriComponents]::HttpRequestUrl, [System.UriFormat]::SafeUnescaped) |
| 82 | + Write-Verbose "ResolvedUri $link to $linkUri" |
| 83 | + |
| 84 | + # If the link is not a web request, like mailto, skip it. |
| 85 | + if (!$linkUri.Scheme.StartsWith("http") -and !$linkUri.IsFile) { |
| 86 | + Write-Verbose "Skipping $linkUri because it is not http or file based." |
| 87 | + return $null |
| 88 | + } |
| 89 | + |
| 90 | + if ($null -ne $ignoreLinks -and $ignoreLinks.Contains($link)) { |
| 91 | + Write-Verbose "Ignoring invalid link $linkUri because it is in the ignore file." |
| 92 | + return $null |
| 93 | + } |
| 94 | + |
| 95 | + return $linkUri; |
| 96 | +} |
| 97 | + |
| 98 | +function ParseLinks([string]$baseUri, [string]$htmlContent) |
| 99 | +{ |
| 100 | + $hrefRegex = "<a[^>]+href\s*=\s*[""']?(?<href>[^""']*)[""']?" |
| 101 | + $regexOptions = [System.Text.RegularExpressions.RegexOptions]"Singleline, IgnoreCase"; |
| 102 | + |
| 103 | + $hrefs = [RegEx]::Matches($htmlContent, $hrefRegex, $regexOptions); |
| 104 | + |
| 105 | + #$hrefs | Foreach-Object { Write-Host $_ } |
| 106 | + |
| 107 | + Write-Verbose "Found $($hrefs.Count) raw href's in page $baseUri"; |
| 108 | + $links = $hrefs | ForEach-Object { ResolveUri $baseUri $_.Groups["href"].Value } | Sort-Object -Unique |
| 109 | + |
| 110 | + #$links | Foreach-Object { Write-Host $_ } |
| 111 | + |
| 112 | + return $links |
| 113 | +} |
| 114 | + |
| 115 | +function CheckLink ([System.Uri]$linkUri) |
| 116 | +{ |
| 117 | + if ($checkedLinks.ContainsKey($linkUri)) { return } |
| 118 | + |
| 119 | + Write-Verbose "Checking link $linkUri..." |
| 120 | + if ($linkUri.IsFile) { |
| 121 | + if (!(Test-Path $linkUri.LocalPath)) { |
| 122 | + LogWarning "Link to file does not exist $($linkUri.LocalPath)" |
| 123 | + $script:badLinks += $linkUri |
| 124 | + } |
| 125 | + } |
| 126 | + else { |
| 127 | + try { |
| 128 | + $response = Invoke-WebRequest -Uri $linkUri |
| 129 | + $statusCode = $response.StatusCode |
| 130 | + if ($statusCode -ne 200) { |
| 131 | + Write-Host "[$statusCode] while requesting $linkUri" |
| 132 | + } |
| 133 | + } |
| 134 | + catch { |
| 135 | + $statusCode = $_.Exception.Response.StatusCode.value__ |
| 136 | + |
| 137 | + if ($statusCode -in $errorStatusCodes) { |
| 138 | + LogWarning "[$statusCode] broken link $linkUri" |
| 139 | + $script:badLinks += $linkUri |
| 140 | + } |
| 141 | + else { |
| 142 | + if ($null -ne $statusCode) { |
| 143 | + Write-Host "[$statusCode] while requesting $linkUri" |
| 144 | + } |
| 145 | + else { |
| 146 | + Write-Host "Exception while requesting $linkUri" |
| 147 | + Write-Host $_.Exception.ToString() |
| 148 | + } |
| 149 | + } |
| 150 | + } |
| 151 | + } |
| 152 | + $checkedLinks[$linkUri] = $true; |
| 153 | +} |
| 154 | + |
| 155 | +function GetLinks([System.Uri]$pageUri) |
| 156 | +{ |
| 157 | + if ($pageUri.Scheme.StartsWith("http")) { |
| 158 | + try { |
| 159 | + $response = Invoke-WebRequest -Uri $pageUri |
| 160 | + $content = $response.Content |
| 161 | + } |
| 162 | + catch { |
| 163 | + $statusCode = $_.Exception.Response.StatusCode.value__ |
| 164 | + Write-Error "Invalid page [$statusCode] $pageUri" |
| 165 | + } |
| 166 | + } |
| 167 | + elseif ($pageUri.IsFile -and (Test-Path $pageUri.LocalPath)) { |
| 168 | + $file = $pageUri.LocalPath |
| 169 | + if ($file.EndsWith(".md")) { |
| 170 | + $content = (ConvertFrom-MarkDown $file).html |
| 171 | + } |
| 172 | + elseif ($file.EndsWith(".html")) { |
| 173 | + $content = Get-Content $file |
| 174 | + } |
| 175 | + else { |
| 176 | + if (Test-Path ($file + "index.html")) { |
| 177 | + $content = Get-Content ($file + "index.html") |
| 178 | + } |
| 179 | + else { |
| 180 | + # Fallback to just reading the content directly |
| 181 | + $content = Get-Content $file |
| 182 | + } |
| 183 | + } |
| 184 | + } |
| 185 | + else { |
| 186 | + Write-Error "Don't know how to process uri $pageUri" |
| 187 | + } |
| 188 | + |
| 189 | + $links = ParseLinks $pageUri $content |
| 190 | + |
| 191 | + return $links; |
| 192 | +} |
| 193 | + |
| 194 | +if ($urls) { |
| 195 | + if ($urls.Count -eq 0) { |
| 196 | + Write-Host "Usage $($MyInvocation.MyCommand.Name) <urls>"; |
| 197 | + exit 1; |
| 198 | + } |
| 199 | +} |
| 200 | + |
| 201 | +if ($PSVersionTable.PSVersion.Major -lt 6) |
| 202 | +{ |
| 203 | + LogWarning "Some web requests will not work in versions of PS earlier then 6. You are running version $($PSVersionTable.PSVersion)." |
| 204 | +} |
| 205 | + |
| 206 | +$badLinks = @(); |
| 207 | +$ignoreLinks = @(); |
| 208 | +if (Test-Path $ignoreLinksFile) |
| 209 | +{ |
| 210 | + $ignoreLinks = [Array](Get-Content $ignoreLinksFile | ForEach-Object { ($_ -replace "#.*", "").Trim() } | Where-Object { $_ -ne "" }) |
| 211 | +} |
| 212 | + |
| 213 | +$checkedPages = @{}; |
| 214 | +$checkedLinks = @{}; |
| 215 | +$pageUrisToCheck = new-object System.Collections.Queue |
| 216 | + |
| 217 | +foreach ($url in $urls) { |
| 218 | + $uri = NormalizeUrl $url |
| 219 | + $pageUrisToCheck.Enqueue($uri); |
| 220 | +} |
| 221 | + |
| 222 | +while ($pageUrisToCheck.Count -ne 0) |
| 223 | +{ |
| 224 | + $pageUri = $pageUrisToCheck.Dequeue(); |
| 225 | + if ($checkedPages.ContainsKey($pageUri)) { continue } |
| 226 | + $checkedPages[$pageUri] = $true; |
| 227 | + |
| 228 | + $linkUris = GetLinks $pageUri |
| 229 | + Write-Host "Found $($linkUris.Count) links on page $pageUri"; |
| 230 | + |
| 231 | + foreach ($linkUri in $linkUris) { |
| 232 | + CheckLink $linkUri |
| 233 | + if ($recursive) { |
| 234 | + if ($linkUri.ToString().StartsWith($baseUrl) -and !$checkedPages.ContainsKey($linkUri)) { |
| 235 | + $pageUrisToCheck.Enqueue($linkUri); |
| 236 | + } |
| 237 | + } |
| 238 | + } |
| 239 | +} |
| 240 | + |
| 241 | +Write-Host "Found $($checkedLinks.Count) links with $($badLinks.Count) broken" |
| 242 | +$badLinks | ForEach-Object { Write-Host " $_" } |
| 243 | + |
| 244 | +exit $badLinks.Count |
0 commit comments