From b5bc9164d1109efa1bb2b016ce51a837c209e106 Mon Sep 17 00:00:00 2001 From: Chris Battarbee Date: Wed, 10 Apr 2024 15:58:30 +0100 Subject: [PATCH] Add rss provider --- go.mod | 2 + go.sum | 4 + scraper/internal/scraper/providers/rss/rss.go | 196 ++++++++++++++++++ scraper/main.go | 2 + 4 files changed, 204 insertions(+) create mode 100644 scraper/internal/scraper/providers/rss/rss.go diff --git a/go.mod b/go.mod index 20006dd..e739abd 100644 --- a/go.mod +++ b/go.mod @@ -38,6 +38,8 @@ require ( github.com/leodido/go-urn v1.4.0 // indirect github.com/lithammer/fuzzysearch v1.1.8 // indirect github.com/mattn/go-isatty v0.0.20 // indirect + github.com/mmcdole/gofeed v1.3.0 // indirect + github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/patrickmn/go-cache v2.1.0+incompatible // indirect diff --git a/go.sum b/go.sum index 32d7380..f68b990 100644 --- a/go.sum +++ b/go.sum @@ -99,6 +99,10 @@ github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APP github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/mmcdole/gofeed v1.3.0 h1:5yn+HeqlcvjMeAI4gu6T+crm7d0anY85+M+v6fIFNG4= +github.com/mmcdole/gofeed v1.3.0/go.mod h1:9TGv2LcJhdXePDzxiuMnukhV2/zb6VtnZt1mS+SjkLE= +github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23 h1:Zr92CAlFhy2gL+V1F+EyIuzbQNbSgP4xhTODZtrXUtk= +github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23/go.mod h1:v+25+lT2ViuQ7mVxcncQ8ch1URund48oH+jhjiwEgS8= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= diff --git a/scraper/internal/scraper/providers/rss/rss.go b/scraper/internal/scraper/providers/rss/rss.go new file mode 100644 index 0000000..4fbcc06 --- /dev/null +++ b/scraper/internal/scraper/providers/rss/rss.go @@ -0,0 +1,196 @@ +package rss + +import ( + "context" + "encoding/xml" + "fmt" + "github.com/metoro-io/statusphere/common/api" + "github.com/mmcdole/gofeed" + "github.com/pkg/errors" + "go.uber.org/zap" + "io" + "io/ioutil" + "net/http" + "regexp" + "strings" + "time" +) + +func (s *RssProvider) Name() string { + return "RSS" +} + +type RssProvider struct { + logger *zap.Logger + httpClient *http.Client +} + +func NewRssProvider(logger *zap.Logger, httpClient *http.Client) *RssProvider { + return &RssProvider{ + logger: logger, + httpClient: httpClient, + } +} + +// There is no historical page differentiation for rss pages so we skip +func (s *RssProvider) ScrapeStatusPageHistorical(ctx context.Context, url string) ([]api.Incident, error) { + _, isRssPage, err := s.isRssPage(url) + if err != nil { + return nil, errors.Wrap(err, "failed to determine if the page is an rss page") + } + if !isRssPage { + return nil, errors.New("page is not a rss page") + } + return []api.Incident{}, nil +} + +func (s *RssProvider) ScrapeStatusPageCurrent(ctx context.Context, url string) ([]api.Incident, error) { + return s.scrapeRssPage(ctx, url) +} + +// scrapeRssPage is a helper function that will attempt to scrape the status +// page using the rss method +// If the ress method fails, it will return an error +func (s *RssProvider) scrapeRssPage(ctx context.Context, url string) ([]api.Incident, error) { + rssPage, isRssPage, err := s.isRssPage(url) + if err != nil { + return nil, errors.Wrap(err, "failed to determine if the page is an rss page") + } + if !isRssPage { + return nil, errors.New("page is not a rss page") + } + + // Get the incidents from the rss page + return s.getIncidentsFromRssPage(rssPage, url) +} + +// We determine if a page is an rss page by checking if there is a /history page and +// that history page contains the data-react-class='HistoryIndex' attribute +func (s *RssProvider) isRssPage(url string) (string, bool, error) { + // Get the history page + historyUrls := getUrls(url) + for _, historyUrl := range historyUrls { + history, err := s.httpClient.Get(historyUrl) + if err != nil { + return "", false, errors.Wrap(err, "failed to make the get request to the history page") + } + if history.StatusCode != http.StatusOK { + continue + } + + // Is the body well formed xml? + if isXMLContent(history.Body) { + return historyUrl, true, nil + } + + } + return "", false, nil +} + +func (s *RssProvider) getIncidentsFromRssPage(url string, statusPageUrl string) ([]api.Incident, error) { + var incidents []api.Incident + + // Fetch the RSS or Atom feed + resp, err := http.Get(url) + if err != nil { + return nil, fmt.Errorf("failed to fetch the feed: %w", err) + } + defer resp.Body.Close() + + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read the feed: %w", err) + } + + fp := gofeed.NewParser() + feed, err := fp.ParseString(string(body)) + if err != nil { + return nil, fmt.Errorf("failed to parse the feed: %w", err) + } + + for _, item := range feed.Items { + var parsedTime time.Time + + // Check and parse the item's published or updated time + if item.PublishedParsed != nil { + parsedTime = *item.PublishedParsed + } + + // Strip HTML tags from title and description - assuming a function stripHTML exists + title := stripHTML(item.Title) + description := stripHTML(item.Content) + if description == "" { + description = stripHTML(item.Description) + } + deepLink := item.Link + + incidents = append(incidents, api.Incident{ + Title: title, + Description: &description, + StartTime: parsedTime, + EndTime: &parsedTime, + DeepLink: deepLink, + // Not all RSS feeds have an impact field, so we default to none + Impact: api.ImpactNone, + StatusPageUrl: statusPageUrl, + }) + } + + if len(incidents) == 0 { + return nil, errors.New("no incidents found") + } + + return incidents, nil +} + +func getUrls(url string) []string { + return []string{ + url + "/history.atom", + url + "/feed.atom", + url + "/de.atom", + url + "/_rss", + url + "/rss/all.rss", + url + "/en-us/status/feed/", + } +} + +// isXMLContent tries to parse the response body as XML to check if it is valid XML. +func isXMLContent(body io.Reader) bool { + decoder := xml.NewDecoder(body) + for { + token, err := decoder.Token() + if err != nil { + break + } + directive, ok := token.(xml.Directive) + if ok { + if strings.Contains(strings.ToLower(string(directive)), "html") { + break + } + } + + if token == nil { + break + } + // If we can decode one token, it's likely an XML. + // This is a simplistic check and might need to be more sophisticated + // depending on the context. + + return true + } + return false +} + +// stripHTML uses a regular expression to remove HTML tags from a string. +func stripHTML(input string) string { + // Compile the regular expression to match HTML tags. + // The expression "<[^>]*>" matches anything that starts with "<" and ends with ">", + // containing any characters except ">" in between. + re, err := regexp.Compile("<[^>]*>") + if err != nil { + panic("Invalid regular expression") + } + + // Replace all HTML tags with an empty string. + return re.ReplaceAllString(input, "") +} diff --git a/scraper/main.go b/scraper/main.go index fe64500..948fbde 100644 --- a/scraper/main.go +++ b/scraper/main.go @@ -9,6 +9,7 @@ import ( "github.com/metoro-io/statusphere/scraper/internal/scraper/poller" "github.com/metoro-io/statusphere/scraper/internal/scraper/providers" "github.com/metoro-io/statusphere/scraper/internal/scraper/providers/atlassian" + "github.com/metoro-io/statusphere/scraper/internal/scraper/providers/rss" "github.com/metoro-io/statusphere/scraper/internal/scraper/urlgetter/dburlgetter" "go.uber.org/zap" "net/http" @@ -22,6 +23,7 @@ func main() { scraper := scraper.NewScraper(logger, http.DefaultClient, []providers.Provider{ atlassian.NewAtlassianProvider(logger, http.DefaultClient), + rss.NewRssProvider(logger, http.DefaultClient), }) dbClient, err := db.NewDbClientFromEnvironment(logger)