Skip to content

Commit

Permalink
Add rss provider
Browse files Browse the repository at this point in the history
  • Loading branch information
Chrisbattarbee committed Apr 10, 2024
1 parent cf5356e commit b5bc916
Show file tree
Hide file tree
Showing 4 changed files with 204 additions and 0 deletions.
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ require (
github.com/leodido/go-urn v1.4.0 // indirect
github.com/lithammer/fuzzysearch v1.1.8 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/mmcdole/gofeed v1.3.0 // indirect
github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/patrickmn/go-cache v2.1.0+incompatible // indirect
Expand Down
4 changes: 4 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,10 @@ github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APP
github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/mmcdole/gofeed v1.3.0 h1:5yn+HeqlcvjMeAI4gu6T+crm7d0anY85+M+v6fIFNG4=
github.com/mmcdole/gofeed v1.3.0/go.mod h1:9TGv2LcJhdXePDzxiuMnukhV2/zb6VtnZt1mS+SjkLE=
github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23 h1:Zr92CAlFhy2gL+V1F+EyIuzbQNbSgP4xhTODZtrXUtk=
github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23/go.mod h1:v+25+lT2ViuQ7mVxcncQ8ch1URund48oH+jhjiwEgS8=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
Expand Down
196 changes: 196 additions & 0 deletions scraper/internal/scraper/providers/rss/rss.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
package rss

import (
"context"
"encoding/xml"
"fmt"
"github.com/metoro-io/statusphere/common/api"
"github.com/mmcdole/gofeed"
"github.com/pkg/errors"
"go.uber.org/zap"
"io"
"io/ioutil"
"net/http"
"regexp"
"strings"
"time"
)

func (s *RssProvider) Name() string {
return "RSS"
}

type RssProvider struct {
logger *zap.Logger
httpClient *http.Client
}

func NewRssProvider(logger *zap.Logger, httpClient *http.Client) *RssProvider {
return &RssProvider{
logger: logger,
httpClient: httpClient,
}
}

// There is no historical page differentiation for rss pages so we skip
func (s *RssProvider) ScrapeStatusPageHistorical(ctx context.Context, url string) ([]api.Incident, error) {
_, isRssPage, err := s.isRssPage(url)
if err != nil {
return nil, errors.Wrap(err, "failed to determine if the page is an rss page")
}
if !isRssPage {
return nil, errors.New("page is not a rss page")
}
return []api.Incident{}, nil
}

func (s *RssProvider) ScrapeStatusPageCurrent(ctx context.Context, url string) ([]api.Incident, error) {
return s.scrapeRssPage(ctx, url)
}

// scrapeRssPage is a helper function that will attempt to scrape the status
// page using the rss method
// If the ress method fails, it will return an error
func (s *RssProvider) scrapeRssPage(ctx context.Context, url string) ([]api.Incident, error) {
rssPage, isRssPage, err := s.isRssPage(url)
if err != nil {
return nil, errors.Wrap(err, "failed to determine if the page is an rss page")
}
if !isRssPage {
return nil, errors.New("page is not a rss page")
}

// Get the incidents from the rss page
return s.getIncidentsFromRssPage(rssPage, url)
}

// We determine if a page is an rss page by checking if there is a /history page and
// that history page contains the data-react-class='HistoryIndex' attribute
func (s *RssProvider) isRssPage(url string) (string, bool, error) {
// Get the history page
historyUrls := getUrls(url)
for _, historyUrl := range historyUrls {
history, err := s.httpClient.Get(historyUrl)
if err != nil {
return "", false, errors.Wrap(err, "failed to make the get request to the history page")
}
if history.StatusCode != http.StatusOK {
continue
}

// Is the body well formed xml?
if isXMLContent(history.Body) {
return historyUrl, true, nil
}

}
return "", false, nil
}

func (s *RssProvider) getIncidentsFromRssPage(url string, statusPageUrl string) ([]api.Incident, error) {
var incidents []api.Incident

// Fetch the RSS or Atom feed
resp, err := http.Get(url)
if err != nil {
return nil, fmt.Errorf("failed to fetch the feed: %w", err)
}
defer resp.Body.Close()

body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to read the feed: %w", err)
}

fp := gofeed.NewParser()
feed, err := fp.ParseString(string(body))
if err != nil {
return nil, fmt.Errorf("failed to parse the feed: %w", err)
}

for _, item := range feed.Items {
var parsedTime time.Time

// Check and parse the item's published or updated time
if item.PublishedParsed != nil {
parsedTime = *item.PublishedParsed
}

// Strip HTML tags from title and description - assuming a function stripHTML exists
title := stripHTML(item.Title)
description := stripHTML(item.Content)
if description == "" {
description = stripHTML(item.Description)
}
deepLink := item.Link

incidents = append(incidents, api.Incident{
Title: title,
Description: &description,
StartTime: parsedTime,
EndTime: &parsedTime,
DeepLink: deepLink,
// Not all RSS feeds have an impact field, so we default to none
Impact: api.ImpactNone,
StatusPageUrl: statusPageUrl,
})
}

if len(incidents) == 0 {
return nil, errors.New("no incidents found")
}

return incidents, nil
}

func getUrls(url string) []string {
return []string{
url + "/history.atom",
url + "/feed.atom",
url + "/de.atom",
url + "/_rss",
url + "/rss/all.rss",
url + "/en-us/status/feed/",
}
}

// isXMLContent tries to parse the response body as XML to check if it is valid XML.
func isXMLContent(body io.Reader) bool {
decoder := xml.NewDecoder(body)
for {
token, err := decoder.Token()
if err != nil {
break
}
directive, ok := token.(xml.Directive)
if ok {
if strings.Contains(strings.ToLower(string(directive)), "html") {
break
}
}

if token == nil {
break
}
// If we can decode one token, it's likely an XML.
// This is a simplistic check and might need to be more sophisticated
// depending on the context.

return true
}
return false
}

// stripHTML uses a regular expression to remove HTML tags from a string.
func stripHTML(input string) string {
// Compile the regular expression to match HTML tags.
// The expression "<[^>]*>" matches anything that starts with "<" and ends with ">",
// containing any characters except ">" in between.
re, err := regexp.Compile("<[^>]*>")
if err != nil {
panic("Invalid regular expression")
}

// Replace all HTML tags with an empty string.
return re.ReplaceAllString(input, "")
}
2 changes: 2 additions & 0 deletions scraper/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"github.com/metoro-io/statusphere/scraper/internal/scraper/poller"
"github.com/metoro-io/statusphere/scraper/internal/scraper/providers"
"github.com/metoro-io/statusphere/scraper/internal/scraper/providers/atlassian"
"github.com/metoro-io/statusphere/scraper/internal/scraper/providers/rss"
"github.com/metoro-io/statusphere/scraper/internal/scraper/urlgetter/dburlgetter"
"go.uber.org/zap"
"net/http"
Expand All @@ -22,6 +23,7 @@ func main() {

scraper := scraper.NewScraper(logger, http.DefaultClient, []providers.Provider{
atlassian.NewAtlassianProvider(logger, http.DefaultClient),
rss.NewRssProvider(logger, http.DefaultClient),
})

dbClient, err := db.NewDbClientFromEnvironment(logger)
Expand Down

0 comments on commit b5bc916

Please sign in to comment.