Skip to content

Commit

Permalink
improved referrer parsing. match hostname against blacklist using com…
Browse files Browse the repository at this point in the history
…munity-maintained blacklist file graciously provided by Matomo (https://github.com/matomo-org/referrer-spam-blacklist). closes #170 relates to #154
  • Loading branch information
dannyvankooten committed Nov 9, 2018
1 parent 9589072 commit bca066b
Show file tree
Hide file tree
Showing 8 changed files with 1,360 additions and 48 deletions.
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,7 @@ lint:
test:
for PKG in $(PACKAGES); do go test -cover -coverprofile $$GOPATH/src/$$PKG/coverage.out $$PKG || exit 1; done;

.PHONY: referrer-spam-blacklist
referrer-spam-blacklist:
wget https://raw.githubusercontent.com/matomo-org/referrer-spam-blacklist/master/spammers.txt -O pkg/aggregator/data/blacklist.txt
go-bindata -prefix "pkg/aggregator/data/" -o pkg/aggregator/bindata.go -pkg aggregator pkg/aggregator/data/
83 changes: 60 additions & 23 deletions pkg/aggregator/aggregator.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package aggregator

import (
"errors"
"net/url"
"strings"

"github.com/usefathom/fathom/pkg/datastore"
"github.com/usefathom/fathom/pkg/models"
Expand Down Expand Up @@ -64,16 +66,46 @@ func (agg *Aggregator) Run() int {
// if no explicit site ID was given in the tracking request, default to site with ID 1
trackingIDMap[""] = 1

// setup referrer spam blacklist
blacklist, err := newBlacklist()
if err != nil {
log.Error(err)
return 0
}

// add each pageview to the various statistics we gather
for _, p := range pageviews {

// discard pageview if site tracking ID is unknown
siteID, ok := trackingIDMap[p.SiteTrackingID]
if !ok {
log.Debugf("discarding pageview because of unrecognized site tracking ID %s", p.SiteTrackingID)
log.Debugf("Skipping pageview because of unrecognized site tracking ID %s", p.SiteTrackingID)
continue
}

// start with referrer because we may want to skip this pageview altogether if it is referrer spam
if p.Referrer != "" {
ref, err := parseReferrer(p.Referrer)
if err != nil {
log.Debugf("Skipping pageview from referrer %s because of malformed referrer URL", p.Referrer)
continue
}

// ignore out pageviews from blacklisted referrers
// we use Hostname() here to discard port numbers
if blacklist.Has(ref.Hostname()) {
log.Debugf("Skipping pageview from referrer %s because of blacklist", p.Referrer)
continue
}

hostname := ref.Scheme + "://" + ref.Host
referrerStats, err := agg.getReferrerStats(results, siteID, p.Timestamp, hostname, ref.Path)
if err != nil {
log.Error(err)
continue
}
referrerStats.HandlePageview(p)
}

// get existing site stats so we can add this pageview to it
site, err := agg.getSiteStats(results, siteID, p.Timestamp)
if err != nil {
Expand All @@ -88,23 +120,6 @@ func (agg *Aggregator) Run() int {
continue
}
pageStats.HandlePageview(p)

// referrer stats
if p.Referrer != "" {
hostname, pathname, err := parseUrlParts(p.Referrer)
if err != nil {
log.Error(err)
continue
}

referrerStats, err := agg.getReferrerStats(results, siteID, p.Timestamp, hostname, pathname)
if err != nil {
log.Error(err)
continue
}
referrerStats.HandlePageview(p)
}

}

// update stats
Expand Down Expand Up @@ -134,11 +149,33 @@ func (agg *Aggregator) Run() int {
return n
}

func parseUrlParts(s string) (string, string, error) {
u, err := url.Parse(s)
// parseReferrer parses the referrer string & normalizes it
func parseReferrer(r string) (*url.URL, error) {
u, err := url.Parse(r)
if err != nil {
return "", "", err
return nil, err
}

// always require a hostname
if u.Host == "" {
return nil, errors.New("malformed URL, empty host")
}

// remove AMP & UTM vars
if u.RawQuery != "" {
q := u.Query()
keys := []string{"amp", "utm_campaign", "utm_medium", "utm_source"}
for _, k := range keys {
q.Del(k)
}
u.RawQuery = q.Encode()
}

// remove amp/ suffix (but keep trailing slash)
if strings.HasSuffix(u.Path, "/amp/") {
u.Path = u.Path[0:(len(u.Path) - 4)]
}

return u.Scheme + "://" + u.Host, u.Path, nil
// re-parse our normalized string into a new URL struct
return url.Parse(u.String())
}
47 changes: 46 additions & 1 deletion pkg/aggregator/aggregator_test.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,54 @@
package aggregator

import (
"net/url"
"testing"
)

func TestProcess(t *testing.T) {
func TestParseReferrer(t *testing.T) {
testsValid := map[string]*url.URL{
"https://www.usefathom.com/?utm_source=github": &url.URL{
Scheme: "https",
Host: "www.usefathom.com",
Path: "/",
},
"https://www.usefathom.com/privacy/amp/?utm_source=github": &url.URL{
Scheme: "https",
Host: "www.usefathom.com",
Path: "/privacy/",
},
}
testsErr := []string{
"mysite.com",
"foobar",
"",
}

for r, e := range testsValid {
v, err := parseReferrer(r)
if err != nil {
t.Error(err)
}

if v.Host != e.Host {
t.Errorf("Invalid Host: expected %s, got %s", e.Host, v.Host)
}

if v.Scheme != e.Scheme {
t.Errorf("Invalid Scheme: expected %s, got %s", e.Scheme, v.Scheme)
}

if v.Path != e.Path {
t.Errorf("Invalid Path: expected %s, got %s", e.Path, v.Path)
}

}

for _, r := range testsErr {
v, err := parseReferrer(r)
if err == nil {
t.Errorf("Expected err, got %#v", v)
}
}

}
235 changes: 235 additions & 0 deletions pkg/aggregator/bindata.go

Large diffs are not rendered by default.

42 changes: 42 additions & 0 deletions pkg/aggregator/blacklist.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package aggregator

import (
"bufio"
"bytes"
"strings"
)

type blacklist struct {
data []byte
}

func newBlacklist() (*blacklist, error) {
var err error
b := &blacklist{}
b.data, err = Asset("blacklist.txt")
if err != nil {
return nil, err
}

return b, nil
}

// Has returns true if the given domain appears on the blacklist
// Uses sub-string matching, so if usesfathom.com is blacklisted then this function will also return true for danny.usesfathom.com
func (b *blacklist) Has(r string) bool {
if r == "" {
return false
}

scanner := bufio.NewScanner(bytes.NewReader(b.data))
domain := ""

for scanner.Scan() {
domain = scanner.Text()
if strings.HasSuffix(r, domain) {
return true
}
}

return false
}
25 changes: 25 additions & 0 deletions pkg/aggregator/blacklist_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package aggregator

import (
"testing"
)

func TestBlacklistHas(t *testing.T) {
b, err := newBlacklist()
if err != nil {
t.Error(err)
}

table := map[string]bool{
"03e.info": true,
"zvetki.ru": true,
"usefathom.com": false,
"foo.03e.info": true, // sub-string match
}

for r, e := range table {
if v := b.Has(r); v != e {
t.Errorf("Expected %v, got %v", e, v)
}
}
}
Loading

1 comment on commit bca066b

@Spone
Copy link

@Spone Spone commented on bca066b Nov 9, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Awesome!! Thanks @dannyvankooten

Please sign in to comment.