Skip to content

Commit

Permalink
set proxy server and update colly version
Browse files Browse the repository at this point in the history
  • Loading branch information
mesaglio committed Nov 16, 2021
1 parent ee76035 commit fd69177
Show file tree
Hide file tree
Showing 7 changed files with 614 additions and 26 deletions.
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1 +1,7 @@
target: ;

test:
@go test ./pkg/parser

test_without_cache:
@go clean -testcache && make test
5 changes: 2 additions & 3 deletions cmd/goclone/clone.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ import (
)

// Clone the given site :)
func cloneSite(ctx context.Context, args, cookies []string) error {
func cloneSite(ctx context.Context, args, cookies []string, proxyString string) error {
jar, err := cookiejar.New(&cookiejar.Options{})
if err != nil {
return err
Expand Down Expand Up @@ -63,8 +63,7 @@ func cloneSite(ctx context.Context, args, cookies []string) error {
if firstProject == "" {
firstProject = projectPath
}

if err := crawler.Crawl(ctx, u, projectPath, crawler.SetCookieJar(jar)); err != nil {
if err := crawler.Crawl(ctx, u, projectPath, jar, proxyString); err != nil {
return fmt.Errorf("%q: %w", u, err)
}
// Restructure html
Expand Down
10 changes: 6 additions & 4 deletions cmd/goclone/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@ import (
var (
// Flags
// Login bool // remove login flag for now
Serve bool
Open bool
cookies []string
Serve bool
Open bool
ProxyString string
cookies []string

// Root cmd
rootCmd = &cobra.Command{
Expand All @@ -35,7 +36,7 @@ var (
ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt)
defer stop()
// Otherwise.. clone ahead!
if err := cloneSite(ctx, args, cookies); err != nil {
if err := cloneSite(ctx, args, cookies, ProxyString); err != nil {
log.Fatalf("%+v", err)
}
},
Expand All @@ -49,6 +50,7 @@ func Execute() {
pf.BoolVarP(&Open, "open", "o", false, "Automatically open project in deafult browser")
// rootCmd.PersistentFlags().BoolVarP(&Login, "login", "l", false, "Wether to use a username or password")
pf.BoolVarP(&Serve, "serve", "s", false, "Serve the generated files using Echo.")
pf.StringVarP(&ProxyString, "proxy_string", "p", "", "Proxy connection string. Support http and socks5 https://pkg.go.dev/github.com/gocolly/colly#Collector.SetProxy")
rootCmd.Flags().StringSliceVarP(&cookies, "cookie", "C", nil, "Pre-set these cookies")

// Execute the command :)
Expand Down
11 changes: 2 additions & 9 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,10 @@ go 1.13

require (
github.com/PuerkitoBio/goquery v1.5.1
github.com/antchfx/htmlquery v1.2.2 // indirect
github.com/antchfx/xmlquery v1.2.3 // indirect
github.com/antchfx/xpath v1.1.4 // indirect
github.com/fatih/color v1.9.0
github.com/fatih/color v1.13.0
github.com/gobwas/glob v0.2.3 // indirect
github.com/gocolly/colly v1.2.0
github.com/gocolly/colly/v2 v2.1.0
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e // indirect
github.com/golang/protobuf v1.3.3 // indirect
github.com/kennygrant/sanitize v1.2.4 // indirect
github.com/labstack/echo v3.3.10+incompatible
github.com/labstack/gommon v0.3.0 // indirect
Expand All @@ -23,7 +19,4 @@ require (
github.com/valyala/fasttemplate v1.1.0 // indirect
github.com/yosssi/gohtml v0.0.0-20190915184251-7ff6f235ecaf
golang.org/x/crypto v0.0.0-20191227163750-53104e6ec876 // indirect
golang.org/x/net v0.0.0-20200222125558-5a598a2470a0 // indirect
golang.org/x/sys v0.0.0-20191228213918-04cbcbbfeed8 // indirect
google.golang.org/appengine v1.6.5 // indirect
)
571 changes: 571 additions & 0 deletions go.sum

Large diffs are not rendered by default.

30 changes: 24 additions & 6 deletions pkg/crawler/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,24 @@ import (
"net/http/cookiejar"
"strings"

"github.com/gocolly/colly"
"github.com/gocolly/colly/v2"
)

// Collector searches for css, js, and images within a given link
// TODO improve for better performance
func Collector(ctx context.Context, url string, projectPath string, collyOpts ...func(*colly.Collector)) error {
func Collector(ctx context.Context, url string, projectPath string, cookieJar *cookiejar.Jar, proxyString string) error {
// create a new collector
c := colly.NewCollector(append(collyOpts, colly.Async(true))...)
c.WithTransport(cancelableTransport{ctx: ctx, transport: http.DefaultTransport})
//c := colly.NewCollector()

c := colly.NewCollector(colly.Async(true))
if cookieJar != nil {
c.SetCookieJar(cookieJar)
}
if proxyString != "" {
c.SetProxy(proxyString)
} else {
c.WithTransport(cancelableTransport{ctx: ctx, transport: http.DefaultTransport})
}

// search for all link tags that have a rel attribute that is equal to stylesheet - CSS
c.OnHTML("link[rel='stylesheet']", func(e *colly.HTMLElement) {
Expand Down Expand Up @@ -67,8 +76,17 @@ func Collector(ctx context.Context, url string, projectPath string, collyOpts ..
}

// SetCookieJar returns a colly.Collector option that sets the cookie jar to the specified.
func SetCookieJar(jar *cookiejar.Jar) func(*colly.Collector) {
return func(c *colly.Collector) { c.SetCookieJar(jar) }
func SetCookieJar(jar *cookiejar.Jar) func(*colly.Collector) *colly.Collector {
var C colly.Collector
C.SetCookieJar(jar)
return func(*colly.Collector) *colly.Collector { return &C }
}

// SetCookieJar returns a colly.Collector option that sets the cookie jar to the specified.
func SetProxy(proxy string) func(*colly.Collector) *colly.Collector {
var C colly.Collector
C.SetProxy(proxy)
return func(*colly.Collector) *colly.Collector { return &C }
}

type cancelableTransport struct {
Expand Down
7 changes: 3 additions & 4 deletions pkg/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,11 @@ package crawler

import (
"context"

"github.com/gocolly/colly"
"net/http/cookiejar"
)

// Crawl asks the necessary crawlers for collecting links for building the web page
func Crawl(ctx context.Context, site string, projectPath string, collyOpts ...func(*colly.Collector)) error {
func Crawl(ctx context.Context, site string, projectPath string, cookieJar *cookiejar.Jar, proxyString string) error {
// searches for css, js, and images within a given link
return Collector(ctx, site, projectPath, collyOpts...)
return Collector(ctx, site, projectPath, cookieJar, proxyString)
}

0 comments on commit fd69177

Please sign in to comment.