diff --git a/README.md b/README.md index 2154cd7..0a64462 100644 --- a/README.md +++ b/README.md @@ -76,26 +76,44 @@ Simply, galer can be run with: ### Flags -```bash -▶ galer -h -``` - ![galer](https://user-images.githubusercontent.com/25837540/100824601-0ee53b80-3489-11eb-878d-a58d1ec3489d.jpg) -This will display help for the tool. Here are all the switches it supports. - -| **Flag** | **Description** | -|------------------- |----------------------------------------------------------------- | -| -u, --url | Target to fetches _(single target URL or list)_ | -| -e, --extension | Show only certain extensions _(comma-separated, e.g. js,php)_ | -| -c, --concurrency | Concurrency level _(default: 50)_ | -| --same-host | Same host only | -| --same-root | Same root (eTLD+1) only (takes precedence over --same-host) | -| -o, --output | Save fetched URLs output into file | -| -t, --timeout | Maximum time _(seconds)_ allowed for connection _(default: 60)_ | -| -s, --silent | Silent mode _(suppress an errors)_ | -| -v, --verbose | Verbose mode show error details unless you weren't use silent | -| -h, --help | Display its helps | +This will display help for the tool. Here are all the options it supports. + +```console +$ galer -h + + __ v0.2.0 + __ _ _(_ ) __ _ __ + /'_ '\/'_' )| | /'__'( '__) +( (_) ( (_| || |( ___| | +'\__ '\__,_(___'\____(_) +( )_) | + \___/' @dwisiswant0 + +A fast tool to fetch URLs from HTML attributes by crawl-in + +Usage: + galer -u [URL|URLs.txt] -o [output.txt] + +Options: + -u, --url Target to fetches (single target URL or list) + -e, --extension Show only certain extensions (comma-separated, e.g. js,php) + -c, --concurrency Concurrency level (default: 50) + -w, --wait Wait N seconds before evaluate (default: 1) + -d, --depth Max. depth for crawling (levels of links to follow) + --same-host Same host only + --same-root Same root (eTLD+1) only (takes precedence over --same-host) + -o, --output Save fetched URLs output into file + -T, --template Format for output template (e.g., "{{scheme}}://{{host}}{{path}}") + Valid variables are: "raw_url", "scheme", "user", "username", + "password", "host", "hostname", "port", "path", "raw_path", + "escaped_path", "raw_query", "fragment", "raw_fragment". + -t, --timeout Max. time (seconds) allowed for connection (default: 60) + -s, --silent Silent mode (suppress an errors) + -v, --verbose Verbose mode show error details unless you weren't use silent + -h, --help Display its helps +``` ### Examples diff --git a/go.mod b/go.mod index 4b32fa4..0d5fba2 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,7 @@ module github.com/dwisiswant0/galer -go 1.22.0 +go 1.23 + toolchain go1.23.1 require ( @@ -29,5 +30,7 @@ require ( github.com/muesli/reflow v0.3.0 // indirect github.com/muesli/termenv v0.15.2 // indirect github.com/rivo/uniseg v0.4.7 // indirect + github.com/valyala/bytebufferpool v1.0.0 // indirect + github.com/valyala/fasttemplate v1.2.2 // indirect golang.org/x/sys v0.26.0 // indirect ) diff --git a/go.sum b/go.sum index 3fa7b92..de90336 100644 --- a/go.sum +++ b/go.sum @@ -51,6 +51,10 @@ github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw= +github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc= +github.com/valyala/fasttemplate v1.2.2 h1:lxLXG0uE3Qnshl9QyaK6XJxMXlQZELvChBOCmQD0Loo= +github.com/valyala/fasttemplate v1.2.2/go.mod h1:KHLXt3tVN2HBp8eijSv/kGJopbvo7S+qRAEEKiv+SiQ= golang.org/x/exp v0.0.0-20241004190924-225e2abe05e6 h1:1wqE9dj9NpSm04INVsJhhEUzhuDVjbcyKH91sVyPATw= golang.org/x/exp v0.0.0-20241004190924-225e2abe05e6/go.mod h1:NQtJDoLvd6faHhE7m4T/1IY708gDefGGjR/iUW8yQQ8= golang.org/x/net v0.30.0 h1:AcW1SDZMkb8IpzCdQUaIq2sP4sZ4zw+55h6ynffypl4= diff --git a/internal/runner/consts.go b/internal/runner/consts.go index 46b95bd..de44624 100644 --- a/internal/runner/consts.go +++ b/internal/runner/consts.go @@ -2,7 +2,7 @@ package runner const ( author = "dwisiswant0" - version = "0.1.0" + version = "0.2.0" banner = ` __ v` + version + ` __ _ _(_ ) __ _ __ @@ -21,11 +21,17 @@ Usage: Options: -u, --url Target to fetches (single target URL or list) -e, --extension Show only certain extensions (comma-separated, e.g. js,php) - -c, --concurrency Concurrency level (default: 50) + -c, --concurrency Concurrency level (default: 50) + -w, --wait Wait N seconds before evaluate (default: 1) + -d, --depth Max. depth for crawling (levels of links to follow) --same-host Same host only --same-root Same root (eTLD+1) only (takes precedence over --same-host) -o, --output Save fetched URLs output into file - -t, --timeout Maximum time (seconds) allowed for connection (default: 60) + -T, --template Format for output template (e.g., "{{scheme}}://{{host}}{{path}}") + Valid variables are: "raw_url", "scheme", "user", "username", + "password", "host", "hostname", "port", "path", "raw_path", + "escaped_path", "raw_query", "fragment", "raw_fragment". + -t, --timeout Max. time (seconds) allowed for connection (default: 60) -s, --silent Silent mode (suppress an errors) -v, --verbose Verbose mode show error details unless you weren't use silent -h, --help Display its helps diff --git a/internal/runner/parser.go b/internal/runner/parser.go index 411a2ec..59b54ec 100644 --- a/internal/runner/parser.go +++ b/internal/runner/parser.go @@ -12,17 +12,19 @@ import ( // Options will defines its options type Options struct { Concurrency int - Timeout int Depth int - URL string Ext string + File *os.File + List *bufio.Scanner Output string SameHost bool SameRoot bool Silent bool + Template string + Timeout int + URL string Verbose bool - List *bufio.Scanner - File *os.File + Wait int } // Parse user given arguments @@ -35,6 +37,12 @@ func Parse() *Options { flag.IntVar(&opt.Concurrency, "concurrency", 50, "") flag.IntVar(&opt.Concurrency, "c", 50, "") + flag.IntVar(&opt.Wait, "wait", 1, "") + flag.IntVar(&opt.Wait, "w", 1, "") + + flag.IntVar(&opt.Depth, "depth", 1, "") + flag.IntVar(&opt.Depth, "d", 1, "") + flag.IntVar(&opt.Timeout, "timeout", 60, "") flag.IntVar(&opt.Timeout, "t", 60, "") @@ -47,6 +55,9 @@ func Parse() *Options { flag.StringVar(&opt.Output, "output", "", "") flag.StringVar(&opt.Output, "o", "", "") + flag.StringVar(&opt.Template, "template", "", "") + flag.StringVar(&opt.Template, "T", "", "") + flag.BoolVar(&opt.Silent, "silent", false, "") flag.BoolVar(&opt.Silent, "s", false, "") diff --git a/internal/runner/runner.go b/internal/runner/runner.go index f87ed84..df37d53 100644 --- a/internal/runner/runner.go +++ b/internal/runner/runner.go @@ -2,63 +2,107 @@ package runner import ( "fmt" + "io" + "os" "github.com/dwisiswant0/galer/pkg/galer" "github.com/remeh/sizedwaitgroup" ) -// New to executes galer -func New(opt *Options) { - job := make(chan string) - con := opt.Concurrency - swg := sizedwaitgroup.New(con) - cfg = &galer.Config{ - Timeout: opt.Timeout, - SameHost: opt.SameHost, - SameRoot: opt.SameRoot, +type Runner struct { + opt *Options + swg sizedwaitgroup.SizedWaitGroup + urls map[string]bool + galer *galer.Config +} + +// New initialize [Runner] +func New(opt *Options) *Runner { + return &Runner{ + opt: opt, + swg: sizedwaitgroup.New(opt.Concurrency), + urls: make(map[string]bool), + galer: &galer.Config{ + Logger: clog, + SameHost: opt.SameHost, + SameRoot: opt.SameRoot, + Template: opt.Template, + Timeout: opt.Timeout, + Wait: opt.Wait, + }, } - cfg = galer.New(cfg) +} - for i := 0; i < con; i++ { - swg.Add() +// Do runs crawling +func (r *Runner) Do() { + jobs := make(chan string) + + for i := 0; i < r.opt.Concurrency; i++ { + r.swg.Add() go func() { - defer swg.Done() - for URL := range job { - run := opt.run(URL, cfg) - for _, u := range run { - if opt.Ext != "" { - if !opt.isOnExt(u) { - continue - } - } - - fmt.Println(u) - - if opt.File != nil { - fmt.Fprintf(opt.File, "%s\n", out) - } - } + defer r.swg.Done() + for job := range jobs { + r.galer.SetScope(job) + r.run(job, 1) } }() } - for opt.List.Scan() { - u := opt.List.Text() - job <- u + for r.opt.List.Scan() { + u := r.opt.List.Text() + jobs <- u + } + + close(jobs) + r.swg.Wait() + r.galer.Close() + + if r.opt.File != nil { + r.opt.File.Close() + } +} + +func (r *Runner) run(URL string, counter int) { + cfg := galer.New(r.galer) + + var writer io.Writer = os.Stdout + if r.opt.File != nil { + writer = io.MultiWriter(os.Stdout, r.opt.File) } - close(job) - swg.Wait() - _ = cfg.Close() + for counter <= r.opt.Depth { + crawl := r.crawl(URL, cfg) + if len(crawl) == 0 { + break + } + counter++ + + var batches []string + for _, u := range crawl { + if !r.urls[u] { + fmt.Fprintf(writer, "%s\n", u) + batches = append(batches, u) + r.urls[u] = true + } + } - if opt.File != nil { - opt.File.Close() + for _, u := range batches { + if r.opt.Ext != "" { + if !r.opt.isOnExt(u) { + continue + } + } + + if counter <= r.opt.Depth { + r.run(u, counter+1) + } + } } } -func (opt *Options) run(URL string, cfg *galer.Config) []string { +func (r *Runner) crawl(URL string, cfg *galer.Config) []string { res, err := cfg.Crawl(URL) - if err != nil && !opt.Silent { + if err != nil && opt.Verbose { clog.Error(err, "url", URL) return []string{} diff --git a/internal/runner/validator.go b/internal/runner/validator.go index 8233a9b..650d4dc 100644 --- a/internal/runner/validator.go +++ b/internal/runner/validator.go @@ -24,7 +24,7 @@ func (opt *Options) validate() error { opt.List = bufio.NewScanner(f) } } else { - return errors.New("No target inputs provided") + return errors.New("no target inputs provided") } if opt.Output != "" { diff --git a/internal/runner/vars.go b/internal/runner/vars.go index 13fb622..65737a7 100644 --- a/internal/runner/vars.go +++ b/internal/runner/vars.go @@ -2,12 +2,9 @@ package runner import ( "github.com/charmbracelet/log" - "github.com/dwisiswant0/galer/pkg/galer" ) var ( - out string opt *Options - cfg *galer.Config clog *log.Logger ) diff --git a/main.go b/main.go index d01fa35..24da340 100644 --- a/main.go +++ b/main.go @@ -4,5 +4,6 @@ import "github.com/dwisiswant0/galer/internal/runner" func main() { options := runner.Parse() - runner.New(options) + r := runner.New(options) + r.Do() } diff --git a/pkg/galer/galer.go b/pkg/galer/galer.go index c70eeeb..30a551b 100644 --- a/pkg/galer/galer.go +++ b/pkg/galer/galer.go @@ -4,23 +4,34 @@ import ( "context" "errors" "net/url" + "strings" "time" + "github.com/charmbracelet/log" "github.com/chromedp/cdproto/network" "github.com/chromedp/chromedp" + "github.com/valyala/fasttemplate" "golang.org/x/exp/slices" "golang.org/x/net/publicsuffix" ) // Config declare its configurations type Config struct { - Timeout int + Logger *log.Logger SameHost bool SameRoot bool + Template string + Timeout int + Wait int // Headers network.Headers - ctx context.Context - cancel context.CancelFunc + ctx context.Context + cancel context.CancelFunc + template *fasttemplate.Template + + scope struct { + hostname, root string + } } // New defines context for the configurations @@ -39,11 +50,35 @@ func (cfg *Config) Crawl(URL string) ([]string, error) { if !IsURI(URL) { return nil, errors.New("cannot parse URL") } - u, _ := url.Parse(URL) - ctx, cancel := chromedp.NewContext(cfg.ctx) + // defaulting sleep + if cfg.Wait <= 0 { + cfg.Wait = 1 + } + + // defaulting scope (hostname & root) + if cfg.scope.hostname == "" && cfg.scope.root == "" { + u, _ := url.Parse(URL) + cfg.scope.hostname = u.Hostname() + cfg.scope.root, _ = publicsuffix.EffectiveTLDPlusOne(cfg.scope.hostname) + } + + var ctxOpts []chromedp.ContextOption + if cfg.Logger != nil { + ctxOpts = []chromedp.ContextOption{ + chromedp.WithLogf(cfg.Logger.Printf), + chromedp.WithDebugf(cfg.Logger.Debugf), + chromedp.WithErrorf(cfg.Logger.Errorf), + } + } + + ctx, cancel := chromedp.NewContext(cfg.ctx, ctxOpts...) defer cancel() + if cfg.Template != "" { + cfg.template = fasttemplate.New(cfg.Template, "{{", "}}") + } + chromedp.ListenTarget(ctx, func(ev interface{}) { switch ev := ev.(type) { case *network.EventRequestWillBeSent: // Outgoing requests @@ -64,12 +99,22 @@ func (cfg *Config) Crawl(URL string) ([]string, error) { err := chromedp.Run(ctx, chromedp.Navigate(URL), + chromedp.Sleep(1*time.Second), chromedp.Evaluate(script, &res), ) if err != nil { return nil, err } + // template eval + for i, _ := range res { + res[i] = cfg.eval(res[i]) + } + + for i, _ := range reqs { + reqs[i] = cfg.eval(reqs[i]) + } + res = MergeSlices(res, reqs) // filters @@ -77,8 +122,12 @@ func (cfg *Config) Crawl(URL string) ([]string, error) { case cfg.SameRoot: for i := 0; i < len(res); i++ { r, _ := url.Parse(res[i]) - base, _ := publicsuffix.EffectiveTLDPlusOne(r.Host) - if base != u.Host { + base, err := publicsuffix.EffectiveTLDPlusOne(r.Hostname()) + if err != nil && cfg.Logger != nil { + cfg.Logger.Error("could not get eTLD+1", "parsed", r.String()) + } + + if !strings.HasSuffix(cfg.scope.root, base) { res = append(res[:i], res[i+1:]...) i-- } @@ -86,7 +135,7 @@ func (cfg *Config) Crawl(URL string) ([]string, error) { case cfg.SameHost: for i := 0; i < len(res); i++ { r, _ := url.Parse(res[i]) - if r.Host != u.Host { + if r.Hostname() != cfg.scope.hostname { res = append(res[:i], res[i+1:]...) i-- } diff --git a/pkg/galer/util.go b/pkg/galer/util.go index 95018cd..acca723 100644 --- a/pkg/galer/util.go +++ b/pkg/galer/util.go @@ -1,6 +1,11 @@ package galer -import "net/url" +import ( + "errors" + "net/url" + + "golang.org/x/net/publicsuffix" +) const script = "[...new Set(Array.from(document.querySelectorAll('[src],[href],[url],[action]')).map(i => i.src || i.href || i.url || i.action))]" @@ -33,3 +38,43 @@ func MergeSlices[T1 comparable, T2 []T1](v1, v2 T2) T2 { return v2 } + +// SetScope sets the host and root (eTLD+1) for config. +func (cfg *Config) SetScope(s string) { + if u, err := url.Parse(s); err == nil { + cfg.scope.hostname = u.Hostname() + cfg.scope.root, _ = publicsuffix.EffectiveTLDPlusOne(u.Hostname()) + } +} + +func (cfg *Config) eval(s string) string { + u, err := url.Parse(s) + if err != nil && cfg.Logger != nil { + cfg.Logger.Errorf("cannot eval %q URL with %q as template: %+v", s, cfg.Template, errors.Unwrap(err)) + return s + } + + if cfg.template == nil { + return s + } + + password, _ := u.User.Password() + tags := map[string]interface{}{ + "raw_url": u.String(), + "scheme": u.Scheme, + "user": u.User.String(), + "username": u.User.Username(), + "password": password, + "host": u.Host, + "hostname": u.Hostname(), + "port": u.Port(), + "path": u.Path, + "raw_path": u.RawPath, + "escaped_path": u.EscapedPath(), + "raw_query": u.RawQuery, + "fragment": u.Fragment, + "raw_fragment": u.RawFragment, + } + + return cfg.template.ExecuteString(tags) +} diff --git a/pkg/galer/vars.go b/pkg/galer/vars.go index cb56593..6f0a1ac 100644 --- a/pkg/galer/vars.go +++ b/pkg/galer/vars.go @@ -6,4 +6,5 @@ var execAllocOpts = append( chromedp.DefaultExecAllocatorOptions[:], chromedp.DisableGPU, chromedp.IgnoreCertErrors, + // chromedp.Flag("headless", false), )