Skip to content

Commit c18c369

Browse files
committed
add Mux, tests, better example
1 parent c05bb70 commit c18c369

12 files changed

+771
-186
lines changed

Diff for: .gitignore

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@
44
*.swo
55
#*.*#
66
tags
7-
fetch.test
8-
7+
fetchbot.test
8+
example/example

Diff for: LICENSE

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
Copyright (c) 2014, Martin Angers & Contributors
2+
All rights reserved.
3+
4+
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
5+
6+
* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
7+
8+
* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
9+
10+
* Neither the name of the author nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
11+
12+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

Diff for: README.md

Whitespace-only changes.

Diff for: cmd.go

+20-9
Original file line numberDiff line numberDiff line change
@@ -13,28 +13,39 @@ type Command interface {
1313
Method() string
1414
}
1515

16-
// TODO : Naming, and take into account those additional interfaces.
17-
type BasicAuth interface {
18-
Credentials() (string, string)
16+
// The BasicAuthProvider interface gets the credentials to use to perform the request
17+
// with Basic Authentication.
18+
type BasicAuthProvider interface {
19+
BasicAuth() (user string, pwd string)
1920
}
2021

21-
type BodyReader interface {
22-
Body() io.Reader
22+
// The ReaderProvider interface gets the Reader to use as the Body of the request. It has
23+
// higher priority than the ValuesProvider interface, so that if both interfaces are implemented,
24+
// the ReaderProvider is used.
25+
type ReaderProvider interface {
26+
Reader() io.Reader
2327
}
2428

25-
type BodyKeyValuer interface {
29+
// The ValuesProvider interface gets the values to send as the Body of the request. It has
30+
// lower priority than the ReaderProvider interface, so that if both interfaces are implemented,
31+
// the ReaderProvider is used. If the request has no explicit Content-Type set, it will be automatically
32+
// set to "application/x-www-form-urlencoded".
33+
type ValuesProvider interface {
2634
Values() url.Values
2735
}
2836

29-
type Cookier interface {
37+
// The CookiesProvider interface gets the cookies to send with the request.
38+
type CookiesProvider interface {
3039
Cookies() []*http.Cookie
3140
}
3241

33-
type Headerer interface {
42+
// The HeaderProvider interface gets the headers to set on the request. If an Authorization
43+
// header is set, it will be overridden by the BasicAuthProvider, if implemented.
44+
type HeaderProvider interface {
3445
Header() http.Header
3546
}
3647

37-
// The Cmd struct defines a basic command implementation.
48+
// The Cmd struct defines a basic Command implementation.
3849
type Cmd struct {
3950
U *url.URL
4051
M string

Diff for: cmd_test.go

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
package fetchbot
2+
3+
// TODO : Test additional handlers.

Diff for: example/main.go

+151-73
Original file line numberDiff line numberDiff line change
@@ -1,100 +1,178 @@
11
package main
22

33
import (
4+
"bytes"
5+
"flag"
46
"fmt"
57
"net/http"
8+
"runtime"
69
"strings"
710
"sync"
11+
"time"
812

913
"github.com/PuerkitoBio/fetchbot"
1014
"github.com/PuerkitoBio/goquery"
1115
)
1216

1317
var (
14-
dup = make(map[string]bool)
15-
mu sync.Mutex
18+
// Starting URL to crawl
19+
seed = "http://golang.org"
20+
// Duplicates table
21+
dup = map[string]bool{seed: true}
22+
// Protect access to dup
23+
mu sync.Mutex
24+
25+
// Command-line flags
26+
stopAfter = flag.Duration("stopafter", 0, "automatically stop the fetchbot after a given time")
27+
stopAtUrl = flag.String("stopat", "", "automatically stop the fetchbot at a given URL")
28+
memStats = flag.Duration("memstats", 0, "display memory statistics at a given interval")
1629
)
1730

18-
func ErrHandler(h fetchbot.Handler) fetchbot.Handler {
31+
func main() {
32+
flag.Parse()
33+
34+
// Create the muxer
35+
mux := fetchbot.NewMux()
36+
37+
// Handle all errors the same
38+
mux.HandleErrors(fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) {
39+
fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err)
40+
}))
41+
42+
// Handle GET requests for html responses, to parse the body and enqueue all links as HEAD
43+
// requests.
44+
mux.Response().Method("GET").ContentType("text/html").HandleFunc(
45+
func(ctx *fetchbot.Context, res *http.Response, err error) {
46+
// Process the body to find the links
47+
doc, err := goquery.NewDocumentFromResponse(res)
48+
if err != nil {
49+
fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err)
50+
return
51+
}
52+
// Enqueue all links as HEAD requests
53+
enqueueLinks(ctx, doc)
54+
})
55+
56+
// Handle HEAD requests for html responses coming from the source host - we don't want
57+
// to crawl links from other hosts.
58+
mux.Response().Method("HEAD").Host("golang.org").ContentType("text/html").HandleFunc(
59+
func(ctx *fetchbot.Context, res *http.Response, err error) {
60+
if _, err := ctx.Q.SendStringGet(ctx.Cmd.URL().String()); err != nil {
61+
fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err)
62+
}
63+
})
64+
65+
// Create the Fetcher, handle the logging first, then dispatch to the Muxer
66+
h := logHandler(mux)
67+
if *stopAtUrl != "" {
68+
h = stopHandler(*stopAtUrl, logHandler(mux))
69+
}
70+
f := fetchbot.New(h)
71+
// First mem stat print must be right after creating the fetchbot
72+
if *memStats > 0 {
73+
// Print starting stats
74+
printMemStats(nil)
75+
// Run at regular intervals
76+
runMemStats(f, *memStats)
77+
// On exit, print ending stats after a GC
78+
defer func() {
79+
runtime.GC()
80+
printMemStats(nil)
81+
}()
82+
}
83+
// Start processing
84+
q := f.Start()
85+
if *stopAfter > 0 {
86+
go func() {
87+
c := time.After(*stopAfter)
88+
<-c
89+
q.Close()
90+
}()
91+
}
92+
// Enqueue the seed, which is the first entry in the dup map
93+
_, err := q.SendStringGet(seed)
94+
if err != nil {
95+
fmt.Printf("[ERR] GET %s - %s\n", seed, err)
96+
}
97+
q.Block()
98+
}
99+
100+
func runMemStats(f *fetchbot.Fetcher, tick time.Duration) {
101+
var mu sync.Mutex
102+
var di *fetchbot.DebugInfo
103+
104+
// Start goroutine to collect fetchbot debug info
105+
go func() {
106+
for v := range f.Debug() {
107+
mu.Lock()
108+
di = v
109+
mu.Unlock()
110+
}
111+
}()
112+
// Start ticker goroutine to print mem stats at regular intervals
113+
go func() {
114+
c := time.Tick(tick)
115+
for _ = range c {
116+
mu.Lock()
117+
printMemStats(di)
118+
mu.Unlock()
119+
}
120+
}()
121+
}
122+
123+
func printMemStats(di *fetchbot.DebugInfo) {
124+
var mem runtime.MemStats
125+
runtime.ReadMemStats(&mem)
126+
buf := bytes.NewBuffer(nil)
127+
buf.WriteString(strings.Repeat("=", 72) + "\n")
128+
buf.WriteString("Memory Profile:\n")
129+
buf.WriteString(fmt.Sprintf("\tAlloc: %d Kb\n", mem.Alloc/1024))
130+
buf.WriteString(fmt.Sprintf("\tTotalAlloc: %d Kb\n", mem.TotalAlloc/1024))
131+
buf.WriteString(fmt.Sprintf("\tNumGC: %d\n", mem.NumGC))
132+
buf.WriteString(fmt.Sprintf("\tGoroutines: %d\n", runtime.NumGoroutine()))
133+
if di != nil {
134+
buf.WriteString(fmt.Sprintf("\tNumHosts: %d\n", di.NumHosts))
135+
}
136+
buf.WriteString(strings.Repeat("=", 72))
137+
fmt.Println(buf.String())
138+
}
139+
140+
func stopHandler(stopurl string, wrapped fetchbot.Handler) fetchbot.Handler {
19141
return fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) {
20-
if err != nil {
21-
fmt.Printf("error: %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err)
142+
if ctx.Cmd.URL().String() == stopurl {
143+
ctx.Q.Close()
22144
return
23145
}
24-
h.Handle(ctx, res, err)
146+
wrapped.Handle(ctx, res, err)
25147
})
26148
}
27149

28-
func LinksHandler(h fetchbot.Handler, host string) fetchbot.Handler {
150+
func logHandler(wrapped fetchbot.Handler) fetchbot.Handler {
29151
return fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) {
30-
// Save as fetched once
31-
mu.Lock()
32-
dup[ctx.Cmd.URL().String()] = true
33-
mu.Unlock()
34-
35-
// Handle if text/html, otherwise continue. Limit fetched pages to the specified host only
36-
// (linked pages to other hosts will produce a HEAD request and a log entry, but no further
37-
// crawling).
38-
if ctx.Cmd.URL().Host == host && strings.HasPrefix(res.Header.Get("Content-Type"), "text/html") {
39-
switch ctx.Cmd.Method() {
40-
case "GET":
41-
// Process the body to find the links
42-
doc, err := goquery.NewDocumentFromResponse(res)
43-
if err != nil {
44-
fmt.Printf("error: parse goquery %s - %s\n", ctx.Cmd.URL(), err)
45-
}
46-
// Enqueue all links as HEAD requests, unless it is a duplicate
47-
mu.Lock()
48-
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
49-
val, _ := s.Attr("href")
50-
// Resolve address
51-
u, err := ctx.Cmd.URL().Parse(val)
52-
if err != nil {
53-
fmt.Printf("error: resolve URL %s - %s\n", val, err)
54-
return
55-
}
56-
if !dup[u.String()] {
57-
if _, err := ctx.Chan.EnqueueHead(u.String()); err != nil {
58-
fmt.Printf("error: enqueue head %s - %s\n", u, err)
59-
} else {
60-
dup[u.String()] = true
61-
}
62-
}
63-
})
64-
mu.Unlock()
65-
// Exit, since logging is done on HEAD
66-
return
67-
68-
case "HEAD":
69-
// Enqueue as a GET, we want the body. Don't check for duplicate, since it is one
70-
// by definition.
71-
if _, err := ctx.Chan.EnqueueGet(ctx.Cmd.URL().String()); err != nil {
72-
fmt.Printf("error: enqueue get %s - %s\n", ctx.Cmd.URL(), err)
73-
}
74-
}
152+
if err == nil {
153+
fmt.Printf("[%d] %s %s - %s\n", res.StatusCode, ctx.Cmd.Method(), ctx.Cmd.URL(), res.Header.Get("Content-Type"))
75154
}
76-
// Continue with wrapped handler
77-
h.Handle(ctx, res, err)
155+
wrapped.Handle(ctx, res, err)
78156
})
79157
}
80158

81-
func LogHandler(ctx *fetchbot.Context, res *http.Response, err error) {
82-
fmt.Printf("%s %s [%d]\n", res.Header.Get("Content-Type"), ctx.Cmd.URL(), res.StatusCode)
83-
}
84-
85-
// TODO : Print mem and goro stats once in a while
86-
func main() {
87-
const home = "http://golang.org"
88-
89-
// Create the Fetcher
90-
f := fetchbot.New(ErrHandler(LinksHandler(fetchbot.HandlerFunc(LogHandler), "golang.org")))
91-
// Start
92-
q := f.Start()
93-
// Enqueue the Go home page
94-
_, err := q.EnqueueHead(home)
95-
if err != nil {
96-
fmt.Printf("error: enqueue head %s - %s\n", home, err)
97-
}
98-
// Must be manually stopped (Ctrl-C)
99-
select {}
159+
func enqueueLinks(ctx *fetchbot.Context, doc *goquery.Document) {
160+
mu.Lock()
161+
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
162+
val, _ := s.Attr("href")
163+
// Resolve address
164+
u, err := ctx.Cmd.URL().Parse(val)
165+
if err != nil {
166+
fmt.Printf("error: resolve URL %s - %s\n", val, err)
167+
return
168+
}
169+
if !dup[u.String()] {
170+
if _, err := ctx.Q.SendStringHead(u.String()); err != nil {
171+
fmt.Printf("error: enqueue head %s - %s\n", u, err)
172+
} else {
173+
dup[u.String()] = true
174+
}
175+
}
176+
})
177+
mu.Unlock()
100178
}

0 commit comments

Comments
 (0)