|
1 | 1 | package main
|
2 | 2 |
|
3 | 3 | import (
|
| 4 | + "bytes" |
| 5 | + "flag" |
4 | 6 | "fmt"
|
5 | 7 | "net/http"
|
| 8 | + "runtime" |
6 | 9 | "strings"
|
7 | 10 | "sync"
|
| 11 | + "time" |
8 | 12 |
|
9 | 13 | "github.com/PuerkitoBio/fetchbot"
|
10 | 14 | "github.com/PuerkitoBio/goquery"
|
11 | 15 | )
|
12 | 16 |
|
13 | 17 | var (
|
14 |
| - dup = make(map[string]bool) |
15 |
| - mu sync.Mutex |
| 18 | + // Starting URL to crawl |
| 19 | + seed = "http://golang.org" |
| 20 | + // Duplicates table |
| 21 | + dup = map[string]bool{seed: true} |
| 22 | + // Protect access to dup |
| 23 | + mu sync.Mutex |
| 24 | + |
| 25 | + // Command-line flags |
| 26 | + stopAfter = flag.Duration("stopafter", 0, "automatically stop the fetchbot after a given time") |
| 27 | + stopAtUrl = flag.String("stopat", "", "automatically stop the fetchbot at a given URL") |
| 28 | + memStats = flag.Duration("memstats", 0, "display memory statistics at a given interval") |
16 | 29 | )
|
17 | 30 |
|
18 |
| -func ErrHandler(h fetchbot.Handler) fetchbot.Handler { |
| 31 | +func main() { |
| 32 | + flag.Parse() |
| 33 | + |
| 34 | + // Create the muxer |
| 35 | + mux := fetchbot.NewMux() |
| 36 | + |
| 37 | + // Handle all errors the same |
| 38 | + mux.HandleErrors(fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) { |
| 39 | + fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err) |
| 40 | + })) |
| 41 | + |
| 42 | + // Handle GET requests for html responses, to parse the body and enqueue all links as HEAD |
| 43 | + // requests. |
| 44 | + mux.Response().Method("GET").ContentType("text/html").HandleFunc( |
| 45 | + func(ctx *fetchbot.Context, res *http.Response, err error) { |
| 46 | + // Process the body to find the links |
| 47 | + doc, err := goquery.NewDocumentFromResponse(res) |
| 48 | + if err != nil { |
| 49 | + fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err) |
| 50 | + return |
| 51 | + } |
| 52 | + // Enqueue all links as HEAD requests |
| 53 | + enqueueLinks(ctx, doc) |
| 54 | + }) |
| 55 | + |
| 56 | + // Handle HEAD requests for html responses coming from the source host - we don't want |
| 57 | + // to crawl links from other hosts. |
| 58 | + mux.Response().Method("HEAD").Host("golang.org").ContentType("text/html").HandleFunc( |
| 59 | + func(ctx *fetchbot.Context, res *http.Response, err error) { |
| 60 | + if _, err := ctx.Q.SendStringGet(ctx.Cmd.URL().String()); err != nil { |
| 61 | + fmt.Printf("[ERR] %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err) |
| 62 | + } |
| 63 | + }) |
| 64 | + |
| 65 | + // Create the Fetcher, handle the logging first, then dispatch to the Muxer |
| 66 | + h := logHandler(mux) |
| 67 | + if *stopAtUrl != "" { |
| 68 | + h = stopHandler(*stopAtUrl, logHandler(mux)) |
| 69 | + } |
| 70 | + f := fetchbot.New(h) |
| 71 | + // First mem stat print must be right after creating the fetchbot |
| 72 | + if *memStats > 0 { |
| 73 | + // Print starting stats |
| 74 | + printMemStats(nil) |
| 75 | + // Run at regular intervals |
| 76 | + runMemStats(f, *memStats) |
| 77 | + // On exit, print ending stats after a GC |
| 78 | + defer func() { |
| 79 | + runtime.GC() |
| 80 | + printMemStats(nil) |
| 81 | + }() |
| 82 | + } |
| 83 | + // Start processing |
| 84 | + q := f.Start() |
| 85 | + if *stopAfter > 0 { |
| 86 | + go func() { |
| 87 | + c := time.After(*stopAfter) |
| 88 | + <-c |
| 89 | + q.Close() |
| 90 | + }() |
| 91 | + } |
| 92 | + // Enqueue the seed, which is the first entry in the dup map |
| 93 | + _, err := q.SendStringGet(seed) |
| 94 | + if err != nil { |
| 95 | + fmt.Printf("[ERR] GET %s - %s\n", seed, err) |
| 96 | + } |
| 97 | + q.Block() |
| 98 | +} |
| 99 | + |
| 100 | +func runMemStats(f *fetchbot.Fetcher, tick time.Duration) { |
| 101 | + var mu sync.Mutex |
| 102 | + var di *fetchbot.DebugInfo |
| 103 | + |
| 104 | + // Start goroutine to collect fetchbot debug info |
| 105 | + go func() { |
| 106 | + for v := range f.Debug() { |
| 107 | + mu.Lock() |
| 108 | + di = v |
| 109 | + mu.Unlock() |
| 110 | + } |
| 111 | + }() |
| 112 | + // Start ticker goroutine to print mem stats at regular intervals |
| 113 | + go func() { |
| 114 | + c := time.Tick(tick) |
| 115 | + for _ = range c { |
| 116 | + mu.Lock() |
| 117 | + printMemStats(di) |
| 118 | + mu.Unlock() |
| 119 | + } |
| 120 | + }() |
| 121 | +} |
| 122 | + |
| 123 | +func printMemStats(di *fetchbot.DebugInfo) { |
| 124 | + var mem runtime.MemStats |
| 125 | + runtime.ReadMemStats(&mem) |
| 126 | + buf := bytes.NewBuffer(nil) |
| 127 | + buf.WriteString(strings.Repeat("=", 72) + "\n") |
| 128 | + buf.WriteString("Memory Profile:\n") |
| 129 | + buf.WriteString(fmt.Sprintf("\tAlloc: %d Kb\n", mem.Alloc/1024)) |
| 130 | + buf.WriteString(fmt.Sprintf("\tTotalAlloc: %d Kb\n", mem.TotalAlloc/1024)) |
| 131 | + buf.WriteString(fmt.Sprintf("\tNumGC: %d\n", mem.NumGC)) |
| 132 | + buf.WriteString(fmt.Sprintf("\tGoroutines: %d\n", runtime.NumGoroutine())) |
| 133 | + if di != nil { |
| 134 | + buf.WriteString(fmt.Sprintf("\tNumHosts: %d\n", di.NumHosts)) |
| 135 | + } |
| 136 | + buf.WriteString(strings.Repeat("=", 72)) |
| 137 | + fmt.Println(buf.String()) |
| 138 | +} |
| 139 | + |
| 140 | +func stopHandler(stopurl string, wrapped fetchbot.Handler) fetchbot.Handler { |
19 | 141 | return fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) {
|
20 |
| - if err != nil { |
21 |
| - fmt.Printf("error: %s %s - %s\n", ctx.Cmd.Method(), ctx.Cmd.URL(), err) |
| 142 | + if ctx.Cmd.URL().String() == stopurl { |
| 143 | + ctx.Q.Close() |
22 | 144 | return
|
23 | 145 | }
|
24 |
| - h.Handle(ctx, res, err) |
| 146 | + wrapped.Handle(ctx, res, err) |
25 | 147 | })
|
26 | 148 | }
|
27 | 149 |
|
28 |
| -func LinksHandler(h fetchbot.Handler, host string) fetchbot.Handler { |
| 150 | +func logHandler(wrapped fetchbot.Handler) fetchbot.Handler { |
29 | 151 | return fetchbot.HandlerFunc(func(ctx *fetchbot.Context, res *http.Response, err error) {
|
30 |
| - // Save as fetched once |
31 |
| - mu.Lock() |
32 |
| - dup[ctx.Cmd.URL().String()] = true |
33 |
| - mu.Unlock() |
34 |
| - |
35 |
| - // Handle if text/html, otherwise continue. Limit fetched pages to the specified host only |
36 |
| - // (linked pages to other hosts will produce a HEAD request and a log entry, but no further |
37 |
| - // crawling). |
38 |
| - if ctx.Cmd.URL().Host == host && strings.HasPrefix(res.Header.Get("Content-Type"), "text/html") { |
39 |
| - switch ctx.Cmd.Method() { |
40 |
| - case "GET": |
41 |
| - // Process the body to find the links |
42 |
| - doc, err := goquery.NewDocumentFromResponse(res) |
43 |
| - if err != nil { |
44 |
| - fmt.Printf("error: parse goquery %s - %s\n", ctx.Cmd.URL(), err) |
45 |
| - } |
46 |
| - // Enqueue all links as HEAD requests, unless it is a duplicate |
47 |
| - mu.Lock() |
48 |
| - doc.Find("a[href]").Each(func(i int, s *goquery.Selection) { |
49 |
| - val, _ := s.Attr("href") |
50 |
| - // Resolve address |
51 |
| - u, err := ctx.Cmd.URL().Parse(val) |
52 |
| - if err != nil { |
53 |
| - fmt.Printf("error: resolve URL %s - %s\n", val, err) |
54 |
| - return |
55 |
| - } |
56 |
| - if !dup[u.String()] { |
57 |
| - if _, err := ctx.Chan.EnqueueHead(u.String()); err != nil { |
58 |
| - fmt.Printf("error: enqueue head %s - %s\n", u, err) |
59 |
| - } else { |
60 |
| - dup[u.String()] = true |
61 |
| - } |
62 |
| - } |
63 |
| - }) |
64 |
| - mu.Unlock() |
65 |
| - // Exit, since logging is done on HEAD |
66 |
| - return |
67 |
| - |
68 |
| - case "HEAD": |
69 |
| - // Enqueue as a GET, we want the body. Don't check for duplicate, since it is one |
70 |
| - // by definition. |
71 |
| - if _, err := ctx.Chan.EnqueueGet(ctx.Cmd.URL().String()); err != nil { |
72 |
| - fmt.Printf("error: enqueue get %s - %s\n", ctx.Cmd.URL(), err) |
73 |
| - } |
74 |
| - } |
| 152 | + if err == nil { |
| 153 | + fmt.Printf("[%d] %s %s - %s\n", res.StatusCode, ctx.Cmd.Method(), ctx.Cmd.URL(), res.Header.Get("Content-Type")) |
75 | 154 | }
|
76 |
| - // Continue with wrapped handler |
77 |
| - h.Handle(ctx, res, err) |
| 155 | + wrapped.Handle(ctx, res, err) |
78 | 156 | })
|
79 | 157 | }
|
80 | 158 |
|
81 |
| -func LogHandler(ctx *fetchbot.Context, res *http.Response, err error) { |
82 |
| - fmt.Printf("%s %s [%d]\n", res.Header.Get("Content-Type"), ctx.Cmd.URL(), res.StatusCode) |
83 |
| -} |
84 |
| - |
85 |
| -// TODO : Print mem and goro stats once in a while |
86 |
| -func main() { |
87 |
| - const home = "http://golang.org" |
88 |
| - |
89 |
| - // Create the Fetcher |
90 |
| - f := fetchbot.New(ErrHandler(LinksHandler(fetchbot.HandlerFunc(LogHandler), "golang.org"))) |
91 |
| - // Start |
92 |
| - q := f.Start() |
93 |
| - // Enqueue the Go home page |
94 |
| - _, err := q.EnqueueHead(home) |
95 |
| - if err != nil { |
96 |
| - fmt.Printf("error: enqueue head %s - %s\n", home, err) |
97 |
| - } |
98 |
| - // Must be manually stopped (Ctrl-C) |
99 |
| - select {} |
| 159 | +func enqueueLinks(ctx *fetchbot.Context, doc *goquery.Document) { |
| 160 | + mu.Lock() |
| 161 | + doc.Find("a[href]").Each(func(i int, s *goquery.Selection) { |
| 162 | + val, _ := s.Attr("href") |
| 163 | + // Resolve address |
| 164 | + u, err := ctx.Cmd.URL().Parse(val) |
| 165 | + if err != nil { |
| 166 | + fmt.Printf("error: resolve URL %s - %s\n", val, err) |
| 167 | + return |
| 168 | + } |
| 169 | + if !dup[u.String()] { |
| 170 | + if _, err := ctx.Q.SendStringHead(u.String()); err != nil { |
| 171 | + fmt.Printf("error: enqueue head %s - %s\n", u, err) |
| 172 | + } else { |
| 173 | + dup[u.String()] = true |
| 174 | + } |
| 175 | + } |
| 176 | + }) |
| 177 | + mu.Unlock() |
100 | 178 | }
|
0 commit comments