File tree 2 files changed +7
-17
lines changed
2 files changed +7
-17
lines changed Original file line number Diff line number Diff line change @@ -29,28 +29,20 @@ A concurrent web scraper that extracts links from a list of URLs.
29
29
30
30
1 . Make sure you have Go installed on your system.
31
31
2 . Save the code as a ` .go ` file (e.g., ` scraper.go ` ).
32
- 3 . edit the list of URLs to fetch
33
- 4 . Run the program using ` go run scraper.go ` .
32
+ 3 . Run the program using ` go run scraper.go --urls="https://example.com,https://example2.com, ...etc" ` .
34
33
35
34
** Dependencies:**
36
35
37
36
* ` golang.org/x/net/html ` : For HTML parsing.
38
37
39
38
** Example Usage:**
40
39
41
- The program currently scrapes the following URLs:
42
-
43
- * "https://google.com "
44
- * "https://old.reddit.com/ "
45
- * "https://timevko.website "
46
-
47
- You can modify the ` urls ` slice in the ` main ` function to scrape different websites.
40
+ ` go run fue.go --urls="https://timevko.website,https://old.reddit.com" `
48
41
49
42
** Potential Improvements:**
50
43
51
44
* ** Error Handling:** More robust error handling for network requests and HTML parsing.
52
45
* ** Politeness:** Implement delays between requests to avoid overloading the target servers.
53
46
* ** Data Storage:** Store the extracted links in a file or database.
54
- * ** Command-line Arguments:** Allow users to specify the URLs and other options through command-line arguments.
55
47
* ** Deduplication:** Remove duplicate links from the output.
56
48
* ** Advanced Extraction:** Use CSS selectors (e.g., with the ` goquery ` library) for more specific link extraction.
Original file line number Diff line number Diff line change 1
1
package main
2
2
3
3
import (
4
+ "flag"
4
5
"fmt"
5
6
"net/http"
6
7
"net/url"
8
+ "strings"
7
9
"sync"
8
10
9
11
"golang.org/x/net/html"
@@ -61,20 +63,16 @@ func extractLinks(doc *html.Node, baseURL *url.URL) map[string]string {
61
63
62
64
func main () {
63
65
var wg sync.WaitGroup
64
- // URLs to scrape
65
- urls := []string {
66
- "https://google.com" ,
67
- "https://old.reddit.com/" ,
68
- "https://timevko.website" ,
69
- }
66
+ urlList := flag .String ("urls" , "https://google.com" , "Comma separated list of URL's to crawl" )
67
+ flag .Parse ()
70
68
urlChan := make (chan string )
71
69
results := make (chan map [string ]string )
72
70
73
71
for i := 1 ; i <= 3 ; i ++ {
74
72
wg .Add (1 )
75
73
go worker (i , urlChan , results , & wg )
76
74
}
77
- for _ , url := range urls {
75
+ for _ , url := range strings . Split ( * urlList , "," ) {
78
76
urlChan <- url
79
77
}
80
78
close (urlChan )
You can’t perform that action at this time.
0 commit comments