-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
856 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,3 +19,6 @@ | |
|
||
# Go workspace file | ||
go.work | ||
|
||
dump/ | ||
.cache/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
package commands | ||
|
||
import ( | ||
"crypto/tls" | ||
"fmt" | ||
"net/http" | ||
"net/url" | ||
"regexp" | ||
"strings" | ||
"wp-go-static/pkg/file" | ||
|
||
"github.com/gocolly/colly" | ||
"github.com/spf13/cobra" | ||
"github.com/spf13/viper" | ||
) | ||
|
||
// Run ... | ||
func Run(args []string) error { | ||
RootCmd.SetArgs(args) | ||
return RootCmd.Execute() | ||
} | ||
|
||
// RootCmd .. | ||
var RootCmd = &cobra.Command{ | ||
Use: "wp-go-static", | ||
Short: "Wordpress Go Static", | ||
Long: `Wordpress Go Static is a tool to download a Wordpress website and make it static`, | ||
RunE: rootCmdF, | ||
} | ||
|
||
func init() { | ||
// Define command-line flags | ||
RootCmd.PersistentFlags().String("dir", "dump", "directory to save downloaded files") | ||
RootCmd.PersistentFlags().String("url", "https://wp-just-expertise.mstudio.work", "URL to scrape") | ||
|
||
// Bind command-line flags to Viper | ||
viper.BindPFlag("dir", RootCmd.PersistentFlags().Lookup("dir")) | ||
viper.BindPFlag("url", RootCmd.PersistentFlags().Lookup("url")) | ||
|
||
// Execute root command | ||
if err := RootCmd.Execute(); err != nil { | ||
fmt.Println(err) | ||
} | ||
} | ||
|
||
func rootCmdF(command *cobra.Command, args []string) error { | ||
commandDir, _ := command.Flags().GetString("dir") | ||
commandURL, _ := command.Flags().GetString("url") | ||
|
||
c := colly.NewCollector( | ||
colly.CacheDir(".cache"), | ||
) | ||
|
||
// Ignore SSL errors | ||
c.WithTransport(&http.Transport{ | ||
TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, | ||
}) | ||
|
||
parsedURL, err := url.Parse(commandURL) | ||
if err != nil { | ||
return err | ||
} | ||
domain := parsedURL.Hostname() | ||
|
||
// Visit only pages that are part of the website | ||
c.AllowedDomains = []string{domain} | ||
|
||
// On every a element which has href attribute call callback | ||
c.OnHTML("a[href]", func(e *colly.HTMLElement) { | ||
link := e.Attr("href") | ||
// Visit link found on page | ||
c.Visit(e.Request.AbsoluteURL(link)) | ||
}) | ||
|
||
// On every link element call callback | ||
c.OnHTML("link[href]", func(e *colly.HTMLElement) { | ||
link := e.Attr("href") | ||
// Download file found on page if it has a supported extension | ||
c.Visit(e.Request.AbsoluteURL(link)) | ||
}) | ||
|
||
// On every script element call callback | ||
c.OnHTML("script[src]", func(e *colly.HTMLElement) { | ||
link := e.Attr("src") | ||
// Download file found on page if it has a supported extension | ||
c.Visit(e.Request.AbsoluteURL(link)) | ||
}) | ||
|
||
// On every img element call callback | ||
c.OnHTML("img", func(e *colly.HTMLElement) { | ||
link := e.Attr("src") | ||
// Download image found on page | ||
c.Visit(e.Request.AbsoluteURL(link)) | ||
}) | ||
|
||
// Before making a request print "Visiting ..." | ||
c.OnRequest(func(r *colly.Request) { | ||
fmt.Println("Visiting", r.URL.String()) | ||
}) | ||
|
||
// On response | ||
c.OnResponse(func(r *colly.Response) { | ||
dir, fileName := file.HandleFile(r, commandDir) | ||
|
||
// Find all URLs in the CSS file | ||
cssUrls := regexp.MustCompile(`url\((https?://[^\s]+)\)`).FindAllStringSubmatch(string(r.Body), -1) | ||
|
||
// Download each referenced file | ||
for _, cssUrl := range cssUrls { | ||
url := strings.Trim(cssUrl[1], "'\"") | ||
if url == "" { | ||
continue | ||
} | ||
|
||
fmt.Printf("Visiting from CSS: '%s'\n", url) | ||
c.Visit(url) | ||
} | ||
|
||
optionList := []string{ | ||
fmt.Sprintf(`http://%s`, domain), | ||
fmt.Sprintf(`http:\/\/%s`, domain), | ||
fmt.Sprintf(`https://%s`, domain), | ||
fmt.Sprintf(`https:\/\/%s`, domain), | ||
domain, | ||
} | ||
|
||
for _, v := range optionList { | ||
// Replace all occurrences of the base URL with a relative URL | ||
replaceBody := strings.ReplaceAll(string(r.Body), v, "") | ||
r.Body = []byte(replaceBody) | ||
} | ||
|
||
err := file.SaveFile(r, dir, fileName) | ||
if err != nil { | ||
fmt.Println(err) | ||
return | ||
} | ||
}) | ||
|
||
// Start scraping | ||
return c.Visit(commandURL) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
package main | ||
|
||
import ( | ||
"os" | ||
"wp-go-static/cmd/wp-go-static/commands" | ||
) | ||
|
||
func main() { | ||
if err := commands.Run(os.Args[1:]); err != nil { | ||
os.Exit(1) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
module wp-go-static | ||
|
||
go 1.20 | ||
|
||
require ( | ||
github.com/gocolly/colly v1.2.0 | ||
github.com/spf13/cobra v1.7.0 | ||
github.com/spf13/viper v1.16.0 | ||
) | ||
|
||
require ( | ||
github.com/PuerkitoBio/goquery v1.8.1 // indirect | ||
github.com/andybalholm/cascadia v1.3.1 // indirect | ||
github.com/antchfx/htmlquery v1.3.0 // indirect | ||
github.com/antchfx/xmlquery v1.3.17 // indirect | ||
github.com/antchfx/xpath v1.2.4 // indirect | ||
github.com/fsnotify/fsnotify v1.6.0 // indirect | ||
github.com/gobwas/glob v0.2.3 // indirect | ||
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect | ||
github.com/golang/protobuf v1.5.3 // indirect | ||
github.com/hashicorp/hcl v1.0.0 // indirect | ||
github.com/inconshreveable/mousetrap v1.1.0 // indirect | ||
github.com/kennygrant/sanitize v1.2.4 // indirect | ||
github.com/magiconair/properties v1.8.7 // indirect | ||
github.com/mitchellh/mapstructure v1.5.0 // indirect | ||
github.com/pelletier/go-toml/v2 v2.0.8 // indirect | ||
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect | ||
github.com/spf13/afero v1.9.5 // indirect | ||
github.com/spf13/cast v1.5.1 // indirect | ||
github.com/spf13/jwalterweatherman v1.1.0 // indirect | ||
github.com/spf13/pflag v1.0.5 // indirect | ||
github.com/subosito/gotenv v1.4.2 // indirect | ||
github.com/temoto/robotstxt v1.1.2 // indirect | ||
golang.org/x/net v0.12.0 // indirect | ||
golang.org/x/sys v0.10.0 // indirect | ||
golang.org/x/text v0.11.0 // indirect | ||
google.golang.org/appengine v1.6.7 // indirect | ||
google.golang.org/protobuf v1.30.0 // indirect | ||
gopkg.in/ini.v1 v1.67.0 // indirect | ||
gopkg.in/yaml.v3 v3.0.1 // indirect | ||
) |
Oops, something went wrong.