Skip to content

Commit

Permalink
feat: add new feature
Browse files Browse the repository at this point in the history
  • Loading branch information
marcotuna committed Jul 20, 2023
1 parent cba80b3 commit 159f62d
Show file tree
Hide file tree
Showing 7 changed files with 856 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,6 @@

# Go workspace file
go.work

dump/
.cache/
142 changes: 142 additions & 0 deletions cmd/wp-go-static/commands/root.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
package commands

import (
"crypto/tls"
"fmt"
"net/http"
"net/url"
"regexp"
"strings"
"wp-go-static/pkg/file"

"github.com/gocolly/colly"
"github.com/spf13/cobra"
"github.com/spf13/viper"
)

// Run ...
func Run(args []string) error {
RootCmd.SetArgs(args)
return RootCmd.Execute()
}

// RootCmd ..
var RootCmd = &cobra.Command{
Use: "wp-go-static",
Short: "Wordpress Go Static",
Long: `Wordpress Go Static is a tool to download a Wordpress website and make it static`,
RunE: rootCmdF,
}

func init() {
// Define command-line flags
RootCmd.PersistentFlags().String("dir", "dump", "directory to save downloaded files")
RootCmd.PersistentFlags().String("url", "https://wp-just-expertise.mstudio.work", "URL to scrape")

// Bind command-line flags to Viper
viper.BindPFlag("dir", RootCmd.PersistentFlags().Lookup("dir"))
viper.BindPFlag("url", RootCmd.PersistentFlags().Lookup("url"))

// Execute root command
if err := RootCmd.Execute(); err != nil {
fmt.Println(err)
}
}

func rootCmdF(command *cobra.Command, args []string) error {
commandDir, _ := command.Flags().GetString("dir")
commandURL, _ := command.Flags().GetString("url")

c := colly.NewCollector(
colly.CacheDir(".cache"),
)

// Ignore SSL errors
c.WithTransport(&http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
})

parsedURL, err := url.Parse(commandURL)
if err != nil {
return err
}
domain := parsedURL.Hostname()

// Visit only pages that are part of the website
c.AllowedDomains = []string{domain}

// On every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
// Visit link found on page
c.Visit(e.Request.AbsoluteURL(link))
})

// On every link element call callback
c.OnHTML("link[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
// Download file found on page if it has a supported extension
c.Visit(e.Request.AbsoluteURL(link))
})

// On every script element call callback
c.OnHTML("script[src]", func(e *colly.HTMLElement) {
link := e.Attr("src")
// Download file found on page if it has a supported extension
c.Visit(e.Request.AbsoluteURL(link))
})

// On every img element call callback
c.OnHTML("img", func(e *colly.HTMLElement) {
link := e.Attr("src")
// Download image found on page
c.Visit(e.Request.AbsoluteURL(link))
})

// Before making a request print "Visiting ..."
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL.String())
})

// On response
c.OnResponse(func(r *colly.Response) {
dir, fileName := file.HandleFile(r, commandDir)

// Find all URLs in the CSS file
cssUrls := regexp.MustCompile(`url\((https?://[^\s]+)\)`).FindAllStringSubmatch(string(r.Body), -1)

// Download each referenced file
for _, cssUrl := range cssUrls {
url := strings.Trim(cssUrl[1], "'\"")
if url == "" {
continue
}

fmt.Printf("Visiting from CSS: '%s'\n", url)
c.Visit(url)
}

optionList := []string{
fmt.Sprintf(`http://%s`, domain),
fmt.Sprintf(`http:\/\/%s`, domain),
fmt.Sprintf(`https://%s`, domain),
fmt.Sprintf(`https:\/\/%s`, domain),
domain,
}

for _, v := range optionList {
// Replace all occurrences of the base URL with a relative URL
replaceBody := strings.ReplaceAll(string(r.Body), v, "")
r.Body = []byte(replaceBody)
}

err := file.SaveFile(r, dir, fileName)
if err != nil {
fmt.Println(err)
return
}
})

// Start scraping
return c.Visit(commandURL)
}
12 changes: 12 additions & 0 deletions cmd/wp-go-static/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package main

import (
"os"
"wp-go-static/cmd/wp-go-static/commands"
)

func main() {
if err := commands.Run(os.Args[1:]); err != nil {
os.Exit(1)
}
}
41 changes: 41 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
module wp-go-static

go 1.20

require (
github.com/gocolly/colly v1.2.0
github.com/spf13/cobra v1.7.0
github.com/spf13/viper v1.16.0
)

require (
github.com/PuerkitoBio/goquery v1.8.1 // indirect
github.com/andybalholm/cascadia v1.3.1 // indirect
github.com/antchfx/htmlquery v1.3.0 // indirect
github.com/antchfx/xmlquery v1.3.17 // indirect
github.com/antchfx/xpath v1.2.4 // indirect
github.com/fsnotify/fsnotify v1.6.0 // indirect
github.com/gobwas/glob v0.2.3 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.3 // indirect
github.com/hashicorp/hcl v1.0.0 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/kennygrant/sanitize v1.2.4 // indirect
github.com/magiconair/properties v1.8.7 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/pelletier/go-toml/v2 v2.0.8 // indirect
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect
github.com/spf13/afero v1.9.5 // indirect
github.com/spf13/cast v1.5.1 // indirect
github.com/spf13/jwalterweatherman v1.1.0 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/subosito/gotenv v1.4.2 // indirect
github.com/temoto/robotstxt v1.1.2 // indirect
golang.org/x/net v0.12.0 // indirect
golang.org/x/sys v0.10.0 // indirect
golang.org/x/text v0.11.0 // indirect
google.golang.org/appengine v1.6.7 // indirect
google.golang.org/protobuf v1.30.0 // indirect
gopkg.in/ini.v1 v1.67.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
Loading

0 comments on commit 159f62d

Please sign in to comment.