Skip to content

Commit

Permalink
feat: variables are scoped to the respective command
Browse files Browse the repository at this point in the history
  • Loading branch information
marcotuna committed Sep 1, 2023
1 parent 42819c9 commit 773604d
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 77 deletions.
81 changes: 27 additions & 54 deletions cmd/wp-go-static/commands/scrape.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,57 +8,28 @@ import (
"net/url"
"regexp"
"strings"
"sync"
"wp-go-static/pkg/file"

"github.com/gocolly/colly"
"github.com/spf13/cobra"
"github.com/spf13/pflag"
"github.com/spf13/viper"
)

// URLCache is a struct to hold the visited URLs
type URLCache struct {
mu sync.Mutex
urls map[string]bool
}

// Add adds a URL to the cache
func (c *URLCache) Add(url string) {
c.mu.Lock()
defer c.mu.Unlock()
c.urls[url] = true
}

// Get checks if a URL is in the cache
func (c *URLCache) Get(url string) bool {
c.mu.Lock()
defer c.mu.Unlock()
_, ok := c.urls[url]
return ok
}

type ScrapeConfig struct {
Dir string `mapstructure:"dir"`
URL string `mapstructure:"url"`
Cache string `mapstructure:"cache"`
ReplaceURL string `mapstructure:"replace-url"`
Replace bool `mapstructure:"replace"`
Parallel bool `mapstructure:"parallel"`
Images bool `mapstructure:"images"`
CheckHead bool `mapstructure:"check-head"`
}
"wp-go-static/internal/cache"
"wp-go-static/internal/config"
)

type Scrape struct {
urlCache *URLCache
urlCache *cache.URLCache
c *colly.Collector
domain string
hostname string
config ScrapeConfig
config config.Config
}

func NewScrape() *Scrape {
return &Scrape{
urlCache: &URLCache{urls: make(map[string]bool)},
urlCache: &cache.URLCache{URLs: make(map[string]bool)},
c: colly.NewCollector(),
}
}
Expand All @@ -70,6 +41,10 @@ var ScrapeCmd = &cobra.Command{
RunE: scrapeCmdF,
}

const (
bindFlagScrapePrefix = "scrape"
)

func init() {
// Define command-line flags
ScrapeCmd.PersistentFlags().String("dir", "dump", "directory to save downloaded files")
Expand All @@ -80,13 +55,12 @@ func init() {
ScrapeCmd.PersistentFlags().Bool("parallel", false, "Fetch in parallel")
ScrapeCmd.PersistentFlags().Bool("images", true, "Download images")
ScrapeCmd.PersistentFlags().Bool("check-head", true, "Checks head")
ScrapeCmd.MarkFlagRequired("url")
// ScrapeCmd.MarkPersistentFlagRequired("url")

// Bind command-line flags to Viper
err := viper.BindPFlags(ScrapeCmd.PersistentFlags())
if err != nil {
log.Fatal(err)
}
ScrapeCmd.PersistentFlags().VisitAll(func(flag *pflag.Flag) {
bindFlag := fmt.Sprintf("%s.%s", bindFlagScrapePrefix, flag.Name)
viper.BindPFlag(bindFlag, ScrapeCmd.PersistentFlags().Lookup(flag.Name))
})

RootCmd.AddCommand(ScrapeCmd)
}
Expand All @@ -95,25 +69,25 @@ func scrapeCmdF(command *cobra.Command, args []string) error {
scrape := NewScrape()
viper.Unmarshal(&scrape.config)

scrape.domain = scrape.config.URL
scrape.domain = scrape.config.Scrape.URL

if scrape.config.CheckHead {
if scrape.config.Scrape.CheckHead {
scrape.c.CheckHead = true
}

if scrape.config.Cache != "" {
log.Println("Using cache directory", scrape.config.Cache)
scrape.c.CacheDir = scrape.config.Cache
if scrape.config.Scrape.Cache != "" {
log.Println("Using cache directory", scrape.config.Scrape.Cache)
scrape.c.CacheDir = scrape.config.Scrape.Cache
}

scrape.c.Async = scrape.config.Parallel
scrape.c.Async = scrape.config.Scrape.Parallel

// Use a custom TLS config to verify server certificates
scrape.c.WithTransport(&http.Transport{
TLSClientConfig: &tls.Config{},
})

parsedURL, err := url.Parse(scrape.config.URL)
parsedURL, err := url.Parse(scrape.config.Scrape.URL)
if err != nil {
return err
}
Expand Down Expand Up @@ -175,7 +149,7 @@ func scrapeCmdF(command *cobra.Command, args []string) error {
// On response
scrape.c.OnResponse(func(r *colly.Response) {
rCopy := *r
dir, fileName := file.HandleFile(r, scrape.config.Dir)
dir, fileName := file.HandleFile(r, scrape.config.Scrape.Dir)
rCopy.Body = scrape.parseBody(r.Body)

err := file.SaveFile(&rCopy, dir, fileName)
Expand All @@ -187,7 +161,6 @@ func scrapeCmdF(command *cobra.Command, args []string) error {

urlsToVisit := []string{
"robots.txt",
// "sitemap.xml",
"favicon.ico",
}

Expand Down Expand Up @@ -224,7 +197,7 @@ func (s *Scrape) visitURL(link string) {
}

if u.Scheme == "" || u.Host == "" {
log.Printf("Invalid URL %s", link)
log.Printf("Invalid URL: %s", link)
return
}

Expand Down Expand Up @@ -254,7 +227,7 @@ func (s *Scrape) parseBody(body []byte) []byte {
s.visitURL(link)
}

if s.config.Replace {
if s.config.Scrape.Replace {
optionList := []string{
fmt.Sprintf(`http://%s`, s.hostname),
fmt.Sprintf(`http:\/\/%s`, s.hostname),
Expand All @@ -264,7 +237,7 @@ func (s *Scrape) parseBody(body []byte) []byte {

for _, option := range optionList {
// Replace all occurrences of the base URL with a relative URL
replaceBody := strings.ReplaceAll(string(body), option, s.config.ReplaceURL)
replaceBody := strings.ReplaceAll(string(body), option, s.config.Scrape.ReplaceURL)
body = []byte(replaceBody)
}
}
Expand Down
43 changes: 20 additions & 23 deletions cmd/wp-go-static/commands/sitemap.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,61 +2,58 @@ package commands

import (
"fmt"
"log"
"net/url"
"strings"

"wp-go-static/internal/config"
goSitemap "wp-go-static/pkg/sitemap"

"github.com/spf13/cobra"
"github.com/spf13/pflag"
"github.com/spf13/viper"
)

type SitemapConfig struct {
Dir string `mapstructure:"dir"`
URL string `mapstructure:"url"`
ReplaceURL string `mapstructure:"replace-url"`
SitemapFile string `mapstructure:"sitemap-file"`
}

// SitemapCmd ...
var SitemapCmd = &cobra.Command{
Use: "sitemap",
Short: "Create sitemap from the Wordpress website",
RunE: sitemapCmdF,
}

const (
bindFlagSitemapPrefix = "sitemap"
)

func init() {
// Define command-line flags
SitemapCmd.PersistentFlags().String("dir", "dump", "directory to save downloaded files")
SitemapCmd.PersistentFlags().String("url", "", "URL to scrape")
SitemapCmd.PersistentFlags().String("replace-url", "", "Replace with a specific url")
SitemapCmd.PersistentFlags().String("sitemap-file", "sitemap.xml", "Output sitemap file name")
SitemapCmd.MarkFlagRequired("url")
SitemapCmd.PersistentFlags().String("file", "sitemap.xml", "Output sitemap file name")

// Bind command-line flags to Viper
err := viper.BindPFlags(SitemapCmd.PersistentFlags())
if err != nil {
log.Fatal(err)
}
SitemapCmd.PersistentFlags().VisitAll(func(flag *pflag.Flag) {
bindFlag := fmt.Sprintf("%s.%s", bindFlagSitemapPrefix, flag.Name)
viper.BindPFlag(bindFlag, SitemapCmd.PersistentFlags().Lookup(flag.Name))
})

RootCmd.AddCommand(SitemapCmd)
}

func sitemapCmdF(command *cobra.Command, args []string) error {
sitemapConfig := SitemapConfig{}
viper.Unmarshal(&sitemapConfig)
config := config.Config{}
viper.Unmarshal(&config)

smap, err := goSitemap.Get(sitemapConfig.URL, nil)
smap, err := goSitemap.Get(config.Sitemap.URL, nil)
if err != nil {
fmt.Println(err)
}

for i := range smap.URL {
// Replace the URL with the url from the replace-url argument
// Only with the URL part, persist the URL path and query
if sitemapConfig.ReplaceURL != "" {
currentURL, _ := url.Parse(sitemapConfig.URL)
if config.Sitemap.ReplaceURL != "" {
currentURL, _ := url.Parse(config.Sitemap.URL)

optionList := []string{
fmt.Sprintf(`http://%s`, currentURL.Host),
Expand All @@ -70,7 +67,7 @@ func sitemapCmdF(command *cobra.Command, args []string) error {
fmt.Println("Index out of range for smap.URL")
break
}
smap.URL[i].Loc = strings.ReplaceAll(string(smap.URL[i].Loc), option, sitemapConfig.ReplaceURL)
smap.URL[i].Loc = strings.ReplaceAll(string(smap.URL[i].Loc), option, config.Sitemap.ReplaceURL)

// for j := range smap.Image {
// if i >= len(smap.URL) {
Expand All @@ -96,9 +93,9 @@ func sitemapCmdF(command *cobra.Command, args []string) error {
fmt.Printf("%s\n", printSmap)

// Write the Sitemap to a file
if sitemapConfig.SitemapFile != "" {
fmt.Printf("Writing sitemap to %s/%s\n", sitemapConfig.Dir, sitemapConfig.SitemapFile)
return smap.Save(sitemapConfig.Dir, sitemapConfig.SitemapFile)
if config.Sitemap.File != "" {
fmt.Printf("Writing sitemap to %s/%s\n", config.Sitemap.Dir, config.Sitemap.File)
return smap.Save(config.Sitemap.Dir, config.Sitemap.File)
}

return nil
Expand Down

0 comments on commit 773604d

Please sign in to comment.