From 7c1e11332631e286a48ad7720cdab158deb3ad24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Do=C4=9Fan=20Can=20Bak=C4=B1r?= Date: Thu, 9 May 2024 17:48:11 +0300 Subject: [PATCH 1/3] deduplicate lines in files in store-field-dir --- cmd/katana/main.go | 52 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/cmd/katana/main.go b/cmd/katana/main.go index 0d4ac4e7..ff415265 100644 --- a/cmd/katana/main.go +++ b/cmd/katana/main.go @@ -1,6 +1,7 @@ package main import ( + "bufio" "fmt" "math" "os" @@ -68,13 +69,62 @@ func main() { gologger.Fatal().Msgf("could not execute crawling: %s", err) } - // on successful execution remove the resume file in case it exists + // on successful execution: + + // deduplicate the lines in each file in the store-field-dir + //use options.StoreFieldDir once https://github.com/projectdiscovery/katana/pull/877 is merged + storeFieldDir := "katana_field" + _ = deduplicateLinesInFilesInDir(storeFieldDir) + + // remove the resume file in case it exists if fileutil.FileExists(resumeFilename) { os.Remove(resumeFilename) } } +func deduplicateLinesInFilesInDir(dir string) error { + err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if !info.IsDir() { + return deduplicateLinesInFile(path) + } + return nil + }) + if err != nil { + return errorutil.NewWithErr(err).Msgf("error processing directory %s", dir) + } + return nil +} + +func deduplicateLinesInFile(filename string) error { + file, err := os.Open(filename) + if err != nil { + return errorutil.NewWithErr(err).Msgf("could not open file: %s", filename) + } + defer file.Close() + + seenLines := make(map[string]struct{}) + var deduplicatedLines []string + + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := scanner.Text() + if _, exists := seenLines[line]; !exists { + seenLines[line] = struct{}{} + deduplicatedLines = append(deduplicatedLines, line) + } + } + + if err := scanner.Err(); err != nil { + return errorutil.NewWithErr(err).Msgf("could not read file: %s", filename) + } + + return os.WriteFile(filename, []byte(strings.Join(deduplicatedLines, "\n")+"\n"), 0644) +} + func readFlags() (*goflags.FlagSet, error) { flagSet := goflags.NewFlagSet() flagSet.SetDescription(`Katana is a fast crawler focused on execution in automation From 77946c00a8fd8f9035486c1741d077baa576fbc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Do=C4=9Fan=20Can=20Bak=C4=B1r?= Date: Fri, 10 May 2024 11:52:50 +0300 Subject: [PATCH 2/3] update utils --- cmd/katana/main.go | 46 ++-------------------------------------------- go.mod | 4 ++-- go.sum | 3 +++ 3 files changed, 7 insertions(+), 46 deletions(-) diff --git a/cmd/katana/main.go b/cmd/katana/main.go index ff415265..09334f44 100644 --- a/cmd/katana/main.go +++ b/cmd/katana/main.go @@ -1,7 +1,6 @@ package main import ( - "bufio" "fmt" "math" "os" @@ -18,6 +17,7 @@ import ( "github.com/projectdiscovery/katana/pkg/types" errorutil "github.com/projectdiscovery/utils/errors" fileutil "github.com/projectdiscovery/utils/file" + folderutil "github.com/projectdiscovery/utils/folder" "github.com/rs/xid" ) @@ -74,7 +74,7 @@ func main() { // deduplicate the lines in each file in the store-field-dir //use options.StoreFieldDir once https://github.com/projectdiscovery/katana/pull/877 is merged storeFieldDir := "katana_field" - _ = deduplicateLinesInFilesInDir(storeFieldDir) + _ = folderutil.DedupeLinesInFiles(storeFieldDir) // remove the resume file in case it exists if fileutil.FileExists(resumeFilename) { @@ -83,48 +83,6 @@ func main() { } -func deduplicateLinesInFilesInDir(dir string) error { - err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { - if err != nil { - return err - } - if !info.IsDir() { - return deduplicateLinesInFile(path) - } - return nil - }) - if err != nil { - return errorutil.NewWithErr(err).Msgf("error processing directory %s", dir) - } - return nil -} - -func deduplicateLinesInFile(filename string) error { - file, err := os.Open(filename) - if err != nil { - return errorutil.NewWithErr(err).Msgf("could not open file: %s", filename) - } - defer file.Close() - - seenLines := make(map[string]struct{}) - var deduplicatedLines []string - - scanner := bufio.NewScanner(file) - for scanner.Scan() { - line := scanner.Text() - if _, exists := seenLines[line]; !exists { - seenLines[line] = struct{}{} - deduplicatedLines = append(deduplicatedLines, line) - } - } - - if err := scanner.Err(); err != nil { - return errorutil.NewWithErr(err).Msgf("could not read file: %s", filename) - } - - return os.WriteFile(filename, []byte(strings.Join(deduplicatedLines, "\n")+"\n"), 0644) -} - func readFlags() (*goflags.FlagSet, error) { flagSet := goflags.NewFlagSet() flagSet.SetDescription(`Katana is a fast crawler focused on execution in automation diff --git a/go.mod b/go.mod index 754d8b46..fd980ef3 100644 --- a/go.mod +++ b/go.mod @@ -12,7 +12,7 @@ require ( github.com/mitchellh/mapstructure v1.5.0 github.com/pkg/errors v0.9.1 github.com/projectdiscovery/dsl v0.0.52 - github.com/projectdiscovery/fastdialer v0.0.69 + github.com/projectdiscovery/fastdialer v0.0.70 github.com/projectdiscovery/goflags v0.1.51 github.com/projectdiscovery/gologger v1.1.12 github.com/projectdiscovery/hmap v0.0.41 @@ -20,7 +20,7 @@ require ( github.com/projectdiscovery/ratelimit v0.0.39 github.com/projectdiscovery/retryablehttp-go v1.0.59 github.com/projectdiscovery/useragent v0.0.47 - github.com/projectdiscovery/utils v0.0.92 + github.com/projectdiscovery/utils v0.0.93 github.com/projectdiscovery/wappalyzergo v0.0.121 github.com/remeh/sizedwaitgroup v1.0.0 github.com/rs/xid v1.5.0 diff --git a/go.sum b/go.sum index d0190144..367b0e12 100644 --- a/go.sum +++ b/go.sum @@ -210,6 +210,7 @@ github.com/projectdiscovery/dsl v0.0.52 h1:jvIvF+qN8+MbI1MHtWJJKfWqAZQlCExL3ob7S github.com/projectdiscovery/dsl v0.0.52/go.mod h1:xfcHwhy2HSaeGgh+1wqzOoCGm2XTdh5JzjBRBVHEMvI= github.com/projectdiscovery/fastdialer v0.0.69 h1:BfFQTyTB1hrw9sWCw4CjQfbmlpvnJCPZEmKtxcwJGbU= github.com/projectdiscovery/fastdialer v0.0.69/go.mod h1:RXrx7M2T3V3rMZ2h0X2/SsY93+RhgF/LmFa1E0MI3L8= +github.com/projectdiscovery/fastdialer v0.0.70/go.mod h1:HQ0ZpvOPOTZFSQxGyYJgNdek93hi4eIC1avZgiQ7+a4= github.com/projectdiscovery/goflags v0.1.51 h1:PhMekTX727L1YqBfP0of0clSygrq20RnWWp0+khAxqo= github.com/projectdiscovery/goflags v0.1.51/go.mod h1:muJxbcOFi4kzg0G30S526X6dY/OyuDInchYIkoZOudU= github.com/projectdiscovery/gologger v1.1.12 h1:uX/QkQdip4PubJjjG0+uk5DtyAi1ANPJUvpmimXqv4A= @@ -236,6 +237,8 @@ github.com/projectdiscovery/useragent v0.0.47 h1:VEOU7uG7TutZNIE0DZNP7hGAGi4bwLP github.com/projectdiscovery/useragent v0.0.47/go.mod h1:Cfk9X9SISYSCmqpej0r9+paJbDHzNHic2YdWQtpdz2M= github.com/projectdiscovery/utils v0.0.92 h1:lGCmjUJhzoNX4FQZWpp80058pRlD0/dYxLJOSs07EqY= github.com/projectdiscovery/utils v0.0.92/go.mod h1:d5uvD5qcRiK3qxZbBy9eatCqrCSuj9SObL04w/WgXSg= +github.com/projectdiscovery/utils v0.0.93 h1:IMZFsmQFYZUf7rxpBoZj+53FsNDC/vHsXA+4B4GuGeg= +github.com/projectdiscovery/utils v0.0.93/go.mod h1:2+mWzk5FeYdK9imo5eLk6oVeih0G0wsTff1pzBAh9tk= github.com/projectdiscovery/wappalyzergo v0.0.121 h1:Xae4Yw3/pzjh1TJQdoavoV2OTo4Tn5g8J8DQQqcieDA= github.com/projectdiscovery/wappalyzergo v0.0.121/go.mod h1:qW0PP+UBMcdQBBnwk+X6YYFs6huKNvn2BOVs4vQPru0= github.com/quic-go/quic-go v0.42.0 h1:uSfdap0eveIl8KXnipv9K7nlwZ5IqLlYOpJ58u5utpM= From ca9465c0eec84222fd975c003d37ba7af22e4c9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Do=C4=9Fan=20Can=20Bak=C4=B1r?= Date: Fri, 10 May 2024 11:53:47 +0300 Subject: [PATCH 3/3] go mod tidy --- go.sum | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/go.sum b/go.sum index 367b0e12..4a159fe5 100644 --- a/go.sum +++ b/go.sum @@ -208,8 +208,7 @@ github.com/projectdiscovery/blackrock v0.0.1 h1:lHQqhaaEFjgf5WkuItbpeCZv2DUIE45k github.com/projectdiscovery/blackrock v0.0.1/go.mod h1:ANUtjDfaVrqB453bzToU+YB4cUbvBRpLvEwoWIwlTss= github.com/projectdiscovery/dsl v0.0.52 h1:jvIvF+qN8+MbI1MHtWJJKfWqAZQlCExL3ob7SddQbZE= github.com/projectdiscovery/dsl v0.0.52/go.mod h1:xfcHwhy2HSaeGgh+1wqzOoCGm2XTdh5JzjBRBVHEMvI= -github.com/projectdiscovery/fastdialer v0.0.69 h1:BfFQTyTB1hrw9sWCw4CjQfbmlpvnJCPZEmKtxcwJGbU= -github.com/projectdiscovery/fastdialer v0.0.69/go.mod h1:RXrx7M2T3V3rMZ2h0X2/SsY93+RhgF/LmFa1E0MI3L8= +github.com/projectdiscovery/fastdialer v0.0.70 h1:1rnUKc8NRj6dzG8aTmqW+RF8m0dFbfTs9CiOo6Aig8U= github.com/projectdiscovery/fastdialer v0.0.70/go.mod h1:HQ0ZpvOPOTZFSQxGyYJgNdek93hi4eIC1avZgiQ7+a4= github.com/projectdiscovery/goflags v0.1.51 h1:PhMekTX727L1YqBfP0of0clSygrq20RnWWp0+khAxqo= github.com/projectdiscovery/goflags v0.1.51/go.mod h1:muJxbcOFi4kzg0G30S526X6dY/OyuDInchYIkoZOudU= @@ -235,8 +234,6 @@ github.com/projectdiscovery/stringsutil v0.0.2 h1:uzmw3IVLJSMW1kEg8eCStG/cGbYYZA github.com/projectdiscovery/stringsutil v0.0.2/go.mod h1:EJ3w6bC5fBYjVou6ryzodQq37D5c6qbAYQpGmAy+DC0= github.com/projectdiscovery/useragent v0.0.47 h1:VEOU7uG7TutZNIE0DZNP7hGAGi4bwLPGM1X7Rny52s0= github.com/projectdiscovery/useragent v0.0.47/go.mod h1:Cfk9X9SISYSCmqpej0r9+paJbDHzNHic2YdWQtpdz2M= -github.com/projectdiscovery/utils v0.0.92 h1:lGCmjUJhzoNX4FQZWpp80058pRlD0/dYxLJOSs07EqY= -github.com/projectdiscovery/utils v0.0.92/go.mod h1:d5uvD5qcRiK3qxZbBy9eatCqrCSuj9SObL04w/WgXSg= github.com/projectdiscovery/utils v0.0.93 h1:IMZFsmQFYZUf7rxpBoZj+53FsNDC/vHsXA+4B4GuGeg= github.com/projectdiscovery/utils v0.0.93/go.mod h1:2+mWzk5FeYdK9imo5eLk6oVeih0G0wsTff1pzBAh9tk= github.com/projectdiscovery/wappalyzergo v0.0.121 h1:Xae4Yw3/pzjh1TJQdoavoV2OTo4Tn5g8J8DQQqcieDA=