From c1425a9b8b1e516e5ca305d32c0adcce43c9b446 Mon Sep 17 00:00:00 2001 From: johnerikhalse Date: Thu, 27 Apr 2023 12:37:54 +0200 Subject: [PATCH 1/4] Cleaned up reading of config varables --- cmd/cat/cat.go | 19 ++++--- cmd/completion.go | 2 +- cmd/convert/nedlib/nedlib.go | 6 --- cmd/ls/ls.go | 19 ++++--- cmd/root.go | 17 ------- cmd/validate/validate.go | 14 ++---- internal/flag/names.go | 98 +++++++++++++++++++++--------------- 7 files changed, 90 insertions(+), 85 deletions(-) diff --git a/cmd/cat/cat.go b/cmd/cat/cat.go index 3974d940..8c18001a 100644 --- a/cmd/cat/cat.go +++ b/cmd/cat/cat.go @@ -57,6 +57,13 @@ warc cat -n4 -P file1.warc.gz | feh -`, return errors.New("missing file name") } c.fileName = args[0] + c.offset = viper.GetInt64(flag.Offset) + c.recordCount = viper.GetInt(flag.RecordCount) + c.recordNum = viper.GetInt(flag.RecordNum) + c.showWarcHeader = viper.GetBool(flag.ShowWarcHeader) + c.showProtocolHeader = viper.GetBool(flag.ShowProtocolHeader) + c.showPayload = viper.GetBool(flag.ShowPayload) + if (c.offset >= 0 || c.recordNum >= 0) && c.recordCount == 0 { c.recordCount = 1 } @@ -75,12 +82,12 @@ warc cat -n4 -P file1.warc.gz | feh -`, }, } - cmd.Flags().Int64VarP(&c.offset, "offset", "o", -1, "print record at offset bytes") - cmd.Flags().IntVarP(&c.recordNum, "num", "n", -1, "print the n'th record. This is applied after records are filtered out by other options") - cmd.Flags().IntVarP(&c.recordCount, "record-count", "c", 0, "The maximum number of records to show. Defaults to show all records except if -o or -n option is set, then default is one.") - cmd.Flags().BoolVarP(&c.showWarcHeader, "header", "w", false, "show WARC header") - cmd.Flags().BoolVarP(&c.showProtocolHeader, "protocol-header", "p", false, "show protocol header") - cmd.Flags().BoolVarP(&c.showPayload, "payload", "P", false, "show payload") + cmd.Flags().Int64P(flag.Offset, "o", -1, flag.OffsetHelp) + cmd.Flags().IntP(flag.RecordNum, "n", -1, flag.RecordNumHelp) + cmd.Flags().IntP(flag.RecordCount, "c", 0, flag.RecordCountHelp+" Defaults to show all records except if -o or -n option is set, then default is one.") + cmd.Flags().BoolP(flag.ShowWarcHeader, "w", false, flag.ShowWarcHeaderHelp) + cmd.Flags().BoolP(flag.ShowProtocolHeader, "p", false, flag.ShowProtocolHeaderHelp) + cmd.Flags().BoolP(flag.ShowPayload, "P", false, flag.ShowPayloadHelp) cmd.Flags().StringArray(flag.RecordId, []string{}, flag.RecordIdHelp) cmd.Flags().StringSliceP(flag.RecordType, "t", []string{}, flag.RecordTypeHelp) cmd.Flags().StringP(flag.ResponseCode, "S", "", flag.ResponseCodeHelp) diff --git a/cmd/completion.go b/cmd/completion.go index ac2d30ab..b9991581 100644 --- a/cmd/completion.go +++ b/cmd/completion.go @@ -49,7 +49,7 @@ PowerShell: `, DisableFlagsInUseLine: true, ValidArgs: []string{"bash", "zsh", "fish", "powershell"}, - Args: cobra.ExactValidArgs(1), + Args: cobra.MatchAll(cobra.ExactArgs(1), cobra.OnlyValidArgs), Run: func(cmd *cobra.Command, args []string) { switch args[0] { case "bash": diff --git a/cmd/convert/nedlib/nedlib.go b/cmd/convert/nedlib/nedlib.go index 80f4c8c2..87566a01 100644 --- a/cmd/convert/nedlib/nedlib.go +++ b/cmd/convert/nedlib/nedlib.go @@ -58,12 +58,6 @@ func NewCommand() *cobra.Command { payload := &gowarc.WarcFields{} payload.Set("software", cmdversion.SoftwareVersion()) payload.Set("format", fmt.Sprintf("WARC File Format %d.%d", wc.WarcVersion.Minor(), wc.WarcVersion.Minor())) - //payload.Set("collection", ww.collectionConfig.GetMeta().GetName()) - //payload.Set("description", ww.collectionConfig.GetMeta().GetDescription()) - //if ww.subCollection != config.Collection_UNDEFINED { - // payload.Set("subCollection", ww.subCollection.String()) - //} - //payload.Set("isPartOf", ww.CollectionName()) h, e := os.Hostname() if e != nil { return e diff --git a/cmd/ls/ls.go b/cmd/ls/ls.go index 2a22379a..dfe05316 100644 --- a/cmd/ls/ls.go +++ b/cmd/ls/ls.go @@ -79,6 +79,13 @@ Output options: return errors.New("missing file or directory") } c.files = args + c.delimiter = viper.GetString(flag.Delimiter) + c.concurrency = viper.GetInt(flag.Concurrency) + c.offset = viper.GetInt64(flag.Offset) + c.recordCount = viper.GetInt(flag.RecordCount) + c.strict = viper.GetBool(flag.Strict) + c.fields = viper.GetString(flag.Fields) + if c.offset >= 0 && c.recordCount == 0 { c.recordCount = 1 // TODO: check that input is exactly one file when using offset @@ -101,12 +108,12 @@ Output options: cmd.Flags().BoolP(flag.Recursive, "r", false, flag.RecursiveHelp) cmd.Flags().BoolP(flag.FollowSymlinks, "s", false, flag.FollowSymlinksHelp) cmd.Flags().StringSlice(flag.Suffixes, []string{".warc", ".warc.gz"}, flag.SuffixesHelp) - cmd.Flags().IntVarP(&c.concurrency, flag.Concurrency, "c", 1, flag.ConcurrencyHelp) - cmd.Flags().Int64VarP(&c.offset, "offset", "o", -1, "record offset") - cmd.Flags().IntVarP(&c.recordCount, "record-count", "n", 0, "The maximum number of records to show") - cmd.Flags().BoolVar(&c.strict, "strict", false, "strict parsing") - cmd.Flags().StringVarP(&c.delimiter, "delimiter", "d", " ", "use string instead of SPACE for field delimiter") - cmd.Flags().StringVarP(&c.fields, "fields", "f", "", "which fields to include. See 'warc help ls' for a description") + cmd.Flags().IntP(flag.Concurrency, "c", 1, flag.ConcurrencyHelp) + cmd.Flags().Int64P(flag.Offset, "o", -1, flag.OffsetHelp) + cmd.Flags().IntP(flag.RecordCount, "n", 0, flag.RecordCountHelp) + cmd.Flags().Bool(flag.Strict, false, flag.StrictHelp) + cmd.Flags().StringP(flag.Delimiter, "d", " ", flag.DelimiterHelp) + cmd.Flags().StringP(flag.Fields, "f", "", flag.FieldsHelp) cmd.Flags().StringArray(flag.RecordId, []string{}, flag.RecordIdHelp) cmd.Flags().StringSliceP(flag.RecordType, "t", []string{}, flag.RecordTypeHelp) cmd.Flags().StringP(flag.ResponseCode, "S", "", flag.ResponseCodeHelp) diff --git a/cmd/root.go b/cmd/root.go index 98cbd2ea..a8ef5b15 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -26,7 +26,6 @@ import ( "github.com/nlnwa/warchaeology/internal/config" "github.com/nlnwa/warchaeology/internal/flag" "github.com/spf13/cobra" - "github.com/spf13/viper" "os" ) @@ -36,22 +35,6 @@ func NewCommand() *cobra.Command { Use: "warc", Short: "A tool for handling warc files", Long: ``, - - PersistentPreRunE: func(cmd *cobra.Command, args []string) error { - // Overwrite config values if set in command specific key - cv := viper.Sub(cmd.Name()) - if cv != nil { - for _, k := range cv.AllKeys() { - viper.Set(k, cv.Get(k)) - } - } - - if err := viper.BindPFlags(cmd.Flags()); err != nil { - panic(err) - } - - return nil - }, } // Flags diff --git a/cmd/validate/validate.go b/cmd/validate/validate.go index 0a339828..1a738b58 100644 --- a/cmd/validate/validate.go +++ b/cmd/validate/validate.go @@ -33,11 +33,7 @@ import ( ) type conf struct { - files []string - recursive bool - followSymlinks bool - suffixes []string - concurrency int + files []string } func NewCommand() *cobra.Command { @@ -56,10 +52,10 @@ func NewCommand() *cobra.Command { ValidArgsFunction: flag.SuffixCompletionFn, } - cmd.Flags().BoolVarP(&c.recursive, flag.Recursive, "r", false, flag.RecursiveHelp) - cmd.Flags().BoolVarP(&c.followSymlinks, flag.FollowSymlinks, "s", false, flag.FollowSymlinksHelp) - cmd.Flags().StringSliceVar(&c.suffixes, flag.Suffixes, []string{".warc", ".warc.gz"}, flag.SuffixesHelp) - cmd.Flags().IntVarP(&c.concurrency, flag.Concurrency, "c", int(float32(runtime.NumCPU())*float32(1.5)), flag.ConcurrencyHelp) + cmd.Flags().BoolP(flag.Recursive, "r", false, flag.RecursiveHelp) + cmd.Flags().BoolP(flag.FollowSymlinks, "s", false, flag.FollowSymlinksHelp) + cmd.Flags().StringSlice(flag.Suffixes, []string{".warc", ".warc.gz"}, flag.SuffixesHelp) + cmd.Flags().IntP(flag.Concurrency, "c", int(float32(runtime.NumCPU())*float32(1.5)), flag.ConcurrencyHelp) return cmd } diff --git a/internal/flag/names.go b/internal/flag/names.go index cacdfeb0..6bd1b561 100644 --- a/internal/flag/names.go +++ b/internal/flag/names.go @@ -1,36 +1,45 @@ package flag const ( - LogFileName = "log-file-name" - LogFile = "log-file" - LogConsole = "log-console" - RecordId = "id" - RecordType = "record-type" - ResponseCode = "response-code" - MimeType = "mime-type" - WarcDir = "warc-dir" - NewIndex = "new-index" - KeepIndex = "keep-index" - IndexDir = "index-dir" - Recursive = "recursive" - FollowSymlinks = "symlinks" - Suffixes = "suffixes" - Concurrency = "concurrency" - ConcurrentWriters = "concurrent-writers" - FileSize = "file-size" - Compress = "compress" - CompressionLevel = "compression-level" - FilePrefix = "prefix" - SubdirPattern = "subdir-pattern" - NameGenerator = "name-generator" - Flush = "flush" - WarcVersion = "warc-version" - DefaultDate = "default-date" - TmpDir = "tmpdir" - BufferMaxMem = "max-buffer-mem" - DedupSizeGain = "min-size-gain" - MinFreeDisk = "min-free-disk" - Repair = "repair" + LogFileName = "log-file-name" + LogFile = "log-file" + LogConsole = "log-console" + RecordId = "id" + RecordType = "record-type" + ResponseCode = "response-code" + MimeType = "mime-type" + Offset = "offset" + RecordNum = "num" + RecordCount = "record-count" + Strict = "strict" + Delimiter = "delimiter" + Fields = "fields" + ShowWarcHeader = "header" + ShowProtocolHeader = "protocol-header" + ShowPayload = "payload" + WarcDir = "warc-dir" + NewIndex = "new-index" + KeepIndex = "keep-index" + IndexDir = "index-dir" + Recursive = "recursive" + FollowSymlinks = "symlinks" + Suffixes = "suffixes" + Concurrency = "concurrency" + ConcurrentWriters = "concurrent-writers" + FileSize = "file-size" + Compress = "compress" + CompressionLevel = "compression-level" + FilePrefix = "prefix" + SubdirPattern = "subdir-pattern" + NameGenerator = "name-generator" + Flush = "flush" + WarcVersion = "warc-version" + DefaultDate = "default-date" + TmpDir = "tmpdir" + BufferMaxMem = "max-buffer-mem" + DedupSizeGain = "min-size-gain" + MinFreeDisk = "min-free-disk" + Repair = "repair" LogFileNameHelp = `a file to write log output. Empty for no log file` LogFileHelp = `the kind of log output to write to file. Valid values: info, error, summary` @@ -44,16 +53,25 @@ Examples: '200-300': all records with response code between 200(inclusive) and 300(exclusive) '-400': all response codes below 400 '500-': all response codes from 500 and above` - MimeTypeHelp = "filter records with given mime-types. For more than one, repeat flag or comma separated list." - WarcDirHelp = `output directory for generated warc files. Directory must exist.` - NewIndexHelp = `true to start from a fresh index, deleting eventual index from last run` - KeepIndexHelp = `true to keep index on disk so that the next run will continue where the previous run left off` - IndexDirHelp = `directory to store indexes` - RecursiveHelp = `walk directories recursively` - FollowSymlinksHelp = `follow symlinks` - SuffixesHelp = `filter files by suffixes` - ConcurrencyHelp = `number of input files to process simultaneously. The default value is 1.5 x ` - ConcurrentWritersHelp = `maximum concurrent WARC writers. This is the number of WARC-files simultaneously written to. + MimeTypeHelp = `filter records with given mime-types. For more than one, repeat flag or comma separated list.` + OffsetHelp = `record offset` + RecordNumHelp = `print the n'th record (zero based). This is applied after records are filtered out by other options` + RecordCountHelp = `The maximum number of records to show` + StrictHelp = `strict parsing` + DelimiterHelp = `use string instead of SPACE for field delimiter` + FieldsHelp = `which fields to include. See 'warc help ls' for a description` + ShowWarcHeaderHelp = `show WARC header` + ShowProtocolHeaderHelp = `show protocol header` + ShowPayloadHelp = `show payload` + WarcDirHelp = `output directory for generated warc files. Directory must exist.` + NewIndexHelp = `true to start from a fresh index, deleting eventual index from last run` + KeepIndexHelp = `true to keep index on disk so that the next run will continue where the previous run left off` + IndexDirHelp = `directory to store indexes` + RecursiveHelp = `walk directories recursively` + FollowSymlinksHelp = `follow symlinks` + SuffixesHelp = `filter files by suffixes` + ConcurrencyHelp = `number of input files to process simultaneously. The default value is 1.5 x ` + ConcurrentWritersHelp = `maximum concurrent WARC writers. This is the number of WARC-files simultaneously written to. A consequence is that at least this many WARC files are created even if there is only one input file.` FileSizeHelp = `The maximum size for WARC files` CompressHelp = `use gzip compression for WARC files` From 7a3ac32d7b90e1e5fa2e759cb8f1993306a33748 Mon Sep 17 00:00:00 2001 From: johnerikhalse Date: Fri, 28 Apr 2023 10:21:00 +0200 Subject: [PATCH 2/4] Environment variable naming --- internal/config/config.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/internal/config/config.go b/internal/config/config.go index dd260286..161c4b76 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -56,6 +56,8 @@ func loadConfig(cmd *cobra.Command) { viper.SetDefault(flag.CompressionLevel, gzip.DefaultCompression) viper.SetDefault(flag.DefaultDate, time.Now().Format(warcwriterconfig.DefaultDateFormat)) + viper.SetEnvPrefix("WARC") + viper.SetEnvKeyReplacer(strings.NewReplacer("-", "_")) viper.AutomaticEnv() // read in environment variables that match if viper.IsSet("config") { From 85e7d6237749ec60a055836078483dbeaeab5adf Mon Sep 17 00:00:00 2001 From: johnerikhalse Date: Fri, 28 Apr 2023 10:32:15 +0200 Subject: [PATCH 3/4] Config documentation --- docs/content/_index.md | 4 +- docs/content/cmd/_index.md | 21 ++++- docs/content/config/_index.md | 92 +++++++++++++++++++ docs/content/contributing/_index.md | 2 +- .../layouts/shortcodes/childpages.html | 4 +- 5 files changed, 116 insertions(+), 7 deletions(-) create mode 100644 docs/content/config/_index.md diff --git a/docs/content/_index.md b/docs/content/_index.md index 0a18f65a..29d2ce20 100644 --- a/docs/content/_index.md +++ b/docs/content/_index.md @@ -1,6 +1,6 @@ --- title: Warchaeology -weight: 1 +weight: 10 --- {{< lead >}} @@ -13,7 +13,7 @@ Warchaeology is a collection of tools for inspecting, manipulating and validatin Validate that WARC-files conforms to specification. {{< /featureitem >}} {{< featureitem icon="fas fa-search" title="Inspection" >}} - Navigate WARC-files with terminal console. + Navigate WARC-files with terminal console or extract content from WARC-files. {{< /featureitem >}} {{< featureitem icon="fas fa-magic" title="Conversion" >}} Convert webarchives into WARC-format. diff --git a/docs/content/cmd/_index.md b/docs/content/cmd/_index.md index 867d887b..287233e0 100644 --- a/docs/content/cmd/_index.md +++ b/docs/content/cmd/_index.md @@ -1,6 +1,23 @@ --- title: Usage -weight: 2 +weight: 20 --- -The command line tool is named `warc`. The different functions are executed trough sub-commands. \ No newline at end of file +The Warchaeology tool is named `warc`. + +## Syntax +``` +warc [command [subcommand]] [flags] +``` +Where: +* **command** and eventually **subcommand** specifies the operation that you want to perform. +* **flags** specifies optional flags. For example, you can use the `--config` flag to specify + the location of a configuration file. + + + +## Commands +The following pages are generated from the built in documentation and can also be viewed on the +command line with `warc -h` + +{{< childpages >}} diff --git a/docs/content/config/_index.md b/docs/content/config/_index.md new file mode 100644 index 00000000..20fe68f3 --- /dev/null +++ b/docs/content/config/_index.md @@ -0,0 +1,92 @@ +--- +title: Configuration +weight: 30 +--- + +## Configuration parameters + +Warchaeology commands can be configured by specifying parameters. +There are several options for specifying parameters where using command line flags +is the easiest. But if you find yourself always setting a specific flag it might be better +to add a configuration file or environment variable. + +Flags set on the command line takes precedence over configuration files and environment variables. + +Parameter documentation can be found in the *options* section for each [command](/cmd). +The parameter name is the long flag name with the dashes removed. + +## Environment variables + +Environment variables can be used to set parameters. Use the following steps to convert +a parameter name to an environment variable name: +* converting the parameter name to upper case +* replace '-' with '_' +* prefix with `WARC_` + +> Setting the environment variable **WARC_RECORD_COUNT=2** is equal to specify the flag `--record-count=2`. + +Environment variables takes precedence over parameters in config files. + +## Configuration File + +Parameters can also be set in configuration files. The configuration file format is YAML. + +#### File structure + +To set a configuration parameter use the parameter name as key and then the value: + +```yaml +delimiter: "\t" +record-count: 2 +``` + +If you want to have a global default, but override the parameter for a specific command +you can do so by adding a section with the command as key. + +```yaml +delimiter: "\t" +record-count: 2 +ls: + record-count: 5 +convert: + tmpdir: mydir + arc: + tmpdir: anotherdir +``` +This config file gives the following values + +{{< table style="table-striped" >}} +| Command | parameter name | parameter value | +|-------------------|----------------|-----------------| +| warc cat | record-count | 2 | +| warc ls | record-count | 5 | +| warc ls | tmpdir | /tmp (default) | +| warc convert warc | tmpdir | mydir | +| warc convert arc | tmpdir | anotherdir | +{{< /table >}} + +#### Config file location + +The standard configuration files are named `config.yaml` and are searched for in +system default directories. + +The directories are looked up in the following order: + +1. Standard Global Configuration Paths + * _Linux_: $XDG_CONFIG_DIRS or "/etc/xdg/warc" + * _Windows_: %PROGRAMDATA% or "C:\\ProgramData/warc" + * _macOS_: /Library/Application Support/warc + +2. Standard User-Specific Configuration Paths + * _Linux_: $XDG_CONFIG_HOME or "$HOME/.config/warc" + * _Windows_: %APPDATA% or "C:\\Users\\%USER%\\AppData\\Roaming\\warc" + * _macOS_: $HOME/Library/Application Support/warc + +3. Working directory + * The directory warc was started from + +All steps are searched for a file named `config.yaml` and if found, +values in a later file will override values in the files before it. + +By setting the command line flag `--config` to a file name, the user can override the default +config with a user specified config file. diff --git a/docs/content/contributing/_index.md b/docs/content/contributing/_index.md index 87b0f26d..006e2eec 100644 --- a/docs/content/contributing/_index.md +++ b/docs/content/contributing/_index.md @@ -1,6 +1,6 @@ --- title: Contributing -weight: 3 +weight: 50 --- ## Getting Started diff --git a/docs/themes/ace-documentation/layouts/shortcodes/childpages.html b/docs/themes/ace-documentation/layouts/shortcodes/childpages.html index 94ff26f1..3d5da407 100644 --- a/docs/themes/ace-documentation/layouts/shortcodes/childpages.html +++ b/docs/themes/ace-documentation/layouts/shortcodes/childpages.html @@ -11,7 +11,7 @@ {{- $cpage := (.Scratch.Get "current") }} -
    +
      {{- .Scratch.Set "pages" $cpage.Pages }} {{- if $cpage.Sections}} {{- .Scratch.Set "pages" ($cpage.Pages | union $cpage.Sections) }} @@ -27,7 +27,7 @@ {{ define "childs" }} {{- range .menu }} -
    • +
    • {{ .Title }}
    • {{ end }} From 64839a20f1c059df6fb70b349856e932f12a3c51 Mon Sep 17 00:00:00 2001 From: johnerikhalse Date: Fri, 28 Apr 2023 10:34:12 +0200 Subject: [PATCH 4/4] Avoid printing status line when output is not a terminal --- internal/filewalker/filewalker.go | 27 +++++++++++++++------------ internal/utils/utils.go | 10 ++++++++++ 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/internal/filewalker/filewalker.go b/internal/filewalker/filewalker.go index 794a6ae2..6dc156ed 100644 --- a/internal/filewalker/filewalker.go +++ b/internal/filewalker/filewalker.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "github.com/nlnwa/warchaeology/internal/flag" + "github.com/nlnwa/warchaeology/internal/utils" "github.com/nlnwa/warchaeology/internal/workerpool" "github.com/spf13/viper" "io/fs" @@ -51,18 +52,20 @@ func New(paths []string, recursive, followSymlinks bool, suffixes []string, conc func NewFromViper(cmd string, paths []string, fn func(path string) Result) FileWalker { var consoleType logType var fileType logType - for _, t := range viper.GetStringSlice(flag.LogConsole) { - switch strings.ToLower(t) { - case "info": - consoleType = consoleType | info - case "error": - consoleType = consoleType | err - case "summary": - consoleType = consoleType | summary - case "progress": - consoleType = consoleType | progress - default: - panic("Illegal config value '" + t + "' for " + flag.LogConsole) + if utils.StdoutIsTerminal() { + for _, t := range viper.GetStringSlice(flag.LogConsole) { + switch strings.ToLower(t) { + case "info": + consoleType = consoleType | info + case "error": + consoleType = consoleType | err + case "summary": + consoleType = consoleType | summary + case "progress": + consoleType = consoleType | progress + default: + panic("Illegal config value '" + t + "' for " + flag.LogConsole) + } } } for _, t := range viper.GetStringSlice(flag.LogFile) { diff --git a/internal/utils/utils.go b/internal/utils/utils.go index 64436bfe..7811e8c8 100644 --- a/internal/utils/utils.go +++ b/internal/utils/utils.go @@ -21,6 +21,7 @@ import ( "fmt" "github.com/shirou/gopsutil/disk" "github.com/spf13/cast" + "os" "strings" "unicode" ) @@ -121,3 +122,12 @@ func NewOutOfSpaceError(format string, a ...any) OutOfSpaceError { func (o OutOfSpaceError) Error() string { return string(o) } + +func StdoutIsTerminal() bool { + o, _ := os.Stdout.Stat() + if (o.Mode() & os.ModeCharDevice) == os.ModeCharDevice { + return true + } else { //It is not the terminal + return false + } +}