From 2caf355971fb2272feda75bb34d12708964803c4 Mon Sep 17 00:00:00 2001 From: zhijian Date: Fri, 14 Oct 2022 21:03:29 +0800 Subject: [PATCH] cmd/debug: refactor debug command (#2857) * refactor debug command --- cmd/debug.go | 290 +++++++++--------- cmd/main.go | 2 +- .../fault_diagnosis_and_analysis.md | 2 +- docs/en/reference/command_reference.md | 13 +- .../fault_diagnosis_and_analysis.md | 2 +- docs/zh_cn/reference/command_reference.md | 13 +- 6 files changed, 157 insertions(+), 165 deletions(-) diff --git a/cmd/debug.go b/cmd/debug.go index 0038d45771e8..d60086360db8 100644 --- a/cmd/debug.go +++ b/cmd/debug.go @@ -20,6 +20,7 @@ import ( "archive/zip" "bufio" "bytes" + "encoding/json" "fmt" "io" "net/http" @@ -35,13 +36,13 @@ import ( "github.com/juicedata/juicefs/pkg/meta" "github.com/juicedata/juicefs/pkg/utils" - + "github.com/juicedata/juicefs/pkg/vfs" "github.com/urfave/cli/v2" ) var defaultOutDir = filepath.Join(".", "debug") -func cmdDoctor() *cli.Command { +func cmdDebug() *cli.Command { return &cli.Command{ Name: "debug", Action: debug, @@ -58,10 +59,7 @@ $ juicefs debug /mnt/jfs $ juicefs debug --out-dir=/var/log /mnt/jfs # Get the last up to 1000 log entries -$ juicefs debug --out-dir=/var/log --collect-log --limit=1000 /mnt/jfs - -# Get pprof information -$ juicefs debug --out-dir=/var/log --collect-log --limit=1000 --collect-pprof /mnt/jfs +$ juicefs debug --out-dir=/var/log --limit=1000 /mnt/jfs `, Flags: []cli.Flag{ &cli.StringFlag{ @@ -69,18 +67,10 @@ $ juicefs debug --out-dir=/var/log --collect-log --limit=1000 --collect-pprof /m Value: defaultOutDir, Usage: "the output directory of the result file", }, - &cli.BoolFlag{ - Name: "collect-log", - Usage: "enable log collection", - }, &cli.Uint64Flag{ Name: "limit", Usage: "the number of last entries to be collected", }, - &cli.BoolFlag{ - Name: "collect-pprof", - Usage: "enable pprof collection", - }, &cli.Uint64Flag{ Name: "stats-sec", Value: 5, @@ -100,7 +90,7 @@ $ juicefs debug --out-dir=/var/log --collect-log --limit=1000 --collect-pprof /m } } -func copyVolumeConfWindows(srcPath, destPath string) error { +func copyFileOnWindows(srcPath, destPath string) error { srcFile, err := os.Open(srcPath) if err != nil { return err @@ -117,9 +107,9 @@ func copyVolumeConfWindows(srcPath, destPath string) error { return nil } -func copyConfigFile(srcPath, destPath string, rootPrivileges bool) error { +func copyFile(srcPath, destPath string, rootPrivileges bool) error { if runtime.GOOS == "windows" { - return copyVolumeConfWindows(srcPath, destPath) + return copyFileOnWindows(srcPath, destPath) } var copyArgs []string @@ -219,7 +209,25 @@ func copyLogFile(logPath, retLogPath string, limit uint64, rootPrivileges bool) return exec.Command(copyArgs[0], copyArgs[1:]...).Run() } -func getPprofPort(pid, mp string, rootPrivileges bool) (int, error) { +func getPprofPort(pid, amp string, rootPrivileges bool) (int, error) { + content, err := os.ReadFile(filepath.Join(amp, ".config")) + if err != nil { + logger.Warnf("failed to read config file: %v", err) + } + cfg := vfs.Config{} + if err := json.Unmarshal(content, &cfg); err != nil { + logger.Warnf("failed to unmarshal config file: %v", err) + } + if cfg.Port.DebugAgent != "" { + if len(strings.Split(cfg.Port.DebugAgent, ":")) >= 2 { + if port, err := strconv.Atoi(strings.Split(cfg.Port.DebugAgent, ":")[1]); err != nil { + logger.Warnf("failed to parse debug agent port: %v", err) + } else { + return port, nil + } + } + } + var lsofArgs []string if rootPrivileges { lsofArgs = append(lsofArgs, "sudo") @@ -243,7 +251,7 @@ func getPprofPort(pid, mp string, rootPrivileges bool) (int, error) { logger.Errorf("failed to parse port %v: %v", port, err) } if port >= 6060 && port <= 6099 && port > listenPort { - if err := checkPort(port, mp); err == nil { + if err := checkPort(port, amp); err == nil { listenPort = port } continue @@ -280,7 +288,7 @@ func getRequest(url string) ([]byte, error) { } // check pprof service status -func checkPort(port int, mp string) error { +func checkPort(port int, amp string) error { url := fmt.Sprintf("http://localhost:%d/debug/pprof/cmdline?debug=1", port) resp, err := getRequest(url) if err != nil { @@ -290,14 +298,13 @@ func checkPort(port int, mp string) error { fields := strings.Fields(string(resp)) flag := false for _, field := range fields { - if mp == field { + if amp == field { flag = true } } if !flag { - return fmt.Errorf("mount point mismatch: \n%s\n%s", resp, mp) + return fmt.Errorf("mount point mismatch: \n%s\n%s", resp, amp) } - return nil } @@ -325,11 +332,7 @@ func reqAndSaveMetric(name string, metric metricItem, outDir string) error { if _, err := writer.Write(resp); err != nil { return fmt.Errorf("failed to write metric %s: %v", name, err) } - if err := writer.Flush(); err != nil { - return fmt.Errorf("failed to flush writer: %v", err) - } - - return nil + return writer.Flush() } func isUnix() bool { @@ -337,8 +340,7 @@ func isUnix() bool { } func checkAgent(cmd string) bool { - fields := strings.Fields(cmd) - for _, field := range fields { + for _, field := range strings.Fields(cmd) { if field == "--no-agent" { return false } @@ -359,7 +361,7 @@ func geneZipFile(srcPath, destPath string) error { } }() - if err = filepath.Walk(srcPath, func(path string, info os.FileInfo, _ error) error { + return filepath.Walk(srcPath, func(path string, info os.FileInfo, _ error) error { if path == srcPath { return nil } @@ -390,45 +392,84 @@ func geneZipFile(srcPath, destPath string) error { } } return nil - }); err != nil { - return err - } - - return nil + }) } -func debug(ctx *cli.Context) error { - setup(ctx, 1) - mp := ctx.Args().First() - inode, err := utils.GetFileInode(mp) - if err != nil { - return fmt.Errorf("failed to lookup inode for %s: %s", mp, err) +func collectPprof(ctx *cli.Context, cmd string, pid string, amp string, rootPrivileges bool, currDir string, wg *sync.WaitGroup) error { + if !checkAgent(cmd) { + logger.Warnf("No agent found, the pprof metrics will not be collected") + return nil } - if inode != uint64(meta.RootInode) { - return fmt.Errorf("path %s is not a mount point", mp) + + if !isUnix() { + logger.Warnf("Collecting pprof currently only support Linux/macOS") + return nil } - outDir := ctx.String("out-dir") - // special treatment for non-existing out dir - if outDirInfo, err := os.Stat(outDir); os.IsNotExist(err) { - if err := os.MkdirAll(outDir, os.ModePerm); err != nil { - return fmt.Errorf("failed to create out dir %s: %v", outDir, err) - } - } else if err == nil && !outDirInfo.IsDir() { - return fmt.Errorf("argument --out-dir is not directory: %s", outDir) + port, err := getPprofPort(pid, amp, rootPrivileges) + if err != nil { + return fmt.Errorf("failed to get pprof port: %v", err) + } + baseUrl := fmt.Sprintf("http://localhost:%d/debug/pprof/", port) + trace := ctx.Uint64("trace-sec") + profile := ctx.Uint64("profile-sec") + metrics := map[string]metricItem{ + "allocs": {name: "allocs.pb.gz", url: baseUrl + "allocs"}, + "blocks": {name: "block.pb.gz", url: baseUrl + "block"}, + "cmdline": {name: "cmdline.txt", url: baseUrl + "cmdline"}, + "goroutine": {name: "goroutine.pb.gz", url: baseUrl + "goroutine"}, + "stack": {name: "goroutine.stack.txt", url: baseUrl + "goroutine?debug=1"}, + "heap": {name: "heap.pb.gz", url: baseUrl + "heap"}, + "mutex": {name: "mutex.pb.gz", url: baseUrl + "mutex"}, + "threadcreate": {name: "threadcreate.pb.gz", url: baseUrl + "threadcreate"}, + "trace": {name: fmt.Sprintf("trace.%ds.pb.gz", trace), url: fmt.Sprintf("%strace?seconds=%d", baseUrl, trace)}, + "profile": {name: fmt.Sprintf("profile.%ds.pb.gz", profile), url: fmt.Sprintf("%sprofile?seconds=%d", baseUrl, profile)}, + } + + pprofOutDir := filepath.Join(currDir, "pprof") + if err := os.Mkdir(pprofOutDir, os.ModePerm); err != nil { + return fmt.Errorf("failed to create out directory: %v", err) + } + + for name, metric := range metrics { + wg.Add(1) + go func(name string, metric metricItem) { + defer wg.Done() + + if name == "profile" { + logger.Infof("Profile metrics are being sampled, sampling duration: %ds", profile) + } + if name == "trace" { + logger.Infof("Trace metrics are being sampled, sampling duration: %ds", trace) + } + if err := reqAndSaveMetric(name, metric, pprofOutDir); err != nil { + logger.Errorf("Failed to get and save metric %s: %v", name, err) + } + }(name, metric) } + return nil +} - mp, _ = filepath.Abs(mp) - timestamp := time.Now().Format("20060102150405") - prefix := strings.Trim(strings.Join(strings.Split(mp, "/"), "-"), "-") - currDir := filepath.Join(outDir, fmt.Sprintf("%s-%s", prefix, timestamp)) - if err := os.Mkdir(currDir, os.ModePerm); err != nil { - return fmt.Errorf("failed to create current out dir %s: %v", currDir, err) +func collectLog(ctx *cli.Context, cmd string, rootPrivileges bool, currDir string) error { + if !isUnix() { + logger.Warnf("Collecting log currently only support Linux/macOS") + return nil + } + logPath, err := getLogPath(cmd, rootPrivileges) + if err != nil { + return fmt.Errorf("failed to get log path: %v", err) } + limit := ctx.Uint64("limit") + retLogPath := filepath.Join(currDir, "juicefs.log") + + logger.Infof("Log %s is being collected", logPath) + return copyLogFile(logPath, retLogPath, limit, rootPrivileges) +} +func collectSysInfo(ctx *cli.Context, currDir string) error { sysInfo, err := utils.GetSysInfo() if err != nil { - return fmt.Errorf("failed to get system info: %v", err) + return err } result := fmt.Sprintf(`Platform: @@ -448,119 +489,88 @@ JuiceFS Version: } fmt.Printf("\n%s\n", result) + return nil +} - uid, pid, cmd, err := getCmdMount(mp) - if err != nil { - return fmt.Errorf("failed to get mount command: %v", err) - } - fmt.Printf("\nMount Command:\n%s\n\n", cmd) - - rootPrivileges := false - if (uid == "0" || uid == "root") && os.Getuid() != 0 { - fmt.Println("Mount point is mounted by the root user, may ask for root privilege...") - rootPrivileges = true - } - +func collectSpecialFile(ctx *cli.Context, amp string, currDir string, rootPrivileges bool, wg *sync.WaitGroup) error { configName := ".config" - if err := copyConfigFile(filepath.Join(mp, configName), filepath.Join(currDir, "config.txt"), rootPrivileges); err != nil { + if err := copyFile(filepath.Join(amp, configName), filepath.Join(currDir, "config.txt"), rootPrivileges); err != nil { return fmt.Errorf("failed to get volume config %s: %v", configName, err) } statsName := ".stats" stats := ctx.Uint64("stats-sec") - var wg sync.WaitGroup wg.Add(1) go func() { defer wg.Done() - srcPath := filepath.Join(mp, statsName) + srcPath := filepath.Join(amp, statsName) destPath := filepath.Join(currDir, "stats.txt") - if err := copyConfigFile(srcPath, destPath, rootPrivileges); err != nil { + if err := copyFile(srcPath, destPath, rootPrivileges); err != nil { logger.Errorf("Failed to get volume config %s: %v", statsName, err) } logger.Infof("Stats metrics are being sampled, sampling duration: %ds", stats) time.Sleep(time.Second * time.Duration(stats)) - destPath = filepath.Join(currDir, fmt.Sprintf("stats.%ds.txt", stats)) - if err := copyConfigFile(srcPath, destPath, rootPrivileges); err != nil { + if err := copyFile(srcPath, destPath, rootPrivileges); err != nil { logger.Errorf("Failed to get volume config %s: %v", statsName, err) } }() + return nil +} - if !isUnix() { - logger.Warnf("Collecting log currently only support Linux/macOS") +func debug(ctx *cli.Context) error { + setup(ctx, 1) + mp := ctx.Args().First() + inode, err := utils.GetFileInode(mp) + if err != nil { + return fmt.Errorf("failed to lookup inode for %s: %s", mp, err) } - - if isUnix() && ctx.Bool("collect-log") { - logPath, err := getLogPath(cmd, rootPrivileges) - if err != nil { - return fmt.Errorf("failed to get log path: %v", err) - } - limit := ctx.Uint64("limit") - retLogPath := filepath.Join(currDir, "juicefs.log") - - logger.Infof("Log %s is being collected", logPath) - if err := copyLogFile(logPath, retLogPath, limit, rootPrivileges); err != nil { - return fmt.Errorf("failed to get log file: %v", err) - } + if inode != uint64(meta.RootInode) { + return fmt.Errorf("path %s is not a mount point", mp) } - enableAgent := checkAgent(cmd) - if !enableAgent { - logger.Warnf("No agent found, the pprof metrics will not be collected") + amp, err := filepath.Abs(mp) + if err != nil { + return fmt.Errorf("failed to get absolute path: %v", err) + } + timestamp := time.Now().Format("20060102150405") + prefix := strings.Trim(strings.Join(strings.Split(amp, "/"), "-"), "-") + outDir := ctx.String("out-dir") + currDir := filepath.Join(outDir, fmt.Sprintf("%s-%s", prefix, timestamp)) + if err := os.MkdirAll(currDir, os.ModePerm); err != nil { + return fmt.Errorf("failed to create current out dir %s: %v", currDir, err) } - if !isUnix() { - logger.Warnf("Collecting pprof currently only support Linux/macOS") + if err := collectSysInfo(ctx, currDir); err != nil { + return err } - if isUnix() && enableAgent && ctx.Bool("collect-pprof") { - port, err := getPprofPort(pid, mp, rootPrivileges) - if err != nil { - return fmt.Errorf("failed to get pprof port: %v", err) - } - baseUrl := fmt.Sprintf("http://localhost:%d/debug/pprof/", port) - trace := ctx.Uint64("trace-sec") - profile := ctx.Uint64("profile-sec") - metrics := map[string]metricItem{ - "allocs": {name: "allocs.pb.gz", url: baseUrl + "allocs"}, - "blocks": {name: "block.pb.gz", url: baseUrl + "block"}, - "cmdline": {name: "cmdline.txt", url: baseUrl + "cmdline"}, - "goroutine": {name: "goroutine.pb.gz", url: baseUrl + "goroutine"}, - "stack": {name: "goroutine.stack.txt", url: baseUrl + "goroutine?debug=1"}, - "heap": {name: "heap.pb.gz", url: baseUrl + "heap"}, - "mutex": {name: "mutex.pb.gz", url: baseUrl + "mutex"}, - "threadcreate": {name: "threadcreate.pb.gz", url: baseUrl + "threadcreate"}, - "trace": {name: fmt.Sprintf("trace.%ds.pb.gz", trace), url: fmt.Sprintf("%strace?seconds=%d", baseUrl, trace)}, - "profile": {name: fmt.Sprintf("profile.%ds.pb.gz", profile), url: fmt.Sprintf("%sprofile?seconds=%d", baseUrl, profile)}, - } + uid, pid, cmd, err := getCmdMount(amp) + if err != nil { + return fmt.Errorf("failed to get mount command: %v", err) + } + fmt.Printf("\nMount Command:\n%s\n\n", cmd) - pprofOutDir := filepath.Join(currDir, "pprof") - if err := os.Mkdir(pprofOutDir, os.ModePerm); err != nil { - return fmt.Errorf("failed to create out directory: %v", err) - } + rootPrivileges := false + if (uid == "0" || uid == "root") && os.Getuid() != 0 { + fmt.Println("Mount point is mounted by the root user, may ask for root privilege...") + rootPrivileges = true + } - for name, metric := range metrics { - wg.Add(1) - go func(name string, metric metricItem) { - defer wg.Done() + var wg sync.WaitGroup + if err := collectSpecialFile(ctx, amp, currDir, rootPrivileges, &wg); err != nil { + return err + } - if name == "profile" { - logger.Infof("Profile metrics are being sampled, sampling duration: %ds", profile) - } - if name == "trace" { - logger.Infof("Trace metrics are being sampled, sampling duration: %ds", trace) - } - if err := reqAndSaveMetric(name, metric, pprofOutDir); err != nil { - logger.Errorf("Failed to get and save metric %s: %v", name, err) - } - }(name, metric) - } + if err := collectLog(ctx, cmd, rootPrivileges, currDir); err != nil { + return err } - wg.Wait() - if err := geneZipFile(currDir, filepath.Join(outDir, fmt.Sprintf("%s-%s.zip", prefix, timestamp))); err != nil { - return fmt.Errorf("failed to zip result %s: %v", currDir, err) + if err := collectPprof(ctx, cmd, pid, amp, rootPrivileges, currDir, &wg); err != nil { + return err } - return nil + + wg.Wait() + return geneZipFile(currDir, filepath.Join(outDir, fmt.Sprintf("%s-%s.zip", prefix, timestamp))) } diff --git a/cmd/main.go b/cmd/main.go index e3fc256e6094..939ceb99eced 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -75,7 +75,7 @@ func Main(args []string) error { cmdWarmup(), cmdRmr(), cmdSync(), - cmdDoctor(), + cmdDebug(), }, } diff --git a/docs/en/administration/fault_diagnosis_and_analysis.md b/docs/en/administration/fault_diagnosis_and_analysis.md index 388d59c83e98..1407612ca251 100644 --- a/docs/en/administration/fault_diagnosis_and_analysis.md +++ b/docs/en/administration/fault_diagnosis_and_analysis.md @@ -153,7 +153,7 @@ By default, pprof listens on port numbers ranging from 6060 to 6099. That's why :::tip You can also use the debug command to automatically collect these runtime information and save it locally. By default, it is saved to the debug directory under the current directory, for example: ```bash -juicefs debug --collect-pprof /mnt/jfs +juicefs debug /mnt/jfs ``` For more information about the debug command, see [command reference](https://juicefs.com/docs/community/command_reference#juicefs-debug) ::: diff --git a/docs/en/reference/command_reference.md b/docs/en/reference/command_reference.md index 2ecec8ce5321..483cbb28e90e 100644 --- a/docs/en/reference/command_reference.md +++ b/docs/en/reference/command_reference.md @@ -1217,15 +1217,9 @@ The output directory of the results, automatically created if the directory does `--stats-sec value`
The number of seconds to sample .stats file (default: 5) -`--collect-log`
-enable log collection (default: false) - `--limit value`
The number of log entries collected, from newest to oldest, if not specified, all entries will be collected -`--collect-pprof`
-enable pprof metrics collection (default: false) - `--trace-sec value`
The number of seconds to sample trace metrics (default: 5) @@ -1241,9 +1235,6 @@ $ juicefs debug /mnt/jfs # Specify the output directory as /var/log $ juicefs debug --out-dir=/var/log /mnt/jfs -# Enable log collection and get the last up to 1000 log entries -$ juicefs debug --out-dir=/var/log --collect-log --limit=1000 /mnt/jfs - -# Enable pprof metrics collection -$ juicefs debug --out-dir=/var/log --collect-log --limit=1000 --collect-pprof /mnt/jfs +# Get the last up to 1000 log entries +$ juicefs debug --out-dir=/var/log --limit=1000 /mnt/jfs ``` diff --git a/docs/zh_cn/administration/fault_diagnosis_and_analysis.md b/docs/zh_cn/administration/fault_diagnosis_and_analysis.md index 459523653394..ab17dd206d3d 100644 --- a/docs/zh_cn/administration/fault_diagnosis_and_analysis.md +++ b/docs/zh_cn/administration/fault_diagnosis_and_analysis.md @@ -166,7 +166,7 @@ curl 'http://localhost:/debug/pprof/heap' > juicefs.heap.pb.gz :::tip 建议 你也可以使用 juicefs debug 命令自动收集这些运行时信息并保存到本地,默认保存到当前目录下的 debug 目录中,例如: ```bash -juicefs debug --collect-pprof /mnt/jfs +juicefs debug /mnt/jfs ``` 关于 juicefs debug 命令的更多信息,请查看[命令参考](https://juicefs.com/docs/zh/community/command_reference#juicefs-debug) ::: diff --git a/docs/zh_cn/reference/command_reference.md b/docs/zh_cn/reference/command_reference.md index da3bc5d61f05..16cb06cd7561 100644 --- a/docs/zh_cn/reference/command_reference.md +++ b/docs/zh_cn/reference/command_reference.md @@ -1220,15 +1220,9 @@ juicefs debug [command options] MOUNTPOINT `--stats-sec value`
.stats文件采样秒数 (默认:5) -`--collect-log`
-启用日志收集 (默认:false) - `--limit value`
收集的日志条目数,从新到旧,若不指定则收集全部条目 -`--collect-pprof`
-启用pprof指标收集 (默认:false) - `--trace-sec value`
trace指标采样秒数 (默认:5) @@ -1244,9 +1238,6 @@ $ juicefs debug /mnt/jfs # 指定输出目录为 /var/log $ juicefs debug --out-dir=/var/log /mnt/jfs -# 启用日志收集,并收集最后1000条日志条目 -$ juicefs debug --out-dir=/var/log --collect-log --limit=1000 /mnt/jfs - -# 启用pprof指标收集 -$ juicefs debug --out-dir=/var/log --collect-log --limit=1000 --collect-pprof /mnt/jfs +# 收集最后 1000 条日志条目 +$ juicefs debug --out-dir=/var/log --limit=1000 /mnt/jfs ```