Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Monitor NGINX logs for errors & NGINX worker processes after a NGINX reload #255

Merged
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
39af9c3
Add monitoring of NGINX error logs after a NGINX reload
dhurley Mar 14, 2023
d3fd094
Make NGINX reload monitoring period configurable
dhurley Mar 15, 2023
74362bf
Clean up
dhurley Mar 15, 2023
e799166
Add more logging to tailer. Fix unit test.
dhurley Mar 15, 2023
f413e34
Debug failing test
dhurley Mar 15, 2023
8369ebc
Clean up
dhurley Mar 15, 2023
a6e2854
Pid drain check (#258)
oliveromahony Mar 16, 2023
68fbd63
Add rollback logic for if the NGINX reload fails
dhurley Mar 16, 2023
bc327c5
Add another error log regex. Update oss-image make target
dhurley Mar 16, 2023
b65f013
Create sec.yml
oliveromahony Mar 22, 2023
4e6f309
Update sec.yml
oliveromahony Mar 22, 2023
385732b
Delete sec.yml
oliveromahony Mar 22, 2023
a421834
wip: added test and framework for monitoring pids
oliveromahony Mar 22, 2023
43f732a
Merge branch 'main' into monitor-nginx-logs-for-errors-after-a-nginx-…
oliveromahony Mar 22, 2023
9c5e3fb
Merge branch 'main' into monitor-nginx-logs-for-errors-after-a-nginx-…
oliveromahony Mar 23, 2023
2223fc5
changed logging level
oliveromahony Mar 23, 2023
d8d5094
the code is in place, some failing tests I need to validate
oliveromahony Mar 23, 2023
9a9861a
wip: issue with deadlock
oliveromahony Mar 23, 2023
bd0f8b1
fixed tests
oliveromahony Mar 23, 2023
cdb3937
tests failing
oliveromahony Mar 24, 2023
3be20ea
Merge branch 'main' into monitor-nginx-logs-for-errors-after-a-nginx-…
oliveromahony Mar 27, 2023
a5a4675
commenting out tests for now
oliveromahony Mar 27, 2023
f59916f
wip: commented out failing tests for a build
oliveromahony Mar 27, 2023
072aac6
fixed tests
oliveromahony Mar 27, 2023
6894944
treat errors as warnings, default to false
oliveromahony Mar 31, 2023
8429a5f
added more conditions to the config check test
oliveromahony Mar 31, 2023
350483e
Merge branch 'main' into monitor-nginx-logs-for-errors-after-a-nginx-…
oliveromahony Mar 31, 2023
a8fd768
code review comments
oliveromahony Apr 3, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions src/core/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ func SetDefaults() {

// NGINX DEFAULTS
Viper.SetDefault(NginxClientVersion, Defaults.Nginx.NginxClientVersion)
Viper.SetDefault(NginxConfigReloadMonitoringPeriod, Defaults.Nginx.ConfigReloadMonitoringPeriod)
}

func setFlagDeprecated(name string, usageMessage string) {
Expand Down Expand Up @@ -284,10 +285,11 @@ func getDataplane() Dataplane {

func getNginx() Nginx {
return Nginx{
ExcludeLogs: Viper.GetString(NginxExcludeLogs),
Debug: Viper.GetBool(NginxDebug),
NginxCountingSocket: Viper.GetString(NginxCountingSocket),
NginxClientVersion: Viper.GetInt(NginxClientVersion),
ExcludeLogs: Viper.GetString(NginxExcludeLogs),
Debug: Viper.GetBool(NginxDebug),
NginxCountingSocket: Viper.GetString(NginxCountingSocket),
NginxClientVersion: Viper.GetInt(NginxClientVersion),
ConfigReloadMonitoringPeriod: Viper.GetDuration(NginxConfigReloadMonitoringPeriod),
}
}

Expand Down
1 change: 1 addition & 0 deletions src/core/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ func TestGetConfig(t *testing.T) {
assert.Equal(t, Defaults.Server.Metrics, config.Server.Metrics)

assert.Equal(t, Defaults.AgentAPI.Port, config.AgentAPI.Port)
assert.Equal(t, Defaults.Nginx.ConfigReloadMonitoringPeriod, config.Nginx.ConfigReloadMonitoringPeriod)

assert.True(t, len(config.AllowedDirectoriesMap) > 0)
assert.Equal(t, Defaults.ConfigDirs, config.ConfigDirs)
Expand Down
21 changes: 14 additions & 7 deletions src/core/config/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,10 @@ var (
Token: uuid.New().String(),
},
Nginx: Nginx{
Debug: false,
NginxCountingSocket: "unix:/var/run/nginx-agent/nginx.sock",
NginxClientVersion: 6,
Debug: false,
NginxCountingSocket: "unix:/var/run/nginx-agent/nginx.sock",
NginxClientVersion: 6,
ConfigReloadMonitoringPeriod: 10 * time.Second,
},
ConfigDirs: "/etc/nginx:/usr/local/etc/nginx:/usr/share/nginx/modules:/etc/nms",
AllowedDirectoriesMap: map[string]struct{}{},
Expand Down Expand Up @@ -128,10 +129,11 @@ const (
// viper keys used in config
NginxKey = "nginx"

NginxExcludeLogs = NginxKey + agent_config.KeyDelimiter + "exclude_logs"
NginxDebug = NginxKey + agent_config.KeyDelimiter + "debug"
NginxCountingSocket = NginxKey + agent_config.KeyDelimiter + "socket"
NginxClientVersion = NginxKey + agent_config.KeyDelimiter + "client_version"
NginxExcludeLogs = NginxKey + agent_config.KeyDelimiter + "exclude_logs"
NginxDebug = NginxKey + agent_config.KeyDelimiter + "debug"
NginxCountingSocket = NginxKey + agent_config.KeyDelimiter + "socket"
NginxClientVersion = NginxKey + agent_config.KeyDelimiter + "client_version"
NginxConfigReloadMonitoringPeriod = NginxKey + agent_config.KeyDelimiter + "config_reload_monitoring_period"

// viper keys used in config
DataplaneKey = "dataplane"
Expand Down Expand Up @@ -247,6 +249,11 @@ var (
Usage: "The NGINX Plus counting unix socket location.",
DefaultValue: Defaults.Nginx.NginxCountingSocket,
},
&DurationFlag{
Name: NginxConfigReloadMonitoringPeriod,
Usage: "The period at which the NGINX Agent will monitor error logs after a NGINX reload.",
DefaultValue: Defaults.Nginx.ConfigReloadMonitoringPeriod,
},
// Metrics
&DurationFlag{
Name: MetricsCollectionInterval,
Expand Down
9 changes: 5 additions & 4 deletions src/core/config/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,11 @@ type TLSConfig struct {

// Nginx settings
type Nginx struct {
ExcludeLogs string `mapstructure:"exclude_logs" yaml:"-"`
Debug bool `mapstructure:"debug" yaml:"-"`
NginxCountingSocket string `mapstructure:"socket" yaml:"-"`
NginxClientVersion int `mapstructure:"client_version" yaml:"-"`
ExcludeLogs string `mapstructure:"exclude_logs" yaml:"-"`
Debug bool `mapstructure:"debug" yaml:"-"`
NginxCountingSocket string `mapstructure:"socket" yaml:"-"`
NginxClientVersion int `mapstructure:"client_version" yaml:"-"`
ConfigReloadMonitoringPeriod time.Duration `mapstructure:"config_reload_monitoring_period" yaml:"-"`
}

type Dataplane struct {
Expand Down
2 changes: 1 addition & 1 deletion src/core/metrics/sources/nginx_access_log.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import (
"github.com/nginx/agent/sdk/v2/proto"
"github.com/nginx/agent/v2/src/core"
"github.com/nginx/agent/v2/src/core/metrics"
"github.com/nginx/agent/v2/src/core/metrics/sources/tailer"
"github.com/nginx/agent/v2/src/core/tailer"

log "github.com/sirupsen/logrus"
)
Expand Down
2 changes: 1 addition & 1 deletion src/core/metrics/sources/nginx_error_log.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import (
"github.com/nginx/agent/sdk/v2/proto"
"github.com/nginx/agent/v2/src/core"
"github.com/nginx/agent/v2/src/core/metrics"
"github.com/nginx/agent/v2/src/core/metrics/sources/tailer"
"github.com/nginx/agent/v2/src/core/tailer"

log "github.com/sirupsen/logrus"
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@ import (

var (
tailConfig = tail.Config{
Follow: true,
ReOpen: true,
Poll: true,
Follow: true,
ReOpen: true,
MustExist: true,
Poll: true,
Location: &tail.SeekInfo{
Whence: io.SeekEnd,
},
Expand Down Expand Up @@ -107,11 +108,14 @@ func (t *Tailer) Tail(ctx context.Context, data chan<- string) {
data <- line.Text

case <-ctx.Done():
err := ctx.Err()
if err != nil {
log.Errorf("error in done context Tail %v", err)
ctxErr := ctx.Err()
switch ctxErr {
case context.DeadlineExceeded:
log.Tracef("Tailer cancelled. Deadline exceeded, %v", ctxErr)
case context.Canceled:
log.Tracef("Tailer forcibly cancelled, %v", ctxErr)
}
log.Info("tailer is done")
log.Trace("Tailer is done")
return
}
}
Expand All @@ -133,11 +137,14 @@ func (t *PatternTailer) Tail(ctx context.Context, data chan<- map[string]string)
data <- l
}
case <-ctx.Done():
err := ctx.Err()
if err != nil {
log.Errorf("error in done context Tail %v", err)
ctxErr := ctx.Err()
switch ctxErr {
case context.DeadlineExceeded:
log.Tracef("Tailer cancelled because deadline was exceeded, %v", ctxErr)
case context.Canceled:
log.Tracef("Tailer forcibly cancelled, %v", ctxErr)
}
log.Info("tailer is done")
log.Error("Tailer is done")
return
}
}
Expand Down
81 changes: 79 additions & 2 deletions src/plugins/nginx.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ import (
"errors"
"fmt"
"path/filepath"
re "regexp"
"sync"
"time"

"github.com/gogo/protobuf/types"
Expand All @@ -29,6 +31,7 @@ import (
"github.com/nginx/agent/v2/src/core"
"github.com/nginx/agent/v2/src/core/config"
"github.com/nginx/agent/v2/src/core/payloads"
"github.com/nginx/agent/v2/src/core/tailer"
"github.com/nginx/agent/v2/src/extensions/nginx-app-protect/nap"
)

Expand All @@ -39,6 +42,9 @@ const (

var (
validationTimeout = 15 * time.Second
reloadErrorList = []*re.Regexp{
re.MustCompile(`.*Address already in use.*`),
}
)

// Nginx is the metadata of our nginx binary
Expand Down Expand Up @@ -383,7 +389,7 @@ func (n *Nginx) writeConfigAndReloadNginx(correlationId string, config *proto.Ng
case result := <-n.configApplyStatusChannel:
return result
case <-time.After(validationTimeout):
log.Errorf("Validation of the NGINX config in taking longer than the validationTimeout %s", validationTimeout)
log.Warnf("Validation of the NGINX config in taking longer than the validationTimeout %s", validationTimeout)
return status
}
}
Expand Down Expand Up @@ -466,10 +472,18 @@ func (n *Nginx) completeConfigApply(response *NginxConfigValidationResponse) *pr
log.Debug("Enabling file watcher")
n.messagePipeline.Process(core.NewMessage(core.FileWatcherEnabled, true))

var monitoringErrors []string
reloadErr := n.nginxBinary.Reload(response.nginxDetails.ProcessId, response.nginxDetails.ProcessPath)
if reloadErr != nil {
nginxConfigStatusMessage = fmt.Sprintf("Config apply failed (write): %v", reloadErr)
log.Errorf(nginxConfigStatusMessage)
} else {
monitoringErrors = n.monitorErrorLogs()
if len(monitoringErrors) > 0 {
nginxConfigStatusMessage = fmt.Sprintf("Config apply failed. The following errors were found in the NGINX error logs: %v", monitoringErrors)
} else {
log.Info("No errors found in NGINX errors logs after NGINX reload")
}
}

nginxReloadEventMeta := NginxReloadResponse{
Expand All @@ -481,11 +495,16 @@ func (n *Nginx) completeConfigApply(response *NginxConfigValidationResponse) *pr

n.messagePipeline.Process(core.NewMessage(core.NginxReloadComplete, nginxReloadEventMeta))

nginxConfigStatus := proto.NginxConfigStatus_OK
if reloadErr != nil || len(monitoringErrors) > 0 {
nginxConfigStatus = proto.NginxConfigStatus_ERROR
}

agentActivityStatus := &proto.AgentActivityStatus{
Status: &proto.AgentActivityStatus_NginxConfigStatus{
NginxConfigStatus: &proto.NginxConfigStatus{
CorrelationId: response.correlationId,
Status: proto.NginxConfigStatus_OK,
Status: nginxConfigStatus,
Message: nginxConfigStatusMessage,
NginxId: response.config.GetConfigData().GetNginxId(),
},
Expand Down Expand Up @@ -513,6 +532,64 @@ func (n *Nginx) completeConfigApply(response *NginxConfigValidationResponse) *pr
return status
}

func (n *Nginx) monitorErrorLogs() []string {
errorLogs := n.nginxBinary.GetErrorLogs()
if len(errorLogs) == 0 {
log.Warn("Skipping error log validation for NGINX reload because no error logs have been configured in the NGINX configuration")
}

errorChannel := make(chan string, len(errorLogs))
wg := &sync.WaitGroup{}

for logFile := range errorLogs {
wg.Add(1)
go n.tailLog(errorChannel, wg, logFile)
}

wg.Wait()
close(errorChannel)

errorsFound := []string{}
for errorLog := range errorChannel {
errorsFound = append(errorsFound, errorLog)
}

return errorsFound
}

func (n *Nginx) tailLog(errorChannel chan string, wg *sync.WaitGroup, logFile string) {
defer wg.Done()

t, err := tailer.NewTailer(logFile)
if err != nil {
log.Errorf("Unable to tail %q: %v", logFile, err)
return
}

ctx, cancel := context.WithCancel(context.Background())
defer cancel()

data := make(chan string, 1024)
go t.Tail(ctx, data)

tick := time.NewTicker(n.config.Nginx.ConfigReloadMonitoringPeriod)
defer tick.Stop()

for {
select {
case d := <-data:
for _, errorRegex := range reloadErrorList {
if errorRegex.MatchString(d) {
errorChannel <- d
return
}
}
case <-tick.C:
return
}
}
}

func (n *Nginx) rollbackConfigApply(response *NginxConfigValidationResponse) {
nginxConfigStatusMessage := fmt.Sprintf("Config apply failed (write): %v", response.err.Error())
log.Error(nginxConfigStatusMessage)
Expand Down
56 changes: 56 additions & 0 deletions src/plugins/nginx_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,14 @@ import (
"errors"
"fmt"
"os"
"path"
"testing"
"time"

"github.com/google/uuid"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"

"github.com/nginx/agent/sdk/v2"
agent_config "github.com/nginx/agent/sdk/v2/agent/config"
Expand Down Expand Up @@ -527,6 +529,7 @@ func TestNginxConfigApply(t *testing.T) {
binary.On("UpdateNginxDetailsFromProcesses", env.Processes())
binary.On("GetNginxDetailsMapFromProcesses", env.Processes()).Return((tutils.GetDetailsMap()))
binary.On("Reload", mock.Anything, mock.Anything).Return(nil)
binary.On("GetErrorLogs").Return(make(map[string]string))

commandClient := tutils.GetMockCommandClient(test.config)
conf := &loadedConfig.Config{
Expand Down Expand Up @@ -831,6 +834,7 @@ func TestNginx_completeConfigApply(t *testing.T) {
binary.On("UpdateNginxDetailsFromProcesses", env.Processes())
binary.On("GetNginxDetailsMapFromProcesses", env.Processes()).Return((tutils.GetDetailsMap()))
binary.On("Reload", mock.Anything, mock.Anything)
binary.On("GetErrorLogs").Return(make(map[string]string))

commandClient := tutils.GetMockCommandClient(
&proto.NginxConfig{
Expand Down Expand Up @@ -1039,3 +1043,55 @@ func TestBlock_ConfigApply(t *testing.T) {

assert.Equal(t, testNAPDetailsActive.AppProtectWafDetails.WafVersion, pluginUnderTest.nginxAppProtectSoftwareDetails.WafVersion)
}

func TestNginx_monitorErrorLogs(t *testing.T) {
tmpDir := t.TempDir()
errorLogFileName := path.Join(tmpDir, "/error.log")
errorLogFile, err := os.Create(errorLogFileName)

defer func() {
err := errorLogFile.Close()
require.NoError(t, err, "Error closing error log file")
os.Remove(errorLogFile.Name())
}()

require.NoError(t, err, "Error creating error log")
commandClient := tutils.GetMockCommandClient(&proto.NginxConfig{})

env := tutils.GetMockEnvWithProcess()
binary := tutils.NewMockNginxBinary()
binary.On("GetErrorLogs").Return(make(map[string]string)).Once()
binary.On("GetErrorLogs").Return(map[string]string{errorLogFileName: ""}).Once()

config := tutils.GetMockAgentConfig()
config.Nginx.ConfigReloadMonitoringPeriod = 500 * time.Millisecond
pluginUnderTest := NewNginx(commandClient, binary, env, config)

// Validate that the monitoring is skipped if no error logs are configured
errorsFound := pluginUnderTest.monitorErrorLogs()
assert.Equal(t, 0, len(errorsFound))

errorsChannel := make(chan []string, 1)

// Validate that errors in the logs returned
go func() {
errorsFound := pluginUnderTest.monitorErrorLogs()
errorsChannel <- errorsFound
}()

time.Sleep(config.Nginx.ConfigReloadMonitoringPeriod / 2)

_, err = errorLogFile.WriteString("2023/03/14 14:16:23 [emerg] 3871#3871: bind() to 0.0.0.0:8081 failed (98: Address already in use)")
require.NoError(t, err, "Error writing data to error log file")

for {
select {
case x := <-errorsChannel:
assert.Equal(t, 1, len(x))
return
case <-time.After(config.Nginx.ConfigReloadMonitoringPeriod * 2):
assert.Fail(t, "Expected error to be reported")
return
}
}
}
Loading