Skip to content

Commit

Permalink
Monitor NGINX logs for errors & NGINX worker processes after a NGINX …
Browse files Browse the repository at this point in the history
…reload (#255)

* Add monitoring of NGINX error logs after a NGINX reload

* Make NGINX reload monitoring period configurable

* Add pid drain check to NGINX monitoring post apply

* Change treat warnings as errors to be configurable

* Add locks to process information

---------

Co-authored-by: Oliver O'Mahony <[email protected]>
  • Loading branch information
dhurley and oliveromahony authored Apr 3, 2023
1 parent 078ddd3 commit 983fa21
Show file tree
Hide file tree
Showing 24 changed files with 1,020 additions and 254 deletions.
7 changes: 3 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,6 @@ build/nginx-agent:
build: build/nginx-agent ## Build agent executable

deps: ## Update dependencies in vendor folders
git diff --quiet || { echo "Local changes found. Please commit or stash your changes." >&2; exit 1; }
cd sdk && make generate
for dir in ${VENDOR_LOCATIONS}; do \
(cd "$$dir" && echo "Running vendor commands on $$dir" && go mod tidy && go mod vendor && cd "$$OLDPWD" || exit) \
Expand Down Expand Up @@ -264,14 +263,14 @@ image: ## Build agent container image for NGINX Plus, need nginx-repo.crt and ng
oss-image: ## Build agent container image for NGINX OSS
@echo Building image with $(CONTAINER_CLITOOL); \
$(CONTAINER_BUILDENV) $(CONTAINER_CLITOOL) build -t ${IMAGE_TAG} . \
--no-cache -f ./scripts/docker/nginx-oss/${OS_RELEASE}/Dockerfile \
--no-cache -f ./scripts/docker/nginx-oss/${CONTAINER_OS_TYPE}/Dockerfile \
--build-arg PACKAGE_NAME=${PACKAGE_NAME} \
--build-arg BASE_IMAGE=${BASE_IMAGE} \
--build-arg ENTRY_POINT=./scripts/docker/nginx-oss/entrypoint.sh
--build-arg ENTRY_POINT=./scripts/docker/entrypoint.sh

run-container: ## Run container from specified IMAGE_TAG
@echo Running ${IMAGE_TAG} with $(CONTAINER_CLITOOL); \
$(CONTAINER_CLITOOL) run --mount type=bind,source=${PWD}/nginx-agent.conf,target=/etc/nginx-agent/nginx-agent.conf ${IMAGE_TAG}
$(CONTAINER_CLITOOL) run -p 127.0.0.1:8081:8081/tcp --mount type=bind,source=${PWD}/nginx-agent.conf,target=/etc/nginx-agent/nginx-agent.conf ${IMAGE_TAG}

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Grafana Example Dashboard Targets #
Expand Down
11 changes: 7 additions & 4 deletions src/core/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ func SetDefaults() {

// NGINX DEFAULTS
Viper.SetDefault(NginxClientVersion, Defaults.Nginx.NginxClientVersion)
Viper.SetDefault(NginxConfigReloadMonitoringPeriod, Defaults.Nginx.ConfigReloadMonitoringPeriod)
}

func setFlagDeprecated(name string, usageMessage string) {
Expand Down Expand Up @@ -284,10 +285,12 @@ func getDataplane() Dataplane {

func getNginx() Nginx {
return Nginx{
ExcludeLogs: Viper.GetString(NginxExcludeLogs),
Debug: Viper.GetBool(NginxDebug),
NginxCountingSocket: Viper.GetString(NginxCountingSocket),
NginxClientVersion: Viper.GetInt(NginxClientVersion),
ExcludeLogs: Viper.GetString(NginxExcludeLogs),
Debug: Viper.GetBool(NginxDebug),
NginxCountingSocket: Viper.GetString(NginxCountingSocket),
NginxClientVersion: Viper.GetInt(NginxClientVersion),
ConfigReloadMonitoringPeriod: Viper.GetDuration(NginxConfigReloadMonitoringPeriod),
TreatWarningsAsErrors: Viper.GetBool(NginxTreatWarningsAsErrors),
}
}

Expand Down
1 change: 1 addition & 0 deletions src/core/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ func TestGetConfig(t *testing.T) {
assert.Equal(t, Defaults.Server.Metrics, config.Server.Metrics)

assert.Equal(t, Defaults.AgentAPI.Port, config.AgentAPI.Port)
assert.Equal(t, Defaults.Nginx.ConfigReloadMonitoringPeriod, config.Nginx.ConfigReloadMonitoringPeriod)

assert.True(t, len(config.AllowedDirectoriesMap) > 0)
assert.Equal(t, Defaults.ConfigDirs, config.ConfigDirs)
Expand Down
28 changes: 21 additions & 7 deletions src/core/config/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,11 @@ var (
Token: uuid.New().String(),
},
Nginx: Nginx{
Debug: false,
NginxCountingSocket: "unix:/var/run/nginx-agent/nginx.sock",
NginxClientVersion: 6,
Debug: false,
NginxCountingSocket: "unix:/var/run/nginx-agent/nginx.sock",
NginxClientVersion: 6,
ConfigReloadMonitoringPeriod: 10 * time.Second,
TreatWarningsAsErrors: false,
},
ConfigDirs: "/etc/nginx:/usr/local/etc/nginx:/usr/share/nginx/modules:/etc/nms",
AllowedDirectoriesMap: map[string]struct{}{},
Expand Down Expand Up @@ -128,10 +130,12 @@ const (
// viper keys used in config
NginxKey = "nginx"

NginxExcludeLogs = NginxKey + agent_config.KeyDelimiter + "exclude_logs"
NginxDebug = NginxKey + agent_config.KeyDelimiter + "debug"
NginxCountingSocket = NginxKey + agent_config.KeyDelimiter + "socket"
NginxClientVersion = NginxKey + agent_config.KeyDelimiter + "client_version"
NginxExcludeLogs = NginxKey + agent_config.KeyDelimiter + "exclude_logs"
NginxDebug = NginxKey + agent_config.KeyDelimiter + "debug"
NginxCountingSocket = NginxKey + agent_config.KeyDelimiter + "socket"
NginxClientVersion = NginxKey + agent_config.KeyDelimiter + "client_version"
NginxConfigReloadMonitoringPeriod = NginxKey + agent_config.KeyDelimiter + "config_reload_monitoring_period"
NginxTreatWarningsAsErrors = NginxKey + agent_config.KeyDelimiter + "treat_warnings_as_errors"

// viper keys used in config
DataplaneKey = "dataplane"
Expand Down Expand Up @@ -247,6 +251,16 @@ var (
Usage: "The NGINX Plus counting unix socket location.",
DefaultValue: Defaults.Nginx.NginxCountingSocket,
},
&DurationFlag{
Name: NginxConfigReloadMonitoringPeriod,
Usage: "The duration the NGINX Agent will monitor error logs after a NGINX reload",
DefaultValue: Defaults.Nginx.ConfigReloadMonitoringPeriod,
},
&BoolFlag{
Name: NginxTreatWarningsAsErrors,
Usage: "On nginx -t, treat warnings as failures on configuration application.",
DefaultValue: Defaults.Nginx.TreatWarningsAsErrors,
},
// Metrics
&DurationFlag{
Name: MetricsCollectionInterval,
Expand Down
10 changes: 6 additions & 4 deletions src/core/config/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,12 @@ type TLSConfig struct {

// Nginx settings
type Nginx struct {
ExcludeLogs string `mapstructure:"exclude_logs" yaml:"-"`
Debug bool `mapstructure:"debug" yaml:"-"`
NginxCountingSocket string `mapstructure:"socket" yaml:"-"`
NginxClientVersion int `mapstructure:"client_version" yaml:"-"`
ExcludeLogs string `mapstructure:"exclude_logs" yaml:"-"`
Debug bool `mapstructure:"debug" yaml:"-"`
NginxCountingSocket string `mapstructure:"socket" yaml:"-"`
NginxClientVersion int `mapstructure:"client_version" yaml:"-"`
ConfigReloadMonitoringPeriod time.Duration `mapstructure:"config_reload_monitoring_period" yaml:"-"`
TreatWarningsAsErrors bool `mapstructure:"treat_warnings_as_errors" yaml:"-"`
}

type Dataplane struct {
Expand Down
2 changes: 1 addition & 1 deletion src/core/metrics/sources/nginx_access_log.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import (
"github.com/nginx/agent/sdk/v2/proto"
"github.com/nginx/agent/v2/src/core"
"github.com/nginx/agent/v2/src/core/metrics"
"github.com/nginx/agent/v2/src/core/metrics/sources/tailer"
"github.com/nginx/agent/v2/src/core/tailer"

log "github.com/sirupsen/logrus"
)
Expand Down
2 changes: 1 addition & 1 deletion src/core/metrics/sources/nginx_error_log.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import (

"github.com/nginx/agent/v2/src/core"
"github.com/nginx/agent/v2/src/core/metrics"
"github.com/nginx/agent/v2/src/core/metrics/sources/tailer"
"github.com/nginx/agent/v2/src/core/tailer"

log "github.com/sirupsen/logrus"
)
Expand Down
20 changes: 16 additions & 4 deletions src/core/nginx.go
Original file line number Diff line number Diff line change
Expand Up @@ -269,15 +269,27 @@ func (n *NginxBinaryType) ValidateConfig(processId, bin, configLocation string,
return fmt.Errorf("error running nginx -t -c %v:\n%s", configLocation, response)
}

if strings.Contains(response.String(), "nginx: [warn]") {
return fmt.Errorf("error running nginx -t -c %v:\n%s", configLocation, response)
err = n.validateConfigCheckResponse(response, configLocation)
if err != nil {
return err
}

log.Infof("Config validated:\n%s", response)

return nil
}

func (n *NginxBinaryType) validateConfigCheckResponse(response *bytes.Buffer, configLocation string) (error) {
if bytes.Contains(response.Bytes(), []byte("[emerg]")) {
return fmt.Errorf("error running nginx -t -c %v:\n%s", configLocation, response)
}

if (n.config.Nginx.TreatWarningsAsErrors && bytes.Contains(response.Bytes(), []byte("[warn]"))) {
return fmt.Errorf("error running nginx -t -c %v:\n%s", configLocation, response)
}
return nil
}

// Stop stops an instance of NGINX.
func (n *NginxBinaryType) Stop(processId, bin string) error {
log.Info("Stopping NGINX")
Expand Down Expand Up @@ -330,8 +342,8 @@ func (n *NginxBinaryType) WriteConfig(config *proto.NginxConfig) (*sdk.ConfigApp
}
}

details, ok := n.nginxDetailsMap[config.ConfigData.NginxId]
if !ok || details == nil {
details := n.GetNginxDetailsByID(config.ConfigData.NginxId)
if details == nil {
return nil, fmt.Errorf("NGINX instance %s not found", config.ConfigData.NginxId)
}

Expand Down
40 changes: 38 additions & 2 deletions src/core/nginx_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,7 @@ func TestWriteBackup(t *testing.T) {
}{
{
name: "enabled test",
config: config.Config{Nginx: config.Nginx{Debug: true}},
config: config.Config{Nginx: config.Nginx{Debug: true, TreatWarningsAsErrors: true}},
nginxConfig: &proto.NginxConfig{Zconfig: zippedFile, Zaux: zippedFile},
confFiles: []*proto.File{
{
Expand All @@ -402,7 +402,7 @@ func TestWriteBackup(t *testing.T) {
},
{
name: "not enabled test",
config: config.Config{Nginx: config.Nginx{Debug: false}},
config: config.Config{Nginx: config.Nginx{Debug: false, TreatWarningsAsErrors: false}},
nginxConfig: &proto.NginxConfig{},
confFiles: []*proto.File{},
auxFiles: []*proto.File{},
Expand Down Expand Up @@ -945,3 +945,39 @@ func TestNginxBinaryType_sanitizeProcessPath(t *testing.T) {
})
}
}

func TestNginxBinaryType_validateConfigCheckResponse(t *testing.T) {
type testDef struct {
name string
response string
expected interface{}
treatWarningsAsErrors bool
}

// no test case for process lookup, that would require running nginx or proc somewhere
for _, test := range []testDef{
{name: "validation fails, emerg respected", response: "nginx [emerg]", treatWarningsAsErrors: false, expected: errors.New("error running nginx -t -c :\nnginx [emerg]")},
{name: "validation fails, emerg respected, config irrelevant", response: "nginx [emerg]", treatWarningsAsErrors: true, expected: errors.New("error running nginx -t -c :\nnginx [emerg]")},
{name: "validation passes, warn ignored", response: "nginx [warn]", treatWarningsAsErrors: false, expected: nil},
{name: "validation fails, warn respected", response: "nginx [warn]", treatWarningsAsErrors: true, expected: errors.New("error running nginx -t -c :\nnginx [warn]")},
{name: "validation passes, info irrelevant", response: "nginx [info]", treatWarningsAsErrors: false, expected: nil},
{name: "validation passes, info irrelevant, config irrelevant", response: "nginx [info]", treatWarningsAsErrors: true, expected: nil},
{name: "validation fails unknown directive", response: "nginx: [emerg] unknown directive \"location/\" in /etc/nginx/sites-enabled/someapp:5", treatWarningsAsErrors: false, expected: errors.New("error running nginx -t -c :\nnginx: [emerg] unknown directive \"location/\" in /etc/nginx/sites-enabled/someapp:5")},
{name: "validation fails conflicting server name", response: "nginx: [warn] conflicting server name \"example.com\" on 0.0.0.0:80, ignored", treatWarningsAsErrors: true, expected: errors.New("error running nginx -t -c :\nnginx: [warn] conflicting server name \"example.com\" on 0.0.0.0:80, ignored")},
{name: "validation fails limit_req", response: "nginx: [emerg] 96300#96300: limit_req \"default\" uses the \"$binary_remote_addr\" key while previously it used the \"$http_apiKey\" key", treatWarningsAsErrors: true, expected: errors.New("error running nginx -t -c :\nnginx: [emerg] 96300#96300: limit_req \"default\" uses the \"$binary_remote_addr\" key while previously it used the \"$http_apiKey\" key")},
{name: "validation fails host not found in upstream", response: "nginx: [emerg] 5191#5191: host not found in upstream \"example.com:80\" in /etc/nginx/nginx.conf:111", treatWarningsAsErrors: false, expected: errors.New("error running nginx -t -c :\nnginx: [emerg] 5191#5191: host not found in upstream \"example.com:80\" in /etc/nginx/nginx.conf:111")},
{name: "validation fails worker_connections", response: "nginx: [warn] 2048 worker_connections exceed open file resource limit: 1024", treatWarningsAsErrors: true, expected: errors.New("error running nginx -t -c :\nnginx: [warn] 2048 worker_connections exceed open file resource limit: 1024")},
{name: "validation passes worker_connections", response: "nginx: [warn] 2048 worker_connections exceed open file resource limit: 1024", treatWarningsAsErrors: false, expected: nil},
} {
t.Run(test.name, func(tt *testing.T) {
binary := NginxBinaryType{
env: &EnvironmentType{},
config: &config.Config{Nginx: config.Nginx{Debug: true, TreatWarningsAsErrors: test.treatWarningsAsErrors}},

}
buffer := bytes.NewBuffer([]byte(test.response))
err := binary.validateConfigCheckResponse(buffer, "")
assert.Equal(tt, test.expected, err)
})
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@ import (

var (
tailConfig = tail.Config{
Follow: true,
ReOpen: true,
Poll: true,
Follow: true,
ReOpen: true,
MustExist: true,
Poll: true,
Location: &tail.SeekInfo{
Whence: io.SeekEnd,
},
Expand Down Expand Up @@ -108,11 +109,14 @@ func (t *Tailer) Tail(ctx context.Context, data chan<- string) {
data <- line.Text

case <-ctx.Done():
err := ctx.Err()
if err != nil {
log.Errorf("error in done context Tail %v", err)
ctxErr := ctx.Err()
switch ctxErr {
case context.DeadlineExceeded:
log.Tracef("Tailer cancelled. Deadline exceeded, %v", ctxErr)
case context.Canceled:
log.Tracef("Tailer forcibly cancelled, %v", ctxErr)
}
log.Info("tailer is done")
log.Trace("Tailer is done")
return
}
}
Expand All @@ -134,11 +138,14 @@ func (t *PatternTailer) Tail(ctx context.Context, data chan<- map[string]string)
data <- l
}
case <-ctx.Done():
err := ctx.Err()
if err != nil {
log.Errorf("error in done context Tail %v", err)
ctxErr := ctx.Err()
switch ctxErr {
case context.DeadlineExceeded:
log.Tracef("Tailer cancelled because deadline was exceeded, %v", ctxErr)
case context.Canceled:
log.Tracef("Tailer forcibly cancelled, %v", ctxErr)
}
log.Info("tailer is done")
log.Tracef("Tailer is done")
return
}
}
Expand Down
File renamed without changes.
Loading

0 comments on commit 983fa21

Please sign in to comment.