@@ -51,6 +51,8 @@ const (
5151
5252 lostEventsUpdateInterval = time .Second * 15
5353 maxDefaultStreamBufferConsumers = 4
54+
55+ setPIDMaxRetries = 5
5456)
5557
5658type backpressureStrategy uint8
@@ -137,10 +139,32 @@ func newAuditClient(c *Config, log *logp.Logger) (*libaudit.AuditClient, error)
137139 return libaudit .NewAuditClient (nil )
138140}
139141
142+ func closeAuditClient (client * libaudit.AuditClient ) error {
143+ discard := func (bytes []byte ) ([]syscall.NetlinkMessage , error ) {
144+ return nil , nil
145+ }
146+ // Drain the netlink channel in parallel to Close() to prevent a deadlock.
147+ // This goroutine will terminate once receive from netlink errors (EBADF,
148+ // EBADFD, or any other error). This happens because the fd is closed.
149+ go func () {
150+ for {
151+ _ , err := client .Netlink .Receive (true , discard )
152+ switch err {
153+ case nil , syscall .EINTR :
154+ case syscall .EAGAIN :
155+ time .Sleep (50 * time .Millisecond )
156+ default :
157+ return
158+ }
159+ }
160+ }()
161+ return client .Close ()
162+ }
163+
140164// Run initializes the audit client and receives audit messages from the
141165// kernel until the reporter's done channel is closed.
142166func (ms * MetricSet ) Run (reporter mb.PushReporterV2 ) {
143- defer ms .client . Close ( )
167+ defer closeAuditClient ( ms .client )
144168
145169 if err := ms .addRules (reporter ); err != nil {
146170 reporter .Error (err )
@@ -164,7 +188,7 @@ func (ms *MetricSet) Run(reporter mb.PushReporterV2) {
164188 go func () {
165189 defer func () { // Close the most recently allocated "client" instance.
166190 if client != nil {
167- client . Close ( )
191+ closeAuditClient ( client )
168192 }
169193 }()
170194 timer := time .NewTicker (lostEventsUpdateInterval )
@@ -178,7 +202,7 @@ func (ms *MetricSet) Run(reporter mb.PushReporterV2) {
178202 ms .updateKernelLostMetric (status .Lost )
179203 } else {
180204 ms .log .Error ("get status request failed:" , err )
181- if err = client . Close ( ); err != nil {
205+ if err = closeAuditClient ( client ); err != nil {
182206 ms .log .Errorw ("Error closing audit monitoring client" , "error" , err )
183207 }
184208 client , err = libaudit .NewAuditClient (nil )
@@ -233,7 +257,7 @@ func (ms *MetricSet) addRules(reporter mb.PushReporterV2) error {
233257 if err != nil {
234258 return errors .Wrap (err , "failed to create audit client for adding rules" )
235259 }
236- defer client . Close ( )
260+ defer closeAuditClient ( client )
237261
238262 // Don't attempt to change configuration if audit rules are locked (enabled == 2).
239263 // Will result in EPERM.
@@ -350,10 +374,12 @@ func (ms *MetricSet) initClient() error {
350374 return errors .Wrap (err , "failed to enable auditing in the kernel" )
351375 }
352376 }
377+
353378 if err := ms .client .WaitForPendingACKs (); err != nil {
354379 return errors .Wrap (err , "failed to wait for ACKs" )
355380 }
356- if err := ms .client .SetPID (libaudit .WaitForReply ); err != nil {
381+
382+ if err := ms .setPID (setPIDMaxRetries ); err != nil {
357383 if errno , ok := err .(syscall.Errno ); ok && errno == syscall .EEXIST && status .PID != 0 {
358384 return fmt .Errorf ("failed to set audit PID. An audit process is already running (PID %d)" , status .PID )
359385 }
@@ -362,6 +388,20 @@ func (ms *MetricSet) initClient() error {
362388 return nil
363389}
364390
391+ func (ms * MetricSet ) setPID (retries int ) (err error ) {
392+ if err = ms .client .SetPID (libaudit .WaitForReply ); err == nil || errors .Cause (err ) != syscall .ENOBUFS || retries == 0 {
393+ return err
394+ }
395+ // At this point the netlink channel is congested (ENOBUFS).
396+ // Drain and close the client, then retry with a new client.
397+ closeAuditClient (ms .client )
398+ if ms .client , err = newAuditClient (& ms .config , ms .log ); err != nil {
399+ return errors .Wrapf (err , "failed to recover from ENOBUFS" )
400+ }
401+ ms .log .Info ("Recovering from ENOBUFS ..." )
402+ return ms .setPID (retries - 1 )
403+ }
404+
365405func (ms * MetricSet ) updateKernelLostMetric (lost uint32 ) {
366406 if ! ms .kernelLost .enabled {
367407 return
0 commit comments