Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions workflow/common/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ const (
// the strategy whose artifacts are being deleted
AnnotationKeyArtifactGCStrategy = workflow.WorkflowFullName + "/artifact-gc-strategy"

// AnnotationKeyLastSeenVersion is the last seen version for the workflow
AnnotationKeyLastSeenVersion = workflow.WorkflowFullName + "/last-seen-version"

// LabelParallelismLimit is a label applied on namespace objects to control the per namespace parallelism.
LabelParallelismLimit = workflow.WorkflowFullName + "/parallelism-limit"

Expand Down
41 changes: 41 additions & 0 deletions workflow/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,11 @@ type recentCompletions struct {
mutex gosync.RWMutex
}

type lastSeenVersions struct {
versions map[string]string
mutex gosync.RWMutex
}

// WorkflowController is the controller for workflow resources
type WorkflowController struct {
// namespace of the workflow controller
Expand Down Expand Up @@ -153,6 +158,8 @@ type WorkflowController struct {
recentCompletions recentCompletions
// lastUnreconciledWorkflows is a map of workflows that have been recently unreconciled
lastUnreconciledWorkflows map[string]*wfv1.Workflow

lastSeenVersions lastSeenVersions // key: workflow UID, value: resource version
}

const (
Expand Down Expand Up @@ -205,6 +212,10 @@ func NewWorkflowController(ctx context.Context, restConfig *rest.Config, kubecli
eventRecorderManager: events.NewEventRecorderManager(kubeclientset),
progressPatchTickDuration: env.LookupEnvDurationOr(ctx, common.EnvVarProgressPatchTickDuration, 1*time.Minute),
progressFileTickDuration: env.LookupEnvDurationOr(ctx, common.EnvVarProgressFileTickDuration, 3*time.Second),
lastSeenVersions: lastSeenVersions{
versions: make(map[string]string),
mutex: gosync.RWMutex{},
},
}

if executorPlugins {
Expand Down Expand Up @@ -724,6 +735,12 @@ func (wfc *WorkflowController) processNextItem(ctx context.Context) bool {
return true
}

if wfc.isOutdated(un) {
logger.WithField("key", key).Debug(ctx, "Skipping outdated workflow event")
wfc.wfQueue.AddRateLimited(key)
return true
}
Comment on lines +738 to +742
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Outdated-event handling may spin indefinitely if invariants are broken

The early‑return in processNextItem:

if wfc.isOutdated(un) {
    logger.WithField("key", key).Debug(ctx, "Skipping outdated workflow event")
    wfc.wfQueue.AddRateLimited(key)
    return true
}

is fine as long as isOutdated only returns true temporarily (for genuinely stale states). However, if the lastSeenVersions map and the last-seen-version annotation ever diverge permanently for a non‑completed workflow (e.g. annotation removed or overwritten by a user/tool), this branch will:

  • Always classify the workflow as outdated,
  • Continuously re‑queue it with backoff,
  • Never reach reconciliationNeeded or operate, effectively wedging that workflow.

The robustness of isOutdated is therefore critical; see comment on its implementation below.

🤖 Prompt for AI Agents
In workflow/controller/controller.go around lines 738-742, the early-return on
isOutdated(...) can cause a workflow to be perpetually re-queued if the
controller's lastSeenVersions map and the workflow's last-seen-version
annotation diverge permanently; change the handling so we don't indefinitely
AddRateLimited+return: detect permanent divergence (e.g. track retry count or
timestamp per key, or make isOutdated return a tri-state that indicates
permanent mismatch), and when that threshold is exceeded log a warning and
either clear/normalize the annotation or treat the item as not-outdated so
reconciliationNeeded/operate run (or drop the key without requeue if
appropriate). Ensure you record/inspect the retry count from the workqueue or
add a small per-key counter, avoid tight infinite requeues, and make the code
fall through to normal reconciliation or a terminal drop once the threshold is
reached.


if !reconciliationNeeded(un) {
logger.WithField("key", key).Debug(ctx, "Won't process Workflow since it's completed")
return true
Expand Down Expand Up @@ -946,6 +963,7 @@ func (wfc *WorkflowController) addWorkflowInformerHandlers(ctx context.Context)
if !needed {
key, _ := cache.MetaNamespaceKeyFunc(un)
wfc.recordCompletedWorkflow(key)
wfc.deleteLastSeenVersionKey(wfc.getLastSeenVersionKey(un))
}
return needed
},
Expand Down Expand Up @@ -1003,6 +1021,7 @@ func (wfc *WorkflowController) addWorkflowInformerHandlers(ctx context.Context)
// no need to add to the queue - this workflow is done
wfc.throttler.Remove(key)
}
wfc.deleteLastSeenVersionKey(wfc.getLastSeenVersionKey(obj.(*unstructured.Unstructured)))
},
},
},
Expand Down Expand Up @@ -1346,3 +1365,25 @@ func (wfc *WorkflowController) IsLeader() bool {
// the wfc.wfInformer is nil if it is not the leader
return wfc.wfInformer != nil
}

func (wfc *WorkflowController) isOutdated(wf metav1.Object) bool {
wfc.lastSeenVersions.mutex.RLock()
defer wfc.lastSeenVersions.mutex.RUnlock()
lastSeenRV, ok := wfc.lastSeenVersions.versions[wfc.getLastSeenVersionKey(wf)]
// always process if not seen before
if !ok || lastSeenRV == "" {
return false
}
annotations := wf.GetAnnotations()[common.AnnotationKeyLastSeenVersion]
return annotations != lastSeenRV
}

func (wfc *WorkflowController) getLastSeenVersionKey(wf metav1.Object) string {
return string(wf.GetUID())
}

func (wfc *WorkflowController) deleteLastSeenVersionKey(key string) {
wfc.lastSeenVersions.mutex.Lock()
defer wfc.lastSeenVersions.mutex.Unlock()
delete(wfc.lastSeenVersions.versions, key)
}
Comment on lines +1369 to +1389
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Make isOutdated resilient to missing or externally modified annotations

Current implementation:

func (wfc *WorkflowController) isOutdated(wf metav1.Object) bool {
	wfc.lastSeenVersions.mutex.RLock()
	defer wfc.lastSeenVersions.mutex.RUnlock()
	lastSeenRV, ok := wfc.lastSeenVersions.versions[wfc.getLastSeenVersionKey(wf)]
	// always process if not seen before
	if !ok || lastSeenRV == "" {
		return false
	}
	annotations := wf.GetAnnotations()[common.AnnotationKeyLastSeenVersion]
	return annotations != lastSeenRV
}

Concerns:

  1. Annotation removed or not propagated
    If a workflow was processed once (so lastSeenVersions has an entry) and later some actor removes or overwrites workflows.argoproj.io/last-seen-version (e.g. via kubectl apply or kubectl annotate), then:

    • lastSeenRV remains non‑empty,
    • annotations becomes empty or different,
    • isOutdated will always return true for that workflow.

    Combined with the processNextItem early‑return, this can permanently wedge that workflow — it is perpetually classified as outdated and never reconciled again, with no self‑healing path.

  2. Inconsistent map/annotation due to rare paths
    Any future changes that update the map but not the annotation (or vice versa) will have the same effect: a workflow stuck behind isOutdated and continually re‑queued.

To make this more robust while preserving stale‑event protection, consider treating a missing or empty annotation as “we can’t safely apply staleness filtering” and either:

  • Skip staleness filtering and process normally, or
  • First drop the map entry so subsequent calls see !ok and also process normally.

For example:

func (wfc *WorkflowController) isOutdated(wf metav1.Object) bool {
	wfc.lastSeenVersions.mutex.RLock()
-	lastSeenRV, ok := wfc.lastSeenVersions.versions[wfc.getLastSeenVersionKey(wf)]
+	key := wfc.getLastSeenVersionKey(wf)
+	lastSeenRV, ok := wfc.lastSeenVersions.versions[key]
	wfc.lastSeenVersions.mutex.RUnlock()

	// always process if not seen before
	if !ok || lastSeenRV == "" {
		return false
	}
-	annotations := wf.GetAnnotations()[common.AnnotationKeyLastSeenVersion]
-	return annotations != lastSeenRV
+	ann := wf.GetAnnotations()[common.AnnotationKeyLastSeenVersion]
+	if ann == "" {
+		// Annotation missing or cleared; drop stale map entry to avoid wedging this workflow.
+		wfc.deleteLastSeenVersionKey(key)
+		return false
+	}
+	return ann != lastSeenRV
}

This keeps the optimisation for genuine stale events while avoiding permanent starvation if the annotation is lost or touched by external tooling.

22 changes: 22 additions & 0 deletions workflow/controller/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -761,6 +761,8 @@ func (woc *wfOperationCtx) persistUpdates(ctx context.Context) {
woc.log.WithError(err).Warn(ctx, "error updating taskset")
}

oldRV := woc.wf.ResourceVersion
woc.updateLastSeenVersionAnnotation(oldRV)
wf, err := wfClient.Update(ctx, woc.wf, metav1.UpdateOptions{})
Comment on lines +764 to 766
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Keep annotation and in-memory last-seen map in sync on all persist paths

The normal persistUpdates path correctly:

  • Captures oldRV from woc.wf.ResourceVersion,
  • Sets AnnotationKeyLastSeenVersion on the object before Update,
  • Updates the controller’s lastSeenVersions map with the same oldRV.

However, the size-limit error path only updates the map:

woc.wf = woc.orig.DeepCopy()
woc.markWorkflowError(ctx, err)
oldRV := woc.wf.ResourceVersion
_, err = wfClient.Update(ctx, woc.wf, metav1.UpdateOptions{})
if err != nil {
    ...
} else {
    woc.updateLastSeenVersion(oldRV)
}

If the previous persisted object already had a last-seen-version annotation, this makes the map hold oldRV while the persisted annotation still holds the previous value. WorkflowController.isOutdated will then see annotations != lastSeenRV and treat subsequent events for this workflow as stale, repeatedly re‑queuing and never operating on them, until the workflow is completed/deleted and the key is cleaned up.

To preserve the invariant that the map and annotation always match after any successful write, the size‑limit path should also set the annotation before the Update, mirroring the main path.

Suggested diff:

 func (woc *wfOperationCtx) persistWorkflowSizeLimitErr(ctx context.Context, wfClient v1alpha1.WorkflowInterface, err error) {
 	woc.wf = woc.orig.DeepCopy()
 	woc.markWorkflowError(ctx, err)
-	oldRV := woc.wf.ResourceVersion
-	_, err = wfClient.Update(ctx, woc.wf, metav1.UpdateOptions{})
+	oldRV := woc.wf.ResourceVersion
+	woc.updateLastSeenVersionAnnotation(oldRV)
+	_, err = wfClient.Update(ctx, woc.wf, metav1.UpdateOptions{})
 	if err != nil {
 		woc.log.WithError(err).Warn(ctx, "Error updating workflow with size error")
 	} else {
 		woc.updateLastSeenVersion(oldRV)
 	}
 }

This keeps the annotation and lastSeenVersions map coherent across both success paths.

Also applies to: 789-789, 865-871

🤖 Prompt for AI Agents
In workflow/controller/operator.go around lines 764-766 (and similarly at 789
and 865-871), the size-limit error path updates only the in-memory
lastSeenVersions map but does not set the AnnotationKeyLastSeenVersion on the
workflow before persisting, causing the annotation and map to diverge; fix by
capturing oldRV := woc.wf.ResourceVersion, call
woc.updateLastSeenVersionAnnotation(oldRV) on the workflow object before calling
wfClient.Update(...), then on successful Update call
woc.updateLastSeenVersion(oldRV) to keep the persisted annotation and in-memory
map in sync (apply same change to the other mentioned blocks).

if err != nil {
woc.log.WithField("error", err).WithField("reason", apierr.ReasonForError(err)).Warn(ctx, "Error updating workflow")
Expand All @@ -784,6 +786,7 @@ func (woc *wfOperationCtx) persistUpdates(ctx context.Context) {
woc.controller.hydrator.HydrateWithNodes(woc.wf, nodes)
}

woc.updateLastSeenVersion(oldRV)
// The workflow returned from wfClient.Update doesn't have a TypeMeta associated
// with it, so copy from the original workflow.
woc.wf.TypeMeta = woc.orig.TypeMeta
Expand Down Expand Up @@ -859,9 +862,12 @@ func (woc *wfOperationCtx) writeBackToInformer() error {
func (woc *wfOperationCtx) persistWorkflowSizeLimitErr(ctx context.Context, wfClient v1alpha1.WorkflowInterface, err error) {
woc.wf = woc.orig.DeepCopy()
woc.markWorkflowError(ctx, err)
oldRV := woc.wf.ResourceVersion
_, err = wfClient.Update(ctx, woc.wf, metav1.UpdateOptions{})
if err != nil {
woc.log.WithError(err).Warn(ctx, "Error updating workflow with size error")
} else {
woc.updateLastSeenVersion(oldRV)
}
}

Expand Down Expand Up @@ -4393,3 +4399,19 @@ func (woc *wfOperationCtx) setNodeDisplayName(ctx context.Context, node *wfv1.No
newNode.DisplayName = displayName
woc.wf.Status.Nodes.Set(ctx, nodeID, *newNode)
}

func (woc *wfOperationCtx) updateLastSeenVersionAnnotation(value string) {
if woc.wf.GetAnnotations() == nil {
woc.wf.SetAnnotations(make(map[string]string))
}
woc.wf.GetAnnotations()[common.AnnotationKeyLastSeenVersion] = value
}

func (woc *wfOperationCtx) updateLastSeenVersion(value string) {
woc.controller.lastSeenVersions.mutex.Lock()
defer woc.controller.lastSeenVersions.mutex.Unlock()
if woc.controller.lastSeenVersions.versions == nil {
woc.controller.lastSeenVersions.versions = make(map[string]string)
}
woc.controller.lastSeenVersions.versions[woc.controller.getLastSeenVersionKey(woc.wf)] = value
}
Loading