From 51b2b64ff806b032d1777a483b9a39a9360a9d62 Mon Sep 17 00:00:00 2001 From: chronark Date: Sun, 18 Jan 2026 19:21:15 +0100 Subject: [PATCH 01/32] wip: make krane initiates smaller watches --- gen/proto/ctrl/v1/cluster.pb.go | 267 +++++++++++------- .../ctrl/v1/ctrlv1connect/cluster.connect.go | 28 ++ pkg/assert/greater_or_equal.go | 2 +- pkg/db/BUILD.bazel | 3 + .../bulk_state_change_insert.sql_generated.go | 41 +++ pkg/db/models_generated.go | 50 ++++ pkg/db/querier_bulk_generated.go | 1 + pkg/db/querier_generated.go | 22 ++ ..._change_find_by_cluster_after_sequence.sql | 6 + pkg/db/queries/state_change_insert.sql | 12 + pkg/db/schema.sql | 10 + ...by_cluster_after_sequence.sql_generated.go | 59 ++++ pkg/db/state_change_insert.sql_generated.go | 57 ++++ svc/ctrl/proto/ctrl/v1/cluster.proto | 24 +- svc/ctrl/services/cluster/BUILD.bazel | 2 + svc/ctrl/services/cluster/rpc_sync.go | 164 +++++++++++ svc/ctrl/services/cluster/rpc_watch.go | 196 ++++--------- svc/ctrl/workflows/deploy/deploy_handler.go | 116 +++++--- svc/krane/internal/reconciler/BUILD.bazel | 46 +-- ...ment_test.go => apply_deployment_test.go_} | 0 ...ntinel_test.go => apply_sentinel_test.go_} | 0 ...ent_test.go => delete_deployment_test.go_} | 0 ...tinel_test.go => delete_sentinel_test.go_} | 0 ...le_state_test.go => handle_state_test.go_} | 0 ...t_test.go => mock_cluster_client_test.go_} | 0 .../{namespace_test.go => namespace_test.go_} | 0 svc/krane/internal/reconciler/reconciler.go | 27 ++ ...reconciler_test.go => reconciler_test.go_} | 0 ...o => refresh_current_deployments_test.go_} | 0 ....go => refresh_current_sentinels_test.go_} | 0 ..._helpers_test.go => test_helpers_test.go_} | 0 ...te_state_test.go => update_state_test.go_} | 0 ....go => watch_current_deployments_test.go_} | 0 ...st.go => watch_current_sentinels_test.go_} | 0 svc/krane/internal/reconciler/watcher.go | 53 ++++ svc/krane/pkg/controlplane/BUILD.bazel | 4 - svc/krane/pkg/controlplane/watcher.go | 135 --------- svc/krane/run.go | 8 - web/internal/db/src/schema/index.ts | 2 + web/internal/db/src/schema/state_changes.ts | 22 ++ 40 files changed, 854 insertions(+), 503 deletions(-) create mode 100644 pkg/db/bulk_state_change_insert.sql_generated.go create mode 100644 pkg/db/queries/state_change_find_by_cluster_after_sequence.sql create mode 100644 pkg/db/queries/state_change_insert.sql create mode 100644 pkg/db/state_change_find_by_cluster_after_sequence.sql_generated.go create mode 100644 pkg/db/state_change_insert.sql_generated.go create mode 100644 svc/ctrl/services/cluster/rpc_sync.go rename svc/krane/internal/reconciler/{apply_deployment_test.go => apply_deployment_test.go_} (100%) rename svc/krane/internal/reconciler/{apply_sentinel_test.go => apply_sentinel_test.go_} (100%) rename svc/krane/internal/reconciler/{delete_deployment_test.go => delete_deployment_test.go_} (100%) rename svc/krane/internal/reconciler/{delete_sentinel_test.go => delete_sentinel_test.go_} (100%) rename svc/krane/internal/reconciler/{handle_state_test.go => handle_state_test.go_} (100%) rename svc/krane/internal/reconciler/{mock_cluster_client_test.go => mock_cluster_client_test.go_} (100%) rename svc/krane/internal/reconciler/{namespace_test.go => namespace_test.go_} (100%) rename svc/krane/internal/reconciler/{reconciler_test.go => reconciler_test.go_} (100%) rename svc/krane/internal/reconciler/{refresh_current_deployments_test.go => refresh_current_deployments_test.go_} (100%) rename svc/krane/internal/reconciler/{refresh_current_sentinels_test.go => refresh_current_sentinels_test.go_} (100%) rename svc/krane/internal/reconciler/{test_helpers_test.go => test_helpers_test.go_} (100%) rename svc/krane/internal/reconciler/{update_state_test.go => update_state_test.go_} (100%) rename svc/krane/internal/reconciler/{watch_current_deployments_test.go => watch_current_deployments_test.go_} (100%) rename svc/krane/internal/reconciler/{watch_current_sentinels_test.go => watch_current_sentinels_test.go_} (100%) create mode 100644 svc/krane/internal/reconciler/watcher.go delete mode 100644 svc/krane/pkg/controlplane/watcher.go create mode 100644 web/internal/db/src/schema/state_changes.ts diff --git a/gen/proto/ctrl/v1/cluster.pb.go b/gen/proto/ctrl/v1/cluster.pb.go index 9244ec6099..77cf212472 100644 --- a/gen/proto/ctrl/v1/cluster.pb.go +++ b/gen/proto/ctrl/v1/cluster.pb.go @@ -413,25 +413,75 @@ func (*UpdateSentinelStateResponse) Descriptor() ([]byte, []int) { return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{6} } +type SyncRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + ClusterId string `protobuf:"bytes,1,opt,name=cluster_id,json=clusterId,proto3" json:"cluster_id,omitempty"` + Region string `protobuf:"bytes,2,opt,name=region,proto3" json:"region,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *SyncRequest) Reset() { + *x = SyncRequest{} + mi := &file_ctrl_v1_cluster_proto_msgTypes[7] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *SyncRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*SyncRequest) ProtoMessage() {} + +func (x *SyncRequest) ProtoReflect() protoreflect.Message { + mi := &file_ctrl_v1_cluster_proto_msgTypes[7] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use SyncRequest.ProtoReflect.Descriptor instead. +func (*SyncRequest) Descriptor() ([]byte, []int) { + return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{7} +} + +func (x *SyncRequest) GetClusterId() string { + if x != nil { + return x.ClusterId + } + return "" +} + +func (x *SyncRequest) GetRegion() string { + if x != nil { + return x.Region + } + return "" +} + // WatchRequest identifies the cluster requesting a watch stream. type WatchRequest struct { state protoimpl.MessageState `protogen:"open.v1"` // cluster_id uniquely identifies the client requesting the watch stream. ClusterId string `protobuf:"bytes,1,opt,name=cluster_id,json=clusterId,proto3" json:"cluster_id,omitempty"` Region string `protobuf:"bytes,2,opt,name=region,proto3" json:"region,omitempty"` - // live indicates whether the client wants live updates of changes - // if true, the stream never ends - Live bool `protobuf:"varint,3,opt,name=live,proto3" json:"live,omitempty"` - // synthetic indicates whether the client wants synthetic events - // if true, the server generates synthetic events to replay the full current desired state. - Synthetic bool `protobuf:"varint,4,opt,name=synthetic,proto3" json:"synthetic,omitempty"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache + // sequence_last_seen indicates the last sequence number the client has processed. + // This allows the server to send only new events since that sequence number, + // enabling efficient reconnection and resumption of the watch stream. + SequenceLastSeen uint64 `protobuf:"varint,3,opt,name=sequence_last_seen,json=sequenceLastSeen,proto3" json:"sequence_last_seen,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache } func (x *WatchRequest) Reset() { *x = WatchRequest{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[7] + mi := &file_ctrl_v1_cluster_proto_msgTypes[8] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -443,7 +493,7 @@ func (x *WatchRequest) String() string { func (*WatchRequest) ProtoMessage() {} func (x *WatchRequest) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[7] + mi := &file_ctrl_v1_cluster_proto_msgTypes[8] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -456,7 +506,7 @@ func (x *WatchRequest) ProtoReflect() protoreflect.Message { // Deprecated: Use WatchRequest.ProtoReflect.Descriptor instead. func (*WatchRequest) Descriptor() ([]byte, []int) { - return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{7} + return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{8} } func (x *WatchRequest) GetClusterId() string { @@ -473,35 +523,28 @@ func (x *WatchRequest) GetRegion() string { return "" } -func (x *WatchRequest) GetLive() bool { - if x != nil { - return x.Live - } - return false -} - -func (x *WatchRequest) GetSynthetic() bool { +func (x *WatchRequest) GetSequenceLastSeen() uint64 { if x != nil { - return x.Synthetic + return x.SequenceLastSeen } - return false + return 0 } type State struct { - state protoimpl.MessageState `protogen:"open.v1"` + state protoimpl.MessageState `protogen:"open.v1"` + Sequence uint64 `protobuf:"varint,1,opt,name=sequence,proto3" json:"sequence,omitempty"` // Types that are valid to be assigned to Kind: // // *State_Deployment // *State_Sentinel Kind isState_Kind `protobuf_oneof:"kind"` - AcknowledgeId *string `protobuf:"bytes,3,opt,name=acknowledge_id,json=acknowledgeId,proto3,oneof" json:"acknowledge_id,omitempty"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } func (x *State) Reset() { *x = State{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[8] + mi := &file_ctrl_v1_cluster_proto_msgTypes[9] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -513,7 +556,7 @@ func (x *State) String() string { func (*State) ProtoMessage() {} func (x *State) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[8] + mi := &file_ctrl_v1_cluster_proto_msgTypes[9] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -526,7 +569,14 @@ func (x *State) ProtoReflect() protoreflect.Message { // Deprecated: Use State.ProtoReflect.Descriptor instead. func (*State) Descriptor() ([]byte, []int) { - return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{8} + return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{9} +} + +func (x *State) GetSequence() uint64 { + if x != nil { + return x.Sequence + } + return 0 } func (x *State) GetKind() isState_Kind { @@ -554,23 +604,16 @@ func (x *State) GetSentinel() *SentinelState { return nil } -func (x *State) GetAcknowledgeId() string { - if x != nil && x.AcknowledgeId != nil { - return *x.AcknowledgeId - } - return "" -} - type isState_Kind interface { isState_Kind() } type State_Deployment struct { - Deployment *DeploymentState `protobuf:"bytes,1,opt,name=deployment,proto3,oneof"` + Deployment *DeploymentState `protobuf:"bytes,2,opt,name=deployment,proto3,oneof"` } type State_Sentinel struct { - Sentinel *SentinelState `protobuf:"bytes,2,opt,name=sentinel,proto3,oneof"` + Sentinel *SentinelState `protobuf:"bytes,3,opt,name=sentinel,proto3,oneof"` } func (*State_Deployment) isState_Kind() {} @@ -598,7 +641,7 @@ type SentinelState struct { func (x *SentinelState) Reset() { *x = SentinelState{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[9] + mi := &file_ctrl_v1_cluster_proto_msgTypes[10] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -610,7 +653,7 @@ func (x *SentinelState) String() string { func (*SentinelState) ProtoMessage() {} func (x *SentinelState) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[9] + mi := &file_ctrl_v1_cluster_proto_msgTypes[10] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -623,7 +666,7 @@ func (x *SentinelState) ProtoReflect() protoreflect.Message { // Deprecated: Use SentinelState.ProtoReflect.Descriptor instead. func (*SentinelState) Descriptor() ([]byte, []int) { - return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{9} + return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{10} } func (x *SentinelState) GetState() isSentinelState_State { @@ -693,7 +736,7 @@ type DeploymentState struct { func (x *DeploymentState) Reset() { *x = DeploymentState{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[10] + mi := &file_ctrl_v1_cluster_proto_msgTypes[11] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -705,7 +748,7 @@ func (x *DeploymentState) String() string { func (*DeploymentState) ProtoMessage() {} func (x *DeploymentState) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[10] + mi := &file_ctrl_v1_cluster_proto_msgTypes[11] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -718,7 +761,7 @@ func (x *DeploymentState) ProtoReflect() protoreflect.Message { // Deprecated: Use DeploymentState.ProtoReflect.Descriptor instead. func (*DeploymentState) Descriptor() ([]byte, []int) { - return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{10} + return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{11} } func (x *DeploymentState) GetState() isDeploymentState_State { @@ -793,7 +836,7 @@ type ApplySentinel struct { func (x *ApplySentinel) Reset() { *x = ApplySentinel{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[11] + mi := &file_ctrl_v1_cluster_proto_msgTypes[12] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -805,7 +848,7 @@ func (x *ApplySentinel) String() string { func (*ApplySentinel) ProtoMessage() {} func (x *ApplySentinel) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[11] + mi := &file_ctrl_v1_cluster_proto_msgTypes[12] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -818,7 +861,7 @@ func (x *ApplySentinel) ProtoReflect() protoreflect.Message { // Deprecated: Use ApplySentinel.ProtoReflect.Descriptor instead. func (*ApplySentinel) Descriptor() ([]byte, []int) { - return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{11} + return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{12} } func (x *ApplySentinel) GetK8SName() string { @@ -897,7 +940,7 @@ type DeleteSentinel struct { func (x *DeleteSentinel) Reset() { *x = DeleteSentinel{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[12] + mi := &file_ctrl_v1_cluster_proto_msgTypes[13] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -909,7 +952,7 @@ func (x *DeleteSentinel) String() string { func (*DeleteSentinel) ProtoMessage() {} func (x *DeleteSentinel) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[12] + mi := &file_ctrl_v1_cluster_proto_msgTypes[13] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -922,7 +965,7 @@ func (x *DeleteSentinel) ProtoReflect() protoreflect.Message { // Deprecated: Use DeleteSentinel.ProtoReflect.Descriptor instead. func (*DeleteSentinel) Descriptor() ([]byte, []int) { - return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{12} + return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{13} } func (x *DeleteSentinel) GetK8SName() string { @@ -977,7 +1020,7 @@ type ApplyDeployment struct { func (x *ApplyDeployment) Reset() { *x = ApplyDeployment{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[13] + mi := &file_ctrl_v1_cluster_proto_msgTypes[14] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -989,7 +1032,7 @@ func (x *ApplyDeployment) String() string { func (*ApplyDeployment) ProtoMessage() {} func (x *ApplyDeployment) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[13] + mi := &file_ctrl_v1_cluster_proto_msgTypes[14] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1002,7 +1045,7 @@ func (x *ApplyDeployment) ProtoReflect() protoreflect.Message { // Deprecated: Use ApplyDeployment.ProtoReflect.Descriptor instead. func (*ApplyDeployment) Descriptor() ([]byte, []int) { - return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{13} + return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{14} } func (x *ApplyDeployment) GetK8SNamespace() string { @@ -1111,7 +1154,7 @@ type DeleteDeployment struct { func (x *DeleteDeployment) Reset() { *x = DeleteDeployment{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[14] + mi := &file_ctrl_v1_cluster_proto_msgTypes[15] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -1123,7 +1166,7 @@ func (x *DeleteDeployment) String() string { func (*DeleteDeployment) ProtoMessage() {} func (x *DeleteDeployment) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[14] + mi := &file_ctrl_v1_cluster_proto_msgTypes[15] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1136,7 +1179,7 @@ func (x *DeleteDeployment) ProtoReflect() protoreflect.Message { // Deprecated: Use DeleteDeployment.ProtoReflect.Descriptor instead. func (*DeleteDeployment) Descriptor() ([]byte, []int) { - return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{14} + return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{15} } func (x *DeleteDeployment) GetK8SNamespace() string { @@ -1163,7 +1206,7 @@ type UpdateDeploymentStateRequest_Update struct { func (x *UpdateDeploymentStateRequest_Update) Reset() { *x = UpdateDeploymentStateRequest_Update{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[15] + mi := &file_ctrl_v1_cluster_proto_msgTypes[16] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -1175,7 +1218,7 @@ func (x *UpdateDeploymentStateRequest_Update) String() string { func (*UpdateDeploymentStateRequest_Update) ProtoMessage() {} func (x *UpdateDeploymentStateRequest_Update) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[15] + mi := &file_ctrl_v1_cluster_proto_msgTypes[16] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1214,7 +1257,7 @@ type UpdateDeploymentStateRequest_Delete struct { func (x *UpdateDeploymentStateRequest_Delete) Reset() { *x = UpdateDeploymentStateRequest_Delete{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[16] + mi := &file_ctrl_v1_cluster_proto_msgTypes[17] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -1226,7 +1269,7 @@ func (x *UpdateDeploymentStateRequest_Delete) String() string { func (*UpdateDeploymentStateRequest_Delete) ProtoMessage() {} func (x *UpdateDeploymentStateRequest_Delete) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[16] + mi := &file_ctrl_v1_cluster_proto_msgTypes[17] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1262,7 +1305,7 @@ type UpdateDeploymentStateRequest_Update_Instance struct { func (x *UpdateDeploymentStateRequest_Update_Instance) Reset() { *x = UpdateDeploymentStateRequest_Update_Instance{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[17] + mi := &file_ctrl_v1_cluster_proto_msgTypes[18] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -1274,7 +1317,7 @@ func (x *UpdateDeploymentStateRequest_Update_Instance) String() string { func (*UpdateDeploymentStateRequest_Update_Instance) ProtoMessage() {} func (x *UpdateDeploymentStateRequest_Update_Instance) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[17] + mi := &file_ctrl_v1_cluster_proto_msgTypes[18] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1361,21 +1404,23 @@ const file_ctrl_v1_cluster_proto_rawDesc = "" + "\x1aUpdateSentinelStateRequest\x12\x19\n" + "\bk8s_name\x18\x01 \x01(\tR\ak8sName\x12-\n" + "\x12available_replicas\x18\x02 \x01(\x05R\x11availableReplicas\"\x1d\n" + - "\x1bUpdateSentinelStateResponse\"w\n" + + "\x1bUpdateSentinelStateResponse\"D\n" + + "\vSyncRequest\x12\x1d\n" + + "\n" + + "cluster_id\x18\x01 \x01(\tR\tclusterId\x12\x16\n" + + "\x06region\x18\x02 \x01(\tR\x06region\"s\n" + "\fWatchRequest\x12\x1d\n" + "\n" + "cluster_id\x18\x01 \x01(\tR\tclusterId\x12\x16\n" + - "\x06region\x18\x02 \x01(\tR\x06region\x12\x12\n" + - "\x04live\x18\x03 \x01(\bR\x04live\x12\x1c\n" + - "\tsynthetic\x18\x04 \x01(\bR\tsynthetic\"\xc0\x01\n" + - "\x05State\x12:\n" + + "\x06region\x18\x02 \x01(\tR\x06region\x12,\n" + + "\x12sequence_last_seen\x18\x03 \x01(\x04R\x10sequenceLastSeen\"\x9d\x01\n" + + "\x05State\x12\x1a\n" + + "\bsequence\x18\x01 \x01(\x04R\bsequence\x12:\n" + "\n" + - "deployment\x18\x01 \x01(\v2\x18.ctrl.v1.DeploymentStateH\x00R\n" + + "deployment\x18\x02 \x01(\v2\x18.ctrl.v1.DeploymentStateH\x00R\n" + "deployment\x124\n" + - "\bsentinel\x18\x02 \x01(\v2\x16.ctrl.v1.SentinelStateH\x00R\bsentinel\x12*\n" + - "\x0eacknowledge_id\x18\x03 \x01(\tH\x01R\racknowledgeId\x88\x01\x01B\x06\n" + - "\x04kindB\x11\n" + - "\x0f_acknowledge_id\"{\n" + + "\bsentinel\x18\x03 \x01(\v2\x16.ctrl.v1.SentinelStateH\x00R\bsentinelB\x06\n" + + "\x04kind\"{\n" + "\rSentinelState\x12.\n" + "\x05apply\x18\x01 \x01(\v2\x16.ctrl.v1.ApplySentinelH\x00R\x05apply\x121\n" + "\x06delete\x18\x02 \x01(\v2\x17.ctrl.v1.DeleteSentinelH\x00R\x06deleteB\a\n" + @@ -1420,9 +1465,10 @@ const file_ctrl_v1_cluster_proto_rawDesc = "" + "\r_readiness_id\"R\n" + "\x10DeleteDeployment\x12#\n" + "\rk8s_namespace\x18\x01 \x01(\tR\fk8sNamespace\x12\x19\n" + - "\bk8s_name\x18\x02 \x01(\tR\ak8sName2\xca\x03\n" + + "\bk8s_name\x18\x02 \x01(\tR\ak8sName2\xfa\x03\n" + "\x0eClusterService\x120\n" + - "\x05Watch\x12\x15.ctrl.v1.WatchRequest\x1a\x0e.ctrl.v1.State0\x01\x12Z\n" + + "\x05Watch\x12\x15.ctrl.v1.WatchRequest\x1a\x0e.ctrl.v1.State0\x01\x12.\n" + + "\x04Sync\x12\x14.ctrl.v1.SyncRequest\x1a\x0e.ctrl.v1.State0\x01\x12Z\n" + "\x17GetDesiredSentinelState\x12'.ctrl.v1.GetDesiredSentinelStateRequest\x1a\x16.ctrl.v1.SentinelState\x12`\n" + "\x13UpdateSentinelState\x12#.ctrl.v1.UpdateSentinelStateRequest\x1a$.ctrl.v1.UpdateSentinelStateResponse\x12`\n" + "\x19GetDesiredDeploymentState\x12).ctrl.v1.GetDesiredDeploymentStateRequest\x1a\x18.ctrl.v1.DeploymentState\x12f\n" + @@ -1442,7 +1488,7 @@ func file_ctrl_v1_cluster_proto_rawDescGZIP() []byte { } var file_ctrl_v1_cluster_proto_enumTypes = make([]protoimpl.EnumInfo, 1) -var file_ctrl_v1_cluster_proto_msgTypes = make([]protoimpl.MessageInfo, 18) +var file_ctrl_v1_cluster_proto_msgTypes = make([]protoimpl.MessageInfo, 19) var file_ctrl_v1_cluster_proto_goTypes = []any{ (UpdateDeploymentStateRequest_Update_Instance_Status)(0), // 0: ctrl.v1.UpdateDeploymentStateRequest.Update.Instance.Status (*GetDesiredSentinelStateRequest)(nil), // 1: ctrl.v1.GetDesiredSentinelStateRequest @@ -1452,41 +1498,44 @@ var file_ctrl_v1_cluster_proto_goTypes = []any{ (*UpdateInstanceStateResponse)(nil), // 5: ctrl.v1.UpdateInstanceStateResponse (*UpdateSentinelStateRequest)(nil), // 6: ctrl.v1.UpdateSentinelStateRequest (*UpdateSentinelStateResponse)(nil), // 7: ctrl.v1.UpdateSentinelStateResponse - (*WatchRequest)(nil), // 8: ctrl.v1.WatchRequest - (*State)(nil), // 9: ctrl.v1.State - (*SentinelState)(nil), // 10: ctrl.v1.SentinelState - (*DeploymentState)(nil), // 11: ctrl.v1.DeploymentState - (*ApplySentinel)(nil), // 12: ctrl.v1.ApplySentinel - (*DeleteSentinel)(nil), // 13: ctrl.v1.DeleteSentinel - (*ApplyDeployment)(nil), // 14: ctrl.v1.ApplyDeployment - (*DeleteDeployment)(nil), // 15: ctrl.v1.DeleteDeployment - (*UpdateDeploymentStateRequest_Update)(nil), // 16: ctrl.v1.UpdateDeploymentStateRequest.Update - (*UpdateDeploymentStateRequest_Delete)(nil), // 17: ctrl.v1.UpdateDeploymentStateRequest.Delete - (*UpdateDeploymentStateRequest_Update_Instance)(nil), // 18: ctrl.v1.UpdateDeploymentStateRequest.Update.Instance + (*SyncRequest)(nil), // 8: ctrl.v1.SyncRequest + (*WatchRequest)(nil), // 9: ctrl.v1.WatchRequest + (*State)(nil), // 10: ctrl.v1.State + (*SentinelState)(nil), // 11: ctrl.v1.SentinelState + (*DeploymentState)(nil), // 12: ctrl.v1.DeploymentState + (*ApplySentinel)(nil), // 13: ctrl.v1.ApplySentinel + (*DeleteSentinel)(nil), // 14: ctrl.v1.DeleteSentinel + (*ApplyDeployment)(nil), // 15: ctrl.v1.ApplyDeployment + (*DeleteDeployment)(nil), // 16: ctrl.v1.DeleteDeployment + (*UpdateDeploymentStateRequest_Update)(nil), // 17: ctrl.v1.UpdateDeploymentStateRequest.Update + (*UpdateDeploymentStateRequest_Delete)(nil), // 18: ctrl.v1.UpdateDeploymentStateRequest.Delete + (*UpdateDeploymentStateRequest_Update_Instance)(nil), // 19: ctrl.v1.UpdateDeploymentStateRequest.Update.Instance } var file_ctrl_v1_cluster_proto_depIdxs = []int32{ - 16, // 0: ctrl.v1.UpdateDeploymentStateRequest.update:type_name -> ctrl.v1.UpdateDeploymentStateRequest.Update - 17, // 1: ctrl.v1.UpdateDeploymentStateRequest.delete:type_name -> ctrl.v1.UpdateDeploymentStateRequest.Delete - 11, // 2: ctrl.v1.State.deployment:type_name -> ctrl.v1.DeploymentState - 10, // 3: ctrl.v1.State.sentinel:type_name -> ctrl.v1.SentinelState - 12, // 4: ctrl.v1.SentinelState.apply:type_name -> ctrl.v1.ApplySentinel - 13, // 5: ctrl.v1.SentinelState.delete:type_name -> ctrl.v1.DeleteSentinel - 14, // 6: ctrl.v1.DeploymentState.apply:type_name -> ctrl.v1.ApplyDeployment - 15, // 7: ctrl.v1.DeploymentState.delete:type_name -> ctrl.v1.DeleteDeployment - 18, // 8: ctrl.v1.UpdateDeploymentStateRequest.Update.instances:type_name -> ctrl.v1.UpdateDeploymentStateRequest.Update.Instance + 17, // 0: ctrl.v1.UpdateDeploymentStateRequest.update:type_name -> ctrl.v1.UpdateDeploymentStateRequest.Update + 18, // 1: ctrl.v1.UpdateDeploymentStateRequest.delete:type_name -> ctrl.v1.UpdateDeploymentStateRequest.Delete + 12, // 2: ctrl.v1.State.deployment:type_name -> ctrl.v1.DeploymentState + 11, // 3: ctrl.v1.State.sentinel:type_name -> ctrl.v1.SentinelState + 13, // 4: ctrl.v1.SentinelState.apply:type_name -> ctrl.v1.ApplySentinel + 14, // 5: ctrl.v1.SentinelState.delete:type_name -> ctrl.v1.DeleteSentinel + 15, // 6: ctrl.v1.DeploymentState.apply:type_name -> ctrl.v1.ApplyDeployment + 16, // 7: ctrl.v1.DeploymentState.delete:type_name -> ctrl.v1.DeleteDeployment + 19, // 8: ctrl.v1.UpdateDeploymentStateRequest.Update.instances:type_name -> ctrl.v1.UpdateDeploymentStateRequest.Update.Instance 0, // 9: ctrl.v1.UpdateDeploymentStateRequest.Update.Instance.status:type_name -> ctrl.v1.UpdateDeploymentStateRequest.Update.Instance.Status - 8, // 10: ctrl.v1.ClusterService.Watch:input_type -> ctrl.v1.WatchRequest - 1, // 11: ctrl.v1.ClusterService.GetDesiredSentinelState:input_type -> ctrl.v1.GetDesiredSentinelStateRequest - 6, // 12: ctrl.v1.ClusterService.UpdateSentinelState:input_type -> ctrl.v1.UpdateSentinelStateRequest - 2, // 13: ctrl.v1.ClusterService.GetDesiredDeploymentState:input_type -> ctrl.v1.GetDesiredDeploymentStateRequest - 3, // 14: ctrl.v1.ClusterService.UpdateDeploymentState:input_type -> ctrl.v1.UpdateDeploymentStateRequest - 9, // 15: ctrl.v1.ClusterService.Watch:output_type -> ctrl.v1.State - 10, // 16: ctrl.v1.ClusterService.GetDesiredSentinelState:output_type -> ctrl.v1.SentinelState - 7, // 17: ctrl.v1.ClusterService.UpdateSentinelState:output_type -> ctrl.v1.UpdateSentinelStateResponse - 11, // 18: ctrl.v1.ClusterService.GetDesiredDeploymentState:output_type -> ctrl.v1.DeploymentState - 4, // 19: ctrl.v1.ClusterService.UpdateDeploymentState:output_type -> ctrl.v1.UpdateDeploymentStateResponse - 15, // [15:20] is the sub-list for method output_type - 10, // [10:15] is the sub-list for method input_type + 9, // 10: ctrl.v1.ClusterService.Watch:input_type -> ctrl.v1.WatchRequest + 8, // 11: ctrl.v1.ClusterService.Sync:input_type -> ctrl.v1.SyncRequest + 1, // 12: ctrl.v1.ClusterService.GetDesiredSentinelState:input_type -> ctrl.v1.GetDesiredSentinelStateRequest + 6, // 13: ctrl.v1.ClusterService.UpdateSentinelState:input_type -> ctrl.v1.UpdateSentinelStateRequest + 2, // 14: ctrl.v1.ClusterService.GetDesiredDeploymentState:input_type -> ctrl.v1.GetDesiredDeploymentStateRequest + 3, // 15: ctrl.v1.ClusterService.UpdateDeploymentState:input_type -> ctrl.v1.UpdateDeploymentStateRequest + 10, // 16: ctrl.v1.ClusterService.Watch:output_type -> ctrl.v1.State + 10, // 17: ctrl.v1.ClusterService.Sync:output_type -> ctrl.v1.State + 11, // 18: ctrl.v1.ClusterService.GetDesiredSentinelState:output_type -> ctrl.v1.SentinelState + 7, // 19: ctrl.v1.ClusterService.UpdateSentinelState:output_type -> ctrl.v1.UpdateSentinelStateResponse + 12, // 20: ctrl.v1.ClusterService.GetDesiredDeploymentState:output_type -> ctrl.v1.DeploymentState + 4, // 21: ctrl.v1.ClusterService.UpdateDeploymentState:output_type -> ctrl.v1.UpdateDeploymentStateResponse + 16, // [16:22] is the sub-list for method output_type + 10, // [10:16] is the sub-list for method input_type 10, // [10:10] is the sub-list for extension type_name 10, // [10:10] is the sub-list for extension extendee 0, // [0:10] is the sub-list for field type_name @@ -1501,26 +1550,26 @@ func file_ctrl_v1_cluster_proto_init() { (*UpdateDeploymentStateRequest_Update_)(nil), (*UpdateDeploymentStateRequest_Delete_)(nil), } - file_ctrl_v1_cluster_proto_msgTypes[8].OneofWrappers = []any{ + file_ctrl_v1_cluster_proto_msgTypes[9].OneofWrappers = []any{ (*State_Deployment)(nil), (*State_Sentinel)(nil), } - file_ctrl_v1_cluster_proto_msgTypes[9].OneofWrappers = []any{ + file_ctrl_v1_cluster_proto_msgTypes[10].OneofWrappers = []any{ (*SentinelState_Apply)(nil), (*SentinelState_Delete)(nil), } - file_ctrl_v1_cluster_proto_msgTypes[10].OneofWrappers = []any{ + file_ctrl_v1_cluster_proto_msgTypes[11].OneofWrappers = []any{ (*DeploymentState_Apply)(nil), (*DeploymentState_Delete)(nil), } - file_ctrl_v1_cluster_proto_msgTypes[13].OneofWrappers = []any{} + file_ctrl_v1_cluster_proto_msgTypes[14].OneofWrappers = []any{} type x struct{} out := protoimpl.TypeBuilder{ File: protoimpl.DescBuilder{ GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: unsafe.Slice(unsafe.StringData(file_ctrl_v1_cluster_proto_rawDesc), len(file_ctrl_v1_cluster_proto_rawDesc)), NumEnums: 1, - NumMessages: 18, + NumMessages: 19, NumExtensions: 0, NumServices: 1, }, diff --git a/gen/proto/ctrl/v1/ctrlv1connect/cluster.connect.go b/gen/proto/ctrl/v1/ctrlv1connect/cluster.connect.go index 6bd06d88ca..a287c3c0f9 100644 --- a/gen/proto/ctrl/v1/ctrlv1connect/cluster.connect.go +++ b/gen/proto/ctrl/v1/ctrlv1connect/cluster.connect.go @@ -44,6 +44,8 @@ const ( const ( // ClusterServiceWatchProcedure is the fully-qualified name of the ClusterService's Watch RPC. ClusterServiceWatchProcedure = "/ctrl.v1.ClusterService/Watch" + // ClusterServiceSyncProcedure is the fully-qualified name of the ClusterService's Sync RPC. + ClusterServiceSyncProcedure = "/ctrl.v1.ClusterService/Sync" // ClusterServiceGetDesiredSentinelStateProcedure is the fully-qualified name of the // ClusterService's GetDesiredSentinelState RPC. ClusterServiceGetDesiredSentinelStateProcedure = "/ctrl.v1.ClusterService/GetDesiredSentinelState" @@ -61,6 +63,7 @@ const ( // ClusterServiceClient is a client for the ctrl.v1.ClusterService service. type ClusterServiceClient interface { Watch(context.Context, *connect.Request[v1.WatchRequest]) (*connect.ServerStreamForClient[v1.State], error) + Sync(context.Context, *connect.Request[v1.SyncRequest]) (*connect.ServerStreamForClient[v1.State], error) GetDesiredSentinelState(context.Context, *connect.Request[v1.GetDesiredSentinelStateRequest]) (*connect.Response[v1.SentinelState], error) UpdateSentinelState(context.Context, *connect.Request[v1.UpdateSentinelStateRequest]) (*connect.Response[v1.UpdateSentinelStateResponse], error) GetDesiredDeploymentState(context.Context, *connect.Request[v1.GetDesiredDeploymentStateRequest]) (*connect.Response[v1.DeploymentState], error) @@ -84,6 +87,12 @@ func NewClusterServiceClient(httpClient connect.HTTPClient, baseURL string, opts connect.WithSchema(clusterServiceMethods.ByName("Watch")), connect.WithClientOptions(opts...), ), + sync: connect.NewClient[v1.SyncRequest, v1.State]( + httpClient, + baseURL+ClusterServiceSyncProcedure, + connect.WithSchema(clusterServiceMethods.ByName("Sync")), + connect.WithClientOptions(opts...), + ), getDesiredSentinelState: connect.NewClient[v1.GetDesiredSentinelStateRequest, v1.SentinelState]( httpClient, baseURL+ClusterServiceGetDesiredSentinelStateProcedure, @@ -114,6 +123,7 @@ func NewClusterServiceClient(httpClient connect.HTTPClient, baseURL string, opts // clusterServiceClient implements ClusterServiceClient. type clusterServiceClient struct { watch *connect.Client[v1.WatchRequest, v1.State] + sync *connect.Client[v1.SyncRequest, v1.State] getDesiredSentinelState *connect.Client[v1.GetDesiredSentinelStateRequest, v1.SentinelState] updateSentinelState *connect.Client[v1.UpdateSentinelStateRequest, v1.UpdateSentinelStateResponse] getDesiredDeploymentState *connect.Client[v1.GetDesiredDeploymentStateRequest, v1.DeploymentState] @@ -125,6 +135,11 @@ func (c *clusterServiceClient) Watch(ctx context.Context, req *connect.Request[v return c.watch.CallServerStream(ctx, req) } +// Sync calls ctrl.v1.ClusterService.Sync. +func (c *clusterServiceClient) Sync(ctx context.Context, req *connect.Request[v1.SyncRequest]) (*connect.ServerStreamForClient[v1.State], error) { + return c.sync.CallServerStream(ctx, req) +} + // GetDesiredSentinelState calls ctrl.v1.ClusterService.GetDesiredSentinelState. func (c *clusterServiceClient) GetDesiredSentinelState(ctx context.Context, req *connect.Request[v1.GetDesiredSentinelStateRequest]) (*connect.Response[v1.SentinelState], error) { return c.getDesiredSentinelState.CallUnary(ctx, req) @@ -148,6 +163,7 @@ func (c *clusterServiceClient) UpdateDeploymentState(ctx context.Context, req *c // ClusterServiceHandler is an implementation of the ctrl.v1.ClusterService service. type ClusterServiceHandler interface { Watch(context.Context, *connect.Request[v1.WatchRequest], *connect.ServerStream[v1.State]) error + Sync(context.Context, *connect.Request[v1.SyncRequest], *connect.ServerStream[v1.State]) error GetDesiredSentinelState(context.Context, *connect.Request[v1.GetDesiredSentinelStateRequest]) (*connect.Response[v1.SentinelState], error) UpdateSentinelState(context.Context, *connect.Request[v1.UpdateSentinelStateRequest]) (*connect.Response[v1.UpdateSentinelStateResponse], error) GetDesiredDeploymentState(context.Context, *connect.Request[v1.GetDesiredDeploymentStateRequest]) (*connect.Response[v1.DeploymentState], error) @@ -167,6 +183,12 @@ func NewClusterServiceHandler(svc ClusterServiceHandler, opts ...connect.Handler connect.WithSchema(clusterServiceMethods.ByName("Watch")), connect.WithHandlerOptions(opts...), ) + clusterServiceSyncHandler := connect.NewServerStreamHandler( + ClusterServiceSyncProcedure, + svc.Sync, + connect.WithSchema(clusterServiceMethods.ByName("Sync")), + connect.WithHandlerOptions(opts...), + ) clusterServiceGetDesiredSentinelStateHandler := connect.NewUnaryHandler( ClusterServiceGetDesiredSentinelStateProcedure, svc.GetDesiredSentinelState, @@ -195,6 +217,8 @@ func NewClusterServiceHandler(svc ClusterServiceHandler, opts ...connect.Handler switch r.URL.Path { case ClusterServiceWatchProcedure: clusterServiceWatchHandler.ServeHTTP(w, r) + case ClusterServiceSyncProcedure: + clusterServiceSyncHandler.ServeHTTP(w, r) case ClusterServiceGetDesiredSentinelStateProcedure: clusterServiceGetDesiredSentinelStateHandler.ServeHTTP(w, r) case ClusterServiceUpdateSentinelStateProcedure: @@ -216,6 +240,10 @@ func (UnimplementedClusterServiceHandler) Watch(context.Context, *connect.Reques return connect.NewError(connect.CodeUnimplemented, errors.New("ctrl.v1.ClusterService.Watch is not implemented")) } +func (UnimplementedClusterServiceHandler) Sync(context.Context, *connect.Request[v1.SyncRequest], *connect.ServerStream[v1.State]) error { + return connect.NewError(connect.CodeUnimplemented, errors.New("ctrl.v1.ClusterService.Sync is not implemented")) +} + func (UnimplementedClusterServiceHandler) GetDesiredSentinelState(context.Context, *connect.Request[v1.GetDesiredSentinelStateRequest]) (*connect.Response[v1.SentinelState], error) { return nil, connect.NewError(connect.CodeUnimplemented, errors.New("ctrl.v1.ClusterService.GetDesiredSentinelState is not implemented")) } diff --git a/pkg/assert/greater_or_equal.go b/pkg/assert/greater_or_equal.go index 59f1c7f463..d9301812a6 100644 --- a/pkg/assert/greater_or_equal.go +++ b/pkg/assert/greater_or_equal.go @@ -16,7 +16,7 @@ import ( // "Insufficient account balance", // )) // } -func GreaterOrEqual[T ~int | ~int32 | ~int64 | ~float32 | ~float64](a, b T, message ...string) error { +func GreaterOrEqual[T ~int | ~int32 | ~int64 | ~float32 | ~float64 | ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64](a, b T, message ...string) error { if a >= b { return nil } diff --git a/pkg/db/BUILD.bazel b/pkg/db/BUILD.bazel index fd3f41fffe..3df59b3197 100644 --- a/pkg/db/BUILD.bazel +++ b/pkg/db/BUILD.bazel @@ -61,6 +61,7 @@ go_library( "bulk_role_insert.sql_generated.go", "bulk_role_permission_insert.sql_generated.go", "bulk_sentinel_insert.sql_generated.go", + "bulk_state_change_insert.sql_generated.go", "bulk_workspace_insert.sql_generated.go", "bulk_workspace_upsert.sql_generated.go", "cached_key_data.go", @@ -234,6 +235,8 @@ go_library( "sentinel_insert.sql_generated.go", "sentinel_list_desired.sql_generated.go", "sentinel_update_available_replicas_and_health.sql_generated.go", + "state_change_find_by_cluster_after_sequence.sql_generated.go", + "state_change_insert.sql_generated.go", "traced_tx.go", "tx.go", "workspace_find_by_id.sql_generated.go", diff --git a/pkg/db/bulk_state_change_insert.sql_generated.go b/pkg/db/bulk_state_change_insert.sql_generated.go new file mode 100644 index 0000000000..421e29a1f4 --- /dev/null +++ b/pkg/db/bulk_state_change_insert.sql_generated.go @@ -0,0 +1,41 @@ +// Code generated by sqlc bulk insert plugin. DO NOT EDIT. + +package db + +import ( + "context" + "fmt" + "strings" +) + +// bulkInsertStateChange is the base query for bulk insert +const bulkInsertStateChange = `INSERT INTO ` + "`" + `state_changes` + "`" + ` ( resource_type, state, cluster_id, created_at ) VALUES %s` + +// InsertStateChanges performs bulk insert in a single query +func (q *BulkQueries) InsertStateChanges(ctx context.Context, db DBTX, args []InsertStateChangeParams) error { + + if len(args) == 0 { + return nil + } + + // Build the bulk insert query + valueClauses := make([]string, len(args)) + for i := range args { + valueClauses[i] = "( ?, ?, ?, ? )" + } + + bulkQuery := fmt.Sprintf(bulkInsertStateChange, strings.Join(valueClauses, ", ")) + + // Collect all arguments + var allArgs []any + for _, arg := range args { + allArgs = append(allArgs, arg.ResourceType) + allArgs = append(allArgs, arg.State) + allArgs = append(allArgs, arg.ClusterID) + allArgs = append(allArgs, arg.CreatedAt) + } + + // Execute the bulk insert + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err +} diff --git a/pkg/db/models_generated.go b/pkg/db/models_generated.go index 3712601cce..b8030fe096 100644 --- a/pkg/db/models_generated.go +++ b/pkg/db/models_generated.go @@ -616,6 +616,48 @@ func (ns NullSentinelsHealth) Value() (driver.Value, error) { return string(ns.SentinelsHealth), nil } +type StateChangesResourceType string + +const ( + StateChangesResourceTypeSentinel StateChangesResourceType = "sentinel" + StateChangesResourceTypeDeployment StateChangesResourceType = "deployment" +) + +func (e *StateChangesResourceType) Scan(src interface{}) error { + switch s := src.(type) { + case []byte: + *e = StateChangesResourceType(s) + case string: + *e = StateChangesResourceType(s) + default: + return fmt.Errorf("unsupported scan type for StateChangesResourceType: %T", src) + } + return nil +} + +type NullStateChangesResourceType struct { + StateChangesResourceType StateChangesResourceType + Valid bool // Valid is true if StateChangesResourceType is not NULL +} + +// Scan implements the Scanner interface. +func (ns *NullStateChangesResourceType) Scan(value interface{}) error { + if value == nil { + ns.StateChangesResourceType, ns.Valid = "", false + return nil + } + ns.Valid = true + return ns.StateChangesResourceType.Scan(value) +} + +// Value implements the driver Valuer interface. +func (ns NullStateChangesResourceType) Value() (driver.Value, error) { + if !ns.Valid { + return nil, nil + } + return string(ns.StateChangesResourceType), nil +} + type VercelBindingsEnvironment string const ( @@ -1165,6 +1207,14 @@ type Sentinel struct { UpdatedAt sql.NullInt64 `db:"updated_at"` } +type StateChange struct { + Sequence uint64 `db:"sequence"` + ResourceType StateChangesResourceType `db:"resource_type"` + State []byte `db:"state"` + ClusterID string `db:"cluster_id"` + CreatedAt uint64 `db:"created_at"` +} + type VercelBinding struct { Pk uint64 `db:"pk"` ID string `db:"id"` diff --git a/pkg/db/querier_bulk_generated.go b/pkg/db/querier_bulk_generated.go index e1a3fef11f..153c3cea5b 100644 --- a/pkg/db/querier_bulk_generated.go +++ b/pkg/db/querier_bulk_generated.go @@ -40,6 +40,7 @@ type BulkQuerier interface { InsertRoles(ctx context.Context, db DBTX, args []InsertRoleParams) error InsertRolePermissions(ctx context.Context, db DBTX, args []InsertRolePermissionParams) error InsertSentinels(ctx context.Context, db DBTX, args []InsertSentinelParams) error + InsertStateChanges(ctx context.Context, db DBTX, args []InsertStateChangeParams) error InsertWorkspaces(ctx context.Context, db DBTX, args []InsertWorkspaceParams) error UpsertWorkspace(ctx context.Context, db DBTX, args []UpsertWorkspaceParams) error } diff --git a/pkg/db/querier_generated.go b/pkg/db/querier_generated.go index f36ea8a05f..4fb17fb4c2 100644 --- a/pkg/db/querier_generated.go +++ b/pkg/db/querier_generated.go @@ -944,6 +944,14 @@ type Querier interface { // // SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, created_at, updated_at FROM sentinels WHERE environment_id = ? FindSentinelsByEnvironmentID(ctx context.Context, db DBTX, environmentID string) ([]Sentinel, error) + //FindStateChangesByClusterAfterSequence + // + // SELECT sequence, resource_type, state, cluster_id, created_at + // FROM `state_changes` + // WHERE cluster_id = ? + // AND sequence > ? + // ORDER BY sequence ASC + FindStateChangesByClusterAfterSequence(ctx context.Context, db DBTX, arg FindStateChangesByClusterAfterSequenceParams) ([]StateChange, error) //FindWorkspaceByID // // SELECT id, org_id, name, slug, k8s_namespace, partition_id, plan, tier, stripe_customer_id, stripe_subscription_id, beta_features, features, subscriptions, enabled, delete_protection, created_at_m, updated_at_m, deleted_at_m FROM `workspaces` @@ -1568,6 +1576,20 @@ type Querier interface { // ? // ) InsertSentinel(ctx context.Context, db DBTX, arg InsertSentinelParams) error + //InsertStateChange + // + // INSERT INTO `state_changes` ( + // resource_type, + // state, + // cluster_id, + // created_at + // ) VALUES ( + // ?, + // ?, + // ?, + // ? + // ) + InsertStateChange(ctx context.Context, db DBTX, arg InsertStateChangeParams) (int64, error) //InsertWorkspace // // INSERT INTO `workspaces` ( diff --git a/pkg/db/queries/state_change_find_by_cluster_after_sequence.sql b/pkg/db/queries/state_change_find_by_cluster_after_sequence.sql new file mode 100644 index 0000000000..b45418e1d3 --- /dev/null +++ b/pkg/db/queries/state_change_find_by_cluster_after_sequence.sql @@ -0,0 +1,6 @@ +-- name: FindStateChangesByClusterAfterSequence :many +SELECT * +FROM `state_changes` +WHERE cluster_id = sqlc.arg(cluster_id) + AND sequence > sqlc.arg(after_sequence) +ORDER BY sequence ASC; diff --git a/pkg/db/queries/state_change_insert.sql b/pkg/db/queries/state_change_insert.sql new file mode 100644 index 0000000000..e91264b599 --- /dev/null +++ b/pkg/db/queries/state_change_insert.sql @@ -0,0 +1,12 @@ +-- name: InsertStateChange :execlastid +INSERT INTO `state_changes` ( + resource_type, + state, + cluster_id, + created_at +) VALUES ( + sqlc.arg(resource_type), + sqlc.arg(state), + sqlc.arg(cluster_id), + sqlc.arg(created_at) +); diff --git a/pkg/db/schema.sql b/pkg/db/schema.sql index 845aa1ba22..a7466834ab 100644 --- a/pkg/db/schema.sql +++ b/pkg/db/schema.sql @@ -571,6 +571,15 @@ CREATE TABLE `frontline_routes` ( CONSTRAINT `frontline_routes_fully_qualified_domain_name_unique` UNIQUE(`fully_qualified_domain_name`) ); +CREATE TABLE `state_changes` ( + `sequence` bigint unsigned AUTO_INCREMENT NOT NULL, + `resource_type` enum('sentinel','deployment') NOT NULL, + `state` longblob NOT NULL, + `cluster_id` varchar(256) NOT NULL, + `created_at` bigint unsigned NOT NULL, + CONSTRAINT `state_changes_sequence` PRIMARY KEY(`sequence`) +); + CREATE INDEX `workspace_id_idx` ON `apis` (`workspace_id`); CREATE INDEX `workspace_id_idx` ON `roles` (`workspace_id`); CREATE INDEX `key_auth_id_deleted_at_idx` ON `keys` (`key_auth_id`,`deleted_at_m`); @@ -606,4 +615,5 @@ CREATE INDEX `idx_deployment_id` ON `instances` (`deployment_id`); CREATE INDEX `idx_region` ON `instances` (`region`); CREATE INDEX `environment_id_idx` ON `frontline_routes` (`environment_id`); CREATE INDEX `deployment_id_idx` ON `frontline_routes` (`deployment_id`); +CREATE INDEX `cluster_id_sequence` ON `state_changes` (`cluster_id`,`sequence`); diff --git a/pkg/db/state_change_find_by_cluster_after_sequence.sql_generated.go b/pkg/db/state_change_find_by_cluster_after_sequence.sql_generated.go new file mode 100644 index 0000000000..e0ab270c84 --- /dev/null +++ b/pkg/db/state_change_find_by_cluster_after_sequence.sql_generated.go @@ -0,0 +1,59 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.30.0 +// source: state_change_find_by_cluster_after_sequence.sql + +package db + +import ( + "context" +) + +const findStateChangesByClusterAfterSequence = `-- name: FindStateChangesByClusterAfterSequence :many +SELECT sequence, resource_type, state, cluster_id, created_at +FROM ` + "`" + `state_changes` + "`" + ` +WHERE cluster_id = ? + AND sequence > ? +ORDER BY sequence ASC +` + +type FindStateChangesByClusterAfterSequenceParams struct { + ClusterID string `db:"cluster_id"` + AfterSequence uint64 `db:"after_sequence"` +} + +// FindStateChangesByClusterAfterSequence +// +// SELECT sequence, resource_type, state, cluster_id, created_at +// FROM `state_changes` +// WHERE cluster_id = ? +// AND sequence > ? +// ORDER BY sequence ASC +func (q *Queries) FindStateChangesByClusterAfterSequence(ctx context.Context, db DBTX, arg FindStateChangesByClusterAfterSequenceParams) ([]StateChange, error) { + rows, err := db.QueryContext(ctx, findStateChangesByClusterAfterSequence, arg.ClusterID, arg.AfterSequence) + if err != nil { + return nil, err + } + defer rows.Close() + var items []StateChange + for rows.Next() { + var i StateChange + if err := rows.Scan( + &i.Sequence, + &i.ResourceType, + &i.State, + &i.ClusterID, + &i.CreatedAt, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} diff --git a/pkg/db/state_change_insert.sql_generated.go b/pkg/db/state_change_insert.sql_generated.go new file mode 100644 index 0000000000..fb07ce0e2c --- /dev/null +++ b/pkg/db/state_change_insert.sql_generated.go @@ -0,0 +1,57 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.30.0 +// source: state_change_insert.sql + +package db + +import ( + "context" +) + +const insertStateChange = `-- name: InsertStateChange :execlastid +INSERT INTO ` + "`" + `state_changes` + "`" + ` ( + resource_type, + state, + cluster_id, + created_at +) VALUES ( + ?, + ?, + ?, + ? +) +` + +type InsertStateChangeParams struct { + ResourceType StateChangesResourceType `db:"resource_type"` + State []byte `db:"state"` + ClusterID string `db:"cluster_id"` + CreatedAt uint64 `db:"created_at"` +} + +// InsertStateChange +// +// INSERT INTO `state_changes` ( +// resource_type, +// state, +// cluster_id, +// created_at +// ) VALUES ( +// ?, +// ?, +// ?, +// ? +// ) +func (q *Queries) InsertStateChange(ctx context.Context, db DBTX, arg InsertStateChangeParams) (int64, error) { + result, err := db.ExecContext(ctx, insertStateChange, + arg.ResourceType, + arg.State, + arg.ClusterID, + arg.CreatedAt, + ) + if err != nil { + return 0, err + } + return result.LastInsertId() +} diff --git a/svc/ctrl/proto/ctrl/v1/cluster.proto b/svc/ctrl/proto/ctrl/v1/cluster.proto index 096958346d..2a0825fc6f 100644 --- a/svc/ctrl/proto/ctrl/v1/cluster.proto +++ b/svc/ctrl/proto/ctrl/v1/cluster.proto @@ -23,6 +23,7 @@ option go_package = "github.com/unkeyed/unkey/gen/proto/ctrl/v1;ctrlv1"; // When an agent reconnects, it should initiate reconciliation to ensure consistency. service ClusterService { rpc Watch(WatchRequest) returns (stream State); + rpc Sync(SyncRequest) returns (stream State); rpc GetDesiredSentinelState(GetDesiredSentinelStateRequest) returns (SentinelState); rpc UpdateSentinelState(UpdateSentinelStateRequest) returns (UpdateSentinelStateResponse); @@ -78,6 +79,11 @@ message UpdateSentinelStateRequest { message UpdateSentinelStateResponse {} +message SyncRequest { + string cluster_id = 1; + string region = 2; +} + // WatchRequest identifies the cluster requesting a watch stream. message WatchRequest { // cluster_id uniquely identifies the client requesting the watch stream. @@ -85,21 +91,19 @@ message WatchRequest { string region = 2; - // live indicates whether the client wants live updates of changes - // if true, the stream never ends - bool live = 3; - - // synthetic indicates whether the client wants synthetic events - // if true, the server generates synthetic events to replay the full current desired state. - bool synthetic = 4; + // sequence_last_seen indicates the last sequence number the client has processed. + // This allows the server to send only new events since that sequence number, + // enabling efficient reconnection and resumption of the watch stream. + uint64 sequence_last_seen = 3; } message State { + uint64 sequence = 1; + oneof kind { - DeploymentState deployment = 1; - SentinelState sentinel = 2; + DeploymentState deployment = 2; + SentinelState sentinel = 3; } - optional string acknowledge_id = 3; } // SentinelState represents a lifecycle event for an API sentinel configuration. diff --git a/svc/ctrl/services/cluster/BUILD.bazel b/svc/ctrl/services/cluster/BUILD.bazel index 3cd796284d..ee2fafe840 100644 --- a/svc/ctrl/services/cluster/BUILD.bazel +++ b/svc/ctrl/services/cluster/BUILD.bazel @@ -7,6 +7,7 @@ go_library( "emit.go", "rpc_get_desired_deployment_state.go", "rpc_get_desired_sentinel_state.go", + "rpc_sync.go", "rpc_update_deployment_state.go", "rpc_update_sentinel_state.go", "rpc_watch.go", @@ -20,6 +21,7 @@ go_library( "//pkg/assert", "//pkg/db", "//pkg/otel/logging", + "//pkg/proto", "//pkg/uid", "@com_connectrpc_connect//:connect", ], diff --git a/svc/ctrl/services/cluster/rpc_sync.go b/svc/ctrl/services/cluster/rpc_sync.go new file mode 100644 index 0000000000..18dbc9f815 --- /dev/null +++ b/svc/ctrl/services/cluster/rpc_sync.go @@ -0,0 +1,164 @@ +package cluster + +import ( + "context" + "sync" + + "connectrpc.com/connect" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" + "github.com/unkeyed/unkey/pkg/db" +) + +func (s *Service) Sync(ctx context.Context, req *connect.Request[ctrlv1.SyncRequest], stream *connect.ServerStream[ctrlv1.State]) error { + + region := req.Msg.GetRegion() + clusterID := req.Msg.GetClusterId() + + s.logger.Info("sync request received", + "region", region, + "clusterID", clusterID, + ) + + wg := sync.WaitGroup{} + + wg.Go(func() { + if err := s.getSyntheticDeployments(ctx, req, stream); err != nil { + s.logger.Error("failed to get synthetic deployments", "error", err) + } + }) + wg.Go(func() { + if err := s.getSyntheticSentinels(ctx, req, stream); err != nil { + s.logger.Error("failed to get synthetic sentinels", "error", err) + } + }) + + wg.Wait() + <-ctx.Done() + return ctx.Err() + +} + +func (s *Service) getSyntheticSentinels(ctx context.Context, req *connect.Request[ctrlv1.SyncRequest], stream *connect.ServerStream[ctrlv1.State]) error { + + clusterID := req.Msg.GetClusterId() + region := req.Msg.GetRegion() + + s.logger.Debug("get all sentinels request received", + "cluster_id", clusterID, + "region", region, + ) + + cursor := "" + for { + sentinels, err := db.Query.ListDesiredSentinels(ctx, s.db.RO(), db.ListDesiredSentinelsParams{ + Region: region, + DesiredState: db.SentinelsDesiredStateRunning, + PaginationCursor: cursor, + Limit: 100, + }) + + if err != nil { + s.logger.Error("failed to get sentinels", "error", err.Error()) + return err + } + + if len(sentinels) == 0 { + break + } + cursor = sentinels[len(sentinels)-1].ID + + for _, s := range sentinels { + err = stream.Send(&ctrlv1.State{ + Kind: &ctrlv1.State_Sentinel{ + Sentinel: &ctrlv1.SentinelState{ + State: &ctrlv1.SentinelState_Apply{ + Apply: &ctrlv1.ApplySentinel{ + K8SName: s.K8sName, + WorkspaceId: s.WorkspaceID, + EnvironmentId: s.EnvironmentID, + ProjectId: s.ProjectID, + SentinelId: s.ID, + Image: s.Image, + Replicas: s.DesiredReplicas, + CpuMillicores: int64(s.CpuMillicores), + MemoryMib: int64(s.MemoryMib), + }, + }, + }, + }, + }) + if err != nil { + return err + } + + } + } + return nil + +} + +func (s *Service) getSyntheticDeployments(ctx context.Context, req *connect.Request[ctrlv1.SyncRequest], stream *connect.ServerStream[ctrlv1.State]) error { + + clusterID := req.Msg.GetClusterId() + region := req.Msg.GetRegion() + + s.logger.Debug("get all sentinels request received", + "cluster_id", clusterID, + "region", region, + ) + + cursor := "" + for { + topologies, err := db.Query.ListDesiredDeploymentTopology(ctx, s.db.RO(), db.ListDesiredDeploymentTopologyParams{ + Region: region, + DesiredState: db.DeploymentsDesiredStateRunning, + PaginationCursor: cursor, + Limit: 1000, + }) + if err != nil { + s.logger.Error("failed to get topologies", "error", err.Error()) + return err + } + + if len(topologies) == 0 { + break + } + cursor = topologies[len(topologies)-1].DeploymentID + + for _, t := range topologies { + var buildID *string + if t.BuildID.Valid { + buildID = &t.BuildID.String + } + err = stream.Send(&ctrlv1.State{ + Kind: &ctrlv1.State_Deployment{ + Deployment: &ctrlv1.DeploymentState{ + State: &ctrlv1.DeploymentState_Apply{ + Apply: &ctrlv1.ApplyDeployment{ + K8SNamespace: t.K8sNamespace.String, + K8SName: t.K8sName, + WorkspaceId: t.WorkspaceID, + EnvironmentId: t.EnvironmentID, + ProjectId: t.ProjectID, + DeploymentId: t.DeploymentID, + Image: t.Image.String, + Replicas: t.DesiredReplicas, + CpuMillicores: int64(t.CpuMillicores), + MemoryMib: int64(t.MemoryMib), + EncryptedEnvironmentVariables: t.EncryptedEnvironmentVariables, + ReadinessId: nil, + BuildId: buildID, + }, + }, + }, + }, + }) + if err != nil { + return err + } + + } + } + return nil + +} diff --git a/svc/ctrl/services/cluster/rpc_watch.go b/svc/ctrl/services/cluster/rpc_watch.go index 17de59df23..715d671429 100644 --- a/svc/ctrl/services/cluster/rpc_watch.go +++ b/svc/ctrl/services/cluster/rpc_watch.go @@ -2,182 +2,86 @@ package cluster import ( "context" - "sync" + "fmt" "connectrpc.com/connect" ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" + "github.com/unkeyed/unkey/pkg/assert" "github.com/unkeyed/unkey/pkg/db" + "github.com/unkeyed/unkey/pkg/proto" ) func (s *Service) Watch(ctx context.Context, req *connect.Request[ctrlv1.WatchRequest], stream *connect.ServerStream[ctrlv1.State]) error { region := req.Msg.GetRegion() clusterID := req.Msg.GetClusterId() + sequence := req.Msg.GetSequenceLastSeen() + + err := assert.All( + assert.NotEmpty(region, "region must not be empty"), + assert.NotEmpty(clusterID, "clusterID must not be empty"), + assert.Greater(sequence, 0, "sequence must be greater than 0"), + ) + if err != nil { + return connect.NewError(connect.CodeInvalidArgument, err) + } s.logger.Info("watch request received", "region", region, "clusterID", clusterID, + "sequence", sequence, ) - wg := sync.WaitGroup{} - - if req.Msg.GetSynthetic() { - - wg.Go(func() { - if err := s.getSyntheticDeployments(ctx, req, stream); err != nil { - s.logger.Error("failed to get synthetic deployments", "error", err) - } - }) - wg.Go(func() { - if err := s.getSyntheticSentinels(ctx, req, stream); err != nil { - s.logger.Error("failed to get synthetic sentinels", "error", err) - } - }) - - } - if req.Msg.GetLive() { - - s.clientsMu.Lock() - s.clients[clusterID] = newClient(clusterID, region, stream) - s.logger.Info("creating new client", "len", len(s.clients), "clients", s.clients) - s.clientsMu.Unlock() - defer func() { - s.clientsMu.Lock() - delete(s.clients, clusterID) - s.logger.Info("deleted client", "len", len(s.clients), "clients", s.clients) - s.clientsMu.Unlock() - }() - + changes, err := db.Query.FindStateChangesByClusterAfterSequence(ctx, s.db.RW(), db.FindStateChangesByClusterAfterSequenceParams{ + ClusterID: clusterID, + AfterSequence: sequence, + }) + if err != nil { + return connect.NewError(connect.CodeInternal, err) } - wg.Wait() - <-ctx.Done() - return ctx.Err() -} - -func (s *Service) getSyntheticSentinels(ctx context.Context, req *connect.Request[ctrlv1.WatchRequest], stream *connect.ServerStream[ctrlv1.State]) error { - - clusterID := req.Msg.GetClusterId() - region := req.Msg.GetRegion() - - s.logger.Debug("get all sentinels request received", - "cluster_id", clusterID, - "region", region, - ) - - cursor := "" - for { - sentinels, err := db.Query.ListDesiredSentinels(ctx, s.db.RO(), db.ListDesiredSentinelsParams{ - Region: region, - DesiredState: db.SentinelsDesiredStateRunning, - PaginationCursor: cursor, - Limit: 100, - }) + for _, change := range changes { - if err != nil { - s.logger.Error("failed to get sentinels", "error", err.Error()) - return err + msg := &ctrlv1.State{ + Sequence: change.Sequence, + Kind: nil, } - if len(sentinels) == 0 { - break - } - cursor = sentinels[len(sentinels)-1].ID - - for _, s := range sentinels { - err = stream.Send(&ctrlv1.State{ - AcknowledgeId: nil, - Kind: &ctrlv1.State_Sentinel{ - Sentinel: &ctrlv1.SentinelState{ - State: &ctrlv1.SentinelState_Apply{ - Apply: &ctrlv1.ApplySentinel{ - K8SName: s.K8sName, - WorkspaceId: s.WorkspaceID, - EnvironmentId: s.EnvironmentID, - ProjectId: s.ProjectID, - SentinelId: s.ID, - Image: s.Image, - Replicas: s.DesiredReplicas, - CpuMillicores: int64(s.CpuMillicores), - MemoryMib: int64(s.MemoryMib), - }, - }, - }, - }, - }) + switch change.ResourceType { + case db.StateChangesResourceTypeSentinel: + sentinel := &ctrlv1.SentinelState{} + err = proto.Unmarshal(change.State, sentinel) if err != nil { - return err - } + return connect.NewError(connect.CodeInternal, err) - } - } - return nil - -} - -func (s *Service) getSyntheticDeployments(ctx context.Context, req *connect.Request[ctrlv1.WatchRequest], stream *connect.ServerStream[ctrlv1.State]) error { + } - clusterID := req.Msg.GetClusterId() - region := req.Msg.GetRegion() + msg.Kind = &ctrlv1.State_Sentinel{ + Sentinel: sentinel, + } + case db.StateChangesResourceTypeDeployment: + deployment := &ctrlv1.DeploymentState{} + err = proto.Unmarshal(change.State, deployment) + if err != nil { + return connect.NewError(connect.CodeInternal, err) - s.logger.Debug("get all sentinels request received", - "cluster_id", clusterID, - "region", region, - ) + } - cursor := "" - for { - topologies, err := db.Query.ListDesiredDeploymentTopology(ctx, s.db.RO(), db.ListDesiredDeploymentTopologyParams{ - Region: region, - DesiredState: db.DeploymentsDesiredStateRunning, - PaginationCursor: cursor, - Limit: 1000, - }) - if err != nil { - s.logger.Error("failed to get topologies", "error", err.Error()) - return err + msg.Kind = &ctrlv1.State_Deployment{ + Deployment: deployment, + } + default: + return connect.NewError(connect.CodeInternal, fmt.Errorf("unexpected resource type %T", change.ResourceType)) } - if len(topologies) == 0 { - break + err = stream.Send(msg) + if err != nil { + return connect.NewError(connect.CodeInternal, err) } - cursor = topologies[len(topologies)-1].DeploymentID - - for _, t := range topologies { - var buildID *string - if t.BuildID.Valid { - buildID = &t.BuildID.String - } - err = stream.Send(&ctrlv1.State{ - AcknowledgeId: nil, - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Apply{ - Apply: &ctrlv1.ApplyDeployment{ - K8SNamespace: t.K8sNamespace.String, - K8SName: t.K8sName, - WorkspaceId: t.WorkspaceID, - EnvironmentId: t.EnvironmentID, - ProjectId: t.ProjectID, - DeploymentId: t.DeploymentID, - Image: t.Image.String, - Replicas: t.DesiredReplicas, - CpuMillicores: int64(t.CpuMillicores), - MemoryMib: int64(t.MemoryMib), - EncryptedEnvironmentVariables: t.EncryptedEnvironmentVariables, - ReadinessId: nil, - BuildId: buildID, - }, - }, - }, - }, - }) - if err != nil { - return err - } - } } - return nil + + <-ctx.Done() + return ctx.Err() } diff --git a/svc/ctrl/workflows/deploy/deploy_handler.go b/svc/ctrl/workflows/deploy/deploy_handler.go index 4170193939..e3f5d43eb5 100644 --- a/svc/ctrl/workflows/deploy/deploy_handler.go +++ b/svc/ctrl/workflows/deploy/deploy_handler.go @@ -232,29 +232,40 @@ func (w *Workflow) Deploy(ctx restate.WorkflowSharedContext, req *hydrav1.Deploy for _, sentinel := range sentinels { err = restate.RunVoid(ctx, func(runCtx restate.RunContext) error { - return w.cluster.EmitState(runCtx, sentinel.Region, - &ctrlv1.State{ - AcknowledgeId: nil, - Kind: &ctrlv1.State_Sentinel{ - Sentinel: &ctrlv1.SentinelState{ - State: &ctrlv1.SentinelState_Apply{ - Apply: &ctrlv1.ApplySentinel{ - K8SName: sentinel.K8sName, - WorkspaceId: sentinel.WorkspaceID, - ProjectId: sentinel.ProjectID, - EnvironmentId: sentinel.EnvironmentID, - SentinelId: sentinel.ID, - Image: w.sentinelImage, - Replicas: sentinel.DesiredReplicas, - CpuMillicores: int64(sentinel.CpuMillicores), - MemoryMib: int64(sentinel.MemoryMib), - }, - }, - }, + + s := &ctrlv1.SentinelState{ + State: &ctrlv1.SentinelState_Apply{ + Apply: &ctrlv1.ApplySentinel{ + K8SName: sentinel.K8sName, + WorkspaceId: sentinel.WorkspaceID, + ProjectId: sentinel.ProjectID, + EnvironmentId: sentinel.EnvironmentID, + SentinelId: sentinel.ID, + Image: w.sentinelImage, + Replicas: sentinel.DesiredReplicas, + CpuMillicores: int64(sentinel.CpuMillicores), + MemoryMib: int64(sentinel.MemoryMib), }, - }) + }, + } + state, err := proto.Marshal(s) + if err != nil { + return restate.TerminalError(err) + } - }, restate.WithName(fmt.Sprintf("emit sentinel apply for %s in %s", sentinel.ID, sentinel.Region))) + sequence, err := db.Query.InsertStateChange(ctx, w.db.RW(), db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeSentinel, + State: state, + ClusterID: sentinel.Region, + CreatedAt: uint64(time.Now().UnixMilli()), + }) + if err != nil { + return err + } + + _ = sequence + return nil + }, restate.WithName(fmt.Sprintf("schedule sentinel for %s in %s", sentinel.ID, sentinel.Region))) if err != nil { return nil, err } @@ -263,34 +274,47 @@ func (w *Workflow) Deploy(ctx restate.WorkflowSharedContext, req *hydrav1.Deploy for _, region := range topologies { err = restate.RunVoid(ctx, func(runCtx restate.RunContext) error { - return w.cluster.EmitState(runCtx, region.Region, - &ctrlv1.State{ - AcknowledgeId: nil, - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Apply{ - Apply: &ctrlv1.ApplyDeployment{ - K8SNamespace: workspace.K8sNamespace.String, - K8SName: deployment.K8sName, - WorkspaceId: workspace.ID, - ProjectId: deployment.ProjectID, - EnvironmentId: deployment.EnvironmentID, - DeploymentId: deployment.ID, - Image: dockerImage, - Replicas: region.DesiredReplicas, - CpuMillicores: int64(deployment.CpuMillicores), - MemoryMib: int64(deployment.MemoryMib), - BuildId: buildID, - EncryptedEnvironmentVariables: deployment.EncryptedEnvironmentVariables, - ReadinessId: ptr.P(deployment.ID), - }, - }, - }, + + s := &ctrlv1.DeploymentState{ + State: &ctrlv1.DeploymentState_Apply{ + Apply: &ctrlv1.ApplyDeployment{ + K8SNamespace: workspace.K8sNamespace.String, + K8SName: deployment.K8sName, + WorkspaceId: workspace.ID, + ProjectId: deployment.ProjectID, + EnvironmentId: deployment.EnvironmentID, + DeploymentId: deployment.ID, + Image: dockerImage, + Replicas: region.DesiredReplicas, + CpuMillicores: int64(deployment.CpuMillicores), + MemoryMib: int64(deployment.MemoryMib), + BuildId: buildID, + EncryptedEnvironmentVariables: deployment.EncryptedEnvironmentVariables, + ReadinessId: ptr.P(deployment.ID), }, }, - ) + } + + state, err := proto.Marshal(s) + if err != nil { + return restate.TerminalError(err) + } + + sequence, err := db.Query.InsertStateChange(ctx, w.db.RW(), db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeDeployment, + State: state, + ClusterID: region.Region, + CreatedAt: uint64(time.Now().UnixMilli()), + }) + if err != nil { + return err + } + + _ = sequence + + return nil - }, restate.WithName(fmt.Sprintf("emit deployment apply %s in %s", deployment.ID, region.Region))) + }, restate.WithName(fmt.Sprintf("schedule deployment %s in %s", deployment.ID, region.Region))) if err != nil { return nil, err } diff --git a/svc/krane/internal/reconciler/BUILD.bazel b/svc/krane/internal/reconciler/BUILD.bazel index 8a9fc598f0..eb65ef4f1c 100644 --- a/svc/krane/internal/reconciler/BUILD.bazel +++ b/svc/krane/internal/reconciler/BUILD.bazel @@ -1,4 +1,4 @@ -load("@rules_go//go:def.bzl", "go_library", "go_test") +load("@rules_go//go:def.bzl", "go_library") go_library( name = "reconciler", @@ -17,6 +17,7 @@ go_library( "update_state.go", "watch_current_deployments.go", "watch_current_sentinels.go", + "watcher.go", ], importpath = "github.com/unkeyed/unkey/svc/krane/internal/reconciler", visibility = ["//svc/krane:__subpackages__"], @@ -41,46 +42,3 @@ go_library( "@io_k8s_sigs_controller_runtime//pkg/client", ], ) - -go_test( - name = "reconciler_test", - size = "small", - srcs = [ - "apply_deployment_test.go", - "apply_sentinel_test.go", - "delete_deployment_test.go", - "delete_sentinel_test.go", - "handle_state_test.go", - "mock_cluster_client_test.go", - "namespace_test.go", - "reconciler_test.go", - "refresh_current_deployments_test.go", - "refresh_current_sentinels_test.go", - "test_helpers_test.go", - "update_state_test.go", - "watch_current_deployments_test.go", - "watch_current_sentinels_test.go", - ], - embed = [":reconciler"], - deps = [ - "//gen/proto/ctrl/v1:ctrl", - "//gen/proto/ctrl/v1/ctrlv1connect", - "//pkg/circuitbreaker", - "//pkg/otel/logging", - "//pkg/ptr", - "//svc/krane/pkg/labels", - "@com_connectrpc_connect//:connect", - "@com_github_stretchr_testify//require", - "@io_k8s_api//apps/v1:apps", - "@io_k8s_api//core/v1:core", - "@io_k8s_apimachinery//pkg/api/errors", - "@io_k8s_apimachinery//pkg/api/resource", - "@io_k8s_apimachinery//pkg/apis/meta/v1:meta", - "@io_k8s_apimachinery//pkg/runtime", - "@io_k8s_apimachinery//pkg/runtime/schema", - "@io_k8s_apimachinery//pkg/types", - "@io_k8s_apimachinery//pkg/watch", - "@io_k8s_client_go//kubernetes/fake", - "@io_k8s_client_go//testing", - ], -) diff --git a/svc/krane/internal/reconciler/apply_deployment_test.go b/svc/krane/internal/reconciler/apply_deployment_test.go_ similarity index 100% rename from svc/krane/internal/reconciler/apply_deployment_test.go rename to svc/krane/internal/reconciler/apply_deployment_test.go_ diff --git a/svc/krane/internal/reconciler/apply_sentinel_test.go b/svc/krane/internal/reconciler/apply_sentinel_test.go_ similarity index 100% rename from svc/krane/internal/reconciler/apply_sentinel_test.go rename to svc/krane/internal/reconciler/apply_sentinel_test.go_ diff --git a/svc/krane/internal/reconciler/delete_deployment_test.go b/svc/krane/internal/reconciler/delete_deployment_test.go_ similarity index 100% rename from svc/krane/internal/reconciler/delete_deployment_test.go rename to svc/krane/internal/reconciler/delete_deployment_test.go_ diff --git a/svc/krane/internal/reconciler/delete_sentinel_test.go b/svc/krane/internal/reconciler/delete_sentinel_test.go_ similarity index 100% rename from svc/krane/internal/reconciler/delete_sentinel_test.go rename to svc/krane/internal/reconciler/delete_sentinel_test.go_ diff --git a/svc/krane/internal/reconciler/handle_state_test.go b/svc/krane/internal/reconciler/handle_state_test.go_ similarity index 100% rename from svc/krane/internal/reconciler/handle_state_test.go rename to svc/krane/internal/reconciler/handle_state_test.go_ diff --git a/svc/krane/internal/reconciler/mock_cluster_client_test.go b/svc/krane/internal/reconciler/mock_cluster_client_test.go_ similarity index 100% rename from svc/krane/internal/reconciler/mock_cluster_client_test.go rename to svc/krane/internal/reconciler/mock_cluster_client_test.go_ diff --git a/svc/krane/internal/reconciler/namespace_test.go b/svc/krane/internal/reconciler/namespace_test.go_ similarity index 100% rename from svc/krane/internal/reconciler/namespace_test.go rename to svc/krane/internal/reconciler/namespace_test.go_ diff --git a/svc/krane/internal/reconciler/reconciler.go b/svc/krane/internal/reconciler/reconciler.go index 57d4ba6df6..3693284840 100644 --- a/svc/krane/internal/reconciler/reconciler.go +++ b/svc/krane/internal/reconciler/reconciler.go @@ -4,6 +4,7 @@ import ( "context" "fmt" + "connectrpc.com/connect" ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" "github.com/unkeyed/unkey/gen/proto/ctrl/v1/ctrlv1connect" "github.com/unkeyed/unkey/pkg/circuitbreaker" @@ -27,7 +28,10 @@ type Reconciler struct { cluster ctrlv1connect.ClusterServiceClient cb circuitbreaker.CircuitBreaker[any] done chan struct{} + clusterID string region string + // last seen sequence + sequence uint64 } // Config holds the configuration required to create a new [Reconciler]. @@ -48,7 +52,9 @@ func New(cfg Config) *Reconciler { cluster: cfg.Cluster, cb: circuitbreaker.New[any]("reconciler_state_update"), done: make(chan struct{}), + clusterID: cfg.ClusterID, region: cfg.Region, + sequence: 0, } } @@ -68,6 +74,26 @@ func (r *Reconciler) Start(ctx context.Context) error { return err } + stream, err := r.cluster.Sync(ctx, connect.NewRequest(&ctrlv1.SyncRequest{ + ClusterId: r.clusterID, + Region: r.region, + })) + if err != nil { + return err + } + + for stream.Receive() { + if err := r.HandleState(ctx, stream.Msg()); err != nil { + r.logger.Error("error handling state", "error", err) + } + } + err = stream.Close() + if err != nil { + r.logger.Error("unable to close stream", "error", err) + } + + go r.Watch(ctx) + return nil } @@ -117,6 +143,7 @@ func (r *Reconciler) HandleState(ctx context.Context, state *ctrlv1.State) error return fmt.Errorf("unknown state type: %T", kind) } + r.sequence = state.GetSequence() return nil } diff --git a/svc/krane/internal/reconciler/reconciler_test.go b/svc/krane/internal/reconciler/reconciler_test.go_ similarity index 100% rename from svc/krane/internal/reconciler/reconciler_test.go rename to svc/krane/internal/reconciler/reconciler_test.go_ diff --git a/svc/krane/internal/reconciler/refresh_current_deployments_test.go b/svc/krane/internal/reconciler/refresh_current_deployments_test.go_ similarity index 100% rename from svc/krane/internal/reconciler/refresh_current_deployments_test.go rename to svc/krane/internal/reconciler/refresh_current_deployments_test.go_ diff --git a/svc/krane/internal/reconciler/refresh_current_sentinels_test.go b/svc/krane/internal/reconciler/refresh_current_sentinels_test.go_ similarity index 100% rename from svc/krane/internal/reconciler/refresh_current_sentinels_test.go rename to svc/krane/internal/reconciler/refresh_current_sentinels_test.go_ diff --git a/svc/krane/internal/reconciler/test_helpers_test.go b/svc/krane/internal/reconciler/test_helpers_test.go_ similarity index 100% rename from svc/krane/internal/reconciler/test_helpers_test.go rename to svc/krane/internal/reconciler/test_helpers_test.go_ diff --git a/svc/krane/internal/reconciler/update_state_test.go b/svc/krane/internal/reconciler/update_state_test.go_ similarity index 100% rename from svc/krane/internal/reconciler/update_state_test.go rename to svc/krane/internal/reconciler/update_state_test.go_ diff --git a/svc/krane/internal/reconciler/watch_current_deployments_test.go b/svc/krane/internal/reconciler/watch_current_deployments_test.go_ similarity index 100% rename from svc/krane/internal/reconciler/watch_current_deployments_test.go rename to svc/krane/internal/reconciler/watch_current_deployments_test.go_ diff --git a/svc/krane/internal/reconciler/watch_current_sentinels_test.go b/svc/krane/internal/reconciler/watch_current_sentinels_test.go_ similarity index 100% rename from svc/krane/internal/reconciler/watch_current_sentinels_test.go rename to svc/krane/internal/reconciler/watch_current_sentinels_test.go_ diff --git a/svc/krane/internal/reconciler/watcher.go b/svc/krane/internal/reconciler/watcher.go new file mode 100644 index 0000000000..a89a48b848 --- /dev/null +++ b/svc/krane/internal/reconciler/watcher.go @@ -0,0 +1,53 @@ +package reconciler + +import ( + "context" + "math/rand/v2" + "time" + + "connectrpc.com/connect" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" +) + +func (r *Reconciler) Watch(ctx context.Context) { + + intervalMin := time.Second + intervalMax := 5 * time.Second + + for { + + interval := intervalMin + time.Duration(rand.Float64()*float64(intervalMax.Milliseconds()-intervalMin.Milliseconds())) + time.Sleep(interval) + + err := r.watch(ctx) + if err != nil { + r.logger.Error("error while watching for state changes", "error", err) + } + + } + +} + +func (r *Reconciler) watch(ctx context.Context) error { + + stream, err := r.cluster.Watch(ctx, connect.NewRequest(&ctrlv1.WatchRequest{ + ClusterId: r.clusterID, + Region: r.region, + SequenceLastSeen: r.sequence, + })) + if err != nil { + return err + } + + for stream.Receive() { + if err := r.HandleState(ctx, stream.Msg()); err != nil { + r.logger.Error("error handling state", "error", err) + } + } + err = stream.Close() + if err != nil { + r.logger.Error("unable to close stream", "error", err) + } + return nil + +} diff --git a/svc/krane/pkg/controlplane/BUILD.bazel b/svc/krane/pkg/controlplane/BUILD.bazel index a908a5a10c..5a89de4750 100644 --- a/svc/krane/pkg/controlplane/BUILD.bazel +++ b/svc/krane/pkg/controlplane/BUILD.bazel @@ -5,15 +5,11 @@ go_library( srcs = [ "client.go", "interceptor.go", - "watcher.go", ], importpath = "github.com/unkeyed/unkey/svc/krane/pkg/controlplane", visibility = ["//visibility:public"], deps = [ - "//gen/proto/ctrl/v1:ctrl", "//gen/proto/ctrl/v1/ctrlv1connect", - "//pkg/otel/logging", - "//pkg/repeat", "@com_connectrpc_connect//:connect", ], ) diff --git a/svc/krane/pkg/controlplane/watcher.go b/svc/krane/pkg/controlplane/watcher.go deleted file mode 100644 index 51d455e4f2..0000000000 --- a/svc/krane/pkg/controlplane/watcher.go +++ /dev/null @@ -1,135 +0,0 @@ -package controlplane - -import ( - "context" - "time" - - "connectrpc.com/connect" - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" - "github.com/unkeyed/unkey/gen/proto/ctrl/v1/ctrlv1connect" - "github.com/unkeyed/unkey/pkg/otel/logging" - "github.com/unkeyed/unkey/pkg/repeat" -) - -// Watcher provides event streaming capabilities from the control plane service. -// -// The watcher handles both live streaming for real-time events and periodic -// synchronization for full state reconciliation. It implements automatic -// reconnection with exponential backoff and provides a buffered channel -// for consuming events. -// -// The type parameter T represents the specific event type being watched. -type Watcher struct { - logger logging.Logger - clusterID string - region string - cluster ctrlv1connect.ClusterServiceClient -} - -// WatcherConfig holds the configuration for creating a new Watcher. -// -// All fields are required for proper watcher operation. The CreateStream -// function should typically be a method on a control plane client that -// establishes the streaming connection. -type WatcherConfig struct { - // Logger is used for logging watcher events and errors. - Logger logging.Logger - - // Cluster ID uniquely identifies this watcher instance to the control plane. - ClusterID string - - // Region identifies the geographical region for filtering events. - Region string - - // Cluster is the control plane client used to establish the streaming connection. - Cluster ctrlv1connect.ClusterServiceClient -} - -// NewWatcher creates a new event watcher with the specified configuration. -// -// The watcher is initialized with a 1000-event buffer that does not drop -// events when full. Events are buffered internally and made available through -// the Consume() method. -// -// The returned watcher is safe for concurrent use, but Watch() and Sync() -// should typically be run in separate goroutines. -func NewWatcher(cfg WatcherConfig) *Watcher { - w := &Watcher{ - logger: cfg.Logger, - clusterID: cfg.ClusterID, - region: cfg.Region, - cluster: cfg.Cluster, - } - - return w -} - -func (w *Watcher) Start(ctx context.Context, handle func(context.Context, *ctrlv1.State) error) { - - repeat.Every(time.Minute, func() { - w.logger.Info("Pulling synthetic state from control plane") - stream, err := w.cluster.Watch(ctx, connect.NewRequest(&ctrlv1.WatchRequest{ - ClusterId: w.clusterID, - Region: w.region, - Synthetic: true, - Live: false, - })) - if err != nil { - w.logger.Error("unable to connect to control plane", "error", err) - return - } - for stream.Receive() { - if err := handle(ctx, stream.Msg()); err != nil { - w.logger.Error("error handling state", "error", err) - } - } - err = stream.Close() - if err != nil { - w.logger.Error("unable to close stream", "error", err) - } - - }) - - go func() { - w.logger.Info("Starting control plane watcher") - - consecutiveFailures := 0 - var stream *connect.ServerStreamForClient[ctrlv1.State] - var err error - for { - if stream == nil { - stream, err = w.cluster.Watch(ctx, connect.NewRequest(&ctrlv1.WatchRequest{ - ClusterId: w.clusterID, - Region: w.region, - Synthetic: false, - Live: true, - })) - if err != nil { - consecutiveFailures++ - w.logger.Error("unable to connect to control plane", "consecutive_failures", consecutiveFailures) - time.Sleep(time.Duration(min(60, consecutiveFailures)) * time.Second) - continue - } else { - consecutiveFailures = 0 - } - } - w.logger.Info("control plane watch stream started") - - hasMsg := stream.Receive() - if !hasMsg { - w.logger.Info("Stream ended, reconnecting...", - "error", stream.Err(), - ) - stream = nil - time.Sleep(time.Second) - continue - } - msg := stream.Msg() - w.logger.Info("control plane watch stream received message", "message", msg) - if err := handle(ctx, msg); err != nil { - w.logger.Error("error handling state", "error", err) - } - } - }() - -} diff --git a/svc/krane/run.go b/svc/krane/run.go index d0b2effef3..90bff4a7ea 100644 --- a/svc/krane/run.go +++ b/svc/krane/run.go @@ -70,13 +70,6 @@ func Run(ctx context.Context, cfg Config) error { ClusterID: cfg.ClusterID, }) - w := controlplane.NewWatcher(controlplane.WatcherConfig{ - Logger: logger, - ClusterID: cfg.ClusterID, - Region: cfg.Region, - Cluster: cluster, - }) - inClusterConfig, err := rest.InClusterConfig() if err != nil { return fmt.Errorf("failed to create in-cluster config: %w", err) @@ -99,7 +92,6 @@ func Run(ctx context.Context, cfg Config) error { } shutdowns.Register(r.Stop) - w.Start(ctx, r.HandleState) // Create vault service for secrets decryption diff --git a/web/internal/db/src/schema/index.ts b/web/internal/db/src/schema/index.ts index be39a48f1c..a1cbfb2a19 100644 --- a/web/internal/db/src/schema/index.ts +++ b/web/internal/db/src/schema/index.ts @@ -26,3 +26,5 @@ export * from "./sentinels"; export * from "./instances"; export * from "./certificates"; export * from "./frontline_routes"; + +export * from "./state_changes"; diff --git a/web/internal/db/src/schema/state_changes.ts b/web/internal/db/src/schema/state_changes.ts new file mode 100644 index 0000000000..aee89c8cbe --- /dev/null +++ b/web/internal/db/src/schema/state_changes.ts @@ -0,0 +1,22 @@ +import { bigint, index, mysqlEnum, mysqlTable, varchar } from "drizzle-orm/mysql-core"; +import { longblob } from "./util/longblob"; + +export const stateChanges = mysqlTable( + "state_changes", + { + sequence: bigint("sequence", { mode: "number", unsigned: true }).autoincrement().primaryKey(), + + // The apply or delete protobuf blob + + resourceType: mysqlEnum("resource_type", ["sentinel", "deployment"]).notNull(), + state: longblob("state").notNull(), + + clusterId: varchar("cluster_id", { length: 256 }).notNull(), + + createdAt: bigint("created_at", { + mode: "number", + unsigned: true, + }).notNull(), + }, + (table) => [index("cluster_id_sequence").on(table.clusterId, table.sequence)], +); From b849e507aa539e6a0fe958e03fc9419eb3338cfe Mon Sep 17 00:00:00 2001 From: chronark Date: Mon, 19 Jan 2026 14:07:18 +0100 Subject: [PATCH 02/32] wip: make krane initiates smaller watches --- cmd/krane/main.go | 1 - docs/rfcs/list-watch-sync.md | 0 gen/proto/ctrl/v1/cluster.pb.go | 172 ++- .../ctrl/v1/ctrlv1connect/cluster.connect.go | 28 - internal/services/keys/service.go | 14 - pkg/db/BUILD.bazel | 6 +- ...eployment_topology_insert.sql_generated.go | 4 +- .../bulk_environment_insert.sql_generated.go | 4 +- .../bulk_environment_upsert.sql_generated.go | 4 +- pkg/db/bulk_identity_insert.sql_generated.go | 4 +- ...identity_insert_ratelimit.sql_generated.go | 4 +- pkg/db/bulk_identity_upsert.sql_generated.go | 4 +- ...bulk_ingress_route_insert.sql_generated.go | 4 +- pkg/db/bulk_instance_upsert.sql_generated.go | 9 +- pkg/db/bulk_key_auth_insert.sql_generated.go | 4 +- ...ulk_key_encryption_insert.sql_generated.go | 4 +- pkg/db/bulk_key_insert.sql_generated.go | 4 +- ...bulk_key_insert_ratelimit.sql_generated.go | 4 +- ...bulk_key_migration_insert.sql_generated.go | 4 +- ...ulk_key_permission_insert.sql_generated.go | 4 +- pkg/db/bulk_key_role_insert.sql_generated.go | 4 +- pkg/db/bulk_key_space_insert.sql_generated.go | 4 +- pkg/db/bulk_key_space_upsert.sql_generated.go | 4 +- .../bulk_permission_insert.sql_generated.go | 4 +- pkg/db/bulk_project_insert.sql_generated.go | 4 +- pkg/db/bulk_quota_upsert.sql_generated.go | 4 +- ...atelimit_namespace_insert.sql_generated.go | 4 +- ...ratelimit_override_insert.sql_generated.go | 4 +- pkg/db/bulk_role_insert.sql_generated.go | 4 +- ...lk_role_permission_insert.sql_generated.go | 4 +- pkg/db/bulk_sentinel_insert.sql_generated.go | 4 +- .../bulk_state_change_insert.sql_generated.go | 13 +- pkg/db/bulk_workspace_insert.sql_generated.go | 4 +- pkg/db/bulk_workspace_upsert.sql_generated.go | 4 +- ...ployment_delete_instances.sql_generated.go | 8 +- ...ent_topology_find_regions.sql_generated.go | 45 + ...ent_topology_list_desired.sql_generated.go | 95 +- pkg/db/instance_delete.sql_generated.go | 11 +- pkg/db/instance_upsert.sql_generated.go | 6 - ...ces_find_by_deployment_id.sql_generated.go | 5 +- ..._deployment_id_and_region.sql_generated.go | 5 +- ...nstances_find_by_pod_name.sql_generated.go | 16 +- pkg/db/models_generated.go | 48 +- pkg/db/querier_generated.go | 92 +- .../queries/deployment_delete_instances.sql | 2 +- .../deployment_topology_find_regions.sql | 6 + .../deployment_topology_list_desired.sql | 19 +- pkg/db/queries/instance_delete.sql | 2 +- pkg/db/queries/instance_upsert.sql | 2 - pkg/db/queries/instances_find_by_pod_name.sql | 2 +- pkg/db/queries/sentinel_list_desired.sql | 3 + ..._change_find_by_cluster_after_sequence.sql | 6 - ...e_change_find_by_region_after_sequence.sql | 11 + .../queries/state_change_get_max_sequence.sql | 6 + .../queries/state_change_get_min_sequence.sql | 6 + pkg/db/queries/state_change_insert.sql | 10 +- pkg/db/schema.sql | 13 +- pkg/db/sentinel_list_desired.sql_generated.go | 4 +- .../state_change_delete_old.sql_generated.go | 30 + ...by_cluster_after_sequence.sql_generated.go | 59 - ..._by_region_after_sequence.sql_generated.go | 72 + ...e_change_get_max_sequence.sql_generated.go | 29 + ...e_change_get_min_sequence.sql_generated.go | 29 + pkg/db/state_change_insert.sql_generated.go | 22 +- svc/ctrl/integration/BUILD.bazel | 16 + svc/ctrl/integration/harness.go | 251 ++++ svc/ctrl/integration/sync_test.go | 1199 +++++++++++++++++ svc/ctrl/proto/ctrl/v1/cluster.proto | 25 +- svc/ctrl/services/cluster/BUILD.bazel | 3 - svc/ctrl/services/cluster/emit.go | 25 - .../rpc_get_desired_deployment_state.go | 1 - svc/ctrl/services/cluster/rpc_sync.go | 380 ++++-- .../cluster/rpc_update_deployment_state.go | 10 +- svc/ctrl/services/cluster/rpc_watch.go | 87 -- svc/ctrl/services/cluster/service.go | 25 - svc/ctrl/workflows/deploy/BUILD.bazel | 1 - svc/ctrl/workflows/deploy/deploy_handler.go | 175 +-- svc/krane/config.go | 2 - svc/krane/doc.go | 4 +- svc/krane/internal/reconciler/BUILD.bazel | 46 +- ...ment_test.go_ => apply_deployment_test.go} | 0 ...ntinel_test.go_ => apply_sentinel_test.go} | 0 ...ent_test.go_ => delete_deployment_test.go} | 0 ...tinel_test.go_ => delete_sentinel_test.go} | 0 svc/krane/internal/reconciler/doc.go | 1 - ...le_state_test.go_ => handle_state_test.go} | 0 ...t_test.go_ => mock_cluster_client_test.go} | 8 +- .../{namespace_test.go_ => namespace_test.go} | 0 svc/krane/internal/reconciler/reconciler.go | 99 +- ...reconciler_test.go_ => reconciler_test.go} | 26 - ...o_ => refresh_current_deployments_test.go} | 0 ....go_ => refresh_current_sentinels_test.go} | 0 .../reconciler/sequence_tracking_test.go | 347 +++++ ..._helpers_test.go_ => test_helpers_test.go} | 14 +- ...te_state_test.go_ => update_state_test.go} | 0 ....go_ => watch_current_deployments_test.go} | 0 ...st.go_ => watch_current_sentinels_test.go} | 0 svc/krane/internal/reconciler/watcher.go | 10 +- svc/krane/internal/reconciler/watcher_test.go | 552 ++++++++ svc/krane/pkg/controlplane/client.go | 6 +- svc/krane/pkg/controlplane/interceptor.go | 14 +- svc/krane/run.go | 2 - web/internal/db/src/schema/instances.ts | 6 +- web/internal/db/src/schema/state_changes.ts | 30 +- 104 files changed, 3438 insertions(+), 938 deletions(-) create mode 100644 docs/rfcs/list-watch-sync.md create mode 100644 pkg/db/deployment_topology_find_regions.sql_generated.go create mode 100644 pkg/db/queries/deployment_topology_find_regions.sql delete mode 100644 pkg/db/queries/state_change_find_by_cluster_after_sequence.sql create mode 100644 pkg/db/queries/state_change_find_by_region_after_sequence.sql create mode 100644 pkg/db/queries/state_change_get_max_sequence.sql create mode 100644 pkg/db/queries/state_change_get_min_sequence.sql create mode 100644 pkg/db/state_change_delete_old.sql_generated.go delete mode 100644 pkg/db/state_change_find_by_cluster_after_sequence.sql_generated.go create mode 100644 pkg/db/state_change_find_by_region_after_sequence.sql_generated.go create mode 100644 pkg/db/state_change_get_max_sequence.sql_generated.go create mode 100644 pkg/db/state_change_get_min_sequence.sql_generated.go create mode 100644 svc/ctrl/integration/BUILD.bazel create mode 100644 svc/ctrl/integration/harness.go create mode 100644 svc/ctrl/integration/sync_test.go delete mode 100644 svc/ctrl/services/cluster/emit.go delete mode 100644 svc/ctrl/services/cluster/rpc_watch.go rename svc/krane/internal/reconciler/{apply_deployment_test.go_ => apply_deployment_test.go} (100%) rename svc/krane/internal/reconciler/{apply_sentinel_test.go_ => apply_sentinel_test.go} (100%) rename svc/krane/internal/reconciler/{delete_deployment_test.go_ => delete_deployment_test.go} (100%) rename svc/krane/internal/reconciler/{delete_sentinel_test.go_ => delete_sentinel_test.go} (100%) rename svc/krane/internal/reconciler/{handle_state_test.go_ => handle_state_test.go} (100%) rename svc/krane/internal/reconciler/{mock_cluster_client_test.go_ => mock_cluster_client_test.go} (89%) rename svc/krane/internal/reconciler/{namespace_test.go_ => namespace_test.go} (100%) rename svc/krane/internal/reconciler/{reconciler_test.go_ => reconciler_test.go} (80%) rename svc/krane/internal/reconciler/{refresh_current_deployments_test.go_ => refresh_current_deployments_test.go} (100%) rename svc/krane/internal/reconciler/{refresh_current_sentinels_test.go_ => refresh_current_sentinels_test.go} (100%) create mode 100644 svc/krane/internal/reconciler/sequence_tracking_test.go rename svc/krane/internal/reconciler/{test_helpers_test.go_ => test_helpers_test.go} (97%) rename svc/krane/internal/reconciler/{update_state_test.go_ => update_state_test.go} (100%) rename svc/krane/internal/reconciler/{watch_current_deployments_test.go_ => watch_current_deployments_test.go} (100%) rename svc/krane/internal/reconciler/{watch_current_sentinels_test.go_ => watch_current_sentinels_test.go} (100%) create mode 100644 svc/krane/internal/reconciler/watcher_test.go diff --git a/cmd/krane/main.go b/cmd/krane/main.go index 62ea2871b0..8256104a70 100644 --- a/cmd/krane/main.go +++ b/cmd/krane/main.go @@ -110,7 +110,6 @@ func action(ctx context.Context, cmd *cli.Command) error { PrometheusPort: cmd.RequireInt("prometheus-port"), ControlPlaneURL: cmd.RequireString("control-plane-url"), ControlPlaneBearer: cmd.RequireString("control-plane-bearer"), - ClusterID: cmd.RequireString("cluster-id"), } // Validate configuration diff --git a/docs/rfcs/list-watch-sync.md b/docs/rfcs/list-watch-sync.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/gen/proto/ctrl/v1/cluster.pb.go b/gen/proto/ctrl/v1/cluster.pb.go index 77cf212472..397556c871 100644 --- a/gen/proto/ctrl/v1/cluster.pb.go +++ b/gen/proto/ctrl/v1/cluster.pb.go @@ -414,11 +414,11 @@ func (*UpdateSentinelStateResponse) Descriptor() ([]byte, []int) { } type SyncRequest struct { - state protoimpl.MessageState `protogen:"open.v1"` - ClusterId string `protobuf:"bytes,1,opt,name=cluster_id,json=clusterId,proto3" json:"cluster_id,omitempty"` - Region string `protobuf:"bytes,2,opt,name=region,proto3" json:"region,omitempty"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache + state protoimpl.MessageState `protogen:"open.v1"` + Region string `protobuf:"bytes,1,opt,name=region,proto3" json:"region,omitempty"` + SequenceLastSeen uint64 `protobuf:"varint,2,opt,name=sequence_last_seen,json=sequenceLastSeen,proto3" json:"sequence_last_seen,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache } func (x *SyncRequest) Reset() { @@ -451,48 +451,43 @@ func (*SyncRequest) Descriptor() ([]byte, []int) { return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{7} } -func (x *SyncRequest) GetClusterId() string { +func (x *SyncRequest) GetRegion() string { if x != nil { - return x.ClusterId + return x.Region } return "" } -func (x *SyncRequest) GetRegion() string { +func (x *SyncRequest) GetSequenceLastSeen() uint64 { if x != nil { - return x.Region + return x.SequenceLastSeen } - return "" + return 0 } -// WatchRequest identifies the cluster requesting a watch stream. -type WatchRequest struct { - state protoimpl.MessageState `protogen:"open.v1"` - // cluster_id uniquely identifies the client requesting the watch stream. - ClusterId string `protobuf:"bytes,1,opt,name=cluster_id,json=clusterId,proto3" json:"cluster_id,omitempty"` - Region string `protobuf:"bytes,2,opt,name=region,proto3" json:"region,omitempty"` - // sequence_last_seen indicates the last sequence number the client has processed. - // This allows the server to send only new events since that sequence number, - // enabling efficient reconnection and resumption of the watch stream. - SequenceLastSeen uint64 `protobuf:"varint,3,opt,name=sequence_last_seen,json=sequenceLastSeen,proto3" json:"sequence_last_seen,omitempty"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache +// Bookmark is sent after bootstrap completes to signal the client is caught up. +// The client should persist this sequence to resume watch on reconnect. +type Bookmark struct { + state protoimpl.MessageState `protogen:"open.v1"` + Sequence uint64 `protobuf:"varint,1,opt,name=sequence,proto3" json:"sequence,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache } -func (x *WatchRequest) Reset() { - *x = WatchRequest{} +func (x *Bookmark) Reset() { + *x = Bookmark{} mi := &file_ctrl_v1_cluster_proto_msgTypes[8] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } -func (x *WatchRequest) String() string { +func (x *Bookmark) String() string { return protoimpl.X.MessageStringOf(x) } -func (*WatchRequest) ProtoMessage() {} +func (*Bookmark) ProtoMessage() {} -func (x *WatchRequest) ProtoReflect() protoreflect.Message { +func (x *Bookmark) ProtoReflect() protoreflect.Message { mi := &file_ctrl_v1_cluster_proto_msgTypes[8] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) @@ -504,39 +499,29 @@ func (x *WatchRequest) ProtoReflect() protoreflect.Message { return mi.MessageOf(x) } -// Deprecated: Use WatchRequest.ProtoReflect.Descriptor instead. -func (*WatchRequest) Descriptor() ([]byte, []int) { +// Deprecated: Use Bookmark.ProtoReflect.Descriptor instead. +func (*Bookmark) Descriptor() ([]byte, []int) { return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{8} } -func (x *WatchRequest) GetClusterId() string { +func (x *Bookmark) GetSequence() uint64 { if x != nil { - return x.ClusterId - } - return "" -} - -func (x *WatchRequest) GetRegion() string { - if x != nil { - return x.Region - } - return "" -} - -func (x *WatchRequest) GetSequenceLastSeen() uint64 { - if x != nil { - return x.SequenceLastSeen + return x.Sequence } return 0 } type State struct { - state protoimpl.MessageState `protogen:"open.v1"` - Sequence uint64 `protobuf:"varint,1,opt,name=sequence,proto3" json:"sequence,omitempty"` + state protoimpl.MessageState `protogen:"open.v1"` + // sequence is the state_changes sequence number for this event. + // Clients should persist this after successfully processing each event + // to resume from the correct position on reconnect. + Sequence uint64 `protobuf:"varint,1,opt,name=sequence,proto3" json:"sequence,omitempty"` // Types that are valid to be assigned to Kind: // // *State_Deployment // *State_Sentinel + // *State_Bookmark Kind isState_Kind `protobuf_oneof:"kind"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache @@ -604,6 +589,15 @@ func (x *State) GetSentinel() *SentinelState { return nil } +func (x *State) GetBookmark() *Bookmark { + if x != nil { + if x, ok := x.Kind.(*State_Bookmark); ok { + return x.Bookmark + } + } + return nil +} + type isState_Kind interface { isState_Kind() } @@ -616,10 +610,16 @@ type State_Sentinel struct { Sentinel *SentinelState `protobuf:"bytes,3,opt,name=sentinel,proto3,oneof"` } +type State_Bookmark struct { + Bookmark *Bookmark `protobuf:"bytes,4,opt,name=bookmark,proto3,oneof"` +} + func (*State_Deployment) isState_Kind() {} func (*State_Sentinel) isState_Kind() {} +func (*State_Bookmark) isState_Kind() {} + // SentinelState represents a lifecycle event for an API sentinel configuration. // // Sentinels are frontline points for services, typically handling routing, load balancing, @@ -1013,7 +1013,6 @@ type ApplyDeployment struct { MemoryMib int64 `protobuf:"varint,10,opt,name=memory_mib,json=memoryMib,proto3" json:"memory_mib,omitempty"` BuildId *string `protobuf:"bytes,11,opt,name=build_id,json=buildId,proto3,oneof" json:"build_id,omitempty"` EncryptedEnvironmentVariables []byte `protobuf:"bytes,12,opt,name=encrypted_environment_variables,json=encryptedEnvironmentVariables,proto3" json:"encrypted_environment_variables,omitempty"` - ReadinessId *string `protobuf:"bytes,13,opt,name=readiness_id,json=readinessId,proto3,oneof" json:"readiness_id,omitempty"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } @@ -1132,13 +1131,6 @@ func (x *ApplyDeployment) GetEncryptedEnvironmentVariables() []byte { return nil } -func (x *ApplyDeployment) GetReadinessId() string { - if x != nil && x.ReadinessId != nil { - return *x.ReadinessId - } - return "" -} - // DeleteDeployment identifies a deployment to remove from the cluster. // // The deployment and all its pods will be terminated gracefully according to @@ -1404,22 +1396,19 @@ const file_ctrl_v1_cluster_proto_rawDesc = "" + "\x1aUpdateSentinelStateRequest\x12\x19\n" + "\bk8s_name\x18\x01 \x01(\tR\ak8sName\x12-\n" + "\x12available_replicas\x18\x02 \x01(\x05R\x11availableReplicas\"\x1d\n" + - "\x1bUpdateSentinelStateResponse\"D\n" + - "\vSyncRequest\x12\x1d\n" + - "\n" + - "cluster_id\x18\x01 \x01(\tR\tclusterId\x12\x16\n" + - "\x06region\x18\x02 \x01(\tR\x06region\"s\n" + - "\fWatchRequest\x12\x1d\n" + - "\n" + - "cluster_id\x18\x01 \x01(\tR\tclusterId\x12\x16\n" + - "\x06region\x18\x02 \x01(\tR\x06region\x12,\n" + - "\x12sequence_last_seen\x18\x03 \x01(\x04R\x10sequenceLastSeen\"\x9d\x01\n" + + "\x1bUpdateSentinelStateResponse\"S\n" + + "\vSyncRequest\x12\x16\n" + + "\x06region\x18\x01 \x01(\tR\x06region\x12,\n" + + "\x12sequence_last_seen\x18\x02 \x01(\x04R\x10sequenceLastSeen\"&\n" + + "\bBookmark\x12\x1a\n" + + "\bsequence\x18\x01 \x01(\x04R\bsequence\"\xce\x01\n" + "\x05State\x12\x1a\n" + "\bsequence\x18\x01 \x01(\x04R\bsequence\x12:\n" + "\n" + "deployment\x18\x02 \x01(\v2\x18.ctrl.v1.DeploymentStateH\x00R\n" + "deployment\x124\n" + - "\bsentinel\x18\x03 \x01(\v2\x16.ctrl.v1.SentinelStateH\x00R\bsentinelB\x06\n" + + "\bsentinel\x18\x03 \x01(\v2\x16.ctrl.v1.SentinelStateH\x00R\bsentinel\x12/\n" + + "\bbookmark\x18\x04 \x01(\v2\x11.ctrl.v1.BookmarkH\x00R\bbookmarkB\x06\n" + "\x04kind\"{\n" + "\rSentinelState\x12.\n" + "\x05apply\x18\x01 \x01(\v2\x16.ctrl.v1.ApplySentinelH\x00R\x05apply\x121\n" + @@ -1443,7 +1432,7 @@ const file_ctrl_v1_cluster_proto_rawDesc = "" + "\n" + "memory_mib\x18\t \x01(\x03R\tmemoryMib\"+\n" + "\x0eDeleteSentinel\x12\x19\n" + - "\bk8s_name\x18\x01 \x01(\tR\ak8sName\"\x85\x04\n" + + "\bk8s_name\x18\x01 \x01(\tR\ak8sName\"\xcc\x03\n" + "\x0fApplyDeployment\x12#\n" + "\rk8s_namespace\x18\x01 \x01(\tR\fk8sNamespace\x12\x19\n" + "\bk8s_name\x18\x02 \x01(\tR\ak8sName\x12!\n" + @@ -1459,15 +1448,12 @@ const file_ctrl_v1_cluster_proto_rawDesc = "" + "memory_mib\x18\n" + " \x01(\x03R\tmemoryMib\x12\x1e\n" + "\bbuild_id\x18\v \x01(\tH\x00R\abuildId\x88\x01\x01\x12F\n" + - "\x1fencrypted_environment_variables\x18\f \x01(\fR\x1dencryptedEnvironmentVariables\x12&\n" + - "\freadiness_id\x18\r \x01(\tH\x01R\vreadinessId\x88\x01\x01B\v\n" + - "\t_build_idB\x0f\n" + - "\r_readiness_id\"R\n" + + "\x1fencrypted_environment_variables\x18\f \x01(\fR\x1dencryptedEnvironmentVariablesB\v\n" + + "\t_build_id\"R\n" + "\x10DeleteDeployment\x12#\n" + "\rk8s_namespace\x18\x01 \x01(\tR\fk8sNamespace\x12\x19\n" + - "\bk8s_name\x18\x02 \x01(\tR\ak8sName2\xfa\x03\n" + - "\x0eClusterService\x120\n" + - "\x05Watch\x12\x15.ctrl.v1.WatchRequest\x1a\x0e.ctrl.v1.State0\x01\x12.\n" + + "\bk8s_name\x18\x02 \x01(\tR\ak8sName2\xc8\x03\n" + + "\x0eClusterService\x12.\n" + "\x04Sync\x12\x14.ctrl.v1.SyncRequest\x1a\x0e.ctrl.v1.State0\x01\x12Z\n" + "\x17GetDesiredSentinelState\x12'.ctrl.v1.GetDesiredSentinelStateRequest\x1a\x16.ctrl.v1.SentinelState\x12`\n" + "\x13UpdateSentinelState\x12#.ctrl.v1.UpdateSentinelStateRequest\x1a$.ctrl.v1.UpdateSentinelStateResponse\x12`\n" + @@ -1499,7 +1485,7 @@ var file_ctrl_v1_cluster_proto_goTypes = []any{ (*UpdateSentinelStateRequest)(nil), // 6: ctrl.v1.UpdateSentinelStateRequest (*UpdateSentinelStateResponse)(nil), // 7: ctrl.v1.UpdateSentinelStateResponse (*SyncRequest)(nil), // 8: ctrl.v1.SyncRequest - (*WatchRequest)(nil), // 9: ctrl.v1.WatchRequest + (*Bookmark)(nil), // 9: ctrl.v1.Bookmark (*State)(nil), // 10: ctrl.v1.State (*SentinelState)(nil), // 11: ctrl.v1.SentinelState (*DeploymentState)(nil), // 12: ctrl.v1.DeploymentState @@ -1516,29 +1502,28 @@ var file_ctrl_v1_cluster_proto_depIdxs = []int32{ 18, // 1: ctrl.v1.UpdateDeploymentStateRequest.delete:type_name -> ctrl.v1.UpdateDeploymentStateRequest.Delete 12, // 2: ctrl.v1.State.deployment:type_name -> ctrl.v1.DeploymentState 11, // 3: ctrl.v1.State.sentinel:type_name -> ctrl.v1.SentinelState - 13, // 4: ctrl.v1.SentinelState.apply:type_name -> ctrl.v1.ApplySentinel - 14, // 5: ctrl.v1.SentinelState.delete:type_name -> ctrl.v1.DeleteSentinel - 15, // 6: ctrl.v1.DeploymentState.apply:type_name -> ctrl.v1.ApplyDeployment - 16, // 7: ctrl.v1.DeploymentState.delete:type_name -> ctrl.v1.DeleteDeployment - 19, // 8: ctrl.v1.UpdateDeploymentStateRequest.Update.instances:type_name -> ctrl.v1.UpdateDeploymentStateRequest.Update.Instance - 0, // 9: ctrl.v1.UpdateDeploymentStateRequest.Update.Instance.status:type_name -> ctrl.v1.UpdateDeploymentStateRequest.Update.Instance.Status - 9, // 10: ctrl.v1.ClusterService.Watch:input_type -> ctrl.v1.WatchRequest + 9, // 4: ctrl.v1.State.bookmark:type_name -> ctrl.v1.Bookmark + 13, // 5: ctrl.v1.SentinelState.apply:type_name -> ctrl.v1.ApplySentinel + 14, // 6: ctrl.v1.SentinelState.delete:type_name -> ctrl.v1.DeleteSentinel + 15, // 7: ctrl.v1.DeploymentState.apply:type_name -> ctrl.v1.ApplyDeployment + 16, // 8: ctrl.v1.DeploymentState.delete:type_name -> ctrl.v1.DeleteDeployment + 19, // 9: ctrl.v1.UpdateDeploymentStateRequest.Update.instances:type_name -> ctrl.v1.UpdateDeploymentStateRequest.Update.Instance + 0, // 10: ctrl.v1.UpdateDeploymentStateRequest.Update.Instance.status:type_name -> ctrl.v1.UpdateDeploymentStateRequest.Update.Instance.Status 8, // 11: ctrl.v1.ClusterService.Sync:input_type -> ctrl.v1.SyncRequest 1, // 12: ctrl.v1.ClusterService.GetDesiredSentinelState:input_type -> ctrl.v1.GetDesiredSentinelStateRequest 6, // 13: ctrl.v1.ClusterService.UpdateSentinelState:input_type -> ctrl.v1.UpdateSentinelStateRequest 2, // 14: ctrl.v1.ClusterService.GetDesiredDeploymentState:input_type -> ctrl.v1.GetDesiredDeploymentStateRequest 3, // 15: ctrl.v1.ClusterService.UpdateDeploymentState:input_type -> ctrl.v1.UpdateDeploymentStateRequest - 10, // 16: ctrl.v1.ClusterService.Watch:output_type -> ctrl.v1.State - 10, // 17: ctrl.v1.ClusterService.Sync:output_type -> ctrl.v1.State - 11, // 18: ctrl.v1.ClusterService.GetDesiredSentinelState:output_type -> ctrl.v1.SentinelState - 7, // 19: ctrl.v1.ClusterService.UpdateSentinelState:output_type -> ctrl.v1.UpdateSentinelStateResponse - 12, // 20: ctrl.v1.ClusterService.GetDesiredDeploymentState:output_type -> ctrl.v1.DeploymentState - 4, // 21: ctrl.v1.ClusterService.UpdateDeploymentState:output_type -> ctrl.v1.UpdateDeploymentStateResponse - 16, // [16:22] is the sub-list for method output_type - 10, // [10:16] is the sub-list for method input_type - 10, // [10:10] is the sub-list for extension type_name - 10, // [10:10] is the sub-list for extension extendee - 0, // [0:10] is the sub-list for field type_name + 10, // 16: ctrl.v1.ClusterService.Sync:output_type -> ctrl.v1.State + 11, // 17: ctrl.v1.ClusterService.GetDesiredSentinelState:output_type -> ctrl.v1.SentinelState + 7, // 18: ctrl.v1.ClusterService.UpdateSentinelState:output_type -> ctrl.v1.UpdateSentinelStateResponse + 12, // 19: ctrl.v1.ClusterService.GetDesiredDeploymentState:output_type -> ctrl.v1.DeploymentState + 4, // 20: ctrl.v1.ClusterService.UpdateDeploymentState:output_type -> ctrl.v1.UpdateDeploymentStateResponse + 16, // [16:21] is the sub-list for method output_type + 11, // [11:16] is the sub-list for method input_type + 11, // [11:11] is the sub-list for extension type_name + 11, // [11:11] is the sub-list for extension extendee + 0, // [0:11] is the sub-list for field type_name } func init() { file_ctrl_v1_cluster_proto_init() } @@ -1553,6 +1538,7 @@ func file_ctrl_v1_cluster_proto_init() { file_ctrl_v1_cluster_proto_msgTypes[9].OneofWrappers = []any{ (*State_Deployment)(nil), (*State_Sentinel)(nil), + (*State_Bookmark)(nil), } file_ctrl_v1_cluster_proto_msgTypes[10].OneofWrappers = []any{ (*SentinelState_Apply)(nil), diff --git a/gen/proto/ctrl/v1/ctrlv1connect/cluster.connect.go b/gen/proto/ctrl/v1/ctrlv1connect/cluster.connect.go index a287c3c0f9..86169fff7e 100644 --- a/gen/proto/ctrl/v1/ctrlv1connect/cluster.connect.go +++ b/gen/proto/ctrl/v1/ctrlv1connect/cluster.connect.go @@ -42,8 +42,6 @@ const ( // reflection-formatted method names, remove the leading slash and convert the remaining slash to a // period. const ( - // ClusterServiceWatchProcedure is the fully-qualified name of the ClusterService's Watch RPC. - ClusterServiceWatchProcedure = "/ctrl.v1.ClusterService/Watch" // ClusterServiceSyncProcedure is the fully-qualified name of the ClusterService's Sync RPC. ClusterServiceSyncProcedure = "/ctrl.v1.ClusterService/Sync" // ClusterServiceGetDesiredSentinelStateProcedure is the fully-qualified name of the @@ -62,7 +60,6 @@ const ( // ClusterServiceClient is a client for the ctrl.v1.ClusterService service. type ClusterServiceClient interface { - Watch(context.Context, *connect.Request[v1.WatchRequest]) (*connect.ServerStreamForClient[v1.State], error) Sync(context.Context, *connect.Request[v1.SyncRequest]) (*connect.ServerStreamForClient[v1.State], error) GetDesiredSentinelState(context.Context, *connect.Request[v1.GetDesiredSentinelStateRequest]) (*connect.Response[v1.SentinelState], error) UpdateSentinelState(context.Context, *connect.Request[v1.UpdateSentinelStateRequest]) (*connect.Response[v1.UpdateSentinelStateResponse], error) @@ -81,12 +78,6 @@ func NewClusterServiceClient(httpClient connect.HTTPClient, baseURL string, opts baseURL = strings.TrimRight(baseURL, "/") clusterServiceMethods := v1.File_ctrl_v1_cluster_proto.Services().ByName("ClusterService").Methods() return &clusterServiceClient{ - watch: connect.NewClient[v1.WatchRequest, v1.State]( - httpClient, - baseURL+ClusterServiceWatchProcedure, - connect.WithSchema(clusterServiceMethods.ByName("Watch")), - connect.WithClientOptions(opts...), - ), sync: connect.NewClient[v1.SyncRequest, v1.State]( httpClient, baseURL+ClusterServiceSyncProcedure, @@ -122,7 +113,6 @@ func NewClusterServiceClient(httpClient connect.HTTPClient, baseURL string, opts // clusterServiceClient implements ClusterServiceClient. type clusterServiceClient struct { - watch *connect.Client[v1.WatchRequest, v1.State] sync *connect.Client[v1.SyncRequest, v1.State] getDesiredSentinelState *connect.Client[v1.GetDesiredSentinelStateRequest, v1.SentinelState] updateSentinelState *connect.Client[v1.UpdateSentinelStateRequest, v1.UpdateSentinelStateResponse] @@ -130,11 +120,6 @@ type clusterServiceClient struct { updateDeploymentState *connect.Client[v1.UpdateDeploymentStateRequest, v1.UpdateDeploymentStateResponse] } -// Watch calls ctrl.v1.ClusterService.Watch. -func (c *clusterServiceClient) Watch(ctx context.Context, req *connect.Request[v1.WatchRequest]) (*connect.ServerStreamForClient[v1.State], error) { - return c.watch.CallServerStream(ctx, req) -} - // Sync calls ctrl.v1.ClusterService.Sync. func (c *clusterServiceClient) Sync(ctx context.Context, req *connect.Request[v1.SyncRequest]) (*connect.ServerStreamForClient[v1.State], error) { return c.sync.CallServerStream(ctx, req) @@ -162,7 +147,6 @@ func (c *clusterServiceClient) UpdateDeploymentState(ctx context.Context, req *c // ClusterServiceHandler is an implementation of the ctrl.v1.ClusterService service. type ClusterServiceHandler interface { - Watch(context.Context, *connect.Request[v1.WatchRequest], *connect.ServerStream[v1.State]) error Sync(context.Context, *connect.Request[v1.SyncRequest], *connect.ServerStream[v1.State]) error GetDesiredSentinelState(context.Context, *connect.Request[v1.GetDesiredSentinelStateRequest]) (*connect.Response[v1.SentinelState], error) UpdateSentinelState(context.Context, *connect.Request[v1.UpdateSentinelStateRequest]) (*connect.Response[v1.UpdateSentinelStateResponse], error) @@ -177,12 +161,6 @@ type ClusterServiceHandler interface { // and JSON codecs. They also support gzip compression. func NewClusterServiceHandler(svc ClusterServiceHandler, opts ...connect.HandlerOption) (string, http.Handler) { clusterServiceMethods := v1.File_ctrl_v1_cluster_proto.Services().ByName("ClusterService").Methods() - clusterServiceWatchHandler := connect.NewServerStreamHandler( - ClusterServiceWatchProcedure, - svc.Watch, - connect.WithSchema(clusterServiceMethods.ByName("Watch")), - connect.WithHandlerOptions(opts...), - ) clusterServiceSyncHandler := connect.NewServerStreamHandler( ClusterServiceSyncProcedure, svc.Sync, @@ -215,8 +193,6 @@ func NewClusterServiceHandler(svc ClusterServiceHandler, opts ...connect.Handler ) return "/ctrl.v1.ClusterService/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { switch r.URL.Path { - case ClusterServiceWatchProcedure: - clusterServiceWatchHandler.ServeHTTP(w, r) case ClusterServiceSyncProcedure: clusterServiceSyncHandler.ServeHTTP(w, r) case ClusterServiceGetDesiredSentinelStateProcedure: @@ -236,10 +212,6 @@ func NewClusterServiceHandler(svc ClusterServiceHandler, opts ...connect.Handler // UnimplementedClusterServiceHandler returns CodeUnimplemented from all methods. type UnimplementedClusterServiceHandler struct{} -func (UnimplementedClusterServiceHandler) Watch(context.Context, *connect.Request[v1.WatchRequest], *connect.ServerStream[v1.State]) error { - return connect.NewError(connect.CodeUnimplemented, errors.New("ctrl.v1.ClusterService.Watch is not implemented")) -} - func (UnimplementedClusterServiceHandler) Sync(context.Context, *connect.Request[v1.SyncRequest], *connect.ServerStream[v1.State]) error { return connect.NewError(connect.CodeUnimplemented, errors.New("ctrl.v1.ClusterService.Sync is not implemented")) } diff --git a/internal/services/keys/service.go b/internal/services/keys/service.go index 8a48a9dbc1..ffed531e9f 100644 --- a/internal/services/keys/service.go +++ b/internal/services/keys/service.go @@ -1,11 +1,8 @@ package keys import ( - "fmt" - "github.com/unkeyed/unkey/internal/services/ratelimit" "github.com/unkeyed/unkey/internal/services/usagelimiter" - "github.com/unkeyed/unkey/pkg/assert" "github.com/unkeyed/unkey/pkg/cache" "github.com/unkeyed/unkey/pkg/clickhouse" "github.com/unkeyed/unkey/pkg/db" @@ -41,17 +38,6 @@ type service struct { // New creates a new keys service instance with the provided configuration. func New(config Config) (*service, error) { - if err := assert.All( - assert.NotNil(config.Logger, "logger is required"), - assert.NotNil(config.DB, "db is required"), - assert.NotNil(config.RateLimiter, "rate limiter is required"), - assert.NotNil(config.RBAC, "rbac is required"), - assert.NotNil(config.Clickhouse, "clickhouse is required"), - assert.NotNil(config.UsageLimiter, "usage limiter is required"), - assert.NotNil(config.KeyCache, "key cache is required"), - ); err != nil { - return nil, fmt.Errorf("invalid keys service config: %w", err) - } return &service{ logger: config.Logger, diff --git a/pkg/db/BUILD.bazel b/pkg/db/BUILD.bazel index 3df59b3197..597eb8c58a 100644 --- a/pkg/db/BUILD.bazel +++ b/pkg/db/BUILD.bazel @@ -82,6 +82,7 @@ go_library( "deployment_find_by_k8s_name.sql_generated.go", "deployment_insert.sql_generated.go", "deployment_topology_by_id_and_region.sql_generated.go", + "deployment_topology_find_regions.sql_generated.go", "deployment_topology_insert.sql_generated.go", "deployment_topology_list_desired.sql_generated.go", "deployment_update_build_id.sql_generated.go", @@ -235,7 +236,10 @@ go_library( "sentinel_insert.sql_generated.go", "sentinel_list_desired.sql_generated.go", "sentinel_update_available_replicas_and_health.sql_generated.go", - "state_change_find_by_cluster_after_sequence.sql_generated.go", + "state_change_delete_old.sql_generated.go", + "state_change_find_by_region_after_sequence.sql_generated.go", + "state_change_get_max_sequence.sql_generated.go", + "state_change_get_min_sequence.sql_generated.go", "state_change_insert.sql_generated.go", "traced_tx.go", "tx.go", diff --git a/pkg/db/bulk_deployment_topology_insert.sql_generated.go b/pkg/db/bulk_deployment_topology_insert.sql_generated.go index 14d182337e..f68a38dbb0 100644 --- a/pkg/db/bulk_deployment_topology_insert.sql_generated.go +++ b/pkg/db/bulk_deployment_topology_insert.sql_generated.go @@ -38,6 +38,6 @@ func (q *BulkQueries) InsertDeploymentTopologies(ctx context.Context, db DBTX, a } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_environment_insert.sql_generated.go b/pkg/db/bulk_environment_insert.sql_generated.go index 5b2688f8da..309b3f09cc 100644 --- a/pkg/db/bulk_environment_insert.sql_generated.go +++ b/pkg/db/bulk_environment_insert.sql_generated.go @@ -40,6 +40,6 @@ func (q *BulkQueries) InsertEnvironments(ctx context.Context, db DBTX, args []In } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_environment_upsert.sql_generated.go b/pkg/db/bulk_environment_upsert.sql_generated.go index 3438fdc668..e21caeb704 100644 --- a/pkg/db/bulk_environment_upsert.sql_generated.go +++ b/pkg/db/bulk_environment_upsert.sql_generated.go @@ -38,6 +38,6 @@ func (q *BulkQueries) UpsertEnvironment(ctx context.Context, db DBTX, args []Ups } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_identity_insert.sql_generated.go b/pkg/db/bulk_identity_insert.sql_generated.go index 80bd90e82f..c3b4583a8e 100644 --- a/pkg/db/bulk_identity_insert.sql_generated.go +++ b/pkg/db/bulk_identity_insert.sql_generated.go @@ -38,6 +38,6 @@ func (q *BulkQueries) InsertIdentities(ctx context.Context, db DBTX, args []Inse } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_identity_insert_ratelimit.sql_generated.go b/pkg/db/bulk_identity_insert_ratelimit.sql_generated.go index 18d07f1836..d4ba807b99 100644 --- a/pkg/db/bulk_identity_insert_ratelimit.sql_generated.go +++ b/pkg/db/bulk_identity_insert_ratelimit.sql_generated.go @@ -45,6 +45,6 @@ func (q *BulkQueries) InsertIdentityRatelimits(ctx context.Context, db DBTX, arg } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_identity_upsert.sql_generated.go b/pkg/db/bulk_identity_upsert.sql_generated.go index 1624fdacf0..a7188608c2 100644 --- a/pkg/db/bulk_identity_upsert.sql_generated.go +++ b/pkg/db/bulk_identity_upsert.sql_generated.go @@ -38,6 +38,6 @@ func (q *BulkQueries) UpsertIdentity(ctx context.Context, db DBTX, args []Upsert } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_ingress_route_insert.sql_generated.go b/pkg/db/bulk_ingress_route_insert.sql_generated.go index 2dfd438ed5..aed50b3a4d 100644 --- a/pkg/db/bulk_ingress_route_insert.sql_generated.go +++ b/pkg/db/bulk_ingress_route_insert.sql_generated.go @@ -40,6 +40,6 @@ func (q *BulkQueries) InsertFrontlineRoutes(ctx context.Context, db DBTX, args [ } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_instance_upsert.sql_generated.go b/pkg/db/bulk_instance_upsert.sql_generated.go index 33b6c5e391..07b168f965 100644 --- a/pkg/db/bulk_instance_upsert.sql_generated.go +++ b/pkg/db/bulk_instance_upsert.sql_generated.go @@ -9,7 +9,7 @@ import ( ) // bulkUpsertInstance is the base query for bulk insert -const bulkUpsertInstance = `INSERT INTO instances ( id, deployment_id, workspace_id, project_id, region, cluster_id, k8s_name, address, cpu_millicores, memory_mib, status ) VALUES %s ON DUPLICATE KEY UPDATE +const bulkUpsertInstance = `INSERT INTO instances ( id, deployment_id, workspace_id, project_id, region, k8s_name, address, cpu_millicores, memory_mib, status ) VALUES %s ON DUPLICATE KEY UPDATE address = ?, cpu_millicores = ?, memory_mib = ?, @@ -25,7 +25,7 @@ func (q *BulkQueries) UpsertInstance(ctx context.Context, db DBTX, args []Upsert // Build the bulk insert query valueClauses := make([]string, len(args)) for i := range args { - valueClauses[i] = "( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )" + valueClauses[i] = "( ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )" } bulkQuery := fmt.Sprintf(bulkUpsertInstance, strings.Join(valueClauses, ", ")) @@ -38,7 +38,6 @@ func (q *BulkQueries) UpsertInstance(ctx context.Context, db DBTX, args []Upsert allArgs = append(allArgs, arg.WorkspaceID) allArgs = append(allArgs, arg.ProjectID) allArgs = append(allArgs, arg.Region) - allArgs = append(allArgs, arg.ClusterID) allArgs = append(allArgs, arg.K8sName) allArgs = append(allArgs, arg.Address) allArgs = append(allArgs, arg.CpuMillicores) @@ -55,6 +54,6 @@ func (q *BulkQueries) UpsertInstance(ctx context.Context, db DBTX, args []Upsert } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_key_auth_insert.sql_generated.go b/pkg/db/bulk_key_auth_insert.sql_generated.go index fc4a861c17..575011ec59 100644 --- a/pkg/db/bulk_key_auth_insert.sql_generated.go +++ b/pkg/db/bulk_key_auth_insert.sql_generated.go @@ -37,6 +37,6 @@ func (q *BulkQueries) InsertKeyAuths(ctx context.Context, db DBTX, args []Insert } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_key_encryption_insert.sql_generated.go b/pkg/db/bulk_key_encryption_insert.sql_generated.go index 9a3a16ad9a..372ecc9d4f 100644 --- a/pkg/db/bulk_key_encryption_insert.sql_generated.go +++ b/pkg/db/bulk_key_encryption_insert.sql_generated.go @@ -37,6 +37,6 @@ func (q *BulkQueries) InsertKeyEncryptions(ctx context.Context, db DBTX, args [] } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_key_insert.sql_generated.go b/pkg/db/bulk_key_insert.sql_generated.go index 20634b7368..da4afc05f5 100644 --- a/pkg/db/bulk_key_insert.sql_generated.go +++ b/pkg/db/bulk_key_insert.sql_generated.go @@ -48,6 +48,6 @@ func (q *BulkQueries) InsertKeys(ctx context.Context, db DBTX, args []InsertKeyP } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_key_insert_ratelimit.sql_generated.go b/pkg/db/bulk_key_insert_ratelimit.sql_generated.go index 2b1a2f00f7..e6b3fce603 100644 --- a/pkg/db/bulk_key_insert_ratelimit.sql_generated.go +++ b/pkg/db/bulk_key_insert_ratelimit.sql_generated.go @@ -49,6 +49,6 @@ func (q *BulkQueries) InsertKeyRatelimits(ctx context.Context, db DBTX, args []I } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_key_migration_insert.sql_generated.go b/pkg/db/bulk_key_migration_insert.sql_generated.go index 4784f699fa..3485d924d7 100644 --- a/pkg/db/bulk_key_migration_insert.sql_generated.go +++ b/pkg/db/bulk_key_migration_insert.sql_generated.go @@ -35,6 +35,6 @@ func (q *BulkQueries) InsertKeyMigrations(ctx context.Context, db DBTX, args []I } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_key_permission_insert.sql_generated.go b/pkg/db/bulk_key_permission_insert.sql_generated.go index f12060f1ea..a3a83d8298 100644 --- a/pkg/db/bulk_key_permission_insert.sql_generated.go +++ b/pkg/db/bulk_key_permission_insert.sql_generated.go @@ -41,6 +41,6 @@ func (q *BulkQueries) InsertKeyPermissions(ctx context.Context, db DBTX, args [] } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_key_role_insert.sql_generated.go b/pkg/db/bulk_key_role_insert.sql_generated.go index 75654742d0..81da1a0f12 100644 --- a/pkg/db/bulk_key_role_insert.sql_generated.go +++ b/pkg/db/bulk_key_role_insert.sql_generated.go @@ -36,6 +36,6 @@ func (q *BulkQueries) InsertKeyRoles(ctx context.Context, db DBTX, args []Insert } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_key_space_insert.sql_generated.go b/pkg/db/bulk_key_space_insert.sql_generated.go index f19d1c8ee9..4753082f73 100644 --- a/pkg/db/bulk_key_space_insert.sql_generated.go +++ b/pkg/db/bulk_key_space_insert.sql_generated.go @@ -38,6 +38,6 @@ func (q *BulkQueries) InsertKeySpaces(ctx context.Context, db DBTX, args []Inser } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_key_space_upsert.sql_generated.go b/pkg/db/bulk_key_space_upsert.sql_generated.go index 0b4ad1cfc2..f447dad56f 100644 --- a/pkg/db/bulk_key_space_upsert.sql_generated.go +++ b/pkg/db/bulk_key_space_upsert.sql_generated.go @@ -40,6 +40,6 @@ func (q *BulkQueries) UpsertKeySpace(ctx context.Context, db DBTX, args []Upsert } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_permission_insert.sql_generated.go b/pkg/db/bulk_permission_insert.sql_generated.go index 05f6d971e7..a5ed0f59cd 100644 --- a/pkg/db/bulk_permission_insert.sql_generated.go +++ b/pkg/db/bulk_permission_insert.sql_generated.go @@ -38,6 +38,6 @@ func (q *BulkQueries) InsertPermissions(ctx context.Context, db DBTX, args []Ins } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_project_insert.sql_generated.go b/pkg/db/bulk_project_insert.sql_generated.go index 01d8a3935c..60b5486231 100644 --- a/pkg/db/bulk_project_insert.sql_generated.go +++ b/pkg/db/bulk_project_insert.sql_generated.go @@ -41,6 +41,6 @@ func (q *BulkQueries) InsertProjects(ctx context.Context, db DBTX, args []Insert } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_quota_upsert.sql_generated.go b/pkg/db/bulk_quota_upsert.sql_generated.go index 643cb56db0..553418fa6d 100644 --- a/pkg/db/bulk_quota_upsert.sql_generated.go +++ b/pkg/db/bulk_quota_upsert.sql_generated.go @@ -40,6 +40,6 @@ func (q *BulkQueries) UpsertQuota(ctx context.Context, db DBTX, args []UpsertQuo } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_ratelimit_namespace_insert.sql_generated.go b/pkg/db/bulk_ratelimit_namespace_insert.sql_generated.go index 932e54826c..c8be775169 100644 --- a/pkg/db/bulk_ratelimit_namespace_insert.sql_generated.go +++ b/pkg/db/bulk_ratelimit_namespace_insert.sql_generated.go @@ -36,6 +36,6 @@ func (q *BulkQueries) InsertRatelimitNamespaces(ctx context.Context, db DBTX, ar } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_ratelimit_override_insert.sql_generated.go b/pkg/db/bulk_ratelimit_override_insert.sql_generated.go index f6172160ca..9d114c4909 100644 --- a/pkg/db/bulk_ratelimit_override_insert.sql_generated.go +++ b/pkg/db/bulk_ratelimit_override_insert.sql_generated.go @@ -48,6 +48,6 @@ func (q *BulkQueries) InsertRatelimitOverrides(ctx context.Context, db DBTX, arg } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_role_insert.sql_generated.go b/pkg/db/bulk_role_insert.sql_generated.go index 5d0fd376fb..ebdb688647 100644 --- a/pkg/db/bulk_role_insert.sql_generated.go +++ b/pkg/db/bulk_role_insert.sql_generated.go @@ -37,6 +37,6 @@ func (q *BulkQueries) InsertRoles(ctx context.Context, db DBTX, args []InsertRol } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_role_permission_insert.sql_generated.go b/pkg/db/bulk_role_permission_insert.sql_generated.go index c00635ce68..fda1b0d623 100644 --- a/pkg/db/bulk_role_permission_insert.sql_generated.go +++ b/pkg/db/bulk_role_permission_insert.sql_generated.go @@ -36,6 +36,6 @@ func (q *BulkQueries) InsertRolePermissions(ctx context.Context, db DBTX, args [ } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_sentinel_insert.sql_generated.go b/pkg/db/bulk_sentinel_insert.sql_generated.go index 69f4741d23..55f5ca9298 100644 --- a/pkg/db/bulk_sentinel_insert.sql_generated.go +++ b/pkg/db/bulk_sentinel_insert.sql_generated.go @@ -46,6 +46,6 @@ func (q *BulkQueries) InsertSentinels(ctx context.Context, db DBTX, args []Inser } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_state_change_insert.sql_generated.go b/pkg/db/bulk_state_change_insert.sql_generated.go index 421e29a1f4..093d317533 100644 --- a/pkg/db/bulk_state_change_insert.sql_generated.go +++ b/pkg/db/bulk_state_change_insert.sql_generated.go @@ -9,7 +9,7 @@ import ( ) // bulkInsertStateChange is the base query for bulk insert -const bulkInsertStateChange = `INSERT INTO ` + "`" + `state_changes` + "`" + ` ( resource_type, state, cluster_id, created_at ) VALUES %s` +const bulkInsertStateChange = `INSERT INTO ` + "`" + `state_changes` + "`" + ` ( resource_type, resource_id, op, region, created_at ) VALUES %s` // InsertStateChanges performs bulk insert in a single query func (q *BulkQueries) InsertStateChanges(ctx context.Context, db DBTX, args []InsertStateChangeParams) error { @@ -21,7 +21,7 @@ func (q *BulkQueries) InsertStateChanges(ctx context.Context, db DBTX, args []In // Build the bulk insert query valueClauses := make([]string, len(args)) for i := range args { - valueClauses[i] = "( ?, ?, ?, ? )" + valueClauses[i] = "( ?, ?, ?, ?, ? )" } bulkQuery := fmt.Sprintf(bulkInsertStateChange, strings.Join(valueClauses, ", ")) @@ -30,12 +30,13 @@ func (q *BulkQueries) InsertStateChanges(ctx context.Context, db DBTX, args []In var allArgs []any for _, arg := range args { allArgs = append(allArgs, arg.ResourceType) - allArgs = append(allArgs, arg.State) - allArgs = append(allArgs, arg.ClusterID) + allArgs = append(allArgs, arg.ResourceID) + allArgs = append(allArgs, arg.Op) + allArgs = append(allArgs, arg.Region) allArgs = append(allArgs, arg.CreatedAt) } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_workspace_insert.sql_generated.go b/pkg/db/bulk_workspace_insert.sql_generated.go index 2aa27b0eaa..eb5de58d43 100644 --- a/pkg/db/bulk_workspace_insert.sql_generated.go +++ b/pkg/db/bulk_workspace_insert.sql_generated.go @@ -37,6 +37,6 @@ func (q *BulkQueries) InsertWorkspaces(ctx context.Context, db DBTX, args []Inse } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_workspace_upsert.sql_generated.go b/pkg/db/bulk_workspace_upsert.sql_generated.go index 3ca46422ae..bc0476b3e0 100644 --- a/pkg/db/bulk_workspace_upsert.sql_generated.go +++ b/pkg/db/bulk_workspace_upsert.sql_generated.go @@ -41,6 +41,6 @@ func (q *BulkQueries) UpsertWorkspace(ctx context.Context, db DBTX, args []Upser } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/deployment_delete_instances.sql_generated.go b/pkg/db/deployment_delete_instances.sql_generated.go index 7e15b5ae5e..c67f85dede 100644 --- a/pkg/db/deployment_delete_instances.sql_generated.go +++ b/pkg/db/deployment_delete_instances.sql_generated.go @@ -11,19 +11,19 @@ import ( const deleteDeploymentInstances = `-- name: DeleteDeploymentInstances :exec DELETE FROM instances -WHERE deployment_id = ? and cluster_id = ? +WHERE deployment_id = ? and region = ? ` type DeleteDeploymentInstancesParams struct { DeploymentID string `db:"deployment_id"` - ClusterID string `db:"cluster_id"` + Region string `db:"region"` } // DeleteDeploymentInstances // // DELETE FROM instances -// WHERE deployment_id = ? and cluster_id = ? +// WHERE deployment_id = ? and region = ? func (q *Queries) DeleteDeploymentInstances(ctx context.Context, db DBTX, arg DeleteDeploymentInstancesParams) error { - _, err := db.ExecContext(ctx, deleteDeploymentInstances, arg.DeploymentID, arg.ClusterID) + _, err := db.ExecContext(ctx, deleteDeploymentInstances, arg.DeploymentID, arg.Region) return err } diff --git a/pkg/db/deployment_topology_find_regions.sql_generated.go b/pkg/db/deployment_topology_find_regions.sql_generated.go new file mode 100644 index 0000000000..279832d684 --- /dev/null +++ b/pkg/db/deployment_topology_find_regions.sql_generated.go @@ -0,0 +1,45 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.30.0 +// source: deployment_topology_find_regions.sql + +package db + +import ( + "context" +) + +const findDeploymentRegions = `-- name: FindDeploymentRegions :many +SELECT region +FROM ` + "`" + `deployment_topology` + "`" + ` +WHERE deployment_id = ? +` + +// Returns all regions where a deployment is configured. +// Used for fan-out: when a deployment changes, emit state_change to each region. +// +// SELECT region +// FROM `deployment_topology` +// WHERE deployment_id = ? +func (q *Queries) FindDeploymentRegions(ctx context.Context, db DBTX, deploymentID string) ([]string, error) { + rows, err := db.QueryContext(ctx, findDeploymentRegions, deploymentID) + if err != nil { + return nil, err + } + defer rows.Close() + var items []string + for rows.Next() { + var region string + if err := rows.Scan(®ion); err != nil { + return nil, err + } + items = append(items, region) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} diff --git a/pkg/db/deployment_topology_list_desired.sql_generated.go b/pkg/db/deployment_topology_list_desired.sql_generated.go index 4aaefb4189..59c5862eec 100644 --- a/pkg/db/deployment_topology_list_desired.sql_generated.go +++ b/pkg/db/deployment_topology_list_desired.sql_generated.go @@ -12,19 +12,9 @@ import ( const listDesiredDeploymentTopology = `-- name: ListDesiredDeploymentTopology :many SELECT - d.id as deployment_id, - d.k8s_name as k8s_name, - d.workspace_id, - d.project_id, - d.environment_id, - d.image, - dt.region, - d.cpu_millicores, - d.memory_mib, - dt.desired_replicas, - w.k8s_namespace as k8s_namespace, - d.build_id, - d.encrypted_environment_variables + dt.pk, dt.workspace_id, dt.deployment_id, dt.region, dt.desired_replicas, dt.desired_status, dt.created_at, dt.updated_at, + d.pk, d.id, d.k8s_name, d.workspace_id, d.project_id, d.environment_id, d.image, d.build_id, d.git_commit_sha, d.git_branch, d.git_commit_message, d.git_commit_author_handle, d.git_commit_author_avatar_url, d.git_commit_timestamp, d.sentinel_config, d.openapi_spec, d.cpu_millicores, d.memory_mib, d.desired_state, d.encrypted_environment_variables, d.status, d.created_at, d.updated_at, + w.k8s_namespace FROM ` + "`" + `deployment_topology` + "`" + ` dt INNER JOIN ` + "`" + `deployments` + "`" + ` d ON dt.deployment_id = d.id INNER JOIN ` + "`" + `workspaces` + "`" + ` w ON d.workspace_id = w.id @@ -43,37 +33,19 @@ type ListDesiredDeploymentTopologyParams struct { } type ListDesiredDeploymentTopologyRow struct { - DeploymentID string `db:"deployment_id"` - K8sName string `db:"k8s_name"` - WorkspaceID string `db:"workspace_id"` - ProjectID string `db:"project_id"` - EnvironmentID string `db:"environment_id"` - Image sql.NullString `db:"image"` - Region string `db:"region"` - CpuMillicores int32 `db:"cpu_millicores"` - MemoryMib int32 `db:"memory_mib"` - DesiredReplicas int32 `db:"desired_replicas"` - K8sNamespace sql.NullString `db:"k8s_namespace"` - BuildID sql.NullString `db:"build_id"` - EncryptedEnvironmentVariables []byte `db:"encrypted_environment_variables"` + DeploymentTopology DeploymentTopology `db:"deployment_topology"` + Deployment Deployment `db:"deployment"` + K8sNamespace sql.NullString `db:"k8s_namespace"` } -// ListDesiredDeploymentTopology +// ListDesiredDeploymentTopology returns all deployment topologies matching the desired state for a region. +// Used during bootstrap to stream all running deployments to krane. +// The version parameter is deprecated and ignored (kept for backwards compatibility). // // SELECT -// d.id as deployment_id, -// d.k8s_name as k8s_name, -// d.workspace_id, -// d.project_id, -// d.environment_id, -// d.image, -// dt.region, -// d.cpu_millicores, -// d.memory_mib, -// dt.desired_replicas, -// w.k8s_namespace as k8s_namespace, -// d.build_id, -// d.encrypted_environment_variables +// dt.pk, dt.workspace_id, dt.deployment_id, dt.region, dt.desired_replicas, dt.desired_status, dt.created_at, dt.updated_at, +// d.pk, d.id, d.k8s_name, d.workspace_id, d.project_id, d.environment_id, d.image, d.build_id, d.git_commit_sha, d.git_branch, d.git_commit_message, d.git_commit_author_handle, d.git_commit_author_avatar_url, d.git_commit_timestamp, d.sentinel_config, d.openapi_spec, d.cpu_millicores, d.memory_mib, d.desired_state, d.encrypted_environment_variables, d.status, d.created_at, d.updated_at, +// w.k8s_namespace // FROM `deployment_topology` dt // INNER JOIN `deployments` d ON dt.deployment_id = d.id // INNER JOIN `workspaces` w ON d.workspace_id = w.id @@ -98,19 +70,38 @@ func (q *Queries) ListDesiredDeploymentTopology(ctx context.Context, db DBTX, ar for rows.Next() { var i ListDesiredDeploymentTopologyRow if err := rows.Scan( - &i.DeploymentID, - &i.K8sName, - &i.WorkspaceID, - &i.ProjectID, - &i.EnvironmentID, - &i.Image, - &i.Region, - &i.CpuMillicores, - &i.MemoryMib, - &i.DesiredReplicas, + &i.DeploymentTopology.Pk, + &i.DeploymentTopology.WorkspaceID, + &i.DeploymentTopology.DeploymentID, + &i.DeploymentTopology.Region, + &i.DeploymentTopology.DesiredReplicas, + &i.DeploymentTopology.DesiredStatus, + &i.DeploymentTopology.CreatedAt, + &i.DeploymentTopology.UpdatedAt, + &i.Deployment.Pk, + &i.Deployment.ID, + &i.Deployment.K8sName, + &i.Deployment.WorkspaceID, + &i.Deployment.ProjectID, + &i.Deployment.EnvironmentID, + &i.Deployment.Image, + &i.Deployment.BuildID, + &i.Deployment.GitCommitSha, + &i.Deployment.GitBranch, + &i.Deployment.GitCommitMessage, + &i.Deployment.GitCommitAuthorHandle, + &i.Deployment.GitCommitAuthorAvatarUrl, + &i.Deployment.GitCommitTimestamp, + &i.Deployment.SentinelConfig, + &i.Deployment.OpenapiSpec, + &i.Deployment.CpuMillicores, + &i.Deployment.MemoryMib, + &i.Deployment.DesiredState, + &i.Deployment.EncryptedEnvironmentVariables, + &i.Deployment.Status, + &i.Deployment.CreatedAt, + &i.Deployment.UpdatedAt, &i.K8sNamespace, - &i.BuildID, - &i.EncryptedEnvironmentVariables, ); err != nil { return nil, err } diff --git a/pkg/db/instance_delete.sql_generated.go b/pkg/db/instance_delete.sql_generated.go index 478d74f6c3..783a48f4c3 100644 --- a/pkg/db/instance_delete.sql_generated.go +++ b/pkg/db/instance_delete.sql_generated.go @@ -10,19 +10,18 @@ import ( ) const deleteInstance = `-- name: DeleteInstance :exec -DELETE FROM instances WHERE k8s_name = ? AND cluster_id = ? AND region = ? +DELETE FROM instances WHERE k8s_name = ? AND region = ? ` type DeleteInstanceParams struct { - K8sName string `db:"k8s_name"` - ClusterID string `db:"cluster_id"` - Region string `db:"region"` + K8sName string `db:"k8s_name"` + Region string `db:"region"` } // DeleteInstance // -// DELETE FROM instances WHERE k8s_name = ? AND cluster_id = ? AND region = ? +// DELETE FROM instances WHERE k8s_name = ? AND region = ? func (q *Queries) DeleteInstance(ctx context.Context, db DBTX, arg DeleteInstanceParams) error { - _, err := db.ExecContext(ctx, deleteInstance, arg.K8sName, arg.ClusterID, arg.Region) + _, err := db.ExecContext(ctx, deleteInstance, arg.K8sName, arg.Region) return err } diff --git a/pkg/db/instance_upsert.sql_generated.go b/pkg/db/instance_upsert.sql_generated.go index c173fbee24..83e64c5b97 100644 --- a/pkg/db/instance_upsert.sql_generated.go +++ b/pkg/db/instance_upsert.sql_generated.go @@ -16,7 +16,6 @@ INSERT INTO instances ( workspace_id, project_id, region, - cluster_id, k8s_name, address, cpu_millicores, @@ -33,7 +32,6 @@ VALUES ( ?, ?, ?, - ?, ? ) ON DUPLICATE KEY UPDATE @@ -49,7 +47,6 @@ type UpsertInstanceParams struct { WorkspaceID string `db:"workspace_id"` ProjectID string `db:"project_id"` Region string `db:"region"` - ClusterID string `db:"cluster_id"` K8sName string `db:"k8s_name"` Address string `db:"address"` CpuMillicores int32 `db:"cpu_millicores"` @@ -65,7 +62,6 @@ type UpsertInstanceParams struct { // workspace_id, // project_id, // region, -// cluster_id, // k8s_name, // address, // cpu_millicores, @@ -82,7 +78,6 @@ type UpsertInstanceParams struct { // ?, // ?, // ?, -// ?, // ? // ) // ON DUPLICATE KEY UPDATE @@ -97,7 +92,6 @@ func (q *Queries) UpsertInstance(ctx context.Context, db DBTX, arg UpsertInstanc arg.WorkspaceID, arg.ProjectID, arg.Region, - arg.ClusterID, arg.K8sName, arg.Address, arg.CpuMillicores, diff --git a/pkg/db/instances_find_by_deployment_id.sql_generated.go b/pkg/db/instances_find_by_deployment_id.sql_generated.go index 91e019a847..524732e702 100644 --- a/pkg/db/instances_find_by_deployment_id.sql_generated.go +++ b/pkg/db/instances_find_by_deployment_id.sql_generated.go @@ -11,7 +11,7 @@ import ( const findInstancesByDeploymentId = `-- name: FindInstancesByDeploymentId :many SELECT - pk, id, deployment_id, workspace_id, project_id, region, cluster_id, k8s_name, address, cpu_millicores, memory_mib, status + pk, id, deployment_id, workspace_id, project_id, region, k8s_name, address, cpu_millicores, memory_mib, status FROM instances WHERE deployment_id = ? ` @@ -19,7 +19,7 @@ WHERE deployment_id = ? // FindInstancesByDeploymentId // // SELECT -// pk, id, deployment_id, workspace_id, project_id, region, cluster_id, k8s_name, address, cpu_millicores, memory_mib, status +// pk, id, deployment_id, workspace_id, project_id, region, k8s_name, address, cpu_millicores, memory_mib, status // FROM instances // WHERE deployment_id = ? func (q *Queries) FindInstancesByDeploymentId(ctx context.Context, db DBTX, deploymentid string) ([]Instance, error) { @@ -38,7 +38,6 @@ func (q *Queries) FindInstancesByDeploymentId(ctx context.Context, db DBTX, depl &i.WorkspaceID, &i.ProjectID, &i.Region, - &i.ClusterID, &i.K8sName, &i.Address, &i.CpuMillicores, diff --git a/pkg/db/instances_find_by_deployment_id_and_region.sql_generated.go b/pkg/db/instances_find_by_deployment_id_and_region.sql_generated.go index e2fdd4af31..dee7878101 100644 --- a/pkg/db/instances_find_by_deployment_id_and_region.sql_generated.go +++ b/pkg/db/instances_find_by_deployment_id_and_region.sql_generated.go @@ -11,7 +11,7 @@ import ( const findInstancesByDeploymentIdAndRegion = `-- name: FindInstancesByDeploymentIdAndRegion :many SELECT - pk, id, deployment_id, workspace_id, project_id, region, cluster_id, k8s_name, address, cpu_millicores, memory_mib, status + pk, id, deployment_id, workspace_id, project_id, region, k8s_name, address, cpu_millicores, memory_mib, status FROM instances WHERE deployment_id = ? AND region = ? ` @@ -24,7 +24,7 @@ type FindInstancesByDeploymentIdAndRegionParams struct { // FindInstancesByDeploymentIdAndRegion // // SELECT -// pk, id, deployment_id, workspace_id, project_id, region, cluster_id, k8s_name, address, cpu_millicores, memory_mib, status +// pk, id, deployment_id, workspace_id, project_id, region, k8s_name, address, cpu_millicores, memory_mib, status // FROM instances // WHERE deployment_id = ? AND region = ? func (q *Queries) FindInstancesByDeploymentIdAndRegion(ctx context.Context, db DBTX, arg FindInstancesByDeploymentIdAndRegionParams) ([]Instance, error) { @@ -43,7 +43,6 @@ func (q *Queries) FindInstancesByDeploymentIdAndRegion(ctx context.Context, db D &i.WorkspaceID, &i.ProjectID, &i.Region, - &i.ClusterID, &i.K8sName, &i.Address, &i.CpuMillicores, diff --git a/pkg/db/instances_find_by_pod_name.sql_generated.go b/pkg/db/instances_find_by_pod_name.sql_generated.go index 440478f211..fd5405e797 100644 --- a/pkg/db/instances_find_by_pod_name.sql_generated.go +++ b/pkg/db/instances_find_by_pod_name.sql_generated.go @@ -11,25 +11,24 @@ import ( const findInstanceByPodName = `-- name: FindInstanceByPodName :one SELECT - pk, id, deployment_id, workspace_id, project_id, region, cluster_id, k8s_name, address, cpu_millicores, memory_mib, status + pk, id, deployment_id, workspace_id, project_id, region, k8s_name, address, cpu_millicores, memory_mib, status FROM instances - WHERE k8s_name = ? AND cluster_id = ? AND region = ? + WHERE k8s_name = ? AND region = ? ` type FindInstanceByPodNameParams struct { - K8sName string `db:"k8s_name"` - ClusterID string `db:"cluster_id"` - Region string `db:"region"` + K8sName string `db:"k8s_name"` + Region string `db:"region"` } // FindInstanceByPodName // // SELECT -// pk, id, deployment_id, workspace_id, project_id, region, cluster_id, k8s_name, address, cpu_millicores, memory_mib, status +// pk, id, deployment_id, workspace_id, project_id, region, k8s_name, address, cpu_millicores, memory_mib, status // FROM instances -// WHERE k8s_name = ? AND cluster_id = ? AND region = ? +// WHERE k8s_name = ? AND region = ? func (q *Queries) FindInstanceByPodName(ctx context.Context, db DBTX, arg FindInstanceByPodNameParams) (Instance, error) { - row := db.QueryRowContext(ctx, findInstanceByPodName, arg.K8sName, arg.ClusterID, arg.Region) + row := db.QueryRowContext(ctx, findInstanceByPodName, arg.K8sName, arg.Region) var i Instance err := row.Scan( &i.Pk, @@ -38,7 +37,6 @@ func (q *Queries) FindInstanceByPodName(ctx context.Context, db DBTX, arg FindIn &i.WorkspaceID, &i.ProjectID, &i.Region, - &i.ClusterID, &i.K8sName, &i.Address, &i.CpuMillicores, diff --git a/pkg/db/models_generated.go b/pkg/db/models_generated.go index b8030fe096..71f956f9a2 100644 --- a/pkg/db/models_generated.go +++ b/pkg/db/models_generated.go @@ -616,6 +616,48 @@ func (ns NullSentinelsHealth) Value() (driver.Value, error) { return string(ns.SentinelsHealth), nil } +type StateChangesOp string + +const ( + StateChangesOpUpsert StateChangesOp = "upsert" + StateChangesOpDelete StateChangesOp = "delete" +) + +func (e *StateChangesOp) Scan(src interface{}) error { + switch s := src.(type) { + case []byte: + *e = StateChangesOp(s) + case string: + *e = StateChangesOp(s) + default: + return fmt.Errorf("unsupported scan type for StateChangesOp: %T", src) + } + return nil +} + +type NullStateChangesOp struct { + StateChangesOp StateChangesOp + Valid bool // Valid is true if StateChangesOp is not NULL +} + +// Scan implements the Scanner interface. +func (ns *NullStateChangesOp) Scan(value interface{}) error { + if value == nil { + ns.StateChangesOp, ns.Valid = "", false + return nil + } + ns.Valid = true + return ns.StateChangesOp.Scan(value) +} + +// Value implements the driver Valuer interface. +func (ns NullStateChangesOp) Value() (driver.Value, error) { + if !ns.Valid { + return nil, nil + } + return string(ns.StateChangesOp), nil +} + type StateChangesResourceType string const ( @@ -1009,7 +1051,6 @@ type Instance struct { WorkspaceID string `db:"workspace_id"` ProjectID string `db:"project_id"` Region string `db:"region"` - ClusterID string `db:"cluster_id"` K8sName string `db:"k8s_name"` Address string `db:"address"` CpuMillicores int32 `db:"cpu_millicores"` @@ -1210,8 +1251,9 @@ type Sentinel struct { type StateChange struct { Sequence uint64 `db:"sequence"` ResourceType StateChangesResourceType `db:"resource_type"` - State []byte `db:"state"` - ClusterID string `db:"cluster_id"` + ResourceID string `db:"resource_id"` + Op StateChangesOp `db:"op"` + Region string `db:"region"` CreatedAt uint64 `db:"created_at"` } diff --git a/pkg/db/querier_generated.go b/pkg/db/querier_generated.go index 4fb17fb4c2..88fc778497 100644 --- a/pkg/db/querier_generated.go +++ b/pkg/db/querier_generated.go @@ -29,7 +29,7 @@ type Querier interface { //DeleteDeploymentInstances // // DELETE FROM instances - // WHERE deployment_id = ? and cluster_id = ? + // WHERE deployment_id = ? and region = ? DeleteDeploymentInstances(ctx context.Context, db DBTX, arg DeleteDeploymentInstancesParams) error //DeleteIdentity // @@ -39,7 +39,7 @@ type Querier interface { DeleteIdentity(ctx context.Context, db DBTX, arg DeleteIdentityParams) error //DeleteInstance // - // DELETE FROM instances WHERE k8s_name = ? AND cluster_id = ? AND region = ? + // DELETE FROM instances WHERE k8s_name = ? AND region = ? DeleteInstance(ctx context.Context, db DBTX, arg DeleteInstanceParams) error //DeleteKeyByID // @@ -118,6 +118,13 @@ type Querier interface { // AND (i.id = ? OR i.external_id = ?) // AND i.deleted = true DeleteOldIdentityWithRatelimits(ctx context.Context, db DBTX, arg DeleteOldIdentityWithRatelimitsParams) error + // Retention cleanup: deletes state changes older than the cutoff timestamp. + // Uses LIMIT to avoid long-running transactions; call repeatedly until 0 rows affected. + // + // DELETE FROM `state_changes` + // WHERE created_at < ? + // LIMIT 10000 + DeleteOldStateChanges(ctx context.Context, db DBTX, cutoffMs uint64) (int64, error) //DeletePermission // // DELETE FROM permissions @@ -202,6 +209,13 @@ type Querier interface { // // SELECT pk, id, k8s_name, workspace_id, project_id, environment_id, image, build_id, git_commit_sha, git_branch, git_commit_message, git_commit_author_handle, git_commit_author_avatar_url, git_commit_timestamp, sentinel_config, openapi_spec, cpu_millicores, memory_mib, desired_state, encrypted_environment_variables, status, created_at, updated_at FROM `deployments` WHERE k8s_name = ? FindDeploymentByK8sName(ctx context.Context, db DBTX, k8sName string) (Deployment, error) + // Returns all regions where a deployment is configured. + // Used for fan-out: when a deployment changes, emit state_change to each region. + // + // SELECT region + // FROM `deployment_topology` + // WHERE deployment_id = ? + FindDeploymentRegions(ctx context.Context, db DBTX, deploymentID string) ([]string, error) //FindDeploymentTopologyByIDAndRegion // // SELECT @@ -354,21 +368,21 @@ type Querier interface { //FindInstanceByPodName // // SELECT - // pk, id, deployment_id, workspace_id, project_id, region, cluster_id, k8s_name, address, cpu_millicores, memory_mib, status + // pk, id, deployment_id, workspace_id, project_id, region, k8s_name, address, cpu_millicores, memory_mib, status // FROM instances - // WHERE k8s_name = ? AND cluster_id = ? AND region = ? + // WHERE k8s_name = ? AND region = ? FindInstanceByPodName(ctx context.Context, db DBTX, arg FindInstanceByPodNameParams) (Instance, error) //FindInstancesByDeploymentId // // SELECT - // pk, id, deployment_id, workspace_id, project_id, region, cluster_id, k8s_name, address, cpu_millicores, memory_mib, status + // pk, id, deployment_id, workspace_id, project_id, region, k8s_name, address, cpu_millicores, memory_mib, status // FROM instances // WHERE deployment_id = ? FindInstancesByDeploymentId(ctx context.Context, db DBTX, deploymentid string) ([]Instance, error) //FindInstancesByDeploymentIdAndRegion // // SELECT - // pk, id, deployment_id, workspace_id, project_id, region, cluster_id, k8s_name, address, cpu_millicores, memory_mib, status + // pk, id, deployment_id, workspace_id, project_id, region, k8s_name, address, cpu_millicores, memory_mib, status // FROM instances // WHERE deployment_id = ? AND region = ? FindInstancesByDeploymentIdAndRegion(ctx context.Context, db DBTX, arg FindInstancesByDeploymentIdAndRegionParams) ([]Instance, error) @@ -944,14 +958,6 @@ type Querier interface { // // SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, created_at, updated_at FROM sentinels WHERE environment_id = ? FindSentinelsByEnvironmentID(ctx context.Context, db DBTX, environmentID string) ([]Sentinel, error) - //FindStateChangesByClusterAfterSequence - // - // SELECT sequence, resource_type, state, cluster_id, created_at - // FROM `state_changes` - // WHERE cluster_id = ? - // AND sequence > ? - // ORDER BY sequence ASC - FindStateChangesByClusterAfterSequence(ctx context.Context, db DBTX, arg FindStateChangesByClusterAfterSequenceParams) ([]StateChange, error) //FindWorkspaceByID // // SELECT id, org_id, name, slug, k8s_namespace, partition_id, plan, tier, stripe_customer_id, stripe_subscription_id, beta_features, features, subscriptions, enabled, delete_protection, created_at_m, updated_at_m, deleted_at_m FROM `workspaces` @@ -970,6 +976,20 @@ type Querier interface { // WHERE id = ? // AND deleted_at_m IS NULL GetKeyAuthByID(ctx context.Context, db DBTX, id string) (GetKeyAuthByIDRow, error) + // Returns the highest sequence for a region. + // Used during bootstrap to get the watermark before streaming current state. + // + // SELECT CAST(COALESCE(MAX(sequence), 0) AS UNSIGNED) AS max_sequence + // FROM `state_changes` + // WHERE region = ? + GetMaxStateChangeSequence(ctx context.Context, db DBTX, region string) (int64, error) + // Returns the lowest retained sequence for a region. + // Used to detect if a client's watermark is too old (requires full resync). + // + // SELECT CAST(COALESCE(MIN(sequence), 0) AS UNSIGNED) AS min_sequence + // FROM `state_changes` + // WHERE region = ? + GetMinStateChangeSequence(ctx context.Context, db DBTX, region string) (int64, error) //HardDeleteWorkspace // // DELETE FROM `workspaces` @@ -1580,13 +1600,15 @@ type Querier interface { // // INSERT INTO `state_changes` ( // resource_type, - // state, - // cluster_id, + // resource_id, + // op, + // region, // created_at // ) VALUES ( // ?, // ?, // ?, + // ?, // ? // ) InsertStateChange(ctx context.Context, db DBTX, arg InsertStateChangeParams) (int64, error) @@ -1617,22 +1639,14 @@ type Querier interface { // true // ) InsertWorkspace(ctx context.Context, db DBTX, arg InsertWorkspaceParams) error - //ListDesiredDeploymentTopology + // ListDesiredDeploymentTopology returns all deployment topologies matching the desired state for a region. + // Used during bootstrap to stream all running deployments to krane. + // The version parameter is deprecated and ignored (kept for backwards compatibility). // // SELECT - // d.id as deployment_id, - // d.k8s_name as k8s_name, - // d.workspace_id, - // d.project_id, - // d.environment_id, - // d.image, - // dt.region, - // d.cpu_millicores, - // d.memory_mib, - // dt.desired_replicas, - // w.k8s_namespace as k8s_namespace, - // d.build_id, - // d.encrypted_environment_variables + // dt.pk, dt.workspace_id, dt.deployment_id, dt.region, dt.desired_replicas, dt.desired_status, dt.created_at, dt.updated_at, + // d.pk, d.id, d.k8s_name, d.workspace_id, d.project_id, d.environment_id, d.image, d.build_id, d.git_commit_sha, d.git_branch, d.git_commit_message, d.git_commit_author_handle, d.git_commit_author_avatar_url, d.git_commit_timestamp, d.sentinel_config, d.openapi_spec, d.cpu_millicores, d.memory_mib, d.desired_state, d.encrypted_environment_variables, d.status, d.created_at, d.updated_at, + // w.k8s_namespace // FROM `deployment_topology` dt // INNER JOIN `deployments` d ON dt.deployment_id = d.id // INNER JOIN `workspaces` w ON d.workspace_id = w.id @@ -1642,7 +1656,9 @@ type Querier interface { // ORDER BY dt.deployment_id ASC // LIMIT ? ListDesiredDeploymentTopology(ctx context.Context, db DBTX, arg ListDesiredDeploymentTopologyParams) ([]ListDesiredDeploymentTopologyRow, error) - //ListDesiredSentinels + // ListDesiredSentinels returns all sentinels matching the desired state for a region. + // Used during bootstrap to stream all running sentinels to krane. + // The version parameter is deprecated and ignored (kept for backwards compatibility). // // SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, created_at, updated_at // FROM `sentinels` @@ -1946,6 +1962,18 @@ type Querier interface { // WHERE kr.key_id = ? // ORDER BY r.name ListRolesByKeyID(ctx context.Context, db DBTX, keyID string) ([]ListRolesByKeyIDRow, error) + // Returns state changes for watch loop. Includes 1-second visibility delay + // to handle AUTO_INCREMENT gaps where sequence N+1 commits before N. + // Clients filter by their region when fetching the actual resource. + // + // SELECT sequence, resource_type, resource_id, op + // FROM `state_changes` + // WHERE region = ? + // AND sequence > ? + // AND created_at < (UNIX_TIMESTAMP() * 1000) - 1000 + // ORDER BY sequence ASC + // LIMIT ? + ListStateChanges(ctx context.Context, db DBTX, arg ListStateChangesParams) ([]ListStateChangesRow, error) //ListWorkspaces // // SELECT @@ -2301,7 +2329,6 @@ type Querier interface { // workspace_id, // project_id, // region, - // cluster_id, // k8s_name, // address, // cpu_millicores, @@ -2318,7 +2345,6 @@ type Querier interface { // ?, // ?, // ?, - // ?, // ? // ) // ON DUPLICATE KEY UPDATE diff --git a/pkg/db/queries/deployment_delete_instances.sql b/pkg/db/queries/deployment_delete_instances.sql index 260003e650..d38ca0f6ce 100644 --- a/pkg/db/queries/deployment_delete_instances.sql +++ b/pkg/db/queries/deployment_delete_instances.sql @@ -1,3 +1,3 @@ -- name: DeleteDeploymentInstances :exec DELETE FROM instances -WHERE deployment_id = ? and cluster_id = ?; +WHERE deployment_id = ? and region = ?; diff --git a/pkg/db/queries/deployment_topology_find_regions.sql b/pkg/db/queries/deployment_topology_find_regions.sql new file mode 100644 index 0000000000..fbe6eb82d3 --- /dev/null +++ b/pkg/db/queries/deployment_topology_find_regions.sql @@ -0,0 +1,6 @@ +-- name: FindDeploymentRegions :many +-- Returns all regions where a deployment is configured. +-- Used for fan-out: when a deployment changes, emit state_change to each region. +SELECT region +FROM `deployment_topology` +WHERE deployment_id = sqlc.arg(deployment_id); diff --git a/pkg/db/queries/deployment_topology_list_desired.sql b/pkg/db/queries/deployment_topology_list_desired.sql index 9d00fcbdfb..b8051e4628 100644 --- a/pkg/db/queries/deployment_topology_list_desired.sql +++ b/pkg/db/queries/deployment_topology_list_desired.sql @@ -1,18 +1,11 @@ -- name: ListDesiredDeploymentTopology :many +-- ListDesiredDeploymentTopology returns all deployment topologies matching the desired state for a region. +-- Used during bootstrap to stream all running deployments to krane. +-- The version parameter is deprecated and ignored (kept for backwards compatibility). SELECT - d.id as deployment_id, - d.k8s_name as k8s_name, - d.workspace_id, - d.project_id, - d.environment_id, - d.image, - dt.region, - d.cpu_millicores, - d.memory_mib, - dt.desired_replicas, - w.k8s_namespace as k8s_namespace, - d.build_id, - d.encrypted_environment_variables + sqlc.embed(dt), + sqlc.embed(d), + w.k8s_namespace FROM `deployment_topology` dt INNER JOIN `deployments` d ON dt.deployment_id = d.id INNER JOIN `workspaces` w ON d.workspace_id = w.id diff --git a/pkg/db/queries/instance_delete.sql b/pkg/db/queries/instance_delete.sql index 95410850ee..21c6b6d87d 100644 --- a/pkg/db/queries/instance_delete.sql +++ b/pkg/db/queries/instance_delete.sql @@ -1,4 +1,4 @@ -- name: DeleteInstance :exec -DELETE FROM instances WHERE k8s_name = sqlc.arg(k8s_name) AND cluster_id = sqlc.arg(cluster_id) AND region = sqlc.arg(region); +DELETE FROM instances WHERE k8s_name = sqlc.arg(k8s_name) AND region = sqlc.arg(region); diff --git a/pkg/db/queries/instance_upsert.sql b/pkg/db/queries/instance_upsert.sql index 902764f96e..156fffd105 100644 --- a/pkg/db/queries/instance_upsert.sql +++ b/pkg/db/queries/instance_upsert.sql @@ -6,7 +6,6 @@ INSERT INTO instances ( workspace_id, project_id, region, - cluster_id, k8s_name, address, cpu_millicores, @@ -19,7 +18,6 @@ VALUES ( sqlc.arg(workspace_id), sqlc.arg(project_id), sqlc.arg(region), - sqlc.arg(cluster_id), sqlc.arg(k8s_name), sqlc.arg(address), sqlc.arg(cpu_millicores), diff --git a/pkg/db/queries/instances_find_by_pod_name.sql b/pkg/db/queries/instances_find_by_pod_name.sql index b275001741..c28988819c 100644 --- a/pkg/db/queries/instances_find_by_pod_name.sql +++ b/pkg/db/queries/instances_find_by_pod_name.sql @@ -2,4 +2,4 @@ SELECT * FROM instances - WHERE k8s_name = sqlc.arg(k8s_name) AND cluster_id = sqlc.arg(cluster_id) AND region = sqlc.arg(region); + WHERE k8s_name = sqlc.arg(k8s_name) AND region = sqlc.arg(region); diff --git a/pkg/db/queries/sentinel_list_desired.sql b/pkg/db/queries/sentinel_list_desired.sql index a5c74da2b1..f31f90def8 100644 --- a/pkg/db/queries/sentinel_list_desired.sql +++ b/pkg/db/queries/sentinel_list_desired.sql @@ -1,4 +1,7 @@ -- name: ListDesiredSentinels :many +-- ListDesiredSentinels returns all sentinels matching the desired state for a region. +-- Used during bootstrap to stream all running sentinels to krane. +-- The version parameter is deprecated and ignored (kept for backwards compatibility). SELECT * FROM `sentinels` WHERE (sqlc.arg(region) = '' OR region = sqlc.arg(region)) diff --git a/pkg/db/queries/state_change_find_by_cluster_after_sequence.sql b/pkg/db/queries/state_change_find_by_cluster_after_sequence.sql deleted file mode 100644 index b45418e1d3..0000000000 --- a/pkg/db/queries/state_change_find_by_cluster_after_sequence.sql +++ /dev/null @@ -1,6 +0,0 @@ --- name: FindStateChangesByClusterAfterSequence :many -SELECT * -FROM `state_changes` -WHERE cluster_id = sqlc.arg(cluster_id) - AND sequence > sqlc.arg(after_sequence) -ORDER BY sequence ASC; diff --git a/pkg/db/queries/state_change_find_by_region_after_sequence.sql b/pkg/db/queries/state_change_find_by_region_after_sequence.sql new file mode 100644 index 0000000000..3051d63ee8 --- /dev/null +++ b/pkg/db/queries/state_change_find_by_region_after_sequence.sql @@ -0,0 +1,11 @@ +-- name: ListStateChanges :many +-- Returns state changes for watch loop. Includes 1-second visibility delay +-- to handle AUTO_INCREMENT gaps where sequence N+1 commits before N. +-- Clients filter by their region when fetching the actual resource. +SELECT sequence, resource_type, resource_id, op +FROM `state_changes` +WHERE region = sqlc.arg(region) + AND sequence > sqlc.arg(after_sequence) + AND created_at < (UNIX_TIMESTAMP() * 1000) - 1000 +ORDER BY sequence ASC +LIMIT ?; diff --git a/pkg/db/queries/state_change_get_max_sequence.sql b/pkg/db/queries/state_change_get_max_sequence.sql new file mode 100644 index 0000000000..430a7ee7ad --- /dev/null +++ b/pkg/db/queries/state_change_get_max_sequence.sql @@ -0,0 +1,6 @@ +-- name: GetMaxStateChangeSequence :one +-- Returns the highest sequence for a region. +-- Used during bootstrap to get the watermark before streaming current state. +SELECT CAST(COALESCE(MAX(sequence), 0) AS UNSIGNED) AS max_sequence +FROM `state_changes` +WHERE region = sqlc.arg(region); diff --git a/pkg/db/queries/state_change_get_min_sequence.sql b/pkg/db/queries/state_change_get_min_sequence.sql new file mode 100644 index 0000000000..ea66919cd3 --- /dev/null +++ b/pkg/db/queries/state_change_get_min_sequence.sql @@ -0,0 +1,6 @@ +-- name: GetMinStateChangeSequence :one +-- Returns the lowest retained sequence for a region. +-- Used to detect if a client's watermark is too old (requires full resync). +SELECT CAST(COALESCE(MIN(sequence), 0) AS UNSIGNED) AS min_sequence +FROM `state_changes` +WHERE region = sqlc.arg(region); diff --git a/pkg/db/queries/state_change_insert.sql b/pkg/db/queries/state_change_insert.sql index e91264b599..98e5c1ceb2 100644 --- a/pkg/db/queries/state_change_insert.sql +++ b/pkg/db/queries/state_change_insert.sql @@ -1,12 +1,14 @@ -- name: InsertStateChange :execlastid INSERT INTO `state_changes` ( resource_type, - state, - cluster_id, + resource_id, + op, + region, created_at ) VALUES ( sqlc.arg(resource_type), - sqlc.arg(state), - sqlc.arg(cluster_id), + sqlc.arg(resource_id), + sqlc.arg(op), + sqlc.arg(region), sqlc.arg(created_at) ); diff --git a/pkg/db/schema.sql b/pkg/db/schema.sql index a7466834ab..f1eaf3f6d5 100644 --- a/pkg/db/schema.sql +++ b/pkg/db/schema.sql @@ -530,7 +530,6 @@ CREATE TABLE `instances` ( `workspace_id` varchar(255) NOT NULL, `project_id` varchar(255) NOT NULL, `region` varchar(64) NOT NULL, - `cluster_id` varchar(64) NOT NULL, `k8s_name` varchar(255) NOT NULL, `address` varchar(255) NOT NULL, `cpu_millicores` int NOT NULL, @@ -538,8 +537,8 @@ CREATE TABLE `instances` ( `status` enum('inactive','pending','running','failed') NOT NULL, CONSTRAINT `instances_pk` PRIMARY KEY(`pk`), CONSTRAINT `instances_id_unique` UNIQUE(`id`), - CONSTRAINT `unique_address_per_cluster` UNIQUE(`address`,`cluster_id`), - CONSTRAINT `unique_k8s_name_per_cluster` UNIQUE(`k8s_name`,`cluster_id`) + CONSTRAINT `unique_address_per_region` UNIQUE(`address`,`region`), + CONSTRAINT `unique_k8s_name_per_region` UNIQUE(`k8s_name`,`region`) ); CREATE TABLE `certificates` ( @@ -574,8 +573,9 @@ CREATE TABLE `frontline_routes` ( CREATE TABLE `state_changes` ( `sequence` bigint unsigned AUTO_INCREMENT NOT NULL, `resource_type` enum('sentinel','deployment') NOT NULL, - `state` longblob NOT NULL, - `cluster_id` varchar(256) NOT NULL, + `resource_id` varchar(256) NOT NULL, + `op` enum('upsert','delete') NOT NULL, + `region` varchar(64) NOT NULL, `created_at` bigint unsigned NOT NULL, CONSTRAINT `state_changes_sequence` PRIMARY KEY(`sequence`) ); @@ -615,5 +615,6 @@ CREATE INDEX `idx_deployment_id` ON `instances` (`deployment_id`); CREATE INDEX `idx_region` ON `instances` (`region`); CREATE INDEX `environment_id_idx` ON `frontline_routes` (`environment_id`); CREATE INDEX `deployment_id_idx` ON `frontline_routes` (`deployment_id`); -CREATE INDEX `cluster_id_sequence` ON `state_changes` (`cluster_id`,`sequence`); +CREATE INDEX `region_sequence` ON `state_changes` (`region`,`sequence`); +CREATE INDEX `created_at` ON `state_changes` (`created_at`); diff --git a/pkg/db/sentinel_list_desired.sql_generated.go b/pkg/db/sentinel_list_desired.sql_generated.go index d4f8563fb1..70c07d6cf1 100644 --- a/pkg/db/sentinel_list_desired.sql_generated.go +++ b/pkg/db/sentinel_list_desired.sql_generated.go @@ -26,7 +26,9 @@ type ListDesiredSentinelsParams struct { Limit int32 `db:"limit"` } -// ListDesiredSentinels +// ListDesiredSentinels returns all sentinels matching the desired state for a region. +// Used during bootstrap to stream all running sentinels to krane. +// The version parameter is deprecated and ignored (kept for backwards compatibility). // // SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, created_at, updated_at // FROM `sentinels` diff --git a/pkg/db/state_change_delete_old.sql_generated.go b/pkg/db/state_change_delete_old.sql_generated.go new file mode 100644 index 0000000000..87fc4201c9 --- /dev/null +++ b/pkg/db/state_change_delete_old.sql_generated.go @@ -0,0 +1,30 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.30.0 +// source: state_change_delete_old.sql + +package db + +import ( + "context" +) + +const deleteOldStateChanges = `-- name: DeleteOldStateChanges :execrows +DELETE FROM ` + "`" + `state_changes` + "`" + ` +WHERE created_at < ? +LIMIT 10000 +` + +// Retention cleanup: deletes state changes older than the cutoff timestamp. +// Uses LIMIT to avoid long-running transactions; call repeatedly until 0 rows affected. +// +// DELETE FROM `state_changes` +// WHERE created_at < ? +// LIMIT 10000 +func (q *Queries) DeleteOldStateChanges(ctx context.Context, db DBTX, cutoffMs uint64) (int64, error) { + result, err := db.ExecContext(ctx, deleteOldStateChanges, cutoffMs) + if err != nil { + return 0, err + } + return result.RowsAffected() +} diff --git a/pkg/db/state_change_find_by_cluster_after_sequence.sql_generated.go b/pkg/db/state_change_find_by_cluster_after_sequence.sql_generated.go deleted file mode 100644 index e0ab270c84..0000000000 --- a/pkg/db/state_change_find_by_cluster_after_sequence.sql_generated.go +++ /dev/null @@ -1,59 +0,0 @@ -// Code generated by sqlc. DO NOT EDIT. -// versions: -// sqlc v1.30.0 -// source: state_change_find_by_cluster_after_sequence.sql - -package db - -import ( - "context" -) - -const findStateChangesByClusterAfterSequence = `-- name: FindStateChangesByClusterAfterSequence :many -SELECT sequence, resource_type, state, cluster_id, created_at -FROM ` + "`" + `state_changes` + "`" + ` -WHERE cluster_id = ? - AND sequence > ? -ORDER BY sequence ASC -` - -type FindStateChangesByClusterAfterSequenceParams struct { - ClusterID string `db:"cluster_id"` - AfterSequence uint64 `db:"after_sequence"` -} - -// FindStateChangesByClusterAfterSequence -// -// SELECT sequence, resource_type, state, cluster_id, created_at -// FROM `state_changes` -// WHERE cluster_id = ? -// AND sequence > ? -// ORDER BY sequence ASC -func (q *Queries) FindStateChangesByClusterAfterSequence(ctx context.Context, db DBTX, arg FindStateChangesByClusterAfterSequenceParams) ([]StateChange, error) { - rows, err := db.QueryContext(ctx, findStateChangesByClusterAfterSequence, arg.ClusterID, arg.AfterSequence) - if err != nil { - return nil, err - } - defer rows.Close() - var items []StateChange - for rows.Next() { - var i StateChange - if err := rows.Scan( - &i.Sequence, - &i.ResourceType, - &i.State, - &i.ClusterID, - &i.CreatedAt, - ); err != nil { - return nil, err - } - items = append(items, i) - } - if err := rows.Close(); err != nil { - return nil, err - } - if err := rows.Err(); err != nil { - return nil, err - } - return items, nil -} diff --git a/pkg/db/state_change_find_by_region_after_sequence.sql_generated.go b/pkg/db/state_change_find_by_region_after_sequence.sql_generated.go new file mode 100644 index 0000000000..9a98bb4d56 --- /dev/null +++ b/pkg/db/state_change_find_by_region_after_sequence.sql_generated.go @@ -0,0 +1,72 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.30.0 +// source: state_change_find_by_region_after_sequence.sql + +package db + +import ( + "context" +) + +const listStateChanges = `-- name: ListStateChanges :many +SELECT sequence, resource_type, resource_id, op +FROM ` + "`" + `state_changes` + "`" + ` +WHERE region = ? + AND sequence > ? + AND created_at < (UNIX_TIMESTAMP() * 1000) - 1000 +ORDER BY sequence ASC +LIMIT ? +` + +type ListStateChangesParams struct { + Region string `db:"region"` + AfterSequence uint64 `db:"after_sequence"` + Limit int32 `db:"limit"` +} + +type ListStateChangesRow struct { + Sequence uint64 `db:"sequence"` + ResourceType StateChangesResourceType `db:"resource_type"` + ResourceID string `db:"resource_id"` + Op StateChangesOp `db:"op"` +} + +// Returns state changes for watch loop. Includes 1-second visibility delay +// to handle AUTO_INCREMENT gaps where sequence N+1 commits before N. +// Clients filter by their region when fetching the actual resource. +// +// SELECT sequence, resource_type, resource_id, op +// FROM `state_changes` +// WHERE region = ? +// AND sequence > ? +// AND created_at < (UNIX_TIMESTAMP() * 1000) - 1000 +// ORDER BY sequence ASC +// LIMIT ? +func (q *Queries) ListStateChanges(ctx context.Context, db DBTX, arg ListStateChangesParams) ([]ListStateChangesRow, error) { + rows, err := db.QueryContext(ctx, listStateChanges, arg.Region, arg.AfterSequence, arg.Limit) + if err != nil { + return nil, err + } + defer rows.Close() + var items []ListStateChangesRow + for rows.Next() { + var i ListStateChangesRow + if err := rows.Scan( + &i.Sequence, + &i.ResourceType, + &i.ResourceID, + &i.Op, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} diff --git a/pkg/db/state_change_get_max_sequence.sql_generated.go b/pkg/db/state_change_get_max_sequence.sql_generated.go new file mode 100644 index 0000000000..e3032e9f5d --- /dev/null +++ b/pkg/db/state_change_get_max_sequence.sql_generated.go @@ -0,0 +1,29 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.30.0 +// source: state_change_get_max_sequence.sql + +package db + +import ( + "context" +) + +const getMaxStateChangeSequence = `-- name: GetMaxStateChangeSequence :one +SELECT CAST(COALESCE(MAX(sequence), 0) AS UNSIGNED) AS max_sequence +FROM ` + "`" + `state_changes` + "`" + ` +WHERE region = ? +` + +// Returns the highest sequence for a region. +// Used during bootstrap to get the watermark before streaming current state. +// +// SELECT CAST(COALESCE(MAX(sequence), 0) AS UNSIGNED) AS max_sequence +// FROM `state_changes` +// WHERE region = ? +func (q *Queries) GetMaxStateChangeSequence(ctx context.Context, db DBTX, region string) (int64, error) { + row := db.QueryRowContext(ctx, getMaxStateChangeSequence, region) + var max_sequence int64 + err := row.Scan(&max_sequence) + return max_sequence, err +} diff --git a/pkg/db/state_change_get_min_sequence.sql_generated.go b/pkg/db/state_change_get_min_sequence.sql_generated.go new file mode 100644 index 0000000000..1ef5dfdbec --- /dev/null +++ b/pkg/db/state_change_get_min_sequence.sql_generated.go @@ -0,0 +1,29 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.30.0 +// source: state_change_get_min_sequence.sql + +package db + +import ( + "context" +) + +const getMinStateChangeSequence = `-- name: GetMinStateChangeSequence :one +SELECT CAST(COALESCE(MIN(sequence), 0) AS UNSIGNED) AS min_sequence +FROM ` + "`" + `state_changes` + "`" + ` +WHERE region = ? +` + +// Returns the lowest retained sequence for a region. +// Used to detect if a client's watermark is too old (requires full resync). +// +// SELECT CAST(COALESCE(MIN(sequence), 0) AS UNSIGNED) AS min_sequence +// FROM `state_changes` +// WHERE region = ? +func (q *Queries) GetMinStateChangeSequence(ctx context.Context, db DBTX, region string) (int64, error) { + row := db.QueryRowContext(ctx, getMinStateChangeSequence, region) + var min_sequence int64 + err := row.Scan(&min_sequence) + return min_sequence, err +} diff --git a/pkg/db/state_change_insert.sql_generated.go b/pkg/db/state_change_insert.sql_generated.go index fb07ce0e2c..32e03756e3 100644 --- a/pkg/db/state_change_insert.sql_generated.go +++ b/pkg/db/state_change_insert.sql_generated.go @@ -12,21 +12,24 @@ import ( const insertStateChange = `-- name: InsertStateChange :execlastid INSERT INTO ` + "`" + `state_changes` + "`" + ` ( resource_type, - state, - cluster_id, + resource_id, + op, + region, created_at ) VALUES ( ?, ?, ?, + ?, ? ) ` type InsertStateChangeParams struct { ResourceType StateChangesResourceType `db:"resource_type"` - State []byte `db:"state"` - ClusterID string `db:"cluster_id"` + ResourceID string `db:"resource_id"` + Op StateChangesOp `db:"op"` + Region string `db:"region"` CreatedAt uint64 `db:"created_at"` } @@ -34,20 +37,23 @@ type InsertStateChangeParams struct { // // INSERT INTO `state_changes` ( // resource_type, -// state, -// cluster_id, +// resource_id, +// op, +// region, // created_at // ) VALUES ( // ?, // ?, // ?, +// ?, // ? // ) func (q *Queries) InsertStateChange(ctx context.Context, db DBTX, arg InsertStateChangeParams) (int64, error) { result, err := db.ExecContext(ctx, insertStateChange, arg.ResourceType, - arg.State, - arg.ClusterID, + arg.ResourceID, + arg.Op, + arg.Region, arg.CreatedAt, ) if err != nil { diff --git a/svc/ctrl/integration/BUILD.bazel b/svc/ctrl/integration/BUILD.bazel new file mode 100644 index 0000000000..1998b8b8a5 --- /dev/null +++ b/svc/ctrl/integration/BUILD.bazel @@ -0,0 +1,16 @@ +load("@rules_go//go:def.bzl", "go_library") + +go_library( + name = "integration", + srcs = ["harness.go"], + importpath = "github.com/unkeyed/unkey/svc/ctrl/integration", + visibility = ["//visibility:public"], + deps = [ + "//pkg/db", + "//pkg/otel/logging", + "//pkg/testutil/containers", + "//pkg/testutil/seed", + "//pkg/uid", + "@com_github_stretchr_testify//require", + ], +) diff --git a/svc/ctrl/integration/harness.go b/svc/ctrl/integration/harness.go new file mode 100644 index 0000000000..b6ec250c9f --- /dev/null +++ b/svc/ctrl/integration/harness.go @@ -0,0 +1,251 @@ +package integration + +import ( + "context" + "database/sql" + "testing" + "time" + + "github.com/stretchr/testify/require" + "github.com/unkeyed/unkey/pkg/db" + "github.com/unkeyed/unkey/pkg/otel/logging" + "github.com/unkeyed/unkey/pkg/testutil/containers" + "github.com/unkeyed/unkey/pkg/testutil/seed" + "github.com/unkeyed/unkey/pkg/uid" +) + +// Harness provides a test environment for ctrl service integration tests. +// It sets up MySQL connection and seeded data for testing the sync functionality. +type Harness struct { + t *testing.T + ctx context.Context + cancel context.CancelFunc + Seed *seed.Seeder + DB db.Database +} + +// New creates a new integration test harness. +func New(t *testing.T) *Harness { + t.Helper() + + ctx, cancel := context.WithCancel(context.Background()) + + mysqlHostCfg := containers.MySQL(t) + mysqlHostCfg.DBName = "unkey" + mysqlHostDSN := mysqlHostCfg.FormatDSN() + + database, err := db.New(db.Config{ + Logger: logging.NewNoop(), + PrimaryDSN: mysqlHostDSN, + ReadOnlyDSN: "", + }) + require.NoError(t, err) + + h := &Harness{ + t: t, + ctx: ctx, + cancel: cancel, + Seed: seed.New(t, database, nil), + DB: database, + } + + h.Seed.Seed(ctx) + + t.Cleanup(func() { + cancel() + database.Close() + }) + + return h +} + +// Context returns the test context. +func (h *Harness) Context() context.Context { + return h.ctx +} + +// Resources returns the seeded resources. +func (h *Harness) Resources() seed.Resources { + return h.Seed.Resources +} + +// InsertStateChange inserts a state change record for testing. +// Returns the auto-generated sequence number. +func (h *Harness) InsertStateChange(ctx context.Context, params db.InsertStateChangeParams) int64 { + seq, err := db.Query.InsertStateChange(ctx, h.DB.RW(), params) + require.NoError(h.t, err) + return seq +} + +// Now returns current time in milliseconds. +func (h *Harness) Now() int64 { + return time.Now().UnixMilli() +} + +// CreateDeploymentRequest contains parameters for creating a test deployment. +type CreateDeploymentRequest struct { + Region string + DesiredState db.DeploymentsDesiredState +} + +// CreateDeploymentResult contains the created deployment and topology. +type CreateDeploymentResult struct { + Deployment db.Deployment + Topology db.DeploymentTopology +} + +// CreateDeployment creates a deployment with topology for testing. +func (h *Harness) CreateDeployment(ctx context.Context, req CreateDeploymentRequest) CreateDeploymentResult { + workspaceID := h.Seed.Resources.UserWorkspace.ID + + project := h.Seed.CreateProject(ctx, seed.CreateProjectRequest{ + ID: uid.New("prj"), + WorkspaceID: workspaceID, + Name: "test-project", + Slug: uid.New("slug"), + GitRepositoryURL: "", + DefaultBranch: "", + DeleteProtection: false, + }) + + env := h.Seed.CreateEnvironment(ctx, seed.CreateEnvironmentRequest{ + ID: uid.New("env"), + WorkspaceID: workspaceID, + ProjectID: project.ID, + Slug: "production", + Description: "", + SentinelConfig: []byte("{}"), + DeleteProtection: false, + }) + + deploymentID := uid.New("dep") + k8sName := uid.New("k8s") + + err := db.Query.InsertDeployment(ctx, h.DB.RW(), db.InsertDeploymentParams{ + ID: deploymentID, + K8sName: k8sName, + WorkspaceID: workspaceID, + ProjectID: project.ID, + EnvironmentID: env.ID, + GitCommitSha: sql.NullString{Valid: false}, + GitBranch: sql.NullString{Valid: false}, + SentinelConfig: []byte("{}"), + GitCommitMessage: sql.NullString{Valid: false}, + GitCommitAuthorHandle: sql.NullString{Valid: false}, + GitCommitAuthorAvatarUrl: sql.NullString{Valid: false}, + GitCommitTimestamp: sql.NullInt64{Valid: false}, + OpenapiSpec: sql.NullString{Valid: false}, + EncryptedEnvironmentVariables: []byte(""), + Status: db.DeploymentsStatusReady, + CpuMillicores: 100, + MemoryMib: 128, + CreatedAt: h.Now(), + UpdatedAt: sql.NullInt64{Valid: false}, + }) + require.NoError(h.t, err) + + // Update desired_state (insert doesn't set it, but it defaults to running) + if req.DesiredState != "" && req.DesiredState != db.DeploymentsDesiredStateRunning { + _, err = h.DB.RW().ExecContext(ctx, "UPDATE deployments SET desired_state = ? WHERE id = ?", req.DesiredState, deploymentID) + require.NoError(h.t, err) + } + + // Set image (required for streaming) + _, err = h.DB.RW().ExecContext(ctx, "UPDATE deployments SET image = ? WHERE id = ?", "nginx:1.19", deploymentID) + require.NoError(h.t, err) + + err = db.Query.InsertDeploymentTopology(ctx, h.DB.RW(), db.InsertDeploymentTopologyParams{ + WorkspaceID: workspaceID, + DeploymentID: deploymentID, + Region: req.Region, + DesiredReplicas: 1, + DesiredStatus: db.DeploymentTopologyDesiredStatusStarted, + CreatedAt: h.Now(), + }) + require.NoError(h.t, err) + + deployment, err := db.Query.FindDeploymentById(ctx, h.DB.RO(), deploymentID) + require.NoError(h.t, err) + + return CreateDeploymentResult{ + Deployment: deployment, + Topology: db.DeploymentTopology{ + Pk: 0, + WorkspaceID: workspaceID, + DeploymentID: deploymentID, + Region: req.Region, + DesiredReplicas: 1, + DesiredStatus: db.DeploymentTopologyDesiredStatusStarted, + CreatedAt: h.Now(), + UpdatedAt: sql.NullInt64{Valid: false}, + }, + } +} + +// CreateSentinelRequest contains parameters for creating a test sentinel. +type CreateSentinelRequest struct { + Region string + DesiredState db.SentinelsDesiredState +} + +// CreateSentinel creates a sentinel for testing. +func (h *Harness) CreateSentinel(ctx context.Context, req CreateSentinelRequest) db.Sentinel { + workspaceID := h.Seed.Resources.UserWorkspace.ID + + project := h.Seed.CreateProject(ctx, seed.CreateProjectRequest{ + ID: uid.New("prj"), + WorkspaceID: workspaceID, + Name: "test-project-sentinel", + Slug: uid.New("slug"), + GitRepositoryURL: "", + DefaultBranch: "", + DeleteProtection: false, + }) + + env := h.Seed.CreateEnvironment(ctx, seed.CreateEnvironmentRequest{ + ID: uid.New("env"), + WorkspaceID: workspaceID, + ProjectID: project.ID, + Slug: "production", + Description: "", + SentinelConfig: []byte("{}"), + DeleteProtection: false, + }) + + sentinelID := uid.New("sen") + k8sName := uid.New("k8s") + + desiredState := req.DesiredState + if desiredState == "" { + desiredState = db.SentinelsDesiredStateRunning + } + + err := db.Query.InsertSentinel(ctx, h.DB.RW(), db.InsertSentinelParams{ + ID: sentinelID, + WorkspaceID: workspaceID, + EnvironmentID: env.ID, + ProjectID: project.ID, + K8sAddress: "http://localhost:8080", + K8sName: k8sName, + Region: req.Region, + Image: "sentinel:1.0", + Health: db.SentinelsHealthHealthy, + DesiredReplicas: 1, + AvailableReplicas: 1, + CpuMillicores: 100, + MemoryMib: 128, + CreatedAt: h.Now(), + }) + require.NoError(h.t, err) + + // Update desired_state if needed + if desiredState != db.SentinelsDesiredStateRunning { + _, err = h.DB.RW().ExecContext(ctx, "UPDATE sentinels SET desired_state = ? WHERE id = ?", desiredState, sentinelID) + require.NoError(h.t, err) + } + + sentinel, err := db.Query.FindSentinelByID(ctx, h.DB.RO(), sentinelID) + require.NoError(h.t, err) + + return sentinel +} diff --git a/svc/ctrl/integration/sync_test.go b/svc/ctrl/integration/sync_test.go new file mode 100644 index 0000000000..1134df135e --- /dev/null +++ b/svc/ctrl/integration/sync_test.go @@ -0,0 +1,1199 @@ +//go:build integration + +// Package integration provides integration tests for the ctrl service's Sync RPC. +// +// These tests validate the Kubernetes-style List+Watch sync pattern that synchronizes +// deployment and sentinel state from the control plane (ctrl) to Kubernetes agents (krane). +// +// # Architecture Overview +// +// The sync protocol follows a two-phase approach: +// 1. Bootstrap: When a client connects with sequence=0, the server streams all current +// running deployments and sentinels for the requested region, then sends a Bookmark +// message containing the current max sequence number. +// 2. Watch: After bootstrap (or when reconnecting with sequence>0), the server polls +// the state_changes table and streams incremental updates to the client. +// +// # Test Categories +// +// - Bootstrap Tests: Verify initial full state sync behavior +// - FailedPrecondition Tests: Verify sequence validation and resync triggers +// - Delete Scenario Tests: Verify correct delete message generation +// - State Change Query Tests: Verify underlying database queries +// - Reconnect Tests: Verify incremental sync on reconnection +// - Sequence Tests: Verify sequence numbers in streamed messages +// +// # Test Isolation +// +// Each test uses a unique region name to ensure test isolation. The state_changes +// table uses an auto-incrementing sequence that is global, but queries filter by +// region, so tests don't interfere with each other. +package integration + +import ( + "context" + "net/http" + "sync" + "testing" + "time" + + "connectrpc.com/connect" + "github.com/stretchr/testify/require" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" + "github.com/unkeyed/unkey/pkg/db" + "github.com/unkeyed/unkey/pkg/otel/logging" + "github.com/unkeyed/unkey/svc/ctrl/services/cluster" +) + +// mockStream implements connect.ServerStream for testing and captures sent messages. +type mockStream struct { + mu sync.Mutex + messages []*ctrlv1.State +} + +func newMockStream() *mockStream { + return &mockStream{ + messages: make([]*ctrlv1.State, 0), + } +} + +func (m *mockStream) Send(msg *ctrlv1.State) error { + m.mu.Lock() + defer m.mu.Unlock() + m.messages = append(m.messages, msg) + return nil +} + +func (m *mockStream) Messages() []*ctrlv1.State { + m.mu.Lock() + defer m.mu.Unlock() + result := make([]*ctrlv1.State, len(m.messages)) + copy(result, m.messages) + return result +} + +func (m *mockStream) ResponseHeader() http.Header { + return make(http.Header) +} + +func (m *mockStream) ResponseTrailer() http.Header { + return make(http.Header) +} + +// newService creates a cluster service for testing. +func newService(t *testing.T, database db.Database) *cluster.Service { + return cluster.New(cluster.Config{ + Database: database, + Logger: logging.NewNoop(), + Bearer: "test-bearer", + }) +} + +// findDeploymentApply finds the first deployment apply message in the stream. +func findDeploymentApply(messages []*ctrlv1.State, deploymentID string) *ctrlv1.ApplyDeployment { + for _, msg := range messages { + if dep := msg.GetDeployment(); dep != nil { + if apply := dep.GetApply(); apply != nil && apply.GetDeploymentId() == deploymentID { + return apply + } + } + } + return nil +} + +// findDeploymentDelete finds the first deployment delete message in the stream. +func findDeploymentDelete(messages []*ctrlv1.State, k8sName string) *ctrlv1.DeleteDeployment { + for _, msg := range messages { + if dep := msg.GetDeployment(); dep != nil { + if del := dep.GetDelete(); del != nil && del.GetK8SName() == k8sName { + return del + } + } + } + return nil +} + +// findSentinelApply finds the first sentinel apply message in the stream. +func findSentinelApply(messages []*ctrlv1.State, sentinelID string) *ctrlv1.ApplySentinel { + for _, msg := range messages { + if sen := msg.GetSentinel(); sen != nil { + if apply := sen.GetApply(); apply != nil && apply.GetSentinelId() == sentinelID { + return apply + } + } + } + return nil +} + +// findSentinelDelete finds the first sentinel delete message in the stream. +func findSentinelDelete(messages []*ctrlv1.State, k8sName string) *ctrlv1.DeleteSentinel { + for _, msg := range messages { + if sen := msg.GetSentinel(); sen != nil { + if del := sen.GetDelete(); del != nil && del.GetK8SName() == k8sName { + return del + } + } + } + return nil +} + +// findBookmark finds the bookmark message in the stream. +func findBookmark(messages []*ctrlv1.State) *ctrlv1.Bookmark { + for _, msg := range messages { + if bookmark := msg.GetBookmark(); bookmark != nil { + return bookmark + } + } + return nil +} + +// countDeploymentApplies counts deployment apply messages. +func countDeploymentApplies(messages []*ctrlv1.State) int { + count := 0 + for _, msg := range messages { + if dep := msg.GetDeployment(); dep != nil { + if dep.GetApply() != nil { + count++ + } + } + } + return count +} + +// countSentinelApplies counts sentinel apply messages. +func countSentinelApplies(messages []*ctrlv1.State) int { + count := 0 + for _, msg := range messages { + if sen := msg.GetSentinel(); sen != nil { + if sen.GetApply() != nil { + count++ + } + } + } + return count +} + +// ============================================================================= +// Bootstrap Tests +// ============================================================================= +// +// Bootstrap tests verify the initial full state synchronization that occurs when +// a krane agent first connects (with sequence=0). During bootstrap, the server +// must stream ALL currently running resources for the requested region, then +// send a Bookmark message with the current max sequence number. +// +// Guarantees tested: +// - All running deployments in the region are streamed +// - All running sentinels in the region are streamed +// - Archived/stopped resources are NOT streamed +// - A Bookmark is always sent after streaming all resources +// - Empty regions receive only a Bookmark (no resources) +// ============================================================================= + +// TestSync_BootstrapStreamsDeploymentsAndVerifiesContent verifies that bootstrap +// correctly streams deployment resources with all required fields populated. +// +// Scenario: A new krane agent connects to sync a region containing one deployment. +// +// Guarantees: +// - The deployment is included in the bootstrap stream +// - The K8sName and Image fields are correctly populated +// - A Bookmark with a non-zero sequence is sent after the deployment +// +// This test validates the core bootstrap contract: all running resources must be +// streamed to new clients so they can reconcile their local state. +func TestSync_BootstrapStreamsDeploymentsAndVerifiesContent(t *testing.T) { + h := New(t) + ctx := h.Context() + + region := "us-west-2-bootstrap" + + dep := h.CreateDeployment(ctx, CreateDeploymentRequest{ + Region: region, + DesiredState: db.DeploymentsDesiredStateRunning, + }) + + // Insert a state change so bootstrap has a watermark + h.InsertStateChange(ctx, db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeDeployment, + ResourceID: dep.Deployment.ID, + Op: db.StateChangesOpUpsert, + Region: region, + CreatedAt: uint64(h.Now() - 2000), + }) + + svc := newService(t, h.DB) + stream := newMockStream() + + // Run bootstrap (Sync with sequence=0 triggers bootstrap then watch) + ctx, cancel := context.WithTimeout(ctx, 500*time.Millisecond) + defer cancel() + + req := connect.NewRequest(&ctrlv1.SyncRequest{ + Region: region, + SequenceLastSeen: 0, + }) + + // The sync will timeout in watch loop, but bootstrap should complete + _ = svc.Sync(ctx, req, stream) + + // Verify the deployment was streamed + messages := stream.Messages() + apply := findDeploymentApply(messages, dep.Deployment.ID) + require.NotNil(t, apply, "bootstrap should stream deployment apply") + require.Equal(t, dep.Deployment.K8sName, apply.GetK8SName()) + require.Equal(t, "nginx:1.19", apply.GetImage()) + + // Verify bookmark was sent + bookmark := findBookmark(messages) + require.NotNil(t, bookmark, "bootstrap should send bookmark") + require.Greater(t, bookmark.GetSequence(), uint64(0), "bookmark should have non-zero sequence") +} + +// TestSync_BootstrapStreamsSentinelsAndVerifiesContent verifies that bootstrap +// correctly streams sentinel resources with all required fields populated. +// +// Scenario: A new krane agent connects to sync a region containing one sentinel. +// +// Guarantees: +// - The sentinel is included in the bootstrap stream +// - The K8sName and Image fields are correctly populated +// - A Bookmark is sent after the sentinel +// +// This test mirrors the deployment test but for sentinels, ensuring both resource +// types are handled correctly during bootstrap. +func TestSync_BootstrapStreamsSentinelsAndVerifiesContent(t *testing.T) { + h := New(t) + ctx := h.Context() + + region := "eu-central-1-bootstrap" + + sentinel := h.CreateSentinel(ctx, CreateSentinelRequest{ + Region: region, + DesiredState: db.SentinelsDesiredStateRunning, + }) + + // Insert a state change + h.InsertStateChange(ctx, db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeSentinel, + ResourceID: sentinel.ID, + Op: db.StateChangesOpUpsert, + Region: region, + CreatedAt: uint64(h.Now() - 2000), + }) + + svc := newService(t, h.DB) + stream := newMockStream() + + ctx, cancel := context.WithTimeout(ctx, 500*time.Millisecond) + defer cancel() + + req := connect.NewRequest(&ctrlv1.SyncRequest{ + Region: region, + SequenceLastSeen: 0, + }) + + _ = svc.Sync(ctx, req, stream) + + // Verify the sentinel was streamed + messages := stream.Messages() + apply := findSentinelApply(messages, sentinel.ID) + require.NotNil(t, apply, "bootstrap should stream sentinel apply") + require.Equal(t, sentinel.K8sName, apply.GetK8SName()) + require.Equal(t, "sentinel:1.0", apply.GetImage()) + + // Verify bookmark was sent + bookmark := findBookmark(messages) + require.NotNil(t, bookmark, "bootstrap should send bookmark") +} + +// TestSync_BootstrapWithEmptyRegionSendsOnlyBookmark verifies that bootstrap +// handles empty regions gracefully by sending only a Bookmark. +// +// Scenario: A krane agent connects to sync a region with no deployments or sentinels. +// +// Guarantees: +// - Exactly one message is sent (the Bookmark) +// - The Bookmark sequence is 0 (no state changes exist for this region) +// - No deployment or sentinel apply messages are sent +// +// This edge case is critical for new regions or regions where all resources have +// been deleted. The client must still receive a Bookmark to know bootstrap completed. +func TestSync_BootstrapWithEmptyRegionSendsOnlyBookmark(t *testing.T) { + h := New(t) + ctx := h.Context() + + region := "empty-region-bootstrap" + + svc := newService(t, h.DB) + stream := newMockStream() + + ctx, cancel := context.WithTimeout(ctx, 500*time.Millisecond) + defer cancel() + + req := connect.NewRequest(&ctrlv1.SyncRequest{ + Region: region, + SequenceLastSeen: 0, + }) + + _ = svc.Sync(ctx, req, stream) + + messages := stream.Messages() + + // Empty region should have exactly one message: the bookmark + require.Len(t, messages, 1, "empty region bootstrap should send exactly one message (bookmark)") + + // The single message should be a bookmark + bookmark := findBookmark(messages) + require.NotNil(t, bookmark, "the only message should be a bookmark") + + // Sequence is 0 since no state changes exist for this region + require.Equal(t, uint64(0), bookmark.GetSequence(), "empty region bookmark should have sequence 0") +} + +// TestSync_BootstrapOnlyStreamsRunningResources verifies that bootstrap filters +// out non-running resources (archived, stopped, etc.). +// +// Scenario: A region contains both a running and an archived deployment. +// +// Guarantees: +// - Running deployments ARE included in bootstrap +// - Archived deployments are NOT included in bootstrap +// +// This test ensures the bootstrap phase only syncs resources that should actually +// exist in Kubernetes. Archived resources should not be created in the cluster. +func TestSync_BootstrapOnlyStreamsRunningResources(t *testing.T) { + h := New(t) + ctx := h.Context() + + region := "running-only-region" + + runningDep := h.CreateDeployment(ctx, CreateDeploymentRequest{ + Region: region, + DesiredState: db.DeploymentsDesiredStateRunning, + }) + + archivedDep := h.CreateDeployment(ctx, CreateDeploymentRequest{ + Region: region, + DesiredState: db.DeploymentsDesiredStateArchived, + }) + + h.InsertStateChange(ctx, db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeDeployment, + ResourceID: runningDep.Deployment.ID, + Op: db.StateChangesOpUpsert, + Region: region, + CreatedAt: uint64(h.Now() - 2000), + }) + + svc := newService(t, h.DB) + stream := newMockStream() + + ctx, cancel := context.WithTimeout(ctx, 500*time.Millisecond) + defer cancel() + + req := connect.NewRequest(&ctrlv1.SyncRequest{ + Region: region, + SequenceLastSeen: 0, + }) + + _ = svc.Sync(ctx, req, stream) + + messages := stream.Messages() + + // Running deployment should be streamed + runningApply := findDeploymentApply(messages, runningDep.Deployment.ID) + require.NotNil(t, runningApply, "running deployment should be streamed") + + // Archived deployment should NOT be streamed + archivedApply := findDeploymentApply(messages, archivedDep.Deployment.ID) + require.Nil(t, archivedApply, "archived deployment should not be streamed during bootstrap") +} + +// ============================================================================= +// FailedPrecondition Tests +// ============================================================================= +// +// These tests verify the sequence validation that prevents clients from resuming +// from a stale position. When state_changes rows are pruned (retention policy), +// clients with old sequence numbers must perform a full resync. +// +// Guarantees tested: +// - Clients with sequence behind min retained sequence get FailedPrecondition error +// - Clients with sequence=0 trigger bootstrap (never get FailedPrecondition) +// - Clients with valid sequence resume normally +// ============================================================================= + +// TestSync_FailedPreconditionWhenSequenceBehindMin verifies that clients attempting +// to resume from a pruned sequence position receive a FailedPrecondition error. +// +// Scenario: State changes have been pruned and the client's last-seen sequence +// is now behind the minimum retained sequence for the region. +// +// Guarantees: +// - Server returns connect.CodeFailedPrecondition error +// - Client knows it must perform a full resync (sequence=0) +// +// This prevents clients from missing events that were pruned from state_changes. +// The client-side behavior should be: discard local sequence, reconnect with 0. +func TestSync_FailedPreconditionWhenSequenceBehindMin(t *testing.T) { + h := New(t) + ctx := h.Context() + + region := "failedprecondition-region" + dummyRegion := "dummy-region-for-sequence-bump" + + // Insert a dummy state change in a different region to bump the auto-increment. + // This ensures the next insert will have sequence > 1, making our test meaningful. + h.InsertStateChange(ctx, db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeDeployment, + ResourceID: "dep_dummy", + Op: db.StateChangesOpUpsert, + Region: dummyRegion, + CreatedAt: uint64(h.Now() - 3000), + }) + + // Insert the actual state change for our test region + minSeq := h.InsertStateChange(ctx, db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeDeployment, + ResourceID: "dep_test", + Op: db.StateChangesOpUpsert, + Region: region, + CreatedAt: uint64(h.Now() - 2000), + }) + + // Verify minSeq > 1 so our test is meaningful + require.Greater(t, minSeq, int64(1), "minSeq should be > 1 after dummy insert") + + svc := newService(t, h.DB) + stream := newMockStream() + + // Request with sequence 1 which is behind minSeq should return FailedPrecondition + req := connect.NewRequest(&ctrlv1.SyncRequest{ + Region: region, + SequenceLastSeen: 1, + }) + + err := svc.Sync(ctx, req, stream) + + require.Error(t, err) + connectErr, ok := err.(*connect.Error) + require.True(t, ok, "error should be a connect error") + require.Equal(t, connect.CodeFailedPrecondition, connectErr.Code()) +} + +// TestSync_NoErrorWhenSequenceIsZero verifies that sequence=0 always triggers +// bootstrap instead of sequence validation. +// +// Scenario: A client connects with sequence=0 to a region that has state changes. +// +// Guarantees: +// - No FailedPrecondition error is returned +// - Bootstrap runs normally (timeout occurs in watch loop, not sync) +// +// sequence=0 is the "fresh start" signal. It means "I have no state, give me +// everything." This must always work regardless of what sequences exist. +func TestSync_NoErrorWhenSequenceIsZero(t *testing.T) { + h := New(t) + ctx := h.Context() + + region := "zero-seq-region" + + h.InsertStateChange(ctx, db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeDeployment, + ResourceID: "dep_test", + Op: db.StateChangesOpUpsert, + Region: region, + CreatedAt: uint64(h.Now() - 2000), + }) + + svc := newService(t, h.DB) + stream := newMockStream() + + ctx, cancel := context.WithTimeout(ctx, 500*time.Millisecond) + defer cancel() + + // Sequence 0 should trigger bootstrap, not FailedPrecondition + req := connect.NewRequest(&ctrlv1.SyncRequest{ + Region: region, + SequenceLastSeen: 0, + }) + + err := svc.Sync(ctx, req, stream) + + // Should timeout in watch loop, not return FailedPrecondition + require.ErrorIs(t, err, context.DeadlineExceeded) +} + +// ============================================================================= +// Delete Scenario Tests +// ============================================================================= +// +// These tests verify that the watch phase correctly generates Delete messages +// in various scenarios. Delete messages tell krane to remove resources from +// Kubernetes. +// +// Delete messages are sent when: +// - The state_changes op is explicitly "delete" +// - The deployment topology doesn't exist for the sync region +// - The resource's desired_state is not "running" +// - The sentinel's region doesn't match the sync region +// +// Guarantees tested: +// - All delete scenarios produce correct DeleteDeployment/DeleteSentinel messages +// - The K8sName in delete messages matches the resource to be deleted +// ============================================================================= + +// TestSync_DeploymentDeleteWhenTopologyNotFound verifies that a Delete message +// is sent when a deployment exists but has no topology in the requesting region. +// +// Scenario: Deployment has topology in region A, but krane is syncing region B. +// A state change references the deployment in region B. +// +// Guarantees: +// - A DeleteDeployment message is sent for the deployment +// - This handles the case where a deployment was removed from a region +// +// This ensures krane removes deployments that no longer belong in its region, +// even if the deployment still exists elsewhere. +func TestSync_DeploymentDeleteWhenTopologyNotFound(t *testing.T) { + h := New(t) + ctx := h.Context() + + region := "topology-not-found-region" + otherRegion := "other-region" + + dep := h.CreateDeployment(ctx, CreateDeploymentRequest{ + Region: otherRegion, + DesiredState: db.DeploymentsDesiredStateRunning, + }) + + // Insert state change for our sync region (but topology doesn't exist there) + seq := h.InsertStateChange(ctx, db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeDeployment, + ResourceID: dep.Deployment.ID, + Op: db.StateChangesOpUpsert, + Region: region, + CreatedAt: uint64(h.Now() - 2000), + }) + + svc := newService(t, h.DB) + stream := newMockStream() + + ctx, cancel := context.WithTimeout(ctx, 500*time.Millisecond) + defer cancel() + + // Start watch from before the state change + req := connect.NewRequest(&ctrlv1.SyncRequest{ + Region: region, + SequenceLastSeen: uint64(seq - 1), + }) + + _ = svc.Sync(ctx, req, stream) + + messages := stream.Messages() + + del := findDeploymentDelete(messages, dep.Deployment.K8sName) + require.NotNil(t, del, "should send delete when topology not found for region") +} + +// TestSync_DeploymentDeleteWhenDesiredStateNotRunning verifies that a Delete +// message is sent when a deployment's desired_state is not "running". +// +// Scenario: A deployment exists with desired_state="archived". A state change +// is emitted for this deployment. +// +// Guarantees: +// - A DeleteDeployment message is sent +// - Archived/stopped deployments are removed from Kubernetes +// +// This handles graceful shutdown: when a user archives a deployment, krane +// must remove it from the cluster. +func TestSync_DeploymentDeleteWhenDesiredStateNotRunning(t *testing.T) { + h := New(t) + ctx := h.Context() + + region := "not-running-region" + + dep := h.CreateDeployment(ctx, CreateDeploymentRequest{ + Region: region, + DesiredState: db.DeploymentsDesiredStateArchived, + }) + + // Insert state change + seq := h.InsertStateChange(ctx, db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeDeployment, + ResourceID: dep.Deployment.ID, + Op: db.StateChangesOpUpsert, + Region: region, + CreatedAt: uint64(h.Now() - 2000), + }) + + svc := newService(t, h.DB) + stream := newMockStream() + + ctx, cancel := context.WithTimeout(ctx, 500*time.Millisecond) + defer cancel() + + req := connect.NewRequest(&ctrlv1.SyncRequest{ + Region: region, + SequenceLastSeen: uint64(seq - 1), + }) + + _ = svc.Sync(ctx, req, stream) + + messages := stream.Messages() + + del := findDeploymentDelete(messages, dep.Deployment.K8sName) + require.NotNil(t, del, "should send delete when desired_state is not running") +} + +// TestSync_DeploymentDeleteOnExplicitDeleteOp verifies that an explicit "delete" +// operation in state_changes produces a Delete message. +// +// Scenario: A running deployment has a state change with op="delete". +// +// Guarantees: +// - A DeleteDeployment message is sent immediately +// - This is the primary delete path for permanent resource removal +// +// Explicit delete operations are emitted when a deployment is permanently deleted +// (not just archived). The deployment row may still exist (soft delete) but +// krane must remove the Kubernetes resources. +func TestSync_DeploymentDeleteOnExplicitDeleteOp(t *testing.T) { + h := New(t) + ctx := h.Context() + + region := "explicit-delete-region" + + dep := h.CreateDeployment(ctx, CreateDeploymentRequest{ + Region: region, + DesiredState: db.DeploymentsDesiredStateRunning, + }) + + // Insert delete state change + seq := h.InsertStateChange(ctx, db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeDeployment, + ResourceID: dep.Deployment.ID, + Op: db.StateChangesOpDelete, + Region: region, + CreatedAt: uint64(h.Now() - 2000), + }) + + svc := newService(t, h.DB) + stream := newMockStream() + + ctx, cancel := context.WithTimeout(ctx, 500*time.Millisecond) + defer cancel() + + req := connect.NewRequest(&ctrlv1.SyncRequest{ + Region: region, + SequenceLastSeen: uint64(seq - 1), + }) + + _ = svc.Sync(ctx, req, stream) + + messages := stream.Messages() + + del := findDeploymentDelete(messages, dep.Deployment.K8sName) + require.NotNil(t, del, "should send delete on explicit delete operation") +} + +// TestSync_SentinelDeleteWhenRegionMismatch verifies that a Delete message is +// sent when a sentinel exists in a different region than the sync request. +// +// Scenario: A sentinel is created in region A. A state change references it +// in region B (e.g., due to migration or misconfiguration). +// +// Guarantees: +// - A DeleteSentinel message is sent +// - Sentinels are region-bound; they must not exist in wrong regions +// +// Unlike deployments (which have separate topology per region), sentinels have +// a single region field. This test ensures region filtering works for sentinels. +func TestSync_SentinelDeleteWhenRegionMismatch(t *testing.T) { + h := New(t) + ctx := h.Context() + + sentinelRegion := "sentinel-actual-region" + syncRegion := "sync-region" + + sentinel := h.CreateSentinel(ctx, CreateSentinelRequest{ + Region: sentinelRegion, + DesiredState: db.SentinelsDesiredStateRunning, + }) + + // Insert state change for sync region + seq := h.InsertStateChange(ctx, db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeSentinel, + ResourceID: sentinel.ID, + Op: db.StateChangesOpUpsert, + Region: syncRegion, + CreatedAt: uint64(h.Now() - 2000), + }) + + svc := newService(t, h.DB) + stream := newMockStream() + + ctx, cancel := context.WithTimeout(ctx, 500*time.Millisecond) + defer cancel() + + req := connect.NewRequest(&ctrlv1.SyncRequest{ + Region: syncRegion, + SequenceLastSeen: uint64(seq - 1), + }) + + _ = svc.Sync(ctx, req, stream) + + messages := stream.Messages() + + del := findSentinelDelete(messages, sentinel.K8sName) + require.NotNil(t, del, "should send delete when sentinel region doesn't match") +} + +// TestSync_SentinelDeleteWhenDesiredStateNotRunning verifies that a Delete +// message is sent when a sentinel's desired_state is not "running". +// +// Scenario: A sentinel has desired_state="archived". +// +// Guarantees: +// - A DeleteSentinel message is sent +// - Archived sentinels are removed from Kubernetes +// +// This mirrors the deployment test for sentinels. +func TestSync_SentinelDeleteWhenDesiredStateNotRunning(t *testing.T) { + h := New(t) + ctx := h.Context() + + region := "sentinel-archived-region" + + sentinel := h.CreateSentinel(ctx, CreateSentinelRequest{ + Region: region, + DesiredState: db.SentinelsDesiredStateArchived, + }) + + // Insert state change + seq := h.InsertStateChange(ctx, db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeSentinel, + ResourceID: sentinel.ID, + Op: db.StateChangesOpUpsert, + Region: region, + CreatedAt: uint64(h.Now() - 2000), + }) + + svc := newService(t, h.DB) + stream := newMockStream() + + ctx, cancel := context.WithTimeout(ctx, 500*time.Millisecond) + defer cancel() + + req := connect.NewRequest(&ctrlv1.SyncRequest{ + Region: region, + SequenceLastSeen: uint64(seq - 1), + }) + + _ = svc.Sync(ctx, req, stream) + + messages := stream.Messages() + + del := findSentinelDelete(messages, sentinel.K8sName) + require.NotNil(t, del, "should send delete when sentinel desired_state is not running") +} + +// TestSync_SentinelDeleteOnExplicitDeleteOp verifies that an explicit "delete" +// operation produces a Delete message for sentinels. +// +// Scenario: A running sentinel has a state change with op="delete". +// +// Guarantees: +// - A DeleteSentinel message is sent +// - This is the primary delete path for permanent sentinel removal +func TestSync_SentinelDeleteOnExplicitDeleteOp(t *testing.T) { + h := New(t) + ctx := h.Context() + + region := "sentinel-explicit-delete-region" + + sentinel := h.CreateSentinel(ctx, CreateSentinelRequest{ + Region: region, + DesiredState: db.SentinelsDesiredStateRunning, + }) + + // Insert delete state change + seq := h.InsertStateChange(ctx, db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeSentinel, + ResourceID: sentinel.ID, + Op: db.StateChangesOpDelete, + Region: region, + CreatedAt: uint64(h.Now() - 2000), + }) + + svc := newService(t, h.DB) + stream := newMockStream() + + ctx, cancel := context.WithTimeout(ctx, 500*time.Millisecond) + defer cancel() + + req := connect.NewRequest(&ctrlv1.SyncRequest{ + Region: region, + SequenceLastSeen: uint64(seq - 1), + }) + + _ = svc.Sync(ctx, req, stream) + + messages := stream.Messages() + + del := findSentinelDelete(messages, sentinel.K8sName) + require.NotNil(t, del, "should send delete on explicit delete operation") +} + +// ============================================================================= +// State Change Query Tests +// ============================================================================= +// +// These tests verify the underlying database queries that power the sync +// mechanism. They test the SQLC-generated query functions directly. +// +// Guarantees tested: +// - ListStateChanges returns changes after a given sequence +// - GetMaxStateChangeSequence returns the highest sequence for a region +// - GetMinStateChangeSequence returns the lowest sequence for a region +// - State changes are correctly filtered by region +// ============================================================================= + +// TestSync_StateChangeQueries verifies that ListStateChanges correctly returns +// state changes after a given sequence number. +// +// Scenario: Three state changes are inserted with increasing sequences. +// We query for changes after the first sequence. +// +// Guarantees: +// - Only changes AFTER the specified sequence are returned +// - Changes are returned in sequence order +// - The correct number of changes is returned +func TestSync_StateChangeQueries(t *testing.T) { + h := New(t) + ctx := h.Context() + + region := "query-test-region" + + seq1 := h.InsertStateChange(ctx, db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeDeployment, + ResourceID: "dep_1", + Op: db.StateChangesOpUpsert, + Region: region, + CreatedAt: uint64(h.Now() - 5000), + }) + + seq2 := h.InsertStateChange(ctx, db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeSentinel, + ResourceID: "sen_1", + Op: db.StateChangesOpUpsert, + Region: region, + CreatedAt: uint64(h.Now() - 4000), + }) + + seq3 := h.InsertStateChange(ctx, db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeDeployment, + ResourceID: "dep_1", + Op: db.StateChangesOpDelete, + Region: region, + CreatedAt: uint64(h.Now() - 3000), + }) + + require.Less(t, seq1, seq2) + require.Less(t, seq2, seq3) + + // Query state changes after seq1 + changes, err := db.Query.ListStateChanges(ctx, h.DB.RO(), db.ListStateChangesParams{ + Region: region, + AfterSequence: uint64(seq1), + Limit: 100, + }) + require.NoError(t, err) + require.Len(t, changes, 2) + require.Equal(t, uint64(seq2), changes[0].Sequence) + require.Equal(t, uint64(seq3), changes[1].Sequence) +} + +// TestSync_MaxSequenceQuery verifies that GetMaxStateChangeSequence returns +// the highest sequence number for a region. +// +// Scenario: Two state changes are inserted. We query for the max sequence. +// +// Guarantees: +// - Returns the sequence of the most recent state change +// - Used during bootstrap to set the Bookmark sequence +func TestSync_MaxSequenceQuery(t *testing.T) { + h := New(t) + ctx := h.Context() + + region := "max-seq-query-region" + + h.InsertStateChange(ctx, db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeDeployment, + ResourceID: "dep_1", + Op: db.StateChangesOpUpsert, + Region: region, + CreatedAt: uint64(h.Now() - 3000), + }) + + seq2 := h.InsertStateChange(ctx, db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeSentinel, + ResourceID: "sen_1", + Op: db.StateChangesOpUpsert, + Region: region, + CreatedAt: uint64(h.Now() - 2000), + }) + + maxSeq, err := db.Query.GetMaxStateChangeSequence(ctx, h.DB.RO(), region) + require.NoError(t, err) + require.Equal(t, seq2, maxSeq) +} + +// TestSync_MinSequenceQuery verifies that GetMinStateChangeSequence returns +// the lowest sequence number for a region. +// +// Scenario: Two state changes are inserted. We query for the min sequence. +// +// Guarantees: +// - Returns the sequence of the oldest state change +// - Used during sequence validation to detect stale clients +func TestSync_MinSequenceQuery(t *testing.T) { + h := New(t) + ctx := h.Context() + + region := "min-seq-query-region" + + seq1 := h.InsertStateChange(ctx, db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeDeployment, + ResourceID: "dep_1", + Op: db.StateChangesOpUpsert, + Region: region, + CreatedAt: uint64(h.Now() - 3000), + }) + + h.InsertStateChange(ctx, db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeSentinel, + ResourceID: "sen_1", + Op: db.StateChangesOpUpsert, + Region: region, + CreatedAt: uint64(h.Now() - 2000), + }) + + minSeq, err := db.Query.GetMinStateChangeSequence(ctx, h.DB.RO(), region) + require.NoError(t, err) + require.Equal(t, seq1, minSeq) +} + +// TestSync_StateChangeRegionFiltering verifies that ListStateChanges only +// returns state changes for the specified region. +// +// Scenario: State changes are inserted in two different regions. +// We query for changes in region1 only. +// +// Guarantees: +// - Only changes for the requested region are returned +// - Changes in other regions are not included +// +// This is essential for multi-region deployments where each krane instance +// only cares about its own region's state changes. +func TestSync_StateChangeRegionFiltering(t *testing.T) { + h := New(t) + ctx := h.Context() + + region1 := "filter-region-1" + region2 := "filter-region-2" + + h.InsertStateChange(ctx, db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeDeployment, + ResourceID: "dep_region1", + Op: db.StateChangesOpUpsert, + Region: region1, + CreatedAt: uint64(h.Now() - 3000), + }) + + h.InsertStateChange(ctx, db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeDeployment, + ResourceID: "dep_region2", + Op: db.StateChangesOpUpsert, + Region: region2, + CreatedAt: uint64(h.Now() - 2000), + }) + + changes, err := db.Query.ListStateChanges(ctx, h.DB.RO(), db.ListStateChangesParams{ + Region: region1, + AfterSequence: 0, + Limit: 100, + }) + require.NoError(t, err) + require.Len(t, changes, 1) + require.Equal(t, "dep_region1", changes[0].ResourceID) +} + +// ============================================================================= +// Reconnect Tests +// ============================================================================= +// +// These tests verify the incremental sync behavior when a krane agent +// reconnects with a non-zero sequence number. +// +// When sequence > 0: +// - Bootstrap is skipped (no full state dump) +// - Watch begins immediately from the given sequence +// - No Bookmark is sent (client already has one) +// +// Guarantees tested: +// - Reconnecting clients receive only new state changes +// - Clients don't receive duplicate events they already processed +// ============================================================================= + +// TestSync_ReconnectResumesFromSequence verifies that a reconnecting client +// skips bootstrap and receives only new state changes. +// +// Scenario: Two state changes exist. Client reconnects with the first sequence. +// +// Guarantees: +// - No Bookmark is sent (bootstrap was skipped) +// - Only state changes AFTER the provided sequence are streamed +// - The deployment update from the second state change is received +// +// This is the core reconnection contract: clients resume where they left off. +func TestSync_ReconnectResumesFromSequence(t *testing.T) { + h := New(t) + ctx := h.Context() + + region := "reconnect-region" + + dep := h.CreateDeployment(ctx, CreateDeploymentRequest{ + Region: region, + DesiredState: db.DeploymentsDesiredStateRunning, + }) + + // First state change (simulates previous session) + seq1 := h.InsertStateChange(ctx, db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeDeployment, + ResourceID: dep.Deployment.ID, + Op: db.StateChangesOpUpsert, + Region: region, + CreatedAt: uint64(h.Now() - 5000), + }) + + // Second state change (new since last session) + _ = h.InsertStateChange(ctx, db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeDeployment, + ResourceID: dep.Deployment.ID, + Op: db.StateChangesOpUpsert, + Region: region, + CreatedAt: uint64(h.Now() - 2000), + }) + + svc := newService(t, h.DB) + stream := newMockStream() + + ctx, cancel := context.WithTimeout(ctx, 500*time.Millisecond) + defer cancel() + + // Reconnect with sequence from first state change (should NOT trigger full bootstrap) + req := connect.NewRequest(&ctrlv1.SyncRequest{ + Region: region, + SequenceLastSeen: uint64(seq1), + }) + + _ = svc.Sync(ctx, req, stream) + + messages := stream.Messages() + + // Should NOT have a bookmark (bootstrap sends bookmark, watch doesn't) + bookmark := findBookmark(messages) + require.Nil(t, bookmark, "reconnect should skip bootstrap and not send bookmark") + + apply := findDeploymentApply(messages, dep.Deployment.ID) + require.NotNil(t, apply, "should receive deployment update from watch") +} + +// ============================================================================= +// Sequence in Messages Tests +// ============================================================================= +// +// These tests verify that all streamed messages contain valid sequence numbers +// that clients can use for resumption. +// +// Guarantees tested: +// - All messages have a sequence field > 0 +// - Clients can use any message's sequence for reconnection +// ============================================================================= + +// TestSync_AllMessagesContainSequence verifies that every message streamed +// during bootstrap contains a valid sequence number. +// +// Scenario: Bootstrap streams multiple resources. +// +// Guarantees: +// - All apply messages have sequence > 0 +// - During bootstrap, all messages have the same sequence (the max at bootstrap time) +// +// The sequence in messages allows clients to track their position even if they +// disconnect mid-stream. +func TestSync_AllMessagesContainSequence(t *testing.T) { + h := New(t) + ctx := h.Context() + + region := "sequence-in-messages-region" + + dep := h.CreateDeployment(ctx, CreateDeploymentRequest{ + Region: region, + DesiredState: db.DeploymentsDesiredStateRunning, + }) + + sentinel := h.CreateSentinel(ctx, CreateSentinelRequest{ + Region: region, + DesiredState: db.SentinelsDesiredStateRunning, + }) + + h.InsertStateChange(ctx, db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeDeployment, + ResourceID: dep.Deployment.ID, + Op: db.StateChangesOpUpsert, + Region: region, + CreatedAt: uint64(h.Now() - 2000), + }) + + h.InsertStateChange(ctx, db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeSentinel, + ResourceID: sentinel.ID, + Op: db.StateChangesOpUpsert, + Region: region, + CreatedAt: uint64(h.Now() - 2000), + }) + + svc := newService(t, h.DB) + stream := newMockStream() + + ctx, cancel := context.WithTimeout(ctx, 500*time.Millisecond) + defer cancel() + + req := connect.NewRequest(&ctrlv1.SyncRequest{ + Region: region, + SequenceLastSeen: 0, + }) + + _ = svc.Sync(ctx, req, stream) + + messages := stream.Messages() + require.NotEmpty(t, messages) + + // All messages during bootstrap should have the same sequence (the max sequence at bootstrap time) + var bootstrapSequence uint64 + for _, msg := range messages { + seq := msg.GetSequence() + if bootstrapSequence == 0 { + bootstrapSequence = seq + } + // Bootstrap messages all have the same sequence + if msg.GetBookmark() == nil { + // Non-bookmark messages should have a valid sequence + require.Greater(t, seq, uint64(0), "all messages should have sequence > 0") + } + } +} diff --git a/svc/ctrl/proto/ctrl/v1/cluster.proto b/svc/ctrl/proto/ctrl/v1/cluster.proto index 2a0825fc6f..395e6f300f 100644 --- a/svc/ctrl/proto/ctrl/v1/cluster.proto +++ b/svc/ctrl/proto/ctrl/v1/cluster.proto @@ -22,7 +22,6 @@ option go_package = "github.com/unkeyed/unkey/gen/proto/ctrl/v1;ctrlv1"; // The watch connection is designed to be long-lived with automatic reconnection on failure. // When an agent reconnects, it should initiate reconciliation to ensure consistency. service ClusterService { - rpc Watch(WatchRequest) returns (stream State); rpc Sync(SyncRequest) returns (stream State); rpc GetDesiredSentinelState(GetDesiredSentinelStateRequest) returns (SentinelState); rpc UpdateSentinelState(UpdateSentinelStateRequest) returns (UpdateSentinelStateResponse); @@ -80,29 +79,26 @@ message UpdateSentinelStateRequest { message UpdateSentinelStateResponse {} message SyncRequest { - string cluster_id = 1; - string region = 2; + string region = 1; + uint64 sequence_last_seen = 2; } -// WatchRequest identifies the cluster requesting a watch stream. -message WatchRequest { - // cluster_id uniquely identifies the client requesting the watch stream. - string cluster_id = 1; - - string region = 2; - - // sequence_last_seen indicates the last sequence number the client has processed. - // This allows the server to send only new events since that sequence number, - // enabling efficient reconnection and resumption of the watch stream. - uint64 sequence_last_seen = 3; +// Bookmark is sent after bootstrap completes to signal the client is caught up. +// The client should persist this sequence to resume watch on reconnect. +message Bookmark { + uint64 sequence = 1; } message State { + // sequence is the state_changes sequence number for this event. + // Clients should persist this after successfully processing each event + // to resume from the correct position on reconnect. uint64 sequence = 1; oneof kind { DeploymentState deployment = 2; SentinelState sentinel = 3; + Bookmark bookmark = 4; } } @@ -225,7 +221,6 @@ message ApplyDeployment { optional string build_id = 11; bytes encrypted_environment_variables = 12; - optional string readiness_id = 13; } // DeleteDeployment identifies a deployment to remove from the cluster. diff --git a/svc/ctrl/services/cluster/BUILD.bazel b/svc/ctrl/services/cluster/BUILD.bazel index ee2fafe840..486e9a5117 100644 --- a/svc/ctrl/services/cluster/BUILD.bazel +++ b/svc/ctrl/services/cluster/BUILD.bazel @@ -4,13 +4,11 @@ go_library( name = "cluster", srcs = [ "auth.go", - "emit.go", "rpc_get_desired_deployment_state.go", "rpc_get_desired_sentinel_state.go", "rpc_sync.go", "rpc_update_deployment_state.go", "rpc_update_sentinel_state.go", - "rpc_watch.go", "service.go", ], importpath = "github.com/unkeyed/unkey/svc/ctrl/services/cluster", @@ -21,7 +19,6 @@ go_library( "//pkg/assert", "//pkg/db", "//pkg/otel/logging", - "//pkg/proto", "//pkg/uid", "@com_connectrpc_connect//:connect", ], diff --git a/svc/ctrl/services/cluster/emit.go b/svc/ctrl/services/cluster/emit.go deleted file mode 100644 index c3a26ad2d4..0000000000 --- a/svc/ctrl/services/cluster/emit.go +++ /dev/null @@ -1,25 +0,0 @@ -package cluster - -import ( - "context" - "fmt" - - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" -) - -func (s *Service) EmitState(ctx context.Context, region string, event *ctrlv1.State) error { - - s.clientsMu.RLock() - defer s.clientsMu.RUnlock() - - s.logger.Info("clients", "count", len(s.clients)) - - for _, krane := range s.clients { - s.logger.Info("found krane", "krane", krane) - if krane.region == region { - return krane.stream.Send(event) - } - } - return fmt.Errorf("no cluster is listening for events in region %s", region) - -} diff --git a/svc/ctrl/services/cluster/rpc_get_desired_deployment_state.go b/svc/ctrl/services/cluster/rpc_get_desired_deployment_state.go index 606457c167..63016df314 100644 --- a/svc/ctrl/services/cluster/rpc_get_desired_deployment_state.go +++ b/svc/ctrl/services/cluster/rpc_get_desired_deployment_state.go @@ -64,7 +64,6 @@ func (s *Service) GetDesiredDeploymentState(ctx context.Context, req *connect.Re CpuMillicores: int64(deployment.CpuMillicores), MemoryMib: int64(deployment.MemoryMib), EncryptedEnvironmentVariables: deployment.EncryptedEnvironmentVariables, - ReadinessId: nil, BuildId: buildID, }, }, diff --git a/svc/ctrl/services/cluster/rpc_sync.go b/svc/ctrl/services/cluster/rpc_sync.go index 18dbc9f815..5868681a02 100644 --- a/svc/ctrl/services/cluster/rpc_sync.go +++ b/svc/ctrl/services/cluster/rpc_sync.go @@ -2,7 +2,8 @@ package cluster import ( "context" - "sync" + "fmt" + "time" "connectrpc.com/connect" ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" @@ -10,45 +11,67 @@ import ( ) func (s *Service) Sync(ctx context.Context, req *connect.Request[ctrlv1.SyncRequest], stream *connect.ServerStream[ctrlv1.State]) error { - region := req.Msg.GetRegion() - clusterID := req.Msg.GetClusterId() + sequence := req.Msg.GetSequenceLastSeen() s.logger.Info("sync request received", "region", region, - "clusterID", clusterID, + "sequence", sequence, ) - wg := sync.WaitGroup{} - - wg.Go(func() { - if err := s.getSyntheticDeployments(ctx, req, stream); err != nil { - s.logger.Error("failed to get synthetic deployments", "error", err) + if sequence > 0 { + minSeq, err := db.Query.GetMinStateChangeSequence(ctx, s.db.RO(), region) + if err != nil { + return err } - }) - wg.Go(func() { - if err := s.getSyntheticSentinels(ctx, req, stream); err != nil { - s.logger.Error("failed to get synthetic sentinels", "error", err) + if sequence < uint64(minSeq) { + return connect.NewError(connect.CodeFailedPrecondition, + fmt.Errorf("sequence %d is behind minimum retained sequence %d, full resync required", sequence, minSeq)) } - }) + } - wg.Wait() - <-ctx.Done() - return ctx.Err() + if sequence == 0 { + var err error + sequence, err = s.bootstrap(ctx, region, stream) + if err != nil { + return err + } + } + return s.watch(ctx, region, sequence, stream) } -func (s *Service) getSyntheticSentinels(ctx context.Context, req *connect.Request[ctrlv1.SyncRequest], stream *connect.ServerStream[ctrlv1.State]) error { +func (s *Service) bootstrap(ctx context.Context, region string, stream *connect.ServerStream[ctrlv1.State]) (uint64, error) { + maxSeq, err := db.Query.GetMaxStateChangeSequence(ctx, s.db.RO(), region) + if err != nil { + return 0, err + } + sequence := uint64(maxSeq) - clusterID := req.Msg.GetClusterId() - region := req.Msg.GetRegion() + cursor := "" + for { + topologies, err := db.Query.ListDesiredDeploymentTopology(ctx, s.db.RO(), db.ListDesiredDeploymentTopologyParams{ + Region: region, + DesiredState: db.DeploymentsDesiredStateRunning, + PaginationCursor: cursor, + Limit: 1000, + }) + if err != nil { + return 0, err + } + if len(topologies) == 0 { + break + } + cursor = topologies[len(topologies)-1].Deployment.ID - s.logger.Debug("get all sentinels request received", - "cluster_id", clusterID, - "region", region, - ) + for _, t := range topologies { + if err := s.streamDeployment(stream, sequence, t); err != nil { + return 0, err + } + } + } - cursor := "" + cursor = "" for { sentinels, err := db.Query.ListDesiredSentinels(ctx, s.db.RO(), db.ListDesiredSentinelsParams{ Region: region, @@ -56,109 +79,272 @@ func (s *Service) getSyntheticSentinels(ctx context.Context, req *connect.Reques PaginationCursor: cursor, Limit: 100, }) - if err != nil { - s.logger.Error("failed to get sentinels", "error", err.Error()) - return err + return 0, err } - if len(sentinels) == 0 { break } cursor = sentinels[len(sentinels)-1].ID - for _, s := range sentinels { - err = stream.Send(&ctrlv1.State{ - Kind: &ctrlv1.State_Sentinel{ - Sentinel: &ctrlv1.SentinelState{ - State: &ctrlv1.SentinelState_Apply{ - Apply: &ctrlv1.ApplySentinel{ - K8SName: s.K8sName, - WorkspaceId: s.WorkspaceID, - EnvironmentId: s.EnvironmentID, - ProjectId: s.ProjectID, - SentinelId: s.ID, - Image: s.Image, - Replicas: s.DesiredReplicas, - CpuMillicores: int64(s.CpuMillicores), - MemoryMib: int64(s.MemoryMib), - }, - }, - }, - }, - }) - if err != nil { - return err + for _, sentinel := range sentinels { + if err := s.streamSentinel(stream, sequence, sentinel); err != nil { + return 0, err } - } } - return nil - -} - -func (s *Service) getSyntheticDeployments(ctx context.Context, req *connect.Request[ctrlv1.SyncRequest], stream *connect.ServerStream[ctrlv1.State]) error { - clusterID := req.Msg.GetClusterId() - region := req.Msg.GetRegion() + // Send BOOKMARK with sequence + if err := stream.Send(&ctrlv1.State{ + Sequence: sequence, + Kind: &ctrlv1.State_Bookmark{ + Bookmark: &ctrlv1.Bookmark{Sequence: sequence}, + }, + }); err != nil { + return 0, err + } - s.logger.Debug("get all sentinels request received", - "cluster_id", clusterID, - "region", region, - ) + s.logger.Info("bootstrap complete", "sequence", sequence) + return sequence, nil +} - cursor := "" +func (s *Service) watch(ctx context.Context, region string, sequence uint64, stream *connect.ServerStream[ctrlv1.State]) error { for { - topologies, err := db.Query.ListDesiredDeploymentTopology(ctx, s.db.RO(), db.ListDesiredDeploymentTopologyParams{ - Region: region, - DesiredState: db.DeploymentsDesiredStateRunning, - PaginationCursor: cursor, - Limit: 1000, + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + + changes, err := db.Query.ListStateChanges(ctx, s.db.RO(), db.ListStateChangesParams{ + Region: region, + AfterSequence: sequence, + Limit: 100, }) if err != nil { - s.logger.Error("failed to get topologies", "error", err.Error()) return err } - if len(topologies) == 0 { - break + if len(changes) == 0 { + time.Sleep(250 * time.Millisecond) + continue } - cursor = topologies[len(topologies)-1].DeploymentID - for _, t := range topologies { - var buildID *string - if t.BuildID.Valid { - buildID = &t.BuildID.String + for _, c := range changes { + if err := s.processStateChange(ctx, region, c, stream); err != nil { + // Stop on error - client will reconnect from last known sequence + return fmt.Errorf("failed to process state change at sequence %d: %w", c.Sequence, err) } - err = stream.Send(&ctrlv1.State{ + sequence = c.Sequence + } + } +} + +// processStateChange fetches the resource and streams it if it applies to this region. +func (s *Service) processStateChange(ctx context.Context, region string, change db.ListStateChangesRow, stream *connect.ServerStream[ctrlv1.State]) error { + switch change.ResourceType { + case db.StateChangesResourceTypeDeployment: + return s.processDeploymentChange(ctx, region, change, stream) + case db.StateChangesResourceTypeSentinel: + return s.processSentinelChange(ctx, region, change, stream) + default: + s.logger.Warn("unknown resource type", "type", change.ResourceType) + return nil + } +} + +func (s *Service) processDeploymentChange(ctx context.Context, region string, change db.ListStateChangesRow, stream *connect.ServerStream[ctrlv1.State]) error { + d, err := db.Query.FindDeploymentById(ctx, s.db.RO(), change.ResourceID) + if err != nil { + if db.IsNotFound(err) { + return nil + } + return err + } + ws, err := db.Query.FindWorkspaceByID(ctx, s.db.RO(), d.WorkspaceID) + if err != nil { + return err + } + + if change.Op == db.StateChangesOpDelete { + return stream.Send(&ctrlv1.State{ + Sequence: change.Sequence, + Kind: &ctrlv1.State_Deployment{ + Deployment: &ctrlv1.DeploymentState{ + State: &ctrlv1.DeploymentState_Delete{ + Delete: &ctrlv1.DeleteDeployment{ + K8SNamespace: ws.K8sNamespace.String, + K8SName: d.K8sName, + }, + }, + }, + }, + }) + } + + t, err := db.Query.FindDeploymentTopologyByIDAndRegion(ctx, s.db.RO(), db.FindDeploymentTopologyByIDAndRegionParams{ + DeploymentID: change.ResourceID, + Region: region, + }) + if err != nil { + if db.IsNotFound(err) { + return stream.Send(&ctrlv1.State{ + Sequence: change.Sequence, Kind: &ctrlv1.State_Deployment{ Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Apply{ - Apply: &ctrlv1.ApplyDeployment{ - K8SNamespace: t.K8sNamespace.String, - K8SName: t.K8sName, - WorkspaceId: t.WorkspaceID, - EnvironmentId: t.EnvironmentID, - ProjectId: t.ProjectID, - DeploymentId: t.DeploymentID, - Image: t.Image.String, - Replicas: t.DesiredReplicas, - CpuMillicores: int64(t.CpuMillicores), - MemoryMib: int64(t.MemoryMib), - EncryptedEnvironmentVariables: t.EncryptedEnvironmentVariables, - ReadinessId: nil, - BuildId: buildID, + State: &ctrlv1.DeploymentState_Delete{ + Delete: &ctrlv1.DeleteDeployment{ + K8SNamespace: ws.K8sNamespace.String, + K8SName: d.K8sName, }, }, }, }, }) - if err != nil { - return err - } + } + return err + } + if t.DesiredState != db.DeploymentsDesiredStateRunning { + return stream.Send(&ctrlv1.State{ + Sequence: change.Sequence, + Kind: &ctrlv1.State_Deployment{ + Deployment: &ctrlv1.DeploymentState{ + State: &ctrlv1.DeploymentState_Delete{ + Delete: &ctrlv1.DeleteDeployment{ + K8SNamespace: ws.K8sNamespace.String, + K8SName: d.K8sName, + }, + }, + }, + }, + }) + } + + var buildID *string + if t.BuildID.Valid { + buildID = &t.BuildID.String + } + return stream.Send(&ctrlv1.State{ + Sequence: change.Sequence, + Kind: &ctrlv1.State_Deployment{ + Deployment: &ctrlv1.DeploymentState{ + State: &ctrlv1.DeploymentState_Apply{ + Apply: &ctrlv1.ApplyDeployment{ + K8SNamespace: t.K8sNamespace.String, + K8SName: t.K8sName, + WorkspaceId: t.WorkspaceID, + EnvironmentId: t.EnvironmentID, + ProjectId: t.ProjectID, + DeploymentId: t.ID, + Image: t.Image.String, + Replicas: t.DesiredReplicas, + CpuMillicores: int64(t.CpuMillicores), + MemoryMib: int64(t.MemoryMib), + EncryptedEnvironmentVariables: t.EncryptedEnvironmentVariables, + BuildId: buildID, + }, + }, + }, + }, + }) +} + +func (s *Service) processSentinelChange(ctx context.Context, region string, change db.ListStateChangesRow, stream *connect.ServerStream[ctrlv1.State]) error { + sentinel, err := db.Query.FindSentinelByID(ctx, s.db.RO(), change.ResourceID) + if err != nil { + if db.IsNotFound(err) { + return nil } + return err + } + + if change.Op == db.StateChangesOpDelete || sentinel.Region != region || sentinel.DesiredState != db.SentinelsDesiredStateRunning { + return stream.Send(&ctrlv1.State{ + Sequence: change.Sequence, + Kind: &ctrlv1.State_Sentinel{ + Sentinel: &ctrlv1.SentinelState{ + State: &ctrlv1.SentinelState_Delete{ + Delete: &ctrlv1.DeleteSentinel{ + K8SName: sentinel.K8sName, + }, + }, + }, + }, + }) } - return nil + return stream.Send(&ctrlv1.State{ + Sequence: change.Sequence, + Kind: &ctrlv1.State_Sentinel{ + Sentinel: &ctrlv1.SentinelState{ + State: &ctrlv1.SentinelState_Apply{ + Apply: &ctrlv1.ApplySentinel{ + K8SName: sentinel.K8sName, + WorkspaceId: sentinel.WorkspaceID, + EnvironmentId: sentinel.EnvironmentID, + ProjectId: sentinel.ProjectID, + SentinelId: sentinel.ID, + Image: sentinel.Image, + Replicas: sentinel.DesiredReplicas, + CpuMillicores: int64(sentinel.CpuMillicores), + MemoryMib: int64(sentinel.MemoryMib), + }, + }, + }, + }, + }) +} + +func (s *Service) streamDeployment(stream *connect.ServerStream[ctrlv1.State], sequence uint64, t db.ListDesiredDeploymentTopologyRow) error { + var buildID *string + if t.Deployment.BuildID.Valid { + buildID = &t.Deployment.BuildID.String + } + return stream.Send(&ctrlv1.State{ + Sequence: sequence, + Kind: &ctrlv1.State_Deployment{ + Deployment: &ctrlv1.DeploymentState{ + State: &ctrlv1.DeploymentState_Apply{ + Apply: &ctrlv1.ApplyDeployment{ + K8SNamespace: t.K8sNamespace.String, + K8SName: t.Deployment.K8sName, + WorkspaceId: t.Deployment.WorkspaceID, + EnvironmentId: t.Deployment.EnvironmentID, + ProjectId: t.Deployment.ProjectID, + DeploymentId: t.Deployment.ID, + Image: t.Deployment.Image.String, + Replicas: t.DeploymentTopology.DesiredReplicas, + CpuMillicores: int64(t.Deployment.CpuMillicores), + MemoryMib: int64(t.Deployment.MemoryMib), + EncryptedEnvironmentVariables: t.Deployment.EncryptedEnvironmentVariables, + BuildId: buildID, + }, + }, + }, + }, + }) +} + +func (s *Service) streamSentinel(stream *connect.ServerStream[ctrlv1.State], sequence uint64, sentinel db.Sentinel) error { + return stream.Send(&ctrlv1.State{ + Sequence: sequence, + Kind: &ctrlv1.State_Sentinel{ + Sentinel: &ctrlv1.SentinelState{ + State: &ctrlv1.SentinelState_Apply{ + Apply: &ctrlv1.ApplySentinel{ + K8SName: sentinel.K8sName, + WorkspaceId: sentinel.WorkspaceID, + EnvironmentId: sentinel.EnvironmentID, + ProjectId: sentinel.ProjectID, + SentinelId: sentinel.ID, + Image: sentinel.Image, + Replicas: sentinel.DesiredReplicas, + CpuMillicores: int64(sentinel.CpuMillicores), + MemoryMib: int64(sentinel.MemoryMib), + }, + }, + }, + }, + }) } diff --git a/svc/ctrl/services/cluster/rpc_update_deployment_state.go b/svc/ctrl/services/cluster/rpc_update_deployment_state.go index f5335242ee..396d7885e8 100644 --- a/svc/ctrl/services/cluster/rpc_update_deployment_state.go +++ b/svc/ctrl/services/cluster/rpc_update_deployment_state.go @@ -18,11 +18,9 @@ func (s *Service) UpdateDeploymentState(ctx context.Context, req *connect.Reques return nil, err } region := req.Header().Get("X-Krane-Region") - clusterID := req.Header().Get("X-Krane-Cluster-Id") err := assert.All( assert.NotEmpty(region, "region is required"), - assert.NotEmpty(clusterID, "clusterID is required"), ) if err != nil { return nil, err @@ -54,9 +52,8 @@ func (s *Service) UpdateDeploymentState(ctx context.Context, req *connect.Reques for _, staleInstance := range staleInstances { if _, ok := wantInstanceNames[staleInstance.K8sName]; !ok { err = db.Query.DeleteInstance(ctx, tx, db.DeleteInstanceParams{ - K8sName: staleInstance.K8sName, - Region: region, - ClusterID: clusterID, + K8sName: staleInstance.K8sName, + Region: region, }) if err != nil { return err @@ -71,7 +68,6 @@ func (s *Service) UpdateDeploymentState(ctx context.Context, req *connect.Reques WorkspaceID: deployment.WorkspaceID, ProjectID: deployment.ProjectID, Region: region, - ClusterID: clusterID, K8sName: instance.GetK8SName(), Address: instance.GetAddress(), CpuMillicores: int32(instance.GetCpuMillicores()), @@ -94,7 +90,7 @@ func (s *Service) UpdateDeploymentState(ctx context.Context, req *connect.Reques err = db.Query.DeleteDeploymentInstances(ctx, tx, db.DeleteDeploymentInstancesParams{ DeploymentID: deployment.ID, - ClusterID: clusterID, + Region: region, }) if err != nil { return err diff --git a/svc/ctrl/services/cluster/rpc_watch.go b/svc/ctrl/services/cluster/rpc_watch.go deleted file mode 100644 index 715d671429..0000000000 --- a/svc/ctrl/services/cluster/rpc_watch.go +++ /dev/null @@ -1,87 +0,0 @@ -package cluster - -import ( - "context" - "fmt" - - "connectrpc.com/connect" - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" - "github.com/unkeyed/unkey/pkg/assert" - "github.com/unkeyed/unkey/pkg/db" - "github.com/unkeyed/unkey/pkg/proto" -) - -func (s *Service) Watch(ctx context.Context, req *connect.Request[ctrlv1.WatchRequest], stream *connect.ServerStream[ctrlv1.State]) error { - - region := req.Msg.GetRegion() - clusterID := req.Msg.GetClusterId() - sequence := req.Msg.GetSequenceLastSeen() - - err := assert.All( - assert.NotEmpty(region, "region must not be empty"), - assert.NotEmpty(clusterID, "clusterID must not be empty"), - assert.Greater(sequence, 0, "sequence must be greater than 0"), - ) - if err != nil { - return connect.NewError(connect.CodeInvalidArgument, err) - } - - s.logger.Info("watch request received", - "region", region, - "clusterID", clusterID, - "sequence", sequence, - ) - - changes, err := db.Query.FindStateChangesByClusterAfterSequence(ctx, s.db.RW(), db.FindStateChangesByClusterAfterSequenceParams{ - ClusterID: clusterID, - AfterSequence: sequence, - }) - if err != nil { - return connect.NewError(connect.CodeInternal, err) - } - - for _, change := range changes { - - msg := &ctrlv1.State{ - Sequence: change.Sequence, - Kind: nil, - } - - switch change.ResourceType { - case db.StateChangesResourceTypeSentinel: - sentinel := &ctrlv1.SentinelState{} - err = proto.Unmarshal(change.State, sentinel) - if err != nil { - return connect.NewError(connect.CodeInternal, err) - - } - - msg.Kind = &ctrlv1.State_Sentinel{ - Sentinel: sentinel, - } - case db.StateChangesResourceTypeDeployment: - deployment := &ctrlv1.DeploymentState{} - err = proto.Unmarshal(change.State, deployment) - if err != nil { - return connect.NewError(connect.CodeInternal, err) - - } - - msg.Kind = &ctrlv1.State_Deployment{ - Deployment: deployment, - } - default: - return connect.NewError(connect.CodeInternal, fmt.Errorf("unexpected resource type %T", change.ResourceType)) - } - - err = stream.Send(msg) - if err != nil { - return connect.NewError(connect.CodeInternal, err) - } - - } - - <-ctx.Done() - return ctx.Err() - -} diff --git a/svc/ctrl/services/cluster/service.go b/svc/ctrl/services/cluster/service.go index ddd601028c..04f08f3670 100644 --- a/svc/ctrl/services/cluster/service.go +++ b/svc/ctrl/services/cluster/service.go @@ -1,40 +1,17 @@ package cluster import ( - "sync" - - "connectrpc.com/connect" - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" "github.com/unkeyed/unkey/gen/proto/ctrl/v1/ctrlv1connect" "github.com/unkeyed/unkey/pkg/db" "github.com/unkeyed/unkey/pkg/otel/logging" ) -type client struct { - clientID string - region string - stream *connect.ServerStream[ctrlv1.State] -} - -func newClient(clientID string, region string, stream *connect.ServerStream[ctrlv1.State]) *client { - return &client{ - clientID: clientID, - region: region, - stream: stream, - } -} - type Service struct { ctrlv1connect.UnimplementedClusterServiceHandler db db.Database logger logging.Logger - // Maps regions to open clients - clientsMu sync.RWMutex - // clientID -> stream - clients map[string]*client - // static bearer token for authentication bearer string } @@ -50,8 +27,6 @@ func New(cfg Config) *Service { UnimplementedClusterServiceHandler: ctrlv1connect.UnimplementedClusterServiceHandler{}, db: cfg.Database, logger: cfg.Logger, - clientsMu: sync.RWMutex{}, - clients: make(map[string]*client), bearer: cfg.Bearer, } diff --git a/svc/ctrl/workflows/deploy/BUILD.bazel b/svc/ctrl/workflows/deploy/BUILD.bazel index fcead07545..5d2c771721 100644 --- a/svc/ctrl/workflows/deploy/BUILD.bazel +++ b/svc/ctrl/workflows/deploy/BUILD.bazel @@ -19,7 +19,6 @@ go_library( "//gen/proto/hydra/v1:hydra", "//pkg/db", "//pkg/otel/logging", - "//pkg/ptr", "//pkg/uid", "//pkg/vault", "//svc/ctrl/services/cluster", diff --git a/svc/ctrl/workflows/deploy/deploy_handler.go b/svc/ctrl/workflows/deploy/deploy_handler.go index e3f5d43eb5..6d079745c6 100644 --- a/svc/ctrl/workflows/deploy/deploy_handler.go +++ b/svc/ctrl/workflows/deploy/deploy_handler.go @@ -12,7 +12,6 @@ import ( ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" hydrav1 "github.com/unkeyed/unkey/gen/proto/hydra/v1" "github.com/unkeyed/unkey/pkg/db" - "github.com/unkeyed/unkey/pkg/ptr" "github.com/unkeyed/unkey/pkg/uid" "google.golang.org/protobuf/proto" ) @@ -82,7 +81,6 @@ func (w *Workflow) Deploy(ctx restate.WorkflowSharedContext, req *hydrav1.Deploy } var dockerImage string - var buildID *string if req.GetBuildContextPath() != "" { if err = w.updateDeploymentStatus(ctx, deployment.ID, db.DeploymentsStatusBuilding); err != nil { @@ -116,7 +114,6 @@ func (w *Workflow) Deploy(ctx restate.WorkflowSharedContext, req *hydrav1.Deploy return nil, fmt.Errorf("failed to build docker image: %w", err) } dockerImage = result.GetImageName() - buildID = ptr.P(result.GetBuildId()) err = restate.RunVoid(ctx, func(stepCtx restate.RunContext) error { return db.Query.UpdateDeploymentBuildID(stepCtx, w.db.RW(), db.UpdateDeploymentBuildIDParams{ @@ -166,7 +163,25 @@ func (w *Workflow) Deploy(ctx restate.WorkflowSharedContext, req *hydrav1.Deploy } err = restate.RunVoid(ctx, func(runCtx restate.RunContext) error { - return db.BulkQuery.InsertDeploymentTopologies(runCtx, w.db.RW(), topologies) + return db.Tx(runCtx, w.db.RW(), func(txCtx context.Context, tx db.DBTX) error { + if err := db.BulkQuery.InsertDeploymentTopologies(txCtx, tx, topologies); err != nil { + return err + } + stateChanges := make([]db.InsertStateChangeParams, len(topologies)) + for i, t := range topologies { + stateChanges[i] = db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeDeployment, + ResourceID: deployment.ID, + Op: db.StateChangesOpUpsert, + Region: t.Region, + CreatedAt: uint64(time.Now().UnixMilli()), + } + } + if err := db.BulkQuery.InsertStateChanges(txCtx, tx, stateChanges); err != nil { + return err + } + return nil + }) }, restate.WithName("insert deployment topologies")) // Ensure sentinels exist in each region for this deployment @@ -190,32 +205,44 @@ func (w *Workflow) Deploy(ctx restate.WorkflowSharedContext, req *hydrav1.Deploy } err = restate.RunVoid(ctx, func(runCtx restate.RunContext) error { - sentinelID := uid.New(uid.SentinelPrefix) sentinelK8sName := uid.DNS1035() - // we rely on the unique indess of environmendID + region here to create or noop - err = db.Query.InsertSentinel(runCtx, w.db.RW(), db.InsertSentinelParams{ - ID: sentinelID, - WorkspaceID: workspace.ID, - EnvironmentID: environment.ID, - ProjectID: project.ID, - K8sAddress: fmt.Sprintf("%s.%s.svc.cluster.local", sentinelK8sName, workspace.K8sNamespace.String), - K8sName: sentinelK8sName, - Region: topology.Region, - Image: w.sentinelImage, - Health: db.SentinelsHealthUnknown, - DesiredReplicas: desiredReplicas, - AvailableReplicas: 0, - CpuMillicores: 256, - MemoryMib: 256, - CreatedAt: time.Now().UnixMilli(), + return db.Tx(runCtx, w.db.RW(), func(txCtx context.Context, tx db.DBTX) error { + // we rely on the unique index of environmentID + region here to create or noop + err := db.Query.InsertSentinel(txCtx, tx, db.InsertSentinelParams{ + ID: sentinelID, + WorkspaceID: workspace.ID, + EnvironmentID: environment.ID, + ProjectID: project.ID, + K8sAddress: fmt.Sprintf("%s.%s.svc.cluster.local", sentinelK8sName, workspace.K8sNamespace.String), + K8sName: sentinelK8sName, + Region: topology.Region, + Image: w.sentinelImage, + Health: db.SentinelsHealthUnknown, + DesiredReplicas: desiredReplicas, + AvailableReplicas: 0, + CpuMillicores: 256, + MemoryMib: 256, + CreatedAt: time.Now().UnixMilli(), + }) + if err != nil { + if db.IsDuplicateKeyError(err) { + return nil + } + return err + } + if _, err := db.Query.InsertStateChange(txCtx, tx, db.InsertStateChangeParams{ + ResourceType: db.StateChangesResourceTypeSentinel, + ResourceID: sentinelID, + Op: db.StateChangesOpUpsert, + Region: topology.Region, + CreatedAt: uint64(time.Now().UnixMilli()), + }); err != nil { + return err + } + return nil }) - if err != nil && !db.IsDuplicateKeyError(err) { - return err - } - return nil - }, restate.WithName("ensure sentinel exists in db")) if err != nil { return nil, err @@ -223,102 +250,6 @@ func (w *Workflow) Deploy(ctx restate.WorkflowSharedContext, req *hydrav1.Deploy } - sentinels, err := restate.Run(ctx, func(runCtx restate.RunContext) ([]db.Sentinel, error) { - return db.Query.FindSentinelsByEnvironmentID(runCtx, w.db.RO(), environment.ID) - }, restate.WithName("find all sentinels")) - if err != nil { - return nil, err - } - - for _, sentinel := range sentinels { - err = restate.RunVoid(ctx, func(runCtx restate.RunContext) error { - - s := &ctrlv1.SentinelState{ - State: &ctrlv1.SentinelState_Apply{ - Apply: &ctrlv1.ApplySentinel{ - K8SName: sentinel.K8sName, - WorkspaceId: sentinel.WorkspaceID, - ProjectId: sentinel.ProjectID, - EnvironmentId: sentinel.EnvironmentID, - SentinelId: sentinel.ID, - Image: w.sentinelImage, - Replicas: sentinel.DesiredReplicas, - CpuMillicores: int64(sentinel.CpuMillicores), - MemoryMib: int64(sentinel.MemoryMib), - }, - }, - } - state, err := proto.Marshal(s) - if err != nil { - return restate.TerminalError(err) - } - - sequence, err := db.Query.InsertStateChange(ctx, w.db.RW(), db.InsertStateChangeParams{ - ResourceType: db.StateChangesResourceTypeSentinel, - State: state, - ClusterID: sentinel.Region, - CreatedAt: uint64(time.Now().UnixMilli()), - }) - if err != nil { - return err - } - - _ = sequence - return nil - }, restate.WithName(fmt.Sprintf("schedule sentinel for %s in %s", sentinel.ID, sentinel.Region))) - if err != nil { - return nil, err - } - } - } - - for _, region := range topologies { - err = restate.RunVoid(ctx, func(runCtx restate.RunContext) error { - - s := &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Apply{ - Apply: &ctrlv1.ApplyDeployment{ - K8SNamespace: workspace.K8sNamespace.String, - K8SName: deployment.K8sName, - WorkspaceId: workspace.ID, - ProjectId: deployment.ProjectID, - EnvironmentId: deployment.EnvironmentID, - DeploymentId: deployment.ID, - Image: dockerImage, - Replicas: region.DesiredReplicas, - CpuMillicores: int64(deployment.CpuMillicores), - MemoryMib: int64(deployment.MemoryMib), - BuildId: buildID, - EncryptedEnvironmentVariables: deployment.EncryptedEnvironmentVariables, - ReadinessId: ptr.P(deployment.ID), - }, - }, - } - - state, err := proto.Marshal(s) - if err != nil { - return restate.TerminalError(err) - } - - sequence, err := db.Query.InsertStateChange(ctx, w.db.RW(), db.InsertStateChangeParams{ - ResourceType: db.StateChangesResourceTypeDeployment, - State: state, - ClusterID: region.Region, - CreatedAt: uint64(time.Now().UnixMilli()), - }) - if err != nil { - return err - } - - _ = sequence - - return nil - - }, restate.WithName(fmt.Sprintf("schedule deployment %s in %s", deployment.ID, region.Region))) - if err != nil { - return nil, err - } - } w.logger.Info("waiting for deployments to be ready", "deployment_id", deployment.ID) diff --git a/svc/krane/config.go b/svc/krane/config.go index 662b38d76b..1439694b3d 100644 --- a/svc/krane/config.go +++ b/svc/krane/config.go @@ -83,8 +83,6 @@ type Config struct { ControlPlaneURL string ControlPlaneBearer string - - ClusterID string } // Validate checks the configuration for required fields and logical consistency. diff --git a/svc/krane/doc.go b/svc/krane/doc.go index 1b741565ba..f5f913fe1c 100644 --- a/svc/krane/doc.go +++ b/svc/krane/doc.go @@ -14,8 +14,7 @@ // - Krane Agents: Node-level agents that expose gRPC APIs for orchestration // - Kubernetes Cluster: Target infrastructure where containers are deployed // -// Each krane instance is identified by a unique InstanceID and operates within -// a specific ClusterID for distributed coordination. The agent uses in-cluster +// Each krane instance is identified by a unique InstanceID. The agent uses in-cluster // Kubernetes configuration for direct cluster access. // // # Key Services @@ -44,7 +43,6 @@ // // cfg := krane.Config{ // InstanceID: "krane-node-001", -// ClusterID: "production-cluster", // Region: "us-west-2", // RegistryURL: "registry.depot.dev", // RegistryUsername: "x-token", diff --git a/svc/krane/internal/reconciler/BUILD.bazel b/svc/krane/internal/reconciler/BUILD.bazel index eb65ef4f1c..00b24d8f42 100644 --- a/svc/krane/internal/reconciler/BUILD.bazel +++ b/svc/krane/internal/reconciler/BUILD.bazel @@ -1,4 +1,4 @@ -load("@rules_go//go:def.bzl", "go_library") +load("@rules_go//go:def.bzl", "go_library", "go_test") go_library( name = "reconciler", @@ -42,3 +42,47 @@ go_library( "@io_k8s_sigs_controller_runtime//pkg/client", ], ) + +go_test( + name = "reconciler_test", + srcs = [ + "apply_deployment_test.go", + "apply_sentinel_test.go", + "delete_deployment_test.go", + "delete_sentinel_test.go", + "handle_state_test.go", + "mock_cluster_client_test.go", + "namespace_test.go", + "reconciler_test.go", + "refresh_current_deployments_test.go", + "refresh_current_sentinels_test.go", + "sequence_tracking_test.go", + "test_helpers_test.go", + "update_state_test.go", + "watch_current_deployments_test.go", + "watch_current_sentinels_test.go", + "watcher_test.go", + ], + embed = [":reconciler"], + deps = [ + "//gen/proto/ctrl/v1:ctrl", + "//gen/proto/ctrl/v1/ctrlv1connect", + "//pkg/circuitbreaker", + "//pkg/otel/logging", + "//pkg/ptr", + "//svc/krane/pkg/labels", + "@com_connectrpc_connect//:connect", + "@com_github_stretchr_testify//require", + "@io_k8s_api//apps/v1:apps", + "@io_k8s_api//core/v1:core", + "@io_k8s_apimachinery//pkg/api/errors", + "@io_k8s_apimachinery//pkg/api/resource", + "@io_k8s_apimachinery//pkg/apis/meta/v1:meta", + "@io_k8s_apimachinery//pkg/runtime", + "@io_k8s_apimachinery//pkg/runtime/schema", + "@io_k8s_apimachinery//pkg/types", + "@io_k8s_apimachinery//pkg/watch", + "@io_k8s_client_go//kubernetes/fake", + "@io_k8s_client_go//testing", + ], +) diff --git a/svc/krane/internal/reconciler/apply_deployment_test.go_ b/svc/krane/internal/reconciler/apply_deployment_test.go similarity index 100% rename from svc/krane/internal/reconciler/apply_deployment_test.go_ rename to svc/krane/internal/reconciler/apply_deployment_test.go diff --git a/svc/krane/internal/reconciler/apply_sentinel_test.go_ b/svc/krane/internal/reconciler/apply_sentinel_test.go similarity index 100% rename from svc/krane/internal/reconciler/apply_sentinel_test.go_ rename to svc/krane/internal/reconciler/apply_sentinel_test.go diff --git a/svc/krane/internal/reconciler/delete_deployment_test.go_ b/svc/krane/internal/reconciler/delete_deployment_test.go similarity index 100% rename from svc/krane/internal/reconciler/delete_deployment_test.go_ rename to svc/krane/internal/reconciler/delete_deployment_test.go diff --git a/svc/krane/internal/reconciler/delete_sentinel_test.go_ b/svc/krane/internal/reconciler/delete_sentinel_test.go similarity index 100% rename from svc/krane/internal/reconciler/delete_sentinel_test.go_ rename to svc/krane/internal/reconciler/delete_sentinel_test.go diff --git a/svc/krane/internal/reconciler/doc.go b/svc/krane/internal/reconciler/doc.go index 37d350665a..e4d0858f38 100644 --- a/svc/krane/internal/reconciler/doc.go +++ b/svc/krane/internal/reconciler/doc.go @@ -54,7 +54,6 @@ // ClientSet: kubeClient, // Logger: logger, // Cluster: clusterClient, -// ClusterID: "cluster-123", // Region: "us-east-1", // } // r := reconciler.New(cfg) diff --git a/svc/krane/internal/reconciler/handle_state_test.go_ b/svc/krane/internal/reconciler/handle_state_test.go similarity index 100% rename from svc/krane/internal/reconciler/handle_state_test.go_ rename to svc/krane/internal/reconciler/handle_state_test.go diff --git a/svc/krane/internal/reconciler/mock_cluster_client_test.go_ b/svc/krane/internal/reconciler/mock_cluster_client_test.go similarity index 89% rename from svc/krane/internal/reconciler/mock_cluster_client_test.go_ rename to svc/krane/internal/reconciler/mock_cluster_client_test.go index 8d1256a31d..6ceabb0b9a 100644 --- a/svc/krane/internal/reconciler/mock_cluster_client_test.go_ +++ b/svc/krane/internal/reconciler/mock_cluster_client_test.go @@ -17,7 +17,7 @@ var _ ctrlv1connect.ClusterServiceClient = (*MockClusterClient)(nil) // default. The mock also records all UpdateDeploymentState and UpdateSentinelState // calls so tests can verify the reconciler reported the correct state. type MockClusterClient struct { - WatchFunc func(context.Context, *connect.Request[ctrlv1.WatchRequest]) (*connect.ServerStreamForClient[ctrlv1.State], error) + SyncFunc func(context.Context, *connect.Request[ctrlv1.SyncRequest]) (*connect.ServerStreamForClient[ctrlv1.State], error) GetDesiredSentinelStateFunc func(context.Context, *connect.Request[ctrlv1.GetDesiredSentinelStateRequest]) (*connect.Response[ctrlv1.SentinelState], error) UpdateSentinelStateFunc func(context.Context, *connect.Request[ctrlv1.UpdateSentinelStateRequest]) (*connect.Response[ctrlv1.UpdateSentinelStateResponse], error) GetDesiredDeploymentStateFunc func(context.Context, *connect.Request[ctrlv1.GetDesiredDeploymentStateRequest]) (*connect.Response[ctrlv1.DeploymentState], error) @@ -26,9 +26,9 @@ type MockClusterClient struct { UpdateSentinelStateCalls []*ctrlv1.UpdateSentinelStateRequest } -func (m *MockClusterClient) Watch(ctx context.Context, req *connect.Request[ctrlv1.WatchRequest]) (*connect.ServerStreamForClient[ctrlv1.State], error) { - if m.WatchFunc != nil { - return m.WatchFunc(ctx, req) +func (m *MockClusterClient) Sync(ctx context.Context, req *connect.Request[ctrlv1.SyncRequest]) (*connect.ServerStreamForClient[ctrlv1.State], error) { + if m.SyncFunc != nil { + return m.SyncFunc(ctx, req) } return nil, nil } diff --git a/svc/krane/internal/reconciler/namespace_test.go_ b/svc/krane/internal/reconciler/namespace_test.go similarity index 100% rename from svc/krane/internal/reconciler/namespace_test.go_ rename to svc/krane/internal/reconciler/namespace_test.go diff --git a/svc/krane/internal/reconciler/reconciler.go b/svc/krane/internal/reconciler/reconciler.go index 3693284840..c030bf5383 100644 --- a/svc/krane/internal/reconciler/reconciler.go +++ b/svc/krane/internal/reconciler/reconciler.go @@ -4,7 +4,6 @@ import ( "context" "fmt" - "connectrpc.com/connect" ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" "github.com/unkeyed/unkey/gen/proto/ctrl/v1/ctrlv1connect" "github.com/unkeyed/unkey/pkg/circuitbreaker" @@ -23,15 +22,13 @@ import ( // background goroutines for watching and refreshing, so callers must call [Start] // before processing state and [Stop] during shutdown. type Reconciler struct { - clientSet kubernetes.Interface - logger logging.Logger - cluster ctrlv1connect.ClusterServiceClient - cb circuitbreaker.CircuitBreaker[any] - done chan struct{} - clusterID string - region string - // last seen sequence - sequence uint64 + clientSet kubernetes.Interface + logger logging.Logger + cluster ctrlv1connect.ClusterServiceClient + cb circuitbreaker.CircuitBreaker[any] + done chan struct{} + region string + sequenceLastSeen uint64 } // Config holds the configuration required to create a new [Reconciler]. @@ -40,21 +37,19 @@ type Config struct { ClientSet kubernetes.Interface Logger logging.Logger Cluster ctrlv1connect.ClusterServiceClient - ClusterID string Region string } // New creates a [Reconciler] ready to be started with [Reconciler.Start]. func New(cfg Config) *Reconciler { return &Reconciler{ - clientSet: cfg.ClientSet, - logger: cfg.Logger, - cluster: cfg.Cluster, - cb: circuitbreaker.New[any]("reconciler_state_update"), - done: make(chan struct{}), - clusterID: cfg.ClusterID, - region: cfg.Region, - sequence: 0, + clientSet: cfg.ClientSet, + logger: cfg.Logger, + cluster: cfg.Cluster, + cb: circuitbreaker.New[any]("reconciler_state_update"), + done: make(chan struct{}), + region: cfg.Region, + sequenceLastSeen: 0, } } @@ -74,24 +69,6 @@ func (r *Reconciler) Start(ctx context.Context) error { return err } - stream, err := r.cluster.Sync(ctx, connect.NewRequest(&ctrlv1.SyncRequest{ - ClusterId: r.clusterID, - Region: r.region, - })) - if err != nil { - return err - } - - for stream.Receive() { - if err := r.HandleState(ctx, stream.Msg()); err != nil { - r.logger.Error("error handling state", "error", err) - } - } - err = stream.Close() - if err != nil { - r.logger.Error("unable to close stream", "error", err) - } - go r.Watch(ctx) return nil @@ -111,39 +88,43 @@ func (r *Reconciler) HandleState(ctx context.Context, state *ctrlv1.State) error if state == nil { return fmt.Errorf("state is nil") } + + sequence := state.GetSequence() + switch kind := state.GetKind().(type) { case *ctrlv1.State_Deployment: - { - switch op := kind.Deployment.GetState().(type) { - case *ctrlv1.DeploymentState_Apply: - if err := r.ApplyDeployment(ctx, op.Apply); err != nil { - return err - } - case *ctrlv1.DeploymentState_Delete: - if err := r.DeleteDeployment(ctx, op.Delete); err != nil { - return err - } + switch op := kind.Deployment.GetState().(type) { + case *ctrlv1.DeploymentState_Apply: + if err := r.ApplyDeployment(ctx, op.Apply); err != nil { + return err + } + case *ctrlv1.DeploymentState_Delete: + if err := r.DeleteDeployment(ctx, op.Delete); err != nil { + return err } } case *ctrlv1.State_Sentinel: - { - switch op := kind.Sentinel.GetState().(type) { - case *ctrlv1.SentinelState_Apply: - if err := r.ApplySentinel(ctx, op.Apply); err != nil { - return err - } - case *ctrlv1.SentinelState_Delete: - if err := r.DeleteSentinel(ctx, op.Delete); err != nil { - return err - } + switch op := kind.Sentinel.GetState().(type) { + case *ctrlv1.SentinelState_Apply: + if err := r.ApplySentinel(ctx, op.Apply); err != nil { + return err + } + case *ctrlv1.SentinelState_Delete: + if err := r.DeleteSentinel(ctx, op.Delete); err != nil { + return err } } - + case *ctrlv1.State_Bookmark: + sequence = kind.Bookmark.GetSequence() + r.logger.Info("received bookmark", "sequence", sequence) default: return fmt.Errorf("unknown state type: %T", kind) } - r.sequence = state.GetSequence() + if sequence > r.sequenceLastSeen { + r.sequenceLastSeen = sequence + } + return nil } diff --git a/svc/krane/internal/reconciler/reconciler_test.go_ b/svc/krane/internal/reconciler/reconciler_test.go similarity index 80% rename from svc/krane/internal/reconciler/reconciler_test.go_ rename to svc/krane/internal/reconciler/reconciler_test.go index cf13d0979a..07280378a5 100644 --- a/svc/krane/internal/reconciler/reconciler_test.go_ +++ b/svc/krane/internal/reconciler/reconciler_test.go @@ -1,7 +1,6 @@ package reconciler import ( - "context" "testing" "github.com/stretchr/testify/require" @@ -25,7 +24,6 @@ func TestNew_CreatesReconcilerWithCorrectFields(t *testing.T) { ClientSet: client, Logger: logger, Cluster: mockCluster, - ClusterID: "cluster-123", Region: "us-east-1", } @@ -44,7 +42,6 @@ func TestNew_CreatesCircuitBreaker(t *testing.T) { ClientSet: client, Logger: logging.NewNoop(), Cluster: &MockClusterClient{}, - ClusterID: "cluster-123", Region: "us-east-1", } @@ -59,7 +56,6 @@ func TestNew_CreatesDoneChannel(t *testing.T) { ClientSet: client, Logger: logging.NewNoop(), Cluster: &MockClusterClient{}, - ClusterID: "cluster-123", Region: "us-east-1", } @@ -80,7 +76,6 @@ func TestStop_ClosesDoneChannel(t *testing.T) { ClientSet: client, Logger: logging.NewNoop(), Cluster: &MockClusterClient{}, - ClusterID: "cluster-123", Region: "us-east-1", } @@ -102,7 +97,6 @@ func TestStop_IsIdempotent(t *testing.T) { ClientSet: client, Logger: logging.NewNoop(), Cluster: &MockClusterClient{}, - ClusterID: "cluster-123", Region: "us-east-1", } @@ -115,23 +109,3 @@ func TestStop_IsIdempotent(t *testing.T) { _ = r.Stop() }, "calling Stop twice should panic when closing already closed channel") } - -func TestStart_InitiatesGoroutines(t *testing.T) { - client := fake.NewSimpleClientset() - cfg := Config{ - ClientSet: client, - Logger: logging.NewNoop(), - Cluster: &MockClusterClient{}, - ClusterID: "cluster-123", - Region: "us-east-1", - } - - r := New(cfg) - ctx := context.Background() - - err := r.Start(ctx) - require.NoError(t, err, "Start should return without error") - - err = r.Stop() - require.NoError(t, err) -} diff --git a/svc/krane/internal/reconciler/refresh_current_deployments_test.go_ b/svc/krane/internal/reconciler/refresh_current_deployments_test.go similarity index 100% rename from svc/krane/internal/reconciler/refresh_current_deployments_test.go_ rename to svc/krane/internal/reconciler/refresh_current_deployments_test.go diff --git a/svc/krane/internal/reconciler/refresh_current_sentinels_test.go_ b/svc/krane/internal/reconciler/refresh_current_sentinels_test.go similarity index 100% rename from svc/krane/internal/reconciler/refresh_current_sentinels_test.go_ rename to svc/krane/internal/reconciler/refresh_current_sentinels_test.go diff --git a/svc/krane/internal/reconciler/sequence_tracking_test.go b/svc/krane/internal/reconciler/sequence_tracking_test.go new file mode 100644 index 0000000000..0a1ffbbb9b --- /dev/null +++ b/svc/krane/internal/reconciler/sequence_tracking_test.go @@ -0,0 +1,347 @@ +package reconciler + +import ( + "context" + "testing" + + "github.com/stretchr/testify/require" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" + "github.com/unkeyed/unkey/pkg/ptr" +) + +// Tests for sequence tracking behavior in HandleState. +// The reconciler tracks sequenceLastSeen to resume from the correct position on reconnect. + +func TestHandleState_UpdatesSequenceAfterDeploymentApply(t *testing.T) { + ctx := context.Background() + h := NewTestHarness(t) + r := h.Reconciler + + require.Equal(t, uint64(0), r.sequenceLastSeen, "initial sequence should be 0") + + state := &ctrlv1.State{ + Sequence: 42, + Kind: &ctrlv1.State_Deployment{ + Deployment: &ctrlv1.DeploymentState{ + State: &ctrlv1.DeploymentState_Apply{ + Apply: &ctrlv1.ApplyDeployment{ + WorkspaceId: "ws_123", + ProjectId: "prj_123", + EnvironmentId: "env_123", + DeploymentId: "dep_123", + K8SNamespace: "test-namespace", + K8SName: "test-deployment", + Image: "nginx:1.19", + Replicas: 1, + CpuMillicores: 100, + MemoryMib: 128, + BuildId: ptr.P("build_123"), + }, + }, + }, + }, + } + + err := r.HandleState(ctx, state) + require.NoError(t, err) + require.Equal(t, uint64(42), r.sequenceLastSeen, "sequence should be updated after apply") +} + +func TestHandleState_UpdatesSequenceAfterDeploymentDelete(t *testing.T) { + ctx := context.Background() + h := NewTestHarness(t) + r := h.Reconciler + + state := &ctrlv1.State{ + Sequence: 100, + Kind: &ctrlv1.State_Deployment{ + Deployment: &ctrlv1.DeploymentState{ + State: &ctrlv1.DeploymentState_Delete{ + Delete: &ctrlv1.DeleteDeployment{ + K8SNamespace: "test-namespace", + K8SName: "test-deployment", + }, + }, + }, + }, + } + + err := r.HandleState(ctx, state) + require.NoError(t, err) + require.Equal(t, uint64(100), r.sequenceLastSeen) +} + +func TestHandleState_UpdatesSequenceAfterSentinelApply(t *testing.T) { + ctx := context.Background() + h := NewTestHarness(t) + r := h.Reconciler + + state := &ctrlv1.State{ + Sequence: 200, + Kind: &ctrlv1.State_Sentinel{ + Sentinel: &ctrlv1.SentinelState{ + State: &ctrlv1.SentinelState_Apply{ + Apply: &ctrlv1.ApplySentinel{ + WorkspaceId: "ws_123", + ProjectId: "prj_123", + EnvironmentId: "env_123", + SentinelId: "sentinel_123", + K8SName: "test-sentinel", + Image: "sentinel:1.0", + Replicas: 2, + CpuMillicores: 100, + MemoryMib: 128, + }, + }, + }, + }, + } + + err := r.HandleState(ctx, state) + require.NoError(t, err) + require.Equal(t, uint64(200), r.sequenceLastSeen) +} + +func TestHandleState_UpdatesSequenceAfterSentinelDelete(t *testing.T) { + ctx := context.Background() + h := NewTestHarness(t) + r := h.Reconciler + + state := &ctrlv1.State{ + Sequence: 300, + Kind: &ctrlv1.State_Sentinel{ + Sentinel: &ctrlv1.SentinelState{ + State: &ctrlv1.SentinelState_Delete{ + Delete: &ctrlv1.DeleteSentinel{ + K8SName: "test-sentinel", + }, + }, + }, + }, + } + + err := r.HandleState(ctx, state) + require.NoError(t, err) + require.Equal(t, uint64(300), r.sequenceLastSeen) +} + +func TestHandleState_UpdatesSequenceFromBookmark(t *testing.T) { + ctx := context.Background() + h := NewTestHarness(t) + r := h.Reconciler + + state := &ctrlv1.State{ + Sequence: 500, // State-level sequence + Kind: &ctrlv1.State_Bookmark{ + Bookmark: &ctrlv1.Bookmark{ + Sequence: 999, // Bookmark-specific sequence takes precedence + }, + }, + } + + err := r.HandleState(ctx, state) + require.NoError(t, err) + require.Equal(t, uint64(999), r.sequenceLastSeen, "bookmark sequence should override state sequence") +} + +func TestHandleState_SequenceOnlyIncreases(t *testing.T) { + ctx := context.Background() + h := NewTestHarness(t) + r := h.Reconciler + + // First event with sequence 100 + state1 := &ctrlv1.State{ + Sequence: 100, + Kind: &ctrlv1.State_Deployment{ + Deployment: &ctrlv1.DeploymentState{ + State: &ctrlv1.DeploymentState_Delete{ + Delete: &ctrlv1.DeleteDeployment{ + K8SNamespace: "test-namespace", + K8SName: "test-deployment-1", + }, + }, + }, + }, + } + + err := r.HandleState(ctx, state1) + require.NoError(t, err) + require.Equal(t, uint64(100), r.sequenceLastSeen) + + // Second event with lower sequence (should not decrease) + state2 := &ctrlv1.State{ + Sequence: 50, // Lower sequence + Kind: &ctrlv1.State_Deployment{ + Deployment: &ctrlv1.DeploymentState{ + State: &ctrlv1.DeploymentState_Delete{ + Delete: &ctrlv1.DeleteDeployment{ + K8SNamespace: "test-namespace", + K8SName: "test-deployment-2", + }, + }, + }, + }, + } + + err = r.HandleState(ctx, state2) + require.NoError(t, err) + require.Equal(t, uint64(100), r.sequenceLastSeen, "sequence should not decrease") + + // Third event with higher sequence (should increase) + state3 := &ctrlv1.State{ + Sequence: 150, + Kind: &ctrlv1.State_Deployment{ + Deployment: &ctrlv1.DeploymentState{ + State: &ctrlv1.DeploymentState_Delete{ + Delete: &ctrlv1.DeleteDeployment{ + K8SNamespace: "test-namespace", + K8SName: "test-deployment-3", + }, + }, + }, + }, + } + + err = r.HandleState(ctx, state3) + require.NoError(t, err) + require.Equal(t, uint64(150), r.sequenceLastSeen, "sequence should increase") +} + +func TestHandleState_SequenceZeroDoesNotReset(t *testing.T) { + ctx := context.Background() + h := NewTestHarness(t) + r := h.Reconciler + + // Set initial sequence + state1 := &ctrlv1.State{ + Sequence: 100, + Kind: &ctrlv1.State_Deployment{ + Deployment: &ctrlv1.DeploymentState{ + State: &ctrlv1.DeploymentState_Delete{ + Delete: &ctrlv1.DeleteDeployment{ + K8SNamespace: "test-namespace", + K8SName: "test-deployment", + }, + }, + }, + }, + } + + err := r.HandleState(ctx, state1) + require.NoError(t, err) + require.Equal(t, uint64(100), r.sequenceLastSeen) + + // Event with sequence 0 should not reset + state2 := &ctrlv1.State{ + Sequence: 0, + Kind: &ctrlv1.State_Deployment{ + Deployment: &ctrlv1.DeploymentState{ + State: &ctrlv1.DeploymentState_Delete{ + Delete: &ctrlv1.DeleteDeployment{ + K8SNamespace: "test-namespace", + K8SName: "another-deployment", + }, + }, + }, + }, + } + + err = r.HandleState(ctx, state2) + require.NoError(t, err) + require.Equal(t, uint64(100), r.sequenceLastSeen, "sequence should not reset to 0") +} + +func TestHandleState_BootstrapSequence(t *testing.T) { + ctx := context.Background() + h := NewTestHarness(t) + r := h.Reconciler + + // Simulate bootstrap: all state items have the same sequence from GetMaxStateChangeSequence + bootstrapSequence := uint64(500) + + states := []*ctrlv1.State{ + { + Sequence: bootstrapSequence, + Kind: &ctrlv1.State_Deployment{ + Deployment: &ctrlv1.DeploymentState{ + State: &ctrlv1.DeploymentState_Apply{ + Apply: &ctrlv1.ApplyDeployment{ + WorkspaceId: "ws_1", + ProjectId: "prj_1", + EnvironmentId: "env_1", + DeploymentId: "dep_1", + K8SNamespace: "test-namespace", + K8SName: "deployment-1", + Image: "nginx:1.19", + Replicas: 1, + CpuMillicores: 100, + MemoryMib: 128, + }, + }, + }, + }, + }, + { + Sequence: bootstrapSequence, + Kind: &ctrlv1.State_Deployment{ + Deployment: &ctrlv1.DeploymentState{ + State: &ctrlv1.DeploymentState_Apply{ + Apply: &ctrlv1.ApplyDeployment{ + WorkspaceId: "ws_2", + ProjectId: "prj_2", + EnvironmentId: "env_2", + DeploymentId: "dep_2", + K8SNamespace: "test-namespace", + K8SName: "deployment-2", + Image: "nginx:1.20", + Replicas: 2, + CpuMillicores: 200, + MemoryMib: 256, + }, + }, + }, + }, + }, + { + Sequence: bootstrapSequence, + Kind: &ctrlv1.State_Sentinel{ + Sentinel: &ctrlv1.SentinelState{ + State: &ctrlv1.SentinelState_Apply{ + Apply: &ctrlv1.ApplySentinel{ + WorkspaceId: "ws_3", + ProjectId: "prj_3", + EnvironmentId: "env_3", + SentinelId: "sentinel_1", + K8SName: "sentinel-1", + Image: "sentinel:1.0", + Replicas: 1, + CpuMillicores: 50, + MemoryMib: 64, + }, + }, + }, + }, + }, + // Bookmark signals end of bootstrap + { + Sequence: bootstrapSequence, + Kind: &ctrlv1.State_Bookmark{ + Bookmark: &ctrlv1.Bookmark{ + Sequence: bootstrapSequence, + }, + }, + }, + } + + for _, state := range states { + err := r.HandleState(ctx, state) + require.NoError(t, err) + } + + require.Equal(t, bootstrapSequence, r.sequenceLastSeen, "sequence should be set to bootstrap watermark") +} + +func TestReconciler_InitialSequenceIsZero(t *testing.T) { + h := NewTestHarness(t) + require.Equal(t, uint64(0), h.Reconciler.sequenceLastSeen, "new reconciler should start with sequence 0") +} diff --git a/svc/krane/internal/reconciler/test_helpers_test.go_ b/svc/krane/internal/reconciler/test_helpers_test.go similarity index 97% rename from svc/krane/internal/reconciler/test_helpers_test.go_ rename to svc/krane/internal/reconciler/test_helpers_test.go index 17d43f7a4f..4a43de99c0 100644 --- a/svc/krane/internal/reconciler/test_helpers_test.go_ +++ b/svc/krane/internal/reconciler/test_helpers_test.go @@ -4,7 +4,6 @@ import ( "encoding/json" "testing" - "github.com/unkeyed/unkey/pkg/circuitbreaker" "github.com/unkeyed/unkey/pkg/otel/logging" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" @@ -52,13 +51,12 @@ func NewTestReconciler(client *fake.Clientset, controlPlane *MockClusterClient) if controlPlane == nil { controlPlane = &MockClusterClient{} } - return &Reconciler{ - clientSet: client, - cluster: controlPlane, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } + return New(Config{ + ClientSet: client, + Logger: logging.NewNoop(), + Cluster: controlPlane, + Region: "test-region", + }) } // ----------------------------------------------------------------------------- diff --git a/svc/krane/internal/reconciler/update_state_test.go_ b/svc/krane/internal/reconciler/update_state_test.go similarity index 100% rename from svc/krane/internal/reconciler/update_state_test.go_ rename to svc/krane/internal/reconciler/update_state_test.go diff --git a/svc/krane/internal/reconciler/watch_current_deployments_test.go_ b/svc/krane/internal/reconciler/watch_current_deployments_test.go similarity index 100% rename from svc/krane/internal/reconciler/watch_current_deployments_test.go_ rename to svc/krane/internal/reconciler/watch_current_deployments_test.go diff --git a/svc/krane/internal/reconciler/watch_current_sentinels_test.go_ b/svc/krane/internal/reconciler/watch_current_sentinels_test.go similarity index 100% rename from svc/krane/internal/reconciler/watch_current_sentinels_test.go_ rename to svc/krane/internal/reconciler/watch_current_sentinels_test.go diff --git a/svc/krane/internal/reconciler/watcher.go b/svc/krane/internal/reconciler/watcher.go index a89a48b848..d747e57b63 100644 --- a/svc/krane/internal/reconciler/watcher.go +++ b/svc/krane/internal/reconciler/watcher.go @@ -16,7 +16,7 @@ func (r *Reconciler) Watch(ctx context.Context) { for { - interval := intervalMin + time.Duration(rand.Float64()*float64(intervalMax.Milliseconds()-intervalMin.Milliseconds())) + interval := intervalMin + time.Millisecond*time.Duration(rand.Float64()*float64(intervalMax.Milliseconds()-intervalMin.Milliseconds())) time.Sleep(interval) err := r.watch(ctx) @@ -30,16 +30,18 @@ func (r *Reconciler) Watch(ctx context.Context) { func (r *Reconciler) watch(ctx context.Context) error { - stream, err := r.cluster.Watch(ctx, connect.NewRequest(&ctrlv1.WatchRequest{ - ClusterId: r.clusterID, + r.logger.Info("starting watch") + + stream, err := r.cluster.Sync(ctx, connect.NewRequest(&ctrlv1.SyncRequest{ Region: r.region, - SequenceLastSeen: r.sequence, + SequenceLastSeen: r.sequenceLastSeen, })) if err != nil { return err } for stream.Receive() { + r.logger.Info("received message") if err := r.HandleState(ctx, stream.Msg()); err != nil { r.logger.Error("error handling state", "error", err) } diff --git a/svc/krane/internal/reconciler/watcher_test.go b/svc/krane/internal/reconciler/watcher_test.go new file mode 100644 index 0000000000..d9de5453c0 --- /dev/null +++ b/svc/krane/internal/reconciler/watcher_test.go @@ -0,0 +1,552 @@ +// Package reconciler provides the krane reconciler that synchronizes control +// plane state with Kubernetes resources. +// +// These tests verify the watch/sync behavior of the reconciler, specifically: +// - How it forms Sync requests to the control plane +// - How it processes State messages via HandleState +// - How it tracks sequence numbers for reconnection +// +// # Test Approach +// +// Due to connect.ServerStreamForClient being a struct (not an interface), we +// cannot mock the actual stream returned by Sync(). Instead, we test: +// - Request formation (capture the SyncRequest sent to the mock) +// - HandleState processing (call HandleState directly with test messages) +// - Error handling (return errors from the mock Sync function) +// +// # Key Invariants +// +// - sequenceLastSeen is updated to the highest sequence seen +// - Bookmark messages update sequenceLastSeen to their sequence value +// - Apply messages create/update Kubernetes resources +// - Delete messages remove Kubernetes resources +package reconciler + +import ( + "context" + "errors" + "net/http" + "sync" + "testing" + + "connectrpc.com/connect" + "github.com/stretchr/testify/require" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" + "github.com/unkeyed/unkey/pkg/otel/logging" + "github.com/unkeyed/unkey/pkg/ptr" + "k8s.io/client-go/kubernetes/fake" +) + +// mockServerStream implements connect.ServerStreamForClient for testing. +type mockServerStream struct { + messages []*ctrlv1.State + index int + err error + closed bool + mu sync.Mutex +} + +func newMockServerStream(messages []*ctrlv1.State) *mockServerStream { + return &mockServerStream{ + messages: messages, + index: 0, + } +} + +func (m *mockServerStream) Receive() bool { + m.mu.Lock() + defer m.mu.Unlock() + + if m.closed || m.index >= len(m.messages) { + return false + } + m.index++ + return true +} + +func (m *mockServerStream) Msg() *ctrlv1.State { + m.mu.Lock() + defer m.mu.Unlock() + + if m.index == 0 || m.index > len(m.messages) { + return nil + } + return m.messages[m.index-1] +} + +func (m *mockServerStream) Err() error { + return m.err +} + +func (m *mockServerStream) Close() error { + m.mu.Lock() + defer m.mu.Unlock() + m.closed = true + return nil +} + +func (m *mockServerStream) ResponseHeader() http.Header { + return make(http.Header) +} + +func (m *mockServerStream) ResponseTrailer() http.Header { + return make(http.Header) +} + +// ============================================================================= +// Sync Request Formation Tests +// ============================================================================= +// +// These tests verify that the reconciler sends correctly formed Sync requests +// to the control plane. The request must include the region and the last-seen +// sequence number. +// ============================================================================= + +// TestWatch_SendsCorrectSyncRequest verifies that watch() sends a Sync request +// with the correct region and sequence number. +// +// Scenario: Reconciler has previously processed messages up to sequence 500. +// It calls watch() which should send a Sync request with that sequence. +// +// Guarantees: +// - SyncRequest.Region matches the reconciler's configured region +// - SyncRequest.SequenceLastSeen matches sequenceLastSeen from previous session +// +// This is critical for reconnection: the sequence tells the server where to +// resume streaming from. +func TestWatch_SendsCorrectSyncRequest(t *testing.T) { + client := fake.NewSimpleClientset() + AddReplicaSetPatchReactor(client) + AddDeploymentPatchReactor(client) + AddServicePatchReactor(client) + AddDeleteTracker(client) + + var capturedRequest *ctrlv1.SyncRequest + + mockCluster := &MockClusterClient{ + SyncFunc: func(_ context.Context, req *connect.Request[ctrlv1.SyncRequest]) (*connect.ServerStreamForClient[ctrlv1.State], error) { + capturedRequest = req.Msg + return nil, errors.New("end test") + }, + } + + r := New(Config{ + ClientSet: client, + Logger: logging.NewNoop(), + Cluster: mockCluster, + Region: "us-west-2", + }) + + r.sequenceLastSeen = 500 + + ctx := context.Background() + _ = r.watch(ctx) + + require.NotNil(t, capturedRequest) + require.Equal(t, "us-west-2", capturedRequest.GetRegion()) + require.Equal(t, uint64(500), capturedRequest.GetSequenceLastSeen()) +} + +// TestWatch_InitialSyncWithZeroSequence verifies that a fresh reconciler sends +// sequence=0 to trigger a full bootstrap from the server. +// +// Scenario: A newly created reconciler (never received any messages) calls watch(). +// +// Guarantees: +// - SyncRequest.SequenceLastSeen is 0 +// - This triggers the server to perform full bootstrap +// +// sequence=0 is the "I have nothing" signal that tells the server to send +// all current state before entering the watch loop. +func TestWatch_InitialSyncWithZeroSequence(t *testing.T) { + client := fake.NewSimpleClientset() + AddReplicaSetPatchReactor(client) + AddDeploymentPatchReactor(client) + AddServicePatchReactor(client) + AddDeleteTracker(client) + + var capturedRequest *ctrlv1.SyncRequest + + mockCluster := &MockClusterClient{ + SyncFunc: func(_ context.Context, req *connect.Request[ctrlv1.SyncRequest]) (*connect.ServerStreamForClient[ctrlv1.State], error) { + capturedRequest = req.Msg + return nil, errors.New("end test") + }, + } + + r := New(Config{ + ClientSet: client, + Logger: logging.NewNoop(), + Cluster: mockCluster, + Region: "eu-central-1", + }) + + ctx := context.Background() + _ = r.watch(ctx) + + require.NotNil(t, capturedRequest) + require.Equal(t, "eu-central-1", capturedRequest.GetRegion()) + require.Equal(t, uint64(0), capturedRequest.GetSequenceLastSeen(), "initial sync should have sequence 0") +} + +// ============================================================================= +// HandleState Processing Tests +// ============================================================================= +// +// These tests verify that HandleState correctly processes different message +// types and updates both Kubernetes resources and the sequence tracker. +// ============================================================================= + +// TestWatch_ProcessesStreamMessages verifies that HandleState correctly +// processes a deployment apply message and a bookmark. +// +// Scenario: A stream contains a deployment apply (seq=10) followed by a +// bookmark (seq=20). +// +// Guarantees: +// - The deployment is applied to Kubernetes (ReplicaSet is created) +// - sequenceLastSeen is updated to the bookmark's sequence (20) +// +// This tests the basic happy path: apply a resource, then receive a bookmark +// that marks the end of bootstrap. +func TestWatch_ProcessesStreamMessages(t *testing.T) { + client := fake.NewSimpleClientset() + rsCapture := AddReplicaSetPatchReactor(client) + AddDeploymentPatchReactor(client) + AddServicePatchReactor(client) + AddDeleteTracker(client) + + messages := []*ctrlv1.State{ + { + Sequence: 10, + Kind: &ctrlv1.State_Deployment{ + Deployment: &ctrlv1.DeploymentState{ + State: &ctrlv1.DeploymentState_Apply{ + Apply: &ctrlv1.ApplyDeployment{ + WorkspaceId: "ws_1", + ProjectId: "prj_1", + EnvironmentId: "env_1", + DeploymentId: "dep_1", + K8SNamespace: "test-ns", + K8SName: "dep-1", + Image: "nginx:1.19", + Replicas: 1, + CpuMillicores: 100, + MemoryMib: 128, + BuildId: ptr.P("build_1"), + }, + }, + }, + }, + }, + { + Sequence: 20, + Kind: &ctrlv1.State_Bookmark{ + Bookmark: &ctrlv1.Bookmark{ + Sequence: 20, + }, + }, + }, + } + + stream := newMockServerStream(messages) + + mockCluster := &MockClusterClient{ + SyncFunc: func(_ context.Context, _ *connect.Request[ctrlv1.SyncRequest]) (*connect.ServerStreamForClient[ctrlv1.State], error) { + // Return our mock stream wrapped as the expected interface + return (*connect.ServerStreamForClient[ctrlv1.State])(nil), nil + }, + } + + r := New(Config{ + ClientSet: client, + Logger: logging.NewNoop(), + Cluster: mockCluster, + Region: "test-region", + }) + + // Process messages directly to test HandleState integration + ctx := context.Background() + for stream.Receive() { + err := r.HandleState(ctx, stream.Msg()) + require.NoError(t, err) + } + + require.NotNil(t, rsCapture.Applied, "deployment should have been applied") + require.Equal(t, uint64(20), r.sequenceLastSeen, "sequence should be updated to bookmark value") +} + +// TestWatch_IncrementalUpdates verifies that HandleState correctly processes +// a sequence of incremental updates including applies and deletes. +// +// Scenario: Starting from sequence 100 (simulating reconnect after bootstrap), +// the reconciler receives: apply deployment (101), delete deployment (102), +// delete sentinel (103). +// +// Guarantees: +// - sequenceLastSeen is updated to 103 (the highest sequence) +// - Deployment delete triggers ReplicaSet deletion +// - Sentinel delete triggers Deployment deletion (sentinels run as k8s Deployments) +// +// This tests the watch loop after bootstrap: processing incremental changes +// as they happen in the control plane. +func TestWatch_IncrementalUpdates(t *testing.T) { + client := fake.NewSimpleClientset() + AddReplicaSetPatchReactor(client) + AddDeploymentPatchReactor(client) + AddServicePatchReactor(client) + deletes := AddDeleteTracker(client) + + messages := []*ctrlv1.State{ + { + Sequence: 101, + Kind: &ctrlv1.State_Deployment{ + Deployment: &ctrlv1.DeploymentState{ + State: &ctrlv1.DeploymentState_Apply{ + Apply: &ctrlv1.ApplyDeployment{ + WorkspaceId: "ws_1", + ProjectId: "prj_1", + EnvironmentId: "env_1", + DeploymentId: "dep_new", + K8SNamespace: "test-ns", + K8SName: "new-deployment", + Image: "myapp:v2", + Replicas: 3, + CpuMillicores: 500, + MemoryMib: 512, + }, + }, + }, + }, + }, + { + Sequence: 102, + Kind: &ctrlv1.State_Deployment{ + Deployment: &ctrlv1.DeploymentState{ + State: &ctrlv1.DeploymentState_Delete{ + Delete: &ctrlv1.DeleteDeployment{ + K8SNamespace: "test-ns", + K8SName: "old-deployment", + }, + }, + }, + }, + }, + { + Sequence: 103, + Kind: &ctrlv1.State_Sentinel{ + Sentinel: &ctrlv1.SentinelState{ + State: &ctrlv1.SentinelState_Delete{ + Delete: &ctrlv1.DeleteSentinel{ + K8SName: "old-sentinel", + }, + }, + }, + }, + }, + } + + mockCluster := &MockClusterClient{} + + r := New(Config{ + ClientSet: client, + Logger: logging.NewNoop(), + Cluster: mockCluster, + Region: "test-region", + }) + + // Start with sequence 100 (simulating reconnect after bootstrap) + r.sequenceLastSeen = 100 + + ctx := context.Background() + stream := newMockServerStream(messages) + for stream.Receive() { + err := r.HandleState(ctx, stream.Msg()) + require.NoError(t, err) + } + + require.Equal(t, uint64(103), r.sequenceLastSeen) + require.Contains(t, deletes.Actions, "replicasets", "deployment delete should be processed (deletes ReplicaSet)") + require.Contains(t, deletes.Actions, "deployments", "sentinel delete should be processed (deletes Deployment)") +} + +// ============================================================================= +// Configuration Tests +// ============================================================================= + +// TestWatch_RegionIsPersisted verifies that the region from Config is correctly +// stored in the reconciler. +// +// Guarantees: +// - New() correctly sets the region field from Config +// - The region is available for use in Sync requests +func TestWatch_RegionIsPersisted(t *testing.T) { + cfg := Config{ + ClientSet: fake.NewSimpleClientset(), + Logger: logging.NewNoop(), + Cluster: &MockClusterClient{}, + Region: "ap-southeast-1", + } + + r := New(cfg) + require.Equal(t, "ap-southeast-1", r.region) +} + +// ============================================================================= +// Error Handling Tests +// ============================================================================= + +// TestWatch_SyncConnectionError verifies that connection errors from Sync() +// are properly propagated back to the caller. +// +// Scenario: The control plane is unreachable (connection refused). +// +// Guarantees: +// - The error from Sync() is returned by watch() +// - The caller (Watch loop) can handle reconnection logic +// +// This tests the error path: what happens when the control plane is down. +// The Watch() outer loop will retry with exponential backoff. +func TestWatch_SyncConnectionError(t *testing.T) { + client := fake.NewSimpleClientset() + AddReplicaSetPatchReactor(client) + AddDeploymentPatchReactor(client) + AddServicePatchReactor(client) + AddDeleteTracker(client) + + expectedErr := errors.New("connection refused") + + mockCluster := &MockClusterClient{ + SyncFunc: func(_ context.Context, req *connect.Request[ctrlv1.SyncRequest]) (*connect.ServerStreamForClient[ctrlv1.State], error) { + return nil, expectedErr + }, + } + + r := New(Config{ + ClientSet: client, + Logger: logging.NewNoop(), + Cluster: mockCluster, + Region: "error-test-region", + }) + + ctx := context.Background() + err := r.watch(ctx) + + require.Error(t, err) + require.Equal(t, expectedErr, err) +} + +// ============================================================================= +// End-to-End Message Flow Tests +// ============================================================================= + +// TestWatch_FullMessageProcessingFlow verifies the complete message processing +// flow including multiple resource types and operations. +// +// Scenario: A full sync stream containing: +// - Apply deployment (seq=10) +// - Apply sentinel (seq=20) +// - Delete deployment (seq=30) +// - Bookmark (seq=40) +// +// Guarantees: +// - Deployment is applied to Kubernetes (ReplicaSet created with correct name) +// - Sentinel is applied (as a k8s Deployment - captured separately) +// - Deployment delete is processed (ReplicaSet deleted) +// - sequenceLastSeen ends at 40 (the bookmark value) +// +// This is a comprehensive integration test of HandleState covering all major +// message types in a realistic sequence. +func TestWatch_FullMessageProcessingFlow(t *testing.T) { + client := fake.NewSimpleClientset() + rsCapture := AddReplicaSetPatchReactor(client) + AddDeploymentPatchReactor(client) + AddServicePatchReactor(client) + deletes := AddDeleteTracker(client) + + r := New(Config{ + ClientSet: client, + Logger: logging.NewNoop(), + Cluster: &MockClusterClient{}, + Region: "full-flow-region", + }) + + ctx := context.Background() + + messages := []*ctrlv1.State{ + { + Sequence: 10, + Kind: &ctrlv1.State_Deployment{ + Deployment: &ctrlv1.DeploymentState{ + State: &ctrlv1.DeploymentState_Apply{ + Apply: &ctrlv1.ApplyDeployment{ + WorkspaceId: "ws_flow", + ProjectId: "prj_flow", + EnvironmentId: "env_flow", + DeploymentId: "dep_flow", + K8SNamespace: "flow-ns", + K8SName: "flow-deployment", + Image: "myapp:v1", + Replicas: 2, + CpuMillicores: 200, + MemoryMib: 256, + }, + }, + }, + }, + }, + { + Sequence: 20, + Kind: &ctrlv1.State_Sentinel{ + Sentinel: &ctrlv1.SentinelState{ + State: &ctrlv1.SentinelState_Apply{ + Apply: &ctrlv1.ApplySentinel{ + K8SName: "flow-sentinel", + WorkspaceId: "ws_flow", + EnvironmentId: "env_flow", + ProjectId: "prj_flow", + SentinelId: "sen_flow", + Image: "sentinel:v1", + Replicas: 1, + CpuMillicores: 100, + MemoryMib: 128, + }, + }, + }, + }, + }, + { + Sequence: 30, + Kind: &ctrlv1.State_Deployment{ + Deployment: &ctrlv1.DeploymentState{ + State: &ctrlv1.DeploymentState_Delete{ + Delete: &ctrlv1.DeleteDeployment{ + K8SNamespace: "flow-ns", + K8SName: "old-deployment", + }, + }, + }, + }, + }, + { + Sequence: 40, + Kind: &ctrlv1.State_Bookmark{ + Bookmark: &ctrlv1.Bookmark{ + Sequence: 40, + }, + }, + }, + } + + for _, msg := range messages { + err := r.HandleState(ctx, msg) + require.NoError(t, err) + } + + require.NotNil(t, rsCapture.Applied, "deployment should have been applied") + require.Equal(t, "flow-deployment", rsCapture.Applied.Name) + + require.Contains(t, deletes.Actions, "replicasets", "deployment delete should have been processed") + + require.Equal(t, uint64(40), r.sequenceLastSeen, "sequence should be updated to bookmark value") +} diff --git a/svc/krane/pkg/controlplane/client.go b/svc/krane/pkg/controlplane/client.go index 9e8586f365..153bb7305f 100644 --- a/svc/krane/pkg/controlplane/client.go +++ b/svc/krane/pkg/controlplane/client.go @@ -22,10 +22,6 @@ type ClientConfig struct { // Region identifies the geographical region of this client instance. // This value is added as the X-Krane-Region header for proper request routing. Region string - - // ClusterID is the identifier of the cluster this client is associated with. - // This value is added as the X-Krane-Cluster-Id header for proper request routing. - ClusterID string } // NewClient creates a new control plane client with the specified configuration. @@ -46,6 +42,6 @@ func NewClient(cfg ClientConfig) ctrlv1connect.ClusterServiceClient { }, }, cfg.URL, - connect.WithInterceptors(connectInterceptor(cfg.Region, cfg.ClusterID, cfg.BearerToken)), + connect.WithInterceptors(connectInterceptor(cfg.Region, cfg.BearerToken)), ) } diff --git a/svc/krane/pkg/controlplane/interceptor.go b/svc/krane/pkg/controlplane/interceptor.go index e6cc004ce4..9a0738a471 100644 --- a/svc/krane/pkg/controlplane/interceptor.go +++ b/svc/krane/pkg/controlplane/interceptor.go @@ -14,14 +14,12 @@ import ( // The interceptor automatically injects: // - Authorization header with Bearer token // - X-Krane-Region header for routing -// - X-Krane-Cluster-Id header for routing // // This is the recommended way to create interceptors for control plane clients. -func connectInterceptor(region, clusterID, bearer string) connect.Interceptor { +func connectInterceptor(region, bearer string) connect.Interceptor { return &authInterceptor{ - region: region, - clusterID: clusterID, - bearer: bearer, + region: region, + bearer: bearer, } } @@ -30,9 +28,8 @@ func connectInterceptor(region, clusterID, bearer string) connect.Interceptor { // It automatically adds authentication and routing metadata to all outgoing requests. // The interceptor is stateless and safe for concurrent use. type authInterceptor struct { - region string - clusterID string - bearer string + region string + bearer string } // WrapUnary intercepts unary RPC calls by adding required headers before forwarding @@ -130,6 +127,5 @@ func (s *streamingClientInterceptor) RequestHeader() http.Header { // All headers use Set() to overwrite any existing values. func (i *authInterceptor) setHeaders(header http.Header) { header.Set("X-Krane-Region", i.region) - header.Set("X-Krane-Cluster-Id", i.clusterID) header.Set("Authorization", fmt.Sprintf("Bearer %s", i.bearer)) } diff --git a/svc/krane/run.go b/svc/krane/run.go index 90bff4a7ea..a1e1badf6f 100644 --- a/svc/krane/run.go +++ b/svc/krane/run.go @@ -67,7 +67,6 @@ func Run(ctx context.Context, cfg Config) error { URL: cfg.ControlPlaneURL, BearerToken: cfg.ControlPlaneBearer, Region: cfg.Region, - ClusterID: cfg.ClusterID, }) inClusterConfig, err := rest.InClusterConfig() @@ -84,7 +83,6 @@ func Run(ctx context.Context, cfg Config) error { ClientSet: clientset, Logger: logger, Cluster: cluster, - ClusterID: cfg.ClusterID, Region: cfg.Region, }) if err := r.Start(ctx); err != nil { diff --git a/web/internal/db/src/schema/instances.ts b/web/internal/db/src/schema/instances.ts index bc0bd1f6a2..4673c27c07 100644 --- a/web/internal/db/src/schema/instances.ts +++ b/web/internal/db/src/schema/instances.ts @@ -23,8 +23,6 @@ export const instances = mysqlTable( projectId: varchar("project_id", { length: 255 }).notNull(), region: varchar("region", { length: 64 }).notNull(), - // allows multiple clusters per region later - clusterId: varchar("cluster_id", { length: 64 }).notNull(), // used to apply updates from the kubernetes watch events k8sName: varchar("k8s_name", { length: 255 }).notNull(), @@ -35,8 +33,8 @@ export const instances = mysqlTable( status: mysqlEnum("status", ["inactive", "pending", "running", "failed"]).notNull(), }, (table) => [ - uniqueIndex("unique_address_per_cluster").on(table.address, table.clusterId), - uniqueIndex("unique_k8s_name_per_cluster").on(table.k8sName, table.clusterId), + uniqueIndex("unique_address_per_region").on(table.address, table.region), + uniqueIndex("unique_k8s_name_per_region").on(table.k8sName, table.region), index("idx_deployment_id").on(table.deploymentId), index("idx_region").on(table.region), ], diff --git a/web/internal/db/src/schema/state_changes.ts b/web/internal/db/src/schema/state_changes.ts index aee89c8cbe..04dae8d2d0 100644 --- a/web/internal/db/src/schema/state_changes.ts +++ b/web/internal/db/src/schema/state_changes.ts @@ -1,22 +1,40 @@ import { bigint, index, mysqlEnum, mysqlTable, varchar } from "drizzle-orm/mysql-core"; -import { longblob } from "./util/longblob"; +/** + * stateChanges is a lightweight changelog for Kubernetes-style List+Watch sync. + * + * When cluster resources (deployments, sentinels) are created, updated, or deleted, + * a row is inserted here. Cluster agents (krane) poll this table to receive incremental + * updates instead of re-fetching all resources on every sync. + * + * The sequence column provides a monotonically increasing watermark. Agents persist their + * last-seen sequence and resume from there on reconnect, avoiding full resyncs. + * + * For upserts, the actual resource config is fetched via existing queries. + * For deletes, the resource row (deployment/sentinel) is soft-deleted so the k8s + * identity can still be looked up. + * + * Retention: Rows older than 7 days are periodically deleted. Agents with a watermark + * behind the minimum retained sequence must perform a full resync. + */ export const stateChanges = mysqlTable( "state_changes", { sequence: bigint("sequence", { mode: "number", unsigned: true }).autoincrement().primaryKey(), - // The apply or delete protobuf blob - resourceType: mysqlEnum("resource_type", ["sentinel", "deployment"]).notNull(), - state: longblob("state").notNull(), + resourceId: varchar("resource_id", { length: 256 }).notNull(), + op: mysqlEnum("op", ["upsert", "delete"]).notNull(), - clusterId: varchar("cluster_id", { length: 256 }).notNull(), + region: varchar("region", { length: 64 }).notNull(), createdAt: bigint("created_at", { mode: "number", unsigned: true, }).notNull(), }, - (table) => [index("cluster_id_sequence").on(table.clusterId, table.sequence)], + (table) => [ + index("region_sequence").on(table.region, table.sequence), + index("created_at").on(table.createdAt), + ], ); From 12fd34ee636d7fb2caf35c13b84823914c3671ee Mon Sep 17 00:00:00 2001 From: chronark Date: Mon, 19 Jan 2026 14:10:04 +0100 Subject: [PATCH 03/32] chore: remove tmp files --- docs/rfcs/list-watch-sync.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 docs/rfcs/list-watch-sync.md diff --git a/docs/rfcs/list-watch-sync.md b/docs/rfcs/list-watch-sync.md deleted file mode 100644 index e69de29bb2..0000000000 From 02c1f7ad8ffe9823f5120fdd369fc505e0d82870 Mon Sep 17 00:00:00 2001 From: chronark Date: Mon, 19 Jan 2026 16:02:55 +0100 Subject: [PATCH 04/32] refactor: make it simpler --- AGENTS.md | 15 + gen/proto/ctrl/v1/cluster.pb.go | 208 ++++------- gen/proto/hydra/v1/hydrav1connect/BUILD.bazel | 16 + .../v1/hydrav1connect/certificate.connect.go | 148 ++++++++ .../v1/hydrav1connect/deployment.connect.go | 167 +++++++++ .../v1/hydrav1connect/routing.connect.go | 111 ++++++ pkg/db/BUILD.bazel | 1 - ...eployment_topology_insert.sql_generated.go | 4 +- .../bulk_environment_insert.sql_generated.go | 4 +- .../bulk_environment_upsert.sql_generated.go | 4 +- pkg/db/bulk_identity_insert.sql_generated.go | 4 +- ...identity_insert_ratelimit.sql_generated.go | 4 +- pkg/db/bulk_identity_upsert.sql_generated.go | 4 +- ...bulk_ingress_route_insert.sql_generated.go | 4 +- pkg/db/bulk_instance_upsert.sql_generated.go | 4 +- pkg/db/bulk_key_auth_insert.sql_generated.go | 4 +- ...ulk_key_encryption_insert.sql_generated.go | 4 +- pkg/db/bulk_key_insert.sql_generated.go | 4 +- ...bulk_key_insert_ratelimit.sql_generated.go | 4 +- ...bulk_key_migration_insert.sql_generated.go | 4 +- ...ulk_key_permission_insert.sql_generated.go | 4 +- pkg/db/bulk_key_role_insert.sql_generated.go | 4 +- pkg/db/bulk_key_space_insert.sql_generated.go | 4 +- pkg/db/bulk_key_space_upsert.sql_generated.go | 4 +- .../bulk_permission_insert.sql_generated.go | 4 +- pkg/db/bulk_project_insert.sql_generated.go | 4 +- pkg/db/bulk_quota_upsert.sql_generated.go | 4 +- ...atelimit_namespace_insert.sql_generated.go | 4 +- ...ratelimit_override_insert.sql_generated.go | 4 +- pkg/db/bulk_role_insert.sql_generated.go | 4 +- ...lk_role_permission_insert.sql_generated.go | 4 +- pkg/db/bulk_sentinel_insert.sql_generated.go | 4 +- .../bulk_state_change_insert.sql_generated.go | 4 +- pkg/db/bulk_workspace_insert.sql_generated.go | 4 +- pkg/db/bulk_workspace_upsert.sql_generated.go | 4 +- pkg/db/querier_generated.go | 7 - .../state_change_delete_old.sql_generated.go | 30 -- svc/ctrl/integration/sync_test.go | 56 +-- svc/ctrl/proto/ctrl/v1/cluster.proto | 7 - svc/ctrl/services/cluster/BUILD.bazel | 4 + svc/ctrl/services/cluster/doc.go | 34 ++ svc/ctrl/services/cluster/rpc_sync.go | 344 ++---------------- svc/ctrl/services/cluster/service.go | 10 +- svc/ctrl/services/cluster/sync_bootstrap.go | 91 +++++ svc/ctrl/services/cluster/sync_changes.go | 90 +++++ svc/ctrl/services/cluster/sync_messages.go | 118 ++++++ svc/krane/internal/reconciler/reconciler.go | 3 - .../reconciler/sequence_tracking_test.go | 28 -- svc/krane/internal/reconciler/watcher_test.go | 46 +-- .../docs/architecture/services/ctrl/index.mdx | 16 +- .../services/ctrl/pull-based-infra.mdx | 133 ++++--- .../architecture/services/krane/index.mdx | 67 ++-- .../services/krane/sync-engine.mdx | 98 ++++- 53 files changed, 1166 insertions(+), 794 deletions(-) create mode 100644 gen/proto/hydra/v1/hydrav1connect/BUILD.bazel create mode 100644 gen/proto/hydra/v1/hydrav1connect/certificate.connect.go create mode 100644 gen/proto/hydra/v1/hydrav1connect/deployment.connect.go create mode 100644 gen/proto/hydra/v1/hydrav1connect/routing.connect.go delete mode 100644 pkg/db/state_change_delete_old.sql_generated.go create mode 100644 svc/ctrl/services/cluster/doc.go create mode 100644 svc/ctrl/services/cluster/sync_bootstrap.go create mode 100644 svc/ctrl/services/cluster/sync_changes.go create mode 100644 svc/ctrl/services/cluster/sync_messages.go diff --git a/AGENTS.md b/AGENTS.md index db1351e9cf..f8eb43ae53 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -199,6 +199,21 @@ expect(res.status).toBe(200); **TypeScript** (`web/biome.json`): Enforces no unused variables/imports, strict equality, proper React hooks usage, and consistent code style. +## Detailed Guidelines + +For comprehensive guidance, read these internal docs in `web/apps/engineering/content/docs/contributing/`: + +- **Code Style** (`code-style.mdx`): Design philosophy (safety > performance > DX), zero technical debt policy, assertions, error handling with `fault`, scope minimization, failure handling (circuit breakers, retry with backoff, idempotency) +- **Documentation** (`documentation.mdx`): Document the "why" not the "what", use prose over bullets, match depth to complexity, verify behavior before documenting +- **Testing** (`testing/`): + - `index.mdx` - What to test, test organization, resource cleanup + - `unit-tests.mdx` - Table-driven tests, naming, parallel execution, test clocks + - `integration-tests.mdx` - Docker containers, test harness, real dependencies + - `http-handler-tests.mdx` - API endpoint testing patterns + - `fuzz-tests.mdx` - Randomized input testing for parsers/validators + - `simulation-tests.mdx` - Property-based testing for stateful systems + - `anti-patterns.mdx` - Common mistakes (sleeping, over-mocking, shared state) + ## Important Notes - Always run `make bazel` after adding new Go files diff --git a/gen/proto/ctrl/v1/cluster.pb.go b/gen/proto/ctrl/v1/cluster.pb.go index 397556c871..65cfb526cf 100644 --- a/gen/proto/ctrl/v1/cluster.pb.go +++ b/gen/proto/ctrl/v1/cluster.pb.go @@ -465,52 +465,6 @@ func (x *SyncRequest) GetSequenceLastSeen() uint64 { return 0 } -// Bookmark is sent after bootstrap completes to signal the client is caught up. -// The client should persist this sequence to resume watch on reconnect. -type Bookmark struct { - state protoimpl.MessageState `protogen:"open.v1"` - Sequence uint64 `protobuf:"varint,1,opt,name=sequence,proto3" json:"sequence,omitempty"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache -} - -func (x *Bookmark) Reset() { - *x = Bookmark{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[8] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) -} - -func (x *Bookmark) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*Bookmark) ProtoMessage() {} - -func (x *Bookmark) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[8] - if x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use Bookmark.ProtoReflect.Descriptor instead. -func (*Bookmark) Descriptor() ([]byte, []int) { - return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{8} -} - -func (x *Bookmark) GetSequence() uint64 { - if x != nil { - return x.Sequence - } - return 0 -} - type State struct { state protoimpl.MessageState `protogen:"open.v1"` // sequence is the state_changes sequence number for this event. @@ -521,7 +475,6 @@ type State struct { // // *State_Deployment // *State_Sentinel - // *State_Bookmark Kind isState_Kind `protobuf_oneof:"kind"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache @@ -529,7 +482,7 @@ type State struct { func (x *State) Reset() { *x = State{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[9] + mi := &file_ctrl_v1_cluster_proto_msgTypes[8] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -541,7 +494,7 @@ func (x *State) String() string { func (*State) ProtoMessage() {} func (x *State) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[9] + mi := &file_ctrl_v1_cluster_proto_msgTypes[8] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -554,7 +507,7 @@ func (x *State) ProtoReflect() protoreflect.Message { // Deprecated: Use State.ProtoReflect.Descriptor instead. func (*State) Descriptor() ([]byte, []int) { - return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{9} + return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{8} } func (x *State) GetSequence() uint64 { @@ -589,15 +542,6 @@ func (x *State) GetSentinel() *SentinelState { return nil } -func (x *State) GetBookmark() *Bookmark { - if x != nil { - if x, ok := x.Kind.(*State_Bookmark); ok { - return x.Bookmark - } - } - return nil -} - type isState_Kind interface { isState_Kind() } @@ -610,16 +554,10 @@ type State_Sentinel struct { Sentinel *SentinelState `protobuf:"bytes,3,opt,name=sentinel,proto3,oneof"` } -type State_Bookmark struct { - Bookmark *Bookmark `protobuf:"bytes,4,opt,name=bookmark,proto3,oneof"` -} - func (*State_Deployment) isState_Kind() {} func (*State_Sentinel) isState_Kind() {} -func (*State_Bookmark) isState_Kind() {} - // SentinelState represents a lifecycle event for an API sentinel configuration. // // Sentinels are frontline points for services, typically handling routing, load balancing, @@ -641,7 +579,7 @@ type SentinelState struct { func (x *SentinelState) Reset() { *x = SentinelState{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[10] + mi := &file_ctrl_v1_cluster_proto_msgTypes[9] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -653,7 +591,7 @@ func (x *SentinelState) String() string { func (*SentinelState) ProtoMessage() {} func (x *SentinelState) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[10] + mi := &file_ctrl_v1_cluster_proto_msgTypes[9] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -666,7 +604,7 @@ func (x *SentinelState) ProtoReflect() protoreflect.Message { // Deprecated: Use SentinelState.ProtoReflect.Descriptor instead. func (*SentinelState) Descriptor() ([]byte, []int) { - return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{10} + return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{9} } func (x *SentinelState) GetState() isSentinelState_State { @@ -736,7 +674,7 @@ type DeploymentState struct { func (x *DeploymentState) Reset() { *x = DeploymentState{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[11] + mi := &file_ctrl_v1_cluster_proto_msgTypes[10] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -748,7 +686,7 @@ func (x *DeploymentState) String() string { func (*DeploymentState) ProtoMessage() {} func (x *DeploymentState) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[11] + mi := &file_ctrl_v1_cluster_proto_msgTypes[10] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -761,7 +699,7 @@ func (x *DeploymentState) ProtoReflect() protoreflect.Message { // Deprecated: Use DeploymentState.ProtoReflect.Descriptor instead. func (*DeploymentState) Descriptor() ([]byte, []int) { - return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{11} + return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{10} } func (x *DeploymentState) GetState() isDeploymentState_State { @@ -836,7 +774,7 @@ type ApplySentinel struct { func (x *ApplySentinel) Reset() { *x = ApplySentinel{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[12] + mi := &file_ctrl_v1_cluster_proto_msgTypes[11] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -848,7 +786,7 @@ func (x *ApplySentinel) String() string { func (*ApplySentinel) ProtoMessage() {} func (x *ApplySentinel) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[12] + mi := &file_ctrl_v1_cluster_proto_msgTypes[11] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -861,7 +799,7 @@ func (x *ApplySentinel) ProtoReflect() protoreflect.Message { // Deprecated: Use ApplySentinel.ProtoReflect.Descriptor instead. func (*ApplySentinel) Descriptor() ([]byte, []int) { - return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{12} + return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{11} } func (x *ApplySentinel) GetK8SName() string { @@ -940,7 +878,7 @@ type DeleteSentinel struct { func (x *DeleteSentinel) Reset() { *x = DeleteSentinel{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[13] + mi := &file_ctrl_v1_cluster_proto_msgTypes[12] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -952,7 +890,7 @@ func (x *DeleteSentinel) String() string { func (*DeleteSentinel) ProtoMessage() {} func (x *DeleteSentinel) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[13] + mi := &file_ctrl_v1_cluster_proto_msgTypes[12] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -965,7 +903,7 @@ func (x *DeleteSentinel) ProtoReflect() protoreflect.Message { // Deprecated: Use DeleteSentinel.ProtoReflect.Descriptor instead. func (*DeleteSentinel) Descriptor() ([]byte, []int) { - return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{13} + return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{12} } func (x *DeleteSentinel) GetK8SName() string { @@ -1019,7 +957,7 @@ type ApplyDeployment struct { func (x *ApplyDeployment) Reset() { *x = ApplyDeployment{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[14] + mi := &file_ctrl_v1_cluster_proto_msgTypes[13] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -1031,7 +969,7 @@ func (x *ApplyDeployment) String() string { func (*ApplyDeployment) ProtoMessage() {} func (x *ApplyDeployment) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[14] + mi := &file_ctrl_v1_cluster_proto_msgTypes[13] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1044,7 +982,7 @@ func (x *ApplyDeployment) ProtoReflect() protoreflect.Message { // Deprecated: Use ApplyDeployment.ProtoReflect.Descriptor instead. func (*ApplyDeployment) Descriptor() ([]byte, []int) { - return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{14} + return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{13} } func (x *ApplyDeployment) GetK8SNamespace() string { @@ -1146,7 +1084,7 @@ type DeleteDeployment struct { func (x *DeleteDeployment) Reset() { *x = DeleteDeployment{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[15] + mi := &file_ctrl_v1_cluster_proto_msgTypes[14] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -1158,7 +1096,7 @@ func (x *DeleteDeployment) String() string { func (*DeleteDeployment) ProtoMessage() {} func (x *DeleteDeployment) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[15] + mi := &file_ctrl_v1_cluster_proto_msgTypes[14] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1171,7 +1109,7 @@ func (x *DeleteDeployment) ProtoReflect() protoreflect.Message { // Deprecated: Use DeleteDeployment.ProtoReflect.Descriptor instead. func (*DeleteDeployment) Descriptor() ([]byte, []int) { - return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{15} + return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{14} } func (x *DeleteDeployment) GetK8SNamespace() string { @@ -1198,7 +1136,7 @@ type UpdateDeploymentStateRequest_Update struct { func (x *UpdateDeploymentStateRequest_Update) Reset() { *x = UpdateDeploymentStateRequest_Update{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[16] + mi := &file_ctrl_v1_cluster_proto_msgTypes[15] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -1210,7 +1148,7 @@ func (x *UpdateDeploymentStateRequest_Update) String() string { func (*UpdateDeploymentStateRequest_Update) ProtoMessage() {} func (x *UpdateDeploymentStateRequest_Update) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[16] + mi := &file_ctrl_v1_cluster_proto_msgTypes[15] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1249,7 +1187,7 @@ type UpdateDeploymentStateRequest_Delete struct { func (x *UpdateDeploymentStateRequest_Delete) Reset() { *x = UpdateDeploymentStateRequest_Delete{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[17] + mi := &file_ctrl_v1_cluster_proto_msgTypes[16] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -1261,7 +1199,7 @@ func (x *UpdateDeploymentStateRequest_Delete) String() string { func (*UpdateDeploymentStateRequest_Delete) ProtoMessage() {} func (x *UpdateDeploymentStateRequest_Delete) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[17] + mi := &file_ctrl_v1_cluster_proto_msgTypes[16] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1297,7 +1235,7 @@ type UpdateDeploymentStateRequest_Update_Instance struct { func (x *UpdateDeploymentStateRequest_Update_Instance) Reset() { *x = UpdateDeploymentStateRequest_Update_Instance{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[18] + mi := &file_ctrl_v1_cluster_proto_msgTypes[17] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -1309,7 +1247,7 @@ func (x *UpdateDeploymentStateRequest_Update_Instance) String() string { func (*UpdateDeploymentStateRequest_Update_Instance) ProtoMessage() {} func (x *UpdateDeploymentStateRequest_Update_Instance) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[18] + mi := &file_ctrl_v1_cluster_proto_msgTypes[17] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1399,16 +1337,13 @@ const file_ctrl_v1_cluster_proto_rawDesc = "" + "\x1bUpdateSentinelStateResponse\"S\n" + "\vSyncRequest\x12\x16\n" + "\x06region\x18\x01 \x01(\tR\x06region\x12,\n" + - "\x12sequence_last_seen\x18\x02 \x01(\x04R\x10sequenceLastSeen\"&\n" + - "\bBookmark\x12\x1a\n" + - "\bsequence\x18\x01 \x01(\x04R\bsequence\"\xce\x01\n" + + "\x12sequence_last_seen\x18\x02 \x01(\x04R\x10sequenceLastSeen\"\x9d\x01\n" + "\x05State\x12\x1a\n" + "\bsequence\x18\x01 \x01(\x04R\bsequence\x12:\n" + "\n" + "deployment\x18\x02 \x01(\v2\x18.ctrl.v1.DeploymentStateH\x00R\n" + "deployment\x124\n" + - "\bsentinel\x18\x03 \x01(\v2\x16.ctrl.v1.SentinelStateH\x00R\bsentinel\x12/\n" + - "\bbookmark\x18\x04 \x01(\v2\x11.ctrl.v1.BookmarkH\x00R\bbookmarkB\x06\n" + + "\bsentinel\x18\x03 \x01(\v2\x16.ctrl.v1.SentinelStateH\x00R\bsentinelB\x06\n" + "\x04kind\"{\n" + "\rSentinelState\x12.\n" + "\x05apply\x18\x01 \x01(\v2\x16.ctrl.v1.ApplySentinelH\x00R\x05apply\x121\n" + @@ -1474,7 +1409,7 @@ func file_ctrl_v1_cluster_proto_rawDescGZIP() []byte { } var file_ctrl_v1_cluster_proto_enumTypes = make([]protoimpl.EnumInfo, 1) -var file_ctrl_v1_cluster_proto_msgTypes = make([]protoimpl.MessageInfo, 19) +var file_ctrl_v1_cluster_proto_msgTypes = make([]protoimpl.MessageInfo, 18) var file_ctrl_v1_cluster_proto_goTypes = []any{ (UpdateDeploymentStateRequest_Update_Instance_Status)(0), // 0: ctrl.v1.UpdateDeploymentStateRequest.Update.Instance.Status (*GetDesiredSentinelStateRequest)(nil), // 1: ctrl.v1.GetDesiredSentinelStateRequest @@ -1485,45 +1420,43 @@ var file_ctrl_v1_cluster_proto_goTypes = []any{ (*UpdateSentinelStateRequest)(nil), // 6: ctrl.v1.UpdateSentinelStateRequest (*UpdateSentinelStateResponse)(nil), // 7: ctrl.v1.UpdateSentinelStateResponse (*SyncRequest)(nil), // 8: ctrl.v1.SyncRequest - (*Bookmark)(nil), // 9: ctrl.v1.Bookmark - (*State)(nil), // 10: ctrl.v1.State - (*SentinelState)(nil), // 11: ctrl.v1.SentinelState - (*DeploymentState)(nil), // 12: ctrl.v1.DeploymentState - (*ApplySentinel)(nil), // 13: ctrl.v1.ApplySentinel - (*DeleteSentinel)(nil), // 14: ctrl.v1.DeleteSentinel - (*ApplyDeployment)(nil), // 15: ctrl.v1.ApplyDeployment - (*DeleteDeployment)(nil), // 16: ctrl.v1.DeleteDeployment - (*UpdateDeploymentStateRequest_Update)(nil), // 17: ctrl.v1.UpdateDeploymentStateRequest.Update - (*UpdateDeploymentStateRequest_Delete)(nil), // 18: ctrl.v1.UpdateDeploymentStateRequest.Delete - (*UpdateDeploymentStateRequest_Update_Instance)(nil), // 19: ctrl.v1.UpdateDeploymentStateRequest.Update.Instance + (*State)(nil), // 9: ctrl.v1.State + (*SentinelState)(nil), // 10: ctrl.v1.SentinelState + (*DeploymentState)(nil), // 11: ctrl.v1.DeploymentState + (*ApplySentinel)(nil), // 12: ctrl.v1.ApplySentinel + (*DeleteSentinel)(nil), // 13: ctrl.v1.DeleteSentinel + (*ApplyDeployment)(nil), // 14: ctrl.v1.ApplyDeployment + (*DeleteDeployment)(nil), // 15: ctrl.v1.DeleteDeployment + (*UpdateDeploymentStateRequest_Update)(nil), // 16: ctrl.v1.UpdateDeploymentStateRequest.Update + (*UpdateDeploymentStateRequest_Delete)(nil), // 17: ctrl.v1.UpdateDeploymentStateRequest.Delete + (*UpdateDeploymentStateRequest_Update_Instance)(nil), // 18: ctrl.v1.UpdateDeploymentStateRequest.Update.Instance } var file_ctrl_v1_cluster_proto_depIdxs = []int32{ - 17, // 0: ctrl.v1.UpdateDeploymentStateRequest.update:type_name -> ctrl.v1.UpdateDeploymentStateRequest.Update - 18, // 1: ctrl.v1.UpdateDeploymentStateRequest.delete:type_name -> ctrl.v1.UpdateDeploymentStateRequest.Delete - 12, // 2: ctrl.v1.State.deployment:type_name -> ctrl.v1.DeploymentState - 11, // 3: ctrl.v1.State.sentinel:type_name -> ctrl.v1.SentinelState - 9, // 4: ctrl.v1.State.bookmark:type_name -> ctrl.v1.Bookmark - 13, // 5: ctrl.v1.SentinelState.apply:type_name -> ctrl.v1.ApplySentinel - 14, // 6: ctrl.v1.SentinelState.delete:type_name -> ctrl.v1.DeleteSentinel - 15, // 7: ctrl.v1.DeploymentState.apply:type_name -> ctrl.v1.ApplyDeployment - 16, // 8: ctrl.v1.DeploymentState.delete:type_name -> ctrl.v1.DeleteDeployment - 19, // 9: ctrl.v1.UpdateDeploymentStateRequest.Update.instances:type_name -> ctrl.v1.UpdateDeploymentStateRequest.Update.Instance - 0, // 10: ctrl.v1.UpdateDeploymentStateRequest.Update.Instance.status:type_name -> ctrl.v1.UpdateDeploymentStateRequest.Update.Instance.Status - 8, // 11: ctrl.v1.ClusterService.Sync:input_type -> ctrl.v1.SyncRequest - 1, // 12: ctrl.v1.ClusterService.GetDesiredSentinelState:input_type -> ctrl.v1.GetDesiredSentinelStateRequest - 6, // 13: ctrl.v1.ClusterService.UpdateSentinelState:input_type -> ctrl.v1.UpdateSentinelStateRequest - 2, // 14: ctrl.v1.ClusterService.GetDesiredDeploymentState:input_type -> ctrl.v1.GetDesiredDeploymentStateRequest - 3, // 15: ctrl.v1.ClusterService.UpdateDeploymentState:input_type -> ctrl.v1.UpdateDeploymentStateRequest - 10, // 16: ctrl.v1.ClusterService.Sync:output_type -> ctrl.v1.State - 11, // 17: ctrl.v1.ClusterService.GetDesiredSentinelState:output_type -> ctrl.v1.SentinelState - 7, // 18: ctrl.v1.ClusterService.UpdateSentinelState:output_type -> ctrl.v1.UpdateSentinelStateResponse - 12, // 19: ctrl.v1.ClusterService.GetDesiredDeploymentState:output_type -> ctrl.v1.DeploymentState - 4, // 20: ctrl.v1.ClusterService.UpdateDeploymentState:output_type -> ctrl.v1.UpdateDeploymentStateResponse - 16, // [16:21] is the sub-list for method output_type - 11, // [11:16] is the sub-list for method input_type - 11, // [11:11] is the sub-list for extension type_name - 11, // [11:11] is the sub-list for extension extendee - 0, // [0:11] is the sub-list for field type_name + 16, // 0: ctrl.v1.UpdateDeploymentStateRequest.update:type_name -> ctrl.v1.UpdateDeploymentStateRequest.Update + 17, // 1: ctrl.v1.UpdateDeploymentStateRequest.delete:type_name -> ctrl.v1.UpdateDeploymentStateRequest.Delete + 11, // 2: ctrl.v1.State.deployment:type_name -> ctrl.v1.DeploymentState + 10, // 3: ctrl.v1.State.sentinel:type_name -> ctrl.v1.SentinelState + 12, // 4: ctrl.v1.SentinelState.apply:type_name -> ctrl.v1.ApplySentinel + 13, // 5: ctrl.v1.SentinelState.delete:type_name -> ctrl.v1.DeleteSentinel + 14, // 6: ctrl.v1.DeploymentState.apply:type_name -> ctrl.v1.ApplyDeployment + 15, // 7: ctrl.v1.DeploymentState.delete:type_name -> ctrl.v1.DeleteDeployment + 18, // 8: ctrl.v1.UpdateDeploymentStateRequest.Update.instances:type_name -> ctrl.v1.UpdateDeploymentStateRequest.Update.Instance + 0, // 9: ctrl.v1.UpdateDeploymentStateRequest.Update.Instance.status:type_name -> ctrl.v1.UpdateDeploymentStateRequest.Update.Instance.Status + 8, // 10: ctrl.v1.ClusterService.Sync:input_type -> ctrl.v1.SyncRequest + 1, // 11: ctrl.v1.ClusterService.GetDesiredSentinelState:input_type -> ctrl.v1.GetDesiredSentinelStateRequest + 6, // 12: ctrl.v1.ClusterService.UpdateSentinelState:input_type -> ctrl.v1.UpdateSentinelStateRequest + 2, // 13: ctrl.v1.ClusterService.GetDesiredDeploymentState:input_type -> ctrl.v1.GetDesiredDeploymentStateRequest + 3, // 14: ctrl.v1.ClusterService.UpdateDeploymentState:input_type -> ctrl.v1.UpdateDeploymentStateRequest + 9, // 15: ctrl.v1.ClusterService.Sync:output_type -> ctrl.v1.State + 10, // 16: ctrl.v1.ClusterService.GetDesiredSentinelState:output_type -> ctrl.v1.SentinelState + 7, // 17: ctrl.v1.ClusterService.UpdateSentinelState:output_type -> ctrl.v1.UpdateSentinelStateResponse + 11, // 18: ctrl.v1.ClusterService.GetDesiredDeploymentState:output_type -> ctrl.v1.DeploymentState + 4, // 19: ctrl.v1.ClusterService.UpdateDeploymentState:output_type -> ctrl.v1.UpdateDeploymentStateResponse + 15, // [15:20] is the sub-list for method output_type + 10, // [10:15] is the sub-list for method input_type + 10, // [10:10] is the sub-list for extension type_name + 10, // [10:10] is the sub-list for extension extendee + 0, // [0:10] is the sub-list for field type_name } func init() { file_ctrl_v1_cluster_proto_init() } @@ -1535,27 +1468,26 @@ func file_ctrl_v1_cluster_proto_init() { (*UpdateDeploymentStateRequest_Update_)(nil), (*UpdateDeploymentStateRequest_Delete_)(nil), } - file_ctrl_v1_cluster_proto_msgTypes[9].OneofWrappers = []any{ + file_ctrl_v1_cluster_proto_msgTypes[8].OneofWrappers = []any{ (*State_Deployment)(nil), (*State_Sentinel)(nil), - (*State_Bookmark)(nil), } - file_ctrl_v1_cluster_proto_msgTypes[10].OneofWrappers = []any{ + file_ctrl_v1_cluster_proto_msgTypes[9].OneofWrappers = []any{ (*SentinelState_Apply)(nil), (*SentinelState_Delete)(nil), } - file_ctrl_v1_cluster_proto_msgTypes[11].OneofWrappers = []any{ + file_ctrl_v1_cluster_proto_msgTypes[10].OneofWrappers = []any{ (*DeploymentState_Apply)(nil), (*DeploymentState_Delete)(nil), } - file_ctrl_v1_cluster_proto_msgTypes[14].OneofWrappers = []any{} + file_ctrl_v1_cluster_proto_msgTypes[13].OneofWrappers = []any{} type x struct{} out := protoimpl.TypeBuilder{ File: protoimpl.DescBuilder{ GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: unsafe.Slice(unsafe.StringData(file_ctrl_v1_cluster_proto_rawDesc), len(file_ctrl_v1_cluster_proto_rawDesc)), NumEnums: 1, - NumMessages: 19, + NumMessages: 18, NumExtensions: 0, NumServices: 1, }, diff --git a/gen/proto/hydra/v1/hydrav1connect/BUILD.bazel b/gen/proto/hydra/v1/hydrav1connect/BUILD.bazel new file mode 100644 index 0000000000..5dc3dd1c97 --- /dev/null +++ b/gen/proto/hydra/v1/hydrav1connect/BUILD.bazel @@ -0,0 +1,16 @@ +load("@rules_go//go:def.bzl", "go_library") + +go_library( + name = "hydrav1connect", + srcs = [ + "certificate.connect.go", + "deployment.connect.go", + "routing.connect.go", + ], + importpath = "github.com/unkeyed/unkey/gen/proto/hydra/v1/hydrav1connect", + visibility = ["//visibility:public"], + deps = [ + "//gen/proto/hydra/v1:hydra", + "@com_connectrpc_connect//:connect", + ], +) diff --git a/gen/proto/hydra/v1/hydrav1connect/certificate.connect.go b/gen/proto/hydra/v1/hydrav1connect/certificate.connect.go new file mode 100644 index 0000000000..c20810312f --- /dev/null +++ b/gen/proto/hydra/v1/hydrav1connect/certificate.connect.go @@ -0,0 +1,148 @@ +// Code generated by protoc-gen-connect-go. DO NOT EDIT. +// +// Source: hydra/v1/certificate.proto + +package hydrav1connect + +import ( + connect "connectrpc.com/connect" + context "context" + errors "errors" + v1 "github.com/unkeyed/unkey/gen/proto/hydra/v1" + http "net/http" + strings "strings" +) + +// This is a compile-time assertion to ensure that this generated file and the connect package are +// compatible. If you get a compiler error that this constant is not defined, this code was +// generated with a version of connect newer than the one compiled into your binary. You can fix the +// problem by either regenerating this code with an older version of connect or updating the connect +// version compiled into your binary. +const _ = connect.IsAtLeastVersion1_13_0 + +const ( + // CertificateServiceName is the fully-qualified name of the CertificateService service. + CertificateServiceName = "hydra.v1.CertificateService" +) + +// These constants are the fully-qualified names of the RPCs defined in this package. They're +// exposed at runtime as Spec.Procedure and as the final two segments of the HTTP route. +// +// Note that these are different from the fully-qualified method names used by +// google.golang.org/protobuf/reflect/protoreflect. To convert from these constants to +// reflection-formatted method names, remove the leading slash and convert the remaining slash to a +// period. +const ( + // CertificateServiceProcessChallengeProcedure is the fully-qualified name of the + // CertificateService's ProcessChallenge RPC. + CertificateServiceProcessChallengeProcedure = "/hydra.v1.CertificateService/ProcessChallenge" + // CertificateServiceRenewExpiringCertificatesProcedure is the fully-qualified name of the + // CertificateService's RenewExpiringCertificates RPC. + CertificateServiceRenewExpiringCertificatesProcedure = "/hydra.v1.CertificateService/RenewExpiringCertificates" +) + +// CertificateServiceClient is a client for the hydra.v1.CertificateService service. +type CertificateServiceClient interface { + // ProcessChallenge handles the complete ACME certificate challenge flow + // Key: domain name (ensures only one challenge per domain at a time) + ProcessChallenge(context.Context, *connect.Request[v1.ProcessChallengeRequest]) (*connect.Response[v1.ProcessChallengeResponse], error) + // RenewExpiringCertificates checks for certificates expiring soon and renews them. + // This should be called periodically (e.g., daily via cron). + // Key: "global" (single instance ensures no duplicate renewal runs) + RenewExpiringCertificates(context.Context, *connect.Request[v1.RenewExpiringCertificatesRequest]) (*connect.Response[v1.RenewExpiringCertificatesResponse], error) +} + +// NewCertificateServiceClient constructs a client for the hydra.v1.CertificateService service. By +// default, it uses the Connect protocol with the binary Protobuf Codec, asks for gzipped responses, +// and sends uncompressed requests. To use the gRPC or gRPC-Web protocols, supply the +// connect.WithGRPC() or connect.WithGRPCWeb() options. +// +// The URL supplied here should be the base URL for the Connect or gRPC server (for example, +// http://api.acme.com or https://acme.com/grpc). +func NewCertificateServiceClient(httpClient connect.HTTPClient, baseURL string, opts ...connect.ClientOption) CertificateServiceClient { + baseURL = strings.TrimRight(baseURL, "/") + certificateServiceMethods := v1.File_hydra_v1_certificate_proto.Services().ByName("CertificateService").Methods() + return &certificateServiceClient{ + processChallenge: connect.NewClient[v1.ProcessChallengeRequest, v1.ProcessChallengeResponse]( + httpClient, + baseURL+CertificateServiceProcessChallengeProcedure, + connect.WithSchema(certificateServiceMethods.ByName("ProcessChallenge")), + connect.WithClientOptions(opts...), + ), + renewExpiringCertificates: connect.NewClient[v1.RenewExpiringCertificatesRequest, v1.RenewExpiringCertificatesResponse]( + httpClient, + baseURL+CertificateServiceRenewExpiringCertificatesProcedure, + connect.WithSchema(certificateServiceMethods.ByName("RenewExpiringCertificates")), + connect.WithClientOptions(opts...), + ), + } +} + +// certificateServiceClient implements CertificateServiceClient. +type certificateServiceClient struct { + processChallenge *connect.Client[v1.ProcessChallengeRequest, v1.ProcessChallengeResponse] + renewExpiringCertificates *connect.Client[v1.RenewExpiringCertificatesRequest, v1.RenewExpiringCertificatesResponse] +} + +// ProcessChallenge calls hydra.v1.CertificateService.ProcessChallenge. +func (c *certificateServiceClient) ProcessChallenge(ctx context.Context, req *connect.Request[v1.ProcessChallengeRequest]) (*connect.Response[v1.ProcessChallengeResponse], error) { + return c.processChallenge.CallUnary(ctx, req) +} + +// RenewExpiringCertificates calls hydra.v1.CertificateService.RenewExpiringCertificates. +func (c *certificateServiceClient) RenewExpiringCertificates(ctx context.Context, req *connect.Request[v1.RenewExpiringCertificatesRequest]) (*connect.Response[v1.RenewExpiringCertificatesResponse], error) { + return c.renewExpiringCertificates.CallUnary(ctx, req) +} + +// CertificateServiceHandler is an implementation of the hydra.v1.CertificateService service. +type CertificateServiceHandler interface { + // ProcessChallenge handles the complete ACME certificate challenge flow + // Key: domain name (ensures only one challenge per domain at a time) + ProcessChallenge(context.Context, *connect.Request[v1.ProcessChallengeRequest]) (*connect.Response[v1.ProcessChallengeResponse], error) + // RenewExpiringCertificates checks for certificates expiring soon and renews them. + // This should be called periodically (e.g., daily via cron). + // Key: "global" (single instance ensures no duplicate renewal runs) + RenewExpiringCertificates(context.Context, *connect.Request[v1.RenewExpiringCertificatesRequest]) (*connect.Response[v1.RenewExpiringCertificatesResponse], error) +} + +// NewCertificateServiceHandler builds an HTTP handler from the service implementation. It returns +// the path on which to mount the handler and the handler itself. +// +// By default, handlers support the Connect, gRPC, and gRPC-Web protocols with the binary Protobuf +// and JSON codecs. They also support gzip compression. +func NewCertificateServiceHandler(svc CertificateServiceHandler, opts ...connect.HandlerOption) (string, http.Handler) { + certificateServiceMethods := v1.File_hydra_v1_certificate_proto.Services().ByName("CertificateService").Methods() + certificateServiceProcessChallengeHandler := connect.NewUnaryHandler( + CertificateServiceProcessChallengeProcedure, + svc.ProcessChallenge, + connect.WithSchema(certificateServiceMethods.ByName("ProcessChallenge")), + connect.WithHandlerOptions(opts...), + ) + certificateServiceRenewExpiringCertificatesHandler := connect.NewUnaryHandler( + CertificateServiceRenewExpiringCertificatesProcedure, + svc.RenewExpiringCertificates, + connect.WithSchema(certificateServiceMethods.ByName("RenewExpiringCertificates")), + connect.WithHandlerOptions(opts...), + ) + return "/hydra.v1.CertificateService/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case CertificateServiceProcessChallengeProcedure: + certificateServiceProcessChallengeHandler.ServeHTTP(w, r) + case CertificateServiceRenewExpiringCertificatesProcedure: + certificateServiceRenewExpiringCertificatesHandler.ServeHTTP(w, r) + default: + http.NotFound(w, r) + } + }) +} + +// UnimplementedCertificateServiceHandler returns CodeUnimplemented from all methods. +type UnimplementedCertificateServiceHandler struct{} + +func (UnimplementedCertificateServiceHandler) ProcessChallenge(context.Context, *connect.Request[v1.ProcessChallengeRequest]) (*connect.Response[v1.ProcessChallengeResponse], error) { + return nil, connect.NewError(connect.CodeUnimplemented, errors.New("hydra.v1.CertificateService.ProcessChallenge is not implemented")) +} + +func (UnimplementedCertificateServiceHandler) RenewExpiringCertificates(context.Context, *connect.Request[v1.RenewExpiringCertificatesRequest]) (*connect.Response[v1.RenewExpiringCertificatesResponse], error) { + return nil, connect.NewError(connect.CodeUnimplemented, errors.New("hydra.v1.CertificateService.RenewExpiringCertificates is not implemented")) +} diff --git a/gen/proto/hydra/v1/hydrav1connect/deployment.connect.go b/gen/proto/hydra/v1/hydrav1connect/deployment.connect.go new file mode 100644 index 0000000000..c5c358d5e7 --- /dev/null +++ b/gen/proto/hydra/v1/hydrav1connect/deployment.connect.go @@ -0,0 +1,167 @@ +// Code generated by protoc-gen-connect-go. DO NOT EDIT. +// +// Source: hydra/v1/deployment.proto + +package hydrav1connect + +import ( + connect "connectrpc.com/connect" + context "context" + errors "errors" + v1 "github.com/unkeyed/unkey/gen/proto/hydra/v1" + http "net/http" + strings "strings" +) + +// This is a compile-time assertion to ensure that this generated file and the connect package are +// compatible. If you get a compiler error that this constant is not defined, this code was +// generated with a version of connect newer than the one compiled into your binary. You can fix the +// problem by either regenerating this code with an older version of connect or updating the connect +// version compiled into your binary. +const _ = connect.IsAtLeastVersion1_13_0 + +const ( + // DeploymentServiceName is the fully-qualified name of the DeploymentService service. + DeploymentServiceName = "hydra.v1.DeploymentService" +) + +// These constants are the fully-qualified names of the RPCs defined in this package. They're +// exposed at runtime as Spec.Procedure and as the final two segments of the HTTP route. +// +// Note that these are different from the fully-qualified method names used by +// google.golang.org/protobuf/reflect/protoreflect. To convert from these constants to +// reflection-formatted method names, remove the leading slash and convert the remaining slash to a +// period. +const ( + // DeploymentServiceDeployProcedure is the fully-qualified name of the DeploymentService's Deploy + // RPC. + DeploymentServiceDeployProcedure = "/hydra.v1.DeploymentService/Deploy" + // DeploymentServiceRollbackProcedure is the fully-qualified name of the DeploymentService's + // Rollback RPC. + DeploymentServiceRollbackProcedure = "/hydra.v1.DeploymentService/Rollback" + // DeploymentServicePromoteProcedure is the fully-qualified name of the DeploymentService's Promote + // RPC. + DeploymentServicePromoteProcedure = "/hydra.v1.DeploymentService/Promote" +) + +// DeploymentServiceClient is a client for the hydra.v1.DeploymentService service. +type DeploymentServiceClient interface { + Deploy(context.Context, *connect.Request[v1.DeployRequest]) (*connect.Response[v1.DeployResponse], error) + Rollback(context.Context, *connect.Request[v1.RollbackRequest]) (*connect.Response[v1.RollbackResponse], error) + Promote(context.Context, *connect.Request[v1.PromoteRequest]) (*connect.Response[v1.PromoteResponse], error) +} + +// NewDeploymentServiceClient constructs a client for the hydra.v1.DeploymentService service. By +// default, it uses the Connect protocol with the binary Protobuf Codec, asks for gzipped responses, +// and sends uncompressed requests. To use the gRPC or gRPC-Web protocols, supply the +// connect.WithGRPC() or connect.WithGRPCWeb() options. +// +// The URL supplied here should be the base URL for the Connect or gRPC server (for example, +// http://api.acme.com or https://acme.com/grpc). +func NewDeploymentServiceClient(httpClient connect.HTTPClient, baseURL string, opts ...connect.ClientOption) DeploymentServiceClient { + baseURL = strings.TrimRight(baseURL, "/") + deploymentServiceMethods := v1.File_hydra_v1_deployment_proto.Services().ByName("DeploymentService").Methods() + return &deploymentServiceClient{ + deploy: connect.NewClient[v1.DeployRequest, v1.DeployResponse]( + httpClient, + baseURL+DeploymentServiceDeployProcedure, + connect.WithSchema(deploymentServiceMethods.ByName("Deploy")), + connect.WithClientOptions(opts...), + ), + rollback: connect.NewClient[v1.RollbackRequest, v1.RollbackResponse]( + httpClient, + baseURL+DeploymentServiceRollbackProcedure, + connect.WithSchema(deploymentServiceMethods.ByName("Rollback")), + connect.WithClientOptions(opts...), + ), + promote: connect.NewClient[v1.PromoteRequest, v1.PromoteResponse]( + httpClient, + baseURL+DeploymentServicePromoteProcedure, + connect.WithSchema(deploymentServiceMethods.ByName("Promote")), + connect.WithClientOptions(opts...), + ), + } +} + +// deploymentServiceClient implements DeploymentServiceClient. +type deploymentServiceClient struct { + deploy *connect.Client[v1.DeployRequest, v1.DeployResponse] + rollback *connect.Client[v1.RollbackRequest, v1.RollbackResponse] + promote *connect.Client[v1.PromoteRequest, v1.PromoteResponse] +} + +// Deploy calls hydra.v1.DeploymentService.Deploy. +func (c *deploymentServiceClient) Deploy(ctx context.Context, req *connect.Request[v1.DeployRequest]) (*connect.Response[v1.DeployResponse], error) { + return c.deploy.CallUnary(ctx, req) +} + +// Rollback calls hydra.v1.DeploymentService.Rollback. +func (c *deploymentServiceClient) Rollback(ctx context.Context, req *connect.Request[v1.RollbackRequest]) (*connect.Response[v1.RollbackResponse], error) { + return c.rollback.CallUnary(ctx, req) +} + +// Promote calls hydra.v1.DeploymentService.Promote. +func (c *deploymentServiceClient) Promote(ctx context.Context, req *connect.Request[v1.PromoteRequest]) (*connect.Response[v1.PromoteResponse], error) { + return c.promote.CallUnary(ctx, req) +} + +// DeploymentServiceHandler is an implementation of the hydra.v1.DeploymentService service. +type DeploymentServiceHandler interface { + Deploy(context.Context, *connect.Request[v1.DeployRequest]) (*connect.Response[v1.DeployResponse], error) + Rollback(context.Context, *connect.Request[v1.RollbackRequest]) (*connect.Response[v1.RollbackResponse], error) + Promote(context.Context, *connect.Request[v1.PromoteRequest]) (*connect.Response[v1.PromoteResponse], error) +} + +// NewDeploymentServiceHandler builds an HTTP handler from the service implementation. It returns +// the path on which to mount the handler and the handler itself. +// +// By default, handlers support the Connect, gRPC, and gRPC-Web protocols with the binary Protobuf +// and JSON codecs. They also support gzip compression. +func NewDeploymentServiceHandler(svc DeploymentServiceHandler, opts ...connect.HandlerOption) (string, http.Handler) { + deploymentServiceMethods := v1.File_hydra_v1_deployment_proto.Services().ByName("DeploymentService").Methods() + deploymentServiceDeployHandler := connect.NewUnaryHandler( + DeploymentServiceDeployProcedure, + svc.Deploy, + connect.WithSchema(deploymentServiceMethods.ByName("Deploy")), + connect.WithHandlerOptions(opts...), + ) + deploymentServiceRollbackHandler := connect.NewUnaryHandler( + DeploymentServiceRollbackProcedure, + svc.Rollback, + connect.WithSchema(deploymentServiceMethods.ByName("Rollback")), + connect.WithHandlerOptions(opts...), + ) + deploymentServicePromoteHandler := connect.NewUnaryHandler( + DeploymentServicePromoteProcedure, + svc.Promote, + connect.WithSchema(deploymentServiceMethods.ByName("Promote")), + connect.WithHandlerOptions(opts...), + ) + return "/hydra.v1.DeploymentService/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case DeploymentServiceDeployProcedure: + deploymentServiceDeployHandler.ServeHTTP(w, r) + case DeploymentServiceRollbackProcedure: + deploymentServiceRollbackHandler.ServeHTTP(w, r) + case DeploymentServicePromoteProcedure: + deploymentServicePromoteHandler.ServeHTTP(w, r) + default: + http.NotFound(w, r) + } + }) +} + +// UnimplementedDeploymentServiceHandler returns CodeUnimplemented from all methods. +type UnimplementedDeploymentServiceHandler struct{} + +func (UnimplementedDeploymentServiceHandler) Deploy(context.Context, *connect.Request[v1.DeployRequest]) (*connect.Response[v1.DeployResponse], error) { + return nil, connect.NewError(connect.CodeUnimplemented, errors.New("hydra.v1.DeploymentService.Deploy is not implemented")) +} + +func (UnimplementedDeploymentServiceHandler) Rollback(context.Context, *connect.Request[v1.RollbackRequest]) (*connect.Response[v1.RollbackResponse], error) { + return nil, connect.NewError(connect.CodeUnimplemented, errors.New("hydra.v1.DeploymentService.Rollback is not implemented")) +} + +func (UnimplementedDeploymentServiceHandler) Promote(context.Context, *connect.Request[v1.PromoteRequest]) (*connect.Response[v1.PromoteResponse], error) { + return nil, connect.NewError(connect.CodeUnimplemented, errors.New("hydra.v1.DeploymentService.Promote is not implemented")) +} diff --git a/gen/proto/hydra/v1/hydrav1connect/routing.connect.go b/gen/proto/hydra/v1/hydrav1connect/routing.connect.go new file mode 100644 index 0000000000..0667b6ec92 --- /dev/null +++ b/gen/proto/hydra/v1/hydrav1connect/routing.connect.go @@ -0,0 +1,111 @@ +// Code generated by protoc-gen-connect-go. DO NOT EDIT. +// +// Source: hydra/v1/routing.proto + +package hydrav1connect + +import ( + connect "connectrpc.com/connect" + context "context" + errors "errors" + v1 "github.com/unkeyed/unkey/gen/proto/hydra/v1" + http "net/http" + strings "strings" +) + +// This is a compile-time assertion to ensure that this generated file and the connect package are +// compatible. If you get a compiler error that this constant is not defined, this code was +// generated with a version of connect newer than the one compiled into your binary. You can fix the +// problem by either regenerating this code with an older version of connect or updating the connect +// version compiled into your binary. +const _ = connect.IsAtLeastVersion1_13_0 + +const ( + // RoutingServiceName is the fully-qualified name of the RoutingService service. + RoutingServiceName = "hydra.v1.RoutingService" +) + +// These constants are the fully-qualified names of the RPCs defined in this package. They're +// exposed at runtime as Spec.Procedure and as the final two segments of the HTTP route. +// +// Note that these are different from the fully-qualified method names used by +// google.golang.org/protobuf/reflect/protoreflect. To convert from these constants to +// reflection-formatted method names, remove the leading slash and convert the remaining slash to a +// period. +const ( + // RoutingServiceAssignFrontlineRoutesProcedure is the fully-qualified name of the RoutingService's + // AssignFrontlineRoutes RPC. + RoutingServiceAssignFrontlineRoutesProcedure = "/hydra.v1.RoutingService/AssignFrontlineRoutes" +) + +// RoutingServiceClient is a client for the hydra.v1.RoutingService service. +type RoutingServiceClient interface { + // AssignFrontlineRoutes creates or reassigns frontline routes to a deployment + AssignFrontlineRoutes(context.Context, *connect.Request[v1.AssignFrontlineRoutesRequest]) (*connect.Response[v1.AssignFrontlineRoutesResponse], error) +} + +// NewRoutingServiceClient constructs a client for the hydra.v1.RoutingService service. By default, +// it uses the Connect protocol with the binary Protobuf Codec, asks for gzipped responses, and +// sends uncompressed requests. To use the gRPC or gRPC-Web protocols, supply the connect.WithGRPC() +// or connect.WithGRPCWeb() options. +// +// The URL supplied here should be the base URL for the Connect or gRPC server (for example, +// http://api.acme.com or https://acme.com/grpc). +func NewRoutingServiceClient(httpClient connect.HTTPClient, baseURL string, opts ...connect.ClientOption) RoutingServiceClient { + baseURL = strings.TrimRight(baseURL, "/") + routingServiceMethods := v1.File_hydra_v1_routing_proto.Services().ByName("RoutingService").Methods() + return &routingServiceClient{ + assignFrontlineRoutes: connect.NewClient[v1.AssignFrontlineRoutesRequest, v1.AssignFrontlineRoutesResponse]( + httpClient, + baseURL+RoutingServiceAssignFrontlineRoutesProcedure, + connect.WithSchema(routingServiceMethods.ByName("AssignFrontlineRoutes")), + connect.WithClientOptions(opts...), + ), + } +} + +// routingServiceClient implements RoutingServiceClient. +type routingServiceClient struct { + assignFrontlineRoutes *connect.Client[v1.AssignFrontlineRoutesRequest, v1.AssignFrontlineRoutesResponse] +} + +// AssignFrontlineRoutes calls hydra.v1.RoutingService.AssignFrontlineRoutes. +func (c *routingServiceClient) AssignFrontlineRoutes(ctx context.Context, req *connect.Request[v1.AssignFrontlineRoutesRequest]) (*connect.Response[v1.AssignFrontlineRoutesResponse], error) { + return c.assignFrontlineRoutes.CallUnary(ctx, req) +} + +// RoutingServiceHandler is an implementation of the hydra.v1.RoutingService service. +type RoutingServiceHandler interface { + // AssignFrontlineRoutes creates or reassigns frontline routes to a deployment + AssignFrontlineRoutes(context.Context, *connect.Request[v1.AssignFrontlineRoutesRequest]) (*connect.Response[v1.AssignFrontlineRoutesResponse], error) +} + +// NewRoutingServiceHandler builds an HTTP handler from the service implementation. It returns the +// path on which to mount the handler and the handler itself. +// +// By default, handlers support the Connect, gRPC, and gRPC-Web protocols with the binary Protobuf +// and JSON codecs. They also support gzip compression. +func NewRoutingServiceHandler(svc RoutingServiceHandler, opts ...connect.HandlerOption) (string, http.Handler) { + routingServiceMethods := v1.File_hydra_v1_routing_proto.Services().ByName("RoutingService").Methods() + routingServiceAssignFrontlineRoutesHandler := connect.NewUnaryHandler( + RoutingServiceAssignFrontlineRoutesProcedure, + svc.AssignFrontlineRoutes, + connect.WithSchema(routingServiceMethods.ByName("AssignFrontlineRoutes")), + connect.WithHandlerOptions(opts...), + ) + return "/hydra.v1.RoutingService/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case RoutingServiceAssignFrontlineRoutesProcedure: + routingServiceAssignFrontlineRoutesHandler.ServeHTTP(w, r) + default: + http.NotFound(w, r) + } + }) +} + +// UnimplementedRoutingServiceHandler returns CodeUnimplemented from all methods. +type UnimplementedRoutingServiceHandler struct{} + +func (UnimplementedRoutingServiceHandler) AssignFrontlineRoutes(context.Context, *connect.Request[v1.AssignFrontlineRoutesRequest]) (*connect.Response[v1.AssignFrontlineRoutesResponse], error) { + return nil, connect.NewError(connect.CodeUnimplemented, errors.New("hydra.v1.RoutingService.AssignFrontlineRoutes is not implemented")) +} diff --git a/pkg/db/BUILD.bazel b/pkg/db/BUILD.bazel index 597eb8c58a..b35f2b8fa2 100644 --- a/pkg/db/BUILD.bazel +++ b/pkg/db/BUILD.bazel @@ -236,7 +236,6 @@ go_library( "sentinel_insert.sql_generated.go", "sentinel_list_desired.sql_generated.go", "sentinel_update_available_replicas_and_health.sql_generated.go", - "state_change_delete_old.sql_generated.go", "state_change_find_by_region_after_sequence.sql_generated.go", "state_change_get_max_sequence.sql_generated.go", "state_change_get_min_sequence.sql_generated.go", diff --git a/pkg/db/bulk_deployment_topology_insert.sql_generated.go b/pkg/db/bulk_deployment_topology_insert.sql_generated.go index f68a38dbb0..14d182337e 100644 --- a/pkg/db/bulk_deployment_topology_insert.sql_generated.go +++ b/pkg/db/bulk_deployment_topology_insert.sql_generated.go @@ -38,6 +38,6 @@ func (q *BulkQueries) InsertDeploymentTopologies(ctx context.Context, db DBTX, a } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_environment_insert.sql_generated.go b/pkg/db/bulk_environment_insert.sql_generated.go index 309b3f09cc..5b2688f8da 100644 --- a/pkg/db/bulk_environment_insert.sql_generated.go +++ b/pkg/db/bulk_environment_insert.sql_generated.go @@ -40,6 +40,6 @@ func (q *BulkQueries) InsertEnvironments(ctx context.Context, db DBTX, args []In } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_environment_upsert.sql_generated.go b/pkg/db/bulk_environment_upsert.sql_generated.go index e21caeb704..3438fdc668 100644 --- a/pkg/db/bulk_environment_upsert.sql_generated.go +++ b/pkg/db/bulk_environment_upsert.sql_generated.go @@ -38,6 +38,6 @@ func (q *BulkQueries) UpsertEnvironment(ctx context.Context, db DBTX, args []Ups } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_identity_insert.sql_generated.go b/pkg/db/bulk_identity_insert.sql_generated.go index c3b4583a8e..80bd90e82f 100644 --- a/pkg/db/bulk_identity_insert.sql_generated.go +++ b/pkg/db/bulk_identity_insert.sql_generated.go @@ -38,6 +38,6 @@ func (q *BulkQueries) InsertIdentities(ctx context.Context, db DBTX, args []Inse } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_identity_insert_ratelimit.sql_generated.go b/pkg/db/bulk_identity_insert_ratelimit.sql_generated.go index d4ba807b99..18d07f1836 100644 --- a/pkg/db/bulk_identity_insert_ratelimit.sql_generated.go +++ b/pkg/db/bulk_identity_insert_ratelimit.sql_generated.go @@ -45,6 +45,6 @@ func (q *BulkQueries) InsertIdentityRatelimits(ctx context.Context, db DBTX, arg } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_identity_upsert.sql_generated.go b/pkg/db/bulk_identity_upsert.sql_generated.go index a7188608c2..1624fdacf0 100644 --- a/pkg/db/bulk_identity_upsert.sql_generated.go +++ b/pkg/db/bulk_identity_upsert.sql_generated.go @@ -38,6 +38,6 @@ func (q *BulkQueries) UpsertIdentity(ctx context.Context, db DBTX, args []Upsert } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_ingress_route_insert.sql_generated.go b/pkg/db/bulk_ingress_route_insert.sql_generated.go index aed50b3a4d..2dfd438ed5 100644 --- a/pkg/db/bulk_ingress_route_insert.sql_generated.go +++ b/pkg/db/bulk_ingress_route_insert.sql_generated.go @@ -40,6 +40,6 @@ func (q *BulkQueries) InsertFrontlineRoutes(ctx context.Context, db DBTX, args [ } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_instance_upsert.sql_generated.go b/pkg/db/bulk_instance_upsert.sql_generated.go index 07b168f965..7300e66ed4 100644 --- a/pkg/db/bulk_instance_upsert.sql_generated.go +++ b/pkg/db/bulk_instance_upsert.sql_generated.go @@ -54,6 +54,6 @@ func (q *BulkQueries) UpsertInstance(ctx context.Context, db DBTX, args []Upsert } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_key_auth_insert.sql_generated.go b/pkg/db/bulk_key_auth_insert.sql_generated.go index 575011ec59..fc4a861c17 100644 --- a/pkg/db/bulk_key_auth_insert.sql_generated.go +++ b/pkg/db/bulk_key_auth_insert.sql_generated.go @@ -37,6 +37,6 @@ func (q *BulkQueries) InsertKeyAuths(ctx context.Context, db DBTX, args []Insert } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_key_encryption_insert.sql_generated.go b/pkg/db/bulk_key_encryption_insert.sql_generated.go index 372ecc9d4f..9a3a16ad9a 100644 --- a/pkg/db/bulk_key_encryption_insert.sql_generated.go +++ b/pkg/db/bulk_key_encryption_insert.sql_generated.go @@ -37,6 +37,6 @@ func (q *BulkQueries) InsertKeyEncryptions(ctx context.Context, db DBTX, args [] } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_key_insert.sql_generated.go b/pkg/db/bulk_key_insert.sql_generated.go index da4afc05f5..20634b7368 100644 --- a/pkg/db/bulk_key_insert.sql_generated.go +++ b/pkg/db/bulk_key_insert.sql_generated.go @@ -48,6 +48,6 @@ func (q *BulkQueries) InsertKeys(ctx context.Context, db DBTX, args []InsertKeyP } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_key_insert_ratelimit.sql_generated.go b/pkg/db/bulk_key_insert_ratelimit.sql_generated.go index e6b3fce603..2b1a2f00f7 100644 --- a/pkg/db/bulk_key_insert_ratelimit.sql_generated.go +++ b/pkg/db/bulk_key_insert_ratelimit.sql_generated.go @@ -49,6 +49,6 @@ func (q *BulkQueries) InsertKeyRatelimits(ctx context.Context, db DBTX, args []I } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_key_migration_insert.sql_generated.go b/pkg/db/bulk_key_migration_insert.sql_generated.go index 3485d924d7..4784f699fa 100644 --- a/pkg/db/bulk_key_migration_insert.sql_generated.go +++ b/pkg/db/bulk_key_migration_insert.sql_generated.go @@ -35,6 +35,6 @@ func (q *BulkQueries) InsertKeyMigrations(ctx context.Context, db DBTX, args []I } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_key_permission_insert.sql_generated.go b/pkg/db/bulk_key_permission_insert.sql_generated.go index a3a83d8298..f12060f1ea 100644 --- a/pkg/db/bulk_key_permission_insert.sql_generated.go +++ b/pkg/db/bulk_key_permission_insert.sql_generated.go @@ -41,6 +41,6 @@ func (q *BulkQueries) InsertKeyPermissions(ctx context.Context, db DBTX, args [] } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_key_role_insert.sql_generated.go b/pkg/db/bulk_key_role_insert.sql_generated.go index 81da1a0f12..75654742d0 100644 --- a/pkg/db/bulk_key_role_insert.sql_generated.go +++ b/pkg/db/bulk_key_role_insert.sql_generated.go @@ -36,6 +36,6 @@ func (q *BulkQueries) InsertKeyRoles(ctx context.Context, db DBTX, args []Insert } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_key_space_insert.sql_generated.go b/pkg/db/bulk_key_space_insert.sql_generated.go index 4753082f73..f19d1c8ee9 100644 --- a/pkg/db/bulk_key_space_insert.sql_generated.go +++ b/pkg/db/bulk_key_space_insert.sql_generated.go @@ -38,6 +38,6 @@ func (q *BulkQueries) InsertKeySpaces(ctx context.Context, db DBTX, args []Inser } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_key_space_upsert.sql_generated.go b/pkg/db/bulk_key_space_upsert.sql_generated.go index f447dad56f..0b4ad1cfc2 100644 --- a/pkg/db/bulk_key_space_upsert.sql_generated.go +++ b/pkg/db/bulk_key_space_upsert.sql_generated.go @@ -40,6 +40,6 @@ func (q *BulkQueries) UpsertKeySpace(ctx context.Context, db DBTX, args []Upsert } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_permission_insert.sql_generated.go b/pkg/db/bulk_permission_insert.sql_generated.go index a5ed0f59cd..05f6d971e7 100644 --- a/pkg/db/bulk_permission_insert.sql_generated.go +++ b/pkg/db/bulk_permission_insert.sql_generated.go @@ -38,6 +38,6 @@ func (q *BulkQueries) InsertPermissions(ctx context.Context, db DBTX, args []Ins } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_project_insert.sql_generated.go b/pkg/db/bulk_project_insert.sql_generated.go index 60b5486231..01d8a3935c 100644 --- a/pkg/db/bulk_project_insert.sql_generated.go +++ b/pkg/db/bulk_project_insert.sql_generated.go @@ -41,6 +41,6 @@ func (q *BulkQueries) InsertProjects(ctx context.Context, db DBTX, args []Insert } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_quota_upsert.sql_generated.go b/pkg/db/bulk_quota_upsert.sql_generated.go index 553418fa6d..643cb56db0 100644 --- a/pkg/db/bulk_quota_upsert.sql_generated.go +++ b/pkg/db/bulk_quota_upsert.sql_generated.go @@ -40,6 +40,6 @@ func (q *BulkQueries) UpsertQuota(ctx context.Context, db DBTX, args []UpsertQuo } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_ratelimit_namespace_insert.sql_generated.go b/pkg/db/bulk_ratelimit_namespace_insert.sql_generated.go index c8be775169..932e54826c 100644 --- a/pkg/db/bulk_ratelimit_namespace_insert.sql_generated.go +++ b/pkg/db/bulk_ratelimit_namespace_insert.sql_generated.go @@ -36,6 +36,6 @@ func (q *BulkQueries) InsertRatelimitNamespaces(ctx context.Context, db DBTX, ar } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_ratelimit_override_insert.sql_generated.go b/pkg/db/bulk_ratelimit_override_insert.sql_generated.go index 9d114c4909..f6172160ca 100644 --- a/pkg/db/bulk_ratelimit_override_insert.sql_generated.go +++ b/pkg/db/bulk_ratelimit_override_insert.sql_generated.go @@ -48,6 +48,6 @@ func (q *BulkQueries) InsertRatelimitOverrides(ctx context.Context, db DBTX, arg } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_role_insert.sql_generated.go b/pkg/db/bulk_role_insert.sql_generated.go index ebdb688647..5d0fd376fb 100644 --- a/pkg/db/bulk_role_insert.sql_generated.go +++ b/pkg/db/bulk_role_insert.sql_generated.go @@ -37,6 +37,6 @@ func (q *BulkQueries) InsertRoles(ctx context.Context, db DBTX, args []InsertRol } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_role_permission_insert.sql_generated.go b/pkg/db/bulk_role_permission_insert.sql_generated.go index fda1b0d623..c00635ce68 100644 --- a/pkg/db/bulk_role_permission_insert.sql_generated.go +++ b/pkg/db/bulk_role_permission_insert.sql_generated.go @@ -36,6 +36,6 @@ func (q *BulkQueries) InsertRolePermissions(ctx context.Context, db DBTX, args [ } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_sentinel_insert.sql_generated.go b/pkg/db/bulk_sentinel_insert.sql_generated.go index 55f5ca9298..69f4741d23 100644 --- a/pkg/db/bulk_sentinel_insert.sql_generated.go +++ b/pkg/db/bulk_sentinel_insert.sql_generated.go @@ -46,6 +46,6 @@ func (q *BulkQueries) InsertSentinels(ctx context.Context, db DBTX, args []Inser } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_state_change_insert.sql_generated.go b/pkg/db/bulk_state_change_insert.sql_generated.go index 093d317533..755effbdcf 100644 --- a/pkg/db/bulk_state_change_insert.sql_generated.go +++ b/pkg/db/bulk_state_change_insert.sql_generated.go @@ -37,6 +37,6 @@ func (q *BulkQueries) InsertStateChanges(ctx context.Context, db DBTX, args []In } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_workspace_insert.sql_generated.go b/pkg/db/bulk_workspace_insert.sql_generated.go index eb5de58d43..2aa27b0eaa 100644 --- a/pkg/db/bulk_workspace_insert.sql_generated.go +++ b/pkg/db/bulk_workspace_insert.sql_generated.go @@ -37,6 +37,6 @@ func (q *BulkQueries) InsertWorkspaces(ctx context.Context, db DBTX, args []Inse } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/bulk_workspace_upsert.sql_generated.go b/pkg/db/bulk_workspace_upsert.sql_generated.go index bc0476b3e0..3ca46422ae 100644 --- a/pkg/db/bulk_workspace_upsert.sql_generated.go +++ b/pkg/db/bulk_workspace_upsert.sql_generated.go @@ -41,6 +41,6 @@ func (q *BulkQueries) UpsertWorkspace(ctx context.Context, db DBTX, args []Upser } // Execute the bulk insert - _, err := db.ExecContext(ctx, bulkQuery, allArgs...) - return err + _, err := db.ExecContext(ctx, bulkQuery, allArgs...) + return err } diff --git a/pkg/db/querier_generated.go b/pkg/db/querier_generated.go index 88fc778497..30d183cb60 100644 --- a/pkg/db/querier_generated.go +++ b/pkg/db/querier_generated.go @@ -118,13 +118,6 @@ type Querier interface { // AND (i.id = ? OR i.external_id = ?) // AND i.deleted = true DeleteOldIdentityWithRatelimits(ctx context.Context, db DBTX, arg DeleteOldIdentityWithRatelimitsParams) error - // Retention cleanup: deletes state changes older than the cutoff timestamp. - // Uses LIMIT to avoid long-running transactions; call repeatedly until 0 rows affected. - // - // DELETE FROM `state_changes` - // WHERE created_at < ? - // LIMIT 10000 - DeleteOldStateChanges(ctx context.Context, db DBTX, cutoffMs uint64) (int64, error) //DeletePermission // // DELETE FROM permissions diff --git a/pkg/db/state_change_delete_old.sql_generated.go b/pkg/db/state_change_delete_old.sql_generated.go deleted file mode 100644 index 87fc4201c9..0000000000 --- a/pkg/db/state_change_delete_old.sql_generated.go +++ /dev/null @@ -1,30 +0,0 @@ -// Code generated by sqlc. DO NOT EDIT. -// versions: -// sqlc v1.30.0 -// source: state_change_delete_old.sql - -package db - -import ( - "context" -) - -const deleteOldStateChanges = `-- name: DeleteOldStateChanges :execrows -DELETE FROM ` + "`" + `state_changes` + "`" + ` -WHERE created_at < ? -LIMIT 10000 -` - -// Retention cleanup: deletes state changes older than the cutoff timestamp. -// Uses LIMIT to avoid long-running transactions; call repeatedly until 0 rows affected. -// -// DELETE FROM `state_changes` -// WHERE created_at < ? -// LIMIT 10000 -func (q *Queries) DeleteOldStateChanges(ctx context.Context, db DBTX, cutoffMs uint64) (int64, error) { - result, err := db.ExecContext(ctx, deleteOldStateChanges, cutoffMs) - if err != nil { - return 0, err - } - return result.RowsAffected() -} diff --git a/svc/ctrl/integration/sync_test.go b/svc/ctrl/integration/sync_test.go index 1134df135e..3e18d02fa9 100644 --- a/svc/ctrl/integration/sync_test.go +++ b/svc/ctrl/integration/sync_test.go @@ -9,8 +9,8 @@ // // The sync protocol follows a two-phase approach: // 1. Bootstrap: When a client connects with sequence=0, the server streams all current -// running deployments and sentinels for the requested region, then sends a Bookmark -// message containing the current max sequence number. +// running deployments and sentinels for the requested region. Stream close signals +// bootstrap completion. // 2. Watch: After bootstrap (or when reconnecting with sequence>0), the server polls // the state_changes table and streams incremental updates to the client. // @@ -137,16 +137,6 @@ func findSentinelDelete(messages []*ctrlv1.State, k8sName string) *ctrlv1.Delete return nil } -// findBookmark finds the bookmark message in the stream. -func findBookmark(messages []*ctrlv1.State) *ctrlv1.Bookmark { - for _, msg := range messages { - if bookmark := msg.GetBookmark(); bookmark != nil { - return bookmark - } - } - return nil -} - // countDeploymentApplies counts deployment apply messages. func countDeploymentApplies(messages []*ctrlv1.State) int { count := 0 @@ -179,15 +169,14 @@ func countSentinelApplies(messages []*ctrlv1.State) int { // // Bootstrap tests verify the initial full state synchronization that occurs when // a krane agent first connects (with sequence=0). During bootstrap, the server -// must stream ALL currently running resources for the requested region, then -// send a Bookmark message with the current max sequence number. +// streams ALL currently running resources for the requested region. Stream close +// signals bootstrap completion. // // Guarantees tested: // - All running deployments in the region are streamed // - All running sentinels in the region are streamed // - Archived/stopped resources are NOT streamed -// - A Bookmark is always sent after streaming all resources -// - Empty regions receive only a Bookmark (no resources) +// - Stream close signals completion // ============================================================================= // TestSync_BootstrapStreamsDeploymentsAndVerifiesContent verifies that bootstrap @@ -198,7 +187,6 @@ func countSentinelApplies(messages []*ctrlv1.State) int { // Guarantees: // - The deployment is included in the bootstrap stream // - The K8sName and Image fields are correctly populated -// - A Bookmark with a non-zero sequence is sent after the deployment // // This test validates the core bootstrap contract: all running resources must be // streamed to new clients so they can reconcile their local state. @@ -243,11 +231,6 @@ func TestSync_BootstrapStreamsDeploymentsAndVerifiesContent(t *testing.T) { require.NotNil(t, apply, "bootstrap should stream deployment apply") require.Equal(t, dep.Deployment.K8sName, apply.GetK8SName()) require.Equal(t, "nginx:1.19", apply.GetImage()) - - // Verify bookmark was sent - bookmark := findBookmark(messages) - require.NotNil(t, bookmark, "bootstrap should send bookmark") - require.Greater(t, bookmark.GetSequence(), uint64(0), "bookmark should have non-zero sequence") } // TestSync_BootstrapStreamsSentinelsAndVerifiesContent verifies that bootstrap @@ -258,7 +241,6 @@ func TestSync_BootstrapStreamsDeploymentsAndVerifiesContent(t *testing.T) { // Guarantees: // - The sentinel is included in the bootstrap stream // - The K8sName and Image fields are correctly populated -// - A Bookmark is sent after the sentinel // // This test mirrors the deployment test but for sentinels, ensuring both resource // types are handled correctly during bootstrap. @@ -301,25 +283,20 @@ func TestSync_BootstrapStreamsSentinelsAndVerifiesContent(t *testing.T) { require.NotNil(t, apply, "bootstrap should stream sentinel apply") require.Equal(t, sentinel.K8sName, apply.GetK8SName()) require.Equal(t, "sentinel:1.0", apply.GetImage()) - - // Verify bookmark was sent - bookmark := findBookmark(messages) - require.NotNil(t, bookmark, "bootstrap should send bookmark") } -// TestSync_BootstrapWithEmptyRegionSendsOnlyBookmark verifies that bootstrap -// handles empty regions gracefully by sending only a Bookmark. +// TestSync_BootstrapWithEmptyRegionSendsNothing verifies that bootstrap +// handles empty regions gracefully by sending no messages. // // Scenario: A krane agent connects to sync a region with no deployments or sentinels. // // Guarantees: -// - Exactly one message is sent (the Bookmark) -// - The Bookmark sequence is 0 (no state changes exist for this region) +// - No messages are sent (stream closes immediately) // - No deployment or sentinel apply messages are sent // -// This edge case is critical for new regions or regions where all resources have -// been deleted. The client must still receive a Bookmark to know bootstrap completed. -func TestSync_BootstrapWithEmptyRegionSendsOnlyBookmark(t *testing.T) { +// Empty regions result in an empty stream. The client stays at sequence=0 and +// will poll again. In production, regions are never empty. +func TestSync_BootstrapWithEmptyRegionSendsNothing(t *testing.T) { h := New(t) ctx := h.Context() @@ -340,15 +317,8 @@ func TestSync_BootstrapWithEmptyRegionSendsOnlyBookmark(t *testing.T) { messages := stream.Messages() - // Empty region should have exactly one message: the bookmark - require.Len(t, messages, 1, "empty region bootstrap should send exactly one message (bookmark)") - - // The single message should be a bookmark - bookmark := findBookmark(messages) - require.NotNil(t, bookmark, "the only message should be a bookmark") - - // Sequence is 0 since no state changes exist for this region - require.Equal(t, uint64(0), bookmark.GetSequence(), "empty region bookmark should have sequence 0") + // Empty region should have no messages + require.Len(t, messages, 0, "empty region bootstrap should send no messages") } // TestSync_BootstrapOnlyStreamsRunningResources verifies that bootstrap filters diff --git a/svc/ctrl/proto/ctrl/v1/cluster.proto b/svc/ctrl/proto/ctrl/v1/cluster.proto index 395e6f300f..cb3138153a 100644 --- a/svc/ctrl/proto/ctrl/v1/cluster.proto +++ b/svc/ctrl/proto/ctrl/v1/cluster.proto @@ -83,12 +83,6 @@ message SyncRequest { uint64 sequence_last_seen = 2; } -// Bookmark is sent after bootstrap completes to signal the client is caught up. -// The client should persist this sequence to resume watch on reconnect. -message Bookmark { - uint64 sequence = 1; -} - message State { // sequence is the state_changes sequence number for this event. // Clients should persist this after successfully processing each event @@ -98,7 +92,6 @@ message State { oneof kind { DeploymentState deployment = 2; SentinelState sentinel = 3; - Bookmark bookmark = 4; } } diff --git a/svc/ctrl/services/cluster/BUILD.bazel b/svc/ctrl/services/cluster/BUILD.bazel index 486e9a5117..6aa85e4593 100644 --- a/svc/ctrl/services/cluster/BUILD.bazel +++ b/svc/ctrl/services/cluster/BUILD.bazel @@ -4,12 +4,16 @@ go_library( name = "cluster", srcs = [ "auth.go", + "doc.go", "rpc_get_desired_deployment_state.go", "rpc_get_desired_sentinel_state.go", "rpc_sync.go", "rpc_update_deployment_state.go", "rpc_update_sentinel_state.go", "service.go", + "sync_bootstrap.go", + "sync_changes.go", + "sync_messages.go", ], importpath = "github.com/unkeyed/unkey/svc/ctrl/services/cluster", visibility = ["//visibility:public"], diff --git a/svc/ctrl/services/cluster/doc.go b/svc/ctrl/services/cluster/doc.go new file mode 100644 index 0000000000..d50f138d66 --- /dev/null +++ b/svc/ctrl/services/cluster/doc.go @@ -0,0 +1,34 @@ +// Package cluster implements the gRPC ClusterService for synchronizing desired state to edge nodes. +// +// # Overview +// +// Edge nodes (running in different regions) connect to the control plane and request +// state synchronization via the Sync RPC. The control plane streams the desired state +// for deployments and sentinels that should run in that region. +// +// # State Synchronization Model +// +// The synchronization uses a sequence-based approach: +// +// 1. Each state change (create, update, delete) is recorded in the state_changes table +// with an auto-incrementing sequence number per region. +// +// 2. Edge nodes track the last sequence they've seen. On reconnect, they request changes +// after that sequence. +// +// 3. If sequence is 0 (new node or reset), a full bootstrap is performed: all running +// deployments and sentinels are streamed. Stream close signals completion. +// +// # Convergence Guarantees +// +// The system guarantees eventual consistency through: +// - Idempotent apply/delete operations: applying the same state multiple times is safe +// - Delete-if-uncertain semantics: if we cannot prove a resource should run in a region, +// we instruct deletion to prevent stale resources +// - Reconnection with last-seen sequence: clients catch up on missed changes +// +// # Key Types +// +// The main service type is [Service], which implements [ctrlv1connect.ClusterServiceHandler]. +// The primary RPC is [Service.Sync] which handles state synchronization. +package cluster diff --git a/svc/ctrl/services/cluster/rpc_sync.go b/svc/ctrl/services/cluster/rpc_sync.go index 5868681a02..a9fabb58ad 100644 --- a/svc/ctrl/services/cluster/rpc_sync.go +++ b/svc/ctrl/services/cluster/rpc_sync.go @@ -3,348 +3,52 @@ package cluster import ( "context" "fmt" - "time" "connectrpc.com/connect" ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" "github.com/unkeyed/unkey/pkg/db" ) +// Sync streams cluster state to an edge node for the given region. +// +// If sequence_last_seen is 0, Sync bootstraps the full desired state for the region. +// Stream close signals bootstrap completion. The client tracks the highest sequence +// from received messages and uses it for the next sync request. +// +// Sync is a bounded catch-up stream. The server stops after sending a batch of +// changes; clients reconnect to continue from their last-seen sequence. func (s *Service) Sync(ctx context.Context, req *connect.Request[ctrlv1.SyncRequest], stream *connect.ServerStream[ctrlv1.State]) error { region := req.Msg.GetRegion() - sequence := req.Msg.GetSequenceLastSeen() + sequenceLastSeen := req.Msg.GetSequenceLastSeen() s.logger.Info("sync request received", "region", region, - "sequence", sequence, + "sequenceLastSeen", sequenceLastSeen, ) - if sequence > 0 { - minSeq, err := db.Query.GetMinStateChangeSequence(ctx, s.db.RO(), region) + sequenceAfter := sequenceLastSeen + if sequenceLastSeen == 0 { + boundary, err := s.bootstrap(ctx, region, stream) if err != nil { - return err - } - if sequence < uint64(minSeq) { - return connect.NewError(connect.CodeFailedPrecondition, - fmt.Errorf("sequence %d is behind minimum retained sequence %d, full resync required", sequence, minSeq)) - } - } - - if sequence == 0 { - var err error - sequence, err = s.bootstrap(ctx, region, stream) - if err != nil { - return err - } - } - - return s.watch(ctx, region, sequence, stream) -} - -func (s *Service) bootstrap(ctx context.Context, region string, stream *connect.ServerStream[ctrlv1.State]) (uint64, error) { - maxSeq, err := db.Query.GetMaxStateChangeSequence(ctx, s.db.RO(), region) - if err != nil { - return 0, err - } - sequence := uint64(maxSeq) - - cursor := "" - for { - topologies, err := db.Query.ListDesiredDeploymentTopology(ctx, s.db.RO(), db.ListDesiredDeploymentTopologyParams{ - Region: region, - DesiredState: db.DeploymentsDesiredStateRunning, - PaginationCursor: cursor, - Limit: 1000, - }) - if err != nil { - return 0, err - } - if len(topologies) == 0 { - break - } - cursor = topologies[len(topologies)-1].Deployment.ID - - for _, t := range topologies { - if err := s.streamDeployment(stream, sequence, t); err != nil { - return 0, err - } - } - } - - cursor = "" - for { - sentinels, err := db.Query.ListDesiredSentinels(ctx, s.db.RO(), db.ListDesiredSentinelsParams{ - Region: region, - DesiredState: db.SentinelsDesiredStateRunning, - PaginationCursor: cursor, - Limit: 100, - }) - if err != nil { - return 0, err - } - if len(sentinels) == 0 { - break - } - cursor = sentinels[len(sentinels)-1].ID - - for _, sentinel := range sentinels { - if err := s.streamSentinel(stream, sequence, sentinel); err != nil { - return 0, err - } - } - } - - // Send BOOKMARK with sequence - if err := stream.Send(&ctrlv1.State{ - Sequence: sequence, - Kind: &ctrlv1.State_Bookmark{ - Bookmark: &ctrlv1.Bookmark{Sequence: sequence}, - }, - }); err != nil { - return 0, err - } - - s.logger.Info("bootstrap complete", "sequence", sequence) - return sequence, nil -} - -func (s *Service) watch(ctx context.Context, region string, sequence uint64, stream *connect.ServerStream[ctrlv1.State]) error { - for { - select { - case <-ctx.Done(): - return ctx.Err() - default: - } - - changes, err := db.Query.ListStateChanges(ctx, s.db.RO(), db.ListStateChangesParams{ - Region: region, - AfterSequence: sequence, - Limit: 100, - }) - if err != nil { - return err - } - - if len(changes) == 0 { - time.Sleep(250 * time.Millisecond) - continue - } - - for _, c := range changes { - if err := s.processStateChange(ctx, region, c, stream); err != nil { - // Stop on error - client will reconnect from last known sequence - return fmt.Errorf("failed to process state change at sequence %d: %w", c.Sequence, err) - } - sequence = c.Sequence + return fmt.Errorf("bootstrap region=%q: %w", region, err) } + sequenceAfter = boundary } -} -// processStateChange fetches the resource and streams it if it applies to this region. -func (s *Service) processStateChange(ctx context.Context, region string, change db.ListStateChangesRow, stream *connect.ServerStream[ctrlv1.State]) error { - switch change.ResourceType { - case db.StateChangesResourceTypeDeployment: - return s.processDeploymentChange(ctx, region, change, stream) - case db.StateChangesResourceTypeSentinel: - return s.processSentinelChange(ctx, region, change, stream) - default: - s.logger.Warn("unknown resource type", "type", change.ResourceType) - return nil - } -} - -func (s *Service) processDeploymentChange(ctx context.Context, region string, change db.ListStateChangesRow, stream *connect.ServerStream[ctrlv1.State]) error { - d, err := db.Query.FindDeploymentById(ctx, s.db.RO(), change.ResourceID) - if err != nil { - if db.IsNotFound(err) { - return nil - } - return err - } - ws, err := db.Query.FindWorkspaceByID(ctx, s.db.RO(), d.WorkspaceID) - if err != nil { - return err - } - - if change.Op == db.StateChangesOpDelete { - return stream.Send(&ctrlv1.State{ - Sequence: change.Sequence, - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Delete{ - Delete: &ctrlv1.DeleteDeployment{ - K8SNamespace: ws.K8sNamespace.String, - K8SName: d.K8sName, - }, - }, - }, - }, - }) - } - - t, err := db.Query.FindDeploymentTopologyByIDAndRegion(ctx, s.db.RO(), db.FindDeploymentTopologyByIDAndRegionParams{ - DeploymentID: change.ResourceID, - Region: region, + changes, err := db.Query.ListStateChanges(ctx, s.db.RW(), db.ListStateChangesParams{ + Region: region, + AfterSequence: sequenceAfter, + Limit: 100, }) if err != nil { - if db.IsNotFound(err) { - return stream.Send(&ctrlv1.State{ - Sequence: change.Sequence, - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Delete{ - Delete: &ctrlv1.DeleteDeployment{ - K8SNamespace: ws.K8sNamespace.String, - K8SName: d.K8sName, - }, - }, - }, - }, - }) - } - return err + return fmt.Errorf("list state changes region=%q after=%d: %w", region, sequenceAfter, err) } - if t.DesiredState != db.DeploymentsDesiredStateRunning { - return stream.Send(&ctrlv1.State{ - Sequence: change.Sequence, - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Delete{ - Delete: &ctrlv1.DeleteDeployment{ - K8SNamespace: ws.K8sNamespace.String, - K8SName: d.K8sName, - }, - }, - }, - }, - }) - } - - var buildID *string - if t.BuildID.Valid { - buildID = &t.BuildID.String - } - return stream.Send(&ctrlv1.State{ - Sequence: change.Sequence, - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Apply{ - Apply: &ctrlv1.ApplyDeployment{ - K8SNamespace: t.K8sNamespace.String, - K8SName: t.K8sName, - WorkspaceId: t.WorkspaceID, - EnvironmentId: t.EnvironmentID, - ProjectId: t.ProjectID, - DeploymentId: t.ID, - Image: t.Image.String, - Replicas: t.DesiredReplicas, - CpuMillicores: int64(t.CpuMillicores), - MemoryMib: int64(t.MemoryMib), - EncryptedEnvironmentVariables: t.EncryptedEnvironmentVariables, - BuildId: buildID, - }, - }, - }, - }, - }) -} - -func (s *Service) processSentinelChange(ctx context.Context, region string, change db.ListStateChangesRow, stream *connect.ServerStream[ctrlv1.State]) error { - sentinel, err := db.Query.FindSentinelByID(ctx, s.db.RO(), change.ResourceID) - if err != nil { - if db.IsNotFound(err) { - return nil + for _, change := range changes { + if err := s.processStateChange(ctx, region, change, stream); err != nil { + return fmt.Errorf("process state change sequence=%d: %w", change.Sequence, err) } - return err - } - - if change.Op == db.StateChangesOpDelete || sentinel.Region != region || sentinel.DesiredState != db.SentinelsDesiredStateRunning { - return stream.Send(&ctrlv1.State{ - Sequence: change.Sequence, - Kind: &ctrlv1.State_Sentinel{ - Sentinel: &ctrlv1.SentinelState{ - State: &ctrlv1.SentinelState_Delete{ - Delete: &ctrlv1.DeleteSentinel{ - K8SName: sentinel.K8sName, - }, - }, - }, - }, - }) } - return stream.Send(&ctrlv1.State{ - Sequence: change.Sequence, - Kind: &ctrlv1.State_Sentinel{ - Sentinel: &ctrlv1.SentinelState{ - State: &ctrlv1.SentinelState_Apply{ - Apply: &ctrlv1.ApplySentinel{ - K8SName: sentinel.K8sName, - WorkspaceId: sentinel.WorkspaceID, - EnvironmentId: sentinel.EnvironmentID, - ProjectId: sentinel.ProjectID, - SentinelId: sentinel.ID, - Image: sentinel.Image, - Replicas: sentinel.DesiredReplicas, - CpuMillicores: int64(sentinel.CpuMillicores), - MemoryMib: int64(sentinel.MemoryMib), - }, - }, - }, - }, - }) -} - -func (s *Service) streamDeployment(stream *connect.ServerStream[ctrlv1.State], sequence uint64, t db.ListDesiredDeploymentTopologyRow) error { - var buildID *string - if t.Deployment.BuildID.Valid { - buildID = &t.Deployment.BuildID.String - } - return stream.Send(&ctrlv1.State{ - Sequence: sequence, - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Apply{ - Apply: &ctrlv1.ApplyDeployment{ - K8SNamespace: t.K8sNamespace.String, - K8SName: t.Deployment.K8sName, - WorkspaceId: t.Deployment.WorkspaceID, - EnvironmentId: t.Deployment.EnvironmentID, - ProjectId: t.Deployment.ProjectID, - DeploymentId: t.Deployment.ID, - Image: t.Deployment.Image.String, - Replicas: t.DeploymentTopology.DesiredReplicas, - CpuMillicores: int64(t.Deployment.CpuMillicores), - MemoryMib: int64(t.Deployment.MemoryMib), - EncryptedEnvironmentVariables: t.Deployment.EncryptedEnvironmentVariables, - BuildId: buildID, - }, - }, - }, - }, - }) -} - -func (s *Service) streamSentinel(stream *connect.ServerStream[ctrlv1.State], sequence uint64, sentinel db.Sentinel) error { - return stream.Send(&ctrlv1.State{ - Sequence: sequence, - Kind: &ctrlv1.State_Sentinel{ - Sentinel: &ctrlv1.SentinelState{ - State: &ctrlv1.SentinelState_Apply{ - Apply: &ctrlv1.ApplySentinel{ - K8SName: sentinel.K8sName, - WorkspaceId: sentinel.WorkspaceID, - EnvironmentId: sentinel.EnvironmentID, - ProjectId: sentinel.ProjectID, - SentinelId: sentinel.ID, - Image: sentinel.Image, - Replicas: sentinel.DesiredReplicas, - CpuMillicores: int64(sentinel.CpuMillicores), - MemoryMib: int64(sentinel.MemoryMib), - }, - }, - }, - }, - }) + return nil } diff --git a/svc/ctrl/services/cluster/service.go b/svc/ctrl/services/cluster/service.go index 04f08f3670..4756eedf7e 100644 --- a/svc/ctrl/services/cluster/service.go +++ b/svc/ctrl/services/cluster/service.go @@ -3,34 +3,32 @@ package cluster import ( "github.com/unkeyed/unkey/gen/proto/ctrl/v1/ctrlv1connect" "github.com/unkeyed/unkey/pkg/db" - "github.com/unkeyed/unkey/pkg/otel/logging" ) +// Service implements the ClusterService gRPC interface for state synchronization. type Service struct { ctrlv1connect.UnimplementedClusterServiceHandler db db.Database logger logging.Logger - - // static bearer token for authentication bearer string } +// Config holds the configuration for creating a new cluster Service. type Config struct { Database db.Database Logger logging.Logger Bearer string } +// New creates a new cluster Service with the given configuration. func New(cfg Config) *Service { - s := &Service{ + return &Service{ UnimplementedClusterServiceHandler: ctrlv1connect.UnimplementedClusterServiceHandler{}, db: cfg.Database, logger: cfg.Logger, bearer: cfg.Bearer, } - - return s } var _ ctrlv1connect.ClusterServiceHandler = (*Service)(nil) diff --git a/svc/ctrl/services/cluster/sync_bootstrap.go b/svc/ctrl/services/cluster/sync_bootstrap.go new file mode 100644 index 0000000000..c0632b64df --- /dev/null +++ b/svc/ctrl/services/cluster/sync_bootstrap.go @@ -0,0 +1,91 @@ +package cluster + +import ( + "context" + "fmt" + + "connectrpc.com/connect" + "github.com/unkeyed/unkey/pkg/db" + + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" +) + +// bootstrap streams the full desired state for a region. +// +// This captures the current max sequence as a snapshot boundary, then streams all +// running deployments and sentinels. The sequence returned is NOT a true snapshot +// (state may have changed during streaming), but convergence is guaranteed because: +// 1. All apply/delete operations are idempotent +// 2. Any changes during bootstrap will be picked up on the next sync +// +// Stream closing without error signals bootstrap completion. +func (s *Service) bootstrap(ctx context.Context, region string, stream *connect.ServerStream[ctrlv1.State]) (uint64, error) { + maxSequence, err := db.Query.GetMaxStateChangeSequence(ctx, s.db.RW(), region) + if err != nil { + return 0, fmt.Errorf("get max sequence region=%q: %w", region, err) + } + sequenceBoundary := uint64(maxSequence) + + if err := s.bootstrapDeployments(ctx, region, sequenceBoundary, stream); err != nil { + return 0, err + } + + if err := s.bootstrapSentinels(ctx, region, sequenceBoundary, stream); err != nil { + return 0, err + } + + s.logger.Info("bootstrap complete", "sequenceBoundary", sequenceBoundary) + return sequenceBoundary, nil +} + +func (s *Service) bootstrapDeployments(ctx context.Context, region string, sequence uint64, stream *connect.ServerStream[ctrlv1.State]) error { + cursor := "" + for { + topologies, err := db.Query.ListDesiredDeploymentTopology(ctx, s.db.RW(), db.ListDesiredDeploymentTopologyParams{ + Region: region, + DesiredState: db.DeploymentsDesiredStateRunning, + PaginationCursor: cursor, + Limit: 1000, + }) + if err != nil { + return fmt.Errorf("list deployment topologies cursor=%q: %w", cursor, err) + } + if len(topologies) == 0 { + break + } + cursor = topologies[len(topologies)-1].Deployment.ID + + for _, topology := range topologies { + if err := s.sendDeploymentApplyFromTopology(stream, sequence, topology); err != nil { + return fmt.Errorf("send deployment id=%q: %w", topology.Deployment.ID, err) + } + } + } + return nil +} + +func (s *Service) bootstrapSentinels(ctx context.Context, region string, sequence uint64, stream *connect.ServerStream[ctrlv1.State]) error { + cursor := "" + for { + sentinels, err := db.Query.ListDesiredSentinels(ctx, s.db.RW(), db.ListDesiredSentinelsParams{ + Region: region, + DesiredState: db.SentinelsDesiredStateRunning, + PaginationCursor: cursor, + Limit: 100, + }) + if err != nil { + return fmt.Errorf("list sentinels cursor=%q: %w", cursor, err) + } + if len(sentinels) == 0 { + break + } + cursor = sentinels[len(sentinels)-1].ID + + for _, sentinel := range sentinels { + if err := s.sendSentinelApply(stream, sequence, sentinel); err != nil { + return fmt.Errorf("send sentinel id=%q: %w", sentinel.ID, err) + } + } + } + return nil +} diff --git a/svc/ctrl/services/cluster/sync_changes.go b/svc/ctrl/services/cluster/sync_changes.go new file mode 100644 index 0000000000..3f86b98aa3 --- /dev/null +++ b/svc/ctrl/services/cluster/sync_changes.go @@ -0,0 +1,90 @@ +package cluster + +import ( + "context" + "fmt" + + "connectrpc.com/connect" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" + "github.com/unkeyed/unkey/pkg/db" +) + +// processStateChange routes a state change to the appropriate handler. +// +// Invariant: if we cannot prove a resource should be running in this region, +// we instruct the edge to delete it. This ensures stale resources are cleaned up. +func (s *Service) processStateChange(ctx context.Context, region string, change db.ListStateChangesRow, stream *connect.ServerStream[ctrlv1.State]) error { + switch change.ResourceType { + case db.StateChangesResourceTypeDeployment: + return s.processDeploymentChange(ctx, region, change, stream) + case db.StateChangesResourceTypeSentinel: + return s.processSentinelChange(ctx, region, change, stream) + default: + return fmt.Errorf("unknown resource type: %q", change.ResourceType) + } +} + +func (s *Service) processDeploymentChange(ctx context.Context, region string, change db.ListStateChangesRow, stream *connect.ServerStream[ctrlv1.State]) error { + deployment, err := db.Query.FindDeploymentById(ctx, s.db.RW(), change.ResourceID) + if err != nil { + if db.IsNotFound(err) { + // Resource already deleted, nothing to sync. + return nil + } + return fmt.Errorf("find deployment id=%q: %w", change.ResourceID, err) + } + + workspace, err := db.Query.FindWorkspaceByID(ctx, s.db.RW(), deployment.WorkspaceID) + if err != nil { + return fmt.Errorf("find workspace id=%q: %w", deployment.WorkspaceID, err) + } + + if change.Op == db.StateChangesOpDelete { + return s.sendDeploymentDelete(stream, change.Sequence, workspace.K8sNamespace.String, deployment.K8sName) + } + + topology, err := db.Query.FindDeploymentTopologyByIDAndRegion(ctx, s.db.RW(), db.FindDeploymentTopologyByIDAndRegionParams{ + DeploymentID: change.ResourceID, + Region: region, + }) + if err != nil { + if db.IsNotFound(err) { + // No topology for this region means delete. + return s.sendDeploymentDelete(stream, change.Sequence, workspace.K8sNamespace.String, deployment.K8sName) + } + return fmt.Errorf("find topology deployment=%q region=%q: %w", change.ResourceID, region, err) + } + + if shouldDeleteDeployment(topology.DesiredState) { + return s.sendDeploymentDelete(stream, change.Sequence, workspace.K8sNamespace.String, deployment.K8sName) + } + + return s.sendDeploymentApply(stream, change.Sequence, newApplyDeploymentFromTopology(topology)) +} + +func shouldDeleteDeployment(desiredState db.DeploymentsDesiredState) bool { + return desiredState != db.DeploymentsDesiredStateRunning +} + +func (s *Service) processSentinelChange(ctx context.Context, region string, change db.ListStateChangesRow, stream *connect.ServerStream[ctrlv1.State]) error { + sentinel, err := db.Query.FindSentinelByID(ctx, s.db.RW(), change.ResourceID) + if err != nil { + if db.IsNotFound(err) { + // Resource already deleted, nothing to sync. + return nil + } + return fmt.Errorf("find sentinel id=%q: %w", change.ResourceID, err) + } + + if shouldDeleteSentinel(change.Op, sentinel.Region, region, sentinel.DesiredState) { + return s.sendSentinelDelete(stream, change.Sequence, sentinel.K8sName) + } + + return s.sendSentinelApply(stream, change.Sequence, sentinel) +} + +func shouldDeleteSentinel(op db.StateChangesOp, sentinelRegion, requestRegion string, desiredState db.SentinelsDesiredState) bool { + return op == db.StateChangesOpDelete || + sentinelRegion != requestRegion || + desiredState != db.SentinelsDesiredStateRunning +} diff --git a/svc/ctrl/services/cluster/sync_messages.go b/svc/ctrl/services/cluster/sync_messages.go new file mode 100644 index 0000000000..bd937a26e9 --- /dev/null +++ b/svc/ctrl/services/cluster/sync_messages.go @@ -0,0 +1,118 @@ +package cluster + +import ( + "connectrpc.com/connect" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" + "github.com/unkeyed/unkey/pkg/db" +) + +// Message sending helpers - centralized protobuf construction. + +func (s *Service) sendDeploymentDelete(stream *connect.ServerStream[ctrlv1.State], sequence uint64, namespace, name string) error { + return stream.Send(&ctrlv1.State{ + Sequence: sequence, + Kind: &ctrlv1.State_Deployment{ + Deployment: &ctrlv1.DeploymentState{ + State: &ctrlv1.DeploymentState_Delete{ + Delete: &ctrlv1.DeleteDeployment{ + K8SNamespace: namespace, + K8SName: name, + }, + }, + }, + }, + }) +} + +func (s *Service) sendDeploymentApply(stream *connect.ServerStream[ctrlv1.State], sequence uint64, apply *ctrlv1.ApplyDeployment) error { + return stream.Send(&ctrlv1.State{ + Sequence: sequence, + Kind: &ctrlv1.State_Deployment{ + Deployment: &ctrlv1.DeploymentState{ + State: &ctrlv1.DeploymentState_Apply{ + Apply: apply, + }, + }, + }, + }) +} + +func (s *Service) sendDeploymentApplyFromTopology(stream *connect.ServerStream[ctrlv1.State], sequence uint64, topology db.ListDesiredDeploymentTopologyRow) error { + var buildID *string + if topology.Deployment.BuildID.Valid { + buildID = &topology.Deployment.BuildID.String + } + return s.sendDeploymentApply(stream, sequence, &ctrlv1.ApplyDeployment{ + K8SNamespace: topology.K8sNamespace.String, + K8SName: topology.Deployment.K8sName, + WorkspaceId: topology.Deployment.WorkspaceID, + EnvironmentId: topology.Deployment.EnvironmentID, + ProjectId: topology.Deployment.ProjectID, + DeploymentId: topology.Deployment.ID, + Image: topology.Deployment.Image.String, + Replicas: topology.DeploymentTopology.DesiredReplicas, + CpuMillicores: int64(topology.Deployment.CpuMillicores), + MemoryMib: int64(topology.Deployment.MemoryMib), + EncryptedEnvironmentVariables: topology.Deployment.EncryptedEnvironmentVariables, + BuildId: buildID, + }) +} + +func newApplyDeploymentFromTopology(topology db.FindDeploymentTopologyByIDAndRegionRow) *ctrlv1.ApplyDeployment { + var buildID *string + if topology.BuildID.Valid { + buildID = &topology.BuildID.String + } + return &ctrlv1.ApplyDeployment{ + K8SNamespace: topology.K8sNamespace.String, + K8SName: topology.K8sName, + WorkspaceId: topology.WorkspaceID, + EnvironmentId: topology.EnvironmentID, + ProjectId: topology.ProjectID, + DeploymentId: topology.ID, + Image: topology.Image.String, + Replicas: topology.DesiredReplicas, + CpuMillicores: int64(topology.CpuMillicores), + MemoryMib: int64(topology.MemoryMib), + EncryptedEnvironmentVariables: topology.EncryptedEnvironmentVariables, + BuildId: buildID, + } +} + +func (s *Service) sendSentinelDelete(stream *connect.ServerStream[ctrlv1.State], sequence uint64, name string) error { + return stream.Send(&ctrlv1.State{ + Sequence: sequence, + Kind: &ctrlv1.State_Sentinel{ + Sentinel: &ctrlv1.SentinelState{ + State: &ctrlv1.SentinelState_Delete{ + Delete: &ctrlv1.DeleteSentinel{ + K8SName: name, + }, + }, + }, + }, + }) +} + +func (s *Service) sendSentinelApply(stream *connect.ServerStream[ctrlv1.State], sequence uint64, sentinel db.Sentinel) error { + return stream.Send(&ctrlv1.State{ + Sequence: sequence, + Kind: &ctrlv1.State_Sentinel{ + Sentinel: &ctrlv1.SentinelState{ + State: &ctrlv1.SentinelState_Apply{ + Apply: &ctrlv1.ApplySentinel{ + K8SName: sentinel.K8sName, + WorkspaceId: sentinel.WorkspaceID, + EnvironmentId: sentinel.EnvironmentID, + ProjectId: sentinel.ProjectID, + SentinelId: sentinel.ID, + Image: sentinel.Image, + Replicas: sentinel.DesiredReplicas, + CpuMillicores: int64(sentinel.CpuMillicores), + MemoryMib: int64(sentinel.MemoryMib), + }, + }, + }, + }, + }) +} diff --git a/svc/krane/internal/reconciler/reconciler.go b/svc/krane/internal/reconciler/reconciler.go index c030bf5383..0f315112f9 100644 --- a/svc/krane/internal/reconciler/reconciler.go +++ b/svc/krane/internal/reconciler/reconciler.go @@ -114,9 +114,6 @@ func (r *Reconciler) HandleState(ctx context.Context, state *ctrlv1.State) error return err } } - case *ctrlv1.State_Bookmark: - sequence = kind.Bookmark.GetSequence() - r.logger.Info("received bookmark", "sequence", sequence) default: return fmt.Errorf("unknown state type: %T", kind) } diff --git a/svc/krane/internal/reconciler/sequence_tracking_test.go b/svc/krane/internal/reconciler/sequence_tracking_test.go index 0a1ffbbb9b..11ffd1080e 100644 --- a/svc/krane/internal/reconciler/sequence_tracking_test.go +++ b/svc/krane/internal/reconciler/sequence_tracking_test.go @@ -125,25 +125,6 @@ func TestHandleState_UpdatesSequenceAfterSentinelDelete(t *testing.T) { require.Equal(t, uint64(300), r.sequenceLastSeen) } -func TestHandleState_UpdatesSequenceFromBookmark(t *testing.T) { - ctx := context.Background() - h := NewTestHarness(t) - r := h.Reconciler - - state := &ctrlv1.State{ - Sequence: 500, // State-level sequence - Kind: &ctrlv1.State_Bookmark{ - Bookmark: &ctrlv1.Bookmark{ - Sequence: 999, // Bookmark-specific sequence takes precedence - }, - }, - } - - err := r.HandleState(ctx, state) - require.NoError(t, err) - require.Equal(t, uint64(999), r.sequenceLastSeen, "bookmark sequence should override state sequence") -} - func TestHandleState_SequenceOnlyIncreases(t *testing.T) { ctx := context.Background() h := NewTestHarness(t) @@ -322,15 +303,6 @@ func TestHandleState_BootstrapSequence(t *testing.T) { }, }, }, - // Bookmark signals end of bootstrap - { - Sequence: bootstrapSequence, - Kind: &ctrlv1.State_Bookmark{ - Bookmark: &ctrlv1.Bookmark{ - Sequence: bootstrapSequence, - }, - }, - }, } for _, state := range states { diff --git a/svc/krane/internal/reconciler/watcher_test.go b/svc/krane/internal/reconciler/watcher_test.go index d9de5453c0..e65530b76a 100644 --- a/svc/krane/internal/reconciler/watcher_test.go +++ b/svc/krane/internal/reconciler/watcher_test.go @@ -17,7 +17,6 @@ // # Key Invariants // // - sequenceLastSeen is updated to the highest sequence seen -// - Bookmark messages update sequenceLastSeen to their sequence value // - Apply messages create/update Kubernetes resources // - Delete messages remove Kubernetes resources package reconciler @@ -198,17 +197,15 @@ func TestWatch_InitialSyncWithZeroSequence(t *testing.T) { // ============================================================================= // TestWatch_ProcessesStreamMessages verifies that HandleState correctly -// processes a deployment apply message and a bookmark. +// processes deployment apply messages. // -// Scenario: A stream contains a deployment apply (seq=10) followed by a -// bookmark (seq=20). +// Scenario: A stream contains two deployment apply messages (seq=10, seq=20). // // Guarantees: // - The deployment is applied to Kubernetes (ReplicaSet is created) -// - sequenceLastSeen is updated to the bookmark's sequence (20) +// - sequenceLastSeen is updated to the highest sequence (20) // -// This tests the basic happy path: apply a resource, then receive a bookmark -// that marks the end of bootstrap. +// This tests the basic happy path: apply resources and track sequence. func TestWatch_ProcessesStreamMessages(t *testing.T) { client := fake.NewSimpleClientset() rsCapture := AddReplicaSetPatchReactor(client) @@ -241,9 +238,23 @@ func TestWatch_ProcessesStreamMessages(t *testing.T) { }, { Sequence: 20, - Kind: &ctrlv1.State_Bookmark{ - Bookmark: &ctrlv1.Bookmark{ - Sequence: 20, + Kind: &ctrlv1.State_Deployment{ + Deployment: &ctrlv1.DeploymentState{ + State: &ctrlv1.DeploymentState_Apply{ + Apply: &ctrlv1.ApplyDeployment{ + WorkspaceId: "ws_1", + ProjectId: "prj_1", + EnvironmentId: "env_1", + DeploymentId: "dep_2", + K8SNamespace: "test-ns", + K8SName: "dep-2", + Image: "nginx:1.20", + Replicas: 1, + CpuMillicores: 100, + MemoryMib: 128, + BuildId: ptr.P("build_2"), + }, + }, }, }, }, @@ -273,7 +284,7 @@ func TestWatch_ProcessesStreamMessages(t *testing.T) { } require.NotNil(t, rsCapture.Applied, "deployment should have been applied") - require.Equal(t, uint64(20), r.sequenceLastSeen, "sequence should be updated to bookmark value") + require.Equal(t, uint64(20), r.sequenceLastSeen, "sequence should be updated to highest value") } // TestWatch_IncrementalUpdates verifies that HandleState correctly processes @@ -447,13 +458,12 @@ func TestWatch_SyncConnectionError(t *testing.T) { // - Apply deployment (seq=10) // - Apply sentinel (seq=20) // - Delete deployment (seq=30) -// - Bookmark (seq=40) // // Guarantees: // - Deployment is applied to Kubernetes (ReplicaSet created with correct name) // - Sentinel is applied (as a k8s Deployment - captured separately) // - Deployment delete is processed (ReplicaSet deleted) -// - sequenceLastSeen ends at 40 (the bookmark value) +// - sequenceLastSeen ends at 30 (the highest sequence) // // This is a comprehensive integration test of HandleState covering all major // message types in a realistic sequence. @@ -528,14 +538,6 @@ func TestWatch_FullMessageProcessingFlow(t *testing.T) { }, }, }, - { - Sequence: 40, - Kind: &ctrlv1.State_Bookmark{ - Bookmark: &ctrlv1.Bookmark{ - Sequence: 40, - }, - }, - }, } for _, msg := range messages { @@ -548,5 +550,5 @@ func TestWatch_FullMessageProcessingFlow(t *testing.T) { require.Contains(t, deletes.Actions, "replicasets", "deployment delete should have been processed") - require.Equal(t, uint64(40), r.sequenceLastSeen, "sequence should be updated to bookmark value") + require.Equal(t, uint64(30), r.sequenceLastSeen, "sequence should be updated to highest value") } diff --git a/web/apps/engineering/content/docs/architecture/services/ctrl/index.mdx b/web/apps/engineering/content/docs/architecture/services/ctrl/index.mdx index 6b5ff8b1d5..ad7da514f4 100644 --- a/web/apps/engineering/content/docs/architecture/services/ctrl/index.mdx +++ b/web/apps/engineering/content/docs/architecture/services/ctrl/index.mdx @@ -33,13 +33,13 @@ The ctrl service is built on Connect RPC for service-to-service communication us ### Cluster Service -The cluster service implements the pull-based infrastructure orchestration model, enabling the control plane to coordinate deployments across multiple regions through event streaming. It maintains persistent connections with Krane agents deployed in each region, distributing deployment and sentinel events while collecting status updates. +The cluster service implements sequence-based synchronization for coordinating deployments across multiple regions. Rather than pushing events to connected agents, it exposes a `Sync` RPC that Krane instances poll for state changes. This design makes the control plane stateless with respect to connected clients. -The service provides four key RPCs. `Watch` establishes long-lived streaming connections with agents for real-time event delivery. `GetDesiredState` streams the complete desired infrastructure state for reconciliation. `UpdateInstance` receives pod status updates from agents. `UpdateSentinel` receives sentinel health and status updates. +The service provides these key RPCs. `Sync` establishes a server-streaming connection where the control plane polls the `state_changes` table and streams new entries to Krane. For fresh connections (sequence=0), it first streams the complete desired state as a bootstrap, then switches to incremental mode. `GetDesiredDeploymentState` and `GetDesiredSentinelState` return current desired state for individual resources. `UpdateDeploymentState` and `UpdateSentinelState` receive pod status updates from agents. -This pull-based architecture ensures resilience during network partitions, as agents continue operating autonomously and reconcile when connectivity returns. The service uses label-based routing to distribute events to appropriate regions and shards without requiring direct addressing. +When resources are created, updated, or deleted, the deploy workflow inserts a row into `state_changes` with a monotonically increasing sequence number. Krane instances polling for that region receive the change and apply it locally. This decouples the control plane from connection management and enables reliable at-least-once delivery through sequence-based resumption. -[Read detailed ClusterService docs →](./cluster-service) +[Read detailed Pull-Based Provisioning docs →](./pull-based-infra) ### Build Service @@ -83,12 +83,14 @@ Workflows are implemented as Restate services for durable execution. The Deploym ## Database Schema -The ctrl service uses a single MySQL database (`unkey`) that stores all data: projects, environments, and workspaces, along with deployments and deployment history, deployment topology for regional distribution, instances tracking individual pods, domains and SSL certificates, ACME users and challenges, sentinel configurations as JSON blobs, and certificate storage in PEM format. The new pull-based architecture adds the `deployment_topology` table for multi-region deployments and enhanced `instances` table for pod-level tracking. +The ctrl service uses a single MySQL database (`unkey`) that stores all data: projects, environments, and workspaces, along with deployments and deployment history, deployment topology for regional distribution, instances tracking individual pods, domains and SSL certificates, ACME users and challenges, sentinel configurations, and certificate storage in PEM format. + +The `state_changes` table is the changelog that drives Krane synchronization. Each row represents a create, update, or delete operation on a deployment or sentinel, with a monotonically increasing sequence number per region. Krane instances poll this table via the `Sync` RPC to receive incremental updates. Rows are indexed by `(region, sequence)` for efficient polling and retained for 7 days before cleanup. ## Monitoring -The ctrl service exposes metrics and logs through OpenTelemetry. Key metrics include deployment duration broken down by phase, build success and failure rates, the number of Krane poll iterations required for deployments to become ready, domain assignment latency, ACME challenge success rates, connected Krane agents by region, event distribution latency, and instance status update processing time. +The ctrl service exposes metrics and logs through OpenTelemetry. Key metrics include deployment duration broken down by phase, build success and failure rates, the number of Krane poll iterations required for deployments to become ready, domain assignment latency, ACME challenge success rates, state change processing latency, and instance status update processing time. -All operations include structured logging fields for correlation and debugging. Common fields include `deployment_id`, `project_id`, and `workspace_id` across all operations. Build operations add `build_id` and `depot_project_id`. ClusterService operations add `agent_id`, `region`, and `shard` for tracking agent connections. System-level logs include `instance_id`, `region`, and `platform` to identify which ctrl instance handled the operation. +All operations include structured logging fields for correlation and debugging. Common fields include `deployment_id`, `project_id`, and `workspace_id` across all operations. Build operations add `build_id` and `depot_project_id`. ClusterService operations add `region` and `sequence` for tracking sync progress. System-level logs include `instance_id`, `region`, and `platform` to identify which ctrl instance handled the operation. Logs are shipped to Grafana Loki in production for centralized log aggregation and querying. diff --git a/web/apps/engineering/content/docs/architecture/services/ctrl/pull-based-infra.mdx b/web/apps/engineering/content/docs/architecture/services/ctrl/pull-based-infra.mdx index de17c77989..6ab3258e06 100644 --- a/web/apps/engineering/content/docs/architecture/services/ctrl/pull-based-infra.mdx +++ b/web/apps/engineering/content/docs/architecture/services/ctrl/pull-based-infra.mdx @@ -1,15 +1,13 @@ --- title: Pull-Based Provisioning -description: Event-driven infrastructure orchestration with autonomous agents, persistent streaming connections, and eventual consistency +description: Sequence-based infrastructure synchronization with autonomous agents, polling-based updates, and eventual consistency --- import { Mermaid } from "@/app/components/mermaid"; +Unkey's infrastructure orchestration implements a pull-based model where autonomous Krane instances poll the control plane for state changes and continuously reconcile desired state with actual state. This architecture follows the Kubernetes List+Watch pattern, using a monotonically increasing sequence number to track changes and enable efficient incremental synchronization. - -Unkey's infrastructure orchestration implements a pull-based model where autonomous *krane* instances maintain persistent connections to the control plane and continuously reconcile desired state with actual state. This architecture follows cloud-native patterns similar to Kubernetes' kubelet-apiserver relationship, enabling resilient, scalable, and observable infrastructure management across multiple regions. - -The architecture's core principle is to enable autonomous reconciliation and self-healing. +The architecture's core principle is to enable autonomous reconciliation and self-healing without requiring the control plane to track connected clients or push events. ## Architecture @@ -23,86 +21,83 @@ sequenceDiagram participant K1 as Krane Agent (Region A) participant K8s as Kubernetes API - K1->>CP: Watch(region=a, shard=1) - Note over K1,CP: Persistent streaming connection + K1->>CP: Sync(region=a, sequence=0) + Note over K1,CP: Bootstrap: stream full state + CP->>K1: Stream all deployments/sentinels + Note over K1: Stream closes, last seq=42 U->>CP: Create deployment - CP->>CP: Build artifacts CP->>DB: Store deployment topology - - CP-->>K1: Stream InfraEvent (deploy) + CP->>DB: Insert state_change(seq=43) - K1->>K8s: Apply deployment + Note over K1: Polling every 1-5s + K1->>CP: (stream polls for changes) + CP->>K1: Stream State(seq=43, deploy) + K1->>K8s: Apply deployment K8s-->>K1: Pod status change K1->>CP: UpdateInstance(status=running) - CP->>DB: Upsert Instance table - - - CP->>U: Notify deployment status - - Note over K1: Asynchronous reconciliation" + CP->>DB: Upsert Instance table" /> +## Sync Protocol -## Krane Agent +The synchronization protocol uses a single `Sync` RPC that handles both initial bootstrap and incremental updates. This design eliminates the complexity of managing separate "synthetic" and "live" modes, and removes the need for the control plane to track connected clients in memory. + +### Sequence Numbers + +Every state change (deployment created, sentinel updated, resource deleted) is recorded in the `state_changes` table with a monotonically increasing sequence number per region. Krane tracks its last-seen sequence and resumes from that point on reconnect, enabling efficient incremental sync without missing events. + +### Bootstrap Phase -Krane agents act as autonomous controllers that reconcile desired state with actual Kubernetes resources in their respective regions. -Each kubernetes runs a single Krane agent. +When Krane connects with `sequence=0` (fresh start or after data loss), the control plane streams the complete desired state for the region. Each message contains a sequence number. Krane tracks the highest sequence seen and uses it for subsequent polls. Stream close signals bootstrap completion. -This diagram shows the control loop for deployments only. Sentinels are handled in the same way, but have their own buffers and controllers. +### Watch Phase + +After bootstrap, the stream enters watch mode. The control plane polls the `state_changes` table every 250ms for new entries after Krane's last-seen sequence. When changes are found, they're streamed to Krane with their sequence numbers. Krane processes each change and updates its sequence watermark. + +### Reconnection + +If the connection drops, Krane reconnects with its last-seen sequence. If this sequence is still within the retention window (changes are kept for 7 days), sync resumes incrementally. If the sequence is too old, the control plane returns `FailedPrecondition` and Krane must perform a full bootstrap by reconnecting with `sequence=0`. + +## Krane Agent + +Krane agents act as autonomous controllers that reconcile desired state with actual Kubernetes resources in their respective regions. Each Kubernetes cluster runs a single Krane agent. |UpdateInstance| CP + GC -->|UpdateSentinel| CP" /> -**Agent Components:** - -**Sync Engine**: Maintains persistent streaming connection using HTTP/2 with gRPC for efficient multiplexing and automatic reconnection. - -**Dual Controllers**: Separate deployment and sentinel controllers for each infra type. - -**Buffered Updates**: Status updates are buffered in memory to smooth over spikes in traffic and reduce the load on the control plane. - -## Communication Protocol - -### Watch Stream - -Krane agents connect to control.unkey.cloud to create a stream to listen for changes and periodic full state syncs. -If a stream is lost, the control plane buffers updates for a while to allow krane to reconnect. -To ensure all events are sent, we do a full sync on first connect and then move to streaming mode. - -### Push Updates - -Changes in kubernetes need to be known to our database to display them in our UI. Krane watches the kubernetes API for changes and pushes updates to the control plane using regular unary RPCs using buffers, retries and circuitbreakers to ensure reliability and availability. - +The Watcher maintains the Sync stream, reconnecting with jittered backoff (1-5 seconds) on failure. It passes received `State` messages to the Reconciler, which dispatches to the appropriate controller based on resource type. The Reconciler tracks `sequenceLastSeen` and updates it after successfully processing each state change. +Status updates flow back to the control plane through unary RPCs (`UpdateDeploymentState`, `UpdateSentinelState`) with buffering, retries, and circuit breakers for reliability. ## Deployment Workflow @@ -113,7 +108,6 @@ sequenceDiagram participant API participant Workflow as Deploy Workflow participant DB - participant Ctrl as Control Plane participant Krane participant K8s as Kubernetes @@ -122,15 +116,15 @@ sequenceDiagram Workflow->>DB: Create deployment Workflow->>DB: Create topology entries - Workflow->>Ctrl: Emit deployment event + Workflow->>DB: Insert state_changes - Ctrl-->>Krane: Stream deployment event + Note over Krane: Polls state_changes via Sync + Krane->>Krane: Receive State(Apply) Krane->>K8s: Apply deployment K8s-->>Krane: Pod created - Krane->>Ctrl: UpdateInstance(pending) - Ctrl->>DB: Update instance status + Krane->>DB: UpdateInstance(pending) loop Poll for completion Workflow->>DB: Check instance status @@ -144,26 +138,29 @@ sequenceDiagram end K8s-->>Krane: Pod running - Krane->>Ctrl: UpdateInstance(running) - Ctrl->>DB: Update instance status" + Krane->>DB: UpdateInstance(running)" /> -The deployment workflow operates asynchronously, with polling for completion rather than callbacks. This design ensures resilience to control plane restarts and simplifies error handling. +The deploy workflow writes desired state to the database and inserts corresponding `state_change` records. It does not push events directly to Krane. The workflow then polls the `instances` table waiting for Krane to report that pods are running, with a timeout for failure handling. + +## Why Polling Over Push -## Reconciliation +We chose polling-based synchronization over push-based event streaming for several reasons. -To prevent drift from missed events, we use two reconciliation mechanisms. +The control plane becomes stateless with respect to connected clients. It doesn't need to track which Krane instances are connected, buffer events during disconnections, or handle the complexity of fan-out to multiple subscribers. This simplifies horizontal scaling and eliminates a class of bugs around connection state management. -To ensure everything that should be running is running, krane periodically initiates a full sync of the desired state by calling `ctrl.GetDesiredState` and ensuring the current state matches the desired state. +Polling naturally handles backpressure. If Krane falls behind processing, it simply polls less frequently. With push-based streaming, the control plane would need to implement flow control or risk overwhelming slow clients. -In addition, krane periodically goes through all existing state and for each resource, it checks for the desired state in the control plane. If the desired state is not met, it initiates a reconciliation process to bring the resource back to the desired state. +The sequence-based approach provides exactly-once delivery semantics. Each change has a unique sequence number, and Krane's watermark ensures no changes are missed or processed twice, even across restarts. - +The tradeoff is latency. With 1-5 second polling intervals, there's a delay between a state change and Krane receiving it. For our use case (infrastructure provisioning measured in seconds to minutes), this latency is acceptable. ## Database Schema -The `deployment_topology` table enables multi-region deployments with independent scaling and lifecycle management. It represents the **desired state** of each deployment. +The `state_changes` table is the changelog that drives synchronization. Each row represents a create, update, or delete operation on a deployment or sentinel. The `sequence` column is an auto-incrementing primary key that provides ordering. Rows are indexed by `(region, sequence)` for efficient polling and retained for 7 days before cleanup. + +The `deployment_topology` table defines desired state for multi-region deployments. Each row specifies the desired replica count for a deployment in a specific region. When this table is modified, a corresponding `state_change` row is inserted. -The `instances` table is a representation of the current state. We only write to it in response to events from kubernetes. For example, if kubernetes deletes a pod, we reflect that in this table. +The `instances` table tracks actual state reported by Krane. We only write to it in response to Kubernetes events. The workflow polls this table to determine when deployments are ready. -The `sentinels` table is a mix of desired and actual state. Because we do not care about individual pods, we only need to configure the desired cpu, memory and replicas. We update the actual state columns from kubernetes events. +The `sentinels` table combines desired and actual state. Desired fields (cpu, memory, replicas) are set by the control plane; actual fields (available_replicas, health) are updated by Krane. diff --git a/web/apps/engineering/content/docs/architecture/services/krane/index.mdx b/web/apps/engineering/content/docs/architecture/services/krane/index.mdx index 414ec7611b..e754465a6d 100644 --- a/web/apps/engineering/content/docs/architecture/services/krane/index.mdx +++ b/web/apps/engineering/content/docs/architecture/services/krane/index.mdx @@ -7,38 +7,25 @@ import { Mermaid } from "@/app/components/mermaid"; -Krane is a Kubernetes cluster agent that follows a pull-based model similar to the Kubernetes kubelet. It connects to the control plane (ctrl) and maintains a long-lived streaming connection to receive deployment and sentinel configuration events. This architecture enables multi-cluster orchestration without the control plane needing direct access to individual clusters. +Krane is a Kubernetes cluster agent that follows a pull-based model similar to the Kubernetes kubelet. It polls the control plane for state changes using a sequence-based synchronization protocol, applying changes to local Kubernetes resources. This architecture enables multi-cluster orchestration without requiring the control plane to track connected clients or push events. -Krane pulls desired state from ctrl and ensures the actual cluster state matches. It handles deployment and sentinel lifecycle operations (create, update, delete) by translating high-level events into Kubernetes resources. +Krane pulls desired state from ctrl and ensures the actual cluster state matches. It handles deployment and sentinel lifecycle operations (create, update, delete) by translating high-level state messages into Kubernetes resources. ## Architecture ### Pull-Based Model -Krane implements a pull-based architecture where agents in each cluster: +Krane implements a polling-based architecture where agents in each cluster connect to ctrl's ClusterService via the `Sync` RPC, which establishes a server-streaming connection. The control plane polls its `state_changes` table and streams new entries to connected agents. Krane processes each state change, applies it to Kubernetes, and updates its sequence watermark. On reconnection, Krane sends its last-seen sequence to resume incrementally without missing events. -1. **Establish a Watch Stream**: Connect to ctrl's ClusterService and maintain a long-lived streaming connection -2. **Pull Desired State**: Periodically sync the full desired state to ensure consistency -3. **Receive Real-Time Events**: Get deployment and sentinel events pushed through the watch stream -4. **Apply Changes Locally**: Translate events into Kubernetes resources +This model eliminates the need for the control plane to track connected clients in memory, simplifying horizontal scaling and removing a class of connection state bugs. -This model eliminates the need for the control plane to know about individual clusters, improving security and scalability. +### Sequence-Based Synchronization +The sync engine uses sequence numbers to track state changes. Every modification to deployments or sentinels is recorded in the `state_changes` table with a monotonically increasing sequence number per region. Krane maintains a `sequenceLastSeen` watermark and polls for changes after that sequence. -### Event-Driven Synchronization +On fresh start (sequence=0), Krane receives the complete desired state followed by a `Bookmark` message containing the current maximum sequence. After bootstrap, the stream switches to incremental mode, receiving only new changes as they occur. -The sync engine (`sync/`) manages the connection to the control plane: - -1. **Watch Stream**: Maintains a persistent connection to receive real-time events -2. **Pull Sync**: Periodically, pulls the complete desired state as a safety net -3. **Event Buffer**: Queues events for processing with configurable capacity -4. **Automatic Reconnection**: Handles connection failures with exponential backoff - -Events flow through the system: -- Control plane emits `InfraEvent` messages containing `ApplyDeployment` or `DeleteDeployment` -- Krane's sync engine receives and buffers these events -- The deployment manager processes events and calls the appropriate backend -- The backend translates events into platform-specific resources +State changes are retained for 7 days. If Krane's last-seen sequence falls behind the retention window, it must perform a full bootstrap. This handles long disconnections gracefully while keeping storage bounded. ### Why StatefulSets Instead of Deployments? @@ -63,12 +50,13 @@ This is a known design compromise. Future versions might move instance addressin User->>Ctrl: Create deployment request Ctrl->>DB: Store desired state - DB->>Ctrl: Acknowledged + Ctrl->>DB: Insert state_change(seq=N) - Note over Krane: Watch Stream Active + Note over Krane: Sync stream polling - Ctrl->>Krane: InfraEvent: ApplyDeployment - Note right of Krane: Event pushed through watch stream + Krane->>Ctrl: (poll for changes > seq) + Ctrl->>DB: Query state_changes + Ctrl->>Krane: State(seq=N, ApplyDeployment) Krane->>K8s: Create/Update Service K8s->>Krane: Service ready @@ -79,13 +67,7 @@ This is a known design compromise. Future versions might move instance addressin K8s->>Pod: Pull image & start container Pod->>Pod: Container running - Note over Krane: Periodic sync (15 min) - - Krane->>Ctrl: GetDesiredState() - Ctrl->>DB: Query desired deployments - DB->>Ctrl: Return configurations - Ctrl->>Krane: Stream all ApplyDeployment events - Krane->>Krane: Reconcile actual vs desired + Note over Krane: sequenceLastSeen = N `} /> @@ -155,21 +137,18 @@ When querying deployments, Krane verifies the `unkey.managed.by` label matches ` ### ClusterService API -The control plane exposes a `ClusterService` with two key RPCs: +The control plane exposes a `ClusterService` with these key RPCs: + +**Sync** establishes a server-streaming connection for receiving state changes. Krane sends its region and last-seen sequence number; the control plane streams bootstrap state (if sequence=0) followed by incremental changes. The control plane polls its `state_changes` table and streams new entries as they appear. + +**GetDesiredDeploymentState** and **GetDesiredSentinelState** return the current desired state for a specific resource. Used for on-demand reconciliation when Kubernetes reports unexpected state. -- **Watch**: Long-lived stream for real-time event delivery -- **GetDesiredState**: Returns all current desired infrastructure configurations +**UpdateDeploymentState** and **UpdateSentinelState** receive status updates from Krane about actual Kubernetes state (pod running, pod failed, etc.). -### Event Distribution +### State Change Distribution -When deployment changes occur: -1. Control plane stores the desired state in the database -2. Emits events to all connected Krane agents matching the region selector -3. Each Krane agent applies changes to its local cluster +When deployment changes occur, the control plane stores the desired state in the database and inserts a row into `state_changes` with the resource ID, operation type (upsert/delete), and region. Krane instances polling for that region receive the change on their next poll cycle. Each Krane instance independently applies changes to its local cluster. ### Multi-Region Support -Krane agents connect with selectors (typically `region: aws-us-east-1`) to receive only relevant events. This enables: -- Region-specific deployments -- Cross-region failover capabilities -- Isolated development environments +The `state_changes` table is partitioned by region, so each Krane instance only receives changes relevant to its cluster. The control plane doesn't need to know which Krane instances exist or are connected; it simply writes changes to the database, and any Krane polling that region will receive them. diff --git a/web/apps/engineering/content/docs/architecture/services/krane/sync-engine.mdx b/web/apps/engineering/content/docs/architecture/services/krane/sync-engine.mdx index 55a5f54c4c..0e06130945 100644 --- a/web/apps/engineering/content/docs/architecture/services/krane/sync-engine.mdx +++ b/web/apps/engineering/content/docs/architecture/services/krane/sync-engine.mdx @@ -1,12 +1,11 @@ --- title: Krane Sync Engine Architecture -description: Deep dive into Krane's event processing engine that maintains persistent connections, handles reconciliation, and ensures eventual consistency +description: Deep dive into Krane's sequence-based synchronization that polls for state changes and ensures eventual consistency --- import { Mermaid } from "@/app/components/mermaid"; - -The Krane Sync Engine is the core component that implements the pull-based infrastructure model. It maintains a persistent connection to the control plane, processes infrastructure events, and ensures eventual consistency through periodic reconciliation. +The Krane Sync Engine implements a Kubernetes-style List+Watch pattern for synchronizing desired infrastructure state from the control plane. It uses sequence numbers to track state changes, enabling efficient incremental synchronization and reliable recovery after disconnections. ## Architecture @@ -14,24 +13,24 @@ The Krane Sync Engine is the core component that implements the pull-based infra chart=" graph TB subgraph 'Krane Agent' - SE[Sync Engine] + W[Watcher] + R[Reconciler] DC[Deployment Controller] GC[Sentinel Controller] - EB[Event Buffer] IUB[Instance Update Buffer] GUB[Sentinel Update Buffer] - SE --> EB - EB --> DC - EB --> GC + W -->|HandleState| R + R --> DC + R --> GC DC --> IUB GC --> GUB - IUB --> SE - GUB --> SE end subgraph 'Control Plane' CS[ClusterService] + SC[(state_changes)] + CS -.->|poll| SC end subgraph Kubernetes @@ -40,13 +39,84 @@ graph TB W2[StatefulSet Watcher] end - SE -.->|Watch Stream| CS - SE -->|GetDesiredState| CS - SE -->|UpdateInstance| CS - SE -->|UpdateSentinel| CS + W -.->|Sync stream| CS + IUB -->|UpdateDeploymentState| CS + GUB -->|UpdateSentinelState| CS DC -->|Apply/Delete| API GC -->|Apply/Delete| API W1 -->|Events| DC W2 -->|Events| DC" /> + +## Sync Protocol + +The sync engine uses a single `Sync` RPC to receive state changes from the control plane. This RPC establishes a server-streaming connection where the control plane sends `State` messages containing deployment or sentinel operations. + +### Sequence Tracking + +The reconciler maintains a `sequenceLastSeen` field that tracks the highest sequence number successfully processed. On startup, this is zero. After processing each `State` message, the reconciler updates this watermark. When reconnecting after a failure, Krane sends its last-seen sequence in the `SyncRequest`, allowing the control plane to resume from the correct position. + +### Message Types + +The `State` message contains a sequence number and one of two payloads: + +**DeploymentState** contains either an `ApplyDeployment` (create or update a StatefulSet with the specified image, replicas, and resource limits) or `DeleteDeployment` (remove the StatefulSet and its associated Service). + +**SentinelState** contains either an `ApplySentinel` (create or update a sentinel deployment) or `DeleteSentinel` (remove the sentinel). + +Stream close signals that the current batch (or bootstrap) is complete. The client tracks the highest sequence from received messages and uses it for the next sync request. + +## Watcher Loop + +The Watcher runs a continuous loop with jittered reconnection timing (1-5 seconds between attempts). Each iteration establishes a Sync stream and processes messages until the stream closes or an error occurs. + +``` +for { + sleep(random(1s, 5s)) + stream = cluster.Sync(region, sequenceLastSeen) + for message in stream { + reconciler.HandleState(message) + } +} +``` + +This design prioritizes simplicity and reliability over latency. The jittered timing prevents thundering herd problems when multiple Krane instances reconnect simultaneously after a control plane restart. + +## State Handling + +The `HandleState` method on the Reconciler dispatches each state message to the appropriate controller: + +``` +HandleState(state): + switch state.Kind: + case Deployment: + if Apply: ApplyDeployment(state.Apply) + if Delete: DeleteDeployment(state.Delete) + case Sentinel: + if Apply: ApplySentinel(state.Apply) + if Delete: DeleteSentinel(state.Delete) + + if state.Sequence > sequenceLastSeen: + sequenceLastSeen = state.Sequence +``` + +The sequence watermark is updated after processing, ensuring at-least-once delivery. If Krane crashes mid-processing, it will reprocess the same message on restart, which is safe because apply operations are idempotent. + +## Kubernetes Watchers + +In addition to receiving desired state from the control plane, Krane watches Kubernetes for actual state changes. Pod and StatefulSet watchers notify the controllers when resources change (pod becomes ready, pod fails, etc.). The controllers then report these changes back to the control plane through `UpdateDeploymentState` and `UpdateSentinelState` RPCs. + +This bidirectional flow ensures the control plane always knows the actual state of resources, enabling the UI to show accurate deployment status and the workflow to detect when deployments are ready. + +## Buffered Updates + +Status updates to the control plane are buffered in memory before sending. This smooths over traffic spikes and reduces load on the control plane during high-churn scenarios (like rolling updates affecting many pods). The buffers use retries with exponential backoff and circuit breakers to handle transient failures without overwhelming a recovering control plane. + +## Failure Modes + +**Stream disconnection**: The watcher reconnects with jittered backoff. If the last-seen sequence is within the 7-day retention window, sync resumes incrementally. Otherwise, Krane performs a full bootstrap. + +**Control plane unavailable**: The circuit breaker opens after repeated failures, preventing Krane from overwhelming a struggling control plane. Local Kubernetes state continues to function; only sync with the control plane is paused. + +**Sequence too old**: If Krane has been offline longer than the 7-day retention period, the control plane returns `FailedPrecondition`. Krane resets its sequence to zero and performs a full bootstrap, which may result in reprocessing resources that already exist (handled gracefully by idempotent apply operations). From ffd3735e13f322e490a2cdc05041aa71b6eccfe9 Mon Sep 17 00:00:00 2001 From: chronark Date: Mon, 19 Jan 2026 22:49:32 +0100 Subject: [PATCH 05/32] feat: use total ordering --- cmd/dev/seed/ingress.go | 1 + gen/proto/ctrl/v1/cluster.pb.go | 36 +- gen/proto/hydra/v1/BUILD.bazel | 2 + gen/proto/hydra/v1/hydrav1connect/BUILD.bazel | 16 - .../v1/hydrav1connect/certificate.connect.go | 148 -------- .../v1/hydrav1connect/deployment.connect.go | 167 --------- .../v1/hydrav1connect/routing.connect.go | 111 ------ gen/proto/hydra/v1/versioning.pb.go | 255 ++++++++++++++ gen/proto/hydra/v1/versioning_restate.pb.go | 177 ++++++++++ pkg/db/BUILD.bazel | 3 + ...eployment_topology_insert.sql_generated.go | 5 +- pkg/db/bulk_sentinel_insert.sql_generated.go | 5 +- .../cluster_state_versions.sql_generated.go | 83 +++++ ...topology_find_by_versions.sql_generated.go | 107 ++++++ ...eployment_topology_insert.sql_generated.go | 6 + ...ent_topology_list_desired.sql_generated.go | 6 +- pkg/db/models_generated.go | 2 + pkg/db/querier_generated.go | 53 ++- pkg/db/queries/cluster_state_versions.sql | 17 + .../deployment_topology_find_by_versions.sql | 11 + pkg/db/queries/deployment_topology_insert.sql | 2 + .../deployment_topology_list_desired.sql | 1 - pkg/db/queries/sentinel_find_by_versions.sql | 4 + pkg/db/queries/sentinel_insert.sql | 2 + pkg/db/queries/sentinel_list_desired.sql | 1 - .../queries/state_change_get_max_sequence.sql | 4 +- pkg/db/schema.sql | 6 + ...el_find_by_environment_id.sql_generated.go | 5 +- pkg/db/sentinel_find_by_id.sql_generated.go | 5 +- ...sentinel_find_by_versions.sql_generated.go | 71 ++++ pkg/db/sentinel_insert.sql_generated.go | 6 + pkg/db/sentinel_list_desired.sql_generated.go | 6 +- ...e_change_get_max_sequence.sql_generated.go | 4 +- svc/ctrl/BUILD.bazel | 1 + svc/ctrl/doc.go | 4 +- svc/ctrl/integration/harness.go | 28 +- svc/ctrl/integration/sync_test.go | 2 +- svc/ctrl/middleware/auth.go | 2 +- svc/ctrl/middleware/doc.go | 4 +- svc/ctrl/proto/ctrl/v1/cluster.proto | 10 +- svc/ctrl/proto/hydra/v1/versioning.proto | 47 +++ svc/ctrl/run.go | 7 +- svc/ctrl/services/build/doc.go | 2 +- svc/ctrl/services/cluster/BUILD.bazel | 3 - svc/ctrl/services/cluster/auth.go | 2 + svc/ctrl/services/cluster/doc.go | 25 +- .../rpc_get_desired_deployment_state.go | 8 + .../cluster/rpc_get_desired_sentinel_state.go | 7 + svc/ctrl/services/cluster/rpc_sync.go | 215 ++++++++++-- .../cluster/rpc_update_deployment_state.go | 12 + .../cluster/rpc_update_sentinel_state.go | 6 + svc/ctrl/services/cluster/service.go | 2 +- svc/ctrl/services/cluster/sync_bootstrap.go | 91 ----- svc/ctrl/services/cluster/sync_changes.go | 90 ----- svc/ctrl/services/cluster/sync_messages.go | 118 ------- svc/ctrl/services/ctrl/doc.go | 4 +- svc/ctrl/services/doc.go | 10 +- svc/ctrl/services/openapi/doc.go | 2 +- svc/ctrl/workflows/deploy/deploy_handler.go | 68 ++-- svc/ctrl/workflows/versioning/BUILD.bazel | 16 + svc/ctrl/workflows/versioning/doc.go | 25 ++ .../versioning/next_version_handler.go | 40 +++ svc/ctrl/workflows/versioning/service.go | 22 ++ svc/krane/internal/reconciler/BUILD.bazel | 2 +- .../internal/reconciler/handle_state_test.go | 12 +- svc/krane/internal/reconciler/reconciler.go | 55 +-- .../reconciler/sequence_tracking_test.go | 319 ------------------ .../reconciler/version_tracking_test.go | 116 +++++++ svc/krane/internal/reconciler/watcher.go | 24 +- svc/krane/internal/reconciler/watcher_test.go | 104 ++++-- svc/krane/secrets/token/k8s_validator.go | 5 + .../services/ctrl/pull-based-infra.mdx | 87 +++-- .../services/krane/sync-engine.mdx | 45 ++- .../db/src/schema/deployment_topology.ts | 7 + web/internal/db/src/schema/sentinels.ts | 8 + 75 files changed, 1657 insertions(+), 1328 deletions(-) delete mode 100644 gen/proto/hydra/v1/hydrav1connect/BUILD.bazel delete mode 100644 gen/proto/hydra/v1/hydrav1connect/certificate.connect.go delete mode 100644 gen/proto/hydra/v1/hydrav1connect/deployment.connect.go delete mode 100644 gen/proto/hydra/v1/hydrav1connect/routing.connect.go create mode 100644 gen/proto/hydra/v1/versioning.pb.go create mode 100644 gen/proto/hydra/v1/versioning_restate.pb.go create mode 100644 pkg/db/cluster_state_versions.sql_generated.go create mode 100644 pkg/db/deployment_topology_find_by_versions.sql_generated.go create mode 100644 pkg/db/queries/cluster_state_versions.sql create mode 100644 pkg/db/queries/deployment_topology_find_by_versions.sql create mode 100644 pkg/db/queries/sentinel_find_by_versions.sql create mode 100644 pkg/db/sentinel_find_by_versions.sql_generated.go create mode 100644 svc/ctrl/proto/hydra/v1/versioning.proto delete mode 100644 svc/ctrl/services/cluster/sync_bootstrap.go delete mode 100644 svc/ctrl/services/cluster/sync_changes.go delete mode 100644 svc/ctrl/services/cluster/sync_messages.go create mode 100644 svc/ctrl/workflows/versioning/BUILD.bazel create mode 100644 svc/ctrl/workflows/versioning/doc.go create mode 100644 svc/ctrl/workflows/versioning/next_version_handler.go create mode 100644 svc/ctrl/workflows/versioning/service.go delete mode 100644 svc/krane/internal/reconciler/sequence_tracking_test.go create mode 100644 svc/krane/internal/reconciler/version_tracking_test.go diff --git a/cmd/dev/seed/ingress.go b/cmd/dev/seed/ingress.go index e2c229b4a8..4a5941b587 100644 --- a/cmd/dev/seed/ingress.go +++ b/cmd/dev/seed/ingress.go @@ -137,6 +137,7 @@ func seedFrontline(ctx context.Context, cmd *cli.Command) error { ProjectID: projectID, CpuMillicores: 512, MemoryMib: 512, + Version: uint64(now), CreatedAt: now, }) if err != nil && !db.IsDuplicateKeyError(err) { diff --git a/gen/proto/ctrl/v1/cluster.pb.go b/gen/proto/ctrl/v1/cluster.pb.go index 65cfb526cf..191329681b 100644 --- a/gen/proto/ctrl/v1/cluster.pb.go +++ b/gen/proto/ctrl/v1/cluster.pb.go @@ -414,11 +414,11 @@ func (*UpdateSentinelStateResponse) Descriptor() ([]byte, []int) { } type SyncRequest struct { - state protoimpl.MessageState `protogen:"open.v1"` - Region string `protobuf:"bytes,1,opt,name=region,proto3" json:"region,omitempty"` - SequenceLastSeen uint64 `protobuf:"varint,2,opt,name=sequence_last_seen,json=sequenceLastSeen,proto3" json:"sequence_last_seen,omitempty"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache + state protoimpl.MessageState `protogen:"open.v1"` + Region string `protobuf:"bytes,1,opt,name=region,proto3" json:"region,omitempty"` + VersionLastSeen uint64 `protobuf:"varint,2,opt,name=version_last_seen,json=versionLastSeen,proto3" json:"version_last_seen,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache } func (x *SyncRequest) Reset() { @@ -458,19 +458,19 @@ func (x *SyncRequest) GetRegion() string { return "" } -func (x *SyncRequest) GetSequenceLastSeen() uint64 { +func (x *SyncRequest) GetVersionLastSeen() uint64 { if x != nil { - return x.SequenceLastSeen + return x.VersionLastSeen } return 0 } type State struct { state protoimpl.MessageState `protogen:"open.v1"` - // sequence is the state_changes sequence number for this event. - // Clients should persist this after successfully processing each event - // to resume from the correct position on reconnect. - Sequence uint64 `protobuf:"varint,1,opt,name=sequence,proto3" json:"sequence,omitempty"` + // version is the resource version for this state update. + // Clients should track the max version seen and persist it after + // the stream closes cleanly to resume from the correct position on reconnect. + Version uint64 `protobuf:"varint,1,opt,name=version,proto3" json:"version,omitempty"` // Types that are valid to be assigned to Kind: // // *State_Deployment @@ -510,9 +510,9 @@ func (*State) Descriptor() ([]byte, []int) { return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{8} } -func (x *State) GetSequence() uint64 { +func (x *State) GetVersion() uint64 { if x != nil { - return x.Sequence + return x.Version } return 0 } @@ -1334,12 +1334,12 @@ const file_ctrl_v1_cluster_proto_rawDesc = "" + "\x1aUpdateSentinelStateRequest\x12\x19\n" + "\bk8s_name\x18\x01 \x01(\tR\ak8sName\x12-\n" + "\x12available_replicas\x18\x02 \x01(\x05R\x11availableReplicas\"\x1d\n" + - "\x1bUpdateSentinelStateResponse\"S\n" + + "\x1bUpdateSentinelStateResponse\"Q\n" + "\vSyncRequest\x12\x16\n" + - "\x06region\x18\x01 \x01(\tR\x06region\x12,\n" + - "\x12sequence_last_seen\x18\x02 \x01(\x04R\x10sequenceLastSeen\"\x9d\x01\n" + - "\x05State\x12\x1a\n" + - "\bsequence\x18\x01 \x01(\x04R\bsequence\x12:\n" + + "\x06region\x18\x01 \x01(\tR\x06region\x12*\n" + + "\x11version_last_seen\x18\x02 \x01(\x04R\x0fversionLastSeen\"\x9b\x01\n" + + "\x05State\x12\x18\n" + + "\aversion\x18\x01 \x01(\x04R\aversion\x12:\n" + "\n" + "deployment\x18\x02 \x01(\v2\x18.ctrl.v1.DeploymentStateH\x00R\n" + "deployment\x124\n" + diff --git a/gen/proto/hydra/v1/BUILD.bazel b/gen/proto/hydra/v1/BUILD.bazel index b9afc5ef6b..8496802a6b 100644 --- a/gen/proto/hydra/v1/BUILD.bazel +++ b/gen/proto/hydra/v1/BUILD.bazel @@ -9,6 +9,8 @@ go_library( "deployment_restate.pb.go", "routing.pb.go", "routing_restate.pb.go", + "versioning.pb.go", + "versioning_restate.pb.go", ], importpath = "github.com/unkeyed/unkey/gen/proto/hydra/v1", visibility = ["//visibility:public"], diff --git a/gen/proto/hydra/v1/hydrav1connect/BUILD.bazel b/gen/proto/hydra/v1/hydrav1connect/BUILD.bazel deleted file mode 100644 index 5dc3dd1c97..0000000000 --- a/gen/proto/hydra/v1/hydrav1connect/BUILD.bazel +++ /dev/null @@ -1,16 +0,0 @@ -load("@rules_go//go:def.bzl", "go_library") - -go_library( - name = "hydrav1connect", - srcs = [ - "certificate.connect.go", - "deployment.connect.go", - "routing.connect.go", - ], - importpath = "github.com/unkeyed/unkey/gen/proto/hydra/v1/hydrav1connect", - visibility = ["//visibility:public"], - deps = [ - "//gen/proto/hydra/v1:hydra", - "@com_connectrpc_connect//:connect", - ], -) diff --git a/gen/proto/hydra/v1/hydrav1connect/certificate.connect.go b/gen/proto/hydra/v1/hydrav1connect/certificate.connect.go deleted file mode 100644 index c20810312f..0000000000 --- a/gen/proto/hydra/v1/hydrav1connect/certificate.connect.go +++ /dev/null @@ -1,148 +0,0 @@ -// Code generated by protoc-gen-connect-go. DO NOT EDIT. -// -// Source: hydra/v1/certificate.proto - -package hydrav1connect - -import ( - connect "connectrpc.com/connect" - context "context" - errors "errors" - v1 "github.com/unkeyed/unkey/gen/proto/hydra/v1" - http "net/http" - strings "strings" -) - -// This is a compile-time assertion to ensure that this generated file and the connect package are -// compatible. If you get a compiler error that this constant is not defined, this code was -// generated with a version of connect newer than the one compiled into your binary. You can fix the -// problem by either regenerating this code with an older version of connect or updating the connect -// version compiled into your binary. -const _ = connect.IsAtLeastVersion1_13_0 - -const ( - // CertificateServiceName is the fully-qualified name of the CertificateService service. - CertificateServiceName = "hydra.v1.CertificateService" -) - -// These constants are the fully-qualified names of the RPCs defined in this package. They're -// exposed at runtime as Spec.Procedure and as the final two segments of the HTTP route. -// -// Note that these are different from the fully-qualified method names used by -// google.golang.org/protobuf/reflect/protoreflect. To convert from these constants to -// reflection-formatted method names, remove the leading slash and convert the remaining slash to a -// period. -const ( - // CertificateServiceProcessChallengeProcedure is the fully-qualified name of the - // CertificateService's ProcessChallenge RPC. - CertificateServiceProcessChallengeProcedure = "/hydra.v1.CertificateService/ProcessChallenge" - // CertificateServiceRenewExpiringCertificatesProcedure is the fully-qualified name of the - // CertificateService's RenewExpiringCertificates RPC. - CertificateServiceRenewExpiringCertificatesProcedure = "/hydra.v1.CertificateService/RenewExpiringCertificates" -) - -// CertificateServiceClient is a client for the hydra.v1.CertificateService service. -type CertificateServiceClient interface { - // ProcessChallenge handles the complete ACME certificate challenge flow - // Key: domain name (ensures only one challenge per domain at a time) - ProcessChallenge(context.Context, *connect.Request[v1.ProcessChallengeRequest]) (*connect.Response[v1.ProcessChallengeResponse], error) - // RenewExpiringCertificates checks for certificates expiring soon and renews them. - // This should be called periodically (e.g., daily via cron). - // Key: "global" (single instance ensures no duplicate renewal runs) - RenewExpiringCertificates(context.Context, *connect.Request[v1.RenewExpiringCertificatesRequest]) (*connect.Response[v1.RenewExpiringCertificatesResponse], error) -} - -// NewCertificateServiceClient constructs a client for the hydra.v1.CertificateService service. By -// default, it uses the Connect protocol with the binary Protobuf Codec, asks for gzipped responses, -// and sends uncompressed requests. To use the gRPC or gRPC-Web protocols, supply the -// connect.WithGRPC() or connect.WithGRPCWeb() options. -// -// The URL supplied here should be the base URL for the Connect or gRPC server (for example, -// http://api.acme.com or https://acme.com/grpc). -func NewCertificateServiceClient(httpClient connect.HTTPClient, baseURL string, opts ...connect.ClientOption) CertificateServiceClient { - baseURL = strings.TrimRight(baseURL, "/") - certificateServiceMethods := v1.File_hydra_v1_certificate_proto.Services().ByName("CertificateService").Methods() - return &certificateServiceClient{ - processChallenge: connect.NewClient[v1.ProcessChallengeRequest, v1.ProcessChallengeResponse]( - httpClient, - baseURL+CertificateServiceProcessChallengeProcedure, - connect.WithSchema(certificateServiceMethods.ByName("ProcessChallenge")), - connect.WithClientOptions(opts...), - ), - renewExpiringCertificates: connect.NewClient[v1.RenewExpiringCertificatesRequest, v1.RenewExpiringCertificatesResponse]( - httpClient, - baseURL+CertificateServiceRenewExpiringCertificatesProcedure, - connect.WithSchema(certificateServiceMethods.ByName("RenewExpiringCertificates")), - connect.WithClientOptions(opts...), - ), - } -} - -// certificateServiceClient implements CertificateServiceClient. -type certificateServiceClient struct { - processChallenge *connect.Client[v1.ProcessChallengeRequest, v1.ProcessChallengeResponse] - renewExpiringCertificates *connect.Client[v1.RenewExpiringCertificatesRequest, v1.RenewExpiringCertificatesResponse] -} - -// ProcessChallenge calls hydra.v1.CertificateService.ProcessChallenge. -func (c *certificateServiceClient) ProcessChallenge(ctx context.Context, req *connect.Request[v1.ProcessChallengeRequest]) (*connect.Response[v1.ProcessChallengeResponse], error) { - return c.processChallenge.CallUnary(ctx, req) -} - -// RenewExpiringCertificates calls hydra.v1.CertificateService.RenewExpiringCertificates. -func (c *certificateServiceClient) RenewExpiringCertificates(ctx context.Context, req *connect.Request[v1.RenewExpiringCertificatesRequest]) (*connect.Response[v1.RenewExpiringCertificatesResponse], error) { - return c.renewExpiringCertificates.CallUnary(ctx, req) -} - -// CertificateServiceHandler is an implementation of the hydra.v1.CertificateService service. -type CertificateServiceHandler interface { - // ProcessChallenge handles the complete ACME certificate challenge flow - // Key: domain name (ensures only one challenge per domain at a time) - ProcessChallenge(context.Context, *connect.Request[v1.ProcessChallengeRequest]) (*connect.Response[v1.ProcessChallengeResponse], error) - // RenewExpiringCertificates checks for certificates expiring soon and renews them. - // This should be called periodically (e.g., daily via cron). - // Key: "global" (single instance ensures no duplicate renewal runs) - RenewExpiringCertificates(context.Context, *connect.Request[v1.RenewExpiringCertificatesRequest]) (*connect.Response[v1.RenewExpiringCertificatesResponse], error) -} - -// NewCertificateServiceHandler builds an HTTP handler from the service implementation. It returns -// the path on which to mount the handler and the handler itself. -// -// By default, handlers support the Connect, gRPC, and gRPC-Web protocols with the binary Protobuf -// and JSON codecs. They also support gzip compression. -func NewCertificateServiceHandler(svc CertificateServiceHandler, opts ...connect.HandlerOption) (string, http.Handler) { - certificateServiceMethods := v1.File_hydra_v1_certificate_proto.Services().ByName("CertificateService").Methods() - certificateServiceProcessChallengeHandler := connect.NewUnaryHandler( - CertificateServiceProcessChallengeProcedure, - svc.ProcessChallenge, - connect.WithSchema(certificateServiceMethods.ByName("ProcessChallenge")), - connect.WithHandlerOptions(opts...), - ) - certificateServiceRenewExpiringCertificatesHandler := connect.NewUnaryHandler( - CertificateServiceRenewExpiringCertificatesProcedure, - svc.RenewExpiringCertificates, - connect.WithSchema(certificateServiceMethods.ByName("RenewExpiringCertificates")), - connect.WithHandlerOptions(opts...), - ) - return "/hydra.v1.CertificateService/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - switch r.URL.Path { - case CertificateServiceProcessChallengeProcedure: - certificateServiceProcessChallengeHandler.ServeHTTP(w, r) - case CertificateServiceRenewExpiringCertificatesProcedure: - certificateServiceRenewExpiringCertificatesHandler.ServeHTTP(w, r) - default: - http.NotFound(w, r) - } - }) -} - -// UnimplementedCertificateServiceHandler returns CodeUnimplemented from all methods. -type UnimplementedCertificateServiceHandler struct{} - -func (UnimplementedCertificateServiceHandler) ProcessChallenge(context.Context, *connect.Request[v1.ProcessChallengeRequest]) (*connect.Response[v1.ProcessChallengeResponse], error) { - return nil, connect.NewError(connect.CodeUnimplemented, errors.New("hydra.v1.CertificateService.ProcessChallenge is not implemented")) -} - -func (UnimplementedCertificateServiceHandler) RenewExpiringCertificates(context.Context, *connect.Request[v1.RenewExpiringCertificatesRequest]) (*connect.Response[v1.RenewExpiringCertificatesResponse], error) { - return nil, connect.NewError(connect.CodeUnimplemented, errors.New("hydra.v1.CertificateService.RenewExpiringCertificates is not implemented")) -} diff --git a/gen/proto/hydra/v1/hydrav1connect/deployment.connect.go b/gen/proto/hydra/v1/hydrav1connect/deployment.connect.go deleted file mode 100644 index c5c358d5e7..0000000000 --- a/gen/proto/hydra/v1/hydrav1connect/deployment.connect.go +++ /dev/null @@ -1,167 +0,0 @@ -// Code generated by protoc-gen-connect-go. DO NOT EDIT. -// -// Source: hydra/v1/deployment.proto - -package hydrav1connect - -import ( - connect "connectrpc.com/connect" - context "context" - errors "errors" - v1 "github.com/unkeyed/unkey/gen/proto/hydra/v1" - http "net/http" - strings "strings" -) - -// This is a compile-time assertion to ensure that this generated file and the connect package are -// compatible. If you get a compiler error that this constant is not defined, this code was -// generated with a version of connect newer than the one compiled into your binary. You can fix the -// problem by either regenerating this code with an older version of connect or updating the connect -// version compiled into your binary. -const _ = connect.IsAtLeastVersion1_13_0 - -const ( - // DeploymentServiceName is the fully-qualified name of the DeploymentService service. - DeploymentServiceName = "hydra.v1.DeploymentService" -) - -// These constants are the fully-qualified names of the RPCs defined in this package. They're -// exposed at runtime as Spec.Procedure and as the final two segments of the HTTP route. -// -// Note that these are different from the fully-qualified method names used by -// google.golang.org/protobuf/reflect/protoreflect. To convert from these constants to -// reflection-formatted method names, remove the leading slash and convert the remaining slash to a -// period. -const ( - // DeploymentServiceDeployProcedure is the fully-qualified name of the DeploymentService's Deploy - // RPC. - DeploymentServiceDeployProcedure = "/hydra.v1.DeploymentService/Deploy" - // DeploymentServiceRollbackProcedure is the fully-qualified name of the DeploymentService's - // Rollback RPC. - DeploymentServiceRollbackProcedure = "/hydra.v1.DeploymentService/Rollback" - // DeploymentServicePromoteProcedure is the fully-qualified name of the DeploymentService's Promote - // RPC. - DeploymentServicePromoteProcedure = "/hydra.v1.DeploymentService/Promote" -) - -// DeploymentServiceClient is a client for the hydra.v1.DeploymentService service. -type DeploymentServiceClient interface { - Deploy(context.Context, *connect.Request[v1.DeployRequest]) (*connect.Response[v1.DeployResponse], error) - Rollback(context.Context, *connect.Request[v1.RollbackRequest]) (*connect.Response[v1.RollbackResponse], error) - Promote(context.Context, *connect.Request[v1.PromoteRequest]) (*connect.Response[v1.PromoteResponse], error) -} - -// NewDeploymentServiceClient constructs a client for the hydra.v1.DeploymentService service. By -// default, it uses the Connect protocol with the binary Protobuf Codec, asks for gzipped responses, -// and sends uncompressed requests. To use the gRPC or gRPC-Web protocols, supply the -// connect.WithGRPC() or connect.WithGRPCWeb() options. -// -// The URL supplied here should be the base URL for the Connect or gRPC server (for example, -// http://api.acme.com or https://acme.com/grpc). -func NewDeploymentServiceClient(httpClient connect.HTTPClient, baseURL string, opts ...connect.ClientOption) DeploymentServiceClient { - baseURL = strings.TrimRight(baseURL, "/") - deploymentServiceMethods := v1.File_hydra_v1_deployment_proto.Services().ByName("DeploymentService").Methods() - return &deploymentServiceClient{ - deploy: connect.NewClient[v1.DeployRequest, v1.DeployResponse]( - httpClient, - baseURL+DeploymentServiceDeployProcedure, - connect.WithSchema(deploymentServiceMethods.ByName("Deploy")), - connect.WithClientOptions(opts...), - ), - rollback: connect.NewClient[v1.RollbackRequest, v1.RollbackResponse]( - httpClient, - baseURL+DeploymentServiceRollbackProcedure, - connect.WithSchema(deploymentServiceMethods.ByName("Rollback")), - connect.WithClientOptions(opts...), - ), - promote: connect.NewClient[v1.PromoteRequest, v1.PromoteResponse]( - httpClient, - baseURL+DeploymentServicePromoteProcedure, - connect.WithSchema(deploymentServiceMethods.ByName("Promote")), - connect.WithClientOptions(opts...), - ), - } -} - -// deploymentServiceClient implements DeploymentServiceClient. -type deploymentServiceClient struct { - deploy *connect.Client[v1.DeployRequest, v1.DeployResponse] - rollback *connect.Client[v1.RollbackRequest, v1.RollbackResponse] - promote *connect.Client[v1.PromoteRequest, v1.PromoteResponse] -} - -// Deploy calls hydra.v1.DeploymentService.Deploy. -func (c *deploymentServiceClient) Deploy(ctx context.Context, req *connect.Request[v1.DeployRequest]) (*connect.Response[v1.DeployResponse], error) { - return c.deploy.CallUnary(ctx, req) -} - -// Rollback calls hydra.v1.DeploymentService.Rollback. -func (c *deploymentServiceClient) Rollback(ctx context.Context, req *connect.Request[v1.RollbackRequest]) (*connect.Response[v1.RollbackResponse], error) { - return c.rollback.CallUnary(ctx, req) -} - -// Promote calls hydra.v1.DeploymentService.Promote. -func (c *deploymentServiceClient) Promote(ctx context.Context, req *connect.Request[v1.PromoteRequest]) (*connect.Response[v1.PromoteResponse], error) { - return c.promote.CallUnary(ctx, req) -} - -// DeploymentServiceHandler is an implementation of the hydra.v1.DeploymentService service. -type DeploymentServiceHandler interface { - Deploy(context.Context, *connect.Request[v1.DeployRequest]) (*connect.Response[v1.DeployResponse], error) - Rollback(context.Context, *connect.Request[v1.RollbackRequest]) (*connect.Response[v1.RollbackResponse], error) - Promote(context.Context, *connect.Request[v1.PromoteRequest]) (*connect.Response[v1.PromoteResponse], error) -} - -// NewDeploymentServiceHandler builds an HTTP handler from the service implementation. It returns -// the path on which to mount the handler and the handler itself. -// -// By default, handlers support the Connect, gRPC, and gRPC-Web protocols with the binary Protobuf -// and JSON codecs. They also support gzip compression. -func NewDeploymentServiceHandler(svc DeploymentServiceHandler, opts ...connect.HandlerOption) (string, http.Handler) { - deploymentServiceMethods := v1.File_hydra_v1_deployment_proto.Services().ByName("DeploymentService").Methods() - deploymentServiceDeployHandler := connect.NewUnaryHandler( - DeploymentServiceDeployProcedure, - svc.Deploy, - connect.WithSchema(deploymentServiceMethods.ByName("Deploy")), - connect.WithHandlerOptions(opts...), - ) - deploymentServiceRollbackHandler := connect.NewUnaryHandler( - DeploymentServiceRollbackProcedure, - svc.Rollback, - connect.WithSchema(deploymentServiceMethods.ByName("Rollback")), - connect.WithHandlerOptions(opts...), - ) - deploymentServicePromoteHandler := connect.NewUnaryHandler( - DeploymentServicePromoteProcedure, - svc.Promote, - connect.WithSchema(deploymentServiceMethods.ByName("Promote")), - connect.WithHandlerOptions(opts...), - ) - return "/hydra.v1.DeploymentService/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - switch r.URL.Path { - case DeploymentServiceDeployProcedure: - deploymentServiceDeployHandler.ServeHTTP(w, r) - case DeploymentServiceRollbackProcedure: - deploymentServiceRollbackHandler.ServeHTTP(w, r) - case DeploymentServicePromoteProcedure: - deploymentServicePromoteHandler.ServeHTTP(w, r) - default: - http.NotFound(w, r) - } - }) -} - -// UnimplementedDeploymentServiceHandler returns CodeUnimplemented from all methods. -type UnimplementedDeploymentServiceHandler struct{} - -func (UnimplementedDeploymentServiceHandler) Deploy(context.Context, *connect.Request[v1.DeployRequest]) (*connect.Response[v1.DeployResponse], error) { - return nil, connect.NewError(connect.CodeUnimplemented, errors.New("hydra.v1.DeploymentService.Deploy is not implemented")) -} - -func (UnimplementedDeploymentServiceHandler) Rollback(context.Context, *connect.Request[v1.RollbackRequest]) (*connect.Response[v1.RollbackResponse], error) { - return nil, connect.NewError(connect.CodeUnimplemented, errors.New("hydra.v1.DeploymentService.Rollback is not implemented")) -} - -func (UnimplementedDeploymentServiceHandler) Promote(context.Context, *connect.Request[v1.PromoteRequest]) (*connect.Response[v1.PromoteResponse], error) { - return nil, connect.NewError(connect.CodeUnimplemented, errors.New("hydra.v1.DeploymentService.Promote is not implemented")) -} diff --git a/gen/proto/hydra/v1/hydrav1connect/routing.connect.go b/gen/proto/hydra/v1/hydrav1connect/routing.connect.go deleted file mode 100644 index 0667b6ec92..0000000000 --- a/gen/proto/hydra/v1/hydrav1connect/routing.connect.go +++ /dev/null @@ -1,111 +0,0 @@ -// Code generated by protoc-gen-connect-go. DO NOT EDIT. -// -// Source: hydra/v1/routing.proto - -package hydrav1connect - -import ( - connect "connectrpc.com/connect" - context "context" - errors "errors" - v1 "github.com/unkeyed/unkey/gen/proto/hydra/v1" - http "net/http" - strings "strings" -) - -// This is a compile-time assertion to ensure that this generated file and the connect package are -// compatible. If you get a compiler error that this constant is not defined, this code was -// generated with a version of connect newer than the one compiled into your binary. You can fix the -// problem by either regenerating this code with an older version of connect or updating the connect -// version compiled into your binary. -const _ = connect.IsAtLeastVersion1_13_0 - -const ( - // RoutingServiceName is the fully-qualified name of the RoutingService service. - RoutingServiceName = "hydra.v1.RoutingService" -) - -// These constants are the fully-qualified names of the RPCs defined in this package. They're -// exposed at runtime as Spec.Procedure and as the final two segments of the HTTP route. -// -// Note that these are different from the fully-qualified method names used by -// google.golang.org/protobuf/reflect/protoreflect. To convert from these constants to -// reflection-formatted method names, remove the leading slash and convert the remaining slash to a -// period. -const ( - // RoutingServiceAssignFrontlineRoutesProcedure is the fully-qualified name of the RoutingService's - // AssignFrontlineRoutes RPC. - RoutingServiceAssignFrontlineRoutesProcedure = "/hydra.v1.RoutingService/AssignFrontlineRoutes" -) - -// RoutingServiceClient is a client for the hydra.v1.RoutingService service. -type RoutingServiceClient interface { - // AssignFrontlineRoutes creates or reassigns frontline routes to a deployment - AssignFrontlineRoutes(context.Context, *connect.Request[v1.AssignFrontlineRoutesRequest]) (*connect.Response[v1.AssignFrontlineRoutesResponse], error) -} - -// NewRoutingServiceClient constructs a client for the hydra.v1.RoutingService service. By default, -// it uses the Connect protocol with the binary Protobuf Codec, asks for gzipped responses, and -// sends uncompressed requests. To use the gRPC or gRPC-Web protocols, supply the connect.WithGRPC() -// or connect.WithGRPCWeb() options. -// -// The URL supplied here should be the base URL for the Connect or gRPC server (for example, -// http://api.acme.com or https://acme.com/grpc). -func NewRoutingServiceClient(httpClient connect.HTTPClient, baseURL string, opts ...connect.ClientOption) RoutingServiceClient { - baseURL = strings.TrimRight(baseURL, "/") - routingServiceMethods := v1.File_hydra_v1_routing_proto.Services().ByName("RoutingService").Methods() - return &routingServiceClient{ - assignFrontlineRoutes: connect.NewClient[v1.AssignFrontlineRoutesRequest, v1.AssignFrontlineRoutesResponse]( - httpClient, - baseURL+RoutingServiceAssignFrontlineRoutesProcedure, - connect.WithSchema(routingServiceMethods.ByName("AssignFrontlineRoutes")), - connect.WithClientOptions(opts...), - ), - } -} - -// routingServiceClient implements RoutingServiceClient. -type routingServiceClient struct { - assignFrontlineRoutes *connect.Client[v1.AssignFrontlineRoutesRequest, v1.AssignFrontlineRoutesResponse] -} - -// AssignFrontlineRoutes calls hydra.v1.RoutingService.AssignFrontlineRoutes. -func (c *routingServiceClient) AssignFrontlineRoutes(ctx context.Context, req *connect.Request[v1.AssignFrontlineRoutesRequest]) (*connect.Response[v1.AssignFrontlineRoutesResponse], error) { - return c.assignFrontlineRoutes.CallUnary(ctx, req) -} - -// RoutingServiceHandler is an implementation of the hydra.v1.RoutingService service. -type RoutingServiceHandler interface { - // AssignFrontlineRoutes creates or reassigns frontline routes to a deployment - AssignFrontlineRoutes(context.Context, *connect.Request[v1.AssignFrontlineRoutesRequest]) (*connect.Response[v1.AssignFrontlineRoutesResponse], error) -} - -// NewRoutingServiceHandler builds an HTTP handler from the service implementation. It returns the -// path on which to mount the handler and the handler itself. -// -// By default, handlers support the Connect, gRPC, and gRPC-Web protocols with the binary Protobuf -// and JSON codecs. They also support gzip compression. -func NewRoutingServiceHandler(svc RoutingServiceHandler, opts ...connect.HandlerOption) (string, http.Handler) { - routingServiceMethods := v1.File_hydra_v1_routing_proto.Services().ByName("RoutingService").Methods() - routingServiceAssignFrontlineRoutesHandler := connect.NewUnaryHandler( - RoutingServiceAssignFrontlineRoutesProcedure, - svc.AssignFrontlineRoutes, - connect.WithSchema(routingServiceMethods.ByName("AssignFrontlineRoutes")), - connect.WithHandlerOptions(opts...), - ) - return "/hydra.v1.RoutingService/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - switch r.URL.Path { - case RoutingServiceAssignFrontlineRoutesProcedure: - routingServiceAssignFrontlineRoutesHandler.ServeHTTP(w, r) - default: - http.NotFound(w, r) - } - }) -} - -// UnimplementedRoutingServiceHandler returns CodeUnimplemented from all methods. -type UnimplementedRoutingServiceHandler struct{} - -func (UnimplementedRoutingServiceHandler) AssignFrontlineRoutes(context.Context, *connect.Request[v1.AssignFrontlineRoutesRequest]) (*connect.Response[v1.AssignFrontlineRoutesResponse], error) { - return nil, connect.NewError(connect.CodeUnimplemented, errors.New("hydra.v1.RoutingService.AssignFrontlineRoutes is not implemented")) -} diff --git a/gen/proto/hydra/v1/versioning.pb.go b/gen/proto/hydra/v1/versioning.pb.go new file mode 100644 index 0000000000..1cce65eed5 --- /dev/null +++ b/gen/proto/hydra/v1/versioning.pb.go @@ -0,0 +1,255 @@ +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.36.8 +// protoc (unknown) +// source: hydra/v1/versioning.proto + +package hydrav1 + +import ( + _ "github.com/restatedev/sdk-go/generated/dev/restate/sdk" + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + reflect "reflect" + sync "sync" + unsafe "unsafe" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +type NextVersionRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *NextVersionRequest) Reset() { + *x = NextVersionRequest{} + mi := &file_hydra_v1_versioning_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *NextVersionRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*NextVersionRequest) ProtoMessage() {} + +func (x *NextVersionRequest) ProtoReflect() protoreflect.Message { + mi := &file_hydra_v1_versioning_proto_msgTypes[0] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use NextVersionRequest.ProtoReflect.Descriptor instead. +func (*NextVersionRequest) Descriptor() ([]byte, []int) { + return file_hydra_v1_versioning_proto_rawDescGZIP(), []int{0} +} + +type NextVersionResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + Version uint64 `protobuf:"varint,1,opt,name=version,proto3" json:"version,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *NextVersionResponse) Reset() { + *x = NextVersionResponse{} + mi := &file_hydra_v1_versioning_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *NextVersionResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*NextVersionResponse) ProtoMessage() {} + +func (x *NextVersionResponse) ProtoReflect() protoreflect.Message { + mi := &file_hydra_v1_versioning_proto_msgTypes[1] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use NextVersionResponse.ProtoReflect.Descriptor instead. +func (*NextVersionResponse) Descriptor() ([]byte, []int) { + return file_hydra_v1_versioning_proto_rawDescGZIP(), []int{1} +} + +func (x *NextVersionResponse) GetVersion() uint64 { + if x != nil { + return x.Version + } + return 0 +} + +type GetVersionRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *GetVersionRequest) Reset() { + *x = GetVersionRequest{} + mi := &file_hydra_v1_versioning_proto_msgTypes[2] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *GetVersionRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetVersionRequest) ProtoMessage() {} + +func (x *GetVersionRequest) ProtoReflect() protoreflect.Message { + mi := &file_hydra_v1_versioning_proto_msgTypes[2] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetVersionRequest.ProtoReflect.Descriptor instead. +func (*GetVersionRequest) Descriptor() ([]byte, []int) { + return file_hydra_v1_versioning_proto_rawDescGZIP(), []int{2} +} + +type GetVersionResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + Version uint64 `protobuf:"varint,1,opt,name=version,proto3" json:"version,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *GetVersionResponse) Reset() { + *x = GetVersionResponse{} + mi := &file_hydra_v1_versioning_proto_msgTypes[3] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *GetVersionResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetVersionResponse) ProtoMessage() {} + +func (x *GetVersionResponse) ProtoReflect() protoreflect.Message { + mi := &file_hydra_v1_versioning_proto_msgTypes[3] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetVersionResponse.ProtoReflect.Descriptor instead. +func (*GetVersionResponse) Descriptor() ([]byte, []int) { + return file_hydra_v1_versioning_proto_rawDescGZIP(), []int{3} +} + +func (x *GetVersionResponse) GetVersion() uint64 { + if x != nil { + return x.Version + } + return 0 +} + +var File_hydra_v1_versioning_proto protoreflect.FileDescriptor + +const file_hydra_v1_versioning_proto_rawDesc = "" + + "\n" + + "\x19hydra/v1/versioning.proto\x12\bhydra.v1\x1a\x18dev/restate/sdk/go.proto\"\x14\n" + + "\x12NextVersionRequest\"/\n" + + "\x13NextVersionResponse\x12\x18\n" + + "\aversion\x18\x01 \x01(\x04R\aversion\"\x13\n" + + "\x11GetVersionRequest\".\n" + + "\x12GetVersionResponse\x12\x18\n" + + "\aversion\x18\x01 \x01(\x04R\aversion2\xb2\x01\n" + + "\x11VersioningService\x12L\n" + + "\vNextVersion\x12\x1c.hydra.v1.NextVersionRequest\x1a\x1d.hydra.v1.NextVersionResponse\"\x00\x12I\n" + + "\n" + + "GetVersion\x12\x1b.hydra.v1.GetVersionRequest\x1a\x1c.hydra.v1.GetVersionResponse\"\x00\x1a\x04\x98\x80\x01\x01B\x95\x01\n" + + "\fcom.hydra.v1B\x0fVersioningProtoP\x01Z3github.com/unkeyed/unkey/gen/proto/hydra/v1;hydrav1\xa2\x02\x03HXX\xaa\x02\bHydra.V1\xca\x02\bHydra\\V1\xe2\x02\x14Hydra\\V1\\GPBMetadata\xea\x02\tHydra::V1b\x06proto3" + +var ( + file_hydra_v1_versioning_proto_rawDescOnce sync.Once + file_hydra_v1_versioning_proto_rawDescData []byte +) + +func file_hydra_v1_versioning_proto_rawDescGZIP() []byte { + file_hydra_v1_versioning_proto_rawDescOnce.Do(func() { + file_hydra_v1_versioning_proto_rawDescData = protoimpl.X.CompressGZIP(unsafe.Slice(unsafe.StringData(file_hydra_v1_versioning_proto_rawDesc), len(file_hydra_v1_versioning_proto_rawDesc))) + }) + return file_hydra_v1_versioning_proto_rawDescData +} + +var file_hydra_v1_versioning_proto_msgTypes = make([]protoimpl.MessageInfo, 4) +var file_hydra_v1_versioning_proto_goTypes = []any{ + (*NextVersionRequest)(nil), // 0: hydra.v1.NextVersionRequest + (*NextVersionResponse)(nil), // 1: hydra.v1.NextVersionResponse + (*GetVersionRequest)(nil), // 2: hydra.v1.GetVersionRequest + (*GetVersionResponse)(nil), // 3: hydra.v1.GetVersionResponse +} +var file_hydra_v1_versioning_proto_depIdxs = []int32{ + 0, // 0: hydra.v1.VersioningService.NextVersion:input_type -> hydra.v1.NextVersionRequest + 2, // 1: hydra.v1.VersioningService.GetVersion:input_type -> hydra.v1.GetVersionRequest + 1, // 2: hydra.v1.VersioningService.NextVersion:output_type -> hydra.v1.NextVersionResponse + 3, // 3: hydra.v1.VersioningService.GetVersion:output_type -> hydra.v1.GetVersionResponse + 2, // [2:4] is the sub-list for method output_type + 0, // [0:2] is the sub-list for method input_type + 0, // [0:0] is the sub-list for extension type_name + 0, // [0:0] is the sub-list for extension extendee + 0, // [0:0] is the sub-list for field type_name +} + +func init() { file_hydra_v1_versioning_proto_init() } +func file_hydra_v1_versioning_proto_init() { + if File_hydra_v1_versioning_proto != nil { + return + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: unsafe.Slice(unsafe.StringData(file_hydra_v1_versioning_proto_rawDesc), len(file_hydra_v1_versioning_proto_rawDesc)), + NumEnums: 0, + NumMessages: 4, + NumExtensions: 0, + NumServices: 1, + }, + GoTypes: file_hydra_v1_versioning_proto_goTypes, + DependencyIndexes: file_hydra_v1_versioning_proto_depIdxs, + MessageInfos: file_hydra_v1_versioning_proto_msgTypes, + }.Build() + File_hydra_v1_versioning_proto = out.File + file_hydra_v1_versioning_proto_goTypes = nil + file_hydra_v1_versioning_proto_depIdxs = nil +} diff --git a/gen/proto/hydra/v1/versioning_restate.pb.go b/gen/proto/hydra/v1/versioning_restate.pb.go new file mode 100644 index 0000000000..0af63cc47d --- /dev/null +++ b/gen/proto/hydra/v1/versioning_restate.pb.go @@ -0,0 +1,177 @@ +// Code generated by protoc-gen-go-restate. DO NOT EDIT. +// versions: +// - protoc-gen-go-restate v0.1 +// - protoc (unknown) +// source: hydra/v1/versioning.proto + +package hydrav1 + +import ( + fmt "fmt" + sdk_go "github.com/restatedev/sdk-go" + encoding "github.com/restatedev/sdk-go/encoding" + ingress "github.com/restatedev/sdk-go/ingress" +) + +// VersioningServiceClient is the client API for hydra.v1.VersioningService service. +// +// VersioningService provides globally unique, monotonically increasing versions +// for state synchronization between the control plane and edge agents. +// +// This is a singleton virtual object (use empty string as key). The version is +// used to track state changes in deployments and sentinels tables, enabling +// efficient incremental synchronization. +// +// Usage: +// 1. Before mutating a deployment or sentinel, call NextVersion to get a new version +// 2. Update the resource row with this version +// 3. Edge agents track their last-seen version and request changes after it +// 4. Sync queries filter by region: WHERE region = ? AND version > ? +type VersioningServiceClient interface { + // NextVersion atomically increments and returns the next version number. + // + // The version is durably stored in Restate's virtual object state, guaranteeing: + // - Monotonically increasing values (no gaps under normal operation) + // - Exactly-once semantics (retries return the same version) + // - Single-writer (singleton virtual object) + NextVersion(opts ...sdk_go.ClientOption) sdk_go.Client[*NextVersionRequest, *NextVersionResponse] + // GetVersion returns the current version without incrementing. + // Useful for stale cursor detection: if client's version < min retained, force bootstrap. + GetVersion(opts ...sdk_go.ClientOption) sdk_go.Client[*GetVersionRequest, *GetVersionResponse] +} + +type versioningServiceClient struct { + ctx sdk_go.Context + key string + options []sdk_go.ClientOption +} + +func NewVersioningServiceClient(ctx sdk_go.Context, key string, opts ...sdk_go.ClientOption) VersioningServiceClient { + cOpts := append([]sdk_go.ClientOption{sdk_go.WithProtoJSON}, opts...) + return &versioningServiceClient{ + ctx, + key, + cOpts, + } +} +func (c *versioningServiceClient) NextVersion(opts ...sdk_go.ClientOption) sdk_go.Client[*NextVersionRequest, *NextVersionResponse] { + cOpts := c.options + if len(opts) > 0 { + cOpts = append(append([]sdk_go.ClientOption{}, cOpts...), opts...) + } + return sdk_go.WithRequestType[*NextVersionRequest](sdk_go.Object[*NextVersionResponse](c.ctx, "hydra.v1.VersioningService", c.key, "NextVersion", cOpts...)) +} + +func (c *versioningServiceClient) GetVersion(opts ...sdk_go.ClientOption) sdk_go.Client[*GetVersionRequest, *GetVersionResponse] { + cOpts := c.options + if len(opts) > 0 { + cOpts = append(append([]sdk_go.ClientOption{}, cOpts...), opts...) + } + return sdk_go.WithRequestType[*GetVersionRequest](sdk_go.Object[*GetVersionResponse](c.ctx, "hydra.v1.VersioningService", c.key, "GetVersion", cOpts...)) +} + +// VersioningServiceIngressClient is the ingress client API for hydra.v1.VersioningService service. +// +// This client is used to call the service from outside of a Restate context. +type VersioningServiceIngressClient interface { + // NextVersion atomically increments and returns the next version number. + // + // The version is durably stored in Restate's virtual object state, guaranteeing: + // - Monotonically increasing values (no gaps under normal operation) + // - Exactly-once semantics (retries return the same version) + // - Single-writer (singleton virtual object) + NextVersion() ingress.Requester[*NextVersionRequest, *NextVersionResponse] + // GetVersion returns the current version without incrementing. + // Useful for stale cursor detection: if client's version < min retained, force bootstrap. + GetVersion() ingress.Requester[*GetVersionRequest, *GetVersionResponse] +} + +type versioningServiceIngressClient struct { + client *ingress.Client + serviceName string + key string +} + +func NewVersioningServiceIngressClient(client *ingress.Client, key string) VersioningServiceIngressClient { + return &versioningServiceIngressClient{ + client, + "hydra.v1.VersioningService", + key, + } +} + +func (c *versioningServiceIngressClient) NextVersion() ingress.Requester[*NextVersionRequest, *NextVersionResponse] { + codec := encoding.ProtoJSONCodec + return ingress.NewRequester[*NextVersionRequest, *NextVersionResponse](c.client, c.serviceName, "NextVersion", &c.key, &codec) +} + +func (c *versioningServiceIngressClient) GetVersion() ingress.Requester[*GetVersionRequest, *GetVersionResponse] { + codec := encoding.ProtoJSONCodec + return ingress.NewRequester[*GetVersionRequest, *GetVersionResponse](c.client, c.serviceName, "GetVersion", &c.key, &codec) +} + +// VersioningServiceServer is the server API for hydra.v1.VersioningService service. +// All implementations should embed UnimplementedVersioningServiceServer +// for forward compatibility. +// +// VersioningService provides globally unique, monotonically increasing versions +// for state synchronization between the control plane and edge agents. +// +// This is a singleton virtual object (use empty string as key). The version is +// used to track state changes in deployments and sentinels tables, enabling +// efficient incremental synchronization. +// +// Usage: +// 1. Before mutating a deployment or sentinel, call NextVersion to get a new version +// 2. Update the resource row with this version +// 3. Edge agents track their last-seen version and request changes after it +// 4. Sync queries filter by region: WHERE region = ? AND version > ? +type VersioningServiceServer interface { + // NextVersion atomically increments and returns the next version number. + // + // The version is durably stored in Restate's virtual object state, guaranteeing: + // - Monotonically increasing values (no gaps under normal operation) + // - Exactly-once semantics (retries return the same version) + // - Single-writer (singleton virtual object) + NextVersion(ctx sdk_go.ObjectContext, req *NextVersionRequest) (*NextVersionResponse, error) + // GetVersion returns the current version without incrementing. + // Useful for stale cursor detection: if client's version < min retained, force bootstrap. + GetVersion(ctx sdk_go.ObjectContext, req *GetVersionRequest) (*GetVersionResponse, error) +} + +// UnimplementedVersioningServiceServer should be embedded to have +// forward compatible implementations. +// +// NOTE: this should be embedded by value instead of pointer to avoid a nil +// pointer dereference when methods are called. +type UnimplementedVersioningServiceServer struct{} + +func (UnimplementedVersioningServiceServer) NextVersion(ctx sdk_go.ObjectContext, req *NextVersionRequest) (*NextVersionResponse, error) { + return nil, sdk_go.TerminalError(fmt.Errorf("method NextVersion not implemented"), 501) +} +func (UnimplementedVersioningServiceServer) GetVersion(ctx sdk_go.ObjectContext, req *GetVersionRequest) (*GetVersionResponse, error) { + return nil, sdk_go.TerminalError(fmt.Errorf("method GetVersion not implemented"), 501) +} +func (UnimplementedVersioningServiceServer) testEmbeddedByValue() {} + +// UnsafeVersioningServiceServer may be embedded to opt out of forward compatibility for this service. +// Use of this interface is not recommended, as added methods to VersioningServiceServer will +// result in compilation errors. +type UnsafeVersioningServiceServer interface { + mustEmbedUnimplementedVersioningServiceServer() +} + +func NewVersioningServiceServer(srv VersioningServiceServer, opts ...sdk_go.ServiceDefinitionOption) sdk_go.ServiceDefinition { + // If the following call panics, it indicates UnimplementedVersioningServiceServer was + // embedded by pointer and is nil. This will cause panics if an + // unimplemented method is ever invoked, so we test this at initialization + // time to prevent it from happening at runtime later due to I/O. + if t, ok := srv.(interface{ testEmbeddedByValue() }); ok { + t.testEmbeddedByValue() + } + sOpts := append([]sdk_go.ServiceDefinitionOption{sdk_go.WithProtoJSON}, opts...) + router := sdk_go.NewObject("hydra.v1.VersioningService", sOpts...) + router = router.Handler("NextVersion", sdk_go.NewObjectHandler(srv.NextVersion)) + router = router.Handler("GetVersion", sdk_go.NewObjectHandler(srv.GetVersion)) + return router +} diff --git a/pkg/db/BUILD.bazel b/pkg/db/BUILD.bazel index b35f2b8fa2..d104da1247 100644 --- a/pkg/db/BUILD.bazel +++ b/pkg/db/BUILD.bazel @@ -71,6 +71,7 @@ go_library( "clickhouse_workspace_settings_find_by_workspace_id.sql_generated.go", "clickhouse_workspace_settings_insert.sql_generated.go", "clickhouse_workspace_settings_update_limits.sql_generated.go", + "cluster_state_versions.sql_generated.go", "custom_domain_find_by_domain.sql_generated.go", "custom_domain_find_by_domain_or_wildcard.sql_generated.go", "custom_domain_find_by_id.sql_generated.go", @@ -82,6 +83,7 @@ go_library( "deployment_find_by_k8s_name.sql_generated.go", "deployment_insert.sql_generated.go", "deployment_topology_by_id_and_region.sql_generated.go", + "deployment_topology_find_by_versions.sql_generated.go", "deployment_topology_find_regions.sql_generated.go", "deployment_topology_insert.sql_generated.go", "deployment_topology_list_desired.sql_generated.go", @@ -233,6 +235,7 @@ go_library( "role_permission_insert.sql_generated.go", "sentinel_find_by_environment_id.sql_generated.go", "sentinel_find_by_id.sql_generated.go", + "sentinel_find_by_versions.sql_generated.go", "sentinel_insert.sql_generated.go", "sentinel_list_desired.sql_generated.go", "sentinel_update_available_replicas_and_health.sql_generated.go", diff --git a/pkg/db/bulk_deployment_topology_insert.sql_generated.go b/pkg/db/bulk_deployment_topology_insert.sql_generated.go index 14d182337e..47d2f8d66f 100644 --- a/pkg/db/bulk_deployment_topology_insert.sql_generated.go +++ b/pkg/db/bulk_deployment_topology_insert.sql_generated.go @@ -9,7 +9,7 @@ import ( ) // bulkInsertDeploymentTopology is the base query for bulk insert -const bulkInsertDeploymentTopology = `INSERT INTO ` + "`" + `deployment_topology` + "`" + ` ( workspace_id, deployment_id, region, desired_replicas, desired_status, created_at ) VALUES %s` +const bulkInsertDeploymentTopology = `INSERT INTO ` + "`" + `deployment_topology` + "`" + ` ( workspace_id, deployment_id, region, desired_replicas, desired_status, version, created_at ) VALUES %s` // InsertDeploymentTopologies performs bulk insert in a single query func (q *BulkQueries) InsertDeploymentTopologies(ctx context.Context, db DBTX, args []InsertDeploymentTopologyParams) error { @@ -21,7 +21,7 @@ func (q *BulkQueries) InsertDeploymentTopologies(ctx context.Context, db DBTX, a // Build the bulk insert query valueClauses := make([]string, len(args)) for i := range args { - valueClauses[i] = "( ?, ?, ?, ?, ?, ? )" + valueClauses[i] = "( ?, ?, ?, ?, ?, ?, ? )" } bulkQuery := fmt.Sprintf(bulkInsertDeploymentTopology, strings.Join(valueClauses, ", ")) @@ -34,6 +34,7 @@ func (q *BulkQueries) InsertDeploymentTopologies(ctx context.Context, db DBTX, a allArgs = append(allArgs, arg.Region) allArgs = append(allArgs, arg.DesiredReplicas) allArgs = append(allArgs, arg.DesiredStatus) + allArgs = append(allArgs, arg.Version) allArgs = append(allArgs, arg.CreatedAt) } diff --git a/pkg/db/bulk_sentinel_insert.sql_generated.go b/pkg/db/bulk_sentinel_insert.sql_generated.go index 69f4741d23..d605dcb6ba 100644 --- a/pkg/db/bulk_sentinel_insert.sql_generated.go +++ b/pkg/db/bulk_sentinel_insert.sql_generated.go @@ -9,7 +9,7 @@ import ( ) // bulkInsertSentinel is the base query for bulk insert -const bulkInsertSentinel = `INSERT INTO sentinels ( id, workspace_id, environment_id, project_id, k8s_address, k8s_name, region, image, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, created_at ) VALUES %s` +const bulkInsertSentinel = `INSERT INTO sentinels ( id, workspace_id, environment_id, project_id, k8s_address, k8s_name, region, image, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, version, created_at ) VALUES %s` // InsertSentinels performs bulk insert in a single query func (q *BulkQueries) InsertSentinels(ctx context.Context, db DBTX, args []InsertSentinelParams) error { @@ -21,7 +21,7 @@ func (q *BulkQueries) InsertSentinels(ctx context.Context, db DBTX, args []Inser // Build the bulk insert query valueClauses := make([]string, len(args)) for i := range args { - valueClauses[i] = "( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )" + valueClauses[i] = "( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )" } bulkQuery := fmt.Sprintf(bulkInsertSentinel, strings.Join(valueClauses, ", ")) @@ -42,6 +42,7 @@ func (q *BulkQueries) InsertSentinels(ctx context.Context, db DBTX, args []Inser allArgs = append(allArgs, arg.AvailableReplicas) allArgs = append(allArgs, arg.CpuMillicores) allArgs = append(allArgs, arg.MemoryMib) + allArgs = append(allArgs, arg.Version) allArgs = append(allArgs, arg.CreatedAt) } diff --git a/pkg/db/cluster_state_versions.sql_generated.go b/pkg/db/cluster_state_versions.sql_generated.go new file mode 100644 index 0000000000..c4e5e233db --- /dev/null +++ b/pkg/db/cluster_state_versions.sql_generated.go @@ -0,0 +1,83 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.30.0 +// source: cluster_state_versions.sql + +package db + +import ( + "context" +) + +const listClusterStateVersions = `-- name: ListClusterStateVersions :many +SELECT combined.version, combined.kind FROM ( + SELECT dt.version, 'deployment' AS kind + FROM ` + "`" + `deployment_topology` + "`" + ` dt + WHERE dt.region = ? + AND dt.version > ? + UNION ALL + SELECT s.version, 'sentinel' AS kind + FROM ` + "`" + `sentinels` + "`" + ` s + WHERE s.region = ? + AND s.version > ? +) AS combined +ORDER BY combined.version ASC +LIMIT ? +` + +type ListClusterStateVersionsParams struct { + Region string `db:"region"` + AfterVersion uint64 `db:"after_version"` + Limit int32 `db:"limit"` +} + +type ListClusterStateVersionsRow struct { + Version uint64 `db:"version"` + Kind string `db:"kind"` +} + +// ListClusterStateVersions returns the next N (version, kind) pairs in global version order. +// Used to determine which resources to fetch for sync, without loading full row data. +// The 'kind' discriminator is 'deployment' or 'sentinel'. +// +// SELECT combined.version, combined.kind FROM ( +// SELECT dt.version, 'deployment' AS kind +// FROM `deployment_topology` dt +// WHERE dt.region = ? +// AND dt.version > ? +// UNION ALL +// SELECT s.version, 'sentinel' AS kind +// FROM `sentinels` s +// WHERE s.region = ? +// AND s.version > ? +// ) AS combined +// ORDER BY combined.version ASC +// LIMIT ? +func (q *Queries) ListClusterStateVersions(ctx context.Context, db DBTX, arg ListClusterStateVersionsParams) ([]ListClusterStateVersionsRow, error) { + rows, err := db.QueryContext(ctx, listClusterStateVersions, + arg.Region, + arg.AfterVersion, + arg.Region, + arg.AfterVersion, + arg.Limit, + ) + if err != nil { + return nil, err + } + defer rows.Close() + var items []ListClusterStateVersionsRow + for rows.Next() { + var i ListClusterStateVersionsRow + if err := rows.Scan(&i.Version, &i.Kind); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} diff --git a/pkg/db/deployment_topology_find_by_versions.sql_generated.go b/pkg/db/deployment_topology_find_by_versions.sql_generated.go new file mode 100644 index 0000000000..1181fcdb44 --- /dev/null +++ b/pkg/db/deployment_topology_find_by_versions.sql_generated.go @@ -0,0 +1,107 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.30.0 +// source: deployment_topology_find_by_versions.sql + +package db + +import ( + "context" + "database/sql" + "strings" +) + +const findDeploymentTopologyByVersions = `-- name: FindDeploymentTopologyByVersions :many +SELECT + dt.pk, dt.workspace_id, dt.deployment_id, dt.region, dt.desired_replicas, dt.version, dt.desired_status, dt.created_at, dt.updated_at, + d.pk, d.id, d.k8s_name, d.workspace_id, d.project_id, d.environment_id, d.image, d.build_id, d.git_commit_sha, d.git_branch, d.git_commit_message, d.git_commit_author_handle, d.git_commit_author_avatar_url, d.git_commit_timestamp, d.sentinel_config, d.openapi_spec, d.cpu_millicores, d.memory_mib, d.desired_state, d.encrypted_environment_variables, d.status, d.created_at, d.updated_at, + w.k8s_namespace +FROM ` + "`" + `deployment_topology` + "`" + ` dt +INNER JOIN ` + "`" + `deployments` + "`" + ` d ON dt.deployment_id = d.id +INNER JOIN ` + "`" + `workspaces` + "`" + ` w ON d.workspace_id = w.id +WHERE dt.version IN (/*SLICE:versions*/?) +` + +type FindDeploymentTopologyByVersionsRow struct { + DeploymentTopology DeploymentTopology `db:"deployment_topology"` + Deployment Deployment `db:"deployment"` + K8sNamespace sql.NullString `db:"k8s_namespace"` +} + +// FindDeploymentTopologyByVersions returns deployment topologies for specific versions. +// Used after ListClusterStateVersions to hydrate the full deployment data. +// +// SELECT +// dt.pk, dt.workspace_id, dt.deployment_id, dt.region, dt.desired_replicas, dt.version, dt.desired_status, dt.created_at, dt.updated_at, +// d.pk, d.id, d.k8s_name, d.workspace_id, d.project_id, d.environment_id, d.image, d.build_id, d.git_commit_sha, d.git_branch, d.git_commit_message, d.git_commit_author_handle, d.git_commit_author_avatar_url, d.git_commit_timestamp, d.sentinel_config, d.openapi_spec, d.cpu_millicores, d.memory_mib, d.desired_state, d.encrypted_environment_variables, d.status, d.created_at, d.updated_at, +// w.k8s_namespace +// FROM `deployment_topology` dt +// INNER JOIN `deployments` d ON dt.deployment_id = d.id +// INNER JOIN `workspaces` w ON d.workspace_id = w.id +// WHERE dt.version IN (/*SLICE:versions*/?) +func (q *Queries) FindDeploymentTopologyByVersions(ctx context.Context, db DBTX, versions []uint64) ([]FindDeploymentTopologyByVersionsRow, error) { + query := findDeploymentTopologyByVersions + var queryParams []interface{} + if len(versions) > 0 { + for _, v := range versions { + queryParams = append(queryParams, v) + } + query = strings.Replace(query, "/*SLICE:versions*/?", strings.Repeat(",?", len(versions))[1:], 1) + } else { + query = strings.Replace(query, "/*SLICE:versions*/?", "NULL", 1) + } + rows, err := db.QueryContext(ctx, query, queryParams...) + if err != nil { + return nil, err + } + defer rows.Close() + var items []FindDeploymentTopologyByVersionsRow + for rows.Next() { + var i FindDeploymentTopologyByVersionsRow + if err := rows.Scan( + &i.DeploymentTopology.Pk, + &i.DeploymentTopology.WorkspaceID, + &i.DeploymentTopology.DeploymentID, + &i.DeploymentTopology.Region, + &i.DeploymentTopology.DesiredReplicas, + &i.DeploymentTopology.Version, + &i.DeploymentTopology.DesiredStatus, + &i.DeploymentTopology.CreatedAt, + &i.DeploymentTopology.UpdatedAt, + &i.Deployment.Pk, + &i.Deployment.ID, + &i.Deployment.K8sName, + &i.Deployment.WorkspaceID, + &i.Deployment.ProjectID, + &i.Deployment.EnvironmentID, + &i.Deployment.Image, + &i.Deployment.BuildID, + &i.Deployment.GitCommitSha, + &i.Deployment.GitBranch, + &i.Deployment.GitCommitMessage, + &i.Deployment.GitCommitAuthorHandle, + &i.Deployment.GitCommitAuthorAvatarUrl, + &i.Deployment.GitCommitTimestamp, + &i.Deployment.SentinelConfig, + &i.Deployment.OpenapiSpec, + &i.Deployment.CpuMillicores, + &i.Deployment.MemoryMib, + &i.Deployment.DesiredState, + &i.Deployment.EncryptedEnvironmentVariables, + &i.Deployment.Status, + &i.Deployment.CreatedAt, + &i.Deployment.UpdatedAt, + &i.K8sNamespace, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} diff --git a/pkg/db/deployment_topology_insert.sql_generated.go b/pkg/db/deployment_topology_insert.sql_generated.go index ae6655f0b8..1ab6dc8a1f 100644 --- a/pkg/db/deployment_topology_insert.sql_generated.go +++ b/pkg/db/deployment_topology_insert.sql_generated.go @@ -16,6 +16,7 @@ INSERT INTO ` + "`" + `deployment_topology` + "`" + ` ( region, desired_replicas, desired_status, + version, created_at ) VALUES ( ?, @@ -23,6 +24,7 @@ INSERT INTO ` + "`" + `deployment_topology` + "`" + ` ( ?, ?, ?, + ?, ? ) ` @@ -33,6 +35,7 @@ type InsertDeploymentTopologyParams struct { Region string `db:"region"` DesiredReplicas int32 `db:"desired_replicas"` DesiredStatus DeploymentTopologyDesiredStatus `db:"desired_status"` + Version uint64 `db:"version"` CreatedAt int64 `db:"created_at"` } @@ -44,6 +47,7 @@ type InsertDeploymentTopologyParams struct { // region, // desired_replicas, // desired_status, +// version, // created_at // ) VALUES ( // ?, @@ -51,6 +55,7 @@ type InsertDeploymentTopologyParams struct { // ?, // ?, // ?, +// ?, // ? // ) func (q *Queries) InsertDeploymentTopology(ctx context.Context, db DBTX, arg InsertDeploymentTopologyParams) error { @@ -60,6 +65,7 @@ func (q *Queries) InsertDeploymentTopology(ctx context.Context, db DBTX, arg Ins arg.Region, arg.DesiredReplicas, arg.DesiredStatus, + arg.Version, arg.CreatedAt, ) return err diff --git a/pkg/db/deployment_topology_list_desired.sql_generated.go b/pkg/db/deployment_topology_list_desired.sql_generated.go index 59c5862eec..273ba96d78 100644 --- a/pkg/db/deployment_topology_list_desired.sql_generated.go +++ b/pkg/db/deployment_topology_list_desired.sql_generated.go @@ -12,7 +12,7 @@ import ( const listDesiredDeploymentTopology = `-- name: ListDesiredDeploymentTopology :many SELECT - dt.pk, dt.workspace_id, dt.deployment_id, dt.region, dt.desired_replicas, dt.desired_status, dt.created_at, dt.updated_at, + dt.pk, dt.workspace_id, dt.deployment_id, dt.region, dt.desired_replicas, dt.version, dt.desired_status, dt.created_at, dt.updated_at, d.pk, d.id, d.k8s_name, d.workspace_id, d.project_id, d.environment_id, d.image, d.build_id, d.git_commit_sha, d.git_branch, d.git_commit_message, d.git_commit_author_handle, d.git_commit_author_avatar_url, d.git_commit_timestamp, d.sentinel_config, d.openapi_spec, d.cpu_millicores, d.memory_mib, d.desired_state, d.encrypted_environment_variables, d.status, d.created_at, d.updated_at, w.k8s_namespace FROM ` + "`" + `deployment_topology` + "`" + ` dt @@ -40,10 +40,9 @@ type ListDesiredDeploymentTopologyRow struct { // ListDesiredDeploymentTopology returns all deployment topologies matching the desired state for a region. // Used during bootstrap to stream all running deployments to krane. -// The version parameter is deprecated and ignored (kept for backwards compatibility). // // SELECT -// dt.pk, dt.workspace_id, dt.deployment_id, dt.region, dt.desired_replicas, dt.desired_status, dt.created_at, dt.updated_at, +// dt.pk, dt.workspace_id, dt.deployment_id, dt.region, dt.desired_replicas, dt.version, dt.desired_status, dt.created_at, dt.updated_at, // d.pk, d.id, d.k8s_name, d.workspace_id, d.project_id, d.environment_id, d.image, d.build_id, d.git_commit_sha, d.git_branch, d.git_commit_message, d.git_commit_author_handle, d.git_commit_author_avatar_url, d.git_commit_timestamp, d.sentinel_config, d.openapi_spec, d.cpu_millicores, d.memory_mib, d.desired_state, d.encrypted_environment_variables, d.status, d.created_at, d.updated_at, // w.k8s_namespace // FROM `deployment_topology` dt @@ -75,6 +74,7 @@ func (q *Queries) ListDesiredDeploymentTopology(ctx context.Context, db DBTX, ar &i.DeploymentTopology.DeploymentID, &i.DeploymentTopology.Region, &i.DeploymentTopology.DesiredReplicas, + &i.DeploymentTopology.Version, &i.DeploymentTopology.DesiredStatus, &i.DeploymentTopology.CreatedAt, &i.DeploymentTopology.UpdatedAt, diff --git a/pkg/db/models_generated.go b/pkg/db/models_generated.go index 71f956f9a2..406d4ee01c 100644 --- a/pkg/db/models_generated.go +++ b/pkg/db/models_generated.go @@ -978,6 +978,7 @@ type DeploymentTopology struct { DeploymentID string `db:"deployment_id"` Region string `db:"region"` DesiredReplicas int32 `db:"desired_replicas"` + Version uint64 `db:"version"` DesiredStatus DeploymentTopologyDesiredStatus `db:"desired_status"` CreatedAt int64 `db:"created_at"` UpdatedAt sql.NullInt64 `db:"updated_at"` @@ -1244,6 +1245,7 @@ type Sentinel struct { AvailableReplicas int32 `db:"available_replicas"` CpuMillicores int32 `db:"cpu_millicores"` MemoryMib int32 `db:"memory_mib"` + Version uint64 `db:"version"` CreatedAt int64 `db:"created_at"` UpdatedAt sql.NullInt64 `db:"updated_at"` } diff --git a/pkg/db/querier_generated.go b/pkg/db/querier_generated.go index 30d183cb60..b63b1cd721 100644 --- a/pkg/db/querier_generated.go +++ b/pkg/db/querier_generated.go @@ -233,6 +233,18 @@ type Querier interface { // AND dt.deployment_id = ? // LIMIT 1 FindDeploymentTopologyByIDAndRegion(ctx context.Context, db DBTX, arg FindDeploymentTopologyByIDAndRegionParams) (FindDeploymentTopologyByIDAndRegionRow, error) + // FindDeploymentTopologyByVersions returns deployment topologies for specific versions. + // Used after ListClusterStateVersions to hydrate the full deployment data. + // + // SELECT + // dt.pk, dt.workspace_id, dt.deployment_id, dt.region, dt.desired_replicas, dt.version, dt.desired_status, dt.created_at, dt.updated_at, + // d.pk, d.id, d.k8s_name, d.workspace_id, d.project_id, d.environment_id, d.image, d.build_id, d.git_commit_sha, d.git_branch, d.git_commit_message, d.git_commit_author_handle, d.git_commit_author_avatar_url, d.git_commit_timestamp, d.sentinel_config, d.openapi_spec, d.cpu_millicores, d.memory_mib, d.desired_state, d.encrypted_environment_variables, d.status, d.created_at, d.updated_at, + // w.k8s_namespace + // FROM `deployment_topology` dt + // INNER JOIN `deployments` d ON dt.deployment_id = d.id + // INNER JOIN `workspaces` w ON d.workspace_id = w.id + // WHERE dt.version IN (/*SLICE:versions*/?) + FindDeploymentTopologyByVersions(ctx context.Context, db DBTX, versions []uint64) ([]FindDeploymentTopologyByVersionsRow, error) //FindEnvironmentById // // SELECT id, workspace_id, project_id, slug, description @@ -944,13 +956,18 @@ type Querier interface { FindRolesByNames(ctx context.Context, db DBTX, arg FindRolesByNamesParams) ([]FindRolesByNamesRow, error) //FindSentinelByID // - // SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, created_at, updated_at FROM sentinels s + // SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, version, created_at, updated_at FROM sentinels s // WHERE id = ? LIMIT 1 FindSentinelByID(ctx context.Context, db DBTX, id string) (Sentinel, error) //FindSentinelsByEnvironmentID // - // SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, created_at, updated_at FROM sentinels WHERE environment_id = ? + // SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, version, created_at, updated_at FROM sentinels WHERE environment_id = ? FindSentinelsByEnvironmentID(ctx context.Context, db DBTX, environmentID string) ([]Sentinel, error) + // FindSentinelsByVersions returns sentinels for specific versions. + // Used after ListClusterStateVersions to hydrate the full sentinel data. + // + // SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, version, created_at, updated_at FROM `sentinels` WHERE version IN (/*SLICE:versions*/?) + FindSentinelsByVersions(ctx context.Context, db DBTX, versions []uint64) ([]Sentinel, error) //FindWorkspaceByID // // SELECT id, org_id, name, slug, k8s_namespace, partition_id, plan, tier, stripe_customer_id, stripe_subscription_id, beta_features, features, subscriptions, enabled, delete_protection, created_at_m, updated_at_m, deleted_at_m FROM `workspaces` @@ -969,8 +986,8 @@ type Querier interface { // WHERE id = ? // AND deleted_at_m IS NULL GetKeyAuthByID(ctx context.Context, db DBTX, id string) (GetKeyAuthByIDRow, error) - // Returns the highest sequence for a region. - // Used during bootstrap to get the watermark before streaming current state. + // GetMaxStateChangeSequence returns the highest sequence number for a region. + // Used during bootstrap to set the sequence boundary. // // SELECT CAST(COALESCE(MAX(sequence), 0) AS UNSIGNED) AS max_sequence // FROM `state_changes` @@ -1193,6 +1210,7 @@ type Querier interface { // region, // desired_replicas, // desired_status, + // version, // created_at // ) VALUES ( // ?, @@ -1200,6 +1218,7 @@ type Querier interface { // ?, // ?, // ?, + // ?, // ? // ) InsertDeploymentTopology(ctx context.Context, db DBTX, arg InsertDeploymentTopologyParams) error @@ -1571,6 +1590,7 @@ type Querier interface { // available_replicas, // cpu_millicores, // memory_mib, + // version, // created_at // ) VALUES ( // ?, @@ -1586,6 +1606,7 @@ type Querier interface { // ?, // ?, // ?, + // ?, // ? // ) InsertSentinel(ctx context.Context, db DBTX, arg InsertSentinelParams) error @@ -1632,12 +1653,29 @@ type Querier interface { // true // ) InsertWorkspace(ctx context.Context, db DBTX, arg InsertWorkspaceParams) error + // ListClusterStateVersions returns the next N (version, kind) pairs in global version order. + // Used to determine which resources to fetch for sync, without loading full row data. + // The 'kind' discriminator is 'deployment' or 'sentinel'. + // + // SELECT combined.version, combined.kind FROM ( + // SELECT dt.version, 'deployment' AS kind + // FROM `deployment_topology` dt + // WHERE dt.region = ? + // AND dt.version > ? + // UNION ALL + // SELECT s.version, 'sentinel' AS kind + // FROM `sentinels` s + // WHERE s.region = ? + // AND s.version > ? + // ) AS combined + // ORDER BY combined.version ASC + // LIMIT ? + ListClusterStateVersions(ctx context.Context, db DBTX, arg ListClusterStateVersionsParams) ([]ListClusterStateVersionsRow, error) // ListDesiredDeploymentTopology returns all deployment topologies matching the desired state for a region. // Used during bootstrap to stream all running deployments to krane. - // The version parameter is deprecated and ignored (kept for backwards compatibility). // // SELECT - // dt.pk, dt.workspace_id, dt.deployment_id, dt.region, dt.desired_replicas, dt.desired_status, dt.created_at, dt.updated_at, + // dt.pk, dt.workspace_id, dt.deployment_id, dt.region, dt.desired_replicas, dt.version, dt.desired_status, dt.created_at, dt.updated_at, // d.pk, d.id, d.k8s_name, d.workspace_id, d.project_id, d.environment_id, d.image, d.build_id, d.git_commit_sha, d.git_branch, d.git_commit_message, d.git_commit_author_handle, d.git_commit_author_avatar_url, d.git_commit_timestamp, d.sentinel_config, d.openapi_spec, d.cpu_millicores, d.memory_mib, d.desired_state, d.encrypted_environment_variables, d.status, d.created_at, d.updated_at, // w.k8s_namespace // FROM `deployment_topology` dt @@ -1651,9 +1689,8 @@ type Querier interface { ListDesiredDeploymentTopology(ctx context.Context, db DBTX, arg ListDesiredDeploymentTopologyParams) ([]ListDesiredDeploymentTopologyRow, error) // ListDesiredSentinels returns all sentinels matching the desired state for a region. // Used during bootstrap to stream all running sentinels to krane. - // The version parameter is deprecated and ignored (kept for backwards compatibility). // - // SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, created_at, updated_at + // SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, version, created_at, updated_at // FROM `sentinels` // WHERE (? = '' OR region = ?) // AND desired_state = ? diff --git a/pkg/db/queries/cluster_state_versions.sql b/pkg/db/queries/cluster_state_versions.sql new file mode 100644 index 0000000000..06c9068055 --- /dev/null +++ b/pkg/db/queries/cluster_state_versions.sql @@ -0,0 +1,17 @@ +-- name: ListClusterStateVersions :many +-- ListClusterStateVersions returns the next N (version, kind) pairs in global version order. +-- Used to determine which resources to fetch for sync, without loading full row data. +-- The 'kind' discriminator is 'deployment' or 'sentinel'. +SELECT combined.version, combined.kind FROM ( + SELECT dt.version, 'deployment' AS kind + FROM `deployment_topology` dt + WHERE dt.region = sqlc.arg(region) + AND dt.version > sqlc.arg(after_version) + UNION ALL + SELECT s.version, 'sentinel' AS kind + FROM `sentinels` s + WHERE s.region = sqlc.arg(region) + AND s.version > sqlc.arg(after_version) +) AS combined +ORDER BY combined.version ASC +LIMIT ?; diff --git a/pkg/db/queries/deployment_topology_find_by_versions.sql b/pkg/db/queries/deployment_topology_find_by_versions.sql new file mode 100644 index 0000000000..01b1e50883 --- /dev/null +++ b/pkg/db/queries/deployment_topology_find_by_versions.sql @@ -0,0 +1,11 @@ +-- name: FindDeploymentTopologyByVersions :many +-- FindDeploymentTopologyByVersions returns deployment topologies for specific versions. +-- Used after ListClusterStateVersions to hydrate the full deployment data. +SELECT + sqlc.embed(dt), + sqlc.embed(d), + w.k8s_namespace +FROM `deployment_topology` dt +INNER JOIN `deployments` d ON dt.deployment_id = d.id +INNER JOIN `workspaces` w ON d.workspace_id = w.id +WHERE dt.version IN (sqlc.slice(versions)); diff --git a/pkg/db/queries/deployment_topology_insert.sql b/pkg/db/queries/deployment_topology_insert.sql index b76ea61296..37158bb00b 100644 --- a/pkg/db/queries/deployment_topology_insert.sql +++ b/pkg/db/queries/deployment_topology_insert.sql @@ -5,6 +5,7 @@ INSERT INTO `deployment_topology` ( region, desired_replicas, desired_status, + version, created_at ) VALUES ( sqlc.arg(workspace_id), @@ -12,5 +13,6 @@ INSERT INTO `deployment_topology` ( sqlc.arg(region), sqlc.arg(desired_replicas), sqlc.arg(desired_status), + sqlc.arg(version), sqlc.arg(created_at) ); diff --git a/pkg/db/queries/deployment_topology_list_desired.sql b/pkg/db/queries/deployment_topology_list_desired.sql index b8051e4628..f3b5b79dc1 100644 --- a/pkg/db/queries/deployment_topology_list_desired.sql +++ b/pkg/db/queries/deployment_topology_list_desired.sql @@ -1,7 +1,6 @@ -- name: ListDesiredDeploymentTopology :many -- ListDesiredDeploymentTopology returns all deployment topologies matching the desired state for a region. -- Used during bootstrap to stream all running deployments to krane. --- The version parameter is deprecated and ignored (kept for backwards compatibility). SELECT sqlc.embed(dt), sqlc.embed(d), diff --git a/pkg/db/queries/sentinel_find_by_versions.sql b/pkg/db/queries/sentinel_find_by_versions.sql new file mode 100644 index 0000000000..182f47be3c --- /dev/null +++ b/pkg/db/queries/sentinel_find_by_versions.sql @@ -0,0 +1,4 @@ +-- name: FindSentinelsByVersions :many +-- FindSentinelsByVersions returns sentinels for specific versions. +-- Used after ListClusterStateVersions to hydrate the full sentinel data. +SELECT * FROM `sentinels` WHERE version IN (sqlc.slice(versions)); diff --git a/pkg/db/queries/sentinel_insert.sql b/pkg/db/queries/sentinel_insert.sql index f6152c8206..940a142552 100644 --- a/pkg/db/queries/sentinel_insert.sql +++ b/pkg/db/queries/sentinel_insert.sql @@ -13,6 +13,7 @@ INSERT INTO sentinels ( available_replicas, cpu_millicores, memory_mib, + version, created_at ) VALUES ( sqlc.arg(id), @@ -28,5 +29,6 @@ INSERT INTO sentinels ( sqlc.arg(available_replicas), sqlc.arg(cpu_millicores), sqlc.arg(memory_mib), + sqlc.arg(version), sqlc.arg(created_at) ); diff --git a/pkg/db/queries/sentinel_list_desired.sql b/pkg/db/queries/sentinel_list_desired.sql index f31f90def8..b499a13fa7 100644 --- a/pkg/db/queries/sentinel_list_desired.sql +++ b/pkg/db/queries/sentinel_list_desired.sql @@ -1,7 +1,6 @@ -- name: ListDesiredSentinels :many -- ListDesiredSentinels returns all sentinels matching the desired state for a region. -- Used during bootstrap to stream all running sentinels to krane. --- The version parameter is deprecated and ignored (kept for backwards compatibility). SELECT * FROM `sentinels` WHERE (sqlc.arg(region) = '' OR region = sqlc.arg(region)) diff --git a/pkg/db/queries/state_change_get_max_sequence.sql b/pkg/db/queries/state_change_get_max_sequence.sql index 430a7ee7ad..ed2a6650a9 100644 --- a/pkg/db/queries/state_change_get_max_sequence.sql +++ b/pkg/db/queries/state_change_get_max_sequence.sql @@ -1,6 +1,6 @@ -- name: GetMaxStateChangeSequence :one --- Returns the highest sequence for a region. --- Used during bootstrap to get the watermark before streaming current state. +-- GetMaxStateChangeSequence returns the highest sequence number for a region. +-- Used during bootstrap to set the sequence boundary. SELECT CAST(COALESCE(MAX(sequence), 0) AS UNSIGNED) AS max_sequence FROM `state_changes` WHERE region = sqlc.arg(region); diff --git a/pkg/db/schema.sql b/pkg/db/schema.sql index f1eaf3f6d5..9f76f08095 100644 --- a/pkg/db/schema.sql +++ b/pkg/db/schema.sql @@ -451,10 +451,12 @@ CREATE TABLE `deployment_topology` ( `deployment_id` varchar(64) NOT NULL, `region` varchar(64) NOT NULL, `desired_replicas` int NOT NULL, + `version` bigint unsigned NOT NULL, `desired_status` enum('starting','started','stopping','stopped') NOT NULL, `created_at` bigint NOT NULL, `updated_at` bigint, CONSTRAINT `deployment_topology_pk` PRIMARY KEY(`pk`), + CONSTRAINT `deployment_topology_version_unique` UNIQUE(`version`), CONSTRAINT `unique_region_per_deployment` UNIQUE(`deployment_id`,`region`) ); @@ -514,12 +516,14 @@ CREATE TABLE `sentinels` ( `available_replicas` int NOT NULL, `cpu_millicores` int NOT NULL, `memory_mib` int NOT NULL, + `version` bigint unsigned NOT NULL, `created_at` bigint NOT NULL, `updated_at` bigint, CONSTRAINT `sentinels_pk` PRIMARY KEY(`pk`), CONSTRAINT `sentinels_id_unique` UNIQUE(`id`), CONSTRAINT `sentinels_k8s_name_unique` UNIQUE(`k8s_name`), CONSTRAINT `sentinels_k8s_address_unique` UNIQUE(`k8s_address`), + CONSTRAINT `sentinels_version_unique` UNIQUE(`version`), CONSTRAINT `one_env_per_region` UNIQUE(`environment_id`,`region`) ); @@ -606,11 +610,13 @@ CREATE INDEX `workspace_idx` ON `deployment_topology` (`workspace_id`); CREATE INDEX `deployment_idx` ON `deployment_topology` (`deployment_id`); CREATE INDEX `region_idx` ON `deployment_topology` (`region`); CREATE INDEX `status_idx` ON `deployment_topology` (`desired_status`); +CREATE INDEX `region_version_idx` ON `deployment_topology` (`region`,`version`); CREATE INDEX `domain_idx` ON `acme_users` (`workspace_id`); CREATE INDEX `workspace_idx` ON `custom_domains` (`workspace_id`); CREATE INDEX `workspace_idx` ON `acme_challenges` (`workspace_id`); CREATE INDEX `status_idx` ON `acme_challenges` (`status`); CREATE INDEX `idx_environment_id` ON `sentinels` (`environment_id`); +CREATE INDEX `region_version_idx` ON `sentinels` (`region`,`version`); CREATE INDEX `idx_deployment_id` ON `instances` (`deployment_id`); CREATE INDEX `idx_region` ON `instances` (`region`); CREATE INDEX `environment_id_idx` ON `frontline_routes` (`environment_id`); diff --git a/pkg/db/sentinel_find_by_environment_id.sql_generated.go b/pkg/db/sentinel_find_by_environment_id.sql_generated.go index a54bd2887c..14d6e1e5dc 100644 --- a/pkg/db/sentinel_find_by_environment_id.sql_generated.go +++ b/pkg/db/sentinel_find_by_environment_id.sql_generated.go @@ -10,12 +10,12 @@ import ( ) const findSentinelsByEnvironmentID = `-- name: FindSentinelsByEnvironmentID :many -SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, created_at, updated_at FROM sentinels WHERE environment_id = ? +SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, version, created_at, updated_at FROM sentinels WHERE environment_id = ? ` // FindSentinelsByEnvironmentID // -// SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, created_at, updated_at FROM sentinels WHERE environment_id = ? +// SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, version, created_at, updated_at FROM sentinels WHERE environment_id = ? func (q *Queries) FindSentinelsByEnvironmentID(ctx context.Context, db DBTX, environmentID string) ([]Sentinel, error) { rows, err := db.QueryContext(ctx, findSentinelsByEnvironmentID, environmentID) if err != nil { @@ -41,6 +41,7 @@ func (q *Queries) FindSentinelsByEnvironmentID(ctx context.Context, db DBTX, env &i.AvailableReplicas, &i.CpuMillicores, &i.MemoryMib, + &i.Version, &i.CreatedAt, &i.UpdatedAt, ); err != nil { diff --git a/pkg/db/sentinel_find_by_id.sql_generated.go b/pkg/db/sentinel_find_by_id.sql_generated.go index a72608665f..486adfbf92 100644 --- a/pkg/db/sentinel_find_by_id.sql_generated.go +++ b/pkg/db/sentinel_find_by_id.sql_generated.go @@ -10,13 +10,13 @@ import ( ) const findSentinelByID = `-- name: FindSentinelByID :one -SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, created_at, updated_at FROM sentinels s +SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, version, created_at, updated_at FROM sentinels s WHERE id = ? LIMIT 1 ` // FindSentinelByID // -// SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, created_at, updated_at FROM sentinels s +// SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, version, created_at, updated_at FROM sentinels s // WHERE id = ? LIMIT 1 func (q *Queries) FindSentinelByID(ctx context.Context, db DBTX, id string) (Sentinel, error) { row := db.QueryRowContext(ctx, findSentinelByID, id) @@ -37,6 +37,7 @@ func (q *Queries) FindSentinelByID(ctx context.Context, db DBTX, id string) (Sen &i.AvailableReplicas, &i.CpuMillicores, &i.MemoryMib, + &i.Version, &i.CreatedAt, &i.UpdatedAt, ) diff --git a/pkg/db/sentinel_find_by_versions.sql_generated.go b/pkg/db/sentinel_find_by_versions.sql_generated.go new file mode 100644 index 0000000000..cfdb36a2aa --- /dev/null +++ b/pkg/db/sentinel_find_by_versions.sql_generated.go @@ -0,0 +1,71 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.30.0 +// source: sentinel_find_by_versions.sql + +package db + +import ( + "context" + "strings" +) + +const findSentinelsByVersions = `-- name: FindSentinelsByVersions :many +SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, version, created_at, updated_at FROM ` + "`" + `sentinels` + "`" + ` WHERE version IN (/*SLICE:versions*/?) +` + +// FindSentinelsByVersions returns sentinels for specific versions. +// Used after ListClusterStateVersions to hydrate the full sentinel data. +// +// SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, version, created_at, updated_at FROM `sentinels` WHERE version IN (/*SLICE:versions*/?) +func (q *Queries) FindSentinelsByVersions(ctx context.Context, db DBTX, versions []uint64) ([]Sentinel, error) { + query := findSentinelsByVersions + var queryParams []interface{} + if len(versions) > 0 { + for _, v := range versions { + queryParams = append(queryParams, v) + } + query = strings.Replace(query, "/*SLICE:versions*/?", strings.Repeat(",?", len(versions))[1:], 1) + } else { + query = strings.Replace(query, "/*SLICE:versions*/?", "NULL", 1) + } + rows, err := db.QueryContext(ctx, query, queryParams...) + if err != nil { + return nil, err + } + defer rows.Close() + var items []Sentinel + for rows.Next() { + var i Sentinel + if err := rows.Scan( + &i.Pk, + &i.ID, + &i.WorkspaceID, + &i.ProjectID, + &i.EnvironmentID, + &i.K8sName, + &i.K8sAddress, + &i.Region, + &i.Image, + &i.DesiredState, + &i.Health, + &i.DesiredReplicas, + &i.AvailableReplicas, + &i.CpuMillicores, + &i.MemoryMib, + &i.Version, + &i.CreatedAt, + &i.UpdatedAt, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} diff --git a/pkg/db/sentinel_insert.sql_generated.go b/pkg/db/sentinel_insert.sql_generated.go index 642f194f8f..bb76da06e5 100644 --- a/pkg/db/sentinel_insert.sql_generated.go +++ b/pkg/db/sentinel_insert.sql_generated.go @@ -24,6 +24,7 @@ INSERT INTO sentinels ( available_replicas, cpu_millicores, memory_mib, + version, created_at ) VALUES ( ?, @@ -39,6 +40,7 @@ INSERT INTO sentinels ( ?, ?, ?, + ?, ? ) ` @@ -57,6 +59,7 @@ type InsertSentinelParams struct { AvailableReplicas int32 `db:"available_replicas"` CpuMillicores int32 `db:"cpu_millicores"` MemoryMib int32 `db:"memory_mib"` + Version uint64 `db:"version"` CreatedAt int64 `db:"created_at"` } @@ -76,6 +79,7 @@ type InsertSentinelParams struct { // available_replicas, // cpu_millicores, // memory_mib, +// version, // created_at // ) VALUES ( // ?, @@ -91,6 +95,7 @@ type InsertSentinelParams struct { // ?, // ?, // ?, +// ?, // ? // ) func (q *Queries) InsertSentinel(ctx context.Context, db DBTX, arg InsertSentinelParams) error { @@ -108,6 +113,7 @@ func (q *Queries) InsertSentinel(ctx context.Context, db DBTX, arg InsertSentine arg.AvailableReplicas, arg.CpuMillicores, arg.MemoryMib, + arg.Version, arg.CreatedAt, ) return err diff --git a/pkg/db/sentinel_list_desired.sql_generated.go b/pkg/db/sentinel_list_desired.sql_generated.go index 70c07d6cf1..2299032280 100644 --- a/pkg/db/sentinel_list_desired.sql_generated.go +++ b/pkg/db/sentinel_list_desired.sql_generated.go @@ -10,7 +10,7 @@ import ( ) const listDesiredSentinels = `-- name: ListDesiredSentinels :many -SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, created_at, updated_at +SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, version, created_at, updated_at FROM ` + "`" + `sentinels` + "`" + ` WHERE (? = '' OR region = ?) AND desired_state = ? @@ -28,9 +28,8 @@ type ListDesiredSentinelsParams struct { // ListDesiredSentinels returns all sentinels matching the desired state for a region. // Used during bootstrap to stream all running sentinels to krane. -// The version parameter is deprecated and ignored (kept for backwards compatibility). // -// SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, created_at, updated_at +// SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, version, created_at, updated_at // FROM `sentinels` // WHERE (? = '' OR region = ?) // AND desired_state = ? @@ -68,6 +67,7 @@ func (q *Queries) ListDesiredSentinels(ctx context.Context, db DBTX, arg ListDes &i.AvailableReplicas, &i.CpuMillicores, &i.MemoryMib, + &i.Version, &i.CreatedAt, &i.UpdatedAt, ); err != nil { diff --git a/pkg/db/state_change_get_max_sequence.sql_generated.go b/pkg/db/state_change_get_max_sequence.sql_generated.go index e3032e9f5d..801acc7607 100644 --- a/pkg/db/state_change_get_max_sequence.sql_generated.go +++ b/pkg/db/state_change_get_max_sequence.sql_generated.go @@ -15,8 +15,8 @@ FROM ` + "`" + `state_changes` + "`" + ` WHERE region = ? ` -// Returns the highest sequence for a region. -// Used during bootstrap to get the watermark before streaming current state. +// GetMaxStateChangeSequence returns the highest sequence number for a region. +// Used during bootstrap to set the sequence boundary. // // SELECT CAST(COALESCE(MAX(sequence), 0) AS UNSIGNED) AS max_sequence // FROM `state_changes` diff --git a/svc/ctrl/BUILD.bazel b/svc/ctrl/BUILD.bazel index 7f3f8b77b1..60f2645d29 100644 --- a/svc/ctrl/BUILD.bazel +++ b/svc/ctrl/BUILD.bazel @@ -39,6 +39,7 @@ go_library( "//svc/ctrl/workflows/certificate", "//svc/ctrl/workflows/deploy", "//svc/ctrl/workflows/routing", + "//svc/ctrl/workflows/versioning", "@com_github_go_acme_lego_v4//challenge", "@com_github_restatedev_sdk_go//:sdk-go", "@com_github_restatedev_sdk_go//ingress", diff --git a/svc/ctrl/doc.go b/svc/ctrl/doc.go index 4104f8022f..f82d4f366a 100644 --- a/svc/ctrl/doc.go +++ b/svc/ctrl/doc.go @@ -9,7 +9,7 @@ // # Architecture // // The control plane consists of several integrated components: -// - HTTP/2 gRPC server for API endpoints +// - HTTP/2 Connect server for API endpoints // - Restate workflow engine for asynchronous operations // - Vault services for secrets and certificate encryption // - Database layer for persistent storage @@ -79,7 +79,7 @@ // 1. Initialize all services (database, vault, build backend, etc.) // 2. Start Restate workflow engine with all service bindings // 3. Register with Restate admin API for service discovery -// 4. Start HTTP/2 gRPC server on configured port +// 4. Start HTTP/2 Connect server on configured port // 5. Handle graceful shutdown on context cancellation // // # Observability diff --git a/svc/ctrl/integration/harness.go b/svc/ctrl/integration/harness.go index b6ec250c9f..5dca835dc8 100644 --- a/svc/ctrl/integration/harness.go +++ b/svc/ctrl/integration/harness.go @@ -17,18 +17,18 @@ import ( // Harness provides a test environment for ctrl service integration tests. // It sets up MySQL connection and seeded data for testing the sync functionality. type Harness struct { - t *testing.T - ctx context.Context - cancel context.CancelFunc - Seed *seed.Seeder - DB db.Database + t *testing.T + ctx context.Context + Seed *seed.Seeder + DB db.Database + versionCounter uint64 } // New creates a new integration test harness. func New(t *testing.T) *Harness { t.Helper() - ctx, cancel := context.WithCancel(context.Background()) + ctx := context.Background() mysqlHostCfg := containers.MySQL(t) mysqlHostCfg.DBName = "unkey" @@ -42,17 +42,16 @@ func New(t *testing.T) *Harness { require.NoError(t, err) h := &Harness{ - t: t, - ctx: ctx, - cancel: cancel, - Seed: seed.New(t, database, nil), - DB: database, + t: t, + ctx: ctx, + Seed: seed.New(t, database, nil), + DB: database, + versionCounter: 0, } h.Seed.Seed(ctx) t.Cleanup(func() { - cancel() database.Close() }) @@ -154,12 +153,14 @@ func (h *Harness) CreateDeployment(ctx context.Context, req CreateDeploymentRequ _, err = h.DB.RW().ExecContext(ctx, "UPDATE deployments SET image = ? WHERE id = ?", "nginx:1.19", deploymentID) require.NoError(h.t, err) + h.versionCounter++ err = db.Query.InsertDeploymentTopology(ctx, h.DB.RW(), db.InsertDeploymentTopologyParams{ WorkspaceID: workspaceID, DeploymentID: deploymentID, Region: req.Region, DesiredReplicas: 1, DesiredStatus: db.DeploymentTopologyDesiredStatusStarted, + Version: h.versionCounter, CreatedAt: h.Now(), }) require.NoError(h.t, err) @@ -176,6 +177,7 @@ func (h *Harness) CreateDeployment(ctx context.Context, req CreateDeploymentRequ Region: req.Region, DesiredReplicas: 1, DesiredStatus: db.DeploymentTopologyDesiredStatusStarted, + Version: h.versionCounter, CreatedAt: h.Now(), UpdatedAt: sql.NullInt64{Valid: false}, }, @@ -220,6 +222,7 @@ func (h *Harness) CreateSentinel(ctx context.Context, req CreateSentinelRequest) desiredState = db.SentinelsDesiredStateRunning } + h.versionCounter++ err := db.Query.InsertSentinel(ctx, h.DB.RW(), db.InsertSentinelParams{ ID: sentinelID, WorkspaceID: workspaceID, @@ -234,6 +237,7 @@ func (h *Harness) CreateSentinel(ctx context.Context, req CreateSentinelRequest) AvailableReplicas: 1, CpuMillicores: 100, MemoryMib: 128, + Version: h.versionCounter, CreatedAt: h.Now(), }) require.NoError(h.t, err) diff --git a/svc/ctrl/integration/sync_test.go b/svc/ctrl/integration/sync_test.go index 3e18d02fa9..e12a7feaca 100644 --- a/svc/ctrl/integration/sync_test.go +++ b/svc/ctrl/integration/sync_test.go @@ -893,7 +893,7 @@ func TestSync_StateChangeQueries(t *testing.T) { // // Guarantees: // - Returns the sequence of the most recent state change -// - Used during bootstrap to set the Bookmark sequence +// - Used during bootstrap to set the sequence boundary func TestSync_MaxSequenceQuery(t *testing.T) { h := New(t) ctx := h.Context() diff --git a/svc/ctrl/middleware/auth.go b/svc/ctrl/middleware/auth.go index 6dfe7cdbda..8f0bec9995 100644 --- a/svc/ctrl/middleware/auth.go +++ b/svc/ctrl/middleware/auth.go @@ -53,7 +53,7 @@ func NewAuthMiddleware(config AuthConfig) *AuthMiddleware { } } -// ConnectInterceptor returns a Connect interceptor for gRPC-like services. +// ConnectInterceptor returns a Connect interceptor for Connect services. // // This method returns an interceptor function that validates Bearer tokens // on incoming requests. It skips authentication for health check endpoints diff --git a/svc/ctrl/middleware/doc.go b/svc/ctrl/middleware/doc.go index 7149a0ee85..7a41225357 100644 --- a/svc/ctrl/middleware/doc.go +++ b/svc/ctrl/middleware/doc.go @@ -1,7 +1,7 @@ // Package middleware provides authentication and authorization for control plane APIs. // // This package implements simple API key authentication middleware for -// protecting gRPC endpoints. It validates Bearer tokens against +// protecting Connect endpoints. It validates Bearer tokens against // expected API key and provides error responses for failed authentication. // // # Authentication Flow @@ -39,5 +39,5 @@ // - ErrInvalidAPIKey: API key doesn't match expected value // // These errors are wrapped with appropriate Connect error codes -// for gRPC transmission. +// for Connect transmission. package middleware diff --git a/svc/ctrl/proto/ctrl/v1/cluster.proto b/svc/ctrl/proto/ctrl/v1/cluster.proto index cb3138153a..63fde634b9 100644 --- a/svc/ctrl/proto/ctrl/v1/cluster.proto +++ b/svc/ctrl/proto/ctrl/v1/cluster.proto @@ -80,14 +80,14 @@ message UpdateSentinelStateResponse {} message SyncRequest { string region = 1; - uint64 sequence_last_seen = 2; + uint64 version_last_seen = 2; } message State { - // sequence is the state_changes sequence number for this event. - // Clients should persist this after successfully processing each event - // to resume from the correct position on reconnect. - uint64 sequence = 1; + // version is the resource version for this state update. + // Clients should track the max version seen and persist it after + // the stream closes cleanly to resume from the correct position on reconnect. + uint64 version = 1; oneof kind { DeploymentState deployment = 2; diff --git a/svc/ctrl/proto/hydra/v1/versioning.proto b/svc/ctrl/proto/hydra/v1/versioning.proto new file mode 100644 index 0000000000..a71547f4f6 --- /dev/null +++ b/svc/ctrl/proto/hydra/v1/versioning.proto @@ -0,0 +1,47 @@ +syntax = "proto3"; + +package hydra.v1; + +import "dev/restate/sdk/go.proto"; + +option go_package = "github.com/unkeyed/unkey/gen/proto/hydra/v1;hydrav1"; + +// VersioningService provides globally unique, monotonically increasing versions +// for state synchronization between the control plane and edge agents. +// +// This is a singleton virtual object (use empty string as key). The version is +// used to track state changes in deployments and sentinels tables, enabling +// efficient incremental synchronization. +// +// Usage: +// 1. Before mutating a deployment or sentinel, call NextVersion to get a new version +// 2. Update the resource row with this version +// 3. Edge agents track their last-seen version and request changes after it +// 4. Sync queries filter by region: WHERE region = ? AND version > ? +service VersioningService { + option (dev.restate.sdk.go.service_type) = VIRTUAL_OBJECT; + + // NextVersion atomically increments and returns the next version number. + // + // The version is durably stored in Restate's virtual object state, guaranteeing: + // - Monotonically increasing values (no gaps under normal operation) + // - Exactly-once semantics (retries return the same version) + // - Single-writer (singleton virtual object) + rpc NextVersion(NextVersionRequest) returns (NextVersionResponse) {} + + // GetVersion returns the current version without incrementing. + // Useful for stale cursor detection: if client's version < min retained, force bootstrap. + rpc GetVersion(GetVersionRequest) returns (GetVersionResponse) {} +} + +message NextVersionRequest {} + +message NextVersionResponse { + uint64 version = 1; +} + +message GetVersionRequest {} + +message GetVersionResponse { + uint64 version = 1; +} diff --git a/svc/ctrl/run.go b/svc/ctrl/run.go index 414ea48420..7ec2b73207 100644 --- a/svc/ctrl/run.go +++ b/svc/ctrl/run.go @@ -42,13 +42,14 @@ import ( "github.com/unkeyed/unkey/svc/ctrl/workflows/certificate" "github.com/unkeyed/unkey/svc/ctrl/workflows/deploy" "github.com/unkeyed/unkey/svc/ctrl/workflows/routing" + "github.com/unkeyed/unkey/svc/ctrl/workflows/versioning" "golang.org/x/net/http2" "golang.org/x/net/http2/h2c" ) // Run starts the control plane server with the provided configuration. // -// This function initializes all required services and starts the HTTP/2 gRPC server. +// This function initializes all required services and starts the HTTP/2 Connect server. // It performs these major initialization steps: // 1. Validates configuration and initializes structured logging // 2. Sets up OpenTelemetry if enabled @@ -57,7 +58,7 @@ import ( // 5. Starts Restate workflow engine with service bindings // 6. Configures ACME challenge providers (HTTP-01, DNS-01) // 7. Registers with Restate admin API for service discovery -// 8. Starts HTTP/2 server with all gRPC handlers +// 8. Starts HTTP/2 server with all Connect handlers // 9. Boots up cluster management and starts certificate renewal // // The server handles graceful shutdown when context is cancelled, properly @@ -253,6 +254,8 @@ func Run(ctx context.Context, cfg Config) error { DefaultDomain: cfg.DefaultDomain, }), restate.WithIngressPrivate(true))) + restateSrv.Bind(hydrav1.NewVersioningServiceServer(versioning.New(), restate.WithIngressPrivate(true))) + // Initialize shared caches for ACME (needed for verification endpoint regardless of provider config) caches, cacheErr := ctrlCaches.New(ctrlCaches.Config{ Logger: logger, diff --git a/svc/ctrl/services/build/doc.go b/svc/ctrl/services/build/doc.go index f7f81f7bdc..75e057672b 100644 --- a/svc/ctrl/services/build/doc.go +++ b/svc/ctrl/services/build/doc.go @@ -54,6 +54,6 @@ // # Error Handling // // All backends provide comprehensive error handling with proper -// gRPC error codes for client communication and detailed +// Connect error codes for client communication and detailed // logging of build failures and progress. package build diff --git a/svc/ctrl/services/cluster/BUILD.bazel b/svc/ctrl/services/cluster/BUILD.bazel index 6aa85e4593..2cbf831eb3 100644 --- a/svc/ctrl/services/cluster/BUILD.bazel +++ b/svc/ctrl/services/cluster/BUILD.bazel @@ -11,9 +11,6 @@ go_library( "rpc_update_deployment_state.go", "rpc_update_sentinel_state.go", "service.go", - "sync_bootstrap.go", - "sync_changes.go", - "sync_messages.go", ], importpath = "github.com/unkeyed/unkey/svc/ctrl/services/cluster", visibility = ["//visibility:public"], diff --git a/svc/ctrl/services/cluster/auth.go b/svc/ctrl/services/cluster/auth.go index 3f92244cc8..f3d855b7f5 100644 --- a/svc/ctrl/services/cluster/auth.go +++ b/svc/ctrl/services/cluster/auth.go @@ -13,6 +13,8 @@ type request interface { Header() http.Header } +// authenticate validates the bearer token from the request's Authorization header. +// Returns a connect.CodeUnauthenticated error if the token is missing, malformed, or invalid. func (s *Service) authenticate(req request) error { header := req.Header().Get("Authorization") diff --git a/svc/ctrl/services/cluster/doc.go b/svc/ctrl/services/cluster/doc.go index d50f138d66..e730b98213 100644 --- a/svc/ctrl/services/cluster/doc.go +++ b/svc/ctrl/services/cluster/doc.go @@ -1,31 +1,32 @@ -// Package cluster implements the gRPC ClusterService for synchronizing desired state to edge nodes. +// Package cluster implements the Connect ClusterService for synchronizing desired state to krane agents. // // # Overview // -// Edge nodes (running in different regions) connect to the control plane and request +// Krane agents (similar to kubelets in Kubernetes) run in each region and act as managers +// for their respective Kubernetes clusters. They connect to the control plane and request // state synchronization via the Sync RPC. The control plane streams the desired state // for deployments and sentinels that should run in that region. // // # State Synchronization Model // -// The synchronization uses a sequence-based approach: +// The synchronization uses a version-based approach: // -// 1. Each state change (create, update, delete) is recorded in the state_changes table -// with an auto-incrementing sequence number per region. +// 1. Each resource (deployment_topology, sentinel) has a version column that is updated +// on every mutation via the Restate VersioningService singleton. // -// 2. Edge nodes track the last sequence they've seen. On reconnect, they request changes -// after that sequence. +// 2. Krane agents track the last version they've seen. On reconnect, they request changes +// after that version. // -// 3. If sequence is 0 (new node or reset), a full bootstrap is performed: all running -// deployments and sentinels are streamed. Stream close signals completion. +// 3. If version is 0 (new agent or reset), a full bootstrap is performed: all resources +// are streamed ordered by version. Stream close signals completion. // // # Convergence Guarantees // // The system guarantees eventual consistency through: // - Idempotent apply/delete operations: applying the same state multiple times is safe -// - Delete-if-uncertain semantics: if we cannot prove a resource should run in a region, -// we instruct deletion to prevent stale resources -// - Reconnection with last-seen sequence: clients catch up on missed changes +// - Soft-delete semantics: "deletes" set desired_replicas=0, keeping the row with its version +// - Bootstrap + GC: after bootstrap, agents delete any k8s resources not in the stream +// - Reconnection with last-seen version: agents catch up on missed changes // // # Key Types // diff --git a/svc/ctrl/services/cluster/rpc_get_desired_deployment_state.go b/svc/ctrl/services/cluster/rpc_get_desired_deployment_state.go index 63016df314..5823b880cb 100644 --- a/svc/ctrl/services/cluster/rpc_get_desired_deployment_state.go +++ b/svc/ctrl/services/cluster/rpc_get_desired_deployment_state.go @@ -10,6 +10,14 @@ import ( "github.com/unkeyed/unkey/pkg/db" ) +// GetDesiredDeploymentState returns the target state for a deployment in the caller's region. +// Krane agents use this to determine whether to apply or delete a deployment. The response +// contains either an ApplyDeployment (for running state) or DeleteDeployment (for archived +// or standby states) based on the deployment's desired_state in the database. +// +// Requires bearer token authentication and the X-Krane-Region header to identify the +// requesting agent's region. Returns CodeNotFound if the deployment doesn't exist in +// the specified region, or CodeInvalidArgument if the region header is missing. func (s *Service) GetDesiredDeploymentState(ctx context.Context, req *connect.Request[ctrlv1.GetDesiredDeploymentStateRequest]) (*connect.Response[ctrlv1.DeploymentState], error) { if err := s.authenticate(req); err != nil { diff --git a/svc/ctrl/services/cluster/rpc_get_desired_sentinel_state.go b/svc/ctrl/services/cluster/rpc_get_desired_sentinel_state.go index fc0fc55810..380eab265d 100644 --- a/svc/ctrl/services/cluster/rpc_get_desired_sentinel_state.go +++ b/svc/ctrl/services/cluster/rpc_get_desired_sentinel_state.go @@ -10,6 +10,13 @@ import ( "github.com/unkeyed/unkey/pkg/db" ) +// GetDesiredSentinelState returns the target state for a sentinel resource. Krane agents +// use this to determine whether to apply or delete a sentinel. The response contains either +// an ApplySentinel (for running state) or DeleteSentinel (for archived or standby states) +// based on the sentinel's desired_state in the database. +// +// Requires bearer token authentication and the X-Krane-Region header. Returns CodeNotFound +// if the sentinel doesn't exist, or CodeInvalidArgument if the region header is missing. func (s *Service) GetDesiredSentinelState(ctx context.Context, req *connect.Request[ctrlv1.GetDesiredSentinelStateRequest]) (*connect.Response[ctrlv1.SentinelState], error) { if err := s.authenticate(req); err != nil { diff --git a/svc/ctrl/services/cluster/rpc_sync.go b/svc/ctrl/services/cluster/rpc_sync.go index a9fabb58ad..9dda742af9 100644 --- a/svc/ctrl/services/cluster/rpc_sync.go +++ b/svc/ctrl/services/cluster/rpc_sync.go @@ -9,46 +9,215 @@ import ( "github.com/unkeyed/unkey/pkg/db" ) -// Sync streams cluster state to an edge node for the given region. +const syncBatchSize = 100 + +// Sync streams cluster state to a krane agent for the given region. +// +// Each resource carries its actual version, so clients track max(seen versions). +// +// IMPORTANT: Clients must only commit their version tracking after a clean stream +// close. This ensures atomic bootstrap: if a stream breaks mid-bootstrap, the client +// retries from version 0 rather than skipping resources that were never received. // -// If sequence_last_seen is 0, Sync bootstraps the full desired state for the region. -// Stream close signals bootstrap completion. The client tracks the highest sequence -// from received messages and uses it for the next sync request. +// After bootstrap (versionLastSeen=0), clients should garbage-collect any k8s +// resources not mentioned in the bootstrap stream. // // Sync is a bounded catch-up stream. The server stops after sending a batch of -// changes; clients reconnect to continue from their last-seen sequence. +// changes; clients reconnect to continue from their last-seen version. func (s *Service) Sync(ctx context.Context, req *connect.Request[ctrlv1.SyncRequest], stream *connect.ServerStream[ctrlv1.State]) error { region := req.Msg.GetRegion() - sequenceLastSeen := req.Msg.GetSequenceLastSeen() + versionLastSeen := req.Msg.GetVersionLastSeen() s.logger.Info("sync request received", "region", region, - "sequenceLastSeen", sequenceLastSeen, + "versionLastSeen", versionLastSeen, ) - sequenceAfter := sequenceLastSeen - if sequenceLastSeen == 0 { - boundary, err := s.bootstrap(ctx, region, stream) + if err := s.streamStateAfterVersion(ctx, region, versionLastSeen, stream); err != nil { + return connect.NewError(connect.CodeInternal, fmt.Errorf("stream state region=%q after_version=%d: %w", region, versionLastSeen, err)) + } + + return nil +} + +// streamStateAfterVersion streams all resources with version > afterVersion in global version order. +// It uses a three-step approach: +// 1. Query the next batch of (version, kind) pairs in global order (lightweight UNION ALL) +// 2. Partition versions by kind and hydrate full data with targeted queries +// 3. Merge results by version and stream to the client +func (s *Service) streamStateAfterVersion(ctx context.Context, region string, afterVersion uint64, stream *connect.ServerStream[ctrlv1.State]) error { + for { + // Step 1: Get next batch of versions in global order + versionRows, err := db.Query.ListClusterStateVersions(ctx, s.db.RO(), db.ListClusterStateVersionsParams{ + Region: region, + AfterVersion: afterVersion, + Limit: int32(syncBatchSize), + }) if err != nil { - return fmt.Errorf("bootstrap region=%q: %w", region, err) + return fmt.Errorf("list cluster state versions after_version=%d: %w", afterVersion, err) } - sequenceAfter = boundary + + if len(versionRows) == 0 { + return nil + } + + // Step 2: Partition versions by kind + var deploymentVersions, sentinelVersions []uint64 + for _, row := range versionRows { + switch row.Kind { + case "deployment": + deploymentVersions = append(deploymentVersions, row.Version) + case "sentinel": + sentinelVersions = append(sentinelVersions, row.Version) + } + } + + // Step 3: Hydrate full data + deploymentsByVersion := make(map[uint64]db.FindDeploymentTopologyByVersionsRow) + if len(deploymentVersions) > 0 { + topologies, err := db.Query.FindDeploymentTopologyByVersions(ctx, s.db.RO(), deploymentVersions) + if err != nil { + return fmt.Errorf("find deployment topologies by versions: %w", err) + } + for _, t := range topologies { + deploymentsByVersion[t.DeploymentTopology.Version] = t + } + } + + sentinelsByVersion := make(map[uint64]db.Sentinel) + if len(sentinelVersions) > 0 { + sentinels, err := db.Query.FindSentinelsByVersions(ctx, s.db.RO(), sentinelVersions) + if err != nil { + return fmt.Errorf("find sentinels by versions: %w", err) + } + for _, sentinel := range sentinels { + sentinelsByVersion[sentinel.Version] = sentinel + } + } + + // Step 4: Stream in global version order + for _, row := range versionRows { + var state *ctrlv1.State + + switch row.Kind { + case "deployment": + topology, ok := deploymentsByVersion[row.Version] + if !ok { + return fmt.Errorf("deployment topology version=%d not found after hydration", row.Version) + } + state = s.deploymentTopologyToState(topology) + + case "sentinel": + sentinel, ok := sentinelsByVersion[row.Version] + if !ok { + return fmt.Errorf("sentinel version=%d not found after hydration", row.Version) + } + state = s.sentinelToState(sentinel) + } + + if err := stream.Send(state); err != nil { + return fmt.Errorf("send state version=%d kind=%s: %w", row.Version, row.Kind, err) + } + } + + // Update afterVersion for next iteration + afterVersion = versionRows[len(versionRows)-1].Version + + // If we got fewer than batch size, we've reached the end + if len(versionRows) < syncBatchSize { + return nil + } + } +} + +// deploymentTopologyToState converts a deployment topology row to a State message. +// If the deployment should not be running (replicas=0 or stopped), it returns a Delete. +func (s *Service) deploymentTopologyToState(topology db.FindDeploymentTopologyByVersionsRow) *ctrlv1.State { + if topology.DeploymentTopology.DesiredReplicas == 0 || + topology.DeploymentTopology.DesiredStatus == db.DeploymentTopologyDesiredStatusStopped || + topology.DeploymentTopology.DesiredStatus == db.DeploymentTopologyDesiredStatusStopping { + return &ctrlv1.State{ + Version: topology.DeploymentTopology.Version, + Kind: &ctrlv1.State_Deployment{ + Deployment: &ctrlv1.DeploymentState{ + State: &ctrlv1.DeploymentState_Delete{ + Delete: &ctrlv1.DeleteDeployment{ + K8SNamespace: topology.K8sNamespace.String, + K8SName: topology.Deployment.K8sName, + }, + }, + }, + }, + } + } + + var buildID *string + if topology.Deployment.BuildID.Valid { + buildID = &topology.Deployment.BuildID.String } - changes, err := db.Query.ListStateChanges(ctx, s.db.RW(), db.ListStateChangesParams{ - Region: region, - AfterSequence: sequenceAfter, - Limit: 100, - }) - if err != nil { - return fmt.Errorf("list state changes region=%q after=%d: %w", region, sequenceAfter, err) + return &ctrlv1.State{ + Version: topology.DeploymentTopology.Version, + Kind: &ctrlv1.State_Deployment{ + Deployment: &ctrlv1.DeploymentState{ + State: &ctrlv1.DeploymentState_Apply{ + Apply: &ctrlv1.ApplyDeployment{ + K8SNamespace: topology.K8sNamespace.String, + K8SName: topology.Deployment.K8sName, + WorkspaceId: topology.Deployment.WorkspaceID, + EnvironmentId: topology.Deployment.EnvironmentID, + ProjectId: topology.Deployment.ProjectID, + DeploymentId: topology.Deployment.ID, + Image: topology.Deployment.Image.String, + Replicas: topology.DeploymentTopology.DesiredReplicas, + CpuMillicores: int64(topology.Deployment.CpuMillicores), + MemoryMib: int64(topology.Deployment.MemoryMib), + EncryptedEnvironmentVariables: topology.Deployment.EncryptedEnvironmentVariables, + BuildId: buildID, + }, + }, + }, + }, } +} - for _, change := range changes { - if err := s.processStateChange(ctx, region, change, stream); err != nil { - return fmt.Errorf("process state change sequence=%d: %w", change.Sequence, err) +// sentinelToState converts a sentinel row to a State message. +// If the sentinel should not be running (replicas=0 or not running state), +// it returns a Delete instruction. Otherwise, it returns an Apply instruction. +func (s *Service) sentinelToState(sentinel db.Sentinel) *ctrlv1.State { + if sentinel.DesiredReplicas == 0 || sentinel.DesiredState != db.SentinelsDesiredStateRunning { + return &ctrlv1.State{ + Version: sentinel.Version, + Kind: &ctrlv1.State_Sentinel{ + Sentinel: &ctrlv1.SentinelState{ + State: &ctrlv1.SentinelState_Delete{ + Delete: &ctrlv1.DeleteSentinel{ + K8SName: sentinel.K8sName, + }, + }, + }, + }, } } - return nil + return &ctrlv1.State{ + Version: sentinel.Version, + Kind: &ctrlv1.State_Sentinel{ + Sentinel: &ctrlv1.SentinelState{ + State: &ctrlv1.SentinelState_Apply{ + Apply: &ctrlv1.ApplySentinel{ + K8SName: sentinel.K8sName, + WorkspaceId: sentinel.WorkspaceID, + EnvironmentId: sentinel.EnvironmentID, + ProjectId: sentinel.ProjectID, + SentinelId: sentinel.ID, + Image: sentinel.Image, + Replicas: sentinel.DesiredReplicas, + CpuMillicores: int64(sentinel.CpuMillicores), + MemoryMib: int64(sentinel.MemoryMib), + }, + }, + }, + }, + } } diff --git a/svc/ctrl/services/cluster/rpc_update_deployment_state.go b/svc/ctrl/services/cluster/rpc_update_deployment_state.go index 396d7885e8..307a7b9532 100644 --- a/svc/ctrl/services/cluster/rpc_update_deployment_state.go +++ b/svc/ctrl/services/cluster/rpc_update_deployment_state.go @@ -10,6 +10,16 @@ import ( "github.com/unkeyed/unkey/pkg/uid" ) +// UpdateDeploymentState reconciles the observed deployment state reported by a krane agent. +// This is the feedback loop for convergence: agents report what's actually running so the +// control plane can track instance health and detect drift. +// +// For update requests, instances are upserted and any instances no longer reported by the +// agent are deleted (garbage collection). For delete requests, all instances for the +// deployment in that region are removed. The operation runs within a retryable transaction +// to handle transient database errors. +// +// Requires bearer token authentication and the X-Krane-Region header. func (s *Service) UpdateDeploymentState(ctx context.Context, req *connect.Request[ctrlv1.UpdateDeploymentStateRequest]) (*connect.Response[ctrlv1.UpdateDeploymentStateResponse], error) { s.logger.Info("updating deployment state", "req", req.Msg) //"update:{k8s_name:\"pgeywtmuengq\" instances:{k8s_name:\"pgeywtmuengq-kdfvj\" address:\"192-168-194-33.uzapavou.pod.cluster.local\" status:STATUS_RUNNING}}" @@ -105,6 +115,8 @@ func (s *Service) UpdateDeploymentState(ctx context.Context, req *connect.Reques } +// ctrlDeploymentStatusToDbStatus maps proto instance status to database enum values. +// Unspecified or unknown statuses are treated as inactive. func ctrlDeploymentStatusToDbStatus(status ctrlv1.UpdateDeploymentStateRequest_Update_Instance_Status) db.InstancesStatus { switch status { case ctrlv1.UpdateDeploymentStateRequest_Update_Instance_STATUS_UNSPECIFIED: diff --git a/svc/ctrl/services/cluster/rpc_update_sentinel_state.go b/svc/ctrl/services/cluster/rpc_update_sentinel_state.go index eab0b5f25d..1af1f39b8d 100644 --- a/svc/ctrl/services/cluster/rpc_update_sentinel_state.go +++ b/svc/ctrl/services/cluster/rpc_update_sentinel_state.go @@ -11,6 +11,12 @@ import ( "github.com/unkeyed/unkey/pkg/db" ) +// UpdateSentinelState records the observed replica count for a sentinel as reported by a +// krane agent. This updates the available_replicas and health fields in the database, +// allowing the control plane to track which sentinels are actually running and healthy. +// A sentinel is considered healthy if it has at least one available replica. +// +// Requires bearer token authentication and the X-Krane-Region header. func (s *Service) UpdateSentinelState(ctx context.Context, req *connect.Request[ctrlv1.UpdateSentinelStateRequest]) (*connect.Response[ctrlv1.UpdateSentinelStateResponse], error) { if err := s.authenticate(req); err != nil { diff --git a/svc/ctrl/services/cluster/service.go b/svc/ctrl/services/cluster/service.go index 4756eedf7e..0261b14132 100644 --- a/svc/ctrl/services/cluster/service.go +++ b/svc/ctrl/services/cluster/service.go @@ -6,7 +6,7 @@ import ( "github.com/unkeyed/unkey/pkg/otel/logging" ) -// Service implements the ClusterService gRPC interface for state synchronization. +// Service implements the ClusterService Connect interface for state synchronization. type Service struct { ctrlv1connect.UnimplementedClusterServiceHandler db db.Database diff --git a/svc/ctrl/services/cluster/sync_bootstrap.go b/svc/ctrl/services/cluster/sync_bootstrap.go deleted file mode 100644 index c0632b64df..0000000000 --- a/svc/ctrl/services/cluster/sync_bootstrap.go +++ /dev/null @@ -1,91 +0,0 @@ -package cluster - -import ( - "context" - "fmt" - - "connectrpc.com/connect" - "github.com/unkeyed/unkey/pkg/db" - - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" -) - -// bootstrap streams the full desired state for a region. -// -// This captures the current max sequence as a snapshot boundary, then streams all -// running deployments and sentinels. The sequence returned is NOT a true snapshot -// (state may have changed during streaming), but convergence is guaranteed because: -// 1. All apply/delete operations are idempotent -// 2. Any changes during bootstrap will be picked up on the next sync -// -// Stream closing without error signals bootstrap completion. -func (s *Service) bootstrap(ctx context.Context, region string, stream *connect.ServerStream[ctrlv1.State]) (uint64, error) { - maxSequence, err := db.Query.GetMaxStateChangeSequence(ctx, s.db.RW(), region) - if err != nil { - return 0, fmt.Errorf("get max sequence region=%q: %w", region, err) - } - sequenceBoundary := uint64(maxSequence) - - if err := s.bootstrapDeployments(ctx, region, sequenceBoundary, stream); err != nil { - return 0, err - } - - if err := s.bootstrapSentinels(ctx, region, sequenceBoundary, stream); err != nil { - return 0, err - } - - s.logger.Info("bootstrap complete", "sequenceBoundary", sequenceBoundary) - return sequenceBoundary, nil -} - -func (s *Service) bootstrapDeployments(ctx context.Context, region string, sequence uint64, stream *connect.ServerStream[ctrlv1.State]) error { - cursor := "" - for { - topologies, err := db.Query.ListDesiredDeploymentTopology(ctx, s.db.RW(), db.ListDesiredDeploymentTopologyParams{ - Region: region, - DesiredState: db.DeploymentsDesiredStateRunning, - PaginationCursor: cursor, - Limit: 1000, - }) - if err != nil { - return fmt.Errorf("list deployment topologies cursor=%q: %w", cursor, err) - } - if len(topologies) == 0 { - break - } - cursor = topologies[len(topologies)-1].Deployment.ID - - for _, topology := range topologies { - if err := s.sendDeploymentApplyFromTopology(stream, sequence, topology); err != nil { - return fmt.Errorf("send deployment id=%q: %w", topology.Deployment.ID, err) - } - } - } - return nil -} - -func (s *Service) bootstrapSentinels(ctx context.Context, region string, sequence uint64, stream *connect.ServerStream[ctrlv1.State]) error { - cursor := "" - for { - sentinels, err := db.Query.ListDesiredSentinels(ctx, s.db.RW(), db.ListDesiredSentinelsParams{ - Region: region, - DesiredState: db.SentinelsDesiredStateRunning, - PaginationCursor: cursor, - Limit: 100, - }) - if err != nil { - return fmt.Errorf("list sentinels cursor=%q: %w", cursor, err) - } - if len(sentinels) == 0 { - break - } - cursor = sentinels[len(sentinels)-1].ID - - for _, sentinel := range sentinels { - if err := s.sendSentinelApply(stream, sequence, sentinel); err != nil { - return fmt.Errorf("send sentinel id=%q: %w", sentinel.ID, err) - } - } - } - return nil -} diff --git a/svc/ctrl/services/cluster/sync_changes.go b/svc/ctrl/services/cluster/sync_changes.go deleted file mode 100644 index 3f86b98aa3..0000000000 --- a/svc/ctrl/services/cluster/sync_changes.go +++ /dev/null @@ -1,90 +0,0 @@ -package cluster - -import ( - "context" - "fmt" - - "connectrpc.com/connect" - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" - "github.com/unkeyed/unkey/pkg/db" -) - -// processStateChange routes a state change to the appropriate handler. -// -// Invariant: if we cannot prove a resource should be running in this region, -// we instruct the edge to delete it. This ensures stale resources are cleaned up. -func (s *Service) processStateChange(ctx context.Context, region string, change db.ListStateChangesRow, stream *connect.ServerStream[ctrlv1.State]) error { - switch change.ResourceType { - case db.StateChangesResourceTypeDeployment: - return s.processDeploymentChange(ctx, region, change, stream) - case db.StateChangesResourceTypeSentinel: - return s.processSentinelChange(ctx, region, change, stream) - default: - return fmt.Errorf("unknown resource type: %q", change.ResourceType) - } -} - -func (s *Service) processDeploymentChange(ctx context.Context, region string, change db.ListStateChangesRow, stream *connect.ServerStream[ctrlv1.State]) error { - deployment, err := db.Query.FindDeploymentById(ctx, s.db.RW(), change.ResourceID) - if err != nil { - if db.IsNotFound(err) { - // Resource already deleted, nothing to sync. - return nil - } - return fmt.Errorf("find deployment id=%q: %w", change.ResourceID, err) - } - - workspace, err := db.Query.FindWorkspaceByID(ctx, s.db.RW(), deployment.WorkspaceID) - if err != nil { - return fmt.Errorf("find workspace id=%q: %w", deployment.WorkspaceID, err) - } - - if change.Op == db.StateChangesOpDelete { - return s.sendDeploymentDelete(stream, change.Sequence, workspace.K8sNamespace.String, deployment.K8sName) - } - - topology, err := db.Query.FindDeploymentTopologyByIDAndRegion(ctx, s.db.RW(), db.FindDeploymentTopologyByIDAndRegionParams{ - DeploymentID: change.ResourceID, - Region: region, - }) - if err != nil { - if db.IsNotFound(err) { - // No topology for this region means delete. - return s.sendDeploymentDelete(stream, change.Sequence, workspace.K8sNamespace.String, deployment.K8sName) - } - return fmt.Errorf("find topology deployment=%q region=%q: %w", change.ResourceID, region, err) - } - - if shouldDeleteDeployment(topology.DesiredState) { - return s.sendDeploymentDelete(stream, change.Sequence, workspace.K8sNamespace.String, deployment.K8sName) - } - - return s.sendDeploymentApply(stream, change.Sequence, newApplyDeploymentFromTopology(topology)) -} - -func shouldDeleteDeployment(desiredState db.DeploymentsDesiredState) bool { - return desiredState != db.DeploymentsDesiredStateRunning -} - -func (s *Service) processSentinelChange(ctx context.Context, region string, change db.ListStateChangesRow, stream *connect.ServerStream[ctrlv1.State]) error { - sentinel, err := db.Query.FindSentinelByID(ctx, s.db.RW(), change.ResourceID) - if err != nil { - if db.IsNotFound(err) { - // Resource already deleted, nothing to sync. - return nil - } - return fmt.Errorf("find sentinel id=%q: %w", change.ResourceID, err) - } - - if shouldDeleteSentinel(change.Op, sentinel.Region, region, sentinel.DesiredState) { - return s.sendSentinelDelete(stream, change.Sequence, sentinel.K8sName) - } - - return s.sendSentinelApply(stream, change.Sequence, sentinel) -} - -func shouldDeleteSentinel(op db.StateChangesOp, sentinelRegion, requestRegion string, desiredState db.SentinelsDesiredState) bool { - return op == db.StateChangesOpDelete || - sentinelRegion != requestRegion || - desiredState != db.SentinelsDesiredStateRunning -} diff --git a/svc/ctrl/services/cluster/sync_messages.go b/svc/ctrl/services/cluster/sync_messages.go deleted file mode 100644 index bd937a26e9..0000000000 --- a/svc/ctrl/services/cluster/sync_messages.go +++ /dev/null @@ -1,118 +0,0 @@ -package cluster - -import ( - "connectrpc.com/connect" - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" - "github.com/unkeyed/unkey/pkg/db" -) - -// Message sending helpers - centralized protobuf construction. - -func (s *Service) sendDeploymentDelete(stream *connect.ServerStream[ctrlv1.State], sequence uint64, namespace, name string) error { - return stream.Send(&ctrlv1.State{ - Sequence: sequence, - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Delete{ - Delete: &ctrlv1.DeleteDeployment{ - K8SNamespace: namespace, - K8SName: name, - }, - }, - }, - }, - }) -} - -func (s *Service) sendDeploymentApply(stream *connect.ServerStream[ctrlv1.State], sequence uint64, apply *ctrlv1.ApplyDeployment) error { - return stream.Send(&ctrlv1.State{ - Sequence: sequence, - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Apply{ - Apply: apply, - }, - }, - }, - }) -} - -func (s *Service) sendDeploymentApplyFromTopology(stream *connect.ServerStream[ctrlv1.State], sequence uint64, topology db.ListDesiredDeploymentTopologyRow) error { - var buildID *string - if topology.Deployment.BuildID.Valid { - buildID = &topology.Deployment.BuildID.String - } - return s.sendDeploymentApply(stream, sequence, &ctrlv1.ApplyDeployment{ - K8SNamespace: topology.K8sNamespace.String, - K8SName: topology.Deployment.K8sName, - WorkspaceId: topology.Deployment.WorkspaceID, - EnvironmentId: topology.Deployment.EnvironmentID, - ProjectId: topology.Deployment.ProjectID, - DeploymentId: topology.Deployment.ID, - Image: topology.Deployment.Image.String, - Replicas: topology.DeploymentTopology.DesiredReplicas, - CpuMillicores: int64(topology.Deployment.CpuMillicores), - MemoryMib: int64(topology.Deployment.MemoryMib), - EncryptedEnvironmentVariables: topology.Deployment.EncryptedEnvironmentVariables, - BuildId: buildID, - }) -} - -func newApplyDeploymentFromTopology(topology db.FindDeploymentTopologyByIDAndRegionRow) *ctrlv1.ApplyDeployment { - var buildID *string - if topology.BuildID.Valid { - buildID = &topology.BuildID.String - } - return &ctrlv1.ApplyDeployment{ - K8SNamespace: topology.K8sNamespace.String, - K8SName: topology.K8sName, - WorkspaceId: topology.WorkspaceID, - EnvironmentId: topology.EnvironmentID, - ProjectId: topology.ProjectID, - DeploymentId: topology.ID, - Image: topology.Image.String, - Replicas: topology.DesiredReplicas, - CpuMillicores: int64(topology.CpuMillicores), - MemoryMib: int64(topology.MemoryMib), - EncryptedEnvironmentVariables: topology.EncryptedEnvironmentVariables, - BuildId: buildID, - } -} - -func (s *Service) sendSentinelDelete(stream *connect.ServerStream[ctrlv1.State], sequence uint64, name string) error { - return stream.Send(&ctrlv1.State{ - Sequence: sequence, - Kind: &ctrlv1.State_Sentinel{ - Sentinel: &ctrlv1.SentinelState{ - State: &ctrlv1.SentinelState_Delete{ - Delete: &ctrlv1.DeleteSentinel{ - K8SName: name, - }, - }, - }, - }, - }) -} - -func (s *Service) sendSentinelApply(stream *connect.ServerStream[ctrlv1.State], sequence uint64, sentinel db.Sentinel) error { - return stream.Send(&ctrlv1.State{ - Sequence: sequence, - Kind: &ctrlv1.State_Sentinel{ - Sentinel: &ctrlv1.SentinelState{ - State: &ctrlv1.SentinelState_Apply{ - Apply: &ctrlv1.ApplySentinel{ - K8SName: sentinel.K8sName, - WorkspaceId: sentinel.WorkspaceID, - EnvironmentId: sentinel.EnvironmentID, - ProjectId: sentinel.ProjectID, - SentinelId: sentinel.ID, - Image: sentinel.Image, - Replicas: sentinel.DesiredReplicas, - CpuMillicores: int64(sentinel.CpuMillicores), - MemoryMib: int64(sentinel.MemoryMib), - }, - }, - }, - }, - }) -} diff --git a/svc/ctrl/services/ctrl/doc.go b/svc/ctrl/services/ctrl/doc.go index 81286bdfee..d52e00f411 100644 --- a/svc/ctrl/services/ctrl/doc.go +++ b/svc/ctrl/services/ctrl/doc.go @@ -28,11 +28,11 @@ // // ctrlSvc := ctrl.New("ctrl-instance-001", database) // -// // Register with gRPC server +// // Register with Connect server // mux.Handle(ctrlv1connect.NewCtrlServiceHandler(ctrlSvc)) // // # Error Handling // -// Uses standard Connect error codes for proper gRPC +// Uses standard Connect error codes for proper // error transmission and client handling. package ctrl diff --git a/svc/ctrl/services/doc.go b/svc/ctrl/services/doc.go index 0b00e89b28..6e70ba71d6 100644 --- a/svc/ctrl/services/doc.go +++ b/svc/ctrl/services/doc.go @@ -1,6 +1,6 @@ -// Package services provides gRPC service implementations for the control plane. +// Package services provides Connect service implementations for the control plane. // -// This package contains all gRPC service handlers that implement +// This package contains all Connect service handlers that implement // the public API surface of the unkey control plane. Each service // is responsible for a specific domain of functionality including deployment // orchestration, build operations, certificate management, routing, @@ -23,7 +23,7 @@ // # Architecture // // All services follow consistent patterns: -// - Implement gRPC handlers with proper error handling +// - Implement Connect handlers with proper error handling // - Use database transactions for data consistency // - Integrate with Restate workflows for complex operations // - Include comprehensive logging and observability @@ -49,13 +49,13 @@ // Logger: logger, // }) // -// // Register with gRPC server +// // Register with Connect server // mux.Handle(ctrlv1connect.NewDeploymentServiceHandler(deploymentSvc)) // // # Error Handling // // All services use standardized error responses with appropriate -// Connect error codes for gRPC transmission: +// Connect error codes: // - InvalidArgument: Bad request parameters // - Unauthenticated: Missing or invalid authentication // - Internal: Unexpected system failures diff --git a/svc/ctrl/services/openapi/doc.go b/svc/ctrl/services/openapi/doc.go index 0fe8ceb3dd..1e358650ad 100644 --- a/svc/ctrl/services/openapi/doc.go +++ b/svc/ctrl/services/openapi/doc.go @@ -38,7 +38,7 @@ // // openapiSvc := openapi.New(database, logger) // -// // Register with gRPC server +// // Register with Connect server // mux.Handle(ctrlv1connect.NewOpenApiServiceHandler(openapiSvc)) // // # Integration diff --git a/svc/ctrl/workflows/deploy/deploy_handler.go b/svc/ctrl/workflows/deploy/deploy_handler.go index 6d079745c6..b45023d3ed 100644 --- a/svc/ctrl/workflows/deploy/deploy_handler.go +++ b/svc/ctrl/workflows/deploy/deploy_handler.go @@ -16,6 +16,33 @@ import ( "google.golang.org/protobuf/proto" ) +// Deploy executes a full deployment workflow for a new application version. +// +// This durable workflow orchestrates the complete deployment lifecycle: building +// Docker images (if source is provided), provisioning containers across regions, +// waiting for instances to become healthy, and configuring domain routing. The +// workflow is idempotent and can safely resume from any step after a crash. +// +// The deployment request must specify either a build context path (to build from +// source) or a pre-built Docker image. If BuildContextPath is set, the workflow +// triggers a Docker build through the build service before deployment. Otherwise, +// the provided DockerImage is deployed directly. +// +// The workflow creates deployment topologies for all configured regions, each with +// its own version number for independent scaling and rollback. Sentinel containers +// are automatically provisioned for environments that don't already have them, +// with production environments getting 3 replicas and others getting 1. +// +// Domain routing is configured through frontline routes, with sticky domains +// (branch and environment) automatically updating to point to the new deployment. +// For production deployments, the project's live deployment pointer is updated +// unless the project is in a rolled-back state. +// +// If any step fails, the deployment status is automatically set to failed via a +// deferred cleanup handler, ensuring the database reflects the true deployment state. +// +// Returns terminal errors for validation failures (missing image/context) and +// retryable errors for transient system failures. func (w *Workflow) Deploy(ctx restate.WorkflowSharedContext, req *hydrav1.DeployRequest) (*hydrav1.DeployResponse, error) { finishedSuccessfully := false @@ -151,36 +178,28 @@ func (w *Workflow) Deploy(ctx restate.WorkflowSharedContext, req *hydrav1.Deploy } topologies := make([]db.InsertDeploymentTopologyParams, len(w.availableRegions)) + versioningClient := hydrav1.NewVersioningServiceClient(ctx, "") + for i, region := range w.availableRegions { + versionResp, versionErr := versioningClient.NextVersion().Request(&hydrav1.NextVersionRequest{}) + if versionErr != nil { + return nil, fmt.Errorf("failed to get next version: %w", versionErr) + } + topologies[i] = db.InsertDeploymentTopologyParams{ WorkspaceID: workspace.ID, DeploymentID: deployment.ID, Region: region, DesiredReplicas: 1, DesiredStatus: db.DeploymentTopologyDesiredStatusStarting, + Version: versionResp.GetVersion(), CreatedAt: time.Now().UnixMilli(), } } err = restate.RunVoid(ctx, func(runCtx restate.RunContext) error { return db.Tx(runCtx, w.db.RW(), func(txCtx context.Context, tx db.DBTX) error { - if err := db.BulkQuery.InsertDeploymentTopologies(txCtx, tx, topologies); err != nil { - return err - } - stateChanges := make([]db.InsertStateChangeParams, len(topologies)) - for i, t := range topologies { - stateChanges[i] = db.InsertStateChangeParams{ - ResourceType: db.StateChangesResourceTypeDeployment, - ResourceID: deployment.ID, - Op: db.StateChangesOpUpsert, - Region: t.Region, - CreatedAt: uint64(time.Now().UnixMilli()), - } - } - if err := db.BulkQuery.InsertStateChanges(txCtx, tx, stateChanges); err != nil { - return err - } - return nil + return db.BulkQuery.InsertDeploymentTopologies(txCtx, tx, topologies) }) }, restate.WithName("insert deployment topologies")) @@ -204,6 +223,11 @@ func (w *Workflow) Deploy(ctx restate.WorkflowSharedContext, req *hydrav1.Deploy desiredReplicas = 3 } + sentinelVersionResp, sentinelVersionErr := versioningClient.NextVersion().Request(&hydrav1.NextVersionRequest{}) + if sentinelVersionErr != nil { + return nil, fmt.Errorf("failed to get next version for sentinel: %w", sentinelVersionErr) + } + err = restate.RunVoid(ctx, func(runCtx restate.RunContext) error { sentinelID := uid.New(uid.SentinelPrefix) sentinelK8sName := uid.DNS1035() @@ -224,6 +248,7 @@ func (w *Workflow) Deploy(ctx restate.WorkflowSharedContext, req *hydrav1.Deploy AvailableReplicas: 0, CpuMillicores: 256, MemoryMib: 256, + Version: sentinelVersionResp.GetVersion(), CreatedAt: time.Now().UnixMilli(), }) if err != nil { @@ -232,15 +257,6 @@ func (w *Workflow) Deploy(ctx restate.WorkflowSharedContext, req *hydrav1.Deploy } return err } - if _, err := db.Query.InsertStateChange(txCtx, tx, db.InsertStateChangeParams{ - ResourceType: db.StateChangesResourceTypeSentinel, - ResourceID: sentinelID, - Op: db.StateChangesOpUpsert, - Region: topology.Region, - CreatedAt: uint64(time.Now().UnixMilli()), - }); err != nil { - return err - } return nil }) }, restate.WithName("ensure sentinel exists in db")) diff --git a/svc/ctrl/workflows/versioning/BUILD.bazel b/svc/ctrl/workflows/versioning/BUILD.bazel new file mode 100644 index 0000000000..d5c14e82dc --- /dev/null +++ b/svc/ctrl/workflows/versioning/BUILD.bazel @@ -0,0 +1,16 @@ +load("@rules_go//go:def.bzl", "go_library") + +go_library( + name = "versioning", + srcs = [ + "doc.go", + "next_version_handler.go", + "service.go", + ], + importpath = "github.com/unkeyed/unkey/svc/ctrl/workflows/versioning", + visibility = ["//visibility:public"], + deps = [ + "//gen/proto/hydra/v1:hydra", + "@com_github_restatedev_sdk_go//:sdk-go", + ], +) diff --git a/svc/ctrl/workflows/versioning/doc.go b/svc/ctrl/workflows/versioning/doc.go new file mode 100644 index 0000000000..2033e6e3dc --- /dev/null +++ b/svc/ctrl/workflows/versioning/doc.go @@ -0,0 +1,25 @@ +// Package versioning provides a global version counter for state synchronization. +// +// The VersioningService is a Restate virtual object that generates monotonically +// increasing version numbers. These versions are used to track state changes in +// deployments and sentinels tables, enabling efficient incremental synchronization +// between the control plane and edge agents (krane). +// +// # Usage +// +// Before mutating a deployment or sentinel: +// +// client := hydrav1.NewVersioningServiceClient(ctx, "") +// resp, err := client.NextVersion(ctx, &hydrav1.NextVersionRequest{}) +// // Use resp.Version when updating the resource row +// +// Edge agents track their last-seen version and request changes after it: +// +// SELECT * FROM deployments WHERE region = ? AND version > ? ORDER BY version +// +// # Singleton Pattern +// +// This service uses an empty string as the virtual object key, making it a +// singleton. All version requests are serialized through a single instance, +// guaranteeing global ordering. +package versioning diff --git a/svc/ctrl/workflows/versioning/next_version_handler.go b/svc/ctrl/workflows/versioning/next_version_handler.go new file mode 100644 index 0000000000..1c9d3a709c --- /dev/null +++ b/svc/ctrl/workflows/versioning/next_version_handler.go @@ -0,0 +1,40 @@ +package versioning + +import ( + restate "github.com/restatedev/sdk-go" + hydrav1 "github.com/unkeyed/unkey/gen/proto/hydra/v1" +) + +const versionStateKey = "version" + +// NextVersion atomically increments and returns the next version number. +// +// The version is durably stored in Restate's virtual object state, guaranteeing monotonically increasing values +func (s *Service) NextVersion(ctx restate.ObjectContext, _ *hydrav1.NextVersionRequest) (*hydrav1.NextVersionResponse, error) { + current, err := restate.Get[uint64](ctx, versionStateKey) + if err != nil { + return nil, err + } + + next := current + 1 + restate.Set(ctx, versionStateKey, next) + + return &hydrav1.NextVersionResponse{ + Version: next, + }, nil +} + +// GetVersion returns the current version without incrementing. +// +// Useful for stale cursor detection: if a client's version is older than the +// minimum retained version in the database, they must perform a full bootstrap. +func (s *Service) GetVersion(ctx restate.ObjectContext, _ *hydrav1.GetVersionRequest) (*hydrav1.GetVersionResponse, error) { + current, err := restate.Get[uint64](ctx, versionStateKey) + if err != nil { + return nil, err + } + + return &hydrav1.GetVersionResponse{ + Version: current, + }, nil +} diff --git a/svc/ctrl/workflows/versioning/service.go b/svc/ctrl/workflows/versioning/service.go new file mode 100644 index 0000000000..817374e1de --- /dev/null +++ b/svc/ctrl/workflows/versioning/service.go @@ -0,0 +1,22 @@ +package versioning + +import ( + hydrav1 "github.com/unkeyed/unkey/gen/proto/hydra/v1" +) + +// Service provides globally unique, monotonically increasing versions for state sync. +// +// This is a Restate virtual object that maintains a durable counter. Each call to +// NextVersion atomically increments and returns the next version number, with +// exactly-once semantics guaranteed by Restate. +type Service struct { + hydrav1.UnimplementedVersioningServiceServer +} + +var _ hydrav1.VersioningServiceServer = (*Service)(nil) + +func New() *Service { + return &Service{ + UnimplementedVersioningServiceServer: hydrav1.UnimplementedVersioningServiceServer{}, + } +} diff --git a/svc/krane/internal/reconciler/BUILD.bazel b/svc/krane/internal/reconciler/BUILD.bazel index 00b24d8f42..22e6fdc6f9 100644 --- a/svc/krane/internal/reconciler/BUILD.bazel +++ b/svc/krane/internal/reconciler/BUILD.bazel @@ -56,9 +56,9 @@ go_test( "reconciler_test.go", "refresh_current_deployments_test.go", "refresh_current_sentinels_test.go", - "sequence_tracking_test.go", "test_helpers_test.go", "update_state_test.go", + "version_tracking_test.go", "watch_current_deployments_test.go", "watch_current_sentinels_test.go", "watcher_test.go", diff --git a/svc/krane/internal/reconciler/handle_state_test.go b/svc/krane/internal/reconciler/handle_state_test.go index 4be70cba2d..8b23bad9f2 100644 --- a/svc/krane/internal/reconciler/handle_state_test.go +++ b/svc/krane/internal/reconciler/handle_state_test.go @@ -39,7 +39,7 @@ func TestHandleState_DeploymentApply(t *testing.T) { }, } - err := r.HandleState(ctx, state) + _, err := r.HandleState(ctx, state) require.NoError(t, err) require.NotNil(t, h.ReplicaSets.Applied, "should route to ApplyDeployment") } @@ -62,7 +62,7 @@ func TestHandleState_DeploymentDelete(t *testing.T) { }, } - err := r.HandleState(ctx, state) + _, err := r.HandleState(ctx, state) require.NoError(t, err) require.Contains(t, h.Deletes.Actions, "replicasets", "should route to DeleteDeployment") } @@ -92,7 +92,7 @@ func TestHandleState_SentinelApply(t *testing.T) { }, } - err := r.HandleState(ctx, state) + _, err := r.HandleState(ctx, state) require.NoError(t, err) require.NotNil(t, h.Deployments.Applied, "should route to ApplySentinel (Deployment)") require.NotNil(t, h.Services.Applied, "should route to ApplySentinel (Service)") @@ -115,7 +115,7 @@ func TestHandleState_SentinelDelete(t *testing.T) { }, } - err := r.HandleState(ctx, state) + _, err := r.HandleState(ctx, state) require.NoError(t, err) require.Contains(t, h.Deletes.Actions, "services", "should route to DeleteSentinel (Service)") require.Contains(t, h.Deletes.Actions, "deployments", "should route to DeleteSentinel (Deployment)") @@ -130,7 +130,7 @@ func TestHandleState_UnknownStateType(t *testing.T) { Kind: nil, } - err := r.HandleState(ctx, state) + _, err := r.HandleState(ctx, state) require.Error(t, err) require.Contains(t, err.Error(), "unknown state type") } @@ -140,7 +140,7 @@ func TestHandleState_NilState(t *testing.T) { h := NewTestHarness(t) r := h.Reconciler - err := r.HandleState(ctx, nil) + _, err := r.HandleState(ctx, nil) require.Error(t, err) require.Contains(t, err.Error(), "state is nil") } diff --git a/svc/krane/internal/reconciler/reconciler.go b/svc/krane/internal/reconciler/reconciler.go index 0f315112f9..1287a490c4 100644 --- a/svc/krane/internal/reconciler/reconciler.go +++ b/svc/krane/internal/reconciler/reconciler.go @@ -22,13 +22,13 @@ import ( // background goroutines for watching and refreshing, so callers must call [Start] // before processing state and [Stop] during shutdown. type Reconciler struct { - clientSet kubernetes.Interface - logger logging.Logger - cluster ctrlv1connect.ClusterServiceClient - cb circuitbreaker.CircuitBreaker[any] - done chan struct{} - region string - sequenceLastSeen uint64 + clientSet kubernetes.Interface + logger logging.Logger + cluster ctrlv1connect.ClusterServiceClient + cb circuitbreaker.CircuitBreaker[any] + done chan struct{} + region string + versionLastSeen uint64 } // Config holds the configuration required to create a new [Reconciler]. @@ -43,13 +43,13 @@ type Config struct { // New creates a [Reconciler] ready to be started with [Reconciler.Start]. func New(cfg Config) *Reconciler { return &Reconciler{ - clientSet: cfg.ClientSet, - logger: cfg.Logger, - cluster: cfg.Cluster, - cb: circuitbreaker.New[any]("reconciler_state_update"), - done: make(chan struct{}), - region: cfg.Region, - sequenceLastSeen: 0, + clientSet: cfg.ClientSet, + logger: cfg.Logger, + cluster: cfg.Cluster, + cb: circuitbreaker.New[any]("reconciler_state_update"), + done: make(chan struct{}), + region: cfg.Region, + versionLastSeen: 0, } } @@ -84,45 +84,46 @@ func (r *Reconciler) Start(ctx context.Context) error { // HandleState returns immediately after processing the single state update. It does // not block waiting for additional updates; use it within a loop that reads from // the control plane's state stream. -func (r *Reconciler) HandleState(ctx context.Context, state *ctrlv1.State) error { +// +// The version from the state is returned so the caller can track progress. The caller +// is responsible for committing the version only after the stream closes cleanly. +// This ensures atomic bootstrap: if a stream breaks mid-bootstrap, the client retries +// from version 0 rather than skipping resources that were never received. +func (r *Reconciler) HandleState(ctx context.Context, state *ctrlv1.State) (uint64, error) { if state == nil { - return fmt.Errorf("state is nil") + return 0, fmt.Errorf("state is nil") } - sequence := state.GetSequence() + version := state.GetVersion() switch kind := state.GetKind().(type) { case *ctrlv1.State_Deployment: switch op := kind.Deployment.GetState().(type) { case *ctrlv1.DeploymentState_Apply: if err := r.ApplyDeployment(ctx, op.Apply); err != nil { - return err + return 0, err } case *ctrlv1.DeploymentState_Delete: if err := r.DeleteDeployment(ctx, op.Delete); err != nil { - return err + return 0, err } } case *ctrlv1.State_Sentinel: switch op := kind.Sentinel.GetState().(type) { case *ctrlv1.SentinelState_Apply: if err := r.ApplySentinel(ctx, op.Apply); err != nil { - return err + return 0, err } case *ctrlv1.SentinelState_Delete: if err := r.DeleteSentinel(ctx, op.Delete); err != nil { - return err + return 0, err } } default: - return fmt.Errorf("unknown state type: %T", kind) - } - - if sequence > r.sequenceLastSeen { - r.sequenceLastSeen = sequence + return 0, fmt.Errorf("unknown state type: %T", kind) } - return nil + return version, nil } // Stop signals all background goroutines to terminate. Safe to call multiple diff --git a/svc/krane/internal/reconciler/sequence_tracking_test.go b/svc/krane/internal/reconciler/sequence_tracking_test.go deleted file mode 100644 index 11ffd1080e..0000000000 --- a/svc/krane/internal/reconciler/sequence_tracking_test.go +++ /dev/null @@ -1,319 +0,0 @@ -package reconciler - -import ( - "context" - "testing" - - "github.com/stretchr/testify/require" - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" - "github.com/unkeyed/unkey/pkg/ptr" -) - -// Tests for sequence tracking behavior in HandleState. -// The reconciler tracks sequenceLastSeen to resume from the correct position on reconnect. - -func TestHandleState_UpdatesSequenceAfterDeploymentApply(t *testing.T) { - ctx := context.Background() - h := NewTestHarness(t) - r := h.Reconciler - - require.Equal(t, uint64(0), r.sequenceLastSeen, "initial sequence should be 0") - - state := &ctrlv1.State{ - Sequence: 42, - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Apply{ - Apply: &ctrlv1.ApplyDeployment{ - WorkspaceId: "ws_123", - ProjectId: "prj_123", - EnvironmentId: "env_123", - DeploymentId: "dep_123", - K8SNamespace: "test-namespace", - K8SName: "test-deployment", - Image: "nginx:1.19", - Replicas: 1, - CpuMillicores: 100, - MemoryMib: 128, - BuildId: ptr.P("build_123"), - }, - }, - }, - }, - } - - err := r.HandleState(ctx, state) - require.NoError(t, err) - require.Equal(t, uint64(42), r.sequenceLastSeen, "sequence should be updated after apply") -} - -func TestHandleState_UpdatesSequenceAfterDeploymentDelete(t *testing.T) { - ctx := context.Background() - h := NewTestHarness(t) - r := h.Reconciler - - state := &ctrlv1.State{ - Sequence: 100, - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Delete{ - Delete: &ctrlv1.DeleteDeployment{ - K8SNamespace: "test-namespace", - K8SName: "test-deployment", - }, - }, - }, - }, - } - - err := r.HandleState(ctx, state) - require.NoError(t, err) - require.Equal(t, uint64(100), r.sequenceLastSeen) -} - -func TestHandleState_UpdatesSequenceAfterSentinelApply(t *testing.T) { - ctx := context.Background() - h := NewTestHarness(t) - r := h.Reconciler - - state := &ctrlv1.State{ - Sequence: 200, - Kind: &ctrlv1.State_Sentinel{ - Sentinel: &ctrlv1.SentinelState{ - State: &ctrlv1.SentinelState_Apply{ - Apply: &ctrlv1.ApplySentinel{ - WorkspaceId: "ws_123", - ProjectId: "prj_123", - EnvironmentId: "env_123", - SentinelId: "sentinel_123", - K8SName: "test-sentinel", - Image: "sentinel:1.0", - Replicas: 2, - CpuMillicores: 100, - MemoryMib: 128, - }, - }, - }, - }, - } - - err := r.HandleState(ctx, state) - require.NoError(t, err) - require.Equal(t, uint64(200), r.sequenceLastSeen) -} - -func TestHandleState_UpdatesSequenceAfterSentinelDelete(t *testing.T) { - ctx := context.Background() - h := NewTestHarness(t) - r := h.Reconciler - - state := &ctrlv1.State{ - Sequence: 300, - Kind: &ctrlv1.State_Sentinel{ - Sentinel: &ctrlv1.SentinelState{ - State: &ctrlv1.SentinelState_Delete{ - Delete: &ctrlv1.DeleteSentinel{ - K8SName: "test-sentinel", - }, - }, - }, - }, - } - - err := r.HandleState(ctx, state) - require.NoError(t, err) - require.Equal(t, uint64(300), r.sequenceLastSeen) -} - -func TestHandleState_SequenceOnlyIncreases(t *testing.T) { - ctx := context.Background() - h := NewTestHarness(t) - r := h.Reconciler - - // First event with sequence 100 - state1 := &ctrlv1.State{ - Sequence: 100, - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Delete{ - Delete: &ctrlv1.DeleteDeployment{ - K8SNamespace: "test-namespace", - K8SName: "test-deployment-1", - }, - }, - }, - }, - } - - err := r.HandleState(ctx, state1) - require.NoError(t, err) - require.Equal(t, uint64(100), r.sequenceLastSeen) - - // Second event with lower sequence (should not decrease) - state2 := &ctrlv1.State{ - Sequence: 50, // Lower sequence - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Delete{ - Delete: &ctrlv1.DeleteDeployment{ - K8SNamespace: "test-namespace", - K8SName: "test-deployment-2", - }, - }, - }, - }, - } - - err = r.HandleState(ctx, state2) - require.NoError(t, err) - require.Equal(t, uint64(100), r.sequenceLastSeen, "sequence should not decrease") - - // Third event with higher sequence (should increase) - state3 := &ctrlv1.State{ - Sequence: 150, - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Delete{ - Delete: &ctrlv1.DeleteDeployment{ - K8SNamespace: "test-namespace", - K8SName: "test-deployment-3", - }, - }, - }, - }, - } - - err = r.HandleState(ctx, state3) - require.NoError(t, err) - require.Equal(t, uint64(150), r.sequenceLastSeen, "sequence should increase") -} - -func TestHandleState_SequenceZeroDoesNotReset(t *testing.T) { - ctx := context.Background() - h := NewTestHarness(t) - r := h.Reconciler - - // Set initial sequence - state1 := &ctrlv1.State{ - Sequence: 100, - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Delete{ - Delete: &ctrlv1.DeleteDeployment{ - K8SNamespace: "test-namespace", - K8SName: "test-deployment", - }, - }, - }, - }, - } - - err := r.HandleState(ctx, state1) - require.NoError(t, err) - require.Equal(t, uint64(100), r.sequenceLastSeen) - - // Event with sequence 0 should not reset - state2 := &ctrlv1.State{ - Sequence: 0, - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Delete{ - Delete: &ctrlv1.DeleteDeployment{ - K8SNamespace: "test-namespace", - K8SName: "another-deployment", - }, - }, - }, - }, - } - - err = r.HandleState(ctx, state2) - require.NoError(t, err) - require.Equal(t, uint64(100), r.sequenceLastSeen, "sequence should not reset to 0") -} - -func TestHandleState_BootstrapSequence(t *testing.T) { - ctx := context.Background() - h := NewTestHarness(t) - r := h.Reconciler - - // Simulate bootstrap: all state items have the same sequence from GetMaxStateChangeSequence - bootstrapSequence := uint64(500) - - states := []*ctrlv1.State{ - { - Sequence: bootstrapSequence, - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Apply{ - Apply: &ctrlv1.ApplyDeployment{ - WorkspaceId: "ws_1", - ProjectId: "prj_1", - EnvironmentId: "env_1", - DeploymentId: "dep_1", - K8SNamespace: "test-namespace", - K8SName: "deployment-1", - Image: "nginx:1.19", - Replicas: 1, - CpuMillicores: 100, - MemoryMib: 128, - }, - }, - }, - }, - }, - { - Sequence: bootstrapSequence, - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Apply{ - Apply: &ctrlv1.ApplyDeployment{ - WorkspaceId: "ws_2", - ProjectId: "prj_2", - EnvironmentId: "env_2", - DeploymentId: "dep_2", - K8SNamespace: "test-namespace", - K8SName: "deployment-2", - Image: "nginx:1.20", - Replicas: 2, - CpuMillicores: 200, - MemoryMib: 256, - }, - }, - }, - }, - }, - { - Sequence: bootstrapSequence, - Kind: &ctrlv1.State_Sentinel{ - Sentinel: &ctrlv1.SentinelState{ - State: &ctrlv1.SentinelState_Apply{ - Apply: &ctrlv1.ApplySentinel{ - WorkspaceId: "ws_3", - ProjectId: "prj_3", - EnvironmentId: "env_3", - SentinelId: "sentinel_1", - K8SName: "sentinel-1", - Image: "sentinel:1.0", - Replicas: 1, - CpuMillicores: 50, - MemoryMib: 64, - }, - }, - }, - }, - }, - } - - for _, state := range states { - err := r.HandleState(ctx, state) - require.NoError(t, err) - } - - require.Equal(t, bootstrapSequence, r.sequenceLastSeen, "sequence should be set to bootstrap watermark") -} - -func TestReconciler_InitialSequenceIsZero(t *testing.T) { - h := NewTestHarness(t) - require.Equal(t, uint64(0), h.Reconciler.sequenceLastSeen, "new reconciler should start with sequence 0") -} diff --git a/svc/krane/internal/reconciler/version_tracking_test.go b/svc/krane/internal/reconciler/version_tracking_test.go new file mode 100644 index 0000000000..88e23c2e2f --- /dev/null +++ b/svc/krane/internal/reconciler/version_tracking_test.go @@ -0,0 +1,116 @@ +package reconciler + +import ( + "context" + "testing" + + "github.com/stretchr/testify/require" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" + "github.com/unkeyed/unkey/pkg/ptr" +) + +// Tests for version tracking behavior. +// +// The reconciler uses a two-phase commit for version tracking: +// 1. HandleState processes state and returns the version (but doesn't commit it) +// 2. versionLastSeen is updated only after clean stream close +// +// This ensures atomic bootstrap: if a stream breaks mid-bootstrap, the client +// retries from version 0 rather than skipping resources that were never received. + +func TestHandleState_ReturnsVersion(t *testing.T) { + ctx := context.Background() + h := NewTestHarness(t) + r := h.Reconciler + + require.Equal(t, uint64(0), r.versionLastSeen, "initial version should be 0") + + state := &ctrlv1.State{ + Version: 42, + Kind: &ctrlv1.State_Deployment{ + Deployment: &ctrlv1.DeploymentState{ + State: &ctrlv1.DeploymentState_Apply{ + Apply: &ctrlv1.ApplyDeployment{ + WorkspaceId: "ws_123", + ProjectId: "prj_123", + EnvironmentId: "env_123", + DeploymentId: "dep_123", + K8SNamespace: "test-namespace", + K8SName: "test-deployment", + Image: "nginx:1.19", + Replicas: 1, + CpuMillicores: 100, + MemoryMib: 128, + BuildId: ptr.P("build_123"), + }, + }, + }, + }, + } + + ver, err := r.HandleState(ctx, state) + require.NoError(t, err) + require.Equal(t, uint64(42), ver, "should return state version") + require.Equal(t, uint64(0), r.versionLastSeen, "versionLastSeen should not change until stream closes cleanly") +} + +func TestHandleState_DoesNotCommitVersion(t *testing.T) { + ctx := context.Background() + h := NewTestHarness(t) + r := h.Reconciler + + states := []*ctrlv1.State{ + { + Version: 100, + Kind: &ctrlv1.State_Deployment{ + Deployment: &ctrlv1.DeploymentState{ + State: &ctrlv1.DeploymentState_Delete{ + Delete: &ctrlv1.DeleteDeployment{ + K8SNamespace: "test-namespace", + K8SName: "test-deployment", + }, + }, + }, + }, + }, + { + Version: 200, + Kind: &ctrlv1.State_Sentinel{ + Sentinel: &ctrlv1.SentinelState{ + State: &ctrlv1.SentinelState_Apply{ + Apply: &ctrlv1.ApplySentinel{ + WorkspaceId: "ws_123", + ProjectId: "prj_123", + EnvironmentId: "env_123", + SentinelId: "sentinel_123", + K8SName: "test-sentinel", + Image: "sentinel:1.0", + Replicas: 2, + CpuMillicores: 100, + MemoryMib: 128, + }, + }, + }, + }, + }, + { + Version: 300, + Kind: &ctrlv1.State_Sentinel{ + Sentinel: &ctrlv1.SentinelState{ + State: &ctrlv1.SentinelState_Delete{ + Delete: &ctrlv1.DeleteSentinel{ + K8SName: "test-sentinel", + }, + }, + }, + }, + }, + } + + for _, state := range states { + _, err := r.HandleState(ctx, state) + require.NoError(t, err) + } + + require.Equal(t, uint64(0), r.versionLastSeen, "versionLastSeen should remain 0 until stream closes cleanly") +} diff --git a/svc/krane/internal/reconciler/watcher.go b/svc/krane/internal/reconciler/watcher.go index d747e57b63..f6ba10d558 100644 --- a/svc/krane/internal/reconciler/watcher.go +++ b/svc/krane/internal/reconciler/watcher.go @@ -10,12 +10,10 @@ import ( ) func (r *Reconciler) Watch(ctx context.Context) { - intervalMin := time.Second intervalMax := 5 * time.Second for { - interval := intervalMin + time.Millisecond*time.Duration(rand.Float64()*float64(intervalMax.Milliseconds()-intervalMin.Milliseconds())) time.Sleep(interval) @@ -23,18 +21,15 @@ func (r *Reconciler) Watch(ctx context.Context) { if err != nil { r.logger.Error("error while watching for state changes", "error", err) } - } - } func (r *Reconciler) watch(ctx context.Context) error { - r.logger.Info("starting watch") stream, err := r.cluster.Sync(ctx, connect.NewRequest(&ctrlv1.SyncRequest{ - Region: r.region, - SequenceLastSeen: r.sequenceLastSeen, + Region: r.region, + VersionLastSeen: r.versionLastSeen, })) if err != nil { return err @@ -42,14 +37,19 @@ func (r *Reconciler) watch(ctx context.Context) error { for stream.Receive() { r.logger.Info("received message") - if err := r.HandleState(ctx, stream.Msg()); err != nil { - r.logger.Error("error handling state", "error", err) + version, err := r.HandleState(ctx, stream.Msg()) + if err != nil { + return err + } + if version > r.versionLastSeen { + r.versionLastSeen = version } } - err = stream.Close() - if err != nil { + + if err := stream.Close(); err != nil { r.logger.Error("unable to close stream", "error", err) + return err } - return nil + return nil } diff --git a/svc/krane/internal/reconciler/watcher_test.go b/svc/krane/internal/reconciler/watcher_test.go index e65530b76a..d00e26fbe8 100644 --- a/svc/krane/internal/reconciler/watcher_test.go +++ b/svc/krane/internal/reconciler/watcher_test.go @@ -4,7 +4,7 @@ // These tests verify the watch/sync behavior of the reconciler, specifically: // - How it forms Sync requests to the control plane // - How it processes State messages via HandleState -// - How it tracks sequence numbers for reconnection +// - How it tracks version numbers for reconnection // // # Test Approach // @@ -16,7 +16,8 @@ // // # Key Invariants // -// - sequenceLastSeen is updated to the highest sequence seen +// - versionLastSeen is only updated after clean stream close (atomic bootstrap) +// - HandleState returns the version but does not commit it // - Apply messages create/update Kubernetes resources // - Delete messages remove Kubernetes resources package reconciler @@ -102,16 +103,16 @@ func (m *mockServerStream) ResponseTrailer() http.Header { // ============================================================================= // TestWatch_SendsCorrectSyncRequest verifies that watch() sends a Sync request -// with the correct region and sequence number. +// with the correct region and version number. // -// Scenario: Reconciler has previously processed messages up to sequence 500. -// It calls watch() which should send a Sync request with that sequence. +// Scenario: Reconciler has previously processed messages up to version 500. +// It calls watch() which should send a Sync request with that version. // // Guarantees: // - SyncRequest.Region matches the reconciler's configured region -// - SyncRequest.SequenceLastSeen matches sequenceLastSeen from previous session +// - SyncRequest.VersionLastSeen matches versionLastSeen from previous session // -// This is critical for reconnection: the sequence tells the server where to +// This is critical for reconnection: the version tells the server where to // resume streaming from. func TestWatch_SendsCorrectSyncRequest(t *testing.T) { client := fake.NewSimpleClientset() @@ -136,28 +137,28 @@ func TestWatch_SendsCorrectSyncRequest(t *testing.T) { Region: "us-west-2", }) - r.sequenceLastSeen = 500 + r.versionLastSeen = 500 ctx := context.Background() _ = r.watch(ctx) require.NotNil(t, capturedRequest) require.Equal(t, "us-west-2", capturedRequest.GetRegion()) - require.Equal(t, uint64(500), capturedRequest.GetSequenceLastSeen()) + require.Equal(t, uint64(500), capturedRequest.GetVersionLastSeen()) } -// TestWatch_InitialSyncWithZeroSequence verifies that a fresh reconciler sends -// sequence=0 to trigger a full bootstrap from the server. +// TestWatch_InitialSyncWithZeroVersion verifies that a fresh reconciler sends +// version=0 to trigger a full bootstrap from the server. // // Scenario: A newly created reconciler (never received any messages) calls watch(). // // Guarantees: -// - SyncRequest.SequenceLastSeen is 0 +// - SyncRequest.VersionLastSeen is 0 // - This triggers the server to perform full bootstrap // -// sequence=0 is the "I have nothing" signal that tells the server to send +// version=0 is the "I have nothing" signal that tells the server to send // all current state before entering the watch loop. -func TestWatch_InitialSyncWithZeroSequence(t *testing.T) { +func TestWatch_InitialSyncWithZeroVersion(t *testing.T) { client := fake.NewSimpleClientset() AddReplicaSetPatchReactor(client) AddDeploymentPatchReactor(client) @@ -185,7 +186,7 @@ func TestWatch_InitialSyncWithZeroSequence(t *testing.T) { require.NotNil(t, capturedRequest) require.Equal(t, "eu-central-1", capturedRequest.GetRegion()) - require.Equal(t, uint64(0), capturedRequest.GetSequenceLastSeen(), "initial sync should have sequence 0") + require.Equal(t, uint64(0), capturedRequest.GetVersionLastSeen(), "initial sync should have version 0") } // ============================================================================= @@ -193,19 +194,20 @@ func TestWatch_InitialSyncWithZeroSequence(t *testing.T) { // ============================================================================= // // These tests verify that HandleState correctly processes different message -// types and updates both Kubernetes resources and the sequence tracker. +// types and updates both Kubernetes resources and the version tracker. // ============================================================================= // TestWatch_ProcessesStreamMessages verifies that HandleState correctly -// processes deployment apply messages. +// processes deployment apply messages and returns versions. // -// Scenario: A stream contains two deployment apply messages (seq=10, seq=20). +// Scenario: A stream contains two deployment apply messages (ver=10, ver=20). // // Guarantees: // - The deployment is applied to Kubernetes (ReplicaSet is created) -// - sequenceLastSeen is updated to the highest sequence (20) +// - HandleState returns the version from each message +// - versionLastSeen is only updated after stream closes cleanly // -// This tests the basic happy path: apply resources and track sequence. +// This tests the basic happy path: apply resources and track version. func TestWatch_ProcessesStreamMessages(t *testing.T) { client := fake.NewSimpleClientset() rsCapture := AddReplicaSetPatchReactor(client) @@ -215,7 +217,7 @@ func TestWatch_ProcessesStreamMessages(t *testing.T) { messages := []*ctrlv1.State{ { - Sequence: 10, + Version: 10, Kind: &ctrlv1.State_Deployment{ Deployment: &ctrlv1.DeploymentState{ State: &ctrlv1.DeploymentState_Apply{ @@ -237,7 +239,7 @@ func TestWatch_ProcessesStreamMessages(t *testing.T) { }, }, { - Sequence: 20, + Version: 20, Kind: &ctrlv1.State_Deployment{ Deployment: &ctrlv1.DeploymentState{ State: &ctrlv1.DeploymentState_Apply{ @@ -278,13 +280,23 @@ func TestWatch_ProcessesStreamMessages(t *testing.T) { // Process messages directly to test HandleState integration ctx := context.Background() + var maxVersion uint64 for stream.Receive() { - err := r.HandleState(ctx, stream.Msg()) + seq, err := r.HandleState(ctx, stream.Msg()) require.NoError(t, err) + if seq > maxVersion { + maxVersion = seq + } } require.NotNil(t, rsCapture.Applied, "deployment should have been applied") - require.Equal(t, uint64(20), r.sequenceLastSeen, "sequence should be updated to highest value") + require.Equal(t, uint64(0), r.versionLastSeen, "sequence should not be updated until CommitSequence") + + // Simulate clean stream close + if maxVersion > r.versionLastSeen { + r.versionLastSeen = maxVersion + } + require.Equal(t, uint64(20), r.versionLastSeen, "sequence should be updated after CommitSequence") } // TestWatch_IncrementalUpdates verifies that HandleState correctly processes @@ -295,7 +307,8 @@ func TestWatch_ProcessesStreamMessages(t *testing.T) { // delete sentinel (103). // // Guarantees: -// - sequenceLastSeen is updated to 103 (the highest sequence) +// - HandleState returns the sequence from each message +// - versionLastSeen is updated to 103 after CommitSequence // - Deployment delete triggers ReplicaSet deletion // - Sentinel delete triggers Deployment deletion (sentinels run as k8s Deployments) // @@ -310,7 +323,7 @@ func TestWatch_IncrementalUpdates(t *testing.T) { messages := []*ctrlv1.State{ { - Sequence: 101, + Version: 101, Kind: &ctrlv1.State_Deployment{ Deployment: &ctrlv1.DeploymentState{ State: &ctrlv1.DeploymentState_Apply{ @@ -331,7 +344,7 @@ func TestWatch_IncrementalUpdates(t *testing.T) { }, }, { - Sequence: 102, + Version: 102, Kind: &ctrlv1.State_Deployment{ Deployment: &ctrlv1.DeploymentState{ State: &ctrlv1.DeploymentState_Delete{ @@ -344,7 +357,7 @@ func TestWatch_IncrementalUpdates(t *testing.T) { }, }, { - Sequence: 103, + Version: 103, Kind: &ctrlv1.State_Sentinel{ Sentinel: &ctrlv1.SentinelState{ State: &ctrlv1.SentinelState_Delete{ @@ -367,16 +380,24 @@ func TestWatch_IncrementalUpdates(t *testing.T) { }) // Start with sequence 100 (simulating reconnect after bootstrap) - r.sequenceLastSeen = 100 + r.versionLastSeen = 100 ctx := context.Background() stream := newMockServerStream(messages) + var maxVersion uint64 for stream.Receive() { - err := r.HandleState(ctx, stream.Msg()) + seq, err := r.HandleState(ctx, stream.Msg()) require.NoError(t, err) + if seq > maxVersion { + maxVersion = seq + } } - require.Equal(t, uint64(103), r.sequenceLastSeen) + require.Equal(t, uint64(100), r.versionLastSeen, "sequence should not change until CommitSequence") + if maxVersion > r.versionLastSeen { + r.versionLastSeen = maxVersion + } + require.Equal(t, uint64(103), r.versionLastSeen) require.Contains(t, deletes.Actions, "replicasets", "deployment delete should be processed (deletes ReplicaSet)") require.Contains(t, deletes.Actions, "deployments", "sentinel delete should be processed (deletes Deployment)") } @@ -463,7 +484,7 @@ func TestWatch_SyncConnectionError(t *testing.T) { // - Deployment is applied to Kubernetes (ReplicaSet created with correct name) // - Sentinel is applied (as a k8s Deployment - captured separately) // - Deployment delete is processed (ReplicaSet deleted) -// - sequenceLastSeen ends at 30 (the highest sequence) +// - versionLastSeen ends at 30 (the highest sequence) // // This is a comprehensive integration test of HandleState covering all major // message types in a realistic sequence. @@ -485,7 +506,7 @@ func TestWatch_FullMessageProcessingFlow(t *testing.T) { messages := []*ctrlv1.State{ { - Sequence: 10, + Version: 10, Kind: &ctrlv1.State_Deployment{ Deployment: &ctrlv1.DeploymentState{ State: &ctrlv1.DeploymentState_Apply{ @@ -506,7 +527,7 @@ func TestWatch_FullMessageProcessingFlow(t *testing.T) { }, }, { - Sequence: 20, + Version: 20, Kind: &ctrlv1.State_Sentinel{ Sentinel: &ctrlv1.SentinelState{ State: &ctrlv1.SentinelState_Apply{ @@ -526,7 +547,7 @@ func TestWatch_FullMessageProcessingFlow(t *testing.T) { }, }, { - Sequence: 30, + Version: 30, Kind: &ctrlv1.State_Deployment{ Deployment: &ctrlv1.DeploymentState{ State: &ctrlv1.DeploymentState_Delete{ @@ -540,9 +561,13 @@ func TestWatch_FullMessageProcessingFlow(t *testing.T) { }, } + var maxVersion uint64 for _, msg := range messages { - err := r.HandleState(ctx, msg) + seq, err := r.HandleState(ctx, msg) require.NoError(t, err) + if seq > maxVersion { + maxVersion = seq + } } require.NotNil(t, rsCapture.Applied, "deployment should have been applied") @@ -550,5 +575,10 @@ func TestWatch_FullMessageProcessingFlow(t *testing.T) { require.Contains(t, deletes.Actions, "replicasets", "deployment delete should have been processed") - require.Equal(t, uint64(30), r.sequenceLastSeen, "sequence should be updated to highest value") + require.Equal(t, uint64(0), r.versionLastSeen, "sequence should not be updated until CommitSequence") + + if maxVersion > r.versionLastSeen { + r.versionLastSeen = maxVersion + } + require.Equal(t, uint64(30), r.versionLastSeen, "sequence should be updated after CommitSequence") } diff --git a/svc/krane/secrets/token/k8s_validator.go b/svc/krane/secrets/token/k8s_validator.go index b7972d3ddf..7660c020af 100644 --- a/svc/krane/secrets/token/k8s_validator.go +++ b/svc/krane/secrets/token/k8s_validator.go @@ -20,6 +20,11 @@ type K8sValidatorConfig struct { Clientset kubernetes.Interface } +// K8sValidator validates service account tokens using Kubernetes TokenReview API. +// +// This validator authenticates requests by verifying that tokens belong to pods +// annotated with the expected deployment ID. It provides Kubernetes-native +// authentication without requiring external identity providers. type K8sValidator struct { clientset kubernetes.Interface } diff --git a/web/apps/engineering/content/docs/architecture/services/ctrl/pull-based-infra.mdx b/web/apps/engineering/content/docs/architecture/services/ctrl/pull-based-infra.mdx index 6ab3258e06..a98223e184 100644 --- a/web/apps/engineering/content/docs/architecture/services/ctrl/pull-based-infra.mdx +++ b/web/apps/engineering/content/docs/architecture/services/ctrl/pull-based-infra.mdx @@ -1,11 +1,11 @@ --- title: Pull-Based Provisioning -description: Sequence-based infrastructure synchronization with autonomous agents, polling-based updates, and eventual consistency +description: Version-based infrastructure synchronization with autonomous agents, polling-based updates, and eventual consistency --- import { Mermaid } from "@/app/components/mermaid"; -Unkey's infrastructure orchestration implements a pull-based model where autonomous Krane instances poll the control plane for state changes and continuously reconcile desired state with actual state. This architecture follows the Kubernetes List+Watch pattern, using a monotonically increasing sequence number to track changes and enable efficient incremental synchronization. +Unkey's infrastructure orchestration implements a pull-based model where autonomous Krane instances poll the control plane for state changes and continuously reconcile desired state with actual state. This architecture follows the Kubernetes List+Watch pattern, using a globally unique, monotonically increasing version number embedded directly in each resource row to track changes and enable efficient incremental synchronization. The architecture's core principle is to enable autonomous reconciliation and self-healing without requiring the control plane to track connected clients or push events. @@ -16,23 +16,26 @@ The architecture's core principle is to enable autonomous reconciliation and sel sequenceDiagram autonumber participant U as User + participant VS as VersioningService participant DB as Database participant CP as Control Plane (Ctrl) participant K1 as Krane Agent (Region A) participant K8s as Kubernetes API - K1->>CP: Sync(region=a, sequence=0) + K1->>CP: Sync(region=a, version=0) Note over K1,CP: Bootstrap: stream full state CP->>K1: Stream all deployments/sentinels - Note over K1: Stream closes, last seq=42 + Note over K1: Stream closes, max version=42 U->>CP: Create deployment - CP->>DB: Store deployment topology - CP->>DB: Insert state_change(seq=43) + CP->>VS: NextVersion() + VS-->>CP: version=43 + CP->>DB: Store topology (version=43) Note over K1: Polling every 1-5s - K1->>CP: (stream polls for changes) - CP->>K1: Stream State(seq=43, deploy) + K1->>CP: Sync(region=a, version=42) + CP->>DB: SELECT WHERE version > 42 + CP->>K1: Stream State(version=43, deploy) K1->>K8s: Apply deployment K8s-->>K1: Pod status change @@ -40,25 +43,55 @@ sequenceDiagram CP->>DB: Upsert Instance table" /> +## VersioningService + +The VersioningService is a Restate virtual object that generates globally unique, monotonically increasing version numbers. It is a singleton (keyed by empty string) that maintains a durable counter in Restate's state store. + +Before any mutation to a deployment topology or sentinel, the control plane calls `NextVersion()` to obtain the next version. This version is then stored directly on the resource row. The virtual object guarantees: + +- **Monotonically increasing values**: Versions always increase, with no gaps under normal operation +- **Exactly-once semantics**: Retries of the same Restate invocation return the same version +- **Single-writer**: The singleton virtual object serializes all version generation + +This eliminates the need for a separate changelog table—the version embedded in each resource row is the source of truth for synchronization. + ## Sync Protocol The synchronization protocol uses a single `Sync` RPC that handles both initial bootstrap and incremental updates. This design eliminates the complexity of managing separate "synthetic" and "live" modes, and removes the need for the control plane to track connected clients in memory. -### Sequence Numbers +### Version-Based Tracking -Every state change (deployment created, sentinel updated, resource deleted) is recorded in the `state_changes` table with a monotonically increasing sequence number per region. Krane tracks its last-seen sequence and resumes from that point on reconnect, enabling efficient incremental sync without missing events. +Every resource (deployment topology, sentinel) carries a `version` column set at insert/update time via the VersioningService. Krane tracks its last-seen version (`versionLastSeen`) and requests only resources with `version > versionLastSeen`. The version is globally unique across all resource types, enabling a single watermark for all synchronization. ### Bootstrap Phase -When Krane connects with `sequence=0` (fresh start or after data loss), the control plane streams the complete desired state for the region. Each message contains a sequence number. Krane tracks the highest sequence seen and uses it for subsequent polls. Stream close signals bootstrap completion. +When Krane connects with `version=0` (fresh start or after data loss), the control plane streams the complete desired state for the region. Resources are ordered by version. Krane tracks the maximum version seen across all received resources. Stream close without error signals bootstrap completion—only then does Krane commit its version watermark. + +After receiving the full bootstrap, Krane garbage-collects any Kubernetes resources that were not mentioned in the stream. This ensures consistency even if the control plane's view of desired state has changed significantly. + +### Incremental Sync + +After bootstrap, Krane periodically reconnects with its last-seen version. The control plane queries resource tables directly: + +```sql +SELECT * FROM deployment_topology +WHERE region = ? AND version > ? +ORDER BY version LIMIT 100; + +SELECT * FROM sentinels +WHERE region = ? AND version > ? +ORDER BY version LIMIT 100; +``` + +Changes are streamed to Krane, which processes each and tracks the maximum version seen. -### Watch Phase +### Soft Deletes -After bootstrap, the stream enters watch mode. The control plane polls the `state_changes` table every 250ms for new entries after Krane's last-seen sequence. When changes are found, they're streamed to Krane with their sequence numbers. Krane processes each change and updates its sequence watermark. +Resources are never hard-deleted during sync. Instead, "deletes" are represented as state changes: setting `desired_replicas=0` or `desired_status=stopped`. Krane sees this change (the row still exists with a new version) and deletes the corresponding Kubernetes resource. ### Reconnection -If the connection drops, Krane reconnects with its last-seen sequence. If this sequence is still within the retention window (changes are kept for 7 days), sync resumes incrementally. If the sequence is too old, the control plane returns `FailedPrecondition` and Krane must perform a full bootstrap by reconnecting with `sequence=0`. +If the connection drops, Krane reconnects with its last-seen version. Since versions are embedded in resource rows and not in a separate changelog with retention, there's no risk of the version being "too old"—the query simply returns any resources with versions newer than Krane's watermark. ## Krane Agent @@ -69,7 +102,8 @@ Krane agents act as autonomous controllers that reconcile desired state with act graph TB subgraph 'Control Plane' CP[ClusterService] - SC[(state_changes)] + DT[(deployment_topology)] + S[(sentinels)] end subgraph 'Krane Agent' W[Watcher] @@ -87,7 +121,8 @@ graph TB end W -.->|Sync stream| CP - CP -.->|poll| SC + CP -.->|query by version| DT + CP -.->|query by version| S DC -->|Apply/Delete| K8s GC -->|Apply/Delete| K8s K8s -->|Watch Events| DC @@ -95,7 +130,7 @@ graph TB GC -->|UpdateSentinel| CP" /> -The Watcher maintains the Sync stream, reconnecting with jittered backoff (1-5 seconds) on failure. It passes received `State` messages to the Reconciler, which dispatches to the appropriate controller based on resource type. The Reconciler tracks `sequenceLastSeen` and updates it after successfully processing each state change. +The Watcher maintains the Sync stream, reconnecting with jittered backoff (1-5 seconds) on failure. It passes received `State` messages to the Reconciler, which dispatches to the appropriate controller based on resource type. The Reconciler tracks `versionLastSeen` and updates it only after successfully processing state changes and receiving a clean stream close. Status updates flow back to the control plane through unary RPCs (`UpdateDeploymentState`, `UpdateSentinelState`) with buffering, retries, and circuit breakers for reliability. @@ -107,6 +142,7 @@ sequenceDiagram participant User participant API participant Workflow as Deploy Workflow + participant VS as VersioningService participant DB participant Krane participant K8s as Kubernetes @@ -115,10 +151,11 @@ sequenceDiagram API->>Workflow: Start deployment Workflow->>DB: Create deployment - Workflow->>DB: Create topology entries - Workflow->>DB: Insert state_changes + Workflow->>VS: NextVersion() + VS-->>Workflow: version=N + Workflow->>DB: Create topology (version=N) - Note over Krane: Polls state_changes via Sync + Note over Krane: Polls via Sync(version > lastSeen) Krane->>Krane: Receive State(Apply) Krane->>K8s: Apply deployment @@ -141,7 +178,7 @@ sequenceDiagram Krane->>DB: UpdateInstance(running)" /> -The deploy workflow writes desired state to the database and inserts corresponding `state_change` records. It does not push events directly to Krane. The workflow then polls the `instances` table waiting for Krane to report that pods are running, with a timeout for failure handling. +The deploy workflow obtains a version from the VersioningService before writing each topology row. It does not push events directly to Krane. The workflow then polls the `instances` table waiting for Krane to report that pods are running, with a timeout for failure handling. ## Why Polling Over Push @@ -151,16 +188,16 @@ The control plane becomes stateless with respect to connected clients. It doesn' Polling naturally handles backpressure. If Krane falls behind processing, it simply polls less frequently. With push-based streaming, the control plane would need to implement flow control or risk overwhelming slow clients. -The sequence-based approach provides exactly-once delivery semantics. Each change has a unique sequence number, and Krane's watermark ensures no changes are missed or processed twice, even across restarts. +The version-based approach provides exactly-once delivery semantics. Each resource has a unique version, and Krane's watermark ensures no changes are missed or processed twice, even across restarts. The tradeoff is latency. With 1-5 second polling intervals, there's a delay between a state change and Krane receiving it. For our use case (infrastructure provisioning measured in seconds to minutes), this latency is acceptable. ## Database Schema -The `state_changes` table is the changelog that drives synchronization. Each row represents a create, update, or delete operation on a deployment or sentinel. The `sequence` column is an auto-incrementing primary key that provides ordering. Rows are indexed by `(region, sequence)` for efficient polling and retained for 7 days before cleanup. +The `deployment_topology` table defines desired state for multi-region deployments. Each row specifies the desired replica count for a deployment in a specific region. The `version` column (unique, indexed with region) is set via VersioningService on insert/update. -The `deployment_topology` table defines desired state for multi-region deployments. Each row specifies the desired replica count for a deployment in a specific region. When this table is modified, a corresponding `state_change` row is inserted. +The `sentinels` table combines desired and actual state with an embedded version. Desired fields (cpu, memory, replicas, desired_state) are set by the control plane with a new version; actual fields (available_replicas, health) are updated by Krane without version changes. The `instances` table tracks actual state reported by Krane. We only write to it in response to Kubernetes events. The workflow polls this table to determine when deployments are ready. -The `sentinels` table combines desired and actual state. Desired fields (cpu, memory, replicas) are set by the control plane; actual fields (available_replicas, health) are updated by Krane. +Both `deployment_topology` and `sentinels` have a composite index on `(region, version)` for efficient sync queries. diff --git a/web/apps/engineering/content/docs/architecture/services/krane/sync-engine.mdx b/web/apps/engineering/content/docs/architecture/services/krane/sync-engine.mdx index 0e06130945..a8727e8d8b 100644 --- a/web/apps/engineering/content/docs/architecture/services/krane/sync-engine.mdx +++ b/web/apps/engineering/content/docs/architecture/services/krane/sync-engine.mdx @@ -1,11 +1,11 @@ --- title: Krane Sync Engine Architecture -description: Deep dive into Krane's sequence-based synchronization that polls for state changes and ensures eventual consistency +description: Deep dive into Krane's version-based synchronization that queries resource tables directly and ensures eventual consistency --- import { Mermaid } from "@/app/components/mermaid"; -The Krane Sync Engine implements a Kubernetes-style List+Watch pattern for synchronizing desired infrastructure state from the control plane. It uses sequence numbers to track state changes, enabling efficient incremental synchronization and reliable recovery after disconnections. +The Krane Sync Engine implements a Kubernetes-style List+Watch pattern for synchronizing desired infrastructure state from the control plane. It uses version numbers embedded in resource tables to track state changes, enabling efficient incremental synchronization and reliable recovery after disconnections. ## Architecture @@ -29,8 +29,12 @@ graph TB subgraph 'Control Plane' CS[ClusterService] - SC[(state_changes)] - CS -.->|poll| SC + VS[VersioningService] + DT[(deployment_topology)] + ST[(sentinels)] + CS -.->|query| DT + CS -.->|query| ST + VS -.->|nextVersion| CS end subgraph Kubernetes @@ -53,19 +57,21 @@ graph TB The sync engine uses a single `Sync` RPC to receive state changes from the control plane. This RPC establishes a server-streaming connection where the control plane sends `State` messages containing deployment or sentinel operations. -### Sequence Tracking +### Version Tracking -The reconciler maintains a `sequenceLastSeen` field that tracks the highest sequence number successfully processed. On startup, this is zero. After processing each `State` message, the reconciler updates this watermark. When reconnecting after a failure, Krane sends its last-seen sequence in the `SyncRequest`, allowing the control plane to resume from the correct position. +Each resource (deployment_topology, sentinel) has a `version` column updated on every mutation via the Restate VersioningService singleton. This provides a globally unique, monotonically increasing version across all resources. + +The reconciler maintains a `versionLastSeen` field that tracks the highest version successfully processed. On startup, this is zero. After processing each `State` message, the reconciler tracks the max version seen but only commits it after a clean stream close. When reconnecting after a failure, Krane sends its last-seen version in the `SyncRequest`, allowing the control plane to resume from the correct position. ### Message Types -The `State` message contains a sequence number and one of two payloads: +The `State` message contains a version number and one of two payloads: **DeploymentState** contains either an `ApplyDeployment` (create or update a StatefulSet with the specified image, replicas, and resource limits) or `DeleteDeployment` (remove the StatefulSet and its associated Service). **SentinelState** contains either an `ApplySentinel` (create or update a sentinel deployment) or `DeleteSentinel` (remove the sentinel). -Stream close signals that the current batch (or bootstrap) is complete. The client tracks the highest sequence from received messages and uses it for the next sync request. +Stream close signals that the current batch (or bootstrap) is complete. The client tracks the highest version from received messages and uses it for the next sync request. ## Watcher Loop @@ -74,10 +80,14 @@ The Watcher runs a continuous loop with jittered reconnection timing (1-5 second ``` for { sleep(random(1s, 5s)) - stream = cluster.Sync(region, sequenceLastSeen) + stream = cluster.Sync(region, versionLastSeen) + maxVersion = 0 for message in stream { - reconciler.HandleState(message) + ver = reconciler.HandleState(message) + maxVersion = max(maxVersion, ver) } + if stream.closed_cleanly: + versionLastSeen = maxVersion } ``` @@ -97,11 +107,16 @@ HandleState(state): if Apply: ApplySentinel(state.Apply) if Delete: DeleteSentinel(state.Delete) - if state.Sequence > sequenceLastSeen: - sequenceLastSeen = state.Sequence + return state.Version ``` -The sequence watermark is updated after processing, ensuring at-least-once delivery. If Krane crashes mid-processing, it will reprocess the same message on restart, which is safe because apply operations are idempotent. +The version is returned but NOT committed until stream closes cleanly. This ensures atomic bootstrap: if the stream breaks mid-bootstrap, the client retries from version 0 rather than skipping resources that were never received. + +## Soft Deletes + +"Deletes" are implemented as soft deletes: setting `desired_replicas=0` or `desired_state='archived'`. The row remains in the table with its version updated, so clients receive the change and can delete the corresponding Kubernetes resource. + +This eliminates the need for a separate changelog table. The resource tables themselves are the source of truth, and each row carries its version for efficient incremental sync. ## Kubernetes Watchers @@ -115,8 +130,8 @@ Status updates to the control plane are buffered in memory before sending. This ## Failure Modes -**Stream disconnection**: The watcher reconnects with jittered backoff. If the last-seen sequence is within the 7-day retention window, sync resumes incrementally. Otherwise, Krane performs a full bootstrap. +**Stream disconnection**: The watcher reconnects with jittered backoff. Since version is not committed until clean close, reconnection resumes from the last committed version. **Control plane unavailable**: The circuit breaker opens after repeated failures, preventing Krane from overwhelming a struggling control plane. Local Kubernetes state continues to function; only sync with the control plane is paused. -**Sequence too old**: If Krane has been offline longer than the 7-day retention period, the control plane returns `FailedPrecondition`. Krane resets its sequence to zero and performs a full bootstrap, which may result in reprocessing resources that already exist (handled gracefully by idempotent apply operations). +**Bootstrap + GC**: After a full bootstrap (version=0), Krane garbage-collects any Kubernetes resources not mentioned in the bootstrap stream. This ensures stale resources are cleaned up even if they were hard-deleted from the database. diff --git a/web/internal/db/src/schema/deployment_topology.ts b/web/internal/db/src/schema/deployment_topology.ts index 4835817666..cb7213192d 100644 --- a/web/internal/db/src/schema/deployment_topology.ts +++ b/web/internal/db/src/schema/deployment_topology.ts @@ -23,6 +23,12 @@ export const deploymentTopology = mysqlTable( desiredReplicas: int("desired_replicas").notNull(), + // Version for state synchronization with edge agents. + // Updated via Restate VersioningService on each mutation. + // Edge agents track their last-seen version and request changes after it. + // Unique across all resources (shared global counter). + version: bigint("version", { mode: "number", unsigned: true }).notNull().unique(), + // Deployment status desiredStatus: mysqlEnum("desired_status", [ "starting", @@ -38,6 +44,7 @@ export const deploymentTopology = mysqlTable( index("deployment_idx").on(table.deploymentId), index("region_idx").on(table.region), index("status_idx").on(table.desiredStatus), + index("region_version_idx").on(table.region, table.version), ], ); diff --git a/web/internal/db/src/schema/sentinels.ts b/web/internal/db/src/schema/sentinels.ts index e36ba8ddc3..1072df3d1a 100644 --- a/web/internal/db/src/schema/sentinels.ts +++ b/web/internal/db/src/schema/sentinels.ts @@ -42,10 +42,18 @@ export const sentinels = mysqlTable( availableReplicas: int("available_replicas").notNull(), cpuMillicores: int("cpu_millicores").notNull(), memoryMib: int("memory_mib").notNull(), + + // Version for state synchronization with edge agents. + // Updated via Restate VersioningService on each mutation. + // Edge agents track their last-seen version and request changes after it. + // Unique across all resources (shared global counter). + version: bigint("version", { mode: "number", unsigned: true }).notNull().unique(), + ...lifecycleDates, }, (table) => [ index("idx_environment_id").on(table.environmentId), + index("region_version_idx").on(table.region, table.version), uniqueIndex("one_env_per_region").on(table.environmentId, table.region), ], From 39fc19da7f3e9de655175037d229d5ea6e089944 Mon Sep 17 00:00:00 2001 From: chronark Date: Tue, 20 Jan 2026 18:56:38 +0100 Subject: [PATCH 06/32] it works again --- AGENTS.md | 153 +-- cmd/dev/seed/local.go | 3 + cmd/sentinel/main.go | 2 +- dev/k8s/manifests/cilium-policies.yaml | 43 + gen/proto/ctrl/v1/cluster.pb.go | 628 ++++----- .../ctrl/v1/ctrlv1connect/cluster.connect.go | 176 ++- gen/proto/krane/v1/BUILD.bazel | 5 +- gen/proto/krane/v1/kranev1connect/BUILD.bazel | 5 +- .../v1/kranev1connect/scheduler.connect.go | 253 ---- gen/proto/krane/v1/scheduler.pb.go | 1219 ----------------- pkg/db/BUILD.bazel | 1 - .../cluster_state_versions.sql_generated.go | 83 -- ...topology_find_by_versions.sql_generated.go | 41 +- pkg/db/querier_generated.go | 55 +- pkg/db/queries/cluster_state_versions.sql | 17 - .../deployment_topology_find_by_versions.sql | 11 - .../deployment_topology_list_by_versions.sql | 13 + pkg/db/queries/sentinel_find_by_versions.sql | 11 +- ...sentinel_find_by_versions.sql_generated.go | 37 +- svc/ctrl/proto/ctrl/v1/cluster.proto | 83 +- svc/ctrl/services/cluster/BUILD.bazel | 7 +- ...ate.go => rpc_report_deployment_status.go} | 25 +- ...state.go => rpc_report_sentinel_status.go} | 6 +- svc/ctrl/services/cluster/rpc_sync.go | 226 --- .../services/cluster/rpc_watch_deployments.go | 128 ++ .../services/cluster/rpc_watch_sentinels.go | 117 ++ svc/krane/BUILD.bazel | 3 +- svc/krane/DOCUMENTATION_SUMMARY.md | 130 -- svc/krane/doc.go | 19 +- svc/krane/internal/deployment/BUILD.bazel | 61 + .../deployment/actual_state_report.go | 75 + .../apply.go} | 38 +- svc/krane/internal/deployment/consts.go | 25 + svc/krane/internal/deployment/controller.go | 97 ++ .../internal/deployment/controller_test.go | 119 ++ .../delete.go} | 16 +- .../deployment/desired_state_apply.go | 73 + svc/krane/internal/deployment/doc.go | 44 + .../{reconciler => deployment}/namespace.go | 20 +- svc/krane/internal/deployment/resync.go | 74 + .../{reconciler => deployment}/scheduling.go | 17 +- svc/krane/internal/deployment/state.go | 83 ++ svc/krane/internal/reconciler/BUILD.bazel | 97 -- .../reconciler/apply_deployment_test.go | 295 ---- .../reconciler/apply_sentinel_test.go | 379 ----- svc/krane/internal/reconciler/consts.go | 10 - .../reconciler/delete_deployment_test.go | 110 -- .../reconciler/delete_sentinel_test.go | 199 --- svc/krane/internal/reconciler/doc.go | 73 - .../internal/reconciler/handle_state_test.go | 146 -- .../reconciler/mock_cluster_client_test.go | 64 - .../internal/reconciler/namespace_test.go | 250 ---- svc/krane/internal/reconciler/reconciler.go | 138 -- .../reconciler/refresh_current_deployments.go | 79 -- .../refresh_current_deployments_test.go | 364 ----- .../reconciler/refresh_current_sentinels.go | 79 -- .../refresh_current_sentinels_test.go | 362 ----- .../internal/reconciler/test_helpers_test.go | 263 ---- svc/krane/internal/reconciler/tolerations.go | 17 - svc/krane/internal/reconciler/update_state.go | 109 -- .../internal/reconciler/update_state_test.go | 339 ----- .../reconciler/version_tracking_test.go | 116 -- .../reconciler/watch_current_deployments.go | 77 -- .../watch_current_deployments_test.go | 299 ---- .../watch_current_sentinels_test.go | 289 ---- svc/krane/internal/reconciler/watcher.go | 55 - svc/krane/internal/reconciler/watcher_test.go | 590 -------- svc/krane/internal/sentinel/BUILD.bazel | 51 + .../actual_state_report.go} | 37 +- .../apply_sentinel.go => sentinel/apply.go} | 58 +- svc/krane/internal/sentinel/consts.go | 28 + svc/krane/internal/sentinel/controller.go | 93 ++ .../controller_test.go} | 69 +- .../delete_sentinel.go => sentinel/delete.go} | 18 +- .../internal/sentinel/desired_state_apply.go | 72 + svc/krane/internal/sentinel/doc.go | 51 + svc/krane/internal/sentinel/resync.go | 74 + svc/krane/internal/testutil/BUILD.bazel | 13 + .../internal/testutil/mock_cluster_client.go | 73 + svc/krane/proto/krane/v1/scheduler.proto | 174 --- svc/krane/run.go | 25 +- .../docs/architecture/services/ctrl/index.mdx | 8 +- .../services/ctrl/pull-based-infra.mdx | 37 +- .../architecture/services/krane/index.mdx | 31 +- .../services/krane/sync-engine.mdx | 42 +- 85 files changed, 2209 insertions(+), 7786 deletions(-) delete mode 100644 gen/proto/krane/v1/kranev1connect/scheduler.connect.go delete mode 100644 gen/proto/krane/v1/scheduler.pb.go delete mode 100644 pkg/db/cluster_state_versions.sql_generated.go delete mode 100644 pkg/db/queries/cluster_state_versions.sql delete mode 100644 pkg/db/queries/deployment_topology_find_by_versions.sql create mode 100644 pkg/db/queries/deployment_topology_list_by_versions.sql rename svc/ctrl/services/cluster/{rpc_update_deployment_state.go => rpc_report_deployment_status.go} (73%) rename svc/ctrl/services/cluster/{rpc_update_sentinel_state.go => rpc_report_sentinel_status.go} (81%) delete mode 100644 svc/ctrl/services/cluster/rpc_sync.go create mode 100644 svc/ctrl/services/cluster/rpc_watch_deployments.go create mode 100644 svc/ctrl/services/cluster/rpc_watch_sentinels.go delete mode 100644 svc/krane/DOCUMENTATION_SUMMARY.md create mode 100644 svc/krane/internal/deployment/BUILD.bazel create mode 100644 svc/krane/internal/deployment/actual_state_report.go rename svc/krane/internal/{reconciler/apply_deployment.go => deployment/apply.go} (75%) create mode 100644 svc/krane/internal/deployment/consts.go create mode 100644 svc/krane/internal/deployment/controller.go create mode 100644 svc/krane/internal/deployment/controller_test.go rename svc/krane/internal/{reconciler/delete_deployment.go => deployment/delete.go} (68%) create mode 100644 svc/krane/internal/deployment/desired_state_apply.go create mode 100644 svc/krane/internal/deployment/doc.go rename svc/krane/internal/{reconciler => deployment}/namespace.go (83%) create mode 100644 svc/krane/internal/deployment/resync.go rename svc/krane/internal/{reconciler => deployment}/scheduling.go (74%) create mode 100644 svc/krane/internal/deployment/state.go delete mode 100644 svc/krane/internal/reconciler/BUILD.bazel delete mode 100644 svc/krane/internal/reconciler/apply_deployment_test.go delete mode 100644 svc/krane/internal/reconciler/apply_sentinel_test.go delete mode 100644 svc/krane/internal/reconciler/consts.go delete mode 100644 svc/krane/internal/reconciler/delete_deployment_test.go delete mode 100644 svc/krane/internal/reconciler/delete_sentinel_test.go delete mode 100644 svc/krane/internal/reconciler/doc.go delete mode 100644 svc/krane/internal/reconciler/handle_state_test.go delete mode 100644 svc/krane/internal/reconciler/mock_cluster_client_test.go delete mode 100644 svc/krane/internal/reconciler/namespace_test.go delete mode 100644 svc/krane/internal/reconciler/reconciler.go delete mode 100644 svc/krane/internal/reconciler/refresh_current_deployments.go delete mode 100644 svc/krane/internal/reconciler/refresh_current_deployments_test.go delete mode 100644 svc/krane/internal/reconciler/refresh_current_sentinels.go delete mode 100644 svc/krane/internal/reconciler/refresh_current_sentinels_test.go delete mode 100644 svc/krane/internal/reconciler/test_helpers_test.go delete mode 100644 svc/krane/internal/reconciler/tolerations.go delete mode 100644 svc/krane/internal/reconciler/update_state.go delete mode 100644 svc/krane/internal/reconciler/update_state_test.go delete mode 100644 svc/krane/internal/reconciler/version_tracking_test.go delete mode 100644 svc/krane/internal/reconciler/watch_current_deployments.go delete mode 100644 svc/krane/internal/reconciler/watch_current_deployments_test.go delete mode 100644 svc/krane/internal/reconciler/watch_current_sentinels_test.go delete mode 100644 svc/krane/internal/reconciler/watcher.go delete mode 100644 svc/krane/internal/reconciler/watcher_test.go create mode 100644 svc/krane/internal/sentinel/BUILD.bazel rename svc/krane/internal/{reconciler/watch_current_sentinels.go => sentinel/actual_state_report.go} (58%) rename svc/krane/internal/{reconciler/apply_sentinel.go => sentinel/apply.go} (83%) create mode 100644 svc/krane/internal/sentinel/consts.go create mode 100644 svc/krane/internal/sentinel/controller.go rename svc/krane/internal/{reconciler/reconciler_test.go => sentinel/controller_test.go} (57%) rename svc/krane/internal/{reconciler/delete_sentinel.go => sentinel/delete.go} (72%) create mode 100644 svc/krane/internal/sentinel/desired_state_apply.go create mode 100644 svc/krane/internal/sentinel/doc.go create mode 100644 svc/krane/internal/sentinel/resync.go create mode 100644 svc/krane/internal/testutil/BUILD.bazel create mode 100644 svc/krane/internal/testutil/mock_cluster_client.go delete mode 100644 svc/krane/proto/krane/v1/scheduler.proto diff --git a/AGENTS.md b/AGENTS.md index f8eb43ae53..5ce32e97bc 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -2,9 +2,39 @@ This document provides essential information for AI coding agents working in this repository. +## Communication + +- Be extremely concise; sacrifice grammar for brevity +- At the end of each plan, list unresolved questions (if any) + +## Code Quality Standards + +- Make minimal, surgical changes +- **Never compromise type safety**: No `any`, no `!` (non-null assertion), no `as Type` +- **Make illegal states unrepresentable**: Model domain with ADTs/discriminated unions; parse inputs at boundaries into typed structures +- Leave the codebase better than you found it + +### Entropy + +This codebase will outlive you. Every shortcut you take becomes +someone else's burden. Every hack compounds into technical debt +that slows the whole team down. + +You are not just writing code. You are shaping the future of this +project. The patterns you establish will be copied. The corners +you cut will be cut again. + +**Fight entropy. Leave the codebase better than you found it.** + +## Specialized Subagents + +- **Oracle**: code review, architecture decisions, debugging, refactor planning +- **Librarian**: understanding 3rd party libs, exploring remote repos, discovering patterns + ## Project Overview Unkey is an open-source API authentication and authorization platform. This is a **polyglot monorepo** containing: + - **Go backend** (root): Services, APIs, and shared libraries built with Bazel - **TypeScript frontend** (`/web`): Dashboard and API workers built with pnpm/Turborepo @@ -67,31 +97,8 @@ make clean # Stop and remove all Docker services ### Go Conventions -**Imports** - Organize in groups separated by blank lines: -1. Standard library -2. External/third-party packages -3. Internal packages (`github.com/unkeyed/unkey/internal/...`) -4. Package-level (`github.com/unkeyed/unkey/pkg/...`) -5. Service-level (`github.com/unkeyed/unkey/svc/...`) -6. Generated code (`github.com/unkeyed/unkey/gen/...`) - -**Error Handling** - Use the `fault` package for structured errors: -```go -return fault.Wrap(err, - fault.Code(codes.App.Internal.ServiceUnavailable.URN()), - fault.Internal("debug message for logs"), - fault.Public("user-facing message"), -) -``` - -**Naming Conventions:** -- Files: `snake_case.go`, tests: `*_test.go` -- Exported functions/types: `PascalCase` -- Unexported: `camelCase` -- Receivers: short names `(s *Service)`, `(h *Handler)` -- Constants: `SCREAMING_SNAKE_CASE` - **Testing** - Use `testify/require` for assertions: + ```go func TestFeature(t *testing.T) { t.Run("scenario", func(t *testing.T) { @@ -101,104 +108,6 @@ func TestFeature(t *testing.T) { } ``` -### TypeScript Conventions - -**Formatting** (enforced by Biome): -- 2 spaces indentation -- 100 character line width -- Use `const` over `let` -- Use template literals over string concatenation -- Use `import type` for type-only imports - -**Style Rules:** -- No default exports (except Next.js pages/layouts/configs) -- No `var` declarations -- No `any` types (use `unknown` or proper types) -- No non-null assertions in production code -- Use block statements for all control flow -- Use optional chaining (`?.`) over manual checks - -**Testing** - Use Vitest with descriptive assertions: -```typescript -import { describe, expect, test } from "vitest"; - -test("creates key", async (t) => { - const h = await IntegrationHarness.init(t); - const res = await h.post({ ... }); - expect(res.status).toBe(200); -}); -``` - -## Project Structure - -``` -/ # Go root -├── cmd/ # CLI entrypoints -├── svc/ # Backend services (api, ctrl, vault, etc.) -├── pkg/ # Shared Go libraries -├── proto/ # Protocol buffer definitions -├── gen/ # Generated code (proto, sqlc) -└── web/ # TypeScript monorepo - ├── apps/ - │ ├── api/ # Cloudflare Workers API (Hono) - │ ├── dashboard/ # Next.js dashboard - │ └── docs/ # Documentation site - └── internal/ # Shared TS packages (db, ui, rbac, etc.) -``` - -## Key Technologies - -**Go Backend:** -- Bazel for builds, Gazelle for BUILD file generation -- `pkg/zen` - Custom HTTP framework -- `pkg/fault` - Structured error handling -- `pkg/codes` - Error code URNs -- golangci-lint v2 for linting - -**TypeScript Frontend:** -- pnpm workspaces + Turborepo -- Next.js 14 (dashboard) -- Hono (API workers) -- Drizzle ORM (database) -- Biome for formatting/linting -- Vitest for testing - -## Common Patterns - -### Go HTTP Handlers (zen framework) -```go -func (h *Handler) Handle(ctx context.Context, s *zen.Session) error { - auth, emit, err := h.Keys.GetRootKey(ctx, s) - defer emit() - if err != nil { return err } - - req, err := zen.BindBody[Request](s) - if err != nil { return err } - - // Business logic... - - return s.JSON(http.StatusOK, Response{...}) -} -``` - -### TypeScript API Tests -```typescript -const h = await IntegrationHarness.init(t); -const root = await h.createRootKey([`api.${h.resources.userApi.id}.create_key`]); -const res = await h.post({ - url: "/v1/keys.createKey", - headers: { Authorization: `Bearer ${root.key}` }, - body: { ... }, -}); -expect(res.status).toBe(200); -``` - -## Linting Configuration - -**Go** (`.golangci.yaml`): Strict linting including exhaustive switch/map checks, struct initialization (`exhaustruct`), and security checks (`gosec`). - -**TypeScript** (`web/biome.json`): Enforces no unused variables/imports, strict equality, proper React hooks usage, and consistent code style. - ## Detailed Guidelines For comprehensive guidance, read these internal docs in `web/apps/engineering/content/docs/contributing/`: diff --git a/cmd/dev/seed/local.go b/cmd/dev/seed/local.go index afd915cf71..33b00f204c 100644 --- a/cmd/dev/seed/local.go +++ b/cmd/dev/seed/local.go @@ -270,6 +270,9 @@ func seedLocal(ctx context.Context, cmd *cli.Command) error { "ratelimit.*.read_override", "ratelimit.*.set_override", "workspace.*.read_workspace", + "project.*.generate_upload_url", + "project.*.create_deployment", + "project.*.read_deployment", } permissionParams := make([]db.InsertPermissionParams, len(allPermissions)) diff --git a/cmd/sentinel/main.go b/cmd/sentinel/main.go index 54af432f38..8a20c7d292 100644 --- a/cmd/sentinel/main.go +++ b/cmd/sentinel/main.go @@ -24,7 +24,7 @@ var Cmd = &cli.Command{ // Instance Identification cli.String("sentinel-id", "Unique identifier for this sentinel instance. Auto-generated if not provided.", - cli.Default(uid.New("sentinel", 4)), cli.EnvVar("UNKEY_GATEWAY_ID")), + cli.Default(uid.New("sentinel", 4)), cli.EnvVar("UNKEY_SENTINEL_ID")), cli.String("workspace-id", "Workspace ID this sentinel serves. Required.", cli.Required(), cli.EnvVar("UNKEY_WORKSPACE_ID")), diff --git a/dev/k8s/manifests/cilium-policies.yaml b/dev/k8s/manifests/cilium-policies.yaml index 1997058707..bd2d08a21d 100644 --- a/dev/k8s/manifests/cilium-policies.yaml +++ b/dev/k8s/manifests/cilium-policies.yaml @@ -102,6 +102,49 @@ spec: - port: "8080" protocol: TCP --- +# 5b. Allow sentinel egress to MySQL and DNS +# Sentinels need database access for routing and state management +apiVersion: cilium.io/v2 +kind: CiliumClusterwideNetworkPolicy +metadata: + name: allow-sentinel-egress +spec: + description: "Allow sentinel pods to reach MySQL and resolve DNS" + endpointSelector: + matchLabels: + io.kubernetes.pod.namespace: sentinel + app.kubernetes.io/component: sentinel + egress: + # DNS resolution via kube-dns + - toEndpoints: + - matchLabels: + io.kubernetes.pod.namespace: kube-system + k8s-app: kube-dns + toPorts: + - ports: + - port: "53" + protocol: ANY + # MySQL in unkey namespace + - toEndpoints: + - matchLabels: + io.kubernetes.pod.namespace: unkey + app: mysql + toPorts: + - ports: + - port: "3306" + protocol: TCP + # ClickHouse in unkey namespace (HTTP and native protocols) + - toEndpoints: + - matchLabels: + io.kubernetes.pod.namespace: unkey + app: clickhouse + toPorts: + - ports: + - port: "8123" + protocol: TCP + - port: "9000" + protocol: TCP +--- # 6. Allow customer pods to reach Krane for secret decryption # Customer pods need to call Krane's DecryptSecretsBlob RPC during init (inject container) apiVersion: cilium.io/v2 diff --git a/gen/proto/ctrl/v1/cluster.pb.go b/gen/proto/ctrl/v1/cluster.pb.go index c108e6a861..d62b181ca7 100644 --- a/gen/proto/ctrl/v1/cluster.pb.go +++ b/gen/proto/ctrl/v1/cluster.pb.go @@ -84,24 +84,24 @@ func (Health) EnumDescriptor() ([]byte, []int) { return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{0} } -type UpdateDeploymentStateRequest_Update_Instance_Status int32 +type ReportDeploymentStatusRequest_Update_Instance_Status int32 const ( - UpdateDeploymentStateRequest_Update_Instance_STATUS_UNSPECIFIED UpdateDeploymentStateRequest_Update_Instance_Status = 0 - UpdateDeploymentStateRequest_Update_Instance_STATUS_PENDING UpdateDeploymentStateRequest_Update_Instance_Status = 1 // Deployment request accepted, container/pod creation in progress - UpdateDeploymentStateRequest_Update_Instance_STATUS_RUNNING UpdateDeploymentStateRequest_Update_Instance_Status = 2 // Container/pod is running and healthy - UpdateDeploymentStateRequest_Update_Instance_STATUS_FAILED UpdateDeploymentStateRequest_Update_Instance_Status = 3 // Container/pod failed to start + ReportDeploymentStatusRequest_Update_Instance_STATUS_UNSPECIFIED ReportDeploymentStatusRequest_Update_Instance_Status = 0 + ReportDeploymentStatusRequest_Update_Instance_STATUS_PENDING ReportDeploymentStatusRequest_Update_Instance_Status = 1 // Deployment request accepted, container/pod creation in progress + ReportDeploymentStatusRequest_Update_Instance_STATUS_RUNNING ReportDeploymentStatusRequest_Update_Instance_Status = 2 // Container/pod is running and healthy + ReportDeploymentStatusRequest_Update_Instance_STATUS_FAILED ReportDeploymentStatusRequest_Update_Instance_Status = 3 // Container/pod failed to start ) -// Enum value maps for UpdateDeploymentStateRequest_Update_Instance_Status. +// Enum value maps for ReportDeploymentStatusRequest_Update_Instance_Status. var ( - UpdateDeploymentStateRequest_Update_Instance_Status_name = map[int32]string{ + ReportDeploymentStatusRequest_Update_Instance_Status_name = map[int32]string{ 0: "STATUS_UNSPECIFIED", 1: "STATUS_PENDING", 2: "STATUS_RUNNING", 3: "STATUS_FAILED", } - UpdateDeploymentStateRequest_Update_Instance_Status_value = map[string]int32{ + ReportDeploymentStatusRequest_Update_Instance_Status_value = map[string]int32{ "STATUS_UNSPECIFIED": 0, "STATUS_PENDING": 1, "STATUS_RUNNING": 2, @@ -109,30 +109,30 @@ var ( } ) -func (x UpdateDeploymentStateRequest_Update_Instance_Status) Enum() *UpdateDeploymentStateRequest_Update_Instance_Status { - p := new(UpdateDeploymentStateRequest_Update_Instance_Status) +func (x ReportDeploymentStatusRequest_Update_Instance_Status) Enum() *ReportDeploymentStatusRequest_Update_Instance_Status { + p := new(ReportDeploymentStatusRequest_Update_Instance_Status) *p = x return p } -func (x UpdateDeploymentStateRequest_Update_Instance_Status) String() string { +func (x ReportDeploymentStatusRequest_Update_Instance_Status) String() string { return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) } -func (UpdateDeploymentStateRequest_Update_Instance_Status) Descriptor() protoreflect.EnumDescriptor { +func (ReportDeploymentStatusRequest_Update_Instance_Status) Descriptor() protoreflect.EnumDescriptor { return file_ctrl_v1_cluster_proto_enumTypes[1].Descriptor() } -func (UpdateDeploymentStateRequest_Update_Instance_Status) Type() protoreflect.EnumType { +func (ReportDeploymentStatusRequest_Update_Instance_Status) Type() protoreflect.EnumType { return &file_ctrl_v1_cluster_proto_enumTypes[1] } -func (x UpdateDeploymentStateRequest_Update_Instance_Status) Number() protoreflect.EnumNumber { +func (x ReportDeploymentStatusRequest_Update_Instance_Status) Number() protoreflect.EnumNumber { return protoreflect.EnumNumber(x) } -// Deprecated: Use UpdateDeploymentStateRequest_Update_Instance_Status.Descriptor instead. -func (UpdateDeploymentStateRequest_Update_Instance_Status) EnumDescriptor() ([]byte, []int) { +// Deprecated: Use ReportDeploymentStatusRequest_Update_Instance_Status.Descriptor instead. +func (ReportDeploymentStatusRequest_Update_Instance_Status) EnumDescriptor() ([]byte, []int) { return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{2, 0, 0, 0} } @@ -224,31 +224,33 @@ func (x *GetDesiredDeploymentStateRequest) GetDeploymentId() string { return "" } -type UpdateDeploymentStateRequest struct { +// ReportDeploymentStatusRequest reports the actual state of a deployment from the agent. +// Used by runActualStateReportLoop to inform the control plane of K8s cluster state. +type ReportDeploymentStatusRequest struct { state protoimpl.MessageState `protogen:"open.v1"` // Types that are valid to be assigned to Change: // - // *UpdateDeploymentStateRequest_Update_ - // *UpdateDeploymentStateRequest_Delete_ - Change isUpdateDeploymentStateRequest_Change `protobuf_oneof:"change"` + // *ReportDeploymentStatusRequest_Update_ + // *ReportDeploymentStatusRequest_Delete_ + Change isReportDeploymentStatusRequest_Change `protobuf_oneof:"change"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } -func (x *UpdateDeploymentStateRequest) Reset() { - *x = UpdateDeploymentStateRequest{} +func (x *ReportDeploymentStatusRequest) Reset() { + *x = ReportDeploymentStatusRequest{} mi := &file_ctrl_v1_cluster_proto_msgTypes[2] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } -func (x *UpdateDeploymentStateRequest) String() string { +func (x *ReportDeploymentStatusRequest) String() string { return protoimpl.X.MessageStringOf(x) } -func (*UpdateDeploymentStateRequest) ProtoMessage() {} +func (*ReportDeploymentStatusRequest) ProtoMessage() {} -func (x *UpdateDeploymentStateRequest) ProtoReflect() protoreflect.Message { +func (x *ReportDeploymentStatusRequest) ProtoReflect() protoreflect.Message { mi := &file_ctrl_v1_cluster_proto_msgTypes[2] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) @@ -260,72 +262,72 @@ func (x *UpdateDeploymentStateRequest) ProtoReflect() protoreflect.Message { return mi.MessageOf(x) } -// Deprecated: Use UpdateDeploymentStateRequest.ProtoReflect.Descriptor instead. -func (*UpdateDeploymentStateRequest) Descriptor() ([]byte, []int) { +// Deprecated: Use ReportDeploymentStatusRequest.ProtoReflect.Descriptor instead. +func (*ReportDeploymentStatusRequest) Descriptor() ([]byte, []int) { return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{2} } -func (x *UpdateDeploymentStateRequest) GetChange() isUpdateDeploymentStateRequest_Change { +func (x *ReportDeploymentStatusRequest) GetChange() isReportDeploymentStatusRequest_Change { if x != nil { return x.Change } return nil } -func (x *UpdateDeploymentStateRequest) GetUpdate() *UpdateDeploymentStateRequest_Update { +func (x *ReportDeploymentStatusRequest) GetUpdate() *ReportDeploymentStatusRequest_Update { if x != nil { - if x, ok := x.Change.(*UpdateDeploymentStateRequest_Update_); ok { + if x, ok := x.Change.(*ReportDeploymentStatusRequest_Update_); ok { return x.Update } } return nil } -func (x *UpdateDeploymentStateRequest) GetDelete() *UpdateDeploymentStateRequest_Delete { +func (x *ReportDeploymentStatusRequest) GetDelete() *ReportDeploymentStatusRequest_Delete { if x != nil { - if x, ok := x.Change.(*UpdateDeploymentStateRequest_Delete_); ok { + if x, ok := x.Change.(*ReportDeploymentStatusRequest_Delete_); ok { return x.Delete } } return nil } -type isUpdateDeploymentStateRequest_Change interface { - isUpdateDeploymentStateRequest_Change() +type isReportDeploymentStatusRequest_Change interface { + isReportDeploymentStatusRequest_Change() } -type UpdateDeploymentStateRequest_Update_ struct { - Update *UpdateDeploymentStateRequest_Update `protobuf:"bytes,1,opt,name=update,proto3,oneof"` +type ReportDeploymentStatusRequest_Update_ struct { + Update *ReportDeploymentStatusRequest_Update `protobuf:"bytes,1,opt,name=update,proto3,oneof"` } -type UpdateDeploymentStateRequest_Delete_ struct { - Delete *UpdateDeploymentStateRequest_Delete `protobuf:"bytes,2,opt,name=delete,proto3,oneof"` +type ReportDeploymentStatusRequest_Delete_ struct { + Delete *ReportDeploymentStatusRequest_Delete `protobuf:"bytes,2,opt,name=delete,proto3,oneof"` } -func (*UpdateDeploymentStateRequest_Update_) isUpdateDeploymentStateRequest_Change() {} +func (*ReportDeploymentStatusRequest_Update_) isReportDeploymentStatusRequest_Change() {} -func (*UpdateDeploymentStateRequest_Delete_) isUpdateDeploymentStateRequest_Change() {} +func (*ReportDeploymentStatusRequest_Delete_) isReportDeploymentStatusRequest_Change() {} -type UpdateDeploymentStateResponse struct { +type ReportDeploymentStatusResponse struct { state protoimpl.MessageState `protogen:"open.v1"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } -func (x *UpdateDeploymentStateResponse) Reset() { - *x = UpdateDeploymentStateResponse{} +func (x *ReportDeploymentStatusResponse) Reset() { + *x = ReportDeploymentStatusResponse{} mi := &file_ctrl_v1_cluster_proto_msgTypes[3] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } -func (x *UpdateDeploymentStateResponse) String() string { +func (x *ReportDeploymentStatusResponse) String() string { return protoimpl.X.MessageStringOf(x) } -func (*UpdateDeploymentStateResponse) ProtoMessage() {} +func (*ReportDeploymentStatusResponse) ProtoMessage() {} -func (x *UpdateDeploymentStateResponse) ProtoReflect() protoreflect.Message { +func (x *ReportDeploymentStatusResponse) ProtoReflect() protoreflect.Message { mi := &file_ctrl_v1_cluster_proto_msgTypes[3] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) @@ -337,48 +339,14 @@ func (x *UpdateDeploymentStateResponse) ProtoReflect() protoreflect.Message { return mi.MessageOf(x) } -// Deprecated: Use UpdateDeploymentStateResponse.ProtoReflect.Descriptor instead. -func (*UpdateDeploymentStateResponse) Descriptor() ([]byte, []int) { +// Deprecated: Use ReportDeploymentStatusResponse.ProtoReflect.Descriptor instead. +func (*ReportDeploymentStatusResponse) Descriptor() ([]byte, []int) { return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{3} } -type UpdateInstanceStateResponse struct { - state protoimpl.MessageState `protogen:"open.v1"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache -} - -func (x *UpdateInstanceStateResponse) Reset() { - *x = UpdateInstanceStateResponse{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[4] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) -} - -func (x *UpdateInstanceStateResponse) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*UpdateInstanceStateResponse) ProtoMessage() {} - -func (x *UpdateInstanceStateResponse) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[4] - if x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use UpdateInstanceStateResponse.ProtoReflect.Descriptor instead. -func (*UpdateInstanceStateResponse) Descriptor() ([]byte, []int) { - return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{4} -} - -type UpdateSentinelStateRequest struct { +// ReportSentinelStatusRequest reports the actual state of a sentinel from the agent. +// Used by runActualStateReportLoop to inform the control plane of K8s cluster state. +type ReportSentinelStatusRequest struct { state protoimpl.MessageState `protogen:"open.v1"` K8SName string `protobuf:"bytes,1,opt,name=k8s_name,json=k8sName,proto3" json:"k8s_name,omitempty"` AvailableReplicas int32 `protobuf:"varint,2,opt,name=available_replicas,json=availableReplicas,proto3" json:"available_replicas,omitempty"` @@ -387,21 +355,21 @@ type UpdateSentinelStateRequest struct { sizeCache protoimpl.SizeCache } -func (x *UpdateSentinelStateRequest) Reset() { - *x = UpdateSentinelStateRequest{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[5] +func (x *ReportSentinelStatusRequest) Reset() { + *x = ReportSentinelStatusRequest{} + mi := &file_ctrl_v1_cluster_proto_msgTypes[4] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } -func (x *UpdateSentinelStateRequest) String() string { +func (x *ReportSentinelStatusRequest) String() string { return protoimpl.X.MessageStringOf(x) } -func (*UpdateSentinelStateRequest) ProtoMessage() {} +func (*ReportSentinelStatusRequest) ProtoMessage() {} -func (x *UpdateSentinelStateRequest) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[5] +func (x *ReportSentinelStatusRequest) ProtoReflect() protoreflect.Message { + mi := &file_ctrl_v1_cluster_proto_msgTypes[4] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -412,53 +380,53 @@ func (x *UpdateSentinelStateRequest) ProtoReflect() protoreflect.Message { return mi.MessageOf(x) } -// Deprecated: Use UpdateSentinelStateRequest.ProtoReflect.Descriptor instead. -func (*UpdateSentinelStateRequest) Descriptor() ([]byte, []int) { - return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{5} +// Deprecated: Use ReportSentinelStatusRequest.ProtoReflect.Descriptor instead. +func (*ReportSentinelStatusRequest) Descriptor() ([]byte, []int) { + return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{4} } -func (x *UpdateSentinelStateRequest) GetK8SName() string { +func (x *ReportSentinelStatusRequest) GetK8SName() string { if x != nil { return x.K8SName } return "" } -func (x *UpdateSentinelStateRequest) GetAvailableReplicas() int32 { +func (x *ReportSentinelStatusRequest) GetAvailableReplicas() int32 { if x != nil { return x.AvailableReplicas } return 0 } -func (x *UpdateSentinelStateRequest) GetHealth() Health { +func (x *ReportSentinelStatusRequest) GetHealth() Health { if x != nil { return x.Health } return Health_HEALTH_UNSPECIFIED } -type UpdateSentinelStateResponse struct { +type ReportSentinelStatusResponse struct { state protoimpl.MessageState `protogen:"open.v1"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } -func (x *UpdateSentinelStateResponse) Reset() { - *x = UpdateSentinelStateResponse{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[6] +func (x *ReportSentinelStatusResponse) Reset() { + *x = ReportSentinelStatusResponse{} + mi := &file_ctrl_v1_cluster_proto_msgTypes[5] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } -func (x *UpdateSentinelStateResponse) String() string { +func (x *ReportSentinelStatusResponse) String() string { return protoimpl.X.MessageStringOf(x) } -func (*UpdateSentinelStateResponse) ProtoMessage() {} +func (*ReportSentinelStatusResponse) ProtoMessage() {} -func (x *UpdateSentinelStateResponse) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[6] +func (x *ReportSentinelStatusResponse) ProtoReflect() protoreflect.Message { + mi := &file_ctrl_v1_cluster_proto_msgTypes[5] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -469,12 +437,15 @@ func (x *UpdateSentinelStateResponse) ProtoReflect() protoreflect.Message { return mi.MessageOf(x) } -// Deprecated: Use UpdateSentinelStateResponse.ProtoReflect.Descriptor instead. -func (*UpdateSentinelStateResponse) Descriptor() ([]byte, []int) { - return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{6} +// Deprecated: Use ReportSentinelStatusResponse.ProtoReflect.Descriptor instead. +func (*ReportSentinelStatusResponse) Descriptor() ([]byte, []int) { + return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{5} } -type SyncRequest struct { +// WatchDeploymentsRequest initiates a stream of deployment state changes. +// The version_last_seen enables resumable streaming - the server will only send +// events newer than this version. Independent of the sentinel stream version. +type WatchDeploymentsRequest struct { state protoimpl.MessageState `protogen:"open.v1"` Region string `protobuf:"bytes,1,opt,name=region,proto3" json:"region,omitempty"` VersionLastSeen uint64 `protobuf:"varint,2,opt,name=version_last_seen,json=versionLastSeen,proto3" json:"version_last_seen,omitempty"` @@ -482,21 +453,21 @@ type SyncRequest struct { sizeCache protoimpl.SizeCache } -func (x *SyncRequest) Reset() { - *x = SyncRequest{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[7] +func (x *WatchDeploymentsRequest) Reset() { + *x = WatchDeploymentsRequest{} + mi := &file_ctrl_v1_cluster_proto_msgTypes[6] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } -func (x *SyncRequest) String() string { +func (x *WatchDeploymentsRequest) String() string { return protoimpl.X.MessageStringOf(x) } -func (*SyncRequest) ProtoMessage() {} +func (*WatchDeploymentsRequest) ProtoMessage() {} -func (x *SyncRequest) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[7] +func (x *WatchDeploymentsRequest) ProtoReflect() protoreflect.Message { + mi := &file_ctrl_v1_cluster_proto_msgTypes[6] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -507,55 +478,51 @@ func (x *SyncRequest) ProtoReflect() protoreflect.Message { return mi.MessageOf(x) } -// Deprecated: Use SyncRequest.ProtoReflect.Descriptor instead. -func (*SyncRequest) Descriptor() ([]byte, []int) { - return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{7} +// Deprecated: Use WatchDeploymentsRequest.ProtoReflect.Descriptor instead. +func (*WatchDeploymentsRequest) Descriptor() ([]byte, []int) { + return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{6} } -func (x *SyncRequest) GetRegion() string { +func (x *WatchDeploymentsRequest) GetRegion() string { if x != nil { return x.Region } return "" } -func (x *SyncRequest) GetVersionLastSeen() uint64 { +func (x *WatchDeploymentsRequest) GetVersionLastSeen() uint64 { if x != nil { return x.VersionLastSeen } return 0 } -type State struct { - state protoimpl.MessageState `protogen:"open.v1"` - // version is the resource version for this state update. - // Clients should track the max version seen and persist it after - // the stream closes cleanly to resume from the correct position on reconnect. - Version uint64 `protobuf:"varint,1,opt,name=version,proto3" json:"version,omitempty"` - // Types that are valid to be assigned to Kind: - // - // *State_Deployment - // *State_Sentinel - Kind isState_Kind `protobuf_oneof:"kind"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache +// WatchSentinelsRequest initiates a stream of sentinel state changes. +// The version_last_seen enables resumable streaming - the server will only send +// events newer than this version. Independent of the deployment stream version. +type WatchSentinelsRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + Region string `protobuf:"bytes,1,opt,name=region,proto3" json:"region,omitempty"` + VersionLastSeen uint64 `protobuf:"varint,2,opt,name=version_last_seen,json=versionLastSeen,proto3" json:"version_last_seen,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache } -func (x *State) Reset() { - *x = State{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[8] +func (x *WatchSentinelsRequest) Reset() { + *x = WatchSentinelsRequest{} + mi := &file_ctrl_v1_cluster_proto_msgTypes[7] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } -func (x *State) String() string { +func (x *WatchSentinelsRequest) String() string { return protoimpl.X.MessageStringOf(x) } -func (*State) ProtoMessage() {} +func (*WatchSentinelsRequest) ProtoMessage() {} -func (x *State) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[8] +func (x *WatchSentinelsRequest) ProtoReflect() protoreflect.Message { + mi := &file_ctrl_v1_cluster_proto_msgTypes[7] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -566,59 +533,25 @@ func (x *State) ProtoReflect() protoreflect.Message { return mi.MessageOf(x) } -// Deprecated: Use State.ProtoReflect.Descriptor instead. -func (*State) Descriptor() ([]byte, []int) { - return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{8} -} - -func (x *State) GetVersion() uint64 { - if x != nil { - return x.Version - } - return 0 -} - -func (x *State) GetKind() isState_Kind { - if x != nil { - return x.Kind - } - return nil +// Deprecated: Use WatchSentinelsRequest.ProtoReflect.Descriptor instead. +func (*WatchSentinelsRequest) Descriptor() ([]byte, []int) { + return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{7} } -func (x *State) GetDeployment() *DeploymentState { +func (x *WatchSentinelsRequest) GetRegion() string { if x != nil { - if x, ok := x.Kind.(*State_Deployment); ok { - return x.Deployment - } + return x.Region } - return nil + return "" } -func (x *State) GetSentinel() *SentinelState { +func (x *WatchSentinelsRequest) GetVersionLastSeen() uint64 { if x != nil { - if x, ok := x.Kind.(*State_Sentinel); ok { - return x.Sentinel - } + return x.VersionLastSeen } - return nil -} - -type isState_Kind interface { - isState_Kind() -} - -type State_Deployment struct { - Deployment *DeploymentState `protobuf:"bytes,2,opt,name=deployment,proto3,oneof"` -} - -type State_Sentinel struct { - Sentinel *SentinelState `protobuf:"bytes,3,opt,name=sentinel,proto3,oneof"` + return 0 } -func (*State_Deployment) isState_Kind() {} - -func (*State_Sentinel) isState_Kind() {} - // SentinelState represents a lifecycle event for an API sentinel configuration. // // Sentinels are frontline points for services, typically handling routing, load balancing, @@ -626,6 +559,11 @@ func (*State_Sentinel) isState_Kind() {} // the cluster state matches the desired configuration. type SentinelState struct { state protoimpl.MessageState `protogen:"open.v1"` + // version is the sentinel-specific resource version for this state update. + // Clients should track the max version seen and use it when reconnecting to + // the WatchSentinels stream to resume from the correct position. + // When returned from GetDesiredSentinelState, this field is not set. + Version uint64 `protobuf:"varint,3,opt,name=version,proto3" json:"version,omitempty"` // state contains the specific sentinel operation to perform. // Only one state type is set per message, determining the action the agent should take. // @@ -640,7 +578,7 @@ type SentinelState struct { func (x *SentinelState) Reset() { *x = SentinelState{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[9] + mi := &file_ctrl_v1_cluster_proto_msgTypes[8] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -652,7 +590,7 @@ func (x *SentinelState) String() string { func (*SentinelState) ProtoMessage() {} func (x *SentinelState) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[9] + mi := &file_ctrl_v1_cluster_proto_msgTypes[8] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -665,7 +603,14 @@ func (x *SentinelState) ProtoReflect() protoreflect.Message { // Deprecated: Use SentinelState.ProtoReflect.Descriptor instead. func (*SentinelState) Descriptor() ([]byte, []int) { - return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{9} + return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{8} +} + +func (x *SentinelState) GetVersion() uint64 { + if x != nil { + return x.Version + } + return 0 } func (x *SentinelState) GetState() isSentinelState_State { @@ -721,6 +666,11 @@ func (*SentinelState_Delete) isSentinelState_State() {} // the cluster agent ensures the cluster state matches the desired configuration. type DeploymentState struct { state protoimpl.MessageState `protogen:"open.v1"` + // version is the deployment-specific resource version for this state update. + // Clients should track the max version seen and use it when reconnecting to + // the WatchDeployments stream to resume from the correct position. + // When returned from GetDesiredDeploymentState, this field is not set. + Version uint64 `protobuf:"varint,3,opt,name=version,proto3" json:"version,omitempty"` // state contains the specific deployment operation to perform. // Only one state type is set per message, determining the action the agent should take. // @@ -735,7 +685,7 @@ type DeploymentState struct { func (x *DeploymentState) Reset() { *x = DeploymentState{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[10] + mi := &file_ctrl_v1_cluster_proto_msgTypes[9] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -747,7 +697,7 @@ func (x *DeploymentState) String() string { func (*DeploymentState) ProtoMessage() {} func (x *DeploymentState) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[10] + mi := &file_ctrl_v1_cluster_proto_msgTypes[9] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -760,7 +710,14 @@ func (x *DeploymentState) ProtoReflect() protoreflect.Message { // Deprecated: Use DeploymentState.ProtoReflect.Descriptor instead. func (*DeploymentState) Descriptor() ([]byte, []int) { - return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{10} + return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{9} +} + +func (x *DeploymentState) GetVersion() uint64 { + if x != nil { + return x.Version + } + return 0 } func (x *DeploymentState) GetState() isDeploymentState_State { @@ -835,7 +792,7 @@ type ApplySentinel struct { func (x *ApplySentinel) Reset() { *x = ApplySentinel{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[11] + mi := &file_ctrl_v1_cluster_proto_msgTypes[10] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -847,7 +804,7 @@ func (x *ApplySentinel) String() string { func (*ApplySentinel) ProtoMessage() {} func (x *ApplySentinel) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[11] + mi := &file_ctrl_v1_cluster_proto_msgTypes[10] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -860,7 +817,7 @@ func (x *ApplySentinel) ProtoReflect() protoreflect.Message { // Deprecated: Use ApplySentinel.ProtoReflect.Descriptor instead. func (*ApplySentinel) Descriptor() ([]byte, []int) { - return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{11} + return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{10} } func (x *ApplySentinel) GetK8SName() string { @@ -939,7 +896,7 @@ type DeleteSentinel struct { func (x *DeleteSentinel) Reset() { *x = DeleteSentinel{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[12] + mi := &file_ctrl_v1_cluster_proto_msgTypes[11] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -951,7 +908,7 @@ func (x *DeleteSentinel) String() string { func (*DeleteSentinel) ProtoMessage() {} func (x *DeleteSentinel) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[12] + mi := &file_ctrl_v1_cluster_proto_msgTypes[11] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -964,7 +921,7 @@ func (x *DeleteSentinel) ProtoReflect() protoreflect.Message { // Deprecated: Use DeleteSentinel.ProtoReflect.Descriptor instead. func (*DeleteSentinel) Descriptor() ([]byte, []int) { - return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{12} + return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{11} } func (x *DeleteSentinel) GetK8SName() string { @@ -1021,7 +978,7 @@ type ApplyDeployment struct { func (x *ApplyDeployment) Reset() { *x = ApplyDeployment{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[13] + mi := &file_ctrl_v1_cluster_proto_msgTypes[12] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -1033,7 +990,7 @@ func (x *ApplyDeployment) String() string { func (*ApplyDeployment) ProtoMessage() {} func (x *ApplyDeployment) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[13] + mi := &file_ctrl_v1_cluster_proto_msgTypes[12] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1046,7 +1003,7 @@ func (x *ApplyDeployment) ProtoReflect() protoreflect.Message { // Deprecated: Use ApplyDeployment.ProtoReflect.Descriptor instead. func (*ApplyDeployment) Descriptor() ([]byte, []int) { - return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{13} + return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{12} } func (x *ApplyDeployment) GetK8SNamespace() string { @@ -1155,7 +1112,7 @@ type DeleteDeployment struct { func (x *DeleteDeployment) Reset() { *x = DeleteDeployment{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[14] + mi := &file_ctrl_v1_cluster_proto_msgTypes[13] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -1167,7 +1124,7 @@ func (x *DeleteDeployment) String() string { func (*DeleteDeployment) ProtoMessage() {} func (x *DeleteDeployment) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[14] + mi := &file_ctrl_v1_cluster_proto_msgTypes[13] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1180,7 +1137,7 @@ func (x *DeleteDeployment) ProtoReflect() protoreflect.Message { // Deprecated: Use DeleteDeployment.ProtoReflect.Descriptor instead. func (*DeleteDeployment) Descriptor() ([]byte, []int) { - return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{14} + return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{13} } func (x *DeleteDeployment) GetK8SNamespace() string { @@ -1197,29 +1154,29 @@ func (x *DeleteDeployment) GetK8SName() string { return "" } -type UpdateDeploymentStateRequest_Update struct { - state protoimpl.MessageState `protogen:"open.v1"` - K8SName string `protobuf:"bytes,1,opt,name=k8s_name,json=k8sName,proto3" json:"k8s_name,omitempty"` - Instances []*UpdateDeploymentStateRequest_Update_Instance `protobuf:"bytes,2,rep,name=instances,proto3" json:"instances,omitempty"` +type ReportDeploymentStatusRequest_Update struct { + state protoimpl.MessageState `protogen:"open.v1"` + K8SName string `protobuf:"bytes,1,opt,name=k8s_name,json=k8sName,proto3" json:"k8s_name,omitempty"` + Instances []*ReportDeploymentStatusRequest_Update_Instance `protobuf:"bytes,2,rep,name=instances,proto3" json:"instances,omitempty"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } -func (x *UpdateDeploymentStateRequest_Update) Reset() { - *x = UpdateDeploymentStateRequest_Update{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[15] +func (x *ReportDeploymentStatusRequest_Update) Reset() { + *x = ReportDeploymentStatusRequest_Update{} + mi := &file_ctrl_v1_cluster_proto_msgTypes[14] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } -func (x *UpdateDeploymentStateRequest_Update) String() string { +func (x *ReportDeploymentStatusRequest_Update) String() string { return protoimpl.X.MessageStringOf(x) } -func (*UpdateDeploymentStateRequest_Update) ProtoMessage() {} +func (*ReportDeploymentStatusRequest_Update) ProtoMessage() {} -func (x *UpdateDeploymentStateRequest_Update) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[15] +func (x *ReportDeploymentStatusRequest_Update) ProtoReflect() protoreflect.Message { + mi := &file_ctrl_v1_cluster_proto_msgTypes[14] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1230,47 +1187,47 @@ func (x *UpdateDeploymentStateRequest_Update) ProtoReflect() protoreflect.Messag return mi.MessageOf(x) } -// Deprecated: Use UpdateDeploymentStateRequest_Update.ProtoReflect.Descriptor instead. -func (*UpdateDeploymentStateRequest_Update) Descriptor() ([]byte, []int) { +// Deprecated: Use ReportDeploymentStatusRequest_Update.ProtoReflect.Descriptor instead. +func (*ReportDeploymentStatusRequest_Update) Descriptor() ([]byte, []int) { return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{2, 0} } -func (x *UpdateDeploymentStateRequest_Update) GetK8SName() string { +func (x *ReportDeploymentStatusRequest_Update) GetK8SName() string { if x != nil { return x.K8SName } return "" } -func (x *UpdateDeploymentStateRequest_Update) GetInstances() []*UpdateDeploymentStateRequest_Update_Instance { +func (x *ReportDeploymentStatusRequest_Update) GetInstances() []*ReportDeploymentStatusRequest_Update_Instance { if x != nil { return x.Instances } return nil } -type UpdateDeploymentStateRequest_Delete struct { +type ReportDeploymentStatusRequest_Delete struct { state protoimpl.MessageState `protogen:"open.v1"` K8SName string `protobuf:"bytes,1,opt,name=k8s_name,json=k8sName,proto3" json:"k8s_name,omitempty"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } -func (x *UpdateDeploymentStateRequest_Delete) Reset() { - *x = UpdateDeploymentStateRequest_Delete{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[16] +func (x *ReportDeploymentStatusRequest_Delete) Reset() { + *x = ReportDeploymentStatusRequest_Delete{} + mi := &file_ctrl_v1_cluster_proto_msgTypes[15] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } -func (x *UpdateDeploymentStateRequest_Delete) String() string { +func (x *ReportDeploymentStatusRequest_Delete) String() string { return protoimpl.X.MessageStringOf(x) } -func (*UpdateDeploymentStateRequest_Delete) ProtoMessage() {} +func (*ReportDeploymentStatusRequest_Delete) ProtoMessage() {} -func (x *UpdateDeploymentStateRequest_Delete) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[16] +func (x *ReportDeploymentStatusRequest_Delete) ProtoReflect() protoreflect.Message { + mi := &file_ctrl_v1_cluster_proto_msgTypes[15] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1281,44 +1238,44 @@ func (x *UpdateDeploymentStateRequest_Delete) ProtoReflect() protoreflect.Messag return mi.MessageOf(x) } -// Deprecated: Use UpdateDeploymentStateRequest_Delete.ProtoReflect.Descriptor instead. -func (*UpdateDeploymentStateRequest_Delete) Descriptor() ([]byte, []int) { +// Deprecated: Use ReportDeploymentStatusRequest_Delete.ProtoReflect.Descriptor instead. +func (*ReportDeploymentStatusRequest_Delete) Descriptor() ([]byte, []int) { return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{2, 1} } -func (x *UpdateDeploymentStateRequest_Delete) GetK8SName() string { +func (x *ReportDeploymentStatusRequest_Delete) GetK8SName() string { if x != nil { return x.K8SName } return "" } -type UpdateDeploymentStateRequest_Update_Instance struct { - state protoimpl.MessageState `protogen:"open.v1"` - K8SName string `protobuf:"bytes,1,opt,name=k8s_name,json=k8sName,proto3" json:"k8s_name,omitempty"` - Address string `protobuf:"bytes,2,opt,name=address,proto3" json:"address,omitempty"` - CpuMillicores int64 `protobuf:"varint,3,opt,name=cpu_millicores,json=cpuMillicores,proto3" json:"cpu_millicores,omitempty"` - MemoryMib int64 `protobuf:"varint,4,opt,name=memory_mib,json=memoryMib,proto3" json:"memory_mib,omitempty"` - Status UpdateDeploymentStateRequest_Update_Instance_Status `protobuf:"varint,5,opt,name=status,proto3,enum=ctrl.v1.UpdateDeploymentStateRequest_Update_Instance_Status" json:"status,omitempty"` +type ReportDeploymentStatusRequest_Update_Instance struct { + state protoimpl.MessageState `protogen:"open.v1"` + K8SName string `protobuf:"bytes,1,opt,name=k8s_name,json=k8sName,proto3" json:"k8s_name,omitempty"` + Address string `protobuf:"bytes,2,opt,name=address,proto3" json:"address,omitempty"` + CpuMillicores int64 `protobuf:"varint,3,opt,name=cpu_millicores,json=cpuMillicores,proto3" json:"cpu_millicores,omitempty"` + MemoryMib int64 `protobuf:"varint,4,opt,name=memory_mib,json=memoryMib,proto3" json:"memory_mib,omitempty"` + Status ReportDeploymentStatusRequest_Update_Instance_Status `protobuf:"varint,5,opt,name=status,proto3,enum=ctrl.v1.ReportDeploymentStatusRequest_Update_Instance_Status" json:"status,omitempty"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } -func (x *UpdateDeploymentStateRequest_Update_Instance) Reset() { - *x = UpdateDeploymentStateRequest_Update_Instance{} - mi := &file_ctrl_v1_cluster_proto_msgTypes[17] +func (x *ReportDeploymentStatusRequest_Update_Instance) Reset() { + *x = ReportDeploymentStatusRequest_Update_Instance{} + mi := &file_ctrl_v1_cluster_proto_msgTypes[16] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } -func (x *UpdateDeploymentStateRequest_Update_Instance) String() string { +func (x *ReportDeploymentStatusRequest_Update_Instance) String() string { return protoimpl.X.MessageStringOf(x) } -func (*UpdateDeploymentStateRequest_Update_Instance) ProtoMessage() {} +func (*ReportDeploymentStatusRequest_Update_Instance) ProtoMessage() {} -func (x *UpdateDeploymentStateRequest_Update_Instance) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_cluster_proto_msgTypes[17] +func (x *ReportDeploymentStatusRequest_Update_Instance) ProtoReflect() protoreflect.Message { + mi := &file_ctrl_v1_cluster_proto_msgTypes[16] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -1329,44 +1286,44 @@ func (x *UpdateDeploymentStateRequest_Update_Instance) ProtoReflect() protorefle return mi.MessageOf(x) } -// Deprecated: Use UpdateDeploymentStateRequest_Update_Instance.ProtoReflect.Descriptor instead. -func (*UpdateDeploymentStateRequest_Update_Instance) Descriptor() ([]byte, []int) { +// Deprecated: Use ReportDeploymentStatusRequest_Update_Instance.ProtoReflect.Descriptor instead. +func (*ReportDeploymentStatusRequest_Update_Instance) Descriptor() ([]byte, []int) { return file_ctrl_v1_cluster_proto_rawDescGZIP(), []int{2, 0, 0} } -func (x *UpdateDeploymentStateRequest_Update_Instance) GetK8SName() string { +func (x *ReportDeploymentStatusRequest_Update_Instance) GetK8SName() string { if x != nil { return x.K8SName } return "" } -func (x *UpdateDeploymentStateRequest_Update_Instance) GetAddress() string { +func (x *ReportDeploymentStatusRequest_Update_Instance) GetAddress() string { if x != nil { return x.Address } return "" } -func (x *UpdateDeploymentStateRequest_Update_Instance) GetCpuMillicores() int64 { +func (x *ReportDeploymentStatusRequest_Update_Instance) GetCpuMillicores() int64 { if x != nil { return x.CpuMillicores } return 0 } -func (x *UpdateDeploymentStateRequest_Update_Instance) GetMemoryMib() int64 { +func (x *ReportDeploymentStatusRequest_Update_Instance) GetMemoryMib() int64 { if x != nil { return x.MemoryMib } return 0 } -func (x *UpdateDeploymentStateRequest_Update_Instance) GetStatus() UpdateDeploymentStateRequest_Update_Instance_Status { +func (x *ReportDeploymentStatusRequest_Update_Instance) GetStatus() ReportDeploymentStatusRequest_Update_Instance_Status { if x != nil { return x.Status } - return UpdateDeploymentStateRequest_Update_Instance_STATUS_UNSPECIFIED + return ReportDeploymentStatusRequest_Update_Instance_STATUS_UNSPECIFIED } var File_ctrl_v1_cluster_proto protoreflect.FileDescriptor @@ -1378,20 +1335,20 @@ const file_ctrl_v1_cluster_proto_rawDesc = "" + "\vsentinel_id\x18\x01 \x01(\tR\n" + "sentinelId\"G\n" + " GetDesiredDeploymentStateRequest\x12#\n" + - "\rdeployment_id\x18\x01 \x01(\tR\fdeploymentId\"\x93\x05\n" + - "\x1cUpdateDeploymentStateRequest\x12F\n" + - "\x06update\x18\x01 \x01(\v2,.ctrl.v1.UpdateDeploymentStateRequest.UpdateH\x00R\x06update\x12F\n" + - "\x06delete\x18\x02 \x01(\v2,.ctrl.v1.UpdateDeploymentStateRequest.DeleteH\x00R\x06delete\x1a\xb3\x03\n" + + "\rdeployment_id\x18\x01 \x01(\tR\fdeploymentId\"\x98\x05\n" + + "\x1dReportDeploymentStatusRequest\x12G\n" + + "\x06update\x18\x01 \x01(\v2-.ctrl.v1.ReportDeploymentStatusRequest.UpdateH\x00R\x06update\x12G\n" + + "\x06delete\x18\x02 \x01(\v2-.ctrl.v1.ReportDeploymentStatusRequest.DeleteH\x00R\x06delete\x1a\xb5\x03\n" + "\x06Update\x12\x19\n" + - "\bk8s_name\x18\x01 \x01(\tR\ak8sName\x12S\n" + - "\tinstances\x18\x02 \x03(\v25.ctrl.v1.UpdateDeploymentStateRequest.Update.InstanceR\tinstances\x1a\xb8\x02\n" + + "\bk8s_name\x18\x01 \x01(\tR\ak8sName\x12T\n" + + "\tinstances\x18\x02 \x03(\v26.ctrl.v1.ReportDeploymentStatusRequest.Update.InstanceR\tinstances\x1a\xb9\x02\n" + "\bInstance\x12\x19\n" + "\bk8s_name\x18\x01 \x01(\tR\ak8sName\x12\x18\n" + "\aaddress\x18\x02 \x01(\tR\aaddress\x12%\n" + "\x0ecpu_millicores\x18\x03 \x01(\x03R\rcpuMillicores\x12\x1d\n" + "\n" + - "memory_mib\x18\x04 \x01(\x03R\tmemoryMib\x12T\n" + - "\x06status\x18\x05 \x01(\x0e2<.ctrl.v1.UpdateDeploymentStateRequest.Update.Instance.StatusR\x06status\"[\n" + + "memory_mib\x18\x04 \x01(\x03R\tmemoryMib\x12U\n" + + "\x06status\x18\x05 \x01(\x0e2=.ctrl.v1.ReportDeploymentStatusRequest.Update.Instance.StatusR\x06status\"[\n" + "\x06Status\x12\x16\n" + "\x12STATUS_UNSPECIFIED\x10\x00\x12\x12\n" + "\x0eSTATUS_PENDING\x10\x01\x12\x12\n" + @@ -1399,29 +1356,26 @@ const file_ctrl_v1_cluster_proto_rawDesc = "" + "\rSTATUS_FAILED\x10\x03\x1a#\n" + "\x06Delete\x12\x19\n" + "\bk8s_name\x18\x01 \x01(\tR\ak8sNameB\b\n" + - "\x06change\"\x1f\n" + - "\x1dUpdateDeploymentStateResponse\"\x1d\n" + - "\x1bUpdateInstanceStateResponse\"\x8f\x01\n" + - "\x1aUpdateSentinelStateRequest\x12\x19\n" + + "\x06change\" \n" + + "\x1eReportDeploymentStatusResponse\"\x90\x01\n" + + "\x1bReportSentinelStatusRequest\x12\x19\n" + "\bk8s_name\x18\x01 \x01(\tR\ak8sName\x12-\n" + "\x12available_replicas\x18\x02 \x01(\x05R\x11availableReplicas\x12'\n" + - "\x06health\x18\x03 \x01(\x0e2\x0f.ctrl.v1.HealthR\x06health\"\x1d\n" + - "\x1bUpdateSentinelStateResponse\"Q\n" + - "\vSyncRequest\x12\x16\n" + + "\x06health\x18\x03 \x01(\x0e2\x0f.ctrl.v1.HealthR\x06health\"\x1e\n" + + "\x1cReportSentinelStatusResponse\"]\n" + + "\x17WatchDeploymentsRequest\x12\x16\n" + "\x06region\x18\x01 \x01(\tR\x06region\x12*\n" + - "\x11version_last_seen\x18\x02 \x01(\x04R\x0fversionLastSeen\"\x9b\x01\n" + - "\x05State\x12\x18\n" + - "\aversion\x18\x01 \x01(\x04R\aversion\x12:\n" + - "\n" + - "deployment\x18\x02 \x01(\v2\x18.ctrl.v1.DeploymentStateH\x00R\n" + - "deployment\x124\n" + - "\bsentinel\x18\x03 \x01(\v2\x16.ctrl.v1.SentinelStateH\x00R\bsentinelB\x06\n" + - "\x04kind\"{\n" + - "\rSentinelState\x12.\n" + + "\x11version_last_seen\x18\x02 \x01(\x04R\x0fversionLastSeen\"[\n" + + "\x15WatchSentinelsRequest\x12\x16\n" + + "\x06region\x18\x01 \x01(\tR\x06region\x12*\n" + + "\x11version_last_seen\x18\x02 \x01(\x04R\x0fversionLastSeen\"\x95\x01\n" + + "\rSentinelState\x12\x18\n" + + "\aversion\x18\x03 \x01(\x04R\aversion\x12.\n" + "\x05apply\x18\x01 \x01(\v2\x16.ctrl.v1.ApplySentinelH\x00R\x05apply\x121\n" + "\x06delete\x18\x02 \x01(\v2\x17.ctrl.v1.DeleteSentinelH\x00R\x06deleteB\a\n" + - "\x05state\"\x81\x01\n" + - "\x0fDeploymentState\x120\n" + + "\x05state\"\x9b\x01\n" + + "\x0fDeploymentState\x12\x18\n" + + "\aversion\x18\x03 \x01(\x04R\aversion\x120\n" + "\x05apply\x18\x01 \x01(\v2\x18.ctrl.v1.ApplyDeploymentH\x00R\x05apply\x123\n" + "\x06delete\x18\x02 \x01(\v2\x19.ctrl.v1.DeleteDeploymentH\x00R\x06deleteB\a\n" + "\x05state\"\xac\x02\n" + @@ -1465,13 +1419,14 @@ const file_ctrl_v1_cluster_proto_rawDesc = "" + "\x12HEALTH_UNSPECIFIED\x10\x00\x12\x12\n" + "\x0eHEALTH_HEALTHY\x10\x01\x12\x14\n" + "\x10HEALTH_UNHEALTHY\x10\x02\x12\x11\n" + - "\rHEALTH_PAUSED\x10\x032\xc8\x03\n" + - "\x0eClusterService\x12.\n" + - "\x04Sync\x12\x14.ctrl.v1.SyncRequest\x1a\x0e.ctrl.v1.State0\x01\x12Z\n" + - "\x17GetDesiredSentinelState\x12'.ctrl.v1.GetDesiredSentinelStateRequest\x1a\x16.ctrl.v1.SentinelState\x12`\n" + - "\x13UpdateSentinelState\x12#.ctrl.v1.UpdateSentinelStateRequest\x1a$.ctrl.v1.UpdateSentinelStateResponse\x12`\n" + - "\x19GetDesiredDeploymentState\x12).ctrl.v1.GetDesiredDeploymentStateRequest\x1a\x18.ctrl.v1.DeploymentState\x12f\n" + - "\x15UpdateDeploymentState\x12%.ctrl.v1.UpdateDeploymentStateRequest\x1a&.ctrl.v1.UpdateDeploymentStateResponseB\x8b\x01\n" + + "\rHEALTH_PAUSED\x10\x032\xbc\x04\n" + + "\x0eClusterService\x12P\n" + + "\x10WatchDeployments\x12 .ctrl.v1.WatchDeploymentsRequest\x1a\x18.ctrl.v1.DeploymentState0\x01\x12J\n" + + "\x0eWatchSentinels\x12\x1e.ctrl.v1.WatchSentinelsRequest\x1a\x16.ctrl.v1.SentinelState0\x01\x12Z\n" + + "\x17GetDesiredSentinelState\x12'.ctrl.v1.GetDesiredSentinelStateRequest\x1a\x16.ctrl.v1.SentinelState\x12c\n" + + "\x14ReportSentinelStatus\x12$.ctrl.v1.ReportSentinelStatusRequest\x1a%.ctrl.v1.ReportSentinelStatusResponse\x12`\n" + + "\x19GetDesiredDeploymentState\x12).ctrl.v1.GetDesiredDeploymentStateRequest\x1a\x18.ctrl.v1.DeploymentState\x12i\n" + + "\x16ReportDeploymentStatus\x12&.ctrl.v1.ReportDeploymentStatusRequest\x1a'.ctrl.v1.ReportDeploymentStatusResponseB\x8b\x01\n" + "\vcom.ctrl.v1B\fClusterProtoP\x01Z1github.com/unkeyed/unkey/gen/proto/ctrl/v1;ctrlv1\xa2\x02\x03CXX\xaa\x02\aCtrl.V1\xca\x02\aCtrl\\V1\xe2\x02\x13Ctrl\\V1\\GPBMetadata\xea\x02\bCtrl::V1b\x06proto3" var ( @@ -1487,56 +1442,55 @@ func file_ctrl_v1_cluster_proto_rawDescGZIP() []byte { } var file_ctrl_v1_cluster_proto_enumTypes = make([]protoimpl.EnumInfo, 2) -var file_ctrl_v1_cluster_proto_msgTypes = make([]protoimpl.MessageInfo, 18) +var file_ctrl_v1_cluster_proto_msgTypes = make([]protoimpl.MessageInfo, 17) var file_ctrl_v1_cluster_proto_goTypes = []any{ (Health)(0), // 0: ctrl.v1.Health - (UpdateDeploymentStateRequest_Update_Instance_Status)(0), // 1: ctrl.v1.UpdateDeploymentStateRequest.Update.Instance.Status - (*GetDesiredSentinelStateRequest)(nil), // 2: ctrl.v1.GetDesiredSentinelStateRequest - (*GetDesiredDeploymentStateRequest)(nil), // 3: ctrl.v1.GetDesiredDeploymentStateRequest - (*UpdateDeploymentStateRequest)(nil), // 4: ctrl.v1.UpdateDeploymentStateRequest - (*UpdateDeploymentStateResponse)(nil), // 5: ctrl.v1.UpdateDeploymentStateResponse - (*UpdateInstanceStateResponse)(nil), // 6: ctrl.v1.UpdateInstanceStateResponse - (*UpdateSentinelStateRequest)(nil), // 7: ctrl.v1.UpdateSentinelStateRequest - (*UpdateSentinelStateResponse)(nil), // 8: ctrl.v1.UpdateSentinelStateResponse - (*SyncRequest)(nil), // 9: ctrl.v1.SyncRequest - (*State)(nil), // 10: ctrl.v1.State - (*SentinelState)(nil), // 11: ctrl.v1.SentinelState - (*DeploymentState)(nil), // 12: ctrl.v1.DeploymentState - (*ApplySentinel)(nil), // 13: ctrl.v1.ApplySentinel - (*DeleteSentinel)(nil), // 14: ctrl.v1.DeleteSentinel - (*ApplyDeployment)(nil), // 15: ctrl.v1.ApplyDeployment - (*DeleteDeployment)(nil), // 16: ctrl.v1.DeleteDeployment - (*UpdateDeploymentStateRequest_Update)(nil), // 17: ctrl.v1.UpdateDeploymentStateRequest.Update - (*UpdateDeploymentStateRequest_Delete)(nil), // 18: ctrl.v1.UpdateDeploymentStateRequest.Delete - (*UpdateDeploymentStateRequest_Update_Instance)(nil), // 19: ctrl.v1.UpdateDeploymentStateRequest.Update.Instance + (ReportDeploymentStatusRequest_Update_Instance_Status)(0), // 1: ctrl.v1.ReportDeploymentStatusRequest.Update.Instance.Status + (*GetDesiredSentinelStateRequest)(nil), // 2: ctrl.v1.GetDesiredSentinelStateRequest + (*GetDesiredDeploymentStateRequest)(nil), // 3: ctrl.v1.GetDesiredDeploymentStateRequest + (*ReportDeploymentStatusRequest)(nil), // 4: ctrl.v1.ReportDeploymentStatusRequest + (*ReportDeploymentStatusResponse)(nil), // 5: ctrl.v1.ReportDeploymentStatusResponse + (*ReportSentinelStatusRequest)(nil), // 6: ctrl.v1.ReportSentinelStatusRequest + (*ReportSentinelStatusResponse)(nil), // 7: ctrl.v1.ReportSentinelStatusResponse + (*WatchDeploymentsRequest)(nil), // 8: ctrl.v1.WatchDeploymentsRequest + (*WatchSentinelsRequest)(nil), // 9: ctrl.v1.WatchSentinelsRequest + (*SentinelState)(nil), // 10: ctrl.v1.SentinelState + (*DeploymentState)(nil), // 11: ctrl.v1.DeploymentState + (*ApplySentinel)(nil), // 12: ctrl.v1.ApplySentinel + (*DeleteSentinel)(nil), // 13: ctrl.v1.DeleteSentinel + (*ApplyDeployment)(nil), // 14: ctrl.v1.ApplyDeployment + (*DeleteDeployment)(nil), // 15: ctrl.v1.DeleteDeployment + (*ReportDeploymentStatusRequest_Update)(nil), // 16: ctrl.v1.ReportDeploymentStatusRequest.Update + (*ReportDeploymentStatusRequest_Delete)(nil), // 17: ctrl.v1.ReportDeploymentStatusRequest.Delete + (*ReportDeploymentStatusRequest_Update_Instance)(nil), // 18: ctrl.v1.ReportDeploymentStatusRequest.Update.Instance } var file_ctrl_v1_cluster_proto_depIdxs = []int32{ - 17, // 0: ctrl.v1.UpdateDeploymentStateRequest.update:type_name -> ctrl.v1.UpdateDeploymentStateRequest.Update - 18, // 1: ctrl.v1.UpdateDeploymentStateRequest.delete:type_name -> ctrl.v1.UpdateDeploymentStateRequest.Delete - 0, // 2: ctrl.v1.UpdateSentinelStateRequest.health:type_name -> ctrl.v1.Health - 12, // 3: ctrl.v1.State.deployment:type_name -> ctrl.v1.DeploymentState - 11, // 4: ctrl.v1.State.sentinel:type_name -> ctrl.v1.SentinelState - 13, // 5: ctrl.v1.SentinelState.apply:type_name -> ctrl.v1.ApplySentinel - 14, // 6: ctrl.v1.SentinelState.delete:type_name -> ctrl.v1.DeleteSentinel - 15, // 7: ctrl.v1.DeploymentState.apply:type_name -> ctrl.v1.ApplyDeployment - 16, // 8: ctrl.v1.DeploymentState.delete:type_name -> ctrl.v1.DeleteDeployment - 19, // 9: ctrl.v1.UpdateDeploymentStateRequest.Update.instances:type_name -> ctrl.v1.UpdateDeploymentStateRequest.Update.Instance - 1, // 10: ctrl.v1.UpdateDeploymentStateRequest.Update.Instance.status:type_name -> ctrl.v1.UpdateDeploymentStateRequest.Update.Instance.Status - 9, // 11: ctrl.v1.ClusterService.Sync:input_type -> ctrl.v1.SyncRequest - 2, // 12: ctrl.v1.ClusterService.GetDesiredSentinelState:input_type -> ctrl.v1.GetDesiredSentinelStateRequest - 7, // 13: ctrl.v1.ClusterService.UpdateSentinelState:input_type -> ctrl.v1.UpdateSentinelStateRequest - 3, // 14: ctrl.v1.ClusterService.GetDesiredDeploymentState:input_type -> ctrl.v1.GetDesiredDeploymentStateRequest - 4, // 15: ctrl.v1.ClusterService.UpdateDeploymentState:input_type -> ctrl.v1.UpdateDeploymentStateRequest - 10, // 16: ctrl.v1.ClusterService.Sync:output_type -> ctrl.v1.State - 11, // 17: ctrl.v1.ClusterService.GetDesiredSentinelState:output_type -> ctrl.v1.SentinelState - 8, // 18: ctrl.v1.ClusterService.UpdateSentinelState:output_type -> ctrl.v1.UpdateSentinelStateResponse - 12, // 19: ctrl.v1.ClusterService.GetDesiredDeploymentState:output_type -> ctrl.v1.DeploymentState - 5, // 20: ctrl.v1.ClusterService.UpdateDeploymentState:output_type -> ctrl.v1.UpdateDeploymentStateResponse - 16, // [16:21] is the sub-list for method output_type - 11, // [11:16] is the sub-list for method input_type - 11, // [11:11] is the sub-list for extension type_name - 11, // [11:11] is the sub-list for extension extendee - 0, // [0:11] is the sub-list for field type_name + 16, // 0: ctrl.v1.ReportDeploymentStatusRequest.update:type_name -> ctrl.v1.ReportDeploymentStatusRequest.Update + 17, // 1: ctrl.v1.ReportDeploymentStatusRequest.delete:type_name -> ctrl.v1.ReportDeploymentStatusRequest.Delete + 0, // 2: ctrl.v1.ReportSentinelStatusRequest.health:type_name -> ctrl.v1.Health + 12, // 3: ctrl.v1.SentinelState.apply:type_name -> ctrl.v1.ApplySentinel + 13, // 4: ctrl.v1.SentinelState.delete:type_name -> ctrl.v1.DeleteSentinel + 14, // 5: ctrl.v1.DeploymentState.apply:type_name -> ctrl.v1.ApplyDeployment + 15, // 6: ctrl.v1.DeploymentState.delete:type_name -> ctrl.v1.DeleteDeployment + 18, // 7: ctrl.v1.ReportDeploymentStatusRequest.Update.instances:type_name -> ctrl.v1.ReportDeploymentStatusRequest.Update.Instance + 1, // 8: ctrl.v1.ReportDeploymentStatusRequest.Update.Instance.status:type_name -> ctrl.v1.ReportDeploymentStatusRequest.Update.Instance.Status + 8, // 9: ctrl.v1.ClusterService.WatchDeployments:input_type -> ctrl.v1.WatchDeploymentsRequest + 9, // 10: ctrl.v1.ClusterService.WatchSentinels:input_type -> ctrl.v1.WatchSentinelsRequest + 2, // 11: ctrl.v1.ClusterService.GetDesiredSentinelState:input_type -> ctrl.v1.GetDesiredSentinelStateRequest + 6, // 12: ctrl.v1.ClusterService.ReportSentinelStatus:input_type -> ctrl.v1.ReportSentinelStatusRequest + 3, // 13: ctrl.v1.ClusterService.GetDesiredDeploymentState:input_type -> ctrl.v1.GetDesiredDeploymentStateRequest + 4, // 14: ctrl.v1.ClusterService.ReportDeploymentStatus:input_type -> ctrl.v1.ReportDeploymentStatusRequest + 11, // 15: ctrl.v1.ClusterService.WatchDeployments:output_type -> ctrl.v1.DeploymentState + 10, // 16: ctrl.v1.ClusterService.WatchSentinels:output_type -> ctrl.v1.SentinelState + 10, // 17: ctrl.v1.ClusterService.GetDesiredSentinelState:output_type -> ctrl.v1.SentinelState + 7, // 18: ctrl.v1.ClusterService.ReportSentinelStatus:output_type -> ctrl.v1.ReportSentinelStatusResponse + 11, // 19: ctrl.v1.ClusterService.GetDesiredDeploymentState:output_type -> ctrl.v1.DeploymentState + 5, // 20: ctrl.v1.ClusterService.ReportDeploymentStatus:output_type -> ctrl.v1.ReportDeploymentStatusResponse + 15, // [15:21] is the sub-list for method output_type + 9, // [9:15] is the sub-list for method input_type + 9, // [9:9] is the sub-list for extension type_name + 9, // [9:9] is the sub-list for extension extendee + 0, // [0:9] is the sub-list for field type_name } func init() { file_ctrl_v1_cluster_proto_init() } @@ -1545,29 +1499,25 @@ func file_ctrl_v1_cluster_proto_init() { return } file_ctrl_v1_cluster_proto_msgTypes[2].OneofWrappers = []any{ - (*UpdateDeploymentStateRequest_Update_)(nil), - (*UpdateDeploymentStateRequest_Delete_)(nil), + (*ReportDeploymentStatusRequest_Update_)(nil), + (*ReportDeploymentStatusRequest_Delete_)(nil), } file_ctrl_v1_cluster_proto_msgTypes[8].OneofWrappers = []any{ - (*State_Deployment)(nil), - (*State_Sentinel)(nil), - } - file_ctrl_v1_cluster_proto_msgTypes[9].OneofWrappers = []any{ (*SentinelState_Apply)(nil), (*SentinelState_Delete)(nil), } - file_ctrl_v1_cluster_proto_msgTypes[10].OneofWrappers = []any{ + file_ctrl_v1_cluster_proto_msgTypes[9].OneofWrappers = []any{ (*DeploymentState_Apply)(nil), (*DeploymentState_Delete)(nil), } - file_ctrl_v1_cluster_proto_msgTypes[13].OneofWrappers = []any{} + file_ctrl_v1_cluster_proto_msgTypes[12].OneofWrappers = []any{} type x struct{} out := protoimpl.TypeBuilder{ File: protoimpl.DescBuilder{ GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: unsafe.Slice(unsafe.StringData(file_ctrl_v1_cluster_proto_rawDesc), len(file_ctrl_v1_cluster_proto_rawDesc)), NumEnums: 2, - NumMessages: 18, + NumMessages: 17, NumExtensions: 0, NumServices: 1, }, diff --git a/gen/proto/ctrl/v1/ctrlv1connect/cluster.connect.go b/gen/proto/ctrl/v1/ctrlv1connect/cluster.connect.go index 86169fff7e..c653181dd7 100644 --- a/gen/proto/ctrl/v1/ctrlv1connect/cluster.connect.go +++ b/gen/proto/ctrl/v1/ctrlv1connect/cluster.connect.go @@ -42,29 +42,48 @@ const ( // reflection-formatted method names, remove the leading slash and convert the remaining slash to a // period. const ( - // ClusterServiceSyncProcedure is the fully-qualified name of the ClusterService's Sync RPC. - ClusterServiceSyncProcedure = "/ctrl.v1.ClusterService/Sync" + // ClusterServiceWatchDeploymentsProcedure is the fully-qualified name of the ClusterService's + // WatchDeployments RPC. + ClusterServiceWatchDeploymentsProcedure = "/ctrl.v1.ClusterService/WatchDeployments" + // ClusterServiceWatchSentinelsProcedure is the fully-qualified name of the ClusterService's + // WatchSentinels RPC. + ClusterServiceWatchSentinelsProcedure = "/ctrl.v1.ClusterService/WatchSentinels" // ClusterServiceGetDesiredSentinelStateProcedure is the fully-qualified name of the // ClusterService's GetDesiredSentinelState RPC. ClusterServiceGetDesiredSentinelStateProcedure = "/ctrl.v1.ClusterService/GetDesiredSentinelState" - // ClusterServiceUpdateSentinelStateProcedure is the fully-qualified name of the ClusterService's - // UpdateSentinelState RPC. - ClusterServiceUpdateSentinelStateProcedure = "/ctrl.v1.ClusterService/UpdateSentinelState" + // ClusterServiceReportSentinelStatusProcedure is the fully-qualified name of the ClusterService's + // ReportSentinelStatus RPC. + ClusterServiceReportSentinelStatusProcedure = "/ctrl.v1.ClusterService/ReportSentinelStatus" // ClusterServiceGetDesiredDeploymentStateProcedure is the fully-qualified name of the // ClusterService's GetDesiredDeploymentState RPC. ClusterServiceGetDesiredDeploymentStateProcedure = "/ctrl.v1.ClusterService/GetDesiredDeploymentState" - // ClusterServiceUpdateDeploymentStateProcedure is the fully-qualified name of the ClusterService's - // UpdateDeploymentState RPC. - ClusterServiceUpdateDeploymentStateProcedure = "/ctrl.v1.ClusterService/UpdateDeploymentState" + // ClusterServiceReportDeploymentStatusProcedure is the fully-qualified name of the ClusterService's + // ReportDeploymentStatus RPC. + ClusterServiceReportDeploymentStatusProcedure = "/ctrl.v1.ClusterService/ReportDeploymentStatus" ) // ClusterServiceClient is a client for the ctrl.v1.ClusterService service. type ClusterServiceClient interface { - Sync(context.Context, *connect.Request[v1.SyncRequest]) (*connect.ServerStreamForClient[v1.State], error) + // WatchDeployments streams deployment state changes from the control plane to agents. + // Each deployment controller maintains its own version cursor for resumable streaming. + // The agent applies received state to Kubernetes to converge actual state toward desired state. + WatchDeployments(context.Context, *connect.Request[v1.WatchDeploymentsRequest]) (*connect.ServerStreamForClient[v1.DeploymentState], error) + // WatchSentinels streams sentinel state changes from the control plane to agents. + // Each sentinel controller maintains its own version cursor for resumable streaming. + // The agent applies received state to Kubernetes to converge actual state toward desired state. + WatchSentinels(context.Context, *connect.Request[v1.WatchSentinelsRequest]) (*connect.ServerStreamForClient[v1.SentinelState], error) + // GetDesiredSentinelState returns the current desired state for a single sentinel. + // Used by the resync loop to verify consistency for existing resources. GetDesiredSentinelState(context.Context, *connect.Request[v1.GetDesiredSentinelStateRequest]) (*connect.Response[v1.SentinelState], error) - UpdateSentinelState(context.Context, *connect.Request[v1.UpdateSentinelStateRequest]) (*connect.Response[v1.UpdateSentinelStateResponse], error) + // ReportSentinelStatus reports actual sentinel state from the agent to the control plane. + // Called when K8s watch events indicate sentinel Deployment changes. + ReportSentinelStatus(context.Context, *connect.Request[v1.ReportSentinelStatusRequest]) (*connect.Response[v1.ReportSentinelStatusResponse], error) + // GetDesiredDeploymentState returns the current desired state for a single deployment. + // Used by the resync loop to verify consistency for existing resources. GetDesiredDeploymentState(context.Context, *connect.Request[v1.GetDesiredDeploymentStateRequest]) (*connect.Response[v1.DeploymentState], error) - UpdateDeploymentState(context.Context, *connect.Request[v1.UpdateDeploymentStateRequest]) (*connect.Response[v1.UpdateDeploymentStateResponse], error) + // ReportDeploymentStatus reports actual deployment state from the agent to the control plane. + // Called when K8s watch events indicate ReplicaSet changes. + ReportDeploymentStatus(context.Context, *connect.Request[v1.ReportDeploymentStatusRequest]) (*connect.Response[v1.ReportDeploymentStatusResponse], error) } // NewClusterServiceClient constructs a client for the ctrl.v1.ClusterService service. By default, @@ -78,10 +97,16 @@ func NewClusterServiceClient(httpClient connect.HTTPClient, baseURL string, opts baseURL = strings.TrimRight(baseURL, "/") clusterServiceMethods := v1.File_ctrl_v1_cluster_proto.Services().ByName("ClusterService").Methods() return &clusterServiceClient{ - sync: connect.NewClient[v1.SyncRequest, v1.State]( + watchDeployments: connect.NewClient[v1.WatchDeploymentsRequest, v1.DeploymentState]( httpClient, - baseURL+ClusterServiceSyncProcedure, - connect.WithSchema(clusterServiceMethods.ByName("Sync")), + baseURL+ClusterServiceWatchDeploymentsProcedure, + connect.WithSchema(clusterServiceMethods.ByName("WatchDeployments")), + connect.WithClientOptions(opts...), + ), + watchSentinels: connect.NewClient[v1.WatchSentinelsRequest, v1.SentinelState]( + httpClient, + baseURL+ClusterServiceWatchSentinelsProcedure, + connect.WithSchema(clusterServiceMethods.ByName("WatchSentinels")), connect.WithClientOptions(opts...), ), getDesiredSentinelState: connect.NewClient[v1.GetDesiredSentinelStateRequest, v1.SentinelState]( @@ -90,10 +115,10 @@ func NewClusterServiceClient(httpClient connect.HTTPClient, baseURL string, opts connect.WithSchema(clusterServiceMethods.ByName("GetDesiredSentinelState")), connect.WithClientOptions(opts...), ), - updateSentinelState: connect.NewClient[v1.UpdateSentinelStateRequest, v1.UpdateSentinelStateResponse]( + reportSentinelStatus: connect.NewClient[v1.ReportSentinelStatusRequest, v1.ReportSentinelStatusResponse]( httpClient, - baseURL+ClusterServiceUpdateSentinelStateProcedure, - connect.WithSchema(clusterServiceMethods.ByName("UpdateSentinelState")), + baseURL+ClusterServiceReportSentinelStatusProcedure, + connect.WithSchema(clusterServiceMethods.ByName("ReportSentinelStatus")), connect.WithClientOptions(opts...), ), getDesiredDeploymentState: connect.NewClient[v1.GetDesiredDeploymentStateRequest, v1.DeploymentState]( @@ -102,10 +127,10 @@ func NewClusterServiceClient(httpClient connect.HTTPClient, baseURL string, opts connect.WithSchema(clusterServiceMethods.ByName("GetDesiredDeploymentState")), connect.WithClientOptions(opts...), ), - updateDeploymentState: connect.NewClient[v1.UpdateDeploymentStateRequest, v1.UpdateDeploymentStateResponse]( + reportDeploymentStatus: connect.NewClient[v1.ReportDeploymentStatusRequest, v1.ReportDeploymentStatusResponse]( httpClient, - baseURL+ClusterServiceUpdateDeploymentStateProcedure, - connect.WithSchema(clusterServiceMethods.ByName("UpdateDeploymentState")), + baseURL+ClusterServiceReportDeploymentStatusProcedure, + connect.WithSchema(clusterServiceMethods.ByName("ReportDeploymentStatus")), connect.WithClientOptions(opts...), ), } @@ -113,16 +138,22 @@ func NewClusterServiceClient(httpClient connect.HTTPClient, baseURL string, opts // clusterServiceClient implements ClusterServiceClient. type clusterServiceClient struct { - sync *connect.Client[v1.SyncRequest, v1.State] + watchDeployments *connect.Client[v1.WatchDeploymentsRequest, v1.DeploymentState] + watchSentinels *connect.Client[v1.WatchSentinelsRequest, v1.SentinelState] getDesiredSentinelState *connect.Client[v1.GetDesiredSentinelStateRequest, v1.SentinelState] - updateSentinelState *connect.Client[v1.UpdateSentinelStateRequest, v1.UpdateSentinelStateResponse] + reportSentinelStatus *connect.Client[v1.ReportSentinelStatusRequest, v1.ReportSentinelStatusResponse] getDesiredDeploymentState *connect.Client[v1.GetDesiredDeploymentStateRequest, v1.DeploymentState] - updateDeploymentState *connect.Client[v1.UpdateDeploymentStateRequest, v1.UpdateDeploymentStateResponse] + reportDeploymentStatus *connect.Client[v1.ReportDeploymentStatusRequest, v1.ReportDeploymentStatusResponse] +} + +// WatchDeployments calls ctrl.v1.ClusterService.WatchDeployments. +func (c *clusterServiceClient) WatchDeployments(ctx context.Context, req *connect.Request[v1.WatchDeploymentsRequest]) (*connect.ServerStreamForClient[v1.DeploymentState], error) { + return c.watchDeployments.CallServerStream(ctx, req) } -// Sync calls ctrl.v1.ClusterService.Sync. -func (c *clusterServiceClient) Sync(ctx context.Context, req *connect.Request[v1.SyncRequest]) (*connect.ServerStreamForClient[v1.State], error) { - return c.sync.CallServerStream(ctx, req) +// WatchSentinels calls ctrl.v1.ClusterService.WatchSentinels. +func (c *clusterServiceClient) WatchSentinels(ctx context.Context, req *connect.Request[v1.WatchSentinelsRequest]) (*connect.ServerStreamForClient[v1.SentinelState], error) { + return c.watchSentinels.CallServerStream(ctx, req) } // GetDesiredSentinelState calls ctrl.v1.ClusterService.GetDesiredSentinelState. @@ -130,9 +161,9 @@ func (c *clusterServiceClient) GetDesiredSentinelState(ctx context.Context, req return c.getDesiredSentinelState.CallUnary(ctx, req) } -// UpdateSentinelState calls ctrl.v1.ClusterService.UpdateSentinelState. -func (c *clusterServiceClient) UpdateSentinelState(ctx context.Context, req *connect.Request[v1.UpdateSentinelStateRequest]) (*connect.Response[v1.UpdateSentinelStateResponse], error) { - return c.updateSentinelState.CallUnary(ctx, req) +// ReportSentinelStatus calls ctrl.v1.ClusterService.ReportSentinelStatus. +func (c *clusterServiceClient) ReportSentinelStatus(ctx context.Context, req *connect.Request[v1.ReportSentinelStatusRequest]) (*connect.Response[v1.ReportSentinelStatusResponse], error) { + return c.reportSentinelStatus.CallUnary(ctx, req) } // GetDesiredDeploymentState calls ctrl.v1.ClusterService.GetDesiredDeploymentState. @@ -140,18 +171,33 @@ func (c *clusterServiceClient) GetDesiredDeploymentState(ctx context.Context, re return c.getDesiredDeploymentState.CallUnary(ctx, req) } -// UpdateDeploymentState calls ctrl.v1.ClusterService.UpdateDeploymentState. -func (c *clusterServiceClient) UpdateDeploymentState(ctx context.Context, req *connect.Request[v1.UpdateDeploymentStateRequest]) (*connect.Response[v1.UpdateDeploymentStateResponse], error) { - return c.updateDeploymentState.CallUnary(ctx, req) +// ReportDeploymentStatus calls ctrl.v1.ClusterService.ReportDeploymentStatus. +func (c *clusterServiceClient) ReportDeploymentStatus(ctx context.Context, req *connect.Request[v1.ReportDeploymentStatusRequest]) (*connect.Response[v1.ReportDeploymentStatusResponse], error) { + return c.reportDeploymentStatus.CallUnary(ctx, req) } // ClusterServiceHandler is an implementation of the ctrl.v1.ClusterService service. type ClusterServiceHandler interface { - Sync(context.Context, *connect.Request[v1.SyncRequest], *connect.ServerStream[v1.State]) error + // WatchDeployments streams deployment state changes from the control plane to agents. + // Each deployment controller maintains its own version cursor for resumable streaming. + // The agent applies received state to Kubernetes to converge actual state toward desired state. + WatchDeployments(context.Context, *connect.Request[v1.WatchDeploymentsRequest], *connect.ServerStream[v1.DeploymentState]) error + // WatchSentinels streams sentinel state changes from the control plane to agents. + // Each sentinel controller maintains its own version cursor for resumable streaming. + // The agent applies received state to Kubernetes to converge actual state toward desired state. + WatchSentinels(context.Context, *connect.Request[v1.WatchSentinelsRequest], *connect.ServerStream[v1.SentinelState]) error + // GetDesiredSentinelState returns the current desired state for a single sentinel. + // Used by the resync loop to verify consistency for existing resources. GetDesiredSentinelState(context.Context, *connect.Request[v1.GetDesiredSentinelStateRequest]) (*connect.Response[v1.SentinelState], error) - UpdateSentinelState(context.Context, *connect.Request[v1.UpdateSentinelStateRequest]) (*connect.Response[v1.UpdateSentinelStateResponse], error) + // ReportSentinelStatus reports actual sentinel state from the agent to the control plane. + // Called when K8s watch events indicate sentinel Deployment changes. + ReportSentinelStatus(context.Context, *connect.Request[v1.ReportSentinelStatusRequest]) (*connect.Response[v1.ReportSentinelStatusResponse], error) + // GetDesiredDeploymentState returns the current desired state for a single deployment. + // Used by the resync loop to verify consistency for existing resources. GetDesiredDeploymentState(context.Context, *connect.Request[v1.GetDesiredDeploymentStateRequest]) (*connect.Response[v1.DeploymentState], error) - UpdateDeploymentState(context.Context, *connect.Request[v1.UpdateDeploymentStateRequest]) (*connect.Response[v1.UpdateDeploymentStateResponse], error) + // ReportDeploymentStatus reports actual deployment state from the agent to the control plane. + // Called when K8s watch events indicate ReplicaSet changes. + ReportDeploymentStatus(context.Context, *connect.Request[v1.ReportDeploymentStatusRequest]) (*connect.Response[v1.ReportDeploymentStatusResponse], error) } // NewClusterServiceHandler builds an HTTP handler from the service implementation. It returns the @@ -161,10 +207,16 @@ type ClusterServiceHandler interface { // and JSON codecs. They also support gzip compression. func NewClusterServiceHandler(svc ClusterServiceHandler, opts ...connect.HandlerOption) (string, http.Handler) { clusterServiceMethods := v1.File_ctrl_v1_cluster_proto.Services().ByName("ClusterService").Methods() - clusterServiceSyncHandler := connect.NewServerStreamHandler( - ClusterServiceSyncProcedure, - svc.Sync, - connect.WithSchema(clusterServiceMethods.ByName("Sync")), + clusterServiceWatchDeploymentsHandler := connect.NewServerStreamHandler( + ClusterServiceWatchDeploymentsProcedure, + svc.WatchDeployments, + connect.WithSchema(clusterServiceMethods.ByName("WatchDeployments")), + connect.WithHandlerOptions(opts...), + ) + clusterServiceWatchSentinelsHandler := connect.NewServerStreamHandler( + ClusterServiceWatchSentinelsProcedure, + svc.WatchSentinels, + connect.WithSchema(clusterServiceMethods.ByName("WatchSentinels")), connect.WithHandlerOptions(opts...), ) clusterServiceGetDesiredSentinelStateHandler := connect.NewUnaryHandler( @@ -173,10 +225,10 @@ func NewClusterServiceHandler(svc ClusterServiceHandler, opts ...connect.Handler connect.WithSchema(clusterServiceMethods.ByName("GetDesiredSentinelState")), connect.WithHandlerOptions(opts...), ) - clusterServiceUpdateSentinelStateHandler := connect.NewUnaryHandler( - ClusterServiceUpdateSentinelStateProcedure, - svc.UpdateSentinelState, - connect.WithSchema(clusterServiceMethods.ByName("UpdateSentinelState")), + clusterServiceReportSentinelStatusHandler := connect.NewUnaryHandler( + ClusterServiceReportSentinelStatusProcedure, + svc.ReportSentinelStatus, + connect.WithSchema(clusterServiceMethods.ByName("ReportSentinelStatus")), connect.WithHandlerOptions(opts...), ) clusterServiceGetDesiredDeploymentStateHandler := connect.NewUnaryHandler( @@ -185,24 +237,26 @@ func NewClusterServiceHandler(svc ClusterServiceHandler, opts ...connect.Handler connect.WithSchema(clusterServiceMethods.ByName("GetDesiredDeploymentState")), connect.WithHandlerOptions(opts...), ) - clusterServiceUpdateDeploymentStateHandler := connect.NewUnaryHandler( - ClusterServiceUpdateDeploymentStateProcedure, - svc.UpdateDeploymentState, - connect.WithSchema(clusterServiceMethods.ByName("UpdateDeploymentState")), + clusterServiceReportDeploymentStatusHandler := connect.NewUnaryHandler( + ClusterServiceReportDeploymentStatusProcedure, + svc.ReportDeploymentStatus, + connect.WithSchema(clusterServiceMethods.ByName("ReportDeploymentStatus")), connect.WithHandlerOptions(opts...), ) return "/ctrl.v1.ClusterService/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { switch r.URL.Path { - case ClusterServiceSyncProcedure: - clusterServiceSyncHandler.ServeHTTP(w, r) + case ClusterServiceWatchDeploymentsProcedure: + clusterServiceWatchDeploymentsHandler.ServeHTTP(w, r) + case ClusterServiceWatchSentinelsProcedure: + clusterServiceWatchSentinelsHandler.ServeHTTP(w, r) case ClusterServiceGetDesiredSentinelStateProcedure: clusterServiceGetDesiredSentinelStateHandler.ServeHTTP(w, r) - case ClusterServiceUpdateSentinelStateProcedure: - clusterServiceUpdateSentinelStateHandler.ServeHTTP(w, r) + case ClusterServiceReportSentinelStatusProcedure: + clusterServiceReportSentinelStatusHandler.ServeHTTP(w, r) case ClusterServiceGetDesiredDeploymentStateProcedure: clusterServiceGetDesiredDeploymentStateHandler.ServeHTTP(w, r) - case ClusterServiceUpdateDeploymentStateProcedure: - clusterServiceUpdateDeploymentStateHandler.ServeHTTP(w, r) + case ClusterServiceReportDeploymentStatusProcedure: + clusterServiceReportDeploymentStatusHandler.ServeHTTP(w, r) default: http.NotFound(w, r) } @@ -212,22 +266,26 @@ func NewClusterServiceHandler(svc ClusterServiceHandler, opts ...connect.Handler // UnimplementedClusterServiceHandler returns CodeUnimplemented from all methods. type UnimplementedClusterServiceHandler struct{} -func (UnimplementedClusterServiceHandler) Sync(context.Context, *connect.Request[v1.SyncRequest], *connect.ServerStream[v1.State]) error { - return connect.NewError(connect.CodeUnimplemented, errors.New("ctrl.v1.ClusterService.Sync is not implemented")) +func (UnimplementedClusterServiceHandler) WatchDeployments(context.Context, *connect.Request[v1.WatchDeploymentsRequest], *connect.ServerStream[v1.DeploymentState]) error { + return connect.NewError(connect.CodeUnimplemented, errors.New("ctrl.v1.ClusterService.WatchDeployments is not implemented")) +} + +func (UnimplementedClusterServiceHandler) WatchSentinels(context.Context, *connect.Request[v1.WatchSentinelsRequest], *connect.ServerStream[v1.SentinelState]) error { + return connect.NewError(connect.CodeUnimplemented, errors.New("ctrl.v1.ClusterService.WatchSentinels is not implemented")) } func (UnimplementedClusterServiceHandler) GetDesiredSentinelState(context.Context, *connect.Request[v1.GetDesiredSentinelStateRequest]) (*connect.Response[v1.SentinelState], error) { return nil, connect.NewError(connect.CodeUnimplemented, errors.New("ctrl.v1.ClusterService.GetDesiredSentinelState is not implemented")) } -func (UnimplementedClusterServiceHandler) UpdateSentinelState(context.Context, *connect.Request[v1.UpdateSentinelStateRequest]) (*connect.Response[v1.UpdateSentinelStateResponse], error) { - return nil, connect.NewError(connect.CodeUnimplemented, errors.New("ctrl.v1.ClusterService.UpdateSentinelState is not implemented")) +func (UnimplementedClusterServiceHandler) ReportSentinelStatus(context.Context, *connect.Request[v1.ReportSentinelStatusRequest]) (*connect.Response[v1.ReportSentinelStatusResponse], error) { + return nil, connect.NewError(connect.CodeUnimplemented, errors.New("ctrl.v1.ClusterService.ReportSentinelStatus is not implemented")) } func (UnimplementedClusterServiceHandler) GetDesiredDeploymentState(context.Context, *connect.Request[v1.GetDesiredDeploymentStateRequest]) (*connect.Response[v1.DeploymentState], error) { return nil, connect.NewError(connect.CodeUnimplemented, errors.New("ctrl.v1.ClusterService.GetDesiredDeploymentState is not implemented")) } -func (UnimplementedClusterServiceHandler) UpdateDeploymentState(context.Context, *connect.Request[v1.UpdateDeploymentStateRequest]) (*connect.Response[v1.UpdateDeploymentStateResponse], error) { - return nil, connect.NewError(connect.CodeUnimplemented, errors.New("ctrl.v1.ClusterService.UpdateDeploymentState is not implemented")) +func (UnimplementedClusterServiceHandler) ReportDeploymentStatus(context.Context, *connect.Request[v1.ReportDeploymentStatusRequest]) (*connect.Response[v1.ReportDeploymentStatusResponse], error) { + return nil, connect.NewError(connect.CodeUnimplemented, errors.New("ctrl.v1.ClusterService.ReportDeploymentStatus is not implemented")) } diff --git a/gen/proto/krane/v1/BUILD.bazel b/gen/proto/krane/v1/BUILD.bazel index 1a28d74046..c58a8307bb 100644 --- a/gen/proto/krane/v1/BUILD.bazel +++ b/gen/proto/krane/v1/BUILD.bazel @@ -2,10 +2,7 @@ load("@rules_go//go:def.bzl", "go_library") go_library( name = "krane", - srcs = [ - "scheduler.pb.go", - "secrets.pb.go", - ], + srcs = ["secrets.pb.go"], importpath = "github.com/unkeyed/unkey/gen/proto/krane/v1", visibility = ["//visibility:public"], deps = [ diff --git a/gen/proto/krane/v1/kranev1connect/BUILD.bazel b/gen/proto/krane/v1/kranev1connect/BUILD.bazel index 160f17a52d..bcd15a852a 100644 --- a/gen/proto/krane/v1/kranev1connect/BUILD.bazel +++ b/gen/proto/krane/v1/kranev1connect/BUILD.bazel @@ -2,10 +2,7 @@ load("@rules_go//go:def.bzl", "go_library") go_library( name = "kranev1connect", - srcs = [ - "scheduler.connect.go", - "secrets.connect.go", - ], + srcs = ["secrets.connect.go"], importpath = "github.com/unkeyed/unkey/gen/proto/krane/v1/kranev1connect", visibility = ["//visibility:public"], deps = [ diff --git a/gen/proto/krane/v1/kranev1connect/scheduler.connect.go b/gen/proto/krane/v1/kranev1connect/scheduler.connect.go deleted file mode 100644 index 02cce8b106..0000000000 --- a/gen/proto/krane/v1/kranev1connect/scheduler.connect.go +++ /dev/null @@ -1,253 +0,0 @@ -// Code generated by protoc-gen-connect-go. DO NOT EDIT. -// -// Source: krane/v1/scheduler.proto - -package kranev1connect - -import ( - connect "connectrpc.com/connect" - context "context" - errors "errors" - v1 "github.com/unkeyed/unkey/gen/proto/krane/v1" - http "net/http" - strings "strings" -) - -// This is a compile-time assertion to ensure that this generated file and the connect package are -// compatible. If you get a compiler error that this constant is not defined, this code was -// generated with a version of connect newer than the one compiled into your binary. You can fix the -// problem by either regenerating this code with an older version of connect or updating the connect -// version compiled into your binary. -const _ = connect.IsAtLeastVersion1_13_0 - -const ( - // SchedulerServiceName is the fully-qualified name of the SchedulerService service. - SchedulerServiceName = "krane.v1.SchedulerService" -) - -// These constants are the fully-qualified names of the RPCs defined in this package. They're -// exposed at runtime as Spec.Procedure and as the final two segments of the HTTP route. -// -// Note that these are different from the fully-qualified method names used by -// google.golang.org/protobuf/reflect/protoreflect. To convert from these constants to -// reflection-formatted method names, remove the leading slash and convert the remaining slash to a -// period. -const ( - // SchedulerServiceApplyDeploymentProcedure is the fully-qualified name of the SchedulerService's - // ApplyDeployment RPC. - SchedulerServiceApplyDeploymentProcedure = "/krane.v1.SchedulerService/ApplyDeployment" - // SchedulerServiceDeleteDeploymentProcedure is the fully-qualified name of the SchedulerService's - // DeleteDeployment RPC. - SchedulerServiceDeleteDeploymentProcedure = "/krane.v1.SchedulerService/DeleteDeployment" - // SchedulerServiceApplySentinelProcedure is the fully-qualified name of the SchedulerService's - // ApplySentinel RPC. - SchedulerServiceApplySentinelProcedure = "/krane.v1.SchedulerService/ApplySentinel" - // SchedulerServiceDeleteSentinelProcedure is the fully-qualified name of the SchedulerService's - // DeleteSentinel RPC. - SchedulerServiceDeleteSentinelProcedure = "/krane.v1.SchedulerService/DeleteSentinel" - // SchedulerServiceWatchProcedure is the fully-qualified name of the SchedulerService's Watch RPC. - SchedulerServiceWatchProcedure = "/krane.v1.SchedulerService/Watch" - // SchedulerServiceScrapeOpenApiSchemaProcedure is the fully-qualified name of the - // SchedulerService's ScrapeOpenApiSchema RPC. - SchedulerServiceScrapeOpenApiSchemaProcedure = "/krane.v1.SchedulerService/ScrapeOpenApiSchema" -) - -// SchedulerServiceClient is a client for the krane.v1.SchedulerService service. -type SchedulerServiceClient interface { - ApplyDeployment(context.Context, *connect.Request[v1.ApplyDeploymentRequest]) (*connect.Response[v1.ApplyDeploymentResponse], error) - DeleteDeployment(context.Context, *connect.Request[v1.DeleteDeploymentRequest]) (*connect.Response[v1.DeleteDeploymentResponse], error) - ApplySentinel(context.Context, *connect.Request[v1.ApplySentinelRequest]) (*connect.Response[v1.ApplySentinelResponse], error) - DeleteSentinel(context.Context, *connect.Request[v1.DeleteSentinelRequest]) (*connect.Response[v1.DeleteSentinelResponse], error) - Watch(context.Context, *connect.Request[v1.WatchRequest]) (*connect.ServerStreamForClient[v1.State], error) - ScrapeOpenApiSchema(context.Context, *connect.Request[v1.ScrapeOpenApiSchemaRequest]) (*connect.Response[v1.ScrapeOpenApiSchemaResponse], error) -} - -// NewSchedulerServiceClient constructs a client for the krane.v1.SchedulerService service. By -// default, it uses the Connect protocol with the binary Protobuf Codec, asks for gzipped responses, -// and sends uncompressed requests. To use the gRPC or gRPC-Web protocols, supply the -// connect.WithGRPC() or connect.WithGRPCWeb() options. -// -// The URL supplied here should be the base URL for the Connect or gRPC server (for example, -// http://api.acme.com or https://acme.com/grpc). -func NewSchedulerServiceClient(httpClient connect.HTTPClient, baseURL string, opts ...connect.ClientOption) SchedulerServiceClient { - baseURL = strings.TrimRight(baseURL, "/") - schedulerServiceMethods := v1.File_krane_v1_scheduler_proto.Services().ByName("SchedulerService").Methods() - return &schedulerServiceClient{ - applyDeployment: connect.NewClient[v1.ApplyDeploymentRequest, v1.ApplyDeploymentResponse]( - httpClient, - baseURL+SchedulerServiceApplyDeploymentProcedure, - connect.WithSchema(schedulerServiceMethods.ByName("ApplyDeployment")), - connect.WithClientOptions(opts...), - ), - deleteDeployment: connect.NewClient[v1.DeleteDeploymentRequest, v1.DeleteDeploymentResponse]( - httpClient, - baseURL+SchedulerServiceDeleteDeploymentProcedure, - connect.WithSchema(schedulerServiceMethods.ByName("DeleteDeployment")), - connect.WithClientOptions(opts...), - ), - applySentinel: connect.NewClient[v1.ApplySentinelRequest, v1.ApplySentinelResponse]( - httpClient, - baseURL+SchedulerServiceApplySentinelProcedure, - connect.WithSchema(schedulerServiceMethods.ByName("ApplySentinel")), - connect.WithClientOptions(opts...), - ), - deleteSentinel: connect.NewClient[v1.DeleteSentinelRequest, v1.DeleteSentinelResponse]( - httpClient, - baseURL+SchedulerServiceDeleteSentinelProcedure, - connect.WithSchema(schedulerServiceMethods.ByName("DeleteSentinel")), - connect.WithClientOptions(opts...), - ), - watch: connect.NewClient[v1.WatchRequest, v1.State]( - httpClient, - baseURL+SchedulerServiceWatchProcedure, - connect.WithSchema(schedulerServiceMethods.ByName("Watch")), - connect.WithClientOptions(opts...), - ), - scrapeOpenApiSchema: connect.NewClient[v1.ScrapeOpenApiSchemaRequest, v1.ScrapeOpenApiSchemaResponse]( - httpClient, - baseURL+SchedulerServiceScrapeOpenApiSchemaProcedure, - connect.WithSchema(schedulerServiceMethods.ByName("ScrapeOpenApiSchema")), - connect.WithClientOptions(opts...), - ), - } -} - -// schedulerServiceClient implements SchedulerServiceClient. -type schedulerServiceClient struct { - applyDeployment *connect.Client[v1.ApplyDeploymentRequest, v1.ApplyDeploymentResponse] - deleteDeployment *connect.Client[v1.DeleteDeploymentRequest, v1.DeleteDeploymentResponse] - applySentinel *connect.Client[v1.ApplySentinelRequest, v1.ApplySentinelResponse] - deleteSentinel *connect.Client[v1.DeleteSentinelRequest, v1.DeleteSentinelResponse] - watch *connect.Client[v1.WatchRequest, v1.State] - scrapeOpenApiSchema *connect.Client[v1.ScrapeOpenApiSchemaRequest, v1.ScrapeOpenApiSchemaResponse] -} - -// ApplyDeployment calls krane.v1.SchedulerService.ApplyDeployment. -func (c *schedulerServiceClient) ApplyDeployment(ctx context.Context, req *connect.Request[v1.ApplyDeploymentRequest]) (*connect.Response[v1.ApplyDeploymentResponse], error) { - return c.applyDeployment.CallUnary(ctx, req) -} - -// DeleteDeployment calls krane.v1.SchedulerService.DeleteDeployment. -func (c *schedulerServiceClient) DeleteDeployment(ctx context.Context, req *connect.Request[v1.DeleteDeploymentRequest]) (*connect.Response[v1.DeleteDeploymentResponse], error) { - return c.deleteDeployment.CallUnary(ctx, req) -} - -// ApplySentinel calls krane.v1.SchedulerService.ApplySentinel. -func (c *schedulerServiceClient) ApplySentinel(ctx context.Context, req *connect.Request[v1.ApplySentinelRequest]) (*connect.Response[v1.ApplySentinelResponse], error) { - return c.applySentinel.CallUnary(ctx, req) -} - -// DeleteSentinel calls krane.v1.SchedulerService.DeleteSentinel. -func (c *schedulerServiceClient) DeleteSentinel(ctx context.Context, req *connect.Request[v1.DeleteSentinelRequest]) (*connect.Response[v1.DeleteSentinelResponse], error) { - return c.deleteSentinel.CallUnary(ctx, req) -} - -// Watch calls krane.v1.SchedulerService.Watch. -func (c *schedulerServiceClient) Watch(ctx context.Context, req *connect.Request[v1.WatchRequest]) (*connect.ServerStreamForClient[v1.State], error) { - return c.watch.CallServerStream(ctx, req) -} - -// ScrapeOpenApiSchema calls krane.v1.SchedulerService.ScrapeOpenApiSchema. -func (c *schedulerServiceClient) ScrapeOpenApiSchema(ctx context.Context, req *connect.Request[v1.ScrapeOpenApiSchemaRequest]) (*connect.Response[v1.ScrapeOpenApiSchemaResponse], error) { - return c.scrapeOpenApiSchema.CallUnary(ctx, req) -} - -// SchedulerServiceHandler is an implementation of the krane.v1.SchedulerService service. -type SchedulerServiceHandler interface { - ApplyDeployment(context.Context, *connect.Request[v1.ApplyDeploymentRequest]) (*connect.Response[v1.ApplyDeploymentResponse], error) - DeleteDeployment(context.Context, *connect.Request[v1.DeleteDeploymentRequest]) (*connect.Response[v1.DeleteDeploymentResponse], error) - ApplySentinel(context.Context, *connect.Request[v1.ApplySentinelRequest]) (*connect.Response[v1.ApplySentinelResponse], error) - DeleteSentinel(context.Context, *connect.Request[v1.DeleteSentinelRequest]) (*connect.Response[v1.DeleteSentinelResponse], error) - Watch(context.Context, *connect.Request[v1.WatchRequest], *connect.ServerStream[v1.State]) error - ScrapeOpenApiSchema(context.Context, *connect.Request[v1.ScrapeOpenApiSchemaRequest]) (*connect.Response[v1.ScrapeOpenApiSchemaResponse], error) -} - -// NewSchedulerServiceHandler builds an HTTP handler from the service implementation. It returns the -// path on which to mount the handler and the handler itself. -// -// By default, handlers support the Connect, gRPC, and gRPC-Web protocols with the binary Protobuf -// and JSON codecs. They also support gzip compression. -func NewSchedulerServiceHandler(svc SchedulerServiceHandler, opts ...connect.HandlerOption) (string, http.Handler) { - schedulerServiceMethods := v1.File_krane_v1_scheduler_proto.Services().ByName("SchedulerService").Methods() - schedulerServiceApplyDeploymentHandler := connect.NewUnaryHandler( - SchedulerServiceApplyDeploymentProcedure, - svc.ApplyDeployment, - connect.WithSchema(schedulerServiceMethods.ByName("ApplyDeployment")), - connect.WithHandlerOptions(opts...), - ) - schedulerServiceDeleteDeploymentHandler := connect.NewUnaryHandler( - SchedulerServiceDeleteDeploymentProcedure, - svc.DeleteDeployment, - connect.WithSchema(schedulerServiceMethods.ByName("DeleteDeployment")), - connect.WithHandlerOptions(opts...), - ) - schedulerServiceApplySentinelHandler := connect.NewUnaryHandler( - SchedulerServiceApplySentinelProcedure, - svc.ApplySentinel, - connect.WithSchema(schedulerServiceMethods.ByName("ApplySentinel")), - connect.WithHandlerOptions(opts...), - ) - schedulerServiceDeleteSentinelHandler := connect.NewUnaryHandler( - SchedulerServiceDeleteSentinelProcedure, - svc.DeleteSentinel, - connect.WithSchema(schedulerServiceMethods.ByName("DeleteSentinel")), - connect.WithHandlerOptions(opts...), - ) - schedulerServiceWatchHandler := connect.NewServerStreamHandler( - SchedulerServiceWatchProcedure, - svc.Watch, - connect.WithSchema(schedulerServiceMethods.ByName("Watch")), - connect.WithHandlerOptions(opts...), - ) - schedulerServiceScrapeOpenApiSchemaHandler := connect.NewUnaryHandler( - SchedulerServiceScrapeOpenApiSchemaProcedure, - svc.ScrapeOpenApiSchema, - connect.WithSchema(schedulerServiceMethods.ByName("ScrapeOpenApiSchema")), - connect.WithHandlerOptions(opts...), - ) - return "/krane.v1.SchedulerService/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - switch r.URL.Path { - case SchedulerServiceApplyDeploymentProcedure: - schedulerServiceApplyDeploymentHandler.ServeHTTP(w, r) - case SchedulerServiceDeleteDeploymentProcedure: - schedulerServiceDeleteDeploymentHandler.ServeHTTP(w, r) - case SchedulerServiceApplySentinelProcedure: - schedulerServiceApplySentinelHandler.ServeHTTP(w, r) - case SchedulerServiceDeleteSentinelProcedure: - schedulerServiceDeleteSentinelHandler.ServeHTTP(w, r) - case SchedulerServiceWatchProcedure: - schedulerServiceWatchHandler.ServeHTTP(w, r) - case SchedulerServiceScrapeOpenApiSchemaProcedure: - schedulerServiceScrapeOpenApiSchemaHandler.ServeHTTP(w, r) - default: - http.NotFound(w, r) - } - }) -} - -// UnimplementedSchedulerServiceHandler returns CodeUnimplemented from all methods. -type UnimplementedSchedulerServiceHandler struct{} - -func (UnimplementedSchedulerServiceHandler) ApplyDeployment(context.Context, *connect.Request[v1.ApplyDeploymentRequest]) (*connect.Response[v1.ApplyDeploymentResponse], error) { - return nil, connect.NewError(connect.CodeUnimplemented, errors.New("krane.v1.SchedulerService.ApplyDeployment is not implemented")) -} - -func (UnimplementedSchedulerServiceHandler) DeleteDeployment(context.Context, *connect.Request[v1.DeleteDeploymentRequest]) (*connect.Response[v1.DeleteDeploymentResponse], error) { - return nil, connect.NewError(connect.CodeUnimplemented, errors.New("krane.v1.SchedulerService.DeleteDeployment is not implemented")) -} - -func (UnimplementedSchedulerServiceHandler) ApplySentinel(context.Context, *connect.Request[v1.ApplySentinelRequest]) (*connect.Response[v1.ApplySentinelResponse], error) { - return nil, connect.NewError(connect.CodeUnimplemented, errors.New("krane.v1.SchedulerService.ApplySentinel is not implemented")) -} - -func (UnimplementedSchedulerServiceHandler) DeleteSentinel(context.Context, *connect.Request[v1.DeleteSentinelRequest]) (*connect.Response[v1.DeleteSentinelResponse], error) { - return nil, connect.NewError(connect.CodeUnimplemented, errors.New("krane.v1.SchedulerService.DeleteSentinel is not implemented")) -} - -func (UnimplementedSchedulerServiceHandler) Watch(context.Context, *connect.Request[v1.WatchRequest], *connect.ServerStream[v1.State]) error { - return connect.NewError(connect.CodeUnimplemented, errors.New("krane.v1.SchedulerService.Watch is not implemented")) -} - -func (UnimplementedSchedulerServiceHandler) ScrapeOpenApiSchema(context.Context, *connect.Request[v1.ScrapeOpenApiSchemaRequest]) (*connect.Response[v1.ScrapeOpenApiSchemaResponse], error) { - return nil, connect.NewError(connect.CodeUnimplemented, errors.New("krane.v1.SchedulerService.ScrapeOpenApiSchema is not implemented")) -} diff --git a/gen/proto/krane/v1/scheduler.pb.go b/gen/proto/krane/v1/scheduler.pb.go deleted file mode 100644 index 62e8fbc7e7..0000000000 --- a/gen/proto/krane/v1/scheduler.pb.go +++ /dev/null @@ -1,1219 +0,0 @@ -// Code generated by protoc-gen-go. DO NOT EDIT. -// versions: -// protoc-gen-go v1.36.8 -// protoc (unknown) -// source: krane/v1/scheduler.proto - -package kranev1 - -import ( - protoreflect "google.golang.org/protobuf/reflect/protoreflect" - protoimpl "google.golang.org/protobuf/runtime/protoimpl" - reflect "reflect" - sync "sync" - unsafe "unsafe" -) - -const ( - // Verify that this generated code is sufficiently up-to-date. - _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) - // Verify that runtime/protoimpl is sufficiently up-to-date. - _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) -) - -type State_Deployment_Instance_Status int32 - -const ( - State_Deployment_Instance_STATUS_UNSPECIFIED State_Deployment_Instance_Status = 0 - State_Deployment_Instance_STATUS_PENDING State_Deployment_Instance_Status = 1 // Deployment request accepted, container/pod creation in progress - State_Deployment_Instance_STATUS_RUNNING State_Deployment_Instance_Status = 2 // Container/pod is running and healthy - State_Deployment_Instance_STATUS_FAILED State_Deployment_Instance_Status = 3 // Container/pod failed to start -) - -// Enum value maps for State_Deployment_Instance_Status. -var ( - State_Deployment_Instance_Status_name = map[int32]string{ - 0: "STATUS_UNSPECIFIED", - 1: "STATUS_PENDING", - 2: "STATUS_RUNNING", - 3: "STATUS_FAILED", - } - State_Deployment_Instance_Status_value = map[string]int32{ - "STATUS_UNSPECIFIED": 0, - "STATUS_PENDING": 1, - "STATUS_RUNNING": 2, - "STATUS_FAILED": 3, - } -) - -func (x State_Deployment_Instance_Status) Enum() *State_Deployment_Instance_Status { - p := new(State_Deployment_Instance_Status) - *p = x - return p -} - -func (x State_Deployment_Instance_Status) String() string { - return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) -} - -func (State_Deployment_Instance_Status) Descriptor() protoreflect.EnumDescriptor { - return file_krane_v1_scheduler_proto_enumTypes[0].Descriptor() -} - -func (State_Deployment_Instance_Status) Type() protoreflect.EnumType { - return &file_krane_v1_scheduler_proto_enumTypes[0] -} - -func (x State_Deployment_Instance_Status) Number() protoreflect.EnumNumber { - return protoreflect.EnumNumber(x) -} - -// Deprecated: Use State_Deployment_Instance_Status.Descriptor instead. -func (State_Deployment_Instance_Status) EnumDescriptor() ([]byte, []int) { - return file_krane_v1_scheduler_proto_rawDescGZIP(), []int{3, 0, 0, 0} -} - -type ScrapeOpenApiSchemaRequest struct { - state protoimpl.MessageState `protogen:"open.v1"` - Namespace string `protobuf:"bytes,1,opt,name=namespace,proto3" json:"namespace,omitempty"` - Name string `protobuf:"bytes,2,opt,name=name,proto3" json:"name,omitempty"` - Path string `protobuf:"bytes,3,opt,name=path,proto3" json:"path,omitempty"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache -} - -func (x *ScrapeOpenApiSchemaRequest) Reset() { - *x = ScrapeOpenApiSchemaRequest{} - mi := &file_krane_v1_scheduler_proto_msgTypes[0] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) -} - -func (x *ScrapeOpenApiSchemaRequest) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*ScrapeOpenApiSchemaRequest) ProtoMessage() {} - -func (x *ScrapeOpenApiSchemaRequest) ProtoReflect() protoreflect.Message { - mi := &file_krane_v1_scheduler_proto_msgTypes[0] - if x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use ScrapeOpenApiSchemaRequest.ProtoReflect.Descriptor instead. -func (*ScrapeOpenApiSchemaRequest) Descriptor() ([]byte, []int) { - return file_krane_v1_scheduler_proto_rawDescGZIP(), []int{0} -} - -func (x *ScrapeOpenApiSchemaRequest) GetNamespace() string { - if x != nil { - return x.Namespace - } - return "" -} - -func (x *ScrapeOpenApiSchemaRequest) GetName() string { - if x != nil { - return x.Name - } - return "" -} - -func (x *ScrapeOpenApiSchemaRequest) GetPath() string { - if x != nil { - return x.Path - } - return "" -} - -type ScrapeOpenApiSchemaResponse struct { - state protoimpl.MessageState `protogen:"open.v1"` - Spec string `protobuf:"bytes,1,opt,name=spec,proto3" json:"spec,omitempty"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache -} - -func (x *ScrapeOpenApiSchemaResponse) Reset() { - *x = ScrapeOpenApiSchemaResponse{} - mi := &file_krane_v1_scheduler_proto_msgTypes[1] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) -} - -func (x *ScrapeOpenApiSchemaResponse) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*ScrapeOpenApiSchemaResponse) ProtoMessage() {} - -func (x *ScrapeOpenApiSchemaResponse) ProtoReflect() protoreflect.Message { - mi := &file_krane_v1_scheduler_proto_msgTypes[1] - if x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use ScrapeOpenApiSchemaResponse.ProtoReflect.Descriptor instead. -func (*ScrapeOpenApiSchemaResponse) Descriptor() ([]byte, []int) { - return file_krane_v1_scheduler_proto_rawDescGZIP(), []int{1} -} - -func (x *ScrapeOpenApiSchemaResponse) GetSpec() string { - if x != nil { - return x.Spec - } - return "" -} - -type WatchRequest struct { - state protoimpl.MessageState `protogen:"open.v1"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache -} - -func (x *WatchRequest) Reset() { - *x = WatchRequest{} - mi := &file_krane_v1_scheduler_proto_msgTypes[2] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) -} - -func (x *WatchRequest) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*WatchRequest) ProtoMessage() {} - -func (x *WatchRequest) ProtoReflect() protoreflect.Message { - mi := &file_krane_v1_scheduler_proto_msgTypes[2] - if x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use WatchRequest.ProtoReflect.Descriptor instead. -func (*WatchRequest) Descriptor() ([]byte, []int) { - return file_krane_v1_scheduler_proto_rawDescGZIP(), []int{2} -} - -type State struct { - state protoimpl.MessageState `protogen:"open.v1"` - // Types that are valid to be assigned to State: - // - // *State_Deployment_ - // *State_Sentinel_ - State isState_State `protobuf_oneof:"state"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache -} - -func (x *State) Reset() { - *x = State{} - mi := &file_krane_v1_scheduler_proto_msgTypes[3] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) -} - -func (x *State) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*State) ProtoMessage() {} - -func (x *State) ProtoReflect() protoreflect.Message { - mi := &file_krane_v1_scheduler_proto_msgTypes[3] - if x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use State.ProtoReflect.Descriptor instead. -func (*State) Descriptor() ([]byte, []int) { - return file_krane_v1_scheduler_proto_rawDescGZIP(), []int{3} -} - -func (x *State) GetState() isState_State { - if x != nil { - return x.State - } - return nil -} - -func (x *State) GetDeployment() *State_Deployment { - if x != nil { - if x, ok := x.State.(*State_Deployment_); ok { - return x.Deployment - } - } - return nil -} - -func (x *State) GetSentinel() *State_Sentinel { - if x != nil { - if x, ok := x.State.(*State_Sentinel_); ok { - return x.Sentinel - } - } - return nil -} - -type isState_State interface { - isState_State() -} - -type State_Deployment_ struct { - Deployment *State_Deployment `protobuf:"bytes,1,opt,name=deployment,proto3,oneof"` -} - -type State_Sentinel_ struct { - Sentinel *State_Sentinel `protobuf:"bytes,2,opt,name=sentinel,proto3,oneof"` -} - -func (*State_Deployment_) isState_State() {} - -func (*State_Sentinel_) isState_State() {} - -// ApplySentinel contains the desired configuration for a sentinel. -// -// The cluster agent will ensure a sentinel exists with this exact configuration, creating it if -// it doesn't exist or updating it if it does. All fields except namespace are required. -// The control plane ensures that sentinel_id is unique within the namespace. -type ApplySentinelRequest struct { - state protoimpl.MessageState `protogen:"open.v1"` - // namespace is the Kubernetes namespace in which the sentinel should exist. - K8SNamespace string `protobuf:"bytes,1,opt,name=k8s_namespace,json=k8sNamespace,proto3" json:"k8s_namespace,omitempty"` - K8SName string `protobuf:"bytes,2,opt,name=k8s_name,json=k8sName,proto3" json:"k8s_name,omitempty"` - // workspace_id identifies the workspace that owns this sentinel. - WorkspaceId string `protobuf:"bytes,3,opt,name=workspace_id,json=workspaceId,proto3" json:"workspace_id,omitempty"` - // project_id identifies the project within the workspace. - ProjectId string `protobuf:"bytes,4,opt,name=project_id,json=projectId,proto3" json:"project_id,omitempty"` - // environment_id in which the sentinel should exist. - EnvironmentId string `protobuf:"bytes,5,opt,name=environment_id,json=environmentId,proto3" json:"environment_id,omitempty"` - // sentinel_id is the unique identifier for this sentinel globally - SentinelId string `protobuf:"bytes,6,opt,name=sentinel_id,json=sentinelId,proto3" json:"sentinel_id,omitempty"` - Image string `protobuf:"bytes,7,opt,name=image,proto3" json:"image,omitempty"` - Replicas int32 `protobuf:"varint,8,opt,name=replicas,proto3" json:"replicas,omitempty"` - CpuMillicores int64 `protobuf:"varint,9,opt,name=cpu_millicores,json=cpuMillicores,proto3" json:"cpu_millicores,omitempty"` - MemoryMib int64 `protobuf:"varint,10,opt,name=memory_mib,json=memoryMib,proto3" json:"memory_mib,omitempty"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache -} - -func (x *ApplySentinelRequest) Reset() { - *x = ApplySentinelRequest{} - mi := &file_krane_v1_scheduler_proto_msgTypes[4] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) -} - -func (x *ApplySentinelRequest) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*ApplySentinelRequest) ProtoMessage() {} - -func (x *ApplySentinelRequest) ProtoReflect() protoreflect.Message { - mi := &file_krane_v1_scheduler_proto_msgTypes[4] - if x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use ApplySentinelRequest.ProtoReflect.Descriptor instead. -func (*ApplySentinelRequest) Descriptor() ([]byte, []int) { - return file_krane_v1_scheduler_proto_rawDescGZIP(), []int{4} -} - -func (x *ApplySentinelRequest) GetK8SNamespace() string { - if x != nil { - return x.K8SNamespace - } - return "" -} - -func (x *ApplySentinelRequest) GetK8SName() string { - if x != nil { - return x.K8SName - } - return "" -} - -func (x *ApplySentinelRequest) GetWorkspaceId() string { - if x != nil { - return x.WorkspaceId - } - return "" -} - -func (x *ApplySentinelRequest) GetProjectId() string { - if x != nil { - return x.ProjectId - } - return "" -} - -func (x *ApplySentinelRequest) GetEnvironmentId() string { - if x != nil { - return x.EnvironmentId - } - return "" -} - -func (x *ApplySentinelRequest) GetSentinelId() string { - if x != nil { - return x.SentinelId - } - return "" -} - -func (x *ApplySentinelRequest) GetImage() string { - if x != nil { - return x.Image - } - return "" -} - -func (x *ApplySentinelRequest) GetReplicas() int32 { - if x != nil { - return x.Replicas - } - return 0 -} - -func (x *ApplySentinelRequest) GetCpuMillicores() int64 { - if x != nil { - return x.CpuMillicores - } - return 0 -} - -func (x *ApplySentinelRequest) GetMemoryMib() int64 { - if x != nil { - return x.MemoryMib - } - return 0 -} - -// ApplySentinelResponse is the response to ApplySentinelRequest. -type ApplySentinelResponse struct { - state protoimpl.MessageState `protogen:"open.v1"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache -} - -func (x *ApplySentinelResponse) Reset() { - *x = ApplySentinelResponse{} - mi := &file_krane_v1_scheduler_proto_msgTypes[5] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) -} - -func (x *ApplySentinelResponse) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*ApplySentinelResponse) ProtoMessage() {} - -func (x *ApplySentinelResponse) ProtoReflect() protoreflect.Message { - mi := &file_krane_v1_scheduler_proto_msgTypes[5] - if x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use ApplySentinelResponse.ProtoReflect.Descriptor instead. -func (*ApplySentinelResponse) Descriptor() ([]byte, []int) { - return file_krane_v1_scheduler_proto_rawDescGZIP(), []int{5} -} - -// DeleteSentinel identifies a sentinel to remove from the cluster. -// -// The sentinel and all its resources (pods, services, frontline) will be deleted. -// In-flight requests may be disrupted unless proper connection draining is configured. -type DeleteSentinelRequest struct { - state protoimpl.MessageState `protogen:"open.v1"` - K8SNamespace string `protobuf:"bytes,1,opt,name=k8s_namespace,json=k8sNamespace,proto3" json:"k8s_namespace,omitempty"` - K8SName string `protobuf:"bytes,2,opt,name=k8s_name,json=k8sName,proto3" json:"k8s_name,omitempty"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache -} - -func (x *DeleteSentinelRequest) Reset() { - *x = DeleteSentinelRequest{} - mi := &file_krane_v1_scheduler_proto_msgTypes[6] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) -} - -func (x *DeleteSentinelRequest) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*DeleteSentinelRequest) ProtoMessage() {} - -func (x *DeleteSentinelRequest) ProtoReflect() protoreflect.Message { - mi := &file_krane_v1_scheduler_proto_msgTypes[6] - if x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use DeleteSentinelRequest.ProtoReflect.Descriptor instead. -func (*DeleteSentinelRequest) Descriptor() ([]byte, []int) { - return file_krane_v1_scheduler_proto_rawDescGZIP(), []int{6} -} - -func (x *DeleteSentinelRequest) GetK8SNamespace() string { - if x != nil { - return x.K8SNamespace - } - return "" -} - -func (x *DeleteSentinelRequest) GetK8SName() string { - if x != nil { - return x.K8SName - } - return "" -} - -type DeleteSentinelResponse struct { - state protoimpl.MessageState `protogen:"open.v1"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache -} - -func (x *DeleteSentinelResponse) Reset() { - *x = DeleteSentinelResponse{} - mi := &file_krane_v1_scheduler_proto_msgTypes[7] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) -} - -func (x *DeleteSentinelResponse) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*DeleteSentinelResponse) ProtoMessage() {} - -func (x *DeleteSentinelResponse) ProtoReflect() protoreflect.Message { - mi := &file_krane_v1_scheduler_proto_msgTypes[7] - if x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use DeleteSentinelResponse.ProtoReflect.Descriptor instead. -func (*DeleteSentinelResponse) Descriptor() ([]byte, []int) { - return file_krane_v1_scheduler_proto_rawDescGZIP(), []int{7} -} - -// ApplyDeployment contains the desired configuration for a deployment. -// -// The cluster agent will ensure a deployment exists with this exact configuration, creating it if -// it doesn't exist or updating it if it does. All fields except namespace are required. -// The control plane ensures that deployment_id is unique within the namespace. -type ApplyDeploymentRequest struct { - state protoimpl.MessageState `protogen:"open.v1"` - // namespace is the Kubernetes namespace in which the deployment should exist. - K8SNamespace string `protobuf:"bytes,1,opt,name=k8s_namespace,json=k8sNamespace,proto3" json:"k8s_namespace,omitempty"` - K8SName string `protobuf:"bytes,2,opt,name=k8s_name,json=k8sName,proto3" json:"k8s_name,omitempty"` - // workspace_id identifies the workspace that owns this deployment. - // Used for multi-tenancy and access control. - WorkspaceId string `protobuf:"bytes,3,opt,name=workspace_id,json=workspaceId,proto3" json:"workspace_id,omitempty"` - // project_id identifies the project within the workspace. - // Deployments are scoped to projects for organizational purposes. - ProjectId string `protobuf:"bytes,4,opt,name=project_id,json=projectId,proto3" json:"project_id,omitempty"` - // environment_id specifies the environment . - // Used for environment-specific configuration and isolation. - EnvironmentId string `protobuf:"bytes,5,opt,name=environment_id,json=environmentId,proto3" json:"environment_id,omitempty"` - // deployment_id is the unique identifier for this deployment within the namespace. - DeploymentId string `protobuf:"bytes,6,opt,name=deployment_id,json=deploymentId,proto3" json:"deployment_id,omitempty"` - // image is the container image to deploy. - // Must be a valid container registry URL accessible by the cluster. - // Example: "gcr.io/myproject/app:v2.1.0" - Image string `protobuf:"bytes,7,opt,name=image,proto3" json:"image,omitempty"` - // replicas is the desired number of pod instances. - // Must be at least 1. Set higher for increased availability and load distribution. - Replicas int32 `protobuf:"varint,8,opt,name=replicas,proto3" json:"replicas,omitempty"` - // cpu_millicores is the CPU request/limit in millicores (1000 = 1 CPU core). - // This ensures each pod has sufficient CPU resources. - // Example: 250 = 0.25 CPU cores - CpuMillicores int64 `protobuf:"varint,9,opt,name=cpu_millicores,json=cpuMillicores,proto3" json:"cpu_millicores,omitempty"` - // memory_mib is the memory request/limit in mebibytes. - // This ensures each pod has sufficient memory. - // Example: 256 = 256 MiB - MemoryMib int64 `protobuf:"varint,10,opt,name=memory_mib,json=memoryMib,proto3" json:"memory_mib,omitempty"` - // build_id is the unique identifier for this build from depot - // if we did not build this image via depot, no buildID exists and we - // assume kubernetes will pull from a public registry - BuildId *string `protobuf:"bytes,11,opt,name=build_id,json=buildId,proto3,oneof" json:"build_id,omitempty"` - // Encrypted secrets blob to be decrypted at runtime by inject. - // This is set as UNKEY_ENCRYPTED_ENV env var in the container. - // inject calls krane's DecryptSecretsBlob RPC to decrypt. - EncryptedEnvironmentVariables []byte `protobuf:"bytes,12,opt,name=encrypted_environment_variables,json=encryptedEnvironmentVariables,proto3" json:"encrypted_environment_variables,omitempty"` - // An opaque identifier used in a restate awakable. - // If set, the cluster must add this as annotation and report back during Watch checks - ReadinessId *string `protobuf:"bytes,13,opt,name=readiness_id,json=readinessId,proto3,oneof" json:"readiness_id,omitempty"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache -} - -func (x *ApplyDeploymentRequest) Reset() { - *x = ApplyDeploymentRequest{} - mi := &file_krane_v1_scheduler_proto_msgTypes[8] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) -} - -func (x *ApplyDeploymentRequest) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*ApplyDeploymentRequest) ProtoMessage() {} - -func (x *ApplyDeploymentRequest) ProtoReflect() protoreflect.Message { - mi := &file_krane_v1_scheduler_proto_msgTypes[8] - if x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use ApplyDeploymentRequest.ProtoReflect.Descriptor instead. -func (*ApplyDeploymentRequest) Descriptor() ([]byte, []int) { - return file_krane_v1_scheduler_proto_rawDescGZIP(), []int{8} -} - -func (x *ApplyDeploymentRequest) GetK8SNamespace() string { - if x != nil { - return x.K8SNamespace - } - return "" -} - -func (x *ApplyDeploymentRequest) GetK8SName() string { - if x != nil { - return x.K8SName - } - return "" -} - -func (x *ApplyDeploymentRequest) GetWorkspaceId() string { - if x != nil { - return x.WorkspaceId - } - return "" -} - -func (x *ApplyDeploymentRequest) GetProjectId() string { - if x != nil { - return x.ProjectId - } - return "" -} - -func (x *ApplyDeploymentRequest) GetEnvironmentId() string { - if x != nil { - return x.EnvironmentId - } - return "" -} - -func (x *ApplyDeploymentRequest) GetDeploymentId() string { - if x != nil { - return x.DeploymentId - } - return "" -} - -func (x *ApplyDeploymentRequest) GetImage() string { - if x != nil { - return x.Image - } - return "" -} - -func (x *ApplyDeploymentRequest) GetReplicas() int32 { - if x != nil { - return x.Replicas - } - return 0 -} - -func (x *ApplyDeploymentRequest) GetCpuMillicores() int64 { - if x != nil { - return x.CpuMillicores - } - return 0 -} - -func (x *ApplyDeploymentRequest) GetMemoryMib() int64 { - if x != nil { - return x.MemoryMib - } - return 0 -} - -func (x *ApplyDeploymentRequest) GetBuildId() string { - if x != nil && x.BuildId != nil { - return *x.BuildId - } - return "" -} - -func (x *ApplyDeploymentRequest) GetEncryptedEnvironmentVariables() []byte { - if x != nil { - return x.EncryptedEnvironmentVariables - } - return nil -} - -func (x *ApplyDeploymentRequest) GetReadinessId() string { - if x != nil && x.ReadinessId != nil { - return *x.ReadinessId - } - return "" -} - -type ApplyDeploymentResponse struct { - state protoimpl.MessageState `protogen:"open.v1"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache -} - -func (x *ApplyDeploymentResponse) Reset() { - *x = ApplyDeploymentResponse{} - mi := &file_krane_v1_scheduler_proto_msgTypes[9] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) -} - -func (x *ApplyDeploymentResponse) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*ApplyDeploymentResponse) ProtoMessage() {} - -func (x *ApplyDeploymentResponse) ProtoReflect() protoreflect.Message { - mi := &file_krane_v1_scheduler_proto_msgTypes[9] - if x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use ApplyDeploymentResponse.ProtoReflect.Descriptor instead. -func (*ApplyDeploymentResponse) Descriptor() ([]byte, []int) { - return file_krane_v1_scheduler_proto_rawDescGZIP(), []int{9} -} - -// DeleteDeployment identifies a deployment to remove from the cluster. -// -// The deployment and all its pods will be terminated gracefully according to -// the configured termination grace period. All associated resources (services, -// configmaps specific to this deployment) will also be cleaned up. -type DeleteDeploymentRequest struct { - state protoimpl.MessageState `protogen:"open.v1"` - K8SNamespace string `protobuf:"bytes,1,opt,name=k8s_namespace,json=k8sNamespace,proto3" json:"k8s_namespace,omitempty"` - K8SName string `protobuf:"bytes,2,opt,name=k8s_name,json=k8sName,proto3" json:"k8s_name,omitempty"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache -} - -func (x *DeleteDeploymentRequest) Reset() { - *x = DeleteDeploymentRequest{} - mi := &file_krane_v1_scheduler_proto_msgTypes[10] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) -} - -func (x *DeleteDeploymentRequest) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*DeleteDeploymentRequest) ProtoMessage() {} - -func (x *DeleteDeploymentRequest) ProtoReflect() protoreflect.Message { - mi := &file_krane_v1_scheduler_proto_msgTypes[10] - if x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use DeleteDeploymentRequest.ProtoReflect.Descriptor instead. -func (*DeleteDeploymentRequest) Descriptor() ([]byte, []int) { - return file_krane_v1_scheduler_proto_rawDescGZIP(), []int{10} -} - -func (x *DeleteDeploymentRequest) GetK8SNamespace() string { - if x != nil { - return x.K8SNamespace - } - return "" -} - -func (x *DeleteDeploymentRequest) GetK8SName() string { - if x != nil { - return x.K8SName - } - return "" -} - -type DeleteDeploymentResponse struct { - state protoimpl.MessageState `protogen:"open.v1"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache -} - -func (x *DeleteDeploymentResponse) Reset() { - *x = DeleteDeploymentResponse{} - mi := &file_krane_v1_scheduler_proto_msgTypes[11] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) -} - -func (x *DeleteDeploymentResponse) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*DeleteDeploymentResponse) ProtoMessage() {} - -func (x *DeleteDeploymentResponse) ProtoReflect() protoreflect.Message { - mi := &file_krane_v1_scheduler_proto_msgTypes[11] - if x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use DeleteDeploymentResponse.ProtoReflect.Descriptor instead. -func (*DeleteDeploymentResponse) Descriptor() ([]byte, []int) { - return file_krane_v1_scheduler_proto_rawDescGZIP(), []int{11} -} - -type State_Deployment struct { - state protoimpl.MessageState `protogen:"open.v1"` - DeploymentK8SName string `protobuf:"bytes,2,opt,name=deployment_k8s_name,json=deploymentK8sName,proto3" json:"deployment_k8s_name,omitempty"` - Instances []*State_Deployment_Instance `protobuf:"bytes,3,rep,name=instances,proto3" json:"instances,omitempty"` - ReadinessId *string `protobuf:"bytes,11,opt,name=readiness_id,json=readinessId,proto3,oneof" json:"readiness_id,omitempty"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache -} - -func (x *State_Deployment) Reset() { - *x = State_Deployment{} - mi := &file_krane_v1_scheduler_proto_msgTypes[12] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) -} - -func (x *State_Deployment) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*State_Deployment) ProtoMessage() {} - -func (x *State_Deployment) ProtoReflect() protoreflect.Message { - mi := &file_krane_v1_scheduler_proto_msgTypes[12] - if x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use State_Deployment.ProtoReflect.Descriptor instead. -func (*State_Deployment) Descriptor() ([]byte, []int) { - return file_krane_v1_scheduler_proto_rawDescGZIP(), []int{3, 0} -} - -func (x *State_Deployment) GetDeploymentK8SName() string { - if x != nil { - return x.DeploymentK8SName - } - return "" -} - -func (x *State_Deployment) GetInstances() []*State_Deployment_Instance { - if x != nil { - return x.Instances - } - return nil -} - -func (x *State_Deployment) GetReadinessId() string { - if x != nil && x.ReadinessId != nil { - return *x.ReadinessId - } - return "" -} - -type State_Sentinel struct { - state protoimpl.MessageState `protogen:"open.v1"` - K8SName string `protobuf:"bytes,1,opt,name=k8s_name,json=k8sName,proto3" json:"k8s_name,omitempty"` - AvailableReplicas int32 `protobuf:"varint,2,opt,name=available_replicas,json=availableReplicas,proto3" json:"available_replicas,omitempty"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache -} - -func (x *State_Sentinel) Reset() { - *x = State_Sentinel{} - mi := &file_krane_v1_scheduler_proto_msgTypes[13] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) -} - -func (x *State_Sentinel) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*State_Sentinel) ProtoMessage() {} - -func (x *State_Sentinel) ProtoReflect() protoreflect.Message { - mi := &file_krane_v1_scheduler_proto_msgTypes[13] - if x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use State_Sentinel.ProtoReflect.Descriptor instead. -func (*State_Sentinel) Descriptor() ([]byte, []int) { - return file_krane_v1_scheduler_proto_rawDescGZIP(), []int{3, 1} -} - -func (x *State_Sentinel) GetK8SName() string { - if x != nil { - return x.K8SName - } - return "" -} - -func (x *State_Sentinel) GetAvailableReplicas() int32 { - if x != nil { - return x.AvailableReplicas - } - return 0 -} - -type State_Deployment_Instance struct { - state protoimpl.MessageState `protogen:"open.v1"` - InstanceK8SName string `protobuf:"bytes,1,opt,name=instance_k8s_name,json=instanceK8sName,proto3" json:"instance_k8s_name,omitempty"` - Address string `protobuf:"bytes,2,opt,name=address,proto3" json:"address,omitempty"` - CpuMillicores int64 `protobuf:"varint,3,opt,name=cpu_millicores,json=cpuMillicores,proto3" json:"cpu_millicores,omitempty"` - MemoryMib int64 `protobuf:"varint,4,opt,name=memory_mib,json=memoryMib,proto3" json:"memory_mib,omitempty"` - Status State_Deployment_Instance_Status `protobuf:"varint,5,opt,name=status,proto3,enum=krane.v1.State_Deployment_Instance_Status" json:"status,omitempty"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache -} - -func (x *State_Deployment_Instance) Reset() { - *x = State_Deployment_Instance{} - mi := &file_krane_v1_scheduler_proto_msgTypes[14] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) -} - -func (x *State_Deployment_Instance) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*State_Deployment_Instance) ProtoMessage() {} - -func (x *State_Deployment_Instance) ProtoReflect() protoreflect.Message { - mi := &file_krane_v1_scheduler_proto_msgTypes[14] - if x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use State_Deployment_Instance.ProtoReflect.Descriptor instead. -func (*State_Deployment_Instance) Descriptor() ([]byte, []int) { - return file_krane_v1_scheduler_proto_rawDescGZIP(), []int{3, 0, 0} -} - -func (x *State_Deployment_Instance) GetInstanceK8SName() string { - if x != nil { - return x.InstanceK8SName - } - return "" -} - -func (x *State_Deployment_Instance) GetAddress() string { - if x != nil { - return x.Address - } - return "" -} - -func (x *State_Deployment_Instance) GetCpuMillicores() int64 { - if x != nil { - return x.CpuMillicores - } - return 0 -} - -func (x *State_Deployment_Instance) GetMemoryMib() int64 { - if x != nil { - return x.MemoryMib - } - return 0 -} - -func (x *State_Deployment_Instance) GetStatus() State_Deployment_Instance_Status { - if x != nil { - return x.Status - } - return State_Deployment_Instance_STATUS_UNSPECIFIED -} - -var File_krane_v1_scheduler_proto protoreflect.FileDescriptor - -const file_krane_v1_scheduler_proto_rawDesc = "" + - "\n" + - "\x18krane/v1/scheduler.proto\x12\bkrane.v1\"b\n" + - "\x1aScrapeOpenApiSchemaRequest\x12\x1c\n" + - "\tnamespace\x18\x01 \x01(\tR\tnamespace\x12\x12\n" + - "\x04name\x18\x02 \x01(\tR\x04name\x12\x12\n" + - "\x04path\x18\x03 \x01(\tR\x04path\"1\n" + - "\x1bScrapeOpenApiSchemaResponse\x12\x12\n" + - "\x04spec\x18\x01 \x01(\tR\x04spec\"\x0e\n" + - "\fWatchRequest\"\xd1\x05\n" + - "\x05State\x12<\n" + - "\n" + - "deployment\x18\x01 \x01(\v2\x1a.krane.v1.State.DeploymentH\x00R\n" + - "deployment\x126\n" + - "\bsentinel\x18\x02 \x01(\v2\x18.krane.v1.State.SentinelH\x00R\bsentinel\x1a\xf2\x03\n" + - "\n" + - "Deployment\x12.\n" + - "\x13deployment_k8s_name\x18\x02 \x01(\tR\x11deploymentK8sName\x12A\n" + - "\tinstances\x18\x03 \x03(\v2#.krane.v1.State.Deployment.InstanceR\tinstances\x12&\n" + - "\freadiness_id\x18\v \x01(\tH\x00R\vreadinessId\x88\x01\x01\x1a\xb7\x02\n" + - "\bInstance\x12*\n" + - "\x11instance_k8s_name\x18\x01 \x01(\tR\x0finstanceK8sName\x12\x18\n" + - "\aaddress\x18\x02 \x01(\tR\aaddress\x12%\n" + - "\x0ecpu_millicores\x18\x03 \x01(\x03R\rcpuMillicores\x12\x1d\n" + - "\n" + - "memory_mib\x18\x04 \x01(\x03R\tmemoryMib\x12B\n" + - "\x06status\x18\x05 \x01(\x0e2*.krane.v1.State.Deployment.Instance.StatusR\x06status\"[\n" + - "\x06Status\x12\x16\n" + - "\x12STATUS_UNSPECIFIED\x10\x00\x12\x12\n" + - "\x0eSTATUS_PENDING\x10\x01\x12\x12\n" + - "\x0eSTATUS_RUNNING\x10\x02\x12\x11\n" + - "\rSTATUS_FAILED\x10\x03B\x0f\n" + - "\r_readiness_id\x1aT\n" + - "\bSentinel\x12\x19\n" + - "\bk8s_name\x18\x01 \x01(\tR\ak8sName\x12-\n" + - "\x12available_replicas\x18\x02 \x01(\x05R\x11availableReplicasB\a\n" + - "\x05state\"\xd8\x02\n" + - "\x14ApplySentinelRequest\x12#\n" + - "\rk8s_namespace\x18\x01 \x01(\tR\fk8sNamespace\x12\x19\n" + - "\bk8s_name\x18\x02 \x01(\tR\ak8sName\x12!\n" + - "\fworkspace_id\x18\x03 \x01(\tR\vworkspaceId\x12\x1d\n" + - "\n" + - "project_id\x18\x04 \x01(\tR\tprojectId\x12%\n" + - "\x0eenvironment_id\x18\x05 \x01(\tR\renvironmentId\x12\x1f\n" + - "\vsentinel_id\x18\x06 \x01(\tR\n" + - "sentinelId\x12\x14\n" + - "\x05image\x18\a \x01(\tR\x05image\x12\x1a\n" + - "\breplicas\x18\b \x01(\x05R\breplicas\x12%\n" + - "\x0ecpu_millicores\x18\t \x01(\x03R\rcpuMillicores\x12\x1d\n" + - "\n" + - "memory_mib\x18\n" + - " \x01(\x03R\tmemoryMib\"\x17\n" + - "\x15ApplySentinelResponse\"W\n" + - "\x15DeleteSentinelRequest\x12#\n" + - "\rk8s_namespace\x18\x01 \x01(\tR\fk8sNamespace\x12\x19\n" + - "\bk8s_name\x18\x02 \x01(\tR\ak8sName\"\x18\n" + - "\x16DeleteSentinelResponse\"\x8c\x04\n" + - "\x16ApplyDeploymentRequest\x12#\n" + - "\rk8s_namespace\x18\x01 \x01(\tR\fk8sNamespace\x12\x19\n" + - "\bk8s_name\x18\x02 \x01(\tR\ak8sName\x12!\n" + - "\fworkspace_id\x18\x03 \x01(\tR\vworkspaceId\x12\x1d\n" + - "\n" + - "project_id\x18\x04 \x01(\tR\tprojectId\x12%\n" + - "\x0eenvironment_id\x18\x05 \x01(\tR\renvironmentId\x12#\n" + - "\rdeployment_id\x18\x06 \x01(\tR\fdeploymentId\x12\x14\n" + - "\x05image\x18\a \x01(\tR\x05image\x12\x1a\n" + - "\breplicas\x18\b \x01(\x05R\breplicas\x12%\n" + - "\x0ecpu_millicores\x18\t \x01(\x03R\rcpuMillicores\x12\x1d\n" + - "\n" + - "memory_mib\x18\n" + - " \x01(\x03R\tmemoryMib\x12\x1e\n" + - "\bbuild_id\x18\v \x01(\tH\x00R\abuildId\x88\x01\x01\x12F\n" + - "\x1fencrypted_environment_variables\x18\f \x01(\fR\x1dencryptedEnvironmentVariables\x12&\n" + - "\freadiness_id\x18\r \x01(\tH\x01R\vreadinessId\x88\x01\x01B\v\n" + - "\t_build_idB\x0f\n" + - "\r_readiness_id\"\x19\n" + - "\x17ApplyDeploymentResponse\"Y\n" + - "\x17DeleteDeploymentRequest\x12#\n" + - "\rk8s_namespace\x18\x01 \x01(\tR\fk8sNamespace\x12\x19\n" + - "\bk8s_name\x18\x02 \x01(\tR\ak8sName\"\x1a\n" + - "\x18DeleteDeploymentResponse2\x84\x04\n" + - "\x10SchedulerService\x12V\n" + - "\x0fApplyDeployment\x12 .krane.v1.ApplyDeploymentRequest\x1a!.krane.v1.ApplyDeploymentResponse\x12Y\n" + - "\x10DeleteDeployment\x12!.krane.v1.DeleteDeploymentRequest\x1a\".krane.v1.DeleteDeploymentResponse\x12P\n" + - "\rApplySentinel\x12\x1e.krane.v1.ApplySentinelRequest\x1a\x1f.krane.v1.ApplySentinelResponse\x12S\n" + - "\x0eDeleteSentinel\x12\x1f.krane.v1.DeleteSentinelRequest\x1a .krane.v1.DeleteSentinelResponse\x122\n" + - "\x05Watch\x12\x16.krane.v1.WatchRequest\x1a\x0f.krane.v1.State0\x01\x12b\n" + - "\x13ScrapeOpenApiSchema\x12$.krane.v1.ScrapeOpenApiSchemaRequest\x1a%.krane.v1.ScrapeOpenApiSchemaResponseB\x94\x01\n" + - "\fcom.krane.v1B\x0eSchedulerProtoP\x01Z3github.com/unkeyed/unkey/gen/proto/krane/v1;kranev1\xa2\x02\x03KXX\xaa\x02\bKrane.V1\xca\x02\bKrane\\V1\xe2\x02\x14Krane\\V1\\GPBMetadata\xea\x02\tKrane::V1b\x06proto3" - -var ( - file_krane_v1_scheduler_proto_rawDescOnce sync.Once - file_krane_v1_scheduler_proto_rawDescData []byte -) - -func file_krane_v1_scheduler_proto_rawDescGZIP() []byte { - file_krane_v1_scheduler_proto_rawDescOnce.Do(func() { - file_krane_v1_scheduler_proto_rawDescData = protoimpl.X.CompressGZIP(unsafe.Slice(unsafe.StringData(file_krane_v1_scheduler_proto_rawDesc), len(file_krane_v1_scheduler_proto_rawDesc))) - }) - return file_krane_v1_scheduler_proto_rawDescData -} - -var file_krane_v1_scheduler_proto_enumTypes = make([]protoimpl.EnumInfo, 1) -var file_krane_v1_scheduler_proto_msgTypes = make([]protoimpl.MessageInfo, 15) -var file_krane_v1_scheduler_proto_goTypes = []any{ - (State_Deployment_Instance_Status)(0), // 0: krane.v1.State.Deployment.Instance.Status - (*ScrapeOpenApiSchemaRequest)(nil), // 1: krane.v1.ScrapeOpenApiSchemaRequest - (*ScrapeOpenApiSchemaResponse)(nil), // 2: krane.v1.ScrapeOpenApiSchemaResponse - (*WatchRequest)(nil), // 3: krane.v1.WatchRequest - (*State)(nil), // 4: krane.v1.State - (*ApplySentinelRequest)(nil), // 5: krane.v1.ApplySentinelRequest - (*ApplySentinelResponse)(nil), // 6: krane.v1.ApplySentinelResponse - (*DeleteSentinelRequest)(nil), // 7: krane.v1.DeleteSentinelRequest - (*DeleteSentinelResponse)(nil), // 8: krane.v1.DeleteSentinelResponse - (*ApplyDeploymentRequest)(nil), // 9: krane.v1.ApplyDeploymentRequest - (*ApplyDeploymentResponse)(nil), // 10: krane.v1.ApplyDeploymentResponse - (*DeleteDeploymentRequest)(nil), // 11: krane.v1.DeleteDeploymentRequest - (*DeleteDeploymentResponse)(nil), // 12: krane.v1.DeleteDeploymentResponse - (*State_Deployment)(nil), // 13: krane.v1.State.Deployment - (*State_Sentinel)(nil), // 14: krane.v1.State.Sentinel - (*State_Deployment_Instance)(nil), // 15: krane.v1.State.Deployment.Instance -} -var file_krane_v1_scheduler_proto_depIdxs = []int32{ - 13, // 0: krane.v1.State.deployment:type_name -> krane.v1.State.Deployment - 14, // 1: krane.v1.State.sentinel:type_name -> krane.v1.State.Sentinel - 15, // 2: krane.v1.State.Deployment.instances:type_name -> krane.v1.State.Deployment.Instance - 0, // 3: krane.v1.State.Deployment.Instance.status:type_name -> krane.v1.State.Deployment.Instance.Status - 9, // 4: krane.v1.SchedulerService.ApplyDeployment:input_type -> krane.v1.ApplyDeploymentRequest - 11, // 5: krane.v1.SchedulerService.DeleteDeployment:input_type -> krane.v1.DeleteDeploymentRequest - 5, // 6: krane.v1.SchedulerService.ApplySentinel:input_type -> krane.v1.ApplySentinelRequest - 7, // 7: krane.v1.SchedulerService.DeleteSentinel:input_type -> krane.v1.DeleteSentinelRequest - 3, // 8: krane.v1.SchedulerService.Watch:input_type -> krane.v1.WatchRequest - 1, // 9: krane.v1.SchedulerService.ScrapeOpenApiSchema:input_type -> krane.v1.ScrapeOpenApiSchemaRequest - 10, // 10: krane.v1.SchedulerService.ApplyDeployment:output_type -> krane.v1.ApplyDeploymentResponse - 12, // 11: krane.v1.SchedulerService.DeleteDeployment:output_type -> krane.v1.DeleteDeploymentResponse - 6, // 12: krane.v1.SchedulerService.ApplySentinel:output_type -> krane.v1.ApplySentinelResponse - 8, // 13: krane.v1.SchedulerService.DeleteSentinel:output_type -> krane.v1.DeleteSentinelResponse - 4, // 14: krane.v1.SchedulerService.Watch:output_type -> krane.v1.State - 2, // 15: krane.v1.SchedulerService.ScrapeOpenApiSchema:output_type -> krane.v1.ScrapeOpenApiSchemaResponse - 10, // [10:16] is the sub-list for method output_type - 4, // [4:10] is the sub-list for method input_type - 4, // [4:4] is the sub-list for extension type_name - 4, // [4:4] is the sub-list for extension extendee - 0, // [0:4] is the sub-list for field type_name -} - -func init() { file_krane_v1_scheduler_proto_init() } -func file_krane_v1_scheduler_proto_init() { - if File_krane_v1_scheduler_proto != nil { - return - } - file_krane_v1_scheduler_proto_msgTypes[3].OneofWrappers = []any{ - (*State_Deployment_)(nil), - (*State_Sentinel_)(nil), - } - file_krane_v1_scheduler_proto_msgTypes[8].OneofWrappers = []any{} - file_krane_v1_scheduler_proto_msgTypes[12].OneofWrappers = []any{} - type x struct{} - out := protoimpl.TypeBuilder{ - File: protoimpl.DescBuilder{ - GoPackagePath: reflect.TypeOf(x{}).PkgPath(), - RawDescriptor: unsafe.Slice(unsafe.StringData(file_krane_v1_scheduler_proto_rawDesc), len(file_krane_v1_scheduler_proto_rawDesc)), - NumEnums: 1, - NumMessages: 15, - NumExtensions: 0, - NumServices: 1, - }, - GoTypes: file_krane_v1_scheduler_proto_goTypes, - DependencyIndexes: file_krane_v1_scheduler_proto_depIdxs, - EnumInfos: file_krane_v1_scheduler_proto_enumTypes, - MessageInfos: file_krane_v1_scheduler_proto_msgTypes, - }.Build() - File_krane_v1_scheduler_proto = out.File - file_krane_v1_scheduler_proto_goTypes = nil - file_krane_v1_scheduler_proto_depIdxs = nil -} diff --git a/pkg/db/BUILD.bazel b/pkg/db/BUILD.bazel index d104da1247..5788962911 100644 --- a/pkg/db/BUILD.bazel +++ b/pkg/db/BUILD.bazel @@ -71,7 +71,6 @@ go_library( "clickhouse_workspace_settings_find_by_workspace_id.sql_generated.go", "clickhouse_workspace_settings_insert.sql_generated.go", "clickhouse_workspace_settings_update_limits.sql_generated.go", - "cluster_state_versions.sql_generated.go", "custom_domain_find_by_domain.sql_generated.go", "custom_domain_find_by_domain_or_wildcard.sql_generated.go", "custom_domain_find_by_id.sql_generated.go", diff --git a/pkg/db/cluster_state_versions.sql_generated.go b/pkg/db/cluster_state_versions.sql_generated.go deleted file mode 100644 index c4e5e233db..0000000000 --- a/pkg/db/cluster_state_versions.sql_generated.go +++ /dev/null @@ -1,83 +0,0 @@ -// Code generated by sqlc. DO NOT EDIT. -// versions: -// sqlc v1.30.0 -// source: cluster_state_versions.sql - -package db - -import ( - "context" -) - -const listClusterStateVersions = `-- name: ListClusterStateVersions :many -SELECT combined.version, combined.kind FROM ( - SELECT dt.version, 'deployment' AS kind - FROM ` + "`" + `deployment_topology` + "`" + ` dt - WHERE dt.region = ? - AND dt.version > ? - UNION ALL - SELECT s.version, 'sentinel' AS kind - FROM ` + "`" + `sentinels` + "`" + ` s - WHERE s.region = ? - AND s.version > ? -) AS combined -ORDER BY combined.version ASC -LIMIT ? -` - -type ListClusterStateVersionsParams struct { - Region string `db:"region"` - AfterVersion uint64 `db:"after_version"` - Limit int32 `db:"limit"` -} - -type ListClusterStateVersionsRow struct { - Version uint64 `db:"version"` - Kind string `db:"kind"` -} - -// ListClusterStateVersions returns the next N (version, kind) pairs in global version order. -// Used to determine which resources to fetch for sync, without loading full row data. -// The 'kind' discriminator is 'deployment' or 'sentinel'. -// -// SELECT combined.version, combined.kind FROM ( -// SELECT dt.version, 'deployment' AS kind -// FROM `deployment_topology` dt -// WHERE dt.region = ? -// AND dt.version > ? -// UNION ALL -// SELECT s.version, 'sentinel' AS kind -// FROM `sentinels` s -// WHERE s.region = ? -// AND s.version > ? -// ) AS combined -// ORDER BY combined.version ASC -// LIMIT ? -func (q *Queries) ListClusterStateVersions(ctx context.Context, db DBTX, arg ListClusterStateVersionsParams) ([]ListClusterStateVersionsRow, error) { - rows, err := db.QueryContext(ctx, listClusterStateVersions, - arg.Region, - arg.AfterVersion, - arg.Region, - arg.AfterVersion, - arg.Limit, - ) - if err != nil { - return nil, err - } - defer rows.Close() - var items []ListClusterStateVersionsRow - for rows.Next() { - var i ListClusterStateVersionsRow - if err := rows.Scan(&i.Version, &i.Kind); err != nil { - return nil, err - } - items = append(items, i) - } - if err := rows.Close(); err != nil { - return nil, err - } - if err := rows.Err(); err != nil { - return nil, err - } - return items, nil -} diff --git a/pkg/db/deployment_topology_find_by_versions.sql_generated.go b/pkg/db/deployment_topology_find_by_versions.sql_generated.go index fadd4b3459..42ca3efbdc 100644 --- a/pkg/db/deployment_topology_find_by_versions.sql_generated.go +++ b/pkg/db/deployment_topology_find_by_versions.sql_generated.go @@ -8,10 +8,9 @@ package db import ( "context" "database/sql" - "strings" ) -const findDeploymentTopologyByVersions = `-- name: FindDeploymentTopologyByVersions :many +const listDeploymentTopologyByRegion = `-- name: ListDeploymentTopologyByRegion :many SELECT dt.pk, dt.workspace_id, dt.deployment_id, dt.region, dt.desired_replicas, dt.version, dt.desired_status, dt.created_at, dt.updated_at, d.pk, d.id, d.k8s_name, d.workspace_id, d.project_id, d.environment_id, d.image, d.build_id, d.git_commit_sha, d.git_branch, d.git_commit_message, d.git_commit_author_handle, d.git_commit_author_avatar_url, d.git_commit_timestamp, d.sentinel_config, d.openapi_spec, d.cpu_millicores, d.memory_mib, d.desired_state, d.encrypted_environment_variables, d.command, d.status, d.created_at, d.updated_at, @@ -19,17 +18,25 @@ SELECT FROM ` + "`" + `deployment_topology` + "`" + ` dt INNER JOIN ` + "`" + `deployments` + "`" + ` d ON dt.deployment_id = d.id INNER JOIN ` + "`" + `workspaces` + "`" + ` w ON d.workspace_id = w.id -WHERE dt.version IN (/*SLICE:versions*/?) +WHERE dt.region = ? AND dt.version > ? +ORDER BY dt.version ASC +LIMIT ? ` -type FindDeploymentTopologyByVersionsRow struct { +type ListDeploymentTopologyByRegionParams struct { + Region string `db:"region"` + Afterversion uint64 `db:"afterversion"` + Limit int32 `db:"limit"` +} + +type ListDeploymentTopologyByRegionRow struct { DeploymentTopology DeploymentTopology `db:"deployment_topology"` Deployment Deployment `db:"deployment"` K8sNamespace sql.NullString `db:"k8s_namespace"` } -// FindDeploymentTopologyByVersions returns deployment topologies for specific versions. -// Used after ListClusterStateVersions to hydrate the full deployment data. +// ListDeploymentTopologyByRegion returns deployment topologies for a region with version > after_version. +// Used by WatchDeployments to stream deployment state changes to krane agents. // // SELECT // dt.pk, dt.workspace_id, dt.deployment_id, dt.region, dt.desired_replicas, dt.version, dt.desired_status, dt.created_at, dt.updated_at, @@ -38,26 +45,18 @@ type FindDeploymentTopologyByVersionsRow struct { // FROM `deployment_topology` dt // INNER JOIN `deployments` d ON dt.deployment_id = d.id // INNER JOIN `workspaces` w ON d.workspace_id = w.id -// WHERE dt.version IN (/*SLICE:versions*/?) -func (q *Queries) FindDeploymentTopologyByVersions(ctx context.Context, db DBTX, versions []uint64) ([]FindDeploymentTopologyByVersionsRow, error) { - query := findDeploymentTopologyByVersions - var queryParams []interface{} - if len(versions) > 0 { - for _, v := range versions { - queryParams = append(queryParams, v) - } - query = strings.Replace(query, "/*SLICE:versions*/?", strings.Repeat(",?", len(versions))[1:], 1) - } else { - query = strings.Replace(query, "/*SLICE:versions*/?", "NULL", 1) - } - rows, err := db.QueryContext(ctx, query, queryParams...) +// WHERE dt.region = ? AND dt.version > ? +// ORDER BY dt.version ASC +// LIMIT ? +func (q *Queries) ListDeploymentTopologyByRegion(ctx context.Context, db DBTX, arg ListDeploymentTopologyByRegionParams) ([]ListDeploymentTopologyByRegionRow, error) { + rows, err := db.QueryContext(ctx, listDeploymentTopologyByRegion, arg.Region, arg.Afterversion, arg.Limit) if err != nil { return nil, err } defer rows.Close() - var items []FindDeploymentTopologyByVersionsRow + var items []ListDeploymentTopologyByRegionRow for rows.Next() { - var i FindDeploymentTopologyByVersionsRow + var i ListDeploymentTopologyByRegionRow if err := rows.Scan( &i.DeploymentTopology.Pk, &i.DeploymentTopology.WorkspaceID, diff --git a/pkg/db/querier_generated.go b/pkg/db/querier_generated.go index 215b3fd9e2..19d6372f12 100644 --- a/pkg/db/querier_generated.go +++ b/pkg/db/querier_generated.go @@ -233,18 +233,6 @@ type Querier interface { // AND dt.deployment_id = ? // LIMIT 1 FindDeploymentTopologyByIDAndRegion(ctx context.Context, db DBTX, arg FindDeploymentTopologyByIDAndRegionParams) (FindDeploymentTopologyByIDAndRegionRow, error) - // FindDeploymentTopologyByVersions returns deployment topologies for specific versions. - // Used after ListClusterStateVersions to hydrate the full deployment data. - // - // SELECT - // dt.pk, dt.workspace_id, dt.deployment_id, dt.region, dt.desired_replicas, dt.version, dt.desired_status, dt.created_at, dt.updated_at, - // d.pk, d.id, d.k8s_name, d.workspace_id, d.project_id, d.environment_id, d.image, d.build_id, d.git_commit_sha, d.git_branch, d.git_commit_message, d.git_commit_author_handle, d.git_commit_author_avatar_url, d.git_commit_timestamp, d.sentinel_config, d.openapi_spec, d.cpu_millicores, d.memory_mib, d.desired_state, d.encrypted_environment_variables, d.command, d.status, d.created_at, d.updated_at, - // w.k8s_namespace - // FROM `deployment_topology` dt - // INNER JOIN `deployments` d ON dt.deployment_id = d.id - // INNER JOIN `workspaces` w ON d.workspace_id = w.id - // WHERE dt.version IN (/*SLICE:versions*/?) - FindDeploymentTopologyByVersions(ctx context.Context, db DBTX, versions []uint64) ([]FindDeploymentTopologyByVersionsRow, error) //FindEnvironmentById // // SELECT id, workspace_id, project_id, slug, description @@ -964,11 +952,6 @@ type Querier interface { // // SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, version, created_at, updated_at FROM sentinels WHERE environment_id = ? FindSentinelsByEnvironmentID(ctx context.Context, db DBTX, environmentID string) ([]Sentinel, error) - // FindSentinelsByVersions returns sentinels for specific versions. - // Used after ListClusterStateVersions to hydrate the full sentinel data. - // - // SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, version, created_at, updated_at FROM `sentinels` WHERE version IN (/*SLICE:versions*/?) - FindSentinelsByVersions(ctx context.Context, db DBTX, versions []uint64) ([]Sentinel, error) //FindWorkspaceByID // // SELECT id, org_id, name, slug, k8s_namespace, partition_id, plan, tier, stripe_customer_id, stripe_subscription_id, beta_features, features, subscriptions, enabled, delete_protection, created_at_m, updated_at_m, deleted_at_m FROM `workspaces` @@ -1656,24 +1639,20 @@ type Querier interface { // true // ) InsertWorkspace(ctx context.Context, db DBTX, arg InsertWorkspaceParams) error - // ListClusterStateVersions returns the next N (version, kind) pairs in global version order. - // Used to determine which resources to fetch for sync, without loading full row data. - // The 'kind' discriminator is 'deployment' or 'sentinel'. - // - // SELECT combined.version, combined.kind FROM ( - // SELECT dt.version, 'deployment' AS kind - // FROM `deployment_topology` dt - // WHERE dt.region = ? - // AND dt.version > ? - // UNION ALL - // SELECT s.version, 'sentinel' AS kind - // FROM `sentinels` s - // WHERE s.region = ? - // AND s.version > ? - // ) AS combined - // ORDER BY combined.version ASC + // ListDeploymentTopologyByRegion returns deployment topologies for a region with version > after_version. + // Used by WatchDeployments to stream deployment state changes to krane agents. + // + // SELECT + // dt.pk, dt.workspace_id, dt.deployment_id, dt.region, dt.desired_replicas, dt.version, dt.desired_status, dt.created_at, dt.updated_at, + // d.pk, d.id, d.k8s_name, d.workspace_id, d.project_id, d.environment_id, d.image, d.build_id, d.git_commit_sha, d.git_branch, d.git_commit_message, d.git_commit_author_handle, d.git_commit_author_avatar_url, d.git_commit_timestamp, d.sentinel_config, d.openapi_spec, d.cpu_millicores, d.memory_mib, d.desired_state, d.encrypted_environment_variables, d.command, d.status, d.created_at, d.updated_at, + // w.k8s_namespace + // FROM `deployment_topology` dt + // INNER JOIN `deployments` d ON dt.deployment_id = d.id + // INNER JOIN `workspaces` w ON d.workspace_id = w.id + // WHERE dt.region = ? AND dt.version > ? + // ORDER BY dt.version ASC // LIMIT ? - ListClusterStateVersions(ctx context.Context, db DBTX, arg ListClusterStateVersionsParams) ([]ListClusterStateVersionsRow, error) + ListDeploymentTopologyByRegion(ctx context.Context, db DBTX, arg ListDeploymentTopologyByRegionParams) ([]ListDeploymentTopologyByRegionRow, error) // ListDesiredDeploymentTopology returns all deployment topologies matching the desired state for a region. // Used during bootstrap to stream all running deployments to krane. // @@ -1995,6 +1974,14 @@ type Querier interface { // WHERE kr.key_id = ? // ORDER BY r.name ListRolesByKeyID(ctx context.Context, db DBTX, keyID string) ([]ListRolesByKeyIDRow, error) + // ListSentinelsByRegion returns sentinels for a region with version > after_version. + // Used by WatchSentinels to stream sentinel state changes to krane agents. + // + // SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, version, created_at, updated_at FROM `sentinels` + // WHERE region = ? AND version > ? + // ORDER BY version ASC + // LIMIT ? + ListSentinelsByRegion(ctx context.Context, db DBTX, arg ListSentinelsByRegionParams) ([]Sentinel, error) // Returns state changes for watch loop. Includes 1-second visibility delay // to handle AUTO_INCREMENT gaps where sequence N+1 commits before N. // Clients filter by their region when fetching the actual resource. diff --git a/pkg/db/queries/cluster_state_versions.sql b/pkg/db/queries/cluster_state_versions.sql deleted file mode 100644 index 06c9068055..0000000000 --- a/pkg/db/queries/cluster_state_versions.sql +++ /dev/null @@ -1,17 +0,0 @@ --- name: ListClusterStateVersions :many --- ListClusterStateVersions returns the next N (version, kind) pairs in global version order. --- Used to determine which resources to fetch for sync, without loading full row data. --- The 'kind' discriminator is 'deployment' or 'sentinel'. -SELECT combined.version, combined.kind FROM ( - SELECT dt.version, 'deployment' AS kind - FROM `deployment_topology` dt - WHERE dt.region = sqlc.arg(region) - AND dt.version > sqlc.arg(after_version) - UNION ALL - SELECT s.version, 'sentinel' AS kind - FROM `sentinels` s - WHERE s.region = sqlc.arg(region) - AND s.version > sqlc.arg(after_version) -) AS combined -ORDER BY combined.version ASC -LIMIT ?; diff --git a/pkg/db/queries/deployment_topology_find_by_versions.sql b/pkg/db/queries/deployment_topology_find_by_versions.sql deleted file mode 100644 index 01b1e50883..0000000000 --- a/pkg/db/queries/deployment_topology_find_by_versions.sql +++ /dev/null @@ -1,11 +0,0 @@ --- name: FindDeploymentTopologyByVersions :many --- FindDeploymentTopologyByVersions returns deployment topologies for specific versions. --- Used after ListClusterStateVersions to hydrate the full deployment data. -SELECT - sqlc.embed(dt), - sqlc.embed(d), - w.k8s_namespace -FROM `deployment_topology` dt -INNER JOIN `deployments` d ON dt.deployment_id = d.id -INNER JOIN `workspaces` w ON d.workspace_id = w.id -WHERE dt.version IN (sqlc.slice(versions)); diff --git a/pkg/db/queries/deployment_topology_list_by_versions.sql b/pkg/db/queries/deployment_topology_list_by_versions.sql new file mode 100644 index 0000000000..4f9b819136 --- /dev/null +++ b/pkg/db/queries/deployment_topology_list_by_versions.sql @@ -0,0 +1,13 @@ +-- name: ListDeploymentTopologyByRegion :many +-- ListDeploymentTopologyByRegion returns deployment topologies for a region with version > after_version. +-- Used by WatchDeployments to stream deployment state changes to krane agents. +SELECT + sqlc.embed(dt), + sqlc.embed(d), + w.k8s_namespace +FROM `deployment_topology` dt +INNER JOIN `deployments` d ON dt.deployment_id = d.id +INNER JOIN `workspaces` w ON d.workspace_id = w.id +WHERE dt.region = sqlc.arg(region) AND dt.version > sqlc.arg(afterVersion) +ORDER BY dt.version ASC +LIMIT ?; diff --git a/pkg/db/queries/sentinel_find_by_versions.sql b/pkg/db/queries/sentinel_find_by_versions.sql index 182f47be3c..30e59e44ba 100644 --- a/pkg/db/queries/sentinel_find_by_versions.sql +++ b/pkg/db/queries/sentinel_find_by_versions.sql @@ -1,4 +1,7 @@ --- name: FindSentinelsByVersions :many --- FindSentinelsByVersions returns sentinels for specific versions. --- Used after ListClusterStateVersions to hydrate the full sentinel data. -SELECT * FROM `sentinels` WHERE version IN (sqlc.slice(versions)); +-- name: ListSentinelsByRegion :many +-- ListSentinelsByRegion returns sentinels for a region with version > after_version. +-- Used by WatchSentinels to stream sentinel state changes to krane agents. +SELECT * FROM `sentinels` +WHERE region = sqlc.arg(region) AND version > sqlc.arg(afterVersion) +ORDER BY version ASC +LIMIT ?; diff --git a/pkg/db/sentinel_find_by_versions.sql_generated.go b/pkg/db/sentinel_find_by_versions.sql_generated.go index cfdb36a2aa..315275ea5d 100644 --- a/pkg/db/sentinel_find_by_versions.sql_generated.go +++ b/pkg/db/sentinel_find_by_versions.sql_generated.go @@ -7,29 +7,30 @@ package db import ( "context" - "strings" ) -const findSentinelsByVersions = `-- name: FindSentinelsByVersions :many -SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, version, created_at, updated_at FROM ` + "`" + `sentinels` + "`" + ` WHERE version IN (/*SLICE:versions*/?) +const listSentinelsByRegion = `-- name: ListSentinelsByRegion :many +SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, version, created_at, updated_at FROM ` + "`" + `sentinels` + "`" + ` +WHERE region = ? AND version > ? +ORDER BY version ASC +LIMIT ? ` -// FindSentinelsByVersions returns sentinels for specific versions. -// Used after ListClusterStateVersions to hydrate the full sentinel data. +type ListSentinelsByRegionParams struct { + Region string `db:"region"` + Afterversion uint64 `db:"afterversion"` + Limit int32 `db:"limit"` +} + +// ListSentinelsByRegion returns sentinels for a region with version > after_version. +// Used by WatchSentinels to stream sentinel state changes to krane agents. // -// SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, version, created_at, updated_at FROM `sentinels` WHERE version IN (/*SLICE:versions*/?) -func (q *Queries) FindSentinelsByVersions(ctx context.Context, db DBTX, versions []uint64) ([]Sentinel, error) { - query := findSentinelsByVersions - var queryParams []interface{} - if len(versions) > 0 { - for _, v := range versions { - queryParams = append(queryParams, v) - } - query = strings.Replace(query, "/*SLICE:versions*/?", strings.Repeat(",?", len(versions))[1:], 1) - } else { - query = strings.Replace(query, "/*SLICE:versions*/?", "NULL", 1) - } - rows, err := db.QueryContext(ctx, query, queryParams...) +// SELECT pk, id, workspace_id, project_id, environment_id, k8s_name, k8s_address, region, image, desired_state, health, desired_replicas, available_replicas, cpu_millicores, memory_mib, version, created_at, updated_at FROM `sentinels` +// WHERE region = ? AND version > ? +// ORDER BY version ASC +// LIMIT ? +func (q *Queries) ListSentinelsByRegion(ctx context.Context, db DBTX, arg ListSentinelsByRegionParams) ([]Sentinel, error) { + rows, err := db.QueryContext(ctx, listSentinelsByRegion, arg.Region, arg.Afterversion, arg.Limit) if err != nil { return nil, err } diff --git a/svc/ctrl/proto/ctrl/v1/cluster.proto b/svc/ctrl/proto/ctrl/v1/cluster.proto index 7400e9e773..b040c9459b 100644 --- a/svc/ctrl/proto/ctrl/v1/cluster.proto +++ b/svc/ctrl/proto/ctrl/v1/cluster.proto @@ -23,19 +23,39 @@ enum Health { // ClusterService coordinates deployment and sentinel configurations across multiple clusters. // -// Agents in each cluster establish a watch stream to receive configuration events -// from the control plane. The service streams deployment and sentinel lifecycle events -// (apply, delete) to the appropriate clusters based on their cluster_id and region. +// Agents in each cluster establish watch streams to receive configuration events +// from the control plane. The service provides separate streams for deployments and +// sentinels, each with independent version cursors. This allows the two control loops +// to operate independently with isolated failure domains. // -// The watch connection is designed to be long-lived with automatic reconnection on failure. -// When an agent reconnects, it should initiate reconciliation to ensure consistency. +// The watch connections are designed to be long-lived with automatic reconnection on failure. +// When an agent reconnects, it should resume from its last seen version for that resource type. service ClusterService { - rpc Sync(SyncRequest) returns (stream State); + // WatchDeployments streams deployment state changes from the control plane to agents. + // Each deployment controller maintains its own version cursor for resumable streaming. + // The agent applies received state to Kubernetes to converge actual state toward desired state. + rpc WatchDeployments(WatchDeploymentsRequest) returns (stream DeploymentState); + + // WatchSentinels streams sentinel state changes from the control plane to agents. + // Each sentinel controller maintains its own version cursor for resumable streaming. + // The agent applies received state to Kubernetes to converge actual state toward desired state. + rpc WatchSentinels(WatchSentinelsRequest) returns (stream SentinelState); + + // GetDesiredSentinelState returns the current desired state for a single sentinel. + // Used by the resync loop to verify consistency for existing resources. rpc GetDesiredSentinelState(GetDesiredSentinelStateRequest) returns (SentinelState); - rpc UpdateSentinelState(UpdateSentinelStateRequest) returns (UpdateSentinelStateResponse); + // ReportSentinelStatus reports actual sentinel state from the agent to the control plane. + // Called when K8s watch events indicate sentinel Deployment changes. + rpc ReportSentinelStatus(ReportSentinelStatusRequest) returns (ReportSentinelStatusResponse); + + // GetDesiredDeploymentState returns the current desired state for a single deployment. + // Used by the resync loop to verify consistency for existing resources. rpc GetDesiredDeploymentState(GetDesiredDeploymentStateRequest) returns (DeploymentState); - rpc UpdateDeploymentState(UpdateDeploymentStateRequest) returns (UpdateDeploymentStateResponse); + + // ReportDeploymentStatus reports actual deployment state from the agent to the control plane. + // Called when K8s watch events indicate ReplicaSet changes. + rpc ReportDeploymentStatus(ReportDeploymentStatusRequest) returns (ReportDeploymentStatusResponse); } message GetDesiredSentinelStateRequest { @@ -46,7 +66,9 @@ message GetDesiredDeploymentStateRequest { string deployment_id = 1; } -message UpdateDeploymentStateRequest { +// ReportDeploymentStatusRequest reports the actual state of a deployment from the agent. +// Used by runActualStateReportLoop to inform the control plane of K8s cluster state. +message ReportDeploymentStatusRequest { message Update { message Instance { enum Status { @@ -75,33 +97,32 @@ message UpdateDeploymentStateRequest { } } -message UpdateDeploymentStateResponse {} - -message UpdateInstanceStateResponse {} +message ReportDeploymentStatusResponse {} -message UpdateSentinelStateRequest { +// ReportSentinelStatusRequest reports the actual state of a sentinel from the agent. +// Used by runActualStateReportLoop to inform the control plane of K8s cluster state. +message ReportSentinelStatusRequest { string k8s_name = 1; int32 available_replicas = 2; Health health = 3; } -message UpdateSentinelStateResponse {} +message ReportSentinelStatusResponse {} -message SyncRequest { +// WatchDeploymentsRequest initiates a stream of deployment state changes. +// The version_last_seen enables resumable streaming - the server will only send +// events newer than this version. Independent of the sentinel stream version. +message WatchDeploymentsRequest { string region = 1; uint64 version_last_seen = 2; } -message State { - // version is the resource version for this state update. - // Clients should track the max version seen and persist it after - // the stream closes cleanly to resume from the correct position on reconnect. - uint64 version = 1; - - oneof kind { - DeploymentState deployment = 2; - SentinelState sentinel = 3; - } +// WatchSentinelsRequest initiates a stream of sentinel state changes. +// The version_last_seen enables resumable streaming - the server will only send +// events newer than this version. Independent of the deployment stream version. +message WatchSentinelsRequest { + string region = 1; + uint64 version_last_seen = 2; } // SentinelState represents a lifecycle event for an API sentinel configuration. @@ -110,6 +131,12 @@ message State { // and API management. The event follows a declarative model where the cluster agent ensures // the cluster state matches the desired configuration. message SentinelState { + // version is the sentinel-specific resource version for this state update. + // Clients should track the max version seen and use it when reconnecting to + // the WatchSentinels stream to resume from the correct position. + // When returned from GetDesiredSentinelState, this field is not set. + uint64 version = 3; + // state contains the specific sentinel operation to perform. // Only one state type is set per message, determining the action the agent should take. oneof state { @@ -130,6 +157,12 @@ message SentinelState { // that can be scaled horizontally. The event follows a declarative model where // the cluster agent ensures the cluster state matches the desired configuration. message DeploymentState { + // version is the deployment-specific resource version for this state update. + // Clients should track the max version seen and use it when reconnecting to + // the WatchDeployments stream to resume from the correct position. + // When returned from GetDesiredDeploymentState, this field is not set. + uint64 version = 3; + // state contains the specific deployment operation to perform. // Only one state type is set per message, determining the action the agent should take. oneof state { diff --git a/svc/ctrl/services/cluster/BUILD.bazel b/svc/ctrl/services/cluster/BUILD.bazel index 2cbf831eb3..28aff134ba 100644 --- a/svc/ctrl/services/cluster/BUILD.bazel +++ b/svc/ctrl/services/cluster/BUILD.bazel @@ -7,9 +7,10 @@ go_library( "doc.go", "rpc_get_desired_deployment_state.go", "rpc_get_desired_sentinel_state.go", - "rpc_sync.go", - "rpc_update_deployment_state.go", - "rpc_update_sentinel_state.go", + "rpc_report_deployment_status.go", + "rpc_report_sentinel_status.go", + "rpc_watch_deployments.go", + "rpc_watch_sentinels.go", "service.go", ], importpath = "github.com/unkeyed/unkey/svc/ctrl/services/cluster", diff --git a/svc/ctrl/services/cluster/rpc_update_deployment_state.go b/svc/ctrl/services/cluster/rpc_report_deployment_status.go similarity index 73% rename from svc/ctrl/services/cluster/rpc_update_deployment_state.go rename to svc/ctrl/services/cluster/rpc_report_deployment_status.go index 307a7b9532..93ba67b7aa 100644 --- a/svc/ctrl/services/cluster/rpc_update_deployment_state.go +++ b/svc/ctrl/services/cluster/rpc_report_deployment_status.go @@ -10,7 +10,7 @@ import ( "github.com/unkeyed/unkey/pkg/uid" ) -// UpdateDeploymentState reconciles the observed deployment state reported by a krane agent. +// ReportDeploymentStatus reconciles the observed deployment state reported by a krane agent. // This is the feedback loop for convergence: agents report what's actually running so the // control plane can track instance health and detect drift. // @@ -20,9 +20,8 @@ import ( // to handle transient database errors. // // Requires bearer token authentication and the X-Krane-Region header. -func (s *Service) UpdateDeploymentState(ctx context.Context, req *connect.Request[ctrlv1.UpdateDeploymentStateRequest]) (*connect.Response[ctrlv1.UpdateDeploymentStateResponse], error) { - s.logger.Info("updating deployment state", "req", req.Msg) - //"update:{k8s_name:\"pgeywtmuengq\" instances:{k8s_name:\"pgeywtmuengq-kdfvj\" address:\"192-168-194-33.uzapavou.pod.cluster.local\" status:STATUS_RUNNING}}" +func (s *Service) ReportDeploymentStatus(ctx context.Context, req *connect.Request[ctrlv1.ReportDeploymentStatusRequest]) (*connect.Response[ctrlv1.ReportDeploymentStatusResponse], error) { + s.logger.Info("reporting deployment status", "req", req.Msg) if err := s.authenticate(req); err != nil { return nil, err @@ -39,7 +38,7 @@ func (s *Service) UpdateDeploymentState(ctx context.Context, req *connect.Reques err = db.TxRetry(ctx, s.db.RW(), func(txCtx context.Context, tx db.DBTX) error { switch msg := req.Msg.GetChange().(type) { - case *ctrlv1.UpdateDeploymentStateRequest_Update_: + case *ctrlv1.ReportDeploymentStatusRequest_Update_: { deployment, err := db.Query.FindDeploymentByK8sName(ctx, tx, msg.Update.GetK8SName()) if err != nil { @@ -54,7 +53,7 @@ func (s *Service) UpdateDeploymentState(ctx context.Context, req *connect.Reques return err } - wantInstanceNames := map[string]*ctrlv1.UpdateDeploymentStateRequest_Update_Instance{} + wantInstanceNames := map[string]*ctrlv1.ReportDeploymentStatusRequest_Update_Instance{} for _, instance := range msg.Update.GetInstances() { wantInstanceNames[instance.GetK8SName()] = instance } @@ -90,7 +89,7 @@ func (s *Service) UpdateDeploymentState(ctx context.Context, req *connect.Reques } } - case *ctrlv1.UpdateDeploymentStateRequest_Delete_: + case *ctrlv1.ReportDeploymentStatusRequest_Delete_: { deployment, err := db.Query.FindDeploymentByK8sName(ctx, tx, msg.Delete.GetK8SName()) @@ -111,21 +110,21 @@ func (s *Service) UpdateDeploymentState(ctx context.Context, req *connect.Reques return nil }) - return connect.NewResponse(&ctrlv1.UpdateDeploymentStateResponse{}), err + return connect.NewResponse(&ctrlv1.ReportDeploymentStatusResponse{}), err } // ctrlDeploymentStatusToDbStatus maps proto instance status to database enum values. // Unspecified or unknown statuses are treated as inactive. -func ctrlDeploymentStatusToDbStatus(status ctrlv1.UpdateDeploymentStateRequest_Update_Instance_Status) db.InstancesStatus { +func ctrlDeploymentStatusToDbStatus(status ctrlv1.ReportDeploymentStatusRequest_Update_Instance_Status) db.InstancesStatus { switch status { - case ctrlv1.UpdateDeploymentStateRequest_Update_Instance_STATUS_UNSPECIFIED: + case ctrlv1.ReportDeploymentStatusRequest_Update_Instance_STATUS_UNSPECIFIED: return db.InstancesStatusInactive - case ctrlv1.UpdateDeploymentStateRequest_Update_Instance_STATUS_PENDING: + case ctrlv1.ReportDeploymentStatusRequest_Update_Instance_STATUS_PENDING: return db.InstancesStatusPending - case ctrlv1.UpdateDeploymentStateRequest_Update_Instance_STATUS_RUNNING: + case ctrlv1.ReportDeploymentStatusRequest_Update_Instance_STATUS_RUNNING: return db.InstancesStatusRunning - case ctrlv1.UpdateDeploymentStateRequest_Update_Instance_STATUS_FAILED: + case ctrlv1.ReportDeploymentStatusRequest_Update_Instance_STATUS_FAILED: return db.InstancesStatusFailed default: return db.InstancesStatusInactive diff --git a/svc/ctrl/services/cluster/rpc_update_sentinel_state.go b/svc/ctrl/services/cluster/rpc_report_sentinel_status.go similarity index 81% rename from svc/ctrl/services/cluster/rpc_update_sentinel_state.go rename to svc/ctrl/services/cluster/rpc_report_sentinel_status.go index 98810a2ddf..27096f4eed 100644 --- a/svc/ctrl/services/cluster/rpc_update_sentinel_state.go +++ b/svc/ctrl/services/cluster/rpc_report_sentinel_status.go @@ -11,13 +11,13 @@ import ( "github.com/unkeyed/unkey/pkg/db" ) -// UpdateSentinelState records the observed replica count for a sentinel as reported by a +// ReportSentinelStatus records the observed replica count for a sentinel as reported by a // krane agent. This updates the available_replicas and health fields in the database, // allowing the control plane to track which sentinels are actually running and healthy. // A sentinel is considered healthy if it has at least one available replica. // // Requires bearer token authentication and the X-Krane-Region header. -func (s *Service) UpdateSentinelState(ctx context.Context, req *connect.Request[ctrlv1.UpdateSentinelStateRequest]) (*connect.Response[ctrlv1.UpdateSentinelStateResponse], error) { +func (s *Service) ReportSentinelStatus(ctx context.Context, req *connect.Request[ctrlv1.ReportSentinelStatusRequest]) (*connect.Response[ctrlv1.ReportSentinelStatusResponse], error) { if err := s.authenticate(req); err != nil { return nil, err @@ -53,6 +53,6 @@ func (s *Service) UpdateSentinelState(ctx context.Context, req *connect.Request[ return nil, connect.NewError(connect.CodeInternal, err) } - return connect.NewResponse(&ctrlv1.UpdateSentinelStateResponse{}), nil + return connect.NewResponse(&ctrlv1.ReportSentinelStatusResponse{}), nil } diff --git a/svc/ctrl/services/cluster/rpc_sync.go b/svc/ctrl/services/cluster/rpc_sync.go deleted file mode 100644 index 4c9dc53281..0000000000 --- a/svc/ctrl/services/cluster/rpc_sync.go +++ /dev/null @@ -1,226 +0,0 @@ -package cluster - -import ( - "context" - "fmt" - - "connectrpc.com/connect" - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" - "github.com/unkeyed/unkey/pkg/db" -) - -const syncBatchSize = 100 - -// Sync streams cluster state to a krane agent for the given region. -// -// Each resource carries its actual version, so clients track max(seen versions). -// -// IMPORTANT: Clients must only commit their version tracking after a clean stream -// close. This ensures atomic bootstrap: if a stream breaks mid-bootstrap, the client -// retries from version 0 rather than skipping resources that were never received. -// -// After bootstrap (versionLastSeen=0), clients should garbage-collect any k8s -// resources not mentioned in the bootstrap stream. -// -// Sync is a bounded catch-up stream. The server stops after sending a batch of -// changes; clients reconnect to continue from their last-seen version. -func (s *Service) Sync(ctx context.Context, req *connect.Request[ctrlv1.SyncRequest], stream *connect.ServerStream[ctrlv1.State]) error { - region := req.Msg.GetRegion() - versionLastSeen := req.Msg.GetVersionLastSeen() - - s.logger.Info("sync request received", - "region", region, - "versionLastSeen", versionLastSeen, - ) - - if err := s.streamStateAfterVersion(ctx, region, versionLastSeen, stream); err != nil { - return connect.NewError(connect.CodeInternal, fmt.Errorf("stream state region=%q after_version=%d: %w", region, versionLastSeen, err)) - } - - return nil -} - -// streamStateAfterVersion streams all resources with version > afterVersion in global version order. -// It uses a three-step approach: -// 1. Query the next batch of (version, kind) pairs in global order (lightweight UNION ALL) -// 2. Partition versions by kind and hydrate full data with targeted queries -// 3. Merge results by version and stream to the client -// -// As you can see, this is not terribly efficient, but it's easy and will do just fine for now. -// Later we can probably split it up and do 2 separete streams, one for deployments and one for sentinels -func (s *Service) streamStateAfterVersion(ctx context.Context, region string, afterVersion uint64, stream *connect.ServerStream[ctrlv1.State]) error { - for { - // Step 1: Get next batch of versions in global order - versionRows, err := db.Query.ListClusterStateVersions(ctx, s.db.RO(), db.ListClusterStateVersionsParams{ - Region: region, - AfterVersion: afterVersion, - Limit: int32(syncBatchSize), - }) - if err != nil { - return fmt.Errorf("list cluster state versions after_version=%d: %w", afterVersion, err) - } - - if len(versionRows) == 0 { - return nil - } - - // Step 2: Partition versions by kind - var deploymentVersions, sentinelVersions []uint64 - for _, row := range versionRows { - switch row.Kind { - case "deployment": - deploymentVersions = append(deploymentVersions, row.Version) - case "sentinel": - sentinelVersions = append(sentinelVersions, row.Version) - } - } - - // Step 3: Hydrate full data - deploymentsByVersion := make(map[uint64]db.FindDeploymentTopologyByVersionsRow) - if len(deploymentVersions) > 0 { - topologies, err := db.Query.FindDeploymentTopologyByVersions(ctx, s.db.RO(), deploymentVersions) - if err != nil { - return fmt.Errorf("find deployment topologies by versions: %w", err) - } - for _, t := range topologies { - deploymentsByVersion[t.DeploymentTopology.Version] = t - } - } - - sentinelsByVersion := make(map[uint64]db.Sentinel) - if len(sentinelVersions) > 0 { - sentinels, err := db.Query.FindSentinelsByVersions(ctx, s.db.RO(), sentinelVersions) - if err != nil { - return fmt.Errorf("find sentinels by versions: %w", err) - } - for _, sentinel := range sentinels { - sentinelsByVersion[sentinel.Version] = sentinel - } - } - - // Step 4: Stream in global version order - for _, row := range versionRows { - var state *ctrlv1.State - - switch row.Kind { - case "deployment": - topology, ok := deploymentsByVersion[row.Version] - if !ok { - return fmt.Errorf("deployment topology version=%d not found after hydration", row.Version) - } - state = s.deploymentTopologyToState(topology) - - case "sentinel": - sentinel, ok := sentinelsByVersion[row.Version] - if !ok { - return fmt.Errorf("sentinel version=%d not found after hydration", row.Version) - } - state = s.sentinelToState(sentinel) - } - - if err := stream.Send(state); err != nil { - return fmt.Errorf("send state version=%d kind=%s: %w", row.Version, row.Kind, err) - } - } - - // Update afterVersion for next iteration - afterVersion = versionRows[len(versionRows)-1].Version - - // If we got fewer than batch size, we've reached the end - if len(versionRows) < syncBatchSize { - return nil - } - } -} - -// deploymentTopologyToState converts a deployment topology row to a State message. -// If the deployment should not be running (replicas=0 or stopped), it returns a Delete. -func (s *Service) deploymentTopologyToState(topology db.FindDeploymentTopologyByVersionsRow) *ctrlv1.State { - if topology.DeploymentTopology.DesiredReplicas == 0 || - topology.DeploymentTopology.DesiredStatus == db.DeploymentTopologyDesiredStatusStopped || - topology.DeploymentTopology.DesiredStatus == db.DeploymentTopologyDesiredStatusStopping { - return &ctrlv1.State{ - Version: topology.DeploymentTopology.Version, - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Delete{ - Delete: &ctrlv1.DeleteDeployment{ - K8SNamespace: topology.K8sNamespace.String, - K8SName: topology.Deployment.K8sName, - }, - }, - }, - }, - } - } - - var buildID *string - if topology.Deployment.BuildID.Valid { - buildID = &topology.Deployment.BuildID.String - } - - return &ctrlv1.State{ - Version: topology.DeploymentTopology.Version, - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Apply{ - Apply: &ctrlv1.ApplyDeployment{ - K8SNamespace: topology.K8sNamespace.String, - K8SName: topology.Deployment.K8sName, - WorkspaceId: topology.Deployment.WorkspaceID, - EnvironmentId: topology.Deployment.EnvironmentID, - ProjectId: topology.Deployment.ProjectID, - DeploymentId: topology.Deployment.ID, - Image: topology.Deployment.Image.String, - Replicas: topology.DeploymentTopology.DesiredReplicas, - CpuMillicores: int64(topology.Deployment.CpuMillicores), - MemoryMib: int64(topology.Deployment.MemoryMib), - EncryptedEnvironmentVariables: topology.Deployment.EncryptedEnvironmentVariables, - BuildId: buildID, - }, - }, - }, - }, - } -} - -// sentinelToState converts a sentinel row to a State message. -// If the sentinel should not be running (replicas=0 or not running state), -// it returns a Delete instruction. Otherwise, it returns an Apply instruction. -func (s *Service) sentinelToState(sentinel db.Sentinel) *ctrlv1.State { - if sentinel.DesiredReplicas == 0 || sentinel.DesiredState != db.SentinelsDesiredStateRunning { - return &ctrlv1.State{ - Version: sentinel.Version, - Kind: &ctrlv1.State_Sentinel{ - Sentinel: &ctrlv1.SentinelState{ - State: &ctrlv1.SentinelState_Delete{ - Delete: &ctrlv1.DeleteSentinel{ - K8SName: sentinel.K8sName, - }, - }, - }, - }, - } - } - - return &ctrlv1.State{ - Version: sentinel.Version, - Kind: &ctrlv1.State_Sentinel{ - Sentinel: &ctrlv1.SentinelState{ - State: &ctrlv1.SentinelState_Apply{ - Apply: &ctrlv1.ApplySentinel{ - K8SName: sentinel.K8sName, - WorkspaceId: sentinel.WorkspaceID, - EnvironmentId: sentinel.EnvironmentID, - ProjectId: sentinel.ProjectID, - SentinelId: sentinel.ID, - Image: sentinel.Image, - Replicas: sentinel.DesiredReplicas, - CpuMillicores: int64(sentinel.CpuMillicores), - MemoryMib: int64(sentinel.MemoryMib), - }, - }, - }, - }, - } -} diff --git a/svc/ctrl/services/cluster/rpc_watch_deployments.go b/svc/ctrl/services/cluster/rpc_watch_deployments.go new file mode 100644 index 0000000000..d94170bf3f --- /dev/null +++ b/svc/ctrl/services/cluster/rpc_watch_deployments.go @@ -0,0 +1,128 @@ +package cluster + +import ( + "context" + "time" + + "connectrpc.com/connect" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" + "github.com/unkeyed/unkey/pkg/assert" + "github.com/unkeyed/unkey/pkg/db" +) + +// WatchDeployments streams deployment state changes from the control plane to agents. +// Each deployment controller maintains its own version cursor for resumable streaming. +// The agent applies received state to Kubernetes to converge actual state toward desired state. +// +// This is a long-lived streaming RPC. The server polls the database for new deployment +// versions and streams them to the client. The client should track the max version seen +// and reconnect with that version to resume from where it left off. +func (s *Service) WatchDeployments( + ctx context.Context, + req *connect.Request[ctrlv1.WatchDeploymentsRequest], + stream *connect.ServerStream[ctrlv1.DeploymentState], +) error { + if err := s.authenticate(req); err != nil { + return err + } + + region := req.Msg.GetRegion() + if err := assert.NotEmpty(region, "region is required"); err != nil { + return connect.NewError(connect.CodeInvalidArgument, err) + } + + versionCursor := req.Msg.GetVersionLastSeen() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + + states, err := s.fetchDeploymentStates(ctx, region, versionCursor) + if err != nil { + s.logger.Error("failed to fetch deployment states", "error", err) + return connect.NewError(connect.CodeInternal, err) + } + + for _, state := range states { + if err := stream.Send(state); err != nil { + return err + } + if state.GetVersion() > versionCursor { + versionCursor = state.GetVersion() + } + } + + if len(states) == 0 { + time.Sleep(time.Second) + } + } +} + +func (s *Service) fetchDeploymentStates(ctx context.Context, region string, afterVersion uint64) ([]*ctrlv1.DeploymentState, error) { + rows, err := db.Query.ListDeploymentTopologyByRegion(ctx, s.db.RO(), db.ListDeploymentTopologyByRegionParams{ + Region: region, + Afterversion: afterVersion, + Limit: 100, + }) + if err != nil { + return nil, err + } + + states := make([]*ctrlv1.DeploymentState, 0, len(rows)) + for _, row := range rows { + state, err := s.deploymentRowToState(row) + if err != nil { + s.logger.Error("failed to convert deployment row to state", "error", err, "deploymentId", row.Deployment.ID) + continue + } + states = append(states, state) + } + + return states, nil +} + +func (s *Service) deploymentRowToState(row db.ListDeploymentTopologyByRegionRow) (*ctrlv1.DeploymentState, error) { + switch row.DeploymentTopology.DesiredStatus { + case db.DeploymentTopologyDesiredStatusStopped, db.DeploymentTopologyDesiredStatusStopping: + return &ctrlv1.DeploymentState{ + Version: row.DeploymentTopology.Version, + State: &ctrlv1.DeploymentState_Delete{ + Delete: &ctrlv1.DeleteDeployment{ + K8SNamespace: row.K8sNamespace.String, + K8SName: row.Deployment.K8sName, + }, + }, + }, nil + case db.DeploymentTopologyDesiredStatusStarted, db.DeploymentTopologyDesiredStatusStarting: + var buildID *string + if row.Deployment.BuildID.Valid { + buildID = &row.Deployment.BuildID.String + } + + return &ctrlv1.DeploymentState{ + Version: row.DeploymentTopology.Version, + State: &ctrlv1.DeploymentState_Apply{ + Apply: &ctrlv1.ApplyDeployment{ + DeploymentId: row.Deployment.ID, + K8SNamespace: row.K8sNamespace.String, + K8SName: row.Deployment.K8sName, + WorkspaceId: row.Deployment.WorkspaceID, + ProjectId: row.Deployment.ProjectID, + EnvironmentId: row.Deployment.EnvironmentID, + Replicas: row.DeploymentTopology.DesiredReplicas, + Image: row.Deployment.Image.String, + CpuMillicores: int64(row.Deployment.CpuMillicores), + MemoryMib: int64(row.Deployment.MemoryMib), + EncryptedEnvironmentVariables: row.Deployment.EncryptedEnvironmentVariables, + BuildId: buildID, + }, + }, + }, nil + default: + s.logger.Error("unhandled deployment topology desired status", "status", row.DeploymentTopology.DesiredStatus) + return nil, nil + } +} diff --git a/svc/ctrl/services/cluster/rpc_watch_sentinels.go b/svc/ctrl/services/cluster/rpc_watch_sentinels.go new file mode 100644 index 0000000000..424945aff1 --- /dev/null +++ b/svc/ctrl/services/cluster/rpc_watch_sentinels.go @@ -0,0 +1,117 @@ +package cluster + +import ( + "context" + "time" + + "connectrpc.com/connect" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" + "github.com/unkeyed/unkey/pkg/assert" + "github.com/unkeyed/unkey/pkg/db" +) + +// WatchSentinels streams sentinel state changes from the control plane to agents. +// Each sentinel controller maintains its own version cursor for resumable streaming. +// The agent applies received state to Kubernetes to converge actual state toward desired state. +// +// This is a long-lived streaming RPC. The server polls the database for new sentinel +// versions and streams them to the client. The client should track the max version seen +// and reconnect with that version to resume from where it left off. +func (s *Service) WatchSentinels( + ctx context.Context, + req *connect.Request[ctrlv1.WatchSentinelsRequest], + stream *connect.ServerStream[ctrlv1.SentinelState], +) error { + if err := s.authenticate(req); err != nil { + return err + } + + region := req.Msg.GetRegion() + if err := assert.NotEmpty(region, "region is required"); err != nil { + return connect.NewError(connect.CodeInvalidArgument, err) + } + + versionCursor := req.Msg.GetVersionLastSeen() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + + states, err := s.fetchSentinelStates(ctx, region, versionCursor) + if err != nil { + s.logger.Error("failed to fetch sentinel states", "error", err) + return connect.NewError(connect.CodeInternal, err) + } + + for _, state := range states { + if err := stream.Send(state); err != nil { + return err + } + if state.GetVersion() > versionCursor { + versionCursor = state.GetVersion() + } + } + + if len(states) == 0 { + time.Sleep(time.Second) + } + } +} + +func (s *Service) fetchSentinelStates(ctx context.Context, region string, afterVersion uint64) ([]*ctrlv1.SentinelState, error) { + rows, err := db.Query.ListSentinelsByRegion(ctx, s.db.RO(), db.ListSentinelsByRegionParams{ + Region: region, + Afterversion: afterVersion, + Limit: 100, + }) + if err != nil { + return nil, err + } + + states := make([]*ctrlv1.SentinelState, 0, len(rows)) + for _, row := range rows { + state := s.sentinelRowToState(row) + if state != nil { + states = append(states, state) + } + } + + return states, nil +} + +func (s *Service) sentinelRowToState(sentinel db.Sentinel) *ctrlv1.SentinelState { + switch sentinel.DesiredState { + case db.SentinelsDesiredStateArchived, db.SentinelsDesiredStateStandby: + return &ctrlv1.SentinelState{ + Version: sentinel.Version, + State: &ctrlv1.SentinelState_Delete{ + Delete: &ctrlv1.DeleteSentinel{ + K8SName: sentinel.K8sName, + }, + }, + } + case db.SentinelsDesiredStateRunning: + return &ctrlv1.SentinelState{ + Version: sentinel.Version, + State: &ctrlv1.SentinelState_Apply{ + Apply: &ctrlv1.ApplySentinel{ + SentinelId: sentinel.ID, + K8SName: sentinel.K8sName, + WorkspaceId: sentinel.WorkspaceID, + ProjectId: sentinel.ProjectID, + EnvironmentId: sentinel.EnvironmentID, + Replicas: sentinel.DesiredReplicas, + Image: sentinel.Image, + CpuMillicores: int64(sentinel.CpuMillicores), + MemoryMib: int64(sentinel.MemoryMib), + }, + }, + } + default: + s.logger.Error("unhandled sentinel desired state", "desiredState", sentinel.DesiredState) + return nil + } +} diff --git a/svc/krane/BUILD.bazel b/svc/krane/BUILD.bazel index 20b60bf750..c7c23eedde 100644 --- a/svc/krane/BUILD.bazel +++ b/svc/krane/BUILD.bazel @@ -18,7 +18,8 @@ go_library( "//pkg/vault", "//pkg/vault/storage", "//pkg/version", - "//svc/krane/internal/reconciler", + "//svc/krane/internal/deployment", + "//svc/krane/internal/sentinel", "//svc/krane/pkg/controlplane", "//svc/krane/secrets", "//svc/krane/secrets/token", diff --git a/svc/krane/DOCUMENTATION_SUMMARY.md b/svc/krane/DOCUMENTATION_SUMMARY.md deleted file mode 100644 index 0d2cec3919..0000000000 --- a/svc/krane/DOCUMENTATION_SUMMARY.md +++ /dev/null @@ -1,130 +0,0 @@ -# Documentation Summary for apps/krane Package - -## Overview - -The `apps/krane` folder contains the complete implementation of the Krane distributed -container orchestration system. Krane serves as a node-level agent that synchronizes -desired state from a central control plane with actual state in Kubernetes clusters. - -## Package Structure - -``` -apps/krane/ -├── config.go # Main configuration for krane agent -├── doc.go # Package documentation -├── run.go # Agent entry point and lifecycle management -├── pkg/ # Shared utilities and libraries -│ ├── controlplane/ # Control plane client and streaming -│ └── k8s/ # Kubernetes utilities and helpers -└── sentinel_controller/ # Sentinel resource management - ├── api/v1/ # Sentinel CRD types - ├── reconciler/ # Kubernetes reconciliation logic - ├── reflector/ # Database-to-Kubernetes sync - ├── status/ # Status reporting to control plane - ├── yaml/ # Kubernetes manifests - └── doc.go # Sentinel controller documentation -``` - -## Key Components - -### Main Package (`apps/krane/`) -- **Config**: Central configuration for krane agent instances -- **Run**: Main entry point that orchestrates all system components -- **Architecture**: Distributed container orchestration with control plane synchronization - -### Control Plane Integration (`pkg/controlplane/`) -- **Client**: gRPC client with automatic authentication and metadata -- **Watcher**: Event streaming with live sync and periodic reconciliation -- **Interceptor**: Request/response metadata injection for routing - -### Kubernetes Integration (`pkg/k8s/`) -- **Labels**: Fluent label builder with standardized conventions -- **Client/Manager**: In-cluster Kubernetes client initialization -- **Reconciler**: Common interface for reconciliation operations -- **Logger**: OpenTelemetry to controller-runtime logging bridge - -### Sentinel Management (`sentinel_controller/`) -- **Custom Resources**: Sentinel CRD with complete API types -- **Reconciliation**: Standard controller-runtime pattern for resource lifecycle -- **Reflector**: Database event streaming to Kubernetes resources -- **Status Reporting**: Bidirectional sync with control plane - -## Key Design Patterns - -### Hybrid Architecture -Krane uses a unique hybrid approach combining: -- **Event Push**: Control plane streams desired state changes -- **Standard Reconciliation**: Kubernetes controller-runtime for resource management -- **Periodic Sync**: Ensures consistency despite missed events -- **Status Feedback**: Operational metrics reported back to control plane - -### Multi-Tenancy Support -All resources are scoped with consistent label hierarchy: -- `unkey.com/workspace.id`: Tenant workspace -- `unkey.com/project.id`: Project within workspace -- `unkey.com/environment.id`: Deployment environment -- `app.kubernetes.io/managed-by`: Resource ownership - -### Resilience Patterns -- Circuit breakers for control plane availability -- Event buffering for network interruptions -- Exponential backoff with jitter for reconnections -- Graceful shutdown with resource cleanup - -### Observability Integration -- OpenTelemetry for structured logging and metrics -- Prometheus metrics exposure for monitoring -- Distributed tracing across control plane and cluster -- Kubernetes event integration for operational visibility - -## Documentation Coverage Status - -### ✅ Fully Documented -- **Package-level documentation**: Comprehensive doc.go files for all packages -- **Exported types**: All structs, interfaces, and constants documented -- **Public functions**: Complete documentation with parameters and behavior -- **Architecture explanations**: Design decisions and integration patterns -- **Error handling**: Failure modes and recovery strategies -- **Usage examples**: Practical implementation guidance - -### ✅ Key Features Documented -- **Label management**: Builder pattern with immutable operations -- **Control plane streaming**: Live sync and periodic reconciliation -- **Kubernetes integration**: In-cluster configuration and controllers -- **Custom resources**: Complete CRD specification and examples -- **Status reporting**: Bidirectional synchronization with metrics - -### ✅ Compliance with Guidelines -- **Package documentation**: Dedicated doc.go files for all packages -- **Function documentation**: What, when, why, watch-out patterns -- **Type documentation**: Field purposes and constraints -- **Cross-references**: Proper [Type] and [Function] linking -- **Examples**: Real-world usage patterns and configurations -- **Non-obvious behavior**: Architectural decisions and trade-offs - -## Implementation Quality - -The codebase demonstrates excellent software engineering practices: -- **Clear separation of concerns** with well-defined package boundaries -- **Consistent interfaces** enabling different implementation strategies -- **Comprehensive error handling** with graceful degradation -- **Production-ready resilience** with circuit breakers and retries -- **Observability-first design** with structured logging and metrics -- **Kubernetes best practices** with standard label conventions -- **Type safety** with proper use of generics and interfaces - -## Conclusion - -The `apps/krane` folder is comprehensively documented and represents a mature, -production-ready distributed orchestration system. The documentation provides -complete understanding of: - -- System architecture and component interactions -- Usage patterns for different deployment scenarios -- Integration points with external systems -- Operational characteristics and failure modes -- Design rationale and engineering trade-offs - -All documentation follows Go documentation guidelines with appropriate depth -matching code complexity, practical examples, and comprehensive -cross-references between related components. \ No newline at end of file diff --git a/svc/krane/doc.go b/svc/krane/doc.go index f5f913fe1c..a5d11ed912 100644 --- a/svc/krane/doc.go +++ b/svc/krane/doc.go @@ -9,9 +9,22 @@ // // # Architecture // -// The system consists of three main components: -// - Control Plane: External service that makes gRPC calls to krane agents -// - Krane Agents: Node-level agents that expose gRPC APIs for orchestration +// Krane uses a split control loop architecture with two independent controllers: +// +// - [deployment.Controller]: Manages user workload ReplicaSets via the +// SyncDeployments stream. Has its own version cursor and circuit breaker. +// +// - [sentinel.Controller]: Manages sentinel Deployments and Services via the +// SyncSentinels stream. Has its own version cursor and circuit breaker. +// +// This separation provides failure isolation: if one controller experiences errors, +// the other continues operating independently. Each controller maintains its own +// connection to the control plane with separate version cursors for resumable +// streaming. +// +// The system consists of these main components: +// - Control Plane: External service that streams state to krane agents +// - Krane Agents: Node-level agents with independent deployment/sentinel controllers // - Kubernetes Cluster: Target infrastructure where containers are deployed // // Each krane instance is identified by a unique InstanceID. The agent uses in-cluster diff --git a/svc/krane/internal/deployment/BUILD.bazel b/svc/krane/internal/deployment/BUILD.bazel new file mode 100644 index 0000000000..f148fe0f1a --- /dev/null +++ b/svc/krane/internal/deployment/BUILD.bazel @@ -0,0 +1,61 @@ +load("@rules_go//go:def.bzl", "go_library", "go_test") + +go_library( + name = "deployment", + srcs = [ + "actual_state_report.go", + "apply.go", + "consts.go", + "controller.go", + "delete.go", + "desired_state_apply.go", + "doc.go", + "namespace.go", + "resync.go", + "scheduling.go", + "state.go", + ], + importpath = "github.com/unkeyed/unkey/svc/krane/internal/deployment", + visibility = ["//svc/krane:__subpackages__"], + deps = [ + "//gen/proto/ctrl/v1:ctrl", + "//gen/proto/ctrl/v1/ctrlv1connect", + "//pkg/assert", + "//pkg/circuitbreaker", + "//pkg/otel/logging", + "//pkg/ptr", + "//pkg/repeat", + "//svc/krane/pkg/labels", + "@com_connectrpc_connect//:connect", + "@com_github_cilium_cilium//pkg/k8s/apis/cilium.io/v2:cilium_io", + "@com_github_cilium_cilium//pkg/k8s/slim/k8s/apis/meta/v1:meta", + "@com_github_cilium_cilium//pkg/policy/api", + "@io_k8s_api//apps/v1:apps", + "@io_k8s_api//core/v1:core", + "@io_k8s_apimachinery//pkg/api/errors", + "@io_k8s_apimachinery//pkg/apis/meta/v1:meta", + "@io_k8s_apimachinery//pkg/apis/meta/v1/unstructured", + "@io_k8s_apimachinery//pkg/runtime/schema", + "@io_k8s_apimachinery//pkg/types", + "@io_k8s_apimachinery//pkg/watch", + "@io_k8s_client_go//dynamic", + "@io_k8s_client_go//kubernetes", + "@io_k8s_sigs_controller_runtime//pkg/client", + ], +) + +go_test( + name = "deployment_test", + srcs = ["controller_test.go"], + embed = [":deployment"], + deps = [ + "//pkg/otel/logging", + "//svc/krane/internal/testutil", + "@com_github_stretchr_testify//require", + "@io_k8s_api//core/v1:core", + "@io_k8s_apimachinery//pkg/apis/meta/v1:meta", + "@io_k8s_apimachinery//pkg/runtime", + "@io_k8s_client_go//dynamic/fake", + "@io_k8s_client_go//kubernetes/fake", + ], +) diff --git a/svc/krane/internal/deployment/actual_state_report.go b/svc/krane/internal/deployment/actual_state_report.go new file mode 100644 index 0000000000..83ff2e07e0 --- /dev/null +++ b/svc/krane/internal/deployment/actual_state_report.go @@ -0,0 +1,75 @@ +package deployment + +import ( + "context" + + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" + "github.com/unkeyed/unkey/svc/krane/pkg/labels" + appsv1 "k8s.io/api/apps/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/watch" +) + +// runActualStateReportLoop starts a Kubernetes watch for deployment ReplicaSets +// and reports actual state changes back to the control plane in real-time. +// +// The watch filters for resources with the "managed-by: krane" and "component: deployment" +// labels, ignoring resources created by other controllers. When a ReplicaSet is added, +// modified, or deleted, the method queries pod status and reports the actual state to +// the control plane so routing tables stay synchronized with what's running in the cluster. +func (c *Controller) runActualStateReportLoop(ctx context.Context) error { + w, err := c.clientSet.AppsV1().ReplicaSets("").Watch(ctx, metav1.ListOptions{ + LabelSelector: labels.New(). + ManagedByKrane(). + ComponentDeployment(). + ToString(), + }) + if err != nil { + return err + } + + go func() { + for event := range w.ResultChan() { + switch event.Type { + case watch.Error: + c.logger.Error("error watching deployment", "event", event.Object) + case watch.Bookmark: + case watch.Added, watch.Modified: + replicaset, ok := event.Object.(*appsv1.ReplicaSet) + if !ok { + c.logger.Error("unable to cast object to replicaset") + continue + } + status, err := c.buildDeploymentStatus(ctx, replicaset) + if err != nil { + c.logger.Error("unable to build status", "error", err.Error()) + continue + } + err = c.reportDeploymentStatus(ctx, status) + if err != nil { + c.logger.Error("unable to report status", "error", err.Error()) + continue + } + case watch.Deleted: + replicaset, ok := event.Object.(*appsv1.ReplicaSet) + if !ok { + c.logger.Error("unable to cast object to replicaset") + continue + } + err := c.reportDeploymentStatus(ctx, &ctrlv1.ReportDeploymentStatusRequest{ + Change: &ctrlv1.ReportDeploymentStatusRequest_Delete_{ + Delete: &ctrlv1.ReportDeploymentStatusRequest_Delete{ + K8SName: replicaset.Name, + }, + }, + }) + if err != nil { + c.logger.Error("unable to report status", "error", err.Error()) + continue + } + } + } + }() + + return nil +} diff --git a/svc/krane/internal/reconciler/apply_deployment.go b/svc/krane/internal/deployment/apply.go similarity index 75% rename from svc/krane/internal/reconciler/apply_deployment.go rename to svc/krane/internal/deployment/apply.go index faa76b3509..a31b7ee4d5 100644 --- a/svc/krane/internal/reconciler/apply_deployment.go +++ b/svc/krane/internal/deployment/apply.go @@ -1,4 +1,4 @@ -package reconciler +package deployment import ( "context" @@ -27,13 +27,13 @@ import ( // // The namespace is created automatically if it doesn't exist. Pods run with gVisor // isolation (RuntimeClass "gvisor") for security since they execute untrusted user code. -func (r *Reconciler) ApplyDeployment(ctx context.Context, req *ctrlv1.ApplyDeployment) error { - - r.logger.Info("applying deployment", +func (c *Controller) ApplyDeployment(ctx context.Context, req *ctrlv1.ApplyDeployment) error { + c.logger.Info("applying deployment", "namespace", req.GetK8SNamespace(), "name", req.GetK8SName(), "deployment_id", req.GetDeploymentId(), ) + err := assert.All( assert.NotEmpty(req.GetWorkspaceId(), "Workspace ID is required"), assert.NotEmpty(req.GetProjectId(), "Project ID is required"), @@ -50,11 +50,10 @@ func (r *Reconciler) ApplyDeployment(ctx context.Context, req *ctrlv1.ApplyDeplo return err } - if err := r.ensureNamespaceExists(ctx, req.GetK8SNamespace(), req.GetWorkspaceId(), req.GetEnvironmentId()); err != nil { + if err := c.ensureNamespaceExists(ctx, req.GetK8SNamespace(), req.GetWorkspaceId(), req.GetEnvironmentId()); err != nil { return err } - // Define labels for resource selection usedLabels := labels.New(). WorkspaceID(req.GetWorkspaceId()). ProjectID(req.GetProjectId()). @@ -83,8 +82,6 @@ func (r *Reconciler) ApplyDeployment(ctx context.Context, req *ctrlv1.ApplyDeplo MinReadySeconds: 30, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ - // We need to prefix the name with our unkey-deployment name, to ensure uniqueness - // This becomes important when sending status updates back to our database GenerateName: fmt.Sprintf("%s-", req.GetK8SName()), Labels: usedLabels, }, @@ -100,33 +97,18 @@ func (r *Reconciler) ApplyDeployment(ctx context.Context, req *ctrlv1.ApplyDeplo ImagePullPolicy: corev1.PullIfNotPresent, Command: req.GetCommand(), Env: buildDeploymentEnv(req), - Ports: []corev1.ContainerPort{{ ContainerPort: DeploymentPort, Name: "deployment", }}, - - Resources: corev1.ResourceRequirements{ - // nolint:exhaustive - //Limits: corev1.ResourceList{ - // corev1.ResourceCPU: *resource.NewMilliQuantity(req.GetCpuMillicores(), resource.BinarySI), - // corev1.ResourceMemory: *resource.NewQuantity(req.GetMemoryMib(), resource.BinarySI), - // corev1.ResourceEphemeralStorage: *resource.NewQuantity(5*1024*1024*1024, resource.BinarySI), - //}, - //// nolint:exhaustive - //Requests: corev1.ResourceList{ - // corev1.ResourceCPU: *resource.NewMilliQuantity(req.GetCpuMillicores(), resource.BinarySI), - // corev1.ResourceMemory: *resource.NewQuantity(req.GetMemoryMib(), resource.BinarySI), - // corev1.ResourceEphemeralStorage: *resource.NewQuantity(5*1024*1024*1024, resource.BinarySI), - //}, - }, + Resources: corev1.ResourceRequirements{}, }}, }, }, }, } - client := r.clientSet.AppsV1().ReplicaSets(req.GetK8SNamespace()) + client := c.clientSet.AppsV1().ReplicaSets(req.GetK8SNamespace()) patch, err := json.Marshal(desired) if err != nil { @@ -140,14 +122,14 @@ func (r *Reconciler) ApplyDeployment(ctx context.Context, req *ctrlv1.ApplyDeplo return fmt.Errorf("failed to apply replicaset: %w", err) } - state, err := r.getDeploymentState(ctx, applied) + status, err := c.buildDeploymentStatus(ctx, applied) if err != nil { return err } - err = r.updateDeploymentState(ctx, state) + err = c.reportDeploymentStatus(ctx, status) if err != nil { - r.logger.Error("failed to reconcile replicaset", "deployment_id", req.GetDeploymentId(), "error", err) + c.logger.Error("failed to report deployment status", "deployment_id", req.GetDeploymentId(), "error", err) return err } diff --git a/svc/krane/internal/deployment/consts.go b/svc/krane/internal/deployment/consts.go new file mode 100644 index 0000000000..c0c8f57210 --- /dev/null +++ b/svc/krane/internal/deployment/consts.go @@ -0,0 +1,25 @@ +package deployment + +import corev1 "k8s.io/api/core/v1" + +const ( + // DeploymentPort is the port user deployments listen on. + DeploymentPort = 8080 + + // runtimeClassGvisor specifies the gVisor sandbox for untrusted user workloads. + runtimeClassGvisor = "gvisor" + + // fieldManagerKrane identifies krane as the server-side apply field manager. + fieldManagerKrane = "krane" + + // CustomerNodeClass is the node class for untrusted customer workloads. + CustomerNodeClass = "untrusted" +) + +// untrustedToleration allows pods to be scheduled on untrusted nodes. +var untrustedToleration = corev1.Toleration{ + Key: "karpenter.sh/nodepool", + Operator: corev1.TolerationOpEqual, + Value: CustomerNodeClass, + Effect: corev1.TaintEffectNoSchedule, +} diff --git a/svc/krane/internal/deployment/controller.go b/svc/krane/internal/deployment/controller.go new file mode 100644 index 0000000000..8a7b90d0d9 --- /dev/null +++ b/svc/krane/internal/deployment/controller.go @@ -0,0 +1,97 @@ +package deployment + +import ( + "context" + "fmt" + + "connectrpc.com/connect" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" + "github.com/unkeyed/unkey/gen/proto/ctrl/v1/ctrlv1connect" + "github.com/unkeyed/unkey/pkg/circuitbreaker" + "github.com/unkeyed/unkey/pkg/otel/logging" + "k8s.io/client-go/dynamic" + "k8s.io/client-go/kubernetes" +) + +// Controller manages deployment ReplicaSets in a Kubernetes cluster. +// +// It maintains bidirectional state synchronization with the control plane: +// receiving desired state via WatchDeployments and reporting actual +// state via ReportDeploymentStatus. The controller operates independently +// from the SentinelController with its own version cursor and circuit breaker. +type Controller struct { + clientSet kubernetes.Interface + dynamicClient dynamic.Interface + logger logging.Logger + cluster ctrlv1connect.ClusterServiceClient + cb circuitbreaker.CircuitBreaker[any] + done chan struct{} + region string + versionLastSeen uint64 +} + +// Config holds the configuration required to create a new [Controller]. +type Config struct { + ClientSet kubernetes.Interface + DynamicClient dynamic.Interface + Logger logging.Logger + Cluster ctrlv1connect.ClusterServiceClient + Region string +} + +// New creates a [Controller] ready to be started with [Controller.Start]. +func New(cfg Config) *Controller { + return &Controller{ + clientSet: cfg.ClientSet, + dynamicClient: cfg.DynamicClient, + logger: cfg.Logger.With("controller", "deployments"), + cluster: cfg.Cluster, + cb: circuitbreaker.New[any]("deployment_state_update"), + done: make(chan struct{}), + region: cfg.Region, + versionLastSeen: 0, + } +} + +// Start launches the three background control loops: +// +// - [Controller.runDesiredStateApplyLoop]: Receives desired state from the +// control plane's SyncDeployments stream and applies it to Kubernetes. +// +// - [Controller.runActualStateReportLoop]: Watches Kubernetes for ReplicaSet +// changes and reports actual state back to the control plane. +// +// - [Controller.runResyncLoop]: Periodically re-queries the control plane for +// each existing ReplicaSet to ensure eventual consistency. +// +// All loops continue until the context is cancelled or [Controller.Stop] is called. +func (c *Controller) Start(ctx context.Context) error { + go c.runResyncLoop(ctx) + + if err := c.runActualStateReportLoop(ctx); err != nil { + return err + } + + go c.runDesiredStateApplyLoop(ctx) + + return nil +} + +// Stop signals all background goroutines to terminate. +func (c *Controller) Stop() error { + close(c.done) + return nil +} + +// reportDeploymentStatus reports actual deployment state to the control plane +// through the circuit breaker. The circuit breaker prevents cascading failures +// during control plane outages by failing fast after repeated errors. +func (c *Controller) reportDeploymentStatus(ctx context.Context, status *ctrlv1.ReportDeploymentStatusRequest) error { + _, err := c.cb.Do(ctx, func(innerCtx context.Context) (any, error) { + return c.cluster.ReportDeploymentStatus(innerCtx, connect.NewRequest(status)) + }) + if err != nil { + return fmt.Errorf("failed to report deployment status: %w", err) + } + return nil +} diff --git a/svc/krane/internal/deployment/controller_test.go b/svc/krane/internal/deployment/controller_test.go new file mode 100644 index 0000000000..631a2d5ab6 --- /dev/null +++ b/svc/krane/internal/deployment/controller_test.go @@ -0,0 +1,119 @@ +package deployment + +import ( + "testing" + + "github.com/stretchr/testify/require" + "github.com/unkeyed/unkey/pkg/otel/logging" + "github.com/unkeyed/unkey/svc/krane/internal/testutil" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + fakedynamic "k8s.io/client-go/dynamic/fake" + "k8s.io/client-go/kubernetes/fake" +) + +func TestNew_CreatesControllerWithCorrectFields(t *testing.T) { + namespace := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-namespace", + }, + } + client := fake.NewSimpleClientset(namespace) + dynamicClient := fakedynamic.NewSimpleDynamicClient(runtime.NewScheme()) + logger := logging.NewNoop() + mockCluster := &testutil.MockClusterClient{} + + cfg := Config{ + ClientSet: client, + DynamicClient: dynamicClient, + Logger: logger, + Cluster: mockCluster, + Region: "us-east-1", + } + + ctrl := New(cfg) + + require.NotNil(t, ctrl) + require.Equal(t, client, ctrl.clientSet) + require.Equal(t, dynamicClient, ctrl.dynamicClient) + require.Equal(t, mockCluster, ctrl.cluster) + require.Equal(t, "us-east-1", ctrl.region) +} + +func TestNew_CreatesOwnCircuitBreaker(t *testing.T) { + client := fake.NewSimpleClientset() + dynamicClient := fakedynamic.NewSimpleDynamicClient(runtime.NewScheme()) + cfg := Config{ + ClientSet: client, + DynamicClient: dynamicClient, + Logger: logging.NewNoop(), + Cluster: &testutil.MockClusterClient{}, + Region: "us-east-1", + } + + ctrl := New(cfg) + + require.NotNil(t, ctrl.cb, "circuit breaker should not be nil") +} + +func TestNew_InitializesVersionCursorToZero(t *testing.T) { + client := fake.NewSimpleClientset() + dynamicClient := fakedynamic.NewSimpleDynamicClient(runtime.NewScheme()) + cfg := Config{ + ClientSet: client, + DynamicClient: dynamicClient, + Logger: logging.NewNoop(), + Cluster: &testutil.MockClusterClient{}, + Region: "us-east-1", + } + + ctrl := New(cfg) + + require.Equal(t, uint64(0), ctrl.versionLastSeen, "version cursor should start at 0") +} + +func TestNew_CreatesDoneChannel(t *testing.T) { + client := fake.NewSimpleClientset() + dynamicClient := fakedynamic.NewSimpleDynamicClient(runtime.NewScheme()) + cfg := Config{ + ClientSet: client, + DynamicClient: dynamicClient, + Logger: logging.NewNoop(), + Cluster: &testutil.MockClusterClient{}, + Region: "us-east-1", + } + + ctrl := New(cfg) + + require.NotNil(t, ctrl.done, "done channel should not be nil") + + select { + case <-ctrl.done: + t.Fatal("done channel should not be closed initially") + default: + } +} + +func TestStop_ClosesDoneChannel(t *testing.T) { + client := fake.NewSimpleClientset() + dynamicClient := fakedynamic.NewSimpleDynamicClient(runtime.NewScheme()) + cfg := Config{ + ClientSet: client, + DynamicClient: dynamicClient, + Logger: logging.NewNoop(), + Cluster: &testutil.MockClusterClient{}, + Region: "us-east-1", + } + + ctrl := New(cfg) + + err := ctrl.Stop() + require.NoError(t, err) + + select { + case <-ctrl.done: + default: + t.Fatal("done channel should be closed after Stop") + } +} diff --git a/svc/krane/internal/reconciler/delete_deployment.go b/svc/krane/internal/deployment/delete.go similarity index 68% rename from svc/krane/internal/reconciler/delete_deployment.go rename to svc/krane/internal/deployment/delete.go index 4573402836..34f046d8d9 100644 --- a/svc/krane/internal/reconciler/delete_deployment.go +++ b/svc/krane/internal/deployment/delete.go @@ -1,4 +1,4 @@ -package reconciler +package deployment import ( "context" @@ -14,20 +14,20 @@ import ( // Not-found errors are ignored since the desired end state (resource gone) is // already achieved. After deletion, the method notifies the control plane so it // can update routing tables and stop sending traffic to this deployment. -func (r *Reconciler) DeleteDeployment(ctx context.Context, req *ctrlv1.DeleteDeployment) error { - r.logger.Info("deleting deployment", +func (c *Controller) DeleteDeployment(ctx context.Context, req *ctrlv1.DeleteDeployment) error { + c.logger.Info("deleting deployment", "namespace", req.GetK8SNamespace(), "name", req.GetK8SName(), ) - // nolint:exhaustruct - err := r.clientSet.AppsV1().ReplicaSets(req.GetK8SNamespace()).Delete(ctx, req.GetK8SName(), metav1.DeleteOptions{}) + err := c.clientSet.AppsV1().ReplicaSets(req.GetK8SNamespace()).Delete(ctx, req.GetK8SName(), metav1.DeleteOptions{}) if err != nil && !apierrors.IsNotFound(err) { return err } - err = r.updateDeploymentState(ctx, &ctrlv1.UpdateDeploymentStateRequest{ - Change: &ctrlv1.UpdateDeploymentStateRequest_Delete_{ - Delete: &ctrlv1.UpdateDeploymentStateRequest_Delete{ + + err = c.reportDeploymentStatus(ctx, &ctrlv1.ReportDeploymentStatusRequest{ + Change: &ctrlv1.ReportDeploymentStatusRequest_Delete_{ + Delete: &ctrlv1.ReportDeploymentStatusRequest_Delete{ K8SName: req.GetK8SName(), }, }, diff --git a/svc/krane/internal/deployment/desired_state_apply.go b/svc/krane/internal/deployment/desired_state_apply.go new file mode 100644 index 0000000000..64f93e38f5 --- /dev/null +++ b/svc/krane/internal/deployment/desired_state_apply.go @@ -0,0 +1,73 @@ +package deployment + +import ( + "context" + "math/rand/v2" + "time" + + "connectrpc.com/connect" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" +) + +// runDesiredStateApplyLoop connects to the control plane's WatchDeployments +// stream and applies desired state updates to the Kubernetes cluster. +// +// The loop automatically reconnects with jittered backoff on stream errors. +// Each received state is processed via applyDesiredState, and the version cursor +// is advanced on successful processing. +func (c *Controller) runDesiredStateApplyLoop(ctx context.Context) { + intervalMin := time.Second + intervalMax := 5 * time.Second + + for { + interval := intervalMin + time.Millisecond*time.Duration(rand.Float64()*float64(intervalMax.Milliseconds()-intervalMin.Milliseconds())) + time.Sleep(interval) + + err := c.streamDesiredStateOnce(ctx) + if err != nil { + c.logger.Error("error streaming desired state from control plane", "error", err) + } + } +} + +// streamDesiredStateOnce opens a single connection to the control plane's +// WatchDeployments stream, processes all received states until the stream +// closes or errors, then returns. The caller handles reconnection. +func (c *Controller) streamDesiredStateOnce(ctx context.Context) error { + c.logger.Info("connecting to control plane for desired state") + + stream, err := c.cluster.WatchDeployments(ctx, connect.NewRequest(&ctrlv1.WatchDeploymentsRequest{ + Region: c.region, + VersionLastSeen: c.versionLastSeen, + })) + if err != nil { + return err + } + + for stream.Receive() { + c.logger.Info("received desired state from control plane") + state := stream.Msg() + + switch op := state.GetState().(type) { + case *ctrlv1.DeploymentState_Apply: + if err := c.ApplyDeployment(ctx, op.Apply); err != nil { + return err + } + case *ctrlv1.DeploymentState_Delete: + if err := c.DeleteDeployment(ctx, op.Delete); err != nil { + return err + } + } + + if state.GetVersion() > c.versionLastSeen { + c.versionLastSeen = state.GetVersion() + } + } + + if err := stream.Close(); err != nil { + c.logger.Error("unable to close control plane stream", "error", err) + return err + } + + return nil +} diff --git a/svc/krane/internal/deployment/doc.go b/svc/krane/internal/deployment/doc.go new file mode 100644 index 0000000000..ac2389a4e1 --- /dev/null +++ b/svc/krane/internal/deployment/doc.go @@ -0,0 +1,44 @@ +// Package deployment provides the DeploymentController for managing user workload +// ReplicaSets in Kubernetes. +// +// The DeploymentController is one half of krane's split control loop architecture. +// It operates independently from the SentinelController, with its own: +// - Control plane sync stream (SyncDeployments) +// - Version cursor for resumable streaming +// - Circuit breaker for failure isolation +// - Kubernetes watch and refresh loops +// +// # Architecture +// +// The controller runs three loops for reliability: +// +// - [Controller.runDesiredStateApplyLoop]: Receives desired state from the +// control plane's SyncDeployments stream and applies it to Kubernetes. +// +// - [Controller.runActualStateReportLoop]: Watches Kubernetes for ReplicaSet +// changes and reports actual state back to the control plane. +// +// - [Controller.runResyncLoop]: Periodically re-queries the control plane +// for each existing ReplicaSet to ensure eventual consistency. +// +// # Failure Isolation +// +// By running as an independent controller, deployment reconciliation continues +// even if sentinel reconciliation is experiencing failures. Each controller +// has its own circuit breaker, so errors in one don't affect the other. +// +// # Usage +// +// ctrl := deployment.New(deployment.Config{ +// ClientSet: kubeClient, +// DynamicClient: dynamicClient, +// Logger: logger.With("controller", "deployments"), +// Cluster: clusterClient, +// Region: "us-east-1", +// }) +// +// if err := ctrl.Start(ctx); err != nil { +// return fmt.Errorf("failed to start deployment controller: %w", err) +// } +// defer ctrl.Stop() +package deployment diff --git a/svc/krane/internal/reconciler/namespace.go b/svc/krane/internal/deployment/namespace.go similarity index 83% rename from svc/krane/internal/reconciler/namespace.go rename to svc/krane/internal/deployment/namespace.go index 4faf028bbe..5256fe15fe 100644 --- a/svc/krane/internal/reconciler/namespace.go +++ b/svc/krane/internal/deployment/namespace.go @@ -1,4 +1,4 @@ -package reconciler +package deployment import ( "context" @@ -17,19 +17,15 @@ import ( ) const ( + // NamespaceSentinel is the namespace where sentinel pods run. NamespaceSentinel = "sentinel" - SentinelNodeClass = "sentinel" - CustomerNodeClass = "untrusted" - SentinelPort = 8040 - DeploymentPort = 8080 ) // ensureNamespaceExists creates the namespace if it doesn't already exist. -// AlreadyExists errors are treated as success. // For customer namespaces (non-sentinel), it also creates a CiliumNetworkPolicy // to allow ingress only from the matching sentinel. -func (r *Reconciler) ensureNamespaceExists(ctx context.Context, namespace, workspaceID, environmentID string) error { - _, err := r.clientSet.CoreV1().Namespaces().Create(ctx, &corev1.Namespace{ +func (c *Controller) ensureNamespaceExists(ctx context.Context, namespace, workspaceID, environmentID string) error { + _, err := c.clientSet.CoreV1().Namespaces().Create(ctx, &corev1.Namespace{ ObjectMeta: metav1.ObjectMeta{ Name: namespace, }, @@ -38,9 +34,8 @@ func (r *Reconciler) ensureNamespaceExists(ctx context.Context, namespace, works return err } - // Create CiliumNetworkPolicy for customer namespaces (not for sentinel namespace) if namespace != NamespaceSentinel { - if err := r.applyCiliumPolicyForNamespace(ctx, namespace, workspaceID, environmentID); err != nil { + if err := c.applyCiliumPolicyForNamespace(ctx, namespace, workspaceID, environmentID); err != nil { return fmt.Errorf("failed to create cilium policy for namespace %s: %w", namespace, err) } } @@ -52,7 +47,7 @@ func (r *Reconciler) ensureNamespaceExists(ctx context.Context, namespace, works // only from sentinels with matching workspace and environment IDs. // //nolint:exhaustruct -func (r *Reconciler) applyCiliumPolicyForNamespace(ctx context.Context, namespace, workspaceID, environmentID string) error { +func (c *Controller) applyCiliumPolicyForNamespace(ctx context.Context, namespace, workspaceID, environmentID string) error { policy := &ciliumv2.CiliumNetworkPolicy{ TypeMeta: metav1.TypeMeta{ APIVersion: "cilium.io/v2", @@ -113,7 +108,7 @@ func (r *Reconciler) applyCiliumPolicyForNamespace(ctx context.Context, namespac Resource: "ciliumnetworkpolicies", } - _, err = r.dynamicClient.Resource(gvr).Namespace(namespace).Apply( + _, err = c.dynamicClient.Resource(gvr).Namespace(namespace).Apply( ctx, "allow-sentinel-ingress", unstructuredPolicy, @@ -122,7 +117,6 @@ func (r *Reconciler) applyCiliumPolicyForNamespace(ctx context.Context, namespac return err } -// toUnstructured converts a typed Kubernetes object to an unstructured object. func toUnstructured(obj any) (*unstructured.Unstructured, error) { data, err := json.Marshal(obj) if err != nil { diff --git a/svc/krane/internal/deployment/resync.go b/svc/krane/internal/deployment/resync.go new file mode 100644 index 0000000000..4a672d0c45 --- /dev/null +++ b/svc/krane/internal/deployment/resync.go @@ -0,0 +1,74 @@ +package deployment + +import ( + "context" + "time" + + "connectrpc.com/connect" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" + "github.com/unkeyed/unkey/pkg/repeat" + "github.com/unkeyed/unkey/svc/krane/pkg/labels" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// runResyncLoop periodically reconciles all deployment ReplicaSets with their +// desired state from the control plane. +// +// This loop runs every minute as a consistency safety net. While +// [Controller.runActualStateReportLoop] handles real-time K8s events and +// [Controller.runDesiredStateApplyLoop] handles streaming updates, both can miss +// events during network partitions, controller restarts, or buffer overflows. +// This resync loop guarantees eventual consistency by querying the control plane +// for each existing ReplicaSet and applying any needed changes. +func (c *Controller) runResyncLoop(ctx context.Context) { + repeat.Every(1*time.Minute, func() { + c.logger.Info("running periodic resync") + + cursor := "" + for { + replicaSets, err := c.clientSet.AppsV1().ReplicaSets("").List(ctx, metav1.ListOptions{ + LabelSelector: labels.New(). + ManagedByKrane(). + ComponentDeployment(). + ToString(), + Continue: cursor, + }) + if err != nil { + c.logger.Error("unable to list replicaSets", "error", err.Error()) + return + } + + for _, replicaSet := range replicaSets.Items { + deploymentID, ok := labels.GetDeploymentID(replicaSet.Labels) + if !ok { + c.logger.Error("unable to get deployment ID", "replicaSet", replicaSet.Name) + continue + } + + res, err := c.cluster.GetDesiredDeploymentState(ctx, connect.NewRequest(&ctrlv1.GetDesiredDeploymentStateRequest{ + DeploymentId: deploymentID, + })) + if err != nil { + c.logger.Error("unable to get desired deployment state", "error", err.Error(), "deployment_id", deploymentID) + continue + } + + switch res.Msg.GetState().(type) { + case *ctrlv1.DeploymentState_Apply: + if err := c.ApplyDeployment(ctx, res.Msg.GetApply()); err != nil { + c.logger.Error("unable to apply deployment", "error", err.Error(), "deployment_id", deploymentID) + } + case *ctrlv1.DeploymentState_Delete: + if err := c.DeleteDeployment(ctx, res.Msg.GetDelete()); err != nil { + c.logger.Error("unable to delete deployment", "error", err.Error(), "deployment_id", deploymentID) + } + } + } + + cursor = replicaSets.Continue + if cursor == "" { + break + } + } + }) +} diff --git a/svc/krane/internal/reconciler/scheduling.go b/svc/krane/internal/deployment/scheduling.go similarity index 74% rename from svc/krane/internal/reconciler/scheduling.go rename to svc/krane/internal/deployment/scheduling.go index 722dc16641..84f9b37dfd 100644 --- a/svc/krane/internal/reconciler/scheduling.go +++ b/svc/krane/internal/deployment/scheduling.go @@ -1,4 +1,4 @@ -package reconciler +package deployment import ( "github.com/unkeyed/unkey/svc/krane/pkg/labels" @@ -26,21 +26,6 @@ func deploymentTopologySpread(deploymentID string) []corev1.TopologySpreadConstr } } -// sentinelTopologySpread returns topology spread constraints for sentinel pods. -// Spreads pods evenly across availability zones with maxSkew of 1. -func sentinelTopologySpread(sentinelID string) []corev1.TopologySpreadConstraint { - return []corev1.TopologySpreadConstraint{ - { - MaxSkew: 1, - TopologyKey: topologyKeyZone, - WhenUnsatisfiable: corev1.ScheduleAnyway, - LabelSelector: &metav1.LabelSelector{ - MatchLabels: labels.New().SentinelID(sentinelID), - }, - }, - } -} - // deploymentAffinity returns affinity rules for customer deployment pods. // Prefers scheduling in the same AZ as sentinels for the given environment // to minimize cross-AZ latency between sentinel and customer code. diff --git a/svc/krane/internal/deployment/state.go b/svc/krane/internal/deployment/state.go new file mode 100644 index 0000000000..e18c952031 --- /dev/null +++ b/svc/krane/internal/deployment/state.go @@ -0,0 +1,83 @@ +package deployment + +import ( + "context" + "fmt" + "strings" + + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// buildDeploymentStatus queries the pods belonging to a ReplicaSet and builds a +// status report containing each pod's address, resource allocation, and phase. +// Pods without an IP address are skipped since they can't receive traffic yet. +// The address is formatted as a cluster-local DNS name for in-cluster routing. +func (c *Controller) buildDeploymentStatus(ctx context.Context, replicaset *appsv1.ReplicaSet) (*ctrlv1.ReportDeploymentStatusRequest, error) { + selector, err := metav1.LabelSelectorAsSelector(replicaset.Spec.Selector) + if err != nil { + return nil, err + } + + pods, err := c.clientSet.CoreV1().Pods(replicaset.Namespace).List(ctx, metav1.ListOptions{ + LabelSelector: selector.String(), + }) + if err != nil { + return nil, fmt.Errorf("failed to list pods: %w", err) + } + + update := &ctrlv1.ReportDeploymentStatusRequest_Update{ + K8SName: replicaset.Name, + Instances: make([]*ctrlv1.ReportDeploymentStatusRequest_Update_Instance, 0, len(pods.Items)), + } + + for _, pod := range pods.Items { + if pod.Status.PodIP == "" { + continue + } + + instance := &ctrlv1.ReportDeploymentStatusRequest_Update_Instance{ + K8SName: pod.GetName(), + Address: fmt.Sprintf("%s.%s.pod.cluster.local:%d", strings.ReplaceAll(pod.Status.PodIP, ".", "-"), pod.Namespace, DeploymentPort), + CpuMillicores: 0, + MemoryMib: 0, + Status: ctrlv1.ReportDeploymentStatusRequest_Update_Instance_STATUS_UNSPECIFIED, + } + if pod.Spec.Resources != nil { + instance.CpuMillicores = pod.Spec.Resources.Limits.Cpu().MilliValue() + instance.MemoryMib = pod.Spec.Resources.Limits.Memory().Value() / (1024 * 1024) + } + + switch pod.Status.Phase { + case corev1.PodPending: + instance.Status = ctrlv1.ReportDeploymentStatusRequest_Update_Instance_STATUS_PENDING + case corev1.PodRunning: + allReady := true + for _, cond := range pod.Status.Conditions { + if cond.Type == corev1.ContainersReady && cond.Status != corev1.ConditionTrue { + allReady = false + break + } + } + if allReady { + instance.Status = ctrlv1.ReportDeploymentStatusRequest_Update_Instance_STATUS_RUNNING + } else { + instance.Status = ctrlv1.ReportDeploymentStatusRequest_Update_Instance_STATUS_FAILED + } + case corev1.PodFailed: + instance.Status = ctrlv1.ReportDeploymentStatusRequest_Update_Instance_STATUS_FAILED + case corev1.PodSucceeded, corev1.PodUnknown: + instance.Status = ctrlv1.ReportDeploymentStatusRequest_Update_Instance_STATUS_UNSPECIFIED + } + + update.Instances = append(update.Instances, instance) + } + + return &ctrlv1.ReportDeploymentStatusRequest{ + Change: &ctrlv1.ReportDeploymentStatusRequest_Update_{ + Update: update, + }, + }, nil +} diff --git a/svc/krane/internal/reconciler/BUILD.bazel b/svc/krane/internal/reconciler/BUILD.bazel deleted file mode 100644 index 45e30b0eea..0000000000 --- a/svc/krane/internal/reconciler/BUILD.bazel +++ /dev/null @@ -1,97 +0,0 @@ -load("@rules_go//go:def.bzl", "go_library", "go_test") - -go_library( - name = "reconciler", - srcs = [ - "apply_deployment.go", - "apply_sentinel.go", - "consts.go", - "delete_deployment.go", - "delete_sentinel.go", - "doc.go", - "namespace.go", - "reconciler.go", - "refresh_current_deployments.go", - "refresh_current_sentinels.go", - "scheduling.go", - "tolerations.go", - "update_state.go", - "watch_current_deployments.go", - "watch_current_sentinels.go", - "watcher.go", - ], - importpath = "github.com/unkeyed/unkey/svc/krane/internal/reconciler", - visibility = ["//svc/krane:__subpackages__"], - deps = [ - "//gen/proto/ctrl/v1:ctrl", - "//gen/proto/ctrl/v1/ctrlv1connect", - "//pkg/assert", - "//pkg/circuitbreaker", - "//pkg/otel/logging", - "//pkg/ptr", - "//pkg/repeat", - "//svc/krane/pkg/labels", - "@com_connectrpc_connect//:connect", - "@com_github_cilium_cilium//pkg/k8s/apis/cilium.io/v2:cilium_io", - "@com_github_cilium_cilium//pkg/k8s/slim/k8s/apis/meta/v1:meta", - "@com_github_cilium_cilium//pkg/policy/api", - "@io_k8s_api//apps/v1:apps", - "@io_k8s_api//core/v1:core", - "@io_k8s_apimachinery//pkg/api/errors", - "@io_k8s_apimachinery//pkg/apis/meta/v1:meta", - "@io_k8s_apimachinery//pkg/apis/meta/v1/unstructured", - "@io_k8s_apimachinery//pkg/runtime/schema", - "@io_k8s_apimachinery//pkg/types", - "@io_k8s_apimachinery//pkg/util/intstr", - "@io_k8s_apimachinery//pkg/watch", - "@io_k8s_client_go//dynamic", - "@io_k8s_client_go//kubernetes", - "@io_k8s_sigs_controller_runtime//pkg/client", - ], -) - -go_test( - name = "reconciler_test", - srcs = [ - "apply_deployment_test.go", - "apply_sentinel_test.go", - "delete_deployment_test.go", - "delete_sentinel_test.go", - "handle_state_test.go", - "mock_cluster_client_test.go", - "namespace_test.go", - "reconciler_test.go", - "refresh_current_deployments_test.go", - "refresh_current_sentinels_test.go", - "test_helpers_test.go", - "update_state_test.go", - "version_tracking_test.go", - "watch_current_deployments_test.go", - "watch_current_sentinels_test.go", - "watcher_test.go", - ], - embed = [":reconciler"], - deps = [ - "//gen/proto/ctrl/v1:ctrl", - "//gen/proto/ctrl/v1/ctrlv1connect", - "//pkg/circuitbreaker", - "//pkg/otel/logging", - "//pkg/ptr", - "//svc/krane/pkg/labels", - "@com_connectrpc_connect//:connect", - "@com_github_stretchr_testify//require", - "@io_k8s_api//apps/v1:apps", - "@io_k8s_api//core/v1:core", - "@io_k8s_apimachinery//pkg/api/errors", - "@io_k8s_apimachinery//pkg/api/resource", - "@io_k8s_apimachinery//pkg/apis/meta/v1:meta", - "@io_k8s_apimachinery//pkg/apis/meta/v1/unstructured", - "@io_k8s_apimachinery//pkg/runtime", - "@io_k8s_apimachinery//pkg/runtime/schema", - "@io_k8s_apimachinery//pkg/types", - "@io_k8s_apimachinery//pkg/watch", - "@io_k8s_client_go//dynamic/fake", - "@io_k8s_client_go//kubernetes/fake", - "@io_k8s_client_go//testing", - ], -) diff --git a/svc/krane/internal/reconciler/apply_deployment_test.go b/svc/krane/internal/reconciler/apply_deployment_test.go deleted file mode 100644 index 05b0b89ba2..0000000000 --- a/svc/krane/internal/reconciler/apply_deployment_test.go +++ /dev/null @@ -1,295 +0,0 @@ -package reconciler - -import ( - "context" - "fmt" - "testing" - - "connectrpc.com/connect" - "github.com/stretchr/testify/require" - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" - "github.com/unkeyed/unkey/pkg/ptr" - corev1 "k8s.io/api/core/v1" -) - -func newApplyDeploymentRequest() *ctrlv1.ApplyDeployment { - return &ctrlv1.ApplyDeployment{ - WorkspaceId: "ws_123", - ProjectId: "prj_123", - EnvironmentId: "env_123", - DeploymentId: "dep_123", - K8SNamespace: "test-namespace", - K8SName: "test-deployment", - Image: "nginx:1.19", - Replicas: 3, - CpuMillicores: 100, - MemoryMib: 128, - BuildId: ptr.P("build_123"), - } -} - -func TestApplyDeployment_UsesServerSideApply(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - capture := AddReplicaSetPatchReactor(client) - r := NewTestReconciler(client, nil) - - req := newApplyDeploymentRequest() - err := r.ApplyDeployment(ctx, req) - require.NoError(t, err) - - require.NotNil(t, capture.Applied, "should have captured applied ReplicaSet") - - var patchCount int - for _, action := range client.Actions() { - if action.GetVerb() == "patch" && action.GetResource().Resource == "replicasets" { - patchCount++ - } - } - require.Equal(t, 1, patchCount, "expected exactly one patch action") -} - -func TestApplyDeployment_SetsCorrectImage(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - capture := AddReplicaSetPatchReactor(client) - r := NewTestReconciler(client, nil) - - req := newApplyDeploymentRequest() - req.Image = "nginx:1.25" - - err := r.ApplyDeployment(ctx, req) - require.NoError(t, err) - - require.NotNil(t, capture.Applied, "should have captured applied ReplicaSet") - require.Len(t, capture.Applied.Spec.Template.Spec.Containers, 1) - require.Equal(t, "nginx:1.25", capture.Applied.Spec.Template.Spec.Containers[0].Image) -} - -func TestApplyDeployment_SetsCorrectReplicas(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - capture := AddReplicaSetPatchReactor(client) - r := NewTestReconciler(client, nil) - - req := newApplyDeploymentRequest() - req.Replicas = 5 - - err := r.ApplyDeployment(ctx, req) - require.NoError(t, err) - - require.NotNil(t, capture.Applied) - require.Equal(t, int32(5), *capture.Applied.Spec.Replicas) -} - -func TestApplyDeployment_SetsCorrectLabels(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - capture := AddReplicaSetPatchReactor(client) - r := NewTestReconciler(client, nil) - - req := newApplyDeploymentRequest() - err := r.ApplyDeployment(ctx, req) - require.NoError(t, err) - - require.NotNil(t, capture.Applied) - - labels := capture.Applied.Spec.Template.Labels - require.Equal(t, "ws_123", labels["unkey.com/workspace.id"]) - require.Equal(t, "prj_123", labels["unkey.com/project.id"]) - require.Equal(t, "env_123", labels["unkey.com/environment.id"]) - require.Equal(t, "dep_123", labels["unkey.com/deployment.id"]) - require.Equal(t, "krane", labels["app.kubernetes.io/managed-by"]) -} - -func TestApplyDeployment_SetsEnvironmentVariables(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - capture := AddReplicaSetPatchReactor(client) - r := NewTestReconciler(client, nil) - - req := newApplyDeploymentRequest() - err := r.ApplyDeployment(ctx, req) - require.NoError(t, err) - - require.NotNil(t, capture.Applied) - require.Len(t, capture.Applied.Spec.Template.Spec.Containers, 1) - - envVars := capture.Applied.Spec.Template.Spec.Containers[0].Env - envMap := make(map[string]string) - for _, env := range envVars { - envMap[env.Name] = env.Value - } - - require.Equal(t, "ws_123", envMap["UNKEY_WORKSPACE_ID"]) - require.Equal(t, "prj_123", envMap["UNKEY_PROJECT_ID"]) - require.Equal(t, "env_123", envMap["UNKEY_ENVIRONMENT_ID"]) - require.Equal(t, "dep_123", envMap["UNKEY_DEPLOYMENT_ID"]) -} - -func TestApplyDeployment_SetsTypeMeta(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - capture := AddReplicaSetPatchReactor(client) - r := NewTestReconciler(client, nil) - - req := newApplyDeploymentRequest() - err := r.ApplyDeployment(ctx, req) - require.NoError(t, err) - - require.NotNil(t, capture.Applied) - require.Equal(t, "apps/v1", capture.Applied.APIVersion) - require.Equal(t, "ReplicaSet", capture.Applied.Kind) -} - -func TestApplyDeployment_ValidationErrors(t *testing.T) { - tests := []struct { - name string - mutate func(*ctrlv1.ApplyDeployment) - }{ - { - name: "missing workspace id", - mutate: func(req *ctrlv1.ApplyDeployment) { req.WorkspaceId = "" }, - }, - { - name: "missing project id", - mutate: func(req *ctrlv1.ApplyDeployment) { req.ProjectId = "" }, - }, - { - name: "missing environment id", - mutate: func(req *ctrlv1.ApplyDeployment) { req.EnvironmentId = "" }, - }, - { - name: "missing deployment id", - mutate: func(req *ctrlv1.ApplyDeployment) { req.DeploymentId = "" }, - }, - { - name: "missing namespace", - mutate: func(req *ctrlv1.ApplyDeployment) { req.K8SNamespace = "" }, - }, - { - name: "missing k8s name", - mutate: func(req *ctrlv1.ApplyDeployment) { req.K8SName = "" }, - }, - { - name: "missing image", - mutate: func(req *ctrlv1.ApplyDeployment) { req.Image = "" }, - }, - { - name: "zero cpu millicores", - mutate: func(req *ctrlv1.ApplyDeployment) { req.CpuMillicores = 0 }, - }, - { - name: "zero memory", - mutate: func(req *ctrlv1.ApplyDeployment) { req.MemoryMib = 0 }, - }, - { - name: "negative replicas", - mutate: func(req *ctrlv1.ApplyDeployment) { req.Replicas = -1 }, - }, - { - name: "negative cpu millicores", - mutate: func(req *ctrlv1.ApplyDeployment) { req.CpuMillicores = -100 }, - }, - { - name: "negative memory", - mutate: func(req *ctrlv1.ApplyDeployment) { req.MemoryMib = -128 }, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - AddReplicaSetPatchReactor(client) - r := NewTestReconciler(client, nil) - - req := newApplyDeploymentRequest() - tt.mutate(req) - - err := r.ApplyDeployment(ctx, req) - require.Error(t, err) - }) - } -} - -func TestApplyDeployment_CallsUpdateDeploymentState(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - AddReplicaSetPatchReactor(client) - mockCluster := &MockClusterClient{} - r := NewTestReconciler(client, mockCluster) - - req := newApplyDeploymentRequest() - err := r.ApplyDeployment(ctx, req) - require.NoError(t, err) - - require.Len(t, mockCluster.UpdateDeploymentStateCalls, 1) - call := mockCluster.UpdateDeploymentStateCalls[0] - update := call.GetUpdate() - require.NotNil(t, update) - require.Equal(t, req.GetK8SName(), update.GetK8SName()) -} - -func TestApplyDeployment_ControlPlaneError(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - AddReplicaSetPatchReactor(client) - - mockCluster := &MockClusterClient{ - UpdateDeploymentStateFunc: func(ctx context.Context, req *connect.Request[ctrlv1.UpdateDeploymentStateRequest]) (*connect.Response[ctrlv1.UpdateDeploymentStateResponse], error) { - return nil, fmt.Errorf("control plane unavailable") - }, - } - r := NewTestReconciler(client, mockCluster) - - req := newApplyDeploymentRequest() - err := r.ApplyDeployment(ctx, req) - require.Error(t, err) - require.Contains(t, err.Error(), "control plane unavailable") -} - -func TestApplyDeployment_EnsuresNamespaceExists(t *testing.T) { - ctx := context.Background() - client := NewFakeClientWithoutNamespace(t) - nsCapture := AddNamespaceCreateTracker(client) - AddReplicaSetPatchReactor(client) - r := NewTestReconciler(client, nil) - - req := newApplyDeploymentRequest() - err := r.ApplyDeployment(ctx, req) - require.NoError(t, err) - require.True(t, nsCapture.Created, "namespace should be created if missing") -} - -func TestApplyDeployment_SetsTolerations(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - capture := AddReplicaSetPatchReactor(client) - r := NewTestReconciler(client, nil) - - req := newApplyDeploymentRequest() - err := r.ApplyDeployment(ctx, req) - require.NoError(t, err) - - require.NotNil(t, capture.Applied) - require.Len(t, capture.Applied.Spec.Template.Spec.Tolerations, 1) - - toleration := capture.Applied.Spec.Template.Spec.Tolerations[0] - require.Equal(t, "node-class", toleration.Key) - require.Equal(t, corev1.TolerationOpEqual, toleration.Operator) - require.Equal(t, untrustedToleration.Value, toleration.Value) - require.Equal(t, corev1.TaintEffectNoSchedule, toleration.Effect) -} - -func TestApplyDeployment_ReplicaSetPatchError(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - AddPatchErrorReactor(client, "replicasets", fmt.Errorf("simulated replicaset patch failure")) - r := NewTestReconciler(client, nil) - - req := newApplyDeploymentRequest() - err := r.ApplyDeployment(ctx, req) - require.Error(t, err) - require.Contains(t, err.Error(), "simulated replicaset patch failure") -} diff --git a/svc/krane/internal/reconciler/apply_sentinel_test.go b/svc/krane/internal/reconciler/apply_sentinel_test.go deleted file mode 100644 index baaf030858..0000000000 --- a/svc/krane/internal/reconciler/apply_sentinel_test.go +++ /dev/null @@ -1,379 +0,0 @@ -package reconciler - -import ( - "context" - "fmt" - "testing" - - "connectrpc.com/connect" - "github.com/stretchr/testify/require" - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" - corev1 "k8s.io/api/core/v1" -) - -func newApplySentinelRequest() *ctrlv1.ApplySentinel { - return &ctrlv1.ApplySentinel{ - WorkspaceId: "ws_123", - ProjectId: "prj_123", - EnvironmentId: "env_123", - SentinelId: "sent_123", - K8SName: "test-sentinel", - Image: "unkey/sentinel:v1.0", - Replicas: 2, - CpuMillicores: 200, - MemoryMib: 256, - } -} - -func TestApplySentinel_CreatesDeploymentAndService(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - depCapture := AddDeploymentPatchReactor(client) - svcCapture := AddServicePatchReactor(client) - r := NewTestReconciler(client, nil) - - req := newApplySentinelRequest() - err := r.ApplySentinel(ctx, req) - require.NoError(t, err) - - require.NotNil(t, depCapture.Applied, "Deployment should be applied") - require.NotNil(t, svcCapture.Applied, "Service should be applied") -} - -func TestApplySentinel_DeploymentHasCorrectImage(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - depCapture := AddDeploymentPatchReactor(client) - AddServicePatchReactor(client) - r := NewTestReconciler(client, nil) - - req := newApplySentinelRequest() - req.Image = "unkey/sentinel:v2.0" - - err := r.ApplySentinel(ctx, req) - require.NoError(t, err) - - require.NotNil(t, depCapture.Applied) - require.Len(t, depCapture.Applied.Spec.Template.Spec.Containers, 1) - require.Equal(t, "unkey/sentinel:v2.0", depCapture.Applied.Spec.Template.Spec.Containers[0].Image) -} - -func TestApplySentinel_DeploymentHasCorrectReplicas(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - depCapture := AddDeploymentPatchReactor(client) - AddServicePatchReactor(client) - r := NewTestReconciler(client, nil) - - req := newApplySentinelRequest() - req.Replicas = 5 - - err := r.ApplySentinel(ctx, req) - require.NoError(t, err) - - require.NotNil(t, depCapture.Applied) - require.Equal(t, int32(5), *depCapture.Applied.Spec.Replicas) -} - -func TestApplySentinel_DeploymentHasCorrectLabels(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - depCapture := AddDeploymentPatchReactor(client) - AddServicePatchReactor(client) - r := NewTestReconciler(client, nil) - - req := newApplySentinelRequest() - err := r.ApplySentinel(ctx, req) - require.NoError(t, err) - - require.NotNil(t, depCapture.Applied) - - labels := depCapture.Applied.Labels - require.Equal(t, "ws_123", labels["unkey.com/workspace.id"]) - require.Equal(t, "prj_123", labels["unkey.com/project.id"]) - require.Equal(t, "env_123", labels["unkey.com/environment.id"]) - require.Equal(t, "sent_123", labels["unkey.com/sentinel.id"]) - require.Equal(t, "krane", labels["app.kubernetes.io/managed-by"]) - require.Equal(t, "sentinel", labels["app.kubernetes.io/component"]) -} - -func TestApplySentinel_DeploymentHasEnvironmentVariables(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - depCapture := AddDeploymentPatchReactor(client) - AddServicePatchReactor(client) - r := NewTestReconciler(client, nil) - - req := newApplySentinelRequest() - err := r.ApplySentinel(ctx, req) - require.NoError(t, err) - - require.NotNil(t, depCapture.Applied) - require.Len(t, depCapture.Applied.Spec.Template.Spec.Containers, 1) - - envVars := depCapture.Applied.Spec.Template.Spec.Containers[0].Env - envMap := make(map[string]string) - for _, env := range envVars { - envMap[env.Name] = env.Value - } - - require.Equal(t, "ws_123", envMap["UNKEY_WORKSPACE_ID"]) - require.Equal(t, "prj_123", envMap["UNKEY_PROJECT_ID"]) - require.Equal(t, "env_123", envMap["UNKEY_ENVIRONMENT_ID"]) - require.Equal(t, "sent_123", envMap["UNKEY_SENTINEL_ID"]) - require.Equal(t, "test-region", envMap["UNKEY_REGION"]) -} - -func TestApplySentinel_ServiceHasCorrectPort(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - AddDeploymentPatchReactor(client) - svcCapture := AddServicePatchReactor(client) - r := NewTestReconciler(client, nil) - - req := newApplySentinelRequest() - err := r.ApplySentinel(ctx, req) - require.NoError(t, err) - - require.NotNil(t, svcCapture.Applied) - require.Len(t, svcCapture.Applied.Spec.Ports, 1) - require.Equal(t, int32(8040), svcCapture.Applied.Spec.Ports[0].Port) - require.Equal(t, corev1.ProtocolTCP, svcCapture.Applied.Spec.Ports[0].Protocol) -} - -func TestApplySentinel_ServiceHasCorrectSelector(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - AddDeploymentPatchReactor(client) - svcCapture := AddServicePatchReactor(client) - r := NewTestReconciler(client, nil) - - req := newApplySentinelRequest() - err := r.ApplySentinel(ctx, req) - require.NoError(t, err) - - require.NotNil(t, svcCapture.Applied) - require.Equal(t, "sent_123", svcCapture.Applied.Spec.Selector["unkey.com/sentinel.id"]) -} - -func TestApplySentinel_ServiceHasOwnerReference(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - AddDeploymentPatchReactor(client) - svcCapture := AddServicePatchReactor(client) - r := NewTestReconciler(client, nil) - - req := newApplySentinelRequest() - err := r.ApplySentinel(ctx, req) - require.NoError(t, err) - - require.NotNil(t, svcCapture.Applied) - require.Len(t, svcCapture.Applied.OwnerReferences, 1) - require.Equal(t, "Deployment", svcCapture.Applied.OwnerReferences[0].Kind) - require.Equal(t, "test-sentinel", svcCapture.Applied.OwnerReferences[0].Name) -} - -func TestApplySentinel_DeploymentSetsTypeMeta(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - depCapture := AddDeploymentPatchReactor(client) - AddServicePatchReactor(client) - r := NewTestReconciler(client, nil) - - req := newApplySentinelRequest() - err := r.ApplySentinel(ctx, req) - require.NoError(t, err) - - require.NotNil(t, depCapture.Applied) - require.Equal(t, "apps/v1", depCapture.Applied.APIVersion) - require.Equal(t, "Deployment", depCapture.Applied.Kind) -} - -func TestApplySentinel_ServiceSetsTypeMeta(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - AddDeploymentPatchReactor(client) - svcCapture := AddServicePatchReactor(client) - r := NewTestReconciler(client, nil) - - req := newApplySentinelRequest() - err := r.ApplySentinel(ctx, req) - require.NoError(t, err) - - require.NotNil(t, svcCapture.Applied) - require.Equal(t, "v1", svcCapture.Applied.APIVersion) - require.Equal(t, "Service", svcCapture.Applied.Kind) -} - -// TestApplySentinel_EnsuresNamespaceExists verifies that ApplySentinel creates -// the target namespace if it doesn't already exist before applying the deployment -// and service resources. -func TestApplySentinel_EnsuresNamespaceExists(t *testing.T) { - ctx := context.Background() - client := NewFakeClientWithoutNamespace(t) - nsCapture := AddNamespaceCreateTracker(client) - AddDeploymentPatchReactor(client) - AddServicePatchReactor(client) - r := NewTestReconciler(client, nil) - - req := newApplySentinelRequest() - err := r.ApplySentinel(ctx, req) - require.NoError(t, err) - require.True(t, nsCapture.Created, "namespace should be created if missing") -} - -// TestApplySentinel_DeploymentPatchError verifies that errors from the Kubernetes -// API during server-side apply of the Deployment resource are properly propagated -// back to the caller. -func TestApplySentinel_DeploymentPatchError(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - AddPatchErrorReactor(client, "deployments", fmt.Errorf("simulated deployment patch failure")) - r := NewTestReconciler(client, nil) - - req := newApplySentinelRequest() - err := r.ApplySentinel(ctx, req) - require.Error(t, err) - require.Contains(t, err.Error(), "simulated deployment patch failure") -} - -// TestApplySentinel_ServicePatchError verifies that errors from the Kubernetes -// API during server-side apply of the Service resource are properly propagated -// back to the caller, even when the Deployment was applied successfully. -func TestApplySentinel_ServicePatchError(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - AddDeploymentPatchReactor(client) - AddPatchErrorReactor(client, "services", fmt.Errorf("simulated service patch failure")) - r := NewTestReconciler(client, nil) - - req := newApplySentinelRequest() - err := r.ApplySentinel(ctx, req) - require.Error(t, err) - require.Contains(t, err.Error(), "simulated service patch failure") -} - -func TestApplySentinel_CallsUpdateSentinelState(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - AddDeploymentPatchReactor(client) - AddServicePatchReactor(client) - mockCluster := &MockClusterClient{} - r := NewTestReconciler(client, mockCluster) - - req := newApplySentinelRequest() - err := r.ApplySentinel(ctx, req) - require.NoError(t, err) - - require.Len(t, mockCluster.UpdateSentinelStateCalls, 1) - call := mockCluster.UpdateSentinelStateCalls[0] - require.Equal(t, req.GetK8SName(), call.GetK8SName()) -} - -func TestApplySentinel_ControlPlaneError(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - AddDeploymentPatchReactor(client) - AddServicePatchReactor(client) - - mockCluster := &MockClusterClient{ - UpdateSentinelStateFunc: func(ctx context.Context, req *connect.Request[ctrlv1.UpdateSentinelStateRequest]) (*connect.Response[ctrlv1.UpdateSentinelStateResponse], error) { - return nil, fmt.Errorf("control plane unavailable") - }, - } - r := NewTestReconciler(client, mockCluster) - - req := newApplySentinelRequest() - err := r.ApplySentinel(ctx, req) - require.Error(t, err) - require.Contains(t, err.Error(), "control plane unavailable") -} - -func TestApplySentinel_SetsTolerations(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - depCapture := AddDeploymentPatchReactor(client) - AddServicePatchReactor(client) - r := NewTestReconciler(client, nil) - - req := newApplySentinelRequest() - err := r.ApplySentinel(ctx, req) - require.NoError(t, err) - - require.NotNil(t, depCapture.Applied) - require.Len(t, depCapture.Applied.Spec.Template.Spec.Tolerations, 1) - - toleration := depCapture.Applied.Spec.Template.Spec.Tolerations[0] - require.Equal(t, "node-class", toleration.Key) - require.Equal(t, corev1.TolerationOpEqual, toleration.Operator) - require.Equal(t, sentinelToleration.Value, toleration.Value) - require.Equal(t, corev1.TaintEffectNoSchedule, toleration.Effect) -} - -func TestApplySentinel_ValidationErrors(t *testing.T) { - tests := []struct { - name string - mutate func(*ctrlv1.ApplySentinel) - }{ - { - name: "missing workspace id", - mutate: func(req *ctrlv1.ApplySentinel) { req.WorkspaceId = "" }, - }, - { - name: "missing project id", - mutate: func(req *ctrlv1.ApplySentinel) { req.ProjectId = "" }, - }, - { - name: "missing environment id", - mutate: func(req *ctrlv1.ApplySentinel) { req.EnvironmentId = "" }, - }, - { - name: "missing sentinel id", - mutate: func(req *ctrlv1.ApplySentinel) { req.SentinelId = "" }, - }, - { - name: "missing k8s name", - mutate: func(req *ctrlv1.ApplySentinel) { req.K8SName = "" }, - }, - { - name: "missing image", - mutate: func(req *ctrlv1.ApplySentinel) { req.Image = "" }, - }, - { - name: "zero cpu millicores", - mutate: func(req *ctrlv1.ApplySentinel) { req.CpuMillicores = 0 }, - }, - { - name: "zero memory", - mutate: func(req *ctrlv1.ApplySentinel) { req.MemoryMib = 0 }, - }, - { - name: "negative replicas", - mutate: func(req *ctrlv1.ApplySentinel) { req.Replicas = -1 }, - }, - { - name: "negative cpu millicores", - mutate: func(req *ctrlv1.ApplySentinel) { req.CpuMillicores = -100 }, - }, - { - name: "negative memory", - mutate: func(req *ctrlv1.ApplySentinel) { req.MemoryMib = -128 }, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - AddDeploymentPatchReactor(client) - AddServicePatchReactor(client) - r := NewTestReconciler(client, nil) - - req := newApplySentinelRequest() - tt.mutate(req) - - err := r.ApplySentinel(ctx, req) - require.Error(t, err) - }) - } -} diff --git a/svc/krane/internal/reconciler/consts.go b/svc/krane/internal/reconciler/consts.go deleted file mode 100644 index 79ec3b585c..0000000000 --- a/svc/krane/internal/reconciler/consts.go +++ /dev/null @@ -1,10 +0,0 @@ -package reconciler - -const ( - - // runtimeClassGvisor specifies the gVisor sandbox for untrusted user workloads. - runtimeClassGvisor = "gvisor" - - // fieldManagerKrane identifies krane as the server-side apply field manager. - fieldManagerKrane = "krane" -) diff --git a/svc/krane/internal/reconciler/delete_deployment_test.go b/svc/krane/internal/reconciler/delete_deployment_test.go deleted file mode 100644 index 11ab5fdf3e..0000000000 --- a/svc/krane/internal/reconciler/delete_deployment_test.go +++ /dev/null @@ -1,110 +0,0 @@ -package reconciler - -import ( - "context" - "fmt" - "testing" - - "github.com/stretchr/testify/require" - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" - appsv1 "k8s.io/api/apps/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime/schema" -) - -func newDeleteDeploymentRequest() *ctrlv1.DeleteDeployment { - return &ctrlv1.DeleteDeployment{ - K8SNamespace: "test-namespace", - K8SName: "test-deployment", - } -} - -func TestDeleteDeployment_SuccessfullyDeletesReplicaSet(t *testing.T) { - ctx := context.Background() - - rs := &appsv1.ReplicaSet{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-deployment", - Namespace: "test-namespace", - }, - } - client := NewFakeClient(t, rs) - r := NewTestReconciler(client, nil) - - req := newDeleteDeploymentRequest() - err := r.DeleteDeployment(ctx, req) - require.NoError(t, err) - - _, err = client.AppsV1().ReplicaSets("test-namespace").Get(ctx, "test-deployment", metav1.GetOptions{}) - require.True(t, apierrors.IsNotFound(err), "ReplicaSet should be deleted") -} - -func TestDeleteDeployment_IgnoresNotFoundErrors(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - r := NewTestReconciler(client, nil) - - req := newDeleteDeploymentRequest() - err := r.DeleteDeployment(ctx, req) - require.NoError(t, err, "should not error when ReplicaSet doesn't exist") -} - -func TestDeleteDeployment_PropagatesForbiddenError(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - - forbiddenErr := apierrors.NewForbidden( - schema.GroupResource{Group: "apps", Resource: "replicasets"}, - "test-deployment", - fmt.Errorf("access denied"), - ) - AddErrorReactor(client, "delete", "replicasets", forbiddenErr) - - r := NewTestReconciler(client, nil) - - req := newDeleteDeploymentRequest() - err := r.DeleteDeployment(ctx, req) - require.Error(t, err) - require.True(t, apierrors.IsForbidden(err), "should propagate forbidden error") -} - -func TestDeleteDeployment_PropagatesInternalServerError(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - - internalErr := apierrors.NewInternalError(fmt.Errorf("internal server error")) - AddErrorReactor(client, "delete", "replicasets", internalErr) - - r := NewTestReconciler(client, nil) - - req := newDeleteDeploymentRequest() - err := r.DeleteDeployment(ctx, req) - require.Error(t, err) - require.True(t, apierrors.IsInternalError(err), "should propagate internal server error") -} - -func TestDeleteDeployment_CallsUpdateDeploymentStateWithDeleteChange(t *testing.T) { - ctx := context.Background() - - rs := &appsv1.ReplicaSet{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-deployment", - Namespace: "test-namespace", - }, - } - client := NewFakeClient(t, rs) - mockCluster := &MockClusterClient{} - r := NewTestReconciler(client, mockCluster) - - req := newDeleteDeploymentRequest() - err := r.DeleteDeployment(ctx, req) - require.NoError(t, err) - - require.Len(t, mockCluster.UpdateDeploymentStateCalls, 1) - call := mockCluster.UpdateDeploymentStateCalls[0] - - deleteChange, ok := call.GetChange().(*ctrlv1.UpdateDeploymentStateRequest_Delete_) - require.True(t, ok, "Change should be a Delete type") - require.Equal(t, "test-deployment", deleteChange.Delete.GetK8SName()) -} diff --git a/svc/krane/internal/reconciler/delete_sentinel_test.go b/svc/krane/internal/reconciler/delete_sentinel_test.go deleted file mode 100644 index e34ef4c3ab..0000000000 --- a/svc/krane/internal/reconciler/delete_sentinel_test.go +++ /dev/null @@ -1,199 +0,0 @@ -package reconciler - -import ( - "context" - "fmt" - "testing" - - "github.com/stretchr/testify/require" - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" - appsv1 "k8s.io/api/apps/v1" - corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime/schema" -) - -func newDeleteSentinelRequest() *ctrlv1.DeleteSentinel { - return &ctrlv1.DeleteSentinel{ - K8SName: "test-sentinel", - } -} - -func TestDeleteSentinel_SuccessfullyDeletesServiceAndDeployment(t *testing.T) { - ctx := context.Background() - - svc := &corev1.Service{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-sentinel", - Namespace: NamespaceSentinel, - }, - } - dep := &appsv1.Deployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-sentinel", - Namespace: NamespaceSentinel, - }, - } - client := NewFakeClient(t, svc, dep) - r := NewTestReconciler(client, nil) - - req := newDeleteSentinelRequest() - err := r.DeleteSentinel(ctx, req) - require.NoError(t, err) - - _, err = client.CoreV1().Services(NamespaceSentinel).Get(ctx, "test-sentinel", metav1.GetOptions{}) - require.True(t, apierrors.IsNotFound(err), "Service should be deleted") - - _, err = client.AppsV1().Deployments(NamespaceSentinel).Get(ctx, "test-sentinel", metav1.GetOptions{}) - require.True(t, apierrors.IsNotFound(err), "Deployment should be deleted") -} - -func TestDeleteSentinel_DeletesServiceBeforeDeployment(t *testing.T) { - ctx := context.Background() - - svc := &corev1.Service{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-sentinel", - Namespace: NamespaceSentinel, - }, - } - dep := &appsv1.Deployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-sentinel", - Namespace: NamespaceSentinel, - }, - } - client := NewFakeClient(t, svc, dep) - deletes := AddDeleteTracker(client) - r := NewTestReconciler(client, nil) - - req := newDeleteSentinelRequest() - err := r.DeleteSentinel(ctx, req) - require.NoError(t, err) - - require.Len(t, deletes.Actions, 2) - require.Equal(t, "services", deletes.Actions[0], "Service should be deleted first") - require.Equal(t, "deployments", deletes.Actions[1], "Deployment should be deleted second") -} - -func TestDeleteSentinel_IgnoresNotFoundOnService(t *testing.T) { - ctx := context.Background() - - dep := &appsv1.Deployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-sentinel", - Namespace: NamespaceSentinel, - }, - } - client := NewFakeClient(t, dep) - r := NewTestReconciler(client, nil) - - req := newDeleteSentinelRequest() - err := r.DeleteSentinel(ctx, req) - require.NoError(t, err, "should not error when Service doesn't exist") - - _, err = client.AppsV1().Deployments(NamespaceSentinel).Get(ctx, "test-sentinel", metav1.GetOptions{}) - require.True(t, apierrors.IsNotFound(err), "Deployment should still be deleted") -} - -func TestDeleteSentinel_IgnoresNotFoundOnDeployment(t *testing.T) { - ctx := context.Background() - - svc := &corev1.Service{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-sentinel", - Namespace: NamespaceSentinel, - }, - } - client := NewFakeClient(t, svc) - r := NewTestReconciler(client, nil) - - req := newDeleteSentinelRequest() - err := r.DeleteSentinel(ctx, req) - require.NoError(t, err, "should not error when Deployment doesn't exist") - - _, err = client.CoreV1().Services(NamespaceSentinel).Get(ctx, "test-sentinel", metav1.GetOptions{}) - require.True(t, apierrors.IsNotFound(err), "Service should still be deleted") -} - -func TestDeleteSentinel_IgnoresNotFoundOnBoth(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - r := NewTestReconciler(client, nil) - - req := newDeleteSentinelRequest() - err := r.DeleteSentinel(ctx, req) - require.NoError(t, err, "should not error when neither Service nor Deployment exist") -} - -func TestDeleteSentinel_PropagatesServiceError(t *testing.T) { - ctx := context.Background() - client := NewFakeClient(t) - - forbiddenErr := apierrors.NewForbidden( - schema.GroupResource{Group: "", Resource: "services"}, - "test-sentinel", - fmt.Errorf("access denied"), - ) - AddErrorReactor(client, "delete", "services", forbiddenErr) - - r := NewTestReconciler(client, nil) - - req := newDeleteSentinelRequest() - err := r.DeleteSentinel(ctx, req) - require.Error(t, err) - require.True(t, apierrors.IsForbidden(err), "should propagate forbidden error from Service deletion") -} - -func TestDeleteSentinel_PropagatesDeploymentError(t *testing.T) { - ctx := context.Background() - - svc := &corev1.Service{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-sentinel", - Namespace: NamespaceSentinel, - }, - } - client := NewFakeClient(t, svc) - - internalErr := apierrors.NewInternalError(fmt.Errorf("internal server error")) - AddErrorReactor(client, "delete", "deployments", internalErr) - - r := NewTestReconciler(client, nil) - - req := newDeleteSentinelRequest() - err := r.DeleteSentinel(ctx, req) - require.Error(t, err) - require.True(t, apierrors.IsInternalError(err), "should propagate internal server error from Deployment deletion") -} - -func TestDeleteSentinel_CallsUpdateSentinelStateWithZeroReplicas(t *testing.T) { - ctx := context.Background() - - svc := &corev1.Service{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-sentinel", - Namespace: NamespaceSentinel, - }, - } - dep := &appsv1.Deployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-sentinel", - Namespace: NamespaceSentinel, - }, - } - client := NewFakeClient(t, svc, dep) - mockCluster := &MockClusterClient{} - r := NewTestReconciler(client, mockCluster) - - req := newDeleteSentinelRequest() - err := r.DeleteSentinel(ctx, req) - require.NoError(t, err) - - require.Len(t, mockCluster.UpdateSentinelStateCalls, 1) - call := mockCluster.UpdateSentinelStateCalls[0] - - require.Equal(t, "test-sentinel", call.GetK8SName()) - require.Equal(t, int32(0), call.GetAvailableReplicas()) -} diff --git a/svc/krane/internal/reconciler/doc.go b/svc/krane/internal/reconciler/doc.go deleted file mode 100644 index e4d0858f38..0000000000 --- a/svc/krane/internal/reconciler/doc.go +++ /dev/null @@ -1,73 +0,0 @@ -// Package reconciler bridges control plane deployment state to Kubernetes resources. -// -// The reconciler exists because Unkey's deployment platform uses a central -// control plane (ctrl service) as the source of truth for what should be running, -// but actual workloads run in Kubernetes clusters. This package continuously -// synchronizes the two: it receives deployment commands from the control plane -// and applies them as Kubernetes resources, then reports the actual cluster state -// back to the control plane. -// -// # Architecture -// -// The reconciler uses a dual-loop architecture for reliability. The watch loops -// ([Reconciler.watchCurrentDeployments] and [Reconciler.watchCurrentSentinels]) -// receive real-time Kubernetes events when resources change. The refresh loops -// ([Reconciler.refreshCurrentDeployments] and [Reconciler.refreshCurrentSentinels]) -// run every minute to catch any events that might have been missed due to -// network partitions, controller restarts, or watch disconnections. -// -// State flows in both directions: the control plane pushes desired state via -// [Reconciler.HandleState], and the reconciler pushes actual state back via the -// cluster client's UpdateDeploymentState and UpdateSentinelState methods. -// -// # Resource Types -// -// The reconciler manages two types of resources: -// -// Deployments are user workloads that run as Kubernetes ReplicaSets. Each -// deployment corresponds to a specific build of a user's code. The reconciler -// creates the ReplicaSet, tracks pod status, and reports instance addresses -// back to the control plane for routing. -// -// Sentinels are infrastructure components that run as Kubernetes Deployments -// with an associated Service. Each sentinel proxies traffic to user deployments -// within an environment. The reconciler manages both the Deployment and Service -// as a unit. -// -// # Error Handling -// -// Operations that fail are logged but do not crash the reconciler. The periodic -// refresh loops will retry failed operations on the next cycle. State updates -// to the control plane use a circuit breaker to prevent cascading failures -// during control plane outages. -// -// # Concurrency -// -// The [Reconciler] is safe for concurrent use. Multiple goroutines handle -// watch events, refresh cycles, and incoming [HandleState] calls simultaneously. -// Each operation acquires only the Kubernetes resources it needs and uses -// server-side apply to handle concurrent modifications gracefully. -// -// # Usage -// -// cfg := reconciler.Config{ -// ClientSet: kubeClient, -// Logger: logger, -// Cluster: clusterClient, -// Region: "us-east-1", -// } -// r := reconciler.New(cfg) -// -// if err := r.Start(ctx); err != nil { -// return fmt.Errorf("failed to start reconciler: %w", err) -// } -// -// // Process state updates from control plane stream -// for state := range stateChannel { -// if err := r.HandleState(ctx, state); err != nil { -// logger.Error("failed to handle state", "error", err) -// } -// } -// -// r.Stop() -package reconciler diff --git a/svc/krane/internal/reconciler/handle_state_test.go b/svc/krane/internal/reconciler/handle_state_test.go deleted file mode 100644 index 8b23bad9f2..0000000000 --- a/svc/krane/internal/reconciler/handle_state_test.go +++ /dev/null @@ -1,146 +0,0 @@ -package reconciler - -import ( - "context" - "testing" - - "github.com/stretchr/testify/require" - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" - "github.com/unkeyed/unkey/pkg/ptr" -) - -// These tests verify HandleState correctly routes to the appropriate handler. -// Detailed behavior of each handler is tested in their respective test files. - -func TestHandleState_DeploymentApply(t *testing.T) { - ctx := context.Background() - h := NewTestHarness(t) - r := h.Reconciler - - state := &ctrlv1.State{ - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Apply{ - Apply: &ctrlv1.ApplyDeployment{ - WorkspaceId: "ws_123", - ProjectId: "prj_123", - EnvironmentId: "env_123", - DeploymentId: "dep_123", - K8SNamespace: "test-namespace", - K8SName: "test-deployment", - Image: "nginx:1.19", - Replicas: 3, - CpuMillicores: 100, - MemoryMib: 128, - BuildId: ptr.P("build_123"), - }, - }, - }, - }, - } - - _, err := r.HandleState(ctx, state) - require.NoError(t, err) - require.NotNil(t, h.ReplicaSets.Applied, "should route to ApplyDeployment") -} - -func TestHandleState_DeploymentDelete(t *testing.T) { - ctx := context.Background() - h := NewTestHarness(t) - r := h.Reconciler - - state := &ctrlv1.State{ - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Delete{ - Delete: &ctrlv1.DeleteDeployment{ - K8SNamespace: "test-namespace", - K8SName: "test-deployment", - }, - }, - }, - }, - } - - _, err := r.HandleState(ctx, state) - require.NoError(t, err) - require.Contains(t, h.Deletes.Actions, "replicasets", "should route to DeleteDeployment") -} - -func TestHandleState_SentinelApply(t *testing.T) { - ctx := context.Background() - h := NewTestHarness(t) - r := h.Reconciler - - state := &ctrlv1.State{ - Kind: &ctrlv1.State_Sentinel{ - Sentinel: &ctrlv1.SentinelState{ - State: &ctrlv1.SentinelState_Apply{ - Apply: &ctrlv1.ApplySentinel{ - WorkspaceId: "ws_123", - ProjectId: "prj_123", - EnvironmentId: "env_123", - SentinelId: "sentinel_123", - K8SName: "test-sentinel", - Image: "sentinel:1.0", - Replicas: 2, - CpuMillicores: 100, - MemoryMib: 128, - }, - }, - }, - }, - } - - _, err := r.HandleState(ctx, state) - require.NoError(t, err) - require.NotNil(t, h.Deployments.Applied, "should route to ApplySentinel (Deployment)") - require.NotNil(t, h.Services.Applied, "should route to ApplySentinel (Service)") -} - -func TestHandleState_SentinelDelete(t *testing.T) { - ctx := context.Background() - h := NewTestHarness(t) - r := h.Reconciler - - state := &ctrlv1.State{ - Kind: &ctrlv1.State_Sentinel{ - Sentinel: &ctrlv1.SentinelState{ - State: &ctrlv1.SentinelState_Delete{ - Delete: &ctrlv1.DeleteSentinel{ - K8SName: "test-sentinel", - }, - }, - }, - }, - } - - _, err := r.HandleState(ctx, state) - require.NoError(t, err) - require.Contains(t, h.Deletes.Actions, "services", "should route to DeleteSentinel (Service)") - require.Contains(t, h.Deletes.Actions, "deployments", "should route to DeleteSentinel (Deployment)") -} - -func TestHandleState_UnknownStateType(t *testing.T) { - ctx := context.Background() - h := NewTestHarness(t) - r := h.Reconciler - - state := &ctrlv1.State{ - Kind: nil, - } - - _, err := r.HandleState(ctx, state) - require.Error(t, err) - require.Contains(t, err.Error(), "unknown state type") -} - -func TestHandleState_NilState(t *testing.T) { - ctx := context.Background() - h := NewTestHarness(t) - r := h.Reconciler - - _, err := r.HandleState(ctx, nil) - require.Error(t, err) - require.Contains(t, err.Error(), "state is nil") -} diff --git a/svc/krane/internal/reconciler/mock_cluster_client_test.go b/svc/krane/internal/reconciler/mock_cluster_client_test.go deleted file mode 100644 index 6ceabb0b9a..0000000000 --- a/svc/krane/internal/reconciler/mock_cluster_client_test.go +++ /dev/null @@ -1,64 +0,0 @@ -package reconciler - -import ( - "context" - - "connectrpc.com/connect" - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" - "github.com/unkeyed/unkey/gen/proto/ctrl/v1/ctrlv1connect" -) - -var _ ctrlv1connect.ClusterServiceClient = (*MockClusterClient)(nil) - -// MockClusterClient is a test double for the control plane's cluster service. -// -// Each method has an optional function field (e.g., WatchFunc) that tests can set -// to customize behavior. If the function is nil, the method returns a sensible -// default. The mock also records all UpdateDeploymentState and UpdateSentinelState -// calls so tests can verify the reconciler reported the correct state. -type MockClusterClient struct { - SyncFunc func(context.Context, *connect.Request[ctrlv1.SyncRequest]) (*connect.ServerStreamForClient[ctrlv1.State], error) - GetDesiredSentinelStateFunc func(context.Context, *connect.Request[ctrlv1.GetDesiredSentinelStateRequest]) (*connect.Response[ctrlv1.SentinelState], error) - UpdateSentinelStateFunc func(context.Context, *connect.Request[ctrlv1.UpdateSentinelStateRequest]) (*connect.Response[ctrlv1.UpdateSentinelStateResponse], error) - GetDesiredDeploymentStateFunc func(context.Context, *connect.Request[ctrlv1.GetDesiredDeploymentStateRequest]) (*connect.Response[ctrlv1.DeploymentState], error) - UpdateDeploymentStateFunc func(context.Context, *connect.Request[ctrlv1.UpdateDeploymentStateRequest]) (*connect.Response[ctrlv1.UpdateDeploymentStateResponse], error) - UpdateDeploymentStateCalls []*ctrlv1.UpdateDeploymentStateRequest - UpdateSentinelStateCalls []*ctrlv1.UpdateSentinelStateRequest -} - -func (m *MockClusterClient) Sync(ctx context.Context, req *connect.Request[ctrlv1.SyncRequest]) (*connect.ServerStreamForClient[ctrlv1.State], error) { - if m.SyncFunc != nil { - return m.SyncFunc(ctx, req) - } - return nil, nil -} - -func (m *MockClusterClient) GetDesiredSentinelState(ctx context.Context, req *connect.Request[ctrlv1.GetDesiredSentinelStateRequest]) (*connect.Response[ctrlv1.SentinelState], error) { - if m.GetDesiredSentinelStateFunc != nil { - return m.GetDesiredSentinelStateFunc(ctx, req) - } - return connect.NewResponse(&ctrlv1.SentinelState{}), nil -} - -func (m *MockClusterClient) UpdateSentinelState(ctx context.Context, req *connect.Request[ctrlv1.UpdateSentinelStateRequest]) (*connect.Response[ctrlv1.UpdateSentinelStateResponse], error) { - m.UpdateSentinelStateCalls = append(m.UpdateSentinelStateCalls, req.Msg) - if m.UpdateSentinelStateFunc != nil { - return m.UpdateSentinelStateFunc(ctx, req) - } - return connect.NewResponse(&ctrlv1.UpdateSentinelStateResponse{}), nil -} - -func (m *MockClusterClient) GetDesiredDeploymentState(ctx context.Context, req *connect.Request[ctrlv1.GetDesiredDeploymentStateRequest]) (*connect.Response[ctrlv1.DeploymentState], error) { - if m.GetDesiredDeploymentStateFunc != nil { - return m.GetDesiredDeploymentStateFunc(ctx, req) - } - return connect.NewResponse(&ctrlv1.DeploymentState{}), nil -} - -func (m *MockClusterClient) UpdateDeploymentState(ctx context.Context, req *connect.Request[ctrlv1.UpdateDeploymentStateRequest]) (*connect.Response[ctrlv1.UpdateDeploymentStateResponse], error) { - m.UpdateDeploymentStateCalls = append(m.UpdateDeploymentStateCalls, req.Msg) - if m.UpdateDeploymentStateFunc != nil { - return m.UpdateDeploymentStateFunc(ctx, req) - } - return connect.NewResponse(&ctrlv1.UpdateDeploymentStateResponse{}), nil -} diff --git a/svc/krane/internal/reconciler/namespace_test.go b/svc/krane/internal/reconciler/namespace_test.go deleted file mode 100644 index 4a2dd0f90f..0000000000 --- a/svc/krane/internal/reconciler/namespace_test.go +++ /dev/null @@ -1,250 +0,0 @@ -package reconciler - -import ( - "context" - "fmt" - "testing" - - "github.com/stretchr/testify/require" - "github.com/unkeyed/unkey/pkg/circuitbreaker" - "github.com/unkeyed/unkey/pkg/otel/logging" - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" - dynamicfake "k8s.io/client-go/dynamic/fake" - "k8s.io/client-go/kubernetes/fake" - k8stesting "k8s.io/client-go/testing" -) - -var ciliumGVR = schema.GroupVersionResource{ - Group: "cilium.io", - Version: "v2", - Resource: "ciliumnetworkpolicies", -} - -// newFakeDynamicClient creates a fake dynamic client for testing CiliumNetworkPolicy operations -func newFakeDynamicClient() *dynamicfake.FakeDynamicClient { - scheme := runtime.NewScheme() - gvrToListKind := map[schema.GroupVersionResource]string{ - ciliumGVR: "CiliumNetworkPolicyList", - } - return dynamicfake.NewSimpleDynamicClientWithCustomListKinds(scheme, gvrToListKind) -} - -// newFakeDynamicClientWithPatchReactor creates a dynamic client that properly handles Apply (patch) operations -func newFakeDynamicClientWithPatchReactor(onPatch func(action k8stesting.Action)) *dynamicfake.FakeDynamicClient { - client := newFakeDynamicClient() - client.PrependReactor("patch", "ciliumnetworkpolicies", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { - if onPatch != nil { - onPatch(action) - } - patchAction := action.(k8stesting.PatchAction) - // Return a valid unstructured object for Apply - obj := &unstructured.Unstructured{} - obj.SetAPIVersion("cilium.io/v2") - obj.SetKind("CiliumNetworkPolicy") - obj.SetName(patchAction.GetName()) - obj.SetNamespace(patchAction.GetNamespace()) - return true, obj, nil - }) - return client -} - -func TestEnsureNamespaceExists_CreatesNamespaceIfMissing(t *testing.T) { - ctx := context.Background() - client := fake.NewSimpleClientset() - dynamicClient := newFakeDynamicClientWithPatchReactor(nil) - - var createdNamespace *corev1.Namespace - client.PrependReactor("create", "namespaces", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { - createAction := action.(k8stesting.CreateAction) - createdNamespace = createAction.GetObject().(*corev1.Namespace) - return false, createdNamespace, nil - }) - - r := &Reconciler{ - clientSet: client, - dynamicClient: dynamicClient, - cluster: &MockClusterClient{}, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - err := r.ensureNamespaceExists(ctx, "new-namespace", "ws-123", "env-456") - require.NoError(t, err) - require.NotNil(t, createdNamespace, "namespace should be created") - require.Equal(t, "new-namespace", createdNamespace.Name) -} - -func TestEnsureNamespaceExists_IdempotentWhenNamespaceExists(t *testing.T) { - ctx := context.Background() - - existingNamespace := &corev1.Namespace{ - ObjectMeta: metav1.ObjectMeta{ - Name: "existing-namespace", - }, - } - client := fake.NewSimpleClientset(existingNamespace) - dynamicClient := newFakeDynamicClientWithPatchReactor(nil) - - var createCount int - client.PrependReactor("create", "namespaces", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { - createCount++ - return true, nil, errors.NewAlreadyExists(schema.GroupResource{Group: "", Resource: "namespaces"}, "existing-namespace") - }) - - r := &Reconciler{ - clientSet: client, - dynamicClient: dynamicClient, - cluster: &MockClusterClient{}, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - err := r.ensureNamespaceExists(ctx, "existing-namespace", "ws-123", "env-456") - require.NoError(t, err, "should not error when namespace already exists") - require.Equal(t, 1, createCount, "should attempt creation exactly once") - - err = r.ensureNamespaceExists(ctx, "existing-namespace", "ws-123", "env-456") - require.NoError(t, err, "should remain idempotent on repeated calls") -} - -func TestEnsureNamespaceExists_HandlesCreationError(t *testing.T) { - ctx := context.Background() - client := fake.NewSimpleClientset() - dynamicClient := newFakeDynamicClientWithPatchReactor(nil) - - expectedErr := fmt.Errorf("k8s API unavailable") - client.PrependReactor("create", "namespaces", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { - return true, nil, expectedErr - }) - - r := &Reconciler{ - clientSet: client, - dynamicClient: dynamicClient, - cluster: &MockClusterClient{}, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - err := r.ensureNamespaceExists(ctx, "test-namespace", "ws-123", "env-456") - require.Error(t, err) - require.Equal(t, expectedErr, err) -} - -func TestEnsureNamespaceExists_CorrectNamespaceMetadata(t *testing.T) { - ctx := context.Background() - client := fake.NewSimpleClientset() - dynamicClient := newFakeDynamicClientWithPatchReactor(nil) - - var createdNamespace *corev1.Namespace - client.PrependReactor("create", "namespaces", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { - createAction := action.(k8stesting.CreateAction) - createdNamespace = createAction.GetObject().(*corev1.Namespace) - return false, createdNamespace, nil - }) - - r := &Reconciler{ - clientSet: client, - dynamicClient: dynamicClient, - cluster: &MockClusterClient{}, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - err := r.ensureNamespaceExists(ctx, "my-namespace", "ws-123", "env-456") - require.NoError(t, err) - - require.NotNil(t, createdNamespace) - require.Equal(t, "my-namespace", createdNamespace.Name) -} - -func TestEnsureNamespaceExists_CreatesCiliumPolicyForCustomerNamespace(t *testing.T) { - ctx := context.Background() - client := fake.NewSimpleClientset() - - var createdPolicy *unstructured.Unstructured - dynamicClient := newFakeDynamicClientWithPatchReactor(func(action k8stesting.Action) { - patchAction := action.(k8stesting.PatchAction) - createdPolicy = &unstructured.Unstructured{} - createdPolicy.SetName(patchAction.GetName()) - createdPolicy.SetNamespace(patchAction.GetNamespace()) - }) - - r := &Reconciler{ - clientSet: client, - dynamicClient: dynamicClient, - cluster: &MockClusterClient{}, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - err := r.ensureNamespaceExists(ctx, "customer-namespace", "ws-123", "env-456") - require.NoError(t, err) - require.NotNil(t, createdPolicy, "CiliumNetworkPolicy should be created") - require.Equal(t, "allow-sentinel-ingress", createdPolicy.GetName()) - require.Equal(t, "customer-namespace", createdPolicy.GetNamespace()) -} - -func TestEnsureNamespaceExists_SkipsCiliumPolicyForSentinelNamespace(t *testing.T) { - ctx := context.Background() - client := fake.NewSimpleClientset() - - var policyCreated bool - dynamicClient := newFakeDynamicClientWithPatchReactor(func(action k8stesting.Action) { - policyCreated = true - }) - - r := &Reconciler{ - clientSet: client, - dynamicClient: dynamicClient, - cluster: &MockClusterClient{}, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - // Sentinel namespace should NOT create a CiliumNetworkPolicy - err := r.ensureNamespaceExists(ctx, NamespaceSentinel, "", "") - require.NoError(t, err) - require.False(t, policyCreated, "CiliumNetworkPolicy should NOT be created for sentinel namespace") -} - -func TestApplyCiliumPolicyForNamespace_CorrectPolicySpec(t *testing.T) { - ctx := context.Background() - client := fake.NewSimpleClientset() - - var appliedPatch []byte - dynamicClient := newFakeDynamicClientWithPatchReactor(func(action k8stesting.Action) { - patchAction := action.(k8stesting.PatchAction) - appliedPatch = patchAction.GetPatch() - }) - - r := &Reconciler{ - clientSet: client, - dynamicClient: dynamicClient, - cluster: &MockClusterClient{}, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - err := r.applyCiliumPolicyForNamespace(ctx, "test-namespace", "ws-test", "env-test") - require.NoError(t, err) - require.NotEmpty(t, appliedPatch, "patch should be applied") - - // Verify the patch contains the expected values - patchStr := string(appliedPatch) - require.Contains(t, patchStr, "allow-sentinel-ingress") - require.Contains(t, patchStr, "ws-test") - require.Contains(t, patchStr, "env-test") - require.Contains(t, patchStr, "8080") -} diff --git a/svc/krane/internal/reconciler/reconciler.go b/svc/krane/internal/reconciler/reconciler.go deleted file mode 100644 index 474b7e6953..0000000000 --- a/svc/krane/internal/reconciler/reconciler.go +++ /dev/null @@ -1,138 +0,0 @@ -package reconciler - -import ( - "context" - "fmt" - - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" - "github.com/unkeyed/unkey/gen/proto/ctrl/v1/ctrlv1connect" - "github.com/unkeyed/unkey/pkg/circuitbreaker" - "github.com/unkeyed/unkey/pkg/otel/logging" - "k8s.io/client-go/dynamic" - "k8s.io/client-go/kubernetes" -) - -// Reconciler synchronizes control plane deployment state with Kubernetes resources. -// -// The reconciler maintains bidirectional state synchronization: it receives desired -// state from the control plane via [HandleState] and applies it to Kubernetes, then -// watches Kubernetes for actual state changes and reports them back. This dual-flow -// ensures the control plane always knows what's actually running in the cluster. -// -// A single Reconciler manages all deployments and sentinels in its cluster. It uses -// background goroutines for watching and refreshing, so callers must call [Start] -// before processing state and [Stop] during shutdown. -type Reconciler struct { - clientSet kubernetes.Interface - dynamicClient dynamic.Interface - logger logging.Logger - cluster ctrlv1connect.ClusterServiceClient - cb circuitbreaker.CircuitBreaker[any] - done chan struct{} - region string - versionLastSeen uint64 -} - -// Config holds the configuration required to create a new [Reconciler]. -// All fields are required. -type Config struct { - ClientSet kubernetes.Interface - DynamicClient dynamic.Interface - Logger logging.Logger - Cluster ctrlv1connect.ClusterServiceClient - Region string -} - -// New creates a [Reconciler] ready to be started with [Reconciler.Start]. -func New(cfg Config) *Reconciler { - return &Reconciler{ - clientSet: cfg.ClientSet, - dynamicClient: cfg.DynamicClient, - logger: cfg.Logger, - cluster: cfg.Cluster, - cb: circuitbreaker.New[any]("reconciler_state_update"), - done: make(chan struct{}), - region: cfg.Region, - versionLastSeen: 0, - } -} - -// Start launches the background watch and refresh loops for Kubernetes resources. -// -// The watch loops provide real-time state updates when resources change. The refresh -// loops run every minute to catch missed events and ensure eventual consistency. -// Both loops continue until the context is cancelled or [Reconciler.Stop] is called. -func (r *Reconciler) Start(ctx context.Context) error { - go r.refreshCurrentDeployments(ctx) - go r.refreshCurrentSentinels(ctx) - - if err := r.watchCurrentSentinels(ctx); err != nil { - return err - } - if err := r.watchCurrentDeployments(ctx); err != nil { - return err - } - - go r.Watch(ctx) - - return nil -} - -// HandleState applies a single state update from the control plane to the cluster. -// -// The state contains either a deployment or sentinel operation (apply or delete). -// For apply operations, HandleState creates or updates the Kubernetes resource and -// reports the resulting state back to the control plane. For delete operations, it -// removes the resource and confirms deletion. -// -// HandleState returns immediately after processing the single state update. It does -// not block waiting for additional updates; use it within a loop that reads from -// the control plane's state stream. -// -// The version from the state is returned so the caller can track progress. The caller -// is responsible for committing the version only after the stream closes cleanly. -// This ensures atomic bootstrap: if a stream breaks mid-bootstrap, the client retries -// from version 0 rather than skipping resources that were never received. -func (r *Reconciler) HandleState(ctx context.Context, state *ctrlv1.State) (uint64, error) { - if state == nil { - return 0, fmt.Errorf("state is nil") - } - - version := state.GetVersion() - - switch kind := state.GetKind().(type) { - case *ctrlv1.State_Deployment: - switch op := kind.Deployment.GetState().(type) { - case *ctrlv1.DeploymentState_Apply: - if err := r.ApplyDeployment(ctx, op.Apply); err != nil { - return 0, err - } - case *ctrlv1.DeploymentState_Delete: - if err := r.DeleteDeployment(ctx, op.Delete); err != nil { - return 0, err - } - } - case *ctrlv1.State_Sentinel: - switch op := kind.Sentinel.GetState().(type) { - case *ctrlv1.SentinelState_Apply: - if err := r.ApplySentinel(ctx, op.Apply); err != nil { - return 0, err - } - case *ctrlv1.SentinelState_Delete: - if err := r.DeleteSentinel(ctx, op.Delete); err != nil { - return 0, err - } - } - default: - return 0, fmt.Errorf("unknown state type: %T", kind) - } - - return version, nil -} - -// Stop signals all background goroutines to terminate. Safe to call multiple -// times, but not concurrently with itself. -func (r *Reconciler) Stop() error { - close(r.done) - return nil -} diff --git a/svc/krane/internal/reconciler/refresh_current_deployments.go b/svc/krane/internal/reconciler/refresh_current_deployments.go deleted file mode 100644 index f607d871a7..0000000000 --- a/svc/krane/internal/reconciler/refresh_current_deployments.go +++ /dev/null @@ -1,79 +0,0 @@ -package reconciler - -import ( - "context" - "time" - - "connectrpc.com/connect" - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" - "github.com/unkeyed/unkey/pkg/repeat" - "github.com/unkeyed/unkey/svc/krane/pkg/labels" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -// refreshCurrentDeployments periodically reconciles all deployment ReplicaSets with -// their desired state from the control plane. -// -// This function runs every minute as a consistency safety net. While -// [Reconciler.watchCurrentDeployments] handles real-time events, watches can miss -// events during network partitions, controller restarts, or if the watch channel -// buffer overflows. This refresh loop guarantees eventual consistency by querying -// the control plane for each existing ReplicaSet and applying any needed changes. -// -// The function paginates through all krane-managed ReplicaSets across all namespaces -// to handle clusters with large numbers of deployments. -func (r *Reconciler) refreshCurrentDeployments(ctx context.Context) { - repeat.Every(1*time.Minute, func() { - r.logger.Info("refreshing current deployments") - - cursor := "" - for { - - replicaSets, err := r.clientSet.AppsV1().ReplicaSets("").List(ctx, metav1.ListOptions{ - LabelSelector: labels.New(). - ManagedByKrane(). - ComponentDeployment(). - ToString(), - Continue: cursor, - }) - - if err != nil { - r.logger.Error("unable to list replicaSets", "error", err.Error()) - return - } - - for _, replicaSet := range replicaSets.Items { - - deploymentID, ok := labels.GetDeploymentID(replicaSet.Labels) - if !ok { - r.logger.Error("unable to get deployment ID", "error", "replicaSet", replicaSet) - continue - } - - res, err := r.cluster.GetDesiredDeploymentState(ctx, connect.NewRequest(&ctrlv1.GetDesiredDeploymentStateRequest{ - DeploymentId: deploymentID, - })) - if err != nil { - r.logger.Error("unable to get desired deployment state", "error", err.Error(), "deployment_id", deploymentID) - continue - } - - switch res.Msg.GetState().(type) { - case *ctrlv1.DeploymentState_Apply: - if err := r.ApplyDeployment(ctx, res.Msg.GetApply()); err != nil { - r.logger.Error("unable to apply deployment", "error", err.Error(), "deployment_id", deploymentID) - } - case *ctrlv1.DeploymentState_Delete: - if err := r.DeleteDeployment(ctx, res.Msg.GetDelete()); err != nil { - r.logger.Error("unable to delete deployment", "error", err.Error(), "deployment_id", deploymentID) - } - } - } - cursor = replicaSets.Continue - if cursor == "" { - break - } - } - - }) -} diff --git a/svc/krane/internal/reconciler/refresh_current_deployments_test.go b/svc/krane/internal/reconciler/refresh_current_deployments_test.go deleted file mode 100644 index d5b6cffb88..0000000000 --- a/svc/krane/internal/reconciler/refresh_current_deployments_test.go +++ /dev/null @@ -1,364 +0,0 @@ -package reconciler - -import ( - "context" - "fmt" - "sync/atomic" - "testing" - "time" - - "connectrpc.com/connect" - "github.com/stretchr/testify/require" - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" - "github.com/unkeyed/unkey/pkg/circuitbreaker" - "github.com/unkeyed/unkey/pkg/otel/logging" - "github.com/unkeyed/unkey/svc/krane/pkg/labels" - appsv1 "k8s.io/api/apps/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/client-go/kubernetes/fake" - k8stesting "k8s.io/client-go/testing" -) - -func TestRefreshCurrentDeployments_ListsAllResources(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - rs1 := &appsv1.ReplicaSet{ - ObjectMeta: metav1.ObjectMeta{ - Name: "rs-1", - Namespace: "ns-1", - Labels: labels.New(). - ManagedByKrane(). - ComponentDeployment(). - DeploymentID("dep_1"), - }, - } - rs2 := &appsv1.ReplicaSet{ - ObjectMeta: metav1.ObjectMeta{ - Name: "rs-2", - Namespace: "ns-2", - Labels: labels.New(). - ManagedByKrane(). - ComponentDeployment(). - DeploymentID("dep_2"), - }, - } - - client := fake.NewSimpleClientset(rs1, rs2) - - var getDesiredCalls atomic.Int32 - mockCluster := &MockClusterClient{ - GetDesiredDeploymentStateFunc: func(ctx context.Context, req *connect.Request[ctrlv1.GetDesiredDeploymentStateRequest]) (*connect.Response[ctrlv1.DeploymentState], error) { - getDesiredCalls.Add(1) - return connect.NewResponse(&ctrlv1.DeploymentState{}), nil - }, - } - - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - go r.refreshCurrentDeployments(ctx) - - require.Eventually(t, func() bool { - return getDesiredCalls.Load() >= 2 - }, 2*time.Second, 50*time.Millisecond, "expected GetDesiredDeploymentState to be called for each replicaset") -} - -func TestRefreshCurrentDeployments_HandlesEmptyList(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - client := fake.NewSimpleClientset() - - var listCalled atomic.Bool - client.PrependReactor("list", "replicasets", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { - listCalled.Store(true) - return false, nil, nil - }) - - mockCluster := &MockClusterClient{} - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - go r.refreshCurrentDeployments(ctx) - - require.Eventually(t, func() bool { - return listCalled.Load() - }, 2*time.Second, 50*time.Millisecond, "expected list to be called") - - require.Len(t, mockCluster.UpdateDeploymentStateCalls, 0, "no updates should be made for empty list") -} - -func TestRefreshCurrentDeployments_HandlesListError(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - client := fake.NewSimpleClientset() - - var listErrorCalled atomic.Bool - client.PrependReactor("list", "replicasets", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { - listErrorCalled.Store(true) - return true, nil, fmt.Errorf("simulated list error") - }) - - mockCluster := &MockClusterClient{} - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - go r.refreshCurrentDeployments(ctx) - - require.Eventually(t, func() bool { - return listErrorCalled.Load() - }, 2*time.Second, 50*time.Millisecond, "expected list to be called with error") -} - -func TestRefreshCurrentDeployments_PeriodicRefresh(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - client := fake.NewSimpleClientset() - - var listCallCount atomic.Int32 - client.PrependReactor("list", "replicasets", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { - listCallCount.Add(1) - return false, nil, nil - }) - - mockCluster := &MockClusterClient{} - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - go r.refreshCurrentDeployments(ctx) - - require.Eventually(t, func() bool { - return listCallCount.Load() >= 1 - }, 2*time.Second, 50*time.Millisecond, "expected at least one list call (immediate execution)") -} - -func TestRefreshCurrentDeployments_AppliesDesiredState(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - rs := &appsv1.ReplicaSet{ - ObjectMeta: metav1.ObjectMeta{ - Name: "rs-1", - Namespace: "test-namespace", - Labels: labels.New(). - ManagedByKrane(). - ComponentDeployment(). - DeploymentID("dep_1"), - }, - } - - client := fake.NewSimpleClientset(rs) - - var applyCalled atomic.Bool - mockCluster := &MockClusterClient{ - GetDesiredDeploymentStateFunc: func(ctx context.Context, req *connect.Request[ctrlv1.GetDesiredDeploymentStateRequest]) (*connect.Response[ctrlv1.DeploymentState], error) { - applyCalled.Store(true) - return connect.NewResponse(&ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Apply{ - Apply: &ctrlv1.ApplyDeployment{ - WorkspaceId: "ws_123", - ProjectId: "prj_123", - EnvironmentId: "env_123", - DeploymentId: "dep_1", - K8SNamespace: "test-namespace", - K8SName: "rs-1", - Image: "nginx:1.19", - Replicas: 1, - CpuMillicores: 100, - MemoryMib: 128, - }, - }, - }), nil - }, - } - - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - go r.refreshCurrentDeployments(ctx) - - require.Eventually(t, func() bool { - return applyCalled.Load() - }, 2*time.Second, 50*time.Millisecond, "expected apply to be called") -} - -func TestRefreshCurrentDeployments_DeletesDesiredState(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - rs := &appsv1.ReplicaSet{ - ObjectMeta: metav1.ObjectMeta{ - Name: "rs-1", - Namespace: "test-namespace", - Labels: labels.New(). - ManagedByKrane(). - ComponentDeployment(). - DeploymentID("dep_1"), - }, - } - - client := fake.NewSimpleClientset(rs) - - var deleteCalled atomic.Bool - mockCluster := &MockClusterClient{ - GetDesiredDeploymentStateFunc: func(ctx context.Context, req *connect.Request[ctrlv1.GetDesiredDeploymentStateRequest]) (*connect.Response[ctrlv1.DeploymentState], error) { - deleteCalled.Store(true) - return connect.NewResponse(&ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Delete{ - Delete: &ctrlv1.DeleteDeployment{ - K8SNamespace: "test-namespace", - K8SName: "rs-1", - }, - }, - }), nil - }, - } - - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - go r.refreshCurrentDeployments(ctx) - - require.Eventually(t, func() bool { - return deleteCalled.Load() - }, 2*time.Second, 50*time.Millisecond, "expected delete to be called") -} - -func TestRefreshCurrentDeployments_HandlesGetDesiredStateError(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - rs1 := &appsv1.ReplicaSet{ - ObjectMeta: metav1.ObjectMeta{ - Name: "rs-1", - Namespace: "ns-1", - Labels: labels.New(). - ManagedByKrane(). - ComponentDeployment(). - DeploymentID("dep_1"), - }, - } - rs2 := &appsv1.ReplicaSet{ - ObjectMeta: metav1.ObjectMeta{ - Name: "rs-2", - Namespace: "ns-2", - Labels: labels.New(). - ManagedByKrane(). - ComponentDeployment(). - DeploymentID("dep_2"), - }, - } - - client := fake.NewSimpleClientset(rs1, rs2) - - var getDesiredCalls atomic.Int32 - mockCluster := &MockClusterClient{ - GetDesiredDeploymentStateFunc: func(ctx context.Context, req *connect.Request[ctrlv1.GetDesiredDeploymentStateRequest]) (*connect.Response[ctrlv1.DeploymentState], error) { - count := getDesiredCalls.Add(1) - if count == 1 { - return nil, fmt.Errorf("simulated control plane error") - } - return connect.NewResponse(&ctrlv1.DeploymentState{}), nil - }, - } - - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - go r.refreshCurrentDeployments(ctx) - - require.Eventually(t, func() bool { - return getDesiredCalls.Load() >= 2 - }, 2*time.Second, 50*time.Millisecond, "expected both replicasets to be processed despite first error") -} - -func TestRefreshCurrentDeployments_HandlesMissingDeploymentID(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - rsWithoutID := &appsv1.ReplicaSet{ - ObjectMeta: metav1.ObjectMeta{ - Name: "rs-no-id", - Namespace: "ns-1", - Labels: labels.New(). - ManagedByKrane(). - ComponentDeployment(), - }, - } - rsWithID := &appsv1.ReplicaSet{ - ObjectMeta: metav1.ObjectMeta{ - Name: "rs-with-id", - Namespace: "ns-2", - Labels: labels.New(). - ManagedByKrane(). - ComponentDeployment(). - DeploymentID("dep_1"), - }, - } - - client := fake.NewSimpleClientset(rsWithoutID, rsWithID) - - var getDesiredCalls atomic.Int32 - mockCluster := &MockClusterClient{ - GetDesiredDeploymentStateFunc: func(ctx context.Context, req *connect.Request[ctrlv1.GetDesiredDeploymentStateRequest]) (*connect.Response[ctrlv1.DeploymentState], error) { - getDesiredCalls.Add(1) - return connect.NewResponse(&ctrlv1.DeploymentState{}), nil - }, - } - - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - go r.refreshCurrentDeployments(ctx) - - require.Eventually(t, func() bool { - return getDesiredCalls.Load() >= 1 - }, 2*time.Second, 50*time.Millisecond, "expected only replicaset with ID to be processed") - - time.Sleep(200 * time.Millisecond) - require.Equal(t, int32(1), getDesiredCalls.Load(), "only one replicaset should be processed (the one with ID)") -} diff --git a/svc/krane/internal/reconciler/refresh_current_sentinels.go b/svc/krane/internal/reconciler/refresh_current_sentinels.go deleted file mode 100644 index f1717a5344..0000000000 --- a/svc/krane/internal/reconciler/refresh_current_sentinels.go +++ /dev/null @@ -1,79 +0,0 @@ -package reconciler - -import ( - "context" - "time" - - "connectrpc.com/connect" - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" - "github.com/unkeyed/unkey/pkg/repeat" - "github.com/unkeyed/unkey/svc/krane/pkg/labels" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -// refreshCurrentSentinels periodically reconciles all sentinel Deployments with -// their desired state from the control plane. -// -// This function runs every minute as a consistency safety net. While -// [Reconciler.watchCurrentSentinels] handles real-time events, watches can miss -// events during network partitions, controller restarts, or if the watch channel -// buffer overflows. This refresh loop guarantees eventual consistency by querying -// the control plane for each existing sentinel and applying any needed changes. -// -// The function paginates through all krane-managed sentinel Deployments across all -// namespaces to handle clusters with large numbers of environments. -func (r *Reconciler) refreshCurrentSentinels(ctx context.Context) { - - repeat.Every(1*time.Minute, func() { - - cursor := "" - for { - - deployments, err := r.clientSet.AppsV1().Deployments(NamespaceSentinel).List(ctx, metav1.ListOptions{ - LabelSelector: labels.New(). - ManagedByKrane(). - ComponentSentinel(). - ToString(), - Continue: cursor, - }) - - if err != nil { - r.logger.Error("unable to list deployments", "error", err.Error()) - return - } - - for _, deployment := range deployments.Items { - - sentinelID, ok := labels.GetSentinelID(deployment.Labels) - if !ok { - r.logger.Error("unable to get sentinel ID", "error", "deployment", deployment) - continue - } - - res, err := r.cluster.GetDesiredSentinelState(ctx, connect.NewRequest(&ctrlv1.GetDesiredSentinelStateRequest{ - SentinelId: sentinelID, - })) - if err != nil { - r.logger.Error("unable to get desired sentinel state", "error", err.Error(), "sentinel_id", sentinelID) - continue - } - - switch res.Msg.GetState().(type) { - case *ctrlv1.SentinelState_Apply: - if err := r.ApplySentinel(ctx, res.Msg.GetApply()); err != nil { - r.logger.Error("unable to apply sentinel", "error", err.Error(), "sentinel_id", sentinelID) - } - case *ctrlv1.SentinelState_Delete: - if err := r.DeleteSentinel(ctx, res.Msg.GetDelete()); err != nil { - r.logger.Error("unable to delete sentinel", "error", err.Error(), "sentinel_id", sentinelID) - } - } - } - cursor = deployments.Continue - if cursor == "" { - break - } - } - - }) -} diff --git a/svc/krane/internal/reconciler/refresh_current_sentinels_test.go b/svc/krane/internal/reconciler/refresh_current_sentinels_test.go deleted file mode 100644 index af815c88ef..0000000000 --- a/svc/krane/internal/reconciler/refresh_current_sentinels_test.go +++ /dev/null @@ -1,362 +0,0 @@ -package reconciler - -import ( - "context" - "fmt" - "sync/atomic" - "testing" - "time" - - "connectrpc.com/connect" - "github.com/stretchr/testify/require" - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" - "github.com/unkeyed/unkey/pkg/circuitbreaker" - "github.com/unkeyed/unkey/pkg/otel/logging" - "github.com/unkeyed/unkey/svc/krane/pkg/labels" - appsv1 "k8s.io/api/apps/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/client-go/kubernetes/fake" - k8stesting "k8s.io/client-go/testing" -) - -func TestRefreshCurrentSentinels_ListsAllResources(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - dep1 := &appsv1.Deployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: "dep-1", - Namespace: NamespaceSentinel, - Labels: labels.New(). - ManagedByKrane(). - ComponentSentinel(). - SentinelID("sent_1"), - }, - } - dep2 := &appsv1.Deployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: "dep-2", - Namespace: NamespaceSentinel, - Labels: labels.New(). - ManagedByKrane(). - ComponentSentinel(). - SentinelID("sent_2"), - }, - } - - client := fake.NewSimpleClientset(dep1, dep2) - - var getDesiredCalls atomic.Int32 - mockCluster := &MockClusterClient{ - GetDesiredSentinelStateFunc: func(ctx context.Context, req *connect.Request[ctrlv1.GetDesiredSentinelStateRequest]) (*connect.Response[ctrlv1.SentinelState], error) { - getDesiredCalls.Add(1) - return connect.NewResponse(&ctrlv1.SentinelState{}), nil - }, - } - - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - go r.refreshCurrentSentinels(ctx) - - require.Eventually(t, func() bool { - return getDesiredCalls.Load() >= 2 - }, 2*time.Second, 50*time.Millisecond, "expected GetDesiredSentinelState to be called for each deployment") -} - -func TestRefreshCurrentSentinels_HandlesEmptyList(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - client := fake.NewSimpleClientset() - - var listCalled atomic.Bool - client.PrependReactor("list", "deployments", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { - listCalled.Store(true) - return false, nil, nil - }) - - mockCluster := &MockClusterClient{} - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - go r.refreshCurrentSentinels(ctx) - - require.Eventually(t, func() bool { - return listCalled.Load() - }, 2*time.Second, 50*time.Millisecond, "expected list to be called") - - require.Len(t, mockCluster.UpdateSentinelStateCalls, 0, "no updates should be made for empty list") -} - -func TestRefreshCurrentSentinels_HandlesListError(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - client := fake.NewSimpleClientset() - - var listErrorCalled atomic.Bool - client.PrependReactor("list", "deployments", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { - listErrorCalled.Store(true) - return true, nil, fmt.Errorf("simulated list error") - }) - - mockCluster := &MockClusterClient{} - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - go r.refreshCurrentSentinels(ctx) - - require.Eventually(t, func() bool { - return listErrorCalled.Load() - }, 2*time.Second, 50*time.Millisecond, "expected list to be called with error") -} - -func TestRefreshCurrentSentinels_PeriodicRefresh(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - client := fake.NewSimpleClientset() - - var listCallCount atomic.Int32 - client.PrependReactor("list", "deployments", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { - listCallCount.Add(1) - return false, nil, nil - }) - - mockCluster := &MockClusterClient{} - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - go r.refreshCurrentSentinels(ctx) - - require.Eventually(t, func() bool { - return listCallCount.Load() >= 1 - }, 2*time.Second, 50*time.Millisecond, "expected at least one list call (immediate execution)") -} - -func TestRefreshCurrentSentinels_AppliesDesiredState(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - dep := &appsv1.Deployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: "dep-1", - Namespace: NamespaceSentinel, - Labels: labels.New(). - ManagedByKrane(). - ComponentSentinel(). - SentinelID("sent_1"), - }, - } - - client := fake.NewSimpleClientset(dep) - - var applyCalled atomic.Bool - mockCluster := &MockClusterClient{ - GetDesiredSentinelStateFunc: func(ctx context.Context, req *connect.Request[ctrlv1.GetDesiredSentinelStateRequest]) (*connect.Response[ctrlv1.SentinelState], error) { - applyCalled.Store(true) - return connect.NewResponse(&ctrlv1.SentinelState{ - State: &ctrlv1.SentinelState_Apply{ - Apply: &ctrlv1.ApplySentinel{ - WorkspaceId: "ws_123", - ProjectId: "prj_123", - EnvironmentId: "env_123", - SentinelId: "sent_1", - K8SName: "dep-1", - Image: "unkey/sentinel:v1.0", - Replicas: 1, - CpuMillicores: 100, - MemoryMib: 128, - }, - }, - }), nil - }, - } - - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - go r.refreshCurrentSentinels(ctx) - - require.Eventually(t, func() bool { - return applyCalled.Load() - }, 2*time.Second, 50*time.Millisecond, "expected apply to be called") -} - -func TestRefreshCurrentSentinels_DeletesDesiredState(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - dep := &appsv1.Deployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: "dep-1", - Namespace: NamespaceSentinel, - Labels: labels.New(). - ManagedByKrane(). - ComponentSentinel(). - SentinelID("sent_1"), - }, - } - - client := fake.NewSimpleClientset(dep) - - var deleteCalled atomic.Bool - mockCluster := &MockClusterClient{ - GetDesiredSentinelStateFunc: func(ctx context.Context, req *connect.Request[ctrlv1.GetDesiredSentinelStateRequest]) (*connect.Response[ctrlv1.SentinelState], error) { - deleteCalled.Store(true) - return connect.NewResponse(&ctrlv1.SentinelState{ - State: &ctrlv1.SentinelState_Delete{ - Delete: &ctrlv1.DeleteSentinel{ - K8SName: "dep-1", - }, - }, - }), nil - }, - } - - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - go r.refreshCurrentSentinels(ctx) - - require.Eventually(t, func() bool { - return deleteCalled.Load() - }, 2*time.Second, 50*time.Millisecond, "expected delete to be called") -} - -func TestRefreshCurrentSentinels_HandlesGetDesiredStateError(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - dep1 := &appsv1.Deployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: "dep-1", - Namespace: NamespaceSentinel, - Labels: labels.New(). - ManagedByKrane(). - ComponentSentinel(). - SentinelID("sent_1"), - }, - } - dep2 := &appsv1.Deployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: "dep-2", - Namespace: NamespaceSentinel, - Labels: labels.New(). - ManagedByKrane(). - ComponentSentinel(). - SentinelID("sent_2"), - }, - } - - client := fake.NewSimpleClientset(dep1, dep2) - - var getDesiredCalls atomic.Int32 - mockCluster := &MockClusterClient{ - GetDesiredSentinelStateFunc: func(ctx context.Context, req *connect.Request[ctrlv1.GetDesiredSentinelStateRequest]) (*connect.Response[ctrlv1.SentinelState], error) { - count := getDesiredCalls.Add(1) - if count == 1 { - return nil, fmt.Errorf("simulated control plane error") - } - return connect.NewResponse(&ctrlv1.SentinelState{}), nil - }, - } - - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - go r.refreshCurrentSentinels(ctx) - - require.Eventually(t, func() bool { - return getDesiredCalls.Load() >= 2 - }, 2*time.Second, 50*time.Millisecond, "expected both deployments to be processed despite first error") -} - -func TestRefreshCurrentSentinels_HandlesMissingSentinelID(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - depWithoutID := &appsv1.Deployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: "dep-no-id", - Namespace: NamespaceSentinel, - Labels: labels.New(). - ManagedByKrane(). - ComponentSentinel(), - }, - } - depWithID := &appsv1.Deployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: "dep-with-id", - Namespace: NamespaceSentinel, - Labels: labels.New(). - ManagedByKrane(). - ComponentSentinel(). - SentinelID("sent_1"), - }, - } - - client := fake.NewSimpleClientset(depWithoutID, depWithID) - - var getDesiredCalls atomic.Int32 - mockCluster := &MockClusterClient{ - GetDesiredSentinelStateFunc: func(ctx context.Context, req *connect.Request[ctrlv1.GetDesiredSentinelStateRequest]) (*connect.Response[ctrlv1.SentinelState], error) { - getDesiredCalls.Add(1) - return connect.NewResponse(&ctrlv1.SentinelState{}), nil - }, - } - - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - go r.refreshCurrentSentinels(ctx) - - require.Eventually(t, func() bool { - return getDesiredCalls.Load() >= 1 - }, 2*time.Second, 50*time.Millisecond, "expected only deployment with ID to be processed") - - time.Sleep(200 * time.Millisecond) - require.Equal(t, int32(1), getDesiredCalls.Load(), "only one deployment should be processed (the one with ID)") -} diff --git a/svc/krane/internal/reconciler/test_helpers_test.go b/svc/krane/internal/reconciler/test_helpers_test.go deleted file mode 100644 index 54d464e592..0000000000 --- a/svc/krane/internal/reconciler/test_helpers_test.go +++ /dev/null @@ -1,263 +0,0 @@ -package reconciler - -import ( - "encoding/json" - "testing" - - "github.com/unkeyed/unkey/pkg/otel/logging" - appsv1 "k8s.io/api/apps/v1" - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/apimachinery/pkg/types" - dynamicfake "k8s.io/client-go/dynamic/fake" - "k8s.io/client-go/kubernetes/fake" - k8stesting "k8s.io/client-go/testing" -) - -// ----------------------------------------------------------------------------- -// Fake Client Setup -// ----------------------------------------------------------------------------- - -// NewFakeClient creates a fake Kubernetes client pre-seeded with a "test-namespace" -// namespace and any additional objects passed as arguments. -func NewFakeClient(t *testing.T, objects ...runtime.Object) *fake.Clientset { - t.Helper() - allObjects := append([]runtime.Object{newTestNamespace()}, objects...) - return fake.NewSimpleClientset(allObjects...) -} - -// NewFakeClientWithoutNamespace creates a fake Kubernetes client with only the -// provided objects. Use this when testing namespace creation behavior. -func NewFakeClientWithoutNamespace(t *testing.T, objects ...runtime.Object) *fake.Clientset { - t.Helper() - return fake.NewSimpleClientset(objects...) -} - -func newTestNamespace() *corev1.Namespace { - return &corev1.Namespace{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-namespace", - }, - } -} - -// ----------------------------------------------------------------------------- -// Reconciler Setup -// ----------------------------------------------------------------------------- - -// NewFakeDynamicClient creates a fake dynamic client for testing CiliumNetworkPolicy operations. -// It includes a reactor that handles server-side apply (patch) operations for CiliumNetworkPolicy. -func NewFakeDynamicClient() *dynamicfake.FakeDynamicClient { - scheme := runtime.NewScheme() - gvrToListKind := map[schema.GroupVersionResource]string{ - {Group: "cilium.io", Version: "v2", Resource: "ciliumnetworkpolicies"}: "CiliumNetworkPolicyList", - } - client := dynamicfake.NewSimpleDynamicClientWithCustomListKinds(scheme, gvrToListKind) - // Add a reactor to handle server-side apply (patch) operations - client.PrependReactor("patch", "ciliumnetworkpolicies", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { - patchAction := action.(k8stesting.PatchAction) - obj := &unstructured.Unstructured{} - obj.SetAPIVersion("cilium.io/v2") - obj.SetKind("CiliumNetworkPolicy") - obj.SetName(patchAction.GetName()) - obj.SetNamespace(patchAction.GetNamespace()) - return true, obj, nil - }) - return client -} - -// NewTestReconciler creates a Reconciler with the provided fake client and mock -// control plane client. If controlPlane is nil, a new MockClusterClient is used. -func NewTestReconciler(client *fake.Clientset, controlPlane *MockClusterClient) *Reconciler { - if controlPlane == nil { - controlPlane = &MockClusterClient{} - } - return New(Config{ - ClientSet: client, - DynamicClient: NewFakeDynamicClient(), - Logger: logging.NewNoop(), - Cluster: controlPlane, - Region: "test-region", - }) -} - -// ----------------------------------------------------------------------------- -// Reactor Utilities - Add behaviors to capture or intercept K8s operations -// ----------------------------------------------------------------------------- - -// ReplicaSetCapture holds a captured ReplicaSet from a patch operation. -type ReplicaSetCapture struct { - Applied *appsv1.ReplicaSet -} - -// AddReplicaSetPatchReactor adds a reactor that captures server-side apply patches -// for ReplicaSets. Returns a capture struct to access the applied resource. -func AddReplicaSetPatchReactor(client *fake.Clientset) *ReplicaSetCapture { - capture := &ReplicaSetCapture{} - client.PrependReactor("patch", "replicasets", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { - patchAction := action.(k8stesting.PatchAction) - if patchAction.GetPatchType() != types.ApplyPatchType { - return false, nil, nil - } - - var rs appsv1.ReplicaSet - if err := json.Unmarshal(patchAction.GetPatch(), &rs); err != nil { - return true, nil, err - } - - capture.Applied = &rs - rs.Namespace = patchAction.GetNamespace() - return true, &rs, nil - }) - return capture -} - -// DeploymentCapture holds a captured Deployment from a patch operation. -type DeploymentCapture struct { - Applied *appsv1.Deployment -} - -// AddDeploymentPatchReactor adds a reactor that captures server-side apply patches -// for Deployments. Returns a capture struct to access the applied resource. -func AddDeploymentPatchReactor(client *fake.Clientset) *DeploymentCapture { - capture := &DeploymentCapture{} - client.PrependReactor("patch", "deployments", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { - patchAction := action.(k8stesting.PatchAction) - if patchAction.GetPatchType() != types.ApplyPatchType { - return false, nil, nil - } - - var dep appsv1.Deployment - if err := json.Unmarshal(patchAction.GetPatch(), &dep); err != nil { - return true, nil, err - } - - capture.Applied = &dep - dep.Namespace = patchAction.GetNamespace() - dep.UID = "test-uid-12345" - return true, &dep, nil - }) - return capture -} - -// ServiceCapture holds a captured Service from a patch operation. -type ServiceCapture struct { - Applied *corev1.Service -} - -// AddServicePatchReactor adds a reactor that captures server-side apply patches -// for Services. Returns a capture struct to access the applied resource. -func AddServicePatchReactor(client *fake.Clientset) *ServiceCapture { - capture := &ServiceCapture{} - client.PrependReactor("patch", "services", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { - patchAction := action.(k8stesting.PatchAction) - if patchAction.GetPatchType() != types.ApplyPatchType { - return false, nil, nil - } - - var svc corev1.Service - if err := json.Unmarshal(patchAction.GetPatch(), &svc); err != nil { - return true, nil, err - } - - capture.Applied = &svc - svc.Namespace = patchAction.GetNamespace() - return true, &svc, nil - }) - return capture -} - -// DeleteCapture tracks delete operations in order. -type DeleteCapture struct { - Actions []string -} - -// AddDeleteTracker adds a reactor that tracks all delete operations. -// Returns a capture struct with the ordered list of deleted resource types. -func AddDeleteTracker(client *fake.Clientset) *DeleteCapture { - capture := &DeleteCapture{Actions: []string{}} - client.PrependReactor("delete", "*", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { - deleteAction := action.(k8stesting.DeleteAction) - capture.Actions = append(capture.Actions, deleteAction.GetResource().Resource) - return false, nil, nil - }) - return capture -} - -// NamespaceCreateCapture tracks namespace creation. -type NamespaceCreateCapture struct { - Created bool -} - -// AddNamespaceCreateTracker adds a reactor that tracks namespace creation. -func AddNamespaceCreateTracker(client *fake.Clientset) *NamespaceCreateCapture { - capture := &NamespaceCreateCapture{} - client.PrependReactor("create", "namespaces", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { - capture.Created = true - createAction := action.(k8stesting.CreateAction) - return false, createAction.GetObject(), nil - }) - return capture -} - -// AddErrorReactor adds a reactor that returns an error for the specified verb and resource. -func AddErrorReactor(client *fake.Clientset, verb, resource string, err error) { - client.PrependReactor(verb, resource, func(action k8stesting.Action) (handled bool, ret runtime.Object, retErr error) { - return true, nil, err - }) -} - -// AddPatchErrorReactor adds a reactor that returns an error only for SSA patch operations -// on the specified resource. -func AddPatchErrorReactor(client *fake.Clientset, resource string, err error) { - client.PrependReactor("patch", resource, func(action k8stesting.Action) (handled bool, ret runtime.Object, retErr error) { - patchAction := action.(k8stesting.PatchAction) - if patchAction.GetPatchType() != types.ApplyPatchType { - return false, nil, nil - } - return true, nil, err - }) -} - -// ----------------------------------------------------------------------------- -// TestHarness - Convenience struct for tests that need everything -// ----------------------------------------------------------------------------- - -// TestHarness provides a pre-configured Reconciler with fake Kubernetes and control -// plane clients for unit testing. Use the composable utilities above for more -// fine-grained control over test setup. -type TestHarness struct { - Reconciler *Reconciler - Client *fake.Clientset - ControlPlane *MockClusterClient - - ReplicaSets *ReplicaSetCapture - Deployments *DeploymentCapture - Services *ServiceCapture - Deletes *DeleteCapture -} - -// NewTestHarness creates a TestHarness with all capture reactors pre-configured. -// For simpler tests or more control, use the composable utilities directly. -func NewTestHarness(t *testing.T, objects ...runtime.Object) *TestHarness { - t.Helper() - - client := NewFakeClient(t, objects...) - controlPlane := &MockClusterClient{} - - h := &TestHarness{ - Client: client, - ControlPlane: controlPlane, - ReplicaSets: AddReplicaSetPatchReactor(client), - Deployments: AddDeploymentPatchReactor(client), - Services: AddServicePatchReactor(client), - Deletes: AddDeleteTracker(client), - } - - h.Reconciler = NewTestReconciler(client, controlPlane) - - return h -} diff --git a/svc/krane/internal/reconciler/tolerations.go b/svc/krane/internal/reconciler/tolerations.go deleted file mode 100644 index e1cdb65842..0000000000 --- a/svc/krane/internal/reconciler/tolerations.go +++ /dev/null @@ -1,17 +0,0 @@ -package reconciler - -import corev1 "k8s.io/api/core/v1" - -var untrustedToleration = corev1.Toleration{ - Key: "node-class", - Operator: corev1.TolerationOpEqual, - Value: CustomerNodeClass, - Effect: corev1.TaintEffectNoSchedule, -} - -var sentinelToleration = corev1.Toleration{ - Key: "node-class", - Operator: corev1.TolerationOpEqual, - Value: SentinelNodeClass, - Effect: corev1.TaintEffectNoSchedule, -} diff --git a/svc/krane/internal/reconciler/update_state.go b/svc/krane/internal/reconciler/update_state.go deleted file mode 100644 index 168f908366..0000000000 --- a/svc/krane/internal/reconciler/update_state.go +++ /dev/null @@ -1,109 +0,0 @@ -package reconciler - -import ( - "context" - "fmt" - "strings" - - "connectrpc.com/connect" - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" - appsv1 "k8s.io/api/apps/v1" - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -// updateDeploymentState pushes deployment state to the control plane through the circuit -// breaker. The circuit breaker prevents cascading failures during control plane outages -// by failing fast after repeated errors rather than blocking all reconciliation. -func (r *Reconciler) updateDeploymentState(ctx context.Context, state *ctrlv1.UpdateDeploymentStateRequest) error { - _, err := r.cb.Do(ctx, func(innerCtx context.Context) (any, error) { - return r.cluster.UpdateDeploymentState(innerCtx, connect.NewRequest(state)) - }) - if err != nil { - return fmt.Errorf("failed to update deployment state: %w", err) - } - return nil -} - -// updateSentinelState pushes sentinel state to the control plane through the circuit breaker. -func (r *Reconciler) updateSentinelState(ctx context.Context, state *ctrlv1.UpdateSentinelStateRequest) error { - _, err := r.cb.Do(ctx, func(innerCtx context.Context) (any, error) { - return r.cluster.UpdateSentinelState(innerCtx, connect.NewRequest(state)) - }) - if err != nil { - return fmt.Errorf("failed to update sentinel state: %w", err) - } - return nil -} - -// getDeploymentState queries the pods belonging to a ReplicaSet and builds a state -// update request containing each pod's address, resource allocation, and phase. -// Pods without an IP address are skipped since they can't receive traffic yet. -// The address is formatted as a cluster-local DNS name for in-cluster routing. -func (r *Reconciler) getDeploymentState(ctx context.Context, replicaset *appsv1.ReplicaSet) (*ctrlv1.UpdateDeploymentStateRequest, error) { - selector, err := metav1.LabelSelectorAsSelector(replicaset.Spec.Selector) - if err != nil { - return nil, err - } - - pods, err := r.clientSet.CoreV1().Pods(replicaset.Namespace).List(ctx, metav1.ListOptions{ - LabelSelector: selector.String(), - }) - if err != nil { - return nil, fmt.Errorf("failed to list pods: %w", err) - } - - update := &ctrlv1.UpdateDeploymentStateRequest_Update{ - K8SName: replicaset.Name, - Instances: make([]*ctrlv1.UpdateDeploymentStateRequest_Update_Instance, 0, len(pods.Items)), - } - - for _, pod := range pods.Items { - if pod.Status.PodIP == "" { - continue - } - - instance := &ctrlv1.UpdateDeploymentStateRequest_Update_Instance{ - K8SName: pod.GetName(), - Address: fmt.Sprintf("%s.%s.pod.cluster.local:%d", strings.ReplaceAll(pod.Status.PodIP, ".", "-"), pod.Namespace, DeploymentPort), - CpuMillicores: 0, - MemoryMib: 0, - Status: ctrlv1.UpdateDeploymentStateRequest_Update_Instance_STATUS_UNSPECIFIED, - } - if pod.Spec.Resources != nil { - instance.CpuMillicores = pod.Spec.Resources.Limits.Cpu().MilliValue() - instance.MemoryMib = pod.Spec.Resources.Limits.Memory().Value() / (1024 * 1024) - } - - switch pod.Status.Phase { - case corev1.PodPending: - instance.Status = ctrlv1.UpdateDeploymentStateRequest_Update_Instance_STATUS_PENDING - case corev1.PodRunning: - // Check if all containers are ready to determine running vs failed - allReady := true - for _, cond := range pod.Status.Conditions { - if cond.Type == corev1.ContainersReady && cond.Status != corev1.ConditionTrue { - allReady = false - break - } - } - if allReady { - instance.Status = ctrlv1.UpdateDeploymentStateRequest_Update_Instance_STATUS_RUNNING - } else { - instance.Status = ctrlv1.UpdateDeploymentStateRequest_Update_Instance_STATUS_FAILED - } - case corev1.PodFailed: - instance.Status = ctrlv1.UpdateDeploymentStateRequest_Update_Instance_STATUS_FAILED - case corev1.PodSucceeded, corev1.PodUnknown: - instance.Status = ctrlv1.UpdateDeploymentStateRequest_Update_Instance_STATUS_UNSPECIFIED - } - - update.Instances = append(update.Instances, instance) - } - - return &ctrlv1.UpdateDeploymentStateRequest{ - Change: &ctrlv1.UpdateDeploymentStateRequest_Update_{ - Update: update, - }, - }, nil -} diff --git a/svc/krane/internal/reconciler/update_state_test.go b/svc/krane/internal/reconciler/update_state_test.go deleted file mode 100644 index 2c258fb26c..0000000000 --- a/svc/krane/internal/reconciler/update_state_test.go +++ /dev/null @@ -1,339 +0,0 @@ -package reconciler - -import ( - "context" - "fmt" - "testing" - - "github.com/stretchr/testify/require" - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" - "github.com/unkeyed/unkey/pkg/circuitbreaker" - "github.com/unkeyed/unkey/pkg/otel/logging" - appsv1 "k8s.io/api/apps/v1" - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes/fake" -) - -func TestGetDeploymentState_PodStatusMapping(t *testing.T) { - tests := []struct { - name string - phase corev1.PodPhase - expectedStatus ctrlv1.UpdateDeploymentStateRequest_Update_Instance_Status - }{ - { - name: "pending pod", - phase: corev1.PodPending, - expectedStatus: ctrlv1.UpdateDeploymentStateRequest_Update_Instance_STATUS_PENDING, - }, - { - name: "running pod", - phase: corev1.PodRunning, - expectedStatus: ctrlv1.UpdateDeploymentStateRequest_Update_Instance_STATUS_RUNNING, - }, - { - name: "failed pod", - phase: corev1.PodFailed, - expectedStatus: ctrlv1.UpdateDeploymentStateRequest_Update_Instance_STATUS_FAILED, - }, - { - name: "succeeded pod", - phase: corev1.PodSucceeded, - expectedStatus: ctrlv1.UpdateDeploymentStateRequest_Update_Instance_STATUS_UNSPECIFIED, - }, - { - name: "unknown pod", - phase: corev1.PodUnknown, - expectedStatus: ctrlv1.UpdateDeploymentStateRequest_Update_Instance_STATUS_UNSPECIFIED, - }, - { - name: "default case", - phase: corev1.PodPhase("SomeUnknownPhase"), - expectedStatus: ctrlv1.UpdateDeploymentStateRequest_Update_Instance_STATUS_UNSPECIFIED, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - ctx := context.Background() - - rs := &appsv1.ReplicaSet{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-rs", - Namespace: "test-namespace", - }, - Spec: appsv1.ReplicaSetSpec{ - Selector: &metav1.LabelSelector{ - MatchLabels: map[string]string{"app": "test"}, - }, - }, - } - - pod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pod", - Namespace: "test-namespace", - Labels: map[string]string{"app": "test"}, - }, - Status: corev1.PodStatus{ - Phase: tt.phase, - PodIP: "10.0.0.1", - }, - } - - client := fake.NewSimpleClientset(rs, pod) - r := &Reconciler{ - clientSet: client, - cluster: &MockClusterClient{}, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - state, err := r.getDeploymentState(ctx, rs) - require.NoError(t, err) - - update := state.GetUpdate() - require.NotNil(t, update) - require.Len(t, update.GetInstances(), 1) - require.Equal(t, tt.expectedStatus, update.GetInstances()[0].GetStatus()) - }) - } -} - -func TestGetDeploymentState_AddressFormatting(t *testing.T) { - ctx := context.Background() - - rs := &appsv1.ReplicaSet{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-rs", - Namespace: "my-namespace", - }, - Spec: appsv1.ReplicaSetSpec{ - Selector: &metav1.LabelSelector{ - MatchLabels: map[string]string{"app": "test"}, - }, - }, - } - - pod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pod", - Namespace: "my-namespace", - Labels: map[string]string{"app": "test"}, - }, - Status: corev1.PodStatus{ - Phase: corev1.PodRunning, - PodIP: "10.0.0.1", - }, - } - - client := fake.NewSimpleClientset(rs, pod) - r := &Reconciler{ - clientSet: client, - cluster: &MockClusterClient{}, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - state, err := r.getDeploymentState(ctx, rs) - require.NoError(t, err) - - update := state.GetUpdate() - require.NotNil(t, update) - require.Len(t, update.GetInstances(), 1) - require.Equal(t, fmt.Sprintf("10-0-0-1.my-namespace.pod.cluster.local:%d", DeploymentPort), update.GetInstances()[0].GetAddress()) -} - -func TestGetDeploymentState_ResourceExtraction(t *testing.T) { - ctx := context.Background() - - rs := &appsv1.ReplicaSet{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-rs", - Namespace: "test-namespace", - }, - Spec: appsv1.ReplicaSetSpec{ - Selector: &metav1.LabelSelector{ - MatchLabels: map[string]string{"app": "test"}, - }, - }, - } - - pod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pod", - Namespace: "test-namespace", - Labels: map[string]string{"app": "test"}, - }, - Spec: corev1.PodSpec{ - Resources: &corev1.ResourceRequirements{ - // nolint:exhaustive - Limits: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("500m"), - corev1.ResourceMemory: resource.MustParse("256Mi"), - }, - }, - }, - Status: corev1.PodStatus{ - Phase: corev1.PodRunning, - PodIP: "10.0.0.1", - }, - } - - client := fake.NewSimpleClientset(rs, pod) - r := &Reconciler{ - clientSet: client, - cluster: &MockClusterClient{}, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - state, err := r.getDeploymentState(ctx, rs) - require.NoError(t, err) - - update := state.GetUpdate() - require.NotNil(t, update) - require.Len(t, update.GetInstances(), 1) - require.Equal(t, int64(500), update.GetInstances()[0].GetCpuMillicores()) - require.Equal(t, int64(256), update.GetInstances()[0].GetMemoryMib()) -} - -func TestGetDeploymentState_MultiplePods(t *testing.T) { - ctx := context.Background() - - rs := &appsv1.ReplicaSet{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-rs", - Namespace: "test-namespace", - }, - Spec: appsv1.ReplicaSetSpec{ - Selector: &metav1.LabelSelector{ - MatchLabels: map[string]string{"app": "test"}, - }, - }, - } - - pod1 := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pod-1", - Namespace: "test-namespace", - Labels: map[string]string{"app": "test"}, - }, - Status: corev1.PodStatus{ - Phase: corev1.PodRunning, - PodIP: "10.0.0.1", - }, - } - - pod2 := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pod-2", - Namespace: "test-namespace", - Labels: map[string]string{"app": "test"}, - }, - Status: corev1.PodStatus{ - Phase: corev1.PodPending, - PodIP: "10.0.0.2", - }, - } - - pod3 := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pod-3", - Namespace: "test-namespace", - Labels: map[string]string{"app": "test"}, - }, - Status: corev1.PodStatus{ - Phase: corev1.PodFailed, - PodIP: "10.0.0.3", - }, - } - - client := fake.NewSimpleClientset(rs, pod1, pod2, pod3) - r := &Reconciler{ - clientSet: client, - cluster: &MockClusterClient{}, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - state, err := r.getDeploymentState(ctx, rs) - require.NoError(t, err) - - update := state.GetUpdate() - require.NotNil(t, update) - require.Len(t, update.GetInstances(), 3) - - podNames := make(map[string]bool) - for _, instance := range update.GetInstances() { - podNames[instance.GetK8SName()] = true - } - require.True(t, podNames["test-pod-1"]) - require.True(t, podNames["test-pod-2"]) - require.True(t, podNames["test-pod-3"]) - - require.Equal(t, "test-rs", update.GetK8SName()) -} - -func TestGetDeploymentState_EmptyPodIP(t *testing.T) { - ctx := context.Background() - - rs := &appsv1.ReplicaSet{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-rs", - Namespace: "test-namespace", - }, - Spec: appsv1.ReplicaSetSpec{ - Selector: &metav1.LabelSelector{ - MatchLabels: map[string]string{"app": "test"}, - }, - }, - } - - podWithIP := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pod-with-ip", - Namespace: "test-namespace", - Labels: map[string]string{"app": "test"}, - }, - Status: corev1.PodStatus{ - Phase: corev1.PodRunning, - PodIP: "10.0.0.1", - }, - } - - podWithoutIP := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pod-without-ip", - Namespace: "test-namespace", - Labels: map[string]string{"app": "test"}, - }, - Status: corev1.PodStatus{ - Phase: corev1.PodPending, - PodIP: "", - }, - } - - client := fake.NewSimpleClientset(rs, podWithIP, podWithoutIP) - r := &Reconciler{ - clientSet: client, - cluster: &MockClusterClient{}, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - state, err := r.getDeploymentState(ctx, rs) - require.NoError(t, err) - - update := state.GetUpdate() - require.NotNil(t, update) - require.Len(t, update.GetInstances(), 1, "pods with empty PodIP should be skipped") - require.Equal(t, "pod-with-ip", update.GetInstances()[0].GetK8SName()) - require.Equal(t, fmt.Sprintf("10-0-0-1.test-namespace.pod.cluster.local:%d", DeploymentPort), update.GetInstances()[0].GetAddress()) -} diff --git a/svc/krane/internal/reconciler/version_tracking_test.go b/svc/krane/internal/reconciler/version_tracking_test.go deleted file mode 100644 index 88e23c2e2f..0000000000 --- a/svc/krane/internal/reconciler/version_tracking_test.go +++ /dev/null @@ -1,116 +0,0 @@ -package reconciler - -import ( - "context" - "testing" - - "github.com/stretchr/testify/require" - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" - "github.com/unkeyed/unkey/pkg/ptr" -) - -// Tests for version tracking behavior. -// -// The reconciler uses a two-phase commit for version tracking: -// 1. HandleState processes state and returns the version (but doesn't commit it) -// 2. versionLastSeen is updated only after clean stream close -// -// This ensures atomic bootstrap: if a stream breaks mid-bootstrap, the client -// retries from version 0 rather than skipping resources that were never received. - -func TestHandleState_ReturnsVersion(t *testing.T) { - ctx := context.Background() - h := NewTestHarness(t) - r := h.Reconciler - - require.Equal(t, uint64(0), r.versionLastSeen, "initial version should be 0") - - state := &ctrlv1.State{ - Version: 42, - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Apply{ - Apply: &ctrlv1.ApplyDeployment{ - WorkspaceId: "ws_123", - ProjectId: "prj_123", - EnvironmentId: "env_123", - DeploymentId: "dep_123", - K8SNamespace: "test-namespace", - K8SName: "test-deployment", - Image: "nginx:1.19", - Replicas: 1, - CpuMillicores: 100, - MemoryMib: 128, - BuildId: ptr.P("build_123"), - }, - }, - }, - }, - } - - ver, err := r.HandleState(ctx, state) - require.NoError(t, err) - require.Equal(t, uint64(42), ver, "should return state version") - require.Equal(t, uint64(0), r.versionLastSeen, "versionLastSeen should not change until stream closes cleanly") -} - -func TestHandleState_DoesNotCommitVersion(t *testing.T) { - ctx := context.Background() - h := NewTestHarness(t) - r := h.Reconciler - - states := []*ctrlv1.State{ - { - Version: 100, - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Delete{ - Delete: &ctrlv1.DeleteDeployment{ - K8SNamespace: "test-namespace", - K8SName: "test-deployment", - }, - }, - }, - }, - }, - { - Version: 200, - Kind: &ctrlv1.State_Sentinel{ - Sentinel: &ctrlv1.SentinelState{ - State: &ctrlv1.SentinelState_Apply{ - Apply: &ctrlv1.ApplySentinel{ - WorkspaceId: "ws_123", - ProjectId: "prj_123", - EnvironmentId: "env_123", - SentinelId: "sentinel_123", - K8SName: "test-sentinel", - Image: "sentinel:1.0", - Replicas: 2, - CpuMillicores: 100, - MemoryMib: 128, - }, - }, - }, - }, - }, - { - Version: 300, - Kind: &ctrlv1.State_Sentinel{ - Sentinel: &ctrlv1.SentinelState{ - State: &ctrlv1.SentinelState_Delete{ - Delete: &ctrlv1.DeleteSentinel{ - K8SName: "test-sentinel", - }, - }, - }, - }, - }, - } - - for _, state := range states { - _, err := r.HandleState(ctx, state) - require.NoError(t, err) - } - - require.Equal(t, uint64(0), r.versionLastSeen, "versionLastSeen should remain 0 until stream closes cleanly") -} diff --git a/svc/krane/internal/reconciler/watch_current_deployments.go b/svc/krane/internal/reconciler/watch_current_deployments.go deleted file mode 100644 index 63155f8216..0000000000 --- a/svc/krane/internal/reconciler/watch_current_deployments.go +++ /dev/null @@ -1,77 +0,0 @@ -package reconciler - -import ( - "context" - - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" - "github.com/unkeyed/unkey/svc/krane/pkg/labels" - appsv1 "k8s.io/api/apps/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/watch" -) - -// watchCurrentDeployments starts a Kubernetes watch for deployment ReplicaSets and -// reports state changes back to the control plane in real-time. -// -// The watch filters for resources with the "managed-by: krane" and "component: deployment" -// labels, ignoring resources created by other controllers. When a ReplicaSet is added, -// modified, or deleted, the method queries pod status and pushes an update to the -// control plane so routing tables stay synchronized with actual cluster state. -// -// This complements [Reconciler.refreshCurrentDeployments] which handles consistency -// for events that might be missed during network partitions or restarts. -func (r *Reconciler) watchCurrentDeployments(ctx context.Context) error { - - w, err := r.clientSet.AppsV1().ReplicaSets("").Watch(ctx, metav1.ListOptions{ - LabelSelector: labels.New(). - ManagedByKrane(). - ComponentDeployment(). - ToString(), - }) - if err != nil { - return err - } - go func() { - for event := range w.ResultChan() { - switch event.Type { - case watch.Error: - r.logger.Error("error watching deployment", "event", event.Object) - case watch.Bookmark: - case watch.Added, watch.Modified: - replicaset, ok := event.Object.(*appsv1.ReplicaSet) - if !ok { - r.logger.Error("unable to cast object to replicaset") - continue - } - state, err := r.getDeploymentState(ctx, replicaset) - if err != nil { - r.logger.Error("unable to get state", "error", err.Error()) - continue - } - err = r.updateDeploymentState(ctx, state) - if err != nil { - r.logger.Error("unable to update state", "error", err.Error()) - continue - } - case watch.Deleted: - replicaset, ok := event.Object.(*appsv1.ReplicaSet) - if !ok { - r.logger.Error("unable to cast object to replicaset") - continue - } - err := r.updateDeploymentState(ctx, &ctrlv1.UpdateDeploymentStateRequest{ - Change: &ctrlv1.UpdateDeploymentStateRequest_Delete_{ - Delete: &ctrlv1.UpdateDeploymentStateRequest_Delete{ - K8SName: replicaset.Name, - }, - }, - }) - if err != nil { - r.logger.Error("unable to update state", "error", err.Error()) - continue - } - } - } - }() - return nil -} diff --git a/svc/krane/internal/reconciler/watch_current_deployments_test.go b/svc/krane/internal/reconciler/watch_current_deployments_test.go deleted file mode 100644 index bc08b69ee8..0000000000 --- a/svc/krane/internal/reconciler/watch_current_deployments_test.go +++ /dev/null @@ -1,299 +0,0 @@ -package reconciler - -import ( - "context" - "testing" - "time" - - "github.com/stretchr/testify/require" - "github.com/unkeyed/unkey/pkg/circuitbreaker" - "github.com/unkeyed/unkey/pkg/otel/logging" - "github.com/unkeyed/unkey/svc/krane/pkg/labels" - appsv1 "k8s.io/api/apps/v1" - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/watch" - "k8s.io/client-go/kubernetes/fake" - k8stesting "k8s.io/client-go/testing" -) - -func TestWatchCurrentDeployments_SetupSucceeds(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - client := fake.NewSimpleClientset() - mockCluster := &MockClusterClient{} - - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - err := r.watchCurrentDeployments(ctx) - require.NoError(t, err) -} - -func TestWatchCurrentDeployments_AddEventTriggersStateUpdate(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - client := fake.NewSimpleClientset() - fakeWatch := watch.NewFake() - - client.PrependWatchReactor("replicasets", k8stesting.DefaultWatchReactor(fakeWatch, nil)) - - mockCluster := &MockClusterClient{} - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - err := r.watchCurrentDeployments(ctx) - require.NoError(t, err) - - rs := &appsv1.ReplicaSet{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-replicaset", - Namespace: "default", - Labels: labels.New(). - ManagedByKrane(). - ComponentDeployment(). - DeploymentID("dep_123"), - }, - Spec: appsv1.ReplicaSetSpec{ - Selector: &metav1.LabelSelector{ - MatchLabels: map[string]string{ - "app": "test", - }, - }, - }, - } - - fakeWatch.Add(rs) - - require.Eventually(t, func() bool { - return len(mockCluster.UpdateDeploymentStateCalls) >= 1 - }, 2*time.Second, 10*time.Millisecond) - - require.NotEmpty(t, mockCluster.UpdateDeploymentStateCalls) -} - -func TestWatchCurrentDeployments_ModifyEventTriggersStateUpdate(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - client := fake.NewSimpleClientset() - fakeWatch := watch.NewFake() - - client.PrependWatchReactor("replicasets", k8stesting.DefaultWatchReactor(fakeWatch, nil)) - - mockCluster := &MockClusterClient{} - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - err := r.watchCurrentDeployments(ctx) - require.NoError(t, err) - - rs := &appsv1.ReplicaSet{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-replicaset", - Namespace: "default", - Labels: labels.New(). - ManagedByKrane(). - ComponentDeployment(). - DeploymentID("dep_123"), - }, - Spec: appsv1.ReplicaSetSpec{ - Selector: &metav1.LabelSelector{ - MatchLabels: map[string]string{ - "app": "test", - }, - }, - }, - } - - fakeWatch.Modify(rs) - - require.Eventually(t, func() bool { - return len(mockCluster.UpdateDeploymentStateCalls) >= 1 - }, 2*time.Second, 10*time.Millisecond) -} - -func TestWatchCurrentDeployments_DeleteEventTriggersStateUpdate(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - client := fake.NewSimpleClientset() - fakeWatch := watch.NewFake() - - client.PrependWatchReactor("replicasets", k8stesting.DefaultWatchReactor(fakeWatch, nil)) - - mockCluster := &MockClusterClient{} - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - err := r.watchCurrentDeployments(ctx) - require.NoError(t, err) - - rs := &appsv1.ReplicaSet{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-replicaset", - Namespace: "default", - Labels: labels.New(). - ManagedByKrane(). - ComponentDeployment(). - DeploymentID("dep_123"), - }, - Spec: appsv1.ReplicaSetSpec{ - Selector: &metav1.LabelSelector{ - MatchLabels: map[string]string{ - "app": "test", - }, - }, - }, - } - - fakeWatch.Delete(rs) - - require.Eventually(t, func() bool { - return len(mockCluster.UpdateDeploymentStateCalls) >= 1 - }, 2*time.Second, 10*time.Millisecond) - - call := mockCluster.UpdateDeploymentStateCalls[0] - deleteReq := call.GetDelete() - require.NotNil(t, deleteReq) - require.Equal(t, "test-replicaset", deleteReq.GetK8SName()) -} - -func TestWatchCurrentDeployments_ChannelClosure(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - client := fake.NewSimpleClientset() - fakeWatch := watch.NewFake() - - client.PrependWatchReactor("replicasets", k8stesting.DefaultWatchReactor(fakeWatch, nil)) - - mockCluster := &MockClusterClient{} - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - err := r.watchCurrentDeployments(ctx) - require.NoError(t, err) - - fakeWatch.Stop() - - time.Sleep(100 * time.Millisecond) -} - -func TestWatchCurrentDeployments_ContextCancellation(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - - client := fake.NewSimpleClientset() - fakeWatch := watch.NewFake() - - client.PrependWatchReactor("replicasets", k8stesting.DefaultWatchReactor(fakeWatch, nil)) - - mockCluster := &MockClusterClient{} - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - err := r.watchCurrentDeployments(ctx) - require.NoError(t, err) - - cancel() - - time.Sleep(100 * time.Millisecond) -} - -func TestWatchCurrentDeployments_WithPods(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - pod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pod", - Namespace: "default", - Labels: map[string]string{ - "app": "test", - }, - }, - Status: corev1.PodStatus{ - Phase: corev1.PodRunning, - PodIP: "10.0.0.1", - }, - } - - client := fake.NewSimpleClientset(pod) - fakeWatch := watch.NewFake() - - client.PrependWatchReactor("replicasets", k8stesting.DefaultWatchReactor(fakeWatch, nil)) - - mockCluster := &MockClusterClient{} - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - err := r.watchCurrentDeployments(ctx) - require.NoError(t, err) - - rs := &appsv1.ReplicaSet{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-replicaset", - Namespace: "default", - Labels: labels.New(). - ManagedByKrane(). - ComponentDeployment(). - DeploymentID("dep_123"), - }, - Spec: appsv1.ReplicaSetSpec{ - Selector: &metav1.LabelSelector{ - MatchLabels: map[string]string{ - "app": "test", - }, - }, - }, - } - - fakeWatch.Add(rs) - - require.Eventually(t, func() bool { - return len(mockCluster.UpdateDeploymentStateCalls) >= 1 - }, 2*time.Second, 10*time.Millisecond) - - call := mockCluster.UpdateDeploymentStateCalls[0] - update := call.GetUpdate() - require.NotNil(t, update) - require.Equal(t, "test-replicaset", update.GetK8SName()) - require.Len(t, update.GetInstances(), 1) -} diff --git a/svc/krane/internal/reconciler/watch_current_sentinels_test.go b/svc/krane/internal/reconciler/watch_current_sentinels_test.go deleted file mode 100644 index 6ce4c9300d..0000000000 --- a/svc/krane/internal/reconciler/watch_current_sentinels_test.go +++ /dev/null @@ -1,289 +0,0 @@ -package reconciler - -import ( - "context" - "testing" - "time" - - "github.com/stretchr/testify/require" - "github.com/unkeyed/unkey/pkg/circuitbreaker" - "github.com/unkeyed/unkey/pkg/otel/logging" - "github.com/unkeyed/unkey/svc/krane/pkg/labels" - appsv1 "k8s.io/api/apps/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/watch" - "k8s.io/client-go/kubernetes/fake" - k8stesting "k8s.io/client-go/testing" -) - -func TestWatchCurrentSentinels_SetupSucceeds(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - client := fake.NewSimpleClientset() - mockCluster := &MockClusterClient{} - - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - err := r.watchCurrentSentinels(ctx) - require.NoError(t, err) -} - -func TestWatchCurrentSentinels_AddEventTriggersStateUpdate(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - client := fake.NewSimpleClientset() - fakeWatch := watch.NewFake() - - client.PrependWatchReactor("deployments", k8stesting.DefaultWatchReactor(fakeWatch, nil)) - - mockCluster := &MockClusterClient{} - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - err := r.watchCurrentSentinels(ctx) - require.NoError(t, err) - - deployment := &appsv1.Deployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-sentinel", - Namespace: "default", - Labels: labels.New(). - ManagedByKrane(). - ComponentSentinel(). - SentinelID("sent_123"), - }, - Status: appsv1.DeploymentStatus{ - AvailableReplicas: 3, - }, - } - - fakeWatch.Add(deployment) - - require.Eventually(t, func() bool { - return len(mockCluster.UpdateSentinelStateCalls) >= 1 - }, 2*time.Second, 10*time.Millisecond) - - call := mockCluster.UpdateSentinelStateCalls[0] - require.Equal(t, "test-sentinel", call.GetK8SName()) - require.Equal(t, int32(3), call.GetAvailableReplicas()) -} - -func TestWatchCurrentSentinels_ModifyEventTriggersStateUpdate(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - client := fake.NewSimpleClientset() - fakeWatch := watch.NewFake() - - client.PrependWatchReactor("deployments", k8stesting.DefaultWatchReactor(fakeWatch, nil)) - - mockCluster := &MockClusterClient{} - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - err := r.watchCurrentSentinels(ctx) - require.NoError(t, err) - - deployment := &appsv1.Deployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-sentinel", - Namespace: "default", - Labels: labels.New(). - ManagedByKrane(). - ComponentSentinel(). - SentinelID("sent_123"), - }, - Status: appsv1.DeploymentStatus{ - AvailableReplicas: 5, - }, - } - - fakeWatch.Modify(deployment) - - require.Eventually(t, func() bool { - return len(mockCluster.UpdateSentinelStateCalls) >= 1 - }, 2*time.Second, 10*time.Millisecond) - - call := mockCluster.UpdateSentinelStateCalls[0] - require.Equal(t, "test-sentinel", call.GetK8SName()) - require.Equal(t, int32(5), call.GetAvailableReplicas()) -} - -func TestWatchCurrentSentinels_DeleteEventTriggersStateUpdate(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - client := fake.NewSimpleClientset() - fakeWatch := watch.NewFake() - - client.PrependWatchReactor("deployments", k8stesting.DefaultWatchReactor(fakeWatch, nil)) - - mockCluster := &MockClusterClient{} - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - err := r.watchCurrentSentinels(ctx) - require.NoError(t, err) - - deployment := &appsv1.Deployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-sentinel", - Namespace: "default", - Labels: labels.New(). - ManagedByKrane(). - ComponentSentinel(). - SentinelID("sent_123"), - }, - Status: appsv1.DeploymentStatus{ - AvailableReplicas: 3, - }, - } - - fakeWatch.Delete(deployment) - - require.Eventually(t, func() bool { - return len(mockCluster.UpdateSentinelStateCalls) >= 1 - }, 2*time.Second, 10*time.Millisecond) - - call := mockCluster.UpdateSentinelStateCalls[0] - require.Equal(t, "test-sentinel", call.GetK8SName()) - require.Equal(t, int32(0), call.GetAvailableReplicas()) -} - -func TestWatchCurrentSentinels_ChannelClosure(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - client := fake.NewSimpleClientset() - fakeWatch := watch.NewFake() - - client.PrependWatchReactor("deployments", k8stesting.DefaultWatchReactor(fakeWatch, nil)) - - mockCluster := &MockClusterClient{} - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - err := r.watchCurrentSentinels(ctx) - require.NoError(t, err) - - fakeWatch.Stop() - - time.Sleep(100 * time.Millisecond) -} - -func TestWatchCurrentSentinels_ContextCancellation(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - - client := fake.NewSimpleClientset() - fakeWatch := watch.NewFake() - - client.PrependWatchReactor("deployments", k8stesting.DefaultWatchReactor(fakeWatch, nil)) - - mockCluster := &MockClusterClient{} - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - err := r.watchCurrentSentinels(ctx) - require.NoError(t, err) - - cancel() - - time.Sleep(100 * time.Millisecond) -} - -func TestWatchCurrentSentinels_MultipleEvents(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - client := fake.NewSimpleClientset() - fakeWatch := watch.NewFake() - - client.PrependWatchReactor("deployments", k8stesting.DefaultWatchReactor(fakeWatch, nil)) - - mockCluster := &MockClusterClient{} - r := &Reconciler{ - clientSet: client, - cluster: mockCluster, - cb: circuitbreaker.New[any]("test"), - logger: logging.NewNoop(), - region: "test-region", - } - - err := r.watchCurrentSentinels(ctx) - require.NoError(t, err) - - deployment1 := &appsv1.Deployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: "sentinel-1", - Namespace: "default", - Labels: labels.New(). - ManagedByKrane(). - ComponentSentinel(). - SentinelID("sent_1"), - }, - Status: appsv1.DeploymentStatus{ - AvailableReplicas: 1, - }, - } - - deployment2 := &appsv1.Deployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: "sentinel-2", - Namespace: "default", - Labels: labels.New(). - ManagedByKrane(). - ComponentSentinel(). - SentinelID("sent_2"), - }, - Status: appsv1.DeploymentStatus{ - AvailableReplicas: 2, - }, - } - - fakeWatch.Add(deployment1) - fakeWatch.Add(deployment2) - - require.Eventually(t, func() bool { - return len(mockCluster.UpdateSentinelStateCalls) >= 2 - }, 2*time.Second, 10*time.Millisecond) - - names := make(map[string]bool) - for _, call := range mockCluster.UpdateSentinelStateCalls { - names[call.GetK8SName()] = true - } - require.True(t, names["sentinel-1"]) - require.True(t, names["sentinel-2"]) -} diff --git a/svc/krane/internal/reconciler/watcher.go b/svc/krane/internal/reconciler/watcher.go deleted file mode 100644 index f6ba10d558..0000000000 --- a/svc/krane/internal/reconciler/watcher.go +++ /dev/null @@ -1,55 +0,0 @@ -package reconciler - -import ( - "context" - "math/rand/v2" - "time" - - "connectrpc.com/connect" - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" -) - -func (r *Reconciler) Watch(ctx context.Context) { - intervalMin := time.Second - intervalMax := 5 * time.Second - - for { - interval := intervalMin + time.Millisecond*time.Duration(rand.Float64()*float64(intervalMax.Milliseconds()-intervalMin.Milliseconds())) - time.Sleep(interval) - - err := r.watch(ctx) - if err != nil { - r.logger.Error("error while watching for state changes", "error", err) - } - } -} - -func (r *Reconciler) watch(ctx context.Context) error { - r.logger.Info("starting watch") - - stream, err := r.cluster.Sync(ctx, connect.NewRequest(&ctrlv1.SyncRequest{ - Region: r.region, - VersionLastSeen: r.versionLastSeen, - })) - if err != nil { - return err - } - - for stream.Receive() { - r.logger.Info("received message") - version, err := r.HandleState(ctx, stream.Msg()) - if err != nil { - return err - } - if version > r.versionLastSeen { - r.versionLastSeen = version - } - } - - if err := stream.Close(); err != nil { - r.logger.Error("unable to close stream", "error", err) - return err - } - - return nil -} diff --git a/svc/krane/internal/reconciler/watcher_test.go b/svc/krane/internal/reconciler/watcher_test.go deleted file mode 100644 index 6009c3adfc..0000000000 --- a/svc/krane/internal/reconciler/watcher_test.go +++ /dev/null @@ -1,590 +0,0 @@ -// Package reconciler provides the krane reconciler that synchronizes control -// plane state with Kubernetes resources. -// -// These tests verify the watch/sync behavior of the reconciler, specifically: -// - How it forms Sync requests to the control plane -// - How it processes State messages via HandleState -// - How it tracks version numbers for reconnection -// -// # Test Approach -// -// Due to connect.ServerStreamForClient being a struct (not an interface), we -// cannot mock the actual stream returned by Sync(). Instead, we test: -// - Request formation (capture the SyncRequest sent to the mock) -// - HandleState processing (call HandleState directly with test messages) -// - Error handling (return errors from the mock Sync function) -// -// # Key Invariants -// -// - versionLastSeen is only updated after clean stream close (atomic bootstrap) -// - HandleState returns the version but does not commit it -// - Apply messages create/update Kubernetes resources -// - Delete messages remove Kubernetes resources -package reconciler - -import ( - "context" - "errors" - "net/http" - "sync" - "testing" - - "connectrpc.com/connect" - "github.com/stretchr/testify/require" - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" - "github.com/unkeyed/unkey/pkg/otel/logging" - "github.com/unkeyed/unkey/pkg/ptr" - "k8s.io/client-go/kubernetes/fake" -) - -// mockServerStream implements connect.ServerStreamForClient for testing. -type mockServerStream struct { - messages []*ctrlv1.State - index int - err error - closed bool - mu sync.Mutex -} - -func newMockServerStream(messages []*ctrlv1.State) *mockServerStream { - return &mockServerStream{ - messages: messages, - index: 0, - } -} - -func (m *mockServerStream) Receive() bool { - m.mu.Lock() - defer m.mu.Unlock() - - if m.closed || m.index >= len(m.messages) { - return false - } - m.index++ - return true -} - -func (m *mockServerStream) Msg() *ctrlv1.State { - m.mu.Lock() - defer m.mu.Unlock() - - if m.index == 0 || m.index > len(m.messages) { - return nil - } - return m.messages[m.index-1] -} - -func (m *mockServerStream) Err() error { - return m.err -} - -func (m *mockServerStream) Close() error { - m.mu.Lock() - defer m.mu.Unlock() - m.closed = true - return nil -} - -func (m *mockServerStream) ResponseHeader() http.Header { - return make(http.Header) -} - -func (m *mockServerStream) ResponseTrailer() http.Header { - return make(http.Header) -} - -// ============================================================================= -// Sync Request Formation Tests -// ============================================================================= -// -// These tests verify that the reconciler sends correctly formed Sync requests -// to the control plane. The request must include the region and the last-seen -// sequence number. -// ============================================================================= - -// TestWatch_SendsCorrectSyncRequest verifies that watch() sends a Sync request -// with the correct region and version number. -// -// Scenario: Reconciler has previously processed messages up to version 500. -// It calls watch() which should send a Sync request with that version. -// -// Guarantees: -// - SyncRequest.Region matches the reconciler's configured region -// - SyncRequest.VersionLastSeen matches versionLastSeen from previous session -// -// This is critical for reconnection: the version tells the server where to -// resume streaming from. -func TestWatch_SendsCorrectSyncRequest(t *testing.T) { - client := fake.NewSimpleClientset() - AddReplicaSetPatchReactor(client) - AddDeploymentPatchReactor(client) - AddServicePatchReactor(client) - AddDeleteTracker(client) - - var capturedRequest *ctrlv1.SyncRequest - - mockCluster := &MockClusterClient{ - SyncFunc: func(_ context.Context, req *connect.Request[ctrlv1.SyncRequest]) (*connect.ServerStreamForClient[ctrlv1.State], error) { - capturedRequest = req.Msg - return nil, errors.New("end test") - }, - } - - r := New(Config{ - ClientSet: client, - DynamicClient: NewFakeDynamicClient(), - Logger: logging.NewNoop(), - Cluster: mockCluster, - Region: "us-west-2", - }) - - r.versionLastSeen = 500 - - ctx := context.Background() - _ = r.watch(ctx) - - require.NotNil(t, capturedRequest) - require.Equal(t, "us-west-2", capturedRequest.GetRegion()) - require.Equal(t, uint64(500), capturedRequest.GetVersionLastSeen()) -} - -// TestWatch_InitialSyncWithZeroVersion verifies that a fresh reconciler sends -// version=0 to trigger a full bootstrap from the server. -// -// Scenario: A newly created reconciler (never received any messages) calls watch(). -// -// Guarantees: -// - SyncRequest.VersionLastSeen is 0 -// - This triggers the server to perform full bootstrap -// -// version=0 is the "I have nothing" signal that tells the server to send -// all current state before entering the watch loop. -func TestWatch_InitialSyncWithZeroVersion(t *testing.T) { - client := fake.NewSimpleClientset() - AddReplicaSetPatchReactor(client) - AddDeploymentPatchReactor(client) - AddServicePatchReactor(client) - AddDeleteTracker(client) - - var capturedRequest *ctrlv1.SyncRequest - - mockCluster := &MockClusterClient{ - SyncFunc: func(_ context.Context, req *connect.Request[ctrlv1.SyncRequest]) (*connect.ServerStreamForClient[ctrlv1.State], error) { - capturedRequest = req.Msg - return nil, errors.New("end test") - }, - } - - r := New(Config{ - ClientSet: client, - DynamicClient: NewFakeDynamicClient(), - Logger: logging.NewNoop(), - Cluster: mockCluster, - Region: "eu-central-1", - }) - - ctx := context.Background() - _ = r.watch(ctx) - - require.NotNil(t, capturedRequest) - require.Equal(t, "eu-central-1", capturedRequest.GetRegion()) - require.Equal(t, uint64(0), capturedRequest.GetVersionLastSeen(), "initial sync should have version 0") -} - -// ============================================================================= -// HandleState Processing Tests -// ============================================================================= -// -// These tests verify that HandleState correctly processes different message -// types and updates both Kubernetes resources and the version tracker. -// ============================================================================= - -// TestWatch_ProcessesStreamMessages verifies that HandleState correctly -// processes deployment apply messages and returns versions. -// -// Scenario: A stream contains two deployment apply messages (ver=10, ver=20). -// -// Guarantees: -// - The deployment is applied to Kubernetes (ReplicaSet is created) -// - HandleState returns the version from each message -// - versionLastSeen is only updated after stream closes cleanly -// -// This tests the basic happy path: apply resources and track version. -func TestWatch_ProcessesStreamMessages(t *testing.T) { - client := fake.NewSimpleClientset() - rsCapture := AddReplicaSetPatchReactor(client) - AddDeploymentPatchReactor(client) - AddServicePatchReactor(client) - AddDeleteTracker(client) - - messages := []*ctrlv1.State{ - { - Version: 10, - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Apply{ - Apply: &ctrlv1.ApplyDeployment{ - WorkspaceId: "ws_1", - ProjectId: "prj_1", - EnvironmentId: "env_1", - DeploymentId: "dep_1", - K8SNamespace: "test-ns", - K8SName: "dep-1", - Image: "nginx:1.19", - Replicas: 1, - CpuMillicores: 100, - MemoryMib: 128, - BuildId: ptr.P("build_1"), - }, - }, - }, - }, - }, - { - Version: 20, - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Apply{ - Apply: &ctrlv1.ApplyDeployment{ - WorkspaceId: "ws_1", - ProjectId: "prj_1", - EnvironmentId: "env_1", - DeploymentId: "dep_2", - K8SNamespace: "test-ns", - K8SName: "dep-2", - Image: "nginx:1.20", - Replicas: 1, - CpuMillicores: 100, - MemoryMib: 128, - BuildId: ptr.P("build_2"), - }, - }, - }, - }, - }, - } - - stream := newMockServerStream(messages) - - mockCluster := &MockClusterClient{ - SyncFunc: func(_ context.Context, _ *connect.Request[ctrlv1.SyncRequest]) (*connect.ServerStreamForClient[ctrlv1.State], error) { - // Return our mock stream wrapped as the expected interface - return (*connect.ServerStreamForClient[ctrlv1.State])(nil), nil - }, - } - - r := New(Config{ - ClientSet: client, - DynamicClient: NewFakeDynamicClient(), - Logger: logging.NewNoop(), - Cluster: mockCluster, - Region: "test-region", - }) - - // Process messages directly to test HandleState integration - ctx := context.Background() - var maxVersion uint64 - for stream.Receive() { - seq, err := r.HandleState(ctx, stream.Msg()) - require.NoError(t, err) - if seq > maxVersion { - maxVersion = seq - } - } - - require.NotNil(t, rsCapture.Applied, "deployment should have been applied") - require.Equal(t, uint64(0), r.versionLastSeen, "sequence should not be updated until CommitSequence") - - // Simulate clean stream close - if maxVersion > r.versionLastSeen { - r.versionLastSeen = maxVersion - } - require.Equal(t, uint64(20), r.versionLastSeen, "sequence should be updated after CommitSequence") -} - -// TestWatch_IncrementalUpdates verifies that HandleState correctly processes -// a sequence of incremental updates including applies and deletes. -// -// Scenario: Starting from sequence 100 (simulating reconnect after bootstrap), -// the reconciler receives: apply deployment (101), delete deployment (102), -// delete sentinel (103). -// -// Guarantees: -// - HandleState returns the sequence from each message -// - versionLastSeen is updated to 103 after CommitSequence -// - Deployment delete triggers ReplicaSet deletion -// - Sentinel delete triggers Deployment deletion (sentinels run as k8s Deployments) -// -// This tests the watch loop after bootstrap: processing incremental changes -// as they happen in the control plane. -func TestWatch_IncrementalUpdates(t *testing.T) { - client := fake.NewSimpleClientset() - AddReplicaSetPatchReactor(client) - AddDeploymentPatchReactor(client) - AddServicePatchReactor(client) - deletes := AddDeleteTracker(client) - - messages := []*ctrlv1.State{ - { - Version: 101, - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Apply{ - Apply: &ctrlv1.ApplyDeployment{ - WorkspaceId: "ws_1", - ProjectId: "prj_1", - EnvironmentId: "env_1", - DeploymentId: "dep_new", - K8SNamespace: "test-ns", - K8SName: "new-deployment", - Image: "myapp:v2", - Replicas: 3, - CpuMillicores: 500, - MemoryMib: 512, - }, - }, - }, - }, - }, - { - Version: 102, - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Delete{ - Delete: &ctrlv1.DeleteDeployment{ - K8SNamespace: "test-ns", - K8SName: "old-deployment", - }, - }, - }, - }, - }, - { - Version: 103, - Kind: &ctrlv1.State_Sentinel{ - Sentinel: &ctrlv1.SentinelState{ - State: &ctrlv1.SentinelState_Delete{ - Delete: &ctrlv1.DeleteSentinel{ - K8SName: "old-sentinel", - }, - }, - }, - }, - }, - } - - mockCluster := &MockClusterClient{} - - r := New(Config{ - ClientSet: client, - DynamicClient: NewFakeDynamicClient(), - Logger: logging.NewNoop(), - Cluster: mockCluster, - Region: "test-region", - }) - - // Start with sequence 100 (simulating reconnect after bootstrap) - r.versionLastSeen = 100 - - ctx := context.Background() - stream := newMockServerStream(messages) - var maxVersion uint64 - for stream.Receive() { - seq, err := r.HandleState(ctx, stream.Msg()) - require.NoError(t, err) - if seq > maxVersion { - maxVersion = seq - } - } - - require.Equal(t, uint64(100), r.versionLastSeen, "sequence should not change until CommitSequence") - if maxVersion > r.versionLastSeen { - r.versionLastSeen = maxVersion - } - require.Equal(t, uint64(103), r.versionLastSeen) - require.Contains(t, deletes.Actions, "replicasets", "deployment delete should be processed (deletes ReplicaSet)") - require.Contains(t, deletes.Actions, "deployments", "sentinel delete should be processed (deletes Deployment)") -} - -// ============================================================================= -// Configuration Tests -// ============================================================================= - -// TestWatch_RegionIsPersisted verifies that the region from Config is correctly -// stored in the reconciler. -// -// Guarantees: -// - New() correctly sets the region field from Config -// - The region is available for use in Sync requests -func TestWatch_RegionIsPersisted(t *testing.T) { - cfg := Config{ - ClientSet: fake.NewSimpleClientset(), - Logger: logging.NewNoop(), - Cluster: &MockClusterClient{}, - Region: "ap-southeast-1", - } - - r := New(cfg) - require.Equal(t, "ap-southeast-1", r.region) -} - -// ============================================================================= -// Error Handling Tests -// ============================================================================= - -// TestWatch_SyncConnectionError verifies that connection errors from Sync() -// are properly propagated back to the caller. -// -// Scenario: The control plane is unreachable (connection refused). -// -// Guarantees: -// - The error from Sync() is returned by watch() -// - The caller (Watch loop) can handle reconnection logic -// -// This tests the error path: what happens when the control plane is down. -// The Watch() outer loop will retry with exponential backoff. -func TestWatch_SyncConnectionError(t *testing.T) { - client := fake.NewSimpleClientset() - AddReplicaSetPatchReactor(client) - AddDeploymentPatchReactor(client) - AddServicePatchReactor(client) - AddDeleteTracker(client) - - expectedErr := errors.New("connection refused") - - mockCluster := &MockClusterClient{ - SyncFunc: func(_ context.Context, req *connect.Request[ctrlv1.SyncRequest]) (*connect.ServerStreamForClient[ctrlv1.State], error) { - return nil, expectedErr - }, - } - - r := New(Config{ - ClientSet: client, - DynamicClient: NewFakeDynamicClient(), - Logger: logging.NewNoop(), - Cluster: mockCluster, - Region: "error-test-region", - }) - - ctx := context.Background() - err := r.watch(ctx) - - require.Error(t, err) - require.Equal(t, expectedErr, err) -} - -// ============================================================================= -// End-to-End Message Flow Tests -// ============================================================================= - -// TestWatch_FullMessageProcessingFlow verifies the complete message processing -// flow including multiple resource types and operations. -// -// Scenario: A full sync stream containing: -// - Apply deployment (seq=10) -// - Apply sentinel (seq=20) -// - Delete deployment (seq=30) -// -// Guarantees: -// - Deployment is applied to Kubernetes (ReplicaSet created with correct name) -// - Sentinel is applied (as a k8s Deployment - captured separately) -// - Deployment delete is processed (ReplicaSet deleted) -// - versionLastSeen ends at 30 (the highest sequence) -// -// This is a comprehensive integration test of HandleState covering all major -// message types in a realistic sequence. -func TestWatch_FullMessageProcessingFlow(t *testing.T) { - client := fake.NewSimpleClientset() - rsCapture := AddReplicaSetPatchReactor(client) - AddDeploymentPatchReactor(client) - AddServicePatchReactor(client) - deletes := AddDeleteTracker(client) - - r := New(Config{ - ClientSet: client, - DynamicClient: NewFakeDynamicClient(), - Logger: logging.NewNoop(), - Cluster: &MockClusterClient{}, - Region: "full-flow-region", - }) - - ctx := context.Background() - - messages := []*ctrlv1.State{ - { - Version: 10, - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Apply{ - Apply: &ctrlv1.ApplyDeployment{ - WorkspaceId: "ws_flow", - ProjectId: "prj_flow", - EnvironmentId: "env_flow", - DeploymentId: "dep_flow", - K8SNamespace: "flow-ns", - K8SName: "flow-deployment", - Image: "myapp:v1", - Replicas: 2, - CpuMillicores: 200, - MemoryMib: 256, - }, - }, - }, - }, - }, - { - Version: 20, - Kind: &ctrlv1.State_Sentinel{ - Sentinel: &ctrlv1.SentinelState{ - State: &ctrlv1.SentinelState_Apply{ - Apply: &ctrlv1.ApplySentinel{ - K8SName: "flow-sentinel", - WorkspaceId: "ws_flow", - EnvironmentId: "env_flow", - ProjectId: "prj_flow", - SentinelId: "sen_flow", - Image: "sentinel:v1", - Replicas: 1, - CpuMillicores: 100, - MemoryMib: 128, - }, - }, - }, - }, - }, - { - Version: 30, - Kind: &ctrlv1.State_Deployment{ - Deployment: &ctrlv1.DeploymentState{ - State: &ctrlv1.DeploymentState_Delete{ - Delete: &ctrlv1.DeleteDeployment{ - K8SNamespace: "flow-ns", - K8SName: "old-deployment", - }, - }, - }, - }, - }, - } - - var maxVersion uint64 - for _, msg := range messages { - seq, err := r.HandleState(ctx, msg) - require.NoError(t, err) - if seq > maxVersion { - maxVersion = seq - } - } - - require.NotNil(t, rsCapture.Applied, "deployment should have been applied") - require.Equal(t, "flow-deployment", rsCapture.Applied.Name) - - require.Contains(t, deletes.Actions, "replicasets", "deployment delete should have been processed") - - require.Equal(t, uint64(0), r.versionLastSeen, "sequence should not be updated until CommitSequence") - - if maxVersion > r.versionLastSeen { - r.versionLastSeen = maxVersion - } - require.Equal(t, uint64(30), r.versionLastSeen, "sequence should be updated after CommitSequence") -} diff --git a/svc/krane/internal/sentinel/BUILD.bazel b/svc/krane/internal/sentinel/BUILD.bazel new file mode 100644 index 0000000000..1a6ddfd1d5 --- /dev/null +++ b/svc/krane/internal/sentinel/BUILD.bazel @@ -0,0 +1,51 @@ +load("@rules_go//go:def.bzl", "go_library", "go_test") + +go_library( + name = "sentinel", + srcs = [ + "actual_state_report.go", + "apply.go", + "consts.go", + "controller.go", + "delete.go", + "desired_state_apply.go", + "doc.go", + "resync.go", + ], + importpath = "github.com/unkeyed/unkey/svc/krane/internal/sentinel", + visibility = ["//svc/krane:__subpackages__"], + deps = [ + "//gen/proto/ctrl/v1:ctrl", + "//gen/proto/ctrl/v1/ctrlv1connect", + "//pkg/assert", + "//pkg/circuitbreaker", + "//pkg/otel/logging", + "//pkg/ptr", + "//pkg/repeat", + "//svc/krane/pkg/labels", + "@com_connectrpc_connect//:connect", + "@io_k8s_api//apps/v1:apps", + "@io_k8s_api//core/v1:core", + "@io_k8s_apimachinery//pkg/api/errors", + "@io_k8s_apimachinery//pkg/apis/meta/v1:meta", + "@io_k8s_apimachinery//pkg/types", + "@io_k8s_apimachinery//pkg/util/intstr", + "@io_k8s_apimachinery//pkg/watch", + "@io_k8s_client_go//kubernetes", + "@io_k8s_sigs_controller_runtime//pkg/client", + ], +) + +go_test( + name = "sentinel_test", + srcs = ["controller_test.go"], + embed = [":sentinel"], + deps = [ + "//pkg/otel/logging", + "//svc/krane/internal/testutil", + "@com_github_stretchr_testify//require", + "@io_k8s_api//core/v1:core", + "@io_k8s_apimachinery//pkg/apis/meta/v1:meta", + "@io_k8s_client_go//kubernetes/fake", + ], +) diff --git a/svc/krane/internal/reconciler/watch_current_sentinels.go b/svc/krane/internal/sentinel/actual_state_report.go similarity index 58% rename from svc/krane/internal/reconciler/watch_current_sentinels.go rename to svc/krane/internal/sentinel/actual_state_report.go index 470a2c267d..347de8511a 100644 --- a/svc/krane/internal/reconciler/watch_current_sentinels.go +++ b/svc/krane/internal/sentinel/actual_state_report.go @@ -1,4 +1,4 @@ -package reconciler +package sentinel import ( "context" @@ -10,18 +10,14 @@ import ( "k8s.io/apimachinery/pkg/watch" ) -// watchCurrentSentinels starts a Kubernetes watch for sentinel Deployments and -// reports replica availability back to the control plane in real-time. +// runActualStateReportLoop starts a Kubernetes watch for sentinel Deployments +// and reports actual state changes back to the control plane in real-time. // // The watch filters for resources with the "managed-by: krane" and "component: sentinel" -// labels. When a Deployment's available replica count changes, the method notifies -// the control plane so it knows which sentinels are ready to receive traffic. -// -// This complements [Reconciler.refreshCurrentSentinels] which handles consistency -// for events that might be missed during network partitions or restarts. -func (r *Reconciler) watchCurrentSentinels(ctx context.Context) error { - - w, err := r.clientSet.AppsV1().Deployments(NamespaceSentinel).Watch(ctx, metav1.ListOptions{ +// labels. When a Deployment's available replica count changes, the method reports +// the actual state to the control plane so it knows which sentinels are ready. +func (c *Controller) runActualStateReportLoop(ctx context.Context) error { + w, err := c.clientSet.AppsV1().Deployments(NamespaceSentinel).Watch(ctx, metav1.ListOptions{ LabelSelector: labels.New(). ManagedByKrane(). ComponentSentinel(). @@ -35,15 +31,15 @@ func (r *Reconciler) watchCurrentSentinels(ctx context.Context) error { for event := range w.ResultChan() { switch event.Type { case watch.Error: - r.logger.Error("error watching sentinel", "event", event.Object) + c.logger.Error("error watching sentinel", "event", event.Object) case watch.Bookmark: case watch.Added, watch.Modified: sentinel, ok := event.Object.(*appsv1.Deployment) if !ok { - r.logger.Error("unable to cast object to deployment") + c.logger.Error("unable to cast object to deployment") continue } - r.logger.Info("sentinel added/modified", "name", sentinel.Name) + c.logger.Info("sentinel added/modified", "name", sentinel.Name) desiredReplicas := int32(0) if sentinel.Spec.Replicas != nil { @@ -59,33 +55,32 @@ func (r *Reconciler) watchCurrentSentinels(ctx context.Context) error { health = ctrlv1.Health_HEALTH_UNHEALTHY } - err := r.updateSentinelState(ctx, &ctrlv1.UpdateSentinelStateRequest{ + err := c.reportSentinelStatus(ctx, &ctrlv1.ReportSentinelStatusRequest{ K8SName: sentinel.Name, AvailableReplicas: sentinel.Status.AvailableReplicas, Health: health, }) if err != nil { - r.logger.Error("error updating sentinel state", "error", err.Error()) + c.logger.Error("error reporting sentinel status", "error", err.Error()) } case watch.Deleted: sentinel, ok := event.Object.(*appsv1.Deployment) if !ok { - r.logger.Error("unable to cast object to deployment") + c.logger.Error("unable to cast object to deployment") continue } - r.logger.Info("sentinel deleted", "name", sentinel.Name) - err := r.updateSentinelState(ctx, &ctrlv1.UpdateSentinelStateRequest{ + c.logger.Info("sentinel deleted", "name", sentinel.Name) + err := c.reportSentinelStatus(ctx, &ctrlv1.ReportSentinelStatusRequest{ K8SName: sentinel.Name, AvailableReplicas: 0, Health: ctrlv1.Health_HEALTH_UNHEALTHY, }) if err != nil { - r.logger.Error("error updating sentinel state", "error", err.Error()) + c.logger.Error("error reporting sentinel status", "error", err.Error()) } } } }() return nil - } diff --git a/svc/krane/internal/reconciler/apply_sentinel.go b/svc/krane/internal/sentinel/apply.go similarity index 83% rename from svc/krane/internal/reconciler/apply_sentinel.go rename to svc/krane/internal/sentinel/apply.go index 88e31de51b..f314307d80 100644 --- a/svc/krane/internal/reconciler/apply_sentinel.go +++ b/svc/krane/internal/sentinel/apply.go @@ -1,4 +1,4 @@ -package reconciler +package sentinel import ( "context" @@ -12,6 +12,7 @@ import ( "github.com/unkeyed/unkey/svc/krane/pkg/labels" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/intstr" @@ -26,13 +27,13 @@ import ( // // ApplySentinel reports the available replica count back to the control plane after // applying, so the platform knows when the sentinel is ready to receive traffic. -func (r *Reconciler) ApplySentinel(ctx context.Context, req *ctrlv1.ApplySentinel) error { - - r.logger.Info("applying sentinel", +func (c *Controller) ApplySentinel(ctx context.Context, req *ctrlv1.ApplySentinel) error { + c.logger.Info("applying sentinel", "namespace", NamespaceSentinel, "name", req.GetK8SName(), "sentinel_id", req.GetSentinelId(), ) + err := assert.All( assert.NotEmpty(req.GetWorkspaceId(), "Workspace ID is required"), assert.NotEmpty(req.GetProjectId(), "Project ID is required"), @@ -48,17 +49,16 @@ func (r *Reconciler) ApplySentinel(ctx context.Context, req *ctrlv1.ApplySentine return err } - // Sentinel namespace is shared across workspaces, no per-namespace policy needed - if err := r.ensureNamespaceExists(ctx, NamespaceSentinel, "", ""); err != nil { + if err := c.ensureNamespaceExists(ctx); err != nil { return err } - sentinel, err := r.ensureSentinelExists(ctx, req) + sentinel, err := c.ensureSentinelExists(ctx, req) if err != nil { return err } - _, err = r.ensureServiceExists(ctx, req, sentinel) + _, err = c.ensureServiceExists(ctx, req, sentinel) if err != nil { return err } @@ -72,24 +72,37 @@ func (r *Reconciler) ApplySentinel(ctx context.Context, req *ctrlv1.ApplySentine health = ctrlv1.Health_HEALTH_UNHEALTHY } - err = r.updateSentinelState(ctx, &ctrlv1.UpdateSentinelStateRequest{ + err = c.reportSentinelStatus(ctx, &ctrlv1.ReportSentinelStatusRequest{ K8SName: req.GetK8SName(), AvailableReplicas: sentinel.Status.AvailableReplicas, Health: health, }) if err != nil { - r.logger.Error("failed to reconcile sentinel", "sentinel_id", req.GetSentinelId(), "error", err) + c.logger.Error("failed to reconcile sentinel", "sentinel_id", req.GetSentinelId(), "error", err) return err } return nil } +// ensureNamespaceExists creates the sentinel namespace if it doesn't already exist. +func (c *Controller) ensureNamespaceExists(ctx context.Context) error { + _, err := c.clientSet.CoreV1().Namespaces().Create(ctx, &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: NamespaceSentinel, + }, + }, metav1.CreateOptions{}) + if err != nil && !errors.IsAlreadyExists(err) { + return err + } + return nil +} + // ensureSentinelExists creates or updates the sentinel's Kubernetes Deployment using // server-side apply. Returns the resulting Deployment so the caller can extract // its UID for setting owner references on related resources. -func (r *Reconciler) ensureSentinelExists(ctx context.Context, sentinel *ctrlv1.ApplySentinel) (*appsv1.Deployment, error) { - client := r.clientSet.AppsV1().Deployments(NamespaceSentinel) +func (c *Controller) ensureSentinelExists(ctx context.Context, sentinel *ctrlv1.ApplySentinel) (*appsv1.Deployment, error) { + client := c.clientSet.AppsV1().Deployments(NamespaceSentinel) desired := &appsv1.Deployment{ TypeMeta: metav1.TypeMeta{ @@ -157,7 +170,7 @@ func (r *Reconciler) ensureSentinelExists(ctx context.Context, sentinel *ctrlv1. {Name: "UNKEY_PROJECT_ID", Value: sentinel.GetProjectId()}, {Name: "UNKEY_ENVIRONMENT_ID", Value: sentinel.GetEnvironmentId()}, {Name: "UNKEY_SENTINEL_ID", Value: sentinel.GetSentinelId()}, - {Name: "UNKEY_REGION", Value: r.region}, + {Name: "UNKEY_REGION", Value: c.region}, }, Ports: []corev1.ContainerPort{{ @@ -225,8 +238,8 @@ func (r *Reconciler) ensureSentinelExists(ctx context.Context, sentinel *ctrlv1. // addressing for the sentinel's pods. The Service is owned by the Deployment, which // means Kubernetes garbage collection will delete the Service when the Deployment // is deleted. -func (r *Reconciler) ensureServiceExists(ctx context.Context, sentinel *ctrlv1.ApplySentinel, deployment *appsv1.Deployment) (*corev1.Service, error) { - client := r.clientSet.CoreV1().Services(NamespaceSentinel) +func (c *Controller) ensureServiceExists(ctx context.Context, sentinel *ctrlv1.ApplySentinel, deployment *appsv1.Deployment) (*corev1.Service, error) { + client := c.clientSet.CoreV1().Services(NamespaceSentinel) desired := &corev1.Service{ TypeMeta: metav1.TypeMeta{ @@ -274,3 +287,18 @@ func (r *Reconciler) ensureServiceExists(ctx context.Context, sentinel *ctrlv1.A FieldManager: fieldManagerKrane, }) } + +// sentinelTopologySpread returns topology spread constraints for sentinel pods. +// Spreads pods evenly across availability zones with maxSkew of 1. +func sentinelTopologySpread(sentinelID string) []corev1.TopologySpreadConstraint { + return []corev1.TopologySpreadConstraint{ + { + MaxSkew: 1, + TopologyKey: topologyKeyZone, + WhenUnsatisfiable: corev1.ScheduleAnyway, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: labels.New().SentinelID(sentinelID), + }, + }, + } +} diff --git a/svc/krane/internal/sentinel/consts.go b/svc/krane/internal/sentinel/consts.go new file mode 100644 index 0000000000..7ab7935d9f --- /dev/null +++ b/svc/krane/internal/sentinel/consts.go @@ -0,0 +1,28 @@ +package sentinel + +import corev1 "k8s.io/api/core/v1" + +const ( + // NamespaceSentinel is the Kubernetes namespace where sentinel pods run. + NamespaceSentinel = "sentinel" + + // SentinelPort is the port sentinel pods listen on. + SentinelPort = 8040 + + // SentinelNodeClass is the node class for sentinel workloads. + SentinelNodeClass = "sentinel" + + // fieldManagerKrane identifies krane as the server-side apply field manager. + fieldManagerKrane = "krane" + + // topologyKeyZone is the standard Kubernetes topology key for availability zones + topologyKeyZone = "topology.kubernetes.io/zone" +) + +// sentinelToleration allows sentinel pods to be scheduled on sentinel nodes. +var sentinelToleration = corev1.Toleration{ + Key: "node-class", + Operator: corev1.TolerationOpEqual, + Value: SentinelNodeClass, + Effect: corev1.TaintEffectNoSchedule, +} diff --git a/svc/krane/internal/sentinel/controller.go b/svc/krane/internal/sentinel/controller.go new file mode 100644 index 0000000000..e6c510f0ee --- /dev/null +++ b/svc/krane/internal/sentinel/controller.go @@ -0,0 +1,93 @@ +package sentinel + +import ( + "context" + "fmt" + + "connectrpc.com/connect" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" + "github.com/unkeyed/unkey/gen/proto/ctrl/v1/ctrlv1connect" + "github.com/unkeyed/unkey/pkg/circuitbreaker" + "github.com/unkeyed/unkey/pkg/otel/logging" + "k8s.io/client-go/kubernetes" +) + +// Controller manages sentinel Deployments and Services in a Kubernetes cluster. +// +// It maintains bidirectional state synchronization with the control plane: +// receiving desired state via WatchSentinels and reporting actual state +// via ReportSentinelStatus. The controller operates independently from +// the DeploymentController with its own version cursor and circuit breaker. +type Controller struct { + clientSet kubernetes.Interface + logger logging.Logger + cluster ctrlv1connect.ClusterServiceClient + cb circuitbreaker.CircuitBreaker[any] + done chan struct{} + region string + versionLastSeen uint64 +} + +// Config holds the configuration required to create a new [Controller]. +type Config struct { + ClientSet kubernetes.Interface + Logger logging.Logger + Cluster ctrlv1connect.ClusterServiceClient + Region string +} + +// New creates a [Controller] ready to be started with [Controller.Start]. +func New(cfg Config) *Controller { + return &Controller{ + clientSet: cfg.ClientSet, + logger: cfg.Logger.With("controller", "sentinels"), + cluster: cfg.Cluster, + cb: circuitbreaker.New[any]("sentinel_state_update"), + done: make(chan struct{}), + region: cfg.Region, + versionLastSeen: 0, + } +} + +// Start launches the three background control loops: +// +// - [Controller.runDesiredStateApplyLoop]: Receives desired state from the +// control plane's SyncSentinels stream and applies it to Kubernetes. +// +// - [Controller.runActualStateReportLoop]: Watches Kubernetes for Deployment +// changes and reports actual state back to the control plane. +// +// - [Controller.runResyncLoop]: Periodically re-queries the control plane for +// each existing sentinel to ensure eventual consistency. +// +// All loops continue until the context is cancelled or [Controller.Stop] is called. +func (c *Controller) Start(ctx context.Context) error { + go c.runResyncLoop(ctx) + + if err := c.runActualStateReportLoop(ctx); err != nil { + return err + } + + go c.runDesiredStateApplyLoop(ctx) + + return nil +} + +// Stop signals all background goroutines to terminate. +func (c *Controller) Stop() error { + close(c.done) + return nil +} + +// reportSentinelStatus pushes sentinel status to the control plane through +// the circuit breaker. The circuit breaker prevents cascading failures during +// control plane outages by failing fast after repeated errors. +func (c *Controller) reportSentinelStatus(ctx context.Context, status *ctrlv1.ReportSentinelStatusRequest) error { + _, err := c.cb.Do(ctx, func(innerCtx context.Context) (any, error) { + return c.cluster.ReportSentinelStatus(innerCtx, connect.NewRequest(status)) + }) + if err != nil { + return fmt.Errorf("failed to report sentinel status: %w", err) + } + return nil +} diff --git a/svc/krane/internal/reconciler/reconciler_test.go b/svc/krane/internal/sentinel/controller_test.go similarity index 57% rename from svc/krane/internal/reconciler/reconciler_test.go rename to svc/krane/internal/sentinel/controller_test.go index 07280378a5..a1011fa57b 100644 --- a/svc/krane/internal/reconciler/reconciler_test.go +++ b/svc/krane/internal/sentinel/controller_test.go @@ -1,16 +1,17 @@ -package reconciler +package sentinel import ( "testing" "github.com/stretchr/testify/require" "github.com/unkeyed/unkey/pkg/otel/logging" + "github.com/unkeyed/unkey/svc/krane/internal/testutil" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes/fake" ) -func TestNew_CreatesReconcilerWithCorrectFields(t *testing.T) { +func TestNew_CreatesControllerWithCorrectFields(t *testing.T) { namespace := &corev1.Namespace{ ObjectMeta: metav1.ObjectMeta{ Name: "test-namespace", @@ -18,7 +19,7 @@ func TestNew_CreatesReconcilerWithCorrectFields(t *testing.T) { } client := fake.NewSimpleClientset(namespace) logger := logging.NewNoop() - mockCluster := &MockClusterClient{} + mockCluster := &testutil.MockClusterClient{} cfg := Config{ ClientSet: client, @@ -27,85 +28,79 @@ func TestNew_CreatesReconcilerWithCorrectFields(t *testing.T) { Region: "us-east-1", } - r := New(cfg) + ctrl := New(cfg) - require.NotNil(t, r) - require.Equal(t, client, r.clientSet) - require.Equal(t, logger, r.logger) - require.Equal(t, mockCluster, r.cluster) - require.Equal(t, "us-east-1", r.region) + require.NotNil(t, ctrl) + require.Equal(t, client, ctrl.clientSet) + require.Equal(t, mockCluster, ctrl.cluster) + require.Equal(t, "us-east-1", ctrl.region) } -func TestNew_CreatesCircuitBreaker(t *testing.T) { +func TestNew_CreatesOwnCircuitBreaker(t *testing.T) { client := fake.NewSimpleClientset() cfg := Config{ ClientSet: client, Logger: logging.NewNoop(), - Cluster: &MockClusterClient{}, + Cluster: &testutil.MockClusterClient{}, Region: "us-east-1", } - r := New(cfg) + ctrl := New(cfg) - require.NotNil(t, r.cb, "circuit breaker should not be nil") + require.NotNil(t, ctrl.cb, "circuit breaker should not be nil") } -func TestNew_CreatesDoneChannel(t *testing.T) { +func TestNew_InitializesVersionCursorToZero(t *testing.T) { client := fake.NewSimpleClientset() cfg := Config{ ClientSet: client, Logger: logging.NewNoop(), - Cluster: &MockClusterClient{}, + Cluster: &testutil.MockClusterClient{}, Region: "us-east-1", } - r := New(cfg) - - require.NotNil(t, r.done, "done channel should not be nil") + ctrl := New(cfg) - select { - case <-r.done: - t.Fatal("done channel should not be closed initially") - default: - } + require.Equal(t, uint64(0), ctrl.versionLastSeen, "version cursor should start at 0") } -func TestStop_ClosesDoneChannel(t *testing.T) { +func TestNew_CreatesDoneChannel(t *testing.T) { client := fake.NewSimpleClientset() cfg := Config{ ClientSet: client, Logger: logging.NewNoop(), - Cluster: &MockClusterClient{}, + Cluster: &testutil.MockClusterClient{}, Region: "us-east-1", } - r := New(cfg) + ctrl := New(cfg) - err := r.Stop() - require.NoError(t, err) + require.NotNil(t, ctrl.done, "done channel should not be nil") select { - case <-r.done: + case <-ctrl.done: + t.Fatal("done channel should not be closed initially") default: - t.Fatal("done channel should be closed after Stop") } } -func TestStop_IsIdempotent(t *testing.T) { +func TestStop_ClosesDoneChannel(t *testing.T) { client := fake.NewSimpleClientset() cfg := Config{ ClientSet: client, Logger: logging.NewNoop(), - Cluster: &MockClusterClient{}, + Cluster: &testutil.MockClusterClient{}, Region: "us-east-1", } - r := New(cfg) + ctrl := New(cfg) - err := r.Stop() + err := ctrl.Stop() require.NoError(t, err) - require.Panics(t, func() { - _ = r.Stop() - }, "calling Stop twice should panic when closing already closed channel") + select { + case <-ctrl.done: + default: + t.Fatal("done channel should be closed after Stop") + } } diff --git a/svc/krane/internal/reconciler/delete_sentinel.go b/svc/krane/internal/sentinel/delete.go similarity index 72% rename from svc/krane/internal/reconciler/delete_sentinel.go rename to svc/krane/internal/sentinel/delete.go index 3f7e8939c5..415bc80cee 100644 --- a/svc/krane/internal/reconciler/delete_sentinel.go +++ b/svc/krane/internal/sentinel/delete.go @@ -1,13 +1,12 @@ -package reconciler +package sentinel import ( "context" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" - - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" ) // DeleteSentinel removes a sentinel's Service and Deployment from the cluster. @@ -15,23 +14,26 @@ import ( // Both resources are deleted explicitly rather than relying on owner reference // cascading, ensuring cleanup completes even if ownership wasn't set correctly. // Not-found errors are ignored since the desired end state is already achieved. -func (r *Reconciler) DeleteSentinel(ctx context.Context, req *ctrlv1.DeleteSentinel) error { - r.logger.Info("deleting sentinel", +func (c *Controller) DeleteSentinel(ctx context.Context, req *ctrlv1.DeleteSentinel) error { + c.logger.Info("deleting sentinel", "namespace", NamespaceSentinel, "name", req.GetK8SName(), ) - err := r.clientSet.CoreV1().Services(NamespaceSentinel).Delete(ctx, req.GetK8SName(), metav1.DeleteOptions{}) + err := c.clientSet.CoreV1().Services(NamespaceSentinel).Delete(ctx, req.GetK8SName(), metav1.DeleteOptions{}) if err != nil && !apierrors.IsNotFound(err) { return err } - err = r.clientSet.AppsV1().Deployments(NamespaceSentinel).Delete(ctx, req.GetK8SName(), metav1.DeleteOptions{}) + + err = c.clientSet.AppsV1().Deployments(NamespaceSentinel).Delete(ctx, req.GetK8SName(), metav1.DeleteOptions{}) if err != nil && !apierrors.IsNotFound(err) { return err } - err = r.updateSentinelState(ctx, &ctrlv1.UpdateSentinelStateRequest{ + + err = c.reportSentinelStatus(ctx, &ctrlv1.ReportSentinelStatusRequest{ K8SName: req.GetK8SName(), AvailableReplicas: 0, + Health: ctrlv1.Health_HEALTH_UNHEALTHY, }) if err != nil { return err diff --git a/svc/krane/internal/sentinel/desired_state_apply.go b/svc/krane/internal/sentinel/desired_state_apply.go new file mode 100644 index 0000000000..4ab9f038b9 --- /dev/null +++ b/svc/krane/internal/sentinel/desired_state_apply.go @@ -0,0 +1,72 @@ +package sentinel + +import ( + "context" + "math/rand/v2" + "time" + + "connectrpc.com/connect" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" +) + +// runDesiredStateApplyLoop connects to the control plane's SyncSentinels stream +// and applies desired state updates to the Kubernetes cluster. +// +// The loop automatically reconnects with jittered backoff on stream errors. +// Each received state is processed via applyDesiredState, and the version cursor +// is advanced on successful processing. +func (c *Controller) runDesiredStateApplyLoop(ctx context.Context) { + intervalMin := time.Second + intervalMax := 5 * time.Second + + for { + interval := intervalMin + time.Millisecond*time.Duration(rand.Float64()*float64(intervalMax.Milliseconds()-intervalMin.Milliseconds())) + time.Sleep(interval) + + err := c.streamDesiredStateOnce(ctx) + if err != nil { + c.logger.Error("error streaming desired state from control plane", "error", err) + } + } +} + +// streamDesiredStateOnce opens a single connection to the control plane's +// WatchSentinels stream, processes all received states until the stream +// closes or errors, then returns. The caller handles reconnection. +func (c *Controller) streamDesiredStateOnce(ctx context.Context) error { + c.logger.Info("connecting to control plane for desired state") + + stream, err := c.cluster.WatchSentinels(ctx, connect.NewRequest(&ctrlv1.WatchSentinelsRequest{ + Region: c.region, + VersionLastSeen: c.versionLastSeen, + })) + if err != nil { + return err + } + + for stream.Receive() { + c.logger.Info("received desired state from control plane") + state := stream.Msg() + + switch op := state.GetState().(type) { + case *ctrlv1.SentinelState_Apply: + if err := c.ApplySentinel(ctx, op.Apply); err != nil { + return err + } + case *ctrlv1.SentinelState_Delete: + if err := c.DeleteSentinel(ctx, op.Delete); err != nil { + return err + } + } + if state.GetVersion() > c.versionLastSeen { + c.versionLastSeen = state.GetVersion() + } + } + + if err := stream.Close(); err != nil { + c.logger.Error("unable to close control plane stream", "error", err) + return err + } + + return nil +} diff --git a/svc/krane/internal/sentinel/doc.go b/svc/krane/internal/sentinel/doc.go new file mode 100644 index 0000000000..ac4cae0d0c --- /dev/null +++ b/svc/krane/internal/sentinel/doc.go @@ -0,0 +1,51 @@ +// Package sentinel provides the SentinelController for managing sentinel +// Deployments and Services in Kubernetes. +// +// The SentinelController is one half of krane's split control loop architecture. +// It operates independently from the DeploymentController, with its own: +// - Control plane sync stream (SyncSentinels) +// - Version cursor for resumable streaming +// - Circuit breaker for failure isolation +// - Kubernetes watch and refresh loops +// +// # Architecture +// +// The controller runs three loops for reliability: +// +// - [Controller.runDesiredStateApplyLoop]: Receives desired state from the +// control plane's SyncSentinels stream and applies it to Kubernetes. +// +// - [Controller.runActualStateReportLoop]: Watches Kubernetes for Deployment +// changes and reports actual state back to the control plane. +// +// - [Controller.runResyncLoop]: Periodically re-queries the control plane +// for each existing sentinel to ensure eventual consistency. +// +// # Resource Management +// +// Sentinels are infrastructure proxies that route traffic to user deployments. +// Each sentinel gets both a Kubernetes Deployment (for the actual pods) and a +// ClusterIP Service (for stable in-cluster addressing). The Service is owned +// by the Deployment, so deleting the Deployment automatically cleans up the +// Service. +// +// # Failure Isolation +// +// By running as an independent controller, sentinel reconciliation continues +// even if deployment reconciliation is experiencing failures. Each controller +// has its own circuit breaker, so errors in one don't affect the other. +// +// # Usage +// +// ctrl := sentinel.New(sentinel.Config{ +// ClientSet: kubeClient, +// Logger: logger.With("controller", "sentinels"), +// Cluster: clusterClient, +// Region: "us-east-1", +// }) +// +// if err := ctrl.Start(ctx); err != nil { +// return fmt.Errorf("failed to start sentinel controller: %w", err) +// } +// defer ctrl.Stop() +package sentinel diff --git a/svc/krane/internal/sentinel/resync.go b/svc/krane/internal/sentinel/resync.go new file mode 100644 index 0000000000..3968d5e73b --- /dev/null +++ b/svc/krane/internal/sentinel/resync.go @@ -0,0 +1,74 @@ +package sentinel + +import ( + "context" + "time" + + "connectrpc.com/connect" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" + "github.com/unkeyed/unkey/pkg/repeat" + "github.com/unkeyed/unkey/svc/krane/pkg/labels" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// runResyncLoop periodically reconciles all sentinel Deployments with their +// desired state from the control plane. +// +// This loop runs every minute as a consistency safety net. While +// [Controller.runActualStateReportLoop] handles real-time K8s events and +// [Controller.runDesiredStateApplyLoop] handles streaming updates, both can miss +// events during network partitions, controller restarts, or buffer overflows. +// This resync loop guarantees eventual consistency by querying the control plane +// for each existing sentinel and applying any needed changes. +func (c *Controller) runResyncLoop(ctx context.Context) { + repeat.Every(1*time.Minute, func() { + c.logger.Info("running periodic resync") + + cursor := "" + for { + deployments, err := c.clientSet.AppsV1().Deployments(NamespaceSentinel).List(ctx, metav1.ListOptions{ + LabelSelector: labels.New(). + ManagedByKrane(). + ComponentSentinel(). + ToString(), + Continue: cursor, + }) + if err != nil { + c.logger.Error("unable to list deployments", "error", err.Error()) + return + } + + for _, deployment := range deployments.Items { + sentinelID, ok := labels.GetSentinelID(deployment.Labels) + if !ok { + c.logger.Error("unable to get sentinel ID", "deployment", deployment.Name) + continue + } + + res, err := c.cluster.GetDesiredSentinelState(ctx, connect.NewRequest(&ctrlv1.GetDesiredSentinelStateRequest{ + SentinelId: sentinelID, + })) + if err != nil { + c.logger.Error("unable to get desired sentinel state", "error", err.Error(), "sentinel_id", sentinelID) + continue + } + + switch res.Msg.GetState().(type) { + case *ctrlv1.SentinelState_Apply: + if err := c.ApplySentinel(ctx, res.Msg.GetApply()); err != nil { + c.logger.Error("unable to apply sentinel", "error", err.Error(), "sentinel_id", sentinelID) + } + case *ctrlv1.SentinelState_Delete: + if err := c.DeleteSentinel(ctx, res.Msg.GetDelete()); err != nil { + c.logger.Error("unable to delete sentinel", "error", err.Error(), "sentinel_id", sentinelID) + } + } + } + + cursor = deployments.Continue + if cursor == "" { + break + } + } + }) +} diff --git a/svc/krane/internal/testutil/BUILD.bazel b/svc/krane/internal/testutil/BUILD.bazel new file mode 100644 index 0000000000..705c116189 --- /dev/null +++ b/svc/krane/internal/testutil/BUILD.bazel @@ -0,0 +1,13 @@ +load("@rules_go//go:def.bzl", "go_library") + +go_library( + name = "testutil", + srcs = ["mock_cluster_client.go"], + importpath = "github.com/unkeyed/unkey/svc/krane/internal/testutil", + visibility = ["//svc/krane:__subpackages__"], + deps = [ + "//gen/proto/ctrl/v1:ctrl", + "//gen/proto/ctrl/v1/ctrlv1connect", + "@com_connectrpc_connect//:connect", + ], +) diff --git a/svc/krane/internal/testutil/mock_cluster_client.go b/svc/krane/internal/testutil/mock_cluster_client.go new file mode 100644 index 0000000000..5e2eda3c83 --- /dev/null +++ b/svc/krane/internal/testutil/mock_cluster_client.go @@ -0,0 +1,73 @@ +// Package testutil provides test utilities shared across krane controller tests. +package testutil + +import ( + "context" + + "connectrpc.com/connect" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" + "github.com/unkeyed/unkey/gen/proto/ctrl/v1/ctrlv1connect" +) + +var _ ctrlv1connect.ClusterServiceClient = (*MockClusterClient)(nil) + +// MockClusterClient is a test double for the control plane's cluster service. +// +// Each method has an optional function field that tests can set to customize +// behavior. If the function is nil, the method returns a sensible default. +// The mock also records ReportDeploymentStatus and ReportSentinelStatus calls +// so tests can verify the controller reported the correct status. +type MockClusterClient struct { + WatchDeploymentsFunc func(context.Context, *connect.Request[ctrlv1.WatchDeploymentsRequest]) (*connect.ServerStreamForClient[ctrlv1.DeploymentState], error) + WatchSentinelsFunc func(context.Context, *connect.Request[ctrlv1.WatchSentinelsRequest]) (*connect.ServerStreamForClient[ctrlv1.SentinelState], error) + GetDesiredSentinelStateFunc func(context.Context, *connect.Request[ctrlv1.GetDesiredSentinelStateRequest]) (*connect.Response[ctrlv1.SentinelState], error) + ReportSentinelStatusFunc func(context.Context, *connect.Request[ctrlv1.ReportSentinelStatusRequest]) (*connect.Response[ctrlv1.ReportSentinelStatusResponse], error) + GetDesiredDeploymentStateFunc func(context.Context, *connect.Request[ctrlv1.GetDesiredDeploymentStateRequest]) (*connect.Response[ctrlv1.DeploymentState], error) + ReportDeploymentStatusFunc func(context.Context, *connect.Request[ctrlv1.ReportDeploymentStatusRequest]) (*connect.Response[ctrlv1.ReportDeploymentStatusResponse], error) + ReportDeploymentStatusCalls []*ctrlv1.ReportDeploymentStatusRequest + ReportSentinelStatusCalls []*ctrlv1.ReportSentinelStatusRequest +} + +func (m *MockClusterClient) WatchDeployments(ctx context.Context, req *connect.Request[ctrlv1.WatchDeploymentsRequest]) (*connect.ServerStreamForClient[ctrlv1.DeploymentState], error) { + if m.WatchDeploymentsFunc != nil { + return m.WatchDeploymentsFunc(ctx, req) + } + return nil, nil +} + +func (m *MockClusterClient) WatchSentinels(ctx context.Context, req *connect.Request[ctrlv1.WatchSentinelsRequest]) (*connect.ServerStreamForClient[ctrlv1.SentinelState], error) { + if m.WatchSentinelsFunc != nil { + return m.WatchSentinelsFunc(ctx, req) + } + return nil, nil +} + +func (m *MockClusterClient) GetDesiredSentinelState(ctx context.Context, req *connect.Request[ctrlv1.GetDesiredSentinelStateRequest]) (*connect.Response[ctrlv1.SentinelState], error) { + if m.GetDesiredSentinelStateFunc != nil { + return m.GetDesiredSentinelStateFunc(ctx, req) + } + return connect.NewResponse(&ctrlv1.SentinelState{}), nil +} + +func (m *MockClusterClient) ReportSentinelStatus(ctx context.Context, req *connect.Request[ctrlv1.ReportSentinelStatusRequest]) (*connect.Response[ctrlv1.ReportSentinelStatusResponse], error) { + m.ReportSentinelStatusCalls = append(m.ReportSentinelStatusCalls, req.Msg) + if m.ReportSentinelStatusFunc != nil { + return m.ReportSentinelStatusFunc(ctx, req) + } + return connect.NewResponse(&ctrlv1.ReportSentinelStatusResponse{}), nil +} + +func (m *MockClusterClient) GetDesiredDeploymentState(ctx context.Context, req *connect.Request[ctrlv1.GetDesiredDeploymentStateRequest]) (*connect.Response[ctrlv1.DeploymentState], error) { + if m.GetDesiredDeploymentStateFunc != nil { + return m.GetDesiredDeploymentStateFunc(ctx, req) + } + return connect.NewResponse(&ctrlv1.DeploymentState{}), nil +} + +func (m *MockClusterClient) ReportDeploymentStatus(ctx context.Context, req *connect.Request[ctrlv1.ReportDeploymentStatusRequest]) (*connect.Response[ctrlv1.ReportDeploymentStatusResponse], error) { + m.ReportDeploymentStatusCalls = append(m.ReportDeploymentStatusCalls, req.Msg) + if m.ReportDeploymentStatusFunc != nil { + return m.ReportDeploymentStatusFunc(ctx, req) + } + return connect.NewResponse(&ctrlv1.ReportDeploymentStatusResponse{}), nil +} diff --git a/svc/krane/proto/krane/v1/scheduler.proto b/svc/krane/proto/krane/v1/scheduler.proto deleted file mode 100644 index 78bbdb5450..0000000000 --- a/svc/krane/proto/krane/v1/scheduler.proto +++ /dev/null @@ -1,174 +0,0 @@ -syntax = "proto3"; - -package krane.v1; - -option go_package = "github.com/unkeyed/unkey/gen/proto/krane/v1;kranev1"; - -service SchedulerService { - rpc ApplyDeployment(ApplyDeploymentRequest) returns (ApplyDeploymentResponse); - rpc DeleteDeployment(DeleteDeploymentRequest) returns (DeleteDeploymentResponse); - - rpc ApplySentinel(ApplySentinelRequest) returns (ApplySentinelResponse); - rpc DeleteSentinel(DeleteSentinelRequest) returns (DeleteSentinelResponse); - - rpc Watch(WatchRequest) returns (stream State); - - rpc ScrapeOpenApiSchema(ScrapeOpenApiSchemaRequest) returns (ScrapeOpenApiSchemaResponse); -} - -message ScrapeOpenApiSchemaRequest { - string namespace = 1; - string name = 2; - string path = 3; -} - -message ScrapeOpenApiSchemaResponse { - string spec = 1; -} - -message WatchRequest {} - -message State { - message Deployment { - message Instance { - enum Status { - STATUS_UNSPECIFIED = 0; - STATUS_PENDING = 1; // Deployment request accepted, container/pod creation in progress - STATUS_RUNNING = 2; // Container/pod is running and healthy - STATUS_FAILED = 3; // Container/pod failed to start - } - string instance_k8s_name = 1; - string address = 2; - int64 cpu_millicores = 3; - int64 memory_mib = 4; - Status status = 5; - } - - string deployment_k8s_name = 2; - repeated Instance instances = 3; - optional string readiness_id = 11; - } - message Sentinel { - string k8s_name = 1; - int32 available_replicas = 2; - } - - oneof state { - Deployment deployment = 1; - Sentinel sentinel = 2; - } -} - -// ApplySentinel contains the desired configuration for a sentinel. -// -// The cluster agent will ensure a sentinel exists with this exact configuration, creating it if -// it doesn't exist or updating it if it does. All fields except namespace are required. -// The control plane ensures that sentinel_id is unique within the namespace. -message ApplySentinelRequest { - // namespace is the Kubernetes namespace in which the sentinel should exist. - string k8s_namespace = 1; - - string k8s_name = 2; - // workspace_id identifies the workspace that owns this sentinel. - string workspace_id = 3; - - // project_id identifies the project within the workspace. - string project_id = 4; - - // environment_id in which the sentinel should exist. - string environment_id = 5; - - // sentinel_id is the unique identifier for this sentinel globally - string sentinel_id = 6; - - string image = 7; - int32 replicas = 8; - int64 cpu_millicores = 9; - int64 memory_mib = 10; -} - -// ApplySentinelResponse is the response to ApplySentinelRequest. -message ApplySentinelResponse {} - -// DeleteSentinel identifies a sentinel to remove from the cluster. -// -// The sentinel and all its resources (pods, services, frontline) will be deleted. -// In-flight requests may be disrupted unless proper connection draining is configured. -message DeleteSentinelRequest { - string k8s_namespace = 1; - string k8s_name = 2; -} -message DeleteSentinelResponse {} - -// ApplyDeployment contains the desired configuration for a deployment. -// -// The cluster agent will ensure a deployment exists with this exact configuration, creating it if -// it doesn't exist or updating it if it does. All fields except namespace are required. -// The control plane ensures that deployment_id is unique within the namespace. -message ApplyDeploymentRequest { - // namespace is the Kubernetes namespace in which the deployment should exist. - string k8s_namespace = 1; - - string k8s_name = 2; - // workspace_id identifies the workspace that owns this deployment. - // Used for multi-tenancy and access control. - string workspace_id = 3; - - // project_id identifies the project within the workspace. - // Deployments are scoped to projects for organizational purposes. - string project_id = 4; - - // environment_id specifies the environment . - // Used for environment-specific configuration and isolation. - string environment_id = 5; - - // deployment_id is the unique identifier for this deployment within the namespace. - string deployment_id = 6; - - // image is the container image to deploy. - // Must be a valid container registry URL accessible by the cluster. - // Example: "gcr.io/myproject/app:v2.1.0" - string image = 7; - - // replicas is the desired number of pod instances. - // Must be at least 1. Set higher for increased availability and load distribution. - int32 replicas = 8; - - // cpu_millicores is the CPU request/limit in millicores (1000 = 1 CPU core). - // This ensures each pod has sufficient CPU resources. - // Example: 250 = 0.25 CPU cores - int64 cpu_millicores = 9; - - // memory_mib is the memory request/limit in mebibytes. - // This ensures each pod has sufficient memory. - // Example: 256 = 256 MiB - int64 memory_mib = 10; - - // build_id is the unique identifier for this build from depot - // if we did not build this image via depot, no buildID exists and we - // assume kubernetes will pull from a public registry - optional string build_id = 11; - - // Encrypted secrets blob to be decrypted at runtime by inject. - // This is set as UNKEY_ENCRYPTED_ENV env var in the container. - // inject calls krane's DecryptSecretsBlob RPC to decrypt. - bytes encrypted_environment_variables = 12; - - // An opaque identifier used in a restate awakable. - // If set, the cluster must add this as annotation and report back during Watch checks - optional string readiness_id = 13; -} - -message ApplyDeploymentResponse {} - -// DeleteDeployment identifies a deployment to remove from the cluster. -// -// The deployment and all its pods will be terminated gracefully according to -// the configured termination grace period. All associated resources (services, -// configmaps specific to this deployment) will also be cleaned up. -message DeleteDeploymentRequest { - string k8s_namespace = 1; - string k8s_name = 2; -} - -message DeleteDeploymentResponse {} diff --git a/svc/krane/run.go b/svc/krane/run.go index d5c027dfe3..8ec89baef6 100644 --- a/svc/krane/run.go +++ b/svc/krane/run.go @@ -15,7 +15,8 @@ import ( "github.com/unkeyed/unkey/pkg/vault" "github.com/unkeyed/unkey/pkg/vault/storage" pkgversion "github.com/unkeyed/unkey/pkg/version" - "github.com/unkeyed/unkey/svc/krane/internal/reconciler" + "github.com/unkeyed/unkey/svc/krane/internal/deployment" + "github.com/unkeyed/unkey/svc/krane/internal/sentinel" "github.com/unkeyed/unkey/svc/krane/pkg/controlplane" "github.com/unkeyed/unkey/svc/krane/secrets" "github.com/unkeyed/unkey/svc/krane/secrets/token" @@ -85,18 +86,30 @@ func Run(ctx context.Context, cfg Config) error { return fmt.Errorf("failed to create k8s dynamic client: %w", err) } - r := reconciler.New(reconciler.Config{ + // Start the deployment controller (independent control loop) + deploymentCtrl := deployment.New(deployment.Config{ ClientSet: clientset, DynamicClient: dynamicClient, Logger: logger, Cluster: cluster, Region: cfg.Region, }) - if err := r.Start(ctx); err != nil { - return fmt.Errorf("failed to start reconciler: %w", err) + if err := deploymentCtrl.Start(ctx); err != nil { + return fmt.Errorf("failed to start deployment controller: %w", err) } - - shutdowns.Register(r.Stop) + shutdowns.Register(deploymentCtrl.Stop) + + // Start the sentinel controller (independent control loop) + sentinelCtrl := sentinel.New(sentinel.Config{ + ClientSet: clientset, + Logger: logger, + Cluster: cluster, + Region: cfg.Region, + }) + if err := sentinelCtrl.Start(ctx); err != nil { + return fmt.Errorf("failed to start sentinel controller: %w", err) + } + shutdowns.Register(sentinelCtrl.Stop) // Create vault service for secrets decryption diff --git a/web/apps/engineering/content/docs/architecture/services/ctrl/index.mdx b/web/apps/engineering/content/docs/architecture/services/ctrl/index.mdx index ad7da514f4..a32c7473d4 100644 --- a/web/apps/engineering/content/docs/architecture/services/ctrl/index.mdx +++ b/web/apps/engineering/content/docs/architecture/services/ctrl/index.mdx @@ -33,11 +33,11 @@ The ctrl service is built on Connect RPC for service-to-service communication us ### Cluster Service -The cluster service implements sequence-based synchronization for coordinating deployments across multiple regions. Rather than pushing events to connected agents, it exposes a `Sync` RPC that Krane instances poll for state changes. This design makes the control plane stateless with respect to connected clients. +The cluster service implements version-based synchronization for coordinating deployments across multiple regions. Rather than pushing events to connected agents, it exposes `WatchDeployments` and `WatchSentinels` RPCs that Krane instances use to stream state changes. This design makes the control plane stateless with respect to connected clients. -The service provides these key RPCs. `Sync` establishes a server-streaming connection where the control plane polls the `state_changes` table and streams new entries to Krane. For fresh connections (sequence=0), it first streams the complete desired state as a bootstrap, then switches to incremental mode. `GetDesiredDeploymentState` and `GetDesiredSentinelState` return current desired state for individual resources. `UpdateDeploymentState` and `UpdateSentinelState` receive pod status updates from agents. +The service provides these key RPCs. `WatchDeployments` and `WatchSentinels` establish server-streaming connections for receiving deployment and sentinel state changes respectively. For fresh connections (version=0), they first stream the complete desired state as a bootstrap, then switch to incremental mode. `GetDesiredDeploymentState` and `GetDesiredSentinelState` return current desired state for individual resources. `ReportDeploymentStatus` and `ReportSentinelStatus` receive pod status updates from agents. -When resources are created, updated, or deleted, the deploy workflow inserts a row into `state_changes` with a monotonically increasing sequence number. Krane instances polling for that region receive the change and apply it locally. This decouples the control plane from connection management and enables reliable at-least-once delivery through sequence-based resumption. +When resources are created, updated, or deleted, the deploy workflow updates the resource with a monotonically increasing version number. Krane instances watching that region receive the change and apply it locally. This decouples the control plane from connection management and enables reliable at-least-once delivery through version-based resumption. [Read detailed Pull-Based Provisioning docs →](./pull-based-infra) @@ -85,7 +85,7 @@ Workflows are implemented as Restate services for durable execution. The Deploym The ctrl service uses a single MySQL database (`unkey`) that stores all data: projects, environments, and workspaces, along with deployments and deployment history, deployment topology for regional distribution, instances tracking individual pods, domains and SSL certificates, ACME users and challenges, sentinel configurations, and certificate storage in PEM format. -The `state_changes` table is the changelog that drives Krane synchronization. Each row represents a create, update, or delete operation on a deployment or sentinel, with a monotonically increasing sequence number per region. Krane instances poll this table via the `Sync` RPC to receive incremental updates. Rows are indexed by `(region, sequence)` for efficient polling and retained for 7 days before cleanup. +Resource tables (deployment_topology, sentinels) include a `version` column that drives Krane synchronization. Each mutation updates the version via the VersioningService singleton, providing a monotonically increasing version across all resources. Krane instances stream changes via `WatchDeployments` and `WatchSentinels` RPCs to receive incremental updates. Rows are indexed by `(region, version)` for efficient streaming. ## Monitoring diff --git a/web/apps/engineering/content/docs/architecture/services/ctrl/pull-based-infra.mdx b/web/apps/engineering/content/docs/architecture/services/ctrl/pull-based-infra.mdx index a98223e184..cedb478534 100644 --- a/web/apps/engineering/content/docs/architecture/services/ctrl/pull-based-infra.mdx +++ b/web/apps/engineering/content/docs/architecture/services/ctrl/pull-based-infra.mdx @@ -22,9 +22,9 @@ sequenceDiagram participant K1 as Krane Agent (Region A) participant K8s as Kubernetes API - K1->>CP: Sync(region=a, version=0) + K1->>CP: WatchDeployments(region=a, version=0) Note over K1,CP: Bootstrap: stream full state - CP->>K1: Stream all deployments/sentinels + CP->>K1: Stream all deployments Note over K1: Stream closes, max version=42 U->>CP: Create deployment @@ -32,14 +32,14 @@ sequenceDiagram VS-->>CP: version=43 CP->>DB: Store topology (version=43) - Note over K1: Polling every 1-5s - K1->>CP: Sync(region=a, version=42) + Note over K1: Reconnects with last version + K1->>CP: WatchDeployments(region=a, version=42) CP->>DB: SELECT WHERE version > 42 - CP->>K1: Stream State(version=43, deploy) + CP->>K1: Stream DeploymentState(version=43) K1->>K8s: Apply deployment K8s-->>K1: Pod status change - K1->>CP: UpdateInstance(status=running) + K1->>CP: ReportDeploymentStatus(running) CP->>DB: Upsert Instance table" /> @@ -55,9 +55,9 @@ Before any mutation to a deployment topology or sentinel, the control plane call This eliminates the need for a separate changelog table—the version embedded in each resource row is the source of truth for synchronization. -## Sync Protocol +## Watch Protocol -The synchronization protocol uses a single `Sync` RPC that handles both initial bootstrap and incremental updates. This design eliminates the complexity of managing separate "synthetic" and "live" modes, and removes the need for the control plane to track connected clients in memory. +The synchronization protocol uses `WatchDeployments` and `WatchSentinels` RPCs that handle both initial bootstrap and incremental updates. This design eliminates the complexity of managing separate "synthetic" and "live" modes, and removes the need for the control plane to track connected clients in memory. ### Version-Based Tracking @@ -106,33 +106,28 @@ graph TB S[(sentinels)] end subgraph 'Krane Agent' - W[Watcher] - R[Reconciler] DC[Deployment Controller] GC[Sentinel Controller] - - W -->|HandleState| R - R --> DC - R --> GC end subgraph Kubernetes K8s[K8s API Server] end - W -.->|Sync stream| CP + DC -.->|WatchDeployments| CP + GC -.->|WatchSentinels| CP CP -.->|query by version| DT CP -.->|query by version| S DC -->|Apply/Delete| K8s GC -->|Apply/Delete| K8s K8s -->|Watch Events| DC - DC -->|UpdateInstance| CP - GC -->|UpdateSentinel| CP" + DC -->|ReportDeploymentStatus| CP + GC -->|ReportSentinelStatus| CP" /> -The Watcher maintains the Sync stream, reconnecting with jittered backoff (1-5 seconds) on failure. It passes received `State` messages to the Reconciler, which dispatches to the appropriate controller based on resource type. The Reconciler tracks `versionLastSeen` and updates it only after successfully processing state changes and receiving a clean stream close. +Each controller maintains its watch stream, reconnecting with jittered backoff (1-5 seconds) on failure. Controllers process received state messages and track `versionLastSeen`, updating it only after successfully processing state changes and receiving a clean stream close. -Status updates flow back to the control plane through unary RPCs (`UpdateDeploymentState`, `UpdateSentinelState`) with buffering, retries, and circuit breakers for reliability. +Status updates flow back to the control plane through unary RPCs (`ReportDeploymentStatus`, `ReportSentinelStatus`) with buffering, retries, and circuit breakers for reliability. ## Deployment Workflow @@ -155,8 +150,8 @@ sequenceDiagram VS-->>Workflow: version=N Workflow->>DB: Create topology (version=N) - Note over Krane: Polls via Sync(version > lastSeen) - Krane->>Krane: Receive State(Apply) + Note over Krane: Receives via WatchDeployments + Krane->>Krane: Receive DeploymentState(Apply) Krane->>K8s: Apply deployment K8s-->>Krane: Pod created diff --git a/web/apps/engineering/content/docs/architecture/services/krane/index.mdx b/web/apps/engineering/content/docs/architecture/services/krane/index.mdx index ba0b541c56..c344da2a17 100644 --- a/web/apps/engineering/content/docs/architecture/services/krane/index.mdx +++ b/web/apps/engineering/content/docs/architecture/services/krane/index.mdx @@ -15,17 +15,15 @@ Krane pulls desired state from ctrl and ensures the actual cluster state matches ### Pull-Based Model -Krane implements a polling-based architecture where agents in each cluster connect to ctrl's ClusterService via the `Sync` RPC, which establishes a server-streaming connection. The control plane polls its `state_changes` table and streams new entries to connected agents. Krane processes each state change, applies it to Kubernetes, and updates its sequence watermark. On reconnection, Krane sends its last-seen sequence to resume incrementally without missing events. +Krane implements a streaming architecture where agents in each cluster connect to ctrl's ClusterService via `WatchDeployments` and `WatchSentinels` RPCs. These establish server-streaming connections where the control plane queries resource tables directly and streams state changes. Krane processes each state change, applies it to Kubernetes, and updates its version watermark. On reconnection, Krane sends its last-seen version to resume incrementally without missing events. This model eliminates the need for the control plane to track connected clients in memory, simplifying horizontal scaling and removing a class of connection state bugs. -### Sequence-Based Synchronization +### Version-Based Synchronization -The sync engine uses sequence numbers to track state changes. Every modification to deployments or sentinels is recorded in the `state_changes` table with a monotonically increasing sequence number per region. Krane maintains a `sequenceLastSeen` watermark and polls for changes after that sequence. +The sync engine uses version numbers embedded in resource tables to track state changes. Every modification to deployments or sentinels updates a `version` column via the Restate VersioningService singleton, providing a globally unique, monotonically increasing version across all resources. Krane maintains a `versionLastSeen` watermark and requests changes after that version. -On fresh start (sequence=0), Krane receives the complete desired state followed by a `Bookmark` message containing the current maximum sequence. After bootstrap, the stream switches to incremental mode, receiving only new changes as they occur. - -State changes are retained for 7 days. If Krane's last-seen sequence falls behind the retention window, it must perform a full bootstrap. This handles long disconnections gracefully while keeping storage bounded. +On fresh start (version=0), Krane receives the complete desired state as a bootstrap. After bootstrap, the stream switches to incremental mode, receiving only new changes as they occur. ### Why StatefulSets Instead of Deployments? @@ -49,14 +47,13 @@ This is a known design compromise. Future versions might move instance addressin participant Pod User->>Ctrl: Create deployment request - Ctrl->>DB: Store desired state - Ctrl->>DB: Insert state_change(seq=N) + Ctrl->>DB: Store desired state with version=N - Note over Krane: Sync stream polling + Note over Krane: WatchDeployments stream - Krane->>Ctrl: (poll for changes > seq) - Ctrl->>DB: Query state_changes - Ctrl->>Krane: State(seq=N, ApplyDeployment) + Krane->>Ctrl: WatchDeployments(version > lastSeen) + Ctrl->>DB: Query deployment_topology + Ctrl->>Krane: DeploymentState(version=N, ApplyDeployment) Krane->>K8s: Create/Update Service K8s->>Krane: Service ready @@ -67,7 +64,7 @@ This is a known design compromise. Future versions might move instance addressin K8s->>Pod: Pull image & start container Pod->>Pod: Container running - Note over Krane: sequenceLastSeen = N + Note over Krane: versionLastSeen = N `} /> @@ -139,16 +136,16 @@ When querying deployments, Krane verifies the `unkey.managed.by` label matches ` The control plane exposes a `ClusterService` with these key RPCs: -**Sync** establishes a server-streaming connection for receiving state changes. Krane sends its region and last-seen sequence number; the control plane streams bootstrap state (if sequence=0) followed by incremental changes. The control plane polls its `state_changes` table and streams new entries as they appear. +**WatchDeployments** and **WatchSentinels** establish server-streaming connections for receiving state changes. Krane sends its region and last-seen version; the control plane streams bootstrap state (if version=0) followed by incremental changes by querying resource tables directly. **GetDesiredDeploymentState** and **GetDesiredSentinelState** return the current desired state for a specific resource. Used for on-demand reconciliation when Kubernetes reports unexpected state. -**UpdateDeploymentState** and **UpdateSentinelState** receive status updates from Krane about actual Kubernetes state (pod running, pod failed, etc.). +**ReportDeploymentStatus** and **ReportSentinelStatus** receive status updates from Krane about actual Kubernetes state (pod running, pod failed, etc.). ### State Change Distribution -When deployment changes occur, the control plane stores the desired state in the database and inserts a row into `state_changes` with the resource ID, operation type (upsert/delete), and region. Krane instances polling for that region receive the change on their next poll cycle. Each Krane instance independently applies changes to its local cluster. +When deployment changes occur, the control plane stores the desired state in the database with an updated version number. Krane instances watching that region receive the change on their stream. Each Krane instance independently applies changes to its local cluster. ### Multi-Region Support -The `state_changes` table is partitioned by region, so each Krane instance only receives changes relevant to its cluster. The control plane doesn't need to know which Krane instances exist or are connected; it simply writes changes to the database, and any Krane polling that region will receive them. +Resource tables include a region column, so each Krane instance only receives changes relevant to its cluster. The control plane doesn't need to know which Krane instances exist or are connected; it simply writes changes to the database, and any Krane watching that region will receive them. diff --git a/web/apps/engineering/content/docs/architecture/services/krane/sync-engine.mdx b/web/apps/engineering/content/docs/architecture/services/krane/sync-engine.mdx index a8727e8d8b..067a36e4b3 100644 --- a/web/apps/engineering/content/docs/architecture/services/krane/sync-engine.mdx +++ b/web/apps/engineering/content/docs/architecture/services/krane/sync-engine.mdx @@ -13,16 +13,11 @@ The Krane Sync Engine implements a Kubernetes-style List+Watch pattern for synch chart=" graph TB subgraph 'Krane Agent' - W[Watcher] - R[Reconciler] DC[Deployment Controller] GC[Sentinel Controller] IUB[Instance Update Buffer] GUB[Sentinel Update Buffer] - W -->|HandleState| R - R --> DC - R --> GC DC --> IUB GC --> GUB end @@ -43,9 +38,10 @@ graph TB W2[StatefulSet Watcher] end - W -.->|Sync stream| CS - IUB -->|UpdateDeploymentState| CS - GUB -->|UpdateSentinelState| CS + DC -.->|WatchDeployments| CS + GC -.->|WatchSentinels| CS + IUB -->|ReportDeploymentStatus| CS + GUB -->|ReportSentinelStatus| CS DC -->|Apply/Delete| API GC -->|Apply/Delete| API @@ -55,35 +51,35 @@ graph TB ## Sync Protocol -The sync engine uses a single `Sync` RPC to receive state changes from the control plane. This RPC establishes a server-streaming connection where the control plane sends `State` messages containing deployment or sentinel operations. +The sync engine uses `WatchDeployments` and `WatchSentinels` RPCs to receive state changes from the control plane. These RPCs establish server-streaming connections where the control plane sends `DeploymentState` and `SentinelState` messages containing deployment or sentinel operations. ### Version Tracking Each resource (deployment_topology, sentinel) has a `version` column updated on every mutation via the Restate VersioningService singleton. This provides a globally unique, monotonically increasing version across all resources. -The reconciler maintains a `versionLastSeen` field that tracks the highest version successfully processed. On startup, this is zero. After processing each `State` message, the reconciler tracks the max version seen but only commits it after a clean stream close. When reconnecting after a failure, Krane sends its last-seen version in the `SyncRequest`, allowing the control plane to resume from the correct position. +Each controller maintains a `versionLastSeen` field that tracks the highest version successfully processed. On startup, this is zero. After processing each state message, the controller tracks the max version seen but only commits it after a clean stream close. When reconnecting after a failure, Krane sends its last-seen version in the watch request, allowing the control plane to resume from the correct position. ### Message Types -The `State` message contains a version number and one of two payloads: +Each state message contains a version number and one of two payloads: **DeploymentState** contains either an `ApplyDeployment` (create or update a StatefulSet with the specified image, replicas, and resource limits) or `DeleteDeployment` (remove the StatefulSet and its associated Service). **SentinelState** contains either an `ApplySentinel` (create or update a sentinel deployment) or `DeleteSentinel` (remove the sentinel). -Stream close signals that the current batch (or bootstrap) is complete. The client tracks the highest version from received messages and uses it for the next sync request. +Stream close signals that the current batch (or bootstrap) is complete. The client tracks the highest version from received messages and uses it for the next watch request. -## Watcher Loop +## Controller Loops -The Watcher runs a continuous loop with jittered reconnection timing (1-5 seconds between attempts). Each iteration establishes a Sync stream and processes messages until the stream closes or an error occurs. +Each controller (deployment and sentinel) runs a continuous loop with jittered reconnection timing (1-5 seconds between attempts). Each iteration establishes a watch stream and processes messages until the stream closes or an error occurs. ``` for { sleep(random(1s, 5s)) - stream = cluster.Sync(region, versionLastSeen) + stream = cluster.WatchDeployments(region, versionLastSeen) maxVersion = 0 for message in stream { - ver = reconciler.HandleState(message) + ver = controller.HandleState(message) maxVersion = max(maxVersion, ver) } if stream.closed_cleanly: @@ -95,17 +91,13 @@ This design prioritizes simplicity and reliability over latency. The jittered ti ## State Handling -The `HandleState` method on the Reconciler dispatches each state message to the appropriate controller: +Each controller handles its own state messages: ``` HandleState(state): - switch state.Kind: - case Deployment: - if Apply: ApplyDeployment(state.Apply) - if Delete: DeleteDeployment(state.Delete) - case Sentinel: - if Apply: ApplySentinel(state.Apply) - if Delete: DeleteSentinel(state.Delete) + switch state.Type: + case Apply: ApplyDeployment(state.Apply) + case Delete: DeleteDeployment(state.Delete) return state.Version ``` @@ -120,7 +112,7 @@ This eliminates the need for a separate changelog table. The resource tables the ## Kubernetes Watchers -In addition to receiving desired state from the control plane, Krane watches Kubernetes for actual state changes. Pod and StatefulSet watchers notify the controllers when resources change (pod becomes ready, pod fails, etc.). The controllers then report these changes back to the control plane through `UpdateDeploymentState` and `UpdateSentinelState` RPCs. +In addition to receiving desired state from the control plane, Krane watches Kubernetes for actual state changes. Pod and StatefulSet watchers notify the controllers when resources change (pod becomes ready, pod fails, etc.). The controllers then report these changes back to the control plane through `ReportDeploymentStatus` and `ReportSentinelStatus` RPCs. This bidirectional flow ensures the control plane always knows the actual state of resources, enabling the UI to show accurate deployment status and the workflow to detect when deployments are ready. From 3769fead29d2014a2d84ff558e0d69e869d02b6b Mon Sep 17 00:00:00 2001 From: chronark Date: Tue, 20 Jan 2026 18:59:57 +0100 Subject: [PATCH 07/32] fix: comment --- dev/k8s/manifests/cilium-policies.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/k8s/manifests/cilium-policies.yaml b/dev/k8s/manifests/cilium-policies.yaml index bd2d08a21d..f1056a4421 100644 --- a/dev/k8s/manifests/cilium-policies.yaml +++ b/dev/k8s/manifests/cilium-policies.yaml @@ -102,7 +102,7 @@ spec: - port: "8080" protocol: TCP --- -# 5b. Allow sentinel egress to MySQL and DNS +# 5b. Allow sentinel egress to MySQL and ClickHouse # Sentinels need database access for routing and state management apiVersion: cilium.io/v2 kind: CiliumClusterwideNetworkPolicy From 4200fc51a7681655a74b26907056887510329394 Mon Sep 17 00:00:00 2001 From: GitButler Date: Tue, 20 Jan 2026 21:50:29 +0100 Subject: [PATCH 08/32] GitButler Workspace Commit This is a merge commit the virtual branches in your workspace. Due to GitButler managing multiple virtual branches, you cannot switch back and forth between git branches and virtual branches easily. If you switch to another branch, GitButler will need to be reinitialized. If you commit on this branch, GitButler will throw it away. Here are the branches that are currently applied: - decouple-control-workers (refs/gitbutler/decouple-control-workers) branch head: 015bad2d7a822be20a8a9c57ffb40c8a634d7158 For more information about what we're doing here, check out our docs: https://docs.gitbutler.com/features/branch-management/integration-branch From 5210c8e7c76174f4807ecf2c4fac8c54f1fc47c0 Mon Sep 17 00:00:00 2001 From: chronark Date: Tue, 20 Jan 2026 22:37:00 +0100 Subject: [PATCH 09/32] refactor: move restate workflow logic into separate svc/worker this allows us to use restates operator to deploy it independently of our control plane --- cmd/run/BUILD.bazel | 1 + cmd/run/main.go | 4 + cmd/worker/BUILD.bazel | 14 + cmd/worker/main.go | 232 ++++++++ dev/Tiltfile | 11 + dev/docker-compose.yaml | 83 ++- dev/k8s/manifests/ctrl.yaml | 11 +- dev/k8s/manifests/worker.yaml | 169 ++++++ svc/ctrl/BUILD.bazel | 14 +- svc/ctrl/internal/caches/BUILD.bazel | 17 - svc/ctrl/internal/caches/caches.go | 84 --- svc/ctrl/internal/caches/doc.go | 43 -- svc/ctrl/run.go | 337 +----------- svc/worker/BUILD.bazel | 42 ++ .../certificate/BUILD.bazel | 2 +- .../workflows => worker}/certificate/doc.go | 0 .../certificate/process_challenge_handler.go | 0 .../certificate/renew_handler.go | 0 .../certificate/service.go | 0 svc/worker/config.go | 424 ++++++++++++++ .../workflows => worker}/deploy/BUILD.bazel | 2 +- .../deploy/deploy_handler.go | 0 svc/{ctrl/workflows => worker}/deploy/doc.go | 0 .../workflows => worker}/deploy/domains.go | 0 .../workflows => worker}/deploy/helpers.go | 0 .../deploy/promote_handler.go | 0 .../deploy/rollback_handler.go | 0 .../workflows => worker}/deploy/service.go | 0 svc/worker/doc.go | 78 +++ .../workflows => worker}/routing/BUILD.bazel | 2 +- .../routing/assign_domains_handler.go | 0 svc/{ctrl/workflows => worker}/routing/doc.go | 0 .../workflows => worker}/routing/service.go | 0 svc/worker/run.go | 517 ++++++++++++++++++ .../versioning/BUILD.bazel | 2 +- .../workflows => worker}/versioning/doc.go | 0 .../versioning/next_version_handler.go | 0 .../versioning/service.go | 0 38 files changed, 1605 insertions(+), 484 deletions(-) create mode 100644 cmd/worker/BUILD.bazel create mode 100644 cmd/worker/main.go create mode 100644 dev/k8s/manifests/worker.yaml delete mode 100644 svc/ctrl/internal/caches/BUILD.bazel delete mode 100644 svc/ctrl/internal/caches/caches.go delete mode 100644 svc/ctrl/internal/caches/doc.go create mode 100644 svc/worker/BUILD.bazel rename svc/{ctrl/workflows => worker}/certificate/BUILD.bazel (90%) rename svc/{ctrl/workflows => worker}/certificate/doc.go (100%) rename svc/{ctrl/workflows => worker}/certificate/process_challenge_handler.go (100%) rename svc/{ctrl/workflows => worker}/certificate/renew_handler.go (100%) rename svc/{ctrl/workflows => worker}/certificate/service.go (100%) create mode 100644 svc/worker/config.go rename svc/{ctrl/workflows => worker}/deploy/BUILD.bazel (91%) rename svc/{ctrl/workflows => worker}/deploy/deploy_handler.go (100%) rename svc/{ctrl/workflows => worker}/deploy/doc.go (100%) rename svc/{ctrl/workflows => worker}/deploy/domains.go (100%) rename svc/{ctrl/workflows => worker}/deploy/helpers.go (100%) rename svc/{ctrl/workflows => worker}/deploy/promote_handler.go (100%) rename svc/{ctrl/workflows => worker}/deploy/rollback_handler.go (100%) rename svc/{ctrl/workflows => worker}/deploy/service.go (100%) create mode 100644 svc/worker/doc.go rename svc/{ctrl/workflows => worker}/routing/BUILD.bazel (84%) rename svc/{ctrl/workflows => worker}/routing/assign_domains_handler.go (100%) rename svc/{ctrl/workflows => worker}/routing/doc.go (100%) rename svc/{ctrl/workflows => worker}/routing/service.go (100%) create mode 100644 svc/worker/run.go rename svc/{ctrl/workflows => worker}/versioning/BUILD.bazel (81%) rename svc/{ctrl/workflows => worker}/versioning/doc.go (100%) rename svc/{ctrl/workflows => worker}/versioning/next_version_handler.go (100%) rename svc/{ctrl/workflows => worker}/versioning/service.go (100%) diff --git a/cmd/run/BUILD.bazel b/cmd/run/BUILD.bazel index 8cb77eb3f1..05397a3220 100644 --- a/cmd/run/BUILD.bazel +++ b/cmd/run/BUILD.bazel @@ -13,6 +13,7 @@ go_library( "//cmd/preflight", "//cmd/sentinel", "//cmd/vault", + "//cmd/worker", "//pkg/cli", ], ) diff --git a/cmd/run/main.go b/cmd/run/main.go index 1108fbd303..72647b9f70 100644 --- a/cmd/run/main.go +++ b/cmd/run/main.go @@ -11,6 +11,7 @@ import ( "github.com/unkeyed/unkey/cmd/preflight" "github.com/unkeyed/unkey/cmd/sentinel" "github.com/unkeyed/unkey/cmd/vault" + "github.com/unkeyed/unkey/cmd/worker" "github.com/unkeyed/unkey/pkg/cli" ) @@ -33,6 +34,7 @@ AVAILABLE SERVICES: - frontline: Multi-tenant frontline service for TLS termination and routing - sentinel: Environment tenant sentinel service for routing requests to the actual instances - vault: Secret management service for encryption +- worker: Restate worker service for background jobs and workflows EXAMPLES: unkey run api # Run the API server @@ -49,6 +51,7 @@ unkey run api --port 8080 --env production # Run API server with custom con sentinel.Cmd, preflight.Cmd, vault.Cmd, + worker.Cmd, }, Action: runAction, } @@ -62,6 +65,7 @@ func runAction(ctx context.Context, cmd *cli.Command) error { fmt.Println(" sentinel - Environment tenant gateway service for routing requests to the actual instances") fmt.Println(" preflight - Kubernetes mutating webhook for secrets and credentials injection") fmt.Println(" vault - Encryption service for sensitive data") + fmt.Println(" worker - Restate worker service for background jobs and workflows") fmt.Println() fmt.Println("Use 'unkey run ' to start a specific service") fmt.Println("Use 'unkey run --help' for service-specific options") diff --git a/cmd/worker/BUILD.bazel b/cmd/worker/BUILD.bazel new file mode 100644 index 0000000000..ea7a65a962 --- /dev/null +++ b/cmd/worker/BUILD.bazel @@ -0,0 +1,14 @@ +load("@rules_go//go:def.bzl", "go_library") + +go_library( + name = "worker", + srcs = ["main.go"], + importpath = "github.com/unkeyed/unkey/cmd/worker", + visibility = ["//visibility:public"], + deps = [ + "//pkg/cli", + "//pkg/clock", + "//pkg/uid", + "//svc/worker", + ], +) diff --git a/cmd/worker/main.go b/cmd/worker/main.go new file mode 100644 index 0000000000..08e31faa7d --- /dev/null +++ b/cmd/worker/main.go @@ -0,0 +1,232 @@ +package worker + +import ( + "context" + + "github.com/unkeyed/unkey/pkg/cli" + "github.com/unkeyed/unkey/pkg/clock" + "github.com/unkeyed/unkey/pkg/uid" + "github.com/unkeyed/unkey/svc/worker" +) + +// Cmd is the worker command that runs the Unkey Restate worker service for +// handling background jobs, deployments, builds, and certificate management. +var Cmd = &cli.Command{ + Version: "", + Commands: []*cli.Command{}, + Aliases: []string{}, + Description: "", + Name: "worker", + Usage: "Run the Unkey Restate worker service for background jobs and workflows", + Flags: []cli.Flag{ + // Server Configuration + cli.Int("http-port", "HTTP port for the health endpoint. Default: 7092", + cli.Default(7092), cli.EnvVar("UNKEY_WORKER_HTTP_PORT")), + cli.Int("prometheus-port", "Port for Prometheus metrics, set to 0 to disable.", + cli.Default(0), cli.EnvVar("UNKEY_PROMETHEUS_PORT")), + + // Instance Identification + cli.String("instance-id", "Unique identifier for this instance. Auto-generated if not provided.", + cli.Default(uid.New(uid.InstancePrefix, 4)), cli.EnvVar("UNKEY_INSTANCE_ID")), + + // Database Configuration + cli.String("database-primary", "MySQL connection string for primary database. Required for all deployments. Example: user:pass@host:3306/unkey?parseTime=true", + cli.Required(), cli.EnvVar("UNKEY_DATABASE_PRIMARY")), + + // Authentication + cli.String("auth-token", "Authentication token for worker API access.", + cli.EnvVar("UNKEY_AUTH_TOKEN")), + + // Vault Configuration - General secrets (env vars, API keys) + cli.StringSlice("vault-master-keys", "Vault master keys for encryption (general vault)", + cli.Required(), cli.EnvVar("UNKEY_VAULT_MASTER_KEYS")), + cli.String("vault-s3-url", "S3 endpoint URL for general vault", + cli.EnvVar("UNKEY_VAULT_S3_URL")), + cli.String("vault-s3-bucket", "S3 bucket for general vault (env vars, API keys)", + cli.EnvVar("UNKEY_VAULT_S3_BUCKET")), + cli.String("vault-s3-access-key-id", "S3 access key ID for general vault", + cli.EnvVar("UNKEY_VAULT_S3_ACCESS_KEY_ID")), + cli.String("vault-s3-access-key-secret", "S3 secret access key for general vault", + cli.EnvVar("UNKEY_VAULT_S3_ACCESS_KEY_SECRET")), + + // ACME Vault Configuration - Let's Encrypt certificates + cli.StringSlice("acme-vault-master-keys", "Vault master keys for encryption (ACME vault)", + cli.EnvVar("UNKEY_ACME_VAULT_MASTER_KEYS")), + cli.String("acme-vault-s3-url", "S3 endpoint URL for ACME vault", + cli.EnvVar("UNKEY_ACME_VAULT_S3_URL")), + cli.String("acme-vault-s3-bucket", "S3 bucket for ACME vault (Let's Encrypt certs)", + cli.EnvVar("UNKEY_ACME_VAULT_S3_BUCKET")), + cli.String("acme-vault-s3-access-key-id", "S3 access key ID for ACME vault", + cli.EnvVar("UNKEY_ACME_VAULT_S3_ACCESS_KEY_ID")), + cli.String("acme-vault-s3-access-key-secret", "S3 secret access key for ACME vault", + cli.EnvVar("UNKEY_ACME_VAULT_S3_ACCESS_KEY_SECRET")), + + // Build Configuration + cli.String("build-backend", "Build backend to use: 'docker' for local, 'depot' for production. Default: depot", + cli.Default("depot"), cli.EnvVar("UNKEY_BUILD_BACKEND")), + cli.String("build-s3-url", "S3 Compatible Endpoint URL for build contexts", + cli.Required(), cli.EnvVar("UNKEY_BUILD_S3_URL")), + cli.String("build-s3-bucket", "S3 bucket name for build contexts", + cli.Required(), cli.EnvVar("UNKEY_BUILD_S3_BUCKET")), + cli.String("build-s3-access-key-id", "S3 access key ID for build contexts", + cli.Required(), cli.EnvVar("UNKEY_BUILD_S3_ACCESS_KEY_ID")), + cli.String("build-s3-access-key-secret", "S3 secret access key for build contexts", + cli.Required(), cli.EnvVar("UNKEY_BUILD_S3_ACCESS_KEY_SECRET")), + cli.String("build-platform", "Run builds on this platform ('dynamic', 'linux/amd64', 'linux/arm64')", + cli.Default("linux/amd64"), cli.EnvVar("UNKEY_BUILD_PLATFORM")), + + // Registry Configuration + cli.String("registry-url", "URL of the container registry for pulling images. Example: registry.depot.dev", + cli.EnvVar("UNKEY_REGISTRY_URL")), + cli.String("registry-username", "Username for authenticating with the container registry.", + cli.EnvVar("UNKEY_REGISTRY_USERNAME")), + cli.String("registry-password", "Password/token for authenticating with the container registry.", + cli.EnvVar("UNKEY_REGISTRY_PASSWORD")), + + // Depot Build Backend Configuration + cli.String("depot-api-url", "Depot API endpoint URL", + cli.EnvVar("UNKEY_DEPOT_API_URL")), + cli.String("depot-project-region", "Build data will be stored in the chosen region ('us-east-1','eu-central-1')", + cli.EnvVar("UNKEY_DEPOT_PROJECT_REGION"), cli.Default("us-east-1")), + + // ACME Configuration + cli.Bool("acme-enabled", "Enable Let's Encrypt for acme challenges", cli.EnvVar("UNKEY_ACME_ENABLED")), + cli.String("acme-email-domain", "Domain for ACME registration emails (workspace_id@domain)", cli.Default("unkey.com"), cli.EnvVar("UNKEY_ACME_EMAIL_DOMAIN")), + + // Cloudflare DNS provider + cli.Bool("acme-cloudflare-enabled", "Enable Cloudflare for wildcard certificates", cli.EnvVar("UNKEY_ACME_CLOUDFLARE_ENABLED")), + cli.String("acme-cloudflare-api-token", "Cloudflare API token for Let's Encrypt", cli.EnvVar("UNKEY_ACME_CLOUDFLARE_API_TOKEN")), + + // Route53 DNS provider + cli.Bool("acme-route53-enabled", "Enable Route53 for DNS-01 challenges", cli.EnvVar("UNKEY_ACME_ROUTE53_ENABLED")), + cli.String("acme-route53-access-key-id", "AWS access key ID for Route53", cli.EnvVar("UNKEY_ACME_ROUTE53_ACCESS_KEY_ID")), + cli.String("acme-route53-secret-access-key", "AWS secret access key for Route53", cli.EnvVar("UNKEY_ACME_ROUTE53_SECRET_ACCESS_KEY")), + cli.String("acme-route53-region", "AWS region for Route53", cli.Default("us-east-1"), cli.EnvVar("UNKEY_ACME_ROUTE53_REGION")), + cli.String("acme-route53-hosted-zone-id", "Route53 hosted zone ID (bypasses auto-discovery, required when wildcard CNAMEs exist)", cli.EnvVar("UNKEY_ACME_ROUTE53_HOSTED_ZONE_ID")), + + cli.String("default-domain", "Default domain for auto-generated hostnames", cli.Default("unkey.app"), cli.EnvVar("UNKEY_DEFAULT_DOMAIN")), + + // Restate Configuration + cli.String("restate-url", "URL of the Restate ingress endpoint for invoking workflows. Example: http://restate:8080", + cli.Default("http://restate:8080"), cli.EnvVar("UNKEY_RESTATE_INGRESS_URL")), + cli.String("restate-admin-url", "URL of the Restate admin endpoint for service registration. Example: http://restate:9070", + cli.Default("http://restate:9070"), cli.EnvVar("UNKEY_RESTATE_ADMIN_URL")), + cli.Int("restate-http-port", "Port where we listen for Restate HTTP requests. Example: 9080", + cli.Default(9080), cli.EnvVar("UNKEY_RESTATE_HTTP_PORT")), + cli.String("restate-register-as", "URL of this service for self-registration with Restate. Example: http://worker:9080", + cli.EnvVar("UNKEY_RESTATE_REGISTER_AS")), + cli.String("restate-api-key", "API key for Restate ingress requests", + cli.EnvVar("UNKEY_RESTATE_API_KEY")), + + // ClickHouse Configuration + cli.String("clickhouse-url", "ClickHouse connection string for analytics. Required. Example: clickhouse://user:pass@host:9000/unkey", + cli.EnvVar("UNKEY_CLICKHOUSE_URL")), + + // Sentinel configuration + cli.String("sentinel-image", "The image new sentinels get deployed with", cli.Default("ghcr.io/unkeyed/unkey:local"), cli.EnvVar("UNKEY_SENTINEL_IMAGE")), + cli.StringSlice("available-regions", "Available regions for deployment", cli.EnvVar("UNKEY_AVAILABLE_REGIONS"), cli.Default([]string{"local.dev"})), + }, + Action: action, +} + +func action(ctx context.Context, cmd *cli.Command) error { + config := worker.Config{ + // Basic configuration + HttpPort: cmd.Int("http-port"), + PrometheusPort: cmd.Int("prometheus-port"), + InstanceID: cmd.String("instance-id"), + + // Database configuration + DatabasePrimary: cmd.String("database-primary"), + + // Authentication + AuthToken: cmd.String("auth-token"), + + // Vault configuration - General secrets + VaultMasterKeys: cmd.StringSlice("vault-master-keys"), + VaultS3: worker.S3Config{ + URL: cmd.String("vault-s3-url"), + Bucket: cmd.String("vault-s3-bucket"), + AccessKeyID: cmd.String("vault-s3-access-key-id"), + AccessKeySecret: cmd.String("vault-s3-access-key-secret"), + ExternalURL: "", + }, + + // ACME Vault configuration - Let's Encrypt certificates + AcmeVaultMasterKeys: cmd.StringSlice("acme-vault-master-keys"), + AcmeVaultS3: worker.S3Config{ + URL: cmd.String("acme-vault-s3-url"), + Bucket: cmd.String("acme-vault-s3-bucket"), + AccessKeyID: cmd.String("acme-vault-s3-access-key-id"), + AccessKeySecret: cmd.String("acme-vault-s3-access-key-secret"), + ExternalURL: "", + }, + + // Build configuration + BuildBackend: worker.BuildBackend(cmd.String("build-backend")), + BuildS3: worker.S3Config{ + URL: cmd.String("build-s3-url"), + Bucket: cmd.String("build-s3-bucket"), + AccessKeyID: cmd.String("build-s3-access-key-id"), + AccessKeySecret: cmd.String("build-s3-access-key-secret"), + ExternalURL: "", + }, + BuildPlatform: cmd.String("build-platform"), + + // Registry configuration + RegistryURL: cmd.String("registry-url"), + RegistryUsername: cmd.String("registry-username"), + RegistryPassword: cmd.String("registry-password"), + + // Depot build backend configuration + Depot: worker.DepotConfig{ + APIUrl: cmd.String("depot-api-url"), + ProjectRegion: cmd.String("depot-project-region"), + }, + + // Acme configuration + Acme: worker.AcmeConfig{ + Enabled: cmd.Bool("acme-enabled"), + EmailDomain: cmd.String("acme-email-domain"), + Cloudflare: worker.CloudflareConfig{ + Enabled: cmd.Bool("acme-cloudflare-enabled"), + ApiToken: cmd.String("acme-cloudflare-api-token"), + }, + Route53: worker.Route53Config{ + Enabled: cmd.Bool("acme-route53-enabled"), + AccessKeyID: cmd.String("acme-route53-access-key-id"), + SecretAccessKey: cmd.String("acme-route53-secret-access-key"), + Region: cmd.String("acme-route53-region"), + HostedZoneID: cmd.String("acme-route53-hosted-zone-id"), + }, + }, + + DefaultDomain: cmd.String("default-domain"), + + // Restate configuration + Restate: worker.RestateConfig{ + URL: cmd.String("restate-url"), + AdminURL: cmd.String("restate-admin-url"), + HttpPort: cmd.Int("restate-http-port"), + RegisterAs: cmd.String("restate-register-as"), + APIKey: cmd.String("restate-api-key"), + }, + + // Clickhouse Configuration + ClickhouseURL: cmd.String("clickhouse-url"), + + // Common + Clock: clock.New(), + + // Sentinel configuration + SentinelImage: cmd.String("sentinel-image"), + AvailableRegions: cmd.RequireStringSlice("available-regions"), + } + + err := config.Validate() + if err != nil { + return err + } + + return worker.Run(ctx, config) +} diff --git a/dev/Tiltfile b/dev/Tiltfile index da2dfecefd..72df746b05 100644 --- a/dev/Tiltfile +++ b/dev/Tiltfile @@ -139,6 +139,17 @@ k8s_resource( trigger_mode=TRIGGER_MODE_AUTO ) +# Worker service (Restate workflow handlers) +k8s_yaml('k8s/manifests/worker.yaml') +k8s_resource( + 'worker', + port_forwards=['7092:7092', '9080:9080'], + resource_deps=['mysql', 'clickhouse', 'restate', 'build-unkey', 'depot-credentials'], + labels=['unkey'], + auto_init=True, + trigger_mode=TRIGGER_MODE_AUTO +) + # Krane service k8s_yaml('k8s/manifests/krane.yaml') k8s_resource( diff --git a/dev/docker-compose.yaml b/dev/docker-compose.yaml index c9691f14eb..41f1f5db68 100644 --- a/dev/docker-compose.yaml +++ b/dev/docker-compose.yaml @@ -323,7 +323,6 @@ services: command: ["run", "ctrl"] ports: - "7091:7091" - - "9080:9080" # Restate workflow service port depends_on: mysql: condition: service_healthy @@ -346,11 +345,9 @@ services: UNKEY_HTTP_PORT: "7091" UNKEY_DEFAULT_DOMAIN: "unkey.local" - # Restate configuration + # Restate configuration (ctrl only needs ingress client, not server) UNKEY_RESTATE_INGRESS_URL: "http://restate:8080" UNKEY_RESTATE_ADMIN_URL: "http://restate:9070" - UNKEY_RESTATE_HTTP_PORT: "9080" - UNKEY_RESTATE_REGISTER_AS: "http://ctrl:9080" UNKEY_RESTATE_API_KEY: "" # Vault - General secrets (env vars, API keys) @@ -389,6 +386,84 @@ services: UNKEY_CLICKHOUSE_URL: "clickhouse://default:password@clickhouse:9000?secure=false&skip_verify=true" + worker: + networks: + - default + build: + context: ../ + dockerfile: Dockerfile + args: + VERSION: "latest" + container_name: worker + command: ["run", "worker"] + ports: + - "7092:7092" # Health endpoint + - "9080:9080" # Restate workflow service port + depends_on: + mysql: + condition: service_healthy + required: true + s3: + condition: service_healthy + required: true + restate: + condition: service_healthy + required: true + clickhouse: + condition: service_healthy + required: true + volumes: + - /var/run/docker.sock:/var/run/docker.sock + environment: + UNKEY_DATABASE_PRIMARY: "unkey:password@tcp(mysql:3306)/unkey?parseTime=true&interpolateParams=true" + + # Worker configuration + UNKEY_WORKER_HTTP_PORT: "7092" + UNKEY_DEFAULT_DOMAIN: "unkey.local" + + # Restate configuration + UNKEY_RESTATE_INGRESS_URL: "http://restate:8080" + UNKEY_RESTATE_ADMIN_URL: "http://restate:9070" + UNKEY_RESTATE_HTTP_PORT: "9080" + UNKEY_RESTATE_REGISTER_AS: "http://worker:9080" + UNKEY_RESTATE_API_KEY: "" + + # Vault - General secrets (env vars, API keys) + UNKEY_VAULT_S3_URL: "http://s3:3902" + UNKEY_VAULT_S3_BUCKET: "vault" + UNKEY_VAULT_S3_ACCESS_KEY_ID: "minio_root_user" + UNKEY_VAULT_S3_ACCESS_KEY_SECRET: "minio_root_password" + UNKEY_VAULT_MASTER_KEYS: "Ch9rZWtfMmdqMFBJdVhac1NSa0ZhNE5mOWlLSnBHenFPENTt7an5MRogENt9Si6wms4pQ2XIvqNSIgNpaBenJmXgcInhu6Nfv2U=" + # ACME Vault - Let's Encrypt certificates + UNKEY_ACME_VAULT_MASTER_KEYS: "Ch9rZWtfMmdqMFBJdVhac1NSa0ZhNE5mOWlLSnBHenFPENTt7an5MRogENt9Si6wms4pQ2XIvqNSIgNpaBenJmXgcInhu6Nfv2U=" + UNKEY_ACME_VAULT_S3_URL: "http://s3:3902" + UNKEY_ACME_VAULT_S3_BUCKET: "acme-vault" + UNKEY_ACME_VAULT_S3_ACCESS_KEY_ID: "minio_root_user" + UNKEY_ACME_VAULT_S3_ACCESS_KEY_SECRET: "minio_root_password" + + # Build configuration + UNKEY_BUILD_S3_URL: "${UNKEY_BUILD_S3_URL:-http://s3:3902}" + UNKEY_BUILD_S3_BUCKET: "build-contexts" + UNKEY_BUILD_S3_ACCESS_KEY_ID: "${UNKEY_BUILD_S3_ACCESS_KEY_ID:-minio_root_user}" + UNKEY_BUILD_S3_ACCESS_KEY_SECRET: "${UNKEY_BUILD_S3_ACCESS_KEY_SECRET:-minio_root_password}" + + # Build backend configuration + UNKEY_BUILD_BACKEND: "${UNKEY_BUILD_BACKEND:-docker}" + UNKEY_BUILD_PLATFORM: "linux/amd64" + UNKEY_DOCKER_SOCKET: "/var/run/docker.sock" + # Registry configuration (used by both Docker and Depot backends) + UNKEY_REGISTRY_URL: "${UNKEY_REGISTRY_URL:-registry.depot.dev}" + UNKEY_REGISTRY_USERNAME: "${UNKEY_REGISTRY_USERNAME:-x-token}" + UNKEY_REGISTRY_PASSWORD: "${UNKEY_REGISTRY_PASSWORD:-${DEPOT_TOKEN:-}}" + # Depot-specific configuration (only needed when UNKEY_BUILD_BACKEND=depot) + UNKEY_DEPOT_API_URL: "https://api.depot.dev" + UNKEY_DEPOT_PROJECT_REGION: "us-east-1" + + UNKEY_CLICKHOUSE_URL: "clickhouse://default:password@clickhouse:9000?secure=false&skip_verify=true" + + # API key for cluster service + UNKEY_AUTH_TOKEN: "your-local-dev-key" + otel: networks: - default diff --git a/dev/k8s/manifests/ctrl.yaml b/dev/k8s/manifests/ctrl.yaml index c628373e23..0b5813adab 100644 --- a/dev/k8s/manifests/ctrl.yaml +++ b/dev/k8s/manifests/ctrl.yaml @@ -29,7 +29,6 @@ spec: imagePullPolicy: Never # Use local images ports: - containerPort: 7091 - - containerPort: 9080 env: # Server Configuration - name: UNKEY_HTTP_PORT @@ -123,15 +122,11 @@ spec: - name: UNKEY_DEFAULT_DOMAIN value: "unkey.local" - # Restate Configuration + # Restate Configuration (ctrl only needs ingress client) - name: UNKEY_RESTATE_INGRESS_URL value: "http://restate:8080" - name: UNKEY_RESTATE_ADMIN_URL value: "http://restate:9070" - - name: UNKEY_RESTATE_HTTP_PORT - value: "9080" - - name: UNKEY_RESTATE_REGISTER_AS - value: "http://ctrl:9080" - name: UNKEY_RESTATE_API_KEY value: "" @@ -174,8 +169,4 @@ spec: port: 7091 targetPort: 7091 protocol: TCP - - name: restate - port: 9080 - targetPort: 9080 - protocol: TCP type: LoadBalancer diff --git a/dev/k8s/manifests/worker.yaml b/dev/k8s/manifests/worker.yaml new file mode 100644 index 0000000000..7ff4c0b9d9 --- /dev/null +++ b/dev/k8s/manifests/worker.yaml @@ -0,0 +1,169 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: worker + namespace: unkey + labels: + app: worker +spec: + replicas: 1 + selector: + matchLabels: + app: worker + template: + metadata: + labels: + app: worker + spec: + serviceAccountName: unkey-serviceaccount + volumes: + - name: docker-socket + hostPath: + path: /var/run/docker.sock + type: Socket + containers: + - name: worker + image: unkey/go:latest + args: ["run", "worker"] + imagePullPolicy: Never # Use local images + ports: + - containerPort: 7092 + - containerPort: 9080 + env: + # Server Configuration + - name: UNKEY_WORKER_HTTP_PORT + value: "7092" + - name: UNKEY_LOGS_COLOR + value: "true" + # Instance Identification + - name: UNKEY_PLATFORM + value: "kubernetes" + - name: UNKEY_REGION + value: "local" + - name: UNKEY_INSTANCE_ID + value: "worker-dev" + # Database Configuration + - name: UNKEY_DATABASE_PRIMARY + value: "unkey:password@tcp(mysql:3306)/unkey?parseTime=true&interpolateParams=true" + + # Observability - DISABLED for development + - name: UNKEY_OTEL + value: "false" + + # Vault Configuration (required) + - name: UNKEY_VAULT_MASTER_KEYS + value: "Ch9rZWtfMmdqMFBJdVhac1NSa0ZhNE5mOWlLSnBHenFPENTt7an5MRogENt9Si6wms4pQ2XIvqNSIgNpaBenJmXgcInhu6Nfv2U=" + - name: UNKEY_VAULT_S3_URL + value: "http://s3:3902" + - name: UNKEY_VAULT_S3_BUCKET + value: "acme-vault" + - name: UNKEY_VAULT_S3_ACCESS_KEY_ID + value: "minio_root_user" + - name: UNKEY_VAULT_S3_ACCESS_KEY_SECRET + value: "minio_root_password" + + # Build Configuration + - name: UNKEY_BUILD_BACKEND + value: "depot" + - name: UNKEY_BUILD_PLATFORM + value: "linux/arm64" + # Build S3 Storage (from depot-credentials secret) + - name: UNKEY_BUILD_S3_URL + valueFrom: + secretKeyRef: + name: depot-credentials + key: s3-url + - name: UNKEY_BUILD_S3_BUCKET + value: "build-contexts" + - name: UNKEY_BUILD_S3_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: depot-credentials + key: s3-access-key-id + - name: UNKEY_BUILD_S3_ACCESS_KEY_SECRET + valueFrom: + secretKeyRef: + name: depot-credentials + key: s3-access-key-secret + + # Registry Configuration + - name: UNKEY_REGISTRY_URL + value: "registry.depot.dev" + - name: UNKEY_REGISTRY_USERNAME + value: "x-token" + - name: UNKEY_REGISTRY_PASSWORD + valueFrom: + secretKeyRef: + name: depot-credentials + key: token + + # Depot-Specific Configuration + - name: UNKEY_DEPOT_API_URL + value: "https://api.depot.dev" + - name: UNKEY_DEPOT_PROJECT_REGION + value: "us-east-1" + + # ACME Configuration + - name: UNKEY_ACME_ENABLED + value: "false" + + - name: UNKEY_DEFAULT_DOMAIN + value: "unkey.local" + + # Restate Configuration + - name: UNKEY_RESTATE_INGRESS_URL + value: "http://restate:8080" + - name: UNKEY_RESTATE_ADMIN_URL + value: "http://restate:9070" + - name: UNKEY_RESTATE_HTTP_PORT + value: "9080" + - name: UNKEY_RESTATE_REGISTER_AS + value: "http://worker:9080" + - name: UNKEY_RESTATE_API_KEY + value: "" + + # API Key for cluster service + - name: UNKEY_AUTH_TOKEN + value: "your-local-dev-key" + + # ClickHouse Configuration + - name: UNKEY_CLICKHOUSE_URL + value: "clickhouse://default:password@clickhouse:9000?secure=false&skip_verify=true" + + - name: UNKEY_SENTINEL_IMAGE + value: "unkey/sentinel:latest" + volumeMounts: + - name: docker-socket + mountPath: /var/run/docker.sock + initContainers: + - name: wait-for-dependencies + image: busybox:1.36 + command: + [ + "sh", + "-c", + "until nc -z mysql 3306 && nc -z s3 3902 && nc -z restate 8080; do echo waiting for dependencies; sleep 2; done;", + ] + +--- +apiVersion: v1 +kind: Service +metadata: + name: worker + namespace: unkey + labels: + app: worker +spec: + selector: + app: worker + ports: + - name: health + port: 7092 + targetPort: 7092 + protocol: TCP + - name: restate + port: 9080 + targetPort: 9080 + protocol: TCP + type: ClusterIP diff --git a/svc/ctrl/BUILD.bazel b/svc/ctrl/BUILD.bazel index 60f2645d29..0bdea90254 100644 --- a/svc/ctrl/BUILD.bazel +++ b/svc/ctrl/BUILD.bazel @@ -11,24 +11,18 @@ go_library( visibility = ["//visibility:public"], deps = [ "//gen/proto/ctrl/v1/ctrlv1connect", - "//gen/proto/hydra/v1:hydra", "//pkg/assert", + "//pkg/cache", "//pkg/clickhouse", "//pkg/clock", "//pkg/db", "//pkg/otel", "//pkg/otel/logging", "//pkg/prometheus", - "//pkg/retry", "//pkg/shutdown", "//pkg/tls", - "//pkg/uid", - "//pkg/vault", - "//pkg/vault/storage", "//pkg/version", - "//svc/ctrl/internal/caches", "//svc/ctrl/services/acme", - "//svc/ctrl/services/acme/providers", "//svc/ctrl/services/build/backend/depot", "//svc/ctrl/services/build/backend/docker", "//svc/ctrl/services/build/storage", @@ -36,14 +30,8 @@ go_library( "//svc/ctrl/services/ctrl", "//svc/ctrl/services/deployment", "//svc/ctrl/services/openapi", - "//svc/ctrl/workflows/certificate", - "//svc/ctrl/workflows/deploy", - "//svc/ctrl/workflows/routing", - "//svc/ctrl/workflows/versioning", - "@com_github_go_acme_lego_v4//challenge", "@com_github_restatedev_sdk_go//:sdk-go", "@com_github_restatedev_sdk_go//ingress", - "@com_github_restatedev_sdk_go//server", "@org_golang_x_net//http2", "@org_golang_x_net//http2/h2c", ], diff --git a/svc/ctrl/internal/caches/BUILD.bazel b/svc/ctrl/internal/caches/BUILD.bazel deleted file mode 100644 index d02c5f4d4a..0000000000 --- a/svc/ctrl/internal/caches/BUILD.bazel +++ /dev/null @@ -1,17 +0,0 @@ -load("@rules_go//go:def.bzl", "go_library") - -go_library( - name = "caches", - srcs = [ - "caches.go", - "doc.go", - ], - importpath = "github.com/unkeyed/unkey/svc/ctrl/internal/caches", - visibility = ["//svc/ctrl:__subpackages__"], - deps = [ - "//pkg/cache", - "//pkg/clock", - "//pkg/db", - "//pkg/otel/logging", - ], -) diff --git a/svc/ctrl/internal/caches/caches.go b/svc/ctrl/internal/caches/caches.go deleted file mode 100644 index 830a417f94..0000000000 --- a/svc/ctrl/internal/caches/caches.go +++ /dev/null @@ -1,84 +0,0 @@ -package caches - -import ( - "time" - - "github.com/unkeyed/unkey/pkg/cache" - "github.com/unkeyed/unkey/pkg/clock" - "github.com/unkeyed/unkey/pkg/db" - "github.com/unkeyed/unkey/pkg/otel/logging" -) - -// Caches holds all shared cache instances for the ctrl application. -// Caches holds all shared cache instances for ctrl application. -// -// This struct provides centralized access to performance-critical caches -// used throughout the control plane for ACME operations and -// domain validation. -type Caches struct { - // Domains cache stores custom domain data for ACME challenges. - // Reduces database queries during domain validation and ownership checks. - Domains cache.Cache[string, db.CustomDomain] - - // Challenges cache stores ACME challenge tokens and authorizations. - // Has short TTL due to rapid state changes during certificate issuance. - Challenges cache.Cache[string, db.AcmeChallenge] -} - -// Config holds configuration for cache initialization. -// -// Provides logger and clock dependencies for cache instances -// with proper configuration for different data types. -type Config struct { - // Logger for cache operations and error reporting. - Logger logging.Logger - - // Clock provides time operations for TTL calculations. - // Uses clock.New() if not provided for production use. - Clock clock.Clock -} - -// New creates configured cache instances for control plane operations. -// -// This function initializes both domain and challenge caches with -// optimized TTL values based on data access patterns. Domains -// use longer TTL due to infrequent changes, while challenges -// use short TTL due to rapid state changes during ACME flows. -// -// Returns configured Caches struct or error if cache creation fails. -func New(cfg Config) (*Caches, error) { - clk := cfg.Clock - if clk == nil { - clk = clock.New() - } - - domains, err := cache.New(cache.Config[string, db.CustomDomain]{ - Fresh: 5 * time.Minute, - Stale: 10 * time.Minute, - MaxSize: 10000, - Logger: cfg.Logger, - Resource: "domains", - Clock: clk, - }) - if err != nil { - return nil, err - } - - // Short TTL for challenges since they change during ACME flow - challenges, err := cache.New(cache.Config[string, db.AcmeChallenge]{ - Fresh: 10 * time.Second, - Stale: 30 * time.Second, - MaxSize: 1000, - Logger: cfg.Logger, - Resource: "acme_challenges", - Clock: clk, - }) - if err != nil { - return nil, err - } - - return &Caches{ - Domains: domains, - Challenges: challenges, - }, nil -} diff --git a/svc/ctrl/internal/caches/doc.go b/svc/ctrl/internal/caches/doc.go deleted file mode 100644 index a9d6beb3b3..0000000000 --- a/svc/ctrl/internal/caches/doc.go +++ /dev/null @@ -1,43 +0,0 @@ -// Package caches provides shared cache instances for control plane operations. -// -// This package manages in-memory caches for ACME challenge data and -// domain information. These caches improve performance by reducing database -// queries for frequently accessed data during certificate issuance -// and challenge processing. -// -// # Cache Types -// -// [Domains]: Cache for custom domain lookups during ACME challenges. -// Used to validate domain ownership and prevent duplicate registrations. -// -// [Challenges]: Cache for ACME challenge token tracking. -// Stores challenge tokens and authorization data with short TTL due to -// rapid changes during certificate issuance process. -// -// # Configuration -// -// Both caches use different TTL values based on data volatility: -// - Domain cache: 5 minutes fresh, 10 minutes stale -// - Challenge cache: 10 seconds fresh, 30 seconds stale -// -// # Key Types -// -// [Caches]: Container holding all cache instances -// [Config]: Configuration for cache initialization -// -// # Usage -// -// Creating caches for control plane: -// -// caches, err := caches.New(caches.Config{ -// Logger: logger, -// Clock: clock.New(), -// }) -// if err != nil { -// log.Fatal(err) -// } -// -// // Access caches -// domain, found := caches.Domains.Get("example.com") -// challenge, found := caches.Challenges.Get("challenge-token") -package caches diff --git a/svc/ctrl/run.go b/svc/ctrl/run.go index 525dd3b35f..57ed13acb4 100644 --- a/svc/ctrl/run.go +++ b/svc/ctrl/run.go @@ -1,37 +1,26 @@ package ctrl import ( - "bytes" "context" - "database/sql" "fmt" "log/slog" "net" "net/http" - "os" "time" - "github.com/go-acme/lego/v4/challenge" restate "github.com/restatedev/sdk-go" restateIngress "github.com/restatedev/sdk-go/ingress" - restateServer "github.com/restatedev/sdk-go/server" "github.com/unkeyed/unkey/gen/proto/ctrl/v1/ctrlv1connect" - hydrav1 "github.com/unkeyed/unkey/gen/proto/hydra/v1" + "github.com/unkeyed/unkey/pkg/cache" "github.com/unkeyed/unkey/pkg/clickhouse" "github.com/unkeyed/unkey/pkg/clock" "github.com/unkeyed/unkey/pkg/db" "github.com/unkeyed/unkey/pkg/otel" "github.com/unkeyed/unkey/pkg/otel/logging" "github.com/unkeyed/unkey/pkg/prometheus" - "github.com/unkeyed/unkey/pkg/retry" "github.com/unkeyed/unkey/pkg/shutdown" - "github.com/unkeyed/unkey/pkg/uid" - "github.com/unkeyed/unkey/pkg/vault" - "github.com/unkeyed/unkey/pkg/vault/storage" pkgversion "github.com/unkeyed/unkey/pkg/version" - ctrlCaches "github.com/unkeyed/unkey/svc/ctrl/internal/caches" "github.com/unkeyed/unkey/svc/ctrl/services/acme" - "github.com/unkeyed/unkey/svc/ctrl/services/acme/providers" "github.com/unkeyed/unkey/svc/ctrl/services/build/backend/depot" "github.com/unkeyed/unkey/svc/ctrl/services/build/backend/docker" buildStorage "github.com/unkeyed/unkey/svc/ctrl/services/build/storage" @@ -39,10 +28,6 @@ import ( "github.com/unkeyed/unkey/svc/ctrl/services/ctrl" "github.com/unkeyed/unkey/svc/ctrl/services/deployment" "github.com/unkeyed/unkey/svc/ctrl/services/openapi" - "github.com/unkeyed/unkey/svc/ctrl/workflows/certificate" - "github.com/unkeyed/unkey/svc/ctrl/workflows/deploy" - "github.com/unkeyed/unkey/svc/ctrl/workflows/routing" - "github.com/unkeyed/unkey/svc/ctrl/workflows/versioning" "golang.org/x/net/http2" "golang.org/x/net/http2/h2c" ) @@ -53,13 +38,9 @@ import ( // It performs these major initialization steps: // 1. Validates configuration and initializes structured logging // 2. Sets up OpenTelemetry if enabled -// 3. Creates vault services for secrets and ACME certificates -// 4. Initializes database and build storage -// 5. Starts Restate workflow engine with service bindings -// 6. Configures ACME challenge providers (HTTP-01, DNS-01) -// 7. Registers with Restate admin API for service discovery -// 8. Starts HTTP/2 server with all Connect handlers -// 9. Boots up cluster management and starts certificate renewal +// 3. Initializes database and build storage +// 4. Creates Restate ingress client for invoking workflows +// 5. Starts HTTP/2 server with all Connect handlers // // The server handles graceful shutdown when context is cancelled, properly // closing all services and database connections. @@ -73,11 +54,6 @@ func Run(ctx context.Context, cfg Config) error { return fmt.Errorf("bad config: %w", err) } - // Disable CNAME following in lego to prevent it from following wildcard CNAMEs - // (e.g., *.example.com -> loadbalancer.aws.com) and failing Route53 zone lookup. - // Must be set before creating any ACME DNS providers. - _ = os.Setenv("LEGO_DISABLE_CNAME_SUPPORT", "true") - shutdowns := shutdown.New() if cfg.OtelEnabled { @@ -113,56 +89,6 @@ func Run(ctx context.Context, cfg Config) error { logger.Info("TLS is enabled, server will use HTTPS") } - // Create vault service for general secrets (env vars, API keys, etc.) - var vaultSvc *vault.Service - if len(cfg.VaultMasterKeys) > 0 && cfg.VaultS3.URL != "" { - vaultStorage, vaultStorageErr := storage.NewS3(storage.S3Config{ - Logger: logger, - S3URL: cfg.VaultS3.URL, - S3Bucket: cfg.VaultS3.Bucket, - S3AccessKeyID: cfg.VaultS3.AccessKeyID, - S3AccessKeySecret: cfg.VaultS3.AccessKeySecret, - }) - if vaultStorageErr != nil { - return fmt.Errorf("unable to create vault storage: %w", vaultStorageErr) - } - - vaultSvc, err = vault.New(vault.Config{ - Logger: logger, - Storage: vaultStorage, - MasterKeys: cfg.VaultMasterKeys, - }) - if err != nil { - return fmt.Errorf("unable to create vault service: %w", err) - } - logger.Info("Vault service initialized", "bucket", cfg.VaultS3.Bucket) - } - - // Create separate vault service for ACME certificates - var acmeVaultSvc *vault.Service - if len(cfg.AcmeVaultMasterKeys) > 0 && cfg.AcmeVaultS3.URL != "" { - acmeVaultStorage, acmeStorageErr := storage.NewS3(storage.S3Config{ - Logger: logger, - S3URL: cfg.AcmeVaultS3.URL, - S3Bucket: cfg.AcmeVaultS3.Bucket, - S3AccessKeyID: cfg.AcmeVaultS3.AccessKeyID, - S3AccessKeySecret: cfg.AcmeVaultS3.AccessKeySecret, - }) - if acmeStorageErr != nil { - return fmt.Errorf("unable to create ACME vault storage: %w", acmeStorageErr) - } - - acmeVaultSvc, err = vault.New(vault.Config{ - Logger: logger, - Storage: acmeVaultStorage, - MasterKeys: cfg.AcmeVaultMasterKeys, - }) - if err != nil { - return fmt.Errorf("unable to create ACME vault service: %w", err) - } - logger.Info("ACME vault service initialized", "bucket", cfg.AcmeVaultS3.Bucket) - } - // Initialize database database, err := db.New(db.Config{ PrimaryDSN: cfg.DatabasePrimary, @@ -227,13 +153,12 @@ func Run(ctx context.Context, cfg Config) error { return fmt.Errorf("unknown build backend: %s (must be 'docker' or 'depot')", cfg.BuildBackend) } - // Restate Client and Server + // Restate ingress client for invoking workflows restateClientOpts := []restate.IngressClientOption{} if cfg.Restate.APIKey != "" { restateClientOpts = append(restateClientOpts, restate.WithAuthKey(cfg.Restate.APIKey)) } restateClient := restateIngress.NewClient(cfg.Restate.URL, restateClientOpts...) - restateSrv := restateServer.NewRestate() c := cluster.New(cluster.Config{ Database: database, @@ -241,170 +166,30 @@ func Run(ctx context.Context, cfg Config) error { Bearer: cfg.AuthToken, }) - restateSrv.Bind(hydrav1.NewDeploymentServiceServer(deploy.New(deploy.Config{ - Logger: logger, - DB: database, - BuildClient: buildService, - DefaultDomain: cfg.DefaultDomain, - Vault: vaultSvc, - Cluster: c, - SentinelImage: cfg.SentinelImage, - AvailableRegions: cfg.AvailableRegions, - Bearer: cfg.AuthToken, - }))) - - restateSrv.Bind(hydrav1.NewRoutingServiceServer(routing.New(routing.Config{ - Logger: logger, - DB: database, - DefaultDomain: cfg.DefaultDomain, - }), restate.WithIngressPrivate(true))) - - restateSrv.Bind(hydrav1.NewVersioningServiceServer(versioning.New(), restate.WithIngressPrivate(true))) - - // Initialize shared caches for ACME (needed for verification endpoint regardless of provider config) - caches, cacheErr := ctrlCaches.New(ctrlCaches.Config{ - Logger: logger, - Clock: clock.New(), + // Initialize caches for ACME service (needed for certificate verification endpoint) + clk := clock.New() + domainCache, err := cache.New(cache.Config[string, db.CustomDomain]{ + Fresh: 5 * time.Minute, + Stale: 10 * time.Minute, + MaxSize: 10000, + Logger: logger, + Resource: "domains", + Clock: clk, }) - if cacheErr != nil { - return fmt.Errorf("failed to create ACME caches: %w", cacheErr) - } - - // Setup ACME challenge providers - var dnsProvider challenge.Provider - var httpProvider challenge.Provider - if cfg.Acme.Enabled { - // HTTP-01 provider for regular (non-wildcard) domains - httpProv, httpErr := providers.NewHTTPProvider(providers.HTTPConfig{ - DB: database, - Logger: logger, - DomainCache: caches.Domains, - }) - if httpErr != nil { - return fmt.Errorf("failed to create HTTP-01 provider: %w", httpErr) - } - httpProvider = httpProv - logger.Info("ACME HTTP-01 provider enabled") - - // DNS-01 provider for wildcard domains (requires DNS provider config) - if cfg.Acme.Cloudflare.Enabled { - cfProvider, cfErr := providers.NewCloudflareProvider(providers.CloudflareConfig{ - DB: database, - Logger: logger, - APIToken: cfg.Acme.Cloudflare.ApiToken, - DomainCache: caches.Domains, - }) - if cfErr != nil { - return fmt.Errorf("failed to create Cloudflare DNS provider: %w", cfErr) - } - dnsProvider = cfProvider - logger.Info("ACME Cloudflare DNS-01 provider enabled for wildcard certs") - } else if cfg.Acme.Route53.Enabled { - r53Provider, r53Err := providers.NewRoute53Provider(providers.Route53Config{ - DB: database, - Logger: logger, - AccessKeyID: cfg.Acme.Route53.AccessKeyID, - SecretAccessKey: cfg.Acme.Route53.SecretAccessKey, - Region: cfg.Acme.Route53.Region, - HostedZoneID: cfg.Acme.Route53.HostedZoneID, - DomainCache: caches.Domains, - }) - if r53Err != nil { - return fmt.Errorf("failed to create Route53 DNS provider: %w", r53Err) - } - dnsProvider = r53Provider - logger.Info("ACME Route53 DNS-01 provider enabled for wildcard certs") - } + if err != nil { + return fmt.Errorf("failed to create domain cache: %w", err) } - // Certificate service needs a longer timeout for ACME DNS-01 challenges - // which can take 5-10 minutes for DNS propagation - restateSrv.Bind(hydrav1.NewCertificateServiceServer(certificate.New(certificate.Config{ - Logger: logger, - DB: database, - Vault: acmeVaultSvc, - EmailDomain: cfg.Acme.EmailDomain, - DefaultDomain: cfg.DefaultDomain, - DNSProvider: dnsProvider, - HTTPProvider: httpProvider, - }), restate.WithInactivityTimeout(15*time.Minute))) - - go func() { - addr := fmt.Sprintf(":%d", cfg.Restate.HttpPort) - logger.Info("Starting Restate server", "addr", addr) - if startErr := restateSrv.Start(ctx, addr); startErr != nil { - logger.Error("failed to start restate server", "error", startErr.Error()) - } - }() - - // Register with Restate admin API if RegisterAs is configured - if cfg.Restate.RegisterAs != "" { - go func() { - // Wait a moment for the restate server to be ready - time.Sleep(2 * time.Second) - - registerURL := fmt.Sprintf("%s/deployments", cfg.Restate.AdminURL) - payload := fmt.Sprintf(`{"uri": "%s"}`, cfg.Restate.RegisterAs) - - logger.Info("Registering with Restate", "admin_url", registerURL, "service_uri", cfg.Restate.RegisterAs) - - retrier := retry.New( - retry.Attempts(10), - retry.Backoff(func(n int) time.Duration { - return 5 * time.Second - }), - ) - - err := retrier.Do(func() error { - req, err := http.NewRequestWithContext(ctx, http.MethodPost, registerURL, bytes.NewBufferString(payload)) - if err != nil { - return fmt.Errorf("failed to create registration request: %w", err) - } - - req.Header.Set("Content-Type", "application/json") - - resp, err := http.DefaultClient.Do(req) - if err != nil { - return fmt.Errorf("failed to register with Restate: %w", err) - } - defer func() { _ = resp.Body.Close() }() - - if resp.StatusCode >= 200 && resp.StatusCode < 300 { - return nil - } - - return fmt.Errorf("registration returned status %d", resp.StatusCode) - }) - - if err != nil { - logger.Error("failed to register with Restate after retries", "error", err.Error()) - } else { - logger.Info("Successfully registered with Restate") - - // Bootstrap wildcard certificate for default domain if ACME is enabled - if cfg.Acme.Enabled && dnsProvider != nil && cfg.DefaultDomain != "" { - bootstrapWildcardDomain(ctx, database, logger, cfg.DefaultDomain) - } - - // Start the certificate renewal cron job if ACME is enabled - // Use Send with idempotency key so multiple restarts don't create duplicate crons - if cfg.Acme.Enabled && dnsProvider != nil { - certClient := hydrav1.NewCertificateServiceIngressClient(restateClient, "global") - _, startErr := certClient.RenewExpiringCertificates().Send( - ctx, - &hydrav1.RenewExpiringCertificatesRequest{ - DaysBeforeExpiry: 30, - }, - restate.WithIdempotencyKey("cert-renewal-cron-startup"), - ) - if startErr != nil { - logger.Warn("failed to start certificate renewal cron", "error", startErr) - } else { - logger.Info("Certificate renewal cron job started") - } - } - } - }() + challengeCache, err := cache.New(cache.Config[string, db.AcmeChallenge]{ + Fresh: 10 * time.Second, + Stale: 30 * time.Second, + MaxSize: 1000, + Logger: logger, + Resource: "acme_challenges", + Clock: clk, + }) + if err != nil { + return fmt.Errorf("failed to create challenge cache: %w", err) } // Create the connect handler @@ -428,8 +213,8 @@ func Run(ctx context.Context, cfg Config) error { mux.Handle(ctrlv1connect.NewAcmeServiceHandler(acme.New(acme.Config{ DB: database, Logger: logger, - DomainCache: caches.Domains, - ChallengeCache: caches.Challenges, + DomainCache: domainCache, + ChallengeCache: challengeCache, }))) mux.Handle(ctrlv1connect.NewClusterServiceHandler(c)) @@ -514,69 +299,3 @@ func Run(ctx context.Context, cfg Config) error { logger.Info("Ctrl server shut down successfully") return nil } - -// bootstrapWildcardDomain ensures a wildcard domain and ACME challenge exist for the default domain. -// -// This helper function creates the necessary database records for automatic -// wildcard certificate issuance during startup. It checks if the wildcard -// domain already exists and creates both the custom domain record and -// ACME challenge record if needed. -// -// The function uses "unkey_internal" as the workspace ID for -// platform-managed resources, ensuring separation from user workspaces. -// -// This is called during control plane startup when ACME is enabled and -// a default domain is configured, allowing the renewal cron job to -// automatically issue wildcard certificates without manual intervention. -func bootstrapWildcardDomain(ctx context.Context, database db.Database, logger logging.Logger, defaultDomain string) { - wildcardDomain := "*." + defaultDomain - - // Check if the wildcard domain already exists - _, err := db.Query.FindCustomDomainByDomain(ctx, database.RO(), wildcardDomain) - if err == nil { - logger.Info("Wildcard domain already exists", "domain", wildcardDomain) - return - } - if !db.IsNotFound(err) { - logger.Error("Failed to check for existing wildcard domain", "error", err, "domain", wildcardDomain) - return - } - - // Create the custom domain record - domainID := uid.New(uid.DomainPrefix) - now := time.Now().UnixMilli() - - // Use "unkey_internal" as the workspace for platform-managed resources - workspaceID := "unkey_internal" - err = db.Query.UpsertCustomDomain(ctx, database.RW(), db.UpsertCustomDomainParams{ - ID: domainID, - WorkspaceID: workspaceID, - Domain: wildcardDomain, - ChallengeType: db.CustomDomainsChallengeTypeDNS01, - CreatedAt: now, - UpdatedAt: sql.NullInt64{Int64: now, Valid: true}, - }) - if err != nil { - logger.Error("Failed to create wildcard domain", "error", err, "domain", wildcardDomain) - return - } - - // Create the ACME challenge record with status 'waiting' so the renewal cron picks it up - err = db.Query.InsertAcmeChallenge(ctx, database.RW(), db.InsertAcmeChallengeParams{ - WorkspaceID: workspaceID, - DomainID: domainID, - Token: "", - Authorization: "", - Status: db.AcmeChallengesStatusWaiting, - ChallengeType: db.AcmeChallengesChallengeTypeDNS01, - CreatedAt: now, - UpdatedAt: sql.NullInt64{Int64: now, Valid: true}, - ExpiresAt: 0, // Will be set when certificate is issued - }) - if err != nil { - logger.Error("Failed to create ACME challenge for wildcard domain", "error", err, "domain", wildcardDomain) - return - } - - logger.Info("Bootstrapped wildcard domain for certificate issuance", "domain", wildcardDomain) -} diff --git a/svc/worker/BUILD.bazel b/svc/worker/BUILD.bazel new file mode 100644 index 0000000000..2e5722a70b --- /dev/null +++ b/svc/worker/BUILD.bazel @@ -0,0 +1,42 @@ +load("@rules_go//go:def.bzl", "go_library") + +go_library( + name = "worker", + srcs = [ + "config.go", + "doc.go", + "run.go", + ], + importpath = "github.com/unkeyed/unkey/svc/worker", + visibility = ["//visibility:public"], + deps = [ + "//gen/proto/ctrl/v1/ctrlv1connect", + "//gen/proto/hydra/v1:hydra", + "//pkg/assert", + "//pkg/cache", + "//pkg/clickhouse", + "//pkg/clock", + "//pkg/db", + "//pkg/otel/logging", + "//pkg/prometheus", + "//pkg/retry", + "//pkg/shutdown", + "//pkg/uid", + "//pkg/vault", + "//pkg/vault/storage", + "//pkg/zen", + "//svc/ctrl/services/acme/providers", + "//svc/ctrl/services/build/backend/depot", + "//svc/ctrl/services/build/backend/docker", + "//svc/ctrl/services/build/storage", + "//svc/ctrl/services/cluster", + "//svc/worker/certificate", + "//svc/worker/deploy", + "//svc/worker/routing", + "//svc/worker/versioning", + "@com_github_go_acme_lego_v4//challenge", + "@com_github_restatedev_sdk_go//:sdk-go", + "@com_github_restatedev_sdk_go//ingress", + "@com_github_restatedev_sdk_go//server", + ], +) diff --git a/svc/ctrl/workflows/certificate/BUILD.bazel b/svc/worker/certificate/BUILD.bazel similarity index 90% rename from svc/ctrl/workflows/certificate/BUILD.bazel rename to svc/worker/certificate/BUILD.bazel index fc14f43c2f..bbbe22e336 100644 --- a/svc/ctrl/workflows/certificate/BUILD.bazel +++ b/svc/worker/certificate/BUILD.bazel @@ -8,7 +8,7 @@ go_library( "renew_handler.go", "service.go", ], - importpath = "github.com/unkeyed/unkey/svc/ctrl/workflows/certificate", + importpath = "github.com/unkeyed/unkey/svc/worker/certificate", visibility = ["//visibility:public"], deps = [ "//gen/proto/hydra/v1:hydra", diff --git a/svc/ctrl/workflows/certificate/doc.go b/svc/worker/certificate/doc.go similarity index 100% rename from svc/ctrl/workflows/certificate/doc.go rename to svc/worker/certificate/doc.go diff --git a/svc/ctrl/workflows/certificate/process_challenge_handler.go b/svc/worker/certificate/process_challenge_handler.go similarity index 100% rename from svc/ctrl/workflows/certificate/process_challenge_handler.go rename to svc/worker/certificate/process_challenge_handler.go diff --git a/svc/ctrl/workflows/certificate/renew_handler.go b/svc/worker/certificate/renew_handler.go similarity index 100% rename from svc/ctrl/workflows/certificate/renew_handler.go rename to svc/worker/certificate/renew_handler.go diff --git a/svc/ctrl/workflows/certificate/service.go b/svc/worker/certificate/service.go similarity index 100% rename from svc/ctrl/workflows/certificate/service.go rename to svc/worker/certificate/service.go diff --git a/svc/worker/config.go b/svc/worker/config.go new file mode 100644 index 0000000000..9af9b1d472 --- /dev/null +++ b/svc/worker/config.go @@ -0,0 +1,424 @@ +package worker + +import ( + "fmt" + "strings" + + "github.com/unkeyed/unkey/pkg/assert" + "github.com/unkeyed/unkey/pkg/clock" +) + +// BuildBackend specifies the container image build backend system. +// +// Determines which service will be used for building container images +// from application source code. Each backend has different capabilities +// and integration requirements. +type BuildBackend string + +const ( + // BuildBackendDepot uses Depot.dev for container builds. + // Provides optimized cloud-native builds with caching and + // integrated registry management. + BuildBackendDepot BuildBackend = "depot" + // BuildBackendDocker uses local Docker daemon for builds. + // Provides on-premises builds with direct Docker integration. + BuildBackendDocker BuildBackend = "docker" +) + +// S3Config holds S3 configuration for storage backends. +// +// This configuration is used by vault, build storage, and other services +// that need to store data in S3-compatible object storage. +type S3Config struct { + // URL is the S3 endpoint URL including protocol and region. + // Examples: "https://s3.amazonaws.com" or "https://s3.us-west-2.amazonaws.com". + URL string + + // Bucket is the S3 bucket name for storing objects. + // Must exist and be accessible with the provided credentials. + Bucket string + + // AccessKeyID is the AWS access key ID for S3 authentication. + // Must have appropriate permissions for bucket operations. + AccessKeyID string + + // AccessKeySecret is the AWS secret access key for S3 authentication. + // Should be stored securely and rotated regularly. + AccessKeySecret string + + // ExternalURL is the public-facing URL for accessing S3 objects. + // Used when objects need to be accessed from outside the AWS network. + // Optional - can be empty for internal-only access. + ExternalURL string +} + +// CloudflareConfig holds Cloudflare API configuration for ACME DNS-01 challenges. +// +// This configuration enables automatic DNS record creation for wildcard +// TLS certificates through Cloudflare's DNS API. +type CloudflareConfig struct { + // Enabled determines whether Cloudflare DNS-01 challenges are used. + // When true, wildcard certificates can be automatically obtained. + Enabled bool + + // ApiToken is the Cloudflare API token for DNS management. + // Requires Zone:Read and DNS:Edit permissions for the target zones. + ApiToken string +} + +// Route53Config holds AWS Route53 configuration for ACME DNS-01 challenges. +// +// This configuration enables automatic DNS record creation for wildcard +// TLS certificates through AWS Route53 DNS API. +type Route53Config struct { + // Enabled determines whether Route53 DNS-01 challenges are used. + // When true, wildcard certificates can be automatically obtained. + Enabled bool + + // AccessKeyID is the AWS access key ID for Route53 API access. + AccessKeyID string + + // SecretAccessKey is the AWS secret access key for Route53 API access. + SecretAccessKey string + + // Region is the AWS region where Route53 hosted zones are located. + // Example: "us-east-1", "us-west-2". + Region string + + // HostedZoneID overrides automatic zone discovery. + // Required when domains have complex CNAME setups that confuse + // automatic zone lookup (e.g., wildcard CNAMEs to load balancers). + HostedZoneID string +} + +// AcmeConfig holds configuration for ACME TLS certificate management. +// +// This configuration enables automatic certificate issuance and renewal +// through ACME protocol with support for multiple DNS providers. +type AcmeConfig struct { + // Enabled determines whether ACME certificate management is active. + // When true, certificates are automatically obtained and renewed. + Enabled bool + + // EmailDomain is the domain used for ACME account emails. + // Used for Let's Encrypt account registration and recovery. + // Example: "unkey.com" creates "admin@unkey.com" for ACME account. + EmailDomain string + + // Cloudflare configures DNS-01 challenges through Cloudflare API. + // Enables wildcard certificates for domains hosted on Cloudflare. + Cloudflare CloudflareConfig + + // Route53 configures DNS-01 challenges through AWS Route53 API. + // Enables wildcard certificates for domains hosted on Route53. + Route53 Route53Config +} + +// RestateConfig holds configuration for Restate workflow engine integration. +// +// This configuration enables asynchronous workflow execution through +// the Restate distributed system for deployment and certificate operations. +type RestateConfig struct { + // URL is the Restate ingress endpoint URL for workflow invocation. + // Used by clients to start and interact with workflow executions. + // Example: "http://restate:8080". + URL string + + // AdminURL is the Restate admin endpoint URL for service registration. + // Used by the worker to register its workflow services. + // Example: "http://restate:9070". + AdminURL string + + // HttpPort is the port where the worker listens for Restate requests. + // This is the internal Restate server port, not the health check port. + HttpPort int + + // RegisterAs is the service URL used for self-registration with Restate. + // Allows Restate to discover and invoke this worker's services. + // Example: "http://worker:9080". + RegisterAs string + + // APIKey is the authentication key for Restate ingress requests. + // If set, this key will be sent with all requests to the Restate ingress. + APIKey string +} + +// DepotConfig holds configuration for Depot.dev build service integration. +// +// This configuration enables cloud-native container builds through +// Depot's managed build infrastructure with optimized caching. +type DepotConfig struct { + // APIUrl is the Depot API endpoint URL for build operations. + // Example: "https://api.depot.dev". + APIUrl string + + // ProjectRegion is the geographic region for build storage. + // Affects build performance and data residency. + // Options: "us-east-1", "eu-central-1". Default: "us-east-1". + ProjectRegion string +} + +// RegistryConfig holds container registry authentication configuration. +// +// This configuration provides credentials for accessing container registries +// used by build backends for pushing and pulling images. +type RegistryConfig struct { + // URL is the container registry endpoint URL. + // Example: "registry.depot.dev" or "https://registry.example.com". + URL string + + // Username is the registry authentication username. + // Common values: "x-token" for token-based auth, or actual username. + Username string + + // Password is the registry password or authentication token. + // Should be stored securely and rotated regularly. + Password string +} + +// BuildPlatform represents parsed container build platform specification. +// +// Contains the validated platform string separated into OS and architecture +// components for build backend integration. +type BuildPlatform struct { + // Platform is the original build platform string. + // Example: "linux/amd64". + Platform string + + // Architecture is the CPU architecture component. + // Example: "amd64", "arm64". + Architecture string +} + +// Config holds configuration for the Restate worker service. +// +// This comprehensive configuration structure defines all aspects of worker +// operation including database connections, vault integration, build backends, +// ACME certificate management, and Restate integration. +type Config struct { + // InstanceID is the unique identifier for this worker instance. + // Used for logging, tracing, and cluster coordination. + InstanceID string + + // HttpPort defines the HTTP port for the health check endpoint. + // Default: 7092. Cannot be 0. + HttpPort int + + // PrometheusPort specifies the port for exposing Prometheus metrics. + // Set to 0 to disable metrics exposure. When enabled, metrics are served + // on all interfaces (0.0.0.0) on the specified port. + PrometheusPort int + + // DatabasePrimary is the primary database connection string. + // Used for both read and write operations to persistent storage. + DatabasePrimary string + + // VaultMasterKeys are encryption keys for the general vault service. + // Used for encrypting/decrypting environment variables, API keys, etc. + VaultMasterKeys []string + + // VaultS3 configures S3 storage for the general vault. + // Stores encrypted secrets data with the provided master keys. + VaultS3 S3Config + + // AcmeVaultMasterKeys are encryption keys for the ACME vault service. + // Separate vault for TLS certificate storage and ACME account data. + AcmeVaultMasterKeys []string + + // AcmeVaultS3 configures S3 storage for the ACME vault. + // Stores encrypted TLS certificates and ACME challenge data. + AcmeVaultS3 S3Config + + // Acme configures automatic TLS certificate management. + // Enables Let's Encrypt integration for domain certificates. + Acme AcmeConfig + + // DefaultDomain is the fallback domain for system operations. + // Used for sentinel deployment and automatic certificate bootstrapping. + DefaultDomain string + + // Restate configures workflow engine integration. + // Enables asynchronous deployment and certificate renewal workflows. + Restate RestateConfig + + // BuildBackend selects the container build system. + // Options: BuildBackendDepot or BuildBackendDocker. + BuildBackend BuildBackend + + // BuildS3 configures storage for build artifacts and outputs. + // Used by both Depot and Docker build backends. + BuildS3 S3Config + + // BuildPlatform defines the target architecture for container builds. + // Format: "linux/amd64", "linux/arm64". Only "linux" OS supported. + BuildPlatform string + + // Depot configures Depot.dev build service integration. + // Required when using BuildBackendDepot. + Depot DepotConfig + + // RegistryURL is the container registry URL for pulling images. + // Example: "registry.depot.dev" or "https://registry.example.com". + RegistryURL string + + // RegistryUsername is the username for container registry authentication. + // Common values: "x-token" for token-based auth or actual username. + RegistryUsername string + + // RegistryPassword is the password/token for container registry authentication. + // Should be stored securely (environment variable or secret management). + RegistryPassword string + + // ClickhouseURL is the ClickHouse database connection string. + // Used for analytics and operational metrics storage. + ClickhouseURL string + + // AuthToken is the authentication token for cluster service API access. + // Used by the cluster service to authenticate requests. + AuthToken string + + // SentinelImage is the container image used for new sentinel deployments. + // Overrides default sentinel image with custom build or registry. + SentinelImage string + + // AvailableRegions is a list of available regions for deployments. + // typically in the format "region.provider", ie "us-east-1.aws", "local.dev" + AvailableRegions []string + + // Clock provides time operations for testing and scheduling. + // Use clock.RealClock{} for production deployments. + Clock clock.Clock +} + +// parseBuildPlatform validates and parses a build platform string. +// +// This function validates that the build platform follows the expected +// format "linux/{architecture}" and parses it into components. +// Only "linux" OS is currently supported. +// +// Returns BuildPlatform with parsed components or error if format is invalid +// or OS is not supported. +func parseBuildPlatform(buildPlatform string) (BuildPlatform, error) { + buildPlatform = strings.TrimPrefix(buildPlatform, "/") + parts := strings.Split(buildPlatform, "/") + + if err := assert.All( + assert.Equal(len(parts), 2, fmt.Sprintf("invalid build platform format: %s (expected format: linux/amd64)", buildPlatform)), + assert.Equal(parts[0], "linux", fmt.Sprintf("unsupported OS: %s (only linux is supported)", parts[0])), + ); err != nil { + return BuildPlatform{}, err + } + + return BuildPlatform{ + Platform: buildPlatform, + Architecture: parts[1], + }, nil +} + +// GetBuildPlatform returns the parsed build platform. +// +// This method returns the parsed BuildPlatform from the configured +// BuildPlatform string. Should only be called after Validate() succeeds +// to ensure the platform string is valid. +// +// Returns BuildPlatform with parsed platform and architecture components. +func (c Config) GetBuildPlatform() BuildPlatform { + parsed, _ := parseBuildPlatform(c.BuildPlatform) + return parsed +} + +// GetRegistryConfig returns the registry configuration. +// +// This method builds a RegistryConfig from the individual registry +// settings in the main Config struct. Should only be called after +// Validate() succeeds to ensure all required fields are present. +// +// Returns RegistryConfig with URL, username, and password for container registry access. +func (c Config) GetRegistryConfig() RegistryConfig { + return RegistryConfig{ + URL: c.RegistryURL, + Username: c.RegistryUsername, + Password: c.RegistryPassword, + } +} + +// GetDepotConfig returns the depot configuration. +// +// This method returns the DepotConfig from the main Config struct. +// Should only be called after Validate() succeeds to ensure +// depot configuration is complete and valid. +// +// Returns the DepotConfig containing API URL and project region. +func (c Config) GetDepotConfig() DepotConfig { + return c.Depot +} + +// Validate checks the configuration for required fields and logical consistency. +// +// This method performs comprehensive validation of all configuration sections +// including build backend, ACME providers, database connections, and +// required credentials. It ensures that conditional configuration +// (like ACME providers) has all necessary dependencies. +// +// Returns an error if required fields are missing, invalid, or inconsistent. +// Provides detailed error messages to help identify configuration issues. +func (c Config) Validate() error { + // Validate Cloudflare configuration if enabled + if c.Acme.Enabled && c.Acme.Cloudflare.Enabled { + if err := assert.NotEmpty(c.Acme.Cloudflare.ApiToken, "cloudflare API token is required when cloudflare is enabled"); err != nil { + return err + } + } + + // Validate Route53 configuration if enabled + if c.Acme.Enabled && c.Acme.Route53.Enabled { + if err := assert.All( + assert.NotEmpty(c.Acme.Route53.AccessKeyID, "route53 access key ID is required when route53 is enabled"), + assert.NotEmpty(c.Acme.Route53.SecretAccessKey, "route53 secret access key is required when route53 is enabled"), + assert.NotEmpty(c.Acme.Route53.Region, "route53 region is required when route53 is enabled"), + ); err != nil { + return err + } + } + + if err := assert.NotEmpty(c.ClickhouseURL, "ClickhouseURL is required"); err != nil { + return err + } + + // Validate build platform format + _, platformErr := parseBuildPlatform(c.BuildPlatform) + + // Validate registry configuration + registryErr := assert.All( + assert.NotEmpty(c.RegistryURL, "registry URL is required"), + assert.NotEmpty(c.RegistryUsername, "registry username is required"), + assert.NotEmpty(c.RegistryPassword, "registry password is required"), + ) + + switch c.BuildBackend { + case BuildBackendDepot: + return assert.All( + platformErr, + registryErr, + assert.NotEmpty(c.BuildPlatform, "build platform is required"), + assert.NotEmpty(c.BuildS3.URL, "build S3 URL is required when using Depot backend"), + assert.NotEmpty(c.BuildS3.Bucket, "build S3 bucket is required when using Depot backend"), + assert.NotEmpty(c.BuildS3.AccessKeyID, "build S3 access key ID is required when using Depot backend"), + assert.NotEmpty(c.BuildS3.AccessKeySecret, "build S3 access key secret is required when using Depot backend"), + assert.NotEmpty(c.Depot.APIUrl, "Depot API URL is required when using Depot backend"), + assert.NotEmpty(c.Depot.ProjectRegion, "Depot project region is required when using Depot backend"), + ) + case BuildBackendDocker: + return assert.All( + platformErr, + assert.NotEmpty(c.BuildPlatform, "build platform is required"), + assert.NotEmpty(c.BuildS3.URL, "build S3 URL is required when using Docker backend"), + assert.NotEmpty(c.BuildS3.ExternalURL, "build S3 external URL is required when using Docker backend"), + assert.NotEmpty(c.BuildS3.Bucket, "build S3 bucket is required when using Docker backend"), + assert.NotEmpty(c.BuildS3.AccessKeyID, "build S3 access key ID is required when using Docker backend"), + assert.NotEmpty(c.BuildS3.AccessKeySecret, "build S3 access key secret is required when using Docker backend"), + ) + default: + return fmt.Errorf("build backend must be either 'depot' or 'docker', got: %s", c.BuildBackend) + } +} diff --git a/svc/ctrl/workflows/deploy/BUILD.bazel b/svc/worker/deploy/BUILD.bazel similarity index 91% rename from svc/ctrl/workflows/deploy/BUILD.bazel rename to svc/worker/deploy/BUILD.bazel index 5d2c771721..3ffab0fea2 100644 --- a/svc/ctrl/workflows/deploy/BUILD.bazel +++ b/svc/worker/deploy/BUILD.bazel @@ -11,7 +11,7 @@ go_library( "rollback_handler.go", "service.go", ], - importpath = "github.com/unkeyed/unkey/svc/ctrl/workflows/deploy", + importpath = "github.com/unkeyed/unkey/svc/worker/deploy", visibility = ["//visibility:public"], deps = [ "//gen/proto/ctrl/v1:ctrl", diff --git a/svc/ctrl/workflows/deploy/deploy_handler.go b/svc/worker/deploy/deploy_handler.go similarity index 100% rename from svc/ctrl/workflows/deploy/deploy_handler.go rename to svc/worker/deploy/deploy_handler.go diff --git a/svc/ctrl/workflows/deploy/doc.go b/svc/worker/deploy/doc.go similarity index 100% rename from svc/ctrl/workflows/deploy/doc.go rename to svc/worker/deploy/doc.go diff --git a/svc/ctrl/workflows/deploy/domains.go b/svc/worker/deploy/domains.go similarity index 100% rename from svc/ctrl/workflows/deploy/domains.go rename to svc/worker/deploy/domains.go diff --git a/svc/ctrl/workflows/deploy/helpers.go b/svc/worker/deploy/helpers.go similarity index 100% rename from svc/ctrl/workflows/deploy/helpers.go rename to svc/worker/deploy/helpers.go diff --git a/svc/ctrl/workflows/deploy/promote_handler.go b/svc/worker/deploy/promote_handler.go similarity index 100% rename from svc/ctrl/workflows/deploy/promote_handler.go rename to svc/worker/deploy/promote_handler.go diff --git a/svc/ctrl/workflows/deploy/rollback_handler.go b/svc/worker/deploy/rollback_handler.go similarity index 100% rename from svc/ctrl/workflows/deploy/rollback_handler.go rename to svc/worker/deploy/rollback_handler.go diff --git a/svc/ctrl/workflows/deploy/service.go b/svc/worker/deploy/service.go similarity index 100% rename from svc/ctrl/workflows/deploy/service.go rename to svc/worker/deploy/service.go diff --git a/svc/worker/doc.go b/svc/worker/doc.go new file mode 100644 index 0000000000..6865aecd5b --- /dev/null +++ b/svc/worker/doc.go @@ -0,0 +1,78 @@ +// Package worker provides the Restate workflow worker service. +// +// This package implements the Restate worker that handles asynchronous workflow +// execution for the unkey platform. It runs as a separate service from the main +// control plane, allowing independent scaling and deployment of workflow handlers. +// +// # Architecture +// +// The worker service consists of: +// - Restate server for workflow handler execution +// - Health check endpoint for orchestration +// - Certificate renewal cron job for ACME management +// +// # Workflow Services +// +// The worker binds and executes the following Restate workflow services: +// - [DeploymentService]: Orchestrates application deployment workflows +// - [RoutingService]: Manages domain assignment and traffic routing +// - [VersioningService]: Handles version management operations +// - [CertificateService]: Processes ACME challenges and certificate lifecycle +// +// # ACME Integration +// +// The worker supports multiple ACME challenge providers: +// - HTTP-01 challenges for regular domains +// - DNS-01 challenges through Cloudflare for wildcard certificates +// - DNS-01 challenges through AWS Route53 for wildcard certificates +// +// Certificate renewal is managed through a cron job that runs after the worker +// successfully registers with the Restate admin API. +// +// # Configuration +// +// The worker is configured through [Config] which includes: +// - Database and vault configuration for persistence +// - Build backend settings for container operations +// - ACME provider configuration for certificate management +// - Restate configuration for workflow registration +// +// # Usage +// +// Basic worker setup: +// +// cfg := worker.Config{ +// InstanceID: "worker-prod-001", +// HttpPort: 7092, +// DatabasePrimary: "user:pass@tcp(db:3306)/unkey", +// VaultMasterKeys: []string{"master-key-1"}, +// VaultS3: worker.S3Config{ +// URL: "https://s3.amazonaws.com", +// Bucket: "unkey-vault", +// AccessKeyID: "access-key", +// AccessKeySecret: "secret-key", +// }, +// BuildBackend: worker.BuildBackendDepot, +// BuildPlatform: "linux/amd64", +// Restate: worker.RestateConfig{ +// URL: "http://restate:8080", +// AdminURL: "http://restate:9070", +// HttpPort: 9080, +// RegisterAs: "http://worker:9080", +// }, +// } +// err := worker.Run(context.Background(), cfg) +// +// The worker will: +// 1. Initialize all services (database, vault, build backend, etc.) +// 2. Start Restate server with workflow service bindings +// 3. Register with Restate admin API for service discovery +// 4. Bootstrap wildcard domain and start certificate renewal cron +// 5. Start health check endpoint on configured port +// 6. Handle graceful shutdown on context cancellation +// +// # Observability +// +// The worker integrates with OpenTelemetry for metrics, traces, and structured logging. +// It exposes health endpoints and Prometheus metrics for monitoring workflow execution. +package worker diff --git a/svc/ctrl/workflows/routing/BUILD.bazel b/svc/worker/routing/BUILD.bazel similarity index 84% rename from svc/ctrl/workflows/routing/BUILD.bazel rename to svc/worker/routing/BUILD.bazel index a938062c2f..301a22f017 100644 --- a/svc/ctrl/workflows/routing/BUILD.bazel +++ b/svc/worker/routing/BUILD.bazel @@ -7,7 +7,7 @@ go_library( "doc.go", "service.go", ], - importpath = "github.com/unkeyed/unkey/svc/ctrl/workflows/routing", + importpath = "github.com/unkeyed/unkey/svc/worker/routing", visibility = ["//visibility:public"], deps = [ "//gen/proto/hydra/v1:hydra", diff --git a/svc/ctrl/workflows/routing/assign_domains_handler.go b/svc/worker/routing/assign_domains_handler.go similarity index 100% rename from svc/ctrl/workflows/routing/assign_domains_handler.go rename to svc/worker/routing/assign_domains_handler.go diff --git a/svc/ctrl/workflows/routing/doc.go b/svc/worker/routing/doc.go similarity index 100% rename from svc/ctrl/workflows/routing/doc.go rename to svc/worker/routing/doc.go diff --git a/svc/ctrl/workflows/routing/service.go b/svc/worker/routing/service.go similarity index 100% rename from svc/ctrl/workflows/routing/service.go rename to svc/worker/routing/service.go diff --git a/svc/worker/run.go b/svc/worker/run.go new file mode 100644 index 0000000000..f13caf3e5c --- /dev/null +++ b/svc/worker/run.go @@ -0,0 +1,517 @@ +package worker + +import ( + "bytes" + "context" + "database/sql" + "fmt" + "log/slog" + "net" + "net/http" + "os" + "time" + + "github.com/go-acme/lego/v4/challenge" + restate "github.com/restatedev/sdk-go" + restateIngress "github.com/restatedev/sdk-go/ingress" + restateServer "github.com/restatedev/sdk-go/server" + "github.com/unkeyed/unkey/gen/proto/ctrl/v1/ctrlv1connect" + hydrav1 "github.com/unkeyed/unkey/gen/proto/hydra/v1" + "github.com/unkeyed/unkey/pkg/cache" + "github.com/unkeyed/unkey/pkg/clickhouse" + "github.com/unkeyed/unkey/pkg/clock" + "github.com/unkeyed/unkey/pkg/db" + "github.com/unkeyed/unkey/pkg/otel/logging" + "github.com/unkeyed/unkey/pkg/prometheus" + "github.com/unkeyed/unkey/pkg/retry" + "github.com/unkeyed/unkey/pkg/shutdown" + "github.com/unkeyed/unkey/pkg/uid" + "github.com/unkeyed/unkey/pkg/vault" + "github.com/unkeyed/unkey/pkg/vault/storage" + + "github.com/unkeyed/unkey/pkg/zen" + "github.com/unkeyed/unkey/svc/ctrl/services/acme/providers" + "github.com/unkeyed/unkey/svc/ctrl/services/build/backend/depot" + "github.com/unkeyed/unkey/svc/ctrl/services/build/backend/docker" + buildStorage "github.com/unkeyed/unkey/svc/ctrl/services/build/storage" + "github.com/unkeyed/unkey/svc/ctrl/services/cluster" + "github.com/unkeyed/unkey/svc/worker/certificate" + "github.com/unkeyed/unkey/svc/worker/deploy" + "github.com/unkeyed/unkey/svc/worker/routing" + "github.com/unkeyed/unkey/svc/worker/versioning" +) + +// Run starts the Restate worker service with the provided configuration. +// +// This function initializes all required services and starts the Restate server +// for workflow execution. It performs these major initialization steps: +// 1. Validates configuration and initializes structured logging +// 2. Creates vault services for secrets and ACME certificates +// 3. Initializes database and build storage +// 4. Creates build service (docker/depot backend) +// 5. Initializes ACME caches and providers (HTTP-01, DNS-01) +// 6. Starts Restate server with workflow service bindings +// 7. Registers with Restate admin API for service discovery +// 8. Bootstraps wildcard domain and starts certificate renewal cron +// 9. Starts health check endpoint +// 10. Optionally starts Prometheus metrics server +// +// The worker handles graceful shutdown when context is cancelled, properly +// closing all services and database connections. +// +// Returns an error if configuration validation fails, service initialization +// fails, or during server startup. Context cancellation results in +// clean shutdown with nil error. +func Run(ctx context.Context, cfg Config) error { + err := cfg.Validate() + if err != nil { + return fmt.Errorf("bad config: %w", err) + } + + // Disable CNAME following in lego to prevent it from following wildcard CNAMEs + // (e.g., *.example.com -> loadbalancer.aws.com) and failing Route53 zone lookup. + // Must be set before creating any ACME DNS providers. + _ = os.Setenv("LEGO_DISABLE_CNAME_SUPPORT", "true") + + shutdowns := shutdown.New() + + logger := logging.New() + if cfg.InstanceID != "" { + logger = logger.With(slog.String("instanceID", cfg.InstanceID)) + } + + // Create vault service for general secrets (env vars, API keys, etc.) + var vaultSvc *vault.Service + if len(cfg.VaultMasterKeys) > 0 && cfg.VaultS3.URL != "" { + vaultStorage, vaultStorageErr := storage.NewS3(storage.S3Config{ + Logger: logger, + S3URL: cfg.VaultS3.URL, + S3Bucket: cfg.VaultS3.Bucket, + S3AccessKeyID: cfg.VaultS3.AccessKeyID, + S3AccessKeySecret: cfg.VaultS3.AccessKeySecret, + }) + if vaultStorageErr != nil { + return fmt.Errorf("unable to create vault storage: %w", vaultStorageErr) + } + + vaultSvc, err = vault.New(vault.Config{ + Logger: logger, + Storage: vaultStorage, + MasterKeys: cfg.VaultMasterKeys, + }) + if err != nil { + return fmt.Errorf("unable to create vault service: %w", err) + } + logger.Info("Vault service initialized", "bucket", cfg.VaultS3.Bucket) + } + + // Create separate vault service for ACME certificates + var acmeVaultSvc *vault.Service + if len(cfg.AcmeVaultMasterKeys) > 0 && cfg.AcmeVaultS3.URL != "" { + acmeVaultStorage, acmeStorageErr := storage.NewS3(storage.S3Config{ + Logger: logger, + S3URL: cfg.AcmeVaultS3.URL, + S3Bucket: cfg.AcmeVaultS3.Bucket, + S3AccessKeyID: cfg.AcmeVaultS3.AccessKeyID, + S3AccessKeySecret: cfg.AcmeVaultS3.AccessKeySecret, + }) + if acmeStorageErr != nil { + return fmt.Errorf("unable to create ACME vault storage: %w", acmeStorageErr) + } + + acmeVaultSvc, err = vault.New(vault.Config{ + Logger: logger, + Storage: acmeVaultStorage, + MasterKeys: cfg.AcmeVaultMasterKeys, + }) + if err != nil { + return fmt.Errorf("unable to create ACME vault service: %w", err) + } + logger.Info("ACME vault service initialized", "bucket", cfg.AcmeVaultS3.Bucket) + } + + // Initialize database + database, err := db.New(db.Config{ + PrimaryDSN: cfg.DatabasePrimary, + ReadOnlyDSN: "", + Logger: logger, + }) + if err != nil { + return fmt.Errorf("unable to create db: %w", err) + } + + shutdowns.Register(database.Close) + + bldStorage, err := buildStorage.NewS3(buildStorage.S3Config{ + Logger: logger, + S3URL: cfg.BuildS3.URL, + S3PresignURL: cfg.BuildS3.ExternalURL, + S3Bucket: cfg.BuildS3.Bucket, + S3AccessKeyID: cfg.BuildS3.AccessKeyID, + S3AccessKeySecret: cfg.BuildS3.AccessKeySecret, + }) + if err != nil { + return fmt.Errorf("unable to create build storage: %w", err) + } + + var ch clickhouse.ClickHouse = clickhouse.NewNoop() + if cfg.ClickhouseURL != "" { + ch, err = clickhouse.New(clickhouse.Config{ + URL: cfg.ClickhouseURL, + Logger: logger, + }) + if err != nil { + return fmt.Errorf("unable to create clickhouse: %w", err) + } + } + + var buildService ctrlv1connect.BuildServiceClient + switch cfg.BuildBackend { + case BuildBackendDocker: + buildService = docker.New(docker.Config{ + InstanceID: cfg.InstanceID, + DB: database, + Logger: logger, + BuildPlatform: docker.BuildPlatform(cfg.GetBuildPlatform()), + Storage: bldStorage, + }) + logger.Info("Using Docker build backend", "presign_url", cfg.BuildS3.ExternalURL) + + case BuildBackendDepot: + buildService = depot.New(depot.Config{ + InstanceID: cfg.InstanceID, + DB: database, + RegistryConfig: depot.RegistryConfig(cfg.GetRegistryConfig()), + BuildPlatform: depot.BuildPlatform(cfg.GetBuildPlatform()), + DepotConfig: depot.DepotConfig(cfg.GetDepotConfig()), + Clickhouse: ch, + Logger: logger, + Storage: bldStorage, + }) + logger.Info("Using Depot build backend") + + default: + return fmt.Errorf("unknown build backend: %s (must be 'docker' or 'depot')", cfg.BuildBackend) + } + + // Restate Client and Server + restateClientOpts := []restate.IngressClientOption{} + if cfg.Restate.APIKey != "" { + restateClientOpts = append(restateClientOpts, restate.WithAuthKey(cfg.Restate.APIKey)) + } + restateClient := restateIngress.NewClient(cfg.Restate.URL, restateClientOpts...) + restateSrv := restateServer.NewRestate() + + c := cluster.New(cluster.Config{ + Database: database, + Logger: logger, + Bearer: cfg.AuthToken, + }) + + restateSrv.Bind(hydrav1.NewDeploymentServiceServer(deploy.New(deploy.Config{ + Logger: logger, + DB: database, + BuildClient: buildService, + DefaultDomain: cfg.DefaultDomain, + Vault: vaultSvc, + Cluster: c, + SentinelImage: cfg.SentinelImage, + AvailableRegions: cfg.AvailableRegions, + Bearer: cfg.AuthToken, + }))) + + restateSrv.Bind(hydrav1.NewRoutingServiceServer(routing.New(routing.Config{ + Logger: logger, + DB: database, + DefaultDomain: cfg.DefaultDomain, + }), restate.WithIngressPrivate(true))) + + restateSrv.Bind(hydrav1.NewVersioningServiceServer(versioning.New(), restate.WithIngressPrivate(true))) + + // Initialize domain cache for ACME providers + clk := clock.New() + domainCache, domainCacheErr := cache.New(cache.Config[string, db.CustomDomain]{ + Fresh: 5 * time.Minute, + Stale: 10 * time.Minute, + MaxSize: 10000, + Logger: logger, + Resource: "domains", + Clock: clk, + }) + if domainCacheErr != nil { + return fmt.Errorf("failed to create domain cache: %w", domainCacheErr) + } + + // Setup ACME challenge providers + var dnsProvider challenge.Provider + var httpProvider challenge.Provider + if cfg.Acme.Enabled { + // HTTP-01 provider for regular (non-wildcard) domains + httpProv, httpErr := providers.NewHTTPProvider(providers.HTTPConfig{ + DB: database, + Logger: logger, + DomainCache: domainCache, + }) + if httpErr != nil { + return fmt.Errorf("failed to create HTTP-01 provider: %w", httpErr) + } + httpProvider = httpProv + logger.Info("ACME HTTP-01 provider enabled") + + // DNS-01 provider for wildcard domains (requires DNS provider config) + if cfg.Acme.Cloudflare.Enabled { + cfProvider, cfErr := providers.NewCloudflareProvider(providers.CloudflareConfig{ + DB: database, + Logger: logger, + APIToken: cfg.Acme.Cloudflare.ApiToken, + DomainCache: domainCache, + }) + if cfErr != nil { + return fmt.Errorf("failed to create Cloudflare DNS provider: %w", cfErr) + } + dnsProvider = cfProvider + logger.Info("ACME Cloudflare DNS-01 provider enabled for wildcard certs") + } else if cfg.Acme.Route53.Enabled { + r53Provider, r53Err := providers.NewRoute53Provider(providers.Route53Config{ + DB: database, + Logger: logger, + AccessKeyID: cfg.Acme.Route53.AccessKeyID, + SecretAccessKey: cfg.Acme.Route53.SecretAccessKey, + Region: cfg.Acme.Route53.Region, + HostedZoneID: cfg.Acme.Route53.HostedZoneID, + DomainCache: domainCache, + }) + if r53Err != nil { + return fmt.Errorf("failed to create Route53 DNS provider: %w", r53Err) + } + dnsProvider = r53Provider + logger.Info("ACME Route53 DNS-01 provider enabled for wildcard certs") + } + } + + // Certificate service needs a longer timeout for ACME DNS-01 challenges + // which can take 5-10 minutes for DNS propagation + restateSrv.Bind(hydrav1.NewCertificateServiceServer(certificate.New(certificate.Config{ + Logger: logger, + DB: database, + Vault: acmeVaultSvc, + EmailDomain: cfg.Acme.EmailDomain, + DefaultDomain: cfg.DefaultDomain, + DNSProvider: dnsProvider, + HTTPProvider: httpProvider, + }), restate.WithInactivityTimeout(15*time.Minute))) + + go func() { + addr := fmt.Sprintf(":%d", cfg.Restate.HttpPort) + logger.Info("Starting Restate server", "addr", addr) + if startErr := restateSrv.Start(ctx, addr); startErr != nil { + logger.Error("failed to start restate server", "error", startErr.Error()) + } + }() + + // Register with Restate admin API + go func() { + // Wait a moment for the restate server to be ready + time.Sleep(2 * time.Second) + + registerURL := fmt.Sprintf("%s/deployments", cfg.Restate.AdminURL) + payload := fmt.Sprintf(`{"uri": "%s"}`, cfg.Restate.RegisterAs) + + logger.Info("Registering with Restate", "admin_url", registerURL, "service_uri", cfg.Restate.RegisterAs) + + retrier := retry.New( + retry.Attempts(10), + retry.Backoff(func(n int) time.Duration { + return 5 * time.Second + }), + ) + + retryErr := retrier.Do(func() error { + req, reqErr := http.NewRequestWithContext(ctx, http.MethodPost, registerURL, bytes.NewBufferString(payload)) + if reqErr != nil { + return fmt.Errorf("failed to create registration request: %w", reqErr) + } + + req.Header.Set("Content-Type", "application/json") + + resp, doErr := http.DefaultClient.Do(req) + if doErr != nil { + return fmt.Errorf("failed to register with Restate: %w", doErr) + } + + status := resp.StatusCode + closeErr := resp.Body.Close() + if closeErr != nil { + return fmt.Errorf("failed to close response body: %w", closeErr) + } + + if status >= 200 && status < 300 { + return nil + } + + return fmt.Errorf("registration returned status %d", status) + }) + + if retryErr != nil { + logger.Error("failed to register with Restate after retries", "error", retryErr.Error()) + } else { + logger.Info("Successfully registered with Restate") + + // Bootstrap wildcard certificate for default domain if ACME is enabled + if cfg.Acme.Enabled && dnsProvider != nil && cfg.DefaultDomain != "" { + bootstrapWildcardDomain(ctx, database, logger, cfg.DefaultDomain) + } + + // Start the certificate renewal cron job if ACME is enabled + // Use Send with idempotency key so multiple restarts don't create duplicate crons + if cfg.Acme.Enabled && dnsProvider != nil { + certClient := hydrav1.NewCertificateServiceIngressClient(restateClient, "global") + _, startErr := certClient.RenewExpiringCertificates().Send( + ctx, + &hydrav1.RenewExpiringCertificatesRequest{ + DaysBeforeExpiry: 30, + }, + restate.WithIdempotencyKey("cert-renewal-cron-startup"), + ) + if startErr != nil { + logger.Warn("failed to start certificate renewal cron", "error", startErr) + } else { + logger.Info("Certificate renewal cron job started") + } + } + } + }() + + // Create zen health server + healthServer, err := zen.New(zen.Config{ + Logger: logger, + TLS: nil, + Flags: nil, + EnableH2C: false, + MaxRequestBodySize: 0, + ReadTimeout: 0, + WriteTimeout: 0, + }) + if err != nil { + return fmt.Errorf("failed to create health server: %w", err) + } + + healthServer.RegisterRoute( + nil, + zen.NewRoute("GET", "/v1/liveness", func(_ context.Context, sess *zen.Session) error { + return sess.Send(http.StatusOK, nil) + }), + ) + + go func() { + healthAddr := fmt.Sprintf(":%d", cfg.HttpPort) + ln, lnErr := net.Listen("tcp", healthAddr) + if lnErr != nil { + logger.Error("failed to listen on health port", "error", lnErr, "port", cfg.HttpPort) + return + } + logger.Info("Starting health server", "addr", healthAddr) + if serveErr := healthServer.Serve(ctx, ln); serveErr != nil { + logger.Error("health server failed", "error", serveErr) + } + }() + + shutdowns.RegisterCtx(healthServer.Shutdown) + + if cfg.PrometheusPort > 0 { + prom, promErr := prometheus.New(prometheus.Config{ + Logger: logger, + }) + if promErr != nil { + return fmt.Errorf("failed to create prometheus server: %w", promErr) + } + + shutdowns.RegisterCtx(prom.Shutdown) + ln, lnErr := net.Listen("tcp", fmt.Sprintf(":%d", cfg.PrometheusPort)) + if lnErr != nil { + return fmt.Errorf("unable to listen on port %d: %w", cfg.PrometheusPort, lnErr) + } + go func() { + logger.Info("prometheus started", "port", cfg.PrometheusPort) + if serveErr := prom.Serve(ctx, ln); serveErr != nil { + logger.Error("failed to start prometheus server", "error", serveErr) + } + }() + } + + // Wait for signal and handle shutdown + logger.Info("Worker started successfully") + if err := shutdowns.WaitForSignal(ctx); err != nil { + logger.Error("Shutdown failed", "error", err) + return err + } + + logger.Info("Worker shut down successfully") + return nil +} + +// bootstrapWildcardDomain ensures a wildcard domain and ACME challenge exist for the default domain. +// +// This helper function creates the necessary database records for automatic +// wildcard certificate issuance during startup. It checks if the wildcard +// domain already exists and creates both the custom domain record and +// ACME challenge record if needed. +// +// The function uses "unkey_internal" as the workspace ID for +// platform-managed resources, ensuring separation from user workspaces. +// +// This is called during worker startup when ACME is enabled and +// a default domain is configured, allowing the renewal cron job to +// automatically issue wildcard certificates without manual intervention. +func bootstrapWildcardDomain(ctx context.Context, database db.Database, logger logging.Logger, defaultDomain string) { + wildcardDomain := "*." + defaultDomain + + // Check if the wildcard domain already exists + _, err := db.Query.FindCustomDomainByDomain(ctx, database.RO(), wildcardDomain) + if err == nil { + logger.Info("Wildcard domain already exists", "domain", wildcardDomain) + return + } + if !db.IsNotFound(err) { + logger.Error("Failed to check for existing wildcard domain", "error", err, "domain", wildcardDomain) + return + } + + // Create the custom domain record + domainID := uid.New(uid.DomainPrefix) + now := time.Now().UnixMilli() + + // Use "unkey_internal" as the workspace for platform-managed resources + workspaceID := "unkey_internal" + err = db.Query.UpsertCustomDomain(ctx, database.RW(), db.UpsertCustomDomainParams{ + ID: domainID, + WorkspaceID: workspaceID, + Domain: wildcardDomain, + ChallengeType: db.CustomDomainsChallengeTypeDNS01, + CreatedAt: now, + UpdatedAt: sql.NullInt64{Int64: now, Valid: true}, + }) + if err != nil { + logger.Error("Failed to create wildcard domain", "error", err, "domain", wildcardDomain) + return + } + + // Create the ACME challenge record with status 'waiting' so the renewal cron picks it up + err = db.Query.InsertAcmeChallenge(ctx, database.RW(), db.InsertAcmeChallengeParams{ + WorkspaceID: workspaceID, + DomainID: domainID, + Token: "", + Authorization: "", + Status: db.AcmeChallengesStatusWaiting, + ChallengeType: db.AcmeChallengesChallengeTypeDNS01, + CreatedAt: now, + UpdatedAt: sql.NullInt64{Int64: now, Valid: true}, + ExpiresAt: 0, // Will be set when certificate is issued + }) + if err != nil { + logger.Error("Failed to create ACME challenge for wildcard domain", "error", err, "domain", wildcardDomain) + return + } + + logger.Info("Bootstrapped wildcard domain for certificate issuance", "domain", wildcardDomain) +} diff --git a/svc/ctrl/workflows/versioning/BUILD.bazel b/svc/worker/versioning/BUILD.bazel similarity index 81% rename from svc/ctrl/workflows/versioning/BUILD.bazel rename to svc/worker/versioning/BUILD.bazel index d5c14e82dc..ac05cb0caa 100644 --- a/svc/ctrl/workflows/versioning/BUILD.bazel +++ b/svc/worker/versioning/BUILD.bazel @@ -7,7 +7,7 @@ go_library( "next_version_handler.go", "service.go", ], - importpath = "github.com/unkeyed/unkey/svc/ctrl/workflows/versioning", + importpath = "github.com/unkeyed/unkey/svc/worker/versioning", visibility = ["//visibility:public"], deps = [ "//gen/proto/hydra/v1:hydra", diff --git a/svc/ctrl/workflows/versioning/doc.go b/svc/worker/versioning/doc.go similarity index 100% rename from svc/ctrl/workflows/versioning/doc.go rename to svc/worker/versioning/doc.go diff --git a/svc/ctrl/workflows/versioning/next_version_handler.go b/svc/worker/versioning/next_version_handler.go similarity index 100% rename from svc/ctrl/workflows/versioning/next_version_handler.go rename to svc/worker/versioning/next_version_handler.go diff --git a/svc/ctrl/workflows/versioning/service.go b/svc/worker/versioning/service.go similarity index 100% rename from svc/ctrl/workflows/versioning/service.go rename to svc/worker/versioning/service.go From 356b1f990977f01d0e77a8d4eb3e6b6c23707999 Mon Sep 17 00:00:00 2001 From: chronark Date: Thu, 22 Jan 2026 08:00:16 +0100 Subject: [PATCH 10/32] fix: remove cloudflare --- cmd/worker/main.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cmd/worker/main.go b/cmd/worker/main.go index 08e31faa7d..57482ed179 100644 --- a/cmd/worker/main.go +++ b/cmd/worker/main.go @@ -188,10 +188,6 @@ func action(ctx context.Context, cmd *cli.Command) error { Acme: worker.AcmeConfig{ Enabled: cmd.Bool("acme-enabled"), EmailDomain: cmd.String("acme-email-domain"), - Cloudflare: worker.CloudflareConfig{ - Enabled: cmd.Bool("acme-cloudflare-enabled"), - ApiToken: cmd.String("acme-cloudflare-api-token"), - }, Route53: worker.Route53Config{ Enabled: cmd.Bool("acme-route53-enabled"), AccessKeyID: cmd.String("acme-route53-access-key-id"), From 374e8895880bde89522fa1bebcae573bc213cd34 Mon Sep 17 00:00:00 2001 From: chronark Date: Thu, 22 Jan 2026 08:02:41 +0100 Subject: [PATCH 11/32] fix: handle error --- svc/worker/deploy/deploy_handler.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/svc/worker/deploy/deploy_handler.go b/svc/worker/deploy/deploy_handler.go index 7bec0910ea..607d0ead59 100644 --- a/svc/worker/deploy/deploy_handler.go +++ b/svc/worker/deploy/deploy_handler.go @@ -209,6 +209,9 @@ func (w *Workflow) Deploy(ctx restate.WorkflowSharedContext, req *hydrav1.Deploy return db.BulkQuery.InsertDeploymentTopologies(txCtx, tx, topologies) }) }, restate.WithName("insert deployment topologies")) + if err != nil { + return nil, fmt.Errorf("failed to insert deployment topologies: %w", err) + } // Ensure sentinels exist in each region for this deployment From 3059d35189026dce2c1dcadc9d31b5b9f57eec10 Mon Sep 17 00:00:00 2001 From: chronark Date: Thu, 22 Jan 2026 15:14:29 +0100 Subject: [PATCH 12/32] refactor: split ctrl into api and worker --- Makefile | 4 +- cmd/ctrl/BUILD.bazel | 9 +- cmd/ctrl/api.go | 189 ++++++++++ cmd/ctrl/main.go | 265 +------------ cmd/{worker/main.go => ctrl/worker.go} | 41 +- cmd/run/BUILD.bazel | 1 - cmd/run/main.go | 3 - cmd/worker/BUILD.bazel | 14 - dev/.env.depot.example | 11 +- dev/Tiltfile | 10 +- dev/docker-compose.yaml | 106 ++++-- dev/k8s/manifests/api.yaml | 2 +- .../manifests/{ctrl.yaml => ctrl-api.yaml} | 88 +---- .../{worker.yaml => ctrl-worker.yaml} | 28 +- dev/k8s/manifests/krane.yaml | 2 +- dev/k8s/manifests/preflight.yaml | 2 +- gen/proto/ctrl/v1/BUILD.bazel | 1 - gen/proto/ctrl/v1/build.pb.go | 349 ------------------ gen/proto/ctrl/v1/ctrlv1connect/BUILD.bazel | 1 - .../ctrl/v1/ctrlv1connect/build.connect.go | 138 ------- .../v1/ctrlv1connect/deployment.connect.go | 37 +- gen/proto/ctrl/v1/deployment.pb.go | 173 +++++++-- gen/proto/hydra/v1/BUILD.bazel | 2 + gen/proto/hydra/v1/build.pb.go | 241 ++++++++++++ gen/proto/hydra/v1/build_restate.pb.go | 103 ++++++ svc/api/internal/testutil/http.go | 10 - svc/api/routes/register.go | 66 ++-- svc/api/routes/services.go | 1 - .../v2_deploy_generate_upload_url/200_test.go | 6 +- .../v2_deploy_generate_upload_url/400_test.go | 2 +- .../v2_deploy_generate_upload_url/401_test.go | 2 +- .../v2_deploy_generate_upload_url/403_test.go | 2 +- .../v2_deploy_generate_upload_url/404_test.go | 2 +- .../v2_deploy_generate_upload_url/handler.go | 10 +- svc/api/run.go | 31 +- svc/ctrl/BUILD.bazel | 32 +- svc/ctrl/api/BUILD.bazel | 33 ++ svc/ctrl/{ => api}/config.go | 217 +---------- svc/ctrl/{ => api}/run.go | 80 +--- .../backend/depot => pkg/build}/BUILD.bazel | 12 +- .../create_build.go => pkg/build/build.go} | 195 ++++------ .../build/backend/depot => pkg/build}/doc.go | 2 +- .../backend/depot => pkg/build}/service.go | 26 +- svc/ctrl/pkg/hash/BUILD.bazel | 12 - svc/ctrl/pkg/hash/doc.go | 34 -- svc/ctrl/pkg/hash/hash.go | 50 --- .../build/storage => pkg/s3}/BUILD.bazel | 20 + .../{services/build/storage => pkg/s3}/doc.go | 2 +- svc/ctrl/pkg/s3/interface.go | 14 + .../{services/build/storage => pkg/s3}/s3.go | 2 +- svc/ctrl/proto/ctrl/v1/build.proto | 33 -- svc/ctrl/proto/ctrl/v1/deployment.proto | 10 + svc/ctrl/proto/hydra/v1/build.proto | 27 ++ svc/ctrl/services/build/BUILD.bazel | 8 - .../backend/depot/generate_upload_url.go | 55 --- .../services/build/backend/docker/BUILD.bazel | 25 -- .../build/backend/docker/create_build.go | 170 --------- svc/ctrl/services/build/backend/docker/doc.go | 52 --- .../backend/docker/generate_upload_url.go | 55 --- .../services/build/backend/docker/service.go | 42 --- svc/ctrl/services/build/doc.go | 59 --- .../services/cluster/rpc_watch_sentinels.go | 6 +- svc/ctrl/services/deployment/BUILD.bazel | 2 + .../deployment/create_s3_upload_url.go | 28 ++ svc/ctrl/services/deployment/service.go | 7 +- svc/{ => ctrl}/worker/BUILD.bazel | 18 +- svc/{ => ctrl}/worker/certificate/BUILD.bazel | 2 +- .../certificate/bootstrap_infra_certs.go | 0 svc/{ => ctrl}/worker/certificate/doc.go | 0 .../certificate/process_challenge_handler.go | 0 .../worker/certificate/renew_handler.go | 0 svc/{ => ctrl}/worker/certificate/service.go | 0 svc/{ => ctrl}/worker/config.go | 4 - svc/{ => ctrl}/worker/deploy/BUILD.bazel | 7 +- .../worker/deploy/deploy_handler.go | 54 ++- svc/{ => ctrl}/worker/deploy/doc.go | 0 svc/{ => ctrl}/worker/deploy/domains.go | 0 svc/{ => ctrl}/worker/deploy/helpers.go | 0 .../worker/deploy/promote_handler.go | 0 .../worker/deploy/rollback_handler.go | 0 svc/{ => ctrl}/worker/deploy/service.go | 18 +- svc/{ => ctrl}/worker/routing/BUILD.bazel | 2 +- .../worker/routing/assign_domains_handler.go | 0 svc/{ => ctrl}/worker/routing/doc.go | 0 svc/{ => ctrl}/worker/routing/service.go | 0 svc/{ => ctrl}/worker/run.go | 70 ++-- svc/{ => ctrl}/worker/versioning/BUILD.bazel | 2 +- svc/{ => ctrl}/worker/versioning/doc.go | 0 .../worker/versioning/next_version_handler.go | 0 svc/{ => ctrl}/worker/versioning/service.go | 0 svc/worker/doc.go | 78 ---- 91 files changed, 1241 insertions(+), 2276 deletions(-) create mode 100644 cmd/ctrl/api.go rename cmd/{worker/main.go => ctrl/worker.go} (84%) delete mode 100644 cmd/worker/BUILD.bazel rename dev/k8s/manifests/{ctrl.yaml => ctrl-api.yaml} (53%) rename dev/k8s/manifests/{worker.yaml => ctrl-worker.yaml} (91%) delete mode 100644 gen/proto/ctrl/v1/build.pb.go delete mode 100644 gen/proto/ctrl/v1/ctrlv1connect/build.connect.go create mode 100644 gen/proto/hydra/v1/build.pb.go create mode 100644 gen/proto/hydra/v1/build_restate.pb.go create mode 100644 svc/ctrl/api/BUILD.bazel rename svc/ctrl/{ => api}/config.go (52%) rename svc/ctrl/{ => api}/run.go (78%) rename svc/ctrl/{services/build/backend/depot => pkg/build}/BUILD.bazel (77%) rename svc/ctrl/{services/build/backend/depot/create_build.go => pkg/build/build.go} (60%) rename svc/ctrl/{services/build/backend/depot => pkg/build}/doc.go (99%) rename svc/ctrl/{services/build/backend/depot => pkg/build}/service.go (58%) delete mode 100644 svc/ctrl/pkg/hash/BUILD.bazel delete mode 100644 svc/ctrl/pkg/hash/doc.go delete mode 100644 svc/ctrl/pkg/hash/hash.go rename svc/ctrl/{services/build/storage => pkg/s3}/BUILD.bazel (51%) rename svc/ctrl/{services/build/storage => pkg/s3}/doc.go (98%) create mode 100644 svc/ctrl/pkg/s3/interface.go rename svc/ctrl/{services/build/storage => pkg/s3}/s3.go (99%) delete mode 100644 svc/ctrl/proto/ctrl/v1/build.proto create mode 100644 svc/ctrl/proto/hydra/v1/build.proto delete mode 100644 svc/ctrl/services/build/BUILD.bazel delete mode 100644 svc/ctrl/services/build/backend/depot/generate_upload_url.go delete mode 100644 svc/ctrl/services/build/backend/docker/BUILD.bazel delete mode 100644 svc/ctrl/services/build/backend/docker/create_build.go delete mode 100644 svc/ctrl/services/build/backend/docker/doc.go delete mode 100644 svc/ctrl/services/build/backend/docker/generate_upload_url.go delete mode 100644 svc/ctrl/services/build/backend/docker/service.go delete mode 100644 svc/ctrl/services/build/doc.go create mode 100644 svc/ctrl/services/deployment/create_s3_upload_url.go rename svc/{ => ctrl}/worker/BUILD.bazel (64%) rename svc/{ => ctrl}/worker/certificate/BUILD.bazel (91%) rename svc/{ => ctrl}/worker/certificate/bootstrap_infra_certs.go (100%) rename svc/{ => ctrl}/worker/certificate/doc.go (100%) rename svc/{ => ctrl}/worker/certificate/process_challenge_handler.go (100%) rename svc/{ => ctrl}/worker/certificate/renew_handler.go (100%) rename svc/{ => ctrl}/worker/certificate/service.go (100%) rename svc/{ => ctrl}/worker/config.go (99%) rename svc/{ => ctrl}/worker/deploy/BUILD.bazel (69%) rename svc/{ => ctrl}/worker/deploy/deploy_handler.go (89%) rename svc/{ => ctrl}/worker/deploy/doc.go (100%) rename svc/{ => ctrl}/worker/deploy/domains.go (100%) rename svc/{ => ctrl}/worker/deploy/helpers.go (100%) rename svc/{ => ctrl}/worker/deploy/promote_handler.go (100%) rename svc/{ => ctrl}/worker/deploy/rollback_handler.go (100%) rename svc/{ => ctrl}/worker/deploy/service.go (81%) rename svc/{ => ctrl}/worker/routing/BUILD.bazel (84%) rename svc/{ => ctrl}/worker/routing/assign_domains_handler.go (100%) rename svc/{ => ctrl}/worker/routing/doc.go (100%) rename svc/{ => ctrl}/worker/routing/service.go (100%) rename svc/{ => ctrl}/worker/run.go (89%) rename svc/{ => ctrl}/worker/versioning/BUILD.bazel (82%) rename svc/{ => ctrl}/worker/versioning/doc.go (100%) rename svc/{ => ctrl}/worker/versioning/next_version_handler.go (100%) rename svc/{ => ctrl}/worker/versioning/service.go (100%) delete mode 100644 svc/worker/doc.go diff --git a/Makefile b/Makefile index 69249f22a0..3628ee2b95 100644 --- a/Makefile +++ b/Makefile @@ -56,7 +56,7 @@ pull: ## Pull latest Docker images for services .PHONY: up up: pull ## Start all infrastructure services - @docker compose -f ./dev/docker-compose.yaml up -d planetscale mysql redis clickhouse s3 otel kafka restate ctrl --wait + @docker compose -f ./dev/docker-compose.yaml up -d planetscale mysql redis clickhouse s3 otel kafka restate ctrl-api --wait .PHONY: clean clean: ## Stop and remove all services with volumes @@ -86,7 +86,7 @@ generate: generate-sql ## Generate code from protobuf and other sources .PHONY: test test: ## Run tests with bazel - docker compose -f ./dev/docker-compose.yaml up -d mysql clickhouse s3 kafka restate ctrl --wait + docker compose -f ./dev/docker-compose.yaml up -d mysql clickhouse s3 kafka restate ctrl-api ctrl-worker --wait bazel test //... make clean-docker-test diff --git a/cmd/ctrl/BUILD.bazel b/cmd/ctrl/BUILD.bazel index 3703674221..f447284b1c 100644 --- a/cmd/ctrl/BUILD.bazel +++ b/cmd/ctrl/BUILD.bazel @@ -2,7 +2,11 @@ load("@rules_go//go:def.bzl", "go_library") go_library( name = "ctrl", - srcs = ["main.go"], + srcs = [ + "api.go", + "main.go", + "worker.go", + ], importpath = "github.com/unkeyed/unkey/cmd/ctrl", visibility = ["//visibility:public"], deps = [ @@ -10,6 +14,7 @@ go_library( "//pkg/clock", "//pkg/tls", "//pkg/uid", - "//svc/ctrl", + "//svc/ctrl/api", + "//svc/ctrl/worker", ], ) diff --git a/cmd/ctrl/api.go b/cmd/ctrl/api.go new file mode 100644 index 0000000000..b0e3fe535f --- /dev/null +++ b/cmd/ctrl/api.go @@ -0,0 +1,189 @@ +package ctrl + +import ( + "context" + + "github.com/unkeyed/unkey/pkg/cli" + "github.com/unkeyed/unkey/pkg/clock" + "github.com/unkeyed/unkey/pkg/tls" + "github.com/unkeyed/unkey/pkg/uid" + ctrlapi "github.com/unkeyed/unkey/svc/ctrl/api" +) + +// Cmd is the ctrl command that runs the Unkey control plane service for managing +// infrastructure, deployments, builds, and service orchestration. +var apiCmd = &cli.Command{ + Version: "", + Commands: []*cli.Command{}, + Aliases: []string{}, + Description: "", + Name: "api", + Usage: "Run the Unkey control plane service for managing infrastructure and services", + Flags: []cli.Flag{ + // Server Configuration + cli.Int("http-port", "HTTP port for the control plane server to listen on. Default: 8080", + cli.Default(8080), cli.EnvVar("UNKEY_HTTP_PORT")), + cli.Int("prometheus-port", "Port for Prometheus metrics, set to 0 to disable.", + cli.Default(0), cli.EnvVar("UNKEY_PROMETHEUS_PORT")), + cli.Bool("color", "Enable colored log output. Default: true", + cli.Default(true), cli.EnvVar("UNKEY_LOGS_COLOR")), + + // Instance Identification + cli.String("platform", "Cloud platform identifier for this node. Used for logging and metrics.", + cli.EnvVar("UNKEY_PLATFORM")), + cli.String("region", "Geographic region identifier. Used for logging and routing. Default: unknown", + cli.Default("unknown"), cli.EnvVar("UNKEY_REGION"), cli.EnvVar("AWS_REGION")), + cli.String("instance-id", "Unique identifier for this instance. Auto-generated if not provided.", + cli.Default(uid.New(uid.InstancePrefix, 4)), cli.EnvVar("UNKEY_INSTANCE_ID")), + + // Database Configuration + cli.String("database-primary", "MySQL connection string for primary database. Required for all deployments. Example: user:pass@host:3306/unkey?parseTime=true", + cli.Required(), cli.EnvVar("UNKEY_DATABASE_PRIMARY")), + + // Observability + cli.Bool("otel", "Enable OpenTelemetry tracing and metrics", + cli.EnvVar("UNKEY_OTEL")), + cli.Float("otel-trace-sampling-rate", "Sampling rate for OpenTelemetry traces (0.0-1.0). Only used when --otel is provided. Default: 0.25", + cli.Default(0.25), cli.EnvVar("UNKEY_OTEL_TRACE_SAMPLING_RATE")), + + // TLS Configuration + cli.String("tls-cert-file", "Path to TLS certificate file for HTTPS. Both cert and key must be provided to enable HTTPS.", + cli.EnvVar("UNKEY_TLS_CERT_FILE")), + cli.String("tls-key-file", "Path to TLS key file for HTTPS. Both cert and key must be provided to enable HTTPS.", + cli.EnvVar("UNKEY_TLS_KEY_FILE")), + + // Control Plane Specific + cli.String("auth-token", "Authentication token for control plane API access. Required for secure deployments.", + cli.EnvVar("UNKEY_AUTH_TOKEN")), + cli.String("spiffe-socket-path", "Path to SPIFFE agent socket for mTLS authentication. Default: /var/lib/spire/agent/agent.sock", + cli.Default("/var/lib/spire/agent/agent.sock"), cli.EnvVar("UNKEY_SPIFFE_SOCKET_PATH")), + + cli.String("vault-url", "Url where vault is availab;e", + cli.EnvVar("UNKEY_VAULT_URL"), cli.Default("https://vault.unkey.cloud")), + + cli.String("vault-token", "Authentication for vault", + cli.EnvVar("UNKEY_VAULT_TOKEN")), + + cli.Bool("acme-enabled", "Enable Let's Encrypt for acme challenges", cli.EnvVar("UNKEY_ACME_ENABLED")), + cli.String("acme-email-domain", "Domain for ACME registration emails (workspace_id@domain)", cli.Default("unkey.com"), cli.EnvVar("UNKEY_ACME_EMAIL_DOMAIN")), + + // Route53 DNS provider + cli.Bool("acme-route53-enabled", "Enable Route53 for DNS-01 challenges", cli.EnvVar("UNKEY_ACME_ROUTE53_ENABLED")), + cli.String("acme-route53-access-key-id", "AWS access key ID for Route53", cli.EnvVar("UNKEY_ACME_ROUTE53_ACCESS_KEY_ID")), + cli.String("acme-route53-secret-access-key", "AWS secret access key for Route53", cli.EnvVar("UNKEY_ACME_ROUTE53_SECRET_ACCESS_KEY")), + cli.String("acme-route53-region", "AWS region for Route53", cli.Default("us-east-1"), cli.EnvVar("UNKEY_ACME_ROUTE53_REGION")), + cli.String("acme-route53-hosted-zone-id", "Route53 hosted zone ID (bypasses auto-discovery, required when wildcard CNAMEs exist)", cli.EnvVar("UNKEY_ACME_ROUTE53_HOSTED_ZONE_ID")), + + cli.String("default-domain", "Default domain for auto-generated hostnames", cli.Default("unkey.app"), cli.EnvVar("UNKEY_DEFAULT_DOMAIN")), + cli.String("regional-apex-domain", "Apex domain for cross-region frontline communication (e.g., unkey.cloud). Certs are provisioned for *.{region}.{regional-apex-domain}", cli.EnvVar("UNKEY_REGIONAL_APEX_DOMAIN")), + + // Restate Configuration + cli.String("restate-url", "URL of the Restate ingress endpoint for invoking workflows. Example: http://restate:8080", + cli.Default("http://restate:8080"), cli.EnvVar("UNKEY_RESTATE_INGRESS_URL")), + cli.String("restate-admin-url", "URL of the Restate admin endpoint for service registration. Example: http://restate:9070", + cli.Default("http://restate:9070"), cli.EnvVar("UNKEY_RESTATE_ADMIN_URL")), + cli.Int("restate-http-port", "Port where we listen for Restate HTTP requests. Example: 9080", + cli.Default(9080), cli.EnvVar("UNKEY_RESTATE_HTTP_PORT")), + cli.String("restate-register-as", "URL of this service for self-registration with Restate. Example: http://ctrl:9080", + cli.EnvVar("UNKEY_RESTATE_REGISTER_AS")), + cli.String("restate-api-key", "API key for Restate ingress requests", + cli.EnvVar("UNKEY_RESTATE_API_KEY")), + cli.String("clickhouse-url", "ClickHouse connection string for analytics. Recommended for production. Example: clickhouse://user:pass@host:9000/unkey", + cli.EnvVar("UNKEY_CLICKHOUSE_URL")), + + // Build S3 configuration + cli.String("build-s3-url", "S3 URL for build storage", + cli.Required(), cli.EnvVar("UNKEY_BUILD_S3_URL")), + cli.String("build-s3-external-url", "External S3 URL for presigned URLs", + cli.EnvVar("UNKEY_BUILD_S3_EXTERNAL_URL")), + cli.String("build-s3-bucket", "S3 bucket for build storage", + cli.Required(), cli.EnvVar("UNKEY_BUILD_S3_BUCKET")), + cli.String("build-s3-access-key-id", "S3 access key ID", + cli.Required(), cli.EnvVar("UNKEY_BUILD_S3_ACCESS_KEY_ID")), + cli.String("build-s3-access-key-secret", "S3 access key secret", + cli.Required(), cli.EnvVar("UNKEY_BUILD_S3_ACCESS_KEY_SECRET")), + + // The image new sentinels get deployed with + cli.String("sentinel-image", "The image new sentinels get deployed with", cli.Default("ghcr.io/unkeyed/unkey:local"), cli.EnvVar("UNKEY_SENTINEL_IMAGE")), + cli.StringSlice("available-regions", "Available regions for deployment", cli.EnvVar("UNKEY_AVAILABLE_REGIONS"), cli.Default([]string{"local.dev"})), + }, + Action: apiAction, +} + +func apiAction(ctx context.Context, cmd *cli.Command) error { + // Check if TLS flags are properly set (both or none) + tlsCertFile := cmd.String("tls-cert-file") + tlsKeyFile := cmd.String("tls-key-file") + if (tlsCertFile == "" && tlsKeyFile != "") || (tlsCertFile != "" && tlsKeyFile == "") { + return cli.Exit("Both --tls-cert-file and --tls-key-file must be provided to enable HTTPS", 1) + } + + // Initialize TLS config if TLS flags are provided + var tlsConfig *tls.Config + if tlsCertFile != "" && tlsKeyFile != "" { + var err error + tlsConfig, err = tls.NewFromFiles(tlsCertFile, tlsKeyFile) + if err != nil { + return cli.Exit("Failed to load TLS configuration: "+err.Error(), 1) + } + } + + config := ctrlapi.Config{ + // Basic configuration + Image: cmd.String("image"), + HttpPort: cmd.Int("http-port"), + PrometheusPort: cmd.Int("prometheus-port"), + Region: cmd.String("region"), + InstanceID: cmd.String("instance-id"), + + // Database configuration + DatabasePrimary: cmd.String("database-primary"), + + // Observability + OtelEnabled: cmd.Bool("otel"), + OtelTraceSamplingRate: cmd.Float("otel-trace-sampling-rate"), + + // TLS Configuration + TLSConfig: tlsConfig, + + // Control Plane Specific + AuthToken: cmd.String("auth-token"), + + Vault: ctrlapi.VaultConfig{ + Url: cmd.RequireString("vault-url"), + Token: cmd.RequireString("vault-token"), + }, + + // Build configuration + BuildS3: ctrlapi.S3Config{ + URL: cmd.String("build-s3-url"), + ExternalURL: cmd.String("build-s3-external-url"), + Bucket: cmd.String("build-s3-bucket"), + AccessKeySecret: cmd.String("build-s3-access-key-secret"), + AccessKeyID: cmd.String("build-s3-access-key-id"), + }, + + // Restate configuration + Restate: ctrlapi.RestateConfig{ + URL: cmd.String("restate-url"), + AdminURL: cmd.String("restate-admin-url"), + HttpPort: cmd.Int("restate-http-port"), + RegisterAs: cmd.String("restate-register-as"), + APIKey: cmd.String("restate-api-key"), + }, + + // Common + Clock: clock.New(), + + // Sentinel configuration + SentinelImage: cmd.String("sentinel-image"), + AvailableRegions: cmd.RequireStringSlice("available-regions"), + } + + err := config.Validate() + if err != nil { + return err + } + + return ctrlapi.Run(ctx, config) +} diff --git a/cmd/ctrl/main.go b/cmd/ctrl/main.go index 32e65eb9fd..fb131e4c6b 100644 --- a/cmd/ctrl/main.go +++ b/cmd/ctrl/main.go @@ -1,269 +1,20 @@ package ctrl import ( - "context" - "github.com/unkeyed/unkey/pkg/cli" - "github.com/unkeyed/unkey/pkg/clock" - "github.com/unkeyed/unkey/pkg/tls" - "github.com/unkeyed/unkey/pkg/uid" - "github.com/unkeyed/unkey/svc/ctrl" ) -// Cmd is the ctrl command that runs the Unkey control plane service for managing -// infrastructure, deployments, builds, and service orchestration. var Cmd = &cli.Command{ - Version: "", - Commands: []*cli.Command{}, + Version: "", + Commands: []*cli.Command{ + apiCmd, + workerCmd, + }, Aliases: []string{}, Description: "", Name: "ctrl", Usage: "Run the Unkey control plane service for managing infrastructure and services", - Flags: []cli.Flag{ - // Server Configuration - cli.Int("http-port", "HTTP port for the control plane server to listen on. Default: 8080", - cli.Default(8080), cli.EnvVar("UNKEY_HTTP_PORT")), - cli.Int("prometheus-port", "Port for Prometheus metrics, set to 0 to disable.", - cli.Default(0), cli.EnvVar("UNKEY_PROMETHEUS_PORT")), - cli.Bool("color", "Enable colored log output. Default: true", - cli.Default(true), cli.EnvVar("UNKEY_LOGS_COLOR")), - - // Instance Identification - cli.String("platform", "Cloud platform identifier for this node. Used for logging and metrics.", - cli.EnvVar("UNKEY_PLATFORM")), - cli.String("region", "Geographic region identifier. Used for logging and routing. Default: unknown", - cli.Default("unknown"), cli.EnvVar("UNKEY_REGION"), cli.EnvVar("AWS_REGION")), - cli.String("instance-id", "Unique identifier for this instance. Auto-generated if not provided.", - cli.Default(uid.New(uid.InstancePrefix, 4)), cli.EnvVar("UNKEY_INSTANCE_ID")), - - // Database Configuration - cli.String("database-primary", "MySQL connection string for primary database. Required for all deployments. Example: user:pass@host:3306/unkey?parseTime=true", - cli.Required(), cli.EnvVar("UNKEY_DATABASE_PRIMARY")), - - // Observability - cli.Bool("otel", "Enable OpenTelemetry tracing and metrics", - cli.EnvVar("UNKEY_OTEL")), - cli.Float("otel-trace-sampling-rate", "Sampling rate for OpenTelemetry traces (0.0-1.0). Only used when --otel is provided. Default: 0.25", - cli.Default(0.25), cli.EnvVar("UNKEY_OTEL_TRACE_SAMPLING_RATE")), - - // TLS Configuration - cli.String("tls-cert-file", "Path to TLS certificate file for HTTPS. Both cert and key must be provided to enable HTTPS.", - cli.EnvVar("UNKEY_TLS_CERT_FILE")), - cli.String("tls-key-file", "Path to TLS key file for HTTPS. Both cert and key must be provided to enable HTTPS.", - cli.EnvVar("UNKEY_TLS_KEY_FILE")), - - // Control Plane Specific - cli.String("auth-token", "Authentication token for control plane API access. Required for secure deployments.", - cli.EnvVar("UNKEY_AUTH_TOKEN")), - cli.String("spiffe-socket-path", "Path to SPIFFE agent socket for mTLS authentication. Default: /var/lib/spire/agent/agent.sock", - cli.Default("/var/lib/spire/agent/agent.sock"), cli.EnvVar("UNKEY_SPIFFE_SOCKET_PATH")), - - // Vault Configuration - General secrets (env vars, API keys) - cli.StringSlice("vault-master-keys", "Vault master keys for encryption (general vault)", - cli.Required(), cli.EnvVar("UNKEY_VAULT_MASTER_KEYS")), - cli.String("vault-s3-url", "S3 endpoint URL for general vault", - cli.EnvVar("UNKEY_VAULT_S3_URL")), - cli.String("vault-s3-bucket", "S3 bucket for general vault (env vars, API keys)", - cli.EnvVar("UNKEY_VAULT_S3_BUCKET")), - cli.String("vault-s3-access-key-id", "S3 access key ID for general vault", - cli.EnvVar("UNKEY_VAULT_S3_ACCESS_KEY_ID")), - cli.String("vault-s3-access-key-secret", "S3 secret access key for general vault", - cli.EnvVar("UNKEY_VAULT_S3_ACCESS_KEY_SECRET")), - - // ACME Vault Configuration - Let's Encrypt certificates - cli.StringSlice("acme-vault-master-keys", "Vault master keys for encryption (ACME vault)", - cli.EnvVar("UNKEY_ACME_VAULT_MASTER_KEYS")), - cli.String("acme-vault-s3-url", "S3 endpoint URL for ACME vault", - cli.EnvVar("UNKEY_ACME_VAULT_S3_URL")), - cli.String("acme-vault-s3-bucket", "S3 bucket for ACME vault (Let's Encrypt certs)", - cli.EnvVar("UNKEY_ACME_VAULT_S3_BUCKET")), - cli.String("acme-vault-s3-access-key-id", "S3 access key ID for ACME vault", - cli.EnvVar("UNKEY_ACME_VAULT_S3_ACCESS_KEY_ID")), - cli.String("acme-vault-s3-access-key-secret", "S3 secret access key for ACME vault", - cli.EnvVar("UNKEY_ACME_VAULT_S3_ACCESS_KEY_SECRET")), - - // Build Configuration - cli.String("build-backend", "Build backend to use: 'docker' for local, 'depot' for production. Default: depot", - cli.Default("depot"), cli.EnvVar("UNKEY_BUILD_BACKEND")), - cli.String("build-s3-url", "S3 Compatible Endpoint URL for build contexts (internal)", - cli.Required(), cli.EnvVar("UNKEY_BUILD_S3_URL")), - cli.String("build-s3-external-url", "S3 Compatible Endpoint URL for build contexts (external/public)", - cli.EnvVar("UNKEY_BUILD_S3_EXTERNAL_URL")), - cli.String("build-s3-bucket", "S3 bucket name for build contexts", - cli.Required(), cli.EnvVar("UNKEY_BUILD_S3_BUCKET")), - cli.String("build-s3-access-key-id", "S3 access key ID for build contexts", - cli.Required(), cli.EnvVar("UNKEY_BUILD_S3_ACCESS_KEY_ID")), - cli.String("build-s3-access-key-secret", "S3 secret access key for build contexts", - cli.Required(), cli.EnvVar("UNKEY_BUILD_S3_ACCESS_KEY_SECRET")), - - cli.String("registry-url", "URL of the container registry for pulling images. Example: registry.depot.dev", - cli.EnvVar("UNKEY_REGISTRY_URL")), - cli.String("registry-username", "Username for authenticating with the container registry.", - cli.EnvVar("UNKEY_REGISTRY_USERNAME")), - cli.String("registry-password", "Password/token for authenticating with the container registry.", - cli.EnvVar("UNKEY_REGISTRY_PASSWORD")), - cli.String("build-platform", "Run builds on this platform ('dynamic', 'linux/amd64', 'linux/arm64')", - cli.EnvVar("UNKEY_BUILD_PLATFORM"), cli.Default("linux/amd64")), - // Depot Build Backend Configuration - cli.String("depot-api-url", "Depot API endpoint URL", - cli.EnvVar("UNKEY_DEPOT_API_URL")), - cli.String("depot-project-region", "Build data will be stored in the chosen region ('us-east-1','eu-central-1')", - cli.EnvVar("UNKEY_DEPOT_PROJECT_REGION"), cli.Default("us-east-1")), - - cli.Bool("acme-enabled", "Enable Let's Encrypt for acme challenges", cli.EnvVar("UNKEY_ACME_ENABLED")), - cli.String("acme-email-domain", "Domain for ACME registration emails (workspace_id@domain)", cli.Default("unkey.com"), cli.EnvVar("UNKEY_ACME_EMAIL_DOMAIN")), - - // Route53 DNS provider - cli.Bool("acme-route53-enabled", "Enable Route53 for DNS-01 challenges", cli.EnvVar("UNKEY_ACME_ROUTE53_ENABLED")), - cli.String("acme-route53-access-key-id", "AWS access key ID for Route53", cli.EnvVar("UNKEY_ACME_ROUTE53_ACCESS_KEY_ID")), - cli.String("acme-route53-secret-access-key", "AWS secret access key for Route53", cli.EnvVar("UNKEY_ACME_ROUTE53_SECRET_ACCESS_KEY")), - cli.String("acme-route53-region", "AWS region for Route53", cli.Default("us-east-1"), cli.EnvVar("UNKEY_ACME_ROUTE53_REGION")), - cli.String("acme-route53-hosted-zone-id", "Route53 hosted zone ID (bypasses auto-discovery, required when wildcard CNAMEs exist)", cli.EnvVar("UNKEY_ACME_ROUTE53_HOSTED_ZONE_ID")), - - cli.String("default-domain", "Default domain for auto-generated hostnames", cli.Default("unkey.app"), cli.EnvVar("UNKEY_DEFAULT_DOMAIN")), - cli.String("regional-apex-domain", "Apex domain for cross-region frontline communication (e.g., unkey.cloud). Certs are provisioned for *.{region}.{regional-apex-domain}", cli.EnvVar("UNKEY_REGIONAL_APEX_DOMAIN")), - - // Restate Configuration - cli.String("restate-url", "URL of the Restate ingress endpoint for invoking workflows. Example: http://restate:8080", - cli.Default("http://restate:8080"), cli.EnvVar("UNKEY_RESTATE_INGRESS_URL")), - cli.String("restate-admin-url", "URL of the Restate admin endpoint for service registration. Example: http://restate:9070", - cli.Default("http://restate:9070"), cli.EnvVar("UNKEY_RESTATE_ADMIN_URL")), - cli.Int("restate-http-port", "Port where we listen for Restate HTTP requests. Example: 9080", - cli.Default(9080), cli.EnvVar("UNKEY_RESTATE_HTTP_PORT")), - cli.String("restate-register-as", "URL of this service for self-registration with Restate. Example: http://ctrl:9080", - cli.EnvVar("UNKEY_RESTATE_REGISTER_AS")), - cli.String("restate-api-key", "API key for Restate ingress requests", - cli.EnvVar("UNKEY_RESTATE_API_KEY")), - cli.String("clickhouse-url", "ClickHouse connection string for analytics. Recommended for production. Example: clickhouse://user:pass@host:9000/unkey", - cli.EnvVar("UNKEY_CLICKHOUSE_URL")), - - // The image new sentinels get deployed with - cli.String("sentinel-image", "The image new sentinels get deployed with", cli.Default("ghcr.io/unkeyed/unkey:local"), cli.EnvVar("UNKEY_SENTINEL_IMAGE")), - cli.StringSlice("available-regions", "Available regions for deployment", cli.EnvVar("UNKEY_AVAILABLE_REGIONS"), cli.Default([]string{"local.dev"})), - }, - Action: action, -} - -func action(ctx context.Context, cmd *cli.Command) error { - // Check if TLS flags are properly set (both or none) - tlsCertFile := cmd.String("tls-cert-file") - tlsKeyFile := cmd.String("tls-key-file") - if (tlsCertFile == "" && tlsKeyFile != "") || (tlsCertFile != "" && tlsKeyFile == "") { - return cli.Exit("Both --tls-cert-file and --tls-key-file must be provided to enable HTTPS", 1) - } - - // Initialize TLS config if TLS flags are provided - var tlsConfig *tls.Config - if tlsCertFile != "" && tlsKeyFile != "" { - var err error - tlsConfig, err = tls.NewFromFiles(tlsCertFile, tlsKeyFile) - if err != nil { - return cli.Exit("Failed to load TLS configuration: "+err.Error(), 1) - } - } - - config := ctrl.Config{ - // Basic configuration - Platform: cmd.String("platform"), - BuildPlatform: cmd.String("build-platform"), - Image: cmd.String("image"), - HttpPort: cmd.Int("http-port"), - PrometheusPort: cmd.Int("prometheus-port"), - Region: cmd.String("region"), - InstanceID: cmd.String("instance-id"), - RegistryURL: cmd.String("registry-url"), - RegistryUsername: cmd.String("registry-username"), - RegistryPassword: cmd.String("registry-password"), - - // Database configuration - DatabasePrimary: cmd.String("database-primary"), - - // Observability - OtelEnabled: cmd.Bool("otel"), - OtelTraceSamplingRate: cmd.Float("otel-trace-sampling-rate"), - - // TLS Configuration - TLSConfig: tlsConfig, - - // Control Plane Specific - AuthToken: cmd.String("auth-token"), - SPIFFESocketPath: cmd.String("spiffe-socket-path"), - - // Vault configuration - General secrets - VaultMasterKeys: cmd.StringSlice("vault-master-keys"), - VaultS3: ctrl.S3Config{ - URL: cmd.String("vault-s3-url"), - Bucket: cmd.String("vault-s3-bucket"), - AccessKeyID: cmd.String("vault-s3-access-key-id"), - AccessKeySecret: cmd.String("vault-s3-access-key-secret"), - ExternalURL: "", - }, - // ACME Vault configuration - Let's Encrypt certificates - AcmeVaultMasterKeys: cmd.StringSlice("acme-vault-master-keys"), - AcmeVaultS3: ctrl.S3Config{ - URL: cmd.String("acme-vault-s3-url"), - Bucket: cmd.String("acme-vault-s3-bucket"), - AccessKeyID: cmd.String("acme-vault-s3-access-key-id"), - AccessKeySecret: cmd.String("acme-vault-s3-access-key-secret"), - ExternalURL: "", - }, - - // Build configuration - BuildBackend: ctrl.BuildBackend(cmd.String("build-backend")), - BuildS3: ctrl.S3Config{ - URL: cmd.String("build-s3-url"), - ExternalURL: cmd.String("build-s3-external-url"), - Bucket: cmd.String("build-s3-bucket"), - AccessKeySecret: cmd.String("build-s3-access-key-secret"), - AccessKeyID: cmd.String("build-s3-access-key-id"), - }, - - // Depot build backend configuration - Depot: ctrl.DepotConfig{ - APIUrl: cmd.String("depot-api-url"), - ProjectRegion: cmd.String("depot-project-region"), - }, - - // Acme configuration - Acme: ctrl.AcmeConfig{ - Enabled: cmd.Bool("acme-enabled"), - EmailDomain: cmd.String("acme-email-domain"), - Route53: ctrl.Route53Config{ - Enabled: cmd.Bool("acme-route53-enabled"), - AccessKeyID: cmd.String("acme-route53-access-key-id"), - SecretAccessKey: cmd.String("acme-route53-secret-access-key"), - Region: cmd.String("acme-route53-region"), - HostedZoneID: cmd.String("acme-route53-hosted-zone-id"), - }, - }, - - DefaultDomain: cmd.String("default-domain"), - RegionalApexDomain: cmd.String("regional-apex-domain"), - - // Restate configuration - Restate: ctrl.RestateConfig{ - URL: cmd.String("restate-url"), - AdminURL: cmd.String("restate-admin-url"), - HttpPort: cmd.Int("restate-http-port"), - RegisterAs: cmd.String("restate-register-as"), - APIKey: cmd.String("restate-api-key"), - }, - - // Clickhouse Configuration - ClickhouseURL: cmd.String("clickhouse-url"), - - // Common - Clock: clock.New(), - - // Sentinel configuration - SentinelImage: cmd.String("sentinel-image"), - AvailableRegions: cmd.RequireStringSlice("available-regions"), - } - - err := config.Validate() - if err != nil { - return err - } - - return ctrl.Run(ctx, config) + Flags: []cli.Flag{}, + Action: nil, + AcceptsArgs: false, } diff --git a/cmd/worker/main.go b/cmd/ctrl/worker.go similarity index 84% rename from cmd/worker/main.go rename to cmd/ctrl/worker.go index 57482ed179..53166390b9 100644 --- a/cmd/worker/main.go +++ b/cmd/ctrl/worker.go @@ -1,4 +1,4 @@ -package worker +package ctrl import ( "context" @@ -6,12 +6,12 @@ import ( "github.com/unkeyed/unkey/pkg/cli" "github.com/unkeyed/unkey/pkg/clock" "github.com/unkeyed/unkey/pkg/uid" - "github.com/unkeyed/unkey/svc/worker" + "github.com/unkeyed/unkey/svc/ctrl/worker" ) // Cmd is the worker command that runs the Unkey Restate worker service for // handling background jobs, deployments, builds, and certificate management. -var Cmd = &cli.Command{ +var workerCmd = &cli.Command{ Version: "", Commands: []*cli.Command{}, Aliases: []string{}, @@ -37,29 +37,11 @@ var Cmd = &cli.Command{ cli.String("auth-token", "Authentication token for worker API access.", cli.EnvVar("UNKEY_AUTH_TOKEN")), - // Vault Configuration - General secrets (env vars, API keys) - cli.StringSlice("vault-master-keys", "Vault master keys for encryption (general vault)", - cli.Required(), cli.EnvVar("UNKEY_VAULT_MASTER_KEYS")), - cli.String("vault-s3-url", "S3 endpoint URL for general vault", - cli.EnvVar("UNKEY_VAULT_S3_URL")), - cli.String("vault-s3-bucket", "S3 bucket for general vault (env vars, API keys)", - cli.EnvVar("UNKEY_VAULT_S3_BUCKET")), - cli.String("vault-s3-access-key-id", "S3 access key ID for general vault", - cli.EnvVar("UNKEY_VAULT_S3_ACCESS_KEY_ID")), - cli.String("vault-s3-access-key-secret", "S3 secret access key for general vault", - cli.EnvVar("UNKEY_VAULT_S3_ACCESS_KEY_SECRET")), - - // ACME Vault Configuration - Let's Encrypt certificates - cli.StringSlice("acme-vault-master-keys", "Vault master keys for encryption (ACME vault)", - cli.EnvVar("UNKEY_ACME_VAULT_MASTER_KEYS")), - cli.String("acme-vault-s3-url", "S3 endpoint URL for ACME vault", - cli.EnvVar("UNKEY_ACME_VAULT_S3_URL")), - cli.String("acme-vault-s3-bucket", "S3 bucket for ACME vault (Let's Encrypt certs)", - cli.EnvVar("UNKEY_ACME_VAULT_S3_BUCKET")), - cli.String("acme-vault-s3-access-key-id", "S3 access key ID for ACME vault", - cli.EnvVar("UNKEY_ACME_VAULT_S3_ACCESS_KEY_ID")), - cli.String("acme-vault-s3-access-key-secret", "S3 secret access key for ACME vault", - cli.EnvVar("UNKEY_ACME_VAULT_S3_ACCESS_KEY_SECRET")), + cli.String("vault-url", "Url where vault is availab;e", + cli.EnvVar("UNKEY_VAULT_URL"), cli.Default("https://vault.unkey.cloud")), + + cli.String("vault-token", "Authentication for vault", + cli.EnvVar("UNKEY_VAULT_TOKEN")), // Build Configuration cli.String("build-backend", "Build backend to use: 'docker' for local, 'depot' for production. Default: depot", @@ -126,10 +108,10 @@ var Cmd = &cli.Command{ cli.String("sentinel-image", "The image new sentinels get deployed with", cli.Default("ghcr.io/unkeyed/unkey:local"), cli.EnvVar("UNKEY_SENTINEL_IMAGE")), cli.StringSlice("available-regions", "Available regions for deployment", cli.EnvVar("UNKEY_AVAILABLE_REGIONS"), cli.Default([]string{"local.dev"})), }, - Action: action, + Action: workerAction, } -func action(ctx context.Context, cmd *cli.Command) error { +func workerAction(ctx context.Context, cmd *cli.Command) error { config := worker.Config{ // Basic configuration HttpPort: cmd.Int("http-port"), @@ -139,9 +121,6 @@ func action(ctx context.Context, cmd *cli.Command) error { // Database configuration DatabasePrimary: cmd.String("database-primary"), - // Authentication - AuthToken: cmd.String("auth-token"), - // Vault configuration - General secrets VaultMasterKeys: cmd.StringSlice("vault-master-keys"), VaultS3: worker.S3Config{ diff --git a/cmd/run/BUILD.bazel b/cmd/run/BUILD.bazel index 05397a3220..8cb77eb3f1 100644 --- a/cmd/run/BUILD.bazel +++ b/cmd/run/BUILD.bazel @@ -13,7 +13,6 @@ go_library( "//cmd/preflight", "//cmd/sentinel", "//cmd/vault", - "//cmd/worker", "//pkg/cli", ], ) diff --git a/cmd/run/main.go b/cmd/run/main.go index 72647b9f70..1e1b1a307d 100644 --- a/cmd/run/main.go +++ b/cmd/run/main.go @@ -11,7 +11,6 @@ import ( "github.com/unkeyed/unkey/cmd/preflight" "github.com/unkeyed/unkey/cmd/sentinel" "github.com/unkeyed/unkey/cmd/vault" - "github.com/unkeyed/unkey/cmd/worker" "github.com/unkeyed/unkey/pkg/cli" ) @@ -38,7 +37,6 @@ AVAILABLE SERVICES: EXAMPLES: unkey run api # Run the API server -unkey run ctrl # Run the control plane unkey run frontline # Run the frontline service unkey run sentinel # Run the tenant sentinel service unkey run --help # Show available services and their options @@ -51,7 +49,6 @@ unkey run api --port 8080 --env production # Run API server with custom con sentinel.Cmd, preflight.Cmd, vault.Cmd, - worker.Cmd, }, Action: runAction, } diff --git a/cmd/worker/BUILD.bazel b/cmd/worker/BUILD.bazel deleted file mode 100644 index ea7a65a962..0000000000 --- a/cmd/worker/BUILD.bazel +++ /dev/null @@ -1,14 +0,0 @@ -load("@rules_go//go:def.bzl", "go_library") - -go_library( - name = "worker", - srcs = ["main.go"], - importpath = "github.com/unkeyed/unkey/cmd/worker", - visibility = ["//visibility:public"], - deps = [ - "//pkg/cli", - "//pkg/clock", - "//pkg/uid", - "//svc/worker", - ], -) diff --git a/dev/.env.depot.example b/dev/.env.depot.example index 3fc46fcaa5..0f7cccb5be 100644 --- a/dev/.env.depot.example +++ b/dev/.env.depot.example @@ -1,6 +1,5 @@ -# yes these are lowercase, sue me - -token= -s3-url= -s3-access-key-id= -s3-access-key-secret= +UNKEY_DEPOT_TOKEN= +UNKEY_BUILD_S3_URL= +UNKEY_BUILD_S3_ACCESS_KEY_ID= +UNKEY_BUILD_S3_ACCESS_KEY_SECRET= +UNKEY_REGISTRY_PASSWORD= diff --git a/dev/Tiltfile b/dev/Tiltfile index 72df746b05..907c3c74f2 100644 --- a/dev/Tiltfile +++ b/dev/Tiltfile @@ -129,9 +129,9 @@ k8s_resource( ) # Ctrl service -k8s_yaml('k8s/manifests/ctrl.yaml') +k8s_yaml('k8s/manifests/ctrl-api.yaml') k8s_resource( - 'ctrl', + 'ctrl-api', port_forwards='7091:7091', resource_deps=['mysql', 'clickhouse', 'restate', 'build-unkey', 'depot-credentials'], labels=['unkey'], @@ -140,9 +140,9 @@ k8s_resource( ) # Worker service (Restate workflow handlers) -k8s_yaml('k8s/manifests/worker.yaml') +k8s_yaml('k8s/manifests/ctrl-worker.yaml') k8s_resource( - 'worker', + 'ctrl-worker', port_forwards=['7092:7092', '9080:9080'], resource_deps=['mysql', 'clickhouse', 'restate', 'build-unkey', 'depot-credentials'], labels=['unkey'], @@ -155,7 +155,7 @@ k8s_yaml('k8s/manifests/krane.yaml') k8s_resource( 'krane', port_forwards='8070:8070', - resource_deps=['ctrl', 'build-unkey', 'rbac', 'namespace', 'sentinel', 'cilium-policies', 'build-sentinel-image'], + resource_deps=['ctrl-api', 'build-unkey', 'rbac', 'namespace', 'sentinel', 'cilium-policies', 'build-sentinel-image'], labels=['unkey'], auto_init=True, trigger_mode=TRIGGER_MODE_AUTO diff --git a/dev/docker-compose.yaml b/dev/docker-compose.yaml index 41f1f5db68..b8838ce70c 100644 --- a/dev/docker-compose.yaml +++ b/dev/docker-compose.yaml @@ -78,7 +78,7 @@ services: condition: service_healthy kafka: condition: service_started - ctrl: + ctrl-api: condition: service_started environment: UNKEY_HTTP_PORT: 7070 @@ -94,7 +94,7 @@ services: UNKEY_VAULT_MASTER_KEYS: "Ch9rZWtfMmdqMFBJdVhac1NSa0ZhNE5mOWlLSnBHenFPENTt7an5MRogENt9Si6wms4pQ2XIvqNSIgNpaBenJmXgcInhu6Nfv2U=" UNKEY_KAFKA_BROKERS: "kafka:9092" UNKEY_CLICKHOUSE_ANALYTICS_URL: "http://clickhouse:8123/default" - UNKEY_CTRL_URL: "http://ctrl:7091" + UNKEY_CTRL_URL: "http://ctrl-api:7091" UNKEY_CTRL_TOKEN: "your-local-dev-key" UNKEY_PPROF_ENABLED: "true" UNKEY_PPROF_USERNAME: "admin" @@ -280,7 +280,7 @@ services: environment: # Server configuration UNKEY_REGION: "local.dev" # currently required to receive filtered events from ctrl - UNKEY_CONTROL_PLANE_URL: "http://ctrl:7091" + UNKEY_CONTROL_PLANE_URL: "http://ctrl-api:7091" UNKEY_CONTROL_PLANE_BEARER: "your-local-dev-key" # Backend configuration - use Docker backend for development @@ -311,7 +311,7 @@ services: start_period: 10s interval: 5s - ctrl: + ctrl-api: networks: - default build: @@ -319,8 +319,8 @@ services: dockerfile: Dockerfile args: VERSION: "latest" - container_name: ctrl - command: ["run", "ctrl"] + container_name: ctrl-api + command: ["run", "ctrl", "api"] ports: - "7091:7091" depends_on: @@ -336,18 +336,69 @@ services: clickhouse: condition: service_healthy required: true - volumes: - - /var/run/docker.sock:/var/run/docker.sock environment: UNKEY_DATABASE_PRIMARY: "unkey:password@tcp(mysql:3306)/unkey?parseTime=true&interpolateParams=true" # Control plane configuration UNKEY_HTTP_PORT: "7091" + + # Restate configuration (ctrl api only needs ingress client, not server) + UNKEY_RESTATE_INGRESS_URL: "http://restate:8080" + UNKEY_RESTATE_ADMIN_URL: "http://restate:9070" + UNKEY_RESTATE_API_KEY: "" + + # Build configuration (for presigned URLs) + UNKEY_BUILD_S3_URL: "${UNKEY_BUILD_S3_URL:-http://s3:3902}" + UNKEY_BUILD_S3_EXTERNAL_URL: "${UNKEY_BUILD_S3_EXTERNAL_URL:-http://localhost:3902}" + UNKEY_BUILD_S3_BUCKET: "build-contexts" + UNKEY_BUILD_S3_ACCESS_KEY_ID: "${UNKEY_BUILD_S3_ACCESS_KEY_ID:-minio_root_user}" + UNKEY_BUILD_S3_ACCESS_KEY_SECRET: "${UNKEY_BUILD_S3_ACCESS_KEY_SECRET:-minio_root_password}" + + # API key for simple authentication + UNKEY_AUTH_TOKEN: "your-local-dev-key" + + ctrl-worker: + networks: + - default + build: + context: ../ + dockerfile: Dockerfile + args: + VERSION: "latest" + container_name: ctrl-worker + command: ["run", "ctrl", "worker"] + env_file: + - .env.depot + ports: + - "7092:7092" + - "9080:9080" + depends_on: + mysql: + condition: service_healthy + required: true + s3: + condition: service_healthy + required: true + restate: + condition: service_healthy + required: true + clickhouse: + condition: service_healthy + required: true + volumes: + - /var/run/docker.sock:/var/run/docker.sock + environment: + UNKEY_DATABASE_PRIMARY: "unkey:password@tcp(mysql:3306)/unkey?parseTime=true&interpolateParams=true" + + # Worker configuration + UNKEY_WORKER_HTTP_PORT: "7092" UNKEY_DEFAULT_DOMAIN: "unkey.local" - # Restate configuration (ctrl only needs ingress client, not server) + # Restate configuration UNKEY_RESTATE_INGRESS_URL: "http://restate:8080" UNKEY_RESTATE_ADMIN_URL: "http://restate:9070" + UNKEY_RESTATE_HTTP_PORT: "9080" + UNKEY_RESTATE_REGISTER_AS: "http://ctrl-worker:9080" UNKEY_RESTATE_API_KEY: "" # Vault - General secrets (env vars, API keys) @@ -356,36 +407,31 @@ services: UNKEY_VAULT_S3_ACCESS_KEY_ID: "minio_root_user" UNKEY_VAULT_S3_ACCESS_KEY_SECRET: "minio_root_password" UNKEY_VAULT_MASTER_KEYS: "Ch9rZWtfMmdqMFBJdVhac1NSa0ZhNE5mOWlLSnBHenFPENTt7an5MRogENt9Si6wms4pQ2XIvqNSIgNpaBenJmXgcInhu6Nfv2U=" - # ACME Vault - Let's Encrypt certificates - UNKEY_ACME_VAULT_MASTER_KEYS: "Ch9rZWtfMmdqMFBJdVhac1NSa0ZhNE5mOWlLSnBHenFPENTt7an5MRogENt9Si6wms4pQ2XIvqNSIgNpaBenJmXgcInhu6Nfv2U=" - UNKEY_ACME_VAULT_S3_URL: "http://s3:3902" - UNKEY_ACME_VAULT_S3_BUCKET: "acme-vault" - UNKEY_ACME_VAULT_S3_ACCESS_KEY_ID: "minio_root_user" - UNKEY_ACME_VAULT_S3_ACCESS_KEY_SECRET: "minio_root_password" - # Build configuration - UNKEY_BUILD_S3_URL: "${UNKEY_BUILD_S3_URL:-http://s3:3902}" - UNKEY_BUILD_S3_EXTERNAL_URL: "${UNKEY_BUILD_S3_EXTERNAL_URL:-http://localhost:3902}" # For CLI/external access + # Build configuration (loaded from .env.depot) UNKEY_BUILD_S3_BUCKET: "build-contexts" - UNKEY_BUILD_S3_ACCESS_KEY_ID: "${UNKEY_BUILD_S3_ACCESS_KEY_ID:-minio_root_user}" - UNKEY_BUILD_S3_ACCESS_KEY_SECRET: "${UNKEY_BUILD_S3_ACCESS_KEY_SECRET:-minio_root_password}" - # API key for simple authentication (temporary, will be replaced with JWT) - UNKEY_API_KEY: "your-local-dev-key" # Build backend configuration - UNKEY_BUILD_BACKEND: "${UNKEY_BUILD_BACKEND:-docker}" + UNKEY_BUILD_BACKEND: "depot" UNKEY_BUILD_PLATFORM: "linux/amd64" - UNKEY_DOCKER_SOCKET: "/var/run/docker.sock" - # Registry configuration (used by both Docker and Depot backends) - UNKEY_REGISTRY_URL: "${UNKEY_REGISTRY_URL:-registry.depot.dev}" - UNKEY_REGISTRY_USERNAME: "${UNKEY_REGISTRY_USERNAME:-x-token}" - UNKEY_REGISTRY_PASSWORD: "${UNKEY_REGISTRY_PASSWORD:-${DEPOT_TOKEN:-}}" - # Depot-specific configuration (only needed when UNKEY_BUILD_BACKEND=depot) + + # Registry configuration (UNKEY_REGISTRY_PASSWORD loaded from .env.depot) + UNKEY_REGISTRY_URL: "registry.depot.dev" + UNKEY_REGISTRY_USERNAME: "x-token" + + # Depot-specific configuration UNKEY_DEPOT_API_URL: "https://api.depot.dev" UNKEY_DEPOT_PROJECT_REGION: "us-east-1" + # ClickHouse UNKEY_CLICKHOUSE_URL: "clickhouse://default:password@clickhouse:9000?secure=false&skip_verify=true" + # API key for authentication + UNKEY_AUTH_TOKEN: "your-local-dev-key" + + # Sentinel image for deployments + UNKEY_SENTINEL_IMAGE: "unkey/sentinel:latest" + worker: networks: - default @@ -517,7 +563,7 @@ services: CLICKHOUSE_URL: "http://default:password@clickhouse:8123" # Environment NODE_ENV: "production" - CTRL_URL: "http://ctrl:7091" + CTRL_URL: "http://ctrl-api:7091" CTRL_API_KEY: "your-local-dev-key" # Bootstrap workspace/API IDs # Reading from env file, no override necessary diff --git a/dev/k8s/manifests/api.yaml b/dev/k8s/manifests/api.yaml index 94b35988ef..5003b78c98 100644 --- a/dev/k8s/manifests/api.yaml +++ b/dev/k8s/manifests/api.yaml @@ -71,7 +71,7 @@ spec: value: "chproxy-test-token-123" # Control Plane Configuration - name: UNKEY_CTRL_URL - value: "http://ctrl:7091" + value: "http://ctrl-api:7091" - name: UNKEY_CTRL_TOKEN value: "your-local-dev-key" # Request Body Configuration diff --git a/dev/k8s/manifests/ctrl.yaml b/dev/k8s/manifests/ctrl-api.yaml similarity index 53% rename from dev/k8s/manifests/ctrl.yaml rename to dev/k8s/manifests/ctrl-api.yaml index 0b5813adab..53b879977f 100644 --- a/dev/k8s/manifests/ctrl.yaml +++ b/dev/k8s/manifests/ctrl-api.yaml @@ -2,30 +2,25 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: ctrl + name: ctrl-api namespace: unkey labels: - app: ctrl + app: ctrl-api spec: replicas: 1 selector: matchLabels: - app: ctrl + app: ctrl-api template: metadata: labels: - app: ctrl + app: ctrl-api spec: serviceAccountName: unkey-serviceaccount - volumes: - - name: docker-socket - hostPath: - path: /var/run/docker.sock - type: Socket containers: - - name: ctrl + - name: ctrl-api image: unkey/go:latest - args: ["run", "ctrl"] + args: ["run", "ctrl", "api"] imagePullPolicy: Never # Use local images ports: - containerPort: 7091 @@ -41,7 +36,7 @@ spec: - name: UNKEY_REGION value: "local" - name: UNKEY_INSTANCE_ID - value: "ctrl-dev" + value: "ctrl-api-dev" # Database Configuration - name: UNKEY_DATABASE_PRIMARY value: "unkey:password@tcp(mysql:3306)/unkey?parseTime=true&interpolateParams=true" @@ -51,76 +46,38 @@ spec: value: "false" # Control Plane Specific # Vault Configuration (required) - - name: UNKEY_VAULT_MASTER_KEYS - value: "Ch9rZWtfMmdqMFBJdVhac1NSa0ZhNE5mOWlLSnBHenFPENTt7an5MRogENt9Si6wms4pQ2XIvqNSIgNpaBenJmXgcInhu6Nfv2U=" - - name: UNKEY_VAULT_S3_URL - value: "http://s3:3902" - - name: UNKEY_VAULT_S3_BUCKET - value: "acme-vault" - - name: UNKEY_VAULT_S3_ACCESS_KEY_ID - value: "minio_root_user" - - name: UNKEY_VAULT_S3_ACCESS_KEY_SECRET - value: "minio_root_password" - # Build Configuration - - name: UNKEY_BUILD_BACKEND - value: "depot" # Changed to docker as default for local k8s - - name: UNKEY_BUILD_PLATFORM - value: "linux/arm64" - # Build S3 Storage (from depot-credentials secret) - #kubectl create secret generic depot-credentials \ - # --from-literal=token=xxxx \ - # --from-literal=s3-url=xxx \ - # --from-literal=s3-access-key-id=xxx \ - # --from-literal=s3-access-key-secret=xxx \ - # --namespace=unkey + - name: UNKEY_VAULT_URL + value: http://vault:8060 + - name: UNKEY_VAULT_TOKEN + value: vault-test-token-123 - name: UNKEY_BUILD_S3_URL valueFrom: secretKeyRef: name: depot-credentials - key: s3-url + key: UNKEY_BUILD_S3_URL - name: UNKEY_BUILD_S3_EXTERNAL_URL valueFrom: secretKeyRef: name: depot-credentials - key: s3-url + key: UNKEY_BUILD_S3_URL - name: UNKEY_BUILD_S3_BUCKET value: "build-contexts" - name: UNKEY_BUILD_S3_ACCESS_KEY_ID valueFrom: secretKeyRef: name: depot-credentials - key: s3-access-key-id + key: UNKEY_BUILD_S3_ACCESS_KEY_ID - name: UNKEY_BUILD_S3_ACCESS_KEY_SECRET valueFrom: secretKeyRef: name: depot-credentials - key: s3-access-key-secret + key: UNKEY_BUILD_S3_ACCESS_KEY_SECRET # Registry Configuration (used by both Docker and Depot backends) #kubectl create secret docker-registry depot-registry \ # --docker-server=registry.depot.dev \ # --docker-username=x-token \ # --docker-password=xxx \ # --namespace=unkey - - name: UNKEY_REGISTRY_URL - value: "registry.depot.dev" - - name: UNKEY_REGISTRY_USERNAME - value: "x-token" - - name: UNKEY_REGISTRY_PASSWORD - valueFrom: - secretKeyRef: - name: depot-credentials - key: token - # Depot-Specific Configuration (only used when UNKEY_BUILD_BACKEND=depot) - - name: UNKEY_DEPOT_API_URL - value: "https://api.depot.dev" - - name: UNKEY_DEPOT_PROJECT_REGION - value: "us-east-1" - # ACME Configuration - - name: UNKEY_ACME_ENABLED - value: "false" - - - name: UNKEY_DEFAULT_DOMAIN - value: "unkey.local" # Restate Configuration (ctrl only needs ingress client) - name: UNKEY_RESTATE_INGRESS_URL @@ -134,15 +91,6 @@ spec: - name: UNKEY_AUTH_TOKEN value: "your-local-dev-key" - # ClickHouse Configuration - - name: UNKEY_CLICKHOUSE_URL - value: "clickhouse://default:password@clickhouse:9000?secure=false&skip_verify=true" - - - name: UNKEY_SENTINEL_IMAGE - value: "unkey/sentinel:latest" - volumeMounts: - - name: docker-socket - mountPath: /var/run/docker.sock initContainers: - name: wait-for-dependencies image: busybox:1.36 @@ -157,13 +105,13 @@ spec: apiVersion: v1 kind: Service metadata: - name: ctrl + name: ctrl-api namespace: unkey labels: - app: ctrl + app: ctrl-api spec: selector: - app: ctrl + app: ctrl-api ports: - name: http port: 7091 diff --git a/dev/k8s/manifests/worker.yaml b/dev/k8s/manifests/ctrl-worker.yaml similarity index 91% rename from dev/k8s/manifests/worker.yaml rename to dev/k8s/manifests/ctrl-worker.yaml index 7ff4c0b9d9..fa0338dc8c 100644 --- a/dev/k8s/manifests/worker.yaml +++ b/dev/k8s/manifests/ctrl-worker.yaml @@ -2,19 +2,19 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: worker + name: ctrl-worker namespace: unkey labels: - app: worker + app: ctrl-worker spec: replicas: 1 selector: matchLabels: - app: worker + app: ctrl-worker template: metadata: labels: - app: worker + app: ctrl-worker spec: serviceAccountName: unkey-serviceaccount volumes: @@ -23,9 +23,9 @@ spec: path: /var/run/docker.sock type: Socket containers: - - name: worker + - name: ctrl-worker image: unkey/go:latest - args: ["run", "worker"] + args: ["run", "ctrl", "worker"] imagePullPolicy: Never # Use local images ports: - containerPort: 7092 @@ -73,19 +73,19 @@ spec: valueFrom: secretKeyRef: name: depot-credentials - key: s3-url + key: UNKEY_BUILD_S3_URL - name: UNKEY_BUILD_S3_BUCKET value: "build-contexts" - name: UNKEY_BUILD_S3_ACCESS_KEY_ID valueFrom: secretKeyRef: name: depot-credentials - key: s3-access-key-id + key: UNKEY_BUILD_S3_ACCESS_KEY_ID - name: UNKEY_BUILD_S3_ACCESS_KEY_SECRET valueFrom: secretKeyRef: name: depot-credentials - key: s3-access-key-secret + key: UNKEY_BUILD_S3_ACCESS_KEY_SECRET # Registry Configuration - name: UNKEY_REGISTRY_URL @@ -96,7 +96,7 @@ spec: valueFrom: secretKeyRef: name: depot-credentials - key: token + key: UNKEY_DEPOT_TOKEN # Depot-Specific Configuration - name: UNKEY_DEPOT_API_URL @@ -119,7 +119,7 @@ spec: - name: UNKEY_RESTATE_HTTP_PORT value: "9080" - name: UNKEY_RESTATE_REGISTER_AS - value: "http://worker:9080" + value: "http://ctrl-worker:9080" - name: UNKEY_RESTATE_API_KEY value: "" @@ -150,13 +150,13 @@ spec: apiVersion: v1 kind: Service metadata: - name: worker + name: ctrl-worker namespace: unkey labels: - app: worker + app: ctrl-worker spec: selector: - app: worker + app: ctrl-worker ports: - name: health port: 7092 diff --git a/dev/k8s/manifests/krane.yaml b/dev/k8s/manifests/krane.yaml index 7822852d81..d7f7684947 100644 --- a/dev/k8s/manifests/krane.yaml +++ b/dev/k8s/manifests/krane.yaml @@ -46,7 +46,7 @@ spec: - name: UNKEY_REGION value: "local.dev" - name: "UNKEY_CONTROL_PLANE_URL" - value: "http://ctrl:7091" + value: "http://ctrl-api:7091" - name: "UNKEY_CONTROL_PLANE_BEARER" value: "your-local-dev-key" # Vault configuration for SecretsService diff --git a/dev/k8s/manifests/preflight.yaml b/dev/k8s/manifests/preflight.yaml index 7eef0ea4dc..aa76559235 100644 --- a/dev/k8s/manifests/preflight.yaml +++ b/dev/k8s/manifests/preflight.yaml @@ -108,7 +108,7 @@ spec: valueFrom: secretKeyRef: name: depot-credentials - key: token + key: UNKEY_DEPOT_TOKEN ports: - containerPort: 8443 name: https diff --git a/gen/proto/ctrl/v1/BUILD.bazel b/gen/proto/ctrl/v1/BUILD.bazel index d277051e9a..44c80e0b6b 100644 --- a/gen/proto/ctrl/v1/BUILD.bazel +++ b/gen/proto/ctrl/v1/BUILD.bazel @@ -4,7 +4,6 @@ go_library( name = "ctrl", srcs = [ "acme.pb.go", - "build.pb.go", "cluster.pb.go", "deployment.pb.go", "environment.pb.go", diff --git a/gen/proto/ctrl/v1/build.pb.go b/gen/proto/ctrl/v1/build.pb.go deleted file mode 100644 index d5365c4d8f..0000000000 --- a/gen/proto/ctrl/v1/build.pb.go +++ /dev/null @@ -1,349 +0,0 @@ -// Code generated by protoc-gen-go. DO NOT EDIT. -// versions: -// protoc-gen-go v1.36.8 -// protoc (unknown) -// source: ctrl/v1/build.proto - -package ctrlv1 - -import ( - protoreflect "google.golang.org/protobuf/reflect/protoreflect" - protoimpl "google.golang.org/protobuf/runtime/protoimpl" - reflect "reflect" - sync "sync" - unsafe "unsafe" -) - -const ( - // Verify that this generated code is sufficiently up-to-date. - _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) - // Verify that runtime/protoimpl is sufficiently up-to-date. - _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) -) - -type CreateBuildRequest struct { - state protoimpl.MessageState `protogen:"open.v1"` - BuildContextPath string `protobuf:"bytes,1,opt,name=build_context_path,json=buildContextPath,proto3" json:"build_context_path,omitempty"` // S3 key of the uploaded tar file - DockerfilePath *string `protobuf:"bytes,2,opt,name=dockerfile_path,json=dockerfilePath,proto3,oneof" json:"dockerfile_path,omitempty"` // Path to Dockerfile within the tar - UnkeyProjectId string `protobuf:"bytes,3,opt,name=unkey_project_id,json=unkeyProjectId,proto3" json:"unkey_project_id,omitempty"` // Your internal user/project ID - DeploymentId string `protobuf:"bytes,4,opt,name=deployment_id,json=deploymentId,proto3" json:"deployment_id,omitempty"` - WorkspaceId string `protobuf:"bytes,5,opt,name=workspace_id,json=workspaceId,proto3" json:"workspace_id,omitempty"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache -} - -func (x *CreateBuildRequest) Reset() { - *x = CreateBuildRequest{} - mi := &file_ctrl_v1_build_proto_msgTypes[0] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) -} - -func (x *CreateBuildRequest) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*CreateBuildRequest) ProtoMessage() {} - -func (x *CreateBuildRequest) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_build_proto_msgTypes[0] - if x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use CreateBuildRequest.ProtoReflect.Descriptor instead. -func (*CreateBuildRequest) Descriptor() ([]byte, []int) { - return file_ctrl_v1_build_proto_rawDescGZIP(), []int{0} -} - -func (x *CreateBuildRequest) GetBuildContextPath() string { - if x != nil { - return x.BuildContextPath - } - return "" -} - -func (x *CreateBuildRequest) GetDockerfilePath() string { - if x != nil && x.DockerfilePath != nil { - return *x.DockerfilePath - } - return "" -} - -func (x *CreateBuildRequest) GetUnkeyProjectId() string { - if x != nil { - return x.UnkeyProjectId - } - return "" -} - -func (x *CreateBuildRequest) GetDeploymentId() string { - if x != nil { - return x.DeploymentId - } - return "" -} - -func (x *CreateBuildRequest) GetWorkspaceId() string { - if x != nil { - return x.WorkspaceId - } - return "" -} - -type CreateBuildResponse struct { - state protoimpl.MessageState `protogen:"open.v1"` - ImageName string `protobuf:"bytes,1,opt,name=image_name,json=imageName,proto3" json:"image_name,omitempty"` // Full image tag (registry.depot.dev/project:tag) - BuildId string `protobuf:"bytes,2,opt,name=build_id,json=buildId,proto3" json:"build_id,omitempty"` // Depot build ID for tracking - DepotProjectId string `protobuf:"bytes,3,opt,name=depot_project_id,json=depotProjectId,proto3" json:"depot_project_id,omitempty"` // Depot project ID - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache -} - -func (x *CreateBuildResponse) Reset() { - *x = CreateBuildResponse{} - mi := &file_ctrl_v1_build_proto_msgTypes[1] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) -} - -func (x *CreateBuildResponse) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*CreateBuildResponse) ProtoMessage() {} - -func (x *CreateBuildResponse) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_build_proto_msgTypes[1] - if x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use CreateBuildResponse.ProtoReflect.Descriptor instead. -func (*CreateBuildResponse) Descriptor() ([]byte, []int) { - return file_ctrl_v1_build_proto_rawDescGZIP(), []int{1} -} - -func (x *CreateBuildResponse) GetImageName() string { - if x != nil { - return x.ImageName - } - return "" -} - -func (x *CreateBuildResponse) GetBuildId() string { - if x != nil { - return x.BuildId - } - return "" -} - -func (x *CreateBuildResponse) GetDepotProjectId() string { - if x != nil { - return x.DepotProjectId - } - return "" -} - -type GenerateUploadURLRequest struct { - state protoimpl.MessageState `protogen:"open.v1"` - UnkeyProjectId string `protobuf:"bytes,1,opt,name=unkey_project_id,json=unkeyProjectId,proto3" json:"unkey_project_id,omitempty"` // Your internal user/project ID - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache -} - -func (x *GenerateUploadURLRequest) Reset() { - *x = GenerateUploadURLRequest{} - mi := &file_ctrl_v1_build_proto_msgTypes[2] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) -} - -func (x *GenerateUploadURLRequest) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*GenerateUploadURLRequest) ProtoMessage() {} - -func (x *GenerateUploadURLRequest) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_build_proto_msgTypes[2] - if x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use GenerateUploadURLRequest.ProtoReflect.Descriptor instead. -func (*GenerateUploadURLRequest) Descriptor() ([]byte, []int) { - return file_ctrl_v1_build_proto_rawDescGZIP(), []int{2} -} - -func (x *GenerateUploadURLRequest) GetUnkeyProjectId() string { - if x != nil { - return x.UnkeyProjectId - } - return "" -} - -type GenerateUploadURLResponse struct { - state protoimpl.MessageState `protogen:"open.v1"` - UploadUrl string `protobuf:"bytes,1,opt,name=upload_url,json=uploadUrl,proto3" json:"upload_url,omitempty"` // Presigned PUT URL - BuildContextPath string `protobuf:"bytes,2,opt,name=build_context_path,json=buildContextPath,proto3" json:"build_context_path,omitempty"` // S3 key to use in CreateBuild - ExpiresIn int64 `protobuf:"varint,3,opt,name=expires_in,json=expiresIn,proto3" json:"expires_in,omitempty"` // Seconds until URL expires - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache -} - -func (x *GenerateUploadURLResponse) Reset() { - *x = GenerateUploadURLResponse{} - mi := &file_ctrl_v1_build_proto_msgTypes[3] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) -} - -func (x *GenerateUploadURLResponse) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*GenerateUploadURLResponse) ProtoMessage() {} - -func (x *GenerateUploadURLResponse) ProtoReflect() protoreflect.Message { - mi := &file_ctrl_v1_build_proto_msgTypes[3] - if x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use GenerateUploadURLResponse.ProtoReflect.Descriptor instead. -func (*GenerateUploadURLResponse) Descriptor() ([]byte, []int) { - return file_ctrl_v1_build_proto_rawDescGZIP(), []int{3} -} - -func (x *GenerateUploadURLResponse) GetUploadUrl() string { - if x != nil { - return x.UploadUrl - } - return "" -} - -func (x *GenerateUploadURLResponse) GetBuildContextPath() string { - if x != nil { - return x.BuildContextPath - } - return "" -} - -func (x *GenerateUploadURLResponse) GetExpiresIn() int64 { - if x != nil { - return x.ExpiresIn - } - return 0 -} - -var File_ctrl_v1_build_proto protoreflect.FileDescriptor - -const file_ctrl_v1_build_proto_rawDesc = "" + - "\n" + - "\x13ctrl/v1/build.proto\x12\actrl.v1\"\xf6\x01\n" + - "\x12CreateBuildRequest\x12,\n" + - "\x12build_context_path\x18\x01 \x01(\tR\x10buildContextPath\x12,\n" + - "\x0fdockerfile_path\x18\x02 \x01(\tH\x00R\x0edockerfilePath\x88\x01\x01\x12(\n" + - "\x10unkey_project_id\x18\x03 \x01(\tR\x0eunkeyProjectId\x12#\n" + - "\rdeployment_id\x18\x04 \x01(\tR\fdeploymentId\x12!\n" + - "\fworkspace_id\x18\x05 \x01(\tR\vworkspaceIdB\x12\n" + - "\x10_dockerfile_path\"y\n" + - "\x13CreateBuildResponse\x12\x1d\n" + - "\n" + - "image_name\x18\x01 \x01(\tR\timageName\x12\x19\n" + - "\bbuild_id\x18\x02 \x01(\tR\abuildId\x12(\n" + - "\x10depot_project_id\x18\x03 \x01(\tR\x0edepotProjectId\"D\n" + - "\x18GenerateUploadURLRequest\x12(\n" + - "\x10unkey_project_id\x18\x01 \x01(\tR\x0eunkeyProjectId\"\x87\x01\n" + - "\x19GenerateUploadURLResponse\x12\x1d\n" + - "\n" + - "upload_url\x18\x01 \x01(\tR\tuploadUrl\x12,\n" + - "\x12build_context_path\x18\x02 \x01(\tR\x10buildContextPath\x12\x1d\n" + - "\n" + - "expires_in\x18\x03 \x01(\x03R\texpiresIn2\xb8\x01\n" + - "\fBuildService\x12J\n" + - "\vCreateBuild\x12\x1b.ctrl.v1.CreateBuildRequest\x1a\x1c.ctrl.v1.CreateBuildResponse\"\x00\x12\\\n" + - "\x11GenerateUploadURL\x12!.ctrl.v1.GenerateUploadURLRequest\x1a\".ctrl.v1.GenerateUploadURLResponse\"\x00B\x89\x01\n" + - "\vcom.ctrl.v1B\n" + - "BuildProtoP\x01Z1github.com/unkeyed/unkey/gen/proto/ctrl/v1;ctrlv1\xa2\x02\x03CXX\xaa\x02\aCtrl.V1\xca\x02\aCtrl\\V1\xe2\x02\x13Ctrl\\V1\\GPBMetadata\xea\x02\bCtrl::V1b\x06proto3" - -var ( - file_ctrl_v1_build_proto_rawDescOnce sync.Once - file_ctrl_v1_build_proto_rawDescData []byte -) - -func file_ctrl_v1_build_proto_rawDescGZIP() []byte { - file_ctrl_v1_build_proto_rawDescOnce.Do(func() { - file_ctrl_v1_build_proto_rawDescData = protoimpl.X.CompressGZIP(unsafe.Slice(unsafe.StringData(file_ctrl_v1_build_proto_rawDesc), len(file_ctrl_v1_build_proto_rawDesc))) - }) - return file_ctrl_v1_build_proto_rawDescData -} - -var file_ctrl_v1_build_proto_msgTypes = make([]protoimpl.MessageInfo, 4) -var file_ctrl_v1_build_proto_goTypes = []any{ - (*CreateBuildRequest)(nil), // 0: ctrl.v1.CreateBuildRequest - (*CreateBuildResponse)(nil), // 1: ctrl.v1.CreateBuildResponse - (*GenerateUploadURLRequest)(nil), // 2: ctrl.v1.GenerateUploadURLRequest - (*GenerateUploadURLResponse)(nil), // 3: ctrl.v1.GenerateUploadURLResponse -} -var file_ctrl_v1_build_proto_depIdxs = []int32{ - 0, // 0: ctrl.v1.BuildService.CreateBuild:input_type -> ctrl.v1.CreateBuildRequest - 2, // 1: ctrl.v1.BuildService.GenerateUploadURL:input_type -> ctrl.v1.GenerateUploadURLRequest - 1, // 2: ctrl.v1.BuildService.CreateBuild:output_type -> ctrl.v1.CreateBuildResponse - 3, // 3: ctrl.v1.BuildService.GenerateUploadURL:output_type -> ctrl.v1.GenerateUploadURLResponse - 2, // [2:4] is the sub-list for method output_type - 0, // [0:2] is the sub-list for method input_type - 0, // [0:0] is the sub-list for extension type_name - 0, // [0:0] is the sub-list for extension extendee - 0, // [0:0] is the sub-list for field type_name -} - -func init() { file_ctrl_v1_build_proto_init() } -func file_ctrl_v1_build_proto_init() { - if File_ctrl_v1_build_proto != nil { - return - } - file_ctrl_v1_build_proto_msgTypes[0].OneofWrappers = []any{} - type x struct{} - out := protoimpl.TypeBuilder{ - File: protoimpl.DescBuilder{ - GoPackagePath: reflect.TypeOf(x{}).PkgPath(), - RawDescriptor: unsafe.Slice(unsafe.StringData(file_ctrl_v1_build_proto_rawDesc), len(file_ctrl_v1_build_proto_rawDesc)), - NumEnums: 0, - NumMessages: 4, - NumExtensions: 0, - NumServices: 1, - }, - GoTypes: file_ctrl_v1_build_proto_goTypes, - DependencyIndexes: file_ctrl_v1_build_proto_depIdxs, - MessageInfos: file_ctrl_v1_build_proto_msgTypes, - }.Build() - File_ctrl_v1_build_proto = out.File - file_ctrl_v1_build_proto_goTypes = nil - file_ctrl_v1_build_proto_depIdxs = nil -} diff --git a/gen/proto/ctrl/v1/ctrlv1connect/BUILD.bazel b/gen/proto/ctrl/v1/ctrlv1connect/BUILD.bazel index d6547de76a..e8be7675ae 100644 --- a/gen/proto/ctrl/v1/ctrlv1connect/BUILD.bazel +++ b/gen/proto/ctrl/v1/ctrlv1connect/BUILD.bazel @@ -4,7 +4,6 @@ go_library( name = "ctrlv1connect", srcs = [ "acme.connect.go", - "build.connect.go", "cluster.connect.go", "deployment.connect.go", "environment.connect.go", diff --git a/gen/proto/ctrl/v1/ctrlv1connect/build.connect.go b/gen/proto/ctrl/v1/ctrlv1connect/build.connect.go deleted file mode 100644 index 9f4cc3ec3f..0000000000 --- a/gen/proto/ctrl/v1/ctrlv1connect/build.connect.go +++ /dev/null @@ -1,138 +0,0 @@ -// Code generated by protoc-gen-connect-go. DO NOT EDIT. -// -// Source: ctrl/v1/build.proto - -package ctrlv1connect - -import ( - connect "connectrpc.com/connect" - context "context" - errors "errors" - v1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" - http "net/http" - strings "strings" -) - -// This is a compile-time assertion to ensure that this generated file and the connect package are -// compatible. If you get a compiler error that this constant is not defined, this code was -// generated with a version of connect newer than the one compiled into your binary. You can fix the -// problem by either regenerating this code with an older version of connect or updating the connect -// version compiled into your binary. -const _ = connect.IsAtLeastVersion1_13_0 - -const ( - // BuildServiceName is the fully-qualified name of the BuildService service. - BuildServiceName = "ctrl.v1.BuildService" -) - -// These constants are the fully-qualified names of the RPCs defined in this package. They're -// exposed at runtime as Spec.Procedure and as the final two segments of the HTTP route. -// -// Note that these are different from the fully-qualified method names used by -// google.golang.org/protobuf/reflect/protoreflect. To convert from these constants to -// reflection-formatted method names, remove the leading slash and convert the remaining slash to a -// period. -const ( - // BuildServiceCreateBuildProcedure is the fully-qualified name of the BuildService's CreateBuild - // RPC. - BuildServiceCreateBuildProcedure = "/ctrl.v1.BuildService/CreateBuild" - // BuildServiceGenerateUploadURLProcedure is the fully-qualified name of the BuildService's - // GenerateUploadURL RPC. - BuildServiceGenerateUploadURLProcedure = "/ctrl.v1.BuildService/GenerateUploadURL" -) - -// BuildServiceClient is a client for the ctrl.v1.BuildService service. -type BuildServiceClient interface { - CreateBuild(context.Context, *connect.Request[v1.CreateBuildRequest]) (*connect.Response[v1.CreateBuildResponse], error) - GenerateUploadURL(context.Context, *connect.Request[v1.GenerateUploadURLRequest]) (*connect.Response[v1.GenerateUploadURLResponse], error) -} - -// NewBuildServiceClient constructs a client for the ctrl.v1.BuildService service. By default, it -// uses the Connect protocol with the binary Protobuf Codec, asks for gzipped responses, and sends -// uncompressed requests. To use the gRPC or gRPC-Web protocols, supply the connect.WithGRPC() or -// connect.WithGRPCWeb() options. -// -// The URL supplied here should be the base URL for the Connect or gRPC server (for example, -// http://api.acme.com or https://acme.com/grpc). -func NewBuildServiceClient(httpClient connect.HTTPClient, baseURL string, opts ...connect.ClientOption) BuildServiceClient { - baseURL = strings.TrimRight(baseURL, "/") - buildServiceMethods := v1.File_ctrl_v1_build_proto.Services().ByName("BuildService").Methods() - return &buildServiceClient{ - createBuild: connect.NewClient[v1.CreateBuildRequest, v1.CreateBuildResponse]( - httpClient, - baseURL+BuildServiceCreateBuildProcedure, - connect.WithSchema(buildServiceMethods.ByName("CreateBuild")), - connect.WithClientOptions(opts...), - ), - generateUploadURL: connect.NewClient[v1.GenerateUploadURLRequest, v1.GenerateUploadURLResponse]( - httpClient, - baseURL+BuildServiceGenerateUploadURLProcedure, - connect.WithSchema(buildServiceMethods.ByName("GenerateUploadURL")), - connect.WithClientOptions(opts...), - ), - } -} - -// buildServiceClient implements BuildServiceClient. -type buildServiceClient struct { - createBuild *connect.Client[v1.CreateBuildRequest, v1.CreateBuildResponse] - generateUploadURL *connect.Client[v1.GenerateUploadURLRequest, v1.GenerateUploadURLResponse] -} - -// CreateBuild calls ctrl.v1.BuildService.CreateBuild. -func (c *buildServiceClient) CreateBuild(ctx context.Context, req *connect.Request[v1.CreateBuildRequest]) (*connect.Response[v1.CreateBuildResponse], error) { - return c.createBuild.CallUnary(ctx, req) -} - -// GenerateUploadURL calls ctrl.v1.BuildService.GenerateUploadURL. -func (c *buildServiceClient) GenerateUploadURL(ctx context.Context, req *connect.Request[v1.GenerateUploadURLRequest]) (*connect.Response[v1.GenerateUploadURLResponse], error) { - return c.generateUploadURL.CallUnary(ctx, req) -} - -// BuildServiceHandler is an implementation of the ctrl.v1.BuildService service. -type BuildServiceHandler interface { - CreateBuild(context.Context, *connect.Request[v1.CreateBuildRequest]) (*connect.Response[v1.CreateBuildResponse], error) - GenerateUploadURL(context.Context, *connect.Request[v1.GenerateUploadURLRequest]) (*connect.Response[v1.GenerateUploadURLResponse], error) -} - -// NewBuildServiceHandler builds an HTTP handler from the service implementation. It returns the -// path on which to mount the handler and the handler itself. -// -// By default, handlers support the Connect, gRPC, and gRPC-Web protocols with the binary Protobuf -// and JSON codecs. They also support gzip compression. -func NewBuildServiceHandler(svc BuildServiceHandler, opts ...connect.HandlerOption) (string, http.Handler) { - buildServiceMethods := v1.File_ctrl_v1_build_proto.Services().ByName("BuildService").Methods() - buildServiceCreateBuildHandler := connect.NewUnaryHandler( - BuildServiceCreateBuildProcedure, - svc.CreateBuild, - connect.WithSchema(buildServiceMethods.ByName("CreateBuild")), - connect.WithHandlerOptions(opts...), - ) - buildServiceGenerateUploadURLHandler := connect.NewUnaryHandler( - BuildServiceGenerateUploadURLProcedure, - svc.GenerateUploadURL, - connect.WithSchema(buildServiceMethods.ByName("GenerateUploadURL")), - connect.WithHandlerOptions(opts...), - ) - return "/ctrl.v1.BuildService/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - switch r.URL.Path { - case BuildServiceCreateBuildProcedure: - buildServiceCreateBuildHandler.ServeHTTP(w, r) - case BuildServiceGenerateUploadURLProcedure: - buildServiceGenerateUploadURLHandler.ServeHTTP(w, r) - default: - http.NotFound(w, r) - } - }) -} - -// UnimplementedBuildServiceHandler returns CodeUnimplemented from all methods. -type UnimplementedBuildServiceHandler struct{} - -func (UnimplementedBuildServiceHandler) CreateBuild(context.Context, *connect.Request[v1.CreateBuildRequest]) (*connect.Response[v1.CreateBuildResponse], error) { - return nil, connect.NewError(connect.CodeUnimplemented, errors.New("ctrl.v1.BuildService.CreateBuild is not implemented")) -} - -func (UnimplementedBuildServiceHandler) GenerateUploadURL(context.Context, *connect.Request[v1.GenerateUploadURLRequest]) (*connect.Response[v1.GenerateUploadURLResponse], error) { - return nil, connect.NewError(connect.CodeUnimplemented, errors.New("ctrl.v1.BuildService.GenerateUploadURL is not implemented")) -} diff --git a/gen/proto/ctrl/v1/ctrlv1connect/deployment.connect.go b/gen/proto/ctrl/v1/ctrlv1connect/deployment.connect.go index 090d187c1e..05c7d5c17a 100644 --- a/gen/proto/ctrl/v1/ctrlv1connect/deployment.connect.go +++ b/gen/proto/ctrl/v1/ctrlv1connect/deployment.connect.go @@ -33,6 +33,9 @@ const ( // reflection-formatted method names, remove the leading slash and convert the remaining slash to a // period. const ( + // DeploymentServiceCreateS3UploadURLProcedure is the fully-qualified name of the + // DeploymentService's CreateS3UploadURL RPC. + DeploymentServiceCreateS3UploadURLProcedure = "/ctrl.v1.DeploymentService/CreateS3UploadURL" // DeploymentServiceCreateDeploymentProcedure is the fully-qualified name of the DeploymentService's // CreateDeployment RPC. DeploymentServiceCreateDeploymentProcedure = "/ctrl.v1.DeploymentService/CreateDeployment" @@ -49,6 +52,7 @@ const ( // DeploymentServiceClient is a client for the ctrl.v1.DeploymentService service. type DeploymentServiceClient interface { + CreateS3UploadURL(context.Context, *connect.Request[v1.CreateS3UploadURLRequest]) (*connect.Response[v1.CreateS3UploadURLResponse], error) // Create a new deployment CreateDeployment(context.Context, *connect.Request[v1.CreateDeploymentRequest]) (*connect.Response[v1.CreateDeploymentResponse], error) // Get deployment details @@ -70,6 +74,12 @@ func NewDeploymentServiceClient(httpClient connect.HTTPClient, baseURL string, o baseURL = strings.TrimRight(baseURL, "/") deploymentServiceMethods := v1.File_ctrl_v1_deployment_proto.Services().ByName("DeploymentService").Methods() return &deploymentServiceClient{ + createS3UploadURL: connect.NewClient[v1.CreateS3UploadURLRequest, v1.CreateS3UploadURLResponse]( + httpClient, + baseURL+DeploymentServiceCreateS3UploadURLProcedure, + connect.WithSchema(deploymentServiceMethods.ByName("CreateS3UploadURL")), + connect.WithClientOptions(opts...), + ), createDeployment: connect.NewClient[v1.CreateDeploymentRequest, v1.CreateDeploymentResponse]( httpClient, baseURL+DeploymentServiceCreateDeploymentProcedure, @@ -99,10 +109,16 @@ func NewDeploymentServiceClient(httpClient connect.HTTPClient, baseURL string, o // deploymentServiceClient implements DeploymentServiceClient. type deploymentServiceClient struct { - createDeployment *connect.Client[v1.CreateDeploymentRequest, v1.CreateDeploymentResponse] - getDeployment *connect.Client[v1.GetDeploymentRequest, v1.GetDeploymentResponse] - rollback *connect.Client[v1.RollbackRequest, v1.RollbackResponse] - promote *connect.Client[v1.PromoteRequest, v1.PromoteResponse] + createS3UploadURL *connect.Client[v1.CreateS3UploadURLRequest, v1.CreateS3UploadURLResponse] + createDeployment *connect.Client[v1.CreateDeploymentRequest, v1.CreateDeploymentResponse] + getDeployment *connect.Client[v1.GetDeploymentRequest, v1.GetDeploymentResponse] + rollback *connect.Client[v1.RollbackRequest, v1.RollbackResponse] + promote *connect.Client[v1.PromoteRequest, v1.PromoteResponse] +} + +// CreateS3UploadURL calls ctrl.v1.DeploymentService.CreateS3UploadURL. +func (c *deploymentServiceClient) CreateS3UploadURL(ctx context.Context, req *connect.Request[v1.CreateS3UploadURLRequest]) (*connect.Response[v1.CreateS3UploadURLResponse], error) { + return c.createS3UploadURL.CallUnary(ctx, req) } // CreateDeployment calls ctrl.v1.DeploymentService.CreateDeployment. @@ -127,6 +143,7 @@ func (c *deploymentServiceClient) Promote(ctx context.Context, req *connect.Requ // DeploymentServiceHandler is an implementation of the ctrl.v1.DeploymentService service. type DeploymentServiceHandler interface { + CreateS3UploadURL(context.Context, *connect.Request[v1.CreateS3UploadURLRequest]) (*connect.Response[v1.CreateS3UploadURLResponse], error) // Create a new deployment CreateDeployment(context.Context, *connect.Request[v1.CreateDeploymentRequest]) (*connect.Response[v1.CreateDeploymentResponse], error) // Get deployment details @@ -144,6 +161,12 @@ type DeploymentServiceHandler interface { // and JSON codecs. They also support gzip compression. func NewDeploymentServiceHandler(svc DeploymentServiceHandler, opts ...connect.HandlerOption) (string, http.Handler) { deploymentServiceMethods := v1.File_ctrl_v1_deployment_proto.Services().ByName("DeploymentService").Methods() + deploymentServiceCreateS3UploadURLHandler := connect.NewUnaryHandler( + DeploymentServiceCreateS3UploadURLProcedure, + svc.CreateS3UploadURL, + connect.WithSchema(deploymentServiceMethods.ByName("CreateS3UploadURL")), + connect.WithHandlerOptions(opts...), + ) deploymentServiceCreateDeploymentHandler := connect.NewUnaryHandler( DeploymentServiceCreateDeploymentProcedure, svc.CreateDeployment, @@ -170,6 +193,8 @@ func NewDeploymentServiceHandler(svc DeploymentServiceHandler, opts ...connect.H ) return "/ctrl.v1.DeploymentService/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { switch r.URL.Path { + case DeploymentServiceCreateS3UploadURLProcedure: + deploymentServiceCreateS3UploadURLHandler.ServeHTTP(w, r) case DeploymentServiceCreateDeploymentProcedure: deploymentServiceCreateDeploymentHandler.ServeHTTP(w, r) case DeploymentServiceGetDeploymentProcedure: @@ -187,6 +212,10 @@ func NewDeploymentServiceHandler(svc DeploymentServiceHandler, opts ...connect.H // UnimplementedDeploymentServiceHandler returns CodeUnimplemented from all methods. type UnimplementedDeploymentServiceHandler struct{} +func (UnimplementedDeploymentServiceHandler) CreateS3UploadURL(context.Context, *connect.Request[v1.CreateS3UploadURLRequest]) (*connect.Response[v1.CreateS3UploadURLResponse], error) { + return nil, connect.NewError(connect.CodeUnimplemented, errors.New("ctrl.v1.DeploymentService.CreateS3UploadURL is not implemented")) +} + func (UnimplementedDeploymentServiceHandler) CreateDeployment(context.Context, *connect.Request[v1.CreateDeploymentRequest]) (*connect.Response[v1.CreateDeploymentResponse], error) { return nil, connect.NewError(connect.CodeUnimplemented, errors.New("ctrl.v1.DeploymentService.CreateDeployment is not implemented")) } diff --git a/gen/proto/ctrl/v1/deployment.pb.go b/gen/proto/ctrl/v1/deployment.pb.go index 514a2ca11f..b7d8f2aced 100644 --- a/gen/proto/ctrl/v1/deployment.pb.go +++ b/gen/proto/ctrl/v1/deployment.pb.go @@ -1125,6 +1125,102 @@ func (*PromoteResponse) Descriptor() ([]byte, []int) { return file_ctrl_v1_deployment_proto_rawDescGZIP(), []int{13} } +type CreateS3UploadURLRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + UnkeyProjectId string `protobuf:"bytes,1,opt,name=unkey_project_id,json=unkeyProjectId,proto3" json:"unkey_project_id,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *CreateS3UploadURLRequest) Reset() { + *x = CreateS3UploadURLRequest{} + mi := &file_ctrl_v1_deployment_proto_msgTypes[14] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *CreateS3UploadURLRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CreateS3UploadURLRequest) ProtoMessage() {} + +func (x *CreateS3UploadURLRequest) ProtoReflect() protoreflect.Message { + mi := &file_ctrl_v1_deployment_proto_msgTypes[14] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CreateS3UploadURLRequest.ProtoReflect.Descriptor instead. +func (*CreateS3UploadURLRequest) Descriptor() ([]byte, []int) { + return file_ctrl_v1_deployment_proto_rawDescGZIP(), []int{14} +} + +func (x *CreateS3UploadURLRequest) GetUnkeyProjectId() string { + if x != nil { + return x.UnkeyProjectId + } + return "" +} + +type CreateS3UploadURLResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + UploadUrl string `protobuf:"bytes,1,opt,name=upload_url,json=uploadUrl,proto3" json:"upload_url,omitempty"` // Presigned PUT URL + BuildContextPath string `protobuf:"bytes,2,opt,name=build_context_path,json=buildContextPath,proto3" json:"build_context_path,omitempty"` // S3 key to use in CreateBuild + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *CreateS3UploadURLResponse) Reset() { + *x = CreateS3UploadURLResponse{} + mi := &file_ctrl_v1_deployment_proto_msgTypes[15] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *CreateS3UploadURLResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CreateS3UploadURLResponse) ProtoMessage() {} + +func (x *CreateS3UploadURLResponse) ProtoReflect() protoreflect.Message { + mi := &file_ctrl_v1_deployment_proto_msgTypes[15] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CreateS3UploadURLResponse.ProtoReflect.Descriptor instead. +func (*CreateS3UploadURLResponse) Descriptor() ([]byte, []int) { + return file_ctrl_v1_deployment_proto_rawDescGZIP(), []int{15} +} + +func (x *CreateS3UploadURLResponse) GetUploadUrl() string { + if x != nil { + return x.UploadUrl + } + return "" +} + +func (x *CreateS3UploadURLResponse) GetBuildContextPath() string { + if x != nil { + return x.BuildContextPath + } + return "" +} + var File_ctrl_v1_deployment_proto protoreflect.FileDescriptor const file_ctrl_v1_deployment_proto_rawDesc = "" + @@ -1219,7 +1315,13 @@ const file_ctrl_v1_deployment_proto_rawDesc = "" + "\x10RollbackResponse\"B\n" + "\x0ePromoteRequest\x120\n" + "\x14target_deployment_id\x18\x01 \x01(\tR\x12targetDeploymentId\"\x11\n" + - "\x0fPromoteResponse*\xef\x01\n" + + "\x0fPromoteResponse\"D\n" + + "\x18CreateS3UploadURLRequest\x12(\n" + + "\x10unkey_project_id\x18\x01 \x01(\tR\x0eunkeyProjectId\"h\n" + + "\x19CreateS3UploadURLResponse\x12\x1d\n" + + "\n" + + "upload_url\x18\x01 \x01(\tR\tuploadUrl\x12,\n" + + "\x12build_context_path\x18\x02 \x01(\tR\x10buildContextPath*\xef\x01\n" + "\x10DeploymentStatus\x12!\n" + "\x1dDEPLOYMENT_STATUS_UNSPECIFIED\x10\x00\x12\x1d\n" + "\x19DEPLOYMENT_STATUS_PENDING\x10\x01\x12\x1e\n" + @@ -1232,8 +1334,9 @@ const file_ctrl_v1_deployment_proto_rawDesc = "" + "SourceType\x12\x1b\n" + "\x17SOURCE_TYPE_UNSPECIFIED\x10\x00\x12\x13\n" + "\x0fSOURCE_TYPE_GIT\x10\x01\x12\x1a\n" + - "\x16SOURCE_TYPE_CLI_UPLOAD\x10\x022\xc3\x02\n" + - "\x11DeploymentService\x12Y\n" + + "\x16SOURCE_TYPE_CLI_UPLOAD\x10\x022\xa1\x03\n" + + "\x11DeploymentService\x12\\\n" + + "\x11CreateS3UploadURL\x12!.ctrl.v1.CreateS3UploadURLRequest\x1a\".ctrl.v1.CreateS3UploadURLResponse\"\x00\x12Y\n" + "\x10CreateDeployment\x12 .ctrl.v1.CreateDeploymentRequest\x1a!.ctrl.v1.CreateDeploymentResponse\"\x00\x12P\n" + "\rGetDeployment\x12\x1d.ctrl.v1.GetDeploymentRequest\x1a\x1e.ctrl.v1.GetDeploymentResponse\"\x00\x12A\n" + "\bRollback\x12\x18.ctrl.v1.RollbackRequest\x1a\x19.ctrl.v1.RollbackResponse\"\x00\x12>\n" + @@ -1253,25 +1356,27 @@ func file_ctrl_v1_deployment_proto_rawDescGZIP() []byte { } var file_ctrl_v1_deployment_proto_enumTypes = make([]protoimpl.EnumInfo, 2) -var file_ctrl_v1_deployment_proto_msgTypes = make([]protoimpl.MessageInfo, 15) +var file_ctrl_v1_deployment_proto_msgTypes = make([]protoimpl.MessageInfo, 17) var file_ctrl_v1_deployment_proto_goTypes = []any{ - (DeploymentStatus)(0), // 0: ctrl.v1.DeploymentStatus - (SourceType)(0), // 1: ctrl.v1.SourceType - (*CreateDeploymentRequest)(nil), // 2: ctrl.v1.CreateDeploymentRequest - (*BuildContext)(nil), // 3: ctrl.v1.BuildContext - (*GitCommitInfo)(nil), // 4: ctrl.v1.GitCommitInfo - (*CreateDeploymentResponse)(nil), // 5: ctrl.v1.CreateDeploymentResponse - (*GetDeploymentRequest)(nil), // 6: ctrl.v1.GetDeploymentRequest - (*GetDeploymentResponse)(nil), // 7: ctrl.v1.GetDeploymentResponse - (*Deployment)(nil), // 8: ctrl.v1.Deployment - (*DeploymentStep)(nil), // 9: ctrl.v1.DeploymentStep - (*Topology)(nil), // 10: ctrl.v1.Topology - (*RegionalConfig)(nil), // 11: ctrl.v1.RegionalConfig - (*RollbackRequest)(nil), // 12: ctrl.v1.RollbackRequest - (*RollbackResponse)(nil), // 13: ctrl.v1.RollbackResponse - (*PromoteRequest)(nil), // 14: ctrl.v1.PromoteRequest - (*PromoteResponse)(nil), // 15: ctrl.v1.PromoteResponse - nil, // 16: ctrl.v1.Deployment.EnvironmentVariablesEntry + (DeploymentStatus)(0), // 0: ctrl.v1.DeploymentStatus + (SourceType)(0), // 1: ctrl.v1.SourceType + (*CreateDeploymentRequest)(nil), // 2: ctrl.v1.CreateDeploymentRequest + (*BuildContext)(nil), // 3: ctrl.v1.BuildContext + (*GitCommitInfo)(nil), // 4: ctrl.v1.GitCommitInfo + (*CreateDeploymentResponse)(nil), // 5: ctrl.v1.CreateDeploymentResponse + (*GetDeploymentRequest)(nil), // 6: ctrl.v1.GetDeploymentRequest + (*GetDeploymentResponse)(nil), // 7: ctrl.v1.GetDeploymentResponse + (*Deployment)(nil), // 8: ctrl.v1.Deployment + (*DeploymentStep)(nil), // 9: ctrl.v1.DeploymentStep + (*Topology)(nil), // 10: ctrl.v1.Topology + (*RegionalConfig)(nil), // 11: ctrl.v1.RegionalConfig + (*RollbackRequest)(nil), // 12: ctrl.v1.RollbackRequest + (*RollbackResponse)(nil), // 13: ctrl.v1.RollbackResponse + (*PromoteRequest)(nil), // 14: ctrl.v1.PromoteRequest + (*PromoteResponse)(nil), // 15: ctrl.v1.PromoteResponse + (*CreateS3UploadURLRequest)(nil), // 16: ctrl.v1.CreateS3UploadURLRequest + (*CreateS3UploadURLResponse)(nil), // 17: ctrl.v1.CreateS3UploadURLResponse + nil, // 18: ctrl.v1.Deployment.EnvironmentVariablesEntry } var file_ctrl_v1_deployment_proto_depIdxs = []int32{ 3, // 0: ctrl.v1.CreateDeploymentRequest.build_context:type_name -> ctrl.v1.BuildContext @@ -1279,20 +1384,22 @@ var file_ctrl_v1_deployment_proto_depIdxs = []int32{ 0, // 2: ctrl.v1.CreateDeploymentResponse.status:type_name -> ctrl.v1.DeploymentStatus 8, // 3: ctrl.v1.GetDeploymentResponse.deployment:type_name -> ctrl.v1.Deployment 0, // 4: ctrl.v1.Deployment.status:type_name -> ctrl.v1.DeploymentStatus - 16, // 5: ctrl.v1.Deployment.environment_variables:type_name -> ctrl.v1.Deployment.EnvironmentVariablesEntry + 18, // 5: ctrl.v1.Deployment.environment_variables:type_name -> ctrl.v1.Deployment.EnvironmentVariablesEntry 10, // 6: ctrl.v1.Deployment.topology:type_name -> ctrl.v1.Topology 9, // 7: ctrl.v1.Deployment.steps:type_name -> ctrl.v1.DeploymentStep 11, // 8: ctrl.v1.Topology.regions:type_name -> ctrl.v1.RegionalConfig - 2, // 9: ctrl.v1.DeploymentService.CreateDeployment:input_type -> ctrl.v1.CreateDeploymentRequest - 6, // 10: ctrl.v1.DeploymentService.GetDeployment:input_type -> ctrl.v1.GetDeploymentRequest - 12, // 11: ctrl.v1.DeploymentService.Rollback:input_type -> ctrl.v1.RollbackRequest - 14, // 12: ctrl.v1.DeploymentService.Promote:input_type -> ctrl.v1.PromoteRequest - 5, // 13: ctrl.v1.DeploymentService.CreateDeployment:output_type -> ctrl.v1.CreateDeploymentResponse - 7, // 14: ctrl.v1.DeploymentService.GetDeployment:output_type -> ctrl.v1.GetDeploymentResponse - 13, // 15: ctrl.v1.DeploymentService.Rollback:output_type -> ctrl.v1.RollbackResponse - 15, // 16: ctrl.v1.DeploymentService.Promote:output_type -> ctrl.v1.PromoteResponse - 13, // [13:17] is the sub-list for method output_type - 9, // [9:13] is the sub-list for method input_type + 16, // 9: ctrl.v1.DeploymentService.CreateS3UploadURL:input_type -> ctrl.v1.CreateS3UploadURLRequest + 2, // 10: ctrl.v1.DeploymentService.CreateDeployment:input_type -> ctrl.v1.CreateDeploymentRequest + 6, // 11: ctrl.v1.DeploymentService.GetDeployment:input_type -> ctrl.v1.GetDeploymentRequest + 12, // 12: ctrl.v1.DeploymentService.Rollback:input_type -> ctrl.v1.RollbackRequest + 14, // 13: ctrl.v1.DeploymentService.Promote:input_type -> ctrl.v1.PromoteRequest + 17, // 14: ctrl.v1.DeploymentService.CreateS3UploadURL:output_type -> ctrl.v1.CreateS3UploadURLResponse + 5, // 15: ctrl.v1.DeploymentService.CreateDeployment:output_type -> ctrl.v1.CreateDeploymentResponse + 7, // 16: ctrl.v1.DeploymentService.GetDeployment:output_type -> ctrl.v1.GetDeploymentResponse + 13, // 17: ctrl.v1.DeploymentService.Rollback:output_type -> ctrl.v1.RollbackResponse + 15, // 18: ctrl.v1.DeploymentService.Promote:output_type -> ctrl.v1.PromoteResponse + 14, // [14:19] is the sub-list for method output_type + 9, // [9:14] is the sub-list for method input_type 9, // [9:9] is the sub-list for extension type_name 9, // [9:9] is the sub-list for extension extendee 0, // [0:9] is the sub-list for field type_name @@ -1314,7 +1421,7 @@ func file_ctrl_v1_deployment_proto_init() { GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: unsafe.Slice(unsafe.StringData(file_ctrl_v1_deployment_proto_rawDesc), len(file_ctrl_v1_deployment_proto_rawDesc)), NumEnums: 2, - NumMessages: 15, + NumMessages: 17, NumExtensions: 0, NumServices: 1, }, diff --git a/gen/proto/hydra/v1/BUILD.bazel b/gen/proto/hydra/v1/BUILD.bazel index 8496802a6b..c21d70cab6 100644 --- a/gen/proto/hydra/v1/BUILD.bazel +++ b/gen/proto/hydra/v1/BUILD.bazel @@ -3,6 +3,8 @@ load("@rules_go//go:def.bzl", "go_library") go_library( name = "hydra", srcs = [ + "build.pb.go", + "build_restate.pb.go", "certificate.pb.go", "certificate_restate.pb.go", "deployment.pb.go", diff --git a/gen/proto/hydra/v1/build.pb.go b/gen/proto/hydra/v1/build.pb.go new file mode 100644 index 0000000000..10f181642f --- /dev/null +++ b/gen/proto/hydra/v1/build.pb.go @@ -0,0 +1,241 @@ +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.36.8 +// protoc (unknown) +// source: hydra/v1/build.proto + +package hydrav1 + +import ( + _ "github.com/restatedev/sdk-go/generated/dev/restate/sdk" + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + reflect "reflect" + sync "sync" + unsafe "unsafe" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +type BuildDockerImageRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + S3Url string `protobuf:"bytes,1,opt,name=s3_url,json=s3Url,proto3" json:"s3_url,omitempty"` + BuildContextPath string `protobuf:"bytes,2,opt,name=build_context_path,json=buildContextPath,proto3" json:"build_context_path,omitempty"` + DockerfilePath string `protobuf:"bytes,3,opt,name=dockerfile_path,json=dockerfilePath,proto3" json:"dockerfile_path,omitempty"` + ProjectId string `protobuf:"bytes,4,opt,name=project_id,json=projectId,proto3" json:"project_id,omitempty"` + DeploymentId string `protobuf:"bytes,5,opt,name=deployment_id,json=deploymentId,proto3" json:"deployment_id,omitempty"` + WorkspaceId string `protobuf:"bytes,6,opt,name=workspace_id,json=workspaceId,proto3" json:"workspace_id,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *BuildDockerImageRequest) Reset() { + *x = BuildDockerImageRequest{} + mi := &file_hydra_v1_build_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *BuildDockerImageRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*BuildDockerImageRequest) ProtoMessage() {} + +func (x *BuildDockerImageRequest) ProtoReflect() protoreflect.Message { + mi := &file_hydra_v1_build_proto_msgTypes[0] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use BuildDockerImageRequest.ProtoReflect.Descriptor instead. +func (*BuildDockerImageRequest) Descriptor() ([]byte, []int) { + return file_hydra_v1_build_proto_rawDescGZIP(), []int{0} +} + +func (x *BuildDockerImageRequest) GetS3Url() string { + if x != nil { + return x.S3Url + } + return "" +} + +func (x *BuildDockerImageRequest) GetBuildContextPath() string { + if x != nil { + return x.BuildContextPath + } + return "" +} + +func (x *BuildDockerImageRequest) GetDockerfilePath() string { + if x != nil { + return x.DockerfilePath + } + return "" +} + +func (x *BuildDockerImageRequest) GetProjectId() string { + if x != nil { + return x.ProjectId + } + return "" +} + +func (x *BuildDockerImageRequest) GetDeploymentId() string { + if x != nil { + return x.DeploymentId + } + return "" +} + +func (x *BuildDockerImageRequest) GetWorkspaceId() string { + if x != nil { + return x.WorkspaceId + } + return "" +} + +type BuildDockerImageResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + DepotProjectId string `protobuf:"bytes,1,opt,name=depot_project_id,json=depotProjectId,proto3" json:"depot_project_id,omitempty"` + DepotBuildId string `protobuf:"bytes,2,opt,name=depot_build_id,json=depotBuildId,proto3" json:"depot_build_id,omitempty"` + ImageName string `protobuf:"bytes,3,opt,name=image_name,json=imageName,proto3" json:"image_name,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *BuildDockerImageResponse) Reset() { + *x = BuildDockerImageResponse{} + mi := &file_hydra_v1_build_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *BuildDockerImageResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*BuildDockerImageResponse) ProtoMessage() {} + +func (x *BuildDockerImageResponse) ProtoReflect() protoreflect.Message { + mi := &file_hydra_v1_build_proto_msgTypes[1] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use BuildDockerImageResponse.ProtoReflect.Descriptor instead. +func (*BuildDockerImageResponse) Descriptor() ([]byte, []int) { + return file_hydra_v1_build_proto_rawDescGZIP(), []int{1} +} + +func (x *BuildDockerImageResponse) GetDepotProjectId() string { + if x != nil { + return x.DepotProjectId + } + return "" +} + +func (x *BuildDockerImageResponse) GetDepotBuildId() string { + if x != nil { + return x.DepotBuildId + } + return "" +} + +func (x *BuildDockerImageResponse) GetImageName() string { + if x != nil { + return x.ImageName + } + return "" +} + +var File_hydra_v1_build_proto protoreflect.FileDescriptor + +const file_hydra_v1_build_proto_rawDesc = "" + + "\n" + + "\x14hydra/v1/build.proto\x12\bhydra.v1\x1a\x18dev/restate/sdk/go.proto\"\xee\x01\n" + + "\x17BuildDockerImageRequest\x12\x15\n" + + "\x06s3_url\x18\x01 \x01(\tR\x05s3Url\x12,\n" + + "\x12build_context_path\x18\x02 \x01(\tR\x10buildContextPath\x12'\n" + + "\x0fdockerfile_path\x18\x03 \x01(\tR\x0edockerfilePath\x12\x1d\n" + + "\n" + + "project_id\x18\x04 \x01(\tR\tprojectId\x12#\n" + + "\rdeployment_id\x18\x05 \x01(\tR\fdeploymentId\x12!\n" + + "\fworkspace_id\x18\x06 \x01(\tR\vworkspaceId\"\x89\x01\n" + + "\x18BuildDockerImageResponse\x12(\n" + + "\x10depot_project_id\x18\x01 \x01(\tR\x0edepotProjectId\x12$\n" + + "\x0edepot_build_id\x18\x02 \x01(\tR\fdepotBuildId\x12\x1d\n" + + "\n" + + "image_name\x18\x03 \x01(\tR\timageName2q\n" + + "\fBuildService\x12[\n" + + "\x10BuildDockerImage\x12!.hydra.v1.BuildDockerImageRequest\x1a\".hydra.v1.BuildDockerImageResponse\"\x00\x1a\x04\x98\x80\x01\x00B\x90\x01\n" + + "\fcom.hydra.v1B\n" + + "BuildProtoP\x01Z3github.com/unkeyed/unkey/gen/proto/hydra/v1;hydrav1\xa2\x02\x03HXX\xaa\x02\bHydra.V1\xca\x02\bHydra\\V1\xe2\x02\x14Hydra\\V1\\GPBMetadata\xea\x02\tHydra::V1b\x06proto3" + +var ( + file_hydra_v1_build_proto_rawDescOnce sync.Once + file_hydra_v1_build_proto_rawDescData []byte +) + +func file_hydra_v1_build_proto_rawDescGZIP() []byte { + file_hydra_v1_build_proto_rawDescOnce.Do(func() { + file_hydra_v1_build_proto_rawDescData = protoimpl.X.CompressGZIP(unsafe.Slice(unsafe.StringData(file_hydra_v1_build_proto_rawDesc), len(file_hydra_v1_build_proto_rawDesc))) + }) + return file_hydra_v1_build_proto_rawDescData +} + +var file_hydra_v1_build_proto_msgTypes = make([]protoimpl.MessageInfo, 2) +var file_hydra_v1_build_proto_goTypes = []any{ + (*BuildDockerImageRequest)(nil), // 0: hydra.v1.BuildDockerImageRequest + (*BuildDockerImageResponse)(nil), // 1: hydra.v1.BuildDockerImageResponse +} +var file_hydra_v1_build_proto_depIdxs = []int32{ + 0, // 0: hydra.v1.BuildService.BuildDockerImage:input_type -> hydra.v1.BuildDockerImageRequest + 1, // 1: hydra.v1.BuildService.BuildDockerImage:output_type -> hydra.v1.BuildDockerImageResponse + 1, // [1:2] is the sub-list for method output_type + 0, // [0:1] is the sub-list for method input_type + 0, // [0:0] is the sub-list for extension type_name + 0, // [0:0] is the sub-list for extension extendee + 0, // [0:0] is the sub-list for field type_name +} + +func init() { file_hydra_v1_build_proto_init() } +func file_hydra_v1_build_proto_init() { + if File_hydra_v1_build_proto != nil { + return + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: unsafe.Slice(unsafe.StringData(file_hydra_v1_build_proto_rawDesc), len(file_hydra_v1_build_proto_rawDesc)), + NumEnums: 0, + NumMessages: 2, + NumExtensions: 0, + NumServices: 1, + }, + GoTypes: file_hydra_v1_build_proto_goTypes, + DependencyIndexes: file_hydra_v1_build_proto_depIdxs, + MessageInfos: file_hydra_v1_build_proto_msgTypes, + }.Build() + File_hydra_v1_build_proto = out.File + file_hydra_v1_build_proto_goTypes = nil + file_hydra_v1_build_proto_depIdxs = nil +} diff --git a/gen/proto/hydra/v1/build_restate.pb.go b/gen/proto/hydra/v1/build_restate.pb.go new file mode 100644 index 0000000000..56f6a2a9dc --- /dev/null +++ b/gen/proto/hydra/v1/build_restate.pb.go @@ -0,0 +1,103 @@ +// Code generated by protoc-gen-go-restate. DO NOT EDIT. +// versions: +// - protoc-gen-go-restate v0.1 +// - protoc (unknown) +// source: hydra/v1/build.proto + +package hydrav1 + +import ( + fmt "fmt" + sdk_go "github.com/restatedev/sdk-go" + encoding "github.com/restatedev/sdk-go/encoding" + ingress "github.com/restatedev/sdk-go/ingress" +) + +// BuildServiceClient is the client API for hydra.v1.BuildService service. +type BuildServiceClient interface { + BuildDockerImage(opts ...sdk_go.ClientOption) sdk_go.Client[*BuildDockerImageRequest, *BuildDockerImageResponse] +} + +type buildServiceClient struct { + ctx sdk_go.Context + options []sdk_go.ClientOption +} + +func NewBuildServiceClient(ctx sdk_go.Context, opts ...sdk_go.ClientOption) BuildServiceClient { + cOpts := append([]sdk_go.ClientOption{sdk_go.WithProtoJSON}, opts...) + return &buildServiceClient{ + ctx, + cOpts, + } +} +func (c *buildServiceClient) BuildDockerImage(opts ...sdk_go.ClientOption) sdk_go.Client[*BuildDockerImageRequest, *BuildDockerImageResponse] { + cOpts := c.options + if len(opts) > 0 { + cOpts = append(append([]sdk_go.ClientOption{}, cOpts...), opts...) + } + return sdk_go.WithRequestType[*BuildDockerImageRequest](sdk_go.Service[*BuildDockerImageResponse](c.ctx, "hydra.v1.BuildService", "BuildDockerImage", cOpts...)) +} + +// BuildServiceIngressClient is the ingress client API for hydra.v1.BuildService service. +// +// This client is used to call the service from outside of a Restate context. +type BuildServiceIngressClient interface { + BuildDockerImage() ingress.Requester[*BuildDockerImageRequest, *BuildDockerImageResponse] +} + +type buildServiceIngressClient struct { + client *ingress.Client + serviceName string +} + +func NewBuildServiceIngressClient(client *ingress.Client) BuildServiceIngressClient { + return &buildServiceIngressClient{ + client, + "hydra.v1.BuildService", + } +} + +func (c *buildServiceIngressClient) BuildDockerImage() ingress.Requester[*BuildDockerImageRequest, *BuildDockerImageResponse] { + codec := encoding.ProtoJSONCodec + return ingress.NewRequester[*BuildDockerImageRequest, *BuildDockerImageResponse](c.client, c.serviceName, "BuildDockerImage", nil, &codec) +} + +// BuildServiceServer is the server API for hydra.v1.BuildService service. +// All implementations should embed UnimplementedBuildServiceServer +// for forward compatibility. +type BuildServiceServer interface { + BuildDockerImage(ctx sdk_go.Context, req *BuildDockerImageRequest) (*BuildDockerImageResponse, error) +} + +// UnimplementedBuildServiceServer should be embedded to have +// forward compatible implementations. +// +// NOTE: this should be embedded by value instead of pointer to avoid a nil +// pointer dereference when methods are called. +type UnimplementedBuildServiceServer struct{} + +func (UnimplementedBuildServiceServer) BuildDockerImage(ctx sdk_go.Context, req *BuildDockerImageRequest) (*BuildDockerImageResponse, error) { + return nil, sdk_go.TerminalError(fmt.Errorf("method BuildDockerImage not implemented"), 501) +} +func (UnimplementedBuildServiceServer) testEmbeddedByValue() {} + +// UnsafeBuildServiceServer may be embedded to opt out of forward compatibility for this service. +// Use of this interface is not recommended, as added methods to BuildServiceServer will +// result in compilation errors. +type UnsafeBuildServiceServer interface { + mustEmbedUnimplementedBuildServiceServer() +} + +func NewBuildServiceServer(srv BuildServiceServer, opts ...sdk_go.ServiceDefinitionOption) sdk_go.ServiceDefinition { + // If the following call panics, it indicates UnimplementedBuildServiceServer was + // embedded by pointer and is nil. This will cause panics if an + // unimplemented method is ever invoked, so we test this at initialization + // time to prevent it from happening at runtime later due to I/O. + if t, ok := srv.(interface{ testEmbeddedByValue() }); ok { + t.testEmbeddedByValue() + } + sOpts := append([]sdk_go.ServiceDefinitionOption{sdk_go.WithProtoJSON}, opts...) + router := sdk_go.NewService("hydra.v1.BuildService", sOpts...) + router = router.Handler("BuildDockerImage", sdk_go.NewServiceHandler(srv.BuildDockerImage)) + return router +} diff --git a/svc/api/internal/testutil/http.go b/svc/api/internal/testutil/http.go index 9c741cfaae..8178949a85 100644 --- a/svc/api/internal/testutil/http.go +++ b/svc/api/internal/testutil/http.go @@ -61,7 +61,6 @@ type Harness struct { Vault *vault.Service AnalyticsConnectionManager analytics.ConnectionManager CtrlDeploymentClient ctrlv1connect.DeploymentServiceClient - CtrlBuildClient ctrlv1connect.BuildServiceClient seeder *seed.Seeder } @@ -199,14 +198,6 @@ func NewHarness(t *testing.T) *Harness { })), ) - ctrlBuildClient := ctrlv1connect.NewBuildServiceClient( - http.DefaultClient, - ctrlURL, - connect.WithInterceptors(interceptor.NewHeaderInjector(map[string]string{ - "Authorization": fmt.Sprintf("Bearer %s", ctrlToken), - })), - ) - audit, err := auditlogs.New(auditlogs.Config{ DB: db, Logger: logger, @@ -228,7 +219,6 @@ func NewHarness(t *testing.T) *Harness { Clock: clk, AnalyticsConnectionManager: analyticsConnManager, CtrlDeploymentClient: ctrlDeploymentClient, - CtrlBuildClient: ctrlBuildClient, Auditlogs: audit, Caches: caches, middleware: []zen.Middleware{ diff --git a/svc/api/routes/register.go b/svc/api/routes/register.go index 8efcc112b9..28533e62fd 100644 --- a/svc/api/routes/register.go +++ b/svc/api/routes/register.go @@ -326,40 +326,38 @@ func Register(srv *zen.Server, svc *Services, info zen.InstanceInfo) { // --------------------------------------------------------------------------- // v2/deploy - if svc.CtrlBuildClient != nil { - // v2/deploy.createDeployment - srv.RegisterRoute( - defaultMiddlewares, - &v2DeployCreateDeployment.Handler{ - Logger: svc.Logger, - DB: svc.Database, - Keys: svc.Keys, - CtrlClient: svc.CtrlDeploymentClient, - }, - ) - - // v2/deploy.getDeployment - srv.RegisterRoute( - defaultMiddlewares, - &v2DeployGetDeployment.Handler{ - Logger: svc.Logger, - DB: svc.Database, - Keys: svc.Keys, - CtrlClient: svc.CtrlDeploymentClient, - }, - ) - - // v2/deploy.generateUploadUrl - srv.RegisterRoute( - defaultMiddlewares, - &v2DeployGenerateUploadUrl.Handler{ - Logger: svc.Logger, - DB: svc.Database, - Keys: svc.Keys, - CtrlClient: svc.CtrlBuildClient, - }, - ) - } + // v2/deploy.createDeployment + srv.RegisterRoute( + defaultMiddlewares, + &v2DeployCreateDeployment.Handler{ + Logger: svc.Logger, + DB: svc.Database, + Keys: svc.Keys, + CtrlClient: svc.CtrlDeploymentClient, + }, + ) + + // v2/deploy.getDeployment + srv.RegisterRoute( + defaultMiddlewares, + &v2DeployGetDeployment.Handler{ + Logger: svc.Logger, + DB: svc.Database, + Keys: svc.Keys, + CtrlClient: svc.CtrlDeploymentClient, + }, + ) + + // v2/deploy.generateUploadUrl + srv.RegisterRoute( + defaultMiddlewares, + &v2DeployGenerateUploadUrl.Handler{ + Logger: svc.Logger, + DB: svc.Database, + Keys: svc.Keys, + CtrlClient: svc.CtrlDeploymentClient, + }, + ) // --------------------------------------------------------------------------- // v2/permissions diff --git a/svc/api/routes/services.go b/svc/api/routes/services.go index 6de3bd9364..e728b19410 100644 --- a/svc/api/routes/services.go +++ b/svc/api/routes/services.go @@ -27,7 +27,6 @@ type Services struct { Vault *vault.Service ChproxyToken string CtrlDeploymentClient ctrlv1connect.DeploymentServiceClient - CtrlBuildClient ctrlv1connect.BuildServiceClient PprofEnabled bool PprofUsername string PprofPassword string diff --git a/svc/api/routes/v2_deploy_generate_upload_url/200_test.go b/svc/api/routes/v2_deploy_generate_upload_url/200_test.go index 6612bc6261..cee6bfe6fa 100644 --- a/svc/api/routes/v2_deploy_generate_upload_url/200_test.go +++ b/svc/api/routes/v2_deploy_generate_upload_url/200_test.go @@ -17,7 +17,7 @@ func TestGenerateUploadUrlSuccessfully(t *testing.T) { Logger: h.Logger, DB: h.DB, Keys: h.Keys, - CtrlClient: h.CtrlBuildClient, + CtrlClient: h.CtrlDeploymentClient, } h.Register(route) @@ -58,7 +58,7 @@ func TestGenerateUploadUrlWithWildcardPermission(t *testing.T) { Logger: h.Logger, DB: h.DB, Keys: h.Keys, - CtrlClient: h.CtrlBuildClient, + CtrlClient: h.CtrlDeploymentClient, } h.Register(route) @@ -89,7 +89,7 @@ func TestGenerateUploadUrlWithSpecificProjectPermission(t *testing.T) { Logger: h.Logger, DB: h.DB, Keys: h.Keys, - CtrlClient: h.CtrlBuildClient, + CtrlClient: h.CtrlDeploymentClient, } h.Register(route) diff --git a/svc/api/routes/v2_deploy_generate_upload_url/400_test.go b/svc/api/routes/v2_deploy_generate_upload_url/400_test.go index cf80aa02dc..b970fe12fd 100644 --- a/svc/api/routes/v2_deploy_generate_upload_url/400_test.go +++ b/svc/api/routes/v2_deploy_generate_upload_url/400_test.go @@ -18,7 +18,7 @@ func TestBadRequests(t *testing.T) { Logger: h.Logger, DB: h.DB, Keys: h.Keys, - CtrlClient: h.CtrlBuildClient, + CtrlClient: h.CtrlDeploymentClient, } h.Register(route) diff --git a/svc/api/routes/v2_deploy_generate_upload_url/401_test.go b/svc/api/routes/v2_deploy_generate_upload_url/401_test.go index 48257786a1..454c7f2ede 100644 --- a/svc/api/routes/v2_deploy_generate_upload_url/401_test.go +++ b/svc/api/routes/v2_deploy_generate_upload_url/401_test.go @@ -16,7 +16,7 @@ func TestUnauthorizedAccess(t *testing.T) { Logger: h.Logger, DB: h.DB, Keys: h.Keys, - CtrlClient: h.CtrlBuildClient, + CtrlClient: h.CtrlDeploymentClient, } h.Register(route) diff --git a/svc/api/routes/v2_deploy_generate_upload_url/403_test.go b/svc/api/routes/v2_deploy_generate_upload_url/403_test.go index 5089010be8..d0bb73b948 100644 --- a/svc/api/routes/v2_deploy_generate_upload_url/403_test.go +++ b/svc/api/routes/v2_deploy_generate_upload_url/403_test.go @@ -20,7 +20,7 @@ func TestGenerateUploadUrlInsufficientPermissions(t *testing.T) { Logger: h.Logger, DB: h.DB, Keys: h.Keys, - CtrlClient: h.CtrlBuildClient, + CtrlClient: h.CtrlDeploymentClient, } h.Register(route) diff --git a/svc/api/routes/v2_deploy_generate_upload_url/404_test.go b/svc/api/routes/v2_deploy_generate_upload_url/404_test.go index f61df9a248..4b58a6b720 100644 --- a/svc/api/routes/v2_deploy_generate_upload_url/404_test.go +++ b/svc/api/routes/v2_deploy_generate_upload_url/404_test.go @@ -19,7 +19,7 @@ func TestNotFound(t *testing.T) { Logger: h.Logger, DB: h.DB, Keys: h.Keys, - CtrlClient: h.CtrlBuildClient, + CtrlClient: h.CtrlDeploymentClient, } h.Register(route) diff --git a/svc/api/routes/v2_deploy_generate_upload_url/handler.go b/svc/api/routes/v2_deploy_generate_upload_url/handler.go index 06ada9dc77..f6955ed4f7 100644 --- a/svc/api/routes/v2_deploy_generate_upload_url/handler.go +++ b/svc/api/routes/v2_deploy_generate_upload_url/handler.go @@ -27,7 +27,7 @@ type Handler struct { Logger logging.Logger DB db.Database Keys keys.KeyService - CtrlClient ctrlv1connect.BuildServiceClient + CtrlClient ctrlv1connect.DeploymentServiceClient } func (h *Handler) Path() string { @@ -86,13 +86,9 @@ func (h *Handler) Handle(ctx context.Context, s *zen.Session) error { ) } - ctrlReq := &ctrlv1.GenerateUploadURLRequest{ + ctrlResp, err := h.CtrlClient.CreateS3UploadURL(ctx, connect.NewRequest(&ctrlv1.CreateS3UploadURLRequest{ UnkeyProjectId: req.ProjectId, - } - - connectReq := connect.NewRequest(ctrlReq) - - ctrlResp, err := h.CtrlClient.GenerateUploadURL(ctx, connectReq) + })) if err != nil { return ctrlclient.HandleError(err, "generate upload URL") } diff --git a/svc/api/run.go b/svc/api/run.go index 8fcb528778..46c2235090 100644 --- a/svc/api/run.go +++ b/svc/api/run.go @@ -305,27 +305,15 @@ func Run(ctx context.Context, cfg Config) error { } // Initialize CTRL deployment client using bufconnect - var ctrlDeploymentClient ctrlv1connect.DeploymentServiceClient - var ctrlBuildClient ctrlv1connect.BuildServiceClient - if cfg.CtrlURL != "" { - ctrlDeploymentClient = ctrlv1connect.NewDeploymentServiceClient( - &http.Client{}, - cfg.CtrlURL, - connect.WithInterceptors(interceptor.NewHeaderInjector(map[string]string{ - "Authorization": fmt.Sprintf("Bearer %s", cfg.CtrlToken), - })), - ) - ctrlBuildClient = ctrlv1connect.NewBuildServiceClient( - &http.Client{}, - cfg.CtrlURL, - connect.WithInterceptors(interceptor.NewHeaderInjector(map[string]string{ - "Authorization": fmt.Sprintf("Bearer %s", cfg.CtrlToken), - })), - ) - logger.Info("CTRL clients initialized", "url", cfg.CtrlURL) - } else { - logger.Warn("CTRL URL not configured, deployment and build endpoints will be unavailable") - } + ctrlDeploymentClient := ctrlv1connect.NewDeploymentServiceClient( + &http.Client{}, + cfg.CtrlURL, + connect.WithInterceptors(interceptor.NewHeaderInjector(map[string]string{ + "Authorization": fmt.Sprintf("Bearer %s", cfg.CtrlToken), + })), + ) + + logger.Info("CTRL clients initialized", "url", cfg.CtrlURL) routes.Register(srv, &routes.Services{ Logger: logger, @@ -339,7 +327,6 @@ func Run(ctx context.Context, cfg Config) error { Vault: vaultSvc, ChproxyToken: cfg.ChproxyToken, CtrlDeploymentClient: ctrlDeploymentClient, - CtrlBuildClient: ctrlBuildClient, PprofEnabled: cfg.PprofEnabled, PprofUsername: cfg.PprofUsername, PprofPassword: cfg.PprofPassword, diff --git a/svc/ctrl/BUILD.bazel b/svc/ctrl/BUILD.bazel index 0bdea90254..6e048c8d83 100644 --- a/svc/ctrl/BUILD.bazel +++ b/svc/ctrl/BUILD.bazel @@ -2,37 +2,7 @@ load("@rules_go//go:def.bzl", "go_library") go_library( name = "ctrl", - srcs = [ - "config.go", - "doc.go", - "run.go", - ], + srcs = ["doc.go"], importpath = "github.com/unkeyed/unkey/svc/ctrl", visibility = ["//visibility:public"], - deps = [ - "//gen/proto/ctrl/v1/ctrlv1connect", - "//pkg/assert", - "//pkg/cache", - "//pkg/clickhouse", - "//pkg/clock", - "//pkg/db", - "//pkg/otel", - "//pkg/otel/logging", - "//pkg/prometheus", - "//pkg/shutdown", - "//pkg/tls", - "//pkg/version", - "//svc/ctrl/services/acme", - "//svc/ctrl/services/build/backend/depot", - "//svc/ctrl/services/build/backend/docker", - "//svc/ctrl/services/build/storage", - "//svc/ctrl/services/cluster", - "//svc/ctrl/services/ctrl", - "//svc/ctrl/services/deployment", - "//svc/ctrl/services/openapi", - "@com_github_restatedev_sdk_go//:sdk-go", - "@com_github_restatedev_sdk_go//ingress", - "@org_golang_x_net//http2", - "@org_golang_x_net//http2/h2c", - ], ) diff --git a/svc/ctrl/api/BUILD.bazel b/svc/ctrl/api/BUILD.bazel new file mode 100644 index 0000000000..8c5bf93094 --- /dev/null +++ b/svc/ctrl/api/BUILD.bazel @@ -0,0 +1,33 @@ +load("@rules_go//go:def.bzl", "go_library") + +go_library( + name = "api", + srcs = [ + "config.go", + "run.go", + ], + importpath = "github.com/unkeyed/unkey/svc/ctrl/api", + visibility = ["//visibility:public"], + deps = [ + "//gen/proto/ctrl/v1/ctrlv1connect", + "//pkg/cache", + "//pkg/clock", + "//pkg/db", + "//pkg/otel", + "//pkg/otel/logging", + "//pkg/prometheus", + "//pkg/shutdown", + "//pkg/tls", + "//pkg/version", + "//svc/ctrl/pkg/s3", + "//svc/ctrl/services/acme", + "//svc/ctrl/services/cluster", + "//svc/ctrl/services/ctrl", + "//svc/ctrl/services/deployment", + "//svc/ctrl/services/openapi", + "@com_github_restatedev_sdk_go//:sdk-go", + "@com_github_restatedev_sdk_go//ingress", + "@org_golang_x_net//http2", + "@org_golang_x_net//http2/h2c", + ], +) diff --git a/svc/ctrl/config.go b/svc/ctrl/api/config.go similarity index 52% rename from svc/ctrl/config.go rename to svc/ctrl/api/config.go index 11be309a3f..925539cbd7 100644 --- a/svc/ctrl/config.go +++ b/svc/ctrl/api/config.go @@ -1,10 +1,6 @@ -package ctrl +package api import ( - "fmt" - "strings" - - "github.com/unkeyed/unkey/pkg/assert" "github.com/unkeyed/unkey/pkg/clock" "github.com/unkeyed/unkey/pkg/tls" ) @@ -159,6 +155,11 @@ type RegistryConfig struct { Password string } +type VaultConfig struct { + Url string + Token string +} + // Config holds configuration for the control plane server. // // This comprehensive configuration structure defines all aspects of control plane @@ -169,15 +170,12 @@ type Config struct { // Used for logging, tracing, and cluster coordination. InstanceID string - // Platform identifies the cloud platform where the node is running. - // Affects integration with cloud-specific services and monitoring. - // Examples: "aws", "gcp", "hetzner". - Platform string - // Image specifies the container image identifier including repository and tag. // Used for control plane deployment and sentinel image configuration. Image string + Region string + // HttpPort defines the HTTP port for the control plane server. // Default: 8080. Cannot be 0. HttpPort int @@ -187,22 +185,6 @@ type Config struct { // on all interfaces (0.0.0.0) on the specified port. PrometheusPort int - // Region identifies the geographic region where this node is deployed. - // Used for observability, compliance, and service routing. - Region string - - // RegistryURL is the container registry URL for pulling images. - // Example: "registry.depot.dev" or "https://registry.example.com". - RegistryURL string - - // RegistryUsername is the username for container registry authentication. - // Common values: "x-token" for token-based auth or actual username. - RegistryUsername string - - // RegistryPassword is the password/token for container registry authentication. - // Should be stored securely (environment variable or secret management). - RegistryPassword string - // --- Database configuration --- // DatabasePrimary is the primary database connection string. @@ -227,75 +209,23 @@ type Config struct { // Used by clients and services to authenticate with this control plane. AuthToken string - // SPIFFESocketPath is the path to the SPIFFE agent socket. - // Enables mTLS authentication with SPIFFE-based identity system. - SPIFFESocketPath string - // Clock provides time operations for testing and scheduling. // Use clock.RealClock{} for production deployments. Clock clock.Clock // --- Vault Configuration --- - // VaultMasterKeys are encryption keys for the general vault service. - // Used for encrypting/decrypting environment variables, API keys, etc. - VaultMasterKeys []string - - // VaultS3 configures S3 storage for the general vault. - // Stores encrypted secrets data with the provided master keys. - VaultS3 S3Config - - // AcmeVaultMasterKeys are encryption keys for the ACME vault service. - // Separate vault for TLS certificate storage and ACME account data. - AcmeVaultMasterKeys []string - - // AcmeVaultS3 configures S3 storage for the ACME vault. - // Stores encrypted TLS certificates and ACME challenge data. - AcmeVaultS3 S3Config - + Vault VaultConfig // --- ACME Configuration --- - // Acme configures automatic TLS certificate management. - // Enables Let's Encrypt integration for domain certificates. - Acme AcmeConfig - - // DefaultDomain is the fallback domain for system operations. - // Used for sentinel deployment and automatic certificate bootstrapping. - DefaultDomain string - - // RegionalApexDomain is the base domain for cross-region frontline communication. - // Certs are provisioned for *.{region}.{RegionalApexDomain} for each available region. - // Example: "unkey.cloud" results in certs for "*.us-west-2.aws.unkey.cloud", etc. - RegionalApexDomain string - // Restate configures workflow engine integration. // Enables asynchronous deployment and certificate renewal workflows. Restate RestateConfig - // --- Build Configuration --- - - // BuildBackend selects the container build system. - // Options: BuildBackendDepot or BuildBackendDocker. - BuildBackend BuildBackend - // BuildS3 configures storage for build artifacts and outputs. // Used by both Depot and Docker build backends. BuildS3 S3Config - // BuildPlatform defines the target architecture for container builds. - // Format: "linux/amd64", "linux/arm64". Only "linux" OS supported. - BuildPlatform string - - // Depot configures Depot.dev build service integration. - // Required when using BuildBackendDepot. - Depot DepotConfig - - // --- Analytics Configuration --- - - // ClickhouseURL is the ClickHouse database connection string. - // Used for analytics and operational metrics storage. - ClickhouseURL string - // --- Sentinel Configuration --- // SentinelImage is the container image used for new sentinel deployments. @@ -307,83 +237,6 @@ type Config struct { AvailableRegions []string } -// BuildPlatform represents parsed container build platform specification. -// -// Contains the validated platform string separated into OS and architecture -// components for build backend integration. -type BuildPlatform struct { - // Platform is the original build platform string. - // Example: "linux/amd64". - Platform string - - // Architecture is the CPU architecture component. - // Example: "amd64", "arm64". - Architecture string -} - -// parseBuildPlatform validates and parses a build platform string. -// -// This function validates that the build platform follows the expected -// format "linux/{architecture}" and parses it into components. -// Only "linux" OS is currently supported. -// -// Returns BuildPlatform with parsed components or error if format is invalid -// or OS is not supported. -func parseBuildPlatform(buildPlatform string) (BuildPlatform, error) { - buildPlatform = strings.TrimPrefix(buildPlatform, "/") - parts := strings.Split(buildPlatform, "/") - - if err := assert.All( - assert.Equal(len(parts), 2, fmt.Sprintf("invalid build platform format: %s (expected format: linux/amd64)", buildPlatform)), - assert.Equal(parts[0], "linux", fmt.Sprintf("unsupported OS: %s (only linux is supported)", parts[0])), - ); err != nil { - return BuildPlatform{}, err - } - - return BuildPlatform{ - Platform: buildPlatform, - Architecture: parts[1], - }, nil -} - -// GetBuildPlatform returns the parsed build platform. -// -// This method returns the parsed BuildPlatform from the configured -// BuildPlatform string. Should only be called after Validate() succeeds -// to ensure the platform string is valid. -// -// Returns BuildPlatform with parsed platform and architecture components. -func (c Config) GetBuildPlatform() BuildPlatform { - parsed, _ := parseBuildPlatform(c.BuildPlatform) - return parsed -} - -// GetRegistryConfig returns the registry configuration. -// -// This method builds a RegistryConfig from the individual registry -// settings in the main Config struct. Should only be called after -// Validate() succeeds to ensure all required fields are present. -// -// Returns RegistryConfig with URL, username, and password for container registry access. -func (c Config) GetRegistryConfig() RegistryConfig { - return RegistryConfig{ - URL: c.RegistryURL, - Username: c.RegistryUsername, - Password: c.RegistryPassword, - } -} - -// GetDepotConfig returns the depot configuration. -// -// This method returns the DepotConfig from the main Config struct. -// Should only be called after Validate() succeeds to ensure -// depot configuration is complete and valid. -// -// Returns the DepotConfig containing API URL and project region. -func (c Config) GetDepotConfig() DepotConfig { - return c.Depot -} - // Validate checks the configuration for required fields and logical consistency. // // This method performs comprehensive validation of all configuration sections @@ -394,55 +247,5 @@ func (c Config) GetDepotConfig() DepotConfig { // Returns an error if required fields are missing, invalid, or inconsistent. // Provides detailed error messages to help identify configuration issues. func (c Config) Validate() error { - // Validate Route53 configuration if enabled - if c.Acme.Enabled && c.Acme.Route53.Enabled { - if err := assert.All( - assert.NotEmpty(c.Acme.Route53.AccessKeyID, "route53 access key ID is required when route53 is enabled"), - assert.NotEmpty(c.Acme.Route53.SecretAccessKey, "route53 secret access key is required when route53 is enabled"), - assert.NotEmpty(c.Acme.Route53.Region, "route53 region is required when route53 is enabled"), - ); err != nil { - return err - } - } - - if err := assert.NotEmpty(c.ClickhouseURL, "ClickhouseURL is required"); err != nil { - return err - } - - // Validate build platform format - _, platformErr := parseBuildPlatform(c.BuildPlatform) - - // Validate registry configuration - registryErr := assert.All( - assert.NotEmpty(c.RegistryURL, "registry URL is required"), - assert.NotEmpty(c.RegistryUsername, "registry username is required"), - assert.NotEmpty(c.RegistryPassword, "registry password is required"), - ) - - switch c.BuildBackend { - case BuildBackendDepot: - return assert.All( - platformErr, - registryErr, - assert.NotEmpty(c.BuildPlatform, "build platform is required"), - assert.NotEmpty(c.BuildS3.URL, "build S3 URL is required when using Depot backend"), - assert.NotEmpty(c.BuildS3.Bucket, "build S3 bucket is required when using Depot backend"), - assert.NotEmpty(c.BuildS3.AccessKeyID, "build S3 access key ID is required when using Depot backend"), - assert.NotEmpty(c.BuildS3.AccessKeySecret, "build S3 access key secret is required when using Depot backend"), - assert.NotEmpty(c.Depot.APIUrl, "Depot API URL is required when using Depot backend"), - assert.NotEmpty(c.Depot.ProjectRegion, "Depot project region is required when using Depot backend"), - ) - case BuildBackendDocker: - return assert.All( - platformErr, - assert.NotEmpty(c.BuildPlatform, "build platform is required"), - assert.NotEmpty(c.BuildS3.URL, "build S3 URL is required when using Docker backend"), - assert.NotEmpty(c.BuildS3.ExternalURL, "build S3 external URL is required when using Docker backend"), - assert.NotEmpty(c.BuildS3.Bucket, "build S3 bucket is required when using Docker backend"), - assert.NotEmpty(c.BuildS3.AccessKeyID, "build S3 access key ID is required when using Docker backend"), - assert.NotEmpty(c.BuildS3.AccessKeySecret, "build S3 access key secret is required when using Docker backend"), - ) - default: - return fmt.Errorf("build backend must be either 'depot' or 'docker', got: %s", c.BuildBackend) - } + return nil } diff --git a/svc/ctrl/run.go b/svc/ctrl/api/run.go similarity index 78% rename from svc/ctrl/run.go rename to svc/ctrl/api/run.go index 57ed13acb4..4367d11acf 100644 --- a/svc/ctrl/run.go +++ b/svc/ctrl/api/run.go @@ -1,4 +1,4 @@ -package ctrl +package api import ( "context" @@ -12,7 +12,6 @@ import ( restateIngress "github.com/restatedev/sdk-go/ingress" "github.com/unkeyed/unkey/gen/proto/ctrl/v1/ctrlv1connect" "github.com/unkeyed/unkey/pkg/cache" - "github.com/unkeyed/unkey/pkg/clickhouse" "github.com/unkeyed/unkey/pkg/clock" "github.com/unkeyed/unkey/pkg/db" "github.com/unkeyed/unkey/pkg/otel" @@ -20,10 +19,8 @@ import ( "github.com/unkeyed/unkey/pkg/prometheus" "github.com/unkeyed/unkey/pkg/shutdown" pkgversion "github.com/unkeyed/unkey/pkg/version" + "github.com/unkeyed/unkey/svc/ctrl/pkg/s3" "github.com/unkeyed/unkey/svc/ctrl/services/acme" - "github.com/unkeyed/unkey/svc/ctrl/services/build/backend/depot" - "github.com/unkeyed/unkey/svc/ctrl/services/build/backend/docker" - buildStorage "github.com/unkeyed/unkey/svc/ctrl/services/build/storage" "github.com/unkeyed/unkey/svc/ctrl/services/cluster" "github.com/unkeyed/unkey/svc/ctrl/services/ctrl" "github.com/unkeyed/unkey/svc/ctrl/services/deployment" @@ -75,9 +72,6 @@ func Run(ctx context.Context, cfg Config) error { if cfg.InstanceID != "" { logger = logger.With(slog.String("instanceID", cfg.InstanceID)) } - if cfg.Platform != "" { - logger = logger.With(slog.String("platform", cfg.Platform)) - } if cfg.Region != "" { logger = logger.With(slog.String("region", cfg.Region)) } @@ -89,6 +83,18 @@ func Run(ctx context.Context, cfg Config) error { logger.Info("TLS is enabled, server will use HTTPS") } + buildStorage, err := s3.NewS3(s3.S3Config{ + S3PresignURL: "", + S3URL: cfg.BuildS3.URL, + S3Bucket: cfg.BuildS3.Bucket, + S3AccessKeyID: cfg.BuildS3.AccessKeyID, + S3AccessKeySecret: cfg.BuildS3.AccessKeySecret, + Logger: logger, + }) + if err != nil { + return fmt.Errorf("unable to create build storage backend: %w", err) + } + // Initialize database database, err := db.New(db.Config{ PrimaryDSN: cfg.DatabasePrimary, @@ -101,58 +107,6 @@ func Run(ctx context.Context, cfg Config) error { shutdowns.Register(database.Close) - buildStorage, err := buildStorage.NewS3(buildStorage.S3Config{ - Logger: logger, - S3URL: cfg.BuildS3.URL, - S3PresignURL: cfg.BuildS3.ExternalURL, // Empty for Depot, set for Docker - S3Bucket: cfg.BuildS3.Bucket, - S3AccessKeyID: cfg.BuildS3.AccessKeyID, - S3AccessKeySecret: cfg.BuildS3.AccessKeySecret, - }) - if err != nil { - return fmt.Errorf("unable to create build storage: %w", err) - } - - var ch clickhouse.ClickHouse = clickhouse.NewNoop() - if cfg.ClickhouseURL != "" { - ch, err = clickhouse.New(clickhouse.Config{ - URL: cfg.ClickhouseURL, - Logger: logger, - }) - if err != nil { - return fmt.Errorf("unable to create clickhouse: %w", err) - } - } - - var buildService ctrlv1connect.BuildServiceClient - switch cfg.BuildBackend { - case BuildBackendDocker: - buildService = docker.New(docker.Config{ - InstanceID: cfg.InstanceID, - DB: database, - Logger: logger, - BuildPlatform: docker.BuildPlatform(cfg.GetBuildPlatform()), - Storage: buildStorage, - }) - logger.Info("Using Docker build backend", "presign_url", cfg.BuildS3.ExternalURL) - - case BuildBackendDepot: - buildService = depot.New(depot.Config{ - InstanceID: cfg.InstanceID, - DB: database, - RegistryConfig: depot.RegistryConfig(cfg.GetRegistryConfig()), - BuildPlatform: depot.BuildPlatform(cfg.GetBuildPlatform()), - DepotConfig: depot.DepotConfig(cfg.GetDepotConfig()), - Clickhouse: ch, - Logger: logger, - Storage: buildStorage, - }) - logger.Info("Using Depot build backend") - - default: - return fmt.Errorf("unknown build backend: %s (must be 'docker' or 'depot')", cfg.BuildBackend) - } - // Restate ingress client for invoking workflows restateClientOpts := []restate.IngressClientOption{} if cfg.Restate.APIKey != "" { @@ -196,19 +150,19 @@ func Run(ctx context.Context, cfg Config) error { mux := http.NewServeMux() // Health check endpoint for load balancers and orchestrators - mux.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) { + mux.HandleFunc("/v1/liveness", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusOK) }) - mux.Handle(ctrlv1connect.NewBuildServiceHandler(buildService)) mux.Handle(ctrlv1connect.NewCtrlServiceHandler(ctrl.New(cfg.InstanceID, database))) mux.Handle(ctrlv1connect.NewDeploymentServiceHandler(deployment.New(deployment.Config{ Database: database, Restate: restateClient, - BuildService: buildService, Logger: logger, AvailableRegions: cfg.AvailableRegions, + BuildStorage: buildStorage, }))) + mux.Handle(ctrlv1connect.NewOpenApiServiceHandler(openapi.New(database, logger))) mux.Handle(ctrlv1connect.NewAcmeServiceHandler(acme.New(acme.Config{ DB: database, diff --git a/svc/ctrl/services/build/backend/depot/BUILD.bazel b/svc/ctrl/pkg/build/BUILD.bazel similarity index 77% rename from svc/ctrl/services/build/backend/depot/BUILD.bazel rename to svc/ctrl/pkg/build/BUILD.bazel index 77013c9837..673dbe1b16 100644 --- a/svc/ctrl/services/build/backend/depot/BUILD.bazel +++ b/svc/ctrl/pkg/build/BUILD.bazel @@ -1,25 +1,22 @@ load("@rules_go//go:def.bzl", "go_library") go_library( - name = "depot", + name = "build", srcs = [ - "create_build.go", + "build.go", "doc.go", - "generate_upload_url.go", "service.go", ], - importpath = "github.com/unkeyed/unkey/svc/ctrl/services/build/backend/depot", + importpath = "github.com/unkeyed/unkey/svc/ctrl/pkg/build", visibility = ["//visibility:public"], deps = [ - "//gen/proto/ctrl/v1:ctrl", - "//gen/proto/ctrl/v1/ctrlv1connect", + "//gen/proto/hydra/v1:hydra", "//pkg/assert", "//pkg/clickhouse", "//pkg/clickhouse/schema", "//pkg/db", "//pkg/otel/logging", "//pkg/ptr", - "//svc/ctrl/services/build/storage", "@build_buf_gen_go_depot_api_connectrpc_go//depot/core/v1/corev1connect", "@build_buf_gen_go_depot_api_protocolbuffers_go//depot/core/v1:core", "@com_connectrpc_connect//:connect", @@ -32,5 +29,6 @@ go_library( "@com_github_moby_buildkit//session", "@com_github_moby_buildkit//session/auth/authprovider", "@com_github_opencontainers_go_digest//:go-digest", + "@com_github_restatedev_sdk_go//:sdk-go", ], ) diff --git a/svc/ctrl/services/build/backend/depot/create_build.go b/svc/ctrl/pkg/build/build.go similarity index 60% rename from svc/ctrl/services/build/backend/depot/create_build.go rename to svc/ctrl/pkg/build/build.go index 235d219778..c767f7e67a 100644 --- a/svc/ctrl/services/build/backend/depot/create_build.go +++ b/svc/ctrl/pkg/build/build.go @@ -1,4 +1,4 @@ -package depot +package build import ( "context" @@ -19,8 +19,9 @@ import ( "github.com/moby/buildkit/session" "github.com/moby/buildkit/session/auth/authprovider" "github.com/opencontainers/go-digest" + restate "github.com/restatedev/sdk-go" - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" + hydrav1 "github.com/unkeyed/unkey/gen/proto/hydra/v1" "github.com/unkeyed/unkey/pkg/assert" "github.com/unkeyed/unkey/pkg/clickhouse/schema" "github.com/unkeyed/unkey/pkg/db" @@ -44,153 +45,121 @@ const ( // Prepare build context and configuration // Execute the build with status logging // Return build metadata -func (s *Depot) CreateBuild( - ctx context.Context, - req *connect.Request[ctrlv1.CreateBuildRequest], -) (*connect.Response[ctrlv1.CreateBuildResponse], error) { - buildContextPath := req.Msg.GetBuildContextPath() - unkeyProjectID := req.Msg.GetUnkeyProjectId() - deploymentID := req.Msg.GetDeploymentId() +func (s *Depot) BuildDockerImage( + ctx restate.Context, + req *hydrav1.BuildDockerImageRequest, +) (*hydrav1.BuildDockerImageResponse, error) { + + unkeyProjectID := req.GetProjectId() if err := assert.All( - assert.NotEmpty(buildContextPath, "build_context_path is required"), - assert.NotEmpty(unkeyProjectID, "unkey_project_id is required"), - assert.NotEmpty(deploymentID, "deploymentID is required"), + assert.NotEmpty(req.GetS3Url(), "s3_url is required"), + assert.NotEmpty(req.GetBuildContextPath(), "build_context_path is required"), + assert.NotEmpty(unkeyProjectID, "project_id is required"), + assert.NotEmpty(req.GetDeploymentId(), "deployment_id is required"), + assert.NotEmpty(req.GetDockerfilePath(), "dockerfile_path is required"), ); err != nil { - return nil, connect.NewError(connect.CodeInvalidArgument, err) + return nil, restate.TerminalError(err) } platform := s.buildPlatform.Platform architecture := s.buildPlatform.Architecture s.logger.Info("Starting build process - getting presigned URL for build context", - "build_context_path", buildContextPath, + "build_context_path", req.GetBuildContextPath(), "unkey_project_id", unkeyProjectID, "platform", platform, "architecture", architecture) - contextURL, err := s.storage.GenerateDownloadURL(ctx, buildContextPath, 15*time.Minute) - if err != nil { - s.logger.Error("Failed to get presigned URL", - "error", err, - "build_context_path", buildContextPath, - "unkey_project_id", unkeyProjectID) - return nil, connect.NewError(connect.CodeInternal, - fmt.Errorf("failed to get presigned URL: %w", err)) - } - - depotProjectID, err := s.getOrCreateDepotProject(ctx, unkeyProjectID) + depotProjectID, err := restate.Run(ctx, func(runCtx restate.RunContext) (string, error) { + return s.getOrCreateDepotProject(runCtx, unkeyProjectID) + }, restate.WithName("get or create depot project")) if err != nil { - s.logger.Error("Failed to get/create depot project", - "error", err, - "unkey_project_id", unkeyProjectID) - return nil, connect.NewError(connect.CodeInternal, - fmt.Errorf("failed to get/create depot project: %w", err)) + return nil, fmt.Errorf("failed to get/create depot project: %w", err) } s.logger.Info("Creating depot build", "depot_project_id", depotProjectID, "unkey_project_id", unkeyProjectID) - buildResp, err := build.NewBuild(ctx, &cliv1.CreateBuildRequest{ - Options: nil, - ProjectId: depotProjectID, - }, s.registryConfig.Password) - if err != nil { - s.logger.Error("Creating depot build failed", - "error", err, - "depot_project_id", depotProjectID, - "unkey_project_id", unkeyProjectID) - return nil, connect.NewError(connect.CodeInternal, - fmt.Errorf("failed to create build: %w", err)) - } + return restate.Run(ctx, func(runCtx restate.RunContext) (*hydrav1.BuildDockerImageResponse, error) { - s.logger.Info("Depot build created", - "build_id", buildResp.ID, - "depot_project_id", depotProjectID, - "unkey_project_id", unkeyProjectID) - - var buildErr error - defer buildResp.Finish(buildErr) - - s.logger.Info("Acquiring build machine", - "build_id", buildResp.ID, - "architecture", architecture, - "unkey_project_id", unkeyProjectID) + depotBuild, err := build.NewBuild(runCtx, &cliv1.CreateBuildRequest{ + Options: nil, + ProjectId: depotProjectID, + }, s.registryConfig.Password) + if err != nil { + return nil, fmt.Errorf("failed to create build: %w", err) + } + defer depotBuild.Finish(err) - var buildkit *machine.Machine - buildkit, buildErr = machine.Acquire(ctx, buildResp.ID, buildResp.Token, architecture) - if buildErr != nil { - s.logger.Error("Acquiring depot build failed", - "error", buildErr, - "build_id", buildResp.ID, + s.logger.Info("Depot build created", + "build_id", depotBuild.ID, "depot_project_id", depotProjectID, "unkey_project_id", unkeyProjectID) - return nil, connect.NewError(connect.CodeInternal, - fmt.Errorf("failed to acquire machine: %w", buildErr)) - } - //nolint: all - defer buildkit.Release() - - s.logger.Info("Build machine acquired, connecting to buildkit", - "build_id", buildResp.ID, - "unkey_project_id", unkeyProjectID) - var buildkitClient *client.Client - buildkitClient, buildErr = buildkit.Connect(ctx) - if buildErr != nil { - s.logger.Error("Connection to depot build failed", - "error", buildErr, - "build_id", buildResp.ID, - "depot_project_id", depotProjectID, + s.logger.Info("Acquiring build machine", + "build_id", depotBuild.ID, + "architecture", architecture, "unkey_project_id", unkeyProjectID) - return nil, connect.NewError(connect.CodeInternal, - fmt.Errorf("failed to connect to buildkit: %w", buildErr)) - } - defer func() { - if err := buildkitClient.Close(); err != nil { - s.logger.Error("failed to close buildkit client", "error", err) + + buildkit, err := machine.Acquire(runCtx, depotBuild.ID, depotBuild.Token, architecture) + if err != nil { + return nil, fmt.Errorf("failed to acquire machine: %w", err) } - }() + defer func() { + if releaseErr := buildkit.Release(); releaseErr != nil { + s.logger.Error("unable to release buildkit", "error", releaseErr) + } + }() - imageName := fmt.Sprintf("%s/%s:%s-%s", s.registryConfig.URL, depotProjectID, unkeyProjectID, deploymentID) + s.logger.Info("Build machine acquired, connecting to buildkit", + "build_id", depotBuild.ID, + "unkey_project_id", unkeyProjectID) - dockerfilePath := req.Msg.GetDockerfilePath() - if dockerfilePath == "" { - dockerfilePath = "Dockerfile" - } + buildClient, err := buildkit.Connect(runCtx) + if err != nil { + return nil, fmt.Errorf("unable to create build client: %w", err) + } + defer func() { + if closeErr := buildClient.Close(); closeErr != nil { + s.logger.Error("unable to close client", "error", closeErr) + } + }() - s.logger.Info("Starting build execution", - "image_name", imageName, - "dockerfile", dockerfilePath, - "platform", platform, - "architecture", architecture, - "build_id", buildResp.ID, - "unkey_project_id", unkeyProjectID) + imageName := fmt.Sprintf("%s/%s:%s-%s", s.registryConfig.URL, depotProjectID, unkeyProjectID, req.GetDeploymentId()) - buildStatusCh := make(chan *client.SolveStatus, 100) - go s.processBuildStatus(buildStatusCh, req.Msg.GetWorkspaceId(), unkeyProjectID, deploymentID) + dockerfilePath := req.GetDockerfilePath() + if dockerfilePath == "" { + dockerfilePath = "Dockerfile" + } - solverOptions := s.buildSolverOptions(platform, contextURL, dockerfilePath, imageName) - _, buildErr = buildkitClient.Solve(ctx, nil, solverOptions, buildStatusCh) - if buildErr != nil { - s.logger.Error("Build failed", - "error", buildErr, + s.logger.Info("Starting build execution", "image_name", imageName, - "build_id", buildResp.ID, - "depot_project_id", depotProjectID, + "dockerfile", dockerfilePath, + "platform", platform, + "architecture", architecture, + "build_id", depotBuild.ID, "unkey_project_id", unkeyProjectID) - return nil, connect.NewError(connect.CodeInternal, - fmt.Errorf("build failed: %w", buildErr)) - } - s.logger.Info("Build completed successfully") + buildStatusCh := make(chan *client.SolveStatus, 100) + go s.processBuildStatus(buildStatusCh, req.GetWorkspaceId(), unkeyProjectID, req.GetDeploymentId()) + + solverOptions := s.buildSolverOptions(platform, req.GetS3Url(), dockerfilePath, imageName) - return connect.NewResponse(&ctrlv1.CreateBuildResponse{ - ImageName: imageName, - BuildId: buildResp.ID, - DepotProjectId: depotProjectID, - }), nil + _, err = buildClient.Solve(runCtx, nil, solverOptions, buildStatusCh) + if err != nil { + return nil, fmt.Errorf("build failed: %w", err) + } + + s.logger.Info("Build completed successfully") + + return &hydrav1.BuildDockerImageResponse{ + ImageName: imageName, + DepotBuildId: depotBuild.ID, + DepotProjectId: depotProjectID, + }, nil + }) } func (s *Depot) buildSolverOptions( diff --git a/svc/ctrl/services/build/backend/depot/doc.go b/svc/ctrl/pkg/build/doc.go similarity index 99% rename from svc/ctrl/services/build/backend/depot/doc.go rename to svc/ctrl/pkg/build/doc.go index d6249c60bb..19c9da96da 100644 --- a/svc/ctrl/services/build/backend/depot/doc.go +++ b/svc/ctrl/pkg/build/doc.go @@ -60,4 +60,4 @@ // // Provides comprehensive error handling with proper HTTP status // codes for API communication failures and build errors. -package depot +package build diff --git a/svc/ctrl/services/build/backend/depot/service.go b/svc/ctrl/pkg/build/service.go similarity index 58% rename from svc/ctrl/services/build/backend/depot/service.go rename to svc/ctrl/pkg/build/service.go index 14cb9bf9ed..1bd94042f2 100644 --- a/svc/ctrl/services/build/backend/depot/service.go +++ b/svc/ctrl/pkg/build/service.go @@ -1,12 +1,11 @@ // Package depot is used to build images and store them in their registry using depot.dev. This gives us isolated and cached builds. -package depot +package build import ( - "github.com/unkeyed/unkey/gen/proto/ctrl/v1/ctrlv1connect" + hydrav1 "github.com/unkeyed/unkey/gen/proto/hydra/v1" "github.com/unkeyed/unkey/pkg/clickhouse" "github.com/unkeyed/unkey/pkg/db" "github.com/unkeyed/unkey/pkg/otel/logging" - "github.com/unkeyed/unkey/svc/ctrl/services/build/storage" ) type BuildPlatform struct { @@ -26,10 +25,8 @@ type RegistryConfig struct { } type Depot struct { - ctrlv1connect.UnimplementedBuildServiceHandler instanceID string db db.Database - storage *storage.S3 depotConfig DepotConfig registryConfig RegistryConfig buildPlatform BuildPlatform @@ -37,10 +34,11 @@ type Depot struct { logger logging.Logger } +var _ hydrav1.BuildServiceServer = (*Depot)(nil) + type Config struct { InstanceID string DB db.Database - Storage *storage.S3 DepotConfig DepotConfig Clickhouse clickhouse.ClickHouse // Clickhouse for telemetry RegistryConfig RegistryConfig @@ -50,14 +48,12 @@ type Config struct { func New(cfg Config) *Depot { return &Depot{ - UnimplementedBuildServiceHandler: ctrlv1connect.UnimplementedBuildServiceHandler{}, - instanceID: cfg.InstanceID, - db: cfg.DB, - storage: cfg.Storage, - depotConfig: cfg.DepotConfig, - clickhouse: cfg.Clickhouse, - registryConfig: cfg.RegistryConfig, - buildPlatform: cfg.BuildPlatform, - logger: cfg.Logger, + instanceID: cfg.InstanceID, + db: cfg.DB, + depotConfig: cfg.DepotConfig, + clickhouse: cfg.Clickhouse, + registryConfig: cfg.RegistryConfig, + buildPlatform: cfg.BuildPlatform, + logger: cfg.Logger, } } diff --git a/svc/ctrl/pkg/hash/BUILD.bazel b/svc/ctrl/pkg/hash/BUILD.bazel deleted file mode 100644 index e8610a5979..0000000000 --- a/svc/ctrl/pkg/hash/BUILD.bazel +++ /dev/null @@ -1,12 +0,0 @@ -load("@rules_go//go:def.bzl", "go_library") - -go_library( - name = "hash", - srcs = [ - "doc.go", - "hash.go", - ], - importpath = "github.com/unkeyed/unkey/svc/ctrl/pkg/hash", - visibility = ["//visibility:public"], - deps = ["//pkg/db"], -) diff --git a/svc/ctrl/pkg/hash/doc.go b/svc/ctrl/pkg/hash/doc.go deleted file mode 100644 index 4600029527..0000000000 --- a/svc/ctrl/pkg/hash/doc.go +++ /dev/null @@ -1,34 +0,0 @@ -// Package hash provides deterministic hashing for control plane resources. -// -// This package implements SHA-256 based hashing for sentinel and -// deployment resources. Hashes are used to detect configuration -// changes and ensure deterministic identification for workflow -// processing and caching. -// -// # Hashing Strategy -// -// The package uses SHA-256 for cryptographically secure -// hashing of resource configurations. The hash includes all -// relevant configuration fields to ensure that any change in -// deployment or sentinel parameters results in a different hash. -// -// # Resource Types -// -// Sentinel: Hash includes ID, image, replicas, CPU, and memory -// Deployment: Hash includes ID, replicas, image, region, resources, and desired state -// -// # Usage -// -// Creating resource hashes: -// -// sentinelHash := hash.Sentinel(sentinelDB) -// deploymentHash := hash.Deployment(deploymentDB) -// -// These hashes can be used for: -// - Configuration change detection -// - Cache key generation -// - Resource identification in workflows -// - Deterministic sorting and comparison -// -// The hash output is a hex-encoded SHA-256 digest. -package hash diff --git a/svc/ctrl/pkg/hash/hash.go b/svc/ctrl/pkg/hash/hash.go deleted file mode 100644 index 02460a519f..0000000000 --- a/svc/ctrl/pkg/hash/hash.go +++ /dev/null @@ -1,50 +0,0 @@ -package hash - -import ( - "crypto/sha256" - "fmt" - - "github.com/unkeyed/unkey/pkg/db" -) - -// Sentinel creates a deterministic hash for sentinel configuration. -// -// This function hashes all relevant sentinel fields including ID, -// image, replica count, CPU allocation, and memory -// configuration. The hash can be used to detect configuration -// changes and uniquely identify sentinel resources. -// -// Returns a hex-encoded SHA-256 hash of the sentinel configuration. -func Sentinel(sentinel db.Sentinel) string { - hash := fmt.Sprintf("%x", sha256.Sum256(fmt.Appendf(nil, "%v", []any{ - sentinel.ID, - sentinel.Image, - sentinel.DesiredReplicas, - sentinel.CpuMillicores, - sentinel.MemoryMib, - }))) - - return hash -} - -// Deployment creates a deterministic hash for deployment configuration. -// -// This function hashes all relevant deployment fields including ID, -// replica count, image, region, resources, and desired state. -// The hash can be used to detect configuration changes and -// uniquely identify deployment resources. -// -// Returns a hex-encoded SHA-256 hash of the deployment configuration. -func Deployment(deployment db.FindDeploymentTopologyByIDAndRegionRow) string { - hash := fmt.Sprintf("%x", sha256.Sum256(fmt.Appendf(nil, "%v", []any{ - deployment.ID, - deployment.DesiredReplicas, - deployment.Image, - deployment.Region, - deployment.CpuMillicores, - deployment.MemoryMib, - deployment.DesiredState, - }))) - - return hash -} diff --git a/svc/ctrl/services/build/storage/BUILD.bazel b/svc/ctrl/pkg/s3/BUILD.bazel similarity index 51% rename from svc/ctrl/services/build/storage/BUILD.bazel rename to svc/ctrl/pkg/s3/BUILD.bazel index 38cffd73cf..63940e168e 100644 --- a/svc/ctrl/services/build/storage/BUILD.bazel +++ b/svc/ctrl/pkg/s3/BUILD.bazel @@ -18,3 +18,23 @@ go_library( "@com_github_aws_aws_sdk_go_v2_service_s3//:s3", ], ) + +go_library( + name = "s3", + srcs = [ + "doc.go", + "interface.go", + "s3.go", + ], + importpath = "github.com/unkeyed/unkey/svc/ctrl/pkg/s3", + visibility = ["//visibility:public"], + deps = [ + "//pkg/fault", + "//pkg/otel/logging", + "@com_github_aws_aws_sdk_go_v2//aws", + "@com_github_aws_aws_sdk_go_v2//aws/signer/v4:signer", + "@com_github_aws_aws_sdk_go_v2_config//:config", + "@com_github_aws_aws_sdk_go_v2_credentials//:credentials", + "@com_github_aws_aws_sdk_go_v2_service_s3//:s3", + ], +) diff --git a/svc/ctrl/services/build/storage/doc.go b/svc/ctrl/pkg/s3/doc.go similarity index 98% rename from svc/ctrl/services/build/storage/doc.go rename to svc/ctrl/pkg/s3/doc.go index 7aee50ddd5..6c5397a94d 100644 --- a/svc/ctrl/services/build/storage/doc.go +++ b/svc/ctrl/pkg/s3/doc.go @@ -43,4 +43,4 @@ // The package provides comprehensive error handling for S3 operations // including network failures, permission errors, and invalid // configurations. -package storage +package s3 diff --git a/svc/ctrl/pkg/s3/interface.go b/svc/ctrl/pkg/s3/interface.go new file mode 100644 index 0000000000..052bef8497 --- /dev/null +++ b/svc/ctrl/pkg/s3/interface.go @@ -0,0 +1,14 @@ +package s3 + +import ( + "context" + "time" +) + +// Storage defines the interface for object storage operations. +type Storage interface { + GenerateDownloadURL(ctx context.Context, key string, expiresIn time.Duration) (string, error) + GenerateUploadURL(ctx context.Context, key string, expiresIn time.Duration) (string, error) +} + +var _ Storage = (*S3)(nil) diff --git a/svc/ctrl/services/build/storage/s3.go b/svc/ctrl/pkg/s3/s3.go similarity index 99% rename from svc/ctrl/services/build/storage/s3.go rename to svc/ctrl/pkg/s3/s3.go index 3a849c56bb..bbce746c5d 100644 --- a/svc/ctrl/services/build/storage/s3.go +++ b/svc/ctrl/pkg/s3/s3.go @@ -1,4 +1,4 @@ -package storage +package s3 import ( "context" diff --git a/svc/ctrl/proto/ctrl/v1/build.proto b/svc/ctrl/proto/ctrl/v1/build.proto deleted file mode 100644 index 7c7d93dfd1..0000000000 --- a/svc/ctrl/proto/ctrl/v1/build.proto +++ /dev/null @@ -1,33 +0,0 @@ -syntax = "proto3"; -package ctrl.v1; - -option go_package = "github.com/unkeyed/unkey/gen/proto/ctrl/v1;ctrlv1"; - -service BuildService { - rpc CreateBuild(CreateBuildRequest) returns (CreateBuildResponse) {} - rpc GenerateUploadURL(GenerateUploadURLRequest) returns (GenerateUploadURLResponse) {} -} - -message CreateBuildRequest { - string build_context_path = 1; // S3 key of the uploaded tar file - optional string dockerfile_path = 2; // Path to Dockerfile within the tar - string unkey_project_id = 3; // Your internal user/project ID - string deployment_id = 4; - string workspace_id = 5; -} - -message CreateBuildResponse { - string image_name = 1; // Full image tag (registry.depot.dev/project:tag) - string build_id = 2; // Depot build ID for tracking - string depot_project_id = 3; // Depot project ID -} - -message GenerateUploadURLRequest { - string unkey_project_id = 1; // Your internal user/project ID -} - -message GenerateUploadURLResponse { - string upload_url = 1; // Presigned PUT URL - string build_context_path = 2; // S3 key to use in CreateBuild - int64 expires_in = 3; // Seconds until URL expires -} diff --git a/svc/ctrl/proto/ctrl/v1/deployment.proto b/svc/ctrl/proto/ctrl/v1/deployment.proto index 4fec123d37..75c773a868 100644 --- a/svc/ctrl/proto/ctrl/v1/deployment.proto +++ b/svc/ctrl/proto/ctrl/v1/deployment.proto @@ -151,7 +151,17 @@ message PromoteRequest { message PromoteResponse {} +message CreateS3UploadURLRequest { + string unkey_project_id = 1; +} + +message CreateS3UploadURLResponse { + string upload_url = 1; // Presigned PUT URL + string build_context_path = 2; // S3 key to use in CreateBuild +} + service DeploymentService { + rpc CreateS3UploadURL(CreateS3UploadURLRequest) returns (CreateS3UploadURLResponse) {} // Create a new deployment rpc CreateDeployment(CreateDeploymentRequest) returns (CreateDeploymentResponse) {} diff --git a/svc/ctrl/proto/hydra/v1/build.proto b/svc/ctrl/proto/hydra/v1/build.proto new file mode 100644 index 0000000000..e7fe0002cc --- /dev/null +++ b/svc/ctrl/proto/hydra/v1/build.proto @@ -0,0 +1,27 @@ +syntax = "proto3"; + +package hydra.v1; + +import "dev/restate/sdk/go.proto"; + +option go_package = "github.com/unkeyed/unkey/gen/proto/hydra/v1;hydrav1"; + +service BuildService { + option (dev.restate.sdk.go.service_type) = SERVICE; + + rpc BuildDockerImage(BuildDockerImageRequest) returns (BuildDockerImageResponse) {} +} + +message BuildDockerImageRequest { + string s3_url = 1; + string build_context_path = 2; + string dockerfile_path = 3; + string project_id = 4; + string deployment_id = 5; + string workspace_id = 6; +} +message BuildDockerImageResponse { + string depot_project_id = 1; + string depot_build_id = 2; + string image_name = 3; +} diff --git a/svc/ctrl/services/build/BUILD.bazel b/svc/ctrl/services/build/BUILD.bazel deleted file mode 100644 index ba27a0c31f..0000000000 --- a/svc/ctrl/services/build/BUILD.bazel +++ /dev/null @@ -1,8 +0,0 @@ -load("@rules_go//go:def.bzl", "go_library") - -go_library( - name = "build", - srcs = ["doc.go"], - importpath = "github.com/unkeyed/unkey/svc/ctrl/services/build", - visibility = ["//visibility:public"], -) diff --git a/svc/ctrl/services/build/backend/depot/generate_upload_url.go b/svc/ctrl/services/build/backend/depot/generate_upload_url.go deleted file mode 100644 index c3302ad108..0000000000 --- a/svc/ctrl/services/build/backend/depot/generate_upload_url.go +++ /dev/null @@ -1,55 +0,0 @@ -package depot - -import ( - "context" - "fmt" - "time" - - "connectrpc.com/connect" - - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" - "github.com/unkeyed/unkey/pkg/db" -) - -func (s *Depot) GenerateUploadURL( - ctx context.Context, - req *connect.Request[ctrlv1.GenerateUploadURLRequest], -) (*connect.Response[ctrlv1.GenerateUploadURLResponse], error) { - unkeyProjectID := req.Msg.GetUnkeyProjectId() - if unkeyProjectID == "" { - return nil, connect.NewError(connect.CodeInvalidArgument, - fmt.Errorf("unkeyProjectID is required")) - } - - // This ensures the project exists. Without this check, callers could provide - // arbitrary projectIds and generate unlimited upload URLs. - _, err := db.Query.FindProjectById(ctx, s.db.RO(), unkeyProjectID) - if err != nil { - if db.IsNotFound(err) { - return nil, connect.NewError(connect.CodeNotFound, - fmt.Errorf("project not found: %s", unkeyProjectID)) - } - return nil, connect.NewError(connect.CodeInternal, err) - } - - // Generate unique S3 key for this build context - buildContextPath := fmt.Sprintf("%s/%d.tar.gz", - unkeyProjectID, - time.Now().UnixNano()) - - // Generate presigned URL (15 minutes expiration) - uploadURL, err := s.storage.GenerateUploadURL(ctx, buildContextPath, 15*time.Minute) - if err != nil { - s.logger.Error("Failed to generate presigned URL", "error", err, "context_key", buildContextPath) - return nil, connect.NewError(connect.CodeInternal, - fmt.Errorf("failed to generate presigned URL: %w", err)) - } - - s.logger.Info("Generated upload URL", "context_key", buildContextPath, "unkey_project_id", unkeyProjectID) - - return connect.NewResponse(&ctrlv1.GenerateUploadURLResponse{ - UploadUrl: uploadURL, - BuildContextPath: buildContextPath, - ExpiresIn: 900, // 15 minutes - }), nil -} diff --git a/svc/ctrl/services/build/backend/docker/BUILD.bazel b/svc/ctrl/services/build/backend/docker/BUILD.bazel deleted file mode 100644 index 0a08676b8e..0000000000 --- a/svc/ctrl/services/build/backend/docker/BUILD.bazel +++ /dev/null @@ -1,25 +0,0 @@ -load("@rules_go//go:def.bzl", "go_library") - -go_library( - name = "docker", - srcs = [ - "create_build.go", - "doc.go", - "generate_upload_url.go", - "service.go", - ], - importpath = "github.com/unkeyed/unkey/svc/ctrl/services/build/backend/docker", - visibility = ["//visibility:public"], - deps = [ - "//gen/proto/ctrl/v1:ctrl", - "//gen/proto/ctrl/v1/ctrlv1connect", - "//pkg/assert", - "//pkg/db", - "//pkg/otel/logging", - "//pkg/uid", - "//svc/ctrl/services/build/storage", - "@com_connectrpc_connect//:connect", - "@com_github_docker_docker//api/types/build", - "@com_github_docker_docker//client", - ], -) diff --git a/svc/ctrl/services/build/backend/docker/create_build.go b/svc/ctrl/services/build/backend/docker/create_build.go deleted file mode 100644 index ea229f1483..0000000000 --- a/svc/ctrl/services/build/backend/docker/create_build.go +++ /dev/null @@ -1,170 +0,0 @@ -package docker - -import ( - "bufio" - "context" - "encoding/json" - "fmt" - "strings" - "time" - - "connectrpc.com/connect" - "github.com/docker/docker/api/types/build" - "github.com/docker/docker/client" - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" - "github.com/unkeyed/unkey/pkg/assert" - "github.com/unkeyed/unkey/pkg/uid" -) - -type dockerBuildResponse struct { - Stream string `json:"stream,omitempty"` - Error string `json:"error,omitempty"` - ErrorDetail struct { - Code int `json:"code"` - Message string `json:"message"` - } `json:"errorDetail"` -} - -func (d *Docker) CreateBuild( - ctx context.Context, - req *connect.Request[ctrlv1.CreateBuildRequest], -) (*connect.Response[ctrlv1.CreateBuildResponse], error) { - buildContextPath := req.Msg.GetBuildContextPath() - unkeyProjectID := req.Msg.GetUnkeyProjectId() - deploymentID := req.Msg.GetDeploymentId() - - if err := assert.All( - assert.NotEmpty(buildContextPath, "build_context_path is required"), - assert.NotEmpty(unkeyProjectID, "unkey_project_id is required"), - assert.NotEmpty(deploymentID, "deploymentID is required"), - ); err != nil { - return nil, connect.NewError(connect.CodeInvalidArgument, err) - } - - // Use configured platform from config - platform := d.buildPlatform.Platform - architecture := d.buildPlatform.Architecture - - d.logger.Info("Getting presigned URL for build context", - "build_context_path", buildContextPath, - "unkey_project_id", unkeyProjectID, - "platform", platform, - "architecture", architecture) - - contextURL, err := d.storage.GenerateDownloadURL(ctx, buildContextPath, 15*time.Minute) - if err != nil { - d.logger.Error("Failed to get presigned URL", - "error", err, - "build_context_path", buildContextPath, - "unkey_project_id", unkeyProjectID) - return nil, connect.NewError(connect.CodeInternal, - fmt.Errorf("failed to get presigned URL: %w", err)) - } - - dockerClient, err := client.NewClientWithOpts( - client.FromEnv, - client.WithAPIVersionNegotiation(), - ) - if err != nil { - d.logger.Error("Failed to create docker client", - "error", err, - "unkey_project_id", unkeyProjectID) - return nil, connect.NewError(connect.CodeInternal, - fmt.Errorf("failed to create docker client: %w", err)) - } - defer func() { - if err := dockerClient.Close(); err != nil { - d.logger.Error("failed to close docker client", "error", err) - } - }() - - // Docker requires lowercase repository names - imageName := strings.ToLower(fmt.Sprintf("%s-%s", - unkeyProjectID, - deploymentID, - )) - - dockerfilePath := req.Msg.GetDockerfilePath() - if dockerfilePath == "" { - dockerfilePath = "Dockerfile" - } - - d.logger.Info("Starting Docker build", - "image_name", imageName, - "dockerfile", dockerfilePath, - "platform", platform, - "architecture", architecture, - "unkey_project_id", unkeyProjectID) - - //nolint: exhaustruct - buildOptions := build.ImageBuildOptions{ - Tags: []string{imageName}, - Dockerfile: dockerfilePath, - Platform: platform, - Remove: true, - RemoteContext: contextURL, - } - - buildResponse, err := dockerClient.ImageBuild(ctx, nil, buildOptions) - if err != nil { - d.logger.Error("Docker build failed", - "error", err, - "image_name", imageName, - "dockerfile", dockerfilePath, - "unkey_project_id", unkeyProjectID) - return nil, connect.NewError(connect.CodeInternal, - fmt.Errorf("failed to start build: %w", err)) - } - defer func() { - if err := buildResponse.Body.Close(); err != nil { - d.logger.Error("failed to close build response body", "error", err) - } - }() - - scanner := bufio.NewScanner(buildResponse.Body) - var buildError error - - for scanner.Scan() { - var resp dockerBuildResponse - if err := json.Unmarshal(scanner.Bytes(), &resp); err != nil { - continue - } - - if resp.Error != "" { - buildError = fmt.Errorf("%s", resp.ErrorDetail.Message) - d.logger.Error("Build failed", - "error", resp.ErrorDetail.Message, - "image_name", imageName, - "unkey_project_id", unkeyProjectID) - break - } - } - - if err := scanner.Err(); err != nil { - d.logger.Error("Failed to read build output", - "error", err, - "image_name", imageName, - "unkey_project_id", unkeyProjectID) - return nil, connect.NewError(connect.CodeInternal, - fmt.Errorf("failed to read build output: %w", err)) - } - - if buildError != nil { - return nil, connect.NewError(connect.CodeInternal, buildError) - } - - buildID := uid.New(uid.BuildPrefix) - - d.logger.Info("Build completed successfully", - "image_name", imageName, - "build_id", buildID, - "platform", platform, - "architecture", architecture, - "unkey_project_id", unkeyProjectID) - - return connect.NewResponse(&ctrlv1.CreateBuildResponse{ - DepotProjectId: "", - ImageName: imageName, - BuildId: buildID, - }), nil -} diff --git a/svc/ctrl/services/build/backend/docker/doc.go b/svc/ctrl/services/build/backend/docker/doc.go deleted file mode 100644 index dfa4ca4864..0000000000 --- a/svc/ctrl/services/build/backend/docker/doc.go +++ /dev/null @@ -1,52 +0,0 @@ -// Package docker provides local Docker build backend integration. -// -// This package implements container image building through the local -// Docker daemon. It supports builds from Dockerfile paths -// and direct container image management without external dependencies. -// -// # Architecture -// -// The Docker backend provides: -// - Local Docker daemon integration for container builds -// - Docker image creation from source code -// - Build artifact storage in S3-compatible systems -// - Real-time build progress tracking -// - Integration with unkey deployment workflows -// -// # Key Features -// -// - Local build execution without external service dependencies -// - Docker daemon communication for container management -// - Build from Dockerfile paths for reproducible builds -// - Container image creation and management -// - S3 storage for build artifact sharing -// - Integration with deployment service for automatic updates -// -// # Usage -// -// Creating Docker build backend: -// -// dockerBackend := docker.New(docker.Config{ -// InstanceID: "build-instance-001", -// DB: database, -// BuildPlatform: docker.BuildPlatform{ -// Platform: "linux/amd64", -// Architecture: "amd64", -// }, -// Storage: buildStorage, -// Logger: logger, -// }) -// -// # Build Operations -// -// The backend implements standard BuildService interface methods: -// - CreateBuild: Start new container build from Dockerfile -// - GenerateUploadUrl: Generate pre-signed URLs for Docker images -// - GetBuild: Get build status and metadata -// - GetBuildLogs: Stream real-time build logs -// -// # Error Handling -// -// Provides comprehensive error handling with proper HTTP status -// codes for Docker daemon communication failures and build errors. -package docker diff --git a/svc/ctrl/services/build/backend/docker/generate_upload_url.go b/svc/ctrl/services/build/backend/docker/generate_upload_url.go deleted file mode 100644 index e190c3693d..0000000000 --- a/svc/ctrl/services/build/backend/docker/generate_upload_url.go +++ /dev/null @@ -1,55 +0,0 @@ -package docker - -import ( - "context" - "fmt" - "time" - - "connectrpc.com/connect" - - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" - "github.com/unkeyed/unkey/pkg/db" -) - -func (s *Docker) GenerateUploadURL( - ctx context.Context, - req *connect.Request[ctrlv1.GenerateUploadURLRequest], -) (*connect.Response[ctrlv1.GenerateUploadURLResponse], error) { - unkeyProjectID := req.Msg.GetUnkeyProjectId() - if unkeyProjectID == "" { - return nil, connect.NewError(connect.CodeInvalidArgument, - fmt.Errorf("unkeyProjectID is required")) - } - - // This ensures the project exists. Without this check, callers could provide - // arbitrary projectIds and generate unlimited upload URLs. - _, err := db.Query.FindProjectById(ctx, s.db.RO(), unkeyProjectID) - if err != nil { - if db.IsNotFound(err) { - return nil, connect.NewError(connect.CodeNotFound, - fmt.Errorf("project not found: %s", unkeyProjectID)) - } - return nil, connect.NewError(connect.CodeInternal, err) - } - - // Generate unique S3 key for this build context - buildContextPath := fmt.Sprintf("%s/%d.tar.gz", - unkeyProjectID, - time.Now().UnixNano()) - - // Generate presigned URL (15 minutes expiration) - uploadURL, err := s.storage.GenerateUploadURL(ctx, buildContextPath, 15*time.Minute) - if err != nil { - s.logger.Error("Failed to generate presigned URL", "error", err, "context_key", buildContextPath) - return nil, connect.NewError(connect.CodeInternal, - fmt.Errorf("failed to generate presigned URL: %w", err)) - } - - s.logger.Info("Generated upload URL", "context_key", buildContextPath, "unkey_project_id", unkeyProjectID) - - return connect.NewResponse(&ctrlv1.GenerateUploadURLResponse{ - UploadUrl: uploadURL, - BuildContextPath: buildContextPath, - ExpiresIn: 900, // 15 minutes - }), nil -} diff --git a/svc/ctrl/services/build/backend/docker/service.go b/svc/ctrl/services/build/backend/docker/service.go deleted file mode 100644 index 8623a3936b..0000000000 --- a/svc/ctrl/services/build/backend/docker/service.go +++ /dev/null @@ -1,42 +0,0 @@ -// Package docker used for local testing using docker only -package docker - -import ( - "github.com/unkeyed/unkey/gen/proto/ctrl/v1/ctrlv1connect" - "github.com/unkeyed/unkey/pkg/db" - "github.com/unkeyed/unkey/pkg/otel/logging" - "github.com/unkeyed/unkey/svc/ctrl/services/build/storage" -) - -type BuildPlatform struct { - Platform string - Architecture string -} - -type Docker struct { - ctrlv1connect.UnimplementedBuildServiceHandler - instanceID string - db db.Database - buildPlatform BuildPlatform - storage *storage.S3 - logger logging.Logger -} - -type Config struct { - InstanceID string - DB db.Database - BuildPlatform BuildPlatform - Storage *storage.S3 - Logger logging.Logger -} - -func New(cfg Config) *Docker { - return &Docker{ - UnimplementedBuildServiceHandler: ctrlv1connect.UnimplementedBuildServiceHandler{}, - instanceID: cfg.InstanceID, - db: cfg.DB, - buildPlatform: cfg.BuildPlatform, - storage: cfg.Storage, - logger: cfg.Logger, - } -} diff --git a/svc/ctrl/services/build/doc.go b/svc/ctrl/services/build/doc.go deleted file mode 100644 index 75e057672b..0000000000 --- a/svc/ctrl/services/build/doc.go +++ /dev/null @@ -1,59 +0,0 @@ -// Package build provides container image building services. -// -// This package implements multiple build backends for container -// image creation and storage. It supports both cloud-native -// builds through Depot.dev and local builds through Docker daemon. -// -// # Architecture -// -// The package provides a unified BuildService interface with -// multiple backend implementations: -// -// - Depot Backend: Cloud-native builds with optimized caching -// - Docker Backend: Local builds with direct Docker integration -// -// Each backend implements: -// - Container image creation from source code -// - Registry pushing and management -// - Build artifact storage in S3-compatible systems -// - Build progress tracking and status reporting -// -// # Key Components -// -// [Depot Backend]: Integration with depot.dev for cloud builds -// -// [Docker Backend]: Local Docker daemon integration -// [Storage]: S3-compatible storage abstraction for build artifacts -// -// # Configuration -// -// Backends are configured through: -// - Build platform specifications (linux/amd64, linux/arm64) -// - Registry credentials for image pushing -// - S3 storage configuration for build artifacts -// - Platform-specific settings (Depot project region, Docker host access) -// -// # Usage -// -// Creating build service: -// -// switch cfg.BuildBackend { -// case ctrl.BuildBackendDepot: -// buildService = depot.New(depot.Config{ -// // Depot-specific configuration -// }) -// case ctrl.BuildBackendDocker: -// buildService = docker.New(docker.Config{ -// // Docker-specific configuration -// }) -// } -// -// The service provides consistent interface regardless of backend -// selection, enabling seamless switching between build systems. -// -// # Error Handling -// -// All backends provide comprehensive error handling with proper -// Connect error codes for client communication and detailed -// logging of build failures and progress. -package build diff --git a/svc/ctrl/services/cluster/rpc_watch_sentinels.go b/svc/ctrl/services/cluster/rpc_watch_sentinels.go index 424945aff1..0b99907658 100644 --- a/svc/ctrl/services/cluster/rpc_watch_sentinels.go +++ b/svc/ctrl/services/cluster/rpc_watch_sentinels.go @@ -30,9 +30,13 @@ func (s *Service) WatchSentinels( if err := assert.NotEmpty(region, "region is required"); err != nil { return connect.NewError(connect.CodeInvalidArgument, err) } - versionCursor := req.Msg.GetVersionLastSeen() + s.logger.Info("krane watching sentinels", + "region", region, + "version", versionCursor, + ) + for { select { case <-ctx.Done(): diff --git a/svc/ctrl/services/deployment/BUILD.bazel b/svc/ctrl/services/deployment/BUILD.bazel index 398889169e..f1c4525475 100644 --- a/svc/ctrl/services/deployment/BUILD.bazel +++ b/svc/ctrl/services/deployment/BUILD.bazel @@ -4,6 +4,7 @@ go_library( name = "deployment", srcs = [ "create_deployment.go", + "create_s3_upload_url.go", "doc.go", "get_deployment.go", "promote.go", @@ -19,6 +20,7 @@ go_library( "//pkg/db", "//pkg/otel/logging", "//pkg/uid", + "//svc/ctrl/pkg/s3", "@com_connectrpc_connect//:connect", "@com_github_restatedev_sdk_go//ingress", "@org_golang_google_protobuf//encoding/protojson", diff --git a/svc/ctrl/services/deployment/create_s3_upload_url.go b/svc/ctrl/services/deployment/create_s3_upload_url.go new file mode 100644 index 0000000000..a62c462d5c --- /dev/null +++ b/svc/ctrl/services/deployment/create_s3_upload_url.go @@ -0,0 +1,28 @@ +package deployment + +import ( + "context" + "fmt" + "time" + + "connectrpc.com/connect" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" + "github.com/unkeyed/unkey/pkg/uid" +) + +func (s *Service) CreateS3UploadURL( + ctx context.Context, + req *connect.Request[ctrlv1.CreateS3UploadURLRequest], +) (*connect.Response[ctrlv1.CreateS3UploadURLResponse], error) { + + buildContextPath := fmt.Sprintf("%s/%s.tar.gz", req.Msg.GetUnkeyProjectId(), uid.New("build")) + + url, err := s.buildStorage.GenerateUploadURL(ctx, buildContextPath, 15*time.Minute) + if err != nil { + return nil, connect.NewError(connect.CodeInternal, err) + } + return connect.NewResponse(&ctrlv1.CreateS3UploadURLResponse{ + UploadUrl: url, + BuildContextPath: buildContextPath, + }), nil +} diff --git a/svc/ctrl/services/deployment/service.go b/svc/ctrl/services/deployment/service.go index ac3c670dcf..77420585c3 100644 --- a/svc/ctrl/services/deployment/service.go +++ b/svc/ctrl/services/deployment/service.go @@ -12,15 +12,16 @@ import ( hydrav1 "github.com/unkeyed/unkey/gen/proto/hydra/v1" "github.com/unkeyed/unkey/pkg/db" "github.com/unkeyed/unkey/pkg/otel/logging" + "github.com/unkeyed/unkey/svc/ctrl/pkg/s3" ) type Service struct { ctrlv1connect.UnimplementedDeploymentServiceHandler db db.Database restate *restateingress.Client - buildService ctrlv1connect.BuildServiceClient logger logging.Logger availableRegions []string + buildStorage s3.Storage } // deploymentClient creates a typed Restate ingress client for the DeploymentService @@ -32,9 +33,9 @@ func (s *Service) deploymentClient(projectID string) hydrav1.DeploymentServiceIn type Config struct { Database db.Database Restate *restateingress.Client - BuildService ctrlv1connect.BuildServiceClient Logger logging.Logger AvailableRegions []string + BuildStorage s3.Storage } func New(cfg Config) *Service { @@ -42,8 +43,8 @@ func New(cfg Config) *Service { UnimplementedDeploymentServiceHandler: ctrlv1connect.UnimplementedDeploymentServiceHandler{}, db: cfg.Database, restate: cfg.Restate, - buildService: cfg.BuildService, logger: cfg.Logger, availableRegions: cfg.AvailableRegions, + buildStorage: cfg.BuildStorage, } } diff --git a/svc/worker/BUILD.bazel b/svc/ctrl/worker/BUILD.bazel similarity index 64% rename from svc/worker/BUILD.bazel rename to svc/ctrl/worker/BUILD.bazel index 2e5722a70b..8fc6285186 100644 --- a/svc/worker/BUILD.bazel +++ b/svc/ctrl/worker/BUILD.bazel @@ -4,13 +4,11 @@ go_library( name = "worker", srcs = [ "config.go", - "doc.go", "run.go", ], - importpath = "github.com/unkeyed/unkey/svc/worker", + importpath = "github.com/unkeyed/unkey/svc/ctrl/worker", visibility = ["//visibility:public"], deps = [ - "//gen/proto/ctrl/v1/ctrlv1connect", "//gen/proto/hydra/v1:hydra", "//pkg/assert", "//pkg/cache", @@ -25,15 +23,13 @@ go_library( "//pkg/vault", "//pkg/vault/storage", "//pkg/zen", + "//svc/ctrl/pkg/build", + "//svc/ctrl/pkg/s3", "//svc/ctrl/services/acme/providers", - "//svc/ctrl/services/build/backend/depot", - "//svc/ctrl/services/build/backend/docker", - "//svc/ctrl/services/build/storage", - "//svc/ctrl/services/cluster", - "//svc/worker/certificate", - "//svc/worker/deploy", - "//svc/worker/routing", - "//svc/worker/versioning", + "//svc/ctrl/worker/certificate", + "//svc/ctrl/worker/deploy", + "//svc/ctrl/worker/routing", + "//svc/ctrl/worker/versioning", "@com_github_go_acme_lego_v4//challenge", "@com_github_restatedev_sdk_go//:sdk-go", "@com_github_restatedev_sdk_go//ingress", diff --git a/svc/worker/certificate/BUILD.bazel b/svc/ctrl/worker/certificate/BUILD.bazel similarity index 91% rename from svc/worker/certificate/BUILD.bazel rename to svc/ctrl/worker/certificate/BUILD.bazel index 1c71248811..bfb2529531 100644 --- a/svc/worker/certificate/BUILD.bazel +++ b/svc/ctrl/worker/certificate/BUILD.bazel @@ -9,7 +9,7 @@ go_library( "renew_handler.go", "service.go", ], - importpath = "github.com/unkeyed/unkey/svc/worker/certificate", + importpath = "github.com/unkeyed/unkey/svc/ctrl/worker/certificate", visibility = ["//visibility:public"], deps = [ "//gen/proto/hydra/v1:hydra", diff --git a/svc/worker/certificate/bootstrap_infra_certs.go b/svc/ctrl/worker/certificate/bootstrap_infra_certs.go similarity index 100% rename from svc/worker/certificate/bootstrap_infra_certs.go rename to svc/ctrl/worker/certificate/bootstrap_infra_certs.go diff --git a/svc/worker/certificate/doc.go b/svc/ctrl/worker/certificate/doc.go similarity index 100% rename from svc/worker/certificate/doc.go rename to svc/ctrl/worker/certificate/doc.go diff --git a/svc/worker/certificate/process_challenge_handler.go b/svc/ctrl/worker/certificate/process_challenge_handler.go similarity index 100% rename from svc/worker/certificate/process_challenge_handler.go rename to svc/ctrl/worker/certificate/process_challenge_handler.go diff --git a/svc/worker/certificate/renew_handler.go b/svc/ctrl/worker/certificate/renew_handler.go similarity index 100% rename from svc/worker/certificate/renew_handler.go rename to svc/ctrl/worker/certificate/renew_handler.go diff --git a/svc/worker/certificate/service.go b/svc/ctrl/worker/certificate/service.go similarity index 100% rename from svc/worker/certificate/service.go rename to svc/ctrl/worker/certificate/service.go diff --git a/svc/worker/config.go b/svc/ctrl/worker/config.go similarity index 99% rename from svc/worker/config.go rename to svc/ctrl/worker/config.go index eb506bc7dc..855ffe7f22 100644 --- a/svc/worker/config.go +++ b/svc/ctrl/worker/config.go @@ -255,10 +255,6 @@ type Config struct { // Used for analytics and operational metrics storage. ClickhouseURL string - // AuthToken is the authentication token for cluster service API access. - // Used by the cluster service to authenticate requests. - AuthToken string - // SentinelImage is the container image used for new sentinel deployments. // Overrides default sentinel image with custom build or registry. SentinelImage string diff --git a/svc/worker/deploy/BUILD.bazel b/svc/ctrl/worker/deploy/BUILD.bazel similarity index 69% rename from svc/worker/deploy/BUILD.bazel rename to svc/ctrl/worker/deploy/BUILD.bazel index 3ffab0fea2..5c175d0c2f 100644 --- a/svc/worker/deploy/BUILD.bazel +++ b/svc/ctrl/worker/deploy/BUILD.bazel @@ -11,19 +11,16 @@ go_library( "rollback_handler.go", "service.go", ], - importpath = "github.com/unkeyed/unkey/svc/worker/deploy", + importpath = "github.com/unkeyed/unkey/svc/ctrl/worker/deploy", visibility = ["//visibility:public"], deps = [ "//gen/proto/ctrl/v1:ctrl", - "//gen/proto/ctrl/v1/ctrlv1connect", "//gen/proto/hydra/v1:hydra", "//pkg/db", "//pkg/otel/logging", "//pkg/uid", "//pkg/vault", - "//svc/ctrl/services/cluster", - "@com_connectrpc_connect//:connect", + "//svc/ctrl/pkg/s3", "@com_github_restatedev_sdk_go//:sdk-go", - "@org_golang_google_protobuf//proto", ], ) diff --git a/svc/worker/deploy/deploy_handler.go b/svc/ctrl/worker/deploy/deploy_handler.go similarity index 89% rename from svc/worker/deploy/deploy_handler.go rename to svc/ctrl/worker/deploy/deploy_handler.go index 607d0ead59..e2c4c050ce 100644 --- a/svc/worker/deploy/deploy_handler.go +++ b/svc/ctrl/worker/deploy/deploy_handler.go @@ -7,13 +7,11 @@ import ( "fmt" "time" - "connectrpc.com/connect" restate "github.com/restatedev/sdk-go" ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" hydrav1 "github.com/unkeyed/unkey/gen/proto/hydra/v1" "github.com/unkeyed/unkey/pkg/db" "github.com/unkeyed/unkey/pkg/uid" - "google.golang.org/protobuf/proto" ) const ( @@ -119,38 +117,35 @@ func (w *Workflow) Deploy(ctx restate.WorkflowSharedContext, req *hydrav1.Deploy return nil, err } - result, err := restate.Run(ctx, func(stepCtx restate.RunContext) (*ctrlv1.CreateBuildResponse, error) { - w.logger.Info("starting docker build", - "deployment_id", deployment.ID, - "build_context_path", req.GetBuildContextPath()) - - buildReq := connect.NewRequest(&ctrlv1.CreateBuildRequest{ - UnkeyProjectId: deployment.ProjectID, - WorkspaceId: deployment.WorkspaceID, - DeploymentId: deployment.ID, - BuildContextPath: req.GetBuildContextPath(), - DockerfilePath: proto.String(req.GetDockerfilePath()), - }) - - var buildResp *connect.Response[ctrlv1.CreateBuildResponse] - buildResp, err = w.buildClient.CreateBuild(stepCtx, buildReq) - if err != nil { - return &ctrlv1.CreateBuildResponse{}, fmt.Errorf("build failed: %w", err) - } + s3DownloadURL, err := restate.Run(ctx, func(stepCtx restate.RunContext) (string, error) { + return w.buildStorage.GenerateDownloadURL(stepCtx, req.GetBuildContextPath(), 1*time.Hour) + }, restate.WithName("generate s3 download url")) + if err != nil { + return nil, fmt.Errorf("failed to generate s3 download url: %w", err) + } - w.logger.Info("docker build completed", "deployment_id", deployment.ID, "image_name", buildResp.Msg.GetImageName()) + w.logger.Info("starting docker build", + "deployment_id", deployment.ID, + "build_context_path", req.GetBuildContextPath()) + + build, err := hydrav1.NewBuildServiceClient(ctx).BuildDockerImage().Request(&hydrav1.BuildDockerImageRequest{ + S3Url: s3DownloadURL, + BuildContextPath: req.GetBuildContextPath(), + DockerfilePath: req.GetDockerfilePath(), + ProjectId: deployment.ProjectID, + DeploymentId: deployment.ID, + WorkspaceId: deployment.WorkspaceID, + }) - return buildResp.Msg, nil - }, restate.WithName("building docker image")) if err != nil { return nil, fmt.Errorf("failed to build docker image: %w", err) } - dockerImage = result.GetImageName() + dockerImage = build.GetImageName() err = restate.RunVoid(ctx, func(stepCtx restate.RunContext) error { return db.Query.UpdateDeploymentBuildID(stepCtx, w.db.RW(), db.UpdateDeploymentBuildIDParams{ ID: deployment.ID, - BuildID: sql.NullString{Valid: true, String: result.GetBuildId()}, + BuildID: sql.NullString{Valid: true, String: build.GetDepotBuildId()}, UpdatedAt: sql.NullInt64{Valid: true, Int64: time.Now().UnixMilli()}, }) }) @@ -185,12 +180,9 @@ func (w *Workflow) Deploy(ctx restate.WorkflowSharedContext, req *hydrav1.Deploy topologies := make([]db.InsertDeploymentTopologyParams, len(w.availableRegions)) for i, region := range w.availableRegions { - versioningClient := hydrav1.NewVersioningServiceClient(ctx, region) - versionResp, versionErr := restate.Run(ctx, func(runCtx restate.RunContext) (*hydrav1.NextVersionResponse, error) { - return versioningClient.NextVersion().Request(&hydrav1.NextVersionRequest{}) - }, restate.WithName("get next version")) - if versionErr != nil { - return nil, fmt.Errorf("failed to get next version: %w", versionErr) + versionResp, err := hydrav1.NewVersioningServiceClient(ctx, region).NextVersion().Request(&hydrav1.NextVersionRequest{}) + if err != nil { + return nil, fmt.Errorf("failed to get next version: %w", err) } topologies[i] = db.InsertDeploymentTopologyParams{ diff --git a/svc/worker/deploy/doc.go b/svc/ctrl/worker/deploy/doc.go similarity index 100% rename from svc/worker/deploy/doc.go rename to svc/ctrl/worker/deploy/doc.go diff --git a/svc/worker/deploy/domains.go b/svc/ctrl/worker/deploy/domains.go similarity index 100% rename from svc/worker/deploy/domains.go rename to svc/ctrl/worker/deploy/domains.go diff --git a/svc/worker/deploy/helpers.go b/svc/ctrl/worker/deploy/helpers.go similarity index 100% rename from svc/worker/deploy/helpers.go rename to svc/ctrl/worker/deploy/helpers.go diff --git a/svc/worker/deploy/promote_handler.go b/svc/ctrl/worker/deploy/promote_handler.go similarity index 100% rename from svc/worker/deploy/promote_handler.go rename to svc/ctrl/worker/deploy/promote_handler.go diff --git a/svc/worker/deploy/rollback_handler.go b/svc/ctrl/worker/deploy/rollback_handler.go similarity index 100% rename from svc/worker/deploy/rollback_handler.go rename to svc/ctrl/worker/deploy/rollback_handler.go diff --git a/svc/worker/deploy/service.go b/svc/ctrl/worker/deploy/service.go similarity index 81% rename from svc/worker/deploy/service.go rename to svc/ctrl/worker/deploy/service.go index a9d98c47a8..3a7672a279 100644 --- a/svc/worker/deploy/service.go +++ b/svc/ctrl/worker/deploy/service.go @@ -1,12 +1,11 @@ package deploy import ( - "github.com/unkeyed/unkey/gen/proto/ctrl/v1/ctrlv1connect" hydrav1 "github.com/unkeyed/unkey/gen/proto/hydra/v1" "github.com/unkeyed/unkey/pkg/db" "github.com/unkeyed/unkey/pkg/otel/logging" "github.com/unkeyed/unkey/pkg/vault" - "github.com/unkeyed/unkey/svc/ctrl/services/cluster" + "github.com/unkeyed/unkey/svc/ctrl/pkg/s3" ) // Workflow orchestrates deployment lifecycle operations. @@ -24,13 +23,11 @@ type Workflow struct { db db.Database logger logging.Logger - cluster *cluster.Service - - buildClient ctrlv1connect.BuildServiceClient defaultDomain string vault *vault.Service sentinelImage string availableRegions []string + buildStorage s3.Storage } var _ hydrav1.DeploymentServiceServer = (*Workflow)(nil) @@ -43,25 +40,19 @@ type Config struct { // DB is the main database connection for workspace, project, and deployment data. DB db.Database - // BuildClient is the client for building Docker images from source. - BuildClient ctrlv1connect.BuildServiceClient - // DefaultDomain is the apex domain for generated deployment URLs (e.g., "unkey.app"). DefaultDomain string // Vault provides encryption/decryption services for secrets. Vault *vault.Service - Cluster *cluster.Service - // SentinelImage is the Docker image used for sentinel containers. SentinelImage string // AvailableRegions is the list of available regions for deployments. AvailableRegions []string - // Bearer is the bearer token for authentication. - Bearer string + BuildStorage s3.Storage } // New creates a new deployment workflow instance. @@ -70,11 +61,10 @@ func New(cfg Config) *Workflow { UnimplementedDeploymentServiceServer: hydrav1.UnimplementedDeploymentServiceServer{}, db: cfg.DB, logger: cfg.Logger, - cluster: cfg.Cluster, - buildClient: cfg.BuildClient, defaultDomain: cfg.DefaultDomain, vault: cfg.Vault, sentinelImage: cfg.SentinelImage, availableRegions: cfg.AvailableRegions, + buildStorage: cfg.BuildStorage, } } diff --git a/svc/worker/routing/BUILD.bazel b/svc/ctrl/worker/routing/BUILD.bazel similarity index 84% rename from svc/worker/routing/BUILD.bazel rename to svc/ctrl/worker/routing/BUILD.bazel index 301a22f017..1aa7e499a1 100644 --- a/svc/worker/routing/BUILD.bazel +++ b/svc/ctrl/worker/routing/BUILD.bazel @@ -7,7 +7,7 @@ go_library( "doc.go", "service.go", ], - importpath = "github.com/unkeyed/unkey/svc/worker/routing", + importpath = "github.com/unkeyed/unkey/svc/ctrl/worker/routing", visibility = ["//visibility:public"], deps = [ "//gen/proto/hydra/v1:hydra", diff --git a/svc/worker/routing/assign_domains_handler.go b/svc/ctrl/worker/routing/assign_domains_handler.go similarity index 100% rename from svc/worker/routing/assign_domains_handler.go rename to svc/ctrl/worker/routing/assign_domains_handler.go diff --git a/svc/worker/routing/doc.go b/svc/ctrl/worker/routing/doc.go similarity index 100% rename from svc/worker/routing/doc.go rename to svc/ctrl/worker/routing/doc.go diff --git a/svc/worker/routing/service.go b/svc/ctrl/worker/routing/service.go similarity index 100% rename from svc/worker/routing/service.go rename to svc/ctrl/worker/routing/service.go diff --git a/svc/worker/run.go b/svc/ctrl/worker/run.go similarity index 89% rename from svc/worker/run.go rename to svc/ctrl/worker/run.go index ffb499ec6d..5ccb69c26b 100644 --- a/svc/worker/run.go +++ b/svc/ctrl/worker/run.go @@ -5,6 +5,7 @@ import ( "context" "database/sql" "fmt" + "io" "log/slog" "net" "net/http" @@ -15,7 +16,6 @@ import ( restate "github.com/restatedev/sdk-go" restateIngress "github.com/restatedev/sdk-go/ingress" restateServer "github.com/restatedev/sdk-go/server" - "github.com/unkeyed/unkey/gen/proto/ctrl/v1/ctrlv1connect" hydrav1 "github.com/unkeyed/unkey/gen/proto/hydra/v1" "github.com/unkeyed/unkey/pkg/cache" "github.com/unkeyed/unkey/pkg/clickhouse" @@ -30,15 +30,13 @@ import ( "github.com/unkeyed/unkey/pkg/vault/storage" "github.com/unkeyed/unkey/pkg/zen" + "github.com/unkeyed/unkey/svc/ctrl/pkg/build" + "github.com/unkeyed/unkey/svc/ctrl/pkg/s3" "github.com/unkeyed/unkey/svc/ctrl/services/acme/providers" - "github.com/unkeyed/unkey/svc/ctrl/services/build/backend/depot" - "github.com/unkeyed/unkey/svc/ctrl/services/build/backend/docker" - buildStorage "github.com/unkeyed/unkey/svc/ctrl/services/build/storage" - "github.com/unkeyed/unkey/svc/ctrl/services/cluster" - "github.com/unkeyed/unkey/svc/worker/certificate" - "github.com/unkeyed/unkey/svc/worker/deploy" - "github.com/unkeyed/unkey/svc/worker/routing" - "github.com/unkeyed/unkey/svc/worker/versioning" + "github.com/unkeyed/unkey/svc/ctrl/worker/certificate" + "github.com/unkeyed/unkey/svc/ctrl/worker/deploy" + "github.com/unkeyed/unkey/svc/ctrl/worker/routing" + "github.com/unkeyed/unkey/svc/ctrl/worker/versioning" ) // Run starts the Restate worker service with the provided configuration. @@ -142,7 +140,7 @@ func Run(ctx context.Context, cfg Config) error { shutdowns.Register(database.Close) - bldStorage, err := buildStorage.NewS3(buildStorage.S3Config{ + imageStore, err := s3.NewS3(s3.S3Config{ Logger: logger, S3URL: cfg.BuildS3.URL, S3PresignURL: cfg.BuildS3.ExternalURL, @@ -165,35 +163,6 @@ func Run(ctx context.Context, cfg Config) error { } } - var buildService ctrlv1connect.BuildServiceClient - switch cfg.BuildBackend { - case BuildBackendDocker: - buildService = docker.New(docker.Config{ - InstanceID: cfg.InstanceID, - DB: database, - Logger: logger, - BuildPlatform: docker.BuildPlatform(cfg.GetBuildPlatform()), - Storage: bldStorage, - }) - logger.Info("Using Docker build backend", "presign_url", cfg.BuildS3.ExternalURL) - - case BuildBackendDepot: - buildService = depot.New(depot.Config{ - InstanceID: cfg.InstanceID, - DB: database, - RegistryConfig: depot.RegistryConfig(cfg.GetRegistryConfig()), - BuildPlatform: depot.BuildPlatform(cfg.GetBuildPlatform()), - DepotConfig: depot.DepotConfig(cfg.GetDepotConfig()), - Clickhouse: ch, - Logger: logger, - Storage: bldStorage, - }) - logger.Info("Using Depot build backend") - - default: - return fmt.Errorf("unknown build backend: %s (must be 'docker' or 'depot')", cfg.BuildBackend) - } - // Restate Client and Server restateClientOpts := []restate.IngressClientOption{} if cfg.Restate.APIKey != "" { @@ -202,22 +171,24 @@ func Run(ctx context.Context, cfg Config) error { restateClient := restateIngress.NewClient(cfg.Restate.URL, restateClientOpts...) restateSrv := restateServer.NewRestate() - c := cluster.New(cluster.Config{ - Database: database, - Logger: logger, - Bearer: cfg.AuthToken, - }) + restateSrv.Bind(hydrav1.NewBuildServiceServer(build.New(build.Config{ + InstanceID: cfg.InstanceID, + DB: database, + RegistryConfig: build.RegistryConfig(cfg.GetRegistryConfig()), + BuildPlatform: build.BuildPlatform(cfg.GetBuildPlatform()), + DepotConfig: build.DepotConfig(cfg.GetDepotConfig()), + Clickhouse: ch, + Logger: logger, + }))) restateSrv.Bind(hydrav1.NewDeploymentServiceServer(deploy.New(deploy.Config{ Logger: logger, DB: database, - BuildClient: buildService, DefaultDomain: cfg.DefaultDomain, Vault: vaultSvc, - Cluster: c, SentinelImage: cfg.SentinelImage, AvailableRegions: cfg.AvailableRegions, - Bearer: cfg.AuthToken, + BuildStorage: imageStore, }))) restateSrv.Bind(hydrav1.NewRoutingServiceServer(routing.New(routing.Config{ @@ -327,16 +298,21 @@ func Run(ctx context.Context, cfg Config) error { return fmt.Errorf("failed to register with Restate: %w", doErr) } + body, err := io.ReadAll(resp.Body) status := resp.StatusCode closeErr := resp.Body.Close() if closeErr != nil { return fmt.Errorf("failed to close response body: %w", closeErr) } + if err != nil { + return fmt.Errorf("failed to read body: %w", err) + } if status >= 200 && status < 300 { return nil } + logger.Info("restate register response", "body", string(body)) return fmt.Errorf("registration returned status %d", status) }) diff --git a/svc/worker/versioning/BUILD.bazel b/svc/ctrl/worker/versioning/BUILD.bazel similarity index 82% rename from svc/worker/versioning/BUILD.bazel rename to svc/ctrl/worker/versioning/BUILD.bazel index ac05cb0caa..544e6e9be4 100644 --- a/svc/worker/versioning/BUILD.bazel +++ b/svc/ctrl/worker/versioning/BUILD.bazel @@ -7,7 +7,7 @@ go_library( "next_version_handler.go", "service.go", ], - importpath = "github.com/unkeyed/unkey/svc/worker/versioning", + importpath = "github.com/unkeyed/unkey/svc/ctrl/worker/versioning", visibility = ["//visibility:public"], deps = [ "//gen/proto/hydra/v1:hydra", diff --git a/svc/worker/versioning/doc.go b/svc/ctrl/worker/versioning/doc.go similarity index 100% rename from svc/worker/versioning/doc.go rename to svc/ctrl/worker/versioning/doc.go diff --git a/svc/worker/versioning/next_version_handler.go b/svc/ctrl/worker/versioning/next_version_handler.go similarity index 100% rename from svc/worker/versioning/next_version_handler.go rename to svc/ctrl/worker/versioning/next_version_handler.go diff --git a/svc/worker/versioning/service.go b/svc/ctrl/worker/versioning/service.go similarity index 100% rename from svc/worker/versioning/service.go rename to svc/ctrl/worker/versioning/service.go diff --git a/svc/worker/doc.go b/svc/worker/doc.go deleted file mode 100644 index 6865aecd5b..0000000000 --- a/svc/worker/doc.go +++ /dev/null @@ -1,78 +0,0 @@ -// Package worker provides the Restate workflow worker service. -// -// This package implements the Restate worker that handles asynchronous workflow -// execution for the unkey platform. It runs as a separate service from the main -// control plane, allowing independent scaling and deployment of workflow handlers. -// -// # Architecture -// -// The worker service consists of: -// - Restate server for workflow handler execution -// - Health check endpoint for orchestration -// - Certificate renewal cron job for ACME management -// -// # Workflow Services -// -// The worker binds and executes the following Restate workflow services: -// - [DeploymentService]: Orchestrates application deployment workflows -// - [RoutingService]: Manages domain assignment and traffic routing -// - [VersioningService]: Handles version management operations -// - [CertificateService]: Processes ACME challenges and certificate lifecycle -// -// # ACME Integration -// -// The worker supports multiple ACME challenge providers: -// - HTTP-01 challenges for regular domains -// - DNS-01 challenges through Cloudflare for wildcard certificates -// - DNS-01 challenges through AWS Route53 for wildcard certificates -// -// Certificate renewal is managed through a cron job that runs after the worker -// successfully registers with the Restate admin API. -// -// # Configuration -// -// The worker is configured through [Config] which includes: -// - Database and vault configuration for persistence -// - Build backend settings for container operations -// - ACME provider configuration for certificate management -// - Restate configuration for workflow registration -// -// # Usage -// -// Basic worker setup: -// -// cfg := worker.Config{ -// InstanceID: "worker-prod-001", -// HttpPort: 7092, -// DatabasePrimary: "user:pass@tcp(db:3306)/unkey", -// VaultMasterKeys: []string{"master-key-1"}, -// VaultS3: worker.S3Config{ -// URL: "https://s3.amazonaws.com", -// Bucket: "unkey-vault", -// AccessKeyID: "access-key", -// AccessKeySecret: "secret-key", -// }, -// BuildBackend: worker.BuildBackendDepot, -// BuildPlatform: "linux/amd64", -// Restate: worker.RestateConfig{ -// URL: "http://restate:8080", -// AdminURL: "http://restate:9070", -// HttpPort: 9080, -// RegisterAs: "http://worker:9080", -// }, -// } -// err := worker.Run(context.Background(), cfg) -// -// The worker will: -// 1. Initialize all services (database, vault, build backend, etc.) -// 2. Start Restate server with workflow service bindings -// 3. Register with Restate admin API for service discovery -// 4. Bootstrap wildcard domain and start certificate renewal cron -// 5. Start health check endpoint on configured port -// 6. Handle graceful shutdown on context cancellation -// -// # Observability -// -// The worker integrates with OpenTelemetry for metrics, traces, and structured logging. -// It exposes health endpoints and Prometheus metrics for monitoring workflow execution. -package worker From 78425c9e28f3c4705918e6d9f2c9741877d96958 Mon Sep 17 00:00:00 2001 From: chronark Date: Thu, 22 Jan 2026 16:31:52 +0100 Subject: [PATCH 13/32] fix: idk stuff works again --- cmd/ctrl/BUILD.bazel | 1 + cmd/ctrl/api.go | 11 +- cmd/ctrl/doc.go | 47 ++++++++ cmd/ctrl/main.go | 3 + cmd/ctrl/worker.go | 9 +- svc/api/internal/testutil/BUILD.bazel | 5 +- svc/api/internal/testutil/doc.go | 38 ++++++ svc/api/internal/testutil/http.go | 77 ++++++++++-- svc/api/internal/testutil/seed/BUILD.bazel | 5 +- svc/api/internal/testutil/seed/doc.go | 42 +++++++ svc/api/internal/testutil/seed/seed.go | 57 ++++++++- svc/api/routes/register.go | 15 ++- svc/api/routes/services.go | 76 +++++++++--- .../v2_deploy_generate_upload_url/BUILD.bazel | 5 +- .../v2_deploy_generate_upload_url/doc.go | 31 +++++ .../v2_deploy_generate_upload_url/handler.go | 18 ++- svc/ctrl/api/BUILD.bazel | 1 + svc/ctrl/api/config.go | 34 +++--- svc/ctrl/api/doc.go | 52 +++++++++ svc/ctrl/doc.go | 110 +++++------------- svc/ctrl/pkg/build/build.go | 48 +++++--- svc/ctrl/pkg/build/doc.go | 89 +++++++------- svc/ctrl/pkg/build/service.go | 49 ++++++-- svc/ctrl/pkg/s3/doc.go | 53 ++++----- svc/ctrl/pkg/s3/interface.go | 8 +- svc/ctrl/pkg/s3/s3.go | 40 ++++++- svc/ctrl/services/cluster/auth.go | 7 +- svc/ctrl/services/cluster/doc.go | 50 ++++---- .../rpc_get_desired_deployment_state.go | 17 +-- .../cluster/rpc_get_desired_sentinel_state.go | 16 ++- .../cluster/rpc_report_deployment_status.go | 23 ++-- .../cluster/rpc_report_sentinel_status.go | 15 ++- .../services/cluster/rpc_watch_deployments.go | 24 +++- .../services/cluster/rpc_watch_sentinels.go | 23 +++- svc/ctrl/services/cluster/service.go | 19 ++- .../services/deployment/create_deployment.go | 19 ++- .../deployment/create_s3_upload_url.go | 6 + svc/ctrl/services/deployment/doc.go | 93 ++++++--------- .../services/deployment/get_deployment.go | 7 ++ svc/ctrl/services/deployment/promote.go | 6 +- svc/ctrl/services/deployment/rollback.go | 7 +- svc/ctrl/services/deployment/service.go | 25 ++-- svc/ctrl/worker/BUILD.bazel | 1 + .../certificate/bootstrap_infra_certs.go | 39 ++++--- svc/ctrl/worker/certificate/doc.go | 102 +++++++--------- .../certificate/process_challenge_handler.go | 51 ++++++-- svc/ctrl/worker/certificate/renew_handler.go | 27 +++-- svc/ctrl/worker/certificate/service.go | 50 +++++--- svc/ctrl/worker/deploy/deploy_handler.go | 5 +- svc/ctrl/worker/deploy/domains.go | 12 +- svc/ctrl/worker/deploy/service.go | 1 + svc/ctrl/worker/doc.go | 67 +++++++++++ .../worker/routing/assign_domains_handler.go | 9 ++ svc/ctrl/worker/routing/doc.go | 69 +++-------- svc/ctrl/worker/routing/service.go | 25 ++-- svc/ctrl/worker/versioning/doc.go | 35 ++++-- .../worker/versioning/next_version_handler.go | 21 +++- svc/ctrl/worker/versioning/service.go | 7 +- .../deployment/actual_state_report.go | 11 +- svc/krane/internal/deployment/apply.go | 22 ++-- svc/krane/internal/deployment/consts.go | 16 ++- svc/krane/internal/deployment/controller.go | 58 ++++++--- svc/krane/internal/deployment/delete.go | 7 +- .../deployment/desired_state_apply.go | 20 +++- svc/krane/internal/deployment/doc.go | 50 ++++---- svc/krane/internal/deployment/namespace.go | 28 ++++- svc/krane/internal/deployment/resync.go | 13 ++- svc/krane/internal/deployment/scheduling.go | 22 +++- svc/krane/internal/deployment/state.go | 13 ++- 69 files changed, 1401 insertions(+), 661 deletions(-) create mode 100644 cmd/ctrl/doc.go create mode 100644 svc/api/internal/testutil/doc.go create mode 100644 svc/api/internal/testutil/seed/doc.go create mode 100644 svc/api/routes/v2_deploy_generate_upload_url/doc.go create mode 100644 svc/ctrl/api/doc.go create mode 100644 svc/ctrl/worker/doc.go diff --git a/cmd/ctrl/BUILD.bazel b/cmd/ctrl/BUILD.bazel index f447284b1c..782470fb8e 100644 --- a/cmd/ctrl/BUILD.bazel +++ b/cmd/ctrl/BUILD.bazel @@ -4,6 +4,7 @@ go_library( name = "ctrl", srcs = [ "api.go", + "doc.go", "main.go", "worker.go", ], diff --git a/cmd/ctrl/api.go b/cmd/ctrl/api.go index b0e3fe535f..d87a682c09 100644 --- a/cmd/ctrl/api.go +++ b/cmd/ctrl/api.go @@ -10,8 +10,11 @@ import ( ctrlapi "github.com/unkeyed/unkey/svc/ctrl/api" ) -// Cmd is the ctrl command that runs the Unkey control plane service for managing -// infrastructure, deployments, builds, and service orchestration. +// apiCmd defines the "api" subcommand for running the control plane HTTP server. +// The server handles infrastructure management, build orchestration, and service +// coordination. It requires a MySQL database (--database-primary) and S3 storage +// for build artifacts. Optional integrations include Vault for secrets, Restate +// for workflows, and ACME for automatic TLS certificates. var apiCmd = &cli.Command{ Version: "", Commands: []*cli.Command{}, @@ -110,6 +113,10 @@ var apiCmd = &cli.Command{ Action: apiAction, } +// apiAction validates configuration and starts the control plane API server. +// It returns an error if TLS is partially configured (only cert or only key), +// if required configuration is missing, or if the server fails to start. +// The function blocks until the context is cancelled or the server exits. func apiAction(ctx context.Context, cmd *cli.Command) error { // Check if TLS flags are properly set (both or none) tlsCertFile := cmd.String("tls-cert-file") diff --git a/cmd/ctrl/doc.go b/cmd/ctrl/doc.go new file mode 100644 index 0000000000..adeba57e17 --- /dev/null +++ b/cmd/ctrl/doc.go @@ -0,0 +1,47 @@ +// Package ctrl provides CLI commands for running the Unkey control plane. +// +// The control plane consists of two services that work together to manage +// Unkey's infrastructure: an API server for handling requests and a worker +// for executing background jobs. Both services are designed for distributed +// deployment and integrate with Restate for durable workflow execution. +// +// # Commands +// +// The package exposes a single [Cmd] that contains two subcommands: +// +// - api: Runs the control plane API server +// - worker: Runs the background job processor +// +// # API Server +// +// The api subcommand starts an HTTP server that handles control plane requests +// including infrastructure provisioning, build management, and service +// orchestration. It requires a MySQL database connection and integrates with +// S3-compatible storage for build artifacts, Vault for secrets management, +// and Restate for workflow coordination. +// +// TLS is optional but both --tls-cert-file and --tls-key-file must be provided +// together to enable HTTPS. The server validates this constraint and exits +// with an error if only one is provided. +// +// ACME support enables automatic TLS certificate provisioning via Let's Encrypt +// using Route53 DNS-01 challenges for domain validation. +// +// # Worker +// +// The worker subcommand starts a background processor that handles durable +// workflows including deployments, container builds, and certificate management. +// It supports two build backends: "docker" for local development and "depot" +// for production builds. The worker registers itself with Restate for receiving +// workflow invocations. +// +// The worker supports both Cloudflare and Route53 DNS providers for ACME +// certificate challenges, allowing flexibility based on where domains are hosted. +// +// # Configuration +// +// Both services accept configuration through CLI flags and environment variables. +// Required flags will cause the service to fail on startup if not provided. +// See the individual flag definitions in [Cmd] for defaults and environment +// variable mappings. +package ctrl diff --git a/cmd/ctrl/main.go b/cmd/ctrl/main.go index fb131e4c6b..115aab7146 100644 --- a/cmd/ctrl/main.go +++ b/cmd/ctrl/main.go @@ -4,6 +4,9 @@ import ( "github.com/unkeyed/unkey/pkg/cli" ) +// Cmd is the root command for the Unkey control plane. It provides subcommands +// for running the API server (api) and background worker (worker). Use this +// command as a subcommand of the main Unkey CLI binary. var Cmd = &cli.Command{ Version: "", Commands: []*cli.Command{ diff --git a/cmd/ctrl/worker.go b/cmd/ctrl/worker.go index 53166390b9..6e0c69165b 100644 --- a/cmd/ctrl/worker.go +++ b/cmd/ctrl/worker.go @@ -9,8 +9,10 @@ import ( "github.com/unkeyed/unkey/svc/ctrl/worker" ) -// Cmd is the worker command that runs the Unkey Restate worker service for -// handling background jobs, deployments, builds, and certificate management. +// workerCmd defines the "worker" subcommand for running the background job +// processor. The worker handles durable workflows via Restate including container +// builds, deployments, and ACME certificate provisioning. It supports two build +// backends: "docker" for local development and "depot" for production. var workerCmd = &cli.Command{ Version: "", Commands: []*cli.Command{}, @@ -111,6 +113,9 @@ var workerCmd = &cli.Command{ Action: workerAction, } +// workerAction validates configuration and starts the background worker service. +// It returns an error if required configuration is missing or if the worker fails +// to start. The function blocks until the context is cancelled or the worker exits. func workerAction(ctx context.Context, cmd *cli.Command) error { config := worker.Config{ // Basic configuration diff --git a/svc/api/internal/testutil/BUILD.bazel b/svc/api/internal/testutil/BUILD.bazel index 88772b62f3..bd0f10126e 100644 --- a/svc/api/internal/testutil/BUILD.bazel +++ b/svc/api/internal/testutil/BUILD.bazel @@ -2,7 +2,10 @@ load("@rules_go//go:def.bzl", "go_library") go_library( name = "testutil", - srcs = ["http.go"], + srcs = [ + "doc.go", + "http.go", + ], importpath = "github.com/unkeyed/unkey/svc/api/internal/testutil", visibility = ["//svc/api:__subpackages__"], deps = [ diff --git a/svc/api/internal/testutil/doc.go b/svc/api/internal/testutil/doc.go new file mode 100644 index 0000000000..8f0752613b --- /dev/null +++ b/svc/api/internal/testutil/doc.go @@ -0,0 +1,38 @@ +// Package testutil provides integration test infrastructure for the API service. +// +// This package creates a complete, isolated test environment with real dependencies +// (MySQL, Redis, ClickHouse, S3, control plane) running in Docker containers. Tests +// using this package verify end-to-end behavior rather than mocking service boundaries. +// +// # Key Types +// +// The main entry point is [Harness], which orchestrates container startup, database +// seeding, and provides access to all services. Use [NewHarness] to create one. +// [TestResponse] wraps HTTP responses with typed body parsing for assertions. +// +// # Usage +// +// Create a harness at the start of your test. The harness handles container lifecycle +// and provides methods to create test data and make HTTP requests: +// +// func TestMyEndpoint(t *testing.T) { +// h := testutil.NewHarness(t) +// h.Register(myRoute) +// +// ws := h.CreateWorkspace() +// rootKey := h.CreateRootKey(ws.ID, "api.keys.create") +// +// resp := testutil.CallRoute[RequestType, ResponseType](h, myRoute, headers, req) +// require.Equal(t, 200, resp.Status) +// } +// +// For deployment-related tests, use [Harness.CreateTestDeploymentSetup] to create +// a workspace, project, environment, and root key in one call. +// +// # Container Dependencies +// +// The harness starts MySQL, Redis, ClickHouse, and MinIO (S3-compatible) containers. +// These are shared across tests within a package for speed, but each test gets fresh +// database state through the seeder. Container startup is parallelized to minimize +// test latency. +package testutil diff --git a/svc/api/internal/testutil/http.go b/svc/api/internal/testutil/http.go index 8178949a85..f9481585e9 100644 --- a/svc/api/internal/testutil/http.go +++ b/svc/api/internal/testutil/http.go @@ -40,9 +40,17 @@ import ( "github.com/unkeyed/unkey/svc/api/internal/testutil/seed" ) +// Harness provides a complete integration test environment with real dependencies. +// It manages Docker containers for MySQL, Redis, ClickHouse, and S3, seeds baseline +// test data, and exposes all services needed to test API endpoints. +// +// The exported fields provide direct access to services when tests need to verify +// side effects or set up complex scenarios beyond what the helper methods offer. type Harness struct { t *testing.T + // Clock is a controllable clock for time-dependent tests. Advancing the clock + // affects rate limiting windows, token expiration, and other time-based behavior. Clock *clock.TestClock srv *zen.Server @@ -50,6 +58,8 @@ type Harness struct { middleware []zen.Middleware + // DB provides direct database access for verifying side effects or setting up + // test data that the seeder methods don't cover. DB db.Database Caches caches.Caches Logger logging.Logger @@ -64,6 +74,10 @@ type Harness struct { seeder *seed.Seeder } +// NewHarness creates a fully initialized test harness with all dependencies started. +// Container startup is parallelized, and the database is seeded with baseline data +// including a root workspace and key space. The harness is tied to the test lifecycle +// and containers are cleaned up when the test completes. func NewHarness(t *testing.T) *Harness { clk := clock.NewTestClock() logger := logging.New() @@ -232,8 +246,9 @@ func NewHarness(t *testing.T) *Harness { return &h } -// Register registers a route with the harness. -// You can override the middleware by passing a list of middleware. +// Register adds a route to the test server with the standard middleware stack. +// Pass custom middleware to override the defaults, which include observability, +// logging, error handling, and validation. Passing no middleware uses the defaults. func (h *Harness) Register(route zen.Route, middleware ...zen.Middleware) { if len(middleware) == 0 { middleware = h.middleware @@ -245,48 +260,63 @@ func (h *Harness) Register(route zen.Route, middleware ...zen.Middleware) { ) } -// CreateRootKey creates a root key with the specified permissions +// CreateRootKey creates a root key that authorizes operations on the given workspace. +// The returned string is the raw key value for use in Authorization headers. Pass +// permission names to restrict what the key can do; omitting permissions grants no +// permissions (the key can authenticate but not authorize any operations). func (h *Harness) CreateRootKey(workspaceID string, permissions ...string) string { return h.seeder.CreateRootKey(context.Background(), workspaceID, permissions...) } +// CreateWorkspace creates a new workspace with auto-generated IDs and names. func (h *Harness) CreateWorkspace() db.Workspace { return h.seeder.CreateWorkspace(context.Background()) } +// CreateApi creates an API with the specified configuration. The API's key space +// is created automatically. See [seed.CreateApiRequest] for available options. func (h *Harness) CreateApi(req seed.CreateApiRequest) db.Api { return h.seeder.CreateAPI(context.Background(), req) } +// CreateKey creates a key in the specified key space with optional permissions, +// roles, rate limits, and other configuration. Returns both the key ID (for database +// lookups) and the raw key value (for authentication). See [seed.CreateKeyRequest]. func (h *Harness) CreateKey(req seed.CreateKeyRequest) seed.CreateKeyResponse { return h.seeder.CreateKey(context.Background(), req) } +// CreateIdentity creates an identity with optional rate limits attached. func (h *Harness) CreateIdentity(req seed.CreateIdentityRequest) db.Identity { return h.seeder.CreateIdentity(context.Background(), req) } +// CreateRatelimit creates a rate limit configuration attached to either a key or identity. func (h *Harness) CreateRatelimit(req seed.CreateRatelimitRequest) db.Ratelimit { return h.seeder.CreateRatelimit(context.Background(), req) } +// CreateRole creates a role with optional permissions attached. func (h *Harness) CreateRole(req seed.CreateRoleRequest) db.Role { return h.seeder.CreateRole(context.Background(), req) } +// CreatePermission creates a permission that can be attached to keys or roles. func (h *Harness) CreatePermission(req seed.CreatePermissionRequest) db.Permission { return h.seeder.CreatePermission(context.Background(), req) } +// CreateProject creates a project within a workspace. func (h *Harness) CreateProject(req seed.CreateProjectRequest) db.Project { return h.seeder.CreateProject(context.Background(), req) } +// CreateEnvironment creates an environment within a project. func (h *Harness) CreateEnvironment(req seed.CreateEnvironmentRequest) db.Environment { return h.seeder.CreateEnvironment(h.t.Context(), req) } -// DeploymentTestSetup contains all resources needed for deployment tests +// DeploymentTestSetup contains all resources needed for deployment tests. type DeploymentTestSetup struct { Workspace db.Workspace RootKey string @@ -294,7 +324,8 @@ type DeploymentTestSetup struct { Environment db.Environment } -// CreateTestDeploymentSetupOptions allows customization of the test setup +// CreateTestDeploymentSetupOptions configures the resources created by +// [Harness.CreateTestDeploymentSetup]. type CreateTestDeploymentSetupOptions struct { ProjectName string ProjectSlug string @@ -303,7 +334,11 @@ type CreateTestDeploymentSetupOptions struct { Permissions []string } -// CreateTestDeploymentSetup creates workspace, root key, project, and environment with sensible defaults +// CreateTestDeploymentSetup creates a complete deployment test environment with a +// workspace, root key, project, and environment. This is a convenience method for +// tests that need all these resources together. Pass [CreateTestDeploymentSetupOptions] +// to customize names, slugs, or skip environment creation. Defaults to project name +// "test-project", slugs "production", and full permissions unless specified. func (h *Harness) CreateTestDeploymentSetup(opts ...CreateTestDeploymentSetupOptions) DeploymentTestSetup { h.t.Helper() @@ -371,6 +406,7 @@ func (h *Harness) CreateTestDeploymentSetup(opts ...CreateTestDeploymentSetupOpt } } +// SetupAnalyticsOption configures analytics settings for [Harness.SetupAnalytics]. type SetupAnalyticsOption func(*setupAnalyticsConfig) type setupAnalyticsConfig struct { @@ -383,36 +419,49 @@ type setupAnalyticsConfig struct { RetentionDays int32 } +// WithMaxQueryResultRows sets the maximum number of rows a query can return. +// Default is 10,000,000. func WithMaxQueryResultRows(rows int32) SetupAnalyticsOption { return func(c *setupAnalyticsConfig) { c.MaxQueryResultRows = rows } } +// WithMaxQueryMemoryBytes sets the maximum memory a query can use. +// Default is 1,000,000,000 (1GB). func WithMaxQueryMemoryBytes(bytes int64) SetupAnalyticsOption { return func(c *setupAnalyticsConfig) { c.MaxQueryMemoryBytes = bytes } } +// WithMaxQueriesPerWindow sets the maximum queries allowed per quota window. +// Default is 1,000. func WithMaxQueriesPerWindow(queries int32) SetupAnalyticsOption { return func(c *setupAnalyticsConfig) { c.MaxQueriesPerWindow = queries } } +// WithMaxExecutionTimePerWindow sets the maximum total execution time per quota window. +// Default is 1,800 seconds (30 minutes). func WithMaxExecutionTimePerWindow(seconds int32) SetupAnalyticsOption { return func(c *setupAnalyticsConfig) { c.MaxExecutionTimePerWindow = seconds } } +// WithRetentionDays sets how long analytics data is retained. Default is 30 days. func WithRetentionDays(days int32) SetupAnalyticsOption { return func(c *setupAnalyticsConfig) { c.RetentionDays = days } } +// SetupAnalytics configures a ClickHouse user and analytics settings for a workspace. +// This creates the user in ClickHouse, encrypts the password in the vault, and stores +// the settings in the database. Use the With* options to customize query limits and +// retention. Tests that query analytics data must call this before making requests. func (h *Harness) SetupAnalytics(workspaceID string, opts ...SetupAnalyticsOption) { ctx := context.Background() @@ -486,10 +535,14 @@ func (h *Harness) SetupAnalytics(workspaceID string, opts ...SetupAnalyticsOptio require.NoError(h.t, err) } +// Resources returns the baseline seed data created during harness initialization. +// This includes the root workspace, root key space, and user workspace that exist +// before any test-specific data is created. func (h *Harness) Resources() seed.Resources { return h.seeder.Resources } +// TestResponse wraps an HTTP response with typed body parsing for test assertions. type TestResponse[TBody any] struct { Status int Headers http.Header @@ -497,6 +550,10 @@ type TestResponse[TBody any] struct { RawBody string } +// CallRaw executes an HTTP request against the test server and returns the parsed +// response. Use this when you need full control over the request, such as setting +// path parameters or custom headers. The response body is JSON-unmarshaled into the +// type parameter. func CallRaw[Res any](h *Harness, req *http.Request) TestResponse[Res] { rr := httptest.NewRecorder() @@ -519,6 +576,10 @@ func CallRaw[Res any](h *Harness, req *http.Request) TestResponse[Res] { return res } +// CallRoute executes a request against a registered route and returns the typed +// response. The request body is JSON-encoded from req, and the response is unmarshaled +// into Res. This is the primary way to test API endpoints. Pass nil headers to use +// an empty header set. func CallRoute[Req any, Res any](h *Harness, route zen.Route, headers http.Header, req Req) TestResponse[Res] { h.t.Helper() @@ -555,7 +616,9 @@ func CallRoute[Req any, Res any](h *Harness, route zen.Route, headers http.Heade return res } -// UnmarshalBody is a helper function to unmarshal the response body +// UnmarshalBody decodes a JSON response body into the provided pointer. This is +// useful when working directly with httptest.ResponseRecorder rather than using +// [CallRoute] or [CallRaw]. func UnmarshalBody[Body any](t *testing.T, r *httptest.ResponseRecorder, body *Body) { err := json.Unmarshal(r.Body.Bytes(), &body) require.NoError(t, err) diff --git a/svc/api/internal/testutil/seed/BUILD.bazel b/svc/api/internal/testutil/seed/BUILD.bazel index 5351a230d2..b0dc6681a8 100644 --- a/svc/api/internal/testutil/seed/BUILD.bazel +++ b/svc/api/internal/testutil/seed/BUILD.bazel @@ -2,7 +2,10 @@ load("@rules_go//go:def.bzl", "go_library") go_library( name = "seed", - srcs = ["seed.go"], + srcs = [ + "doc.go", + "seed.go", + ], importpath = "github.com/unkeyed/unkey/svc/api/internal/testutil/seed", visibility = ["//visibility:public"], deps = [ diff --git a/svc/api/internal/testutil/seed/doc.go b/svc/api/internal/testutil/seed/doc.go new file mode 100644 index 0000000000..e4b48cdf89 --- /dev/null +++ b/svc/api/internal/testutil/seed/doc.go @@ -0,0 +1,42 @@ +// Package seed provides database seeding utilities for integration tests. +// +// This package handles creating test data in the database with proper relationships +// between entities. It generates unique IDs, handles foreign key constraints, and +// provides sensible defaults while allowing full customization. +// +// # Key Types +// +// [Seeder] is the main type that provides methods to create test entities. It holds +// a database connection and vault service for encrypting keys. [Resources] contains +// the baseline entities created during initial seeding. +// +// # Usage +// +// The seeder is typically used through [testutil.Harness], which wraps it with +// context management. For direct usage: +// +// seeder := seed.New(t, database, vaultService) +// seeder.Seed(ctx) // Creates baseline data +// +// api := seeder.CreateAPI(ctx, seed.CreateApiRequest{ +// WorkspaceID: seeder.Resources.UserWorkspace.ID, +// }) +// +// key := seeder.CreateKey(ctx, seed.CreateKeyRequest{ +// WorkspaceID: api.WorkspaceID, +// KeySpaceID: api.KeyAuthID.String, +// Permissions: []seed.CreatePermissionRequest{{Name: "read", Slug: "read", WorkspaceID: api.WorkspaceID}}, +// }) +// +// # Entity Relationships +// +// The seeder handles cascading entity creation. For example, [CreateKeyRequest] can +// include permissions, roles, and rate limits which are created and linked automatically. +// Similarly, [CreateRoleRequest] can include permissions to attach. +// +// # Request Types +// +// Each Create* method has a corresponding request struct that documents all available +// options. Required fields are typically WorkspaceID and identifiers. Optional fields +// use pointers to distinguish between "not set" and "set to zero value". +package seed diff --git a/svc/api/internal/testutil/seed/seed.go b/svc/api/internal/testutil/seed/seed.go index 75daaf5751..23a55425b6 100644 --- a/svc/api/internal/testutil/seed/seed.go +++ b/svc/api/internal/testutil/seed/seed.go @@ -19,7 +19,9 @@ import ( "github.com/unkeyed/unkey/pkg/vault" ) -// Resources represents seed data created for tests +// Resources contains the baseline entities created during [Seeder.Seed]. These +// represent the "system" workspace used for root keys and a user workspace for +// test-specific data. type Resources struct { RootWorkspace db.Workspace RootKeySpace db.KeyAuth @@ -27,7 +29,8 @@ type Resources struct { UserWorkspace db.Workspace } -// Seeder provides methods to seed test data +// Seeder provides methods to create test entities in the database. It ensures proper +// foreign key relationships and generates unique IDs for all entities. type Seeder struct { t *testing.T DB db.Database @@ -35,7 +38,8 @@ type Seeder struct { Resources Resources } -// New creates a new Seeder instance +// New creates a Seeder with the given database and vault service. Call [Seeder.Seed] +// after creation to populate baseline data. func New(t *testing.T, database db.Database, vault *vault.Service) *Seeder { return &Seeder{ t: t, @@ -45,6 +49,8 @@ func New(t *testing.T, database db.Database, vault *vault.Service) *Seeder { } } +// CreateWorkspace creates a new workspace with auto-generated IDs for the workspace, +// org, name, and slug. func (s *Seeder) CreateWorkspace(ctx context.Context) db.Workspace { params := db.InsertWorkspaceParams{ ID: uid.New("test_ws"), @@ -63,7 +69,9 @@ func (s *Seeder) CreateWorkspace(ctx context.Context) db.Workspace { return ws } -// Seed initializes the database with test data +// Seed initializes the database with baseline test data. This creates a root workspace +// (for issuing root keys), a root API with its key space, and a user workspace for +// test-specific entities. The created resources are stored in [Seeder.Resources]. func (s *Seeder) Seed(ctx context.Context) { s.Resources.UserWorkspace = s.CreateWorkspace(ctx) s.Resources.RootWorkspace = s.CreateWorkspace(ctx) @@ -81,6 +89,7 @@ func (s *Seeder) Seed(ctx context.Context) { s.Resources.RootKeySpace = keySpace } +// CreateApiRequest configures the API to create. type CreateApiRequest struct { WorkspaceID string IpWhitelist string @@ -91,6 +100,9 @@ type CreateApiRequest struct { DefaultBytes *int32 } +// CreateAPI creates an API and its associated key space. The key space is created +// first since the API references it. Returns the created API which includes the +// KeyAuthID linking to the key space. func (s *Seeder) CreateAPI(ctx context.Context, req CreateApiRequest) db.Api { keySpaceID := uid.New(uid.KeySpacePrefix) err := db.Query.InsertKeySpace(ctx, s.DB.RW(), db.InsertKeySpaceParams{ @@ -121,6 +133,7 @@ func (s *Seeder) CreateAPI(ctx context.Context, req CreateApiRequest) db.Api { return api } +// CreateProjectRequest configures the project to create. type CreateProjectRequest struct { ID string WorkspaceID string @@ -131,6 +144,8 @@ type CreateProjectRequest struct { DeleteProtection bool } +// CreateProject creates a project within a workspace. The ID should be generated +// with [uid.New] using [uid.ProjectPrefix]. func (h *Seeder) CreateProject(ctx context.Context, req CreateProjectRequest) db.Project { err := db.Query.InsertProject(ctx, h.DB.RW(), db.InsertProjectParams{ ID: req.ID, @@ -166,6 +181,7 @@ func (h *Seeder) CreateProject(ctx context.Context, req CreateProjectRequest) db } } +// CreateEnvironmentRequest configures the environment to create. type CreateEnvironmentRequest struct { ID string WorkspaceID string @@ -176,6 +192,8 @@ type CreateEnvironmentRequest struct { DeleteProtection bool } +// CreateEnvironment creates an environment within a project. If SentinelConfig is +// nil or empty, it defaults to "{}". func (s *Seeder) CreateEnvironment(ctx context.Context, req CreateEnvironmentRequest) db.Environment { sentinelConfig := []byte("{}") if len(req.SentinelConfig) > 0 { @@ -211,7 +229,10 @@ func (s *Seeder) CreateEnvironment(ctx context.Context, req CreateEnvironmentReq } } -// CreateRootKey creates a root key with optional permissions +// CreateRootKey creates a root key that authorizes operations on the specified +// workspace. The key is created in the root key space (from baseline seed data). +// Pass permission names to grant; if a permission already exists, it reuses the +// existing one. Returns the raw key value for use in Authorization headers. func (s *Seeder) CreateRootKey(ctx context.Context, workspaceID string, permissions ...string) string { key := uid.New("test_root_key") @@ -277,6 +298,8 @@ func (s *Seeder) CreateRootKey(ctx context.Context, workspaceID string, permissi return key } +// CreateKeyRequest configures the key to create. WorkspaceID and KeySpaceID are +// required. The key is enabled by default unless Disabled is true. type CreateKeyRequest struct { Disabled bool WorkspaceID string @@ -287,7 +310,7 @@ type CreateKeyRequest struct { Expires *time.Time Name *string Deleted bool - ForWorkspaceID *string // For creating root keys that target a specific workspace + ForWorkspaceID *string Recoverable bool @@ -299,6 +322,8 @@ type CreateKeyRequest struct { Ratelimits []CreateRatelimitRequest } +// CreateKeyResponse contains the created key's ID and raw value, plus the IDs of +// any roles and permissions that were created and attached. type CreateKeyResponse struct { KeyID string Key string @@ -307,6 +332,10 @@ type CreateKeyResponse struct { PermissionIds []string } +// CreateKey creates a key with the specified configuration. If Permissions, Roles, +// or Ratelimits are provided, they are created and linked to the key. If Deleted +// is true, the key is soft-deleted after creation. If Recoverable is true and a +// Vault service is configured, the key is encrypted and stored for recovery. func (s *Seeder) CreateKey(ctx context.Context, req CreateKeyRequest) CreateKeyResponse { keyID := uid.New(uid.KeyPrefix) key := uid.New("") @@ -399,6 +428,8 @@ func (s *Seeder) CreateKey(ctx context.Context, req CreateKeyRequest) CreateKeyR return res } +// CreateRatelimitRequest configures the rate limit to create. Either IdentityID or +// KeyID must be set to attach the rate limit to an entity. type CreateRatelimitRequest struct { Name string WorkspaceID string @@ -409,6 +440,9 @@ type CreateRatelimitRequest struct { KeyID *string } +// CreateRatelimit creates a rate limit attached to either a key or identity. The +// rate limit allows Limit requests per Duration (in milliseconds). If AutoApply is +// true, the rate limit is automatically applied during key verification. func (s *Seeder) CreateRatelimit(ctx context.Context, req CreateRatelimitRequest) db.Ratelimit { ratelimitID := uid.New(uid.RatelimitPrefix) createdAt := time.Now().UnixMilli() @@ -458,6 +492,8 @@ func (s *Seeder) CreateRatelimit(ctx context.Context, req CreateRatelimitRequest } } +// CreateIdentityRequest configures the identity to create. ExternalID and +// WorkspaceID are required. type CreateIdentityRequest struct { WorkspaceID string ExternalID string @@ -465,6 +501,9 @@ type CreateIdentityRequest struct { Ratelimits []CreateRatelimitRequest } +// CreateIdentity creates an identity with optional rate limits attached. If Meta +// is nil or empty, it defaults to "{}". Any rate limits in Ratelimits are created +// and linked to this identity. func (s *Seeder) CreateIdentity(ctx context.Context, req CreateIdentityRequest) db.Identity { metaBytes := []byte("{}") if len(req.Meta) > 0 { @@ -503,6 +542,7 @@ func (s *Seeder) CreateIdentity(ctx context.Context, req CreateIdentityRequest) } } +// CreateRoleRequest configures the role to create. Name and WorkspaceID are required. type CreateRoleRequest struct { Name string Description *string @@ -511,6 +551,8 @@ type CreateRoleRequest struct { Permissions []CreatePermissionRequest } +// CreateRole creates a role with optional permissions attached. Any permissions in +// Permissions are created and linked to this role. func (s *Seeder) CreateRole(ctx context.Context, req CreateRoleRequest) db.Role { require.NoError(s.t, assert.NotEmpty(req.WorkspaceID, "Role WorkspaceID must be set")) require.NoError(s.t, assert.NotEmpty(req.Name, "Role Name must be set")) @@ -549,6 +591,8 @@ func (s *Seeder) CreateRole(ctx context.Context, req CreateRoleRequest) db.Role } } +// CreatePermissionRequest configures the permission to create. Name, Slug, and +// WorkspaceID are required. type CreatePermissionRequest struct { Name string Slug string @@ -556,6 +600,7 @@ type CreatePermissionRequest struct { WorkspaceID string } +// CreatePermission creates a permission that can be attached to keys or roles. func (s *Seeder) CreatePermission(ctx context.Context, req CreatePermissionRequest) db.Permission { require.NoError(s.t, assert.NotEmpty(req.WorkspaceID, "Permission WorkspaceID must be set")) require.NoError(s.t, assert.NotEmpty(req.Name, "Permission Name must be set")) diff --git a/svc/api/routes/register.go b/svc/api/routes/register.go index 28533e62fd..e428f9d494 100644 --- a/svc/api/routes/register.go +++ b/svc/api/routes/register.go @@ -66,8 +66,19 @@ import ( zen "github.com/unkeyed/unkey/pkg/zen" ) -// here we register all of the routes. -// this function runs during startup. +// Register wires up all API route handlers with their dependencies and middleware +// chains. This function runs once during server startup; routes cannot be added +// or removed after initialization. +// +// The function applies a default middleware stack to most routes: panic recovery, +// observability (tracing), metrics collection to ClickHouse, structured logging, +// error handling, a one-minute request timeout, and request validation. Internal +// endpoints (chproxy, pprof) use reduced middleware stacks appropriate to their +// needs. +// +// Conditional routes are registered based on [Services] configuration. Chproxy +// endpoints require a non-empty ChproxyToken, and pprof endpoints require +// PprofEnabled to be true. func Register(srv *zen.Server, svc *Services, info zen.InstanceInfo) { withObservability := zen.WithObservability() withMetrics := zen.WithMetrics(svc.ClickHouse, info) diff --git a/svc/api/routes/services.go b/svc/api/routes/services.go index e728b19410..f01bbf75e8 100644 --- a/svc/api/routes/services.go +++ b/svc/api/routes/services.go @@ -15,21 +15,67 @@ import ( "github.com/unkeyed/unkey/pkg/zen/validation" ) +// Services aggregates all dependencies required by API route handlers. It acts +// as a dependency injection container, allowing [Register] to wire up handlers +// without exposing individual dependencies throughout the codebase. +// +// This struct is constructed during server startup and passed to [Register]. +// All fields except the optional configuration fields (ChproxyToken, Pprof*) +// must be non-nil for the API to function correctly. type Services struct { - Logger logging.Logger - Database db.Database - Keys keys.KeyService - ClickHouse clickhouse.ClickHouse - Validator *validation.Validator - Ratelimit ratelimit.Service - Auditlogs auditlogs.AuditLogService - Caches caches.Caches - Vault *vault.Service - ChproxyToken string - CtrlDeploymentClient ctrlv1connect.DeploymentServiceClient - PprofEnabled bool - PprofUsername string - PprofPassword string - UsageLimiter usagelimiter.Service + // Logger provides structured logging for all route handlers. + Logger logging.Logger + + // Database provides access to the primary MySQL database for persistence. + Database db.Database + + // Keys handles API key authentication, verification, and authorization + // checks for incoming requests. + Keys keys.KeyService + + // ClickHouse stores analytics data including verification events, + // rate limit events, and request metrics. + ClickHouse clickhouse.ClickHouse + + // Validator performs request payload validation using struct tags. + Validator *validation.Validator + + // Ratelimit provides distributed rate limiting across API requests. + Ratelimit ratelimit.Service + + // Auditlogs records security-relevant events for compliance and debugging. + Auditlogs auditlogs.AuditLogService + + // Caches holds various cache instances for performance optimization, + // including API metadata, key data, and rate limit namespace caches. + Caches caches.Caches + + // Vault provides encrypted storage for sensitive key material. + Vault *vault.Service + + // ChproxyToken authenticates requests to internal chproxy endpoints. + // When empty, chproxy routes are not registered. + ChproxyToken string + + // CtrlDeploymentClient communicates with the control plane for deployment + // operations like creating and managing deployments. + CtrlDeploymentClient ctrlv1connect.DeploymentServiceClient + + // PprofEnabled controls whether pprof profiling endpoints are registered. + PprofEnabled bool + + // PprofUsername is the HTTP basic auth username for pprof endpoints. + // Required when PprofEnabled is true. + PprofUsername string + + // PprofPassword is the HTTP basic auth password for pprof endpoints. + // Required when PprofEnabled is true. + PprofPassword string + + // UsageLimiter tracks and enforces usage limits on API keys. + UsageLimiter usagelimiter.Service + + // AnalyticsConnectionManager manages connections to analytics backends + // for retrieving verification and usage data. AnalyticsConnectionManager analytics.ConnectionManager } diff --git a/svc/api/routes/v2_deploy_generate_upload_url/BUILD.bazel b/svc/api/routes/v2_deploy_generate_upload_url/BUILD.bazel index dc35d482aa..d1370e7822 100644 --- a/svc/api/routes/v2_deploy_generate_upload_url/BUILD.bazel +++ b/svc/api/routes/v2_deploy_generate_upload_url/BUILD.bazel @@ -2,7 +2,10 @@ load("@rules_go//go:def.bzl", "go_library", "go_test") go_library( name = "v2_deploy_generate_upload_url", - srcs = ["handler.go"], + srcs = [ + "doc.go", + "handler.go", + ], importpath = "github.com/unkeyed/unkey/svc/api/routes/v2_deploy_generate_upload_url", visibility = ["//visibility:public"], deps = [ diff --git a/svc/api/routes/v2_deploy_generate_upload_url/doc.go b/svc/api/routes/v2_deploy_generate_upload_url/doc.go new file mode 100644 index 0000000000..71d655d10b --- /dev/null +++ b/svc/api/routes/v2_deploy_generate_upload_url/doc.go @@ -0,0 +1,31 @@ +// Package handler implements the POST /v2/deploy.generateUploadUrl endpoint +// for generating pre-signed S3 URLs used to upload deployment build contexts. +// +// This endpoint is part of the deployment workflow where clients upload their +// build artifacts to S3 before triggering a deployment. The handler delegates +// URL generation to the control plane service via gRPC, ensuring all upload +// URLs are centrally managed and consistently configured. +// +// # Authentication and Authorization +// +// Requests must include a valid root key in the Authorization header. The root +// key must have either wildcard project permission (project.*.generate_upload_url) +// or specific permission for the target project (project..generate_upload_url). +// +// The handler also verifies that the requested project belongs to the workspace +// associated with the root key. Requests for projects in other workspaces return +// 404 to avoid leaking information about project existence. +// +// # Request Flow +// +// The handler validates the root key, binds and validates the request body, +// checks RBAC permissions, verifies project ownership, then calls the control +// plane to generate the upload URL. On success, it returns both the pre-signed +// upload URL and the build context path where the uploaded artifact will be stored. +// +// # Error Responses +// +// The handler returns 400 for missing or invalid request body, 401 for invalid +// root keys, 403 for insufficient permissions, and 404 when the project does +// not exist or belongs to a different workspace. +package handler diff --git a/svc/api/routes/v2_deploy_generate_upload_url/handler.go b/svc/api/routes/v2_deploy_generate_upload_url/handler.go index f6955ed4f7..5329b349f8 100644 --- a/svc/api/routes/v2_deploy_generate_upload_url/handler.go +++ b/svc/api/routes/v2_deploy_generate_upload_url/handler.go @@ -19,10 +19,18 @@ import ( ) type ( - Request = openapi.V2DeployGenerateUploadUrlRequestBody + // Request is the request body for generating an upload URL, containing the + // target project ID. Aliased from [openapi.V2DeployGenerateUploadUrlRequestBody]. + Request = openapi.V2DeployGenerateUploadUrlRequestBody + + // Response is the response body containing the pre-signed upload URL and + // build context path. Aliased from [openapi.V2DeployGenerateUploadUrlResponseBody]. Response = openapi.V2DeployGenerateUploadUrlResponseBody ) +// Handler generates pre-signed S3 upload URLs for deployment build contexts. +// It validates authentication, checks RBAC permissions, verifies project ownership, +// and delegates URL generation to the control plane service. type Handler struct { Logger logging.Logger DB db.Database @@ -30,14 +38,22 @@ type Handler struct { CtrlClient ctrlv1connect.DeploymentServiceClient } +// Path returns the URL path for this endpoint. func (h *Handler) Path() string { return "/v2/deploy.generateUploadUrl" } +// Method returns the HTTP method for this endpoint. func (h *Handler) Method() string { return "POST" } +// Handle processes a request to generate a pre-signed S3 upload URL. It +// authenticates via root key, verifies the caller has generate_upload_url +// permission on the project, confirms the project belongs to the caller's +// workspace, then returns an upload URL from the control plane. Returns 400 +// for invalid input, 401 for invalid root key, 403 for missing permissions, +// or 404 if the project does not exist in the caller's workspace. func (h *Handler) Handle(ctx context.Context, s *zen.Session) error { auth, emit, err := h.Keys.GetRootKey(ctx, s) defer emit() diff --git a/svc/ctrl/api/BUILD.bazel b/svc/ctrl/api/BUILD.bazel index 8c5bf93094..f2e244b85d 100644 --- a/svc/ctrl/api/BUILD.bazel +++ b/svc/ctrl/api/BUILD.bazel @@ -4,6 +4,7 @@ go_library( name = "api", srcs = [ "config.go", + "doc.go", "run.go", ], importpath = "github.com/unkeyed/unkey/svc/ctrl/api", diff --git a/svc/ctrl/api/config.go b/svc/ctrl/api/config.go index 925539cbd7..136a31dd7a 100644 --- a/svc/ctrl/api/config.go +++ b/svc/ctrl/api/config.go @@ -155,8 +155,18 @@ type RegistryConfig struct { Password string } +// VaultConfig holds configuration for HashiCorp Vault integration. +// +// Vault is used for secret management and encryption key storage. +// The control plane uses Vault to securely store and retrieve +// sensitive configuration data for deployed applications. type VaultConfig struct { - Url string + // Url is the Vault server address including protocol. + // Example: "https://vault.example.com:8200". + Url string + + // Token is the Vault authentication token. + // Must have appropriate policies for secret operations. Token string } @@ -174,6 +184,8 @@ type Config struct { // Used for control plane deployment and sentinel image configuration. Image string + // Region is the geographic region where this control plane instance runs. + // Used for logging, tracing, and region-aware routing decisions. Region string // HttpPort defines the HTTP port for the control plane server. @@ -185,14 +197,10 @@ type Config struct { // on all interfaces (0.0.0.0) on the specified port. PrometheusPort int - // --- Database configuration --- - // DatabasePrimary is the primary database connection string. // Used for both read and write operations to persistent storage. DatabasePrimary string - // --- OpenTelemetry configuration --- - // OtelEnabled enables sending telemetry data to collector endpoint. // When true, enables metrics, traces, and structured logs. OtelEnabled bool @@ -213,10 +221,8 @@ type Config struct { // Use clock.RealClock{} for production deployments. Clock clock.Clock - // --- Vault Configuration --- - + // Vault configures HashiCorp Vault integration for secret management. Vault VaultConfig - // --- ACME Configuration --- // Restate configures workflow engine integration. // Enables asynchronous deployment and certificate renewal workflows. @@ -226,8 +232,6 @@ type Config struct { // Used by both Depot and Docker build backends. BuildS3 S3Config - // --- Sentinel Configuration --- - // SentinelImage is the container image used for new sentinel deployments. // Overrides default sentinel image with custom build or registry. SentinelImage string @@ -239,13 +243,9 @@ type Config struct { // Validate checks the configuration for required fields and logical consistency. // -// This method performs comprehensive validation of all configuration sections -// including build backend, ACME providers, database connections, and -// required credentials. It ensures that conditional configuration -// (like ACME providers) has all necessary dependencies. -// -// Returns an error if required fields are missing, invalid, or inconsistent. -// Provides detailed error messages to help identify configuration issues. +// Currently this method performs no validation and always returns nil. Future +// implementations should validate required fields like DatabasePrimary, HttpPort, +// and conditional dependencies between configuration sections. func (c Config) Validate() error { return nil } diff --git a/svc/ctrl/api/doc.go b/svc/ctrl/api/doc.go new file mode 100644 index 0000000000..8cb1891491 --- /dev/null +++ b/svc/ctrl/api/doc.go @@ -0,0 +1,52 @@ +// Package api provides the control plane HTTP/2 server for Unkey's distributed infrastructure. +// +// The control plane coordinates deployment workflows, certificate management, and cluster +// operations across the Unkey platform. It exposes Connect RPC services over HTTP/2 and +// integrates with Restate for durable workflow execution. +// +// # Architecture +// +// The control plane sits at the center of Unkey's infrastructure, coordinating between: +// - Sentinel instances that run customer workloads +// - Restate for durable async workflow execution +// - S3-compatible storage for build artifacts +// - ACME providers for automatic TLS certificates +// +// # Services +// +// The server exposes several Connect RPC services: +// +// - [ctrl.Ctrl] - Core control plane operations +// - [deployment.Deployment] - Application deployment workflows +// - [acme.Acme] - ACME certificate management and HTTP-01 challenges +// - [openapi.OpenApi] - OpenAPI specification management +// - [cluster.Cluster] - Cluster coordination and sentinel management +// +// # Usage +// +// Configure and start the control plane: +// +// cfg := api.Config{ +// InstanceID: "ctrl-1", +// HttpPort: 8080, +// DatabasePrimary: "postgres://...", +// Clock: clock.RealClock{}, +// Restate: api.RestateConfig{ +// URL: "http://restate:8080", +// AdminURL: "http://restate:9070", +// }, +// } +// if err := api.Run(ctx, cfg); err != nil { +// log.Fatal(err) +// } +// +// The server supports both HTTP/2 cleartext (h2c) for development and TLS for production. +// When [Config.TLSConfig] is set, the server uses HTTPS; otherwise it uses h2c to allow +// HTTP/2 without TLS. +// +// # Shutdown +// +// The [Run] function handles graceful shutdown when the provided context is cancelled. +// All active connections are drained, database connections closed, and telemetry flushed +// before the function returns. +package api diff --git a/svc/ctrl/doc.go b/svc/ctrl/doc.go index 8be2b6ad05..391acb26ec 100644 --- a/svc/ctrl/doc.go +++ b/svc/ctrl/doc.go @@ -1,91 +1,43 @@ -// Package ctrl provides the main control plane service for the unkey platform. +// Package ctrl is the root package for the Unkey control plane service. // -// This package implements the central control plane that orchestrates deployments, -// manages TLS certificates through ACME, handles build operations, and provides -// API services for the unkey ecosystem. It integrates with multiple backend -// services including Restate for workflow orchestration, vault for secrets management, -// and container registries for build operations. +// This package serves as an organizational namespace and does not contain +// executable code. The control plane implementation is split across several +// subpackages, each responsible for a distinct concern. // -// # Architecture +// # Subpackages // -// The control plane consists of several integrated components: -// - HTTP/2 Connect server for API endpoints -// - Restate workflow engine for asynchronous operations -// - Vault services for secrets and certificate encryption -// - Database layer for persistent storage -// - ACME providers for automatic TLS certificate management -// - Build backends (Depot or Docker) for container image building +// The api subpackage provides the HTTP/2 Connect server that exposes the +// control plane's public API. It handles authentication, request routing, +// and serves as the external interface for clients interacting with the +// control plane. // -// # Key Services +// The worker subpackage implements the Restate workflow engine integration. +// It hosts long-running asynchronous operations including deployment +// orchestration, TLS certificate lifecycle management via ACME, and +// container image builds. The worker registers itself with the Restate +// admin API for service discovery. // -// [Deployment Service]: Manages application deployment workflows through Restate -// [Certificate Service]: Handles ACME challenges and TLS certificate lifecycle -// [Build Service]: Orchestrates container image builds via Depot or Docker -// [Routing Service]: Manages domain assignment and traffic routing -// [Cluster Object Service]: Handles cluster metadata and bootstrapping -// [OpenAPI Service]: Provides OpenAPI specification and schema documentation -// [Ctrl Service]: Control plane health and management operations +// The services subpackage contains domain-specific service implementations +// that are called by both the API and worker layers. These include the +// deployment service for managing application deployments, the ACME service +// for certificate challenges, and the cluster service for node metadata. // -// # ACME Integration +// The db subpackage provides the database access layer, including schema +// definitions and query functions for persistent storage operations. // -// The system supports ACME challenge providers: -// - HTTP-01 challenges for regular domains -// - DNS-01 challenges through AWS Route53 for wildcard certificates +// The middleware subpackage contains HTTP middleware components used by +// the API layer, including authentication and request logging. // -// # Configuration +// The pkg subpackage holds shared utilities used across the control plane, +// including the build backend abstraction that supports both Depot and +// Docker for container image building. // -// The control plane is highly configurable through [Config] which includes: -// - Database and vault configuration for persistence -// - Registry and build backend settings for container operations -// - ACME provider configuration for certificate management -// - Restate integration for workflow orchestration -// - TLS and authentication settings for secure operation +// The workflows subpackage defines Restate workflow definitions that +// orchestrate multi-step operations with durable execution guarantees. // -// # Usage +// The proto subpackage contains Protocol Buffer definitions and generated +// code for the control plane's gRPC and Connect interfaces. // -// Basic control plane setup: -// -// cfg := ctrl.Config{ -// InstanceID: "ctrl-prod-001", -// Platform: "aws", -// Region: "us-west-2", -// HttpPort: 8080, -// DatabasePrimary: "user:pass@tcp(db:3306)/unkey", -// RegistryURL: "registry.depot.dev", -// RegistryUsername: "x-token", -// RegistryPassword: "depot-token", -// VaultMasterKeys: []string{"master-key-1"}, -// VaultS3: ctrl.S3Config{ -// URL: "https://s3.amazonaws.com", -// Bucket: "unkey-vault", -// AccessKeyID: "access-key", -// AccessKeySecret: "secret-key", -// }, -// BuildBackend: ctrl.BuildBackendDepot, -// BuildPlatform: "linux/amd64", -// Acme: ctrl.AcmeConfig{ -// Enabled: true, -// EmailDomain: "unkey.com", -// Route53: ctrl.Route53Config{ -// Enabled: true, -// AccessKeyID: "aws-key", -// SecretAccessKey: "aws-secret", -// Region: "us-east-1", -// }, -// }, -// } -// err := ctrl.Run(context.Background(), cfg) -// -// The control plane will: -// 1. Initialize all services (database, vault, build backend, etc.) -// 2. Start Restate workflow engine with all service bindings -// 3. Register with Restate admin API for service discovery -// 4. Start HTTP/2 Connect server on configured port -// 5. Handle graceful shutdown on context cancellation -// -// # Observability -// -// The control plane integrates with OpenTelemetry for metrics, traces, and structured logging. -// It exposes health endpoints and provides comprehensive monitoring of all operations -// including deployment status, certificate renewal, and build progress. +// The integration subpackage provides integration test infrastructure +// for validating the control plane's behavior against real dependencies. package ctrl diff --git a/svc/ctrl/pkg/build/build.go b/svc/ctrl/pkg/build/build.go index c767f7e67a..ba1974f1e4 100644 --- a/svc/ctrl/pkg/build/build.go +++ b/svc/ctrl/pkg/build/build.go @@ -29,22 +29,29 @@ import ( ) const ( - // Cache policy constants for Depot projects - defaultCacheKeepGB = 50 + // defaultCacheKeepGB is the maximum cache size in gigabytes for new Depot + // projects. Depot evicts least-recently-used cache entries when exceeded. + defaultCacheKeepGB = 50 + + // defaultCacheKeepDays is the maximum age in days for cached build layers. + // Layers older than this are evicted regardless of cache size. defaultCacheKeepDays = 14 ) -// CreateBuild orchestrates the container image build process using Depot. +// BuildDockerImage builds a container image using Depot and pushes it to the +// configured registry. +// +// The method retrieves or creates a Depot project for the Unkey project, +// acquires a remote build machine, and executes the build. Build progress +// is streamed to ClickHouse for observability. On success, returns the +// fully-qualified image name and Depot metadata. // -// Steps: +// Required request fields: S3Url (build context), BuildContextPath, ProjectId, +// DeploymentId, and DockerfilePath. All fields are validated; missing fields +// result in a terminal error. // -// Get or create Depot project -// Register a new build with Depot -// Acquire a build machine -// Connect to the buildkit instance -// Prepare build context and configuration -// Execute the build with status logging -// Return build metadata +// Returns a terminal error for validation failures. Other errors may be +// retried by Restate. func (s *Depot) BuildDockerImage( ctx restate.Context, req *hydrav1.BuildDockerImageRequest, @@ -162,6 +169,9 @@ func (s *Depot) BuildDockerImage( }) } +// buildSolverOptions constructs the buildkit solver configuration for a build. +// It configures the dockerfile frontend, sets the platform and context URL, +// attaches registry authentication, and configures image export with push. func (s *Depot) buildSolverOptions( platform, contextURL, dockerfilePath, imageName string, ) client.SolveOpt { @@ -199,13 +209,12 @@ func (s *Depot) buildSolverOptions( } } -// getOrCreateDepotProject retrieves or creates a Depot project for the given Unkey project. +// getOrCreateDepotProject retrieves the Depot project ID for an Unkey project, +// creating one if it doesn't exist. The mapping is persisted to the database +// so subsequent builds reuse the same Depot project and its cache. // -// Steps: -// -// Check database for existing project mapping -// Create new Depot project if not found -// Store project mapping in database +// New projects are named "unkey-{projectID}" and created in the region +// specified by [DepotConfig.ProjectRegion] with the default cache policy. func (s *Depot) getOrCreateDepotProject(ctx context.Context, unkeyProjectID string) (string, error) { project, err := db.Query.FindProjectById(ctx, s.db.RO(), unkeyProjectID) if err != nil { @@ -268,6 +277,11 @@ func (s *Depot) getOrCreateDepotProject(ctx context.Context, unkeyProjectID stri return depotProjectID, nil } +// processBuildStatus consumes build status events from buildkit and writes +// telemetry to ClickHouse. It tracks completed vertices (build steps) and +// their logs, buffering them for batch insertion. +// +// This method runs in its own goroutine and exits when statusCh is closed. func (s *Depot) processBuildStatus( statusCh <-chan *client.SolveStatus, workspaceID, projectID, deploymentID string, diff --git a/svc/ctrl/pkg/build/doc.go b/svc/ctrl/pkg/build/doc.go index 19c9da96da..b843b29a22 100644 --- a/svc/ctrl/pkg/build/doc.go +++ b/svc/ctrl/pkg/build/doc.go @@ -1,63 +1,54 @@ -// Package depot provides Depot.dev build backend integration. +// Package build provides container image building via [Depot.dev]. // -// This package implements cloud-native container builds through -// the Depot.dev platform. It provides optimized builds with -// automatic caching, parallel execution, and direct registry -// integration for production workflows. +// Unkey uses Depot for container builds because it provides isolated build +// environments with automatic caching, eliminating the need to manage buildkit +// infrastructure. Each Unkey project gets a dedicated Depot project, ensuring +// cache isolation between tenants while sharing cache within a project. // // # Architecture // -// The Depot backend integrates with: -// - Depot.dev API for build orchestration -// - ClickHouse for build telemetry and analytics -// - S3 storage for build artifact persistence -// - Registry integration for container image management +// The build service operates as a Restate workflow step within the deployment +// pipeline. When a deployment requires building from source, the deploy worker +// calls [Depot.BuildDockerImage] which: // -// # Key Features -// -// - Cloud-native builds with automatic scaling -// - Build caching for faster repeated builds -// - Parallel build execution -// - Direct registry pushing and management -// - Build artifact storage and sharing -// - Real-time build progress tracking -// - Integration with unkey platform deployment workflows +// 1. Creates or retrieves a Depot project for the Unkey project +// 2. Acquires a build machine from Depot's infrastructure +// 3. Connects to the buildkit instance on that machine +// 4. Streams build context from S3 and executes the build +// 5. Pushes the resulting image to the configured registry +// 6. Records build step telemetry to ClickHouse // // # Usage // -// Creating Depot build backend: -// -// depotBackend := depot.New(depot.Config{ -// InstanceID: "build-instance-001", -// DB: database, -// RegistryConfig: depot.RegistryConfig{ -// URL: "https://registry.depot.dev", -// Username: "x-token", -// Password: "depot-api-token", -// }, -// BuildPlatform: depot.BuildPlatform{ -// Platform: "linux/amd64", -// Architecture: "amd64", -// }, -// DepotConfig: depot.DepotConfig{ -// APIUrl: "https://api.depot.dev", -// ProjectRegion: "us-east-1", -// }, -// Clickhouse: clickhouseClient, -// Storage: buildStorage, -// Logger: logger, +// Create a Depot backend and register it with Restate: +// +// backend := build.New(build.Config{ +// InstanceID: "build-instance-001", +// DB: database, +// DepotConfig: build.DepotConfig{ +// APIUrl: "https://api.depot.dev", +// ProjectRegion: "us-east-1", +// }, +// RegistryConfig: build.RegistryConfig{ +// URL: "registry.depot.dev", +// Username: "x-token", +// Password: depotToken, +// }, +// BuildPlatform: build.BuildPlatform{ +// Platform: "linux/amd64", +// Architecture: "amd64", +// }, +// Clickhouse: clickhouseClient, +// Logger: logger, // }) // -// # Build Operations +// The backend implements [hydrav1.BuildServiceServer] and exposes +// [Depot.BuildDockerImage] as an RPC endpoint. // -// The backend implements standard BuildService interface methods: -// - CreateBuild: Start new container build -// - GenerateUploadUrl: Generate pre-signed URLs for build artifacts -// - GetBuild: Get build status and metadata -// - GetBuildLogs: Stream real-time build logs +// # Cache Policy // -// # Error Handling +// New Depot projects are created with a cache policy of 50GB retained for 14 +// days. This balances build speed (cache hits) against storage costs. // -// Provides comprehensive error handling with proper HTTP status -// codes for API communication failures and build errors. +// [Depot.dev]: https://depot.dev package build diff --git a/svc/ctrl/pkg/build/service.go b/svc/ctrl/pkg/build/service.go index 1bd94042f2..a7dbe17eff 100644 --- a/svc/ctrl/pkg/build/service.go +++ b/svc/ctrl/pkg/build/service.go @@ -1,4 +1,3 @@ -// Package depot is used to build images and store them in their registry using depot.dev. This gives us isolated and cached builds. package build import ( @@ -8,22 +7,39 @@ import ( "github.com/unkeyed/unkey/pkg/otel/logging" ) +// BuildPlatform specifies the target platform for container builds. +// Platform is the full platform string (e.g., "linux/amd64") while +// Architecture is just the architecture portion (e.g., "amd64") used +// when requesting build machines from Depot. type BuildPlatform struct { Platform string Architecture string } +// DepotConfig holds configuration for connecting to the Depot.dev API. type DepotConfig struct { - APIUrl string + // APIUrl is the base URL for the Depot API, typically "https://api.depot.dev". + APIUrl string + + // ProjectRegion determines where Depot projects are created. Build machines + // run in this region, so choose one close to your registry for faster pushes. ProjectRegion string } +// RegistryConfig holds credentials for the container registry where built +// images are pushed. The Password field is also used as the Depot API token +// for authentication. type RegistryConfig struct { URL string Username string Password string } +// Depot orchestrates container builds using the Depot.dev platform. It +// implements [hydrav1.BuildServiceServer] for integration with Restate +// workflows. +// +// Create instances with [New]. The zero value is not usable. type Depot struct { instanceID string db db.Database @@ -36,16 +52,33 @@ type Depot struct { var _ hydrav1.BuildServiceServer = (*Depot)(nil) +// Config holds all dependencies required to create a [Depot] service. +// All fields are required. type Config struct { - InstanceID string - DB db.Database - DepotConfig DepotConfig - Clickhouse clickhouse.ClickHouse // Clickhouse for telemetry + // InstanceID identifies this service instance in logs and telemetry. + InstanceID string + + // DB provides database access for reading and updating project mappings. + DB db.Database + + // DepotConfig configures the Depot API connection. + DepotConfig DepotConfig + + // Clickhouse receives build step telemetry for observability. + Clickhouse clickhouse.ClickHouse + + // RegistryConfig provides credentials for the container registry. RegistryConfig RegistryConfig - BuildPlatform BuildPlatform - Logger logging.Logger + + // BuildPlatform specifies the target platform for all builds. + BuildPlatform BuildPlatform + + // Logger is used for structured logging throughout the build process. + Logger logging.Logger } +// New creates a [Depot] service from the provided configuration. All fields +// in [Config] must be set; the function does not validate inputs. func New(cfg Config) *Depot { return &Depot{ instanceID: cfg.InstanceID, diff --git a/svc/ctrl/pkg/s3/doc.go b/svc/ctrl/pkg/s3/doc.go index 6c5397a94d..2470435877 100644 --- a/svc/ctrl/pkg/s3/doc.go +++ b/svc/ctrl/pkg/s3/doc.go @@ -1,46 +1,33 @@ -// Package storage provides S3-compatible storage for build artifacts. +// Package s3 provides pre-signed URL generation for S3-compatible object storage. // -// This package implements S3-compatible object storage for -// container build artifacts. It supports multiple S3 endpoints -// and provides pre-signed URL generation for secure artifact access. +// The package supports separate internal and external S3 endpoints, which is +// necessary when the service runs inside a Docker network but clients access +// storage from outside. For example, the service may communicate with MinIO at +// http://minio:9000 internally, while clients need URLs pointing to +// http://localhost:9000. // -// # Architecture +// # Key Types // -// The storage package provides: -// - S3 client abstraction for multiple providers -// - Pre-signed URL generation for secure artifact downloads -// - Upload functionality for build artifact storage -// - Integration with S3-compatible storage backends -// -// # Key Features -// -// - Multiple S3 provider support (AWS, MinIO, localstack, etc.) -// - Secure pre-signed URL generation with configurable TTL -// - Multipart upload support for large artifacts -// - Error handling with retry logic -// - Logging for storage operations debugging +// [S3] is the main client that implements the [Storage] interface. Create one +// with [NewS3] using [S3Config] for configuration. // // # Usage // -// Creating S3 storage: -// -// storage, err := storage.NewS3(storage.S3Config{ +// s3Client, err := s3.NewS3(s3.S3Config{ // Logger: logger, -// S3URL: "https://s3.amazonaws.com", -// S3Bucket: "build-artifacts", +// S3URL: "http://minio:9000", +// S3PresignURL: "http://localhost:9000", +// S3Bucket: "artifacts", // S3AccessKeyID: "access-key", // S3AccessKeySecret: "secret-key", // }) +// if err != nil { +// // Handle error - bucket creation or AWS config failed +// } // -// // Upload build artifact -// err = storage.Upload(ctx, "build-artifact.tar.gz", buildArtifactData) -// -// // Generate download URL -// url, err := storage.GeneratePresignedURL(ctx, "build-artifact.tar.gz", time.Hour) -// -// # Error Handling +// // Generate a URL for clients to upload an artifact +// uploadURL, err := s3Client.GenerateUploadURL(ctx, "builds/123/artifact.tar.gz", time.Hour) // -// The package provides comprehensive error handling for S3 operations -// including network failures, permission errors, and invalid -// configurations. +// // Generate a URL for clients to download an artifact +// downloadURL, err := s3Client.GenerateDownloadURL(ctx, "builds/123/artifact.tar.gz", time.Hour) package s3 diff --git a/svc/ctrl/pkg/s3/interface.go b/svc/ctrl/pkg/s3/interface.go index 052bef8497..078cd0a3bf 100644 --- a/svc/ctrl/pkg/s3/interface.go +++ b/svc/ctrl/pkg/s3/interface.go @@ -5,9 +5,15 @@ import ( "time" ) -// Storage defines the interface for object storage operations. +// Storage defines the interface for generating pre-signed URLs for object storage. +// Implementations must be safe for concurrent use. type Storage interface { + // GenerateDownloadURL returns a pre-signed URL that allows downloading the + // object at key. The URL expires after expiresIn. GenerateDownloadURL(ctx context.Context, key string, expiresIn time.Duration) (string, error) + + // GenerateUploadURL returns a pre-signed URL that allows uploading an object + // to key using HTTP PUT. The URL expires after expiresIn. GenerateUploadURL(ctx context.Context, key string, expiresIn time.Duration) (string, error) } diff --git a/svc/ctrl/pkg/s3/s3.go b/svc/ctrl/pkg/s3/s3.go index bbce746c5d..6b749b1565 100644 --- a/svc/ctrl/pkg/s3/s3.go +++ b/svc/ctrl/pkg/s3/s3.go @@ -15,21 +15,45 @@ import ( "github.com/unkeyed/unkey/pkg/otel/logging" ) +// S3 generates pre-signed URLs for S3-compatible object storage. It implements +// the [Storage] interface and is safe for concurrent use. type S3 struct { presigner *awsS3.PresignClient config S3Config logger logging.Logger } +// S3Config holds configuration for connecting to an S3-compatible storage backend. type S3Config struct { - S3URL string // Internal URL for S3 operations (e.g., http://s3:3902) - S3PresignURL string // Optional: External URL for presigned URLs when clients are outside Docker network. Defaults to S3URL. - S3Bucket string - S3AccessKeyID string + // S3URL is the endpoint URL used for S3 operations like bucket creation. + // This should be the internal network address when running in Docker. + S3URL string + + // S3PresignURL is the endpoint URL embedded in pre-signed URLs. Clients use + // this URL to access objects, so it must be reachable from outside the + // Docker network. Defaults to S3URL if empty. + S3PresignURL string + + // S3Bucket is the name of the bucket to use. The bucket is created + // automatically by [NewS3] if it does not exist. + S3Bucket string + + // S3AccessKeyID is the access key for S3 authentication. + S3AccessKeyID string + + // S3AccessKeySecret is the secret key for S3 authentication. S3AccessKeySecret string - Logger logging.Logger + + // Logger is used for logging storage operations. + Logger logging.Logger } +// NewS3 creates an S3 client configured for pre-signed URL generation. It +// creates the configured bucket if it does not already exist, ignoring the +// "BucketAlreadyOwnedByYou" error. +// +// Returns an error if the AWS configuration cannot be loaded or if bucket +// creation fails for reasons other than the bucket already existing. func NewS3(config S3Config) (*S3, error) { logger := config.Logger.With("service", "storage") @@ -110,10 +134,16 @@ func NewS3(config S3Config) (*S3, error) { }, nil } +// GenerateDownloadURL returns a pre-signed URL for downloading the object at +// key. The URL is valid for expiresIn duration and uses the S3PresignURL +// endpoint configured in [S3Config]. func (s *S3) GenerateDownloadURL(ctx context.Context, key string, expiresIn time.Duration) (string, error) { return s.presign(ctx, key, expiresIn, "GET") } +// GenerateUploadURL returns a pre-signed URL for uploading an object to key +// using HTTP PUT. The URL is valid for expiresIn duration and uses the +// S3PresignURL endpoint configured in [S3Config]. func (s *S3) GenerateUploadURL(ctx context.Context, key string, expiresIn time.Duration) (string, error) { return s.presign(ctx, key, expiresIn, "PUT") } diff --git a/svc/ctrl/services/cluster/auth.go b/svc/ctrl/services/cluster/auth.go index f3d855b7f5..ebda486eae 100644 --- a/svc/ctrl/services/cluster/auth.go +++ b/svc/ctrl/services/cluster/auth.go @@ -9,12 +9,15 @@ import ( "connectrpc.com/connect" ) +// request abstracts over Connect request types to extract HTTP headers for authentication. type request interface { Header() http.Header } -// authenticate validates the bearer token from the request's Authorization header. -// Returns a connect.CodeUnauthenticated error if the token is missing, malformed, or invalid. +// authenticate validates the bearer token from the request's Authorization header using +// constant-time comparison to prevent timing attacks. Returns connect.CodeUnauthenticated +// if the Authorization header is missing, does not start with "Bearer ", or contains an +// invalid token. func (s *Service) authenticate(req request) error { header := req.Header().Get("Authorization") diff --git a/svc/ctrl/services/cluster/doc.go b/svc/ctrl/services/cluster/doc.go index e93ba4a66c..7330a32bca 100644 --- a/svc/ctrl/services/cluster/doc.go +++ b/svc/ctrl/services/cluster/doc.go @@ -1,37 +1,43 @@ -// Package cluster implements the Connect ClusterService for synchronizing desired state to krane agents. +// Package cluster implements the Connect ClusterService for synchronizing desired state +// between the control plane and krane agents running in regional Kubernetes clusters. // // # Overview // -// Krane agents (similar to kubelets in Kubernetes) run in each region and act as managers -// for their respective Kubernetes clusters. They connect to the control plane and request -// state synchronization via the Sync RPC. The control plane streams the desired state -// for deployments and sentinels that should run in that region. +// Krane agents run in each region and manage their local Kubernetes clusters. They maintain +// long-lived streaming connections to the control plane, receiving desired state for +// deployments and sentinels via [Service.WatchDeployments] and [Service.WatchSentinels]. +// Agents report observed state back through [Service.ReportDeploymentStatus] and +// [Service.ReportSentinelStatus], enabling drift detection and health tracking. // // # State Synchronization Model // -// The synchronization uses a version-based approach: +// Synchronization uses version-based cursors for resumable streaming. Each resource +// (deployment_topology, sentinel) has a version column updated on every mutation. Agents +// track the maximum version seen and reconnect with that version to resume without +// replaying history. When version is 0 (new agent or reset), all resources are streamed +// in version order for a full bootstrap. // -// 1. Each resource (deployment_topology, sentinel) has a version column that is updated -// on every mutation via the Restate VersioningService singleton. +// The streaming RPCs poll the database every second when no new versions are available. +// Each poll fetches up to 100 resources with versions greater than the cursor. // -// 2. Krane agents track the last version they've seen. On reconnect, they request changes -// after that version. +// # Convergence Guarantees // -// 3. If version is 0 (new agent or reset), a full bootstrap is performed: all resources -// are streamed ordered by version. Stream close signals completion. +// The system achieves eventual consistency through idempotent operations: agents can +// safely apply the same state multiple times. Deletes use soft-delete semantics by +// setting desired state to archived or standby, preserving the version for streaming. +// After bootstrap, agents garbage-collect any Kubernetes resources not present in the +// stream, ensuring convergence even if messages were missed. // -// # Convergence Guarantees +// # Authentication // -// The system guarantees eventual consistency through: -// - Idempotent apply/delete operations: applying the same state multiple times is safe -// - Soft-delete semantics: "deletes" set desired_replicas=0, keeping the row with its version -// - Bootstrap + GC: after bootstrap, agents delete any k8s resources not in the stream -// - Reconnection with last-seen version: agents catch up on missed changes +// All RPCs require bearer token authentication via the Authorization header. Agents must +// also provide their region in the X-Krane-Region header for region-scoped operations. // // # Key Types // -// The main service type is [Service], which implements [ctrlv1connect.ClusterServiceHandler]. -// The primary RPCs are [Service.WatchDeployments] and [Service.WatchSentinels] for streaming -// state changes, and [Service.ReportDeploymentStatus] and [Service.ReportSentinelStatus] for -// receiving agent status updates. +// [Service] implements [ctrlv1connect.ClusterServiceHandler] with six RPCs: two streaming +// watchers ([Service.WatchDeployments], [Service.WatchSentinels]), two point queries +// ([Service.GetDesiredDeploymentState], [Service.GetDesiredSentinelState]), and two +// status reporters ([Service.ReportDeploymentStatus], [Service.ReportSentinelStatus]). +// Configuration is provided through [Config]. package cluster diff --git a/svc/ctrl/services/cluster/rpc_get_desired_deployment_state.go b/svc/ctrl/services/cluster/rpc_get_desired_deployment_state.go index 5823b880cb..86677a70c1 100644 --- a/svc/ctrl/services/cluster/rpc_get_desired_deployment_state.go +++ b/svc/ctrl/services/cluster/rpc_get_desired_deployment_state.go @@ -10,14 +10,17 @@ import ( "github.com/unkeyed/unkey/pkg/db" ) -// GetDesiredDeploymentState returns the target state for a deployment in the caller's region. -// Krane agents use this to determine whether to apply or delete a deployment. The response -// contains either an ApplyDeployment (for running state) or DeleteDeployment (for archived -// or standby states) based on the deployment's desired_state in the database. +// GetDesiredDeploymentState returns the target state for a single deployment in the caller's +// region. This is a point query alternative to [Service.WatchDeployments] for cases where +// an agent needs to fetch state for a specific deployment rather than streaming all changes. // -// Requires bearer token authentication and the X-Krane-Region header to identify the -// requesting agent's region. Returns CodeNotFound if the deployment doesn't exist in -// the specified region, or CodeInvalidArgument if the region header is missing. +// The response contains either an ApplyDeployment (for running state) or DeleteDeployment +// (for archived or standby states) based on the deployment_topology's desired_state in the +// database. Unhandled desired states result in CodeInternal. +// +// Returns CodeUnauthenticated if bearer token is invalid, CodeInvalidArgument if the +// X-Krane-Region header is missing, CodeNotFound if no deployment exists with the given +// ID in the specified region, or CodeInternal for database errors or unhandled states. func (s *Service) GetDesiredDeploymentState(ctx context.Context, req *connect.Request[ctrlv1.GetDesiredDeploymentStateRequest]) (*connect.Response[ctrlv1.DeploymentState], error) { if err := s.authenticate(req); err != nil { diff --git a/svc/ctrl/services/cluster/rpc_get_desired_sentinel_state.go b/svc/ctrl/services/cluster/rpc_get_desired_sentinel_state.go index 380eab265d..b40aca50b2 100644 --- a/svc/ctrl/services/cluster/rpc_get_desired_sentinel_state.go +++ b/svc/ctrl/services/cluster/rpc_get_desired_sentinel_state.go @@ -10,13 +10,17 @@ import ( "github.com/unkeyed/unkey/pkg/db" ) -// GetDesiredSentinelState returns the target state for a sentinel resource. Krane agents -// use this to determine whether to apply or delete a sentinel. The response contains either -// an ApplySentinel (for running state) or DeleteSentinel (for archived or standby states) -// based on the sentinel's desired_state in the database. +// GetDesiredSentinelState returns the target state for a single sentinel resource. This is +// a point query alternative to [Service.WatchSentinels] for cases where an agent needs to +// fetch state for a specific sentinel rather than streaming all changes. // -// Requires bearer token authentication and the X-Krane-Region header. Returns CodeNotFound -// if the sentinel doesn't exist, or CodeInvalidArgument if the region header is missing. +// The response contains either an ApplySentinel (for running state) or DeleteSentinel +// (for archived or standby states) based on the sentinel's desired_state in the database. +// Unhandled desired states result in CodeInternal. +// +// Returns CodeUnauthenticated if bearer token is invalid, CodeInvalidArgument if the +// X-Krane-Region header is missing, CodeNotFound if no sentinel exists with the given ID, +// or CodeInternal for database errors or unhandled states. func (s *Service) GetDesiredSentinelState(ctx context.Context, req *connect.Request[ctrlv1.GetDesiredSentinelStateRequest]) (*connect.Response[ctrlv1.SentinelState], error) { if err := s.authenticate(req); err != nil { diff --git a/svc/ctrl/services/cluster/rpc_report_deployment_status.go b/svc/ctrl/services/cluster/rpc_report_deployment_status.go index 93ba67b7aa..45334aa3ab 100644 --- a/svc/ctrl/services/cluster/rpc_report_deployment_status.go +++ b/svc/ctrl/services/cluster/rpc_report_deployment_status.go @@ -12,14 +12,19 @@ import ( // ReportDeploymentStatus reconciles the observed deployment state reported by a krane agent. // This is the feedback loop for convergence: agents report what's actually running so the -// control plane can track instance health and detect drift. +// control plane can track instance health and detect drift from desired state. // -// For update requests, instances are upserted and any instances no longer reported by the -// agent are deleted (garbage collection). For delete requests, all instances for the -// deployment in that region are removed. The operation runs within a retryable transaction -// to handle transient database errors. +// The request contains either an Update or Delete change. For Update, the method upserts +// all reported instances and garbage-collects any instances in the database that were not +// included in the report. For Delete, all instances for the deployment in that region are +// removed. Both operations run within a retryable transaction to handle transient database +// errors using [db.TxRetry]. // -// Requires bearer token authentication and the X-Krane-Region header. +// Instance status is mapped from proto values to database enums via [ctrlDeploymentStatusToDbStatus]. +// Unspecified or unknown statuses default to inactive. +// +// Returns CodeUnauthenticated if bearer token is invalid. Database errors during the +// transaction are returned as-is (not wrapped in Connect error codes). func (s *Service) ReportDeploymentStatus(ctx context.Context, req *connect.Request[ctrlv1.ReportDeploymentStatusRequest]) (*connect.Response[ctrlv1.ReportDeploymentStatusResponse], error) { s.logger.Info("reporting deployment status", "req", req.Msg) @@ -114,8 +119,10 @@ func (s *Service) ReportDeploymentStatus(ctx context.Context, req *connect.Reque } -// ctrlDeploymentStatusToDbStatus maps proto instance status to database enum values. -// Unspecified or unknown statuses are treated as inactive. +// ctrlDeploymentStatusToDbStatus maps proto instance status values to database enum values. +// STATUS_PENDING maps to InstancesStatusPending, STATUS_RUNNING to InstancesStatusRunning, +// STATUS_FAILED to InstancesStatusFailed. STATUS_UNSPECIFIED and any unknown values default +// to InstancesStatusInactive. func ctrlDeploymentStatusToDbStatus(status ctrlv1.ReportDeploymentStatusRequest_Update_Instance_Status) db.InstancesStatus { switch status { case ctrlv1.ReportDeploymentStatusRequest_Update_Instance_STATUS_UNSPECIFIED: diff --git a/svc/ctrl/services/cluster/rpc_report_sentinel_status.go b/svc/ctrl/services/cluster/rpc_report_sentinel_status.go index 27096f4eed..b258c47740 100644 --- a/svc/ctrl/services/cluster/rpc_report_sentinel_status.go +++ b/svc/ctrl/services/cluster/rpc_report_sentinel_status.go @@ -11,12 +11,17 @@ import ( "github.com/unkeyed/unkey/pkg/db" ) -// ReportSentinelStatus records the observed replica count for a sentinel as reported by a -// krane agent. This updates the available_replicas and health fields in the database, -// allowing the control plane to track which sentinels are actually running and healthy. -// A sentinel is considered healthy if it has at least one available replica. +// ReportSentinelStatus records the observed replica count and health for a sentinel as +// reported by a krane agent. This updates the available_replicas, health, and updated_at +// fields in the database, enabling the control plane to track which sentinels are actually +// running and their current health state. // -// Requires bearer token authentication and the X-Krane-Region header. +// The health proto value is mapped to database enums: HEALTH_HEALTHY to SentinelsHealthHealthy, +// HEALTH_UNHEALTHY to SentinelsHealthUnhealthy, HEALTH_PAUSED to SentinelsHealthPaused, and +// HEALTH_UNSPECIFIED to SentinelsHealthUnknown. +// +// Returns CodeUnauthenticated if bearer token is invalid, or CodeInternal if the database +// update fails. func (s *Service) ReportSentinelStatus(ctx context.Context, req *connect.Request[ctrlv1.ReportSentinelStatusRequest]) (*connect.Response[ctrlv1.ReportSentinelStatusResponse], error) { if err := s.authenticate(req); err != nil { diff --git a/svc/ctrl/services/cluster/rpc_watch_deployments.go b/svc/ctrl/services/cluster/rpc_watch_deployments.go index d94170bf3f..a830a6d04b 100644 --- a/svc/ctrl/services/cluster/rpc_watch_deployments.go +++ b/svc/ctrl/services/cluster/rpc_watch_deployments.go @@ -11,12 +11,20 @@ import ( ) // WatchDeployments streams deployment state changes from the control plane to agents. -// Each deployment controller maintains its own version cursor for resumable streaming. -// The agent applies received state to Kubernetes to converge actual state toward desired state. +// This is the primary mechanism for agents to receive desired state updates for their region. +// Agents apply received state to Kubernetes to converge actual state toward desired state. // -// This is a long-lived streaming RPC. The server polls the database for new deployment -// versions and streams them to the client. The client should track the max version seen -// and reconnect with that version to resume from where it left off. +// The stream uses version-based cursors for resumability. The client provides version_last_seen +// in the request, and the server streams all deployments with versions greater than that cursor. +// Clients should track the maximum version received and use it to reconnect without replaying +// history. When no new versions are available, the server polls the database every second. +// +// Each poll fetches up to 100 deployment topology rows ordered by version. The desired_status +// field determines whether to send an ApplyDeployment (for started/starting states) or +// DeleteDeployment (for stopped/stopping states). Rows with unhandled statuses are logged +// and skipped. +// +// Returns when the context is cancelled, or on database or stream errors. func (s *Service) WatchDeployments( ctx context.Context, req *connect.Request[ctrlv1.WatchDeploymentsRequest], @@ -61,6 +69,9 @@ func (s *Service) WatchDeployments( } } +// fetchDeploymentStates queries the database for deployment topologies in the given region +// with versions greater than afterVersion, returning up to 100 results. Rows that fail +// conversion are logged and skipped rather than failing the entire batch. func (s *Service) fetchDeploymentStates(ctx context.Context, region string, afterVersion uint64) ([]*ctrlv1.DeploymentState, error) { rows, err := db.Query.ListDeploymentTopologyByRegion(ctx, s.db.RO(), db.ListDeploymentTopologyByRegionParams{ Region: region, @@ -84,6 +95,9 @@ func (s *Service) fetchDeploymentStates(ctx context.Context, region string, afte return states, nil } +// deploymentRowToState converts a database row to a proto DeploymentState message. Returns +// a DeleteDeployment for stopped/stopping statuses and an ApplyDeployment for started/starting +// statuses. Returns (nil, nil) for unhandled statuses, which the caller should skip. func (s *Service) deploymentRowToState(row db.ListDeploymentTopologyByRegionRow) (*ctrlv1.DeploymentState, error) { switch row.DeploymentTopology.DesiredStatus { case db.DeploymentTopologyDesiredStatusStopped, db.DeploymentTopologyDesiredStatusStopping: diff --git a/svc/ctrl/services/cluster/rpc_watch_sentinels.go b/svc/ctrl/services/cluster/rpc_watch_sentinels.go index 0b99907658..5e1469a690 100644 --- a/svc/ctrl/services/cluster/rpc_watch_sentinels.go +++ b/svc/ctrl/services/cluster/rpc_watch_sentinels.go @@ -11,12 +11,19 @@ import ( ) // WatchSentinels streams sentinel state changes from the control plane to agents. -// Each sentinel controller maintains its own version cursor for resumable streaming. -// The agent applies received state to Kubernetes to converge actual state toward desired state. +// This is the primary mechanism for agents to receive desired state updates for their region. +// Agents apply received state to Kubernetes to converge actual state toward desired state. // -// This is a long-lived streaming RPC. The server polls the database for new sentinel -// versions and streams them to the client. The client should track the max version seen -// and reconnect with that version to resume from where it left off. +// The stream uses version-based cursors for resumability. The client provides version_last_seen +// in the request, and the server streams all sentinels with versions greater than that cursor. +// Clients should track the maximum version received and use it to reconnect without replaying +// history. When no new versions are available, the server polls the database every second. +// +// Each poll fetches up to 100 sentinel rows ordered by version. The desired_state field +// determines whether to send an ApplySentinel (for running state) or DeleteSentinel (for +// archived or standby states). Rows with unhandled states are logged and skipped. +// +// Returns when the context is cancelled, or on database or stream errors. func (s *Service) WatchSentinels( ctx context.Context, req *connect.Request[ctrlv1.WatchSentinelsRequest], @@ -65,6 +72,9 @@ func (s *Service) WatchSentinels( } } +// fetchSentinelStates queries the database for sentinels in the given region with versions +// greater than afterVersion, returning up to 100 results. Rows with unhandled desired_state +// values are skipped rather than failing the entire batch. func (s *Service) fetchSentinelStates(ctx context.Context, region string, afterVersion uint64) ([]*ctrlv1.SentinelState, error) { rows, err := db.Query.ListSentinelsByRegion(ctx, s.db.RO(), db.ListSentinelsByRegionParams{ Region: region, @@ -86,6 +96,9 @@ func (s *Service) fetchSentinelStates(ctx context.Context, region string, afterV return states, nil } +// sentinelRowToState converts a database sentinel row to a proto SentinelState message. +// Returns a DeleteSentinel for archived or standby states and an ApplySentinel for running +// state. Returns nil for unhandled states, which the caller should skip. func (s *Service) sentinelRowToState(sentinel db.Sentinel) *ctrlv1.SentinelState { switch sentinel.DesiredState { case db.SentinelsDesiredStateArchived, db.SentinelsDesiredStateStandby: diff --git a/svc/ctrl/services/cluster/service.go b/svc/ctrl/services/cluster/service.go index 0261b14132..917e475248 100644 --- a/svc/ctrl/services/cluster/service.go +++ b/svc/ctrl/services/cluster/service.go @@ -6,7 +6,10 @@ import ( "github.com/unkeyed/unkey/pkg/otel/logging" ) -// Service implements the ClusterService Connect interface for state synchronization. +// Service implements [ctrlv1connect.ClusterServiceHandler] to synchronize desired state +// between the control plane and krane agents. It provides streaming RPCs for watching +// deployment and sentinel changes, point queries for fetching individual resource states, +// and status reporting endpoints for agents to report observed state back to the control plane. type Service struct { ctrlv1connect.UnimplementedClusterServiceHandler db db.Database @@ -14,14 +17,20 @@ type Service struct { bearer string } -// Config holds the configuration for creating a new cluster Service. +// Config holds the configuration for creating a new cluster [Service]. type Config struct { + // Database provides read and write access for querying and updating resource state. Database db.Database - Logger logging.Logger - Bearer string + + // Logger is used for structured logging throughout the service. + Logger logging.Logger + + // Bearer is the authentication token that agents must provide in the Authorization header. + Bearer string } -// New creates a new cluster Service with the given configuration. +// New creates a new cluster [Service] with the given configuration. The returned service +// is ready to be registered with a Connect server. func New(cfg Config) *Service { return &Service{ UnimplementedClusterServiceHandler: ctrlv1connect.UnimplementedClusterServiceHandler{}, diff --git a/svc/ctrl/services/deployment/create_deployment.go b/svc/ctrl/services/deployment/create_deployment.go index 2f0568563f..3273b0f6fe 100644 --- a/svc/ctrl/services/deployment/create_deployment.go +++ b/svc/ctrl/services/deployment/create_deployment.go @@ -18,11 +18,26 @@ import ( ) const ( - maxCommitMessageLength = 10240 + // maxCommitMessageLength limits commit messages to prevent oversized database entries. + maxCommitMessageLength = 10240 + // maxCommitAuthorHandleLength limits author handles (e.g., GitHub usernames). maxCommitAuthorHandleLength = 256 + // maxCommitAuthorAvatarLength limits avatar URL length. maxCommitAuthorAvatarLength = 512 ) +// CreateDeployment creates a new deployment record and initiates an async Restate +// workflow. The deployment source must be either a build context (S3 path to a +// tar.gz archive with an optional Dockerfile path) or a prebuilt Docker image. +// +// The method looks up the project to infer the workspace, validates the +// environment exists, fetches environment variables, and persists the deployment +// with status "pending" before triggering the workflow. Git commit metadata is +// optional but validated when provided: timestamps must be Unix epoch milliseconds +// and cannot be more than one hour in the future. +// +// The workflow runs asynchronously keyed by project ID, so only one deployment +// per project executes at a time. Returns the deployment ID and initial status. func (s *Service) CreateDeployment( ctx context.Context, req *connect.Request[ctrlv1.CreateDeploymentRequest], @@ -259,6 +274,8 @@ func (s *Service) CreateDeployment( return res, nil } +// trimLength truncates s to the specified number of characters. Note this +// operates on bytes, not runes, so multi-byte UTF-8 characters may be split. func trimLength(s string, characters int) string { if len(s) > characters { return s[:characters] diff --git a/svc/ctrl/services/deployment/create_s3_upload_url.go b/svc/ctrl/services/deployment/create_s3_upload_url.go index a62c462d5c..1e34050881 100644 --- a/svc/ctrl/services/deployment/create_s3_upload_url.go +++ b/svc/ctrl/services/deployment/create_s3_upload_url.go @@ -10,6 +10,12 @@ import ( "github.com/unkeyed/unkey/pkg/uid" ) +// CreateS3UploadURL generates a presigned S3 URL for uploading a build context +// archive. The URL is valid for 15 minutes. The build context path is generated +// using the project ID and a unique build ID, formatted as +// "{project_id}/{build_id}.tar.gz". Clients should upload a tar.gz archive +// containing the application source code to this URL, then pass the returned +// BuildContextPath to [CreateDeployment]. func (s *Service) CreateS3UploadURL( ctx context.Context, req *connect.Request[ctrlv1.CreateS3UploadURLRequest], diff --git a/svc/ctrl/services/deployment/doc.go b/svc/ctrl/services/deployment/doc.go index dc9b3e4468..0322860396 100644 --- a/svc/ctrl/services/deployment/doc.go +++ b/svc/ctrl/services/deployment/doc.go @@ -1,75 +1,52 @@ -// Package deployment provides complete deployment lifecycle orchestration. +// Package deployment provides the control-plane deployment service for managing +// application deployments, promotions, and rollbacks. // -// This package implements the core deployment functionality of the -// unkey platform, managing the entire deployment process from -// container creation through scaling, routing, and promotion. -// It coordinates with Krane agents for container orchestration -// and integrates with build services for container image creation. +// This package implements a ConnectRPC service that orchestrates deployment +// workflows through Restate for durable execution. It acts as the API layer +// between clients (CLI, dashboard) and the underlying Hydra deployment workflows. // -// # Architecture +// # Concurrency Model // -// The deployment service provides comprehensive workflow orchestration: -// - Deploy new container images and configure routing -// - Scale deployments by adjusting replica counts -// - Promote successful deployments to production traffic -// - Rollback failed deployments to previous working versions -// - Manage domain assignments and sticky behavior -// - Coordinate with sentinel configurations for edge routing +// All operations use Restate virtual objects keyed by project ID, ensuring only +// one deployment operation runs per project at a time. This prevents race +// conditions when multiple deployments, promotions, or rollbacks are triggered +// simultaneously for the same project. // -// # Built on Restate +// # Deployment Sources // -// All deployment workflows use Restate for durable execution: -// - Automatic retries on transient failures -// - Exactly-once guarantees for each workflow step -// - Durable state that survives process crashes and restarts -// - Virtual object concurrency control keyed by project ID +// [CreateDeployment] supports two deployment sources: // -// # Key Workflow Types +// - Build from source: provide a build context path (S3 key to a tar.gz archive) +// and optionally a Dockerfile path (defaults to "./Dockerfile") +// - Prebuilt image: provide a Docker image reference directly // -// [Workflow.Deploy]: Deploy new applications from container images -// [Workflow.Rollback]: Revert to previous deployment version -// [Workflow.Promote]: Mark deployment as production-ready +// # Workflow Lifecycle // -// # Deployment Sources +// Deployments follow this lifecycle: // -// The service supports multiple deployment sources: -// - Source from build: Create from existing container image -// - Source from git: Build from repository with Dockerfile +// 1. [CreateDeployment] validates the request, stores metadata in the database +// with status "pending", and triggers an async Restate workflow +// 2. The Hydra workflow (separate service) builds the image, deploys containers, +// and configures networking +// 3. [GetDeployment] retrieves current deployment status and metadata +// 4. [Promote] switches traffic to the target deployment +// 5. [Rollback] reverts traffic to a previous deployment // -// # Integration Points +// # Error Handling // -// - Build Services: Coordinates with Depot or Docker backends -// - Krane Agents: Container orchestration and deployment -// - Database: Persistent state and metadata management -// - Routing Service: Domain assignment and traffic management -// - Vault Service: Secure storage of secrets and certificates +// All methods return Connect error codes following standard conventions: +// [connect.CodeInvalidArgument] for validation errors, [connect.CodeNotFound] +// for missing resources, and [connect.CodeInternal] for system failures. // // # Usage // -// Creating deployment service: -// -// deploymentSvc := deployment.New(deployment.Config{ -// Database: mainDB, -// Restate: restateClient, -// BuildService: buildService, -// Logger: logger, -// DefaultDomain: "unkey.app", -// }) +// Creating the deployment service: // -// // Deploy new application -// _, err := deploymentSvc.Deploy(ctx, &hydrav1.DeployRequest{ -// DeploymentId: "deploy-123", -// Source: &hydrav1.DeployRequest_Git{ -// Git: &hydrav1.DeployRequest_Git_Source{ -// Repository: "https://github.com/user/repo.git", -// Branch: "main", -// CommitSha: "abc123def456", -// }, -// }, +// svc := deployment.New(deployment.Config{ +// Database: db, +// Restate: restateClient, +// Logger: logger, +// AvailableRegions: []string{"us-east-1", "eu-west-1"}, +// BuildStorage: s3Storage, // }) -// -// # Error Handling -// -// The service uses comprehensive error handling with proper HTTP -// status codes and database transaction management. package deployment diff --git a/svc/ctrl/services/deployment/get_deployment.go b/svc/ctrl/services/deployment/get_deployment.go index c6d81f3d08..5b65a71fcd 100644 --- a/svc/ctrl/services/deployment/get_deployment.go +++ b/svc/ctrl/services/deployment/get_deployment.go @@ -9,6 +9,11 @@ import ( "github.com/unkeyed/unkey/pkg/db" ) +// GetDeployment retrieves a deployment by ID including its current status, +// git metadata, and associated hostnames. Returns [connect.CodeNotFound] if the +// deployment does not exist. Hostnames are fetched separately from frontline +// routes; if that lookup fails, the response still succeeds but with an empty +// hostname list. func (s *Service) GetDeployment( ctx context.Context, req *connect.Request[ctrlv1.GetDeploymentRequest], @@ -88,6 +93,8 @@ func (s *Service) GetDeployment( return res, nil } +// convertDbStatusToProto maps database deployment status to the proto enum. +// Returns DEPLOYMENT_STATUS_UNSPECIFIED for unknown status values. func convertDbStatusToProto(status db.DeploymentsStatus) ctrlv1.DeploymentStatus { switch status { case db.DeploymentsStatusPending: diff --git a/svc/ctrl/services/deployment/promote.go b/svc/ctrl/services/deployment/promote.go index d97f158757..834154e2df 100644 --- a/svc/ctrl/services/deployment/promote.go +++ b/svc/ctrl/services/deployment/promote.go @@ -10,7 +10,11 @@ import ( "github.com/unkeyed/unkey/pkg/db" ) -// Promote reassigns all domains to a deployment and removes the rolled back state via Restate workflow +// Promote reassigns all domains to the target deployment via a Restate workflow. +// This is typically used after a rollback to restore the original deployment, or +// to switch traffic to a new deployment that was previously in a preview state. +// The workflow runs synchronously (blocking until complete) and is keyed by +// project ID to prevent concurrent promotion operations on the same project. func (s *Service) Promote(ctx context.Context, req *connect.Request[ctrlv1.PromoteRequest]) (*connect.Response[ctrlv1.PromoteResponse], error) { s.logger.Info("initiating promotion via Restate", "target", req.Msg.GetTargetDeploymentId(), diff --git a/svc/ctrl/services/deployment/rollback.go b/svc/ctrl/services/deployment/rollback.go index 2322b0fabb..a6d2a5e539 100644 --- a/svc/ctrl/services/deployment/rollback.go +++ b/svc/ctrl/services/deployment/rollback.go @@ -10,8 +10,11 @@ import ( "github.com/unkeyed/unkey/pkg/db" ) -// Rollback performs a rollback to a previous deployment via Restate workflow -// This is the main rollback implementation that the dashboard will call +// Rollback switches traffic from the source deployment to a previous target +// deployment via a Restate workflow. This is typically called from the dashboard +// when a deployment needs to be reverted. The workflow runs synchronously +// (blocking until complete) and is keyed by project ID to prevent concurrent +// rollback operations on the same project. func (s *Service) Rollback(ctx context.Context, req *connect.Request[ctrlv1.RollbackRequest]) (*connect.Response[ctrlv1.RollbackResponse], error) { s.logger.Info("initiating rollback via Restate", "source", req.Msg.GetSourceDeploymentId(), diff --git a/svc/ctrl/services/deployment/service.go b/svc/ctrl/services/deployment/service.go index 77420585c3..9ca353b87d 100644 --- a/svc/ctrl/services/deployment/service.go +++ b/svc/ctrl/services/deployment/service.go @@ -1,9 +1,3 @@ -// Package deployment manages the full deployment lifecycle including creation, -// promotion, and rollback operations. All operations are keyed by project ID -// in Restate to ensure only one operation runs per project at a time. -// -// Supports two deployment sources: build from source (with build context and -// Dockerfile path) or prebuilt Docker images. package deployment import ( @@ -15,6 +9,9 @@ import ( "github.com/unkeyed/unkey/svc/ctrl/pkg/s3" ) +// Service implements the DeploymentService ConnectRPC API. It coordinates +// deployment operations by persisting state to the database and delegating +// workflow execution to Restate. type Service struct { ctrlv1connect.UnimplementedDeploymentServiceHandler db db.Database @@ -30,14 +27,22 @@ func (s *Service) deploymentClient(projectID string) hydrav1.DeploymentServiceIn return hydrav1.NewDeploymentServiceIngressClient(s.restate, projectID) } +// Config holds the configuration for creating a new [Service]. type Config struct { - Database db.Database - Restate *restateingress.Client - Logger logging.Logger + // Database provides read/write access to deployment metadata. + Database db.Database + // Restate is the ingress client for triggering durable workflows. + Restate *restateingress.Client + // Logger is used for structured logging throughout the service. + Logger logging.Logger + // AvailableRegions lists the regions where deployments can be created. AvailableRegions []string - BuildStorage s3.Storage + // BuildStorage provides presigned URL generation for build context uploads. + BuildStorage s3.Storage } +// New creates a new [Service] with the given configuration. All fields in +// [Config] are required. func New(cfg Config) *Service { return &Service{ UnimplementedDeploymentServiceHandler: ctrlv1connect.UnimplementedDeploymentServiceHandler{}, diff --git a/svc/ctrl/worker/BUILD.bazel b/svc/ctrl/worker/BUILD.bazel index 8fc6285186..0c3a170ff0 100644 --- a/svc/ctrl/worker/BUILD.bazel +++ b/svc/ctrl/worker/BUILD.bazel @@ -4,6 +4,7 @@ go_library( name = "worker", srcs = [ "config.go", + "doc.go", "run.go", ], importpath = "github.com/unkeyed/unkey/svc/ctrl/worker", diff --git a/svc/ctrl/worker/certificate/bootstrap_infra_certs.go b/svc/ctrl/worker/certificate/bootstrap_infra_certs.go index 3afa476cf7..f629fe72ce 100644 --- a/svc/ctrl/worker/certificate/bootstrap_infra_certs.go +++ b/svc/ctrl/worker/certificate/bootstrap_infra_certs.go @@ -12,36 +12,45 @@ import ( "github.com/unkeyed/unkey/pkg/uid" ) -// InfraWorkspaceID is the workspace ID used for infrastructure certificates. +// InfraWorkspaceID is the workspace ID for infrastructure certificates. Infrastructure +// certs are owned by this synthetic workspace rather than a customer workspace, allowing +// them to be managed separately and avoiding conflicts with customer domain records. const InfraWorkspaceID = "unkey_internal" -// BootstrapConfig holds configuration for infrastructure certificate bootstrapping. +// BootstrapConfig holds configuration for [Service.BootstrapInfraCerts]. type BootstrapConfig struct { - // DefaultDomain is the base domain for deployments (e.g., "unkey.app", "unkey.fun"). - // Results in a wildcard cert for "*.unkey.app". + // DefaultDomain is the base domain for customer deployments. If set, a wildcard + // certificate for "*.{DefaultDomain}" is provisioned to terminate TLS for all + // customer subdomains. DefaultDomain string - // RegionalApexDomain is the base domain for cross-region frontline communication (e.g., "unkey.cloud"). + // RegionalApexDomain is the base domain for cross-region communication between + // frontline instances. Combined with each entry in Regions to create per-region + // wildcard certificates. RegionalApexDomain string - // Regions is the list of available regions (e.g., ["us-west-2.aws", "eu-central-1.aws"]). - // Combined with RegionalApexDomain to create certs like "*.us-west-2.aws.unkey.cloud". + // Regions lists all deployment regions. For each region, a wildcard certificate + // is created as "*.{region}.{RegionalApexDomain}" to secure inter-region traffic. Regions []string - // Restate is the raw Restate ingress client. + // Restate is the ingress client used to trigger [Service.ProcessChallenge] workflows. + // Infrastructure cert bootstrapping delegates to the standard challenge flow rather + // than implementing separate certificate logic. Restate *restateIngress.Client } -// BootstrapInfraCerts ensures infrastructure wildcard certificates are provisioned. +// BootstrapInfraCerts provisions wildcard certificates for platform infrastructure. // -// This creates custom_domain and acme_challenge records for each infrastructure domain, -// then triggers the existing ProcessChallenge Restate workflow to obtain the certs. +// This method ensures the platform has valid TLS certificates for its own domains +// before serving customer traffic. It creates database records for each infrastructure +// domain and triggers [Service.ProcessChallenge] via Restate to obtain certificates. // -// Handles: -// - Default domain wildcard (e.g., "*.unkey.app") for deployment TLS -// - Per-region wildcards (e.g., "*.us-west-2.aws.unkey.cloud") for cross-region frontline +// The method is idempotent: domains with existing valid certificates are skipped, +// and domains with pending challenges are not re-triggered. This makes it safe to +// call on every service startup without risking duplicate certificate requests. // -// Idempotent - skips domains that already have certs or pending challenges. +// Returns nil without error if DNSProvider is not configured, since infrastructure +// certs require DNS-01 challenges for wildcards. Logs a warning in this case. func (s *Service) BootstrapInfraCerts(ctx context.Context, cfg BootstrapConfig) error { if s.dnsProvider == nil { s.logger.Warn("DNS provider not configured, skipping infrastructure cert bootstrap") diff --git a/svc/ctrl/worker/certificate/doc.go b/svc/ctrl/worker/certificate/doc.go index 728167a190..e8220a518e 100644 --- a/svc/ctrl/worker/certificate/doc.go +++ b/svc/ctrl/worker/certificate/doc.go @@ -1,78 +1,62 @@ -// Package certificate implements ACME certificate challenge workflows for SSL/TLS provisioning. +// Package certificate implements ACME certificate workflows for SSL/TLS provisioning. // -// This package handles the complete lifecycle of certificate provisioning using the ACME -// (Automatic Certificate Management Environment) protocol. It coordinates with certificate -// authorities to validate domain ownership and obtain SSL/TLS certificates. +// This package orchestrates the complete certificate lifecycle using the ACME protocol, +// coordinating with certificate authorities like Let's Encrypt to validate domain +// ownership and obtain certificates. It supports both HTTP-01 challenges for regular +// domains and DNS-01 challenges for wildcard certificates. // -// # Built on Restate +// # Why Restate // -// All workflows in this package are built on top of Restate (restate.dev) for durable -// execution. This provides critical guarantees: +// Certificate issuance involves multiple external dependencies (ACME servers, DNS +// propagation, database writes) that can fail independently. We use Restate for durable +// execution because ACME challenges have strict timing requirements and rate limits. +// If a challenge fails partway through, we cannot simply restart from the beginning +// without risking Let's Encrypt rate limits. Restate's exactly-once execution semantics +// allow the workflow to resume from the last completed step after crashes or network +// failures. // -// - Automatic retries on transient failures -// - Exactly-once execution semantics for each workflow step -// - Durable state that survives process crashes and restarts -// - Virtual object concurrency control keyed by domain name -// -// The virtual object model ensures that only one certificate challenge runs per domain -// at any given time, preventing race conditions and duplicate certificate requests that -// could trigger rate limits from certificate authorities. +// The virtual object model keys workflows by domain name. This prevents race conditions +// where two processes might simultaneously request certificates for the same domain, +// which would trigger ACME duplicate certificate rate limits. // // # Key Types // -// [Service] is the main entry point that implements the ACME certificate workflow. -// It handles the [Service.ProcessChallenge] method which orchestrates the entire -// certificate issuance process. -// -// # Usage -// -// The service is typically initialized with database connections and a vault service -// for secure storage of private keys: -// -// svc := certificate.New(certificate.Config{ -// DB: mainDB, -// Vault: vaultService, -// Logger: logger, -// }) +// [Service] is the main entry point implementing hydrav1.CertificateServiceServer. +// Configure it via [Config] and create instances with [New]. The service exposes two +// primary handlers: [Service.ProcessChallenge] for obtaining certificates and +// [Service.RenewExpiringCertificates] as a self-scheduling renewal cron job. // -// Certificate challenges are processed through the ProcessChallenge RPC: +// [EncryptedCertificate] holds certificate data with the private key encrypted via +// the vault service before database storage. // -// resp, err := svc.ProcessChallenge(ctx, &hydrav1.ProcessChallengeRequest{ -// WorkspaceId: "ws_123", -// Domain: "api.example.com", -// }) -// if err != nil { -// // Handle error -// } -// if resp.Status == "success" { -// // Certificate issued successfully -// } +// [BootstrapConfig] configures infrastructure certificate bootstrapping for wildcard +// certificates needed by the platform itself. // -// # ACME Challenge Flow +// # Challenge Types // -// The certificate challenge process follows these steps: +// The service automatically selects the appropriate ACME challenge type based on the +// domain. Wildcard domains (e.g., "*.example.com") require DNS-01 challenges because +// HTTP-01 cannot prove control over all possible subdomains. Regular domains use +// HTTP-01 which is faster since it avoids DNS propagation delays. // -// 1. Domain validation - Verify the domain exists and belongs to the workspace -// 2. Challenge claiming - Acquire exclusive lock on the domain challenge -// 3. ACME client setup - Get or create an ACME account for the workspace -// 4. Certificate obtain/renew - Request certificate from the CA -// 5. Certificate persistence - Store certificate and encrypted private key -// 6. Challenge completion - Mark the challenge as verified with expiry time +// # Rate Limit Handling // -// Each step is wrapped in a restate.Run call, making it durable and retryable. If the -// workflow crashes at any point, Restate will resume from the last completed step rather -// than restarting from the beginning. This ensures that ACME challenges can complete -// reliably even in the face of system failures, network partitions, or process restarts. +// Let's Encrypt enforces rate limits that cannot be bypassed. When rate limited, +// [Service.ProcessChallenge] uses Restate's durable sleep to wait until the retry-after +// time, then automatically retries. This prevents the workflow from consuming retry +// budget while waiting. Sleep duration is capped at 2 hours with a 1 minute buffer +// added to the retry-after time. // -// # Security Considerations +// # Security // -// Private keys are encrypted before storage using the vault service. Certificates -// are stored in the database for fast access by sentinels. ACME account -// credentials are workspace-scoped to prevent cross-workspace access. +// Private keys are encrypted using the vault service before storage. The encryption +// is workspace-scoped via keyring isolation. Certificates themselves are stored +// unencrypted for fast retrieval by sentinels that terminate TLS. // // # Error Handling // -// The package uses Restate's error handling model. Terminal errors with appropriate -// HTTP status codes are returned for client errors (invalid input, not found, etc.). -// System errors are returned for unexpected failures that may be retried. +// The package distinguishes between retryable errors (network timeouts, temporary +// ACME server issues) and terminal errors (invalid credentials, unauthorized domains). +// Terminal errors use restate.TerminalError to prevent infinite retry loops. +// Rate limit errors are handled specially with durable sleeps rather than retries. package certificate diff --git a/svc/ctrl/worker/certificate/process_challenge_handler.go b/svc/ctrl/worker/certificate/process_challenge_handler.go index fed91c8f93..cf53a07bf8 100644 --- a/svc/ctrl/worker/certificate/process_challenge_handler.go +++ b/svc/ctrl/worker/certificate/process_challenge_handler.go @@ -16,22 +16,45 @@ import ( "github.com/unkeyed/unkey/svc/ctrl/services/acme" ) -// EncryptedCertificate holds a certificate and its encrypted private key. +// EncryptedCertificate holds a certificate with its private key encrypted for storage. +// The private key is encrypted using the vault service with the workspace ID as the +// keyring, ensuring keys can only be decrypted by the owning workspace. type EncryptedCertificate struct { - CertificateID string - Certificate string + // CertificateID is the unique identifier for this certificate, generated using + // uid.New with the certificate prefix. + CertificateID string + + // Certificate contains the PEM-encoded certificate chain including intermediates. + Certificate string + + // EncryptedPrivateKey is the vault-encrypted PEM-encoded private key. EncryptedPrivateKey string - ExpiresAt int64 + + // ExpiresAt is the certificate expiration time as Unix milliseconds. Parsed from + // the certificate's NotAfter field; defaults to 90 days from issuance if parsing + // fails. + ExpiresAt int64 } -// ProcessChallenge handles the complete ACME certificate challenge flow. +// ProcessChallenge obtains or renews an SSL/TLS certificate for a domain. +// +// This is a Restate virtual object handler keyed by domain name, ensuring only one +// certificate challenge runs per domain at any time. The workflow consists of durable +// steps that survive process restarts: domain resolution, challenge claiming, certificate +// obtainment, persistence, and verification marking. +// +// The method uses the saga pattern for error handling. If any step fails after claiming +// the challenge, a deferred compensation function marks the challenge as failed in the +// database. This prevents the challenge from being stuck in "pending" state indefinitely. // -// This method implements a multi-step durable workflow using Restate to obtain or renew -// an SSL/TLS certificate for a domain. Each step is wrapped in restate.Run for durability, -// allowing the workflow to resume from the last completed step if interrupted. +// Rate limit handling is special: when Let's Encrypt returns a rate limit error with a +// retry-after time, the workflow performs a Restate durable sleep until that time plus +// a 1-minute buffer (capped at 2 hours), then retries. This uses at most 3 rate limit +// retries before failing. For transient errors, Restate's standard retry with exponential +// backoff applies (30s initial, 2x factor, 5m max, 5 attempts). // -// Uses the saga pattern: if any step fails after claiming the challenge, the deferred -// compensation marks the challenge as failed. +// Returns a response with Status "success" and the certificate ID on success, or Status +// "failed" with empty certificate ID on failure. System errors return (nil, error). func (s *Service) ProcessChallenge( ctx restate.ObjectContext, req *hydrav1.ProcessChallengeRequest, @@ -171,10 +194,14 @@ func (s *Service) ProcessChallenge( }, nil } -// globalAcmeUserID is the fixed user ID for the single global ACME account +// globalAcmeUserID identifies the single shared ACME account used for all certificate +// requests. Using one account avoids per-workspace ACME account registration and +// simplifies key management, while staying well under Let's Encrypt's account limits. const globalAcmeUserID = "acme" -// isWildcard returns true if the domain starts with "*." +// isWildcard reports whether domain is a wildcard domain pattern. Wildcard domains +// start with "*." and require DNS-01 challenges since HTTP-01 cannot validate control +// over arbitrary subdomains. func isWildcard(domain string) bool { return len(domain) > 2 && domain[0] == '*' && domain[1] == '.' } diff --git a/svc/ctrl/worker/certificate/renew_handler.go b/svc/ctrl/worker/certificate/renew_handler.go index e96fdf5006..3933465b6a 100644 --- a/svc/ctrl/worker/certificate/renew_handler.go +++ b/svc/ctrl/worker/certificate/renew_handler.go @@ -9,19 +9,32 @@ import ( ) const ( - // renewalInterval is how often the certificate renewal check runs + // renewalInterval determines how frequently the certificate renewal cron runs. + // Set to 24 hours because Let's Encrypt certificates are valid for 90 days and + // we trigger renewal 30 days before expiry, giving ample time for retries. renewalInterval = 24 * time.Hour - // renewalKey is the virtual object key for the singleton renewal job + // renewalKey is the Restate virtual object key for the singleton renewal job. + // Using a fixed key ensures only one renewal job runs globally, preventing + // duplicate work across service instances. renewalKey = "global" ) -// RenewExpiringCertificates scans for certificates expiring soon and triggers renewal. -// This is a self-scheduling Restate cron job - after completing, it schedules itself -// to run again after renewalInterval (24 hours). +// RenewExpiringCertificates is a self-scheduling Restate cron that renews certificates +// before they expire. // -// To start the cron job, call this handler once with key "global". It will then -// automatically reschedule itself forever. +// This handler queries for certificates expiring within 30 days (based on the +// acme_challenges table) and triggers [Service.ProcessChallenge] for each one via +// fire-and-forget Restate calls. The ProcessChallenge handler handles actual renewal. +// +// The cron pattern works by scheduling itself to run again after [renewalInterval] +// at the end of each execution. To bootstrap the cron, call this handler once with +// key "global" - it will then reschedule itself indefinitely. The idempotency key +// includes the next run date to prevent duplicate schedules if the handler is called +// multiple times on the same day. +// +// A 100ms delay is inserted between renewal triggers to avoid overwhelming the system +// when many certificates need renewal simultaneously. func (s *Service) RenewExpiringCertificates( ctx restate.ObjectContext, req *hydrav1.RenewExpiringCertificatesRequest, diff --git a/svc/ctrl/worker/certificate/service.go b/svc/ctrl/worker/certificate/service.go index 0d89840f78..1c0813934e 100644 --- a/svc/ctrl/worker/certificate/service.go +++ b/svc/ctrl/worker/certificate/service.go @@ -8,15 +8,21 @@ import ( "github.com/unkeyed/unkey/pkg/vault" ) -// Service handles ACME certificate challenge workflows. +// Service orchestrates ACME certificate issuance and renewal. // -// This service orchestrates the complete certificate issuance process including -// domain validation, challenge claiming, ACME protocol communication, and certificate -// storage. It implements the hydrav1.CertificateServiceServer interface. +// Service implements hydrav1.CertificateServiceServer with two main handlers: +// [Service.ProcessChallenge] for obtaining/renewing individual certificates, and +// [Service.RenewExpiringCertificates] as a self-scheduling cron job. It also provides +// [Service.BootstrapInfraCerts] for provisioning infrastructure wildcard certificates +// at startup. // -// The service uses Restate virtual objects keyed by domain name to ensure that only -// one certificate challenge runs per domain at any time, preventing duplicate requests -// and rate limit violations. +// The service uses a single global ACME account (not per-workspace) to simplify +// key management and avoid hitting Let's Encrypt's account rate limits. Challenge +// type selection is automatic: wildcard domains use DNS-01, regular domains use +// HTTP-01 for faster issuance. +// +// Not safe for concurrent use on the same domain. Concurrency control is handled +// by Restate's virtual object model which keys handlers by domain name. type Service struct { hydrav1.UnimplementedCertificateServiceServer db db.Database @@ -24,37 +30,45 @@ type Service struct { logger logging.Logger emailDomain string defaultDomain string - dnsProvider challenge.Provider // For DNS-01 challenges (wildcard certs) - httpProvider challenge.Provider // For HTTP-01 challenges (regular certs) + dnsProvider challenge.Provider + httpProvider challenge.Provider } var _ hydrav1.CertificateServiceServer = (*Service)(nil) -// Config holds the configuration for creating a certificate service. +// Config holds configuration for creating a [Service] instance. type Config struct { - // DB is the main database connection for workspace and domain data. + // DB provides database access for domain, certificate, and ACME challenge records. DB db.Database - // Vault provides encryption services for private key storage. + // Vault encrypts private keys before database storage. Keys are encrypted using + // the workspace ID as the keyring identifier. Vault *vault.Service - // Logger for structured logging. + // Logger receives structured log output from certificate operations. Logger logging.Logger - // EmailDomain is the domain used for ACME account emails (workspace_id@domain) + // EmailDomain forms the email address for ACME account registration. The service + // constructs emails as "acme@{EmailDomain}" for the global ACME account. EmailDomain string - // DefaultDomain is the base domain for wildcard certificates + // DefaultDomain is the base domain for infrastructure wildcard certificates, + // used by [Service.BootstrapInfraCerts] to provision platform TLS. DefaultDomain string - // DNSProvider is the challenge provider for DNS-01 challenges (wildcard certs) + // DNSProvider handles DNS-01 challenges required for wildcard certificates. + // Must be set to issue wildcard certs; ignored for regular domain certificates. DNSProvider challenge.Provider - // HTTPProvider is the challenge provider for HTTP-01 challenges (regular certs) + // HTTPProvider handles HTTP-01 challenges for regular (non-wildcard) certificates. + // Must be set to issue regular certs; ignored for wildcard certificates. HTTPProvider challenge.Provider } -// New creates a new certificate service instance. +// New creates a [Service] with the given configuration. The returned service is +// ready to handle certificate requests but does not start any background processes. +// Call [Service.BootstrapInfraCerts] at startup to provision infrastructure certs, +// and trigger [Service.RenewExpiringCertificates] once to start the renewal cron. func New(cfg Config) *Service { return &Service{ UnimplementedCertificateServiceServer: hydrav1.UnimplementedCertificateServiceServer{}, diff --git a/svc/ctrl/worker/deploy/deploy_handler.go b/svc/ctrl/worker/deploy/deploy_handler.go index e2c4c050ce..151ed74702 100644 --- a/svc/ctrl/worker/deploy/deploy_handler.go +++ b/svc/ctrl/worker/deploy/deploy_handler.go @@ -15,8 +15,11 @@ import ( ) const ( + // sentinelNamespace is the Kubernetes namespace where sentinel containers are deployed. sentinelNamespace = "sentinel" - sentinelPort = 8040 + + // sentinelPort is the port that sentinel containers listen on for traffic routing. + sentinelPort = 8040 ) // Deploy executes a full deployment workflow for a new application version. diff --git a/svc/ctrl/worker/deploy/domains.go b/svc/ctrl/worker/deploy/domains.go index a8c79bf6be..ca73a1d6a4 100644 --- a/svc/ctrl/worker/deploy/domains.go +++ b/svc/ctrl/worker/deploy/domains.go @@ -10,8 +10,15 @@ import ( "github.com/unkeyed/unkey/pkg/db" ) +// newDomain represents a domain to be created for a deployment, including its +// stickiness behavior for routing updates. type newDomain struct { + // domain is the fully qualified domain name (e.g., "myapp-production-acme.unkey.app"). domain string + + // sticky determines how this domain behaves across deployments. Non-sticky domains + // remain pinned to their original deployment, while sticky domains automatically + // update to point to new deployments matching their criteria. sticky db.FrontlineRoutesSticky } @@ -80,8 +87,11 @@ func buildDomains(workspaceSlug, projectSlug, environmentSlug, gitSha, branchNam } var ( + // nonAlphanumericRegex matches any character that is not a letter, digit, or whitespace. nonAlphanumericRegex = regexp.MustCompile(`[^a-zA-Z0-9\s]`) - multipleSpacesRegex = regexp.MustCompile(`\s+`) + + // multipleSpacesRegex matches one or more consecutive whitespace characters. + multipleSpacesRegex = regexp.MustCompile(`\s+`) ) // sluggify converts a string into a URL-safe slug. diff --git a/svc/ctrl/worker/deploy/service.go b/svc/ctrl/worker/deploy/service.go index 3a7672a279..977bdffa64 100644 --- a/svc/ctrl/worker/deploy/service.go +++ b/svc/ctrl/worker/deploy/service.go @@ -52,6 +52,7 @@ type Config struct { // AvailableRegions is the list of available regions for deployments. AvailableRegions []string + // BuildStorage provides access to S3-compatible storage for build context archives. BuildStorage s3.Storage } diff --git a/svc/ctrl/worker/doc.go b/svc/ctrl/worker/doc.go new file mode 100644 index 0000000000..1271eed9f7 --- /dev/null +++ b/svc/ctrl/worker/doc.go @@ -0,0 +1,67 @@ +// Package worker implements the Restate workflow worker for Unkey's control plane. +// +// The worker is the execution engine for long-running operations in Unkey's infrastructure, +// handling container builds, deployments, certificate management, and routing configuration +// through the Restate distributed workflow engine. It provides durable execution guarantees +// for operations that span multiple services and may take minutes to complete. +// +// # Architecture +// +// The worker acts as a Restate service host, binding multiple workflow services that handle +// distinct operational concerns. Each service is implemented as a separate sub-package: +// +// - [deploy] handles container deployments across multiple regions +// - [certificate] manages TLS certificates via ACME (Let's Encrypt) +// - [routing] configures traffic routing for custom domains +// - [versioning] manages application version lifecycle +// +// The worker maintains connections to several infrastructure components: the primary database +// for persistent state, two separate vault services (one for application secrets, one for +// ACME certificates), S3-compatible storage for build artifacts, and ClickHouse for analytics. +// +// # Configuration +// +// Configuration is provided through the [Config] struct, which validates all settings on startup. +// The worker supports two build backends ([BuildBackendDepot] for cloud builds and +// [BuildBackendDocker] for local builds), each with different requirements validated by +// [Config.Validate]. +// +// # Usage +// +// The worker is started with [Run], which blocks until the context is cancelled or a fatal +// error occurs: +// +// cfg := worker.Config{ +// InstanceID: "worker-1", +// HttpPort: 7092, +// DatabasePrimary: "mysql://...", +// BuildBackend: worker.BuildBackendDepot, +// // ... additional configuration +// } +// +// if err := worker.Run(ctx, cfg); err != nil { +// log.Fatal(err) +// } +// +// # Startup Sequence +// +// [Run] performs initialization in a specific order: configuration validation, vault services +// creation, database connection, build storage initialization, ACME provider setup, Restate +// server binding, admin registration with retry, wildcard certificate bootstrapping, health +// endpoint startup, and optional Prometheus metrics exposure. +// +// # Graceful Shutdown +// +// When the context passed to [Run] is cancelled, the worker performs graceful shutdown by +// stopping the health server, closing database connections, and allowing in-flight Restate +// workflows to complete. The shutdown sequence is managed through a shutdown handler that +// reverses the startup order. +// +// # ACME Certificate Management +// +// When ACME is enabled in configuration, the worker automatically manages TLS certificates +// using Let's Encrypt. It supports HTTP-01 challenges for regular domains and DNS-01 +// challenges (via Route53) for wildcard certificates. On startup with a configured default +// domain, [Run] calls [bootstrapWildcardDomain] to ensure the platform's wildcard certificate +// can be automatically renewed. +package worker diff --git a/svc/ctrl/worker/routing/assign_domains_handler.go b/svc/ctrl/worker/routing/assign_domains_handler.go index c826d9ffa7..0cd2ccb55c 100644 --- a/svc/ctrl/worker/routing/assign_domains_handler.go +++ b/svc/ctrl/worker/routing/assign_domains_handler.go @@ -10,6 +10,15 @@ import ( "github.com/unkeyed/unkey/pkg/db" ) +// AssignFrontlineRoutes reassigns a set of frontline routes to a new deployment. +// +// Each route in FrontlineRouteIds is updated to point at DeploymentId. The updates +// are executed sequentially, with each wrapped in [restate.Run] for durability. If +// any update fails, the operation returns an error and Restate will retry the +// entire handler from the last successful checkpoint. +// +// Returns an empty response on success. Database errors from the route updates +// propagate directly to the caller. func (s *Service) AssignFrontlineRoutes(ctx restate.ObjectContext, req *hydrav1.AssignFrontlineRoutesRequest) (*hydrav1.AssignFrontlineRoutesResponse, error) { s.logger.Info("assigning domains", "deployment_id", req.GetDeploymentId(), diff --git a/svc/ctrl/worker/routing/doc.go b/svc/ctrl/worker/routing/doc.go index 5bff141888..be709e7743 100644 --- a/svc/ctrl/worker/routing/doc.go +++ b/svc/ctrl/worker/routing/doc.go @@ -1,49 +1,32 @@ -// Package routing manages domain assignment and traffic routing workflows. +// Package routing manages frontline route assignment for deployments. // -// This package orchestrates the relationship between domains, -// deployments, and sentinel configurations. It handles creating -// new domain assignments during deployments and switching -// domains during rollback or promotion operations. +// This package provides a Restate-based service for reassigning frontline routes +// (the edge routing layer) to point at different deployments. When a deployment +// is promoted or traffic needs to shift, this service updates the database records +// that control which deployment receives traffic for each route. // // # Architecture // -// The routing service manages domain lifecycle and ensures -// traffic is routed to the correct deployments. It provides: -// - Domain assignment during deployment creation -// - Sticky domain behavior for different deployment types -// - Atomic domain switching during operations -// - Integration with sentinel configurations +// The routing service implements [hydrav1.RoutingServiceServer] and runs as a +// Restate virtual object. Virtual objects provide serialized access per key, +// preventing race conditions when multiple operations target the same routes. // -// # Built on Restate +// # Restate Integration // -// All routing workflows use Restate for durable execution: -// - Automatic retries on transient failures -// - Exactly-once guarantees for each workflow step -// - Durable state that survives process crashes -// - Virtual object concurrency control keyed by project ID +// All route reassignment operations use Restate's durable execution model. +// Each route update is wrapped in [restate.Run], which provides automatic retries +// on transient failures and exactly-once execution guarantees. If the service +// crashes mid-operation, Restate replays completed steps and resumes from where +// it left off. // // # Key Operations // -// [AssignDomains]: Create domain assignments for new deployments -// [SwitchDomains]: Reassign domains during rollback/promote operations -// -// # Domain Behavior Types -// -// - UNSPECIFIED: Per-commit domains (immutable, never reassigned) -// - BRANCH: Per-branch domains (follows latest deployment for branch) -// - ENVIRONMENT: Per-environment domains (follows latest deployment for environment) -// - LIVE: Per-live domains (follows current production deployment) -// -// # Sentinel Configuration -// -// Sentinel configs are automatically created for all domains -// (except local development hostnames) and stored as JSON -// in the database. Each config includes deployment ID, -// VM addresses for load balancing, and optional auth/validation configs. +// [Service.AssignFrontlineRoutes] reassigns a set of frontline routes to a new +// deployment by updating the deployment_id column in the frontline_routes table. // // # Usage // -// Creating routing service: +// Create a routing service: // // routingSvc := routing.New(routing.Config{ // DB: mainDB, @@ -51,20 +34,6 @@ // DefaultDomain: "unkey.app", // }) // -// Assign domains during deployment: -// -// resp, err := routingSvc.AssignDomains(ctx, &hydrav1.AssignDomainsRequest{ -// WorkspaceId: "ws_123", -// ProjectId: "proj_456", -// DeploymentId: "dep_abc", -// Domains: []*hydrav1.DomainToAssign{ -// {Name: "api.example.com", Sticky: hydrav1.DomainSticky_ENVIRONMENT}, -// }, -// IsRolledBack: false, -// }) -// -// # Error Handling -// -// The service ensures atomic operations and provides detailed error -// reporting for routing failures and domain conflicts. +// Register with Restate and invoke via the generated client to reassign routes +// during deployment promotion or rollback operations. package routing diff --git a/svc/ctrl/worker/routing/service.go b/svc/ctrl/worker/routing/service.go index c611895f0b..3eaf269b44 100644 --- a/svc/ctrl/worker/routing/service.go +++ b/svc/ctrl/worker/routing/service.go @@ -6,15 +6,11 @@ import ( "github.com/unkeyed/unkey/pkg/otel/logging" ) -// Service handles routing operations - domain assignment and sentinel configuration. +// Service implements the routing service for frontline route management. // -// This service manages the relationship between domains, deployments, and sentinel -// configurations. It handles creating new domain assignments during deployments and -// switching existing domains between deployments during rollback/promote operations. -// -// The service uses Restate virtual objects keyed by project ID to ensure that domain -// operations are serialized, preventing race conditions that could create inconsistent -// routing state. +// Service embeds [hydrav1.UnimplementedRoutingServiceServer] to satisfy the gRPC +// interface. It uses Restate virtual objects to serialize route reassignment +// operations, preventing concurrent modifications to the same routes. type Service struct { hydrav1.UnimplementedRoutingServiceServer db db.Database @@ -24,19 +20,14 @@ type Service struct { var _ hydrav1.RoutingServiceServer = (*Service)(nil) -// Config holds the configuration for creating a routing service. +// Config holds the configuration for creating a [Service]. type Config struct { - // Logger for structured logging. - Logger logging.Logger - - // DB is the main database connection for domain data. - DB db.Database - - // DefaultDomain is the apex domain used to identify production domains (e.g., "unkey.app"). + Logger logging.Logger + DB db.Database DefaultDomain string } -// New creates a new routing service instance. +// New creates a new [Service] with the provided configuration. func New(cfg Config) *Service { return &Service{ UnimplementedRoutingServiceServer: hydrav1.UnimplementedRoutingServiceServer{}, diff --git a/svc/ctrl/worker/versioning/doc.go b/svc/ctrl/worker/versioning/doc.go index df5a40d588..c95d12a461 100644 --- a/svc/ctrl/worker/versioning/doc.go +++ b/svc/ctrl/worker/versioning/doc.go @@ -1,25 +1,38 @@ // Package versioning provides per-region version counters for state synchronization. // -// The VersioningService is a Restate virtual object that generates monotonically -// increasing version numbers. These versions are used to track state changes in -// deployments and sentinels tables, enabling efficient incremental synchronization -// between the control plane and edge agents (krane). +// The [Service] is a Restate virtual object that generates monotonically increasing +// version numbers. These versions are used to track state changes in deployments and +// sentinels tables, enabling efficient incremental synchronization between the +// control plane and edge agents (krane). +// +// # Why Per-Region Versioning +// +// This service uses the region name as the virtual object key, creating one version +// counter per region. This design allows version requests for different regions to +// be processed in parallel while maintaining strict ordering within each region. +// A global counter would serialize all writes across all regions, creating a +// bottleneck. The per-region approach matches the data partitioning in the +// deployments and sentinels tables. // // # Usage // // Before mutating a deployment or sentinel, pass the region as the virtual object key: // // client := hydrav1.NewVersioningServiceClient(ctx, region) -// resp, err := client.NextVersion(ctx, &hydrav1.NextVersionRequest{}) -// // Use resp.Version when updating the resource row +// resp, err := client.NextVersion().Request(&hydrav1.NextVersionRequest{}) +// if err != nil { +// // Restate errors indicate infrastructure problems; fail the operation +// return err +// } +// // Use resp.Version when inserting/updating the resource row // -// Edge agents track their last-seen version and request changes after it: +// Edge agents track their last-seen version and request changes since then: // // SELECT * FROM deployments WHERE region = ? AND version > ? ORDER BY version // -// # Per-Region Pattern +// # Stale Cursor Detection // -// This service uses the region name as the virtual object key, creating one -// version counter per region. This allows version requests for different regions -// to be processed in parallel while maintaining ordering within each region. +// If a client's cursor version is older than the minimum retained version in the +// database (due to compaction or cleanup), it must perform a full bootstrap. Use +// [Service.GetVersion] to check the current version without incrementing. package versioning diff --git a/svc/ctrl/worker/versioning/next_version_handler.go b/svc/ctrl/worker/versioning/next_version_handler.go index bdbf1d341c..0def2f3ce5 100644 --- a/svc/ctrl/worker/versioning/next_version_handler.go +++ b/svc/ctrl/worker/versioning/next_version_handler.go @@ -5,12 +5,21 @@ import ( hydrav1 "github.com/unkeyed/unkey/gen/proto/hydra/v1" ) +// versionStateKey is the Restate state key used to store the version counter. +// Each virtual object instance (keyed by region) has its own isolated state, +// so this key is scoped to the region. const versionStateKey = "version" // NextVersion atomically increments and returns the next version number. // -// The version is durably stored in Restate's virtual object state per region, -// guaranteeing monotonically increasing values within each region. +// The version is durably stored in Restate's virtual object state, guaranteeing +// monotonically increasing values within each region. Version numbers start at 1 +// (the first call to a new region returns 1, not 0). Restate guarantees exactly-once +// execution, so retried invocations return the same version that was originally +// assigned. +// +// Returns an error only if Restate state operations fail, which indicates an +// infrastructure problem. On success, the returned version is always positive. func (s *Service) NextVersion(ctx restate.ObjectContext, _ *hydrav1.NextVersionRequest) (*hydrav1.NextVersionResponse, error) { current, err := restate.Get[uint64](ctx, versionStateKey) if err != nil { @@ -27,8 +36,12 @@ func (s *Service) NextVersion(ctx restate.ObjectContext, _ *hydrav1.NextVersionR // GetVersion returns the current version without incrementing. // -// Useful for stale cursor detection: if a client's version is older than the -// minimum retained version in the database, they must perform a full bootstrap. +// This is useful for stale cursor detection: if a client's cursor version is +// older than the minimum retained version in the database (due to row deletion +// or compaction), the client must perform a full bootstrap instead of +// incremental sync. +// +// Returns 0 if no versions have been generated yet for this region. func (s *Service) GetVersion(ctx restate.ObjectContext, _ *hydrav1.GetVersionRequest) (*hydrav1.GetVersionResponse, error) { current, err := restate.Get[uint64](ctx, versionStateKey) if err != nil { diff --git a/svc/ctrl/worker/versioning/service.go b/svc/ctrl/worker/versioning/service.go index bb41fd8a5a..c42e30d5bf 100644 --- a/svc/ctrl/worker/versioning/service.go +++ b/svc/ctrl/worker/versioning/service.go @@ -7,14 +7,17 @@ import ( // Service provides per-region, monotonically increasing versions for state sync. // // This is a Restate virtual object that maintains a durable counter per region. -// Each call to NextVersion atomically increments and returns the next version -// number for that region, with exactly-once semantics guaranteed by Restate. +// Each call to [Service.NextVersion] atomically increments and returns the next +// version number for that region, with exactly-once semantics guaranteed by Restate. +// The service is stateless; all state is stored in Restate's virtual object storage. type Service struct { hydrav1.UnimplementedVersioningServiceServer } var _ hydrav1.VersioningServiceServer = (*Service)(nil) +// New creates a new versioning service instance. The returned service should be +// registered with a Restate router using [hydrav1.NewVersioningServiceServer]. func New() *Service { return &Service{ UnimplementedVersioningServiceServer: hydrav1.UnimplementedVersioningServiceServer{}, diff --git a/svc/krane/internal/deployment/actual_state_report.go b/svc/krane/internal/deployment/actual_state_report.go index 83ff2e07e0..10f0b51850 100644 --- a/svc/krane/internal/deployment/actual_state_report.go +++ b/svc/krane/internal/deployment/actual_state_report.go @@ -14,9 +14,14 @@ import ( // and reports actual state changes back to the control plane in real-time. // // The watch filters for resources with the "managed-by: krane" and "component: deployment" -// labels, ignoring resources created by other controllers. When a ReplicaSet is added, -// modified, or deleted, the method queries pod status and reports the actual state to -// the control plane so routing tables stay synchronized with what's running in the cluster. +// labels, ignoring resources created by other controllers. When a ReplicaSet is added +// or modified, the method calls [Controller.buildDeploymentStatus] to query pod state +// and reports via [Controller.reportDeploymentStatus]. Deletions are reported directly +// so the control plane can remove the deployment from its routing tables. +// +// The method returns an error if the initial watch setup fails. Once started, watch +// errors are logged but the goroutine continues processing events. The watch runs +// until the context is cancelled. func (c *Controller) runActualStateReportLoop(ctx context.Context) error { w, err := c.clientSet.AppsV1().ReplicaSets("").Watch(ctx, metav1.ListOptions{ LabelSelector: labels.New(). diff --git a/svc/krane/internal/deployment/apply.go b/svc/krane/internal/deployment/apply.go index a31b7ee4d5..4e7022d48a 100644 --- a/svc/krane/internal/deployment/apply.go +++ b/svc/krane/internal/deployment/apply.go @@ -19,14 +19,19 @@ import ( // ApplyDeployment creates or updates a user workload as a Kubernetes ReplicaSet. // -// The deployment represents a specific build of user code. ApplyDeployment uses -// server-side apply to create or update the ReplicaSet, which allows concurrent -// modifications from different sources without conflicts. After applying, it -// queries the resulting pods and reports their addresses and status back to the -// control plane so the routing layer knows where to send traffic. +// The method uses server-side apply to create or update the ReplicaSet, enabling +// concurrent modifications from different sources without conflicts. After applying, +// it queries the resulting pods and reports their addresses and status to the control +// plane so the routing layer knows where to send traffic. // -// The namespace is created automatically if it doesn't exist. Pods run with gVisor -// isolation (RuntimeClass "gvisor") for security since they execute untrusted user code. +// ApplyDeployment validates all required fields and returns an error if any are missing +// or invalid: WorkspaceId, ProjectId, EnvironmentId, DeploymentId, K8sNamespace, K8sName, +// and Image must be non-empty; Replicas must be >= 0; CpuMillicores and MemoryMib must be > 0. +// +// The namespace is created automatically if it doesn't exist, along with a +// CiliumNetworkPolicy restricting ingress to matching sentinels. Pods run with gVisor +// isolation (RuntimeClass "gvisor") since they execute untrusted user code, and are +// scheduled on Karpenter-managed untrusted nodes with zone-spread constraints. func (c *Controller) ApplyDeployment(ctx context.Context, req *ctrlv1.ApplyDeployment) error { c.logger.Info("applying deployment", "namespace", req.GetK8SNamespace(), @@ -136,6 +141,9 @@ func (c *Controller) ApplyDeployment(ctx context.Context, req *ctrlv1.ApplyDeplo return nil } +// buildDeploymentEnv constructs the environment variables injected into deployment +// containers. It includes the PORT, workspace/project/environment/deployment IDs, +// and optionally the base64-encoded encrypted environment variables if present. func buildDeploymentEnv(req *ctrlv1.ApplyDeployment) []corev1.EnvVar { env := []corev1.EnvVar{ {Name: "PORT", Value: strconv.Itoa(DeploymentPort)}, diff --git a/svc/krane/internal/deployment/consts.go b/svc/krane/internal/deployment/consts.go index c0c8f57210..d9908c4c6d 100644 --- a/svc/krane/internal/deployment/consts.go +++ b/svc/krane/internal/deployment/consts.go @@ -3,20 +3,26 @@ package deployment import corev1 "k8s.io/api/core/v1" const ( - // DeploymentPort is the port user deployments listen on. + // DeploymentPort is the port all user deployment containers expose. The routing + // layer and sentinel proxies use this port to forward traffic to user code. DeploymentPort = 8080 - // runtimeClassGvisor specifies the gVisor sandbox for untrusted user workloads. + // runtimeClassGvisor specifies the gVisor sandbox RuntimeClass for running + // untrusted user workloads with kernel-level isolation. runtimeClassGvisor = "gvisor" - // fieldManagerKrane identifies krane as the server-side apply field manager. + // fieldManagerKrane identifies krane as the server-side apply field manager, + // enabling conflict-free concurrent updates from multiple sources. fieldManagerKrane = "krane" - // CustomerNodeClass is the node class for untrusted customer workloads. + // CustomerNodeClass is the Karpenter nodepool name for untrusted customer + // workloads. Nodes in this pool have additional isolation and monitoring. CustomerNodeClass = "untrusted" ) -// untrustedToleration allows pods to be scheduled on untrusted nodes. +// untrustedToleration allows deployment pods to be scheduled on nodes tainted +// for untrusted workloads. Without this toleration, pods would be rejected by +// the Karpenter-managed nodepool's NoSchedule taint. var untrustedToleration = corev1.Toleration{ Key: "karpenter.sh/nodepool", Operator: corev1.TolerationOpEqual, diff --git a/svc/krane/internal/deployment/controller.go b/svc/krane/internal/deployment/controller.go index 8a7b90d0d9..836a67e696 100644 --- a/svc/krane/internal/deployment/controller.go +++ b/svc/krane/internal/deployment/controller.go @@ -13,12 +13,16 @@ import ( "k8s.io/client-go/kubernetes" ) -// Controller manages deployment ReplicaSets in a Kubernetes cluster. +// Controller manages deployment ReplicaSets in a Kubernetes cluster by maintaining +// bidirectional state synchronization with the control plane. // -// It maintains bidirectional state synchronization with the control plane: -// receiving desired state via WatchDeployments and reporting actual -// state via ReportDeploymentStatus. The controller operates independently -// from the SentinelController with its own version cursor and circuit breaker. +// The controller receives desired state via the WatchDeployments stream and reports +// actual state via ReportDeploymentStatus. It operates independently from the sentinel +// controller with its own version cursor and circuit breaker, ensuring that failures +// in one controller don't cascade to the other. +// +// Create a Controller with [New] and start it with [Controller.Start]. The controller +// runs until the context is cancelled or [Controller.Stop] is called. type Controller struct { clientSet kubernetes.Interface dynamicClient dynamic.Interface @@ -31,15 +35,34 @@ type Controller struct { } // Config holds the configuration required to create a new [Controller]. +// +// All fields are required. The ClientSet and DynamicClient are used for Kubernetes +// operations, while Cluster provides the control plane RPC client for state +// synchronization. Region determines which deployments this controller manages. type Config struct { - ClientSet kubernetes.Interface + // ClientSet provides typed Kubernetes API access for ReplicaSet and Pod operations. + ClientSet kubernetes.Interface + + // DynamicClient provides unstructured Kubernetes API access for CiliumNetworkPolicy + // resources that don't have generated Go types. DynamicClient dynamic.Interface - Logger logging.Logger - Cluster ctrlv1connect.ClusterServiceClient - Region string + + // Logger is the structured logger for controller operations. + Logger logging.Logger + + // Cluster is the control plane RPC client for WatchDeployments and + // ReportDeploymentStatus calls. + Cluster ctrlv1connect.ClusterServiceClient + + // Region identifies the cluster region for filtering deployment streams. + Region string } // New creates a [Controller] ready to be started with [Controller.Start]. +// +// The controller initializes with versionLastSeen=0, meaning it will receive all +// pending deployments on first connection. The circuit breaker starts in a closed +// (healthy) state. func New(cfg Config) *Controller { return &Controller{ clientSet: cfg.ClientSet, @@ -53,16 +76,12 @@ func New(cfg Config) *Controller { } } -// Start launches the three background control loops: -// -// - [Controller.runDesiredStateApplyLoop]: Receives desired state from the -// control plane's SyncDeployments stream and applies it to Kubernetes. -// -// - [Controller.runActualStateReportLoop]: Watches Kubernetes for ReplicaSet -// changes and reports actual state back to the control plane. +// Start launches the three background control loops and blocks until they're initialized. // -// - [Controller.runResyncLoop]: Periodically re-queries the control plane for -// each existing ReplicaSet to ensure eventual consistency. +// The method starts [Controller.runResyncLoop] and [Controller.runDesiredStateApplyLoop] +// as background goroutines, and initializes [Controller.runActualStateReportLoop]'s +// Kubernetes watch before returning. If watch initialization fails, Start returns +// the error and no goroutines are left running. // // All loops continue until the context is cancelled or [Controller.Stop] is called. func (c *Controller) Start(ctx context.Context) error { @@ -77,7 +96,8 @@ func (c *Controller) Start(ctx context.Context) error { return nil } -// Stop signals all background goroutines to terminate. +// Stop signals all background goroutines to terminate by closing the done channel. +// Returns nil; the error return exists for interface compatibility. func (c *Controller) Stop() error { close(c.done) return nil diff --git a/svc/krane/internal/deployment/delete.go b/svc/krane/internal/deployment/delete.go index 34f046d8d9..801ddd6d3b 100644 --- a/svc/krane/internal/deployment/delete.go +++ b/svc/krane/internal/deployment/delete.go @@ -12,8 +12,11 @@ import ( // DeleteDeployment removes a user workload's ReplicaSet from the cluster. // // Not-found errors are ignored since the desired end state (resource gone) is -// already achieved. After deletion, the method notifies the control plane so it -// can update routing tables and stop sending traffic to this deployment. +// already achieved. After deletion, the method reports the deletion to the control +// plane so it can update routing tables and stop sending traffic to this deployment. +// +// The method is idempotent: calling it multiple times for the same deployment +// succeeds without error. func (c *Controller) DeleteDeployment(ctx context.Context, req *ctrlv1.DeleteDeployment) error { c.logger.Info("deleting deployment", "namespace", req.GetK8SNamespace(), diff --git a/svc/krane/internal/deployment/desired_state_apply.go b/svc/krane/internal/deployment/desired_state_apply.go index 64f93e38f5..db1f36372a 100644 --- a/svc/krane/internal/deployment/desired_state_apply.go +++ b/svc/krane/internal/deployment/desired_state_apply.go @@ -12,9 +12,13 @@ import ( // runDesiredStateApplyLoop connects to the control plane's WatchDeployments // stream and applies desired state updates to the Kubernetes cluster. // -// The loop automatically reconnects with jittered backoff on stream errors. -// Each received state is processed via applyDesiredState, and the version cursor -// is advanced on successful processing. +// The loop automatically reconnects with jittered backoff (1-5 seconds) on stream +// errors or disconnections. Each received state is processed via [Controller.ApplyDeployment] +// or [Controller.DeleteDeployment], and the version cursor is advanced on successful +// processing to enable resumable streaming. +// +// The loop runs indefinitely until the context is cancelled. It does not use the +// done channel since the jittered sleep handles graceful reconnection. func (c *Controller) runDesiredStateApplyLoop(ctx context.Context) { intervalMin := time.Second intervalMax := 5 * time.Second @@ -32,7 +36,15 @@ func (c *Controller) runDesiredStateApplyLoop(ctx context.Context) { // streamDesiredStateOnce opens a single connection to the control plane's // WatchDeployments stream, processes all received states until the stream -// closes or errors, then returns. The caller handles reconnection. +// closes or errors, then returns. +// +// The method sends the current versionLastSeen to resume from where it left off, +// avoiding reprocessing of already-applied states. On each received message, it +// dispatches to [Controller.ApplyDeployment] or [Controller.DeleteDeployment] based +// on the state type. If processing fails, the method returns the error without +// updating the version cursor, ensuring the state will be retried. +// +// The caller ([Controller.runDesiredStateApplyLoop]) handles reconnection on error. func (c *Controller) streamDesiredStateOnce(ctx context.Context) error { c.logger.Info("connecting to control plane for desired state") diff --git a/svc/krane/internal/deployment/doc.go b/svc/krane/internal/deployment/doc.go index ac2389a4e1..2830059e5b 100644 --- a/svc/krane/internal/deployment/doc.go +++ b/svc/krane/internal/deployment/doc.go @@ -1,44 +1,52 @@ -// Package deployment provides the DeploymentController for managing user workload -// ReplicaSets in Kubernetes. +// Package deployment manages user workload ReplicaSets in Kubernetes as part of +// krane's split control loop architecture. // -// The DeploymentController is one half of krane's split control loop architecture. -// It operates independently from the SentinelController, with its own: -// - Control plane sync stream (SyncDeployments) -// - Version cursor for resumable streaming -// - Circuit breaker for failure isolation -// - Kubernetes watch and refresh loops +// The package provides [Controller], which operates independently from the sentinel +// controller with its own control plane stream, version cursor, and circuit breaker. +// This separation ensures deployment reconciliation continues even when sentinel +// reconciliation experiences failures. // // # Architecture // -// The controller runs three loops for reliability: +// The controller runs three concurrent loops for reliability: // -// - [Controller.runDesiredStateApplyLoop]: Receives desired state from the -// control plane's SyncDeployments stream and applies it to Kubernetes. +// [Controller.runDesiredStateApplyLoop] streams desired state from the control plane's +// WatchDeployments RPC and applies changes to Kubernetes. It uses a version cursor +// for resumable streaming and automatically reconnects with jittered backoff on errors. // -// - [Controller.runActualStateReportLoop]: Watches Kubernetes for ReplicaSet -// changes and reports actual state back to the control plane. +// [Controller.runActualStateReportLoop] watches Kubernetes for ReplicaSet changes and +// reports actual state back to the control plane via ReportDeploymentStatus. This keeps +// the control plane's routing tables synchronized with what's actually running. // -// - [Controller.runResyncLoop]: Periodically re-queries the control plane -// for each existing ReplicaSet to ensure eventual consistency. +// [Controller.runResyncLoop] runs every minute as a consistency safety net. While the +// other loops handle real-time events, they can miss updates during network partitions, +// controller restarts, or buffer overflows. The resync loop queries the control plane +// for each existing ReplicaSet and applies any drift. // -// # Failure Isolation +// # Security // -// By running as an independent controller, deployment reconciliation continues -// even if sentinel reconciliation is experiencing failures. Each controller -// has its own circuit breaker, so errors in one don't affect the other. +// All user workloads run with gVisor isolation (RuntimeClass "gvisor") since they +// execute untrusted code. Each namespace gets a CiliumNetworkPolicy that restricts +// ingress to only sentinels with matching workspace and environment IDs. +// +// # Scheduling +// +// Deployment pods are spread across availability zones using TopologySpreadConstraints +// with maxSkew=1 for high availability. Pod affinity prefers scheduling in the same +// zone as the environment's sentinels to minimize cross-AZ latency. // // # Usage // // ctrl := deployment.New(deployment.Config{ // ClientSet: kubeClient, // DynamicClient: dynamicClient, -// Logger: logger.With("controller", "deployments"), +// Logger: logger, // Cluster: clusterClient, // Region: "us-east-1", // }) // // if err := ctrl.Start(ctx); err != nil { -// return fmt.Errorf("failed to start deployment controller: %w", err) +// return fmt.Errorf("start deployment controller: %w", err) // } // defer ctrl.Stop() package deployment diff --git a/svc/krane/internal/deployment/namespace.go b/svc/krane/internal/deployment/namespace.go index 5256fe15fe..345ef27fd0 100644 --- a/svc/krane/internal/deployment/namespace.go +++ b/svc/krane/internal/deployment/namespace.go @@ -17,13 +17,20 @@ import ( ) const ( - // NamespaceSentinel is the namespace where sentinel pods run. + // NamespaceSentinel is the namespace where all sentinel pods run, separate + // from customer deployment namespaces for isolation. NamespaceSentinel = "sentinel" ) -// ensureNamespaceExists creates the namespace if it doesn't already exist. -// For customer namespaces (non-sentinel), it also creates a CiliumNetworkPolicy -// to allow ingress only from the matching sentinel. +// ensureNamespaceExists creates the namespace if it doesn't already exist and +// configures network policies for customer workloads. +// +// For customer namespaces (anything except "sentinel"), the method also applies +// a CiliumNetworkPolicy that restricts ingress to only sentinels with matching +// workspace and environment IDs. This ensures customer code can only be reached +// by its designated sentinel, not by other tenants' workloads. +// +// The method is idempotent: calling it for an existing namespace succeeds without error. func (c *Controller) ensureNamespaceExists(ctx context.Context, namespace, workspaceID, environmentID string) error { _, err := c.clientSet.CoreV1().Namespaces().Create(ctx, &corev1.Namespace{ ObjectMeta: metav1.ObjectMeta{ @@ -43,8 +50,14 @@ func (c *Controller) ensureNamespaceExists(ctx context.Context, namespace, works return nil } -// applyCiliumPolicyForNamespace creates a CiliumNetworkPolicy that allows ingress -// only from sentinels with matching workspace and environment IDs. +// applyCiliumPolicyForNamespace creates or updates a CiliumNetworkPolicy that +// restricts ingress to pods matching specific labels. +// +// The policy allows TCP traffic on [DeploymentPort] only from pods in the "sentinel" +// namespace with matching workspace and environment ID labels. This isolates each +// tenant's deployments from other tenants' sentinels and from direct external access. +// +// The method uses server-side apply, making it safe to call repeatedly. // //nolint:exhaustruct func (c *Controller) applyCiliumPolicyForNamespace(ctx context.Context, namespace, workspaceID, environmentID string) error { @@ -117,6 +130,9 @@ func (c *Controller) applyCiliumPolicyForNamespace(ctx context.Context, namespac return err } +// toUnstructured converts a typed Kubernetes object to an unstructured representation +// for use with the dynamic client. This is needed for CRDs like CiliumNetworkPolicy +// that may not have generated client types available. func toUnstructured(obj any) (*unstructured.Unstructured, error) { data, err := json.Marshal(obj) if err != nil { diff --git a/svc/krane/internal/deployment/resync.go b/svc/krane/internal/deployment/resync.go index 4a672d0c45..6393e5f18e 100644 --- a/svc/krane/internal/deployment/resync.go +++ b/svc/krane/internal/deployment/resync.go @@ -14,12 +14,17 @@ import ( // runResyncLoop periodically reconciles all deployment ReplicaSets with their // desired state from the control plane. // -// This loop runs every minute as a consistency safety net. While -// [Controller.runActualStateReportLoop] handles real-time K8s events and +// The loop runs every minute as a consistency safety net. While +// [Controller.runActualStateReportLoop] handles real-time Kubernetes events and // [Controller.runDesiredStateApplyLoop] handles streaming updates, both can miss -// events during network partitions, controller restarts, or buffer overflows. +// events during network partitions, controller restarts, or watch buffer overflows. // This resync loop guarantees eventual consistency by querying the control plane -// for each existing ReplicaSet and applying any needed changes. +// for each existing ReplicaSet and applying any drift. +// +// The loop paginates through all krane-managed deployment ReplicaSets across all +// namespaces, calling GetDesiredDeploymentState for each and applying or deleting +// as directed. Errors are logged but don't stop the loop from processing remaining +// ReplicaSets. func (c *Controller) runResyncLoop(ctx context.Context) { repeat.Every(1*time.Minute, func() { c.logger.Info("running periodic resync") diff --git a/svc/krane/internal/deployment/scheduling.go b/svc/krane/internal/deployment/scheduling.go index 84f9b37dfd..d4794a5784 100644 --- a/svc/krane/internal/deployment/scheduling.go +++ b/svc/krane/internal/deployment/scheduling.go @@ -7,12 +7,18 @@ import ( ) const ( - // topologyKeyZone is the standard Kubernetes topology key for availability zones + // topologyKeyZone is the standard Kubernetes label for availability zones, + // used for spreading pods across zones for high availability. topologyKeyZone = "topology.kubernetes.io/zone" ) -// deploymentTopologySpread returns topology spread constraints for customer deployment pods. -// Spreads pods evenly across availability zones with maxSkew of 1. +// deploymentTopologySpread returns topology spread constraints that distribute +// deployment pods evenly across availability zones. +// +// The constraints use maxSkew=1 with WhenUnsatisfiable=ScheduleAnyway, meaning +// the scheduler prefers even distribution but won't block scheduling if zones +// are imbalanced. This ensures deployments remain schedulable even in degraded +// cluster states while still achieving zone redundancy under normal conditions. func deploymentTopologySpread(deploymentID string) []corev1.TopologySpreadConstraint { return []corev1.TopologySpreadConstraint{ { @@ -26,9 +32,13 @@ func deploymentTopologySpread(deploymentID string) []corev1.TopologySpreadConstr } } -// deploymentAffinity returns affinity rules for customer deployment pods. -// Prefers scheduling in the same AZ as sentinels for the given environment -// to minimize cross-AZ latency between sentinel and customer code. +// deploymentAffinity returns pod affinity rules that prefer co-locating deployment +// pods with their environment's sentinel pods in the same availability zone. +// +// This optimization reduces cross-AZ latency between sentinels and the user code +// they proxy to. The affinity is a soft preference (weight=100) rather than a hard +// requirement, so deployments can still schedule if no sentinel-local zones have +// capacity. func deploymentAffinity(environmentID string) *corev1.Affinity { return &corev1.Affinity{ PodAffinity: &corev1.PodAffinity{ diff --git a/svc/krane/internal/deployment/state.go b/svc/krane/internal/deployment/state.go index 86b5e329e1..01266c540e 100644 --- a/svc/krane/internal/deployment/state.go +++ b/svc/krane/internal/deployment/state.go @@ -12,9 +12,16 @@ import ( ) // buildDeploymentStatus queries the pods belonging to a ReplicaSet and builds a -// status report containing each pod's address, resource allocation, and phase. -// Pods without an IP address are skipped since they can't receive traffic yet. -// The address is formatted as a cluster-local DNS name for in-cluster routing. +// status report for the control plane. +// +// The report includes each pod's cluster-local DNS address, CPU and memory limits, +// and health status. Pods without an IP address are excluded since they can't +// receive traffic yet. The address format is "{ip-with-dashes}.{namespace}.pod.cluster.local:{port}" +// which enables in-cluster DNS resolution without a headless Service. +// +// Pod phase is mapped to instance status: Running pods with all containers ready +// become STATUS_RUNNING, Pending pods become STATUS_PENDING, and Failed pods or +// Running pods with unready containers become STATUS_FAILED. func (c *Controller) buildDeploymentStatus(ctx context.Context, replicaset *appsv1.ReplicaSet) (*ctrlv1.ReportDeploymentStatusRequest, error) { selector, err := metav1.LabelSelectorAsSelector(replicaset.Spec.Selector) if err != nil { From 3ca8148447320f4a10dd90cae46d7002cd9888e0 Mon Sep 17 00:00:00 2001 From: chronark Date: Thu, 22 Jan 2026 17:39:42 +0100 Subject: [PATCH 14/32] fix: use vault --- cmd/ctrl/worker.go | 22 +----- svc/ctrl/services/acme/BUILD.bazel | 2 +- svc/ctrl/services/acme/user.go | 17 ++--- svc/ctrl/worker/BUILD.bazel | 5 +- svc/ctrl/worker/certificate/BUILD.bazel | 3 +- .../certificate/process_challenge_handler.go | 7 +- svc/ctrl/worker/certificate/service.go | 6 +- svc/ctrl/worker/config.go | 20 ++---- svc/ctrl/worker/deploy/BUILD.bazel | 2 +- svc/ctrl/worker/deploy/service.go | 6 +- svc/ctrl/worker/run.go | 68 +++++-------------- 11 files changed, 51 insertions(+), 107 deletions(-) diff --git a/cmd/ctrl/worker.go b/cmd/ctrl/worker.go index 6e0c69165b..ab7f8d06db 100644 --- a/cmd/ctrl/worker.go +++ b/cmd/ctrl/worker.go @@ -126,25 +126,9 @@ func workerAction(ctx context.Context, cmd *cli.Command) error { // Database configuration DatabasePrimary: cmd.String("database-primary"), - // Vault configuration - General secrets - VaultMasterKeys: cmd.StringSlice("vault-master-keys"), - VaultS3: worker.S3Config{ - URL: cmd.String("vault-s3-url"), - Bucket: cmd.String("vault-s3-bucket"), - AccessKeyID: cmd.String("vault-s3-access-key-id"), - AccessKeySecret: cmd.String("vault-s3-access-key-secret"), - ExternalURL: "", - }, - - // ACME Vault configuration - Let's Encrypt certificates - AcmeVaultMasterKeys: cmd.StringSlice("acme-vault-master-keys"), - AcmeVaultS3: worker.S3Config{ - URL: cmd.String("acme-vault-s3-url"), - Bucket: cmd.String("acme-vault-s3-bucket"), - AccessKeyID: cmd.String("acme-vault-s3-access-key-id"), - AccessKeySecret: cmd.String("acme-vault-s3-access-key-secret"), - ExternalURL: "", - }, + // Vault configuration + VaultURL: cmd.String("vault-url"), + VaultToken: cmd.String("vault-token"), // Build configuration BuildBackend: worker.BuildBackend(cmd.String("build-backend")), diff --git a/svc/ctrl/services/acme/BUILD.bazel b/svc/ctrl/services/acme/BUILD.bazel index 4105eacc94..3d474bb445 100644 --- a/svc/ctrl/services/acme/BUILD.bazel +++ b/svc/ctrl/services/acme/BUILD.bazel @@ -16,12 +16,12 @@ go_library( "//gen/proto/ctrl/v1:ctrl", "//gen/proto/ctrl/v1/ctrlv1connect", "//gen/proto/vault/v1:vault", + "//gen/proto/vault/v1/vaultv1connect", "//internal/services/caches", "//pkg/cache", "//pkg/db", "//pkg/otel/logging", "//pkg/uid", - "//pkg/vault", "@com_connectrpc_connect//:connect", "@com_github_go_acme_lego_v4//acme", "@com_github_go_acme_lego_v4//lego", diff --git a/svc/ctrl/services/acme/user.go b/svc/ctrl/services/acme/user.go index da74f99998..94e736c01e 100644 --- a/svc/ctrl/services/acme/user.go +++ b/svc/ctrl/services/acme/user.go @@ -10,13 +10,14 @@ import ( "fmt" "time" + "connectrpc.com/connect" "github.com/go-acme/lego/v4/lego" "github.com/go-acme/lego/v4/registration" vaultv1 "github.com/unkeyed/unkey/gen/proto/vault/v1" + "github.com/unkeyed/unkey/gen/proto/vault/v1/vaultv1connect" "github.com/unkeyed/unkey/pkg/db" "github.com/unkeyed/unkey/pkg/otel/logging" "github.com/unkeyed/unkey/pkg/uid" - "github.com/unkeyed/unkey/pkg/vault" ) type AcmeUser struct { @@ -41,7 +42,7 @@ func (u *AcmeUser) GetPrivateKey() crypto.PrivateKey { type UserConfig struct { DB db.Database Logger logging.Logger - Vault *vault.Service + Vault vaultv1connect.VaultServiceClient WorkspaceID string EmailDomain string // Domain for ACME registration emails (e.g., "unkey.com") } @@ -55,15 +56,15 @@ func GetOrCreateUser(ctx context.Context, cfg UserConfig) (*lego.Client, error) return nil, fmt.Errorf("failed to find acme user: %w", err) } - resp, err := cfg.Vault.Decrypt(ctx, &vaultv1.DecryptRequest{ + resp, err := cfg.Vault.Decrypt(ctx, connect.NewRequest(&vaultv1.DecryptRequest{ Keyring: cfg.WorkspaceID, Encrypted: foundUser.EncryptedKey, - }) + })) if err != nil { return nil, fmt.Errorf("failed to decrypt private key: %w", err) } - key, err := stringToPrivateKey(resp.GetPlaintext()) + key, err := stringToPrivateKey(resp.Msg.GetPlaintext()) if err != nil { return nil, fmt.Errorf("failed to convert private key: %w", err) } @@ -131,10 +132,10 @@ func register(ctx context.Context, cfg UserConfig) (*lego.Client, error) { return nil, fmt.Errorf("failed to serialize private key: %w", err) } - resp, err := cfg.Vault.Encrypt(ctx, &vaultv1.EncryptRequest{ + resp, err := cfg.Vault.Encrypt(ctx, connect.NewRequest(&vaultv1.EncryptRequest{ Keyring: cfg.WorkspaceID, Data: privKeyString, - }) + })) if err != nil { return nil, fmt.Errorf("failed to encrypt private key: %w", err) } @@ -143,7 +144,7 @@ func register(ctx context.Context, cfg UserConfig) (*lego.Client, error) { err = db.Query.InsertAcmeUser(ctx, cfg.DB.RW(), db.InsertAcmeUserParams{ ID: id, WorkspaceID: cfg.WorkspaceID, - EncryptedKey: resp.GetEncrypted(), + EncryptedKey: resp.Msg.GetEncrypted(), CreatedAt: time.Now().UnixMilli(), }) if err != nil { diff --git a/svc/ctrl/worker/BUILD.bazel b/svc/ctrl/worker/BUILD.bazel index 0c3a170ff0..b4e3199476 100644 --- a/svc/ctrl/worker/BUILD.bazel +++ b/svc/ctrl/worker/BUILD.bazel @@ -11,6 +11,7 @@ go_library( visibility = ["//visibility:public"], deps = [ "//gen/proto/hydra/v1:hydra", + "//gen/proto/vault/v1/vaultv1connect", "//pkg/assert", "//pkg/cache", "//pkg/clickhouse", @@ -19,10 +20,9 @@ go_library( "//pkg/otel/logging", "//pkg/prometheus", "//pkg/retry", + "//pkg/rpc/interceptor", "//pkg/shutdown", "//pkg/uid", - "//pkg/vault", - "//pkg/vault/storage", "//pkg/zen", "//svc/ctrl/pkg/build", "//svc/ctrl/pkg/s3", @@ -31,6 +31,7 @@ go_library( "//svc/ctrl/worker/deploy", "//svc/ctrl/worker/routing", "//svc/ctrl/worker/versioning", + "@com_connectrpc_connect//:connect", "@com_github_go_acme_lego_v4//challenge", "@com_github_restatedev_sdk_go//:sdk-go", "@com_github_restatedev_sdk_go//ingress", diff --git a/svc/ctrl/worker/certificate/BUILD.bazel b/svc/ctrl/worker/certificate/BUILD.bazel index bfb2529531..41c83d3958 100644 --- a/svc/ctrl/worker/certificate/BUILD.bazel +++ b/svc/ctrl/worker/certificate/BUILD.bazel @@ -14,11 +14,12 @@ go_library( deps = [ "//gen/proto/hydra/v1:hydra", "//gen/proto/vault/v1:vault", + "//gen/proto/vault/v1/vaultv1connect", "//pkg/db", "//pkg/otel/logging", "//pkg/uid", - "//pkg/vault", "//svc/ctrl/services/acme", + "@com_connectrpc_connect//:connect", "@com_github_go_acme_lego_v4//certificate", "@com_github_go_acme_lego_v4//challenge", "@com_github_go_acme_lego_v4//lego", diff --git a/svc/ctrl/worker/certificate/process_challenge_handler.go b/svc/ctrl/worker/certificate/process_challenge_handler.go index cf53a07bf8..b22a32eaed 100644 --- a/svc/ctrl/worker/certificate/process_challenge_handler.go +++ b/svc/ctrl/worker/certificate/process_challenge_handler.go @@ -6,6 +6,7 @@ import ( "fmt" "time" + "connectrpc.com/connect" "github.com/go-acme/lego/v4/certificate" "github.com/go-acme/lego/v4/lego" restate "github.com/restatedev/sdk-go" @@ -293,10 +294,10 @@ func (s *Service) obtainCertificate(ctx context.Context, _ string, dom db.Custom } // Encrypt the private key before storage - encryptResp, err := s.vault.Encrypt(ctx, &vaultv1.EncryptRequest{ + encryptResp, err := s.vault.Encrypt(ctx, connect.NewRequest(&vaultv1.EncryptRequest{ Keyring: dom.WorkspaceID, Data: string(certificates.PrivateKey), - }) + })) if err != nil { return EncryptedCertificate{}, fmt.Errorf("failed to encrypt private key: %w", err) } @@ -304,7 +305,7 @@ func (s *Service) obtainCertificate(ctx context.Context, _ string, dom db.Custom return EncryptedCertificate{ CertificateID: uid.New(uid.CertificatePrefix), Certificate: string(certificates.Certificate), - EncryptedPrivateKey: encryptResp.GetEncrypted(), + EncryptedPrivateKey: encryptResp.Msg.GetEncrypted(), ExpiresAt: expiresAt, }, nil } diff --git a/svc/ctrl/worker/certificate/service.go b/svc/ctrl/worker/certificate/service.go index 1c0813934e..547ca57f59 100644 --- a/svc/ctrl/worker/certificate/service.go +++ b/svc/ctrl/worker/certificate/service.go @@ -3,9 +3,9 @@ package certificate import ( "github.com/go-acme/lego/v4/challenge" hydrav1 "github.com/unkeyed/unkey/gen/proto/hydra/v1" + "github.com/unkeyed/unkey/gen/proto/vault/v1/vaultv1connect" "github.com/unkeyed/unkey/pkg/db" "github.com/unkeyed/unkey/pkg/otel/logging" - "github.com/unkeyed/unkey/pkg/vault" ) // Service orchestrates ACME certificate issuance and renewal. @@ -26,7 +26,7 @@ import ( type Service struct { hydrav1.UnimplementedCertificateServiceServer db db.Database - vault *vault.Service + vault vaultv1connect.VaultServiceClient logger logging.Logger emailDomain string defaultDomain string @@ -43,7 +43,7 @@ type Config struct { // Vault encrypts private keys before database storage. Keys are encrypted using // the workspace ID as the keyring identifier. - Vault *vault.Service + Vault vaultv1connect.VaultServiceClient // Logger receives structured log output from certificate operations. Logger logging.Logger diff --git a/svc/ctrl/worker/config.go b/svc/ctrl/worker/config.go index 855ffe7f22..c64ebd07a9 100644 --- a/svc/ctrl/worker/config.go +++ b/svc/ctrl/worker/config.go @@ -195,21 +195,13 @@ type Config struct { // Used for both read and write operations to persistent storage. DatabasePrimary string - // VaultMasterKeys are encryption keys for the general vault service. - // Used for encrypting/decrypting environment variables, API keys, etc. - VaultMasterKeys []string + // VaultURL is the URL of the remote vault service for secret encryption. + // Example: "https://vault.unkey.cloud". + VaultURL string - // VaultS3 configures S3 storage for the general vault. - // Stores encrypted secrets data with the provided master keys. - VaultS3 S3Config - - // AcmeVaultMasterKeys are encryption keys for the ACME vault service. - // Separate vault for TLS certificate storage and ACME account data. - AcmeVaultMasterKeys []string - - // AcmeVaultS3 configures S3 storage for the ACME vault. - // Stores encrypted TLS certificates and ACME challenge data. - AcmeVaultS3 S3Config + // VaultToken is the authentication token for the remote vault service. + // Used for bearer authentication when calling vault RPCs. + VaultToken string // Acme configures automatic TLS certificate management. // Enables Let's Encrypt integration for domain certificates. diff --git a/svc/ctrl/worker/deploy/BUILD.bazel b/svc/ctrl/worker/deploy/BUILD.bazel index 5c175d0c2f..0212ffc5e0 100644 --- a/svc/ctrl/worker/deploy/BUILD.bazel +++ b/svc/ctrl/worker/deploy/BUILD.bazel @@ -16,10 +16,10 @@ go_library( deps = [ "//gen/proto/ctrl/v1:ctrl", "//gen/proto/hydra/v1:hydra", + "//gen/proto/vault/v1/vaultv1connect", "//pkg/db", "//pkg/otel/logging", "//pkg/uid", - "//pkg/vault", "//svc/ctrl/pkg/s3", "@com_github_restatedev_sdk_go//:sdk-go", ], diff --git a/svc/ctrl/worker/deploy/service.go b/svc/ctrl/worker/deploy/service.go index 977bdffa64..87932e244b 100644 --- a/svc/ctrl/worker/deploy/service.go +++ b/svc/ctrl/worker/deploy/service.go @@ -2,9 +2,9 @@ package deploy import ( hydrav1 "github.com/unkeyed/unkey/gen/proto/hydra/v1" + "github.com/unkeyed/unkey/gen/proto/vault/v1/vaultv1connect" "github.com/unkeyed/unkey/pkg/db" "github.com/unkeyed/unkey/pkg/otel/logging" - "github.com/unkeyed/unkey/pkg/vault" "github.com/unkeyed/unkey/svc/ctrl/pkg/s3" ) @@ -24,7 +24,7 @@ type Workflow struct { logger logging.Logger defaultDomain string - vault *vault.Service + vault vaultv1connect.VaultServiceClient sentinelImage string availableRegions []string buildStorage s3.Storage @@ -44,7 +44,7 @@ type Config struct { DefaultDomain string // Vault provides encryption/decryption services for secrets. - Vault *vault.Service + Vault vaultv1connect.VaultServiceClient // SentinelImage is the Docker image used for sentinel containers. SentinelImage string diff --git a/svc/ctrl/worker/run.go b/svc/ctrl/worker/run.go index 5ccb69c26b..0407975dfe 100644 --- a/svc/ctrl/worker/run.go +++ b/svc/ctrl/worker/run.go @@ -12,11 +12,13 @@ import ( "os" "time" + "connectrpc.com/connect" "github.com/go-acme/lego/v4/challenge" restate "github.com/restatedev/sdk-go" restateIngress "github.com/restatedev/sdk-go/ingress" restateServer "github.com/restatedev/sdk-go/server" hydrav1 "github.com/unkeyed/unkey/gen/proto/hydra/v1" + "github.com/unkeyed/unkey/gen/proto/vault/v1/vaultv1connect" "github.com/unkeyed/unkey/pkg/cache" "github.com/unkeyed/unkey/pkg/clickhouse" "github.com/unkeyed/unkey/pkg/clock" @@ -24,10 +26,9 @@ import ( "github.com/unkeyed/unkey/pkg/otel/logging" "github.com/unkeyed/unkey/pkg/prometheus" "github.com/unkeyed/unkey/pkg/retry" + "github.com/unkeyed/unkey/pkg/rpc/interceptor" "github.com/unkeyed/unkey/pkg/shutdown" "github.com/unkeyed/unkey/pkg/uid" - "github.com/unkeyed/unkey/pkg/vault" - "github.com/unkeyed/unkey/pkg/vault/storage" "github.com/unkeyed/unkey/pkg/zen" "github.com/unkeyed/unkey/svc/ctrl/pkg/build" @@ -78,54 +79,17 @@ func Run(ctx context.Context, cfg Config) error { logger = logger.With(slog.String("instanceID", cfg.InstanceID)) } - // Create vault service for general secrets (env vars, API keys, etc.) - var vaultSvc *vault.Service - if len(cfg.VaultMasterKeys) > 0 && cfg.VaultS3.URL != "" { - vaultStorage, vaultStorageErr := storage.NewS3(storage.S3Config{ - Logger: logger, - S3URL: cfg.VaultS3.URL, - S3Bucket: cfg.VaultS3.Bucket, - S3AccessKeyID: cfg.VaultS3.AccessKeyID, - S3AccessKeySecret: cfg.VaultS3.AccessKeySecret, - }) - if vaultStorageErr != nil { - return fmt.Errorf("unable to create vault storage: %w", vaultStorageErr) - } - - vaultSvc, err = vault.New(vault.Config{ - Logger: logger, - Storage: vaultStorage, - MasterKeys: cfg.VaultMasterKeys, - }) - if err != nil { - return fmt.Errorf("unable to create vault service: %w", err) - } - logger.Info("Vault service initialized", "bucket", cfg.VaultS3.Bucket) - } - - // Create separate vault service for ACME certificates - var acmeVaultSvc *vault.Service - if len(cfg.AcmeVaultMasterKeys) > 0 && cfg.AcmeVaultS3.URL != "" { - acmeVaultStorage, acmeStorageErr := storage.NewS3(storage.S3Config{ - Logger: logger, - S3URL: cfg.AcmeVaultS3.URL, - S3Bucket: cfg.AcmeVaultS3.Bucket, - S3AccessKeyID: cfg.AcmeVaultS3.AccessKeyID, - S3AccessKeySecret: cfg.AcmeVaultS3.AccessKeySecret, - }) - if acmeStorageErr != nil { - return fmt.Errorf("unable to create ACME vault storage: %w", acmeStorageErr) - } - - acmeVaultSvc, err = vault.New(vault.Config{ - Logger: logger, - Storage: acmeVaultStorage, - MasterKeys: cfg.AcmeVaultMasterKeys, - }) - if err != nil { - return fmt.Errorf("unable to create ACME vault service: %w", err) - } - logger.Info("ACME vault service initialized", "bucket", cfg.AcmeVaultS3.Bucket) + // Create vault client for remote vault service + var vaultClient vaultv1connect.VaultServiceClient + if cfg.VaultURL != "" { + vaultClient = vaultv1connect.NewVaultServiceClient( + http.DefaultClient, + cfg.VaultURL, + connect.WithInterceptors(interceptor.NewHeaderInjector(map[string]string{ + "Authorization": "Bearer " + cfg.VaultToken, + })), + ) + logger.Info("Vault client initialized", "url", cfg.VaultURL) } // Initialize database @@ -185,7 +149,7 @@ func Run(ctx context.Context, cfg Config) error { Logger: logger, DB: database, DefaultDomain: cfg.DefaultDomain, - Vault: vaultSvc, + Vault: vaultClient, SentinelImage: cfg.SentinelImage, AvailableRegions: cfg.AvailableRegions, BuildStorage: imageStore, @@ -253,7 +217,7 @@ func Run(ctx context.Context, cfg Config) error { restateSrv.Bind(hydrav1.NewCertificateServiceServer(certificate.New(certificate.Config{ Logger: logger, DB: database, - Vault: acmeVaultSvc, + Vault: vaultClient, EmailDomain: cfg.Acme.EmailDomain, DefaultDomain: cfg.DefaultDomain, DNSProvider: dnsProvider, From 8ff17bf34338863d8f33d293a6f4ec66c30f1f71 Mon Sep 17 00:00:00 2001 From: chronark Date: Thu, 22 Jan 2026 18:13:15 +0100 Subject: [PATCH 15/32] fix: remove duplicated code --- svc/ctrl/workflows/versioning/BUILD.bazel | 16 -------- svc/ctrl/workflows/versioning/doc.go | 25 ----------- .../versioning/next_version_handler.go | 41 ------------------- svc/ctrl/workflows/versioning/service.go | 22 ---------- 4 files changed, 104 deletions(-) delete mode 100644 svc/ctrl/workflows/versioning/BUILD.bazel delete mode 100644 svc/ctrl/workflows/versioning/doc.go delete mode 100644 svc/ctrl/workflows/versioning/next_version_handler.go delete mode 100644 svc/ctrl/workflows/versioning/service.go diff --git a/svc/ctrl/workflows/versioning/BUILD.bazel b/svc/ctrl/workflows/versioning/BUILD.bazel deleted file mode 100644 index d5c14e82dc..0000000000 --- a/svc/ctrl/workflows/versioning/BUILD.bazel +++ /dev/null @@ -1,16 +0,0 @@ -load("@rules_go//go:def.bzl", "go_library") - -go_library( - name = "versioning", - srcs = [ - "doc.go", - "next_version_handler.go", - "service.go", - ], - importpath = "github.com/unkeyed/unkey/svc/ctrl/workflows/versioning", - visibility = ["//visibility:public"], - deps = [ - "//gen/proto/hydra/v1:hydra", - "@com_github_restatedev_sdk_go//:sdk-go", - ], -) diff --git a/svc/ctrl/workflows/versioning/doc.go b/svc/ctrl/workflows/versioning/doc.go deleted file mode 100644 index df5a40d588..0000000000 --- a/svc/ctrl/workflows/versioning/doc.go +++ /dev/null @@ -1,25 +0,0 @@ -// Package versioning provides per-region version counters for state synchronization. -// -// The VersioningService is a Restate virtual object that generates monotonically -// increasing version numbers. These versions are used to track state changes in -// deployments and sentinels tables, enabling efficient incremental synchronization -// between the control plane and edge agents (krane). -// -// # Usage -// -// Before mutating a deployment or sentinel, pass the region as the virtual object key: -// -// client := hydrav1.NewVersioningServiceClient(ctx, region) -// resp, err := client.NextVersion(ctx, &hydrav1.NextVersionRequest{}) -// // Use resp.Version when updating the resource row -// -// Edge agents track their last-seen version and request changes after it: -// -// SELECT * FROM deployments WHERE region = ? AND version > ? ORDER BY version -// -// # Per-Region Pattern -// -// This service uses the region name as the virtual object key, creating one -// version counter per region. This allows version requests for different regions -// to be processed in parallel while maintaining ordering within each region. -package versioning diff --git a/svc/ctrl/workflows/versioning/next_version_handler.go b/svc/ctrl/workflows/versioning/next_version_handler.go deleted file mode 100644 index bdbf1d341c..0000000000 --- a/svc/ctrl/workflows/versioning/next_version_handler.go +++ /dev/null @@ -1,41 +0,0 @@ -package versioning - -import ( - restate "github.com/restatedev/sdk-go" - hydrav1 "github.com/unkeyed/unkey/gen/proto/hydra/v1" -) - -const versionStateKey = "version" - -// NextVersion atomically increments and returns the next version number. -// -// The version is durably stored in Restate's virtual object state per region, -// guaranteeing monotonically increasing values within each region. -func (s *Service) NextVersion(ctx restate.ObjectContext, _ *hydrav1.NextVersionRequest) (*hydrav1.NextVersionResponse, error) { - current, err := restate.Get[uint64](ctx, versionStateKey) - if err != nil { - return nil, err - } - - next := current + 1 - restate.Set(ctx, versionStateKey, next) - - return &hydrav1.NextVersionResponse{ - Version: next, - }, nil -} - -// GetVersion returns the current version without incrementing. -// -// Useful for stale cursor detection: if a client's version is older than the -// minimum retained version in the database, they must perform a full bootstrap. -func (s *Service) GetVersion(ctx restate.ObjectContext, _ *hydrav1.GetVersionRequest) (*hydrav1.GetVersionResponse, error) { - current, err := restate.Get[uint64](ctx, versionStateKey) - if err != nil { - return nil, err - } - - return &hydrav1.GetVersionResponse{ - Version: current, - }, nil -} diff --git a/svc/ctrl/workflows/versioning/service.go b/svc/ctrl/workflows/versioning/service.go deleted file mode 100644 index bb41fd8a5a..0000000000 --- a/svc/ctrl/workflows/versioning/service.go +++ /dev/null @@ -1,22 +0,0 @@ -package versioning - -import ( - hydrav1 "github.com/unkeyed/unkey/gen/proto/hydra/v1" -) - -// Service provides per-region, monotonically increasing versions for state sync. -// -// This is a Restate virtual object that maintains a durable counter per region. -// Each call to NextVersion atomically increments and returns the next version -// number for that region, with exactly-once semantics guaranteed by Restate. -type Service struct { - hydrav1.UnimplementedVersioningServiceServer -} - -var _ hydrav1.VersioningServiceServer = (*Service)(nil) - -func New() *Service { - return &Service{ - UnimplementedVersioningServiceServer: hydrav1.UnimplementedVersioningServiceServer{}, - } -} From 4a51c28625e6b456219caa960c880c81081ec428 Mon Sep 17 00:00:00 2001 From: chronark Date: Thu, 22 Jan 2026 18:18:42 +0100 Subject: [PATCH 16/32] ci: start correct container --- .github/workflows/job_bazel.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/job_bazel.yaml b/.github/workflows/job_bazel.yaml index 66fddc460d..68f2c3fec6 100644 --- a/.github/workflows/job_bazel.yaml +++ b/.github/workflows/job_bazel.yaml @@ -33,7 +33,7 @@ jobs: # Running containers is temporary until we moved them inside of bazel, # at that point they are only created if they are actually needed - name: Start for containers - run: docker compose -f ./dev/docker-compose.yaml up s3 clickhouse kafka mysql ctrl restate -d --wait + run: docker compose -f ./dev/docker-compose.yaml up s3 clickhouse kafka mysql ctrl-api restate -d --wait - name: Run tests run: bazel test //... --test_output=errors From 15bd60a5c97249ab516b2449cc661648f2cca7b7 Mon Sep 17 00:00:00 2001 From: chronark Date: Thu, 22 Jan 2026 18:19:16 +0100 Subject: [PATCH 17/32] ci: fix containers --- .github/workflows/job_bazel.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/job_bazel.yaml b/.github/workflows/job_bazel.yaml index 68f2c3fec6..90ff3930bf 100644 --- a/.github/workflows/job_bazel.yaml +++ b/.github/workflows/job_bazel.yaml @@ -32,8 +32,7 @@ jobs: # Running containers is temporary until we moved them inside of bazel, # at that point they are only created if they are actually needed - - name: Start for containers + - name: Start containers run: docker compose -f ./dev/docker-compose.yaml up s3 clickhouse kafka mysql ctrl-api restate -d --wait - - name: Run tests run: bazel test //... --test_output=errors From 1fb5fa78e9d4a7b804f9618da63674db12d2022a Mon Sep 17 00:00:00 2001 From: chronark Date: Thu, 22 Jan 2026 18:26:10 +0100 Subject: [PATCH 18/32] ci: add worker --- .github/workflows/job_bazel.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/job_bazel.yaml b/.github/workflows/job_bazel.yaml index 90ff3930bf..a3a02c481e 100644 --- a/.github/workflows/job_bazel.yaml +++ b/.github/workflows/job_bazel.yaml @@ -33,6 +33,6 @@ jobs: # Running containers is temporary until we moved them inside of bazel, # at that point they are only created if they are actually needed - name: Start containers - run: docker compose -f ./dev/docker-compose.yaml up s3 clickhouse kafka mysql ctrl-api restate -d --wait + run: docker compose -f ./dev/docker-compose.yaml up s3 clickhouse kafka mysql ctrl-api ctrl-worker restate -d --wait - name: Run tests run: bazel test //... --test_output=errors From 507f85f203a9da29e4a93b6e468b72dca4df9fb4 Mon Sep 17 00:00:00 2001 From: chronark Date: Thu, 22 Jan 2026 18:32:05 +0100 Subject: [PATCH 19/32] ci: hack --- .github/workflows/job_bazel.yaml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/job_bazel.yaml b/.github/workflows/job_bazel.yaml index a3a02c481e..0a09e4423d 100644 --- a/.github/workflows/job_bazel.yaml +++ b/.github/workflows/job_bazel.yaml @@ -33,6 +33,13 @@ jobs: # Running containers is temporary until we moved them inside of bazel, # at that point they are only created if they are actually needed - name: Start containers - run: docker compose -f ./dev/docker-compose.yaml up s3 clickhouse kafka mysql ctrl-api ctrl-worker restate -d --wait + run: | + echo "UNKEY_DEPOT_TOKEN=fake" >> ./dev/.env.depot + echo "UNKEY_BUILD_S3_URL=fake" >> ./dev/.env.depot + echo "UNKEY_BUILD_S3_ACCESS_KEY_ID=fake" >> ./dev/.env.depot + echo "UNKEY_BUILD_S3_ACCESS_KEY_SECRET=fake" >> ./dev/.env.depot + echo "UNKEY_REGISTRY_PASSWORD=fake" >> ./dev/.env.depot + + docker compose -f ./dev/docker-compose.yaml up s3 clickhouse kafka mysql ctrl-api ctrl-worker restate -d --wait - name: Run tests run: bazel test //... --test_output=errors From 6dbe055ea520073fc48b7ad77131ce23ddde608e Mon Sep 17 00:00:00 2001 From: chronark Date: Thu, 22 Jan 2026 18:45:03 +0100 Subject: [PATCH 20/32] ci: sleep --- .github/workflows/job_bazel.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/job_bazel.yaml b/.github/workflows/job_bazel.yaml index 0a09e4423d..bd95d8b870 100644 --- a/.github/workflows/job_bazel.yaml +++ b/.github/workflows/job_bazel.yaml @@ -41,5 +41,6 @@ jobs: echo "UNKEY_REGISTRY_PASSWORD=fake" >> ./dev/.env.depot docker compose -f ./dev/docker-compose.yaml up s3 clickhouse kafka mysql ctrl-api ctrl-worker restate -d --wait + sleep 10 - name: Run tests run: bazel test //... --test_output=errors From 62f7876966fbca9ff57205ccb1e2191c250cce79 Mon Sep 17 00:00:00 2001 From: chronark Date: Thu, 22 Jan 2026 19:26:16 +0100 Subject: [PATCH 21/32] test: use mock --- .github/workflows/job_bazel.yaml | 10 +- Makefile | 2 +- svc/api/internal/testutil/BUILD.bazel | 3 +- svc/api/internal/testutil/http.go | 23 +--- .../testutil/mock_deployment_client.go | 69 +++++++++++ svc/api/internal/testutil/seed/seed.go | 47 ++++++++ svc/api/routes/register.go | 7 +- .../v2_deploy_create_deployment/200_test.go | 39 ++++-- .../v2_deploy_create_deployment/400_test.go | 15 ++- .../v2_deploy_create_deployment/401_test.go | 15 ++- .../v2_deploy_create_deployment/403_test.go | 15 ++- .../v2_deploy_create_deployment/404_test.go | 111 ++++++++++------- .../v2_deploy_create_deployment/BUILD.bazel | 2 + .../v2_deploy_generate_upload_url/200_test.go | 48 ++++++-- .../v2_deploy_generate_upload_url/400_test.go | 18 ++- .../v2_deploy_generate_upload_url/401_test.go | 18 ++- .../v2_deploy_generate_upload_url/403_test.go | 18 ++- .../v2_deploy_generate_upload_url/404_test.go | 18 ++- .../v2_deploy_generate_upload_url/BUILD.bazel | 2 + .../v2_deploy_get_deployment/200_test.go | 107 ++++++++--------- .../v2_deploy_get_deployment/400_test.go | 7 +- .../v2_deploy_get_deployment/401_test.go | 7 +- .../v2_deploy_get_deployment/403_test.go | 29 +++-- .../v2_deploy_get_deployment/404_test.go | 10 +- .../v2_deploy_get_deployment/BUILD.bazel | 9 +- .../v2_deploy_get_deployment/handler.go | 112 ++++-------------- 26 files changed, 449 insertions(+), 312 deletions(-) create mode 100644 svc/api/internal/testutil/mock_deployment_client.go diff --git a/.github/workflows/job_bazel.yaml b/.github/workflows/job_bazel.yaml index bd95d8b870..f04af1972a 100644 --- a/.github/workflows/job_bazel.yaml +++ b/.github/workflows/job_bazel.yaml @@ -33,14 +33,6 @@ jobs: # Running containers is temporary until we moved them inside of bazel, # at that point they are only created if they are actually needed - name: Start containers - run: | - echo "UNKEY_DEPOT_TOKEN=fake" >> ./dev/.env.depot - echo "UNKEY_BUILD_S3_URL=fake" >> ./dev/.env.depot - echo "UNKEY_BUILD_S3_ACCESS_KEY_ID=fake" >> ./dev/.env.depot - echo "UNKEY_BUILD_S3_ACCESS_KEY_SECRET=fake" >> ./dev/.env.depot - echo "UNKEY_REGISTRY_PASSWORD=fake" >> ./dev/.env.depot - - docker compose -f ./dev/docker-compose.yaml up s3 clickhouse kafka mysql ctrl-api ctrl-worker restate -d --wait - sleep 10 + run: docker compose -f ./dev/docker-compose.yaml up s3 clickhouse kafka mysql -d --wait - name: Run tests run: bazel test //... --test_output=errors diff --git a/Makefile b/Makefile index 24bbb27863..fd4b3277de 100644 --- a/Makefile +++ b/Makefile @@ -86,7 +86,7 @@ generate: generate-sql ## Generate code from protobuf and other sources .PHONY: test test: ## Run tests with bazel - docker compose -f ./dev/docker-compose.yaml up -d mysql clickhouse s3 kafka restate ctrl-api ctrl-worker --wait + docker compose -f ./dev/docker-compose.yaml up -d mysql clickhouse s3 kafka --wait bazel test //... make clean-docker-test diff --git a/svc/api/internal/testutil/BUILD.bazel b/svc/api/internal/testutil/BUILD.bazel index bd0f10126e..9b2eeb00e7 100644 --- a/svc/api/internal/testutil/BUILD.bazel +++ b/svc/api/internal/testutil/BUILD.bazel @@ -5,10 +5,12 @@ go_library( srcs = [ "doc.go", "http.go", + "mock_deployment_client.go", ], importpath = "github.com/unkeyed/unkey/svc/api/internal/testutil", visibility = ["//svc/api:__subpackages__"], deps = [ + "//gen/proto/ctrl/v1:ctrl", "//gen/proto/ctrl/v1/ctrlv1connect", "//gen/proto/vault/v1:vault", "//internal/services/analytics", @@ -24,7 +26,6 @@ go_library( "//pkg/dockertest", "//pkg/otel/logging", "//pkg/rbac", - "//pkg/rpc/interceptor", "//pkg/testutil/containers", "//pkg/uid", "//pkg/vault", diff --git a/svc/api/internal/testutil/http.go b/svc/api/internal/testutil/http.go index f9481585e9..cb07c8f21a 100644 --- a/svc/api/internal/testutil/http.go +++ b/svc/api/internal/testutil/http.go @@ -5,15 +5,12 @@ import ( "context" "database/sql" "encoding/json" - "fmt" "net/http" "net/http/httptest" "testing" "time" - "connectrpc.com/connect" "github.com/stretchr/testify/require" - "github.com/unkeyed/unkey/gen/proto/ctrl/v1/ctrlv1connect" vaultv1 "github.com/unkeyed/unkey/gen/proto/vault/v1" "github.com/unkeyed/unkey/internal/services/analytics" "github.com/unkeyed/unkey/internal/services/auditlogs" @@ -28,7 +25,6 @@ import ( "github.com/unkeyed/unkey/pkg/dockertest" "github.com/unkeyed/unkey/pkg/otel/logging" "github.com/unkeyed/unkey/pkg/rbac" - "github.com/unkeyed/unkey/pkg/rpc/interceptor" "github.com/unkeyed/unkey/pkg/testutil/containers" "github.com/unkeyed/unkey/pkg/uid" "github.com/unkeyed/unkey/pkg/vault" @@ -70,7 +66,6 @@ type Harness struct { Ratelimit ratelimit.Service Vault *vault.Service AnalyticsConnectionManager analytics.ConnectionManager - CtrlDeploymentClient ctrlv1connect.DeploymentServiceClient seeder *seed.Seeder } @@ -200,18 +195,6 @@ func NewHarness(t *testing.T) *Harness { seeder.Seed(context.Background()) - // Get CTRL service URL and token - ctrlURL, ctrlToken := containers.ControlPlane(t) - - // Create CTRL clients - ctrlDeploymentClient := ctrlv1connect.NewDeploymentServiceClient( - http.DefaultClient, - ctrlURL, - connect.WithInterceptors(interceptor.NewHeaderInjector(map[string]string{ - "Authorization": fmt.Sprintf("Bearer %s", ctrlToken), - })), - ) - audit, err := auditlogs.New(auditlogs.Config{ DB: db, Logger: logger, @@ -232,7 +215,6 @@ func NewHarness(t *testing.T) *Harness { seeder: seeder, Clock: clk, AnalyticsConnectionManager: analyticsConnManager, - CtrlDeploymentClient: ctrlDeploymentClient, Auditlogs: audit, Caches: caches, middleware: []zen.Middleware{ @@ -316,6 +298,11 @@ func (h *Harness) CreateEnvironment(req seed.CreateEnvironmentRequest) db.Enviro return h.seeder.CreateEnvironment(h.t.Context(), req) } +// CreateDeployment creates a deployment within a project and environment. +func (h *Harness) CreateDeployment(req seed.CreateDeploymentRequest) db.Deployment { + return h.seeder.CreateDeployment(context.Background(), req) +} + // DeploymentTestSetup contains all resources needed for deployment tests. type DeploymentTestSetup struct { Workspace db.Workspace diff --git a/svc/api/internal/testutil/mock_deployment_client.go b/svc/api/internal/testutil/mock_deployment_client.go new file mode 100644 index 0000000000..841ad99c36 --- /dev/null +++ b/svc/api/internal/testutil/mock_deployment_client.go @@ -0,0 +1,69 @@ +package testutil + +import ( + "context" + + "connectrpc.com/connect" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" + "github.com/unkeyed/unkey/gen/proto/ctrl/v1/ctrlv1connect" +) + +var _ ctrlv1connect.DeploymentServiceClient = (*MockDeploymentClient)(nil) + +// MockDeploymentClient is a test double for the control plane's deployment service. +// +// Each method has an optional function field that tests can set to customize +// behavior. If the function is nil, the method returns a sensible default. +// The mock also records calls so tests can verify the correct requests were made. +type MockDeploymentClient struct { + CreateS3UploadURLFunc func(context.Context, *connect.Request[ctrlv1.CreateS3UploadURLRequest]) (*connect.Response[ctrlv1.CreateS3UploadURLResponse], error) + CreateDeploymentFunc func(context.Context, *connect.Request[ctrlv1.CreateDeploymentRequest]) (*connect.Response[ctrlv1.CreateDeploymentResponse], error) + GetDeploymentFunc func(context.Context, *connect.Request[ctrlv1.GetDeploymentRequest]) (*connect.Response[ctrlv1.GetDeploymentResponse], error) + RollbackFunc func(context.Context, *connect.Request[ctrlv1.RollbackRequest]) (*connect.Response[ctrlv1.RollbackResponse], error) + PromoteFunc func(context.Context, *connect.Request[ctrlv1.PromoteRequest]) (*connect.Response[ctrlv1.PromoteResponse], error) + CreateS3UploadURLCalls []*ctrlv1.CreateS3UploadURLRequest + CreateDeploymentCalls []*ctrlv1.CreateDeploymentRequest + GetDeploymentCalls []*ctrlv1.GetDeploymentRequest + RollbackCalls []*ctrlv1.RollbackRequest + PromoteCalls []*ctrlv1.PromoteRequest +} + +func (m *MockDeploymentClient) CreateS3UploadURL(ctx context.Context, req *connect.Request[ctrlv1.CreateS3UploadURLRequest]) (*connect.Response[ctrlv1.CreateS3UploadURLResponse], error) { + m.CreateS3UploadURLCalls = append(m.CreateS3UploadURLCalls, req.Msg) + if m.CreateS3UploadURLFunc != nil { + return m.CreateS3UploadURLFunc(ctx, req) + } + return connect.NewResponse(&ctrlv1.CreateS3UploadURLResponse{}), nil +} + +func (m *MockDeploymentClient) CreateDeployment(ctx context.Context, req *connect.Request[ctrlv1.CreateDeploymentRequest]) (*connect.Response[ctrlv1.CreateDeploymentResponse], error) { + m.CreateDeploymentCalls = append(m.CreateDeploymentCalls, req.Msg) + if m.CreateDeploymentFunc != nil { + return m.CreateDeploymentFunc(ctx, req) + } + return connect.NewResponse(&ctrlv1.CreateDeploymentResponse{}), nil +} + +func (m *MockDeploymentClient) GetDeployment(ctx context.Context, req *connect.Request[ctrlv1.GetDeploymentRequest]) (*connect.Response[ctrlv1.GetDeploymentResponse], error) { + m.GetDeploymentCalls = append(m.GetDeploymentCalls, req.Msg) + if m.GetDeploymentFunc != nil { + return m.GetDeploymentFunc(ctx, req) + } + return connect.NewResponse(&ctrlv1.GetDeploymentResponse{}), nil +} + +func (m *MockDeploymentClient) Rollback(ctx context.Context, req *connect.Request[ctrlv1.RollbackRequest]) (*connect.Response[ctrlv1.RollbackResponse], error) { + m.RollbackCalls = append(m.RollbackCalls, req.Msg) + if m.RollbackFunc != nil { + return m.RollbackFunc(ctx, req) + } + return connect.NewResponse(&ctrlv1.RollbackResponse{}), nil +} + +func (m *MockDeploymentClient) Promote(ctx context.Context, req *connect.Request[ctrlv1.PromoteRequest]) (*connect.Response[ctrlv1.PromoteResponse], error) { + m.PromoteCalls = append(m.PromoteCalls, req.Msg) + if m.PromoteFunc != nil { + return m.PromoteFunc(ctx, req) + } + return connect.NewResponse(&ctrlv1.PromoteResponse{}), nil +} diff --git a/svc/api/internal/testutil/seed/seed.go b/svc/api/internal/testutil/seed/seed.go index 23a55425b6..1beb1f2fc0 100644 --- a/svc/api/internal/testutil/seed/seed.go +++ b/svc/api/internal/testutil/seed/seed.go @@ -600,6 +600,53 @@ type CreatePermissionRequest struct { WorkspaceID string } +// CreateDeploymentRequest configures the deployment to create. +type CreateDeploymentRequest struct { + ID string + WorkspaceID string + ProjectID string + EnvironmentID string + GitBranch string +} + +// CreateDeployment creates a deployment within a project and environment. +func (s *Seeder) CreateDeployment(ctx context.Context, req CreateDeploymentRequest) db.Deployment { + require.NoError(s.t, assert.NotEmpty(req.ID, "Deployment ID must be set")) + require.NoError(s.t, assert.NotEmpty(req.WorkspaceID, "Deployment WorkspaceID must be set")) + require.NoError(s.t, assert.NotEmpty(req.ProjectID, "Deployment ProjectID must be set")) + require.NoError(s.t, assert.NotEmpty(req.EnvironmentID, "Deployment EnvironmentID must be set")) + + createdAt := time.Now().UnixMilli() + err := db.Query.InsertDeployment(ctx, s.DB.RW(), db.InsertDeploymentParams{ + ID: req.ID, + K8sName: "test-" + req.ID, + WorkspaceID: req.WorkspaceID, + ProjectID: req.ProjectID, + EnvironmentID: req.EnvironmentID, + GitCommitSha: sql.NullString{Valid: false}, + GitBranch: sql.NullString{String: req.GitBranch, Valid: req.GitBranch != ""}, + SentinelConfig: []byte("{}"), + GitCommitMessage: sql.NullString{Valid: false}, + GitCommitAuthorHandle: sql.NullString{Valid: false}, + GitCommitAuthorAvatarUrl: sql.NullString{Valid: false}, + GitCommitTimestamp: sql.NullInt64{Valid: false}, + OpenapiSpec: sql.NullString{Valid: false}, + EncryptedEnvironmentVariables: []byte{}, + Command: []byte("[]"), + Status: db.DeploymentsStatusPending, + CpuMillicores: 100, + MemoryMib: 128, + CreatedAt: createdAt, + UpdatedAt: sql.NullInt64{Valid: false}, + }) + require.NoError(s.t, err) + + deployment, err := db.Query.FindDeploymentById(ctx, s.DB.RO(), req.ID) + require.NoError(s.t, err) + + return deployment +} + // CreatePermission creates a permission that can be attached to keys or roles. func (s *Seeder) CreatePermission(ctx context.Context, req CreatePermissionRequest) db.Permission { require.NoError(s.t, assert.NotEmpty(req.WorkspaceID, "Permission WorkspaceID must be set")) diff --git a/svc/api/routes/register.go b/svc/api/routes/register.go index e428f9d494..2e01e59847 100644 --- a/svc/api/routes/register.go +++ b/svc/api/routes/register.go @@ -352,10 +352,9 @@ func Register(srv *zen.Server, svc *Services, info zen.InstanceInfo) { srv.RegisterRoute( defaultMiddlewares, &v2DeployGetDeployment.Handler{ - Logger: svc.Logger, - DB: svc.Database, - Keys: svc.Keys, - CtrlClient: svc.CtrlDeploymentClient, + Logger: svc.Logger, + DB: svc.Database, + Keys: svc.Keys, }, ) diff --git a/svc/api/routes/v2_deploy_create_deployment/200_test.go b/svc/api/routes/v2_deploy_create_deployment/200_test.go index f906feab65..287778fb82 100644 --- a/svc/api/routes/v2_deploy_create_deployment/200_test.go +++ b/svc/api/routes/v2_deploy_create_deployment/200_test.go @@ -1,11 +1,14 @@ package handler_test import ( + "context" "fmt" "net/http" "testing" + "connectrpc.com/connect" "github.com/stretchr/testify/require" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" "github.com/unkeyed/unkey/pkg/ptr" "github.com/unkeyed/unkey/svc/api/internal/testutil" "github.com/unkeyed/unkey/svc/api/openapi" @@ -16,10 +19,14 @@ func TestCreateDeploymentSuccessfully(t *testing.T) { h := testutil.NewHarness(t) route := &handler.Handler{ - Logger: h.Logger, - DB: h.DB, - Keys: h.Keys, - CtrlClient: h.CtrlDeploymentClient, + Logger: h.Logger, + DB: h.DB, + Keys: h.Keys, + CtrlClient: &testutil.MockDeploymentClient{ + CreateDeploymentFunc: func(ctx context.Context, req *connect.Request[ctrlv1.CreateDeploymentRequest]) (*connect.Response[ctrlv1.CreateDeploymentResponse], error) { + return connect.NewResponse(&ctrlv1.CreateDeploymentResponse{DeploymentId: "test-deployment-id"}), nil + }, + }, } h.Register(route) @@ -142,10 +149,14 @@ func TestCreateDeploymentWithWildcardPermission(t *testing.T) { h := testutil.NewHarness(t) route := &handler.Handler{ - Logger: h.Logger, - DB: h.DB, - Keys: h.Keys, - CtrlClient: h.CtrlDeploymentClient, + Logger: h.Logger, + DB: h.DB, + Keys: h.Keys, + CtrlClient: &testutil.MockDeploymentClient{ + CreateDeploymentFunc: func(ctx context.Context, req *connect.Request[ctrlv1.CreateDeploymentRequest]) (*connect.Response[ctrlv1.CreateDeploymentResponse], error) { + return connect.NewResponse(&ctrlv1.CreateDeploymentResponse{DeploymentId: "test-deployment-id"}), nil + }, + }, } h.Register(route) @@ -178,10 +189,14 @@ func TestCreateDeploymentWithSpecificProjectPermission(t *testing.T) { h := testutil.NewHarness(t) route := &handler.Handler{ - Logger: h.Logger, - DB: h.DB, - Keys: h.Keys, - CtrlClient: h.CtrlDeploymentClient, + Logger: h.Logger, + DB: h.DB, + Keys: h.Keys, + CtrlClient: &testutil.MockDeploymentClient{ + CreateDeploymentFunc: func(ctx context.Context, req *connect.Request[ctrlv1.CreateDeploymentRequest]) (*connect.Response[ctrlv1.CreateDeploymentResponse], error) { + return connect.NewResponse(&ctrlv1.CreateDeploymentResponse{DeploymentId: "test-deployment-id"}), nil + }, + }, } h.Register(route) diff --git a/svc/api/routes/v2_deploy_create_deployment/400_test.go b/svc/api/routes/v2_deploy_create_deployment/400_test.go index 6f8a2ef7a0..07cb966465 100644 --- a/svc/api/routes/v2_deploy_create_deployment/400_test.go +++ b/svc/api/routes/v2_deploy_create_deployment/400_test.go @@ -1,11 +1,14 @@ package handler_test import ( + "context" "fmt" "net/http" "testing" + "connectrpc.com/connect" "github.com/stretchr/testify/require" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" "github.com/unkeyed/unkey/svc/api/internal/testutil" "github.com/unkeyed/unkey/svc/api/openapi" handler "github.com/unkeyed/unkey/svc/api/routes/v2_deploy_create_deployment" @@ -15,10 +18,14 @@ func TestBadRequests(t *testing.T) { h := testutil.NewHarness(t) route := &handler.Handler{ - Logger: h.Logger, - DB: h.DB, - Keys: h.Keys, - CtrlClient: h.CtrlDeploymentClient, + Logger: h.Logger, + DB: h.DB, + Keys: h.Keys, + CtrlClient: &testutil.MockDeploymentClient{ + CreateDeploymentFunc: func(ctx context.Context, req *connect.Request[ctrlv1.CreateDeploymentRequest]) (*connect.Response[ctrlv1.CreateDeploymentResponse], error) { + return connect.NewResponse(&ctrlv1.CreateDeploymentResponse{DeploymentId: "test-deployment-id"}), nil + }, + }, } h.Register(route) diff --git a/svc/api/routes/v2_deploy_create_deployment/401_test.go b/svc/api/routes/v2_deploy_create_deployment/401_test.go index b2a716d0d2..69479f78e6 100644 --- a/svc/api/routes/v2_deploy_create_deployment/401_test.go +++ b/svc/api/routes/v2_deploy_create_deployment/401_test.go @@ -1,10 +1,13 @@ package handler_test import ( + "context" "net/http" "testing" + "connectrpc.com/connect" "github.com/stretchr/testify/require" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" "github.com/unkeyed/unkey/svc/api/internal/testutil" "github.com/unkeyed/unkey/svc/api/openapi" handler "github.com/unkeyed/unkey/svc/api/routes/v2_deploy_create_deployment" @@ -14,10 +17,14 @@ func TestUnauthorizedAccess(t *testing.T) { h := testutil.NewHarness(t) route := &handler.Handler{ - Logger: h.Logger, - DB: h.DB, - Keys: h.Keys, - CtrlClient: h.CtrlDeploymentClient, + Logger: h.Logger, + DB: h.DB, + Keys: h.Keys, + CtrlClient: &testutil.MockDeploymentClient{ + CreateDeploymentFunc: func(ctx context.Context, req *connect.Request[ctrlv1.CreateDeploymentRequest]) (*connect.Response[ctrlv1.CreateDeploymentResponse], error) { + return connect.NewResponse(&ctrlv1.CreateDeploymentResponse{DeploymentId: "test-deployment-id"}), nil + }, + }, } h.Register(route) diff --git a/svc/api/routes/v2_deploy_create_deployment/403_test.go b/svc/api/routes/v2_deploy_create_deployment/403_test.go index 8369e1b3d2..e1044f2c52 100644 --- a/svc/api/routes/v2_deploy_create_deployment/403_test.go +++ b/svc/api/routes/v2_deploy_create_deployment/403_test.go @@ -1,11 +1,14 @@ package handler_test import ( + "context" "fmt" "net/http" "testing" + "connectrpc.com/connect" "github.com/stretchr/testify/require" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" "github.com/unkeyed/unkey/svc/api/internal/testutil" "github.com/unkeyed/unkey/svc/api/openapi" handler "github.com/unkeyed/unkey/svc/api/routes/v2_deploy_create_deployment" @@ -17,10 +20,14 @@ func TestCreateDeploymentInsufficientPermissions(t *testing.T) { h := testutil.NewHarness(t) route := &handler.Handler{ - Logger: h.Logger, - DB: h.DB, - Keys: h.Keys, - CtrlClient: h.CtrlDeploymentClient, + Logger: h.Logger, + DB: h.DB, + Keys: h.Keys, + CtrlClient: &testutil.MockDeploymentClient{ + CreateDeploymentFunc: func(ctx context.Context, req *connect.Request[ctrlv1.CreateDeploymentRequest]) (*connect.Response[ctrlv1.CreateDeploymentResponse], error) { + return connect.NewResponse(&ctrlv1.CreateDeploymentResponse{DeploymentId: "test-deployment-id"}), nil + }, + }, } h.Register(route) diff --git a/svc/api/routes/v2_deploy_create_deployment/404_test.go b/svc/api/routes/v2_deploy_create_deployment/404_test.go index 8c4514d86f..d327edb81a 100644 --- a/svc/api/routes/v2_deploy_create_deployment/404_test.go +++ b/svc/api/routes/v2_deploy_create_deployment/404_test.go @@ -1,75 +1,100 @@ package handler_test import ( + "context" "fmt" "net/http" "testing" + "connectrpc.com/connect" "github.com/stretchr/testify/require" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" "github.com/unkeyed/unkey/pkg/uid" "github.com/unkeyed/unkey/svc/api/internal/testutil" "github.com/unkeyed/unkey/svc/api/openapi" handler "github.com/unkeyed/unkey/svc/api/routes/v2_deploy_create_deployment" ) -func TestNotFound(t *testing.T) { +func TestProjectNotFound(t *testing.T) { h := testutil.NewHarness(t) + setup := h.CreateTestDeploymentSetup(testutil.CreateTestDeploymentSetupOptions{ + Permissions: []string{"project.*.create_deployment"}, + }) + route := &handler.Handler{ - Logger: h.Logger, - DB: h.DB, - Keys: h.Keys, - CtrlClient: h.CtrlDeploymentClient, + Logger: h.Logger, + DB: h.DB, + Keys: h.Keys, + CtrlClient: &testutil.MockDeploymentClient{ + CreateDeploymentFunc: func(ctx context.Context, req *connect.Request[ctrlv1.CreateDeploymentRequest]) (*connect.Response[ctrlv1.CreateDeploymentResponse], error) { + return connect.NewResponse(&ctrlv1.CreateDeploymentResponse{DeploymentId: "test-deployment-id"}), nil + }, + }, } h.Register(route) + headers := http.Header{ + "Content-Type": {"application/json"}, + "Authorization": {fmt.Sprintf("Bearer %s", setup.RootKey)}, + } + + req := handler.Request{ + ProjectId: uid.New(uid.ProjectPrefix), // Non-existent project ID + Branch: "main", + EnvironmentSlug: "production", + } + + err := req.FromV2DeployImageSource(openapi.V2DeployImageSource{ + Image: "nginx:latest", + }) + + require.NoError(t, err, "failed to set image source") + + res := testutil.CallRoute[handler.Request, openapi.NotFoundErrorResponse](h, route, headers, req) + require.Equal(t, http.StatusNotFound, res.Status, "expected 404, received: %s", res.RawBody) + require.NotNil(t, res.Body) + require.Equal(t, "https://unkey.com/docs/errors/unkey/data/project_not_found", res.Body.Error.Type) + require.Equal(t, http.StatusNotFound, res.Body.Error.Status) + require.Equal(t, "The requested project does not exist or has been deleted.", res.Body.Error.Detail) +} + +func TestEnvironmentNotFound(t *testing.T) { + h := testutil.NewHarness(t) + setup := h.CreateTestDeploymentSetup(testutil.CreateTestDeploymentSetupOptions{ Permissions: []string{"project.*.create_deployment"}, }) + route := &handler.Handler{ + Logger: h.Logger, + DB: h.DB, + Keys: h.Keys, + CtrlClient: &testutil.MockDeploymentClient{ + CreateDeploymentFunc: func(ctx context.Context, req *connect.Request[ctrlv1.CreateDeploymentRequest]) (*connect.Response[ctrlv1.CreateDeploymentResponse], error) { + return nil, connect.NewError(connect.CodeNotFound, fmt.Errorf("environment not found")) + }, + }, + } + h.Register(route) + headers := http.Header{ "Content-Type": {"application/json"}, "Authorization": {fmt.Sprintf("Bearer %s", setup.RootKey)}, } - t.Run("project not found", func(t *testing.T) { - req := handler.Request{ - ProjectId: uid.New(uid.ProjectPrefix), // Non-existent project ID - Branch: "main", - EnvironmentSlug: "production", - } - - err := req.FromV2DeployImageSource(openapi.V2DeployImageSource{ - Image: "nginx:latest", - }) - - require.NoError(t, err, "failed to set image source") - - res := testutil.CallRoute[handler.Request, openapi.NotFoundErrorResponse](h, route, headers, req) - require.Equal(t, http.StatusNotFound, res.Status, "expected 404, received: %s", res.RawBody) - require.NotNil(t, res.Body) - require.Equal(t, "https://unkey.com/docs/errors/unkey/data/project_not_found", res.Body.Error.Type) - require.Equal(t, http.StatusNotFound, res.Body.Error.Status) - require.Equal(t, "The requested project does not exist or has been deleted.", res.Body.Error.Detail) - }) + req := handler.Request{ + ProjectId: setup.Project.ID, + Branch: "main", + EnvironmentSlug: "nonexistent-env", // Non-existent environment + } - t.Run("environment not found", func(t *testing.T) { - req := handler.Request{ - ProjectId: setup.Project.ID, - Branch: "main", - EnvironmentSlug: "nonexistent-env", // Non-existent environment - } - - err := req.FromV2DeployImageSource(openapi.V2DeployImageSource{ - Image: "nginx:latest", - }) - require.NoError(t, err, "failed to set image source") - - res := testutil.CallRoute[handler.Request, openapi.NotFoundErrorResponse](h, route, headers, req) - require.Equal(t, http.StatusNotFound, res.Status, "expected 404, received: %s", res.RawBody) - require.NotNil(t, res.Body) - require.Equal(t, "https://unkey.com/docs/errors/unkey/data/project_not_found", res.Body.Error.Type) - require.Equal(t, http.StatusNotFound, res.Body.Error.Status) - require.Equal(t, "Project not found.", res.Body.Error.Detail) + err := req.FromV2DeployImageSource(openapi.V2DeployImageSource{ + Image: "nginx:latest", }) + require.NoError(t, err, "failed to set image source") + + res := testutil.CallRoute[handler.Request, openapi.NotFoundErrorResponse](h, route, headers, req) + require.Equal(t, http.StatusNotFound, res.Status, "expected 404, received: %s", res.RawBody) + require.NotNil(t, res.Body) } diff --git a/svc/api/routes/v2_deploy_create_deployment/BUILD.bazel b/svc/api/routes/v2_deploy_create_deployment/BUILD.bazel index 3b8df72e39..34caca2429 100644 --- a/svc/api/routes/v2_deploy_create_deployment/BUILD.bazel +++ b/svc/api/routes/v2_deploy_create_deployment/BUILD.bazel @@ -32,10 +32,12 @@ go_test( ], deps = [ ":v2_deploy_create_deployment", + "//gen/proto/ctrl/v1:ctrl", "//pkg/ptr", "//pkg/uid", "//svc/api/internal/testutil", "//svc/api/openapi", + "@com_connectrpc_connect//:connect", "@com_github_stretchr_testify//require", ], ) diff --git a/svc/api/routes/v2_deploy_generate_upload_url/200_test.go b/svc/api/routes/v2_deploy_generate_upload_url/200_test.go index cee6bfe6fa..a98922b0a9 100644 --- a/svc/api/routes/v2_deploy_generate_upload_url/200_test.go +++ b/svc/api/routes/v2_deploy_generate_upload_url/200_test.go @@ -1,11 +1,14 @@ package handler_test import ( + "context" "fmt" "net/http" "testing" + "connectrpc.com/connect" "github.com/stretchr/testify/require" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" "github.com/unkeyed/unkey/svc/api/internal/testutil" handler "github.com/unkeyed/unkey/svc/api/routes/v2_deploy_generate_upload_url" ) @@ -14,10 +17,17 @@ func TestGenerateUploadUrlSuccessfully(t *testing.T) { h := testutil.NewHarness(t) route := &handler.Handler{ - Logger: h.Logger, - DB: h.DB, - Keys: h.Keys, - CtrlClient: h.CtrlDeploymentClient, + Logger: h.Logger, + DB: h.DB, + Keys: h.Keys, + CtrlClient: &testutil.MockDeploymentClient{ + CreateS3UploadURLFunc: func(ctx context.Context, req *connect.Request[ctrlv1.CreateS3UploadURLRequest]) (*connect.Response[ctrlv1.CreateS3UploadURLResponse], error) { + return connect.NewResponse(&ctrlv1.CreateS3UploadURLResponse{ + UploadUrl: "https://s3.example.com/upload", + BuildContextPath: "s3://bucket/path/to/context.tar.gz", + }), nil + }, + }, } h.Register(route) @@ -55,10 +65,17 @@ func TestGenerateUploadUrlWithWildcardPermission(t *testing.T) { h := testutil.NewHarness(t) route := &handler.Handler{ - Logger: h.Logger, - DB: h.DB, - Keys: h.Keys, - CtrlClient: h.CtrlDeploymentClient, + Logger: h.Logger, + DB: h.DB, + Keys: h.Keys, + CtrlClient: &testutil.MockDeploymentClient{ + CreateS3UploadURLFunc: func(ctx context.Context, req *connect.Request[ctrlv1.CreateS3UploadURLRequest]) (*connect.Response[ctrlv1.CreateS3UploadURLResponse], error) { + return connect.NewResponse(&ctrlv1.CreateS3UploadURLResponse{ + UploadUrl: "https://s3.example.com/upload", + BuildContextPath: "s3://bucket/path/to/context.tar.gz", + }), nil + }, + }, } h.Register(route) @@ -86,10 +103,17 @@ func TestGenerateUploadUrlWithSpecificProjectPermission(t *testing.T) { h := testutil.NewHarness(t) route := &handler.Handler{ - Logger: h.Logger, - DB: h.DB, - Keys: h.Keys, - CtrlClient: h.CtrlDeploymentClient, + Logger: h.Logger, + DB: h.DB, + Keys: h.Keys, + CtrlClient: &testutil.MockDeploymentClient{ + CreateS3UploadURLFunc: func(ctx context.Context, req *connect.Request[ctrlv1.CreateS3UploadURLRequest]) (*connect.Response[ctrlv1.CreateS3UploadURLResponse], error) { + return connect.NewResponse(&ctrlv1.CreateS3UploadURLResponse{ + UploadUrl: "https://s3.example.com/upload", + BuildContextPath: "s3://bucket/path/to/context.tar.gz", + }), nil + }, + }, } h.Register(route) diff --git a/svc/api/routes/v2_deploy_generate_upload_url/400_test.go b/svc/api/routes/v2_deploy_generate_upload_url/400_test.go index b970fe12fd..5f94a46911 100644 --- a/svc/api/routes/v2_deploy_generate_upload_url/400_test.go +++ b/svc/api/routes/v2_deploy_generate_upload_url/400_test.go @@ -1,11 +1,14 @@ package handler_test import ( + "context" "fmt" "net/http" "testing" + "connectrpc.com/connect" "github.com/stretchr/testify/require" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" "github.com/unkeyed/unkey/svc/api/internal/testutil" "github.com/unkeyed/unkey/svc/api/openapi" handler "github.com/unkeyed/unkey/svc/api/routes/v2_deploy_generate_upload_url" @@ -15,10 +18,17 @@ func TestBadRequests(t *testing.T) { h := testutil.NewHarness(t) route := &handler.Handler{ - Logger: h.Logger, - DB: h.DB, - Keys: h.Keys, - CtrlClient: h.CtrlDeploymentClient, + Logger: h.Logger, + DB: h.DB, + Keys: h.Keys, + CtrlClient: &testutil.MockDeploymentClient{ + CreateS3UploadURLFunc: func(ctx context.Context, req *connect.Request[ctrlv1.CreateS3UploadURLRequest]) (*connect.Response[ctrlv1.CreateS3UploadURLResponse], error) { + return connect.NewResponse(&ctrlv1.CreateS3UploadURLResponse{ + UploadUrl: "https://s3.example.com/upload", + BuildContextPath: "s3://bucket/path/to/context.tar.gz", + }), nil + }, + }, } h.Register(route) diff --git a/svc/api/routes/v2_deploy_generate_upload_url/401_test.go b/svc/api/routes/v2_deploy_generate_upload_url/401_test.go index 454c7f2ede..0944caf86f 100644 --- a/svc/api/routes/v2_deploy_generate_upload_url/401_test.go +++ b/svc/api/routes/v2_deploy_generate_upload_url/401_test.go @@ -1,10 +1,13 @@ package handler_test import ( + "context" "net/http" "testing" + "connectrpc.com/connect" "github.com/stretchr/testify/require" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" "github.com/unkeyed/unkey/svc/api/internal/testutil" handler "github.com/unkeyed/unkey/svc/api/routes/v2_deploy_generate_upload_url" ) @@ -13,10 +16,17 @@ func TestUnauthorizedAccess(t *testing.T) { h := testutil.NewHarness(t) route := &handler.Handler{ - Logger: h.Logger, - DB: h.DB, - Keys: h.Keys, - CtrlClient: h.CtrlDeploymentClient, + Logger: h.Logger, + DB: h.DB, + Keys: h.Keys, + CtrlClient: &testutil.MockDeploymentClient{ + CreateS3UploadURLFunc: func(ctx context.Context, req *connect.Request[ctrlv1.CreateS3UploadURLRequest]) (*connect.Response[ctrlv1.CreateS3UploadURLResponse], error) { + return connect.NewResponse(&ctrlv1.CreateS3UploadURLResponse{ + UploadUrl: "https://s3.example.com/upload", + BuildContextPath: "s3://bucket/path/to/context.tar.gz", + }), nil + }, + }, } h.Register(route) diff --git a/svc/api/routes/v2_deploy_generate_upload_url/403_test.go b/svc/api/routes/v2_deploy_generate_upload_url/403_test.go index d0bb73b948..279915c15c 100644 --- a/svc/api/routes/v2_deploy_generate_upload_url/403_test.go +++ b/svc/api/routes/v2_deploy_generate_upload_url/403_test.go @@ -1,11 +1,14 @@ package handler_test import ( + "context" "fmt" "net/http" "testing" + "connectrpc.com/connect" "github.com/stretchr/testify/require" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" "github.com/unkeyed/unkey/svc/api/internal/testutil" "github.com/unkeyed/unkey/svc/api/openapi" handler "github.com/unkeyed/unkey/svc/api/routes/v2_deploy_generate_upload_url" @@ -17,10 +20,17 @@ func TestGenerateUploadUrlInsufficientPermissions(t *testing.T) { h := testutil.NewHarness(t) route := &handler.Handler{ - Logger: h.Logger, - DB: h.DB, - Keys: h.Keys, - CtrlClient: h.CtrlDeploymentClient, + Logger: h.Logger, + DB: h.DB, + Keys: h.Keys, + CtrlClient: &testutil.MockDeploymentClient{ + CreateS3UploadURLFunc: func(ctx context.Context, req *connect.Request[ctrlv1.CreateS3UploadURLRequest]) (*connect.Response[ctrlv1.CreateS3UploadURLResponse], error) { + return connect.NewResponse(&ctrlv1.CreateS3UploadURLResponse{ + UploadUrl: "https://s3.example.com/upload", + BuildContextPath: "s3://bucket/path/to/context.tar.gz", + }), nil + }, + }, } h.Register(route) diff --git a/svc/api/routes/v2_deploy_generate_upload_url/404_test.go b/svc/api/routes/v2_deploy_generate_upload_url/404_test.go index 4b58a6b720..64f6b249fe 100644 --- a/svc/api/routes/v2_deploy_generate_upload_url/404_test.go +++ b/svc/api/routes/v2_deploy_generate_upload_url/404_test.go @@ -1,11 +1,14 @@ package handler_test import ( + "context" "fmt" "net/http" "testing" + "connectrpc.com/connect" "github.com/stretchr/testify/require" + ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" "github.com/unkeyed/unkey/pkg/uid" "github.com/unkeyed/unkey/svc/api/internal/testutil" "github.com/unkeyed/unkey/svc/api/openapi" @@ -16,10 +19,17 @@ func TestNotFound(t *testing.T) { h := testutil.NewHarness(t) route := &handler.Handler{ - Logger: h.Logger, - DB: h.DB, - Keys: h.Keys, - CtrlClient: h.CtrlDeploymentClient, + Logger: h.Logger, + DB: h.DB, + Keys: h.Keys, + CtrlClient: &testutil.MockDeploymentClient{ + CreateS3UploadURLFunc: func(ctx context.Context, req *connect.Request[ctrlv1.CreateS3UploadURLRequest]) (*connect.Response[ctrlv1.CreateS3UploadURLResponse], error) { + return connect.NewResponse(&ctrlv1.CreateS3UploadURLResponse{ + UploadUrl: "https://s3.example.com/upload", + BuildContextPath: "s3://bucket/path/to/context.tar.gz", + }), nil + }, + }, } h.Register(route) diff --git a/svc/api/routes/v2_deploy_generate_upload_url/BUILD.bazel b/svc/api/routes/v2_deploy_generate_upload_url/BUILD.bazel index d1370e7822..ff532c23fa 100644 --- a/svc/api/routes/v2_deploy_generate_upload_url/BUILD.bazel +++ b/svc/api/routes/v2_deploy_generate_upload_url/BUILD.bazel @@ -35,9 +35,11 @@ go_test( ], deps = [ ":v2_deploy_generate_upload_url", + "//gen/proto/ctrl/v1:ctrl", "//pkg/uid", "//svc/api/internal/testutil", "//svc/api/openapi", + "@com_connectrpc_connect//:connect", "@com_github_stretchr_testify//require", ], ) diff --git a/svc/api/routes/v2_deploy_get_deployment/200_test.go b/svc/api/routes/v2_deploy_get_deployment/200_test.go index 45942e10ba..e347c70caf 100644 --- a/svc/api/routes/v2_deploy_get_deployment/200_test.go +++ b/svc/api/routes/v2_deploy_get_deployment/200_test.go @@ -1,36 +1,40 @@ package handler_test import ( - "context" "fmt" "net/http" "testing" - "connectrpc.com/connect" "github.com/stretchr/testify/require" - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" - "github.com/unkeyed/unkey/gen/proto/ctrl/v1/ctrlv1connect" + "github.com/unkeyed/unkey/pkg/uid" "github.com/unkeyed/unkey/svc/api/internal/testutil" + "github.com/unkeyed/unkey/svc/api/internal/testutil/seed" handler "github.com/unkeyed/unkey/svc/api/routes/v2_deploy_get_deployment" ) func TestGetDeploymentSuccessfully(t *testing.T) { h := testutil.NewHarness(t) - route := &handler.Handler{ - Logger: h.Logger, - DB: h.DB, - Keys: h.Keys, - CtrlClient: h.CtrlDeploymentClient, - } - h.Register(route) - t.Run("get existing deployment successfully", func(t *testing.T) { setup := h.CreateTestDeploymentSetup(testutil.CreateTestDeploymentSetupOptions{ Permissions: []string{"project.*.create_deployment", "project.*.read_deployment"}, }) - deploymentID := createTestDeployment(t, h.CtrlDeploymentClient, setup.Project.ID, setup.RootKey) + deploymentID := uid.New(uid.DeploymentPrefix) + h.CreateDeployment(seed.CreateDeploymentRequest{ + ID: deploymentID, + WorkspaceID: setup.Workspace.ID, + ProjectID: setup.Project.ID, + EnvironmentID: setup.Environment.ID, + GitBranch: "main", + }) + + route := &handler.Handler{ + Logger: h.Logger, + DB: h.DB, + Keys: h.Keys, + } + h.Register(route) headers := http.Header{ "Content-Type": {"application/json"}, @@ -60,22 +64,26 @@ func TestGetDeploymentWithWildcardPermission(t *testing.T) { t.Parallel() h := testutil.NewHarness(t) - route := &handler.Handler{ - Logger: h.Logger, - DB: h.DB, - Keys: h.Keys, - CtrlClient: h.CtrlDeploymentClient, - } - h.Register(route) - - // Create setup with create_deployment permission to create a test deployment setupCreate := h.CreateTestDeploymentSetup(testutil.CreateTestDeploymentSetupOptions{ Permissions: []string{"project.*.create_deployment"}, }) - deploymentID := createTestDeployment(t, h.CtrlDeploymentClient, setupCreate.Project.ID, setupCreate.RootKey) + deploymentID := uid.New(uid.DeploymentPrefix) + h.CreateDeployment(seed.CreateDeploymentRequest{ + ID: deploymentID, + WorkspaceID: setupCreate.Workspace.ID, + ProjectID: setupCreate.Project.ID, + EnvironmentID: setupCreate.Environment.ID, + GitBranch: "main", + }) + + route := &handler.Handler{ + Logger: h.Logger, + DB: h.DB, + Keys: h.Keys, + } + h.Register(route) - // Now create a separate key with wildcard read_deployment permission for the actual test rootKey := h.CreateRootKey(setupCreate.Workspace.ID, "project.*.read_deployment") headers := http.Header{ @@ -96,22 +104,26 @@ func TestGetDeploymentWithSpecificProjectPermission(t *testing.T) { t.Parallel() h := testutil.NewHarness(t) - route := &handler.Handler{ - Logger: h.Logger, - DB: h.DB, - Keys: h.Keys, - CtrlClient: h.CtrlDeploymentClient, - } - h.Register(route) - - // Create setup with create_deployment permission to create a test deployment setupCreate := h.CreateTestDeploymentSetup(testutil.CreateTestDeploymentSetupOptions{ Permissions: []string{"project.*.create_deployment"}, }) - deploymentID := createTestDeployment(t, h.CtrlDeploymentClient, setupCreate.Project.ID, setupCreate.RootKey) + deploymentID := uid.New(uid.DeploymentPrefix) + h.CreateDeployment(seed.CreateDeploymentRequest{ + ID: deploymentID, + WorkspaceID: setupCreate.Workspace.ID, + ProjectID: setupCreate.Project.ID, + EnvironmentID: setupCreate.Environment.ID, + GitBranch: "main", + }) + + route := &handler.Handler{ + Logger: h.Logger, + DB: h.DB, + Keys: h.Keys, + } + h.Register(route) - // Now create a separate key with project-specific read_deployment permission for the actual test rootKey := h.CreateRootKey(setupCreate.Workspace.ID, fmt.Sprintf("project.%s.read_deployment", setupCreate.Project.ID)) headers := http.Header{ @@ -127,28 +139,3 @@ func TestGetDeploymentWithSpecificProjectPermission(t *testing.T) { require.Equal(t, http.StatusOK, res.Status, "Expected 200, got: %d", res.Status) require.NotNil(t, res.Body) } - -func createTestDeployment(t *testing.T, client ctrlv1connect.DeploymentServiceClient, projectID, rootKey string) string { - t.Helper() - - req := &ctrlv1.CreateDeploymentRequest{ - ProjectId: projectID, - Branch: "main", - EnvironmentSlug: "production", - Source: &ctrlv1.CreateDeploymentRequest_DockerImage{ - DockerImage: "nginx:latest", - }, - GitCommit: &ctrlv1.GitCommitInfo{ - CommitSha: "abc123", - }, - } - - connectReq := connect.NewRequest(req) - connectReq.Header().Set("Authorization", fmt.Sprintf("Bearer %s", rootKey)) - - resp, err := client.CreateDeployment(context.Background(), connectReq) - require.NoError(t, err) - require.NotEmpty(t, resp.Msg.GetDeploymentId()) - - return resp.Msg.GetDeploymentId() -} diff --git a/svc/api/routes/v2_deploy_get_deployment/400_test.go b/svc/api/routes/v2_deploy_get_deployment/400_test.go index 3ab0ada892..a561136085 100644 --- a/svc/api/routes/v2_deploy_get_deployment/400_test.go +++ b/svc/api/routes/v2_deploy_get_deployment/400_test.go @@ -15,10 +15,9 @@ func TestBadRequests(t *testing.T) { h := testutil.NewHarness(t) route := &handler.Handler{ - Logger: h.Logger, - DB: h.DB, - Keys: h.Keys, - CtrlClient: h.CtrlDeploymentClient, + Logger: h.Logger, + DB: h.DB, + Keys: h.Keys, } h.Register(route) diff --git a/svc/api/routes/v2_deploy_get_deployment/401_test.go b/svc/api/routes/v2_deploy_get_deployment/401_test.go index 2c16f1b450..6b990f27ec 100644 --- a/svc/api/routes/v2_deploy_get_deployment/401_test.go +++ b/svc/api/routes/v2_deploy_get_deployment/401_test.go @@ -13,10 +13,9 @@ func TestUnauthorizedAccess(t *testing.T) { h := testutil.NewHarness(t) route := &handler.Handler{ - Logger: h.Logger, - DB: h.DB, - Keys: h.Keys, - CtrlClient: h.CtrlDeploymentClient, + Logger: h.Logger, + DB: h.DB, + Keys: h.Keys, } h.Register(route) diff --git a/svc/api/routes/v2_deploy_get_deployment/403_test.go b/svc/api/routes/v2_deploy_get_deployment/403_test.go index 415e7a4632..435d0db38b 100644 --- a/svc/api/routes/v2_deploy_get_deployment/403_test.go +++ b/svc/api/routes/v2_deploy_get_deployment/403_test.go @@ -6,7 +6,9 @@ import ( "testing" "github.com/stretchr/testify/require" + "github.com/unkeyed/unkey/pkg/uid" "github.com/unkeyed/unkey/svc/api/internal/testutil" + "github.com/unkeyed/unkey/svc/api/internal/testutil/seed" "github.com/unkeyed/unkey/svc/api/openapi" handler "github.com/unkeyed/unkey/svc/api/routes/v2_deploy_get_deployment" ) @@ -16,23 +18,26 @@ func TestGetDeploymentInsufficientPermissions(t *testing.T) { h := testutil.NewHarness(t) - route := &handler.Handler{ - Logger: h.Logger, - DB: h.DB, - Keys: h.Keys, - CtrlClient: h.CtrlDeploymentClient, - } - h.Register(route) - - // Create setup with create_deployment permission to create a test deployment setupCreate := h.CreateTestDeploymentSetup(testutil.CreateTestDeploymentSetupOptions{ Permissions: []string{"project.*.create_deployment"}, }) - // Create an actual deployment - deploymentID := createTestDeployment(t, h.CtrlDeploymentClient, setupCreate.Project.ID, setupCreate.RootKey) + deploymentID := uid.New(uid.DeploymentPrefix) + h.CreateDeployment(seed.CreateDeploymentRequest{ + ID: deploymentID, + WorkspaceID: setupCreate.Workspace.ID, + ProjectID: setupCreate.Project.ID, + EnvironmentID: setupCreate.Environment.ID, + GitBranch: "main", + }) + + route := &handler.Handler{ + Logger: h.Logger, + DB: h.DB, + Keys: h.Keys, + } + h.Register(route) - // Now create a key with insufficient permissions (no read_deployment) rootKeyWithoutRead := h.CreateRootKey(setupCreate.Workspace.ID, "project.*.create_deployment") headers := http.Header{ diff --git a/svc/api/routes/v2_deploy_get_deployment/404_test.go b/svc/api/routes/v2_deploy_get_deployment/404_test.go index d843a12ae8..5ece49dd4f 100644 --- a/svc/api/routes/v2_deploy_get_deployment/404_test.go +++ b/svc/api/routes/v2_deploy_get_deployment/404_test.go @@ -14,10 +14,9 @@ func TestNotFound(t *testing.T) { h := testutil.NewHarness(t) route := &handler.Handler{ - Logger: h.Logger, - DB: h.DB, - Keys: h.Keys, - CtrlClient: h.CtrlDeploymentClient, + Logger: h.Logger, + DB: h.DB, + Keys: h.Keys, } h.Register(route) @@ -37,8 +36,7 @@ func TestNotFound(t *testing.T) { res := testutil.CallRoute[handler.Request, handler.Response](h, route, headers, req) - // CTRL service returns 500 for not found errors, not 404 - require.Equal(t, http.StatusNotFound, res.Status, "expected 504, received: %s", res) + require.Equal(t, http.StatusNotFound, res.Status, "expected 404, received: %s", res) require.NotNil(t, res.Body) }) } diff --git a/svc/api/routes/v2_deploy_get_deployment/BUILD.bazel b/svc/api/routes/v2_deploy_get_deployment/BUILD.bazel index 84e16d15f9..e30af0d7e5 100644 --- a/svc/api/routes/v2_deploy_get_deployment/BUILD.bazel +++ b/svc/api/routes/v2_deploy_get_deployment/BUILD.bazel @@ -6,8 +6,6 @@ go_library( importpath = "github.com/unkeyed/unkey/svc/api/routes/v2_deploy_get_deployment", visibility = ["//visibility:public"], deps = [ - "//gen/proto/ctrl/v1:ctrl", - "//gen/proto/ctrl/v1/ctrlv1connect", "//internal/services/keys", "//pkg/codes", "//pkg/db", @@ -15,9 +13,7 @@ go_library( "//pkg/otel/logging", "//pkg/rbac", "//pkg/zen", - "//svc/api/internal/ctrlclient", "//svc/api/openapi", - "@com_connectrpc_connect//:connect", ], ) @@ -32,11 +28,10 @@ go_test( ], deps = [ ":v2_deploy_get_deployment", - "//gen/proto/ctrl/v1:ctrl", - "//gen/proto/ctrl/v1/ctrlv1connect", + "//pkg/uid", "//svc/api/internal/testutil", + "//svc/api/internal/testutil/seed", "//svc/api/openapi", - "@com_connectrpc_connect//:connect", "@com_github_stretchr_testify//require", ], ) diff --git a/svc/api/routes/v2_deploy_get_deployment/handler.go b/svc/api/routes/v2_deploy_get_deployment/handler.go index 19230048a2..66cfc656d5 100644 --- a/svc/api/routes/v2_deploy_get_deployment/handler.go +++ b/svc/api/routes/v2_deploy_get_deployment/handler.go @@ -3,10 +3,8 @@ package handler import ( "context" "net/http" + "strings" - "connectrpc.com/connect" - ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" - "github.com/unkeyed/unkey/gen/proto/ctrl/v1/ctrlv1connect" "github.com/unkeyed/unkey/internal/services/keys" "github.com/unkeyed/unkey/pkg/codes" "github.com/unkeyed/unkey/pkg/db" @@ -14,7 +12,6 @@ import ( "github.com/unkeyed/unkey/pkg/otel/logging" "github.com/unkeyed/unkey/pkg/rbac" "github.com/unkeyed/unkey/pkg/zen" - "github.com/unkeyed/unkey/svc/api/internal/ctrlclient" "github.com/unkeyed/unkey/svc/api/openapi" ) @@ -24,10 +21,9 @@ type ( ) type Handler struct { - Logger logging.Logger - DB db.Database - Keys keys.KeyService - CtrlClient ctrlv1connect.DeploymentServiceClient + Logger logging.Logger + DB db.Database + Keys keys.KeyService } func (h *Handler) Path() string { @@ -50,7 +46,7 @@ func (h *Handler) Handle(ctx context.Context, s *zen.Session) error { return err } - dbDeployment, err := db.Query.FindDeploymentById(ctx, h.DB.RO(), req.DeploymentId) + deployment, err := db.Query.FindDeploymentById(ctx, h.DB.RO(), req.DeploymentId) if err != nil { if db.IsNotFound(err) { return fault.New("deployment not found", @@ -63,7 +59,7 @@ func (h *Handler) Handle(ctx context.Context, s *zen.Session) error { } // Verify deployment belongs to the authenticated workspace - if dbDeployment.WorkspaceID != auth.AuthorizedWorkspaceID { + if deployment.WorkspaceID != auth.AuthorizedWorkspaceID { return fault.New("wrong workspace", fault.Code(codes.Data.Project.NotFound.URN()), fault.Internal("wrong workspace, masking as 404"), @@ -72,7 +68,7 @@ func (h *Handler) Handle(ctx context.Context, s *zen.Session) error { } // Extract projectID from deployment - projectID := dbDeployment.ProjectID + projectID := deployment.ProjectID err = auth.VerifyRootKey(ctx, keys.WithPermissions(rbac.Or( rbac.T(rbac.Tuple{ @@ -90,76 +86,27 @@ func (h *Handler) Handle(ctx context.Context, s *zen.Session) error { return err } - ctrlReq := &ctrlv1.GetDeploymentRequest{ - DeploymentId: req.DeploymentId, - } - connectReq := connect.NewRequest(ctrlReq) - - ctrlResp, err := h.CtrlClient.GetDeployment(ctx, connectReq) - if err != nil { - return ctrlclient.HandleError(err, "get deployment") - } - - deployment := ctrlResp.Msg.GetDeployment() - - // Transform status enum to string - statusStr := deploymentStatusToString(deployment.GetStatus()) - - // Transform steps - var steps *[]openapi.V2DeployDeploymentStep - if deployment.GetSteps() != nil { - stepsSlice := make([]openapi.V2DeployDeploymentStep, len(deployment.GetSteps())) - for i, protoStep := range deployment.GetSteps() { - step := openapi.V2DeployDeploymentStep{ - ErrorMessage: nil, - CreatedAt: nil, - Message: nil, - Status: nil, - } - - if protoStep.GetStatus() != "" { - status := protoStep.GetStatus() - step.Status = &status - } - if protoStep.GetMessage() != "" { - message := protoStep.GetMessage() - step.Message = &message - } - if protoStep.GetErrorMessage() != "" { - errMessage := protoStep.GetErrorMessage() - step.ErrorMessage = &errMessage - } - if protoStep.GetCreatedAt() != 0 { - createdAt := protoStep.GetCreatedAt() - step.CreatedAt = &createdAt - } - stepsSlice[i] = step - } - steps = &stepsSlice - } - + // Build response directly from database model responseData := openapi.V2DeployGetDeploymentResponseData{ - Id: deployment.GetId(), - Status: openapi.V2DeployGetDeploymentResponseDataStatus(statusStr), + Id: deployment.ID, + Status: dbStatusToOpenAPI(deployment.Status), Steps: nil, ErrorMessage: nil, Hostnames: nil, } - if deployment.GetErrorMessage() != "" { - errorMessage := deployment.GetErrorMessage() - responseData.ErrorMessage = &errorMessage - } - - if len(deployment.GetHostnames()) > 0 { - hostnames := deployment.GetHostnames() + // Fetch hostnames from frontline routes + routes, routesErr := db.Query.FindFrontlineRoutesByDeploymentID(ctx, h.DB.RO(), req.DeploymentId) + if routesErr != nil { + h.Logger.Warn("failed to fetch frontline routes for deployment", "error", routesErr, "deployment_id", deployment.ID) + } else if len(routes) > 0 { + hostnames := make([]string, len(routes)) + for i, route := range routes { + hostnames[i] = route.FullyQualifiedDomainName + } responseData.Hostnames = &hostnames } - if steps != nil { - responseData.Steps = steps - } - return s.JSON(http.StatusOK, Response{ Meta: openapi.Meta{ RequestId: s.RequestID(), @@ -168,23 +115,6 @@ func (h *Handler) Handle(ctx context.Context, s *zen.Session) error { }) } -func deploymentStatusToString(status ctrlv1.DeploymentStatus) string { - switch status { - case ctrlv1.DeploymentStatus_DEPLOYMENT_STATUS_UNSPECIFIED: - return "UNSPECIFIED" - case ctrlv1.DeploymentStatus_DEPLOYMENT_STATUS_PENDING: - return "PENDING" - case ctrlv1.DeploymentStatus_DEPLOYMENT_STATUS_BUILDING: - return "BUILDING" - case ctrlv1.DeploymentStatus_DEPLOYMENT_STATUS_DEPLOYING: - return "DEPLOYING" - case ctrlv1.DeploymentStatus_DEPLOYMENT_STATUS_NETWORK: - return "NETWORK" - case ctrlv1.DeploymentStatus_DEPLOYMENT_STATUS_READY: - return "READY" - case ctrlv1.DeploymentStatus_DEPLOYMENT_STATUS_FAILED: - return "FAILED" - default: - return "UNSPECIFIED" - } +func dbStatusToOpenAPI(status db.DeploymentsStatus) openapi.V2DeployGetDeploymentResponseDataStatus { + return openapi.V2DeployGetDeploymentResponseDataStatus(strings.ToUpper(string(status))) } From 3e57b63c819f20a110bc65dc9e2b1a5991bd68d3 Mon Sep 17 00:00:00 2001 From: chronark Date: Thu, 22 Jan 2026 22:04:23 +0100 Subject: [PATCH 22/32] fix: use vault service in docker --- dev/docker-compose.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/dev/docker-compose.yaml b/dev/docker-compose.yaml index b8838ce70c..1e7246db01 100644 --- a/dev/docker-compose.yaml +++ b/dev/docker-compose.yaml @@ -385,6 +385,9 @@ services: clickhouse: condition: service_healthy required: true + vault: + condition: service_healthy + required: true volumes: - /var/run/docker.sock:/var/run/docker.sock environment: @@ -401,12 +404,9 @@ services: UNKEY_RESTATE_REGISTER_AS: "http://ctrl-worker:9080" UNKEY_RESTATE_API_KEY: "" - # Vault - General secrets (env vars, API keys) - UNKEY_VAULT_S3_URL: "http://s3:3902" - UNKEY_VAULT_S3_BUCKET: "vault" - UNKEY_VAULT_S3_ACCESS_KEY_ID: "minio_root_user" - UNKEY_VAULT_S3_ACCESS_KEY_SECRET: "minio_root_password" - UNKEY_VAULT_MASTER_KEYS: "Ch9rZWtfMmdqMFBJdVhac1NSa0ZhNE5mOWlLSnBHenFPENTt7an5MRogENt9Si6wms4pQ2XIvqNSIgNpaBenJmXgcInhu6Nfv2U=" + # Vault service for secret encryption + UNKEY_VAULT_URL: "http://vault:8060" + UNKEY_VAULT_TOKEN: "vault-test-token-123" # Build configuration (loaded from .env.depot) UNKEY_BUILD_S3_BUCKET: "build-contexts" From 79cca56c7d7325344092609b411724159106a0ff Mon Sep 17 00:00:00 2001 From: chronark Date: Thu, 22 Jan 2026 22:23:48 +0100 Subject: [PATCH 23/32] fix: drop unused indices --- .../db/src/schema/deployment_topology.ts | 31 ++++++++++++------- web/internal/db/src/schema/sentinels.ts | 5 +-- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/web/internal/db/src/schema/deployment_topology.ts b/web/internal/db/src/schema/deployment_topology.ts index 98dd7a3343..4955380f01 100644 --- a/web/internal/db/src/schema/deployment_topology.ts +++ b/web/internal/db/src/schema/deployment_topology.ts @@ -15,7 +15,9 @@ import { workspaces } from "./workspaces"; export const deploymentTopology = mysqlTable( "deployment_topology", { - pk: bigint("pk", { mode: "number", unsigned: true }).autoincrement().primaryKey(), + pk: bigint("pk", { mode: "number", unsigned: true }) + .autoincrement() + .primaryKey(), workspaceId: varchar("workspace_id", { length: 64 }).notNull(), deploymentId: varchar("deployment_id", { length: 64 }).notNull(), @@ -39,21 +41,26 @@ export const deploymentTopology = mysqlTable( ...lifecycleDates, }, (table) => [ - uniqueIndex("unique_region_per_deployment").on(table.deploymentId, table.region), + uniqueIndex("unique_region_per_deployment").on( + table.deploymentId, + table.region, + ), uniqueIndex("unique_version_per_region").on(table.region, table.version), index("workspace_idx").on(table.workspaceId), - index("region_idx").on(table.region), index("status_idx").on(table.desiredStatus), ], ); -export const deploymentTopologyRelations = relations(deploymentTopology, ({ one }) => ({ - workspace: one(workspaces, { - fields: [deploymentTopology.workspaceId], - references: [workspaces.id], +export const deploymentTopologyRelations = relations( + deploymentTopology, + ({ one }) => ({ + workspace: one(workspaces, { + fields: [deploymentTopology.workspaceId], + references: [workspaces.id], + }), + delpoyment: one(deployments, { + fields: [deploymentTopology.deploymentId], + references: [deployments.id], + }), }), - delpoyment: one(deployments, { - fields: [deploymentTopology.deploymentId], - references: [deployments.id], - }), -})); +); diff --git a/web/internal/db/src/schema/sentinels.ts b/web/internal/db/src/schema/sentinels.ts index 917ea40c0e..dc034d2a4f 100644 --- a/web/internal/db/src/schema/sentinels.ts +++ b/web/internal/db/src/schema/sentinels.ts @@ -19,7 +19,9 @@ import { workspaces } from "./workspaces"; export const sentinels = mysqlTable( "sentinels", { - pk: bigint("pk", { mode: "number", unsigned: true }).autoincrement().primaryKey(), + pk: bigint("pk", { mode: "number", unsigned: true }) + .autoincrement() + .primaryKey(), id: varchar("id", { length: 64 }).notNull().unique(), workspaceId: varchar("workspace_id", { length: 255 }).notNull(), projectId: varchar("project_id", { length: 255 }).notNull(), @@ -53,7 +55,6 @@ export const sentinels = mysqlTable( }, (table) => [ index("idx_environment_id").on(table.environmentId), - index("region_version_idx").on(table.region, table.version), uniqueIndex("one_env_per_region").on(table.environmentId, table.region), uniqueIndex("unique_version_per_region").on(table.region, table.version), ], From 0c654a9bec994d165922f994217a5db2df4311e8 Mon Sep 17 00:00:00 2001 From: chronark Date: Thu, 22 Jan 2026 22:24:02 +0100 Subject: [PATCH 24/32] fix: mock client is now concurrency safe --- .../internal/testutil/mock_deployment_client.go | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/svc/api/internal/testutil/mock_deployment_client.go b/svc/api/internal/testutil/mock_deployment_client.go index 841ad99c36..8691f064a1 100644 --- a/svc/api/internal/testutil/mock_deployment_client.go +++ b/svc/api/internal/testutil/mock_deployment_client.go @@ -2,6 +2,7 @@ package testutil import ( "context" + "sync" "connectrpc.com/connect" ctrlv1 "github.com/unkeyed/unkey/gen/proto/ctrl/v1" @@ -15,7 +16,10 @@ var _ ctrlv1connect.DeploymentServiceClient = (*MockDeploymentClient)(nil) // Each method has an optional function field that tests can set to customize // behavior. If the function is nil, the method returns a sensible default. // The mock also records calls so tests can verify the correct requests were made. +// +// This mock is safe for concurrent use. All call recording is protected by a mutex. type MockDeploymentClient struct { + mu sync.Mutex CreateS3UploadURLFunc func(context.Context, *connect.Request[ctrlv1.CreateS3UploadURLRequest]) (*connect.Response[ctrlv1.CreateS3UploadURLResponse], error) CreateDeploymentFunc func(context.Context, *connect.Request[ctrlv1.CreateDeploymentRequest]) (*connect.Response[ctrlv1.CreateDeploymentResponse], error) GetDeploymentFunc func(context.Context, *connect.Request[ctrlv1.GetDeploymentRequest]) (*connect.Response[ctrlv1.GetDeploymentResponse], error) @@ -29,7 +33,9 @@ type MockDeploymentClient struct { } func (m *MockDeploymentClient) CreateS3UploadURL(ctx context.Context, req *connect.Request[ctrlv1.CreateS3UploadURLRequest]) (*connect.Response[ctrlv1.CreateS3UploadURLResponse], error) { + m.mu.Lock() m.CreateS3UploadURLCalls = append(m.CreateS3UploadURLCalls, req.Msg) + m.mu.Unlock() if m.CreateS3UploadURLFunc != nil { return m.CreateS3UploadURLFunc(ctx, req) } @@ -37,7 +43,9 @@ func (m *MockDeploymentClient) CreateS3UploadURL(ctx context.Context, req *conne } func (m *MockDeploymentClient) CreateDeployment(ctx context.Context, req *connect.Request[ctrlv1.CreateDeploymentRequest]) (*connect.Response[ctrlv1.CreateDeploymentResponse], error) { + m.mu.Lock() m.CreateDeploymentCalls = append(m.CreateDeploymentCalls, req.Msg) + m.mu.Unlock() if m.CreateDeploymentFunc != nil { return m.CreateDeploymentFunc(ctx, req) } @@ -45,7 +53,9 @@ func (m *MockDeploymentClient) CreateDeployment(ctx context.Context, req *connec } func (m *MockDeploymentClient) GetDeployment(ctx context.Context, req *connect.Request[ctrlv1.GetDeploymentRequest]) (*connect.Response[ctrlv1.GetDeploymentResponse], error) { + m.mu.Lock() m.GetDeploymentCalls = append(m.GetDeploymentCalls, req.Msg) + m.mu.Unlock() if m.GetDeploymentFunc != nil { return m.GetDeploymentFunc(ctx, req) } @@ -53,7 +63,9 @@ func (m *MockDeploymentClient) GetDeployment(ctx context.Context, req *connect.R } func (m *MockDeploymentClient) Rollback(ctx context.Context, req *connect.Request[ctrlv1.RollbackRequest]) (*connect.Response[ctrlv1.RollbackResponse], error) { + m.mu.Lock() m.RollbackCalls = append(m.RollbackCalls, req.Msg) + m.mu.Unlock() if m.RollbackFunc != nil { return m.RollbackFunc(ctx, req) } @@ -61,7 +73,9 @@ func (m *MockDeploymentClient) Rollback(ctx context.Context, req *connect.Reques } func (m *MockDeploymentClient) Promote(ctx context.Context, req *connect.Request[ctrlv1.PromoteRequest]) (*connect.Response[ctrlv1.PromoteResponse], error) { + m.mu.Lock() m.PromoteCalls = append(m.PromoteCalls, req.Msg) + m.mu.Unlock() if m.PromoteFunc != nil { return m.PromoteFunc(ctx, req) } From 1ebde0fad1cf0b825fd476af87d0ef9d8367e028 Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Fri, 23 Jan 2026 06:56:41 +0000 Subject: [PATCH 25/32] [autofix.ci] apply automated fixes --- .../db/src/schema/deployment_topology.ts | 30 +++++++------------ web/internal/db/src/schema/sentinels.ts | 4 +-- 2 files changed, 12 insertions(+), 22 deletions(-) diff --git a/web/internal/db/src/schema/deployment_topology.ts b/web/internal/db/src/schema/deployment_topology.ts index 4955380f01..37ab0382d5 100644 --- a/web/internal/db/src/schema/deployment_topology.ts +++ b/web/internal/db/src/schema/deployment_topology.ts @@ -15,9 +15,7 @@ import { workspaces } from "./workspaces"; export const deploymentTopology = mysqlTable( "deployment_topology", { - pk: bigint("pk", { mode: "number", unsigned: true }) - .autoincrement() - .primaryKey(), + pk: bigint("pk", { mode: "number", unsigned: true }).autoincrement().primaryKey(), workspaceId: varchar("workspace_id", { length: 64 }).notNull(), deploymentId: varchar("deployment_id", { length: 64 }).notNull(), @@ -41,26 +39,20 @@ export const deploymentTopology = mysqlTable( ...lifecycleDates, }, (table) => [ - uniqueIndex("unique_region_per_deployment").on( - table.deploymentId, - table.region, - ), + uniqueIndex("unique_region_per_deployment").on(table.deploymentId, table.region), uniqueIndex("unique_version_per_region").on(table.region, table.version), index("workspace_idx").on(table.workspaceId), index("status_idx").on(table.desiredStatus), ], ); -export const deploymentTopologyRelations = relations( - deploymentTopology, - ({ one }) => ({ - workspace: one(workspaces, { - fields: [deploymentTopology.workspaceId], - references: [workspaces.id], - }), - delpoyment: one(deployments, { - fields: [deploymentTopology.deploymentId], - references: [deployments.id], - }), +export const deploymentTopologyRelations = relations(deploymentTopology, ({ one }) => ({ + workspace: one(workspaces, { + fields: [deploymentTopology.workspaceId], + references: [workspaces.id], }), -); + delpoyment: one(deployments, { + fields: [deploymentTopology.deploymentId], + references: [deployments.id], + }), +})); diff --git a/web/internal/db/src/schema/sentinels.ts b/web/internal/db/src/schema/sentinels.ts index dc034d2a4f..28ad261aba 100644 --- a/web/internal/db/src/schema/sentinels.ts +++ b/web/internal/db/src/schema/sentinels.ts @@ -19,9 +19,7 @@ import { workspaces } from "./workspaces"; export const sentinels = mysqlTable( "sentinels", { - pk: bigint("pk", { mode: "number", unsigned: true }) - .autoincrement() - .primaryKey(), + pk: bigint("pk", { mode: "number", unsigned: true }).autoincrement().primaryKey(), id: varchar("id", { length: 64 }).notNull().unique(), workspaceId: varchar("workspace_id", { length: 255 }).notNull(), projectId: varchar("project_id", { length: 255 }).notNull(), From d7477a04c9583fa902f96d0a147fe7b0965ab3d2 Mon Sep 17 00:00:00 2001 From: Andreas Thomas Date: Fri, 23 Jan 2026 12:17:19 +0100 Subject: [PATCH 26/32] Update cmd/ctrl/worker.go Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- cmd/ctrl/worker.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/ctrl/worker.go b/cmd/ctrl/worker.go index ab7f8d06db..a5f853b342 100644 --- a/cmd/ctrl/worker.go +++ b/cmd/ctrl/worker.go @@ -39,7 +39,7 @@ var workerCmd = &cli.Command{ cli.String("auth-token", "Authentication token for worker API access.", cli.EnvVar("UNKEY_AUTH_TOKEN")), - cli.String("vault-url", "Url where vault is availab;e", + cli.String("vault-url", "Url where vault is available", cli.EnvVar("UNKEY_VAULT_URL"), cli.Default("https://vault.unkey.cloud")), cli.String("vault-token", "Authentication for vault", From 7a53c3e3f01359e7c48c016dc0c7148f58d22c57 Mon Sep 17 00:00:00 2001 From: Andreas Thomas Date: Fri, 23 Jan 2026 12:17:59 +0100 Subject: [PATCH 27/32] Update svc/krane/internal/deployment/consts.go Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- svc/krane/internal/deployment/consts.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/svc/krane/internal/deployment/consts.go b/svc/krane/internal/deployment/consts.go index d9908c4c6d..84b55fe7ed 100644 --- a/svc/krane/internal/deployment/consts.go +++ b/svc/krane/internal/deployment/consts.go @@ -12,7 +12,7 @@ const ( runtimeClassGvisor = "gvisor" // fieldManagerKrane identifies krane as the server-side apply field manager, - // enabling conflict-free concurrent updates from multiple sources. + // so field ownership/conflict detection is tracked per manager. fieldManagerKrane = "krane" // CustomerNodeClass is the Karpenter nodepool name for untrusted customer From 30470e25bcf6bd0c111e1550ab287776a48ce962 Mon Sep 17 00:00:00 2001 From: Andreas Thomas Date: Fri, 23 Jan 2026 12:18:13 +0100 Subject: [PATCH 28/32] Update svc/ctrl/worker/run.go Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- svc/ctrl/worker/run.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/svc/ctrl/worker/run.go b/svc/ctrl/worker/run.go index 0407975dfe..c03b809012 100644 --- a/svc/ctrl/worker/run.go +++ b/svc/ctrl/worker/run.go @@ -257,7 +257,8 @@ func Run(ctx context.Context, cfg Config) error { req.Header.Set("Content-Type", "application/json") - resp, doErr := http.DefaultClient.Do(req) + client := &http.Client{Timeout: 30 * time.Second} + resp, doErr := client.Do(req) if doErr != nil { return fmt.Errorf("failed to register with Restate: %w", doErr) } From fae4f2aa7419cfe3068b53bd62dda6159c3358ac Mon Sep 17 00:00:00 2001 From: Andreas Thomas Date: Fri, 23 Jan 2026 12:22:34 +0100 Subject: [PATCH 29/32] Update cmd/ctrl/api.go Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- cmd/ctrl/api.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/ctrl/api.go b/cmd/ctrl/api.go index d87a682c09..9a56057cfb 100644 --- a/cmd/ctrl/api.go +++ b/cmd/ctrl/api.go @@ -61,7 +61,7 @@ var apiCmd = &cli.Command{ cli.String("spiffe-socket-path", "Path to SPIFFE agent socket for mTLS authentication. Default: /var/lib/spire/agent/agent.sock", cli.Default("/var/lib/spire/agent/agent.sock"), cli.EnvVar("UNKEY_SPIFFE_SOCKET_PATH")), - cli.String("vault-url", "Url where vault is availab;e", + cli.String("vault-url", "URL where Vault is available", cli.EnvVar("UNKEY_VAULT_URL"), cli.Default("https://vault.unkey.cloud")), cli.String("vault-token", "Authentication for vault", From 51e2ac684d598a4f5b4c7450ceed3093928a93d5 Mon Sep 17 00:00:00 2001 From: chronark Date: Fri, 23 Jan 2026 12:28:10 +0100 Subject: [PATCH 30/32] fix: flags --- cmd/ctrl/api.go | 27 +++++++++------------------ cmd/ctrl/worker.go | 17 +++++++---------- 2 files changed, 16 insertions(+), 28 deletions(-) diff --git a/cmd/ctrl/api.go b/cmd/ctrl/api.go index d87a682c09..dc66a8c7a0 100644 --- a/cmd/ctrl/api.go +++ b/cmd/ctrl/api.go @@ -57,38 +57,29 @@ var apiCmd = &cli.Command{ // Control Plane Specific cli.String("auth-token", "Authentication token for control plane API access. Required for secure deployments.", + cli.Required(), cli.EnvVar("UNKEY_AUTH_TOKEN")), - cli.String("spiffe-socket-path", "Path to SPIFFE agent socket for mTLS authentication. Default: /var/lib/spire/agent/agent.sock", - cli.Default("/var/lib/spire/agent/agent.sock"), cli.EnvVar("UNKEY_SPIFFE_SOCKET_PATH")), - cli.String("vault-url", "Url where vault is availab;e", - cli.EnvVar("UNKEY_VAULT_URL"), cli.Default("https://vault.unkey.cloud")), + cli.String("vault-url", "URL where Vault is available", + cli.Required(), + cli.EnvVar("UNKEY_VAULT_URL"), + cli.Default("https://vault.unkey.cloud"), + ), cli.String("vault-token", "Authentication for vault", - cli.EnvVar("UNKEY_VAULT_TOKEN")), + cli.Required(), + cli.EnvVar("UNKEY_VAULT_TOKEN"), + ), cli.Bool("acme-enabled", "Enable Let's Encrypt for acme challenges", cli.EnvVar("UNKEY_ACME_ENABLED")), cli.String("acme-email-domain", "Domain for ACME registration emails (workspace_id@domain)", cli.Default("unkey.com"), cli.EnvVar("UNKEY_ACME_EMAIL_DOMAIN")), - // Route53 DNS provider - cli.Bool("acme-route53-enabled", "Enable Route53 for DNS-01 challenges", cli.EnvVar("UNKEY_ACME_ROUTE53_ENABLED")), - cli.String("acme-route53-access-key-id", "AWS access key ID for Route53", cli.EnvVar("UNKEY_ACME_ROUTE53_ACCESS_KEY_ID")), - cli.String("acme-route53-secret-access-key", "AWS secret access key for Route53", cli.EnvVar("UNKEY_ACME_ROUTE53_SECRET_ACCESS_KEY")), - cli.String("acme-route53-region", "AWS region for Route53", cli.Default("us-east-1"), cli.EnvVar("UNKEY_ACME_ROUTE53_REGION")), - cli.String("acme-route53-hosted-zone-id", "Route53 hosted zone ID (bypasses auto-discovery, required when wildcard CNAMEs exist)", cli.EnvVar("UNKEY_ACME_ROUTE53_HOSTED_ZONE_ID")), - cli.String("default-domain", "Default domain for auto-generated hostnames", cli.Default("unkey.app"), cli.EnvVar("UNKEY_DEFAULT_DOMAIN")), cli.String("regional-apex-domain", "Apex domain for cross-region frontline communication (e.g., unkey.cloud). Certs are provisioned for *.{region}.{regional-apex-domain}", cli.EnvVar("UNKEY_REGIONAL_APEX_DOMAIN")), // Restate Configuration cli.String("restate-url", "URL of the Restate ingress endpoint for invoking workflows. Example: http://restate:8080", cli.Default("http://restate:8080"), cli.EnvVar("UNKEY_RESTATE_INGRESS_URL")), - cli.String("restate-admin-url", "URL of the Restate admin endpoint for service registration. Example: http://restate:9070", - cli.Default("http://restate:9070"), cli.EnvVar("UNKEY_RESTATE_ADMIN_URL")), - cli.Int("restate-http-port", "Port where we listen for Restate HTTP requests. Example: 9080", - cli.Default(9080), cli.EnvVar("UNKEY_RESTATE_HTTP_PORT")), - cli.String("restate-register-as", "URL of this service for self-registration with Restate. Example: http://ctrl:9080", - cli.EnvVar("UNKEY_RESTATE_REGISTER_AS")), cli.String("restate-api-key", "API key for Restate ingress requests", cli.EnvVar("UNKEY_RESTATE_API_KEY")), cli.String("clickhouse-url", "ClickHouse connection string for analytics. Recommended for production. Example: clickhouse://user:pass@host:9000/unkey", diff --git a/cmd/ctrl/worker.go b/cmd/ctrl/worker.go index a5f853b342..9dcd5f1643 100644 --- a/cmd/ctrl/worker.go +++ b/cmd/ctrl/worker.go @@ -35,19 +35,18 @@ var workerCmd = &cli.Command{ cli.String("database-primary", "MySQL connection string for primary database. Required for all deployments. Example: user:pass@host:3306/unkey?parseTime=true", cli.Required(), cli.EnvVar("UNKEY_DATABASE_PRIMARY")), - // Authentication - cli.String("auth-token", "Authentication token for worker API access.", - cli.EnvVar("UNKEY_AUTH_TOKEN")), - cli.String("vault-url", "Url where vault is available", - cli.EnvVar("UNKEY_VAULT_URL"), cli.Default("https://vault.unkey.cloud")), + cli.Required(), + cli.EnvVar("UNKEY_VAULT_URL"), + cli.Default("https://vault.unkey.cloud"), + ), cli.String("vault-token", "Authentication for vault", - cli.EnvVar("UNKEY_VAULT_TOKEN")), + cli.Required(), + cli.EnvVar("UNKEY_VAULT_TOKEN"), + ), // Build Configuration - cli.String("build-backend", "Build backend to use: 'docker' for local, 'depot' for production. Default: depot", - cli.Default("depot"), cli.EnvVar("UNKEY_BUILD_BACKEND")), cli.String("build-s3-url", "S3 Compatible Endpoint URL for build contexts", cli.Required(), cli.EnvVar("UNKEY_BUILD_S3_URL")), cli.String("build-s3-bucket", "S3 bucket name for build contexts", @@ -91,8 +90,6 @@ var workerCmd = &cli.Command{ cli.String("default-domain", "Default domain for auto-generated hostnames", cli.Default("unkey.app"), cli.EnvVar("UNKEY_DEFAULT_DOMAIN")), // Restate Configuration - cli.String("restate-url", "URL of the Restate ingress endpoint for invoking workflows. Example: http://restate:8080", - cli.Default("http://restate:8080"), cli.EnvVar("UNKEY_RESTATE_INGRESS_URL")), cli.String("restate-admin-url", "URL of the Restate admin endpoint for service registration. Example: http://restate:9070", cli.Default("http://restate:9070"), cli.EnvVar("UNKEY_RESTATE_ADMIN_URL")), cli.Int("restate-http-port", "Port where we listen for Restate HTTP requests. Example: 9080", From 9d46a620343361d4e8cfe3331ebb0815a133f8ac Mon Sep 17 00:00:00 2001 From: chronark Date: Fri, 23 Jan 2026 14:05:59 +0100 Subject: [PATCH 31/32] fix: comments --- cmd/ctrl/api.go | 1 - .../routes/v2_deploy_get_deployment/handler.go | 18 ++++++++++++++++-- svc/ctrl/api/config.go | 4 ---- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/cmd/ctrl/api.go b/cmd/ctrl/api.go index dc66a8c7a0..7950540867 100644 --- a/cmd/ctrl/api.go +++ b/cmd/ctrl/api.go @@ -128,7 +128,6 @@ func apiAction(ctx context.Context, cmd *cli.Command) error { config := ctrlapi.Config{ // Basic configuration - Image: cmd.String("image"), HttpPort: cmd.Int("http-port"), PrometheusPort: cmd.Int("prometheus-port"), Region: cmd.String("region"), diff --git a/svc/api/routes/v2_deploy_get_deployment/handler.go b/svc/api/routes/v2_deploy_get_deployment/handler.go index 66cfc656d5..c0ab2773f2 100644 --- a/svc/api/routes/v2_deploy_get_deployment/handler.go +++ b/svc/api/routes/v2_deploy_get_deployment/handler.go @@ -3,7 +3,6 @@ package handler import ( "context" "net/http" - "strings" "github.com/unkeyed/unkey/internal/services/keys" "github.com/unkeyed/unkey/pkg/codes" @@ -116,5 +115,20 @@ func (h *Handler) Handle(ctx context.Context, s *zen.Session) error { } func dbStatusToOpenAPI(status db.DeploymentsStatus) openapi.V2DeployGetDeploymentResponseDataStatus { - return openapi.V2DeployGetDeploymentResponseDataStatus(strings.ToUpper(string(status))) + switch status { + case db.DeploymentsStatusPending: + return openapi.PENDING + case db.DeploymentsStatusBuilding: + return openapi.BUILDING + case db.DeploymentsStatusDeploying: + return openapi.DEPLOYING + case db.DeploymentsStatusNetwork: + return openapi.NETWORK + case db.DeploymentsStatusReady: + return openapi.READY + case db.DeploymentsStatusFailed: + return openapi.FAILED + default: + return openapi.UNSPECIFIED + } } diff --git a/svc/ctrl/api/config.go b/svc/ctrl/api/config.go index 136a31dd7a..173b954662 100644 --- a/svc/ctrl/api/config.go +++ b/svc/ctrl/api/config.go @@ -180,10 +180,6 @@ type Config struct { // Used for logging, tracing, and cluster coordination. InstanceID string - // Image specifies the container image identifier including repository and tag. - // Used for control plane deployment and sentinel image configuration. - Image string - // Region is the geographic region where this control plane instance runs. // Used for logging, tracing, and region-aware routing decisions. Region string From 37301051deac682724d8edff58208c67179fc317 Mon Sep 17 00:00:00 2001 From: chronark Date: Fri, 23 Jan 2026 15:21:01 +0100 Subject: [PATCH 32/32] ci: nuke ci for the old api --- .github/workflows/deploy.yaml | 44 --------------- .github/workflows/job_detect_changes.yaml | 13 ----- .github/workflows/job_test_api_canary.yaml | 49 ----------------- .github/workflows/job_test_api_local.yaml | 61 --------------------- .github/workflows/job_test_api_staging.yaml | 53 ------------------ .github/workflows/pr.yaml | 5 -- 6 files changed, 225 deletions(-) delete mode 100644 .github/workflows/job_test_api_canary.yaml delete mode 100644 .github/workflows/job_test_api_local.yaml delete mode 100644 .github/workflows/job_test_api_staging.yaml diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml index db6b0d06ee..6b8afb3343 100644 --- a/.github/workflows/deploy.yaml +++ b/.github/workflows/deploy.yaml @@ -11,50 +11,6 @@ concurrency: jobs: detect_changes: uses: ./.github/workflows/job_detect_changes.yaml - api_local_test: - name: Test API - uses: ./.github/workflows/job_test_api_local.yaml - api_preview_deployment: - needs: - - api_local_test - uses: ./.github/workflows/job_deploy_api_staging.yaml - secrets: - CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }} - api_preview_test: - needs: - - api_preview_deployment - uses: ./.github/workflows/job_test_api_staging.yaml - with: - UNKEY_BASE_URL: https://preview-api.unkey.dev - secrets: - DATABASE_HOST: ${{ secrets.DATABASE_HOST }} - DATABASE_USERNAME: ${{ secrets.DATABASE_USERNAME }} - DATABASE_PASSWORD: ${{ secrets.DATABASE_PASSWORD }} - CLICKHOUSE_URL: ${{ secrets.CLICKHOUSE_URL }} - api_canary_deployment: - needs: - - api_local_test - - api_preview_test - uses: ./.github/workflows/job_deploy_api_canary.yaml - secrets: - CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }} - api_canary_test: - needs: - - api_canary_deployment - uses: ./.github/workflows/job_test_api_canary.yaml - with: - UNKEY_BASE_URL: https://canary.unkey.dev - secrets: - DATABASE_HOST: ${{ secrets.DATABASE_HOST }} - DATABASE_USERNAME: ${{ secrets.DATABASE_USERNAME }} - DATABASE_PASSWORD: ${{ secrets.DATABASE_PASSWORD }} - CLICKHOUSE_URL: ${{ secrets.CLICKHOUSE_URL }} - api_production_deployment: - needs: - - api_canary_test - uses: ./.github/workflows/job_deploy_api_production.yaml - secrets: - CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }} mintlify_deployment: runs-on: depot-ubuntu-24.04-4 needs: diff --git a/.github/workflows/job_detect_changes.yaml b/.github/workflows/job_detect_changes.yaml index 0aeca2ada3..c10f00203a 100644 --- a/.github/workflows/job_detect_changes.yaml +++ b/.github/workflows/job_detect_changes.yaml @@ -2,9 +2,6 @@ name: Detect Changes on: workflow_call: outputs: - api: - description: "Whether API has changed" - value: ${{ jobs.build.outputs.api }} dashboard: description: "Whether Dashboard has changed" value: ${{ jobs.build.outputs.dashboard }} @@ -52,16 +49,6 @@ jobs: with: filters: | # API application and its direct dependencies - api: - - 'web/apps/api/**' - - 'web/internal/db/**' - - 'web/internal/encoding/**' - - 'web/internal/encryption/**' - - 'web/internal/hash/**' - - 'web/internal/id/**' - - 'web/internal/keys/**' - - 'web/internal/validation/**' - - 'web/internal/vault/**' # Dashboard application and its dependencies dashboard: diff --git a/.github/workflows/job_test_api_canary.yaml b/.github/workflows/job_test_api_canary.yaml deleted file mode 100644 index 5bf417237e..0000000000 --- a/.github/workflows/job_test_api_canary.yaml +++ /dev/null @@ -1,49 +0,0 @@ -name: Test API Canary -permissions: - contents: read -on: - workflow_call: - inputs: - UNKEY_BASE_URL: - type: string - required: true - secrets: - DATABASE_HOST: - required: true - DATABASE_USERNAME: - required: true - DATABASE_PASSWORD: - required: true - CLICKHOUSE_URL: - required: true -jobs: - test: - environment: Canary - name: API Test Canary - timeout-minutes: 60 - runs-on: depot-ubuntu-24.04-4 - strategy: - fail-fast: false - matrix: - shard: ["1/9", "2/9", "3/9", "4/9", "5/9", "6/9", "7/9", "8/9", "9/9"] - steps: - - uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4 - - name: Setup Node - uses: ./.github/actions/setup-node - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - - name: Build - run: pnpm turbo run build --filter=./apps/api - env: - CI: 1 - working-directory: web - - name: Test - run: pnpm vitest run -c vitest.integration.ts --bail=1 --shard=${{ matrix.shard }} - working-directory: web/apps/api - env: - CI: 1 - UNKEY_BASE_URL: ${{ inputs.UNKEY_BASE_URL }} - DATABASE_HOST: ${{ secrets.DATABASE_HOST }} - DATABASE_USERNAME: ${{ secrets.DATABASE_USERNAME }} - DATABASE_PASSWORD: ${{ secrets.DATABASE_PASSWORD }} - CLICKHOUSE_URL: ${{ secrets.CLICKHOUSE_URL }} diff --git a/.github/workflows/job_test_api_local.yaml b/.github/workflows/job_test_api_local.yaml deleted file mode 100644 index 2ba2c34a89..0000000000 --- a/.github/workflows/job_test_api_local.yaml +++ /dev/null @@ -1,61 +0,0 @@ -name: Test API Local -on: - workflow_call: -permissions: - contents: read -jobs: - test: - name: API Test Local - timeout-minutes: 90 - runs-on: depot-ubuntu-24.04-4 - steps: - - uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4 - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3.11.1 - with: - version: v0.29.1 - install: true - - name: Create dashboard env file for Docker Compose - run: | - mkdir -p ./web/apps/dashboard - touch ./web/apps/dashboard/.env - - name: Run containers - run: docker compose -f ./dev/docker-compose.yaml up mysql redis clickhouse planetscale agent s3 apiv2 api -d --wait - env: - DOCKER_BUILDKIT: 1 - COMPOSE_DOCKER_CLI_BUILD: 1 - timeout-minutes: 8 - - name: Setup Node - uses: ./.github/actions/setup-node - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - - name: Setup Go - uses: ./.github/actions/setup-go - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - - name: Build - run: pnpm turbo run build --filter=./apps/api... - env: - CI: 1 - working-directory: web - - name: Test - run: pnpm vitest run -c vitest.integration.ts --bail=1 - working-directory: web/apps/api - env: - UNKEY_BASE_URL: http://localhost:8787 - DATABASE_HOST: localhost:3900 - DATABASE_USERNAME: unkey - DATABASE_PASSWORD: password - CLICKHOUSE_URL: http://default:password@localhost:8123 - TEST_LOCAL: true - CI: 1 - - name: Dump logs - if: always() - run: docker compose -f ./dev/docker-compose.yaml logs --no-color > ./docker.logs - - name: Upload logs - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 - if: always() - with: - name: ${{github.run_id}}-${{github.run_number}}-api.logs - path: docker.logs - retention-days: 7 diff --git a/.github/workflows/job_test_api_staging.yaml b/.github/workflows/job_test_api_staging.yaml deleted file mode 100644 index ea7d78fe95..0000000000 --- a/.github/workflows/job_test_api_staging.yaml +++ /dev/null @@ -1,53 +0,0 @@ -permissions: - contents: read -name: API Test Staging -on: - workflow_call: - inputs: - UNKEY_BASE_URL: - type: string - required: true - secrets: - DATABASE_HOST: - required: true - DATABASE_USERNAME: - required: true - DATABASE_PASSWORD: - required: true - CLICKHOUSE_URL: - required: true -jobs: - test: - name: API Test Staging - environment: Preview - timeout-minutes: 60 - runs-on: depot-ubuntu-24.04-4 - strategy: - fail-fast: false - matrix: - shard: ["1/9", "2/9", "3/9", "4/9", "5/9", "6/9", "7/9", "8/9", "9/9"] - steps: - - uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4 - # Our staging server goes to sleep to save money, this will wake it up - # before running our tests - - name: Wake ClickHouse - run: curl -X GET ${{ secrets.CLICKHOUSE_URL }}/ping - - name: Setup Node - uses: ./.github/actions/setup-node - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - - name: Build - run: pnpm turbo run build --filter=./apps/api - env: - CI: 1 - working-directory: web - - name: Test - run: pnpm vitest run -c vitest.integration.ts --bail=1 --shard=${{ matrix.shard }} - working-directory: web/apps/api - env: - CI: 1 - UNKEY_BASE_URL: ${{ inputs.UNKEY_BASE_URL }} - DATABASE_HOST: ${{ secrets.DATABASE_HOST }} - DATABASE_USERNAME: ${{ secrets.DATABASE_USERNAME }} - DATABASE_PASSWORD: ${{ secrets.DATABASE_PASSWORD }} - CLICKHOUSE_URL: ${{ secrets.CLICKHOUSE_URL }} diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index c493453327..6b362573f8 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -20,11 +20,6 @@ jobs: if: ((github.event_name == 'pull_request' && github.event.pull_request.draft == false) || github.event_name != 'pull_request') && needs.detect_changes.result == 'success' && (needs.detect_changes.outputs.packages == 'true' || needs.detect_changes.outputs.dependencies == 'true') needs: [detect_changes] uses: ./.github/workflows/job_test_web.yaml - test_api: - name: Test API - if: ((github.event_name == 'pull_request' && github.event.pull_request.draft == false) || github.event_name != 'pull_request') && needs.detect_changes.result == 'success' && (needs.detect_changes.outputs.api == 'true' || needs.detect_changes.outputs.packages == 'true') - needs: [detect_changes] - uses: ./.github/workflows/job_test_api_local.yaml bazel: name: Bazel uses: ./.github/workflows/job_bazel.yaml