Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CromwellOnAzure On AKS #354

Merged
merged 127 commits into from
Sep 16, 2022
Merged
Show file tree
Hide file tree
Changes from 111 commits
Commits
Show all changes
127 commits
Select commit Hold shift + click to select a range
bec8ebf
WIP
jsaun Jan 26, 2022
bd37f9a
Working blob storage driver
jsaun Feb 22, 2022
58886d1
Remove redudancies, update proj file
jsaun Feb 23, 2022
cb30e84
Fix blobfuse
jsaun Apr 23, 2022
ef68490
Clean up
jsaun Apr 23, 2022
144b6f1
Update reclaim policy to retain
jsaun Apr 23, 2022
852b41d
Use CoA Vnet for AKS
jsaun Apr 26, 2022
bc26b37
Support dynamic blob container mounting
jsaun Apr 26, 2022
97e9a9e
Typo and cleanup
jsaun Apr 26, 2022
54e6452
Fix LogAnalytics
jsaun Apr 28, 2022
65c4b13
Use CoA ManagedId for AKS
jsaun Apr 28, 2022
f5575a8
Make namespace configurable
jsaun May 5, 2022
fd6be7d
Create namespace if it doesn't exist
jsaun May 6, 2022
d1c8c82
Update Deployer.cs
jsaun May 6, 2022
1cbea5f
Refactoring to enable upgrade of AKS deployments
jsaun May 7, 2022
fd20144
Require storage account if upgrading aks deployment
jsaun May 7, 2022
44b2c7c
Switch namespace
jsaun May 10, 2022
d6a8f94
Allow updating TES deployment to reload containers
jsaun May 10, 2022
25e041a
Update args validation for AKS
jsaun May 10, 2022
3ec9629
Minor cleanup
jsaun May 13, 2022
e8bc4d0
Persist env- settings to storage account
jsaun May 13, 2022
a214e96
Allow updating deployed image versions
jsaun May 17, 2022
1723b9d
Move kubernetes functions to separate class
jsaun May 18, 2022
3f27bd7
Remove unused method
jsaun May 18, 2022
3658d7c
Mount containers for cromwell and refactorings
jsaun May 21, 2022
d2215be
Merge branch 'main' into jsaun/coa-on-aks
jsaun May 21, 2022
f315b50
Add compatibility for for AKS with managed mysql
jsaun May 22, 2022
2965318
Fix whitespace?
jsaun May 22, 2022
af35194
Add private networking flag for AKS
jsaun May 23, 2022
cc3229a
PR clean up
jsaun May 23, 2022
15d6d86
Clean up
jsaun May 23, 2022
a5819c9
Style
jsaun May 24, 2022
f37d0bb
Missed file
jsaun May 24, 2022
380a055
Line endings
jsaun May 24, 2022
22f1b66
Make pool size configurable
jsaun May 24, 2022
0d53a6c
Add mysql data disk without caching
jsaun May 24, 2022
dfe3c49
Separate settings into two files
jsaun May 24, 2022
fd81526
Unlock change log on update not creation.
jsaun May 24, 2022
22eed0c
Update docs
jsaun Jun 15, 2022
7450030
Sort imports
jsaun Jun 16, 2022
f810167
PR fixes
jsaun Jun 21, 2022
1ffe375
Skip install if CSI driver already exists
jsaun Jun 21, 2022
3a11f7d
Fix missing import
jsaun Jun 21, 2022
924ae0c
Fix silent print handler
jsaun Jun 21, 2022
419685c
Merge branch 'main' into jsaun/coa-on-aks
jsaun Jun 22, 2022
ff384da
Fix merge issue
jsaun Jun 24, 2022
05ecf8b
Bind AKS and postgres features
jsaun Jun 24, 2022
1d7b20f
Fix postgres setup from K8
jsaun Jun 28, 2022
96f99cd
Initial helm chart
jsaun Jul 15, 2022
ed9899e
Working helm with dynamic pvcs
jsaun Jul 22, 2022
0da8c41
Remove kube api code, and cleanup
jsaun Jul 22, 2022
9dce5f9
Add missing external storage containers env var
jsaun Jul 26, 2022
7f95f86
Add flag for debug logging
jsaun Jul 27, 2022
8168f11
Move helm mysql deployment file
jsaun Jul 28, 2022
64d882b
Print postgresql command for manual helm deployment
jsaun Jul 29, 2022
05613b3
Cleanup
jsaun Jul 29, 2022
8f647b5
Supply namespace to Helm
jsaun Aug 1, 2022
ba17d47
Update helm values for manual deployment
jsaun Aug 1, 2022
cd28eed
Remove Helm MySQL config
jsaun Aug 1, 2022
fc53019
Pass custom images to Helm
jsaun Aug 2, 2022
f938c6d
Address PR comments
jsaun Aug 2, 2022
9093eea
Remove unused imports
jsaun Aug 2, 2022
6f091dd
Add postgres / aks feature coupling again
jsaun Aug 2, 2022
ee0b90b
Clean up and fix
jsaun Aug 3, 2022
7b0930c
Fix exception handling
jsaun Aug 3, 2022
674090b
Make batch/aks account query safe
jsaun Aug 3, 2022
723055e
Remove unneeded print
jsaun Aug 3, 2022
bbaff87
Make clustername optional for manual helm deployment
jsaun Aug 3, 2022
efc0064
Rename mysql subnet to sql subnet
jsaun Aug 4, 2022
e464584
Pass remaining parameters
jsaun Aug 4, 2022
0527566
Allow passing existing postgresql server
jsaun Aug 5, 2022
186de62
Use storage account key based access rather than managed identity
jsaun Aug 13, 2022
a9c3464
Update blob-fuse to Retain instead of Delete
jsaun Aug 15, 2022
778f963
Add option to use postgresql single server
jsaun Aug 18, 2022
b391fb3
Use aadpodidentity plugin
jsaun Aug 18, 2022
4387e56
Fix Identity plugin
jsaun Aug 25, 2022
231d21d
Move storage key to azure keyvault
jsaun Aug 25, 2022
e829d0d
Add missing params
jsaun Aug 25, 2022
d3dccd2
Name helm chart
jsaun Aug 29, 2022
4f2ceed
Update saved settings
jsaun Sep 1, 2022
c09a9ce
Make postgresql flag accessible
jsaun Sep 1, 2022
964012a
Add AKS documentation
jsaun Sep 1, 2022
c8d9e58
Default to null to fix validation
jsaun Sep 1, 2022
f140a7c
Fix update
jsaun Sep 1, 2022
158dc6e
Move billing reader assignment before compute
jsaun Sep 1, 2022
42136fd
Remove blob-csi / aad-plugin install from update path.
jsaun Sep 1, 2022
a28908c
Fix volume handle for updates and include outputs by default.
jsaun Sep 1, 2022
c4d285b
Merge branch 'develop' into jsaun/coa-on-aks
jsaun Sep 2, 2022
0712a7f
Fix merge issues
jsaun Sep 2, 2022
12fd45c
Suppress credscan
jsaun Sep 2, 2022
a20cf08
Only provision key vault for AKS deployments
jsaun Sep 2, 2022
1252b3d
PR clean up
jsaun Sep 2, 2022
e6fe8d7
Add new configuration to list
jsaun Sep 2, 2022
c108810
Remove original k8s files
jsaun Sep 2, 2022
25a3f73
Update proj file
jsaun Sep 2, 2022
10cb598
Revert launchSettings
jsaun Sep 2, 2022
5d735e3
Fix change
jsaun Sep 2, 2022
8346b17
Fix proj setting
jsaun Sep 2, 2022
7afe8b5
Cleanup
jsaun Sep 2, 2022
c8eccdb
Whitespace
jsaun Sep 2, 2022
b550cdb
Merge branch 'jsaun/coa-on-aks' of github.com:microsoft/CromwellOnAzu…
jsaun Sep 2, 2022
b5a55db
Missed method signature
jsaun Sep 2, 2022
d0a7641
More whitespace
jsaun Sep 2, 2022
728e9b3
Fix merge error
jsaun Sep 2, 2022
5a1aee2
Style
jsaun Sep 2, 2022
f8ddce9
Merge branch 'develop' into jsaun/coa-on-aks
jsaun Sep 2, 2022
dab7fb8
Typo
jsaun Sep 9, 2022
710dc3e
Add retries to command execution instead of waiting for 40 secs.
jsaun Sep 9, 2022
8874a67
Fix line endings
jsaun Sep 9, 2022
2c0ff98
Merge branch 'jsaun/coa-on-aks' of github.com:microsoft/CromwellOnAzu…
jsaun Sep 9, 2022
734d5c0
Address PR comments
jsaun Sep 10, 2022
2b703fa
Add UserObjectId to the documentation
jsaun Sep 12, 2022
7eb6b82
Use Polly for retry loops
jsaun Sep 12, 2022
2782ee4
Rename helmexe to helmbinary
jsaun Sep 12, 2022
2ddead5
Address PR feedback
jsaun Sep 13, 2022
cc24818
Update kubernetes client
jsaun Sep 13, 2022
aa36b21
Fix missed paths
jsaun Sep 13, 2022
ad46d1f
Add trailing separator to externalSasContainers list
jsaun Sep 13, 2022
e812e6c
Log exceptions
jsaun Sep 13, 2022
579da10
Duplicate line with LF to register with Git
jsaun Sep 13, 2022
16a4ac8
Remove duplicate
jsaun Sep 13, 2022
7072a93
Merge branch 'develop' into jsaun/coa-on-aks
jsaun Sep 14, 2022
134a62e
Merge remote-tracking branch 'origin/develop' into jsaun/coa-on-aks
jsaun Sep 14, 2022
f1baa5c
Merge branch 'jsaun/coa-on-aks' of github.com:microsoft/CromwellOnAzu…
jsaun Sep 14, 2022
8f33470
Remove references to getting the objectId from rbac graph client for …
jsaun Sep 14, 2022
85f713a
Fix existing key vault functionality
jsaun Sep 14, 2022
60ba008
CoA on AKS - async updates (#479)
MattMcL4475 Sep 15, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions docs/coa-aks.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
## Cromwell on AKS Instructions and Troubleshooting

### Deployment Dependencies
The CoA deployer requires the user to have Helm 3 installed locally to deploy with AKS. Use the flag "--HelmExePath HELM_PATH" to give the deployer the path to the helm exe, if no flag is passed the deployer will assume Helm is installed with Chocolatey at "C:\\ProgramData\\chocolatey\\bin\\helm.exe".
jsaun marked this conversation as resolved.
Show resolved Hide resolved

### Deployment Models

- ### CoA provisioned AKS account
Add the flag "--UseAks true" and the deployer will provision an AKS account and run its containers in AKS rather than provisioning a VM.
- ### Shared AKS account with CoA namespace
Add the flags "--UseAks true --AksClusterName {existingClusterName}", where the user has admin access to the existing AKS account, the deployer will deploy blob-csi-driver, and aad-pod-identity to the kube-system namespace, and then deploy CoA to the namespace "coa". Add the flag "--AksCoANamespace {namespace}" to override the default namespace.
jsaun marked this conversation as resolved.
Show resolved Hide resolved
- ### Shared AKS account without developer access.
If the user is required to use an AKS account, but does not have the required access, the deployer will produce a Helm chart that can then be installed by an admin or existing CI/CD pipeline. Add the flags "--UseAks true --ManualHelmDeployment". The deployer will print a postgresql command, this would typically be run on the kubernetes node to setup the cromwell user however the user will need to run this manually since the deployer won't directly access the AKS account.

- Run the deployer with supplied flags.
- Deployer will create initial resources and pause once it's time to deploy the Helm chart.
- Ensure the blob-csi-driver and aad-pod-identity are installed.
- Install the CoA Helm chart.
- Run the postgresql command to create the cromwell user.
- Press enter in the deployer console to finish the deployment and run a test workflow.

### Depedent Kubernetes Packages
jsaun marked this conversation as resolved.
Show resolved Hide resolved
These packages will be deployed into the kube-system namespace.
- ### Blob CSI Driver - https://github.com/kubernetes-sigs/blob-csi-driver/
This is used to mount the storage account to the containers.
- ### AAD Pod Identity - https://github.com/Azure/aad-pod-identity
This is used to assigned managed identities to the containers.

### External storage accounts
Typically in CromwellOnAzure you can add storage accounts with input data to the containers-to-mount file. For AKS, you need to modify the values.yaml file of the helm chart and redeploy. The vaules-template.yaml will have examples externalContainers and externalSasContainers.

### Logs and troubleshooting
For troubleshooting any of the CoA services, you can login directly to the pods or get logs using the kubectl program. The deployer will write a kubeconfig to the working directory, either copy that file to ~/.kube/config for reference it manually for each command with --kubeconfig {coa-directory}/kubeconfig.txt. You can also run the command `az aks get-credentials --resource-group {coa-resource-group} --name {aks-count} --subscription {subscription-id} --file kubeconfig.txt` to get the file.
jsaun marked this conversation as resolved.
Show resolved Hide resolved

1. Get the exact name of the pods.

`kubectl get pods --namespace coa`
2. Get logs for the tes pod.

`kubectl logs tes-68d6dc4789-mvvwj --namespace coa`
3. SSH to pod to troubleshoot storage or network connectivity.

`kubectl exec --namespace coa --stdin --tty tes-68d6dc4789-mvvwj -- /bin/bash`

### Updating settings and environment variables.

For VM based CoA deployments, you can ssh into the VM host, update the environment files, and restart the VM.
To update update settings for AKS, you will need to redeploy the helm chart. If you still have the chart locally,
you can update the values.yaml file and redeploy with:

`helm upgrade cromwellonazure ./scripts/helm --kubeconfig kubeconfig.txt --namespace coa`

If the original chart is lost, you can regenerate it by running the deployer again with the "--update true" and "--AksClusterName {existingClusterName}" flags.
25 changes: 23 additions & 2 deletions docs/troubleshooting-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ Before deploying, you can choose to customize some input parameters to use exist
.\deploy-cromwell-on-azure.exe --SubscriptionId <Your subscription ID> --RegionName <Your region> --MainIdentifierPrefix <Your string> --VmSize "Standard_D2_v2"
```

Here is the summary of all configuration parameters:
Here is the summary of common configuration parameters:

Configuration parameter | Has default | Validated | Used by update | Comment
-- | -- | -- | -- | --
Expand Down Expand Up @@ -239,7 +239,28 @@ bool Update = false; | Y | Y | Y | Set to true if you want to update your
bool PrivateNetworking = false; | Y | Y | N | Available starting version 2.2. Set to true to create the host VM without public IP address. If set, VnetResourceGroupName, VnetName and SubnetName must be provided (and already exist). The deployment must be initiated from a machine that has access to that subnet.
bool KeepSshPortOpen = false; | Y | Y | Y | Available starting version 3.0. Set to true if you need to keep the SSH port accessible on the host VM while deployer is not running (not recommended).
string LogAnalyticsArmId | Y | N | N | Arm resource id for an exising Log Analytics workspace, workspace is used for App Insights - Not required, a workspace will be generated automatically if not provided.
bool ProvisionPostgreSqlOnAzure = false; | Y | N | N | COMING SOON in version 4.0. Triggers whether to use Docker MySQL or Azure PostgreSQL when provisioning the database.
bool ProvisionPostgreSqlOnAzure = false; | Y | N | N | Triggers whether to use Docker MySQL or Azure PostgreSQL when provisioning the database. Required for AKS deployment.
bool UseAks = false; | Y | N | N | Uses Azure Kubernetes Service rather than a VM to run the CoA system services Cromwell/TES/TriggerService.
string AksClusterName | Y | Y | N | Cluster name of existing Azure Kubernetes Service cluster to use rather than provisioning a new one.
string AksCoANamespace = "coa" | Y | N | N | Kubernetes namespace.
bool ManualHelmDeployment | Y | N | N | For use if user doesn't have direct access to existing AKS cluster.
string HelmExePath = "C:\\ProgramData\\chocolatey\\bin\\helm.exe" | Y | N | N | Path to helm executable for AKS deployment.
int AksPoolSize = 2 | Y | N | N | Size of AKS node pool, two nodes are recommended for reliability, however a minimum of one can be used to save COGS.
bool DebugLogging = false | Y | N | N | Prints all log information.
string PostgreSqlServerName | Y | Y | N | Name of existing postgresql server.
bool UsePostgreSqlSingleServer = false | Y | N | N | Use Postgresql single server rather than flexi servers, only recommended if you need to use private endpoints.
string KeyVaultName | Y | Y | N | Name of an existing key vault

The following are more advanced configuration parameters:

Configuration parameter | Has default | Validated | Used by update | Comment
-- | -- | -- | -- | --
string VnetAddressSpace = "10.1.0.0/16" | Y | N | N | Total address space for CoA vnet.
string VmSubnetAddressSpace = "10.1.0.0/24" | Y | N | N | Address space for compute, VM or AKS.
string MySqlSubnetAddressSpace = "10.1.1.0/24" | Y | N | N | Address space for database.
string KubernetesServiceCidr = "10.1.4.0/22" | Y | N | N | Address space for kubernetes system services, must not overlap with any subnets.
string KubernetesDnsServiceIP = "10.1.4.10" | Y | N | N | Kubernetes DNS service IP Address.
string KubernetesDockerBridgeCidr = "172.17.0.1/16" | Y | N | N | Kubernetes dock bridge Cidr.

### Use a specific Cromwell version
#### Before deploying Cromwell on Azure
Expand Down
30 changes: 24 additions & 6 deletions src/deploy-cromwell-on-azure/Configuration.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ namespace CromwellOnAzureDeployer
{
public class Configuration : UserAccessibleConfiguration
{
public string PostgreSqlServerName { get; set; }
public string PostgreSqlCromwellDatabaseName { get; set; } = "cromwell_db";
public string PostgreSqlTesDatabaseName { get; set; } = "tes_db";
public string PostgreSqlAdministratorLogin { get; set; } = "coa_admin";
Expand All @@ -23,11 +22,11 @@ public class Configuration : UserAccessibleConfiguration
public string PostgreSqlSkuName { get; set; } = "Standard_B2s";
public string PostgreSqlTier { get; set; } = "Burstable";
public string DefaultVmSubnetName { get; set; } = "vmsubnet";
public string DefaultPostgreSqlSubnetName { get; set; } = "mysqlsubnet";
public string PostgreSqlVersion { get; set; } = "11";
public string DefaultPostgreSqlSubnetName { get; set; } = "sqlsubnet";
public int PostgreSqlStorageSize { get; set; } = 128; // GiB
public bool? ProvisionPostgreSqlOnAzure { get; set; } // Will be accessible in 4.0 release
}

public abstract class UserAccessibleConfiguration
{
public string SubscriptionId { get; set; }
Expand All @@ -37,9 +36,15 @@ public abstract class UserAccessibleConfiguration
public string VmOsName { get; set; } = "UbuntuServer";
public string VmOsVersion { get; set; } = "18.04-LTS";
public string VmSize { get; set; } = "Standard_D3_v2";
public string VnetAddressSpace { get; set; } = "10.1.0.0/16";
public string VmSubnetAddressSpace { get; set; } = "10.1.0.0/24";
public string PostgreSqlSubnetAddressSpace { get; set; } = "10.1.1.0/24";
public string VnetAddressSpace { get; set; } = "10.1.0.0/16"; // 10.1.0.0 - 10.1.255.255, 65536 IPs
// Address space for CoA services.
public string VmSubnetAddressSpace { get; set; } = "10.1.0.0/24"; // 10.1.0.0 - 10.1.0.255, 256 IPs
public string PostgreSqlSubnetAddressSpace { get; set; } = "10.1.1.0/24"; // 10.1.1.0 - 10.1.1.255, 256 IPs
// Address space for kubernetes system services, must not overlap with any subnet.
public string KubernetesServiceCidr = "10.1.4.0/22"; // 10.1.4.0 -> 10.1.7.255, 1024 IPs
public string KubernetesDnsServiceIP = "10.1.4.10";
public string KubernetesDockerBridgeCidr = "172.17.0.1/16"; // 172.17.0.0 - 172.17.255.255, 65536 IPs

public string VmUsername { get; set; } = "vmadmin";
public string VmPassword { get; set; }
public string ResourceGroupName { get; set; }
Expand All @@ -50,6 +55,12 @@ public abstract class UserAccessibleConfiguration
public string LogAnalyticsArmId { get; set; }
public string ApplicationInsightsAccountName { get; set; }
public string VmName { get; set; }
public bool UseAks { get; set; }
jsaun marked this conversation as resolved.
Show resolved Hide resolved
public string AksClusterName { get; set; }
jsaun marked this conversation as resolved.
Show resolved Hide resolved
public string AksCoANamespace { get; set; } = "coa";
public bool ManualHelmDeployment { get; set; }
public string HelmExePath { get; set; } = "C:\\ProgramData\\chocolatey\\bin\\helm.exe";
public int AksPoolSize { get; set; } = 2;
public bool Silent { get; set; }
public bool DeleteResourceGroupOnFailure { get; set; }
public string CromwellVersion { get; set; }
Expand All @@ -72,6 +83,13 @@ public abstract class UserAccessibleConfiguration
public string BlobxferImageName { get; set; } = null;
public bool? DisableBatchNodesPublicIpAddress { get; set; } = null;
public bool? KeepSshPortOpen { get; set; } = null;
public bool DebugLogging { get; set; } = false;
public bool? ProvisionPostgreSqlOnAzure { get; set; } = null;
public string PostgreSqlServerName { get; set; }
public bool UsePostgreSqlSingleServer { get; set; } = false;
public string KeyVaultName { get; set; }
// Temporary workaround until I can get Azure Graph RBAC client working.
jsaun marked this conversation as resolved.
Show resolved Hide resolved
public string UserObjectId { get; set; }

public static Configuration BuildConfiguration(string[] args)
{
Expand Down
Loading