diff --git a/.openpublishing.redirection.json b/.openpublishing.redirection.json index e62e769e5c..a64c443c2b 100644 --- a/.openpublishing.redirection.json +++ b/.openpublishing.redirection.json @@ -1,5 +1,20 @@ { "redirections": [ + { + "source_path": "docs/scenarios/azure-hpc/energy/azure-billing-active-directory-tenant.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/", + "redirect_document_id": false + }, + { + "source_path": "docs/scenarios/azure-hpc/finance/azure-billing-active-directory-tenant.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/", + "redirect_document_id": false + }, + { + "source_path": "docs/scenarios/azure-hpc/manufacturing/azure-billing-active-directory-tenant.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/", + "redirect_document_id": false + }, { "source_path": "docs/manage/hybrid/server/best-practices/arc-vm-extension-mma.md", "redirect_url": "/azure/azure-arc/servers/manage-vm-extensions-template", @@ -2231,6 +2246,21 @@ "redirect_document_id": false }, { + "source_path": "docs/scenarios/azure-hpc/energy/storage.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/storage", + "redirect_document_id": false + }, + { + "source_path": "docs/scenarios/azure-hpc/finance/storage.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/storage", + "redirect_document_id": false + }, + { + "source_path": "docs/scenarios/azure-hpc/manufacturing/storage.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/storage", + "redirect_document_id": false + }, + { "source_path": "docs/secure/security-best-practices-introduction.md", "redirect_url": "/security/zero-trust/deploy/infrastructure", "redirect_document_id": false @@ -2356,6 +2386,96 @@ "redirect_document_id": false }, { + "source_path": "docs/scenarios/azure-hpc/energy/management.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/", + "redirect_document_id": false + }, + { + "source_path": "docs/scenarios/azure-hpc/finance/management.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/", + "redirect_document_id": false + }, + { + "source_path": "docs/scenarios/azure-hpc/manufacturing/management.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/", + "redirect_document_id": false + }, + { + "source_path": "docs/scenarios/azure-hpc/energy/platform-automation-devops.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/", + "redirect_document_id": false + }, + { + "source_path": "docs/scenarios/azure-hpc/finance/platform-automation-devops.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/", + "redirect_document_id": false + }, + { + "source_path": "docs/scenarios/azure-hpc/energy/network-topology-connectivity.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/network-topology-connectivity", + "redirect_document_id": false + }, + { + "source_path": "docs/scenarios/azure-hpc/finance/network-topology-connectivity.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/network-topology-connectivity", + "redirect_document_id": false + }, + { + "source_path": "docs/scenarios/azure-hpc/manufacturing/network-topology-connectivity.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/network-topology-connectivity", + "redirect_document_id": false + }, + { + "source_path": "docs/scenarios/azure-hpc/manufacturing/platform-automation-devops.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/", + "redirect_document_id": false + }, + { + "source_path": "docs/scenarios/azure-hpc/energy/security-governance-compliance.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/", + "redirect_document_id": false + }, + { + "source_path": "docs/scenarios/azure-hpc/finance/security-governance-compliance.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/", + "redirect_document_id": false + }, + { + "source_path": "docs/scenarios/azure-hpc/manufacturing/security-governance-compliance.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/", + "redirect_document_id": false + }, + { + "source_path": "docs/scenarios/azure-hpc/energy/security.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/", + "redirect_document_id": false + }, + { + "source_path": "docs/scenarios/azure-hpc/finance/security.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/", + "redirect_document_id": false + }, + { + "source_path": "docs/scenarios/azure-hpc/manufacturing/security.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/", + "redirect_document_id": false + }, + { + "source_path": "docs/scenarios/azure-hpc/energy/compute.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/compute", + "redirect_document_id": false + }, + { + "source_path": "docs/scenarios/azure-hpc/finance/compute.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/compute", + "redirect_document_id": false + }, + { + "source_path": "docs/scenarios/azure-hpc/manufacturing/compute.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/compute", + "redirect_document_id": false + }, + { "source_path": "docs/scenarios/oracle-iaas/enterprise-enrollment-oracle.md", "redirect_url": "/azure/cloud-adoption-framework/scenarios/oracle-iaas", "redirect_document_id": false diff --git a/.openpublishing.redirection.scenarios.json b/.openpublishing.redirection.scenarios.json index 90efe061f4..76aef75edc 100644 --- a/.openpublishing.redirection.scenarios.json +++ b/.openpublishing.redirection.scenarios.json @@ -1,5 +1,35 @@ { "redirections": [ + { + "source_path_from_root": "/docs/scenarios/azure-hpc/energy/resource-organization.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/resource-organization", + "redirect_document_id": false + }, + { + "source_path_from_root": "/docs/scenarios/azure-hpc/finance/resource-organization.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/resource-organization", + "redirect_document_id": false + }, + { + "source_path_from_root": "/docs/scenarios/azure-hpc/manufacturing/resource-organization.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/resource-organization", + "redirect_document_id": false + }, + { + "source_path_from_root": "/docs/scenarios/azure-hpc/energy/identity-access-management.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/identity-access-management", + "redirect_document_id": false + }, + { + "source_path_from_root": "/docs/scenarios/azure-hpc/finance/identity-access-management.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/identity-access-management", + "redirect_document_id": false + }, + { + "source_path_from_root": "/docs/scenarios/azure-hpc/manufacturing/identity-access-management.md", + "redirect_url": "/azure/cloud-adoption-framework/scenarios/azure-hpc/identity-access-management", + "redirect_document_id": false + }, { "source_path_from_root": "/docs/scenarios/cloud-scale-analytics/architectures/reference-architecture-data-mesh.md", "redirect_url": "/docs/scenarios/cloud-scale-analytics/architectures/data-mesh-scenario", diff --git a/docs/get-started/whats-new.md b/docs/get-started/whats-new.md index b3aab64a33..00ff24f573 100644 --- a/docs/get-started/whats-new.md +++ b/docs/get-started/whats-new.md @@ -357,7 +357,6 @@ Find new articles about Azure landing zones in the Ready methodology. - [Abbreviation recommendations for Azure resources](../ready/azure-best-practices/resource-abbreviations.md): Find updates about the data-collection and alert-processing rules. - [Zero Trust configuration for multitenant defense organizations](../scenarios/defense/identity/multi-tenant/zero-trust-configuration.md): Review a new multitenant architecture diagram and information about Microsoft Entra ID Protection. - [The Azure Well-Architected Framework for HPC](../scenarios/azure-hpc/well-architected-framework.md): Explore updates related to ExpressRoute. -- [Manufacturing HPC storage in Azure](../scenarios/azure-hpc/manufacturing/storage.md): Learn about updates related to Azure Managed Lustre. ## March 2024 @@ -369,7 +368,7 @@ Find new articles about Azure landing zones in the Ready methodology. - [Select Azure regions](../ready/azure-setup-guide/regions.md): We added guidance about how to plan Azure resource group deployments. - [Transition an existing Azure environment to the Azure landing zone conceptual architecture](../ready/enterprise-scale/transition.md): Find tip to help you reduce the impact of regional outages. - [Security guidelines for Oracle on Azure Virtual Machines landing zone accelerator](../scenarios/oracle-iaas/oracle-security-guideline-landing-zone.md): Review new use cases for centralized identity management. These use cases include using Azure Key Vault to store credentials and using hardened operating system images. -- [Storage for Azure HPC in the finance sector](../scenarios/azure-hpc/finance/storage.md): Find new data to help you compare Azure Managed Lustre with Blob Storage, Azure Files, and Azure NetApp Files. +- [Storage for Azure HPC](../scenarios/azure-hpc/storage.md): Find new data to help you compare Azure Managed Lustre with Blob Storage, Azure Files, and Azure NetApp Files. - [Network topology and connectivity for an SAP migration](../scenarios/sap/eslz-network-topology-and-connectivity.md): Explore design recommendations for Azure ExpressRoute. ## February 2024 diff --git a/docs/scenarios/azure-hpc/azure-hpc-landing-zone-accelerator.md b/docs/scenarios/azure-hpc/azure-hpc-landing-zone-accelerator.md index 70c5222ce5..4515948885 100644 --- a/docs/scenarios/azure-hpc/azure-hpc-landing-zone-accelerator.md +++ b/docs/scenarios/azure-hpc/azure-hpc-landing-zone-accelerator.md @@ -31,44 +31,16 @@ The approach to landing zones of the HPC landing zone accelerator provides the f - An implementation that includes: - A deployable reference capable of creating the environment for your HPC deployment - A Microsoft-approved HPC reference implementation to test the deployed environment + + ## Design guidelines for energy, manufacturing, and finance -## Design guidelines for energy, manufacturing, and finance - -The architectures of landing zones vary by business sector, in addition to varying by organization. This section lists articles by sector that provide guidelines for creating your landing zone: - -- Energy (Oil and Gas) - - [Azure Billing and Microsoft Entra tenants for energy HPC](./energy/azure-billing-active-directory-tenant.md) - - [Identity and access management for Azure HPC in energy](./energy/identity-access-management.md) - - [Management for Azure HPC in energy](./energy/management.md) - - [Network topology and connectivity for Azure HPC in energy](./energy/network-topology-connectivity.md) - - [Platform automation and DevOps for Azure HPC in energy](./energy/platform-automation-devops.md) - - [Resource organization for HPC in the energy industry](./energy/resource-organization.md) - - [Governance for HPC in energy industries](./energy/security-governance-compliance.md) - - [Security for Azure HPC in energy](./energy/security.md) - - [Compute large-scale HPC application workloads in Azure VMs](./energy/compute.md) - - [Storage for HPC energy environments](./energy/storage.md) - -- Manufacturing - - [Manufacturing HPC Azure billing and Active Directory tenants](./manufacturing/azure-billing-active-directory-tenant.md) - - [Azure identity and access management for HPC in manufacturing](./manufacturing/identity-access-management.md) - - [Management for HPC in the manufacturing industry](./manufacturing/management.md) - - [Manufacturing HPC network topology and connectivity](./manufacturing/network-topology-connectivity.md) - - [Platform automation and DevOps for Azure HPC in the manufacturing industry](./manufacturing/platform-automation-devops.md) - - [Manufacturing HPC resource organization](./manufacturing/resource-organization.md) - - [Azure governance for manufacturing HPC](./manufacturing/security-governance-compliance.md) - - [Security for HPC in manufacturing industries](./manufacturing/security.md) - - [Manufacturing HPC storage](./manufacturing/storage.md) - -- Finance - - [Azure billing offers and Active Directory tenants for finance HPC](./finance/azure-billing-active-directory-tenant.md) - - [Finance HPC Azure identity and access management](./finance/identity-access-management.md) - - [Management for HPC in the finance sector](./finance/management.md) - - [Network topology and connectivity for HPC in the finance sector](./finance/network-topology-connectivity.md) - - [Platform automation and DevOps for HPC in the finance sector](./finance/platform-automation-devops.md) - - [Resource organization for Azure HPC in the finance sector](./finance/resource-organization.md) - - [Governance for finance HPC](./finance/security-governance-compliance.md) - - [Security for HPC in the finance sector](./finance/security.md) - - [Storage for HPC in the finance sector](./finance/storage.md) +The architectures of landing zones vary by business sector, in addition to varying by organization. This section lists articles that provide guidelines for creating your landing zone: + + - [Identity and access management for Azure HPC](identity-access-management.md) + - [Network topology and connectivity for Azure HPC](network-topology-connectivity.md) + - [Resource organization for HPC](resource-organization.md) + - [Compute large-scale HPC application workloads in Azure VMs](compute.md) + - [Storage for HPC environments](storage.md) ### Design Guidelines for choosing HPC compute for AI workloads @@ -106,4 +78,4 @@ The HPC landing zone accelerator is available on GitHub: [Azure HPC OnDemand Pla ## Next steps -For considerations and recommendations for your HPC landing zone accelerator architecture, review the critical design areas of the HPC landing zone accelerator in [Azure Identity and Access Management](./energy/identity-access-management.md). +For considerations and recommendations for your HPC landing zone accelerator architecture, review the critical design areas of the HPC landing zone accelerator in [Azure Identity and Access Management](identity-access-management.md). diff --git a/docs/scenarios/azure-hpc/compute.md b/docs/scenarios/azure-hpc/compute.md new file mode 100644 index 0000000000..c643a3c716 --- /dev/null +++ b/docs/scenarios/azure-hpc/compute.md @@ -0,0 +1,153 @@ +--- +title: Compute Large-Scale HPC Application Workloads in Azure Virtual Machines +description: Learn about the ideal Azure VM instances for large-scale HPC application workloads, use cases, reference architecture, and design considerations and recommendations. +author: Rajani-Janaki-Ram +ms.author: rajanaki +ms.topic: conceptual +ms.custom: think-tank +ms.date: 11/14/2024 +--- + +# Compute large-scale HPC application workloads in Azure Virtual Machines + +High-performance computing (HPC) workloads, also known as big compute applications, are large-scale workloads that require many cores. HPC can help industries like energy, finance, and manufacturing at every stage of the product development process. + +Big compute applications typically have the following characteristics: + +- You can divide the workload into discrete tasks that can be run simultaneously across many cores. +- Each task takes input, processes it, and produces output. The entire application runs for a finite amount of time. +- The application doesn't need to run constantly, but it must be able to handle node failures and crashes. +- Tasks can be independent or tightly coupled, which requires high-speed networking technologies like InfiniBand and remote direct memory access (RDMA) connectivity. +- You can use compute-intensive virtual machine (VM) sizes such as H16r, H16mr, and A9. Your selection depends on the workload. + +:::image type="content" source="./media/tasks.png" alt-text="Diagram that shows how a job queue moves from the client to the scheduler and the parallel and tightly coupled Azure tasks." lightbox="./media/tasks.png" border="false"::: + +Azure provides a range of VM instances that are optimized for CPU-intensive and GPU-intensive workloads. These VMs can run in Azure Virtual Machine Scale Sets to provide resiliency and load balancing. Azure is also the only cloud platform that offers InfiniBand-enabled hardware. InfiniBand provides a significant performance advantage for tasks such as financial risk modeling, engineering stress analysis, and running reservoir simulation and seismic workloads. This advantage results in performance that approaches or exceeds current on-premises infrastructure performance. + +Azure provides various VM sizes for HPC and GPU-optimized computing. It's important to select a VM size that's appropriate for your workload. To find the best fit, see [Sizes for virtual machines in Azure](/azure/virtual-machines/sizes) and [Virtual machines selector tool](https://azure.microsoft.com/pricing/vm-selector/). + +Keep in mind that not all Azure products are available in all regions. To see what's available in your area, see [Products available by region](https://azure.microsoft.com/explore/global-infrastructure/products-by-region/). + +For more information about Azure compute options, see the [Azure compute blog](https://techcommunity.microsoft.com/t5/azure-compute-blog/bg-p/AzureCompute) or [Choose an Azure compute service](/azure/architecture/guide/technology-choices/compute-decision-tree). + +Azure provides both CPU-based and GPU-enabled VMs. The N-series VMs feature NVIDIA GPUs that are designed for compute-intensive or graphics-intensive applications such as AI, learning, and visualization. + +HPC products are designed for high-performance scenarios. But other products, such as the E and F series, are also suitable for specific workloads. + +## Design considerations + +When you design your HPC infrastructure, several tools and services are available to help you manage and schedule your workloads. + +- [Azure Batch](/azure/batch/) is a managed service for running large-scale HPC applications. Use Batch to configure a VM pool and upload the applications and data files. Then the Batch service configures the VMs, assigns tasks to the VMs, runs the tasks, and monitors progress. Batch can automatically scale VMs up and down in response to changing workloads. Batch also provides a job-scheduling functionality. + +- [Azure CycleCloud](/azure/cyclecloud/) is a tool for creating, managing, operating, and optimizing HPC and big compute clusters in Azure. Use Azure CycleCloud to dynamically configure HPC Azure clusters and orchestrate data and jobs for hybrid and cloud workflows. Azure CycleCloud provides the simplest way to manage HPC workloads by using a workload manager. Azure CycleCloud supports workload managers such as Grid Engine, Microsoft HPC Pack, HTCondor, LSF, PBS Pro, SLURM, and Symphony. + +- [Azure Logic Apps](/azure/logic-apps/logic-apps-overview) is a specialized service for scheduling compute-intensive work to run on a managed pool of VMs. You can automatically scale compute resources to meet your jobs' needs. + +The following sections describe reference architectures for the energy, finance, and manufacturing industries. + +## Energy reference architecture + +Consider the following recommendations and use cases when you design an architecture for energy workloads. + +### Design recommendations + +- Understand that reservoir and seismic workflows typically have similar requirements for compute and job scheduling. + +- Consider your network needs. Azure HPC provides HBv2 and HBv3-series VM sizes for memory-intensive seismic imaging and reservoir simulations. + +- Use HB-series VMs for memory bandwidth-bound applications and HC-series VMs for compute-bound reservoir simulations. + +- Use NV-series VMs for 3D reservoir modeling and visualizing seismic data. + +- Use NCv4-series VMs for GPU-accelerated seismic full-waveform inversion (FWI) analysis. + + For data-intensive resin transfer molding (RTM) processing, the NDv4 VM size is the best option because it provides Non-Volatile Memory Express (NVMe) drives that have a cumulative capacity of 7 TB. + + To get the best possible performance on HB-series VMs with Message Passing Interface (MPI) workloads, do optimal process pinning to the processors' cores. For more information, see [Optimal MPI process placement for Azure HB-series VMs](https://techcommunity.microsoft.com/t5/azure-high-performance-computing/optimal-mpi-process-placement-for-azure-hb-series-vms/ba-p/2450663). + + NCv4-series VMs also provide dedicated tools to ensure the correct pinning of parallel application processes. + +- Because of the complex architecture of NDv4-series VMs, pay attention when you configure the VMs to ensure that you launch the GPU-accelerated applications optimally. For more information, see [Azure scalable GPU VM](https://techcommunity.microsoft.com/t5/azure-high-performance-computing/azure-offers-the-most-scalable-gpu-vm-in-the-cloud-with-the-nd/ba-p/2524369). + +### Use cases for the oil and gas seismic and reservoir simulation reference architecture + +Reservoir and seismic workflows usually have similar requirements for compute and job scheduling. However, seismic workloads challenge the infrastructure's storage capabilities. They sometimes need multiple PBs of storage and throughput requirements that might be measured in hundreds of GBs. For example, a single seismic processing project might start with 500 TB of raw data, which requires potentially several PBs of long-term storage. + +See the following reference architectures that can help you successfully meet your goals for running your application in Azure. + +#### Reference architecture for seismic processing + +Seismic processing and imaging are fundamental for the oil and gas industry because they create a model of the subsurface based on the exploration data. Geoscientists typically conduct the process of qualifying and quantifying what might be in the subsurface. Geoscientists usually use datacenter and cloud-bound software. Occasionally they access the software remotely or in the cloud by using virtual desktop technology. + +The quality of the subsurface model and the quality and resolution of the data is crucial to make the right business decisions about bidding on leases or deciding where to drill. Seismic image interpretation images can improve the position of wells and reduce the risk of drilling a *dry hole*. For oil and gas companies, having a better understanding of subsurface structures translates directly to reducing exploration risk. Basically, the higher the accuracy of the company's view of the geological area, the better its chance of striking oil when it drills. + +This job is data and compute-intensive. The company needs to process TBs of data. This data processing requires massive and fast computation power, which includes fast networking. Because of the data and computing-intensive nature of seismic imaging, companies use parallel computing to process data and reduce the time to compilation and completion. + +Companies relentlessly process large volumes of seismic acquisition data to locate and accurately quantify and qualify the hydrocarbon content in reservoirs that they discover in the subsurface before they begin recovery operations. Acquisition data is unstructured and can easily reach PBs of storage for one potential oil and gas field. Because of these factors, you can only complete seismic processing activity within a reasonable timeframe by using HPC and other appropriate data management strategies. + +:::image type="content" source="./media/network-interconnect.png" alt-text="Diagram of the network interconnect compute and storage for seismic interpretation and modeling." lightbox="./media/network-interconnect.png" border="false"::: + +:::image type="content" source="./media/network-reference-architecture-seismic-processing.png" alt-text="Diagram of the network reference architecture for seismic processing." lightbox="./media/network-reference-architecture-seismic-processing.png" border="false"::: + +#### Reference architecture for reservoir simulation and modeling + +Physical subsurface characteristics, such as water saturation, porosity, and permeability, are also valuable data in reservoir modeling. This data is important to determine what kind of recovery approach and equipment to deploy and, ultimately, where to position wells. + +A reservoir modeling workload is also an area of reservoir engineering. The workload combines physics, mathematics, and computer programming in a reservoir model to analyze and predict fluid behavior in the reservoir over time. This analysis requires high computation power and big compute workload demands, including fast networking. + +:::image type="content" source="./media/network-reference-architecture-reservoir-simulation.png" alt-text="Diagram of the network reference architecture for reservoir simulation." lightbox="./media/network-reference-architecture-reservoir-simulation.png" border="false"::: + +:::image type="content" source="./media/network-interconnect-compute-and-storage-seismic-analysis.png" alt-text="Diagram of the network interconnect compute and storage seismic analysis." lightbox="./media/network-interconnect-compute-and-storage-seismic-analysis.png" border="false"::: + +## Finance reference architecture + +The following architecture is an example of how to use VMs in HPC for finance workloads. + +:::image type="content" alt-text="Architecture diagram that shows a finance HPC workload that uses HPC Pack HB-series VMs." source="./media/hpc-finance-architecture-example.svg" lightbox="./media/hpc-finance-architecture-example.svg" border="false"::: + +This workload uses HPC Pack HB-series compute nodes. + +The [HB-series VMs](/azure/virtual-machines/hb-series) are optimized for HPC applications, such as financial analysis, weather simulation, and silicon register-transfer level (RTL) modeling. HB VMs feature: + +- Up to 120 AMD EPYC™ 7003-series CPU cores. +- 448 GB of RAM. +- No hyperthreading. + +HB-series VMs also provide: + +- 350 GB per second of memory bandwidth. +- Up to 32 MB of L3 cache per core. +- Up to 7 GB per second of block device solid-state drive (SSD) performance. +- Clock frequencies of up to 3.675 GHz. + +For the HPC head node, the workload uses a different-sized VM. Specifically, it uses a D16s_v4 VM, a type of general-purpose product. + +## Manufacturing reference architecture + +The following architecture is an example of how to use VMs in HPC in manufacturing. + +:::image type="content" alt-text="Architecture diagram that shows a manufacturing HPC workload that uses Azure CycleCloud and HC-series VMs." source="./media/hpc-manufacturing-architecture-example.svg" lightbox="./media/hpc-manufacturing-architecture-example.svg" border="false"::: + +This architecture uses Azure Files shares and Azure Storage accounts that are connected to an Azure Private Link subnet. + +The architecture uses Azure CycleCloud in its own subnet. HC-series VMs are used in an arrangement of cluster nodes. + +The HC-series VMs are optimized for HPC applications that use intensive computation. Examples include implicit and finite element analysis, reservoir simulation, and computational chemistry applications. HC VMs feature 44 Intel Xeon Platinum 8168 processor cores, 8 GB of RAM per CPU core, no hyperthreading, and up to four managed disks. The Intel Xeon Platinum platform supports Intel's rich ecosystem of software tools and features and an all-cores clock speed of 3.4 GHz for most workloads. + +## Next steps + +For more information about applications that support the use cases in this article, see the following resources: + +- [Virtual machine series](https://azure.microsoft.com/pricing/details/virtual-machines/series/). +- [Azure HPC certification.github.io](https://github.com/AzureHPC-Certification/AzureHPC-Certification.github.io/). +- [Microsoft Azure HPC OnDemand Platform](https://techcommunity.microsoft.com/t5/azure-global/azure-hpc-ondemand-platform-cloud-hpc-made-easy/ba-p/2537338). This standalone reference architecture might not be compliant with the Azure landing zone paradigm. + +The following articles provide guidance for various stages of the cloud adoption process. These resources can help you succeed in adopting manufacturing HPC environments for the cloud. + +- [Identity and access management](./identity-access-management.md) +- [Network topology and connectivity](./network-topology-connectivity.md) +- [Resource organization](./resource-organization.md) +- [Storage](./storage.md) +- [HPC landing zone accelerator](./azure-hpc-landing-zone-accelerator.md) +- [Spot virtual machines](/azure/architecture/guide/spot/spot-eviction) \ No newline at end of file diff --git a/docs/scenarios/azure-hpc/energy/azure-billing-active-directory-tenant.md b/docs/scenarios/azure-hpc/energy/azure-billing-active-directory-tenant.md deleted file mode 100644 index dc661b8002..0000000000 --- a/docs/scenarios/azure-hpc/energy/azure-billing-active-directory-tenant.md +++ /dev/null @@ -1,32 +0,0 @@ ---- -title: Azure billing and Microsoft Entra tenants for energy HPC -description: Learn about Azure billing and Microsoft Entra tenants for energy HPC. -author: Rajani-Janaki-Ram -ms.author: rajanaki -ms.topic: conceptual -ms.custom: think-tank -ms.date: 11/08/2022 ---- - -# Azure billing and Microsoft Entra tenants for energy HPC - -Azure landing zones for HPC don't have any specific considerations or recommendations that affect billing offer decisions, enterprise enrollment, or Active Directory tenant decisions. - -## Previous recommendations - -Keep in mind that it might be important to understand any decisions made previously by the cloud platform team so you know about existing [enterprise enrollment or Microsoft Entra tenant decisions](../../../ready/landing-zone/design-area/azure-billing-microsoft-entra-tenant.md). - -## Next steps - -The following articles provide guidance that you might find helpful at various points throughout the cloud adoption process. They can help you succeed in your cloud adoption for energy HPC environments. - -- [Identity and access management for Azure HPC in energy](./identity-access-management.md) -- [Management for Azure HPC in energy](./management.md) -- [Network topology and connectivity for Azure HPC in energy](./network-topology-connectivity.md) -- [Platform automation and DevOps for Azure HPC in energy](./platform-automation-devops.md) -- [Resource organization for HPC in the energy industry](./resource-organization.md) -- [Governance for HPC in energy industries](./security-governance-compliance.md) -- [Security for Azure HPC in energy](./security.md) -- [Compute large-scale HPC application workloads in Azure VMs](./compute.md) -- [Storage for HPC energy environments](./storage.md) -- [Azure high-performance computing (HPC) landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) diff --git a/docs/scenarios/azure-hpc/energy/compute.md b/docs/scenarios/azure-hpc/energy/compute.md deleted file mode 100644 index 76fa146548..0000000000 --- a/docs/scenarios/azure-hpc/energy/compute.md +++ /dev/null @@ -1,114 +0,0 @@ ---- -title: 'Compute large-scale HPC application workloads in Azure VMs' -description: Learn about the Azure VM instances that are ideal for large-scale HPC application workloads in the oil and gas industry, plus HPC use cases, reference architecture, design considerations, and design recommendations. -author: Rajani-Janaki-Ram -ms.author: rajanaki -ms.topic: conceptual -ms.custom: think-tank -ms.date: 11/14/2022 ---- - -# Compute large-scale HPC application workloads in Azure VMs - -The term big compute (used in reference to HPC) describes large-scale workloads that require a large number of cores, often numbering in the hundreds or thousands. Scenarios include image rendering, fluid dynamics, financial risk modeling, oil exploration, drug design, and engineering stress analysis, among others. - -The following are typical characteristics of big compute applications: - -- The work can be split into discrete tasks, which can be run across many cores simultaneously. -- Each task is finite. It takes some input, does some processing, and produces output. The entire application runs for a finite amount of time (minutes to days). A common pattern is to set up many cores in a burst, and then spin down to zero once the application completes. -- The application doesn't need to stay up 24/7. But the system must handle node failures or application crashes. -- For some applications, tasks are independent and can run in parallel. In other cases, tasks are tightly coupled, meaning they must interact or exchange intermediate results. In that case, consider using high-speed networking technologies such as InfiniBand and remote direct memory access (RDMA). -- Depending on your workload, you might use compute-intensive VM sizes (H16r, H16mr, and A9). - -:::image type="content" source="../media/tasks.png" alt-text="Diagram of Azure tasks." lightbox="../media/tasks.png"::: - -Azure offers a range of VM instances that are optimized for both CPU- and GPU-intensive workloads (both compute and visualization). The VMs are ideal for running oil and gas workloads. - -Azure is the only cloud platform that offers VM instances with InfiniBand-enabled hardware. This feature provides a significant performance advantage for running reservoir simulation and seismic workloads. The improved performance narrows the performance gap and results in near or better performance than current on-premises infrastructures. - -Azure VMs have many different options, known as VM sizes. There are different series of VM sizes for HPC and GPU-optimized computing. Select the appropriate VM size for the workload you want to use. For more information on selecting VM sizes, see the [Sizes for VMs in Azure selector tool](/azure/virtual-machines/sizes). - -Not all Azure products are available in all Azure regions. For more information, see the current [list of products available by region](https://azure.microsoft.com/explore/global-infrastructure/products-by-region/). - -For best practices on your choices in Azure compute, see the [Azure compute blog](https://techcommunity.microsoft.com/t5/azure-compute-blog/bg-p/AzureCompute) or see the [Azure compute service](/azure/architecture/guide/technology-choices/compute-decision-tree) content to choose a service. - -**CPU-based virtual machines** - -- [Linux VMs](/azure/virtual-machines/linux/sizes-hpc) -- [Windows VMs](/azure/virtual-machines/windows/sizes-hpc) - -**GPU-enabled virtual machines** - -N-series VMs feature NVIDIA GPUs designed for compute-intensive or graphics-intensive applications including artificial intelligence (AI), learning, and visualization. - -- [Linux VMs](/azure/virtual-machines/linux/sizes-gpu) -- [Windows VMs](/azure/virtual-machines/windows/sizes-gpu) - -HPC SKUs are built specially for high-performance scenarios. But Azure also offers other SKUs that might be suitable for certain workloads you run on your HPC infrastructure. You can run these SKUs effectively on less expensive hardware. Some commonly used compute SKUs are the E and F series. - -## HPC design considerations - -[Job Scheduler](/azure/logic-apps/logic-apps-overview) is a specialized service for scheduling compute-intensive work to run on a managed pool of virtual machines. You can automatically scale compute resources to meet the needs of your jobs. - -[Azure Batch](/azure/batch/) is a managed service for running large-scale HPC applications. Using Azure Batch, you configure a VM pool, and then you upload the applications and data files. Then the Batch service configures the VMs, assigns tasks to the VMs, runs the tasks, and monitors the progress. Batch can automatically scale VMs up and down in response to changing workloads. Batch also provides a job-scheduling functionality. - -[Azure CycleCloud](/azure/cyclecloud/) is a tool for creating, managing, operating, and optimizing HPC and Big Compute clusters in Azure. With Azure CycleCloud, users can dynamically configure HPC Azure clusters and orchestrate data and jobs for hybrid and cloud workflows. Azure CycleCloud provides the simplest way to manage HPC workloads, by using various work load managers (such as Grid Engine, HPC Pack, HTCondor, LSF, PBS Pro, Slurm, or Symphony) on Azure. - -## HPC design recommendations - -- Both reservoir and seismic workflows typically have similar requirements for compute and job scheduling. -- While you consider your network needs, for the memory-intensive seismic imaging and reservoir simulations, Azure HPC offers HBv2 and HBv3 VM sizes. -- Use HB VMs for memory bandwidth-bound applications and HC VMs for compute-bound reservoir simulations. -- Use NV VMs for 3D reservoir modeling and visualizing seismic data. -- For GPU accelerated seismic FWI analysis, NCv4 is the recommended solution. For more data intensive RTM processing, the NDv4 SKU is the best option thanks to the availability of NVMe drives with a cumulative capacity of 7 TB. To get the best possible performance on HB series VMs with MPI workloads, do optimal process pinning to the processors' cores. For more information, see the [Optimal MPI process placement for Azure HB series VMs](https://techcommunity.microsoft.com/t5/azure-high-performance-computing/optimal-mpi-process-placement-for-azure-hb-series-vms/ba-p/2450663) blog post. Dedicated tools are also provided to ensure the correct pinning of parallel application processes as described here. -- Due to the complex architecture of NDv4 series VMs, be sure to pay particular attention when configuring the VMs to ensure you launch the GPU-accelerated applications optimally. For more information about Azure high-performance computing, see the [Azure scalable GPU VM](https://techcommunity.microsoft.com/t5/azure-high-performance-computing/azure-offers-the-most-scalable-gpu-vm-in-the-cloud-with-the-nd/ba-p/2524369) blog post. - -## HPC reference architecture - -The following are the use case and reference architectures for energy HPC environments. - -### Oil and gas seismic and reservoir simulation reference architecture use cases - -Commonly, both reservoir and seismic workflows have similar requirements for compute and job scheduling. However, seismic workloads challenge the infrastructure on storage with potentially multi-PB storage and throughput requirements that might be measured in the hundreds of GB. For example, a single seismic processing project might start with 500 TB of raw data, which requires potentially several PB of long-term storage. Following are a few reference architectures available today that can help you successfully meet your goals for running your application in Azure. - -### Use case and reference architecture for seismic processing - -Seismic processing and imaging are fundamental for the oil and gas business because they create a model of the subsurface out of the exploration data. The process of qualifying and quantifying what might be in the subsurface is typically conducted by geoscientists. Geoscientists use software that's often data center and cloud bound. Occasionally they access the software using virtual desktop technology remotely or in the cloud. - -The quality of the subsurface model and the quality and resolution of the data is crucial to make the right business decisions regarding bidding on leases or deciding where to drill. Seismic image interpretation images can improve the position of wells and reduce the risk of drilling a “dry hole”. For oil and gas companies, having a better understanding of subsurface structures translates directly to reducing exploration risk. Basically, the higher the accuracy of the company’s view of the geological area, the better its chance of striking oil when it drills. - -This job is data- and compute-intensive. The company needs to process terabytes of data, requiring massive and fast computation power, which includes fast networking. Due to the data- and computing-intensive nature of seismic imaging, companies use parallel computing to process data and reduce the time compilation and completion. Companies relentlessly process large volumes of seismic acquisition data to locate, accurately quantify, and qualify the hydrocarbon content in reservoirs discovered in the subsurface before recovery operations commence. As acquisition data is unstructured and can easily reach petabyte levels for one potential oil and gas field, seismic processing activity can only be completed within a meaningful timescale by using HPC and appropriate data management strategies. - -:::image type="content" source="../media/network-interconnect.png" alt-text="Diagram of network interconnect compute and storage - seismic interpretation and modeling." lightbox="../media/network-interconnect.png"::: - -:::image type="content" source="../media/network-reference-architecture-seismic-processing.png" alt-text="Diagram of network reference architecture - seismic processing." lightbox="../media/network-reference-architecture-seismic-processing.png"::: - -### Use case and reference architecture for reservoir simulation and modeling - -Reservoir modeling also places values on physical subsurface characteristics such as water saturation, porosity, and permeability. This data is important in determining what kind of recovery approach and equipment to deploy, and ultimately where best to position wells. - -A reservoir modeling workload is also an area of reservoir engineering. The workload combines physics, mathematics, and computer programming in a reservoir model to analyze and predict fluid behavior in the reservoir over time. This analysis requires high computation power and typically big compute workload demands including fast networking. - -:::image type="content" source="../media/network-reference-architecture-reservoir-simulation.png" alt-text="Diagram of network reference architecture - reservoir simulation." lightbox="../media/network-reference-architecture-reservoir-simulation.png"::: - -:::image type="content" source="../media/network-interconnect-compute-and-storage-seismic-analysis.png" alt-text="Diagram of network interconnect compute and storage seismic analysis." lightbox="../media/network-interconnect-compute-and-storage-seismic-analysis.png"::: - -For more information on reference architecture or cookbooks for relevant HPC ISV applications that support HPC for energy use cases, see: - -- [Azure HPC certification.github.io](https://github.com/AzureHPC-Certification/AzureHPC-Certification.github.io/) -- [Microsoft Azure HPC OnDemand Platform](https://techcommunity.microsoft.com/t5/azure-global/azure-hpc-ondemand-platform-cloud-hpc-made-easy/ba-p/2537338). Standalone reference architecture might not be compliant with the ALZ paradigm. - -## Next steps - -The following articles provide guidance on each step in the cloud adoption journey for energy HPC environments. - -- [Azure Billing and Microsoft Entra tenants for energy HPC](./azure-billing-active-directory-tenant.md) -- [Identity and access management for Azure HPC in energy](./identity-access-management.md) -- [Management for Azure HPC in energy](./management.md) -- [Network topology and connectivity for Azure HPC in energy](./network-topology-connectivity.md) -- [Platform automation and DevOps for Azure HPC in energy](./platform-automation-devops.md) -- [Resource organization for HPC in the energy industry](./resource-organization.md) -- [Governance for HPC in energy industries](./security-governance-compliance.md) -- [Security for Azure HPC in energy](./security.md) -- [Storage for HPC energy environments](./storage.md) -- [Azure high-performance computing (HPC) landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) diff --git a/docs/scenarios/azure-hpc/energy/identity-access-management.md b/docs/scenarios/azure-hpc/energy/identity-access-management.md deleted file mode 100644 index 34eaec72c3..0000000000 --- a/docs/scenarios/azure-hpc/energy/identity-access-management.md +++ /dev/null @@ -1,54 +0,0 @@ ---- -title: Identity and access management for Azure HPC in energy -description: Learn about identity and access management for high-performance computing (HPC) using this energy industry scenario that builds on the Azure landing zone design area. -author: Rajani-Janaki-Ram -ms.author: rajanaki -ms.topic: conceptual -ms.custom: think-tank -ms.date: 11/09/2022 ---- - -# Identity and access management for Azure HPC in energy - -The guidance in this article can help you examine design considerations and recommendations that relate to identity and access management for high-performance computing (HPC). This scenario is specific to the deployment of an HPC application for the energy industry. For more information about design considerations and recommendations, see the [Azure landing zone design area for identity and access management](../../../ready/landing-zone/design-area/identity-access.md). - -[Microsoft Entra Domain Services](/azure/active-directory-domain-services/overview) (Microsoft Entra Domain Services) can make use of managed domain services such as domain join, group policy, and access to legacy authentication protocols like lightweight directory access protocol (LDAP) and Kerberos/NTLM authentication. Microsoft Entra Domain Services integrates with your existing Microsoft Entra tenant, so users can sign into services and applications connected to the managed domain using their Microsoft Entra credentials. You can also use existing groups and user accounts to secure access to resources. These features provide a smoother lift-and-shift of on-premises resources to Azure, especially for a hybrid environment. - -For more information, see [design recommendations for platform access](../../../ready/landing-zone/design-area/identity-access-platform-access.md#design-recommendations-for-platform-access) and [Azure identity and access for landing zones](../../../ready/landing-zone/design-area/identity-access-landing-zones.md). - -## Design considerations - -HPC deployment uses the Azure landing zone infrastructure setup for security identity and access management needs. - -Two common deployment types in oil and gas industry workloads are *cloud only* and *hybrid cloud* models. While it's less complex to have all of your compute, storage, and visualization resources in the cloud, our customers sometimes use a hybrid model due to multiple business constraints for seismic and reservoir-simulation HPC workloads. - -Both the cloud only and hybrid cloud models might have their own unique identity and access needs that affect which type of active directory solution to adopt. - -Workloads in the cloud only deployment model use Microsoft Entra ID for Azure service fabric authentication, while the HPC hybrid cloud model uses the [Microsoft Entra hybrid identity solution](/azure/active-directory/hybrid/choose-ad-authn) for authentication. Regardless of the deployment type, Linux clients and POSIX-compliant storage solutions require legacy active directory support through Microsoft Entra Domain Services. - -A typical HPC setup includes a frontend for submitting jobs, a job scheduler or orchestrator, a compute cluster, and shared storage. The jobs can be submitted from on-premises and/or in the cloud. Identity and access management considerations for users and visualization devices might vary depending on the enterprise standards. - -Review the Azure administration and management activities that you require from your teams. Consider your HPC needs on Azure resources. Determine the best possible distribution of responsibilities within your organization. - -## Design recommendations - -Depending on the HPC compute resource orchestrator that you choose, different types of authentication methods are supported: - -- [Azure CycleCloud](/azure/cyclecloud/overview) offers three methods of authentication: a built-in database with encryption, active directory, or LDAP. -- [Azure Batch](/azure/batch/batch-technical-overview) supports two methods of authentication: shared key and Microsoft Entra ID. -- [Microsoft HPC Pack](/powershell/high-performance-computing/overview-of-microsoft-hpc-pack): Currently all HPC Pack nodes must be joined into an active directory domain. If you deploy the HPC Pack cluster in a virtual network that has a site-to-site VPN or ExpressRoute connection with your corporate network, there's usually an existing active directory domain. If you don't have an active directory domain in your virtual network yet, you can choose to create one by promoting the head node as domain controller. - -## Next steps - -The following articles provide guidance for specific steps in the cloud adoption journey for energy HPC environments. - -- [Azure Billing and Microsoft Entra tenants for energy HPC](./azure-billing-active-directory-tenant.md) -- [Management for Azure HPC in energy](./management.md) -- [Network topology and connectivity for Azure HPC in energy](./network-topology-connectivity.md) -- [Platform automation and DevOps for Azure HPC in energy](./platform-automation-devops.md) -- [Resource organization for HPC in the energy industry](./resource-organization.md) -- [Governance for HPC in energy industries](./security-governance-compliance.md) -- [Security for Azure HPC in energy](./security.md) -- [Compute large-scale HPC application workloads in Azure VMs](./compute.md) -- [Storage for HPC energy environments](./storage.md) -- [Azure high-performance computing (HPC) landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) diff --git a/docs/scenarios/azure-hpc/energy/management.md b/docs/scenarios/azure-hpc/energy/management.md deleted file mode 100644 index 38a009cd89..0000000000 --- a/docs/scenarios/azure-hpc/energy/management.md +++ /dev/null @@ -1,32 +0,0 @@ ---- -title: Management for Azure HPC in energy -description: Learn about management considerations in the Azure landing zone for high-performance computing (HPC) in the energy sector. -author: Rajani-Janaki-Ram -ms.author: rajanaki -ms.topic: conceptual -ms.custom: think-tank -ms.date: 11/09/2022 ---- - -# Management for Azure HPC in energy - -The Azure landing zone for high-performance computing (HPC) doesn't have any specific considerations or recommendations that impact management-related decisions in the energy industry. - -## General recommendations - -It's important to understand any decisions that were previously made by the cloud platform team regarding [management recommendations](../../../ready/landing-zone/design-area/management.md). - -## Next steps - -The following articles provide guidance for specific steps in the cloud adoption journey for energy HPC environments. - -- [Azure Billing and Microsoft Entra tenants for energy HPC](./azure-billing-active-directory-tenant.md) -- [Identity and access management for Azure HPC in energy](./identity-access-management.md) -- [Network topology and connectivity for Azure HPC in energy](./network-topology-connectivity.md) -- [Platform automation and DevOps for Azure HPC in energy](./platform-automation-devops.md) -- [Resource organization for HPC in the energy industry](./resource-organization.md) -- [Governance for HPC in energy industries](./security-governance-compliance.md) -- [Security for Azure HPC in energy](./security.md) -- [Compute large-scale HPC application workloads in Azure VMs](./compute.md) -- [Storage for HPC energy environments](./storage.md) -- [Azure high-performance computing (HPC) landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) diff --git a/docs/scenarios/azure-hpc/energy/network-topology-connectivity.md b/docs/scenarios/azure-hpc/energy/network-topology-connectivity.md deleted file mode 100644 index 4cea0e4fca..0000000000 --- a/docs/scenarios/azure-hpc/energy/network-topology-connectivity.md +++ /dev/null @@ -1,139 +0,0 @@ ---- -title: Network topology and connectivity for Azure HPC in energy -description: Learn about network topology and connectivity considerations in the Azure landing zone for high-performance computing (HPC) in the energy industry. -author: Rajani-Janaki-Ram -ms.author: rajanaki -ms.topic: conceptual -ms.custom: think-tank -ms.date: 11/11/2022 ---- - -# Network topology and connectivity for Azure HPC in energy - -The guidance in this article can help you examine design considerations and best practices that relate to networking and connectivity for Microsoft Azure and high-performance computing (HPC) deployments. The following suggestions build on considerations and recommendations defined in the Azure landing zone article for [network topology and connectivity](../../../ready/landing-zone/design-area/network-topology-and-connectivity.md). - -## IP addressing, virtual networks, and subnets - -It's vital to plan for IP addressing in Azure to ensure that: - -- The IP address space doesn't overlap across on-premises locations and Azure regions. -- Future virtual network (VNet) peering to existing or planned VNets is possible. -- The VNet contains the right address space. -- Proper planning for subnet configuration happens in advance. -- Sufficient excess addressing is considered for future expansion or other services. - -## Design considerations - -Consider creating separate subnets to assign IP addresses across functional components of the environment. For example, a dedicated HPC VNet could include the following subnets: -- Compute -- Storage -- Infrastructure -- Visualization -- Sign in -- Azure NetApp Files -- Azure HPC Cache - -Services like Azure NetApp Files, Azure HPC Cache, and future storage offerings require dedicated delegated subnets for proper operation. Ensure that appropriate addressing space is planned if any of these services are under consideration. - -## DNS and name resolution for on-premises and Azure resources - -Domain name system (DNS) is a critical design topic in the overall Azure landing zone architecture. Some organizations might want to use their existing investments in DNS, while others might see cloud adoption as an opportunity to modernize their internal DNS infrastructure and use native Azure capabilities. - -**DNS design considerations**: Follow these recommendations when a virtual machine's DNS or virtual name doesn't change during migration. - -- Background DNS and virtual names connect many system interfaces in HPC environments, and customers are only sometimes aware of the interfaces that developers define over time. Connection challenges arise between various systems when virtual or DNS names change after migrations, so you should retain DNS aliases to prevent these types of difficulties. -- Use different DNS zones to distinguish environments from each other, such as sandbox, development, preproduction, and production. The exception is for HPC deployments with their own VNet, which might not require private DNS zones. -- DNS support is mandatory while using HPC cache so they can access storage and other resources. - -## High-performance network services - -- **Accelerated networking**: Many HPC workloads, such as seismic processing, process large amounts of data stored in shared file systems like Azure Blob, Azure NetApp Files, Lustre ClusterStor, and other custom storage solutions that are accessed through the network. A high-performance network is critical to reduce the time for data transfers. - - [Accelerated networking](/azure/virtual-network/accelerated-networking-overview) provides a high-throughput, low-latency connection between the VMs and to Azure services. Other benefits include reduced jitter and minimal CPU utilization. - -- **InfiniBand**: Parallel HPC applications that rely on Message Passing Interface (MPI) libraries might need to transfer significant amounts of data between many VMs. The InfiniBand interconnect, available on RDMA capable [H-series](/azure/virtual-machines/sizes-hpc) and [N-series](/azure/virtual-machines/sizes-gpu) VMs, provides a low-latency, high-bandwidth connection to maximize the performance and scalability of HPC and deep learning applications. - - :::image type="content" alt-text="Diagram of InfiniBand connection between VMs." source="../media/infiniband.png" lightbox="../media/infiniband.png"::: - - Some examples of MPI jobs include molecular dynamics, computational fluid dynamics, oil and gas reservoir simulation, and emerging distributed machine learning workloads. - - InfiniBand connections are possible only between VMs allocated within the same [placement group](/azure/virtual-machine-scale-sets/virtual-machine-scale-sets-placement-groups). - -- **Azure ExpressRoute**: In the case of a burst application like a hybrid setup for reservoir simulation and modeling, where on-premises datasets are shared and the Azure compute becomes an extension, ExpressRoute connects your on-premises environment to the Microsoft cloud over a private connection. ExpressRoute provides enterprise-grade resiliency and availability, and the advantage of a Global ExpressRoute partner ecosystem. For information on how to connect your network to Microsoft using ExpressRoute, see [ExpressRoute connectivity models](/azure/expressroute/expressroute-connectivity-models). - - ExpressRoute connections don't go over the public internet, and they offer more reliability, faster speeds, and lower latency than typical internet connections. For point-to-site VPN and site-to-site VPN, you can connect on-premises devices or networks to a virtual network using any combination of these VPN options and Azure ExpressRoute. - -## Defining an Azure network topology - -Enterprise-scale landing zones support two network topologies: one based on Azure Virtual WAN, and the other on a traditional network topology based on hub-and-spoke architecture. This section recommends HPC configurations and practices for both deployment models. - -- **Azure Virtual WAN**: Use a network topology based on a virtual WAN if your organization plans to: - - Deploy resources across several Azure regions and connect your global locations to both Azure and on-premises. - - Fully integrate software-defined WAN deployments with Azure. - - Deploy up to 2,000 VM workloads across all VNets connected to one virtual WAN hub. - - Organizations use Azure Virtual WAN to meet large-scale interconnectivity requirements. Microsoft manages this service, which helps to reduce overall network complexity and modernize your organization's network. - -- **Hub-and-spoke architecture**: Use a traditional Azure network topology based on [hub-and-spoke architecture](/azure/architecture/reference-architectures/hybrid-networking/hub-spoke?tabs=cli) if your organization: - - Plans to deploy resources in only select Azure regions. - - Doesn't need a global, interconnected network. - - Has few remote or branch locations per region and needs fewer than 30 IP security (IPsec) tunnels. - - Requires full control and granularity to manually configure your Azure network. - - Local and global VNet peering provide connectivity and are the preferred approaches to ensure connectivity between landing zones for HPC deployments across multiple Azure regions. - -## Inbound and outbound internet connectivity - -Since Azure-native network security services like Azure Firewall, Azure Web Application Firewall on Application Gateway, and Azure Front Door are fully managed services, you don't incur the operational and management costs associated with infrastructure deployments, which can become complex at scale. - -Design recommendations for HPC implementation: - -- For customers with a global footprint, Azure Front Door helps HPC deployments by using Azure Web Application Firewall policies to deliver and protect global HTTP/S applications across Azure regions. -- Take advantage of Web Application Firewall policies in Azure Front Door when you're using this service and Application Gateway to protect HTTP/S applications. Lock down Application Gateway to receive traffic only from Azure Front Door. - -## Network encryption requirements - -Design considerations for HPC implementations: - -- Traffic isn't currently encrypted when Azure ExpressRoute is used to configure private peering. -- Traffic over ExpressRoute for HPC deployments doesn't need to be encrypted. IPsec tunnels encrypt internet traffic by default, and encryption or decryption could negatively affect the traffic's performance. - -Key recommendations for encrypting networks between on-premises and Azure, and across Azure regions: - -- Determine whether HPC traffic should be encrypted. Explore network topology and connectivity to understand network encryption options in enterprise-scale landing zones. -- Plan for IP addressing in Azure to ensure that: - - The IP address space doesn't overlap across on-premises locations and Azure regions. - - The VNet contains the right address space. - - Proper planning for subnet configuration happens in advance. - -## Throughput latency bandwidth network requirements - -Both HPC in the cloud only and hybrid cloud deployment models have their own latency and throughput requirements depending on how the energy workloads are submitted and executed in the on-premises compared to cloud environments. Users can submit HPC jobs in many deployment modes, from on-premises or in the cloud. -- Single jobs - - On-premises to Azure connectivity considerations if remote visualization desktop is used -- Burst jobs - - Scheduler setup network considerations that submit the jobs in the cloud - - Azure Batch network considerations -- Parallel workflows, both on-premises and in the cloud -- Hybrid - - HPC cache -- Cloud native - - Azure Kubernetes Service containers - - Functions - -MPI environments are dedicated as they have unique requirements with the need for low-latency communications between nodes. The nodes are connected via high-speed interconnect and can't be shared with other workloads. MPI applications use the entire high-performance interconnects using pass-through mode in virtualized environments. Storage for MPI nodes is usually a parallel file system like Lustre, also accessed via the high-speed interconnect. - -## Next steps - -The following articles provide guidance for each step in the cloud adoption journey for energy HPC environments. - -- [Azure Billing and Microsoft Entra tenants for energy HPC](./azure-billing-active-directory-tenant.md) -- [Identity and access management for Azure HPC in energy](./identity-access-management.md) -- [Management for Azure HPC in energy](./management.md) -- [Platform automation and DevOps for Azure HPC in energy](./platform-automation-devops.md) -- [Resource organization for HPC in the energy industry](./resource-organization.md) -- [Governance for HPC in energy industries](./security-governance-compliance.md) -- [Security for Azure HPC in energy](./security.md) -- [Compute large-scale HPC application workloads in Azure VMs](./compute.md) -- [Storage for HPC energy environments](./storage.md) -- [Azure high-performance computing (HPC) landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) diff --git a/docs/scenarios/azure-hpc/energy/platform-automation-devops.md b/docs/scenarios/azure-hpc/energy/platform-automation-devops.md deleted file mode 100644 index 500cf58fb0..0000000000 --- a/docs/scenarios/azure-hpc/energy/platform-automation-devops.md +++ /dev/null @@ -1,32 +0,0 @@ ---- -title: Platform automation and DevOps for Azure HPC in energy -description: Learn about platform automation considerations in the Azure landing zone for high-performance computing (HPC) in the energy industry. -author: Rajani-Janaki-Ram -ms.author: rajanaki -ms.topic: conceptual -ms.custom: think-tank -ms.date: 11/10/2022 ---- - -# Platform automation and DevOps for Azure HPC in energy - -The Azure landing zone for high-performance computing (HPC) doesn't have any specific considerations or recommendations that impact platform automation or DevOps-related decisions in the energy industry. - -## General recommendations - -It's important to understand any decisions that were previously made by the cloud platform team regarding [platform automation and DevOps](../../../ready/landing-zone/design-area/platform-automation-devops.md). - -## Next steps - -The following articles provide guidance for specific steps in the cloud adoption journey for energy HPC environments. - -- [Azure Billing and Microsoft Entra tenants for energy HPC](./azure-billing-active-directory-tenant.md) -- [Identity and access management for Azure HPC in energy](./identity-access-management.md) -- [Management for Azure HPC in energy](./management.md) -- [Network topology and connectivity for Azure HPC in energy](./network-topology-connectivity.md) -- [Resource organization for HPC in the energy industry](./resource-organization.md) -- [Governance for HPC in energy industries](./security-governance-compliance.md) -- [Security for Azure HPC in energy](./security.md) -- [Compute large-scale HPC application workloads in Azure VMs](./compute.md) -- [Storage for HPC energy environments](./storage.md) -- [Azure high-performance computing (HPC) landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) diff --git a/docs/scenarios/azure-hpc/energy/resource-organization.md b/docs/scenarios/azure-hpc/energy/resource-organization.md deleted file mode 100644 index 58a4df3c4a..0000000000 --- a/docs/scenarios/azure-hpc/energy/resource-organization.md +++ /dev/null @@ -1,34 +0,0 @@ ---- -title: Resource organization for Azure HPC in the energy industry -description: This article provides naming and tagging recommendations to help you align HPC implementations in the energy industry with Cloud Adoption Framework methodologies. -author: Rajani-Janaki-Ram -ms.author: rajanaki -ms.topic: conceptual -ms.custom: think-tank -ms.date: 11/14/2022 ---- - -# Resource organization for HPC in the energy industry - -To align with the [Ready methodology](../../../ready/index.md) of the Cloud Adoption Framework for Azure, implement a naming and tagging strategy that includes business and operational details as components of resource names and metadata tags. - -## Use Azure resource naming and tagging conventions - -The business side of this strategy ensures that resource names and tags include the organizational information that you need to identify the associated teams. Include the business owners who are responsible for resource costs. The operational side ensures that names and tags include information that IT teams can use to identify the workload, application, environment, criticality, and other information that's useful for managing resources. - -Resources to name include VMs, load balancers, DNS labels, availability sets, virtual networks, subnets, Azure ExpressRoute, NSGs, application security groups, tags, route tables, managed disks, and public IPs. For example, you could label all development VMs with the tag `Dev`. Doing so makes it easier to pull billing reports and other reports for development VMs. For more information, see [Develop your naming and tagging strategy for Azure resources](../../../ready/azure-best-practices/naming-and-tagging.md). - -## Next steps - -The following articles provide guidance that you might find helpful at various points during your cloud adoption process. They can help you succeed in your cloud adoption scenario for HPC in the energy industry. - -- [Azure Billing and Microsoft Entra tenants for energy HPC](./azure-billing-active-directory-tenant.md) -- [Identity and access management for Azure HPC in energy](./identity-access-management.md) -- [Management for Azure HPC in energy](./management.md) -- [Network topology and connectivity for Azure HPC in energy](./network-topology-connectivity.md) -- [Platform automation and DevOps for Azure HPC in energy](./platform-automation-devops.md) -- [Governance for HPC in energy industries](./security-governance-compliance.md) -- [Security for Azure HPC in energy](./security.md) -- [Compute large-scale HPC application workloads in Azure VMs](./compute.md) -- [Storage for HPC energy environments](./storage.md) -- [Azure high-performance computing (HPC) landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) diff --git a/docs/scenarios/azure-hpc/energy/security-governance-compliance.md b/docs/scenarios/azure-hpc/energy/security-governance-compliance.md deleted file mode 100644 index 65dd604a0a..0000000000 --- a/docs/scenarios/azure-hpc/energy/security-governance-compliance.md +++ /dev/null @@ -1,30 +0,0 @@ ---- -title: Governance for Azure HPC in energy industries -description: Azure landing zones for HPC in energy have no specific considerations or recommendations that affect decisions about governance. -author: Rajani-Janaki-Ram -ms.author: rajanaki -ms.topic: conceptual -ms.custom: think-tank -ms.date: 11/23/2022 ---- - -# Governance for HPC in energy industries - -Azure landing zones for high-performance computing (HPC) in energy industries don't have any specific considerations or recommendations that affect decisions about governance. - -However, it might be important to understand any decisions that your cloud platform team has made so that you're aware of existing recommendations. For more information, see [Design area: Azure governance](../../../ready/landing-zone/design-area/governance.md). - -## Next steps - -The following articles provide guidance for specific points in cloud adoption of HPC in energy industries. - -- [Azure Billing and Microsoft Entra tenants for energy HPC](./azure-billing-active-directory-tenant.md) -- [Identity and access management for Azure HPC in energy](./identity-access-management.md) -- [Management for Azure HPC in energy](./management.md) -- [Network topology and connectivity for Azure HPC in energy](./network-topology-connectivity.md) -- [Platform automation and DevOps for Azure HPC in energy](./platform-automation-devops.md) -- [Resource organization for HPC in the energy industry](./resource-organization.md) -- [Security for Azure HPC in energy](./security.md) -- [Compute large-scale HPC application workloads in Azure VMs](./compute.md) -- [Storage for HPC energy environments](./storage.md) -- [Azure high-performance computing (HPC) landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) diff --git a/docs/scenarios/azure-hpc/energy/security.md b/docs/scenarios/azure-hpc/energy/security.md deleted file mode 100644 index 75aa7a2be0..0000000000 --- a/docs/scenarios/azure-hpc/energy/security.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -title: Security for Azure HPC in energy -description: Learn about security considerations in the Azure landing zone for high-performance computing (HPC) in the energy sector. -author: Rajani-Janaki-Ram -ms.author: rajanaki -ms.topic: conceptual -ms.custom: think-tank -ms.date: 11/15/2022 ---- - -# Security for Azure HPC in energy - -The Azure landing zone for high-performance computing (HPC) doesn't have any specific recommendations that affect security-related decisions in the energy industry. - -However, it might be important to understand any decisions that were previously made by the cloud platform team regarding security recommendations. - -## Design considerations - -Security rules and policies can be defined and applied based on environment, workflow, virtual machine (VM), physical server, and operator, including: -- Actions controlled through user permissions and logged for audit reporting. For example, root access privileges are only granted as needed and are based on the specified VMs, preventing compromise of other HPC workflows. -- Isolated workflows where sensitive data can't be shared with other HPC environments, workflows, or users running on the same underlying hardware. - -For CycleCloud deployments, extra security practices should be followed: -- CycleCloud should be installed on a drive with only admin-group access, which prevents non-admin users from accessing non-encrypted data. Non-admin users shouldn't be included in this group. Ideally, access to the CycleCloud installation should be limited to only administrators. -- CycleCloud installations shouldn't be shared across trust boundaries. -- The role-based access controls within a single CycleCloud installation might not be sufficient in a true multi-tenant environment. Use separate and isolated CycleCloud installations for each tenant with critical data. - -## Next steps - -The following articles provide guidance for each step in the cloud adoption journey for energy HPC environments. - -- [Azure Billing and Microsoft Entra tenants for energy HPC](./azure-billing-active-directory-tenant.md) -- [Identity and access management for Azure HPC in energy](./identity-access-management.md) -- [Management for Azure HPC in energy](./management.md) -- [Network topology and connectivity for Azure HPC in energy](./network-topology-connectivity.md) -- [Platform automation and DevOps for Azure HPC in energy](./platform-automation-devops.md) -- [Resource organization for HPC in the energy industry](./resource-organization.md) -- [Governance for HPC in energy industries](./security-governance-compliance.md) -- [Compute large-scale HPC application workloads in Azure VMs](./compute.md) -- [Storage for HPC energy environments](./storage.md) -- [Azure high-performance computing (HPC) landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) diff --git a/docs/scenarios/azure-hpc/energy/storage.md b/docs/scenarios/azure-hpc/energy/storage.md deleted file mode 100644 index d1ec855960..0000000000 --- a/docs/scenarios/azure-hpc/energy/storage.md +++ /dev/null @@ -1,98 +0,0 @@ ---- -title: Storage for Azure HPC in the energy industry -description: 'Large-scale HPC energy workloads have demands for data storage and access that exceed the capabilities of traditional cloud file systems.' -author: Rajani-Janaki-Ram -ms.author: rajanaki -ms.topic: conceptual -ms.custom: think-tank -ms.date: 09/23/2022 ---- - -# Storage for HPC energy environments - -Large-scale HPC workloads have demands for data storage and access that exceed the capabilities of traditional cloud file systems. - -Below are the factors you would need to consider and identify about your application requirement to decide on what storage solution to use - - - Latency, - - IOPS, - - Throughput, - - File sizes and count, - - Job run time - - Cost associated - - Affinity for storage location – on-premises vs Azure - -For further understanding of the factors that influence HPC storage selected in Azure please see “Understand factors that influence HPC storage selection in Azure”. - -Decision tree for specific HPC storage system choice. - -:::image type="content" alt-text="Diagram showing a decision tree of considerations when choosing a storage solution." source="../media/storage-selection-flow.png" lightbox="../media/storage-selection-flow.png"::: - -## HPC Design considerations - -Oil and gas companies must be able to effectively manage and store exabytes of seismic data, well data, maps, leases, and more. To put this data to use, they need a high-performance infrastructure that can process and deliver real-time analytics to help optimize production, reduce environmental risks, and enhance operational safety. - -[Data storage](/azure/architecture/topics/high-performance-computing#storage) and access needs vary widely, depending on workload scale. Azure supports several approaches for managing the speed and capacity of HPC applications. - -Large-scale Batch and HPC workloads in energy industry have demands for data storage and access that exceed the capabilities of traditional cloud file systems. The high-performance I/O requirements and massive scalability needs of [high-performance computing (HPC)](https://azure.microsoft.com/solutions/high-performance-computing/) introduces unique challenges for data storage and access. - -HPC is used to solve complex problems, such as Seismic and Reservoir simulation and modeling, that are not practical or cost effective to handle with traditional computing techniques. It does this through a combination of parallel processing and massive scalability to perform large and complicated computing tasks quickly, efficiently, and reliably. - -Additionally, in Azure HPC clusters, compute nodes are virtual machines that can be spun up, as needed, to perform whatever jobs the cluster has been assigned. These nodes spread computation tasks across the cluster to achieve the high-performance parallel processing needed to solve the complex problems HPC is applied to. Compute nodes need to perform read/write operations on shared working storage while executing jobs. The way nodes access this storage falls on a continuum between these two scenarios: - - - One set of data to many compute nodes - In this scenario, there is a single data source on the network that all the compute nodes access for working data. While structurally simple, any I/O operations are limited by the I/O capacity of the storage location. - - Many sets of data to many compute nodes - In this scenario, there is a single data source on the network that all the compute nodes access for working data. While structurally simple, any I/O operations are limited by the I/O capacity of the storage location. - -## HPC Design Recommendations - -Choose the solution best suited to your unique I/O and capacity requirements. - -### Network file system (NFS) - -NFS is commonly used to provide access to shared storage locations. With NFS a server VM shares out its local file system, which in the case of Azure is stored on one or more virtual hard disks (VHD) hosted in Azure Storage. Clients can then mount the server's shared files and access the shared location directly. - -The Network File System (NFS) is often used for home directories and project space mounted across all nodes. It can also often provide a space for research groups sharing data. In general, the throughput workloads are horizontally scalable with little dependency between individual tasks. Job schedulers divvy up the work across nodes and coordinate the activity. NFS is the typical shared storage across the nodes which is accessed via TCP/IP networks. - -NFS has the advantage of being easy to set up and maintain and is supported on both Linux and Windows operating systems. Multiple NFS servers can be used to spread storage across a network, but individual files are only accessible through a single server. - -For low-scale workloads, consider running NFS on the head node, using a [Storage](/azure/virtual-machines/sizes-storage) Optimized VM with large ephemeral disks, or D-series VMs with Azure Premium Storage, depending on your requirements. This solution suits workloads with 500 cores or fewer. - -In HPC scenarios, the file server can often serve as a bottleneck, throttling overall performance. Attempts to access uncached data from a single NFS server at rates higher than the documented per-VM maximum IOPS and throughput will result in throttling. - -In a scenario where dozens of clients are attempting to work on data stored on a single NFS server, these limits can easily be reached, causing your entire application's performance to suffer. The closer to a pure one-to-many scenario your HPC application uses, the sooner you run up against these limitations. - -### Parallel file systems on Azure - -Parallel file systems distribute block level storage across multiple networked storage nodes. File data is spread among these nodes, meaning file data is spread among multiple storage devices. This pools any individual storage I/O requests across multiple storage nodes that are accessible through a common namespace. - -Multiple storage devices and multiple paths to data are utilized to provide a high degree of parallelism, reducing bottlenecks imposed by accessing only a single node at a time. However, parallel I/O can be difficult to coordinate and optimize if working directly at the level of API or POSIX I/O Interface. By introducing intermediate data access and coordination layers, parallel file systems provide application developers a high-level interface between the application layer and the I/O layer. - -Energy MPI workloads have unique requirements with the need for low latency communications between nodes. The nodes are connected via high-speed interconnect and aren't amenable to sharing with other workloads. MPI applications leverage the entire high-performance interconnects using Pass-Through mode in virtualized environments. Storage for MPI nodes is usually a parallel file system like Lustre also accessed via the high-speed interconnect. Lustre/BeeGFS is typically used to handle the large throughput requirements of primarily seismic processing (but also reservoir simulation). - -Parallel file systems such as Lustre are used for HPC energy workloads that require access to large files, simultaneous access from multiple compute nodes, and massive amounts of data. The implementation of parallel file systems makes it easy to scale in terms of capability and performance. Such file systems take advantage of RDMA transfers with large bandwidth and reduced CPU usage. The parallel file system is usually used as scratch space and intended for work that requires optimized I/O. Examples include workload setup, pre-processing, running, and post-processing. - -Using an orchestrated parallel file service, such as Azure Managed Lustre, works for 50,000 or more cores, with read/write rates up to 500 GB/s, and up to 12.5 PiB storage upon request. - -For more information on Parallel Virtual file system on Azure, see [Parallel Virtual File Systems on Microsoft Azure - Part 1: Overview - Microsoft Tech Community](https://techcommunity.microsoft.com/t5/azure-global/parallel-virtual-file-systems-on-microsoft-azure-part-1-overview/ba-p/306487). - - - Azure NetApp Files and local disks are typically used to handle the more latency/IOPS sensitive workloads, like seismic interpretation, model preparation, and visualization. Consider using for workloads of up to 4,000 cores, with a throughput up to 6.5 GiB/s, and workloads that benefit from our require multiprotocol (NFS/SMB) access to the same data set. - - Azure Managed Lustre provides faster and higher capacity storage for HPC workloads. This solution works for medium to very large workloads and can support 50,000 or more cores, with throughput up to 500 GB/s, and storage capacity up to 12.5 PiB upon request. - - Standard or Premium Blob is a cost effective being the lowest cost cloud offering. This service provides exabyte scale, high throughput, low latency access where necessary, familiar file system and multi-protocol access (REST, HDFS, NFS). You can make use of the NFS v3.0 at the blob service endpoint for high throughput and read heavy workloads. You can optimize costs by moving to cooler tiers with the ability to perform lifecycle management with last update/ last access time, intelligent tiering with customizable policies. - - The Oil and Gas energy workloads may also require large data size and volumes transfer mechanism from on-premises to Cloud and vice versa that can be achieved by - - Offline - device based migration (DataBox) - - Online - over the network (ExpressRoute) based migration. - - -## Next steps -The following list of articles takes you to guidance found at specific points throughout the cloud adoption journey to help you be successful in the cloud adoption scenario for energy HPC environments. - -- [Azure Billing and Microsoft Entra tenants for energy HPC](./azure-billing-active-directory-tenant.md) -- [Identity and access management for Azure HPC in energy](./identity-access-management.md) -- [Management for Azure HPC in energy](./management.md) -- [Network topology and connectivity for Azure HPC in energy](./network-topology-connectivity.md) -- [Platform automation and DevOps for Azure HPC in energy](./platform-automation-devops.md) -- [Resource organization for HPC in the energy industry](./resource-organization.md) -- [Governance for HPC in energy industries](./security-governance-compliance.md) -- [Security for Azure HPC in energy](./security.md) -- [Compute large-scale HPC application workloads in Azure VMs](./compute.md) -- [Azure high-performance computing (HPC) landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) diff --git a/docs/scenarios/azure-hpc/finance/azure-billing-active-directory-tenant.md b/docs/scenarios/azure-hpc/finance/azure-billing-active-directory-tenant.md deleted file mode 100644 index be73f3373f..0000000000 --- a/docs/scenarios/azure-hpc/finance/azure-billing-active-directory-tenant.md +++ /dev/null @@ -1,31 +0,0 @@ ---- -title: Azure billing offers and Active Directory tenants for finance HPC -description: Azure landing zones for HPC in finance don't have any specific considerations or recommendations that affect billing offer decisions, enterprise enrollment, or Active Directory tenant decisions. -author: Rajani-Janaki-Ram -ms.author: rajanaki -ms.topic: conceptual -ms.custom: think-tank -ms.date: 11/08/2022 ---- - -# Azure billing offers and Active Directory tenants for finance HPC - -Azure landing zones for HPC in finance don't have any specific considerations or recommendations that affect billing offer decisions, enterprise enrollment, or Active Directory tenant decisions. - -## Previous recommendations - -Keep in mind that it could be important to understand any decisions made previously by the cloud platform team so you know about existing [enterprise enrollment or Microsoft Entra tenant decisions](../../../ready/landing-zone/design-area/azure-billing-microsoft-entra-tenant.md). - -## Next steps - -The following articles provide guidance that you might find helpful at various points throughout the cloud adoption process. They can help you succeed in your cloud adoption scenario for finance HPC environments. - -- [Finance HPC Azure identity and access management](./identity-access-management.md) -- [Management for HPC in the finance sector](./management.md) -- [Network topology and connectivity for HPC in the finance sector](./network-topology-connectivity.md) -- [Platform automation and DevOps for HPC in the finance sector](./platform-automation-devops.md) -- [Resource organization for Azure HPC in the finance sector](./resource-organization.md) -- [Governance for finance HPC](./security-governance-compliance.md) -- [Security for HPC in the finance sector](./security.md) -- [Storage for HPC in the finance sector](./storage.md) -- [Azure high-performance computing (HPC) landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) \ No newline at end of file diff --git a/docs/scenarios/azure-hpc/finance/compute.md b/docs/scenarios/azure-hpc/finance/compute.md deleted file mode 100644 index d32120a69f..0000000000 --- a/docs/scenarios/azure-hpc/finance/compute.md +++ /dev/null @@ -1,72 +0,0 @@ ---- -title: Large-scale finance HPC workloads on Azure VMs -description: See the optimized virtual machine (VM) sizes that Azure offers for CPU-intensive and GPU-intensive finance workloads, including InfiniBand-enabled hardware. -author: bsantacruz -ms.author: bsantacruz -ms.topic: conceptual -ms.custom: think-tank -ms.date: 10/17/2023 ---- - -# Run large-scale HPC application workloads for the finance industry on Azure virtual machines - -High-performance computing (HPC) workloads, also known as big compute applications, require many cores. These workloads can include tasks such as financial risk modeling and engineering stress analysis. - -Big compute applications typically have the following characteristics: - -- The workload can be divided into discrete tasks that can be run simultaneously across many cores. -- Each task takes input, processes it, and produces output. The entire application runs for a finite amount of time. -- The application doesn't need to run constantly, but it must be able to handle node failures and crashes. -- Tasks can be independent or tightly coupled, requiring high-speed networking technologies like InfiniBand and remote direct memory access (RDMA) connectivity. -- Compute-intensive virtual machine (VM) sizes such as H16r, H16mr, and A9 can be used. The selection depends on the workload. - -Azure offers a range of VM instances that are optimized for CPU-intensive and GPU-intensive workloads. These VMs can run in Azure Virtual Machine Scale Sets to provide resiliency and load balancing. Azure is also the only cloud platform that offers InfiniBand-enabled hardware. InfiniBand provides a significant performance advantage for tasks such as financial risk modeling and engineering stress analysis. This advantage results in performance that approaches or exceeds current on-premises infrastructure performance. - -Azure VMs offer various options, known as VM sizes, for HPC and GPU-optimized computing. It's important to select a VM size that's appropriate for your workload. To find the size that's the best fit, see [Sizes for virtual machines in Azure](/azure/virtual-machines/sizes). Also see the selector tool in [Virtual machines selector](https://azure.microsoft.com/pricing/vm-selector/). - -Keep in mind that not all Azure products are available in all regions. To see what's available in your area, see [Products available by region](https://azure.microsoft.com/explore/global-infrastructure/products-by-region/). - -For best practices for choosing Azure compute options, see the [Azure compute blog](https://techcommunity.microsoft.com/t5/azure-compute-blog/bg-p/AzureCompute) or the [Azure compute service](/azure/architecture/guide/technology-choices/compute-decision-tree) content. - -Azure offers both CPU-based and GPU-enabled VMs. The N-series VMs feature NVIDIA GPUs that are designed for compute-intensive or graphics-intensive applications such as AI, learning, and visualization. - -HPC SKUs are designed for high-performance scenarios. But other SKUs, such as the E and F series, are also suitable for certain workloads. - -## Design considerations - -When you design your HPC infrastructure, several tools and services are available to help you manage and schedule your workloads. - -[Azure CycleCloud](/azure/cyclecloud/) is a tool for creating, managing, operating, and optimizing HPC and big compute clusters in Azure. With Azure CycleCloud, users can dynamically configure HPC Azure clusters and orchestrate data and jobs for hybrid and cloud workflows. Azure CycleCloud provides the simplest way to manage HPC workloads in Azure that involves using a workload manager. Azure CycleCloud supports workload managers such as Grid Engine, Microsoft HPC Pack, HTCondor, LSF, PBS Pro, SLURM, and Symphony. - -### Finance architecture design example - -The following architecture provides an example of using VMs in HPC for finance workloads. - -:::image type="content" alt-text="Architecture diagram that shows a finance HPC workload that uses HPC Pack HB-series VMs." source="./media/hpc-finance-architecture-example.svg" lightbox="./media/hpc-finance-architecture-example.svg"::: - -This workload uses HPC Pack HB-series compute nodes. - -The [HB-series VMs](/azure/virtual-machines/hb-series) are optimized for HPC applications, such as financial analysis, weather simulation, and silicon register-transfer level (RTL) modeling. HB VMs feature up to 120 AMD EPYC™ 7003-series CPU cores, 448 GB of RAM, and no hyperthreading. HB-series VMs also provide 350 GB/sec of memory bandwidth, up to 32 MB of L3 cache per core, up to 7 GB/s of block device solid-state drive (SSD) performance, and clock frequencies of up to 3.675 GHz. - -For the HPC head node, the workload uses a different sized VM. Specifically, it uses a D16s_v4 VM, a type of general purpose SKU. - -For reference architectures and cookbooks for deploying HPC independent software vendor (ISV) applications that support use cases in the finance sector, see the following resources: - -- [Virtual machine series](https://azure.microsoft.com/pricing/details/virtual-machines/series/). -- [Azure HPC certification.github.io](https://github.com/AzureHPC-Certification/AzureHPC-Certification.github.io/). -- [Microsoft Azure HPC OnDemand Platform](https://techcommunity.microsoft.com/t5/azure-global/azure-hpc-ondemand-platform-cloud-hpc-made-easy/ba-p/2537338). This standalone reference architecture might not be compliant with the Azure landing zone paradigm. - -## Next steps - -The following articles provide guidance for various stages of the cloud adoption process. These resources can help you succeed in adopting finance sector HPC environments for the cloud. - -- [Azure billing offers and Active Directory tenants](./azure-billing-active-directory-tenant.md) -- [Identity and access management](./identity-access-management.md) -- [Management](./management.md) -- [Platform automation and DevOps](./platform-automation-devops.md) -- [Resource organization](./resource-organization.md) -- [Governance](./security-governance-compliance.md) -- [Security](./security.md) -- [Storage](./storage.md) -- [HPC landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) -- [Spot virtual machines](/azure/architecture/guide/spot/spot-eviction) diff --git a/docs/scenarios/azure-hpc/finance/identity-access-management.md b/docs/scenarios/azure-hpc/finance/identity-access-management.md deleted file mode 100644 index ffc2a2f00b..0000000000 --- a/docs/scenarios/azure-hpc/finance/identity-access-management.md +++ /dev/null @@ -1,62 +0,0 @@ ---- -title: 'Finance HPC Azure identity and access management' -description: Use identity and access management design considerations and recommendations to deploy an HPC application on Microsoft Azure for the financial industry. -author: Rajani-Janaki-Ram -ms.author: rajanaki -ms.topic: conceptual -ms.custom: think-tank -ms.date: 11/11/2022 ---- - -# Finance HPC Azure identity and access management - -This article builds on considerations and recommendations that are defined in the Azure landing zone article [Azure landing zone design area for identity and access management](../../../ready/landing-zone/design-area/identity-access.md). Following the guidance in this article helps you use the identity and access management design considerations and recommendations to deploy a high-performance computing (HPC) application on Microsoft Azure for the financial industry. - -## Design considerations - -Keep the following design considerations in mind when you deploy your HPC application: - -- Determine the Azure resource administration that's required by various members of the team. Consider providing those team members with elevated Azure resource administration access in a non-production environment. - - For example, give them a [Virtual Machine Contributor](/azure/role-based-access-control/built-in-roles#virtual-machine-contributor) role. - - You can also give team members partially elevated administration access like a partial Virtual Machine Contributor role in a production environment. Both options achieve a good balance between separation of duties and operational efficiency. -- Review the Azure administration and management activities that you require your teams to do. Consider your HPC on Azure landscape. Determine the best possible distribution of responsibilities within your organization. - - Here are the common Azure activities for administration and management. - - | Azure resource | Azure resource provider | Activities | - |--|--|--| - | Virtual machine (VM) | Microsoft.Compute/virtualMachines | Start, stop, restart, deallocate, deploy, redeploy, change, and resize VMs. Manage extensions, availability sets, and proximity placement groups. | - | VMs | Microsoft.Compute/disks | Read and write to disk. | - | Storage | Microsoft.Storage | Read and make changes on storage accounts, for example, a boot diagnostics storage account. | - | Storage | Microsoft.NetApp | Read and make changes on NetApp capacity pools and volumes. | - | Storage | Microsoft.NetApp | Take Azure NetApp Files snapshots. | - | Storage | Microsoft.NetApp | Use Azure NetApp Files cross-region replication. | - | Networking | Microsoft.Network/networkInterfaces | Read, create, and change network interfaces. | - | Networking | Microsoft.Network/loadBalancers | Read, create, and change load balancers. | - | Networking | Microsoft.Network/networkSecurityGroups | Read network security groups. | - | Networking | Microsoft.Network/azureFirewalls | Read firewalls. | - | Networking | Microsoft.Network/virtualNetworks | Read, create, and change network interfaces.

Consider the relevant access that's needed for the resource group of the virtual network and related access if it's different from the resource group of the VMs. | - -- Consider the Microsoft service that you use—Azure CycleCloud, Azure Batch, or a hybrid environment with HPC VMs in the cloud. - -## Recommendations - -- If you use Azure CycleCloud, there are three methods of authentication: a built-in database with encryption, Microsoft Entra ID, or Lightweight Directory Access Protocol (LDAP). For more information, see [User authentication](/azure/cyclecloud/how-to/user-authentication). For more information about service principals in Azure CycleCloud, see [Using service principals](/azure/cyclecloud/how-to/service-principals?view=cyclecloud-8&preserve-view=true). -- If you use Batch, you can authenticate with Microsoft Entra ID by way of two different methods: integrated authentication or a service principle. For more information about how to use these different approaches, see [Azure Batch authentication](/azure/batch/batch-aad-auth). If you use the [user subscription mode](/azure/batch/best-practices#pool-configuration-and-naming) and not the Batch service mode, grant access to Batch so that it can access the subscription. For more information, see [Allow Batch to access the subscription](/azure/batch/batch-account-create-portal#allow-azure-batch-to-access-the-subscription-one-time-operation). -- If you want to extend your on-premises capabilities to a hybrid environment, you can authenticate through Active Directory with a read-only domain controller that's hosted in Azure. This approach minimizes traffic across the link. This integration provides a way for users to use their existing credentials to sign in to services and applications that are connected to the managed domain. You can also use existing groups and user accounts to help secure access to resources. These features provide a smoother lift and shift of on-premises resources to Azure. - -For more information, see [Design recommendations for platform access](../../../ready/landing-zone/design-area/identity-access-platform-access.md#design-recommendations-for-platform-access) and [Azure identity and access for landing zones](../../../ready/landing-zone/design-area/identity-access-landing-zones.md). - -## Next steps - -The following articles provide guidance for various stages of the cloud adoption process. These resources can help you succeed in adopting finance sector HPC environments for the cloud. - -- [Azure billing offers and Active Directory tenants](./azure-billing-active-directory-tenant.md) -- [Identity and access management](./identity-access-management.md) -- [Management](./management.md) -- [Platform automation and DevOps](./platform-automation-devops.md) -- [Resource organization](./resource-organization.md) -- [Governance](./security-governance-compliance.md) -- [Security](./security.md) -- [Storage](./storage.md) -- [HPC landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) diff --git a/docs/scenarios/azure-hpc/finance/management.md b/docs/scenarios/azure-hpc/finance/management.md deleted file mode 100644 index bc45dbb2dd..0000000000 --- a/docs/scenarios/azure-hpc/finance/management.md +++ /dev/null @@ -1,31 +0,0 @@ ---- -title: Management for Azure HPC in the finance sector -description: Azure landing zones for HPC in finance don't have any specific considerations or recommendations that influence management-related decisions. -author: Rajani-Janaki-Ram -ms.author: rajanaki -ms.topic: conceptual -ms.custom: think-tank -ms.date: 11/11/2022 ---- - -# Management for HPC in the finance sector - -Azure [landing zones for HPC](../ready.md) in the finance sector don't have any specific considerations or recommendations that influence management-related decisions. Management is one of several design areas considered in this documentation. - -## Previous recommendations - -Keep in mind that it could be important to understand any decisions made previously by the cloud platform team so you know about existing [management recommendations](../../../ready/landing-zone/design-area/management.md). - -## Next steps - -The following articles provide guidance that you might find helpful at various points throughout the cloud adoption process. They can help you succeed in the cloud adoption scenario for HPC environments in the finance sector. - -- [Azure billing offers and Active Directory tenants for finance HPC](./azure-billing-active-directory-tenant.md) -- [Finance HPC Azure identity and access management](./identity-access-management.md) -- [Network topology and connectivity for HPC in the finance sector](./network-topology-connectivity.md) -- [Platform automation and DevOps for HPC in the finance sector](./platform-automation-devops.md) -- [Resource organization for Azure HPC in the finance sector](./resource-organization.md) -- [Governance for finance HPC](./security-governance-compliance.md) -- [Security for HPC in the finance sector](./security.md) -- [Storage for HPC in the finance sector](./storage.md) -- [Azure high-performance computing (HPC) landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) \ No newline at end of file diff --git a/docs/scenarios/azure-hpc/finance/network-topology-connectivity.md b/docs/scenarios/azure-hpc/finance/network-topology-connectivity.md deleted file mode 100644 index 45187a2d4e..0000000000 --- a/docs/scenarios/azure-hpc/finance/network-topology-connectivity.md +++ /dev/null @@ -1,103 +0,0 @@ ---- -title: Network topology and connectivity for Azure HPC in the finance sector -description: This finance HPC guidance builds on considerations and recommendations described in the Azure landing zone article for network topology and connectivity. -author: Rajani-Janaki-Ram -ms.author: rajanaki -ms.topic: conceptual -ms.custom: think-tank -ms.date: 11/11/2022 ---- - -# Network topology and connectivity for HPC in the finance sector - -This article builds on considerations and recommendations that are described in the Azure landing zone article for [network topology and connectivity](../../../ready/landing-zone/design-area/network-topology-and-connectivity.md). The guidance in this article can help you examine key design considerations and best practices for networking and connectivity to, from, and within Azure and HPC deployments. - -## Plan for IP addressing - -It's important to plan for IP addressing on Azure to ensure that: - - - The IP address space doesn't overlap across on-premises locations and Azure regions. - - The virtual network contains the right address space. - - Proper planning for subnet configuration happens in advance. - -### Design considerations and recommendations - - - Delegated subnets are required if you want to implement Azure NetApp Files, which is used frequently in HPC deployments with shared file systems. You can [dedicate](/azure/virtual-network/virtual-network-for-azure-services#services-that-can-be-deployed-into-a-virtual-network) and delegate subnets to certain services and then create instances of those services within subnets. Although Azure helps you create multiple delegated subnets in a virtual network, only one delegated subnet can exist in a virtual network for Azure NetApp Files. Attempts to create a new volume will fail if you use more than one delegated subnet for Azure NetApp Files. - - You need to create a dedicated subnet if you use Azure HPC Cache for storage. For more information about this subnet prerequisite, see [Cache subnet](/azure/hpc-cache/hpc-cache-prerequisites#cache-subnet). To learn more about how to create a subnet, see [Add a virtual network subnet](/azure/virtual-network/virtual-network-manage-subnet). - -## Configure DNS and name resolution for on-premises and Azure resources - -Domain Name System (DNS) is a critical design element in the Azure landing zone architecture. Some organizations prefer to use their existing investments in DNS. Others see cloud adoption as an opportunity to modernize their internal DNS infrastructure and use native Azure capabilities. - -### Design recommendations - -The following recommendations apply to scenarios in which a virtual machine's DNS or virtual name doesn't change during migration. - - - Background DNS and virtual names connect many system interfaces in HPC environments. You might not be aware of all the interfaces that developers define over time. Connection challenges arise between various systems when virtual machine or DNS names change after migrations. We recommend that you retain DNS aliases to prevent these difficulties. - - Use different DNS zones to distinguish the environments (sandbox, development, preproduction, and production) from each other. The exception is for HPC deployments that have their own virtual networks. In these deployments, private DNS zones might not be necessary. - - DNS support is required when you use HPC Cache. DNS enables it to access storage and other resources. - - DNS and name resolution are critical in the finance sector when you use resource location and SRV records. We recommend that you use the DNS resolution provided by the Microsoft Entra Domain Services (Microsoft Entra Domain Services) domain controller. For more information, see [Deploy Microsoft Entra Domain Services in an Azure virtual network](/azure/architecture/reference-architectures/identity/adds-extend-domain). - -## High-performance network services - -### InfiniBand - - - If you run financial applications for which you need low latency between machines, and information must be transferred between nodes to get results, you need low-latency and high-throughput interconnections. [RDMA-capable H-series](/azure/virtual-machines/sizes-hpc#rdma-capable-instances) and [N-series](/azure/virtual-machines/sizes-gpu) VMs communicate over the low-latency and high-bandwidth InfiniBand network. The RDMA network capability over such a connection is critical to boost the scalability and performance of distributed-node HPC and AI workloads. This network can improve the performance of applications that run under Microsoft MPI or Intel MPI. For more information, see [Enable InfiniBand](/azure/virtual-machines/workloads/hpc/enable-infiniband). To learn how to set up MPI, see [Set up Message Passing Interface for HPC](/azure/virtual-machines/workloads/hpc/setup-mpi). - -### Azure ExpressRoute - - - For hybrid applications like risk grid computing solutions, where your on-premises trading systems and analytics are functional and Azure becomes an extension, you can use ExpressRoute to connect your on-premises environment to Azure over a private connection, with the help of a connectivity provider. ExpressRoute provides enterprise-grade resiliency and availability and the advantage of a global ExpressRoute partner ecosystem. For information on how to connect your network to Azure by using ExpressRoute, see [ExpressRoute connectivity models](/azure/expressroute/expressroute-connectivity-models). - - ExpressRoute connections don't use the public internet, and they provide more reliability, faster speeds, and lower latencies than typical internet connections. For point-to-site VPN and site-to-site VPN, you can connect on-premises devices or networks to a virtual network by using any combination of these VPN options and ExpressRoute. - -## Define an Azure network topology - -Enterprise-scale landing zones support two network topologies: one based on Azure Virtual WAN and the other a traditional network topology that's based on hub-and-spoke architecture. This section provides recommended HPC configurations and practices for both deployment models. - -Use a network topology that's based on Virtual WAN if your organization plans to: - - - Deploy resources across several Azure regions and connect your global locations to both Azure and on-premises. - - Fully integrate software-defined WAN deployments with Azure. - - Deploy as many as 2,000 virtual machine workloads across all virtual networks connected to one Virtual WAN hub. - -Organizations use Virtual WAN to meet large-scale interconnectivity requirements. Microsoft manages this service, which helps you reduce overall network complexity and modernize your organization's network. - -Use a traditional Azure network topology based on [hub-and-spoke architecture](/azure/architecture/reference-architectures/hybrid-networking/hub-spoke?tabs=cli) if your organization: - - - Plans to deploy resources in only select Azure regions. - - Doesn't need a global, interconnected network. - - Has few remote or branch locations per region and needs fewer than 30 IP security (IPsec) tunnels. - - Requires full control and granularity to manually configure your Azure network. - -Document your network topology and firewall rules. Network security groups (NSGs) are often implemented with considerable complexity. Use application security groups when it makes sense to label traffic at a greater granularity than virtual networks can provide. Understand NSG prioritization rules and which rules take precedence over others. - -## Plan for inbound and outbound internet connectivity - -This section describes recommended connectivity models for inbound and outbound connectivity to and from the public internet. Because Azure-native network security services like Azure Firewall, Azure Web Application Firewall on Azure Application Gateway, and Azure Front Door are fully managed services, you don't incur the operational and management costs associated with infrastructure deployments, which can become complex at scale. - -### Design considerations and recommendations - - - If your organization has a global footprint, [Azure Front Door](/azure/frontdoor/front-door-overview) can be helpful in your HPC deployment. Azure Front Door uses [Azure Web Application Firewall policies](/azure/web-application-firewall/ag/policy-overview) to deliver and help protect global HTTP(S) applications across Azure regions. - - Take advantage of [Web Application Firewall policies](/azure/web-application-firewall/ag/create-waf-policy-ag) in Azure Front Door when you're using Azure Front Door and Application Gateway to help protect HTTP(S) applications. Lock down Application Gateway to receive traffic only from Azure Front Door. For more information, see [How do I lock down access?](/azure/frontdoor/front-door-faq#how-do-i-lock-down-the-access-to-my-backend-to-only-azure-front-door-). - - Use local and global virtual network peering connectivity. These are the preferred methods for ensuring connectivity between landing zones for HPC deployments across multiple Azure regions. - -## Define network encryption requirements - -This section provides key recommendations for encrypting networks between on-premises environments and Azure, and across Azure regions. - -### Design considerations and recommendations - - - Traffic performance is an important consideration when you enable encryption. IPsec tunnels encrypt internet traffic by default. Any additional encryption or decryption can negatively affect performance. When you use ExpressRoute, traffic isn't encrypted by default. You need to determine whether HPC traffic should be encrypted. Explore [network topology](../../../ready/azure-best-practices/define-an-azure-network-topology.md) and [connectivity](../../../ready/azure-best-practices/connectivity-to-azure.md) to understand network encryption options in enterprise-scale landing zones. - -## Next steps - -The following articles provide guidance that you might find helpful during various stages of the cloud adoption process. They can help you succeed in your cloud adoption scenario for HPC environments in the finance sector. - -- [Azure billing offers and Active Directory tenants for finance HPC](./azure-billing-active-directory-tenant.md) -- [Finance HPC Azure identity and access management](./identity-access-management.md) -- [Management for HPC in the finance sector](./management.md) -- [Platform automation and DevOps for HPC in the finance sector](./platform-automation-devops.md) -- [Resource organization for Azure HPC in the finance sector](./resource-organization.md) -- [Governance for finance HPC](./security-governance-compliance.md) -- [Security for HPC in the finance sector](./security.md) -- [Storage for HPC in the finance sector](./storage.md) -- [Azure high-performance computing (HPC) landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) diff --git a/docs/scenarios/azure-hpc/finance/platform-automation-devops.md b/docs/scenarios/azure-hpc/finance/platform-automation-devops.md deleted file mode 100644 index a1010aa5ed..0000000000 --- a/docs/scenarios/azure-hpc/finance/platform-automation-devops.md +++ /dev/null @@ -1,31 +0,0 @@ ---- -title: Platform automation and DevOps for Azure HPC in the finance sector -description: Azure landing zones for HPC in finance don't have any specific considerations or recommendations that affect platform automation and DevOps decisions. -author: Rajani-Janaki-Ram -ms.author: rajanaki -ms.topic: conceptual -ms.custom: think-tank -ms.date: 11/14/2022 ---- - -# Platform automation and DevOps for HPC in the finance sector - - Azure [landing zones for HPC](../ready.md) in the finance sector don't have any specific considerations or recommendations that affect decisions related to platform automation and DevOps. Platform automation and DevOps are two of several design areas considered in this documentation. - -## Previous recommendations - -Keep in mind that it could be important to understand any decisions made previously by the cloud platform team so you know about existing recommendations for [platform automation and DevOps](../../../ready/landing-zone/design-area/platform-automation-devops.md). - -## Next steps - -The following articles provide guidance that you might find helpful at various points during your cloud adoption process. They can help you succeed in the cloud adoption scenario for HPC environments in the finance sector. - -- [Azure billing offers and Active Directory tenants for finance HPC](./azure-billing-active-directory-tenant.md) -- [Finance HPC Azure identity and access management](./identity-access-management.md) -- [Management for HPC in the finance sector](./management.md) -- [Network topology and connectivity for HPC in the finance sector](./network-topology-connectivity.md) -- [Resource organization for Azure HPC in the finance sector](./resource-organization.md) -- [Governance for finance HPC](./security-governance-compliance.md) -- [Security for HPC in the finance sector](./security.md) -- [Storage for HPC in the finance sector](./storage.md) -- [Azure high-performance computing (HPC) landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) \ No newline at end of file diff --git a/docs/scenarios/azure-hpc/finance/security-governance-compliance.md b/docs/scenarios/azure-hpc/finance/security-governance-compliance.md deleted file mode 100644 index 915862d4dd..0000000000 --- a/docs/scenarios/azure-hpc/finance/security-governance-compliance.md +++ /dev/null @@ -1,31 +0,0 @@ ---- -title: Governance for Azure HPC in the finance sector -description: Learn about governance for HPC workloads in the finance sector. -author: Rajani-Janaki-Ram -ms.author: rajanaki -ms.topic: conceptual -ms.custom: think-tank -ms.date: 11/10/2022 ---- - -# Governance for finance HPC - -Azure landing zones for HPC in finance don't have any specific considerations or recommendations that influence governance-related decisions. - -## Previous recommendations - -It might be important to be aware of existing [governance recommendations](../../../ready/landing-zone/design-area/governance.md). - -## Next steps - -The following articles provide guidance that you might find helpful at various points throughout the cloud adoption process. They can help you succeed in your cloud adoption scenario for finance HPC environments. - -- [Azure billing offers and Active Directory tenants for finance HPC](./azure-billing-active-directory-tenant.md) -- [Finance HPC Azure identity and access management](./identity-access-management.md) -- [Management for HPC in the finance sector](./management.md) -- [Network topology and connectivity for HPC in the finance sector](./network-topology-connectivity.md) -- [Platform automation and DevOps for HPC in the finance sector](./platform-automation-devops.md) -- [Resource organization for Azure HPC in the finance sector](./resource-organization.md) -- [Security for HPC in the finance sector](./security.md) -- [Storage for HPC in the finance sector](./storage.md) -- [Azure high-performance computing (HPC) landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) diff --git a/docs/scenarios/azure-hpc/finance/security.md b/docs/scenarios/azure-hpc/finance/security.md deleted file mode 100644 index 836723658f..0000000000 --- a/docs/scenarios/azure-hpc/finance/security.md +++ /dev/null @@ -1,37 +0,0 @@ ---- -title: Security for Azure HPC in the finance sector -description: This article provides recommendations for implementing security in HPC environments for the finance sector. -author: Rajani-Janaki-Ram -ms.author: rajanaki -ms.topic: conceptual -ms.custom: think-tank -ms.date: 11/15/2022 ---- - -# Security for HPC in the finance sector - -This article provides recommendations for implementing security in HPC environments for the finance sector. - -It's important to understand the previous [security recommendations](../../../ready/landing-zone/design-area/security.md) outlined by the Microsoft cloud platform team. - -## Security baseline recommendations - -We also recommend that you consider security baseline recommendations for various services that are used in HPC environments, including: - - - [Azure HPC Cache](/security/benchmark/azure/baselines/hpc-cache-security-baseline) - - [Azure CycleCloud](/azure/cyclecloud/concepts/security-best-practices?view=cyclecloud-8&preserve-view=true) - - [Azure Batch](/azure/batch/security-controls-policy) - -## Next steps - -The following articles provide guidance that you might find helpful at various points during your cloud adoption process. They can help you succeed in your cloud adoption scenario for HPC in the finance sector. - -- [Azure billing offers and Active Directory tenants for finance HPC](./azure-billing-active-directory-tenant.md) -- [Finance HPC Azure identity and access management](./identity-access-management.md) -- [Management for HPC in the finance sector](./management.md) -- [Network topology and connectivity for HPC in the finance sector](./network-topology-connectivity.md) -- [Platform automation and DevOps for HPC in the finance sector](./platform-automation-devops.md) -- [Resource organization for Azure HPC in the finance sector](./resource-organization.md) -- [Governance for finance HPC](./security-governance-compliance.md) -- [Storage for HPC in the finance sector](./storage.md) -- [Azure high-performance computing (HPC) landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) diff --git a/docs/scenarios/azure-hpc/finance/storage.md b/docs/scenarios/azure-hpc/finance/storage.md deleted file mode 100644 index 56c5c74b37..0000000000 --- a/docs/scenarios/azure-hpc/finance/storage.md +++ /dev/null @@ -1,67 +0,0 @@ ---- -title: Storage for Azure HPC in the finance sector -description: This article provides recommendations for implementing storage in HPC environments for the finance sector. -author: Rajani-Janaki-Ram -ms.author: rajanaki -ms.topic: conceptual -ms.custom: think-tank -ms.date: 11/15/2022 ---- - -# Storage for HPC in the finance sector - -This article provides recommendations for implementing storage in HPC environments for the finance sector. Large-scale HPC workloads in finance environments create demands for data storage and access that exceed the capabilities of traditional cloud file systems. - -## Design considerations - -To decide which storage solution to use, you need to take into account the following application requirements. - - - Latency - - IOPS - - Throughput - - File sizes and number - - Job runtime - - Associated costs - - Affinity for storage location: on-premises versus Azure - -## Design recommendations - -- Use [Standard or Premium Azure Blob Storage](/azure/storage/blobs/storage-blobs-introduction) for high-throughput, low-latency storage. It offers these benefits: - - - It provides exabyte-scale, high-throughput, low-latency access where necessary, a familiar file system, and multi-protocol access (REST, HDFS, NFS). - - It's cost effective. - - You can mount Blob Storage as a file system by using [BlobFuse](/azure/storage/blobs/storage-how-to-mount-container-linux). Doing so makes it easy to allow multiple nodes to mount the same container for read-only scenarios. - - It supports NFS 3.0 at the blob service endpoint for high-throughput, read-heavy workloads. - - You can optimize costs by moving data to cooler tiers via the ability to perform lifecycle management with last update/access time intelligent tiering, with customizable policies. - -- Use [Azure NetApp Files](/azure/azure-netapp-files) for ReadWriteMany (unique) or write-once, read-once applications. It provides these benefits: - - - A wide choice of file protocols (NFSv3, NFSv4.1, SMB3). - - Performance that's comparable with on-premises performance, with multiple tiers (Ultra, Premium, Standard). - - Deploys in minutes and offers a wide range of tiers and flexibility. - - Flexible capacity pool types and performance, where the QoS per volume is automatically assigned based on the tier of the pool and the volume quota. - -The following table provides a comparison of Blob Storage, Azure Files, Azure Managed Lustre, and Azure NetApp Files. - -| | Blob Storage | Azure Files | Azure Managed Lustre | Azure NetApp Files | -| -- | -- | -- | -- | -- | -| **Use cases** | Best suited for large-scale read-heavy sequential access workloads where data is ingested once and minimally modified.

Low total cost of ownership, if there's light maintenance. | A highly available service that's best suited for random access workloads.

For NFS shares, Azure Files provides full POSIX file system support. The built-in CSI driver enables you to easily use it from container platforms like Azure Container Instances and Azure Kubernetes Service (AKS), in addition to VM-based platforms. | Azure Managed Lustre is a fully managed parallel file system best suited to medium to large HPC workloads.

Enables HPC applications in the cloud without breaking application compatibility by providing familiar Lustre parallel file system functionality, behaviors, and performance, securing long-term application investments. | A fully managed file service in the cloud, powered by NetApp, with advanced management capabilities.

Azure NetApp Files is suited for workloads that require random access. It provides broad protocol support and improved data protection. | -| **Available protocols** | NFS 3.0

REST

Azure Data Lake Storage | SMB

NFS 4.1

(No interoperability between either protocol.) | Lustre | NFS 3.0 and 4.1

SMB


| -| **Key features** | Integrated with Azure HPC Cache for low-latency workloads.

Integrated management, including lifecycle management, immutable blobs, data failover, and metadata index. | Zonally redundant for high availability.

Consistent single-digit millisecond latency.

Predictable performance and cost that scales with capacity. | High storage capacity up to 12.5 PiB upon request.

Low (~2ms) latency.

Spin up new clusters in minutes.

Supports containerized workloads with AKS. | Extremely low latency (as low as sub-millisecond).

Rich NetApp ONTAP management capability, like SnapMirror Cloud.

Consistent hybrid cloud experience. | -| **Performance (per volume)** | As much as 20,000 IOPS. As much as 100 GiB/s throughput. | As much as 100,000 IOPS. As much as 80 GiB/s throughput. | As much as 1M IOPS, up to 500 GiB/s throughput. | As much as 460,000 IOPS. As much as 36 GiB/s throughput. | -| **Scale** | As much as 2 PiB for a single volume.

As much as ~4.75 TiB for a single file.

No minimum capacity requirements. | As much as 100 TiB for a single volume.

As much as 4 TiB for a single file.

100 GiB minimum capacity. | Up to 12.5 PiB upon request for a single volume.

As much as 31.25 PiB for a single file.

4 TiB minimum capacity. | As much as 100 TiB for a single volume.

As much as 16 TiB for a single file.

Consistent hybrid cloud experience. | -| **Pricing** | [Azure Blob Storage pricing](https://azure.microsoft.com/pricing/details/storage/blobs) | [Azure Files pricing](https://azure.microsoft.com/pricing/details/storage/files) | [Azure Managed Lustre pricing](https://azure.microsoft.com/pricing/details/managed-lustre) | [Azure NetApp Files pricing](https://azure.microsoft.com/pricing/details/netapp) | - -## Next steps - -The following articles provide guidance that you might find helpful at various points during your cloud adoption process. They can help you succeed in your cloud adoption scenario for HPC in the finance sector. - -- [Azure billing offers and Active Directory tenants for finance HPC](./azure-billing-active-directory-tenant.md) -- [Finance HPC Azure identity and access management](./identity-access-management.md) -- [Management for HPC in the finance sector](./management.md) -- [Network topology and connectivity for HPC in the finance sector](./network-topology-connectivity.md) -- [Platform automation and DevOps for HPC in the finance sector](./platform-automation-devops.md) -- [Resource organization for Azure HPC in the finance sector](./resource-organization.md) -- [Governance for finance HPC](./security-governance-compliance.md) -- [Security for HPC in the finance sector](./security.md) -- [Azure high-performance computing (HPC) landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) diff --git a/docs/scenarios/azure-hpc/govern.md b/docs/scenarios/azure-hpc/govern.md index 4f0efd86ae..b7e509074d 100644 --- a/docs/scenarios/azure-hpc/govern.md +++ b/docs/scenarios/azure-hpc/govern.md @@ -36,7 +36,7 @@ Ensure the five disciplines of cloud governance are implemented: For HPC governance examples, organizations in the manufacturing or automotive space often rely on additional compliance and regulation standards like [Trusted Information Security Assessment Exchange (TISAX).](/azure/compliance/offerings/offering-tisax) Following the TISAX standard ensures HPC cloud solutions with strong information security and data protection. -Banking organizations will also have [data security considerations](/azure/architecture/industries/finance/risk-grid-banking-overview#data-security-considerations-for-risk-grid-computing) that include conducting security risk assessments before placing workloads in the cloud. Risk grid computing also follows both Microsoft Azure ExpressRoute and VPN Gateway as the thousands of ingested documents typically live in on-premises locations. +Banking organizations will also have data security considerations that include conducting security risk assessments before placing workloads in the cloud. Risk grid computing also follows both Microsoft Azure ExpressRoute and VPN Gateway as the thousands of ingested documents typically live in on-premises locations. ## Next steps diff --git a/docs/scenarios/azure-hpc/identity-access-management.md b/docs/scenarios/azure-hpc/identity-access-management.md new file mode 100644 index 0000000000..34042d9e39 --- /dev/null +++ b/docs/scenarios/azure-hpc/identity-access-management.md @@ -0,0 +1,88 @@ +--- +title: Identity and Access Management for Azure HPC +description: Learn about identity and access management for high-performance computing (HPC) using this scenario that builds on the Azure landing zone design area. +author: Rajani-Janaki-Ram +ms.author: rajanaki +ms.topic: conceptual +ms.custom: think-tank +ms.date: 11/15/2024 +--- + +# Identity and access management for Azure HPC + +This article builds on considerations and recommendations that are described in the article [Azure identity and access management design](../../ready/landing-zone/design-area/identity-access.md). It can help you examine design considerations for identity and access management that are specific to the deployment, on Azure, of HPC applications. + +[Microsoft Entra Domain Services](/azure/active-directory-domain-services/overview) provides managed domain services like domain join and Group Policy. It also provides access to legacy authentication protocols like lightweight directory access protocol (LDAP) and Kerberos/NTLM authentication. Microsoft Entra Domain Services integrates with your existing Microsoft Entra tenant. This integration enables users to sign in to services and applications connected to the managed domain by using their existing credentials in Microsoft Entra ID. You can also use existing groups and user accounts to help secure access to resources. These features provide a smoother lift-and-shift of on-premises resources to Azure, especially for hybrid environments. + +For more information, see [design recommendations for platform access](../../ready/landing-zone/design-area/identity-access-platform-access.md#design-recommendations-for-platform-access) and [Azure identity and access for landing zones](../../ready/landing-zone/design-area/identity-access-landing-zones.md). + +## Design considerations + +HPC deployment uses the Azure landing zone infrastructure setup for security identity and access management. Keep the following design considerations in mind when you deploy your HPC application: + +- Determine the Azure resource administration that's required by various members of the team. Consider providing those team members with elevated Azure resource administration access in a nonproduction environment. + - For example, give them a [Virtual Machine Contributor](/azure/role-based-access-control/built-in-roles#virtual-machine-contributor) role. + - You can also give team members partially elevated administration access, like a partial Virtual Machine Contributor role in a production environment. + + Both options achieve a good balance between separation of duties and operational efficiency. +- Review the Azure administration and management activities that you require your teams to do. Consider your HPC on Azure landscape. Determine the best possible distribution of responsibilities within your organization. + + Here are the common Azure activities for administration and management: + + | Azure resource | Azure resource provider | Activities | + |--|--|--| + | Virtual machines (VMs) | Microsoft.Compute/virtualMachines | Start, stop, restart, deallocate, deploy, redeploy, change, and resize VMs. Manage extensions, availability sets, and proximity placement groups. | + | VMs | Microsoft.Compute/disks | Read and write to disk. | + | Storage | Microsoft.Storage | Read and make changes on storage accounts, for example, a boot diagnostics storage account. | + | Storage | Microsoft.NetApp | Read and make changes on NetApp capacity pools and volumes. | + | Storage | Microsoft.NetApp | Take Azure NetApp Files snapshots. | + | Storage | Microsoft.NetApp | Use Azure NetApp Files cross-region replication. | + | Networking | Microsoft.Network/networkInterfaces | Read, create, and change network interfaces. | + | Networking | Microsoft.Network/loadBalancers | Read, create, and change load balancers. | + | Networking | Microsoft.Network/networkSecurityGroups | Read network security groups. | + | Networking | Microsoft.Network/azureFirewalls | Read firewalls. | + | Networking | Microsoft.Network/virtualNetworks | Read, create, and change network interfaces.

Consider the relevant access that's needed for the resource group of the virtual network and related access if it's different from the resource group of the VMs. | + +- A typical HPC setup includes a front end for submitting jobs, a job scheduler or orchestrator, a compute cluster, and shared storage. The jobs can be submitted from on-premises and/or in the cloud. Identity and access management considerations for users and visualization devices might vary depending on the enterprise standards. + +- Consider the Microsoft authentication service that you use. Depending on the HPC compute resource orchestrator that you use, various authentication methods are supported, as described here. + + - [Azure CycleCloud](/azure/cyclecloud/overview?view=cyclecloud-8&preserve-view=true) provides [three authentication methods](/azure/cyclecloud/how-to/user-authentication?view=cyclecloud-8&preserve-view=true): a built-in database with encryption, Active Directory, and LDAP. + - [Azure Batch](/azure/batch/batch-technical-overview) supports [two authentication methods](/azure/batch/security-best-practices): Shared Key and Microsoft Entra ID. + - If you want to extend your on-premises capabilities to a hybrid environment, you can authenticate through Active Directory with a read-only domain controller that's hosted on Azure. This approach minimizes traffic across the link. This integration provides a way for users to use their existing credentials to sign in to services and applications that are connected to the managed domain. You can also use existing groups and user accounts to help secure access to resources. These features provide a smoother lift-and-shift of on-premises resources to Azure. + - Currently, [HPC Pack](/azure/cyclecloud/hpcpack?view=cyclecloud-8&preserve-view=true) nodes must be joined into an Active Directory domain. If you're deploying the HPC Pack cluster in a virtual network that has a site-to-site VPN or Azure ExpressRoute connection to your corporate network (and firewall rules allow access to Active Directory domain Controllers), there's typically already an Active Directory domain. If you don't have an Active Directory domain in your virtual network, you can choose to create one by promoting the head node as domain controller. Another option is to use Microsoft Entra Domain Services to allow the HPC Pack nodes to be domain joined to this service vs. on-premises Active Directory domain controllers. If the head nodes will be deployed on Azure, it's important to determine whether remote users on-premises will be submitting jobs. If remote users are submitting jobs, you should use Active Directory because it provides a better experience and allows certificates to be used properly for authentication. Otherwise, if you're using Microsoft Entra Domain Services instead of Active Directory, the remote clients will need to use the REST API service to submit jobs. + +For more information, see [Design recommendations for platform access](../../ready/landing-zone/design-area/identity-access-platform-access.md#design-recommendations-for-platform-access) and [Azure identity and access for landing zones](../../ready/landing-zone/design-area/identity-access-landing-zones.md). + +### Design considerations for the energy industry + +In addition to the preceding considerations, take these considerations into account. + +Two common deployment types in oil and gas industry workloads are *cloud only* and *hybrid cloud* models. While it's less complex to have all of your compute, storage, and visualization resources in the cloud, enterprises sometimes use a hybrid model due to multiple business constraints for seismic and reservoir-simulation HPC workloads. + +Both the cloud only and hybrid cloud models might have their own unique identity and access needs that affect the type of Active Directory solution to adopt. + +Workloads in the cloud only deployment model use Microsoft Entra ID for Azure service fabric authentication, while the HPC hybrid cloud model uses the [Microsoft Entra hybrid identity solution](/azure/active-directory/hybrid/choose-ad-authn) for authentication. Regardless of the deployment type, Linux clients and POSIX-compliant storage solutions require legacy Active Directory support through Microsoft Entra Domain Services. + +### Design considerations for the manufacturing industry + +The following diagram shows a manufacturing reference architecture that uses CycleCloud for authentication: + +:::image type="content" source="../azure-hpc/media/hpc-identity-access-management-cyclecloud.png" alt-text="Diagram that shows a manufacturing reference architecture that uses Azure CycleCloud." lightbox="../azure-hpc/media/hpc-identity-access-management-cyclecloud.png"::: + +This diagram shows a manufacturing architecture that uses Batch for authentication: + +:::image type="content" source="../azure-hpc/media/hpc-identity-access-management-batch.png" alt-text="Diagram that shows a manufacturing reference architecture that uses Azure Batch." lightbox="../azure-hpc/media/hpc-identity-access-management-batch.png"::: + +## Next steps + +The following articles provide guidance for various stages of the cloud adoption process. These resources can help you succeed in adopting HPC environments for the cloud. + +- [Azure billing offers and Microsoft Entra tenants](../../ready/landing-zone/design-area/azure-billing-microsoft-entra-tenant.md) +- [Management](manage.md) +- [Resource organization](resource-organization.md) +- [Secure](secure.md) +- [Storage](storage.md) +- [HPC landing zone accelerator](azure-hpc-landing-zone-accelerator.md) +- [HPC network topology and connectivity](network-topology-connectivity.md) +- [Compute large-scale HPC application workloads in Azure VMs](compute.md) diff --git a/docs/scenarios/azure-hpc/manufacturing/azure-billing-active-directory-tenant.md b/docs/scenarios/azure-hpc/manufacturing/azure-billing-active-directory-tenant.md deleted file mode 100644 index 672bcfe3d0..0000000000 --- a/docs/scenarios/azure-hpc/manufacturing/azure-billing-active-directory-tenant.md +++ /dev/null @@ -1,29 +0,0 @@ ---- -title: 'Manufacturing HPC Azure billing and Active Directory tenants' -description: Azure landing zones for HPC don't have specific considerations or recommendations that affect billing offer decisions, enterprise enrollment, or Active Directory tenant decisions. -author: Rajani-Janaki-Ram -ms.author: rajanaki -ms.topic: conceptual -ms.custom: think-tank -ms.date: 11/11/2022 ---- - -# Manufacturing HPC Azure billing and Active Directory tenants - -Azure landing zones for HPC don't have specific considerations or recommendations that affect billing offer decisions, enterprise enrollment, or Microsoft Entra tenant decisions. - -But it might be important to understand decisions previously made by the cloud platform team and the existing [enterprise enrollment or Microsoft Entra tenant decisions](../../../ready/landing-zone/design-area/azure-billing-microsoft-entra-tenant.md). - -## Next steps - -The following articles provide guidance on each step in the cloud adoption journey for manufacturing HPC environments. - -- [Azure identity and access management for HPC in manufacturing](./identity-access-management.md) -- [Management for HPC in the manufacturing industry](./management.md) -- [Manufacturing HPC network topology and connectivity](./network-topology-connectivity.md) -- [Platform automation and DevOps for Azure HPC in the manufacturing industry](./platform-automation-devops.md) -- [Manufacturing HPC resource organization](./resource-organization.md) -- [Azure governance for manufacturing HPC](./security-governance-compliance.md) -- [Security for HPC in manufacturing industries](./security.md) -- [Manufacturing HPC storage](./storage.md) -- [Azure high-performance computing (HPC) landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) diff --git a/docs/scenarios/azure-hpc/manufacturing/compute.md b/docs/scenarios/azure-hpc/manufacturing/compute.md deleted file mode 100644 index 86ac55b47e..0000000000 --- a/docs/scenarios/azure-hpc/manufacturing/compute.md +++ /dev/null @@ -1,72 +0,0 @@ ---- -title: Big compute in the manufacturing industry -description: See virtual machine (VM) sizes that are appropriate for big compute workloads in manufacturing, which require a large number of processing cores. -author: bsantacruz -ms.author: bsantacruz -ms.topic: conceptual -ms.custom: think-tank -ms.date: 10/17/2023 ---- - -# Big compute in the manufacturing industry - -High-performance computing (HPC) is used in the manufacturing industry to run large-scale workloads, also known as big compute applications. HPC can help manufacturers at every stage of product development processes and supply chains. For instance, you can use HPC to run advanced design simulations and also to automate processes and predict maintenance issues. - -Big compute workloads involve tasks that require many cores to process. These workloads can include financial risk modeling, engineering stress analysis, and other compute-intensive tasks. Big compute workloads have the following characteristics: - -- The workload can be divided into discrete tasks that can be run simultaneously across many cores. -- Each task takes input, processes it, and produces output. The entire application runs for a finite amount of time. -- The application doesn't need to run constantly, but it must be able to handle node failures and crashes. -- Tasks can be independent or tightly coupled, requiring high-speed networking technologies like InfiniBand and remote direct memory access (RDMA) connectivity. - -You can use compute-intensive virtual machine (VM) sizes such as H16r, H16mr, and A9. Your selection depends on your workload. - -Azure offers a range of VM instances that are optimized for CPU-intensive and GPU-intensive workloads. It's also the only cloud platform that offers InfiniBand-enabled hardware. InfiniBand provides a significant performance advantage for reservoir simulation and seismic workload runs. This advantage results in performance that approaches or exceeds current on-premises infrastructure performance. - -Azure VMs offer various options, known as VM sizes, for HPC and GPU-optimized computing. It's important to select a VM size that's appropriate for your workload. To find the size that's the best fit, see [Sizes for virtual machines in Azure](/azure/virtual-machines/sizes). Also see the selector tool in [Virtual machines selector](https://azure.microsoft.com/pricing/vm-selector/). - -Keep in mind that not all Azure products are available in all regions. To see what's available in your area, see [Products available by region](https://azure.microsoft.com/explore/global-infrastructure/products-by-region/). - -For best practices for choosing Azure compute options, see the [Azure compute blog](https://techcommunity.microsoft.com/t5/azure-compute-blog/bg-p/AzureCompute) or the [Azure compute service](/azure/architecture/guide/technology-choices/compute-decision-tree) content. - -Azure offers both CPU-based and GPU-enabled VMs. The N-series VMs feature NVIDIA GPUs that are designed for compute-intensive or graphics-intensive applications such as AI, learning, and visualization. - -HPC SKUs are designed for high-performance scenarios. But other SKUs, such as the E and F series, are also suitable for certain workloads. To help ensure optimal performance, carefully evaluate your workload requirements and choose an appropriate VM size. - -## Design considerations - -When you design your HPC infrastructure, several tools and services are available to help you manage and schedule your workloads. - -[Azure CycleCloud](/azure/cyclecloud/) is a tool for creating, managing, operating, and optimizing HPC and big compute clusters in Azure. With Azure CycleCloud, users can dynamically configure HPC Azure clusters and orchestrate data and jobs for hybrid and cloud workflows. Azure CycleCloud provides the simplest way to manage HPC workloads in Azure that involves using a workload manager. Azure CycleCloud supports workload managers such as Grid Engine, Microsoft HPC Pack, HTCondor, LSF, PBS Pro, SLURM, and Symphony. - -### Manufacturing architecture design example - -The following architecture provides an example of using VMs in HPC in manufacturing. - -:::image type="content" alt-text="Architecture diagram that shows a manufacturing HPC workload that uses Azure CycleCloud and HC-series VMs." source="./media/hpc-manufacturing-architecture-example.svg" lightbox="./media/hpc-manufacturing-architecture-example.svg"::: - -This architecture uses Azure Files shares and Azure Storage accounts that are connected to an Azure Private Link subnet. - -The architecture uses Azure CycleCloud its own subnet. HC-series VMs are used in an arrangement of cluster nodes. - -The HC-series VMs are optimized for HPC applications that are driven by intensive computation. Examples include implicit finite element analysis, reservoir simulation, and computational chemistry applications. HC VMs feature 44 Intel Xeon Platinum 8168 processor cores, 8 GB of RAM per CPU core, no hyperthreading, and up to four managed disks. The Intel Xeon Platinum platform supports Intel's rich ecosystem of software tools and features and an all-cores clock speed of 3.4 GHz for most workloads. - -For reference architectures and cookbooks for deploying HPC independent software vendor (ISV) applications that support manufacturing use cases, see the following resources: - -- [Virtual machine series](https://azure.microsoft.com/pricing/details/virtual-machines/series/). -- [Azure HPC certification.github.io](https://github.com/AzureHPC-Certification/AzureHPC-Certification.github.io/). -- [Microsoft Azure HPC OnDemand Platform](https://techcommunity.microsoft.com/t5/azure-global/azure-hpc-ondemand-platform-cloud-hpc-made-easy/ba-p/2537338). This standalone reference architecture might not be compliant with the Azure landing zone paradigm. - -## Next steps - -The following articles provide guidance for various stages of the cloud adoption process. These resources can help you succeed in adopting manufacturing HPC environments for the cloud. - -- [Azure billing and Active Directory tenants](./azure-billing-active-directory-tenant.md) -- [Management](./management.md) -- [Network topology and connectivity](./network-topology-connectivity.md) -- [Platform automation and DevOps](./platform-automation-devops.md) -- [Resource organization](./resource-organization.md) -- [Governance](./security-governance-compliance.md) -- [Security](./security.md) -- [Storage](./storage.md) -- [HPC landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) diff --git a/docs/scenarios/azure-hpc/manufacturing/identity-access-management.md b/docs/scenarios/azure-hpc/manufacturing/identity-access-management.md deleted file mode 100644 index 3c56a1b946..0000000000 --- a/docs/scenarios/azure-hpc/manufacturing/identity-access-management.md +++ /dev/null @@ -1,49 +0,0 @@ ---- -title: Azure identity and access management for HPC in manufacturing -description: This article describes considerations and provides recommendations for identity and access management in manufacturing HPC environments. -author: Rajani-Janaki-Ram -ms.author: rajanaki -ms.topic: conceptual -ms.custom: think-tank -ms.date: 11/09/2022 ---- - -# Azure identity and access management for HPC in manufacturing - -This article builds on considerations and recommendations that are described in the article [Azure identity and access management design](../../../ready/landing-zone/design-area/identity-access.md). It can help you examine design considerations for identity and access management that are specific to the deployment, on Azure, of HPC applications designed for the manufacturing industry. - -[Microsoft Entra Domain Services](/azure/active-directory-domain-services/overview) (Microsoft Entra Domain Services) provides managed domain services like domain join and Group Policy. It also provides access to legacy authentication protocols like lightweight directory access protocol (LDAP) and Kerberos/NTLM authentication. Microsoft Entra Domain Services integrates with your existing Microsoft Entra tenant. This integration enables users to sign in to services and applications connected to the managed domain by using their existing credentials in Microsoft Entra ID. You can also use existing groups and user accounts to help secure access to resources. These features provide a smoother lift-and-shift of on-premises resources to Azure, especially for hybrid environments. - -For more information, see [design recommendations for platform access](../../../ready/landing-zone/design-area/identity-access-platform-access.md#design-recommendations-for-platform-access) and [Azure identity and access for landing zones](../../../ready/landing-zone/design-area/identity-access-landing-zones.md). - - - -## HPC design considerations - -Depending on the HPC compute resource orchestrator that you use, various authentication methods are supported, as described here. - - - [Azure CycleCloud](/azure/cyclecloud/overview?view=cyclecloud-8&preserve-view=true). CycleCloud provides [three authentication methods](/azure/cyclecloud/how-to/user-authentication?view=cyclecloud-8&preserve-view=true): a built-in database with encryption, Active Directory, and LDAP. - - [Azure Batch](/azure/batch/batch-technical-overview). Batch account access supports [two authentication methods](/azure/batch/security-best-practices): Shared Key and Microsoft Entra ID. - - [HPC Pack](/azure/cyclecloud/hpcpack?view=cyclecloud-8&preserve-view=true). Currently, all HPC Pack nodes must be joined into an Active Directory domain. If you're deploying the HPC Pack cluster in a virtual network that has a site-to-site VPN or Azure ExpressRoute connection to your corporate network (and firewall rules allow access to Active Directory domain Controllers), there's typically already an Active Directory domain. If you don't have an Active Directory domain in your virtual network, you can choose to create one by promoting the head node as domain controller. Another option would be to utilize Microsoft Entra Domain Services to allow the HPC Pack nodes to be domain joined to this service vs. on-premises Active Directory domain controllers. If the Head Nodes will be deployed in Azure, it is important to determine if remote users on-premises will be submitting jobs. If remote users are submitting jobs it would be recommended to use Active Directory as this will allow a better experience and allow certificates to be used properly for authentication. Otherwise, if Active Directory is not utilized and Microsoft Entra Domain Services is used instead, the remote clients will need to use the REST API service to submit jobs. - -The following diagram shows a manufacturing reference architecture that uses CycleCloud: - -:::image type="content" source="./media/hpc-identity-access-management-cyclecloud.png" alt-text="Diagram that shows a manufacturing reference architecture, which uses Azure CycleCloud." lightbox="./media/hpc-identity-access-management-cyclecloud.png"::: - -This diagram shows a manufacturing architecture that uses Batch: - -:::image type="content" source="./media/hpc-identity-access-management-batch.png" alt-text="Diagram that shows a manufacturing reference architecture, which uses Azure Batch." lightbox="./media/hpc-identity-access-management-batch.png"::: - -## Next steps - -The following articles provide guidance that you might find helpful at various points during your cloud adoption process. They can help you succeed in your cloud adoption scenario for manufacturing HPC environments. - -- [Manufacturing HPC Azure billing and Active Directory tenants](./azure-billing-active-directory-tenant.md) -- [Management for HPC in the manufacturing industry](./management.md) -- [Manufacturing HPC network topology and connectivity](./network-topology-connectivity.md) -- [Platform automation and DevOps for Azure HPC in the manufacturing industry](./platform-automation-devops.md) -- [Manufacturing HPC resource organization](./resource-organization.md) -- [Azure governance for manufacturing HPC](./security-governance-compliance.md) -- [Security for HPC in manufacturing industries](./security.md) -- [Manufacturing HPC storage](./storage.md) -- [Azure high-performance computing (HPC) landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) diff --git a/docs/scenarios/azure-hpc/manufacturing/management.md b/docs/scenarios/azure-hpc/manufacturing/management.md deleted file mode 100644 index 0497d4338f..0000000000 --- a/docs/scenarios/azure-hpc/manufacturing/management.md +++ /dev/null @@ -1,31 +0,0 @@ ---- -title: Management for Azure HPC in the manufacturing industry -description: Azure landing zones for HPC in the manufacturing industry don't have any specific considerations or recommendations that affect management decisions. -author: Rajani-Janaki-Ram -ms.author: rajanaki -ms.topic: conceptual -ms.custom: think-tank -ms.date: 11/11/2022 ---- - -# Management for HPC in the manufacturing industry - -Azure [landing zones for HPC](../ready.md) in the manufacturing industry don't have any specific considerations or recommendations that influence management decisions. Management is one of several design areas considered in this documentation. - -## Previous recommendations - -Keep in mind that it could be important to understand any decisions made previously by the cloud platform team so you know about existing [management recommendations](../../../ready/landing-zone/design-area/management.md). - -## Next steps - -The following articles provide guidance that you might find helpful at various points throughout the cloud adoption process. They can help you succeed in the cloud adoption scenario for HPC environments in manufacturing. - -- [Manufacturing HPC Azure billing and Active Directory tenants](./azure-billing-active-directory-tenant.md) -- [Azure identity and access management for HPC in manufacturing](./identity-access-management.md) -- [Manufacturing HPC network topology and connectivity](./network-topology-connectivity.md) -- [Platform automation and DevOps for Azure HPC in the manufacturing industry](./platform-automation-devops.md) -- [Manufacturing HPC resource organization](./resource-organization.md) -- [Azure governance for manufacturing HPC](./security-governance-compliance.md) -- [Security for HPC in manufacturing industries](./security.md) -- [Manufacturing HPC storage](./storage.md) -- [Azure high-performance computing (HPC) landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) diff --git a/docs/scenarios/azure-hpc/manufacturing/network-topology-connectivity.md b/docs/scenarios/azure-hpc/manufacturing/network-topology-connectivity.md deleted file mode 100644 index 4f8dfd6878..0000000000 --- a/docs/scenarios/azure-hpc/manufacturing/network-topology-connectivity.md +++ /dev/null @@ -1,156 +0,0 @@ ---- -title: 'Manufacturing HPC network topology and connectivity in Azure' -description: Learn about the key design considerations and best practices for networking and connectivity to, from, and within Microsoft Azure and HPC deployments. -author: Rajani-Janaki-Ram -ms.author: rajanaki -ms.topic: conceptual -ms.custom: think-tank -ms.date: 11/17/2022 ---- - -# Manufacturing HPC network topology and connectivity - -This guidance builds on considerations and recommendations defined in the Azure landing zone article for [network topology and connectivity](../../../ready/landing-zone/design-area/network-topology-and-connectivity.md). Following the guidance in this article helps you examine key design considerations and best practices for networking and connectivity to, from, and within Microsoft Azure and HPC deployments. - -## Plan for IP address, virtual network, and subnets - -It's vital to plan for IP address needs in Azure to ensure that: - - - The IP address space doesn't overlap across on-premises locations and Azure regions. - - Future VNet peering to existing or planned VNets is possible. - - The virtual network (VNet) contains the right address space. - - Proper planning for subnet configuration happens in advance. - - Sufficient excess addressing is considered for future expansion or other services - -## HPC manufacturing design considerations - -Consider creating separate subnets to assign IP addresses across functional components of the environment. For example, a dedicated HPC VNet could include the following subnets: - -* Compute -* Storage -* Infrastructure -* Visualization -* Sign-in -* ANF -* HPC Cache - -Several services like Azure NetApp Files, Azure HPC Cache, and future storage offerings require dedicated delegated subnets for proper operation. Ensure that you plan appropriate addressing space if you're considering any of these services. - -## Configure DNS and name resolution for on-premises and Azure resources - -Domain Name System (DNS) is a critical design aspect in the overall Azure landing zone architecture. Some organizations might want to use their existing investments in DNS. Others might see cloud adoption as an opportunity to modernize their internal DNS infrastructure and use native Azure capabilities. - -## HPC networking design considerations - -The following recommendations are for when a virtual machine's DNS or virtual name doesn't change during migration. - -Use case: - - - Background DNS and virtual names connect many system interfaces in the HPC environments. Customers are only sometimes aware of the interfaces that developers define over time. Connection challenges arise between various systems when virtual or DNS names change after migrations. It's recommended to retain DNS aliases to prevent these types of difficulties. - - Use different DNS zones to distinguish each environment (sandbox, development, preproduction, and production) from each other. The exception is for HPC deployments with their own VNet. Here, private DNS zones might not be necessary. - - DNS support is mandatory while using HPC cache so they can access storage and other resources. - -## High-performance network services - -**Accelerated networking** - -Many HPC workloads (for example, seismic processing) require processing a large amount of data. The data stores in large shared file systems like Azure Blob, Azure NetApp Files, Lustre ClusterStor, and other custom storage solutions that you access through the network. It's paramount to rely on a high-performance network to reduce the time for data transfers. - -Enabling [Accelerated networking](/azure/virtual-network/accelerated-networking-overview) provides the VMs a high-throughput and low-latency connection between them and to and from Azure services together with reduced jitter and minimal CPU utilization. - -**InfiniBand** - -Parallel HPC applications that rely on Message Passing Interface (MPI) libraries might require significant amount of information to be transferred between many VMs. The InfiniBand interconnect available on RDMA capable [H-series](/azure/virtual-machines/sizes-hpc) and [N-series](/azure/virtual-machines/sizes-gpu) VMs provides the required low latency and high bandwidth to maximize the performance and scalability of HPC and AI applications. - -Some examples of MPI jobs include: - -* Molecular dynamics -* Computational fluid dynamics -* Oil and gas reservoir simulation -* Emerging distributed machine learning workloads in manufacturing - -InfiniBand connection is only possible between VMs allocated within the same [placement group](/azure/virtual-machine-scale-sets/virtual-machine-scale-sets-placement-groups). - -**Azure ExpressRoute** - - - If there's a burst application like a hybrid setup for reservoir simulation and modeling, where the on-premises data sets are shared and the Azure compute becomes an extension, Express Route helps you connect your on-premises environment into the Microsoft Cloud over a private connection with the help of a connectivity provider. It provides enterprise-grade resiliency and availability, and the advantage of a Global ExpressRoute partner ecosystem. For information on how to connect your network to Microsoft using ExpressRoute, see [ExpressRoute connectivity models](/azure/expressroute/expressroute-connectivity-models). - - ExpressRoute connections don't go over the public internet, and they offer more reliability, faster speeds, and lower latencies than typical internet connections. For point-to-site VPN and site-to-site VPN, you can connect on-premises devices or networks to a virtual network by using any combination of these VPN options and Azure ExpressRoute. - -## Define an Azure network topology - -Enterprise-scale landing zones support two network topologies: one based on Azure Virtual WAN and the other a traditional network topology based on hub-and-spoke architecture. This section recommends HPC configurations and practices for both deployment models. - -Use a network topology based on Virtual WAN if your organization plans to: - - - Deploy resources across several Azure regions and connect your global locations to both Azure and on-premises. - - Fully integrate software-defined WAN deployments with Azure. - - Deploy up to 50,000 virtual machine workloads across all VNets connected to one Virtual WAN hub. - -Organizations use Virtual WAN to meet large-scale interconnectivity requirements. Microsoft manages this service, which helps to reduce overall network complexity and modernize your organization's network. - -Use a traditional Azure network topology based on [hub-and-spoke architecture](/azure/architecture/reference-architectures/hybrid-networking/hub-spoke?tabs=cli) if your organization: - - - Plans to deploy resources in only select Azure regions. - - Doesn't need a global, interconnected network. - - Has few remote or branch locations per region and needs fewer than 30 IP security (IPsec) tunnels. - - Requires full control and granularity to manually configure your Azure network. - - Uses local and global VNet peering to provide connectivity. Local and global VNet peering are the preferred approaches to ensure connectivity between landing zones for HPC deployments across multiple Azure regions. - -## Plan for inbound and outbound internet connectivity - -This section recommends connectivity models for inbound and outbound connectivity to and from the public internet. Azure-native network security services like Azure Firewall, Azure Web Application Firewall on Application Gateway, and Azure Front Door are fully managed services. So, you don't incur the operational and management costs associated with infrastructure deployments, which can become complex at scale. - -Design recommendations for HPC implementation: - - - For customers with a global footprint, [Azure Front Door](/azure/frontdoor/front-door-overview) helps HPC deployments by using Azure Web Application Firewall policies to deliver and protect global HTTP/S applications across Azure regions. - - Take advantage of Web Application Firewall policies in Azure Front Door when you're using this service and Azure Application Gateway to protect HTTP/S applications. Lock down Azure Application Gateway to receive traffic only from Azure Front Door. - -## Define network encryption requirements - -This section explores key recommendations for encrypting networks between on-premises and Azure and across Azure regions. - -Design considerations for HPC implementations: - - - Traffic isn't currently encrypted when you use Azure ExpressRoute to configure private peering. - - It isn't necessary to encrypt traffic over ExpressRoute for HPC deployments. IPsec tunnels encrypt internet traffic by default. Encryption or decryption could negatively affect the traffic's performance. - -It's up to the customer to determine whether HPC traffic should be encrypted. Explore [network topology and connectivity](../../../ready/landing-zone/design-area/network-topology-and-connectivity.md) to understand network encryption options in enterprise-scale landing zones. - -It's vital to plan for IP address needs in Azure to ensure that: - - - The IP address space doesn't overlap across on-premises locations and Azure regions. - - The virtual network (VNet) contains the right address space. - - Proper planning for subnet configuration happens in advance. - -## Define and throughput latency bandwidth network requirements - - - Both HPC in the Cloud Only and HPC Cloud Hybrid deployment model have their own networking and connectivity latency and throughput needs depending on how you submit and run the manufacturing workflow and workload jobs in on-premises versus cloud. Users can submit HPC jobs in many deployment modes (from on-premises or cloud). - - Single Jobs - - On-premises to Azure connectivity considerations if you use remote visualization desktop - - Burst Jobs - - Scheduler setup network considerations, which submit the jobs in the cloud - - Azure Batch network considerations - - Parallel workflows (both on-premises and cloud) - - Hybrid - - HPC cache - - Cloud native - - KS containers - - Functions - - MPI environments are dedicated as they have unique requirements with a need for low latency communications between nodes. The nodes connect via high-speed interconnect and aren't amenable to sharing with other workloads. MPI applications use the entire high-performance interconnects by way of pass-through mode in virtualized environments. Storage for MPI nodes is usually a parallel file system like Lustre that's also accessed via the high-speed interconnect. - -:::image type="content" alt-text="Diagram that shows InfiniBand." source="../media/infiniband.png" lightbox="../media/infiniband.png"::: - -## Next steps - -The following articles provide guidance on each step in the cloud adoption journey for manufacturing HPC environments. - -- [Manufacturing HPC Azure billing and Active Directory tenants](./azure-billing-active-directory-tenant.md) -- [Azure identity and access management for HPC in manufacturing](./identity-access-management.md) -- [Management for HPC in the manufacturing industry](./management.md) -- [Platform automation and DevOps for Azure HPC in the manufacturing industry](./platform-automation-devops.md) -- [Manufacturing HPC resource organization](./resource-organization.md) -- [Azure governance for manufacturing HPC](./security-governance-compliance.md) -- [Security for HPC in manufacturing industries](./security.md) -- [Manufacturing HPC storage](./storage.md) -- [Azure high-performance computing (HPC) landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) - diff --git a/docs/scenarios/azure-hpc/manufacturing/platform-automation-devops.md b/docs/scenarios/azure-hpc/manufacturing/platform-automation-devops.md deleted file mode 100644 index b2aa6dd52d..0000000000 --- a/docs/scenarios/azure-hpc/manufacturing/platform-automation-devops.md +++ /dev/null @@ -1,31 +0,0 @@ ---- -title: Platform automation and DevOps for Azure HPC in the manufacturing industry -description: Azure landing zones for HPC in manufacturing don't have any specific considerations or recommendations that affect platform automation and DevOps decisions. -author: Rajani-Janaki-Ram -ms.author: rajanaki -ms.topic: conceptual -ms.custom: think-tank -ms.date: 11/14/2022 ---- - -# Platform automation and DevOps for HPC in the manufacturing industry - -Azure [landing zones for HPC](../ready.md) in the manufacturing industry don't have any specific considerations or recommendations that affect decisions related to platform automation and DevOps. Platform automation and DevOps are two of several design areas considered in this documentation. - -## Previous recommendations - -Keep in mind that it could be important to understand any decisions made previously by the cloud platform team so you know about existing recommendations for [platform automation and DevOps](../../../ready/landing-zone/design-area/platform-automation-devops.md). - -## Next steps - -The following articles provide guidance that you might find helpful at various points during your cloud adoption process. They can help you succeed in the cloud adoption scenario for HPC in the manufacturing industry. - -- [Manufacturing HPC Azure billing and Active Directory tenants](./azure-billing-active-directory-tenant.md) -- [Azure identity and access management for HPC in manufacturing](./identity-access-management.md) -- [Management for HPC in the manufacturing industry](./management.md) -- [Manufacturing HPC network topology and connectivity](./network-topology-connectivity.md) -- [Manufacturing HPC resource organization](./resource-organization.md) -- [Azure governance for manufacturing HPC](./security-governance-compliance.md) -- [Security for HPC in manufacturing industries](./security.md) -- [Manufacturing HPC storage](./storage.md) -- [Azure high-performance computing (HPC) landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) diff --git a/docs/scenarios/azure-hpc/manufacturing/resource-organization.md b/docs/scenarios/azure-hpc/manufacturing/resource-organization.md deleted file mode 100644 index f9998d98e8..0000000000 --- a/docs/scenarios/azure-hpc/manufacturing/resource-organization.md +++ /dev/null @@ -1,57 +0,0 @@ ---- -title: 'Manufacturing HPC Resource organization in Azure' -description: Learn how to align with the Ready methodology of the Cloud Adoption Framework by using a naming and tagging strategy that includes business and operations details. -author: Rajani-Janaki-Ram -ms.author: rajanaki -ms.topic: conceptual -ms.custom: think-tank -ms.date: 11/15/2022 ---- - -# Manufacturing HPC resource organization - -To align with the [Ready methodology](../../../ready/index.md) of the Cloud Adoption Framework, implement a naming and tagging strategy that includes business and operational details as components of resource names and metadata tags. - -## Use single versus multiple Azure subscriptions - - - In Microsoft Entra ID, a tenant is representative of an organization. It's a dedicated instance of the Microsoft Entra service that an organization receives and owns when it signs up for using Azure. Each Microsoft Entra tenant is distinct and separate from other Microsoft Entra tenants. An Azure tenant can have multiple subscriptions and each subscription can use the same Microsoft Entra ID. - - You can choose single or multiple subscriptions based on your organization's needs. - - Multiple subscriptions allow for easy view billing for each subscription and limit who can access the Microsoft Azure services associated with that subscription. For example, production versus non-production subscriptions, internal versus external, and so on. - - Using multiple subscriptions helps in planning for subscriptions limits. For more information on the decision-making process for subscriptions, see [Subscription decision guide](../../../ready/landing-zone/design-area/resource-org-subscriptions.md?source=recommendations). - - It's recommended that you consider Azure Resource Manager throttle limits when going with multiple versus single subscriptions. For more information on throttle limits, see [Throttling Resource Manager requests](/azure/azure-resource-manager/management/request-limits-and-throttling). - -## Use Azure resource naming and tagging conventions - -The business side of this strategy ensures that resource names and tags include the organizational information you need to identify the teams. Use a resource name along with the business owners who are responsible for resource costs. The operational side ensures that names and tags include information that IT teams use to identify the workload, application, environment, criticality, and other useful information for managing resources. - -Resources to name include: - -- VMs -- Load balancers -- DNS labels -- Availability sets -- Virtual networks -- Subnets -- ExpressRoute -- NSGs -- Application security groups -- Tags -- Route tables -- Managed disks -- Public IPs - -A sample use case would be to tag all Azure development VMs with the tag of Dev. This tag eases reporting and billing operations because you can pull a report for all things Dev. For more information on a naming and tagging strategy, see [Develop your naming and tagging strategy for Azure resources](../../../ready/azure-best-practices/naming-and-tagging.md). - -## Next steps - -The following articles provide guidance on each step in the cloud adoption journey for manufacturing HPC environments. - -- [Manufacturing HPC Azure billing and Active Directory tenants](./azure-billing-active-directory-tenant.md) -- [Azure identity and access management for HPC in manufacturing](./identity-access-management.md) -- [Management for HPC in the manufacturing industry](./management.md) -- [Manufacturing HPC network topology and connectivity](./network-topology-connectivity.md) -- [Platform automation and DevOps for Azure HPC in the manufacturing industry](./platform-automation-devops.md) -- [Azure governance for manufacturing HPC](./security-governance-compliance.md) -- [Security for HPC in manufacturing industries](./security.md) -- [Manufacturing HPC storage](./storage.md) -- [Azure high-performance computing (HPC) landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) diff --git a/docs/scenarios/azure-hpc/manufacturing/security-governance-compliance.md b/docs/scenarios/azure-hpc/manufacturing/security-governance-compliance.md deleted file mode 100644 index f42d241837..0000000000 --- a/docs/scenarios/azure-hpc/manufacturing/security-governance-compliance.md +++ /dev/null @@ -1,38 +0,0 @@ ---- -title: Azure governance for manufacturing HPC -description: Learn about governance for HPC workloads. -author: Rajani-Janaki-Ram -ms.author: rajanaki -ms.topic: conceptual -ms.custom: think-tank -ms.date: 11/08/2022 ---- - -# Azure governance for manufacturing HPC - -Governance is an important part of planning for HPC workloads. Organizations in the manufacturing and automotive industries often rely on compliance and regulation assessments, like [Trusted Information Security Assessment Exchange (TISAX)](/azure/compliance/offerings/offering-tisax). - -Keep in mind that it could be important to understand any decisions previously made by the cloud platform team so that you're aware of existing  governance recommendations. - -## Azure governance in the Azure HPC landing zone accelerator - -The Azure HPC landing zone accelerator can help your organization get mature governance controls. - -For example: - -- A management group hierarchy that groups resources by function or workload type might encourage best practices for resource consistency. -- A rich set of Azure policies might enable governance controls at management-group level to ensure all resources are in scope. - -## Next steps - -The following articles provide guidance that you might find helpful at various points during the cloud adoption process. They can help you succeed in your cloud adoption scenario for manufacturing HPC environments. - -- [Manufacturing HPC Azure billing and Active Directory tenants](./azure-billing-active-directory-tenant.md) -- [Azure identity and access management for HPC in manufacturing](./identity-access-management.md) -- [Management for HPC in the manufacturing industry](./management.md) -- [Manufacturing HPC network topology and connectivity](./network-topology-connectivity.md) -- [Platform automation and DevOps for Azure HPC in the manufacturing industry](./platform-automation-devops.md) -- [Manufacturing HPC resource organization](./resource-organization.md) -- [Security for HPC in manufacturing industries](./security.md) -- [Manufacturing HPC storage](./storage.md) -- [Azure high-performance computing (HPC) landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) diff --git a/docs/scenarios/azure-hpc/manufacturing/security.md b/docs/scenarios/azure-hpc/manufacturing/security.md deleted file mode 100644 index c639034f86..0000000000 --- a/docs/scenarios/azure-hpc/manufacturing/security.md +++ /dev/null @@ -1,57 +0,0 @@ ---- -title: Security for HPC in manufacturing industries -description: Azure landing zones in manufacturing industries don't have any specific considerations or recommendations that affect decisions about security. -author: Rajani-Janaki-Ram -ms.author: rajanaki -ms.topic: conceptual -ms.custom: think-tank -ms.date: 11/23/2022 ---- - -# Security for HPC in manufacturing industries - -Azure landing zones for high performance computing (HPC) in manufacturing industries don't have any specific considerations or recommendations that affect decisions about security. - -However, it might be important to understand any decisions that your cloud platform team has made so that you're aware of existing recommendations. For more information, see [Design area: Security](../../../ready/landing-zone/design-area/security.md). - -## Design consideration for security - -You can define and apply security rules and policies that are based on environment, workflow, virtual machine (VM), physical server, and operator. Examples of rules and policies include: - - - Actions that are controlled by user permissions and that are logged for audit reporting. For example, root access is granted only as needed and based on the specified VMs. Scoping access in this way helps to prevent compromising other HPC workflows. - - - Isolated workflows where sensitive data can't be shared with other HPC environments, workflows, or users even if they're on the same underlying hardware. - - - Additional security practices for CycleCloud deployments: - - - Only administrators should have access to the drive on which CycleCloud is installed. Limiting access to the administrators' group prevents users who aren't administrators from accessing non-encrypted data. - - - Don't share an installation of CycleCloud across trust boundaries. For more information about trust relationships, see [How trust relationships work for forests in Active Directory](/azure/active-directory-domain-services/concepts-forest-trust). - - - The Azure role-based access control (Azure RBAC) within a single installation of CycleCloud installation might not be sufficient in a true multi-tenant environment. Use separate and isolated installations of CycleCloud for each tenant that has critical data. For more information about Azure RBAC, see [What is Azure role-based access control (Azure RBAC)?](/azure/role-based-access-control/overview) - -## HPC Pack user roles and security - - - There are several [roles](/powershell/high-performance-computing/understanding-user-roles) that users may be added to Administrator, User, Job Administrator, and Job Operator. - - - - Active Directory Domain Services (AD DS) or Microsoft Entra Domain Services is a prerequisite to installing Microsoft HPC Pack because the authentication process for users and computers relies on the services provided by AD DS. - - - - At installation time, the HPC cluster administrator role is automatically granted to all members of the local Administrators security group on the head node. When an identity is added as an HPC cluster administrator, it is also added to the local Administrators security group. - - - -## Next steps - -The following articles provide guidance for specific points in cloud adoption of HPC in manufacturing industries. - -- [Manufacturing HPC Azure billing and Active Directory tenants](./azure-billing-active-directory-tenant.md) -- [Azure identity and access management for HPC in manufacturing](./identity-access-management.md) -- [Management for HPC in the manufacturing industry](./management.md) -- [Manufacturing HPC network topology and connectivity](./network-topology-connectivity.md) -- [Platform automation and DevOps for Azure HPC in the manufacturing industry](./platform-automation-devops.md) -- [Manufacturing HPC resource organization](./resource-organization.md) -- [Azure governance for manufacturing HPC](./security-governance-compliance.md) -- [Manufacturing HPC storage](./storage.md) -- [Azure high-performance computing (HPC) landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) diff --git a/docs/scenarios/azure-hpc/manufacturing/storage.md b/docs/scenarios/azure-hpc/manufacturing/storage.md deleted file mode 100644 index 132ada26ff..0000000000 --- a/docs/scenarios/azure-hpc/manufacturing/storage.md +++ /dev/null @@ -1,107 +0,0 @@ ---- -title: 'Manufacturing HPC storage in Azure' -description: Learn about storage access, various storage solution capabilities, and how to streamline your storage decision process as a part of planning for HPC workload performance. -author: Rajani-Janaki-Ram -ms.author: rajanaki -ms.topic: conceptual -ms.custom: think-tank -ms.date: 11/16/2022 ---- - -# Manufacturing HPC storage - -Storage access is an important part of planning for HPC workload performance. The following materials help to streamline your decision process and minimize any misunderstandings around a particular storage solution's capabilities (or lack of capabilities). - -## Design considerations - - It's important to ensure that the required data gets to the HPC cluster machines at the right time. You also want to make sure results from those individual machines are quickly saved and available for further analysis. - -## Distribution of workload traffic - -Account for the types of traffic your HPC environment generates and processes. This step is especially important if you plan to run multiple types of workloads and plan to use the storage for other purposes. Consider and record the following traffic types: - -- Single stream versus multiple streams -- Ratio of read traffic to write traffic -- Average file sizes and counts -- Random versus sequential access patterns - -## Data locality - -The next category accounts for the location of the data. Locality awareness helps you determine whether you can use copying, caching, or synchronization as your data movement strategy. The following are locality items to check beforehand: - -- Source data on-premises, in Azure, or both? -- Results data on-premises, in Azure, or both? -- HPC workloads in Azure to be coordinated with source-data modification timelines? -- Sensitive/HIPAA data? - -## Performance requirements - -Performance requirements for storage solutions are generally summarized as follows: - -- Single-stream throughput (in Gb/ps) -- Multi-stream throughput (in Gb/ps) -- Expected maximum IOPS -- Average latency (ms) - -Every consideration affects performance, so these numbers represent a guide that a particular solution should achieve. For example, you might have an HPC workload that does extensive file creation and deletion as part of the workflow. Those operations could affect the overall throughput. - -## Access methods - -Account for the client access protocol required and be clear about what features of the protocol you need. There are different versions of NFS and SMB. - -Here are some things to consider: - -- NFS/SMB versions required -- Expected protocol features (ACLs, encryption) -- Parallel file system solution - -## Total capacity requirement - -Storage capacity in Azure is the next consideration. It helps to inform the overall cost of the solution. If you plan to store a large amount of data for a long time, you might want to consider tiering as part of the storage solution. Tiering provides lower-cost storage options combined with higher-cost but higher-performance storage in a hot tier. So, evaluate the capacity requirements as follows: - -- Total capacity required -- Total hot-tier capacity required -- Total warm-tier capacity required -- Total cold-tier capacity required - -## Authentication and authorization method - -Regarding authentication and authorization requirements, like using an LDAP server or Active Directory environment, ensures you include the appropriate supporting systems for the architecture. If you need to support capabilities like UID/GID mapping to Active Directory users, confirm that the storage solution supports that capability. - -Here are some things to consider: - -- Local (UID/GID on file server only) -- Directory (LDAP, Active Directory) -- UID/GID mapping to Active Directory users? - -## Common Azure storage solutions comparison - -| Category | Azure Blob Storage | Azure Files| Azure Managed Lustre | Azure NetApp Files | -|--|--|--|--|--| -| Use cases | Azure Blob Storage is best suited for large-scale, read-heavy sequential access workloads where data is ingested once with few or no further modifications.

Blob Storage offers the lowest total cost of ownership, if there's little or no maintenance.

Some example scenarios are: Large scale analytical data, throughput sensitive high-performance computing, backup and archive, autonomous driving, media rendering, or genomic sequencing. | Azure Files is a highly available service best suited for random access workloads.

For NFS shares, Azure Files provides full POSIX file system support. You can easily use it from container platforms like Azure Container Instance (ACI) and Azure Kubernetes Service (AKS) with the built-in CSI driver, and VM-based platforms.

Some example scenarios are: Shared files, databases, home directories, traditional applications, ERP, CMS, NAS migrations that don't require advanced management, and custom applications requiring scale-out file storage. | Azure Managed Lustre is a fully managed parallel file system best suited to medium to large HPC workloads.

Enables HPC applications in the cloud without breaking application compatibility by providing familiar Lustre parallel file system functionality, behaviors, and performance, securing long-term application investments. | Fully managed file service in the cloud, powered by NetApp, with advanced management capabilities.

NetApp Files is suited for workloads that require random access and provides broad protocol support and data protection capabilities.

Some example scenarios are: On-premises enterprise NAS migration that requires rich management capabilities, latency sensitive workloads like SAP HANA, latency-sensitive or IOPS intensive high-performance compute, or workloads that require simultaneous multi-protocol access. | -| Available protocols | NFS 3.0

REST

Data Lake Storage Gen2 | SMB

NFS 4.1

(No interoperability between either protocol) | Lustre | NFS 3.0 and 4.1

SMB | -| Key features | Integrated with HPC cache for low-latency workloads.

Integrated management, including lifecycle, immutable blobs, data failover, and metadata index. | Zonally redundant for high availability.

Consistent single-digit millisecond latency.

Predictable performance and cost that scales with capacity. | High storage capacity up to 12.5 PiB upon request.

Low (~2ms) latency.

Spin up new clusters in minutes.

Supports containerized workloads with AKS. | Extremely low latency (as low as sub-ms).

Rich NetApp ONTAP management capability such as SnapMirror in cloud.

Consistent hybrid cloud experience. | -| Performance (Per volume) | Up to 20,000 IOPS, up to 100 GiB/s throughput. | Up to 100,000 IOPS, up to 80 GiB/s throughput. | Up to 1M IOPS, up to 500 GiB/s throughput. | Up to 460,000 IOPS, up to 36 GiB/s throughput. | -| Pricing | [Azure Blob Storage pricing](https://azure.microsoft.com/pricing/details/storage/blobs/) | [Azure Files pricing](https://azure.microsoft.com/pricing/details/storage/files/) | [Azure Managed Lustre pricing](https://azure.microsoft.com/pricing/details/managed-lustre) | [Azure NetApp Files pricing](https://azure.microsoft.com/pricing/details/netapp/) | - -## Roll-your-own parallel file system - -As with NFS, you can create a multi-node BeeGFS or Lustre file system. Performance of such systems is largely dependent on the type of Virtual Machines you select. You can use images found in the Azure Marketplace for [BeeGFS](https://azuremarketplace.microsoft.com/marketplace/apps/beegfs.beegfs-ubuntu-free?tab=Overview), or a Lustre implementation by DDN called [Whamcloud](https://whamcloud.com/). Using third-party images from vendors such as [BeeGFS](https://www.beegfs.io/content/) or DDN lets you purchase their support. Otherwise, you can use both BeeGFS and Lustre by way of their GPL licenses without other charges (beyond the machines and disks). These tools are easy to roll out using the [Azure HPC scripts](https://github.com/Azure/azurehpc/tree/master/examples) with either ephemeral local disks (for scratch) or Premium / Ultra SSD for persistent storage. - -## Cray ClusterStor - -One of the biggest challenges with larger workloads is replicating the pure “bare-metal” performance of large compute clusters working alongside large Lustre environments (in terms of TB/s throughput, and possibly Petabytes of storage). You can now run these workloads with the Azure Cray ClusterStor solution. This approach is a pure bare-metal Lustre deployment placed in the relevant Azure data center. Parallel file systems such as BeeGFS and Lustre provide the highest performance due to their architecture. But that architecture comes with a high management price and so does the use of these technologies. - -## Next steps - -The following articles provide guidance on each step in the cloud adoption journey for manufacturing HPC environments. - -- [Manufacturing HPC Azure billing and Active Directory tenants](./azure-billing-active-directory-tenant.md) -- [Azure identity and access management for HPC in manufacturing](./identity-access-management.md) -- [Management for HPC in the manufacturing industry](./management.md) -- [Manufacturing HPC network topology and connectivity](./network-topology-connectivity.md) -- [Platform automation and DevOps for Azure HPC in the manufacturing industry](./platform-automation-devops.md) -- [Manufacturing HPC resource organization](./resource-organization.md) -- [Azure governance for manufacturing HPC](./security-governance-compliance.md) -- [Security for HPC in manufacturing industries](./security.md) -- [Azure high-performance computing (HPC) landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) diff --git a/docs/scenarios/azure-hpc/finance/media/hpc-finance-architecture-example.svg b/docs/scenarios/azure-hpc/media/hpc-finance-architecture-example.svg similarity index 100% rename from docs/scenarios/azure-hpc/finance/media/hpc-finance-architecture-example.svg rename to docs/scenarios/azure-hpc/media/hpc-finance-architecture-example.svg diff --git a/docs/scenarios/azure-hpc/manufacturing/media/hpc-identity-access-management-batch.png b/docs/scenarios/azure-hpc/media/hpc-identity-access-management-batch.png similarity index 100% rename from docs/scenarios/azure-hpc/manufacturing/media/hpc-identity-access-management-batch.png rename to docs/scenarios/azure-hpc/media/hpc-identity-access-management-batch.png diff --git a/docs/scenarios/azure-hpc/manufacturing/media/hpc-identity-access-management-cyclecloud.png b/docs/scenarios/azure-hpc/media/hpc-identity-access-management-cyclecloud.png similarity index 100% rename from docs/scenarios/azure-hpc/manufacturing/media/hpc-identity-access-management-cyclecloud.png rename to docs/scenarios/azure-hpc/media/hpc-identity-access-management-cyclecloud.png diff --git a/docs/scenarios/azure-hpc/manufacturing/media/hpc-manufacturing-architecture-example.svg b/docs/scenarios/azure-hpc/media/hpc-manufacturing-architecture-example.svg similarity index 100% rename from docs/scenarios/azure-hpc/manufacturing/media/hpc-manufacturing-architecture-example.svg rename to docs/scenarios/azure-hpc/media/hpc-manufacturing-architecture-example.svg diff --git a/docs/scenarios/azure-hpc/network-topology-connectivity.md b/docs/scenarios/azure-hpc/network-topology-connectivity.md new file mode 100644 index 0000000000..7d46488c64 --- /dev/null +++ b/docs/scenarios/azure-hpc/network-topology-connectivity.md @@ -0,0 +1,160 @@ +--- +title: Network Topology and Connectivity for Azure HPC +description: Learn about network topology and connectivity considerations in the Azure landing zone for high-performance computing (HPC). +author: Rajani-Janaki-Ram +ms.author: rajanaki +ms.topic: conceptual +ms.custom: think-tank +ms.date: 12/05/2024 +--- + +# Network topology and connectivity for Azure HPC + +The guidance in this article can help you examine design considerations and best practices that relate to networking and connectivity for Microsoft Azure and high-performance computing (HPC) deployments. + +## Plan for IP addressing + +It's crucial that you plan for IP addressing in Azure to help ensure that: + +- The IP address space doesn't overlap across on-premises locations and Azure regions. +- Future virtual network peering to existing or planned virtual networks is possible. +- The virtual network contains the right address space. +- Proper planning for subnet configuration happens in advance. +- Sufficient excess addressing is considered for future expansion or other services. + +### Design considerations and recommendations + +- Consider creating separate subnets to assign IP addresses across functional components of the environment. For example, a dedicated HPC virtual network could include the following subnets: + + - Compute + - Storage + - Infrastructure + - Visualization + - Sign in + - Azure NetApp Files + - Azure HPC Cache + +- Several services like Azure NetApp Files, HPC Cache, and future storage offerings require dedicated delegated subnets for proper operation. If you consider using any of these services, make sure that you plan appropriate addressing space. Delegated subnets are required if you want to implement Azure NetApp Files, which is used frequently in HPC deployments with shared file systems. You can [dedicate](/azure/virtual-network/virtual-network-for-azure-services#services-that-can-be-deployed-into-a-virtual-network) and delegate subnets to specific services and then create instances of those services within subnets. + +- Azure helps you create multiple delegated subnets in a virtual network, but only one delegated subnet can exist in a virtual network for Azure NetApp Files. Attempts to create a new volume fail if you use more than one delegated subnet for Azure NetApp Files. If you use HPC Cache for storage, create a dedicated subnet. For more information about this subnet prerequisite, see [Cache subnet](/azure/hpc-cache/hpc-cache-prerequisites#cache-subnet). To learn more about how to create a subnet, see [Add a virtual network subnet](/azure/virtual-network/virtual-network-manage-subnet). + +## DNS and name resolution for on-premises and Azure resources + +Domain name system (DNS) is a crucial design element in the overall Azure landing zone architecture. Some organizations might want to use their existing investments in DNS. Other organizations might see cloud adoption as an opportunity to modernize their internal DNS infrastructure and use native Azure capabilities. The following recommendations apply when a virtual machine's DNS or virtual name doesn't change during migration. + +### Design considerations and recommendations + +- Background DNS and virtual names connect multiple system interfaces in HPC environments. Customers are only sometimes aware of the interfaces that developers define over time. Connection challenges can occur between various systems when virtual or DNS names change after migrations, so you should retain DNS aliases to prevent these types of problems. + +- Use different DNS zones to distinguish environments from each other. These environments include sandbox, development, preproduction, and production. The exception is for HPC deployments that have their own virtual network, which might not require private DNS zones. + +- DNS support is mandatory when you use HPC Cache so that they can access storage and other resources. + +- DNS and name resolution are crucial in the finance sector when you use resource location and service records. We recommend that you use the DNS resolution that the Microsoft Entra Domain Services domain controller provides. For more information, see [Deploy Microsoft Entra Domain Services in an Azure virtual network](/azure/architecture/reference-architectures/identity/adds-extend-domain). + +## High-performance network services + +- **Accelerated networking:** Many HPC workloads, including seismic processing, handle vast amounts of data stored in shared file systems such as Azure Blob, Azure NetApp Files, and Lustre ClusterStor. These storage solutions and custom solutions are accessed through the network. A high-performance network is crucial to reduce the time for data transfers. [Accelerated networking](/azure/virtual-network/accelerated-networking-overview) provides a high-throughput, low-latency connection between the virtual machines (VMs) and to Azure services. Other benefits include reduced jitter and minimal CPU utilization. + +- **InfiniBand:** Parallel HPC applications that rely on Message Passing Interface (MPI) libraries might need to transfer significant amounts of data between multiple VMs. The InfiniBand interconnect, available on remote direct memory access (RDMA)-capable [H-series](/azure/virtual-machines/sizes-hpc) and [N-series](/azure/virtual-machines/sizes-gpu) VMs, provides a low-latency, high-bandwidth connection to maximize the performance and scalability of HPC and deep learning applications. + + - If you run financial applications that require low latency between machines, and information must be transferred between nodes to get results, use low-latency and high-throughput interconnections. [RDMA-capable H-series](/azure/virtual-machines/sizes-hpc#rdma-capable-instances) and [N-series](/azure/virtual-machines/sizes-gpu) VMs communicate over the low-latency and high-bandwidth InfiniBand network. The RDMA network capability over such a connection is crucial to boost the scalability and performance of distributed-node HPC and AI workloads. This network can improve the performance of applications that run under Microsoft MPI or Intel MPI. Some examples of MPI jobs include molecular dynamics, computational fluid dynamics, oil and gas reservoir simulation, and emerging distributed machine learning workloads. + + - InfiniBand connections are possible only between VMs that are allocated within the same [placement group](/azure/virtual-machine-scale-sets/virtual-machine-scale-sets-placement-groups). For more information, see [Enable InfiniBand](/azure/virtual-machines/workloads/hpc/enable-infiniband). To learn how to set up MPI, see [Set up Message Passing Interface for HPC](/azure/virtual-machines/workloads/hpc/setup-mpi). + + :::image type="content" alt-text="Diagram that shows InfiniBand connection between VMs." source="media/infiniband.png" lightbox="media/infiniband.png"::: + +- **Azure ExpressRoute:** ExpressRoute connections don't use the public internet, and they provide more reliability, faster speeds, and lower latencies than typical internet connections. For point-to-site VPN and site-to-site VPN, you can connect on-premises devices or networks to a virtual network by using any combination of these VPN options and ExpressRoute. + + - For burst applications like a hybrid setup for reservoir simulation and modeling, where on-premises datasets are shared and the Azure compute becomes an extension, ExpressRoute connects your on-premises environment to the Microsoft cloud over a private connection. ExpressRoute provides enterprise-grade resiliency and availability, and the advantage of a Global ExpressRoute partner ecosystem. For more information about how to connect your network to Microsoft by using ExpressRoute, see [ExpressRoute connectivity models](/azure/expressroute/expressroute-connectivity-models). + + - For hybrid applications like risk grid computing solutions, where your on-premises trading systems and analytics are functional and Azure becomes an extension, you can use ExpressRoute to connect your on-premises environment to Azure over a private connection, with the help of a connectivity provider. ExpressRoute provides enterprise-grade resiliency and availability and the advantage of a global ExpressRoute partner ecosystem. For information about how to connect your network to Azure by using ExpressRoute, see [ExpressRoute connectivity models](/azure/expressroute/expressroute-connectivity-models). + +## Define an Azure network topology + +Enterprise-scale landing zones support two network topologies. One topology is based on Azure Virtual WAN and the other on a traditional network topology that's based on hub-and-spoke architecture. This section recommends HPC configurations and practices for both deployment models. + +Use a network topology that's based on a virtual WAN if your organization plans to: + +- Deploy resources across several Azure regions and connect your global locations to both Azure and on-premises environments. + +- Integrate software-defined WAN deployments fully with Azure. + +- Deploy up to 2,000 VM workloads across all virtual networks connected to one virtual WAN hub. + +Organizations use Azure Virtual WAN to meet large-scale interconnectivity requirements. Microsoft manages this service, which helps to reduce overall network complexity and modernize your organization's network. Use a traditional Azure network topology based on the [hub-and-spoke architecture](/azure/architecture/reference-architectures/hybrid-networking/hub-spoke?tabs=cli) if your organization: + +- Plans to deploy resources in only select Azure regions. + +- Has no need for a global, interconnected network. + +- Has few remote or branch locations per region and needs fewer than 30 IP security (IPsec) tunnels. + +- Requires full control and granularity to manually configure your Azure network. + +- Uses local and global virtual network peering to provide connectivity. + +Local and global virtual network peering provide connectivity and are the preferred approaches to help ensure connectivity between landing zones for HPC deployments across multiple Azure regions. Document your network topology and firewall rules. Network security groups (NSGs) are often implemented with considerable complexity. Use application security groups when it makes sense to label traffic at a greater granularity than virtual networks can provide. Understand NSG prioritization rules and which rules take precedence over others. + +## Inbound and outbound internet connectivity + +The following section describes recommended connectivity models for inbound and outbound connectivity to and from the public internet. Because Azure-native network security services like Azure Firewall, Azure Web Application Firewall on Azure Application Gateway, and Azure Front Door are fully managed services, you don't incur the operational and management costs that are associated with infrastructure deployments, which can become complex at scale. + +### Design considerations and recommendations + +- Consider using [Azure Front Door](/azure/frontdoor/front-door-overview) for your HPC deployment if your organization has a global footprint. Azure Front Door uses [Azure Web Application Firewall policies](/azure/web-application-firewall/ag/policy-overview) to deliver and help protect global HTTP and HTTPS applications across Azure regions. + +- Take advantage of [Web Application Firewall policies](/azure/web-application-firewall/ag/create-waf-policy-ag) when you use Azure Front Door and Application Gateway to help protect HTTP and HTTPS applications. Lock down Application Gateway to receive traffic only from Azure Front Door. For more information, see [How do I lock down access?](/azure/frontdoor/front-door-faq#what-are-the-steps-to-restrict-the-access-to-my-backend-to-only-azure-front-door-). + +- Use local and global virtual network peering connectivity. These methods are preferred to help ensure connectivity between landing zones for HPC deployments across multiple Azure regions. + +## Define network encryption requirements + +The following section provides key recommendations for encrypting networks between on-premises environments and Azure, and across Azure regions. + +### Design considerations and recommendations + +- Consider traffic performance when you enable encryption. IPsec tunnels encrypt internet traffic by default. Any extra encryption or decryption can negatively affect performance. When you use ExpressRoute, traffic isn't encrypted by default. Determine whether you should encrypt HPC traffic. For more information about network encryption options in enterprise-scale landing zones, see [network topology](../../ready/azure-best-practices/define-an-azure-network-topology.md) and [connectivity](../../ready/azure-best-practices/connectivity-to-azure.md). + +The following recommendations are for encrypting networks between on-premises and Azure, and across Azure regions: + +- Determine whether HPC traffic should be encrypted. For more information, see [Network topology and connectivity](../../ready/landing-zone/design-area/network-topology-and-connectivity.md). + +- Plan for IP addressing in Azure to help ensure that: + + - The IP address space doesn't overlap across on-premises locations and Azure regions. + - The virtual network contains the right address space. + - Proper planning for subnet configuration happens in advance. + +## Define and throughput latency bandwidth network requirements + +HPC in the Cloud Only and HPC Cloud Hybrid deployment models each have their own networking and connectivity latency and throughput needs. These needs depend on how you submit and run the manufacturing workflow and workload jobs on-premises versus the cloud. Users can submit HPC jobs in many deployment modes from on-premises or the cloud. + +- Single jobs + - On-premises to Azure connectivity considerations if you use remote visualization desktop + +- Burst jobs + - Scheduler setup network considerations, which submit the jobs in the cloud + - Azure Batch network considerations + +- Parallel workflows for on-premises and cloud environments +- Hybrid + - HPC cache + +- Cloud native + - KS containers + - Functions + +MPI environments are dedicated because they have unique requirements that need low-latency communications between nodes. The nodes connect via high-speed interconnect and aren't amenable to sharing with other workloads. MPI applications use the entire high-performance interconnects by way of pass-through mode in virtualized environments. Storage for MPI nodes is usually a parallel file system like Lustre that's also accessed via the high-speed interconnect. + +## Next steps + +The following articles provide guidance for each step in the cloud adoption journey for HPC environments. + +- [Azure Billing and Microsoft Entra tenants for energy HPC](/azure/cloud-adoption-framework/scenarios/azure-hpc/energy/azure-billing-active-directory-tenant) +- [Resource organization for HPC in the energy industry](/azure/cloud-adoption-framework/scenarios/azure-hpc/energy/resource-organization) +- [Governance for HPC in energy industries](/azure/cloud-adoption-framework/scenarios/azure-hpc/energy/security-governance-compliance) +- [Security for Azure HPC in energy](/azure/cloud-adoption-framework/scenarios/azure-hpc/energy/security) +- [Compute large-scale HPC application workloads in Azure VMs](/azure/cloud-adoption-framework/scenarios/azure-hpc/energy/compute) +- [Storage for HPC energy environments](/azure/cloud-adoption-framework/scenarios/azure-hpc/energy/storage) +- [Azure high-performance computing (HPC) landing zone accelerator](azure-hpc-landing-zone-accelerator.md) diff --git a/docs/scenarios/azure-hpc/plan.md b/docs/scenarios/azure-hpc/plan.md index 0d473a8912..93d5e9e23e 100644 --- a/docs/scenarios/azure-hpc/plan.md +++ b/docs/scenarios/azure-hpc/plan.md @@ -24,7 +24,7 @@ Evaluate the attributes of your HPC environment in terms of an [asset-driven](.. When planning cloud adoption for your HPC environment, review the Azure DevOps [strategy and plan template](../../plan/template.md#align-the-cloud-adoption-plan) to align your tasks and organize objectives through epics, features, and user stories. -Also plan your [HPC network topology](../azure-hpc/energy/network-topology-connectivity.md) to follow best practices surrounding networking and connectivity for Azure HPC deployments. +Also plan your [HPC network topology](network-topology-connectivity.md) to follow best practices surrounding networking and connectivity for Azure HPC deployments. For skilling your IT teams on adopting Azure HPC services, [identify gaps](../../plan/adapt-roles-skills-processes.md#identify-gaps) when forming your digital transformation. diff --git a/docs/scenarios/azure-hpc/ready.md b/docs/scenarios/azure-hpc/ready.md index 0335345473..d401370328 100644 --- a/docs/scenarios/azure-hpc/ready.md +++ b/docs/scenarios/azure-hpc/ready.md @@ -16,7 +16,7 @@ The [Ready methodology](../../ready/index.md) of the Cloud Adoption Framework fo As you prepare your environment for sustained cloud adoption, you can use an Azure landing zone conceptual architecture that represents a target end state. For HPC, there are multiple conceptual architecture references that are based on the three industries that the Cloud Adoption Framework currently addresses: -- [Energy (oil and gas)](../azure-hpc/energy/compute.md#use-case-and-reference-architecture-for-seismic-processing) +- [Energy (oil and gas)](../azure-hpc/compute.md#reference-architecture-for-seismic-processing) - [Finance](./azure-hpc-landing-zone-accelerator.md#example-conceptual-reference-architecture-for-energy) - [Manufacturing](./azure-hpc-landing-zone-accelerator.md#example-conceptual-reference-architecture-for-finance) diff --git a/docs/scenarios/azure-hpc/finance/resource-organization.md b/docs/scenarios/azure-hpc/resource-organization.md similarity index 58% rename from docs/scenarios/azure-hpc/finance/resource-organization.md rename to docs/scenarios/azure-hpc/resource-organization.md index 87906f3ad3..78cb1e5696 100644 --- a/docs/scenarios/azure-hpc/finance/resource-organization.md +++ b/docs/scenarios/azure-hpc/resource-organization.md @@ -1,48 +1,47 @@ ---- -title: Resource organization for Azure HPC in the finance sector -description: This article provides recommendations to help you align HPC implementations in the finance sector with Cloud Adoption Framework methodologies. -author: Rajani-Janaki-Ram -ms.author: rajanaki -ms.topic: conceptual -ms.custom: think-tank -ms.date: 11/15/2022 ---- - -# Resource organization for HPC in the finance sector - -This article provides recommendations that can help you align HPC implementations in the finance sector with the [Ready methodology](../../../ready/index.md) of the Cloud Adoption Framework for Azure. - -## Single vs. multiple Azure subscriptions - -In Microsoft Entra ID, a tenant represents an organization. It's a dedicated instance of Microsoft Entra ID that an organization receives and owns when it signs up to use Azure. Each Microsoft Entra tenant is distinct and separate from other Microsoft Entra tenants. An Azure tenant can have multiple subscriptions, and each subscription can use the same Microsoft Entra instance. - -The needs of your organization determine whether you should use a single subscription or multiple subscriptions: - -- Multiple subscriptions enable you to easily view billing for each subscription. They also allow you to limit who can access the Azure services associated with each subscription. For example, you could have production and non-production subscriptions, or internal and external subscriptions. -- Using multiple subscriptions can help you plan for [subscriptions limits](/azure/azure-resource-manager/management/azure-subscription-service-limits). - -For more information about the decision-making process, see [Subscription decision guide](../../../ready/landing-zone/design-area/resource-org-subscriptions.md). - -We also recommend that you consider ARM throttling limits when you make decisions about subscriptions. For more information, see [Throttling Resource Manager requests](/azure/azure-resource-manager/management/request-limits-and-throttling). - -## Use Azure resource naming and tagging conventions - -Implement a naming and tagging strategy that includes business and operational details as components of resource names and metadata tags. - -The business side of this strategy ensures that resource names and tags include the organizational information that you need to identify the associated teams. Include the business owners who are responsible for resource costs. The operational side ensures that names and tags include information that IT teams can use to identify the workload, application, environment, criticality, and other information that's useful for managing resources. - -Resources to name include VMs, load balancers, DNS labels, availability sets, virtual networks, subnets, Azure ExpressRoute, NSGs, application security groups, tags, route tables, managed disks, and public IPs. For example, you could label all development VMs with the tag `Dev`. Doing so makes it easier to pull billing reports and other reports for development VMs. For more information, see [Develop your naming and tagging strategy for Azure resources](../../../ready/azure-best-practices/naming-and-tagging.md). - -## Next steps - -The following articles provide guidance that you might find helpful at various points during your cloud adoption process. They can help you succeed in your cloud adoption scenario for HPC environments in the finance sector. - -- [Azure billing offers and Active Directory tenants for finance HPC](./azure-billing-active-directory-tenant.md) -- [Finance HPC Azure identity and access management](./identity-access-management.md) -- [Management for HPC in the finance sector](./management.md) -- [Network topology and connectivity for HPC in the finance sector](./network-topology-connectivity.md) -- [Platform automation and DevOps for HPC in the finance sector](./platform-automation-devops.md) -- [Governance for finance HPC](./security-governance-compliance.md) -- [Security for HPC in the finance sector](./security.md) -- [Storage for HPC in the finance sector](./storage.md) -- [Azure high-performance computing (HPC) landing zone accelerator](../azure-hpc-landing-zone-accelerator.md) +--- +title: Resource Organization for Azure HPC +description: This article provides naming and tagging recommendations to help you align HPC implementations with Cloud Adoption Framework methodologies. +author: Rajani-Janaki-Ram +ms.author: rajanaki +ms.topic: conceptual +ms.custom: think-tank +ms.date: 11/14/2024 +--- + +# Resource organization for Azure HPC + +This article provides recommendations that can help you align HPC implementations with the [Ready methodology](../../ready/index.md) of the Cloud Adoption Framework for Azure. + +## Single vs. multiple Azure subscriptions + +In Microsoft Entra ID, a tenant represents an organization. It's a dedicated instance of Microsoft Entra ID that an organization receives and owns when it signs up to use Azure. Each Microsoft Entra tenant is distinct and separate from other Microsoft Entra tenants. An Azure tenant can have multiple subscriptions, and each subscription can use the same Microsoft Entra instance. + +The needs of your organization determine whether you should use a single subscription or multiple subscriptions: + +- Multiple subscriptions enable you to easily view billing for each subscription. They also allow you to limit who can access the Azure services associated with each subscription. For example, you could have production and nonproduction subscriptions, or internal and external subscriptions. +- Using multiple subscriptions can help you plan for [subscriptions limits](/azure/azure-resource-manager/management/azure-subscription-service-limits). + +For more information about the decision-making process, see [Subscription decision guide](../../ready/landing-zone/design-area/resource-org-subscriptions.md). + +We also recommend that you consider ARM throttling limits when you make decisions about subscriptions. For more information, see [Throttling Resource Manager requests](/azure/azure-resource-manager/management/request-limits-and-throttling). + +## Use Azure resource naming and tagging conventions + +Implement a naming and tagging strategy that includes business and operational details as components of resource names and metadata tags. + +The business side of this strategy ensures that resource names and tags include the organizational information that you need to identify the associated teams. Include the business owners who are responsible for resource costs. The operational side ensures that names and tags include information that IT teams can use to identify the workload, application, environment, criticality, and other information that's useful for managing resources. + +Resources to name include VMs, load balancers, DNS labels, availability sets, virtual networks, subnets, Azure ExpressRoute, NSGs, application security groups, tags, route tables, managed disks, and public IPs. For example, you could label all development VMs with the tag `Dev`. Doing so makes it easier to pull billing reports and other reports for development VMs. For more information, see [Develop your naming and tagging strategy for Azure resources](../../ready/azure-best-practices/naming-and-tagging.md). + +## Next steps + +The following articles provide guidance that you might find helpful at various points during your cloud adoption process. They can help you succeed in your cloud adoption scenario for HPC environments. + +- [Azure billing offers and Microsoft Entra tenants](../../ready/landing-zone/design-area/azure-billing-microsoft-entra-tenant.md) +- [Identity and access management](identity-access-management.md) +- [Management](./manage.md) +- [HPC network topology and connectivity](./network-topology-connectivity.md) +- [Security](./secure.md) +- [Compute large-scale HPC application workloads in Azure VMs](./compute.md) +- [Storage](./storage.md) +- [HPC landing zone accelerator](../azure-hpc/azure-hpc-landing-zone-accelerator.md) \ No newline at end of file diff --git a/docs/scenarios/azure-hpc/storage.md b/docs/scenarios/azure-hpc/storage.md new file mode 100644 index 0000000000..73d771d8f5 --- /dev/null +++ b/docs/scenarios/azure-hpc/storage.md @@ -0,0 +1,201 @@ +--- +title: Storage for Azure HPC Workloads +description: Learn how to use Azure Storage for high-performance computing (HPC) workloads in energy environments, the finance sector, and for manufacturing. +author: Rajani-Janaki-Ram +ms.author: rajanaki +ms.topic: conceptual +ms.custom: think-tank +ms.date: 12/06/2024 +--- + +# Storage for Azure HPC workloads + +Storage access is a crucial factor to consider when you plan for high-performance computing (HPC) workload performance. Large-scale HPC workloads in certain environments can create demands for data storage and access that exceed the capabilities of traditional cloud file systems. This article provides recommendations to help you choose the correct storage for your Azure HPC workloads. It also provides recommendations about storage for HPC workloads in the energy, finance, and manufacturing industries. + +Consider the following factors related to your application requirements to help decide which storage solution to use: + +- Latency +- Input/output operations per second (IOPS) +- Throughput +- File sizes and count +- Job runtime +- Cost +- Storage location – on-premises versus Azure + +For more information, see [Understand factors that influence HPC storage selection in Azure](/training/modules/hpc-storage-considerations/). + +The following diagram shows a decision tree that's for a specific HPC storage system choice. + +:::image type="content" source="media/storage-selection-flow.png" alt-text="Diagram that shows a decision tree of considerations when you choose a storage solution." lightbox="media/storage-selection-flow.png" border="false"::: + +## HPC considerations + +Oil and gas companies must be able to effectively manage and store exabytes of seismic data, well data, maps, and leases. To put this data to use, they need a high-performance infrastructure that can process and deliver real-time analytics to help optimize production, reduce environmental risks, and enhance operational safety. + +[Data storage](/azure/architecture/topics/high-performance-computing#storage) and access needs vary widely, depending on workload scale. Azure supports several approaches for managing the speed and capacity of HPC applications. + +Large-scale batch and HPC workloads in the energy industry have demands for data storage and access that exceed the capabilities of traditional cloud file systems. The high-performance input/output (I/O) requirements and massive scalability needs of [HPC](https://azure.microsoft.com/solutions/high-performance-computing/) introduces unique challenges for data storage and access. + +HPC solves complex problems like seismic and reservoir simulations and modeling, which aren't practical or cost-effective to handle with traditional computing techniques. HPC solves these problems through a combination of parallel processing and massive scalability to perform large and complicated computing tasks quickly, efficiently, and reliably. + +In Azure HPC clusters, compute nodes are virtual machines (VMs) that you can quickly create to perform jobs that are assigned to the cluster. These nodes distribute computation tasks across the cluster. This distribution helps achieve the high-performance parallel processing that's required to solve complex HPC problems. Compute nodes need to perform read and write operations on shared working storage when they run jobs. Nodes access this storage in a range of scenarios that lie between the following two extremes: + +- **One set of data to many compute nodes.** In this scenario, there's a single data source on the network that all the compute nodes access for working data. Though they're structurally simple, the I/O capacity of the storage location limits the I/O operations. + +- **Many sets of data to many compute nodes.** In this scenario, there are many data sources on the network that all the compute nodes access for working data. Though they're structurally simple, the I/O capacity of the storage location limits the I/O operations. + +## HPC design recommendations + +Choose the solution that's best suited for your unique I/O and capacity requirements. + +### Network File System + +Network File System (NFS) is often used to provide access to shared storage locations. A server VM that uses NFS shares its local file system. In Azure, this file system is stored on one or more virtual hard disks (VHDs) that are hosted in Azure Storage. Clients can then mount the server's shared files and access the shared location directly. + +NFS is often used for home directories and project spaces that require access across all nodes. It can provide a space for research groups that share data. In general, the throughput workloads are horizontally scalable with little dependency between individual tasks. Job schedulers divide the work across nodes and coordinate the activity. NFS is the typical shared storage across the nodes which is accessed via TCP/IP networks. + +NFS has the advantage of being easy to set up and maintain and is supported on both Linux and Windows operating systems. Multiple NFS servers can be used to spread storage across a network, but individual files are only accessible through a single server. + +For low-scale workloads, consider running NFS on the head node by using a [storage](/azure/virtual-machines/sizes-storage)-optimized VM that has large ephemeral disks or D-series VMs with Azure Premium Storage, depending on your requirements. This solution suits workloads that have 500 cores or fewer. + +In HPC scenarios, the file server can often serve as a bottleneck that throttles overall performance. Attempts to access uncached data from a single NFS server at rates higher than the documented per-VM maximum IOPS and throughput results in throttling. + +In a scenario where dozens of clients attempt to work on data stored on a single NFS server, you can easily reach these limits. These limits can cause your entire application's performance to suffer. The closer to a pure one-to-many scenario your HPC application uses, the sooner you encounter these limitations. + +### Parallel file systems on Azure + +Parallel file systems distribute block-level storage across multiple networked storage nodes. File data is spread among these nodes, which means that file data is spread among multiple storage devices. This distribution pools any individual storage I/O requests across multiple storage nodes that are accessible through a common namespace. + +Multiple storage devices and multiple paths to data are used to provide a high degree of parallelism. This approach reduces the number of bottlenecks imposed by accessing only a single node at a time. However, parallel I/O can be difficult to coordinate and optimize if working directly at the level of the API or POSIX I/O interface. By introducing intermediate data access and coordination layers, parallel file systems provide application developers a high-level interface between the application layer and the I/O layer. + +Energy Messaging Passing Interface (MPI) workloads have unique requirements with the need for low-latency communications between nodes. The nodes are connected via high-speed interconnect and aren't easily adaptable for sharing with other workloads. MPI applications use the entire high-performance interconnects via Pass-Through mode in virtualized environments. Storage for MPI nodes is usually a parallel file system like [Lustre](https://www.lustre.org/) that's also accessed via the high-speed interconnect. Lustre and BeeGFS are typically used to handle the large throughput requirements of seismic processing. To a lesser extent, they're also used for reservoir simulation. + +Parallel file systems such as Lustre are used for HPC energy workloads that require access to large files, simultaneous access from multiple compute nodes, and massive amounts of data. The implementation of parallel file systems makes it easy to scale in terms of capability and performance. These file systems take advantage of remote direct memory access transfers with large bandwidth and reduced CPU usage. The parallel file system is often used as scratch space and intended for work that requires optimized I/O. Examples include workload setup, preprocessing, running, and post-processing. + +An orchestrated parallel file service, such as Azure Managed Lustre, works for 50,000 or more cores, with read/write rates up to 500 GBps and 2.5-PB storage. For more information, see [Parallel virtual file systems on Microsoft Azure](https://techcommunity.microsoft.com/t5/azure-global/parallel-virtual-file-systems-on-microsoft-azure-part-1-overview/ba-p/306487). + +## HPC components + +- Azure NetApp Files and local disks are typically used to handle the more latency and IOPS-sensitive workloads, like seismic interpretation, model preparation, and visualization. Consider using Azure NetApp Files for workloads of up to 4,000 cores, with a throughput up to 6.5 GiBps, and workloads that benefit from or require multiprotocol NFS and Server Message Block (SMB) access to the same data set. + +- Managed Lustre provides faster and higher capacity storage for HPC workloads. This solution works for medium to large workloads and can support 50,000 or more cores, with throughput up to 500 GBps, and storage capacity up to 2.5 PiB. + +- Standard or Premium Azure Blob Storage is cost effective because it's the lowest-cost cloud offering. This service provides exabyte-scale, high-throughput, low-latency access where needed, a familiar file system interface, and multi-protocol access (REST, HDFS, NFS). You can use NFS v3.0 at the blob service endpoint for high-throughput and read-heavy workloads. You can optimize costs by moving to cooler storage tiers. This approach allows for lifecycle management based on the last update or access time and intelligent tiering with customizable policies. + +- Oil and gas energy workloads might require you to transfer large data sizes and volumes between on-premises systems and the cloud. Offline migration uses device-based services like Azure Data Box. Online migration uses network-based services like Azure ExpressRoute. + +The following table provides a comparison of Blob Storage, Azure Files, Managed Lustre, and Azure NetApp Files. + +|Category |Blob Storage | Azure Files | Managed Lustre | Azure NetApp Files | +|--|--|--|--|--| +| Use cases | Best suited for large-scale read-heavy sequential access workloads where data is ingested once and modified minimally.

Low total cost of ownership, if there's light maintenance.

Some example scenarios include large scale analytical data, throughput sensitive high-performance computing, backup and archive, autonomous driving, media rendering, and genomic sequencing.| A highly available service that's best suited for random-access workloads.

For NFS shares, Azure Files provides full POSIX file system support. The built-in CSI driver allows you to easily use it from VM-based platforms and container platforms like Azure Container Instances and Azure Kubernetes Service (AKS).

Some example scenarios include shared files, databases, home directories, traditional applications, ERP, CMS, NAS migrations that don't require advanced management, and custom applications that require scale-out file storage. | Managed Lustre is a fully managed parallel file system that's best suited for medium to large HPC workloads.

Enables HPC applications in the cloud without breaking application compatibility by providing familiar Lustre parallel file system functionality, behaviors, and performance. This service helps secure long-term application investments. | A fully managed file service in the cloud, powered by NetApp, that has advanced management capabilities.

Azure NetApp Files is suited for workloads that require random access. It provides broad protocol support and improved data protection.

Some example scenarios include on-premises enterprise NAS migration that requires rich management capabilities, latency sensitive workloads like SAP HANA, latency-sensitive or IOPS intensive high-performance compute, or workloads that require simultaneous multi-protocol access. | +| Available protocols | NFS 3.0

REST

Azure Data Lake Storage | SMB

NFS 4.1

(No interoperability between either protocol) | Lustre | NFS 3.0 and 4.1

SMB


| +| Key features | Integrated with Azure HPC Cache for low-latency workloads.

Integrated management, including lifecycle management, immutable blobs, data failover, and metadata index. | Zonally redundant for high availability.

Consistent single-digit millisecond latency.

Predictable performance and cost that scales with capacity. | High storage capacity up to 2.5 PB.

Low latency, about 2 ms.

Create new clusters in minutes.

Supports containerized workloads with AKS. | Extremely low latency, as low as a submillisecond.

Rich NetApp ONTAP management capability, like SnapMirror Cloud.

Consistent hybrid cloud experience. | +| Performance (per volume) | As much as 20,000 IOPS. As much as 100 GiBps throughput. | As much as 100,000 IOPS. As much as 80 GiBps throughput. | As much as 100,000 IOPS. As much as 500 GiBps throughput. | As much as 460,000 IOPS. As much as 36 GiBps throughput. | +| Scale | As much as 2 PiB for a single volume.

As much as roughly 4.75 TiB for a single file.

No minimum capacity requirements. | As much as 100 TiB for a single volume.

As much as 4 TiB for a single file.

100-GiB minimum capacity. | As much as 2.5 PiB for a single volume.

As much as 32 PB for a single file.

4-TiB minimum capacity. | As much as 100 TiB for a single volume.

As much as 16 TiB for a single file.

Consistent hybrid cloud experience. | +| Pricing | [Blob Storage pricing](https://azure.microsoft.com/pricing/details/storage/blobs) | [Azure Files pricing](https://azure.microsoft.com/pricing/details/storage/files) | [Managed Lustre pricing](https://azure.microsoft.com/pricing/details/managed-lustre) | [Azure NetApp Files pricing](https://azure.microsoft.com/pricing/details/netapp) | + +## Finance design recommendations + +- Use [Standard or Premium Blob Storage](/azure/storage/blobs/storage-blobs-introduction) for high-throughput, low-latency storage. It provides the following benefits: + + - It provides exabyte-scale, high-throughput, low-latency access, a familiar file system, and multi-protocol access, including REST, HDFS, NFS. + + - It's cost effective. + + - You can mount Blob Storage as a file system by using [BlobFuse](/azure/storage/blobs/storage-how-to-mount-container-linux). Doing so makes it easy to allow multiple nodes to mount the same container for read-only scenarios. + + - It supports NFS 3.0 at the blob service endpoint for high-throughput, read-heavy workloads. + + - You can optimize costs by moving data to cooler storage tiers. This optimization is possible through lifecycle management that's based on the last update or access time and intelligent tiering with customizable policies. + +- Use [Azure NetApp Files](/azure/azure-netapp-files) for ReadWriteMany (unique) or write-once, read-once applications. It provides the following benefits: + + - A wide range of file protocols, such as NFSv3, NFSv4.1, and SMB3 + + - Performance that's comparable with on-premises performance, with multiple tiers (Ultra, Premium, Standard) + + - Deploys in minutes and provides a wide range of tiers and flexibility + + - Flexible capacity pool types and performance, where the QoS per volume is automatically assigned based on the tier of the pool and the volume quota + +## Manufacturing considerations + +It's important to ensure that the required data reaches the HPC cluster machines at the right time. You also want to make sure that results from those individual machines are quickly saved and available for further analysis. + +### Distribution of workload traffic + +Consider the types of traffic that your HPC environment generates and processes. This step is especially important if you plan to run multiple types of workloads and plan to use the storage for other purposes. Consider and record the following traffic types: + +- Single stream versus multiple streams +- Ratio of read traffic to write traffic +- Average file sizes and counts +- Random versus sequential access patterns + +### Data locality + +This category accounts for the location of the data. Locality awareness helps you determine whether you can use copying, caching, or synchronization as your data-movement strategy. Check the following locality items in advance: + +- If source data is on-premises, in Azure, or both +- If results data is on-premises, in Azure, or both +- If HPC workloads in Azure need to be coordinated with source-data modification timelines +- If sensitive or Health Insurance Portability and Accountability Act data is included + +### Performance requirements + +Performance requirements for storage solutions are typically summarized as follows: + +- Single-stream throughput +- Multi-stream throughput +- Expected maximum IOPS +- Average latency + +Every factor affects performance, so these numbers serve as a guide for the expected results of a specific solution. For instance, an HPC workload might include extensive file creation and deletion as part of the workflow. Those operations could affect the overall throughput. + +### Access methods + +Account for the client access protocol required and be clear about what features of the protocol you need. There are different versions of NFS and SMB. + +Consider the following requirements: + +- Whether NFS/SMB versions are required +- Expected protocol features, such as access control lists or encryption +- Parallel file system solution + +### Total capacity requirement + +Storage capacity in Azure is the next consideration. It helps to inform the overall cost of the solution. If you plan to store a large amount of data for a long time, you might want to consider tiering as part of the storage solution. Tiering combines lower-cost storage options with higher-cost, higher-performance storage in a hot tier. Consider the following capacity requirements: + +- Total capacity required +- Total hot-tier capacity required +- Total warm-tier capacity required +- Total cold-tier capacity required + +### Authentication and authorization method + +For authentication and authorization requirements, such as using an LDAP server or Windows Server Active Directory, make sure to include the necessary supporting systems in your architecture. If you need to support capabilities like UID or GID mapping to Windows Server Active Directory users, confirm that the storage solution supports that capability. + +Consider the following network requirements: + +- Local (UID or GID on file server only) +- Directory (LDAP or Windows Server Active Directory) +- UID or GID mapping to Windows Server Active Directory users or not + +### Roll-your-own parallel file system + +Similar to NFS, you can create a multi-node BeeGFS or Lustre file system. The performance of these systems is mostly dependent on the type of VMs that you choose. You can use images found in Azure Marketplace for [BeeGFS](https://azuremarketplace.microsoft.com/marketplace/apps/beegfs.beegfs-ubuntu-free) or a Lustre implementation by DDN called [Whamcloud](https://whamcloud.com/). If you use non-Microsoft images from vendors such as [BeeGFS](https://www.beegfs.io/content/) or DDN, you can purchase their support services. You can use BeeGFS and Lustre under their GPL licenses without extra charges, aside from the costs for machines and disks. These tools are easy to roll out by using [Azure HPC scripts](https://github.com/Azure/azurehpc/tree/master/examples) with either ephemeral local disks for scratch or Azure Premium SSD or Azure Ultra Disk Storage for persistent storage. + +### Cray ClusterStor + +It's a challenge for larger workloads to replicate the *bare-metal* performance of large compute clusters with large Lustre environments. Other challenges include achieving high throughput in terms of TBps and potentially handling PBs of storage. You can now run these workloads by using the Cray ClusterStor in Azure solution. This approach is a pure bare-metal Lustre deployment that's placed in the relevant Azure datacenter. Parallel file systems such as BeeGFS and Lustre provide the highest performance because of their architecture. But that architecture and the use of these technologies has a high management price. + +## Next steps + +The following articles provide guidance to help you at various points during your cloud adoption journey. + +- [Introduction to the Azure high-performance computing (HPC) scenario](/azure/cloud-adoption-framework/scenarios/azure-hpc/) +- [Identity and access management for Azure HPC](/azure/cloud-adoption-framework/scenarios/azure-hpc/identity-access-management) +- [Network topology and connectivity for Azure HPC in energy](/azure/cloud-adoption-framework/scenarios/azure-hpc/network-topology-connectivity) +- [Resource organization for HPC in the energy industry](/azure/cloud-adoption-framework/scenarios/azure-hpc/resource-organization) +- [Compute large-scale HPC application workloads in Azure VMs](/azure/cloud-adoption-framework/scenarios/azure-hpc/compute) +- [Azure high-performance computing (HPC) landing zone accelerator](/azure/cloud-adoption-framework/scenarios/azure-hpc/azure-hpc-landing-zone-accelerator) diff --git a/docs/scenarios/azure-hpc/toc.yml b/docs/scenarios/azure-hpc/toc.yml index 5689bf513f..562b2bcb14 100644 --- a/docs/scenarios/azure-hpc/toc.yml +++ b/docs/scenarios/azure-hpc/toc.yml @@ -9,85 +9,22 @@ items: items: - name: Azure landing zone for HPC href: ready.md - - name: Energy + - name: Design areas items: - - name: Design areas - items: - - name: Billing and Active Directory tenant - href: energy/azure-billing-active-directory-tenant.md - - name: Identity and access management - href: energy/identity-access-management.md - - name: Network topology and connectivity - href: energy/network-topology-connectivity.md - - name: Compute - href: energy/compute.md - - name: Storage - href: energy/storage.md - - name: Resource organization - href: energy/resource-organization.md - - name: Security - href: energy/security.md - - name: Management - href: energy/management.md - - name: Governance - href: energy/security-governance-compliance.md - - name: Platform automation and DevOps - href: energy/platform-automation-devops.md - - name: Finance - items: - - name: Design areas - items: - - name: Billing and Active Directory tenant - href: finance/azure-billing-active-directory-tenant.md - - name: Identity and access management - href: finance/identity-access-management.md - - name: Network topology and connectivity - href: finance/network-topology-connectivity.md - - name: Compute - href: finance/compute.md - - name: Storage - href: finance/storage.md - - name: Resource organization - href: finance/resource-organization.md - - name: Security - href: finance/security.md - - name: Management - href: finance/management.md - - name: Governance - href: finance/security-governance-compliance.md - - name: Platform automation and DevOps - href: finance/platform-automation-devops.md - - name: Manufacturing - items: - - name: Design areas - items: - - name: Billing and Active Directory tenant - href: manufacturing/azure-billing-active-directory-tenant.md - - name: Identity and access management - href: manufacturing/identity-access-management.md - - name: Network topology and connectivity - href: manufacturing/network-topology-connectivity.md - - name: Compute - href: manufacturing/compute.md - - name: Storage - href: manufacturing/storage.md - - name: Resource organization - href: manufacturing/resource-organization.md - - name: Security - href: manufacturing/security.md - - name: Management - href: manufacturing/management.md - - name: Governance - href: manufacturing/security-governance-compliance.md - - name: Platform automation and DevOps - href: manufacturing/platform-automation-devops.md + - name: Identity and access management + href: identity-access-management.md + - name: Network topology and connectivity + href: network-topology-connectivity.md + - name: Compute + href: compute.md + - name: Storage + href: storage.md + - name: Resource organization + href: resource-organization.md - name: Implementation options items: - name: HPC landing zone accelerator href: azure-hpc-landing-zone-accelerator.md - - name: Compute manufacturing architecture - href: /azure/architecture/industries/manufacturing/compute-manufacturing-overview - maintainContext: true - name: Condition monitoring architecture href: /azure/architecture/solution-ideas/articles/condition-monitoring maintainContext: true @@ -114,12 +51,6 @@ items: items: - name: Overview href: /azure/architecture/topics/high-performance-computing#example-hpc-architectures - - name: Energy example architecture - href: energy/compute.md#hpc-reference-architecture - - name: Finance example architecture - href: azure-hpc-landing-zone-accelerator.md#example-conceptual-reference-architecture-for-finance - - name: Manufacturing example architecture - href: azure-hpc-landing-zone-accelerator.md#example-conceptual-reference-architecture-for-manufacturing - name: 3D video rendering href: /azure/architecture/example-scenario/infrastructure/video-rendering maintainContext: true