Azure · j82w · Jun 2, 2020 · Feb 28, 2020 · Feb 28, 2020 · Mar 17, 2020
diff --git a/Microsoft.Azure.Cosmos/src/Resource/CosmosExceptions/CosmosException.cs b/Microsoft.Azure.Cosmos/src/Resource/CosmosExceptions/CosmosException.cs
@@ -242,4 +242,4 @@ private string ToStringHelper(
             return stringBuilder.ToString();
         }
     }
-}
+}
diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosItemTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosItemTests.cs
@@ -1304,7 +1304,8 @@ public async Task ItemReplaceAsyncTest()
                 partitionKey: new Cosmos.PartitionKey(originalStatus),
                 item: testItem);
                 Assert.Fail("Replace changing partition key is not supported.");
-            }catch(CosmosException ce)
+            }
+            catch (CosmosException ce)
             {
                 Assert.AreEqual((HttpStatusCode)400, ce.StatusCode);
             }

diff --git a/TroubleshootingGuides/CosmosMacSignature.md b/TroubleshootingGuides/CosmosMacSignature.md
@@ -0,0 +1,32 @@
+## CosmosUnauthorized
+
+|   |   |   |
+|---|---|---|
+|TypeName|CosmosNotModified|
+|Status|401_0000|
+|Category|Service|
+
+## Description
+HTTP 401: The MAC signature found in the HTTP request is not the same as the computed signature
+If you received the following 401 error message: "The MAC signature found in the HTTP request is not the same as the computed signature." it can be caused by the following scenarios.
+
+## Troubleshooting steps
+
+### 1. Key was not properly rotated.
+
+    Symptom: 401 MAC signature is seen shortly after a key rotation and eventually stops without any changes. 
+
+    Cause: The key was rotated and did not follow the [best practices](secure-access-to-data.md#key-rotation). This is usually the case. Cosmos DB account key rotation can take anywhere from a few seconds to possibly days depending on the Cosmos DB account size.
+
+### 2. The key is misconfigured
+
+    Symptoms: 401 MAC signature issue will be consistent and happens for all calls using that key
+
+    Cause: The key is misconfigured on the application so the key does not match the account or entire key was not copied.
+
+
+### 3. Race condition with create container
+
+    Symptoms: 401 MAC signature issue is seen shortly after a container creation, and only occur until the container creation is completed.
+
+    Cause: There is a race condition with container creation. An application instance is trying to access the container before container creation is complete. The most common scenario for this if the application is running, and the container is deleted and recreated with the same name while the application is running. The SDK will attempt to use the new container, but the container creation is still in progress so it does not have the keys.
diff --git a/TroubleshootingGuides/CosmosNotFound.md b/TroubleshootingGuides/CosmosNotFound.md
@@ -0,0 +1,42 @@
+## CosmosNotFound
+
+|   |   |   |
+|---|---|---|
+|TypeName|CosmosNotFound|
+|Status|404_0000|
+|Category|Service|
+
+## Description
+
+This status code represents that the resource no longer exists. 
+
+## Known causes
+
+The document does exists, but still returns a 404. 
+
+### 1. Race condition
+    Cause: There is multiple SDK client instances and the read happened before the write.
+
+    Fix:
+    1. For session consistency the create item will return a session token that can be passed between SDK instances to guarantee that the read request is reading from a replica with that change.
+    2. Change the [consistency level](https://docs.microsoft.com/azure/cosmos-db/consistency-levels-choosing) to a [stronger level](https://docs.microsoft.com/azure/cosmos-db/consistency-levels-tradeoffs)
+
+### 2. Invalid Partition Key and ID combination
+    Cause: The partition key and id combination are not valid.
+
+    Fix: Fix the application logic that is causing the incorrect combination. 
+
+### 3. TTL purge
+    Cause: The item had the [Time To Live (TTL)](https://docs.microsoft.com/azure/cosmos-db/time-to-live) property set. The item was purged because the time to live had expired.
+
+    Fix: Change the Time To Live to prevent the item from getting purged.
+
+### 4. Lazy indexing
+    Cause: The [lazy indexing](https://docs.microsoft.com/azure/cosmos-db/index-policy#indexing-mode) has not caught up.
+
+    Fix: Wait for the indexing to catch up or change the indexing policy
+
+### 5. Parent resource deleted
+    Cause: The database and/or container that the item exists in has been deleted.
+
+    Fix: [Restore](https://docs.microsoft.com/azure/cosmos-db/online-backup-and-restore#backup-retention-period) the parent resource or recreate the resources.
diff --git a/TroubleshootingGuides/CosmosNotModified.md b/TroubleshootingGuides/CosmosNotModified.md
@@ -0,0 +1,14 @@
+## CosmosNotModified
+
+|   |   |   |
+|---|---|---|
+|TypeName|CosmosNotModified|
+|Status|304_0000|
+|Category|Service|
+
+## Description
+
+This status code in changefeed simply means there is no new items to process. This is expected and the SDK is designed to handle it.
+
+## Related documentation
+* [Change feed overview](https://docs.microsoft.com/azure/cosmos-db/change-feed)
diff --git a/TroubleshootingGuides/CosmosRequestHeaderTooLarge.md b/TroubleshootingGuides/CosmosRequestHeaderTooLarge.md
@@ -0,0 +1,33 @@
+## CosmosRequestHeaderTooLarge
+
+|   |   |   |
+|---|---|---|
+|TypeName|CosmosRequestHeaderTooLarge|
+|Status|400_0000|
+|Category|Service|
+
+## Description
+The size of the header has grown to large and is exceeding the maximum allowed size. It's always recommended to use the latest SDK. Make sure to use at least version 3.x or 2.x, which adds header size tracing to the exception message.
+
+## Troubleshooting steps
+
+### 1. Session Token too large
+    Symptoms: The 400 bad request is happening on point operations where the continuation token is not being used. The exception started without making any changes to the application.
+
+    Cause: The session token grows as the number of partitions increase in the container. The numbers of partition increase as the amount of data increase or if the thoughput is increased.
+
+    Temprorary mitigation: Restart the application will reset all the session token. This session token will eventually grow back to the previous size that causes the issue.
+
+    Fixes:
+    1. Follow the performance tips and convert the application to Direct + TCP connection mode. Direct + TCP does not have the header size restriction like HTTP does which avoids this issue. Make sure to use SDK version greater than 2.9.3 which has a fix for query opertaions when the service interop is not available.
+    2. If the application cannot be converted to Direct + TCP and the session token is the cause, then mitigation can be done by changing the client consistency level. The session token is only used for session consistency which is the default for Cosmos DB. Any other consistency level will not use the session token.
+
+
+### 2. Continuation token too large
+    Symptoms: The 400 bad request is happening on query operations where the continuation token is being passed in.
+
+    Cause: The continuation token has grown to large. Different queries will have different continuation token sizes.
+
+    Fixes:
+     1. Follow the performance tips and convert the application to Direct + TCP connection mode. Direct + TCP does not have the header size restriction like HTTP does which avoids this issue.
+     2. If the application cannot be converted to Direct + TCP and the continuation token is the cause, then try setting the ResponseContinuationTokenLimitInKb option. The option can be found in the FeedOptions for v2 or the QueryRequestOptions in v3.
diff --git a/TroubleshootingGuides/CosmosRequestRateTooLarge.md b/TroubleshootingGuides/CosmosRequestRateTooLarge.md
@@ -0,0 +1,19 @@
+## CosmosRequestRateTooLarge
+
+|   |   |   |
+|---|---|---|
+|TypeName|CosmosRequestRateTooLarge|
+|Status|429_0000|
+|Category|Service|
+
+## Issue
+
+'Request rate too large' or error code 429 indicates that your requests are being throttled, because the consumed throughput (RU/s) has exceeded the [provisioned throughput](https://docs.microsoft.com/azure/cosmos-db/set-throughput). The SDK will automatically retry requests based on the specified retry policy. If you get this failure often, consider increasing the throughput on the collection. Check the portal's metrics to see if you are getting 429 errors. Review your partition key to ensure it results in an [even distribution of storage and request volume](https://docs.microsoft.com/azure/cosmos-db/partition-data).
+
+## Solution
+
+Use the portal or the SDK to increase the provisioned throughput.
+
+## Related documentation
+* [Provision throughput on containers and databases](https://docs.microsoft.com/azure/cosmos-db/set-throughput)
+* [Request units in Azure Cosmos DB](https://docs.microsoft.com/azure/cosmos-db/request-units)
diff --git a/TroubleshootingGuides/CosmosRequestTimeoutClient.md b/TroubleshootingGuides/CosmosRequestTimeoutClient.md
@@ -0,0 +1,45 @@
+## CosmosRequestTimeoutClient
+
+|   |   |   |
+|---|---|---|
+|TypeName|CosmosRequestTimeoutClient|
+|Status|408_0000|
+|Category|Connectivity|
+
+
+## Issue
+
+The SDK was not able to connect to the Azure Cosmos DB service.
+
+## Troubleshooting steps
+These are the known causes for this issue.
+
+### 1. High CPU utilization (most common case)
+    Cause: For optimal latency it is recommended that CPU usage should be roughly 40%. It is recommended to look at CPU utilization at 10 second intervals. If the interval is larger then CPU spikes can be missed by getting averaged in with lower values. This is more common with cross partition queries where it might do multiple connections for a single request.
+
+    Fix: The application should be scaled up/out.
+
+### 2. Socket / Port availability might be low
+    Cause: When running in Azure, clients using the .NET SDK can hit Azure SNAT (PAT) port exhaustion.
+
+    Fix: Follow the CosmosSNATPortExhuastion guide.
+
+### 3. Creating multiple Client instances
+    Cause: This might lead to connection contention and timeout issues.
+
+    Fix:Follow the [performance tips](https://docs.microsoft.com/azure/cosmos-db/performance-tips), and use a single CosmosClient instance across an entire process.|
+
+### 4. Hot partition key
+    Cause: Azure Cosmos DB distributes the overall provisioned throughput evenly across physical partitions. One partition is having all of it's resources consumed while other partitions go unused. Check portal metrics to see if the workload is encountering a hot [partition key](https://docs.microsoft.com/azure/cosmos-db/partition-data). This will cause the aggregate consumed throughput (RU/s) to be appear to be under the provisioned RUs, but a single partition consumed throughput (RU/s) will exceed the provisioned throughput
+
+    Fix: The partition key should be changed to avoid the heavily used value.
+
+### 5. High degree of concurrency
+    Cause: The application is doing a high level of conccurrency which can lead to contention on the channel
+
+    Fix: Try to scale the application up/out.
+
+### 6. Large requests and/or responses
+    Cause: Large requests or responses can lead to head-of-line blocking on the channel and exacerbate contention, even with a relatively low degree of concurrency.
+
+    Fix: Try to scale the application up/out.
diff --git a/TroubleshootingGuides/CosmosRequestTimeoutService.md b/TroubleshootingGuides/CosmosRequestTimeoutService.md
@@ -0,0 +1,22 @@
+## CosmosRequestTimeoutService
+
+|   |   |   |
+|---|---|---|
+|TypeName|CosmosRequestTimeoutService|
+|Status|408_0000|
+|Category|Service|
+
+## Issue
+
+The SDK was able to connect to the Azure Cosmos DB service, but the request timed out.
+
+## Troubleshooting steps
+
+### 1. Check the portal metrics
+    Use the [Azure monitoring](https://docs.microsoft.com/azure/cosmos-db/monitor-cosmos-db) to check if the 408 request timeout was from the service.
+
+### 2. Failure rate is within Cosmos DB SLA
+    The application should be able to handle transient failures and retry when necessary.
+
+### 3. Failure rate is violating the Cosmos DB SLA
+    Please contact Azure support.
diff --git a/TroubleshootingGuides/CosmosSNATPortExhaustion.md b/TroubleshootingGuides/CosmosSNATPortExhaustion.md
@@ -0,0 +1,25 @@
+## CosmosSNATPortExhuastion
+
+|   |   |   |
+|---|---|---|
+|TypeName|CosmosSNATPortExhuastion|
+|Status|503_0000|
+|Category|Connectivity|
+
+## Issue
+
+If your app is deployed on Azure Virtual Machines without a public IP address, by default [Azure SNAT ports](https://docs.microsoft.com/azure/load-balancer/load-balancer-outbound-connections#preallocatedports) establish connections to any endpoint outside of your VM. The number of connections allowed from the VM to the Azure Cosmos DB endpoint is limited by the [Azure SNAT configuration](https://docs.microsoft.com/azure/load-balancer/load-balancer-outbound-connections#preallocatedports).
+
+ Azure SNAT ports are used only when your VM has a private IP address and a process from the VM tries to connect to a public IP address.
+
+## Troubleshooting steps
+
+There are two workarounds to avoid Azure SNAT limitation:
+
+* Add your Azure Cosmos DB service endpoint to the subnet of your Azure Virtual Machines virtual network. For more information, see [Azure Virtual Network service endpoints](https://docs.microsoft.com/azure/virtual-network/virtual-network-service-endpoints-overview). 
+
+    When the service endpoint is enabled, the requests are no longer sent from a public IP to Azure Cosmos DB. Instead, the virtual network and subnet identity are sent. This change might result in firewall drops if only public IPs are allowed. If you use a firewall, when you enable the service endpoint, add a subnet to the firewall by using [Virtual Network ACLs](https://docs.microsoft.com/azure/virtual-network/virtual-networks-acl).
+* Assign a public IP to your Azure VM.
+
+## Related documentation
+* [Diagnose and troubleshoot issues when using Azure Cosmos DB .NET SDK](https://docs.microsoft.com/azure/cosmos-db/troubleshoot-dot-net-sdk)
-Original file line number
+Diff line change
@@ Expand Up / @@ -242,4 +242,4 @@ private string ToStringHelper( @@
                 return stringBuilder.ToString();
             }
         }
-    }
+    }