From a440bfad0a47d58eb7a81b5f20c243fb35b3c6a5 Mon Sep 17 00:00:00 2001
From: Stef Nestor <26751266+stefnestor@users.noreply.github.com>
Date: Thu, 25 Jul 2024 17:20:58 -0600
Subject: [PATCH 1/5] Add link to flood-stage watermark exception message

---
 .../disk-usage-exceeded.asciidoc              | 73 +++++++++++++------
 .../geoip/EnterpriseGeoIpDownloaderTests.java |  5 +-
 .../ingest/geoip/GeoIpDownloaderTests.java    |  5 +-
 .../indices/delete/DeleteIndexBlocksIT.java   |  5 +-
 .../cluster/metadata/IndexMetadata.java       |  4 +-
 .../elasticsearch/common/ReferenceDocs.java   |  1 +
 .../common/reference-docs-links.json          |  3 +-
 7 files changed, 67 insertions(+), 29 deletions(-)
diff --git a/docs/reference/troubleshooting/common-issues/disk-usage-exceeded.asciidoc b/docs/reference/troubleshooting/common-issues/disk-usage-exceeded.asciidoc
index 2b3fcc1b6df9f..83e3092d62d7a 100644
--- a/docs/reference/troubleshooting/common-issues/disk-usage-exceeded.asciidoc
+++ b/docs/reference/troubleshooting/common-issues/disk-usage-exceeded.asciidoc
@@ -4,31 +4,40 @@
 ++++
 <titleabbrev>Watermark errors</titleabbrev>
 ++++
-:keywords: {es}, high watermark, low watermark, full disk
+:keywords: {es}, high watermark, low watermark, full disk, flood stage watermark
 
 When a data node is critically low on disk space and has reached the
 <<cluster-routing-flood-stage,flood-stage disk usage watermark>>, the following
-error is logged: `Error: disk usage exceeded flood-stage watermark, index has read-only-allow-delete block`. 
+error is logged: `Error: disk usage exceeded flood-stage watermark, index has 
+read-only-allow-delete block`. 
 
-To prevent a full disk, when a node reaches this watermark, {es} blocks writes
+To prevent a full disk, when a node reaches this watermark, {es} <<index-block-settings,blocks writes>>
 to any index with a shard on the node. If the block affects related system
-indices, {kib} and other {stack} features may become unavailable.
+indices, {kib} and other {stack} features may become unavailable. For example, 
+this could induce {kib}'s `Kibana Server is not Ready yet` 
+{kibana-ref}/access.html#not-ready[error message]. 
 
 {es} will automatically remove the write block when the affected node's disk
-usage goes below the <<cluster-routing-watermark-high,high disk watermark>>. To
-achieve this, {es} automatically moves some of the affected node's shards to
-other nodes in the same data tier.
+usage falls below the <<cluster-routing-watermark-high,high disk watermark>>. 
+To achieve this, {es} attempts to rebalances some of the affected node's shards 
+to other nodes in the same data tier. 
 
-To verify that shards are moving off the affected node, use the <<cat-shards,cat
-shards API>>.
+[[fix-watermark-errors-rebalance]]
+==== Monitor rebalancing
+
+To verify that shards are moving off the affected node until it falls below high 
+watermark., use the <<cat-shards,cat shards API>> and <<cat-recovery,cat recovery API>>: 
 
 [source,console]
 ----
 GET _cat/shards?v=true
+
+GET _cat/recovery?v=true&active_only=true
 ----
 
-If shards remain on the node, use the <<cluster-allocation-explain,cluster
-allocation explanation API>> to get an explanation for their allocation status.
+If shards remain on the node keeping it about high watermark, use the 
+<<cluster-allocation-explain,cluster allocation explanation API>> to get an 
+explanation for their allocation status.
 
 [source,console]
 ----
@@ -44,8 +53,12 @@ GET _cluster/allocation/explain
 // TEST[s/"primary": false,/"primary": false/]
 // TEST[s/"current_node": "my-node"//]
 
-To immediately restore write operations, you can temporarily increase the disk
-watermarks and remove the write block.
+[[fix-watermark-errors-temporary]]
+==== Temporary Relief
+
+To immediately restore write operations, you can temporarily increase the 
+<<disk-based-shard-allocation,disk watermarks>> and remove the 
+<<index-block-settings,write block>>.
 
 [source,console]
 ----
@@ -67,21 +80,15 @@ PUT */_settings?expand_wildcards=all
 {
   "index.blocks.read_only_allow_delete": null
 }
-----
-// TEST[s/^/PUT my-index\n/]
-
-As a long-term solution, we recommend you add nodes to the affected data tiers
-or upgrade existing nodes to increase disk space. To free up additional disk
-space, you can delete unneeded indices using the <<indices-delete-index,delete
-index API>>.
 
-[source,console]
-----
-DELETE my-index
+PUT .kibana*/_settings?expand_wildcards=all
+{
+  "index.blocks.write": null
+}
 ----
 // TEST[s/^/PUT my-index\n/]
 
-When a long-term solution is in place, reset or reconfigure the disk watermarks.
+When a long-term solution is in place, to reset or reconfigure the disk watermarks:
 
 [source,console]
 ----
@@ -99,3 +106,21 @@ PUT _cluster/settings
   }
 }
 ----
+
+[[fix-watermark-errors-resolve]]
+==== Resolve
+
+As a long-term solution, we recommend you do one of the following best suited 
+to your use case: 
+
+* add nodes to the affected <<data-tiers,data tiers>>
+
+* upgrade existing nodes to increase disk space
++
+TIP: On {ess}, https://support.elastic.co[Elastic Support] intervention may 
+become necessary if <<cluster-health,cluster health>> reaches `status:red`. 
+
+* delete unneeded indices using the <<indices-delete-index,delete index API>>
+
+* update related <<index-lifecycle-management,ILM policy>> to push indices 
+through to later <<data-tiers,data tiers>>
diff --git a/modules/ingest-geoip/src/test/java/org/elasticsearch/ingest/geoip/EnterpriseGeoIpDownloaderTests.java b/modules/ingest-geoip/src/test/java/org/elasticsearch/ingest/geoip/EnterpriseGeoIpDownloaderTests.java
index 58cb566165db2..203ecaea72c0e 100644
--- a/modules/ingest-geoip/src/test/java/org/elasticsearch/ingest/geoip/EnterpriseGeoIpDownloaderTests.java
+++ b/modules/ingest-geoip/src/test/java/org/elasticsearch/ingest/geoip/EnterpriseGeoIpDownloaderTests.java
@@ -27,6 +27,7 @@
 import org.elasticsearch.cluster.block.ClusterBlocks;
 import org.elasticsearch.cluster.metadata.IndexMetadata;
 import org.elasticsearch.cluster.service.ClusterService;
+import org.elasticsearch.common.ReferenceDocs;
 import org.elasticsearch.common.hash.MessageDigests;
 import org.elasticsearch.common.settings.ClusterSettings;
 import org.elasticsearch.common.settings.Settings;
@@ -476,7 +477,9 @@ public void testUpdateDatabasesWriteBlock() {
                 "index ["
                     + geoIpIndex
                     + "] blocked by: [TOO_MANY_REQUESTS/12/disk usage exceeded flood-stage watermark, "
-                    + "index has read-only-allow-delete block];"
+                    + "index has read-only-allow-delete block; for more information, see "
+                    + ReferenceDocs.FLOOD_STAGE_WATERMARK
+                    + "];"
             )
         );
         verifyNoInteractions(httpClient);
diff --git a/modules/ingest-geoip/src/test/java/org/elasticsearch/ingest/geoip/GeoIpDownloaderTests.java b/modules/ingest-geoip/src/test/java/org/elasticsearch/ingest/geoip/GeoIpDownloaderTests.java
index 06b2605bd6d41..984bd37181fe7 100644
--- a/modules/ingest-geoip/src/test/java/org/elasticsearch/ingest/geoip/GeoIpDownloaderTests.java
+++ b/modules/ingest-geoip/src/test/java/org/elasticsearch/ingest/geoip/GeoIpDownloaderTests.java
@@ -28,6 +28,7 @@
 import org.elasticsearch.cluster.block.ClusterBlocks;
 import org.elasticsearch.cluster.metadata.IndexMetadata;
 import org.elasticsearch.cluster.service.ClusterService;
+import org.elasticsearch.common.ReferenceDocs;
 import org.elasticsearch.common.settings.ClusterSettings;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.reindex.BulkByScrollResponse;
@@ -590,7 +591,9 @@ public void testUpdateDatabasesWriteBlock() {
                 "index ["
                     + geoIpIndex
                     + "] blocked by: [TOO_MANY_REQUESTS/12/disk usage exceeded flood-stage watermark, "
-                    + "index has read-only-allow-delete block];"
+                    + "index has read-only-allow-delete block; for more information, see "
+                    + ReferenceDocs.FLOOD_STAGE_WATERMARK
+                    + "];"
             )
         );
         verifyNoInteractions(httpClient);
diff --git a/server/src/internalClusterTest/java/org/elasticsearch/action/admin/indices/delete/DeleteIndexBlocksIT.java b/server/src/internalClusterTest/java/org/elasticsearch/action/admin/indices/delete/DeleteIndexBlocksIT.java
index 3560b74189d1d..415cfff459a67 100644
--- a/server/src/internalClusterTest/java/org/elasticsearch/action/admin/indices/delete/DeleteIndexBlocksIT.java
+++ b/server/src/internalClusterTest/java/org/elasticsearch/action/admin/indices/delete/DeleteIndexBlocksIT.java
@@ -12,6 +12,7 @@
 import org.elasticsearch.cluster.block.ClusterBlockException;
 import org.elasticsearch.cluster.metadata.IndexMetadata;
 import org.elasticsearch.cluster.metadata.Metadata;
+import org.elasticsearch.common.ReferenceDocs;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.test.ESIntegTestCase;
 
@@ -68,7 +69,9 @@ public void testClusterBlockMessageHasIndexName() {
             ClusterBlockException e = expectThrows(ClusterBlockException.class, prepareIndex("test").setId("1").setSource("foo", "bar"));
             assertEquals(
                 "index [test] blocked by: [TOO_MANY_REQUESTS/12/disk usage exceeded flood-stage watermark, "
-                    + "index has read-only-allow-delete block];",
+                    + "index has read-only-allow-delete block; for more information, see "
+                    + ReferenceDocs.FLOOD_STAGE_WATERMARK
+                    + "];",
                 e.getMessage()
             );
         } finally {
diff --git a/server/src/main/java/org/elasticsearch/cluster/metadata/IndexMetadata.java b/server/src/main/java/org/elasticsearch/cluster/metadata/IndexMetadata.java
index 32a3af0c341e5..742439c9a2484 100644
--- a/server/src/main/java/org/elasticsearch/cluster/metadata/IndexMetadata.java
+++ b/server/src/main/java/org/elasticsearch/cluster/metadata/IndexMetadata.java
@@ -26,6 +26,7 @@
 import org.elasticsearch.cluster.routing.allocation.IndexMetadataUpdater;
 import org.elasticsearch.cluster.routing.allocation.decider.DiskThresholdDecider;
 import org.elasticsearch.cluster.routing.allocation.decider.ShardsLimitAllocationDecider;
+import org.elasticsearch.common.ReferenceDocs;
 import org.elasticsearch.common.collect.ImmutableOpenMap;
 import org.elasticsearch.common.compress.CompressedXContent;
 import org.elasticsearch.common.io.stream.StreamInput;
@@ -129,7 +130,8 @@ public class IndexMetadata implements Diffable<IndexMetadata>, ToXContentFragmen
     );
     public static final ClusterBlock INDEX_READ_ONLY_ALLOW_DELETE_BLOCK = new ClusterBlock(
         12,
-        "disk usage exceeded flood-stage watermark, index has read-only-allow-delete block",
+        "disk usage exceeded flood-stage watermark, index has read-only-allow-delete block; for more information, see "
+            + ReferenceDocs.FLOOD_STAGE_WATERMARK,
         false,
         false,
         true,
diff --git a/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java b/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java
index a87f3b3d4bda0..c11d369e3cc76 100644
--- a/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java
+++ b/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java
@@ -79,6 +79,7 @@ public enum ReferenceDocs {
     S3_COMPATIBLE_REPOSITORIES,
     LUCENE_MAX_DOCS_LIMIT,
     MAX_SHARDS_PER_NODE,
+    FLOOD_STAGE_WATERMARK,
     // this comment keeps the ';' on the next line so every entry above has a trailing ',' which makes the diff for adding new links cleaner
     ;
 
diff --git a/server/src/main/resources/org/elasticsearch/common/reference-docs-links.json b/server/src/main/resources/org/elasticsearch/common/reference-docs-links.json
index 0d11629803ced..e325aea9f1089 100644
--- a/server/src/main/resources/org/elasticsearch/common/reference-docs-links.json
+++ b/server/src/main/resources/org/elasticsearch/common/reference-docs-links.json
@@ -39,5 +39,6 @@
   "SNAPSHOT_REPOSITORY_ANALYSIS": "repo-analysis-api.html",
   "S3_COMPATIBLE_REPOSITORIES": "repository-s3.html#repository-s3-compatible-services",
   "LUCENE_MAX_DOCS_LIMIT": "size-your-shards.html#troubleshooting-max-docs-limit",
-  "MAX_SHARDS_PER_NODE": "size-your-shards.html#troubleshooting-max-shards-open"
+  "MAX_SHARDS_PER_NODE": "size-your-shards.html#troubleshooting-max-shards-open",
+  "FLOOD_STAGE_WATERMARK": "fix-watermark-errors.html"
 }

From a789f01ad48d39ef521ea8ee5b8cd40a4184e48f Mon Sep 17 00:00:00 2001
From: Stef Nestor <26751266+stefnestor@users.noreply.github.com>
Date: Thu, 25 Jul 2024 17:28:39 -0600
Subject: [PATCH 2/5] Update docs/changelog/111315.yaml

---
 docs/changelog/111315.yaml | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 docs/changelog/111315.yaml

diff --git a/docs/changelog/111315.yaml b/docs/changelog/111315.yaml
new file mode 100644
index 0000000000000..0e2e56898b51c
--- /dev/null
+++ b/docs/changelog/111315.yaml
@@ -0,0 +1,5 @@
+pr: 111315
+summary: Add link to flood-stage watermark exception message
+area: Allocation
+type: enhancement
+issues: []

From 585b31277f3dd307d8c0ec21190db392d6bf2f8b Mon Sep 17 00:00:00 2001
From: Stef Nestor <26751266+stefnestor@users.noreply.github.com>
Date: Thu, 25 Jul 2024 17:30:02 -0600
Subject: [PATCH 3/5] typo

---
 .../troubleshooting/common-issues/disk-usage-exceeded.asciidoc  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/reference/troubleshooting/common-issues/disk-usage-exceeded.asciidoc b/docs/reference/troubleshooting/common-issues/disk-usage-exceeded.asciidoc
index 83e3092d62d7a..e7c3db1c60458 100644
--- a/docs/reference/troubleshooting/common-issues/disk-usage-exceeded.asciidoc
+++ b/docs/reference/troubleshooting/common-issues/disk-usage-exceeded.asciidoc
@@ -19,7 +19,7 @@ this could induce {kib}'s `Kibana Server is not Ready yet`
 
 {es} will automatically remove the write block when the affected node's disk
 usage falls below the <<cluster-routing-watermark-high,high disk watermark>>. 
-To achieve this, {es} attempts to rebalances some of the affected node's shards 
+To achieve this, {es} attempts to rebalance some of the affected node's shards 
 to other nodes in the same data tier. 
 
 [[fix-watermark-errors-rebalance]]

From e215ff90ea6e1de1675f3bc310ee2a98695d6295 Mon Sep 17 00:00:00 2001
From: Stef Nestor <26751266+stefnestor@users.noreply.github.com>
Date: Fri, 26 Jul 2024 13:58:31 -0600
Subject: [PATCH 4/5] question

---
 .../common-issues/disk-usage-exceeded.asciidoc               | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/docs/reference/troubleshooting/common-issues/disk-usage-exceeded.asciidoc b/docs/reference/troubleshooting/common-issues/disk-usage-exceeded.asciidoc
index e7c3db1c60458..369528987ed86 100644
--- a/docs/reference/troubleshooting/common-issues/disk-usage-exceeded.asciidoc
+++ b/docs/reference/troubleshooting/common-issues/disk-usage-exceeded.asciidoc
@@ -80,11 +80,6 @@ PUT */_settings?expand_wildcards=all
 {
   "index.blocks.read_only_allow_delete": null
 }
-
-PUT .kibana*/_settings?expand_wildcards=all
-{
-  "index.blocks.write": null
-}
 ----
 // TEST[s/^/PUT my-index\n/]
 

From 061109a5efb30a33c2d841cf8d23e151746f58b5 Mon Sep 17 00:00:00 2001
From: Stef Nestor <26751266+stefnestor@users.noreply.github.com>
Date: Wed, 31 Jul 2024 17:12:20 -0600
Subject: [PATCH 5/5] feedback

Co-authored-by: David Turner <david.turner@elastic.co>
---
 .../common-issues/disk-usage-exceeded.asciidoc               | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/docs/reference/troubleshooting/common-issues/disk-usage-exceeded.asciidoc b/docs/reference/troubleshooting/common-issues/disk-usage-exceeded.asciidoc
index 369528987ed86..728d805db7a30 100644
--- a/docs/reference/troubleshooting/common-issues/disk-usage-exceeded.asciidoc
+++ b/docs/reference/troubleshooting/common-issues/disk-usage-exceeded.asciidoc
@@ -8,8 +8,7 @@
 
 When a data node is critically low on disk space and has reached the
 <<cluster-routing-flood-stage,flood-stage disk usage watermark>>, the following
-error is logged: `Error: disk usage exceeded flood-stage watermark, index has 
-read-only-allow-delete block`. 
+error is logged: `Error: disk usage exceeded flood-stage watermark, index has read-only-allow-delete block`. 
 
 To prevent a full disk, when a node reaches this watermark, {es} <<index-block-settings,blocks writes>>
 to any index with a shard on the node. If the block affects related system
@@ -20,7 +19,7 @@ this could induce {kib}'s `Kibana Server is not Ready yet`
 {es} will automatically remove the write block when the affected node's disk
 usage falls below the <<cluster-routing-watermark-high,high disk watermark>>. 
 To achieve this, {es} attempts to rebalance some of the affected node's shards 
-to other nodes in the same data tier. 
+to other nodes in the same data tier.
 
 [[fix-watermark-errors-rebalance]]
 ==== Monitor rebalancing