Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -66,13 +66,13 @@ public class DatanodeConfiguration extends ReconfigurableConfig {
// Ex: If volume has 1000GB and minFreeSpace is configured as 10GB,
// In this case when availableSpace is 10GB or below, volume is assumed as full
public static final String HDDS_DATANODE_VOLUME_MIN_FREE_SPACE = "hdds.datanode.volume.min.free.space";
public static final String HDDS_DATANODE_VOLUME_MIN_FREE_SPACE_DEFAULT = "5GB";
public static final String HDDS_DATANODE_VOLUME_MIN_FREE_SPACE_DEFAULT = "20GB";
// Minimum percent of space should be left on volume.
// Ex: If volume has 1000GB and minFreeSpacePercent is configured as 2%,
// In this case when availableSpace is 20GB(2% of 1000) or below, volume is assumed as full
public static final String HDDS_DATANODE_VOLUME_MIN_FREE_SPACE_PERCENT =
"hdds.datanode.volume.min.free.space.percent";
static final byte MIN_FREE_SPACE_UNSET = -1;
public static final float HDDS_DATANODE_VOLUME_MIN_FREE_SPACE_PERCENT_DEFAULT = 0.001f;

public static final String WAIT_ON_ALL_FOLLOWERS = "hdds.datanode.wait.on.all.followers";
public static final String CONTAINER_SCHEMA_V3_ENABLED = "hdds.datanode.container.schema.v3.enabled";
Expand Down Expand Up @@ -280,10 +280,9 @@ public class DatanodeConfiguration extends ReconfigurableConfig {
" When the difference between volume capacity and used reaches this number," +
" containers that reside on this volume will be closed and no new containers" +
" would be allocated on this volume." +
" Either of min.free.space or min.free.space.percent should be configured, when both are set then" +
" min.free.space will be used."
" Max of min.free.space and min.free.space.percent will be used as final value."
)
private long minFreeSpace = MIN_FREE_SPACE_UNSET;
private long minFreeSpace = getDefaultFreeSpace();

@Config(key = "hdds.datanode.volume.min.free.space.percent",
defaultValue = "-1",
Expand All @@ -293,10 +292,9 @@ public class DatanodeConfiguration extends ReconfigurableConfig {
" When the difference between volume capacity and used reaches (free.space.percent of volume capacity)," +
" containers that reside on this volume will be closed and no new containers" +
" would be allocated on this volume." +
" Either of min.free.space or min.free.space.percent should be configured, when both are set then" +
" min.free.space will be used."
" Max of min.free.space or min.free.space.percent will be used as final value."
)
private float minFreeSpaceRatio = MIN_FREE_SPACE_UNSET;
private float minFreeSpaceRatio = HDDS_DATANODE_VOLUME_MIN_FREE_SPACE_PERCENT_DEFAULT;

@Config(key = "periodic.disk.check.interval.minutes",
defaultValue = "60",
Expand Down Expand Up @@ -683,39 +681,18 @@ public void validate() {
}

/**
* If 'hdds.datanode.volume.min.free.space' is defined,
* it will be honored first. If it is not defined and
* 'hdds.datanode.volume.min.free.space.percent' is defined, it will honor this
* else it will fall back to 'hdds.datanode.volume.min.free.space.default'
* validate value of 'hdds.datanode.volume.min.free.space' and 'hdds.datanode.volume.min.free.space.percent'
* and update with default value if not within range.
*/
private void validateMinFreeSpace() {
if (minFreeSpaceRatio > 1) {
if (minFreeSpaceRatio > 1 || minFreeSpaceRatio < 0) {
LOG.warn("{} = {} is invalid, should be between 0 and 1",
HDDS_DATANODE_VOLUME_MIN_FREE_SPACE_PERCENT,
minFreeSpaceRatio);

minFreeSpaceRatio = MIN_FREE_SPACE_UNSET;
}

final boolean minFreeSpaceConfigured = minFreeSpace >= 0;
final boolean minFreeSpaceRatioConfigured = minFreeSpaceRatio >= 0;

if (minFreeSpaceConfigured && minFreeSpaceRatioConfigured) {
// Only one property should be configured.
// Since both properties are configured, HDDS_DATANODE_VOLUME_MIN_FREE_SPACE is used to determine minFreeSpace
LOG.warn("Only one of {}={} and {}={} should be set. With both set, {} value will be used.",
HDDS_DATANODE_VOLUME_MIN_FREE_SPACE,
minFreeSpace,
HDDS_DATANODE_VOLUME_MIN_FREE_SPACE_PERCENT,
minFreeSpaceRatio,
HDDS_DATANODE_VOLUME_MIN_FREE_SPACE);

minFreeSpaceRatio = MIN_FREE_SPACE_UNSET;
minFreeSpaceRatio = HDDS_DATANODE_VOLUME_MIN_FREE_SPACE_PERCENT_DEFAULT;
}

if (!minFreeSpaceConfigured && !minFreeSpaceRatioConfigured) {
// If both are not configured use defaultFreeSpace
minFreeSpaceRatio = MIN_FREE_SPACE_UNSET;
if (minFreeSpace < 0) {
minFreeSpace = getDefaultFreeSpace();
}
}
Expand Down Expand Up @@ -781,9 +758,7 @@ public void setContainerCloseThreads(int containerCloseThreads) {
}

public long getMinFreeSpace(long capacity) {
return minFreeSpaceRatio >= 0
? ((long) (capacity * minFreeSpaceRatio))
: minFreeSpace;
return Math.max((long) (capacity * minFreeSpaceRatio), minFreeSpace);
}

public long getMinFreeSpace() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import static org.apache.hadoop.ozone.container.common.statemachine.DatanodeConfiguration.FAILED_DB_VOLUMES_TOLERATED_KEY;
import static org.apache.hadoop.ozone.container.common.statemachine.DatanodeConfiguration.FAILED_METADATA_VOLUMES_TOLERATED_KEY;
import static org.apache.hadoop.ozone.container.common.statemachine.DatanodeConfiguration.FAILED_VOLUMES_TOLERATED_DEFAULT;
import static org.apache.hadoop.ozone.container.common.statemachine.DatanodeConfiguration.HDDS_DATANODE_VOLUME_MIN_FREE_SPACE_PERCENT_DEFAULT;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a new unit test which doesn't explicitly set any of the two properties.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is considered in org.apache.hadoop.ozone.container.common.statemachine.TestDatanodeConfiguration#isCreatedWitDefaultValues

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isCreatedWitDefaultValues unsets DatanodeConfiguration.HDDS_DATANODE_VOLUME_MIN_FREE_SPACE.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unset ensure default value is used in ozone configuration, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unset is done for ozone-site.xml as defined in test module, so that it can use default value if not defined. comment added.

import static org.apache.hadoop.ozone.container.common.statemachine.DatanodeConfiguration.PERIODIC_DISK_CHECK_INTERVAL_MINUTES_DEFAULT;
import static org.apache.hadoop.ozone.container.common.statemachine.DatanodeConfiguration.PERIODIC_DISK_CHECK_INTERVAL_MINUTES_KEY;
import static org.junit.jupiter.api.Assertions.assertEquals;
Expand Down Expand Up @@ -153,6 +154,7 @@ public void overridesInvalidValues() {
public void isCreatedWitDefaultValues() {
// GIVEN
OzoneConfiguration conf = new OzoneConfiguration();
// unset over-ridding configuration from ozone-site.xml defined for the test module
conf.unset(DatanodeConfiguration.HDDS_DATANODE_VOLUME_MIN_FREE_SPACE); // set in ozone-site.xml

// WHEN
Expand All @@ -176,7 +178,13 @@ public void isCreatedWitDefaultValues() {
assertEquals(BLOCK_DELETE_COMMAND_WORKER_INTERVAL_DEFAULT,
subject.getBlockDeleteCommandWorkerInterval());
assertEquals(DatanodeConfiguration.getDefaultFreeSpace(), subject.getMinFreeSpace());
assertEquals(DatanodeConfiguration.MIN_FREE_SPACE_UNSET, subject.getMinFreeSpaceRatio());
assertEquals(HDDS_DATANODE_VOLUME_MIN_FREE_SPACE_PERCENT_DEFAULT, subject.getMinFreeSpaceRatio());
final long oneGB = 1024 * 1024 * 1024;
// capacity is less, consider default min_free_space
assertEquals(DatanodeConfiguration.getDefaultFreeSpace(), subject.getMinFreeSpace(oneGB));
// capacity is large, consider min_free_space_percent, max(min_free_space, min_free_space_percent * capacity)ß
assertEquals(HDDS_DATANODE_VOLUME_MIN_FREE_SPACE_PERCENT_DEFAULT * oneGB * oneGB,
subject.getMinFreeSpace(oneGB * oneGB));
}

@Test
Expand All @@ -186,11 +194,11 @@ void rejectsInvalidMinFreeSpaceRatio() {

DatanodeConfiguration subject = conf.getObject(DatanodeConfiguration.class);

assertEquals(DatanodeConfiguration.MIN_FREE_SPACE_UNSET, subject.getMinFreeSpaceRatio());
assertEquals(HDDS_DATANODE_VOLUME_MIN_FREE_SPACE_PERCENT_DEFAULT, subject.getMinFreeSpaceRatio());
}

@Test
void useMinFreeSpaceIfBothMinFreeSpacePropertiesSet() {
void useMaxIfBothMinFreeSpacePropertiesSet() {
OzoneConfiguration conf = new OzoneConfiguration();
int minFreeSpace = 10000;
conf.setLong(DatanodeConfiguration.HDDS_DATANODE_VOLUME_MIN_FREE_SPACE, minFreeSpace);
Expand All @@ -199,10 +207,11 @@ void useMinFreeSpaceIfBothMinFreeSpacePropertiesSet() {
DatanodeConfiguration subject = conf.getObject(DatanodeConfiguration.class);

assertEquals(minFreeSpace, subject.getMinFreeSpace());
assertEquals(DatanodeConfiguration.MIN_FREE_SPACE_UNSET, subject.getMinFreeSpaceRatio());
assertEquals(.5f, subject.getMinFreeSpaceRatio());

for (long capacity : CAPACITIES) {
assertEquals(minFreeSpace, subject.getMinFreeSpace(capacity));
// disk percent is higher than minFreeSpace configured 10000 bytes
assertEquals((long)(capacity * 0.5f), subject.getMinFreeSpace(capacity));
}
}

Expand All @@ -211,11 +220,12 @@ void useMinFreeSpaceIfBothMinFreeSpacePropertiesSet() {
void usesFixedMinFreeSpace(long bytes) {
OzoneConfiguration conf = new OzoneConfiguration();
conf.setLong(DatanodeConfiguration.HDDS_DATANODE_VOLUME_MIN_FREE_SPACE, bytes);
// keeping %cent low so that min free space is picked up
conf.setFloat(DatanodeConfiguration.HDDS_DATANODE_VOLUME_MIN_FREE_SPACE_PERCENT, 0.00001f);

DatanodeConfiguration subject = conf.getObject(DatanodeConfiguration.class);

assertEquals(bytes, subject.getMinFreeSpace());
assertEquals(DatanodeConfiguration.MIN_FREE_SPACE_UNSET, subject.getMinFreeSpaceRatio());

for (long capacity : CAPACITIES) {
assertEquals(bytes, subject.getMinFreeSpace(capacity));
Expand All @@ -226,7 +236,8 @@ void usesFixedMinFreeSpace(long bytes) {
@ValueSource(ints = {1, 10, 100})
void calculatesMinFreeSpaceRatio(int percent) {
OzoneConfiguration conf = new OzoneConfiguration();
conf.unset(DatanodeConfiguration.HDDS_DATANODE_VOLUME_MIN_FREE_SPACE); // set in ozone-site.xml
// keeping min free space low so that %cent is picked up after calculation
conf.set(DatanodeConfiguration.HDDS_DATANODE_VOLUME_MIN_FREE_SPACE, "1000"); // set in ozone-site.xml
conf.setFloat(DatanodeConfiguration.HDDS_DATANODE_VOLUME_MIN_FREE_SPACE_PERCENT, percent / 100.0f);

DatanodeConfiguration subject = conf.getObject(DatanodeConfiguration.class);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

import static org.apache.hadoop.hdds.scm.ScmConfigKeys.HDDS_DATANODE_DIR_DU_RESERVED_PERCENT;
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.HDDS_DATANODE_DIR_DU_RESERVED_PERCENT_DEFAULT;
import static org.apache.hadoop.ozone.container.common.statemachine.DatanodeConfiguration.HDDS_DATANODE_VOLUME_MIN_FREE_SPACE;
import static org.apache.hadoop.ozone.container.common.statemachine.DatanodeConfiguration.HDDS_DATANODE_VOLUME_MIN_FREE_SPACE_PERCENT;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotEquals;
Expand Down Expand Up @@ -202,12 +201,12 @@ public void testMinFreeSpaceCalculator() throws Exception {
assertEquals(minSpace, conf.getObject(DatanodeConfiguration.class).getMinFreeSpace(capacity));

conf.setFloat(HDDS_DATANODE_VOLUME_MIN_FREE_SPACE_PERCENT, 0.01f);
// When both are set, minSpace will be used
// When both are set, max(minSpace, %cent), minSpace will be used
assertEquals(minSpace, conf.getObject(DatanodeConfiguration.class).getMinFreeSpace(capacity));

// capacity * 1% = 10
conf.unset(HDDS_DATANODE_VOLUME_MIN_FREE_SPACE);
assertEquals(10, conf.getObject(DatanodeConfiguration.class).getMinFreeSpace(capacity));
conf.setFloat(HDDS_DATANODE_VOLUME_MIN_FREE_SPACE_PERCENT, 1f);
// When both are set, max(minSpace, %cent), hence %cent will be used
assertEquals(1000, conf.getObject(DatanodeConfiguration.class).getMinFreeSpace(capacity));
}

private long getExpectedDefaultReserved(HddsVolume volume) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,9 @@ public void testReplicationImportReserveSpace(ContainerLayoutVersion layout)
// Initially volume has 0 used space
assertEquals(0, usedSpace);
// Increase committed bytes so that volume has only remaining 3 times container size space
long initialCommittedBytes = vol1.getCurrentUsage().getCapacity() - containerMaxSize * 3;
long minFreeSpace =
conf.getObject(DatanodeConfiguration.class).getMinFreeSpace(vol1.getCurrentUsage().getCapacity());
long initialCommittedBytes = vol1.getCurrentUsage().getCapacity() - containerMaxSize * 3 - minFreeSpace;
vol1.incCommittedBytes(initialCommittedBytes);
ContainerReplicator replicator =
new DownloadAndImportReplicator(conf, set, importer, moc);
Expand Down
108 changes: 108 additions & 0 deletions hadoop-hdds/docs/content/design/dn-min-space-configuration.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
---
title: Minimum free space configuration for datanode volumes
summary: Describe proposal for minimum free space configuration which volume must have to function correctly.
date: 2025-05-05
jira: HDDS-12928
status: implemented
author: Sumit Agrawal
---
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->

# Abstract
Volume in the datanode stores the container data and metadata (rocks db co-located on the volume).
There are various parallel operation going on such as import container, export container, write and delete data blocks,
container repairs, create and delete containers. The space is also required for volume db to perform compaction at regular interval.
This is hard to capture exact usages and free available space. So, this is required to configure minimum free space
so that datanode operation can perform without any corruption and environment being stuck and support read of data.

This free space is used to ensure volume allocation if `required space < (volume available space - free space - reserved space - committed space)`.
Any container creation and import container need to ensure that this constraint is met. And block byte writes need ensure that `free space` space is available.
Note: Any issue related to ensuring free space is tracked with separate JIRA.

# Existing configuration (before HDDS-12928)
Two configurations are provided,
- hdds.datanode.volume.min.free.space (default: 5GB)
- hdds.datanode.volume.min.free.space.percent

1. If nothing is configured, takes default value as 5GB
2. if both are configured, priority to hdds.datanode.volume.min.free.space
3. else respective configuration is used.

# Problem Statement

- With 5GB default configuration, its not avoiding full disk scenario due to error in ensuring free space availability.
This is due to container size being imported is 5GB which is near boundary, and other parallel operation.
- Volume DB size can increase with increase in disk space as container and blocks it can hold can more and hence metadata.
- Volume DB size can also vary due to small files and big files combination, as more small files can lead to more metadata.

Solution involves
- appropriate default min free space
- depends on disk size variation

# Approach 1 Combination of minimum free space and percent increase on disk size

Configuration:
1. Minimum free space: hdds.datanode.volume.min.free.space: default value `20GB`
2. disk size variation: hdds.datanode.volume.min.free.space.percent: default 0.1% or 0.001 ratio

Minimum free space = Max (`<Min free space>`, `<percent disk space>`)

| Disk space | Min Free Space (percent: 1%) | Min Free Space ( percent: 0.1%) |
| -- |------------------------------|---------------------------------|
| 100 GB | 20 GB | 20 GB (min space default) |
| 1 TB | 20 GB | 20 GB (min space default) |
| 10 TB | 100 GB | 20 GB (min space default) |
| 100 TB | 1 TB | 100 GB |

considering above table with this solution,
- 0.1 % to be sufficient to hold almost all cases, as not observed any dn volume db to be more that 1-2 GB

# Approach 2 Only minimum free space configuration

Considering above approach, 20 GB as default should be sufficient for most of the disk, as usually disk size is 10-15TB as seen.
Higher disk is rarely used, and instead multiple volumes are attached to same DN with multiple disk.

Considering this scenario, Minimum free space: `hdds.datanode.volume.min.free.space` itself is enough and
percent based configuration can be removed.

### Compatibility
If `hdds.datanode.volume.min.free.space.percent` is configured, this should not have any impact
as default value is increased to 20GB which will consider most of the use case.

# Approach 3 Combination of maximum free space and percent configuration on disk size

Configuration:
1. Maximum free space: hdds.datanode.volume.min.free.space: default value `20GB`
2. disk size variation: hdds.datanode.volume.min.free.space.percent: default 10% or 0.1 ratio

Minimum free space = **Min** (`<Max free space>`, `<percent disk space>`)
> Difference with approach `one` is, Min function over the 2 above configuration

| Disk space | Min Free Space (20GB, 10% of disk) |
| -- |------------------------------------|
| 10 GB | 1 GB (=Min(20GB, 1GB) |
| 100 GB | 10 GB (=Min(20GB, 10GB) |
| 1 TB | 20 GB (=Min(20GB, 100GB) |
| 10 TB | 20 GB (=Min(20GB, 1TB) |
| 100 TB | 20GB (=Min(20GB, 10TB) |

This case is more useful for test environment where disk space is less and no need any additional configuration.

# Conclusion
1. Going with Approach 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think supporting both explicit size and percent is good, but there's a few issues still not addressed:

  • Do we only want to support setting one global size for all volumes or supporting individual volume configs?
  • If we are adjusting how hdds.datanode.volume.min.free.space works, we should also adjust hdds.datanode.dir.du.reserved to support configuration in a consistent way.
  • It is bad UX to have two different configs (percent and value) for the same thing. The user has no intuition as to what happens when both are configured.
    • Having a max function buried in the code to resolve this instead of making them exclusive is even worse.

Probably the most user friendly thing to do is deprecate the percent config keys and have one config that takes either a size or percent based value. Whether we want to continue supporting individual volume mappings in the config is still an open question that needs to be resolved in this proposal.

Copy link
Contributor Author

@sumitagrawl sumitagrawl May 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@errose28
The config is applicable for each volume as global config only, do not support individual volume. As min-free-space as ozone maintain is global in nature for each volume.
hdds.datanode.dir.du.reserved config simplification is not in scope of this JIRA/PR.

Using 2 config has been discussed in community meeting, and concluded to have both. Any concern now, need re-discuss over community again.

Single config: Approach "2" is not being opted with majority, and hence went with Approach 1 as max of 2. I have updated in design doc for both Approach 1 and approach 2 pros/cons.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using 2 config has been discussed in community meeting, and concluded to have both. Any concern now, need re-discuss over community again.

Community meetings are for synchronous discussion, not definitive decisions. There are many other forums (mailing list, PRs, Jira, Github discussion). I think this kind of issue is fine for discussion in PR. If you are concerned about visibility, please discuss on mailing list.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@errose28 after discussion over community, will go with Approach 1 only.

  1. purpose of du.reserved config is identifying the disk to be reserved for the application sharing the disk, and hence its at disk level. But here, since its ozone managed space, this needs to be flat configuration. So both need not be same.

  2. For simplicity for min.free.space config, its at global level, and may not be required to be disk level similar to reserved.

  3. Max of min.free space and percent is done, min.free space represent min threshold for most of the disk ranges, and percent to be if some disk are exceptionally higher size.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let me try to add some guiding principles for config modifications which can help us compare one decision or another. The following are usability issues that can occur with config keys:

  1. Inconsistent config format: Configs that operate on similar entities (space usage, address + port, percentages) that read those values differently.
  2. Hidden config dependencies: When one configuration whose value is unchanged functions differently based on the value applied to a different config.
    • This does not include invalid config combinations that fail component startup, since that is easily caught and called out with an error message. We know that no actively running system will have this configuration.

Both hdds.datanode.du.reserved{.percent} and hdds.datanode.min.free.space{.percent} have issues here, and this is our chance to fix them. Now let's look at how our options either help or hurt the above points.

Inconsistent config format

hdds.datanode.du.reserved and hdds.datanode.min.free.space are both used to configure space reservation on datanode drives, so as stated in point 1 it is most intuitive if they accept the same value format. It is ok if one format is more useful for one than another. For example per-volume configuration may be required for hdds.datanode.du.reserved but not for hdds.datanode.min.free.space. It's still ok for both to have that option because it is not invalid for hdds.datanode.min.free.space, there is still only one set of formatting options for users to remember, and only one parser in the code. If we pick and choose different valid formats for each config we will have two formats to remember and two parsers in the code. Therefore even removing allowed config formats from hdds.datanode.min.free.space that are still present in hdds.datanode.du.reserved actually adds complexity. Based on this hdds.datanode.du.reserved and hdds.datanode.min.free.space must accept values of the same format to avoid introducing new config usability problems.

Hidden config dependencies

Next let's look at how the percent variations affect point 2. Anything other than failing startup if the percent and non-percent variations are specified creates this problem, so if a percent and non-percent config key are given like hdds.datanode.min.free.space.percent and hdds.datanode.min.free.space it must be considered invalid and fail the datanode.

There is another option though: get rid of the percentage specific config keys but still support percentage based configuration with the one hdds.datanode.min.free.space config. Let's look at why this works:

  • hdds.datanode.du.reserved needs to support volume specific configuration in the form of <volume-path>:reserved-size since not all volumes may be used as spill for compute, or the volumes may be utilized differently.
    • This means we will always have a parsing method like VolumeUsage#getReserved to handle converting config strings into long values for a volume.
  • hdds.datanode.min.free.space and hdds.datanode.du.reserved should support the same value format, so hdds.datanode.min.free.space also needs to use this same parser.
  • If we are already need a string parser for both configs, we might as well make it differentiate between percentage and size based configs too.

Proposal to address all requirements

The following layout meets all the constraints defined above:

  • Only two config keys: hdds.datanode.min.free.space and hdds.datanode.du.reserved
  • The valid formats for either config key are:
    • A fixed size, like 20GB
    • A percentage as a float, like 0.001. The lack of a unit differentiates it from the first option.
    • A mapping of volumes to sizes, like /data/hdds1:20GB,/data/hdds2:10GB
  • Only one parser is required for both types of configs.
    • This is not new since a parser is already required and cannot be removed without removing support for per-volume configuration in hdds.datanode.du.reserved.

We should never introduce usability issues in our configurations. We have enough of them already : ) If you can show how an alternate proposal meets all the configuration requirements without impacting usability we can consider that as well, but currently none of the proposals in the doc satisfy this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@errose28 You mean we need have another config for min.free.space?
min.free.space.volumes:/data/hdds1:20GB,/data/hdds2:10GB. -- to be similar to reserve? as similar to du.reserve

I do not feel being in name of similar config for space, we should go with this approach, These are if different purpose. Making similar just in name of both represent free space will make configuration complex for min.free.space as user need config for all disk. There is no usecase till not for min.free.space for this.

I do not agree with this approach. In future if there is a need for this for volume mapping for min.free.space, we can ad as separate requirement and handle.

Share your suggestion for this PR if can be merged .....

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding one more to @errose28's list of requirements: cross-compatibility. When extending the possible values allowed for existing configuration, e.g.:

  • adding suffix
  • starting to support percentage
  • allowing list of items instead of single one

we need to consider that even old version may encounter values understood only by new one, and fail. (See HDDS-13077 for a specific example.)

In such cases it may be better to deprecate the existing config properties and add new one(s).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sumitagrawl please re-read the Proposal to address all requirements section in my reply. I think this very clearly states the proposal but the things you are referring to in your reply are not mentioned there.

You mean we need have another config for min.free.space?

No, two configs, one for min free space and one for DU reserved that each use the same value schema. I very clearly said in the previous response "Only two config keys: hdds.datanode.min.free.space and hdds.datanode.du.reserved".

I do not feel being in name of similar config for space, we should go with this approach, These are if different purpose.

This is your take as developer. You need to look at this from a user's perspective. Our consistent failure to consider this perspective is why the system is difficult to use. Configs representing the same "type" of configuration, be it an address, percentage, disk space, time duration, etc must accept the same types of values. Users are not going to understand the nuance of why two similar configs accept different value formats, and in a few months I probably won't either.

Making similar just in name of both represent free space will make configuration complex for min.free.space as user need config for all disk.

This is not part of the proposal. Please re-read it. Min space can be configured with one value across all disks, OR it can use a volume mapping.

There is no usecase till not for min.free.space for this.

Lack of use case is not a valid reason to create a separate value schema for configs that work on the same type. There is also no use case for setting hdds.heartbeat.interval to 7d, but the same value makes perfect sense for hdds.container.scrub.data.scan.interval. Yet they use the same value schema because they both represent time intervals. Your suggestion is analogous to rejecting the d suffix for hdds.heartbeat.interval because it would never be set that long.

@adoroszlai

we need to consider that even old version may encounter values understood only by new one, and fail.

We definitely need to formalize our configuration compatibility guarantees. This probably warrants a dedicated discussion somewhere more visible. My initial take is that we should always support "new software old config", but that supporting "old software new config" is not sustainable because it closes our config for extensions. Especially on the server side this would seem like a deployment error. Maybe our client side config compat guarantees would be different from the server.

Copy link
Contributor Author

@sumitagrawl sumitagrawl May 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@errose28

No, two configs, one for min free space and one for DU reserved that each use the same value schema

DU reserved is special case carried from Hadoop, for case of disk sharing by other application. This may not be required to have same value Schema. This needs user input over various disk as sharing may differ, so this schema is specialized. They are not of same type.

This is your take as developer. You need to look at this from a user's perspective. Our consistent failure to consider this perspective is why the system is difficult to use.

From user perspective only, user have no knowledge how to configure the min-free-space, this is more internal to Ozone working.

volume mapping

This might be additional config can be added later on on need basis. May be we should not add just based on
intuition, as this may go to be dead config.
Please share any possible use case in practical env, we can take up this as enhancement.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Per disk configuration is an abomination that stems from needing to run other applications on nodes/drives along with HDFS in the past. It makes sense for the du config where essentially we tell the Datanode to spare a few drives. This is very different from the min configurations, which has to do with operations and uptime of applications. We must keep configurations for min same across all drives as it has to do with space for repairs and recovery and nothing to do with configuration of the cluster with regards to co-existing with peer applications.

I am all for consistency but in this case it implies a capability that I am not sure we wish to implement.

- Approach 2 is simple setting only min-free-space, but it does not expand with higher disk size.
- Approach 3 is more applicable for test environment where disk space is less, else same as Approach 2.
- So Approach 1 is selected considering advantage where higher free space can be configured by default.
2. Min Space will be 20GB as default


Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ data:
OZONE-SITE.XML_hdds.datanode.dir: "/data/storage"
OZONE-SITE.XML_hdds.scm.safemode.min.datanode: "3"
OZONE-SITE.XML_ozone.datanode.pipeline.limit: "1"
OZONE-SITE.XML_hdds.datanode.volume.min.free.space: "1GB"
OZONE-SITE.XML_ozone.metadata.dirs: "/data/metadata"
OZONE-SITE.XML_ozone.om.address: "om-0.om"
OZONE-SITE.XML_ozone.recon.address: "recon-0.recon"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ data:
OZONE-SITE.XML_hdds.datanode.dir: /data/storage
OZONE-SITE.XML_hdds.scm.safemode.min.datanode: "3"
OZONE-SITE.XML_ozone.datanode.pipeline.limit: "1"
OZONE-SITE.XML_hdds.datanode.volume.min.free.space: "1GB"
OZONE-SITE.XML_ozone.metadata.dirs: /data/metadata
OZONE-SITE.XML_ozone.om.address: om-0.om
OZONE-SITE.XML_ozone.recon.address: recon-0.recon
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ data:
OZONE-SITE.XML_hdds.datanode.dir: /data/storage
OZONE-SITE.XML_hdds.scm.safemode.min.datanode: "3"
OZONE-SITE.XML_ozone.datanode.pipeline.limit: "1"
OZONE-SITE.XML_hdds.datanode.volume.min.free.space: "1GB"
OZONE-SITE.XML_ozone.metadata.dirs: /data/metadata
OZONE-SITE.XML_ozone.om.address: om-0.om
OZONE-SITE.XML_ozone.recon.address: recon-0.recon
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ data:
OZONE-SITE.XML_hdds.datanode.dir: /data/storage
OZONE-SITE.XML_hdds.scm.safemode.min.datanode: "3"
OZONE-SITE.XML_ozone.datanode.pipeline.limit: "1"
OZONE-SITE.XML_hdds.datanode.volume.min.free.space: "1GB"
OZONE-SITE.XML_ozone.metadata.dirs: /data/metadata
OZONE-SITE.XML_ozone.om.address: om-0.om
OZONE-SITE.XML_ozone.recon.address: recon-0.recon
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ data:
OZONE-SITE.XML_hdds.datanode.dir: /data/storage
OZONE-SITE.XML_hdds.scm.safemode.min.datanode: "3"
OZONE-SITE.XML_ozone.datanode.pipeline.limit: "1"
OZONE-SITE.XML_hdds.datanode.volume.min.free.space: "1GB"
OZONE-SITE.XML_ozone.metadata.dirs: /data/metadata
OZONE-SITE.XML_ozone.om.address: om-0.om
OZONE-SITE.XML_ozone.recon.address: recon-0.recon
Expand Down
Loading