Skip to content

Commit

Permalink
Feature/md5 usedforsecurity (#17866)
Browse files Browse the repository at this point in the history
## Summary & Motivation
For FIPS enabled systems the MD5 function is disabled in `openssl`.
Since Dagster is using `hashlib.md5` in various locations (`dagster`,
`dagster-dbt`, and `dagster-k8s`), on a FIPS enabled environment the UI
will deliver the following error when trying to load the code location:

```
ValueError: [digital envelope routines: EVP_DigestInit_ex] disabled for FIPS

  File "/opt/dagster/app/venv/lib/python3.11/site-packages/dagster/_grpc/server.py", line 609, in _get_serialized_external_repository_data
    external_repository_data_from_def(
  File "/opt/dagster/app/venv/lib/python3.11/site-packages/dagster/_core/host_representation/external_data.py", line 1341, in external_repository_data_from_def
    asset_graph = external_asset_graph_from_defs(
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/dagster/app/venv/lib/python3.11/site-packages/dagster/_core/host_representation/external_data.py", line 1531, in external_asset_graph_from_defs
    atomic_execution_unit_id = assets_def.unique_id
                               ^^^^^^^^^^^^^^^^^^^^
  File "/opt/dagster/app/venv/lib/python3.11/site-packages/dagster/_core/definitions/assets.py", line 1254, in unique_id
    return hashlib.md5((json.dumps(sorted(self.keys))).encode("utf-8")).hexdigest()
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
```

A web search [indicates](s3tools/s3cmd#1005)
that flagging such `hashlib.md5` uses with the `usedforsecurity=False`
parameter will resolve this error. As far as I can ascertain, each of
the modified usages are indeed NOT used for the security of the md5
algorithm but instead used to determine the uniqueness of the item(s)
being hashed. If this is not the case, my PR will need to be corrected.

## How I Tested These Changes
I have deployed these changes on my own companies FIPS-enabled,
k8s-based systems and seen the error resolved.
  • Loading branch information
jlloyd-widen authored Dec 5, 2023
1 parent 9b0f031 commit 55f7522
Show file tree
Hide file tree
Showing 5 changed files with 21 additions and 9 deletions.
4 changes: 2 additions & 2 deletions python_modules/dagster/dagster/_core/definitions/assets.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import hashlib
import json
import warnings
from typing import (
Expand Down Expand Up @@ -45,6 +44,7 @@
)
from dagster._utils import IHasInternalInit
from dagster._utils.merger import merge_dicts
from dagster._utils.security import non_secure_md5_hash_str
from dagster._utils.warnings import (
disable_dagster_warnings,
)
Expand Down Expand Up @@ -1348,7 +1348,7 @@ def __str__(self):
@property
def unique_id(self) -> str:
"""A unique identifier for the AssetsDefinition that's stable across processes."""
return hashlib.md5((json.dumps(sorted(self.keys))).encode("utf-8")).hexdigest()
return non_secure_md5_hash_str((json.dumps(sorted(self.keys))).encode("utf-8"))

def with_resources(self, resource_defs: Mapping[str, ResourceDefinition]) -> "AssetsDefinition":
attributes_dict = self.get_attributes_dict()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import hashlib
import os
import shutil
import sys
Expand All @@ -22,6 +21,7 @@
from dagster._serdes import ConfigurableClass, ConfigurableClassData
from dagster._seven import json
from dagster._utils import ensure_dir, ensure_file, touch_file
from dagster._utils.security import non_secure_md5_hash_str

from .captured_log_manager import (
CapturedLogContext,
Expand Down Expand Up @@ -225,7 +225,7 @@ def get_captured_local_path(self, log_key: Sequence[str], extension: str, partia
if partial:
filename = f"{filename}.partial"
if len(filename) > MAX_FILENAME_LENGTH:
filename = "{}.{}".format(hashlib.md5(filebase.encode("utf-8")).hexdigest(), extension)
filename = "{}.{}".format(non_secure_md5_hash_str(filebase.encode("utf-8")), extension)
location = os.path.join(self._base_dir, *namespace, filename)
location = os.path.abspath(location)
if not location.startswith(self._base_dir):
Expand Down
12 changes: 12 additions & 0 deletions python_modules/dagster/dagster/_utils/security.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import hashlib
import sys
from typing import Union


def non_secure_md5_hash_str(s: Union[bytes, bytearray, memoryview]) -> str:
"""Drop in replacement md5 hash function marking it for a non-security purpose."""
# check python version, use usedforsecurity flag if possible.
if sys.version_info[0] <= 3 and sys.version_info[1] <= 8:
return hashlib.md5(s).hexdigest()
else:
return hashlib.md5(s, usedforsecurity=False).hexdigest() # type: ignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import hashlib
import json
import os
from pathlib import Path
Expand Down Expand Up @@ -44,6 +43,7 @@
from dagster._core.definitions.metadata import MetadataUserInput, RawMetadataValue
from dagster._core.errors import DagsterInvalidSubsetError
from dagster._utils.merger import deep_merge_dicts
from dagster._utils.security import non_secure_md5_hash_str
from dagster._utils.warnings import (
deprecation_warning,
normalize_renamed_param,
Expand Down Expand Up @@ -430,7 +430,7 @@ def _dbt_nodes_to_assets(
if not op_name:
op_name = f"run_dbt_{project_id}"
if select != "fqn:*" or exclude:
op_name += "_" + hashlib.md5(select.encode() + exclude.encode()).hexdigest()[-5:]
op_name += "_" + non_secure_md5_hash_str(select.encode() + exclude.encode())[-5:]

check_outs_by_output_name: Mapping[str, Out] = {}
if check_specs_by_output_name:
Expand Down
6 changes: 3 additions & 3 deletions python_modules/libraries/dagster-k8s/dagster_k8s/job.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import copy
import hashlib
import json
import random
import string
Expand All @@ -21,6 +20,7 @@
from dagster._core.utils import parse_env_var
from dagster._serdes import whitelist_for_serdes
from dagster._utils.merger import merge_dicts
from dagster._utils.security import non_secure_md5_hash_str

from .models import k8s_model_from_dict, k8s_snake_case_dict
from .utils import get_common_labels, sanitize_k8s_label
Expand Down Expand Up @@ -892,6 +892,6 @@ def get_k8s_job_name(input_1, input_2=None):
input_2 = "".join(random.choice(letters) for i in range(20))

# Creates 32-bit signed int, so could be negative
name_hash = hashlib.md5((input_1 + input_2).encode("utf-8"))
name_hash = non_secure_md5_hash_str((input_1 + input_2).encode("utf-8"))

return name_hash.hexdigest()
return name_hash

0 comments on commit 55f7522

Please sign in to comment.