From bd391404827aafeff11913d2632452d9e353f445 Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Wed, 21 Aug 2024 23:13:21 -0700 Subject: [PATCH 01/24] Change top-level flow decorator options to be prefixed by METAFLOW_FLOW_ Previously, options like `branch` and `name` (injected by the project decorator for example) could be set using `METAFLOW_BRANCH`. They now need to be set using `METAFLOW_FLOW_BRANCH`. This change is made to prevent clashes between regular metaflow configuration settings and decorator level options. No other changes are made so `METAFLOW_RUN_MAX_WORKERS` still works as expected and `METAFLOW_PYLINT` as well. --- metaflow/decorators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/metaflow/decorators.py b/metaflow/decorators.py index 262102551e4..efd27e2ffeb 100644 --- a/metaflow/decorators.py +++ b/metaflow/decorators.py @@ -218,6 +218,7 @@ def add_decorator_options(cmd): ) raise MetaflowInternalError(msg) else: + kwargs["envvar"] = "METAFLOW_FLOW_%s" % option.upper() seen[option] = deco.name cmd.params.insert(0, click.Option(("--" + option,), **kwargs)) return cmd From 13a67cca32bbad6e0c19837204f1be722e462b17 Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Thu, 15 Aug 2024 09:59:09 -0700 Subject: [PATCH 02/24] Initial Config object --- metaflow/metaflow_environment.py | 11 +- metaflow/parameters.py | 2 + metaflow/user_configs.py | 266 +++++++++++++++++++++++++++++++ 3 files changed, 276 insertions(+), 3 deletions(-) create mode 100644 metaflow/user_configs.py diff --git a/metaflow/metaflow_environment.py b/metaflow/metaflow_environment.py index 0ac1ca3266c..cea6e18697b 100644 --- a/metaflow/metaflow_environment.py +++ b/metaflow/metaflow_environment.py @@ -6,6 +6,7 @@ from . import metaflow_version from metaflow.exception import MetaflowException from metaflow.extension_support import dump_module_info +from metaflow.user_configs import dump_config_values from metaflow.mflog import BASH_MFLOG from . import R @@ -18,7 +19,7 @@ class MetaflowEnvironment(object): TYPE = "local" def __init__(self, flow): - pass + self._flow = flow def init_environment(self, echo): """ @@ -177,7 +178,7 @@ def get_package_commands(self, code_package_url, datastore_type): ] return cmds - def get_environment_info(self, include_ext_info=False): + def get_environment_info(self, full_info=False): # note that this dict goes into the code package # so variables here should be relatively stable (no # timestamps) so the hash won't change all the time @@ -198,10 +199,14 @@ def get_environment_info(self, include_ext_info=False): env["metaflow_r_version"] = R.metaflow_r_version() env["r_version"] = R.r_version() env["r_version_code"] = R.r_version_code() - if include_ext_info: + if full_info: # Information about extension modules (to load them in the proper order) ext_key, ext_val = dump_module_info() env[ext_key] = ext_val + # Information about configurations (to be able to reload them) + user_configs = dump_config_values(self._flow) + if user_configs: + env[user_configs[0]] = user_configs[1] return env def executable(self, step_name, default=None): diff --git a/metaflow/parameters.py b/metaflow/parameters.py index fe0dabbda3f..30e2e0f623b 100644 --- a/metaflow/parameters.py +++ b/metaflow/parameters.py @@ -204,6 +204,8 @@ def __repr__(self): def deploy_time_eval(value): if isinstance(value, DeployTimeField): return value(deploy_time=True) + elif isinstance(value, DelayedEvaluationParameter): + return value(return_str=True) else: return value diff --git a/metaflow/user_configs.py b/metaflow/user_configs.py new file mode 100644 index 00000000000..1570af25b55 --- /dev/null +++ b/metaflow/user_configs.py @@ -0,0 +1,266 @@ +import json +import os + +from typing import Any, Dict, Optional, Union, TYPE_CHECKING + +from metaflow import INFO_FILE +from metaflow._vendor import click + +from .exception import MetaflowException +from .parameters import ( + DelayedEvaluationParameter, + Parameter, + current_flow, +) +import functools + +if TYPE_CHECKING: + from metaflow import FlowSpec + +# _tracefunc_depth = 0 + + +# def tracefunc(func): +# """Decorates a function to show its trace.""" + +# @functools.wraps(func) +# def tracefunc_closure(*args, **kwargs): +# global _tracefunc_depth +# """The closure.""" +# print(f"{_tracefunc_depth}: {func.__name__}(args={args}, kwargs={kwargs})") +# _tracefunc_depth += 1 +# result = func(*args, **kwargs) +# _tracefunc_depth -= 1 +# print(f"{_tracefunc_depth} => {result}") +# return result + +# return tracefunc_closure + + +def dump_config_values(flow: FlowSpec): + if hasattr(flow, "_user_configs"): + return "user_configs", flow._user_configs + return None, None + + +def load_config_values() -> Optional[Dict[str, Any]]: + try: + with open(INFO_FILE, encoding="utf-8") as contents: + return json.load(contents).get("user_configs", {}) + except IOError: + return None + + +class ConfigValue(object): + # Thin wrapper to allow configuration values to be accessed using a "." notation + # as well as a [] notation. + + def __init__(self, data: Dict[str, Any]): + self._data = data + + for key, value in data.items(): + if isinstance(value, dict): + value = ConfigValue(value) + elif isinstance(value, list): + value = [ConfigValue(v) for v in value] + setattr(self, key, value) + + def __getitem__(self, key): + value = self._data[key] + if isinstance(value, dict): + value = ConfigValue(value) + elif isinstance(value, list): + value = [ConfigValue(v) for v in value] + return value + + def __repr__(self): + return repr(self._data) + + +class ConfigInput(click.ParamType): + name = "ConfigInput" + + # Contains the values loaded from the INFO file. We make this a class method + # so that if there are multiple configs, we just need to read the file once. + # It is OK to be globally unique because this is only evoked in scenario A.2 (see + # convert method) which means we are already just executing a single task and so + # there is no concern about it "leaking" to things running with Runner for example + # (ie: even if Runner is evoked in that task, we won't "share" this global value's + # usage). + loaded_configs = None # type: Optional[Dict[str, Dict[str, Any]]] + + def __init__(self): + self._flow_cls = getattr(current_flow, "flow_cls", None) + if self._flow_cls is None: + raise MetaflowException("ConfigInput can only be used inside a flow") + if not hasattr(self._flow_cls, "_user_configs"): + self._flow_cls._user_configs = {} + + @staticmethod + def _make_key_name(name: str) -> str: + return "kv." + name.lower() + + @classmethod + def get_config(cls, config_name: str) -> Optional[Dict[str, Any]]: + if cls.loaded_configs is None: + all_configs = load_config_values() + if all_configs is None: + raise MetaflowException( + "Could not load expected configuration values " + "the INFO file. This is a Metaflow bug. Please contact support." + ) + cls.loaded_configs = all_configs + return cls.loaded_configs.get(config_name, None) + + def convert(self, value, param, ctx): + # Click can call convert multiple times, so we need to make sure to only + # convert once. + if isinstance(value, (ConfigValue, DelayedEvaluationParameter)): + return value + + # There are two paths we need to worry about: + # - Scenario A: deploying to a scheduler + # A.1 In this case, when deploying (using `step-functions create` for example), + # the value passed to click (or the default value) will be converted and we + # will: + # - store the configuration in the flow object under _user_configs (so that it + # can later be dumped to the INFO file when packaging) + # - return a DelayedEvaluationParameter object so that when the scheduler + # evaluates it (with return_str set to True), it gets back the *string* + # kv. which indicates that this + # configuration should be fetched from INFO + # A.2 When the scheduler runs the flow, the value returned in A.1 (the kv. + # string) will be passed to convert again. This time, we directly return a + # ConfigValue after having fetched/loaded the configuration from INFO. + # + # - Scenario B: running with the native Runtime + # The value passed in will be similarly stored under _user_configs. We also + # return a DelayedEvaluationParameter object but when the _set_constants in + # the runtime calls it, it calls it with return_str set to False and it will + # return a ConfigValue directly which can then be persisted in the artifact + # store. + + # The value we get in to convert can be: + # - a dictionary + # - a path to a YAML or JSON file + # - the string representation of a YAML or JSON file + # In all cases, we also store the configuration in the flow object under _user_configs. + # It will *not* be stored as an artifact but is a good place to store it so we + # can access it when packaging to store it in the INFO file. The config itself + # will be stored as regular artifacts (the ConfigValue object basically) + + def _delay_eval(name: str, value: ConfigValue, return_str=False): + if return_str: + # Scenario A.1 when deploy_time_eval is called by the scheduler + # (or, in some cases, some schedulers directly identify the + # DelayedEvaluationParameter value and call it directory with + # return_str=True) + return name + # Scenario B + return value + + if isinstance(value, dict): + # Scenario A.1 or B. + self._flow_cls._user_configs[self._make_key_name(param.name)] = value + return DelayedEvaluationParameter( + param.name, "value", functools.partial(_delay_eval, param.name, value) + ) + elif not isinstance(value, str): + raise MetaflowException( + "Configuration value for '%s' must be a string or a dictionary" + % param.name + ) + + # Here we are sure we have a string + if value.startswith("kv."): + # This is scenario A.2 + value = self.get_config(value) + if value is None: + raise MetaflowException( + "Could not find configuration '%s' in INFO file" % value + ) + return ConfigValue(value) + + elif os.path.isfile(value): + try: + with open(value, "r") as f: + content = f.read() + except OSError as e: + raise MetaflowException( + "Could not read configuration file '%s'" % value + ) from e + try: + value = json.loads(content) + except json.JSONDecodeError as e: + raise MetaflowException( + "Configuration file '%s' is not valid JSON" % value + ) from e + # TODO: Support YAML + self._flow_cls._user_configs[self._make_key_name(param.name)] = value + else: + try: + value = json.loads(value) + except json.JSONDecodeError as e: + raise MetaflowException( + "Configuration value for '%s' is not valid JSON" % param.name + ) from e + # TODO: Support YAML + self._flow_cls._user_configs[self._make_key_name(param.name)] = value + return DelayedEvaluationParameter( + param.name, "value", functools.partial(_delay_eval, param.name, value) + ) + + def __str__(self): + return repr(self) + + def __repr__(self): + return "ConfigInput" + + +ConfigArgType = Union[str, Dict[str, Any]] + + +class Config(Parameter): + """ + Includes a configuration for this flow. + + `Config` is a special type of `Parameter` but differs in a few key areas: + - it is immutable and determined at deploy time (or prior to running if not deploying + to a scheduler) + - as such, it can be used anywhere in your code including in Metaflow decorators + + + Parameters + ---------- + name : str + User-visible parameter name. + default : Union[ConfigArgType, Callable[[ParameterContext], ConfigArgType]] + Default configuration either as a path to a file, the string representation of + a YAML or JSON file or a dictionary. If specified as a function, the function + will be evaluated to get the value to use. + required : bool, default False + Require that the user specified a value for the parameter. + `required=True` implies that the `default` value is ignored. + help : str, optional + Help text to show in `run --help`. + show_default : bool, default True + If True, show the default value in the help text. + """ + + def __init__( + self, + name: str, + required: bool = False, + help: Optional[str] = None, + **kwargs: Dict[str, str] + ): + super(Config, self).__init__( + name, + required=required, + help=help, + type=ConfigInput(), + **kwargs, + ) + + def load_parameter(self, v): + return v From 49c931ceeb4cdbe791df4885fcbe9e712461957f Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Tue, 20 Aug 2024 00:56:13 -0700 Subject: [PATCH 03/24] Move from -- to --config --- metaflow/__init__.py | 2 + metaflow/cli.py | 27 +- metaflow/cli_components/init_cmd.py | 2 +- metaflow/cli_components/run_cmds.py | 3 +- metaflow/cli_components/step_cmd.py | 1 + metaflow/decorators.py | 45 +- metaflow/flowspec.py | 140 +++++- metaflow/package.py | 2 +- metaflow/parameters.py | 16 +- metaflow/plugins/aws/batch/batch_decorator.py | 4 +- .../kubernetes/kubernetes_decorator.py | 4 +- metaflow/plugins/pypi/conda_decorator.py | 13 +- metaflow/plugins/timeout_decorator.py | 4 +- metaflow/runner/click_api.py | 4 +- metaflow/runtime.py | 180 ++++--- metaflow/user_configs.py | 438 +++++++++++++----- metaflow/util.py | 11 + 17 files changed, 668 insertions(+), 228 deletions(-) diff --git a/metaflow/__init__.py b/metaflow/__init__.py index 409922a49d6..951b9acde0c 100644 --- a/metaflow/__init__.py +++ b/metaflow/__init__.py @@ -103,6 +103,8 @@ class and related decorators. from .parameters import Parameter, JSONTypeClass, JSONType +from .user_configs import Config, FlowConfig, config_expr, eval_config + # data layer # For historical reasons, we make metaflow.plugins.datatools accessible as # metaflow.datatools. S3 is also a tool that has historically been available at the diff --git a/metaflow/cli.py b/metaflow/cli.py index 192a26f4855..962b80fa2f7 100644 --- a/metaflow/cli.py +++ b/metaflow/cli.py @@ -35,6 +35,7 @@ from .pylint_wrapper import PyLint from .R import metaflow_r_version, use_r from .util import resolve_identity +from .user_configs import LocalFileInput, config_options ERASE_TO_EOL = "\033[K" HIGHLIGHT = "red" @@ -223,7 +224,10 @@ def version(obj): echo_always(obj.version) +# NOTE: add_decorator_options should be TL because it checks to make sure +# that no option conflict with the ones below @decorators.add_decorator_options +@config_options @click.command( cls=LazyPluginCommandCollection, sources=[cli], @@ -293,6 +297,15 @@ def version(obj): type=click.Choice(MONITOR_SIDECARS), help="Monitoring backend type", ) +@click.option( + "--local-info-file", + type=LocalFileInput(exists=True, readable=True, dir_okay=False, resolve_path=True), + required=False, + default=None, + help="A filename containing a subset of the INFO file. Internal use only.", + hidden=True, + is_eager=True, +) @click.pass_context def start( ctx, @@ -306,6 +319,8 @@ def start( pylint=None, event_logger=None, monitor=None, + local_info_file=None, + config_options=None, **deco_options ): if quiet: @@ -322,11 +337,17 @@ def start( echo(" executing *%s*" % ctx.obj.flow.name, fg="magenta", nl=False) echo(" for *%s*" % resolve_identity(), fg="magenta") + # At this point, we are able to resolve the user-configuration options so we can + # process all those decorators that the user added that will modify the flow based + # on those configurations. It is important to do this as early as possible since it + # actually modifies the flow itself + ctx.obj.flow = ctx.obj.flow._process_config_funcs(config_options) + cli_args._set_top_kwargs(ctx.params) ctx.obj.echo = echo ctx.obj.echo_always = echo_always ctx.obj.is_quiet = quiet - ctx.obj.graph = FlowGraph(ctx.obj.flow.__class__) + ctx.obj.graph = ctx.obj.flow._graph ctx.obj.logger = logger ctx.obj.pylint = pylint ctx.obj.check = functools.partial(_check, echo) @@ -377,6 +398,10 @@ def start( ctx.obj.monitor, ) + ctx.obj.config_options = config_options + + decorators._resolve_configs(ctx.obj.flow) + # It is important to initialize flow decorators early as some of the # things they provide may be used by some of the objects initialized after. decorators._init_flow_decorators( diff --git a/metaflow/cli_components/init_cmd.py b/metaflow/cli_components/init_cmd.py index 404a3a6911e..fdd64bdcc54 100644 --- a/metaflow/cli_components/init_cmd.py +++ b/metaflow/cli_components/init_cmd.py @@ -47,5 +47,5 @@ def init(obj, run_id=None, task_id=None, tags=None, **kwargs): obj.monitor, run_id=run_id, ) - obj.flow._set_constants(obj.graph, kwargs) + obj.flow._set_constants(obj.graph, kwargs, obj.config_options) runtime.persist_constants(task_id=task_id) diff --git a/metaflow/cli_components/run_cmds.py b/metaflow/cli_components/run_cmds.py index fefd2cdfbe1..5d4701c4e13 100644 --- a/metaflow/cli_components/run_cmds.py +++ b/metaflow/cli_components/run_cmds.py @@ -40,6 +40,7 @@ def before_run(obj, tags, decospecs): ) if all_decospecs: decorators._attach_decorators(obj.flow, all_decospecs) + decorators._init(obj.flow, only_non_static=True) obj.graph = FlowGraph(obj.flow.__class__) obj.check(obj.graph, obj.flow, obj.environment, pylint=obj.pylint) @@ -326,7 +327,7 @@ def run( write_latest_run_id(obj, runtime.run_id) write_file(run_id_file, runtime.run_id) - obj.flow._set_constants(obj.graph, kwargs) + obj.flow._set_constants(obj.graph, kwargs, obj.config_options) current._update_env( { "run_id": runtime.run_id, diff --git a/metaflow/cli_components/step_cmd.py b/metaflow/cli_components/step_cmd.py index 88f300bd679..9302870fec6 100644 --- a/metaflow/cli_components/step_cmd.py +++ b/metaflow/cli_components/step_cmd.py @@ -138,6 +138,7 @@ def step( if decospecs: decorators._attach_decorators_to_step(func, decospecs) + decorators._init(ctx.obj.flow, only_non_static=True) step_kwargs = ctx.params # Remove argument `step_name` from `step_kwargs`. diff --git a/metaflow/decorators.py b/metaflow/decorators.py index efd27e2ffeb..8df702ebf10 100644 --- a/metaflow/decorators.py +++ b/metaflow/decorators.py @@ -12,6 +12,7 @@ ) from .parameters import current_flow +from .user_configs import DelayEvaluator from metaflow._vendor import click @@ -123,6 +124,30 @@ def __init__(self, attributes=None, statically_defined=False): else: raise InvalidDecoratorAttribute(self.name, k, self.defaults) + def resolve_configs(self): + """ + Resolve any configuration options that may be set in the decorator's attributes + """ + + def _resolve_delayed_evaluator(v): + if isinstance(v, DelayEvaluator): + return v() + if isinstance(v, dict): + return { + _resolve_delayed_evaluator(k): _resolve_delayed_evaluator(v) + for k, v in v.items() + } + if isinstance(v, list): + return [_resolve_delayed_evaluator(x) for x in v] + if isinstance(v, tuple): + return tuple(_resolve_delayed_evaluator(x) for x in v) + if isinstance(v, set): + return {_resolve_delayed_evaluator(x) for x in v} + return v + + for k, v in self.attributes.items(): + self.attributes[k] = _resolve_delayed_evaluator(v) + @classmethod def _parse_decorator_spec(cls, deco_spec): if len(deco_spec) == 0: @@ -203,10 +228,13 @@ def get_top_level_options(self): # compare this to parameters.add_custom_parameters def add_decorator_options(cmd): - seen = {} flow_cls = getattr(current_flow, "flow_cls", None) if flow_cls is None: return cmd + + seen = {} + existing_params = set(p.name.lower() for p in cmd.params) + # Add decorator options for deco in flow_decorators(flow_cls): for option, kwargs in deco.options.items(): if option in seen: @@ -217,6 +245,11 @@ def add_decorator_options(cmd): % (deco.name, option, seen[option]) ) raise MetaflowInternalError(msg) + elif deco.name.lower() in existing_params: + raise MetaflowInternalError( + "Flow decorator '%s' uses an option '%s' which is a reserved " + "keyword. Please use a different option name." % (deco.name, option) + ) else: kwargs["envvar"] = "METAFLOW_FLOW_%s" % option.upper() seen[option] = deco.name @@ -511,6 +544,16 @@ def _attach_decorators_to_step(step, decospecs): step.decorators.append(deco) +def _resolve_configs(flow): + # We get the datastore for the _parameters step which can contain + for decorators in flow._flow_decorators.values(): + for deco in decorators: + deco.resolve_configs() + for step in flow: + for deco in step.decorators: + deco.resolve_configs() + + def _init_flow_decorators( flow, graph, environment, flow_datastore, metadata, logger, echo, deco_options ): diff --git a/metaflow/flowspec.py b/metaflow/flowspec.py index 0c7ffd1f128..b549345d76c 100644 --- a/metaflow/flowspec.py +++ b/metaflow/flowspec.py @@ -6,7 +6,7 @@ from itertools import islice from types import FunctionType, MethodType -from typing import Any, Callable, List, Optional, Tuple +from typing import TYPE_CHECKING, Any, Callable, Generator, List, Optional, Tuple from . import cmd_with_io, parameters from .parameters import DelayedEvaluationParameter, Parameter @@ -20,6 +20,7 @@ from .graph import FlowGraph from .unbounded_foreach import UnboundedForeachInput +from .user_configs import ConfigInput, ConfigValue from .util import to_pod from .metaflow_config import INCLUDE_FOREACH_STACK, MAXIMUM_FOREACH_VALUE_CHARS @@ -70,10 +71,66 @@ def __new__(cls, name, bases, dct): # This makes sure to give _flow_decorators to each # child class (and not share it with the FlowSpec base # class). This is important to not make a "global" - # _flow_decorators + # _flow_decorators. Same deal with user configurations f._flow_decorators = {} + f._user_configs = {} + + # We also cache parameter names to avoid having to recompute what is a parameter + # in the dir of a flow + f._cached_parameters = None + + # Finally attach all functions that need to be evaluated once user configurations + # are available + f._config_funcs = [] + return f + @property + def configs(cls) -> Generator[Tuple[str, "ConfigValue"], None, None]: + """ + Iterate over all user configurations in this flow + + Use this to parameterize your flow based on configuration. As an example: + ``` + def parametrize(flow): + val = next(flow.configs)[1].steps.start.cpu + flow.start = environment(vars={'mycpu': val})(flow.start) + return flow + + @parametrize + class TestFlow(FlowSpec): + config = Config('myconfig.json') + + @step + def start(self): + pass + ``` + can be used to add an environment decorator to the `start` step. + + Yields + ------ + Tuple[str, ConfigValue] + Iterates over the configurations of the flow + """ + # When configs are parsed, they are loaded in _user_configs + for name, value in cls._user_configs.items(): + yield name, ConfigValue(value) + + @property + def steps(cls) -> Generator[Tuple[str, Any], None, None]: + """ + Iterate over all the steps in this flow + + Yields + ------ + Tuple[str, Any] + A tuple with the step name and the step itself + """ + for var in dir(cls): + potential_step = getattr(cls, var) + if callable(potential_step) and hasattr(potential_step, "is_step"): + yield var, potential_step + class FlowSpec(metaclass=FlowSpecMeta): """ @@ -95,6 +152,9 @@ class FlowSpec(metaclass=FlowSpecMeta): "_cached_input", "_graph", "_flow_decorators", + "_user_configs", + "_cached_parameters", + "_config_funcs", "_steps", "index", "input", @@ -147,14 +207,7 @@ def script_name(self) -> str: fname = fname[:-1] return os.path.basename(fname) - def _set_constants(self, graph, kwargs): - from metaflow.decorators import ( - flow_decorators, - ) # To prevent circular dependency - - # Persist values for parameters and other constants (class level variables) - # only once. This method is called before persist_constants is called to - # persist all values set using setattr + def _check_parameters(self): seen = set() for var, param in self._get_parameters(): norm = param.name.lower() @@ -165,13 +218,69 @@ def _set_constants(self, graph, kwargs): "case-insensitive." % param.name ) seen.add(norm) - seen.clear() + + def _process_config_funcs(self, config_options): + current_cls = self.__class__ + + # Fast path for no user configurations + if not self._config_funcs: + return self + + # We need to convert all the user configurations from DelayedEvaluationParameters + # to actual values so they can be used as is in the config functions. + + # We then reset them to be proper parameters so they can be re-evaluated in + # _set_constants + to_reset_params = [] + self._check_parameters() + for var, param in self._get_parameters(): + if not param.IS_FLOW_PARAMETER: + continue + to_reset_params.append((var, param)) + val = config_options[param.name.replace("-", "_").lower()] + if isinstance(val, DelayedEvaluationParameter): + val = val() + setattr(current_cls, var, val) + + # Run all the functions. They will now be able to access the configuration + # values directly from the class + for func in self._config_funcs: + current_cls = func(current_cls) + + # Reset all configs that were already present in the class. + # TODO: This means that users can't override configs directly. Not sure if this + # is a pattern we want to support + for var, param in to_reset_params: + setattr(current_cls, var, param) + + # We reset cached_parameters on the very off chance that the user added + # more configurations based on the configuration + current_cls._cached_parameters = None + + # Set the current flow class we are in (the one we just created) + parameters.replace_flow_context(current_cls) + return current_cls(use_cli=False) + + def _set_constants(self, graph, kwargs, config_options): + from metaflow.decorators import ( + flow_decorators, + ) # To prevent circular dependency + + # Persist values for parameters and other constants (class level variables) + # only once. This method is called before persist_constants is called to + # persist all values set using setattr + self._check_parameters() + + seen = set() self._success = True parameters_info = [] for var, param in self._get_parameters(): seen.add(var) - val = kwargs[param.name.replace("-", "_").lower()] + if param.IS_FLOW_PARAMETER: + val = config_options[param.name.replace("-", "_").lower()] + else: + val = kwargs[param.name.replace("-", "_").lower()] # Support for delayed evaluation of parameters. if isinstance(val, DelayedEvaluationParameter): val = val() @@ -217,6 +326,11 @@ def _set_constants(self, graph, kwargs): @classmethod def _get_parameters(cls): + if cls._cached_parameters is not None: + for var in cls._cached_parameters: + yield var, getattr(cls, var) + return + build_list = [] for var in dir(cls): if var[0] == "_" or var in cls._NON_PARAMETERS: continue @@ -225,7 +339,9 @@ def _get_parameters(cls): except: continue if isinstance(val, Parameter): + build_list.append(var) yield var, val + cls._cached_parameters = build_list def _set_datastore(self, datastore): self._datastore = datastore diff --git a/metaflow/package.py b/metaflow/package.py index 30435dce47f..55968a03d65 100644 --- a/metaflow/package.py +++ b/metaflow/package.py @@ -153,7 +153,7 @@ def path_tuples(self): def _add_info(self, tar): info = tarfile.TarInfo(os.path.basename(INFO_FILE)) - env = self.environment.get_environment_info(include_ext_info=True) + env = self.environment.get_environment_info(full_info=True) buf = BytesIO() buf.write(json.dumps(env).encode("utf-8")) buf.seek(0) diff --git a/metaflow/parameters.py b/metaflow/parameters.py index 30e2e0f623b..81fae124b2e 100644 --- a/metaflow/parameters.py +++ b/metaflow/parameters.py @@ -72,6 +72,16 @@ def flow_context(flow_cls): context_proto = None +def replace_flow_context(flow_cls): + """ + Replace the current flow context with a new flow class. This is used + when we change the current flow class after having run user configuration functions + """ + current_flow.flow_cls_stack = current_flow.flow_cls_stack[1:] + current_flow.flow_cls_stack.insert(0, flow_cls) + current_flow.flow_cls = current_flow.flow_cls_stack[0] + + class JSONTypeClass(click.ParamType): name = "JSON" @@ -293,6 +303,8 @@ class MyFlow(FlowSpec): If True, show the default value in the help text. """ + IS_FLOW_PARAMETER = False + def __init__( self, name: str, @@ -433,7 +445,9 @@ def wrapper(cmd): flow_cls = getattr(current_flow, "flow_cls", None) if flow_cls is None: return cmd - parameters = [p for _, p in flow_cls._get_parameters()] + parameters = [ + p for _, p in flow_cls._get_parameters() if not p.IS_FLOW_PARAMETER + ] for arg in parameters[::-1]: kwargs = arg.option_kwargs(deploy_mode) cmd.params.insert(0, click.Option(("--" + arg.name,), **kwargs)) diff --git a/metaflow/plugins/aws/batch/batch_decorator.py b/metaflow/plugins/aws/batch/batch_decorator.py index 52291d49cd5..16ca5c6f768 100644 --- a/metaflow/plugins/aws/batch/batch_decorator.py +++ b/metaflow/plugins/aws/batch/batch_decorator.py @@ -138,8 +138,8 @@ class BatchDecorator(StepDecorator): supports_conda_environment = True target_platform = "linux-64" - def __init__(self, attributes=None, statically_defined=False): - super(BatchDecorator, self).__init__(attributes, statically_defined) + def resolve_configs(self): + super(BatchDecorator, self).resolve_configs() # If no docker image is explicitly specified, impute a default image. if not self.attributes["image"]: diff --git a/metaflow/plugins/kubernetes/kubernetes_decorator.py b/metaflow/plugins/kubernetes/kubernetes_decorator.py index 7852685bdcc..20b56175149 100644 --- a/metaflow/plugins/kubernetes/kubernetes_decorator.py +++ b/metaflow/plugins/kubernetes/kubernetes_decorator.py @@ -145,8 +145,8 @@ class KubernetesDecorator(StepDecorator): supports_conda_environment = True target_platform = "linux-64" - def __init__(self, attributes=None, statically_defined=False): - super(KubernetesDecorator, self).__init__(attributes, statically_defined) + def resolve_configs(self): + super(KubernetesDecorator, self).resolve_configs() if not self.attributes["namespace"]: self.attributes["namespace"] = KUBERNETES_NAMESPACE diff --git a/metaflow/plugins/pypi/conda_decorator.py b/metaflow/plugins/pypi/conda_decorator.py index 74418ae9f54..b6ac1b91d88 100644 --- a/metaflow/plugins/pypi/conda_decorator.py +++ b/metaflow/plugins/pypi/conda_decorator.py @@ -49,7 +49,9 @@ class CondaStepDecorator(StepDecorator): # CONDA_CHANNELS in their environment. For pinning specific packages to specific # conda channels, users can specify channel::package as the package name. - def __init__(self, attributes=None, statically_defined=False): + def resolve_configs(self): + super(CondaStepDecorator, self).resolve_configs() + self._user_defined_attributes = ( attributes.copy() if attributes is not None else {} ) @@ -172,9 +174,7 @@ def runtime_init(self, flow, graph, package, run_id): encoding="utf-8", ) as f: f.write( - json.dumps( - self.environment.get_environment_info(include_ext_info=True) - ) + json.dumps(self.environment.get_environment_info(full_info=True)) ) # Support metaflow extensions. @@ -332,11 +332,16 @@ class CondaFlowDecorator(FlowDecorator): "disabled": None, } +<<<<<<< HEAD def __init__(self, attributes=None, statically_defined=False): self._user_defined_attributes = ( attributes.copy() if attributes is not None else {} ) super(CondaFlowDecorator, self).__init__(attributes, statically_defined) +======= + def resolve_configs(self): + super(CondaFlowDecorator, self).resolve_configs() +>>>>>>> 99d06ae8 (More WIP) # Support legacy 'libraries=' attribute for the decorator. self.attributes["packages"] = { diff --git a/metaflow/plugins/timeout_decorator.py b/metaflow/plugins/timeout_decorator.py index e2c04dbcb31..648e318d36a 100644 --- a/metaflow/plugins/timeout_decorator.py +++ b/metaflow/plugins/timeout_decorator.py @@ -37,8 +37,8 @@ class TimeoutDecorator(StepDecorator): name = "timeout" defaults = {"seconds": 0, "minutes": 0, "hours": 0} - def __init__(self, *args, **kwargs): - super(TimeoutDecorator, self).__init__(*args, **kwargs) + def resolve_configs(self): + super().resolve_configs() # Initialize secs in __init__ so other decorators could safely use this # value without worrying about decorator order. # Convert values in attributes to type:int since they can be type:str diff --git a/metaflow/runner/click_api.py b/metaflow/runner/click_api.py index 47692113804..83d7d3f9081 100644 --- a/metaflow/runner/click_api.py +++ b/metaflow/runner/click_api.py @@ -224,7 +224,9 @@ def name(self): @classmethod def from_cli(cls, flow_file: str, cli_collection: Callable) -> Callable: flow_cls = extract_flow_class_from_file(flow_file) - flow_parameters = [p for _, p in flow_cls._get_parameters()] + flow_parameters = [ + p for _, p in flow_cls._get_parameters() if not p.IS_FLOW_PARAMETER + ] with flow_context(flow_cls) as _: add_decorator_options(cli_collection) diff --git a/metaflow/runtime.py b/metaflow/runtime.py index 5f86210aa67..e4cb07caddb 100644 --- a/metaflow/runtime.py +++ b/metaflow/runtime.py @@ -6,9 +6,11 @@ """ from __future__ import print_function +import json import os import sys import fcntl +import tempfile import time import subprocess from datetime import datetime @@ -39,6 +41,9 @@ UBF_CONTROL, UBF_TASK, ) + +from .user_configs import ConfigInput, dump_config_values + import metaflow.tracing as tracing MAX_WORKERS = 16 @@ -417,82 +422,95 @@ def execute(self): else: self._queue_push("start", {}) progress_tstamp = time.time() - try: - # main scheduling loop - exception = None - while self._run_queue or self._active_tasks[0] > 0 or self._cloned_tasks: - # 1. are any of the current workers finished? - if self._cloned_tasks: - finished_tasks = self._cloned_tasks - # reset the list of cloned tasks and let poll_workers handle - # the remaining transition - self._cloned_tasks = [] - else: - finished_tasks = list(self._poll_workers()) - # 2. push new tasks triggered by the finished tasks to the queue - self._queue_tasks(finished_tasks) - # 3. if there are available worker slots, pop and start tasks - # from the queue. - self._launch_workers() - - if time.time() - progress_tstamp > PROGRESS_INTERVAL: - progress_tstamp = time.time() - tasks_print = ", ".join( - [ - "%s (%d running; %d done)" % (k, v[0], v[1]) - for k, v in self._active_tasks.items() - if k != 0 and v[0] > 0 - ] - ) - if self._active_tasks[0] == 0: - msg = "No tasks are running." + with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8") as config_file: + # Configurations are passed through a file to avoid overloading the + # command-line. We only need to create this file once and it can be reused + # for any task launch + config_key, config_value = dump_config_values(self._flow) + if config_value: + json.dump({config_key: config_value}, config_file) + config_file.flush() + self._config_file_name = config_file.name + else: + self._config_file_name = None + try: + # main scheduling loop + exception = None + while ( + self._run_queue or self._active_tasks[0] > 0 or self._cloned_tasks + ): + # 1. are any of the current workers finished? + if self._cloned_tasks: + finished_tasks = self._cloned_tasks + # reset the list of cloned tasks and let poll_workers handle + # the remaining transition + self._cloned_tasks = [] else: - if self._active_tasks[0] == 1: - msg = "1 task is running: " + finished_tasks = list(self._poll_workers()) + # 2. push new tasks triggered by the finished tasks to the queue + self._queue_tasks(finished_tasks) + # 3. if there are available worker slots, pop and start tasks + # from the queue. + self._launch_workers() + + if time.time() - progress_tstamp > PROGRESS_INTERVAL: + progress_tstamp = time.time() + tasks_print = ", ".join( + [ + "%s (%d running; %d done)" % (k, v[0], v[1]) + for k, v in self._active_tasks.items() + if k != 0 and v[0] > 0 + ] + ) + if self._active_tasks[0] == 0: + msg = "No tasks are running." else: - msg = "%d tasks are running: " % self._active_tasks[0] - msg += "%s." % tasks_print + if self._active_tasks[0] == 1: + msg = "1 task is running: " + else: + msg = "%d tasks are running: " % self._active_tasks[0] + msg += "%s." % tasks_print - self._logger(msg, system_msg=True) + self._logger(msg, system_msg=True) - if len(self._run_queue) == 0: - msg = "No tasks are waiting in the queue." - else: - if len(self._run_queue) == 1: - msg = "1 task is waiting in the queue: " + if len(self._run_queue) == 0: + msg = "No tasks are waiting in the queue." else: - msg = "%d tasks are waiting in the queue." % len( - self._run_queue - ) + if len(self._run_queue) == 1: + msg = "1 task is waiting in the queue: " + else: + msg = "%d tasks are waiting in the queue." % len( + self._run_queue + ) - self._logger(msg, system_msg=True) - if len(self._unprocessed_steps) > 0: - if len(self._unprocessed_steps) == 1: - msg = "%s step has not started" % ( - next(iter(self._unprocessed_steps)), - ) - else: - msg = "%d steps have not started: " % len( - self._unprocessed_steps - ) - msg += "%s." % ", ".join(self._unprocessed_steps) self._logger(msg, system_msg=True) - - except KeyboardInterrupt as ex: - self._logger("Workflow interrupted.", system_msg=True, bad=True) - self._killall() - exception = ex - raise - except Exception as ex: - self._logger("Workflow failed.", system_msg=True, bad=True) - self._killall() - exception = ex - raise - finally: - # on finish clean tasks - for step in self._flow: - for deco in step.decorators: - deco.runtime_finished(exception) + if len(self._unprocessed_steps) > 0: + if len(self._unprocessed_steps) == 1: + msg = "%s step has not started" % ( + next(iter(self._unprocessed_steps)), + ) + else: + msg = "%d steps have not started: " % len( + self._unprocessed_steps + ) + msg += "%s." % ", ".join(self._unprocessed_steps) + self._logger(msg, system_msg=True) + + except KeyboardInterrupt as ex: + self._logger("Workflow interrupted.", system_msg=True, bad=True) + self._killall() + exception = ex + raise + except Exception as ex: + self._logger("Workflow failed.", system_msg=True, bad=True) + self._killall() + exception = ex + raise + finally: + # on finish clean tasks + for step in self._flow: + for deco in step.decorators: + deco.runtime_finished(exception) # assert that end was executed and it was successful if ("end", ()) in self._finished: @@ -901,7 +919,7 @@ def _launch_worker(self, task): ) return - worker = Worker(task, self._max_log_size) + worker = Worker(task, self._max_log_size, self._config_file_name) for fd in worker.fds(): self._workers[fd] = worker self._poll.add(fd) @@ -1181,7 +1199,6 @@ def __init__( # Open the output datastore only if the task is not being cloned. if not self._is_cloned: self.new_attempt() - for deco in decos: deco.runtime_task_created( self._ds, @@ -1448,6 +1465,14 @@ def __init__(self, task): for deco in flow_decorators(self.task.flow): self.top_level_options.update(deco.get_top_level_options()) + # We also pass configuration options using the kv. syntax which will cause + # the configuration options to be loaded from the INFO file (or local-info-file + # in the case of the local runtime) + if self.task.flow._user_configs: + self.top_level_options["config"] = [ + (k, ConfigInput.make_key_name(k)) for k in self.task.flow._user_configs + ] + self.commands = ["step"] self.command_args = [self.task.step] self.command_options = { @@ -1481,7 +1506,9 @@ def _options(mapping): for value in v: yield "--%s" % k if not isinstance(value, bool): - yield to_unicode(value) + value = value if isinstance(value, tuple) else (value,) + for vv in value: + yield to_unicode(vv) args = list(self.entrypoint) args.extend(_options(self.top_level_options)) @@ -1498,8 +1525,9 @@ def __str__(self): class Worker(object): - def __init__(self, task, max_logs_size): + def __init__(self, task, max_logs_size, config_file_name): self.task = task + self._config_file_name = config_file_name self._proc = self._launch() if task.retries > task.user_code_retries: @@ -1551,6 +1579,12 @@ def _launch(self): self.task.user_code_retries, self.task.ubf_context, ) + + # Add user configurations using a file to avoid using up too much space on the + # command line + if self._config_file_name: + args.top_level_options["local-info-file"] = self._config_file_name + # Pass configuration options env.update(args.get_env()) env["PYTHONUNBUFFERED"] = "x" tracing.inject_tracing_vars(env) diff --git a/metaflow/user_configs.py b/metaflow/user_configs.py index 1570af25b55..ed255e18996 100644 --- a/metaflow/user_configs.py +++ b/metaflow/user_configs.py @@ -1,15 +1,16 @@ import json import os -from typing import Any, Dict, Optional, Union, TYPE_CHECKING +from typing import Any, Callable, Dict, List, Optional, Union, TYPE_CHECKING from metaflow import INFO_FILE from metaflow._vendor import click -from .exception import MetaflowException +from .exception import MetaflowException, MetaflowInternalError from .parameters import ( DelayedEvaluationParameter, Parameter, + ParameterContext, current_flow, ) import functools @@ -37,21 +38,23 @@ # return tracefunc_closure -def dump_config_values(flow: FlowSpec): - if hasattr(flow, "_user_configs"): +def dump_config_values(flow: "FlowSpec"): + if flow._user_configs: return "user_configs", flow._user_configs return None, None -def load_config_values() -> Optional[Dict[str, Any]]: +def load_config_values(info_file: Optional[str] = None) -> Optional[Dict[str, Any]]: + if info_file is None: + info_file = INFO_FILE try: - with open(INFO_FILE, encoding="utf-8") as contents: + with open(info_file, encoding="utf-8") as contents: return json.load(contents).get("user_configs", {}) except IOError: return None -class ConfigValue(object): +class ConfigValue: # Thin wrapper to allow configuration values to be accessed using a "." notation # as well as a [] notation. @@ -61,25 +64,47 @@ def __init__(self, data: Dict[str, Any]): for key, value in data.items(): if isinstance(value, dict): value = ConfigValue(value) - elif isinstance(value, list): - value = [ConfigValue(v) for v in value] setattr(self, key, value) def __getitem__(self, key): value = self._data[key] if isinstance(value, dict): value = ConfigValue(value) - elif isinstance(value, list): - value = [ConfigValue(v) for v in value] return value def __repr__(self): return repr(self._data) + def __str__(self): + return json.dumps(self._data) + -class ConfigInput(click.ParamType): +class PathOrStr(click.ParamType): name = "ConfigInput" + def convert(self, value, param, ctx): + if value is None: + return None + + if isinstance(value, dict): + return "converted:" + json.dumps(value) + + if value.startswith("converted:"): + return value + + if os.path.isfile(value): + try: + with open(value, "r") as f: + content = f.read() + except OSError as e: + raise click.UsageError( + "Could not read configuration file '%s'" % value + ) from e + return "converted:" + content + return "converted:" + value + + +class ConfigInput: # Contains the values loaded from the INFO file. We make this a class method # so that if there are multiple configs, we just need to read the file once. # It is OK to be globally unique because this is only evoked in scenario A.2 (see @@ -88,127 +113,81 @@ class ConfigInput(click.ParamType): # (ie: even if Runner is evoked in that task, we won't "share" this global value's # usage). loaded_configs = None # type: Optional[Dict[str, Dict[str, Any]]] + info_file = None # type: Optional[str] - def __init__(self): - self._flow_cls = getattr(current_flow, "flow_cls", None) - if self._flow_cls is None: - raise MetaflowException("ConfigInput can only be used inside a flow") - if not hasattr(self._flow_cls, "_user_configs"): - self._flow_cls._user_configs = {} + def __init__( + self, + req_configs: List[str], + parsers: Dict[str, Callable[[str], Dict[str, Any]]], + ): + self._req_configs = req_configs + self._parsers = parsers @staticmethod - def _make_key_name(name: str) -> str: + def make_key_name(name: str) -> str: return "kv." + name.lower() + @classmethod + def set_info_file(cls, info_file: str): + cls.info_file = info_file + @classmethod def get_config(cls, config_name: str) -> Optional[Dict[str, Any]]: if cls.loaded_configs is None: - all_configs = load_config_values() + all_configs = load_config_values(cls.info_file) if all_configs is None: raise MetaflowException( "Could not load expected configuration values " - "the INFO file. This is a Metaflow bug. Please contact support." + "from the INFO file. This is a Metaflow bug. Please contact support." ) cls.loaded_configs = all_configs return cls.loaded_configs.get(config_name, None) - def convert(self, value, param, ctx): - # Click can call convert multiple times, so we need to make sure to only - # convert once. - if isinstance(value, (ConfigValue, DelayedEvaluationParameter)): - return value - - # There are two paths we need to worry about: - # - Scenario A: deploying to a scheduler - # A.1 In this case, when deploying (using `step-functions create` for example), - # the value passed to click (or the default value) will be converted and we - # will: - # - store the configuration in the flow object under _user_configs (so that it - # can later be dumped to the INFO file when packaging) - # - return a DelayedEvaluationParameter object so that when the scheduler - # evaluates it (with return_str set to True), it gets back the *string* - # kv. which indicates that this - # configuration should be fetched from INFO - # A.2 When the scheduler runs the flow, the value returned in A.1 (the kv. - # string) will be passed to convert again. This time, we directly return a - # ConfigValue after having fetched/loaded the configuration from INFO. - # - # - Scenario B: running with the native Runtime - # The value passed in will be similarly stored under _user_configs. We also - # return a DelayedEvaluationParameter object but when the _set_constants in - # the runtime calls it, it calls it with return_str set to False and it will - # return a ConfigValue directly which can then be persisted in the artifact - # store. - - # The value we get in to convert can be: - # - a dictionary - # - a path to a YAML or JSON file - # - the string representation of a YAML or JSON file - # In all cases, we also store the configuration in the flow object under _user_configs. - # It will *not* be stored as an artifact but is a good place to store it so we - # can access it when packaging to store it in the INFO file. The config itself - # will be stored as regular artifacts (the ConfigValue object basically) - - def _delay_eval(name: str, value: ConfigValue, return_str=False): - if return_str: - # Scenario A.1 when deploy_time_eval is called by the scheduler - # (or, in some cases, some schedulers directly identify the - # DelayedEvaluationParameter value and call it directory with - # return_str=True) - return name - # Scenario B - return value - - if isinstance(value, dict): - # Scenario A.1 or B. - self._flow_cls._user_configs[self._make_key_name(param.name)] = value - return DelayedEvaluationParameter( - param.name, "value", functools.partial(_delay_eval, param.name, value) - ) - elif not isinstance(value, str): - raise MetaflowException( - "Configuration value for '%s' must be a string or a dictionary" - % param.name + def process_configs(self, ctx, param, value): + flow_cls = getattr(current_flow, "flow_cls", None) + if flow_cls is None: + # This is an error + raise MetaflowInternalError( + "Config values should be processed for a FlowSpec" ) - # Here we are sure we have a string - if value.startswith("kv."): - # This is scenario A.2 - value = self.get_config(value) - if value is None: - raise MetaflowException( - "Could not find configuration '%s' in INFO file" % value - ) - return ConfigValue(value) + # First validate if we have all the required parameters + # Here value is a list of tuples. Each tuple has the name of the configuration + # and the string representation of the config (it was already read + # from a file if applicable). + missing = set(self._req_configs) - set([v[0] for v in value]) + if missing: + raise click.UsageError( + "Missing required configuration values: %s" % ", ".join(missing) + ) - elif os.path.isfile(value): - try: - with open(value, "r") as f: - content = f.read() - except OSError as e: - raise MetaflowException( - "Could not read configuration file '%s'" % value - ) from e - try: - value = json.loads(content) - except json.JSONDecodeError as e: - raise MetaflowException( - "Configuration file '%s' is not valid JSON" % value - ) from e - # TODO: Support YAML - self._flow_cls._user_configs[self._make_key_name(param.name)] = value - else: - try: - value = json.loads(value) - except json.JSONDecodeError as e: - raise MetaflowException( - "Configuration value for '%s' is not valid JSON" % param.name - ) from e - # TODO: Support YAML - self._flow_cls._user_configs[self._make_key_name(param.name)] = value - return DelayedEvaluationParameter( - param.name, "value", functools.partial(_delay_eval, param.name, value) - ) + to_return = {} + for name, val in value: + name = name.lower() + val = val[10:] # Remove the "converted:" prefix + if val.startswith("kv."): + # This means to load it from a file + read_value = self.get_config(val[3:]) + if read_value is None: + raise click.UsageError( + "Could not find configuration '%s' in INFO file" % val + ) + flow_cls._user_configs[name] = read_value + to_return[name] = ConfigValue(read_value) + else: + if self._parsers[name]: + read_value = self._parsers[name](val) + else: + try: + read_value = json.loads(val) + except json.JSONDecodeError as e: + raise click.UsageError( + "Configuration value for '%s' is not valid JSON" % name + ) from e + # TODO: Support YAML + flow_cls._user_configs[name] = read_value + to_return[name] = ConfigValue(read_value) + return to_return def __str__(self): return repr(self) @@ -217,9 +196,140 @@ def __repr__(self): return "ConfigInput" +class LocalFileInput(click.Path): + name = "LocalFileInput" + + def convert(self, value, param, ctx): + super().convert(value, param, ctx) + ConfigInput.set_info_file(value) + # This purposefully returns None which means it is *not* passed down + # when commands use ctx.parent.parent.params to get all the configuration + # values. + + def __str__(self): + return repr(self) + + def __repr__(self): + return "LocalFileInput" + + ConfigArgType = Union[str, Dict[str, Any]] +class DelayEvaluator: + """ + Small wrapper that allows the evaluation of a Config() value in a delayed manner. + This is used when we want to use config.* values in decorators for example. + """ + + def __init__(self, config_expr: str, is_var_only=True): + self._config_expr = config_expr + if is_var_only: + self._access = [] + else: + self._access = None + self._is_var_only = is_var_only + + def __getattr__(self, name): + if self._access is None: + raise AttributeError() + self._access.append(name) + return self + + def __call__(self): + flow_cls = getattr(current_flow, "flow_cls", None) + if flow_cls is None: + # We are not executing inside a flow (ie: not the CLI) + raise MetaflowException( + "Config object can only be used directly in the FlowSpec defining them. " + "If using outside of the FlowSpec, please use ConfigEval" + ) + if self._access is not None: + # Build the final expression by adding all the fields in access as . fields + self._config_expr = ".".join([self._config_expr] + self._access) + # Evaluate the expression setting the config values as local variables + return eval( + self._config_expr, + globals(), + {k: ConfigValue(v) for k, v in flow_cls._user_configs.items()}, + ) + + +def config_expr(expr: str) -> DelayEvaluator: + return DelayEvaluator(expr) + + +def eval_config(f: Callable[["FlowSpec"], "FlowSpec"]) -> "FlowSpec": + """ + Decorator to allow you to add Python decorators to a FlowSpec that makes use of + user configurations. + + As an example: + + ``` + def parameterize(f): + for s in f: + # Iterate over all the steps + if s.name in f.config.add_env_to_steps: + setattr(f, s.name) = environment(vars={**f.config.env_vars})(s) + return f + + @eval_config(parameterize) + class MyFlow(FlowSpec): + config = Config("config") + ... + ``` + + allows you to add an environment decorator to all steps in `add_env_to_steps`. Both + the steps to add this decorator to and the values to add are extracted from the + configuration passed to the Flow through config. + + Parameters + ---------- + f : Callable[[FlowSpec], FlowSpec] + Decorator function + + Returns + ------- + FlowSpec + The modified FlowSpec + """ + + def _wrapper(flow_spec: "FlowSpec"): + flow_spec._config_funcs.append(f) + return flow_spec + + return _wrapper + + +class FlowConfig(DelayEvaluator): + def __init__(self, config_name: str): + """ + Small wrapper to allow you to refer to a flow's configuration in a flow-level + decorator. + + As an example: + + @project(name=FlowConfig("config").project.name) + class MyFlow(FlowSpec): + config = Config("config") + ... + + This will allow you to specify a `project.name` value in your configuration + and have it used in the flow-level decorator. + + Without this construct, it would be difficult to access `config` inside the + arguments of the decorator. + + Parameters + ---------- + config_name : str + Name of the configuration being used. This should be the name given to + the `Config` constructor. + """ + super().__init__(config_name, is_var_only=True) + + class Config(Parameter): """ Includes a configuration for this flow. @@ -233,34 +343,110 @@ class Config(Parameter): Parameters ---------- name : str - User-visible parameter name. - default : Union[ConfigArgType, Callable[[ParameterContext], ConfigArgType]] - Default configuration either as a path to a file, the string representation of - a YAML or JSON file or a dictionary. If specified as a function, the function - will be evaluated to get the value to use. + User-visible configuration name. + default : Union[str, Dict[str, Any], Callable[[ParameterContext], Union[str, Dict[str, Any]]]], optional, default None + Default value for the parameter. A function + implies that the value will be computed using that function. + help : str, optional, default None + Help text to show in `run --help`. required : bool, default False Require that the user specified a value for the parameter. - `required=True` implies that the `default` value is ignored. - help : str, optional - Help text to show in `run --help`. + `required=True` implies that the `default` is not used. + parser : Callable[[str], Dict[str, Any]], optional, default None show_default : bool, default True If True, show the default value in the help text. """ + IS_FLOW_PARAMETER = True + def __init__( self, name: str, - required: bool = False, + default: Optional[ + Union[ + str, + Dict[str, Any], + Callable[[ParameterContext], Union[str, Dict[str, Any]]], + ] + ] = None, help: Optional[str] = None, - **kwargs: Dict[str, str] + required: bool = False, + parser: Optional[Callable[[str], Dict[str, Any]]] = None, + **kwargs: Dict[str, str], ): super(Config, self).__init__( name, + default=default, required=required, help=help, - type=ConfigInput(), + type=str, **kwargs, ) + if isinstance(kwargs.get("default", None), str): + kwargs["default"] = json.dumps(kwargs["default"]) + self.parser = parser def load_parameter(self, v): return v + + def __getattr__(self, name): + ev = DelayEvaluator(self.name, is_var_only=True) + return ev.__getattr__(name) + + +def config_options(cmd): + help_strs = [] + required_names = [] + defaults = [] + config_seen = set() + parsers = {} + flow_cls = getattr(current_flow, "flow_cls", None) + if flow_cls is None: + return cmd + + parameters = [p for _, p in flow_cls._get_parameters() if p.IS_FLOW_PARAMETER] + # List all the configuration options + for arg in parameters[::-1]: + kwargs = arg.option_kwargs(False) + if arg.name.lower() in config_seen: + msg = ( + "Multiple configurations use the same name '%s'. Note that names are " + "case-insensitive. Please change the " + "names of some of your configurations" % arg.name + ) + raise MetaflowException(msg) + config_seen.add(arg.name.lower()) + if kwargs["required"]: + required_names.append(arg.name) + if kwargs.get("default") is not None: + defaults.append((arg.name.lower(), kwargs["default"])) + else: + defaults.append(None) + help_strs.append(" - %s: %s" % (arg.name.lower(), kwargs.get("help", ""))) + parsers[arg.name.lower()] = arg.parser + + print("DEFAULTS %s" % defaults) + if not config_seen: + # No configurations -- don't add anything + return cmd + + help_str = ( + "Configuration options for the flow. " + "Multiple configurations can be specified." + ) + help_str = "\n\n".join([help_str] + help_strs) + cmd.params.insert( + 0, + click.Option( + ["--config", "config_options"], + nargs=2, + multiple=True, + type=click.Tuple([click.Choice(config_seen), PathOrStr()]), + callback=ConfigInput(required_names, parsers).process_configs, + help=help_str, + envvar="METAFLOW_FLOW_CONFIG", + show_default=False, + default=defaults, + ), + ) + return cmd diff --git a/metaflow/util.py b/metaflow/util.py index e95ab9f0f87..69f3be5db25 100644 --- a/metaflow/util.py +++ b/metaflow/util.py @@ -296,6 +296,9 @@ def get_metaflow_root(): def dict_to_cli_options(params): + # Prevent circular imports + from metaflow.user_configs import ConfigInput, ConfigValue + for k, v in params.items(): # Omit boolean options set to false or None, but preserve options with an empty # string argument. @@ -304,6 +307,14 @@ def dict_to_cli_options(params): # keyword in Python, so we call it 'decospecs' in click args if k == "decospecs": k = "with" + if k == "config_options": + # Special handling here since we gather them all in one option but actually + # need to send them one at a time using --config kv. + for config_name in v.keys(): + yield "--config" + yield to_unicode(config_name) + yield to_unicode(ConfigInput.make_key_name(config_name)) + continue k = k.replace("_", "-") v = v if isinstance(v, (list, tuple, set)) else [v] for value in v: From 5ef7c3b9a3ccdcfcee4ee3318c1f17038126b07e Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Wed, 28 Aug 2024 23:51:01 -0700 Subject: [PATCH 04/24] Fix runner use of configs --- metaflow/runner/click_api.py | 2 ++ metaflow/user_configs.py | 9 ++------- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/metaflow/runner/click_api.py b/metaflow/runner/click_api.py index 83d7d3f9081..4b69d7b61e3 100644 --- a/metaflow/runner/click_api.py +++ b/metaflow/runner/click_api.py @@ -39,6 +39,7 @@ from metaflow.exception import MetaflowException from metaflow.includefile import FilePathClass from metaflow.parameters import JSONTypeClass, flow_context +from metaflow.user_configs import LocalFileInput # Define a recursive type alias for JSON JSON = Union[Dict[str, "JSON"], List["JSON"], str, int, float, bool, None] @@ -56,6 +57,7 @@ File: str, JSONTypeClass: JSON, FilePathClass: str, + LocalFileInput: str, } diff --git a/metaflow/user_configs.py b/metaflow/user_configs.py index ed255e18996..bd3f7a176d5 100644 --- a/metaflow/user_configs.py +++ b/metaflow/user_configs.py @@ -372,15 +372,10 @@ def __init__( help: Optional[str] = None, required: bool = False, parser: Optional[Callable[[str], Dict[str, Any]]] = None, - **kwargs: Dict[str, str], + **kwargs: Dict[str, str] ): super(Config, self).__init__( - name, - default=default, - required=required, - help=help, - type=str, - **kwargs, + name, default=default, required=required, help=help, type=str, **kwargs ) if isinstance(kwargs.get("default", None), str): kwargs["default"] = json.dumps(kwargs["default"]) From 2f1ab934e1254df2111d42c546f124b4b1c6f9a3 Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Fri, 6 Sep 2024 01:53:54 -0700 Subject: [PATCH 05/24] Multiple fix plus sample flow Several fixes: - fixed an issue with default values - better handling of parameter defaults as configs - handle config defaults as functions - ConfigValue is more "dict"-like - made .configs and .steps work properly - renamed resolve_configs to init --- metaflow/__init__.py | 2 +- metaflow/cli.py | 13 +- metaflow/decorators.py | 11 +- metaflow/flowspec.py | 7 +- metaflow/includefile.py | 2 + metaflow/parameters.py | 15 +- metaflow/plugins/aws/batch/batch_decorator.py | 4 +- .../kubernetes/kubernetes_decorator.py | 4 +- metaflow/plugins/pypi/conda_decorator.py | 16 +- metaflow/plugins/timeout_decorator.py | 4 +- metaflow/user_configs.py | 185 ++++++++++++------ test/core/metaflow_test/__init__.py | 1 + test/core/metaflow_test/formatter.py | 6 +- test_config/config2.json | 4 + test_config/helloconfig.py | 139 +++++++++++++ 15 files changed, 323 insertions(+), 90 deletions(-) create mode 100644 test_config/config2.json create mode 100644 test_config/helloconfig.py diff --git a/metaflow/__init__.py b/metaflow/__init__.py index 951b9acde0c..4f58772c78d 100644 --- a/metaflow/__init__.py +++ b/metaflow/__init__.py @@ -103,7 +103,7 @@ class and related decorators. from .parameters import Parameter, JSONTypeClass, JSONType -from .user_configs import Config, FlowConfig, config_expr, eval_config +from .user_configs import Config, config_expr, eval_config # data layer # For historical reasons, we make metaflow.plugins.datatools accessible as diff --git a/metaflow/cli.py b/metaflow/cli.py index 962b80fa2f7..13c81082bb6 100644 --- a/metaflow/cli.py +++ b/metaflow/cli.py @@ -235,11 +235,15 @@ def version(obj): invoke_without_command=True, ) @tracing.cli_entrypoint("cli/start") +# Quiet is eager to make sure it is available when processing --config options since +# we need it to construct a context to pass to any DeployTimeField for the default +# value. @click.option( "--quiet/--not-quiet", show_default=True, default=False, help="Suppress unnecessary messages", + is_eager=True, ) @click.option( "--metadata", @@ -255,12 +259,14 @@ def version(obj): type=click.Choice(["local"] + [m.TYPE for m in ENVIRONMENTS]), help="Execution environment type", ) +# See comment for --quiet @click.option( "--datastore", default=DEFAULT_DATASTORE, show_default=True, type=click.Choice([d.TYPE for d in DATASTORES]), help="Data backend type", + is_eager=True, ) @click.option("--datastore-root", help="Root path for datastore") @click.option( @@ -400,7 +406,7 @@ def start( ctx.obj.config_options = config_options - decorators._resolve_configs(ctx.obj.flow) + decorators._init(ctx.obj.flow) # It is important to initialize flow decorators early as some of the # things they provide may be used by some of the objects initialized after. @@ -424,7 +430,10 @@ def start( # initialize current and parameter context for deploy-time parameters current._set_env(flow=ctx.obj.flow, is_running=False) parameters.set_parameter_context( - ctx.obj.flow.name, ctx.obj.echo, ctx.obj.flow_datastore + ctx.obj.flow.name, + ctx.obj.echo, + ctx.obj.flow_datastore, + dict(ctx.obj.flow.configs), ) if ctx.invoked_subcommand not in ("run", "resume"): diff --git a/metaflow/decorators.py b/metaflow/decorators.py index 8df702ebf10..b9bf32a21a7 100644 --- a/metaflow/decorators.py +++ b/metaflow/decorators.py @@ -124,9 +124,10 @@ def __init__(self, attributes=None, statically_defined=False): else: raise InvalidDecoratorAttribute(self.name, k, self.defaults) - def resolve_configs(self): + def init(self): """ - Resolve any configuration options that may be set in the decorator's attributes + Initializes the decorator. In general, any operation you would do in __init__ + should be done here. """ def _resolve_delayed_evaluator(v): @@ -544,14 +545,14 @@ def _attach_decorators_to_step(step, decospecs): step.decorators.append(deco) -def _resolve_configs(flow): +def _init(flow): # We get the datastore for the _parameters step which can contain for decorators in flow._flow_decorators.values(): for deco in decorators: - deco.resolve_configs() + deco.init() for step in flow: for deco in step.decorators: - deco.resolve_configs() + deco.init() def _init_flow_decorators( diff --git a/metaflow/flowspec.py b/metaflow/flowspec.py index b549345d76c..8cd25710aa6 100644 --- a/metaflow/flowspec.py +++ b/metaflow/flowspec.py @@ -237,7 +237,8 @@ def _process_config_funcs(self, config_options): if not param.IS_FLOW_PARAMETER: continue to_reset_params.append((var, param)) - val = config_options[param.name.replace("-", "_").lower()] + # Note that a config with no default and not required will be None + val = config_options.get(param.name.replace("-", "_").lower()) if isinstance(val, DelayedEvaluationParameter): val = val() setattr(current_cls, var, val) @@ -278,7 +279,7 @@ def _set_constants(self, graph, kwargs, config_options): for var, param in self._get_parameters(): seen.add(var) if param.IS_FLOW_PARAMETER: - val = config_options[param.name.replace("-", "_").lower()] + val = config_options.get(param.name.replace("-", "_").lower()) else: val = kwargs[param.name.replace("-", "_").lower()] # Support for delayed evaluation of parameters. @@ -360,6 +361,8 @@ def __iter__(self): return iter(self._steps) def __getattr__(self, name: str): + if name in ("configs", "steps"): + return getattr(self.__class__, name) if self._datastore and name in self._datastore: # load the attribute from the datastore... x = self._datastore[name] diff --git a/metaflow/includefile.py b/metaflow/includefile.py index 4bc16172863..b93f8c49224 100644 --- a/metaflow/includefile.py +++ b/metaflow/includefile.py @@ -20,6 +20,7 @@ ) from .plugins import DATACLIENTS +from .user_configs import ConfigValue from .util import get_username import functools @@ -136,6 +137,7 @@ def convert(self, value, param, ctx): parameter_name=param.name, logger=ctx.obj.echo, ds_type=ctx.obj.datastore_impl.TYPE, + configs=ConfigValue(dict(ctx.obj.flow.__class__.configs)), ) if len(value) > 0 and (value.startswith("{") or value.startswith('"{')): diff --git a/metaflow/parameters.py b/metaflow/parameters.py index 81fae124b2e..090e339491c 100644 --- a/metaflow/parameters.py +++ b/metaflow/parameters.py @@ -3,7 +3,7 @@ from contextlib import contextmanager from threading import local -from typing import Any, Callable, Dict, NamedTuple, Optional, Type, Union +from typing import Any, Callable, Dict, NamedTuple, Optional, TYPE_CHECKING, Type, Union from metaflow._vendor import click @@ -14,6 +14,9 @@ MetaflowException, ) +if TYPE_CHECKING: + from .user_configs import ConfigValue + try: # Python2 strtype = basestring @@ -32,6 +35,7 @@ ("parameter_name", str), ("logger", Callable[..., None]), ("ds_type", str), + ("configs", "ConfigValue"), ], ) @@ -221,7 +225,9 @@ def deploy_time_eval(value): # this is called by cli.main -def set_parameter_context(flow_name, echo, datastore): +def set_parameter_context(flow_name, echo, datastore, configs): + from .user_configs import ConfigValue # Prevent circular dependency + global context_proto context_proto = ParameterContext( flow_name=flow_name, @@ -229,6 +235,7 @@ def set_parameter_context(flow_name, echo, datastore): parameter_name=None, logger=echo, ds_type=datastore.TYPE, + configs=ConfigValue(dict(configs)), ) @@ -297,8 +304,8 @@ class MyFlow(FlowSpec): help : str, optional Help text to show in `run --help`. required : bool, default False - Require that the user specified a value for the parameter. - `required=True` implies that the `default` is not used. + Require that the user specified a value for the parameter. If a non-None + default is specified, that default will be used if no other value is provided show_default : bool, default True If True, show the default value in the help text. """ diff --git a/metaflow/plugins/aws/batch/batch_decorator.py b/metaflow/plugins/aws/batch/batch_decorator.py index 16ca5c6f768..e24591dda32 100644 --- a/metaflow/plugins/aws/batch/batch_decorator.py +++ b/metaflow/plugins/aws/batch/batch_decorator.py @@ -138,8 +138,8 @@ class BatchDecorator(StepDecorator): supports_conda_environment = True target_platform = "linux-64" - def resolve_configs(self): - super(BatchDecorator, self).resolve_configs() + def init(self): + super(BatchDecorator, self).init() # If no docker image is explicitly specified, impute a default image. if not self.attributes["image"]: diff --git a/metaflow/plugins/kubernetes/kubernetes_decorator.py b/metaflow/plugins/kubernetes/kubernetes_decorator.py index 20b56175149..d6efa18b753 100644 --- a/metaflow/plugins/kubernetes/kubernetes_decorator.py +++ b/metaflow/plugins/kubernetes/kubernetes_decorator.py @@ -145,8 +145,8 @@ class KubernetesDecorator(StepDecorator): supports_conda_environment = True target_platform = "linux-64" - def resolve_configs(self): - super(KubernetesDecorator, self).resolve_configs() + def init(self): + super(KubernetesDecorator, self).init() if not self.attributes["namespace"]: self.attributes["namespace"] = KUBERNETES_NAMESPACE diff --git a/metaflow/plugins/pypi/conda_decorator.py b/metaflow/plugins/pypi/conda_decorator.py index b6ac1b91d88..5c7859f24a2 100644 --- a/metaflow/plugins/pypi/conda_decorator.py +++ b/metaflow/plugins/pypi/conda_decorator.py @@ -49,8 +49,8 @@ class CondaStepDecorator(StepDecorator): # CONDA_CHANNELS in their environment. For pinning specific packages to specific # conda channels, users can specify channel::package as the package name. - def resolve_configs(self): - super(CondaStepDecorator, self).resolve_configs() + def init(self): + super(CondaStepDecorator, self).init() self._user_defined_attributes = ( attributes.copy() if attributes is not None else {} @@ -332,16 +332,12 @@ class CondaFlowDecorator(FlowDecorator): "disabled": None, } -<<<<<<< HEAD - def __init__(self, attributes=None, statically_defined=False): + def init(self): + super(CondaFlowDecorator, self).init() + self._user_defined_attributes = ( - attributes.copy() if attributes is not None else {} + self.attributes.copy() if self.attributes is not None else {} ) - super(CondaFlowDecorator, self).__init__(attributes, statically_defined) -======= - def resolve_configs(self): - super(CondaFlowDecorator, self).resolve_configs() ->>>>>>> 99d06ae8 (More WIP) # Support legacy 'libraries=' attribute for the decorator. self.attributes["packages"] = { diff --git a/metaflow/plugins/timeout_decorator.py b/metaflow/plugins/timeout_decorator.py index 648e318d36a..de50281b3ac 100644 --- a/metaflow/plugins/timeout_decorator.py +++ b/metaflow/plugins/timeout_decorator.py @@ -37,8 +37,8 @@ class TimeoutDecorator(StepDecorator): name = "timeout" defaults = {"seconds": 0, "minutes": 0, "hours": 0} - def resolve_configs(self): - super().resolve_configs() + def init(self): + super().init() # Initialize secs in __init__ so other decorators could safely use this # value without worrying about decorator order. # Convert values in attributes to type:int since they can be type:str diff --git a/metaflow/user_configs.py b/metaflow/user_configs.py index bd3f7a176d5..18f26ee6eec 100644 --- a/metaflow/user_configs.py +++ b/metaflow/user_configs.py @@ -1,5 +1,7 @@ +import collections.abc import json import os +import re from typing import Any, Callable, Dict, List, Optional, Union, TYPE_CHECKING @@ -8,12 +10,13 @@ from .exception import MetaflowException, MetaflowInternalError from .parameters import ( - DelayedEvaluationParameter, + DeployTimeField, Parameter, ParameterContext, current_flow, ) -import functools + +from .util import get_username if TYPE_CHECKING: from metaflow import FlowSpec @@ -54,7 +57,7 @@ def load_config_values(info_file: Optional[str] = None) -> Optional[Dict[str, An return None -class ConfigValue: +class ConfigValue(collections.abc.Mapping): # Thin wrapper to allow configuration values to be accessed using a "." notation # as well as a [] notation. @@ -72,6 +75,12 @@ def __getitem__(self, key): value = ConfigValue(value) return value + def __len__(self): + return len(self._data) + + def __iter__(self): + return iter(self._data) + def __repr__(self): return repr(self._data) @@ -80,9 +89,10 @@ def __str__(self): class PathOrStr(click.ParamType): - name = "ConfigInput" + name = "PathOrStr" - def convert(self, value, param, ctx): + @staticmethod + def convert_value(value): if value is None: return None @@ -94,7 +104,7 @@ def convert(self, value, param, ctx): if os.path.isfile(value): try: - with open(value, "r") as f: + with open(value, "r", encoding="utf-8") as f: content = f.read() except OSError as e: raise click.UsageError( @@ -103,6 +113,9 @@ def convert(self, value, param, ctx): return "converted:" + content return "converted:" + value + def convert(self, value, param, ctx): + return self.convert_value(value) + class ConfigInput: # Contains the values loaded from the INFO file. We make this a class method @@ -118,9 +131,11 @@ class ConfigInput: def __init__( self, req_configs: List[str], + defaults: Dict[str, Union[str, Dict[str, Any]]], parsers: Dict[str, Callable[[str], Dict[str, Any]]], ): - self._req_configs = req_configs + self._req_configs = set(req_configs) + self._defaults = defaults self._parsers = parsers @staticmethod @@ -144,6 +159,8 @@ def get_config(cls, config_name: str) -> Optional[Dict[str, Any]]: return cls.loaded_configs.get(config_name, None) def process_configs(self, ctx, param, value): + from .cli import echo_always, echo_dev_null # Prevent circular import + flow_cls = getattr(current_flow, "flow_cls", None) if flow_cls is None: # This is an error @@ -151,19 +168,51 @@ def process_configs(self, ctx, param, value): "Config values should be processed for a FlowSpec" ) - # First validate if we have all the required parameters - # Here value is a list of tuples. Each tuple has the name of the configuration - # and the string representation of the config (it was already read - # from a file if applicable). - missing = set(self._req_configs) - set([v[0] for v in value]) - if missing: - raise click.UsageError( - "Missing required configuration values: %s" % ", ".join(missing) - ) - + # value is a list of tuples (name, value). + # Click will provide: + # - all the defaults if nothing is provided on the command line + # - provide *just* the passed in value if anything is provided on the command + # line. + # + # We therefore "merge" the defaults with what we are provided by click to form + # a full set of values + # We therefore get a full set of values where: + # - the name will correspond to the configuration name + # - the value will be the default (including None if there is no default) or + # the string representation of the value (this will always include + # the "converted:" prefix as it will have gone through the PathOrStr + # conversion function). A value of None basically means that the config has + # no default and was not specified on the command line. to_return = {} + + merged_configs = dict(self._defaults) for name, val in value: + # Don't replace by None -- this is needed to avoid replacing a function + # default + if val: + merged_configs[name] = val + + print("PARAMS: %s" % str(ctx.params)) + missing_configs = set() + for name, val in merged_configs.items(): name = name.lower() + # convert is idempotent so if it is already converted, it will just return + # the value. This is used to make sure we process the defaults + if isinstance(val, DeployTimeField): + # We will form our own context and pass it down + param_ctx = ParameterContext( + flow_name=ctx.obj.flow.name, + user_name=get_username(), + parameter_name=name, + logger=echo_dev_null if ctx.params["quiet"] else echo_always, + ds_type=ctx.params["datastore"], + configs=None, + ) + val = val.fun(param_ctx) + val = PathOrStr.convert_value(val) + if val is None: + missing_configs.add(name) + continue val = val[10:] # Remove the "converted:" prefix if val.startswith("kv."): # This means to load it from a file @@ -187,6 +236,11 @@ def process_configs(self, ctx, param, value): # TODO: Support YAML flow_cls._user_configs[name] = read_value to_return[name] = ConfigValue(read_value) + + if missing_configs.intersection(self._req_configs): + raise click.UsageError( + "Missing configuration values for %s" % ", ".join(missing_configs) + ) return to_return def __str__(self): @@ -222,13 +276,17 @@ class DelayEvaluator: This is used when we want to use config.* values in decorators for example. """ - def __init__(self, config_expr: str, is_var_only=True): - self._config_expr = config_expr - if is_var_only: + id_pattern = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*$") + + def __init__(self, ex: str): + self._config_expr = ex + if self.id_pattern.match(self._config_expr): + # This is a variable only so allow things like config_expr("config").var + self._is_var_only = True self._access = [] else: + self._is_var_only = False self._access = None - self._is_var_only = is_var_only def __getattr__(self, name): if self._access is None: @@ -236,7 +294,9 @@ def __getattr__(self, name): self._access.append(name) return self - def __call__(self): + def __call__(self, ctx=None, deploy_time=False): + # Two additional arguments are only used by DeployTimeField which will call + # this function with those two additional arguments. They are ignored. flow_cls = getattr(current_flow, "flow_cls", None) if flow_cls is None: # We are not executing inside a flow (ie: not the CLI) @@ -256,6 +316,32 @@ def __call__(self): def config_expr(expr: str) -> DelayEvaluator: + """ + Function to allow you to use an expression involving a config parameter in + places where it may not be directory accessible or if you want a more complicated + expression than just a single variable. + + You can use it as follows: + - When the config is not directly accessible: + + @project(name=config_expr("config").project.name) + class MyFlow(FlowSpec): + config = Config("config") + ... + - When you want a more complex expression: + class MyFlow(FlowSpec): + config = Config("config") + + @environment(vars={"foo": config_expr("config.bar.baz.lower()")}) + @step + def start(self): + ... + + Parameters + ---------- + expr : str + Expression using the config values. + """ return DelayEvaluator(expr) @@ -302,34 +388,6 @@ def _wrapper(flow_spec: "FlowSpec"): return _wrapper -class FlowConfig(DelayEvaluator): - def __init__(self, config_name: str): - """ - Small wrapper to allow you to refer to a flow's configuration in a flow-level - decorator. - - As an example: - - @project(name=FlowConfig("config").project.name) - class MyFlow(FlowSpec): - config = Config("config") - ... - - This will allow you to specify a `project.name` value in your configuration - and have it used in the flow-level decorator. - - Without this construct, it would be difficult to access `config` inside the - arguments of the decorator. - - Parameters - ---------- - config_name : str - Name of the configuration being used. This should be the name given to - the `Config` constructor. - """ - super().__init__(config_name, is_var_only=True) - - class Config(Parameter): """ Includes a configuration for this flow. @@ -374,9 +432,12 @@ def __init__( parser: Optional[Callable[[str], Dict[str, Any]]] = None, **kwargs: Dict[str, str] ): + + print("Config %s, default is %s" % (name, default)) super(Config, self).__init__( name, default=default, required=required, help=help, type=str, **kwargs ) + if isinstance(kwargs.get("default", None), str): kwargs["default"] = json.dumps(kwargs["default"]) self.parser = parser @@ -385,14 +446,14 @@ def load_parameter(self, v): return v def __getattr__(self, name): - ev = DelayEvaluator(self.name, is_var_only=True) + ev = DelayEvaluator(self.name) return ev.__getattr__(name) def config_options(cmd): help_strs = [] required_names = [] - defaults = [] + defaults = {} config_seen = set() parsers = {} flow_cls = getattr(current_flow, "flow_cls", None) @@ -400,8 +461,10 @@ def config_options(cmd): return cmd parameters = [p for _, p in flow_cls._get_parameters() if p.IS_FLOW_PARAMETER] + config_opt_required = False # List all the configuration options for arg in parameters[::-1]: + save_default = arg.kwargs.get("default", None) kwargs = arg.option_kwargs(False) if arg.name.lower() in config_seen: msg = ( @@ -413,14 +476,17 @@ def config_options(cmd): config_seen.add(arg.name.lower()) if kwargs["required"]: required_names.append(arg.name) - if kwargs.get("default") is not None: - defaults.append((arg.name.lower(), kwargs["default"])) - else: - defaults.append(None) + if save_default is None: + # We need at least one option if we have a required configuration. + config_opt_required = True + defaults[arg.name.lower()] = save_default help_strs.append(" - %s: %s" % (arg.name.lower(), kwargs.get("help", ""))) parsers[arg.name.lower()] = arg.parser - print("DEFAULTS %s" % defaults) + print( + "DEFAULTS %s" + % str(dict((k, v if not callable(v) else "FUNC") for k, v in defaults.items())) + ) if not config_seen: # No configurations -- don't add anything return cmd @@ -437,11 +503,12 @@ def config_options(cmd): nargs=2, multiple=True, type=click.Tuple([click.Choice(config_seen), PathOrStr()]), - callback=ConfigInput(required_names, parsers).process_configs, + callback=ConfigInput(required_names, defaults, parsers).process_configs, help=help_str, envvar="METAFLOW_FLOW_CONFIG", show_default=False, - default=defaults, + default=[(k, v if not callable(v) else None) for k, v in defaults.items()], + required=config_opt_required, ), ) return cmd diff --git a/test/core/metaflow_test/__init__.py b/test/core/metaflow_test/__init__.py index 19263df26ff..f2e574fe627 100644 --- a/test/core/metaflow_test/__init__.py +++ b/test/core/metaflow_test/__init__.py @@ -155,6 +155,7 @@ class MetaflowTest(object): PRIORITY = 999999999 PARAMETERS = {} INCLUDE_FILES = {} + CONFIGS = {} CLASS_VARS = {} HEADER = "" diff --git a/test/core/metaflow_test/formatter.py b/test/core/metaflow_test/formatter.py index 4461645c217..0da52b49501 100644 --- a/test/core/metaflow_test/formatter.py +++ b/test/core/metaflow_test/formatter.py @@ -85,7 +85,7 @@ def _flow_lines(self): tags.extend(tag.split("(")[0] for tag in step.tags) yield 0, "# -*- coding: utf-8 -*-" - yield 0, "from metaflow import FlowSpec, step, Parameter, project, IncludeFile, JSONType, current, parallel" + yield 0, "from metaflow import Config, FlowSpec, step, Parameter, project, IncludeFile, JSONType, current, parallel" yield 0, "from metaflow_test import assert_equals, assert_equals_metadata, assert_exception, ExpectationFailed, is_resumed, ResumeFromHere, TestRetry, try_to_get_card" if tags: yield 0, "from metaflow import %s" % ",".join(tags) @@ -104,6 +104,10 @@ def _flow_lines(self): kwargs = ["%s=%s" % (k, v) for k, v in include.items()] yield 1, '%s = IncludeFile("%s", %s)' % (var, var, ",".join(kwargs)) + for var, include in self.test.CONFIGS.items(): + kwargs = ["%s=%s" % (k, v) for k, v in include.items()] + yield 1, '%s = Config("%s", %s)' % (var, var, ",".join(kwargs)) + for name, node in self.graphspec["graph"].items(): step = self._choose_step(name, node) self.used.add(step) diff --git a/test_config/config2.json b/test_config/config2.json new file mode 100644 index 00000000000..12ec1d8f996 --- /dev/null +++ b/test_config/config2.json @@ -0,0 +1,4 @@ +{ + "default_param": 456, + "default_param2": 789 +} diff --git a/test_config/helloconfig.py b/test_config/helloconfig.py new file mode 100644 index 00000000000..be8246cc6b2 --- /dev/null +++ b/test_config/helloconfig.py @@ -0,0 +1,139 @@ +import os + +from metaflow import ( + Config, + FlowSpec, + Parameter, + environment, + step, + project, + config_expr, + eval_config, + titus, +) + + +def silly_parser(s): + k, v = s.split(":") + return {k: v} + + +def param_func(ctx): + return ctx.configs.config2.default_param2 + 1 + + +def config_func(ctx): + return {"val": 123} + + +default_config = { + "run_on_titus": ["hello"], + "cpu_count": 2, + "env_to_start": "Romain", + "magic_value": 42, + "project_name": "hirec", +} + +silly_config = "baz:awesome" + + +def titus_or_not(flow): + to_replace = [] + for name, s in flow.steps: + if name in flow.config.run_on_titus: + to_replace.append((name, titus(cpu=flow.config.cpu_count)(s))) + for name, val in to_replace: + setattr(flow, name, val) + return flow + + +def add_env_to_start(flow): + # Add a decorator directly to a step + flow.start = environment(vars={"hello": config_expr("config").env_to_start})( + flow.start + ) + return flow + + +@eval_config(titus_or_not) +@add_env_to_start +@project(name=config_expr("config").project_name) +class HelloConfig(FlowSpec): + """ + A flow where Metaflow prints 'Hi'. + + Run this flow to validate that Metaflow is installed correctly. + + """ + + default_from_config = Parameter( + "default_from_config", default=config_expr("config2").default_param, type=int + ) + + default_from_func = Parameter("default_from_func", default=param_func, type=int) + + config = Config("config", default=default_config, help="Help for config") + sconfig = Config( + "sconfig", + default="sillyconfig.txt", + parser=silly_parser, + help="Help for sconfig", + required=True, + ) + config2 = Config("config2", required=True) + + config3 = Config("config3", default=config_func) + + @step + def start(self): + """ + This is the 'start' step. All flows must have a step named 'start' that + is the first step in the flow. + + """ + print("HelloConfig is %s (should be awesome)" % self.sconfig.baz) + print( + "Environment variable hello %s (should be Romain)" % os.environ.get("hello") + ) + + print( + "Parameters are: default_from_config: %s, default_from_func: %s" + % (self.default_from_config, self.default_from_func) + ) + + print("Config3 has value: %s" % self.config3.val) + self.next(self.hello) + + @environment( + vars={ + "normal": config.env_to_start, + "stringify": config_expr("str(config.magic_value)"), + } + ) + @step + def hello(self): + """ + A step for metaflow to introduce itself. + + """ + print( + "In this step, we got a normal variable %s, one that is stringified %s" + % ( + os.environ.get("normal"), + os.environ.get("stringify"), + ) + ) + self.next(self.end) + + @step + def end(self): + """ + This is the 'end' step. All flows must have an 'end' step, which is the + last step in the flow. + + """ + print("HelloFlow is all done") + + +if __name__ == "__main__": + HelloConfig() From daf067ba9d97e5b07aa6e959e5a4838794268519 Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Tue, 10 Sep 2024 00:58:34 -0700 Subject: [PATCH 06/24] Addressed comments. Added more documentation/explanation Specifically: - moved things out of the INFO file - added to_dict - renamed user_configs to config_parameters --- metaflow/__init__.py | 2 +- metaflow/cli.py | 6 +- .../{user_configs.py => config_parameters.py} | 135 +++++++++++++----- metaflow/decorators.py | 2 +- metaflow/flowspec.py | 2 +- metaflow/includefile.py | 4 +- metaflow/metaflow_environment.py | 11 +- metaflow/package.py | 20 ++- metaflow/parameters.py | 4 +- metaflow/plugins/pypi/conda_decorator.py | 4 +- metaflow/runner/click_api.py | 2 +- metaflow/runtime.py | 10 +- metaflow/util.py | 2 +- 13 files changed, 139 insertions(+), 65 deletions(-) rename metaflow/{user_configs.py => config_parameters.py} (76%) diff --git a/metaflow/__init__.py b/metaflow/__init__.py index 4f58772c78d..fbc08342d9c 100644 --- a/metaflow/__init__.py +++ b/metaflow/__init__.py @@ -103,7 +103,7 @@ class and related decorators. from .parameters import Parameter, JSONTypeClass, JSONType -from .user_configs import Config, config_expr, eval_config +from .config_parameters import Config, config_expr, eval_config # data layer # For historical reasons, we make metaflow.plugins.datatools accessible as diff --git a/metaflow/cli.py b/metaflow/cli.py index 13c81082bb6..da418ad20cd 100644 --- a/metaflow/cli.py +++ b/metaflow/cli.py @@ -35,7 +35,7 @@ from .pylint_wrapper import PyLint from .R import metaflow_r_version, use_r from .util import resolve_identity -from .user_configs import LocalFileInput, config_options +from .config_parameters import LocalFileInput, config_options ERASE_TO_EOL = "\033[K" HIGHLIGHT = "red" @@ -304,11 +304,11 @@ def version(obj): help="Monitoring backend type", ) @click.option( - "--local-info-file", + "--local-config-file", type=LocalFileInput(exists=True, readable=True, dir_okay=False, resolve_path=True), required=False, default=None, - help="A filename containing a subset of the INFO file. Internal use only.", + help="A filename containing the dumped configuration values. Internal use only.", hidden=True, is_eager=True, ) diff --git a/metaflow/user_configs.py b/metaflow/config_parameters.py similarity index 76% rename from metaflow/user_configs.py rename to metaflow/config_parameters.py index 18f26ee6eec..3a76500e6dd 100644 --- a/metaflow/user_configs.py +++ b/metaflow/config_parameters.py @@ -5,7 +5,6 @@ from typing import Any, Callable, Dict, List, Optional, Union, TYPE_CHECKING -from metaflow import INFO_FILE from metaflow._vendor import click from .exception import MetaflowException, MetaflowInternalError @@ -40,16 +39,20 @@ # return tracefunc_closure +CONFIG_FILE = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "CONFIG_PARAMETERS" +) + def dump_config_values(flow: "FlowSpec"): if flow._user_configs: - return "user_configs", flow._user_configs - return None, None + return {"user_configs": flow._user_configs} + return {} -def load_config_values(info_file: Optional[str] = None) -> Optional[Dict[str, Any]]: +def load_config_values(info_file: Optional[str] = None) -> Optional[Dict[Any, Any]]: if info_file is None: - info_file = INFO_FILE + info_file = os.path.basename(CONFIG_FILE) try: with open(info_file, encoding="utf-8") as contents: return json.load(contents).get("user_configs", {}) @@ -58,10 +61,19 @@ def load_config_values(info_file: Optional[str] = None) -> Optional[Dict[str, An class ConfigValue(collections.abc.Mapping): + """ + ConfigValue is a thin wrapper around an arbitrarily nested dictionary-like + configuration object. It allows you to access elements of this nested structure + using either a "." notation or a [] notation. As an example, if your configuration + object is: + {"foo": {"bar": 42}} + you can access the value 42 using either config["foo"]["bar"] or config.foo.bar. + """ + # Thin wrapper to allow configuration values to be accessed using a "." notation # as well as a [] notation. - def __init__(self, data: Dict[str, Any]): + def __init__(self, data: Dict[Any, Any]): self._data = data for key, value in data.items(): @@ -69,7 +81,20 @@ def __init__(self, data: Dict[str, Any]): value = ConfigValue(value) setattr(self, key, value) - def __getitem__(self, key): + def __getitem__(self, key: Any) -> Any: + """ + Access an element of this configuration + + Parameters + ---------- + key : Any + Element to access + + Returns + ------- + Any + Element of the configuration + """ value = self._data[key] if isinstance(value, dict): value = ConfigValue(value) @@ -87,12 +112,31 @@ def __repr__(self): def __str__(self): return json.dumps(self._data) + def to_dict(self) -> Dict[Any, Any]: + """ + Returns a dictionary representation of this configuration object. + + Returns + ------- + Dict[Any, Any] + Dictionary equivalent of this configuration object. + """ + return dict(self._data) + class PathOrStr(click.ParamType): + # Click parameter type for a configuration value -- it can either be the string + # representation of the configuration value (like a JSON string or any other + # string that the configuration parser can parse) or the path to a file containing + # such a content. The value will be initially assumed to be that of a file and will + # only be considered not a file if no file exists. name = "PathOrStr" @staticmethod def convert_value(value): + # Click requires this to be idempotent. We therefore check if the value + # starts with "converted:" which is our marker for "we already processed this + # value". if value is None: return None @@ -118,21 +162,22 @@ def convert(self, value, param, ctx): class ConfigInput: - # Contains the values loaded from the INFO file. We make this a class method - # so that if there are multiple configs, we just need to read the file once. - # It is OK to be globally unique because this is only evoked in scenario A.2 (see - # convert method) which means we are already just executing a single task and so - # there is no concern about it "leaking" to things running with Runner for example - # (ie: even if Runner is evoked in that task, we won't "share" this global value's - # usage). - loaded_configs = None # type: Optional[Dict[str, Dict[str, Any]]] - info_file = None # type: Optional[str] + # ConfigInput is an internal class responsible for processing all the --config + # options. It gathers information from the --local-config-file (to figure out + # where options are stored) and is also responsible for processing any `--config` + # options and processing the default value of `Config(...)` objects. + + # It will then store this information in the flow spec for use later in processing. + # It is stored in the flow spec to avoid being global to support the Runner. + + loaded_configs = None # type: Optional[Dict[str, Dict[Any, Any]]] + config_file = None # type: Optional[str] def __init__( self, req_configs: List[str], - defaults: Dict[str, Union[str, Dict[str, Any]]], - parsers: Dict[str, Callable[[str], Dict[str, Any]]], + defaults: Dict[str, Union[str, Dict[Any, Any]]], + parsers: Dict[str, Callable[[str], Dict[Any, Any]]], ): self._req_configs = set(req_configs) self._defaults = defaults @@ -140,20 +185,24 @@ def __init__( @staticmethod def make_key_name(name: str) -> str: + # Special mark to indicate that the configuration value is not content or a file + # name but a value that should be read in the config file (effectively where + # the value has already been materialized). return "kv." + name.lower() @classmethod - def set_info_file(cls, info_file: str): - cls.info_file = info_file + def set_config_file(cls, config_file: str): + cls.config_file = config_file @classmethod - def get_config(cls, config_name: str) -> Optional[Dict[str, Any]]: + def get_config(cls, config_name: str) -> Optional[Dict[Any, Any]]: if cls.loaded_configs is None: - all_configs = load_config_values(cls.info_file) + all_configs = load_config_values(cls.config_file) if all_configs is None: raise MetaflowException( "Could not load expected configuration values " - "from the INFO file. This is a Metaflow bug. Please contact support." + "from the CONFIG_PARAMETERS file. This is a Metaflow bug. " + "Please contact support." ) cls.loaded_configs = all_configs return cls.loaded_configs.get(config_name, None) @@ -168,7 +217,8 @@ def process_configs(self, ctx, param, value): "Config values should be processed for a FlowSpec" ) - # value is a list of tuples (name, value). + # This function is called by click when processing all the --config options. + # The value passed in is a list of tuples (name, value). # Click will provide: # - all the defaults if nothing is provided on the command line # - provide *just* the passed in value if anything is provided on the command @@ -197,9 +247,15 @@ def process_configs(self, ctx, param, value): for name, val in merged_configs.items(): name = name.lower() # convert is idempotent so if it is already converted, it will just return - # the value. This is used to make sure we process the defaults + # the value. This is used to make sure we process the defaults which do + # NOT make it through the PathOrStr convert function if isinstance(val, DeployTimeField): - # We will form our own context and pass it down + # This supports a default value that is a deploy-time field (similar + # to Parameter).) + # We will form our own context and pass it down -- note that you cannot + # use configs in the default value of configs as this introduces a bit + # of circularity. Note also that quiet and datastore are *eager* + # options so are available here. param_ctx = ParameterContext( flow_name=ctx.obj.flow.name, user_name=get_username(), @@ -251,14 +307,19 @@ def __repr__(self): class LocalFileInput(click.Path): + # Small wrapper around click.Path to set the value from which to read configuration + # values. This is set immediately upon processing the --local-config-file + # option and will therefore then be available when processing any of the other + # --config options (which will call ConfigInput.process_configs name = "LocalFileInput" def convert(self, value, param, ctx): super().convert(value, param, ctx) - ConfigInput.set_info_file(value) + ConfigInput.set_config_file(value) # This purposefully returns None which means it is *not* passed down # when commands use ctx.parent.parent.params to get all the configuration - # values. + # values (it becomes hidden because its only purpose is to update the + # config file in ConfigInput) def __str__(self): return repr(self) @@ -267,7 +328,7 @@ def __repr__(self): return "LocalFileInput" -ConfigArgType = Union[str, Dict[str, Any]] +ConfigArgType = Union[str, Dict[Any, Any]] class DelayEvaluator: @@ -402,15 +463,17 @@ class Config(Parameter): ---------- name : str User-visible configuration name. - default : Union[str, Dict[str, Any], Callable[[ParameterContext], Union[str, Dict[str, Any]]]], optional, default None + default : Union[str, Dict[Any, Any], Callable[[ParameterContext], Union[str, Dict[Any, Any]]]], optional, default None Default value for the parameter. A function implies that the value will be computed using that function. help : str, optional, default None Help text to show in `run --help`. required : bool, default False - Require that the user specified a value for the parameter. - `required=True` implies that the `default` is not used. - parser : Callable[[str], Dict[str, Any]], optional, default None + Require that the user specified a value for the parameter. Note that if + a default is provided, the required flag is ignored. + parser : Callable[[str], Dict[Any, Any]], optional, default None + An optional function that can parse the configuration string into an arbitrarily + nested dictionary. show_default : bool, default True If True, show the default value in the help text. """ @@ -423,13 +486,13 @@ def __init__( default: Optional[ Union[ str, - Dict[str, Any], - Callable[[ParameterContext], Union[str, Dict[str, Any]]], + Dict[Any, Any], + Callable[[ParameterContext], Union[str, Dict[Any, Any]]], ] ] = None, help: Optional[str] = None, required: bool = False, - parser: Optional[Callable[[str], Dict[str, Any]]] = None, + parser: Optional[Callable[[str], Dict[Any, Any]]] = None, **kwargs: Dict[str, str] ): diff --git a/metaflow/decorators.py b/metaflow/decorators.py index b9bf32a21a7..b852ade200f 100644 --- a/metaflow/decorators.py +++ b/metaflow/decorators.py @@ -12,7 +12,7 @@ ) from .parameters import current_flow -from .user_configs import DelayEvaluator +from .config_parameters import DelayEvaluator from metaflow._vendor import click diff --git a/metaflow/flowspec.py b/metaflow/flowspec.py index 8cd25710aa6..d82478301b1 100644 --- a/metaflow/flowspec.py +++ b/metaflow/flowspec.py @@ -20,7 +20,7 @@ from .graph import FlowGraph from .unbounded_foreach import UnboundedForeachInput -from .user_configs import ConfigInput, ConfigValue +from .config_parameters import ConfigInput, ConfigValue from .util import to_pod from .metaflow_config import INCLUDE_FOREACH_STACK, MAXIMUM_FOREACH_VALUE_CHARS diff --git a/metaflow/includefile.py b/metaflow/includefile.py index b93f8c49224..499b4cd6a90 100644 --- a/metaflow/includefile.py +++ b/metaflow/includefile.py @@ -20,7 +20,7 @@ ) from .plugins import DATACLIENTS -from .user_configs import ConfigValue +from .config_parameters import ConfigValue from .util import get_username import functools @@ -137,7 +137,7 @@ def convert(self, value, param, ctx): parameter_name=param.name, logger=ctx.obj.echo, ds_type=ctx.obj.datastore_impl.TYPE, - configs=ConfigValue(dict(ctx.obj.flow.__class__.configs)), + configs=ConfigValue(dict(ctx.obj.flow.configs)), ) if len(value) > 0 and (value.startswith("{") or value.startswith('"{')): diff --git a/metaflow/metaflow_environment.py b/metaflow/metaflow_environment.py index cea6e18697b..0ac1ca3266c 100644 --- a/metaflow/metaflow_environment.py +++ b/metaflow/metaflow_environment.py @@ -6,7 +6,6 @@ from . import metaflow_version from metaflow.exception import MetaflowException from metaflow.extension_support import dump_module_info -from metaflow.user_configs import dump_config_values from metaflow.mflog import BASH_MFLOG from . import R @@ -19,7 +18,7 @@ class MetaflowEnvironment(object): TYPE = "local" def __init__(self, flow): - self._flow = flow + pass def init_environment(self, echo): """ @@ -178,7 +177,7 @@ def get_package_commands(self, code_package_url, datastore_type): ] return cmds - def get_environment_info(self, full_info=False): + def get_environment_info(self, include_ext_info=False): # note that this dict goes into the code package # so variables here should be relatively stable (no # timestamps) so the hash won't change all the time @@ -199,14 +198,10 @@ def get_environment_info(self, full_info=False): env["metaflow_r_version"] = R.metaflow_r_version() env["r_version"] = R.r_version() env["r_version_code"] = R.r_version_code() - if full_info: + if include_ext_info: # Information about extension modules (to load them in the proper order) ext_key, ext_val = dump_module_info() env[ext_key] = ext_val - # Information about configurations (to be able to reload them) - user_configs = dump_config_values(self._flow) - if user_configs: - env[user_configs[0]] = user_configs[1] return env def executable(self, step_name, default=None): diff --git a/metaflow/package.py b/metaflow/package.py index 55968a03d65..9666d929d08 100644 --- a/metaflow/package.py +++ b/metaflow/package.py @@ -6,6 +6,7 @@ import json from io import BytesIO +from .config_parameters import CONFIG_FILE, dump_config_values from .extension_support import EXT_PKG, package_mfext_all from .metaflow_config import DEFAULT_PACKAGE_SUFFIXES from .exception import MetaflowException @@ -151,11 +152,23 @@ def path_tuples(self): for path_tuple in self._walk(flowdir, suffixes=self.suffixes): yield path_tuple + def _add_configs(self, tar): + buf = BytesIO() + buf.write(json.dumps(dump_config_values(self._flow)).encode("utf-8")) + self._add_file(tar, os.path.basename(CONFIG_FILE), buf) + def _add_info(self, tar): - info = tarfile.TarInfo(os.path.basename(INFO_FILE)) - env = self.environment.get_environment_info(full_info=True) buf = BytesIO() - buf.write(json.dumps(env).encode("utf-8")) + buf.write( + json.dumps( + self.environment.get_environment_info(include_ext_info=True) + ).encode("utf-8") + ) + self._add_file(tar, os.path.basename(INFO_FILE), buf) + + @staticmethod + def _add_file(tar, filename, buf): + info = tarfile.TarInfo(filename) buf.seek(0) info.size = len(buf.getvalue()) # Setting this default to Dec 3, 2019 @@ -175,6 +188,7 @@ def no_mtime(tarinfo): fileobj=buf, mode="w:gz", compresslevel=3, dereference=True ) as tar: self._add_info(tar) + self._add_configs(tar) for path, arcname in self.path_tuples(): tar.add(path, arcname=arcname, recursive=False, filter=no_mtime) diff --git a/metaflow/parameters.py b/metaflow/parameters.py index 090e339491c..da9669243bf 100644 --- a/metaflow/parameters.py +++ b/metaflow/parameters.py @@ -15,7 +15,7 @@ ) if TYPE_CHECKING: - from .user_configs import ConfigValue + from .config_parameters import ConfigValue try: # Python2 @@ -226,7 +226,7 @@ def deploy_time_eval(value): # this is called by cli.main def set_parameter_context(flow_name, echo, datastore, configs): - from .user_configs import ConfigValue # Prevent circular dependency + from .config_parameters import ConfigValue # Prevent circular dependency global context_proto context_proto = ParameterContext( diff --git a/metaflow/plugins/pypi/conda_decorator.py b/metaflow/plugins/pypi/conda_decorator.py index 5c7859f24a2..49b42f7e55e 100644 --- a/metaflow/plugins/pypi/conda_decorator.py +++ b/metaflow/plugins/pypi/conda_decorator.py @@ -174,7 +174,9 @@ def runtime_init(self, flow, graph, package, run_id): encoding="utf-8", ) as f: f.write( - json.dumps(self.environment.get_environment_info(full_info=True)) + json.dumps( + self.environment.get_environment_info(include_ext_info=True) + ) ) # Support metaflow extensions. diff --git a/metaflow/runner/click_api.py b/metaflow/runner/click_api.py index 4b69d7b61e3..0001ed39a95 100644 --- a/metaflow/runner/click_api.py +++ b/metaflow/runner/click_api.py @@ -39,7 +39,7 @@ from metaflow.exception import MetaflowException from metaflow.includefile import FilePathClass from metaflow.parameters import JSONTypeClass, flow_context -from metaflow.user_configs import LocalFileInput +from metaflow.config_parameters import LocalFileInput # Define a recursive type alias for JSON JSON = Union[Dict[str, "JSON"], List["JSON"], str, int, float, bool, None] diff --git a/metaflow/runtime.py b/metaflow/runtime.py index e4cb07caddb..724029e37f6 100644 --- a/metaflow/runtime.py +++ b/metaflow/runtime.py @@ -42,7 +42,7 @@ UBF_TASK, ) -from .user_configs import ConfigInput, dump_config_values +from .config_parameters import ConfigInput, dump_config_values import metaflow.tracing as tracing @@ -426,9 +426,9 @@ def execute(self): # Configurations are passed through a file to avoid overloading the # command-line. We only need to create this file once and it can be reused # for any task launch - config_key, config_value = dump_config_values(self._flow) + config_value = dump_config_values(self._flow) if config_value: - json.dump({config_key: config_value}, config_file) + json.dump(config_value, config_file) config_file.flush() self._config_file_name = config_file.name else: @@ -1466,7 +1466,7 @@ def __init__(self, task): self.top_level_options.update(deco.get_top_level_options()) # We also pass configuration options using the kv. syntax which will cause - # the configuration options to be loaded from the INFO file (or local-info-file + # the configuration options to be loaded from the CONFIG file (or local-config-file # in the case of the local runtime) if self.task.flow._user_configs: self.top_level_options["config"] = [ @@ -1583,7 +1583,7 @@ def _launch(self): # Add user configurations using a file to avoid using up too much space on the # command line if self._config_file_name: - args.top_level_options["local-info-file"] = self._config_file_name + args.top_level_options["local-config-file"] = self._config_file_name # Pass configuration options env.update(args.get_env()) env["PYTHONUNBUFFERED"] = "x" diff --git a/metaflow/util.py b/metaflow/util.py index 69f3be5db25..e1b7d48f214 100644 --- a/metaflow/util.py +++ b/metaflow/util.py @@ -297,7 +297,7 @@ def get_metaflow_root(): def dict_to_cli_options(params): # Prevent circular imports - from metaflow.user_configs import ConfigInput, ConfigValue + from metaflow.config_parameters import ConfigInput, ConfigValue for k, v in params.items(): # Omit boolean options set to false or None, but preserve options with an empty From 1d77249ba36be190ecf23f728bf9f9c52a54b8c5 Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Tue, 10 Sep 2024 12:36:14 -0700 Subject: [PATCH 07/24] Added test, more cleanup Specifically: - made config values immutable - cleaned up state stored in FlowSpec - added a test exercising configs in various places --- metaflow/config_parameters.py | 77 ++++++++++++--- metaflow/flowspec.py | 50 +++++----- metaflow/runtime.py | 6 +- test/core/metaflow_test/formatter.py | 2 +- test/core/tests/basic_config_parameters.py | 106 +++++++++++++++++++++ 5 files changed, 203 insertions(+), 38 deletions(-) create mode 100644 test/core/tests/basic_config_parameters.py diff --git a/metaflow/config_parameters.py b/metaflow/config_parameters.py index 3a76500e6dd..eb20fd6f1f2 100644 --- a/metaflow/config_parameters.py +++ b/metaflow/config_parameters.py @@ -8,6 +8,7 @@ from metaflow._vendor import click from .exception import MetaflowException, MetaflowInternalError + from .parameters import ( DeployTimeField, Parameter, @@ -45,8 +46,11 @@ def dump_config_values(flow: "FlowSpec"): - if flow._user_configs: - return {"user_configs": flow._user_configs} + from .flowspec import _FlowState # Prevent circular import + + configs = flow._flow_state.get(_FlowState.CONFIGS) + if configs: + return {"user_configs": configs} return {} @@ -76,10 +80,34 @@ class ConfigValue(collections.abc.Mapping): def __init__(self, data: Dict[Any, Any]): self._data = data - for key, value in data.items(): - if isinstance(value, dict): - value = ConfigValue(value) - setattr(self, key, value) + def __getattr__(self, key: str) -> Any: + """ + Access an element of this configuration + + Parameters + ---------- + key : str + Element to access + + Returns + ------- + Any + Element of the configuration + """ + if key == "_data": + # Called during unpickling. Special case to not run into infinite loop + # below. + raise AttributeError(key) + + if key in self._data: + return self[key] + raise AttributeError(key) + + def __setattr__(self, name: str, value: Any) -> None: + # Prevent configuration modification + if name == "_data": + return super().__setattr__(name, value) + raise TypeError("ConfigValue is immutable") def __getitem__(self, key: Any) -> Any: """ @@ -209,6 +237,7 @@ def get_config(cls, config_name: str) -> Optional[Dict[Any, Any]]: def process_configs(self, ctx, param, value): from .cli import echo_always, echo_dev_null # Prevent circular import + from .flowspec import _FlowState # Prevent circular import flow_cls = getattr(current_flow, "flow_cls", None) if flow_cls is None: @@ -216,7 +245,7 @@ def process_configs(self, ctx, param, value): raise MetaflowInternalError( "Config values should be processed for a FlowSpec" ) - + flow_cls._flow_state[_FlowState.CONFIGS] = {} # This function is called by click when processing all the --config options. # The value passed in is a list of tuples (name, value). # Click will provide: @@ -277,7 +306,7 @@ def process_configs(self, ctx, param, value): raise click.UsageError( "Could not find configuration '%s' in INFO file" % val ) - flow_cls._user_configs[name] = read_value + flow_cls._flow_state[_FlowState.CONFIGS][name] = read_value to_return[name] = ConfigValue(read_value) else: if self._parsers[name]: @@ -290,7 +319,7 @@ def process_configs(self, ctx, param, value): "Configuration value for '%s' is not valid JSON" % name ) from e # TODO: Support YAML - flow_cls._user_configs[name] = read_value + flow_cls._flow_state[_FlowState.CONFIGS][name] = read_value to_return[name] = ConfigValue(read_value) if missing_configs.intersection(self._req_configs): @@ -331,6 +360,23 @@ def __repr__(self): ConfigArgType = Union[str, Dict[Any, Any]] +class MultipleTuple(click.Tuple): + # Small wrapper around a click.Tuple to allow the environment variable for + # configurations to be a JSON string. Otherwise the default behavior is splitting + # by whitespace which is totally not what we want + # You can now pass multiple configuration options through an environment variable + # using something like: + # METAFLOW_FLOW_CONFIG='{"config1": "filenameforconfig1.json", "config2": {"key1": "value1"}}' + + def split_envvar_value(self, rv): + loaded = json.loads(rv) + return list( + item if isinstance(item, str) else json.dumps(item) + for pair in loaded.items() + for item in pair + ) + + class DelayEvaluator: """ Small wrapper that allows the evaluation of a Config() value in a delayed manner. @@ -356,6 +402,8 @@ def __getattr__(self, name): return self def __call__(self, ctx=None, deploy_time=False): + from .flowspec import _FlowState # Prevent circular import + # Two additional arguments are only used by DeployTimeField which will call # this function with those two additional arguments. They are ignored. flow_cls = getattr(current_flow, "flow_cls", None) @@ -372,7 +420,10 @@ def __call__(self, ctx=None, deploy_time=False): return eval( self._config_expr, globals(), - {k: ConfigValue(v) for k, v in flow_cls._user_configs.items()}, + { + k: ConfigValue(v) + for k, v in flow_cls._flow_state.get(_FlowState.CONFIGS, {}).items() + }, ) @@ -443,7 +494,9 @@ class MyFlow(FlowSpec): """ def _wrapper(flow_spec: "FlowSpec"): - flow_spec._config_funcs.append(f) + from .flowspec import _FlowState + + flow_spec._flow_state.setdefault(_FlowState.CONFIG_FUNCS, []).append(f) return flow_spec return _wrapper @@ -565,7 +618,7 @@ def config_options(cmd): ["--config", "config_options"], nargs=2, multiple=True, - type=click.Tuple([click.Choice(config_seen), PathOrStr()]), + type=MultipleTuple([click.Choice(config_seen), PathOrStr()]), callback=ConfigInput(required_names, defaults, parsers).process_configs, help=help_str, envvar="METAFLOW_FLOW_CONFIG", diff --git a/metaflow/flowspec.py b/metaflow/flowspec.py index d82478301b1..9fc65e0a276 100644 --- a/metaflow/flowspec.py +++ b/metaflow/flowspec.py @@ -4,6 +4,7 @@ import traceback import reprlib +from enum import Enum from itertools import islice from types import FunctionType, MethodType from typing import TYPE_CHECKING, Any, Callable, Generator, List, Optional, Tuple @@ -65,23 +66,26 @@ def __getitem__(self, item): return item or 0 # item is None for the control task, but it is also split 0 +class _FlowState(Enum): + CONFIGS = 1 + CONFIG_FUNCS = 2 + CACHED_PARAMETERS = 3 + + class FlowSpecMeta(type): def __new__(cls, name, bases, dct): f = super().__new__(cls, name, bases, dct) - # This makes sure to give _flow_decorators to each - # child class (and not share it with the FlowSpec base - # class). This is important to not make a "global" - # _flow_decorators. Same deal with user configurations - f._flow_decorators = {} - f._user_configs = {} + # We store some state in the flow class itself. This is primarily used to + # attach global state to a flow. It is *not* an actual global because of + # Runner/NBRunner. This is also created here in the meta class to avoid it being + # shared between different children classes. - # We also cache parameter names to avoid having to recompute what is a parameter - # in the dir of a flow - f._cached_parameters = None + # We should move _flow_decorators into this structure as well but keeping it + # out to limit the changes for now. + f._flow_decorators = {} - # Finally attach all functions that need to be evaluated once user configurations - # are available - f._config_funcs = [] + # Keys are _FlowState enum values + f._flow_state = {} return f @@ -112,8 +116,8 @@ def start(self): Tuple[str, ConfigValue] Iterates over the configurations of the flow """ - # When configs are parsed, they are loaded in _user_configs - for name, value in cls._user_configs.items(): + # When configs are parsed, they are loaded in _flow_state[_FlowState.CONFIGS] + for name, value in cls._flow_state.get(_FlowState.CONFIGS, {}).items(): yield name, ConfigValue(value) @property @@ -152,9 +156,7 @@ class FlowSpec(metaclass=FlowSpecMeta): "_cached_input", "_graph", "_flow_decorators", - "_user_configs", - "_cached_parameters", - "_config_funcs", + "_flow_state", "_steps", "index", "input", @@ -223,7 +225,7 @@ def _process_config_funcs(self, config_options): current_cls = self.__class__ # Fast path for no user configurations - if not self._config_funcs: + if not self._flow_state.get(_FlowState.CONFIG_FUNCS): return self # We need to convert all the user configurations from DelayedEvaluationParameters @@ -245,7 +247,7 @@ def _process_config_funcs(self, config_options): # Run all the functions. They will now be able to access the configuration # values directly from the class - for func in self._config_funcs: + for func in self._flow_state[_FlowState.CONFIG_FUNCS]: current_cls = func(current_cls) # Reset all configs that were already present in the class. @@ -256,7 +258,8 @@ def _process_config_funcs(self, config_options): # We reset cached_parameters on the very off chance that the user added # more configurations based on the configuration - current_cls._cached_parameters = None + if _FlowState.CACHED_PARAMETERS in current_cls._flow_state: + del current_cls._flow_state[_FlowState.CACHED_PARAMETERS] # Set the current flow class we are in (the one we just created) parameters.replace_flow_context(current_cls) @@ -327,8 +330,9 @@ def _set_constants(self, graph, kwargs, config_options): @classmethod def _get_parameters(cls): - if cls._cached_parameters is not None: - for var in cls._cached_parameters: + cached = cls._flow_state.get(_FlowState.CACHED_PARAMETERS) + if cached is not None: + for var in cached: yield var, getattr(cls, var) return build_list = [] @@ -342,7 +346,7 @@ def _get_parameters(cls): if isinstance(val, Parameter): build_list.append(var) yield var, val - cls._cached_parameters = build_list + cls._flow_state[_FlowState.CACHED_PARAMETERS] = build_list def _set_datastore(self, datastore): self._datastore = datastore diff --git a/metaflow/runtime.py b/metaflow/runtime.py index 724029e37f6..fd1e8fb3a01 100644 --- a/metaflow/runtime.py +++ b/metaflow/runtime.py @@ -33,6 +33,7 @@ from .datastore import TaskDataStoreSet from .debug import debug from .decorators import flow_decorators +from .flowspec import _FlowState from .mflog import mflog, RUNTIME_LOG_SOURCE from .util import to_unicode, compress_list, unicode_type from .clone_util import clone_task_helper @@ -1468,9 +1469,10 @@ def __init__(self, task): # We also pass configuration options using the kv. syntax which will cause # the configuration options to be loaded from the CONFIG file (or local-config-file # in the case of the local runtime) - if self.task.flow._user_configs: + configs = self.task.flow._flow_state.get(_FlowState.CONFIGS) + if configs: self.top_level_options["config"] = [ - (k, ConfigInput.make_key_name(k)) for k in self.task.flow._user_configs + (k, ConfigInput.make_key_name(k)) for k in configs ] self.commands = ["step"] diff --git a/test/core/metaflow_test/formatter.py b/test/core/metaflow_test/formatter.py index 0da52b49501..096afe78ebb 100644 --- a/test/core/metaflow_test/formatter.py +++ b/test/core/metaflow_test/formatter.py @@ -85,7 +85,7 @@ def _flow_lines(self): tags.extend(tag.split("(")[0] for tag in step.tags) yield 0, "# -*- coding: utf-8 -*-" - yield 0, "from metaflow import Config, FlowSpec, step, Parameter, project, IncludeFile, JSONType, current, parallel" + yield 0, "from metaflow import Config, config_expr, eval_config, FlowSpec, step, Parameter, project, IncludeFile, JSONType, current, parallel" yield 0, "from metaflow_test import assert_equals, assert_equals_metadata, assert_exception, ExpectationFailed, is_resumed, ResumeFromHere, TestRetry, try_to_get_card" if tags: yield 0, "from metaflow import %s" % ",".join(tags) diff --git a/test/core/tests/basic_config_parameters.py b/test/core/tests/basic_config_parameters.py new file mode 100644 index 00000000000..dc367bef524 --- /dev/null +++ b/test/core/tests/basic_config_parameters.py @@ -0,0 +1,106 @@ +from metaflow_test import MetaflowTest, ExpectationFailed, steps, tag + + +class BasicConfigTest(MetaflowTest): + PRIORITY = 1 + PARAMETERS = { + "default_from_config": { + "default": "config_expr('config2').default_param", + "type": "int", + }, + "default_from_func": {"default": "param_default", "type": "int"}, + } + CONFIGS = { + "config": {"default": "default_config"}, + "silly_config": {"required": True, "parser": "silly_parser"}, + "config2": {}, + "config3": {"default": "config_default"}, + } + HEADER = """ +import json +import os + +os.environ['METAFLOW_FLOW_CONFIG'] = json.dumps( + { + "config2": {"default_param": 123}, + "silly_config": "baz:amazing" + } +) + +def silly_parser(s): + k, v = s.split(":") + return {k: v} + +default_config = { + "value": 42, + "str_value": "foobar", + "project_name": "test_config", + "nested": {"value": 43}, +} + +def param_default(ctx): + return ctx.configs.config2.default_param + 1 + +def config_default(ctx): + return {"val": 456} + +# Test flow-level decorator configurations +@project(name=config_expr("config").project_name) +""" + + # Test step level decorators with configs + @tag( + "environment(vars={'normal': config.str_value, 'stringify': config_expr('str(config.value)')})" + ) + @steps(0, ["all"]) + def step_all(self): + # Test flow-level decorator configs + assert_equals(current.project_name, "test_config") + + # Test step-level decorator configs + assert_equals(os.environ["normal"], "foobar") + assert_equals(os.environ["stringify"], "42") + + # Test parameters reading configs + assert_equals(self.default_from_config, 123) + assert_equals(self.default_from_func, 124) + + # Test configs are accessible as artifacts + assert_equals(self.config.value, 42) + assert_equals(self.config["value"], 42) + assert_equals(self.config.nested.value, 43) + assert_equals(self.config["nested"]["value"], 43) + assert_equals(self.config.nested["value"], 43) + assert_equals(self.config["nested"].value, 43) + + assert_equals(self.silly_config.baz, "amazing") + assert_equals(self.silly_config["baz"], "amazing") + + assert_equals(self.config3.val, 456) + + try: + self.config3["val"] = 5 + raise ExpectationFailed(TypeError, "configs should be immutable") + except TypeError: + pass + + try: + self.config3.val = 5 + raise ExpectationFailed(TypeError, "configs should be immutable") + except TypeError: + pass + + def check_results(self, flow, checker): + for step in flow: + checker.assert_artifact( + step.name, + "config", + { + "value": 42, + "str_value": "foobar", + "project_name": "test_config", + "nested": {"value": 43}, + }, + ) + checker.assert_artifact(step.name, "config2", {"default_param": 123}) + checker.assert_artifact(step.name, "silly_config", {"baz": "amazing"}) From 739b87a51c94d1a942bf9beb3a8ad6b225e65207 Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Tue, 10 Sep 2024 12:54:48 -0700 Subject: [PATCH 08/24] Fixup conda decorator --- metaflow/plugins/pypi/conda_decorator.py | 31 +++++++++++++++++++----- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/metaflow/plugins/pypi/conda_decorator.py b/metaflow/plugins/pypi/conda_decorator.py index 49b42f7e55e..f881f7f6b05 100644 --- a/metaflow/plugins/pypi/conda_decorator.py +++ b/metaflow/plugins/pypi/conda_decorator.py @@ -49,14 +49,23 @@ class CondaStepDecorator(StepDecorator): # CONDA_CHANNELS in their environment. For pinning specific packages to specific # conda channels, users can specify channel::package as the package name. - def init(self): - super(CondaStepDecorator, self).init() - + def __init__(self, attributes=None, statically_defined=False): self._user_defined_attributes = ( attributes.copy() if attributes is not None else {} ) super(CondaStepDecorator, self).__init__(attributes, statically_defined) + def init(self): + super(CondaStepDecorator, self).init() + + # We have to go back and fixup _user_defined_attributes for potential + # config resolution + self._user_defined_attributes = { + k: v + for k, v in self.attributes.items() + if k in self._user_defined_attributes + } + # Support legacy 'libraries=' attribute for the decorator. self.attributes["packages"] = { **self.attributes["libraries"], @@ -334,12 +343,22 @@ class CondaFlowDecorator(FlowDecorator): "disabled": None, } + def __init__(self, attributes=None, statically_defined=False): + self._user_defined_attributes = ( + attributes.copy() if attributes is not None else {} + ) + super(CondaFlowDecorator, self).__init__(attributes, statically_defined) + def init(self): super(CondaFlowDecorator, self).init() - self._user_defined_attributes = ( - self.attributes.copy() if self.attributes is not None else {} - ) + # We have to go back and fixup _user_defined_attributes for potential + # config resolution + self._user_defined_attributes = { + k: v + for k, v in self.attributes.items() + if k in self._user_defined_attributes + } # Support legacy 'libraries=' attribute for the decorator. self.attributes["packages"] = { From 2e5f7a4c432192df66b5091c95a281339f074519 Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Tue, 10 Sep 2024 15:15:48 -0700 Subject: [PATCH 09/24] Fix parallel tests --- metaflow/cli.py | 2 +- metaflow/cli_args.py | 15 +++++++++++++++ metaflow/config_parameters.py | 7 ++----- metaflow/util.py | 4 ++++ 4 files changed, 22 insertions(+), 6 deletions(-) diff --git a/metaflow/cli.py b/metaflow/cli.py index da418ad20cd..e44baf9ecde 100644 --- a/metaflow/cli.py +++ b/metaflow/cli.py @@ -325,7 +325,7 @@ def start( pylint=None, event_logger=None, monitor=None, - local_info_file=None, + local_config_file=None, config_options=None, **deco_options ): diff --git a/metaflow/cli_args.py b/metaflow/cli_args.py index 40918f984ff..62f997b8566 100644 --- a/metaflow/cli_args.py +++ b/metaflow/cli_args.py @@ -12,7 +12,14 @@ # well as the converting of options in runtime.py. We should make it so that we # can properly shlex things and un-shlex when using. Ideally this should all be # done in one place. +# +# NOTE: There is an important between these two as well: +# - this one will include local_config_file whereas the other one WILL NOT. +# This is because this is used when constructing the parallel UBF command which +# executes locally and therefore needs the local_config_file but the other (remote) +# commands do not. +from .config_parameters import ConfigInput from .util import to_unicode @@ -65,6 +72,14 @@ def _options(mapping): # keyword in Python, so we call it 'decospecs' in click args if k == "decospecs": k = "with" + if k == "config_options": + # Special handling here since we gather them all in one option but actually + # need to send them one at a time using --config kv. + for config_name in v.keys(): + yield "--config" + yield to_unicode(config_name) + yield to_unicode(ConfigInput.make_key_name(config_name)) + continue k = k.replace("_", "-") v = v if isinstance(v, (list, tuple, set)) else [v] for value in v: diff --git a/metaflow/config_parameters.py b/metaflow/config_parameters.py index eb20fd6f1f2..fe823d1e1d8 100644 --- a/metaflow/config_parameters.py +++ b/metaflow/config_parameters.py @@ -343,12 +343,9 @@ class LocalFileInput(click.Path): name = "LocalFileInput" def convert(self, value, param, ctx): - super().convert(value, param, ctx) + v = super().convert(value, param, ctx) ConfigInput.set_config_file(value) - # This purposefully returns None which means it is *not* passed down - # when commands use ctx.parent.parent.params to get all the configuration - # values (it becomes hidden because its only purpose is to update the - # config file in ConfigInput) + return v def __str__(self): return repr(self) diff --git a/metaflow/util.py b/metaflow/util.py index e1b7d48f214..6695b987d2b 100644 --- a/metaflow/util.py +++ b/metaflow/util.py @@ -315,6 +315,10 @@ def dict_to_cli_options(params): yield to_unicode(config_name) yield to_unicode(ConfigInput.make_key_name(config_name)) continue + if k == "local_config_file": + # Skip this value -- it should only be used locally and never when + # forming another command line + continue k = k.replace("_", "-") v = v if isinstance(v, (list, tuple, set)) else [v] for value in v: From 70e153d044c081239750e7f9963adbb1b24c9ff4 Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Tue, 10 Sep 2024 15:48:50 -0700 Subject: [PATCH 10/24] Fix current singleton test (conflict with `steps`) --- test/core/tests/current_singleton.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/core/tests/current_singleton.py b/test/core/tests/current_singleton.py index b8064b7f9ee..036005796e3 100644 --- a/test/core/tests/current_singleton.py +++ b/test/core/tests/current_singleton.py @@ -22,7 +22,7 @@ def step_start(self): self.flow_names = {current.flow_name} self.run_ids = {current.run_id} self.origin_run_ids = {current.origin_run_id} - self.steps = {current.step_name} + self.seen_steps = {current.step_name} self.step_name = current.step_name self.namespaces = {current.namespace} self.usernames = {current.username} @@ -49,7 +49,7 @@ def step_join(self): self.flow_names = set(chain(*(i.flow_names for i in inputs))) self.run_ids = set(chain(*(i.run_ids for i in inputs))) self.origin_run_ids = set(chain(*(i.origin_run_ids for i in inputs))) - self.steps = set(chain(*(i.steps for i in inputs))) + self.seen_steps = set(chain(*(i.seen_steps for i in inputs))) self.namespaces = set(chain(*(i.namespaces for i in inputs))) self.usernames = set(chain(*(i.usernames for i in inputs))) self.task_data = {} @@ -68,7 +68,7 @@ def step_join(self): self.origin_run_ids.add(current.origin_run_id) self.namespaces.add(current.namespace) self.usernames.add(current.username) - self.steps.add(current.step_name) + self.seen_steps.add(current.step_name) self.uuid = str(uuid4()) self.task_data[current.pathspec] = self.uuid self.tags.update(current.tags) @@ -90,7 +90,7 @@ def step_all(self): self.namespaces.add(current.namespace) self.usernames.add(current.username) self.step_name = current.step_name - self.steps.add(current.step_name) + self.seen_steps.add(current.step_name) self.uuid = str(uuid4()) self.task_data[current.pathspec] = self.uuid self.tags.update(current.tags) From b04cabecc9e3f7f794f01acd7f42a3a96d1ec47f Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Thu, 12 Sep 2024 08:24:24 -0700 Subject: [PATCH 11/24] Call decorator init method on non-static decorators --- metaflow/cli.py | 1 + metaflow/decorators.py | 12 +++++++----- metaflow/plugins/airflow/airflow_cli.py | 1 + metaflow/plugins/argo/argo_workflows_cli.py | 1 + .../plugins/aws/step_functions/step_functions_cli.py | 1 + metaflow/plugins/pypi/pypi_decorator.py | 1 + 6 files changed, 12 insertions(+), 5 deletions(-) diff --git a/metaflow/cli.py b/metaflow/cli.py index e44baf9ecde..55c38ab8a18 100644 --- a/metaflow/cli.py +++ b/metaflow/cli.py @@ -444,6 +444,7 @@ def start( ) if all_decospecs: decorators._attach_decorators(ctx.obj.flow, all_decospecs) + decorators._init(ctx.obj.flow, only_non_static=True) # Regenerate graph if we attached more decorators ctx.obj.graph = FlowGraph(ctx.obj.flow.__class__) diff --git a/metaflow/decorators.py b/metaflow/decorators.py index b852ade200f..11b7308c7f0 100644 --- a/metaflow/decorators.py +++ b/metaflow/decorators.py @@ -545,14 +545,16 @@ def _attach_decorators_to_step(step, decospecs): step.decorators.append(deco) -def _init(flow): +def _init(flow, only_non_static=False): # We get the datastore for the _parameters step which can contain for decorators in flow._flow_decorators.values(): for deco in decorators: - deco.init() - for step in flow: - for deco in step.decorators: - deco.init() + if not only_non_static or not deco.statically_defined: + deco.init() + for flowstep in flow: + for deco in flowstep.decorators: + if not only_non_static or not deco.statically_defined: + deco.init() def _init_flow_decorators( diff --git a/metaflow/plugins/airflow/airflow_cli.py b/metaflow/plugins/airflow/airflow_cli.py index b80d82fbcee..33e0e0f48d6 100644 --- a/metaflow/plugins/airflow/airflow_cli.py +++ b/metaflow/plugins/airflow/airflow_cli.py @@ -283,6 +283,7 @@ def make_flow( ): # Attach @kubernetes. decorators._attach_decorators(obj.flow, [KubernetesDecorator.name]) + decorators._init(obj.flow, only_non_static=True) decorators._init_step_decorators( obj.flow, obj.graph, obj.environment, obj.flow_datastore, obj.logger diff --git a/metaflow/plugins/argo/argo_workflows_cli.py b/metaflow/plugins/argo/argo_workflows_cli.py index bd755684102..734c50d21ee 100644 --- a/metaflow/plugins/argo/argo_workflows_cli.py +++ b/metaflow/plugins/argo/argo_workflows_cli.py @@ -470,6 +470,7 @@ def make_flow( decorators._attach_decorators( obj.flow, [KubernetesDecorator.name, EnvironmentDecorator.name] ) + decorators._init(obj.flow, only_non_static=True) decorators._init_step_decorators( obj.flow, obj.graph, obj.environment, obj.flow_datastore, obj.logger diff --git a/metaflow/plugins/aws/step_functions/step_functions_cli.py b/metaflow/plugins/aws/step_functions/step_functions_cli.py index efa4e7f35f4..373b5daef1c 100644 --- a/metaflow/plugins/aws/step_functions/step_functions_cli.py +++ b/metaflow/plugins/aws/step_functions/step_functions_cli.py @@ -326,6 +326,7 @@ def make_flow( # Attach AWS Batch decorator to the flow decorators._attach_decorators(obj.flow, [BatchDecorator.name]) + decorators._init(obj.flow, only_non_static=True) decorators._init_step_decorators( obj.flow, obj.graph, obj.environment, obj.flow_datastore, obj.logger ) diff --git a/metaflow/plugins/pypi/pypi_decorator.py b/metaflow/plugins/pypi/pypi_decorator.py index 17925446bde..3f650281dc5 100644 --- a/metaflow/plugins/pypi/pypi_decorator.py +++ b/metaflow/plugins/pypi/pypi_decorator.py @@ -140,6 +140,7 @@ def flow_init( from metaflow import decorators decorators._attach_decorators(flow, ["pypi"]) + decorators._init(flow, only_non_static=True) # @pypi uses a conda environment to create a virtual environment. # The conda environment can be created through micromamba. From b1e9ce1d7faea97a180ab5052605e9fbaa851425 Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Mon, 30 Sep 2024 13:43:07 -0700 Subject: [PATCH 12/24] Several fixes - Separate out value and file (so default and default_value and --config and --config-value) - Provide classes for step and flow config decorators with proxy objects - Split things into several files (it was getting too long) - Addressed all bugs discussed --- metaflow/__init__.py | 3 +- metaflow/cli.py | 21 +- metaflow/cli_args.py | 10 +- metaflow/config_parameters.py | 627 --------------------- metaflow/decorators.py | 27 +- metaflow/flowspec.py | 87 ++- metaflow/includefile.py | 2 +- metaflow/package.py | 2 +- metaflow/parameters.py | 10 +- metaflow/runner/click_api.py | 2 +- metaflow/runtime.py | 5 +- metaflow/user_configs/__init__.py | 0 metaflow/user_configs/config_decorators.py | 214 +++++++ metaflow/user_configs/config_options.py | 384 +++++++++++++ metaflow/user_configs/config_parameters.py | 350 ++++++++++++ metaflow/util.py | 10 +- test/core/metaflow_test/formatter.py | 2 +- test_config/helloconfig.py | 41 +- 18 files changed, 1071 insertions(+), 726 deletions(-) delete mode 100644 metaflow/config_parameters.py create mode 100644 metaflow/user_configs/__init__.py create mode 100644 metaflow/user_configs/config_decorators.py create mode 100644 metaflow/user_configs/config_options.py create mode 100644 metaflow/user_configs/config_parameters.py diff --git a/metaflow/__init__.py b/metaflow/__init__.py index fbc08342d9c..2ce16d0b909 100644 --- a/metaflow/__init__.py +++ b/metaflow/__init__.py @@ -103,7 +103,8 @@ class and related decorators. from .parameters import Parameter, JSONTypeClass, JSONType -from .config_parameters import Config, config_expr, eval_config +from .user_configs.config_parameters import Config, config_expr +from .user_configs.config_decorators import FlowConfigDecorator, StepConfigDecorator # data layer # For historical reasons, we make metaflow.plugins.datatools accessible as diff --git a/metaflow/cli.py b/metaflow/cli.py index 55c38ab8a18..4f9d2759930 100644 --- a/metaflow/cli.py +++ b/metaflow/cli.py @@ -12,6 +12,7 @@ from .cli_components.utils import LazyGroup, LazyPluginCommandCollection from .datastore import FlowDataStore from .exception import CommandException, MetaflowException +from .flowspec import _FlowState from .graph import FlowGraph from .metaflow_config import ( DECOSPECS, @@ -35,7 +36,8 @@ from .pylint_wrapper import PyLint from .R import metaflow_r_version, use_r from .util import resolve_identity -from .config_parameters import LocalFileInput, config_options +from .user_configs.config_options import LocalFileInput, config_options +from .user_configs.config_parameters import ConfigValue ERASE_TO_EOL = "\033[K" HIGHLIGHT = "red" @@ -326,7 +328,8 @@ def start( event_logger=None, monitor=None, local_config_file=None, - config_options=None, + config_file_options=None, + config_value_options=None, **deco_options ): if quiet: @@ -347,7 +350,12 @@ def start( # process all those decorators that the user added that will modify the flow based # on those configurations. It is important to do this as early as possible since it # actually modifies the flow itself - ctx.obj.flow = ctx.obj.flow._process_config_funcs(config_options) + + # When we process the options, the first one processed will return None and the + # second one processed will return the actual options. The order of processing + # depends on what (and in what order) the user specifies on the command line. + config_options = config_file_options or config_value_options + ctx.obj.flow = ctx.obj.flow._process_config_decorators(config_options) cli_args._set_top_kwargs(ctx.params) ctx.obj.echo = echo @@ -433,7 +441,12 @@ def start( ctx.obj.flow.name, ctx.obj.echo, ctx.obj.flow_datastore, - dict(ctx.obj.flow.configs), + { + k: ConfigValue(v) + for k, v in ctx.obj.flow.__class__._flow_state.get( + _FlowState.CONFIGS, {} + ).items() + }, ) if ctx.invoked_subcommand not in ("run", "resume"): diff --git a/metaflow/cli_args.py b/metaflow/cli_args.py index 62f997b8566..3c411627395 100644 --- a/metaflow/cli_args.py +++ b/metaflow/cli_args.py @@ -19,7 +19,7 @@ # executes locally and therefore needs the local_config_file but the other (remote) # commands do not. -from .config_parameters import ConfigInput +from .user_configs.config_options import ConfigInput from .util import to_unicode @@ -72,11 +72,13 @@ def _options(mapping): # keyword in Python, so we call it 'decospecs' in click args if k == "decospecs": k = "with" - if k == "config_options": + if k in ("config_file_options", "config_value_options"): # Special handling here since we gather them all in one option but actually - # need to send them one at a time using --config kv. + # need to send them one at a time using --config-value kv.. + # Note it can be either config_file_options or config_value_options depending + # on click processing order. for config_name in v.keys(): - yield "--config" + yield "--config-value" yield to_unicode(config_name) yield to_unicode(ConfigInput.make_key_name(config_name)) continue diff --git a/metaflow/config_parameters.py b/metaflow/config_parameters.py deleted file mode 100644 index fe823d1e1d8..00000000000 --- a/metaflow/config_parameters.py +++ /dev/null @@ -1,627 +0,0 @@ -import collections.abc -import json -import os -import re - -from typing import Any, Callable, Dict, List, Optional, Union, TYPE_CHECKING - -from metaflow._vendor import click - -from .exception import MetaflowException, MetaflowInternalError - -from .parameters import ( - DeployTimeField, - Parameter, - ParameterContext, - current_flow, -) - -from .util import get_username - -if TYPE_CHECKING: - from metaflow import FlowSpec - -# _tracefunc_depth = 0 - - -# def tracefunc(func): -# """Decorates a function to show its trace.""" - -# @functools.wraps(func) -# def tracefunc_closure(*args, **kwargs): -# global _tracefunc_depth -# """The closure.""" -# print(f"{_tracefunc_depth}: {func.__name__}(args={args}, kwargs={kwargs})") -# _tracefunc_depth += 1 -# result = func(*args, **kwargs) -# _tracefunc_depth -= 1 -# print(f"{_tracefunc_depth} => {result}") -# return result - -# return tracefunc_closure - -CONFIG_FILE = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "CONFIG_PARAMETERS" -) - - -def dump_config_values(flow: "FlowSpec"): - from .flowspec import _FlowState # Prevent circular import - - configs = flow._flow_state.get(_FlowState.CONFIGS) - if configs: - return {"user_configs": configs} - return {} - - -def load_config_values(info_file: Optional[str] = None) -> Optional[Dict[Any, Any]]: - if info_file is None: - info_file = os.path.basename(CONFIG_FILE) - try: - with open(info_file, encoding="utf-8") as contents: - return json.load(contents).get("user_configs", {}) - except IOError: - return None - - -class ConfigValue(collections.abc.Mapping): - """ - ConfigValue is a thin wrapper around an arbitrarily nested dictionary-like - configuration object. It allows you to access elements of this nested structure - using either a "." notation or a [] notation. As an example, if your configuration - object is: - {"foo": {"bar": 42}} - you can access the value 42 using either config["foo"]["bar"] or config.foo.bar. - """ - - # Thin wrapper to allow configuration values to be accessed using a "." notation - # as well as a [] notation. - - def __init__(self, data: Dict[Any, Any]): - self._data = data - - def __getattr__(self, key: str) -> Any: - """ - Access an element of this configuration - - Parameters - ---------- - key : str - Element to access - - Returns - ------- - Any - Element of the configuration - """ - if key == "_data": - # Called during unpickling. Special case to not run into infinite loop - # below. - raise AttributeError(key) - - if key in self._data: - return self[key] - raise AttributeError(key) - - def __setattr__(self, name: str, value: Any) -> None: - # Prevent configuration modification - if name == "_data": - return super().__setattr__(name, value) - raise TypeError("ConfigValue is immutable") - - def __getitem__(self, key: Any) -> Any: - """ - Access an element of this configuration - - Parameters - ---------- - key : Any - Element to access - - Returns - ------- - Any - Element of the configuration - """ - value = self._data[key] - if isinstance(value, dict): - value = ConfigValue(value) - return value - - def __len__(self): - return len(self._data) - - def __iter__(self): - return iter(self._data) - - def __repr__(self): - return repr(self._data) - - def __str__(self): - return json.dumps(self._data) - - def to_dict(self) -> Dict[Any, Any]: - """ - Returns a dictionary representation of this configuration object. - - Returns - ------- - Dict[Any, Any] - Dictionary equivalent of this configuration object. - """ - return dict(self._data) - - -class PathOrStr(click.ParamType): - # Click parameter type for a configuration value -- it can either be the string - # representation of the configuration value (like a JSON string or any other - # string that the configuration parser can parse) or the path to a file containing - # such a content. The value will be initially assumed to be that of a file and will - # only be considered not a file if no file exists. - name = "PathOrStr" - - @staticmethod - def convert_value(value): - # Click requires this to be idempotent. We therefore check if the value - # starts with "converted:" which is our marker for "we already processed this - # value". - if value is None: - return None - - if isinstance(value, dict): - return "converted:" + json.dumps(value) - - if value.startswith("converted:"): - return value - - if os.path.isfile(value): - try: - with open(value, "r", encoding="utf-8") as f: - content = f.read() - except OSError as e: - raise click.UsageError( - "Could not read configuration file '%s'" % value - ) from e - return "converted:" + content - return "converted:" + value - - def convert(self, value, param, ctx): - return self.convert_value(value) - - -class ConfigInput: - # ConfigInput is an internal class responsible for processing all the --config - # options. It gathers information from the --local-config-file (to figure out - # where options are stored) and is also responsible for processing any `--config` - # options and processing the default value of `Config(...)` objects. - - # It will then store this information in the flow spec for use later in processing. - # It is stored in the flow spec to avoid being global to support the Runner. - - loaded_configs = None # type: Optional[Dict[str, Dict[Any, Any]]] - config_file = None # type: Optional[str] - - def __init__( - self, - req_configs: List[str], - defaults: Dict[str, Union[str, Dict[Any, Any]]], - parsers: Dict[str, Callable[[str], Dict[Any, Any]]], - ): - self._req_configs = set(req_configs) - self._defaults = defaults - self._parsers = parsers - - @staticmethod - def make_key_name(name: str) -> str: - # Special mark to indicate that the configuration value is not content or a file - # name but a value that should be read in the config file (effectively where - # the value has already been materialized). - return "kv." + name.lower() - - @classmethod - def set_config_file(cls, config_file: str): - cls.config_file = config_file - - @classmethod - def get_config(cls, config_name: str) -> Optional[Dict[Any, Any]]: - if cls.loaded_configs is None: - all_configs = load_config_values(cls.config_file) - if all_configs is None: - raise MetaflowException( - "Could not load expected configuration values " - "from the CONFIG_PARAMETERS file. This is a Metaflow bug. " - "Please contact support." - ) - cls.loaded_configs = all_configs - return cls.loaded_configs.get(config_name, None) - - def process_configs(self, ctx, param, value): - from .cli import echo_always, echo_dev_null # Prevent circular import - from .flowspec import _FlowState # Prevent circular import - - flow_cls = getattr(current_flow, "flow_cls", None) - if flow_cls is None: - # This is an error - raise MetaflowInternalError( - "Config values should be processed for a FlowSpec" - ) - flow_cls._flow_state[_FlowState.CONFIGS] = {} - # This function is called by click when processing all the --config options. - # The value passed in is a list of tuples (name, value). - # Click will provide: - # - all the defaults if nothing is provided on the command line - # - provide *just* the passed in value if anything is provided on the command - # line. - # - # We therefore "merge" the defaults with what we are provided by click to form - # a full set of values - # We therefore get a full set of values where: - # - the name will correspond to the configuration name - # - the value will be the default (including None if there is no default) or - # the string representation of the value (this will always include - # the "converted:" prefix as it will have gone through the PathOrStr - # conversion function). A value of None basically means that the config has - # no default and was not specified on the command line. - to_return = {} - - merged_configs = dict(self._defaults) - for name, val in value: - # Don't replace by None -- this is needed to avoid replacing a function - # default - if val: - merged_configs[name] = val - - print("PARAMS: %s" % str(ctx.params)) - missing_configs = set() - for name, val in merged_configs.items(): - name = name.lower() - # convert is idempotent so if it is already converted, it will just return - # the value. This is used to make sure we process the defaults which do - # NOT make it through the PathOrStr convert function - if isinstance(val, DeployTimeField): - # This supports a default value that is a deploy-time field (similar - # to Parameter).) - # We will form our own context and pass it down -- note that you cannot - # use configs in the default value of configs as this introduces a bit - # of circularity. Note also that quiet and datastore are *eager* - # options so are available here. - param_ctx = ParameterContext( - flow_name=ctx.obj.flow.name, - user_name=get_username(), - parameter_name=name, - logger=echo_dev_null if ctx.params["quiet"] else echo_always, - ds_type=ctx.params["datastore"], - configs=None, - ) - val = val.fun(param_ctx) - val = PathOrStr.convert_value(val) - if val is None: - missing_configs.add(name) - continue - val = val[10:] # Remove the "converted:" prefix - if val.startswith("kv."): - # This means to load it from a file - read_value = self.get_config(val[3:]) - if read_value is None: - raise click.UsageError( - "Could not find configuration '%s' in INFO file" % val - ) - flow_cls._flow_state[_FlowState.CONFIGS][name] = read_value - to_return[name] = ConfigValue(read_value) - else: - if self._parsers[name]: - read_value = self._parsers[name](val) - else: - try: - read_value = json.loads(val) - except json.JSONDecodeError as e: - raise click.UsageError( - "Configuration value for '%s' is not valid JSON" % name - ) from e - # TODO: Support YAML - flow_cls._flow_state[_FlowState.CONFIGS][name] = read_value - to_return[name] = ConfigValue(read_value) - - if missing_configs.intersection(self._req_configs): - raise click.UsageError( - "Missing configuration values for %s" % ", ".join(missing_configs) - ) - return to_return - - def __str__(self): - return repr(self) - - def __repr__(self): - return "ConfigInput" - - -class LocalFileInput(click.Path): - # Small wrapper around click.Path to set the value from which to read configuration - # values. This is set immediately upon processing the --local-config-file - # option and will therefore then be available when processing any of the other - # --config options (which will call ConfigInput.process_configs - name = "LocalFileInput" - - def convert(self, value, param, ctx): - v = super().convert(value, param, ctx) - ConfigInput.set_config_file(value) - return v - - def __str__(self): - return repr(self) - - def __repr__(self): - return "LocalFileInput" - - -ConfigArgType = Union[str, Dict[Any, Any]] - - -class MultipleTuple(click.Tuple): - # Small wrapper around a click.Tuple to allow the environment variable for - # configurations to be a JSON string. Otherwise the default behavior is splitting - # by whitespace which is totally not what we want - # You can now pass multiple configuration options through an environment variable - # using something like: - # METAFLOW_FLOW_CONFIG='{"config1": "filenameforconfig1.json", "config2": {"key1": "value1"}}' - - def split_envvar_value(self, rv): - loaded = json.loads(rv) - return list( - item if isinstance(item, str) else json.dumps(item) - for pair in loaded.items() - for item in pair - ) - - -class DelayEvaluator: - """ - Small wrapper that allows the evaluation of a Config() value in a delayed manner. - This is used when we want to use config.* values in decorators for example. - """ - - id_pattern = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*$") - - def __init__(self, ex: str): - self._config_expr = ex - if self.id_pattern.match(self._config_expr): - # This is a variable only so allow things like config_expr("config").var - self._is_var_only = True - self._access = [] - else: - self._is_var_only = False - self._access = None - - def __getattr__(self, name): - if self._access is None: - raise AttributeError() - self._access.append(name) - return self - - def __call__(self, ctx=None, deploy_time=False): - from .flowspec import _FlowState # Prevent circular import - - # Two additional arguments are only used by DeployTimeField which will call - # this function with those two additional arguments. They are ignored. - flow_cls = getattr(current_flow, "flow_cls", None) - if flow_cls is None: - # We are not executing inside a flow (ie: not the CLI) - raise MetaflowException( - "Config object can only be used directly in the FlowSpec defining them. " - "If using outside of the FlowSpec, please use ConfigEval" - ) - if self._access is not None: - # Build the final expression by adding all the fields in access as . fields - self._config_expr = ".".join([self._config_expr] + self._access) - # Evaluate the expression setting the config values as local variables - return eval( - self._config_expr, - globals(), - { - k: ConfigValue(v) - for k, v in flow_cls._flow_state.get(_FlowState.CONFIGS, {}).items() - }, - ) - - -def config_expr(expr: str) -> DelayEvaluator: - """ - Function to allow you to use an expression involving a config parameter in - places where it may not be directory accessible or if you want a more complicated - expression than just a single variable. - - You can use it as follows: - - When the config is not directly accessible: - - @project(name=config_expr("config").project.name) - class MyFlow(FlowSpec): - config = Config("config") - ... - - When you want a more complex expression: - class MyFlow(FlowSpec): - config = Config("config") - - @environment(vars={"foo": config_expr("config.bar.baz.lower()")}) - @step - def start(self): - ... - - Parameters - ---------- - expr : str - Expression using the config values. - """ - return DelayEvaluator(expr) - - -def eval_config(f: Callable[["FlowSpec"], "FlowSpec"]) -> "FlowSpec": - """ - Decorator to allow you to add Python decorators to a FlowSpec that makes use of - user configurations. - - As an example: - - ``` - def parameterize(f): - for s in f: - # Iterate over all the steps - if s.name in f.config.add_env_to_steps: - setattr(f, s.name) = environment(vars={**f.config.env_vars})(s) - return f - - @eval_config(parameterize) - class MyFlow(FlowSpec): - config = Config("config") - ... - ``` - - allows you to add an environment decorator to all steps in `add_env_to_steps`. Both - the steps to add this decorator to and the values to add are extracted from the - configuration passed to the Flow through config. - - Parameters - ---------- - f : Callable[[FlowSpec], FlowSpec] - Decorator function - - Returns - ------- - FlowSpec - The modified FlowSpec - """ - - def _wrapper(flow_spec: "FlowSpec"): - from .flowspec import _FlowState - - flow_spec._flow_state.setdefault(_FlowState.CONFIG_FUNCS, []).append(f) - return flow_spec - - return _wrapper - - -class Config(Parameter): - """ - Includes a configuration for this flow. - - `Config` is a special type of `Parameter` but differs in a few key areas: - - it is immutable and determined at deploy time (or prior to running if not deploying - to a scheduler) - - as such, it can be used anywhere in your code including in Metaflow decorators - - - Parameters - ---------- - name : str - User-visible configuration name. - default : Union[str, Dict[Any, Any], Callable[[ParameterContext], Union[str, Dict[Any, Any]]]], optional, default None - Default value for the parameter. A function - implies that the value will be computed using that function. - help : str, optional, default None - Help text to show in `run --help`. - required : bool, default False - Require that the user specified a value for the parameter. Note that if - a default is provided, the required flag is ignored. - parser : Callable[[str], Dict[Any, Any]], optional, default None - An optional function that can parse the configuration string into an arbitrarily - nested dictionary. - show_default : bool, default True - If True, show the default value in the help text. - """ - - IS_FLOW_PARAMETER = True - - def __init__( - self, - name: str, - default: Optional[ - Union[ - str, - Dict[Any, Any], - Callable[[ParameterContext], Union[str, Dict[Any, Any]]], - ] - ] = None, - help: Optional[str] = None, - required: bool = False, - parser: Optional[Callable[[str], Dict[Any, Any]]] = None, - **kwargs: Dict[str, str] - ): - - print("Config %s, default is %s" % (name, default)) - super(Config, self).__init__( - name, default=default, required=required, help=help, type=str, **kwargs - ) - - if isinstance(kwargs.get("default", None), str): - kwargs["default"] = json.dumps(kwargs["default"]) - self.parser = parser - - def load_parameter(self, v): - return v - - def __getattr__(self, name): - ev = DelayEvaluator(self.name) - return ev.__getattr__(name) - - -def config_options(cmd): - help_strs = [] - required_names = [] - defaults = {} - config_seen = set() - parsers = {} - flow_cls = getattr(current_flow, "flow_cls", None) - if flow_cls is None: - return cmd - - parameters = [p for _, p in flow_cls._get_parameters() if p.IS_FLOW_PARAMETER] - config_opt_required = False - # List all the configuration options - for arg in parameters[::-1]: - save_default = arg.kwargs.get("default", None) - kwargs = arg.option_kwargs(False) - if arg.name.lower() in config_seen: - msg = ( - "Multiple configurations use the same name '%s'. Note that names are " - "case-insensitive. Please change the " - "names of some of your configurations" % arg.name - ) - raise MetaflowException(msg) - config_seen.add(arg.name.lower()) - if kwargs["required"]: - required_names.append(arg.name) - if save_default is None: - # We need at least one option if we have a required configuration. - config_opt_required = True - defaults[arg.name.lower()] = save_default - help_strs.append(" - %s: %s" % (arg.name.lower(), kwargs.get("help", ""))) - parsers[arg.name.lower()] = arg.parser - - print( - "DEFAULTS %s" - % str(dict((k, v if not callable(v) else "FUNC") for k, v in defaults.items())) - ) - if not config_seen: - # No configurations -- don't add anything - return cmd - - help_str = ( - "Configuration options for the flow. " - "Multiple configurations can be specified." - ) - help_str = "\n\n".join([help_str] + help_strs) - cmd.params.insert( - 0, - click.Option( - ["--config", "config_options"], - nargs=2, - multiple=True, - type=MultipleTuple([click.Choice(config_seen), PathOrStr()]), - callback=ConfigInput(required_names, defaults, parsers).process_configs, - help=help_str, - envvar="METAFLOW_FLOW_CONFIG", - show_default=False, - default=[(k, v if not callable(v) else None) for k, v in defaults.items()], - required=config_opt_required, - ), - ) - return cmd diff --git a/metaflow/decorators.py b/metaflow/decorators.py index 11b7308c7f0..b615d5744ac 100644 --- a/metaflow/decorators.py +++ b/metaflow/decorators.py @@ -12,7 +12,7 @@ ) from .parameters import current_flow -from .config_parameters import DelayEvaluator +from .user_configs.config_parameters import DelayEvaluator from metaflow._vendor import click @@ -116,10 +116,12 @@ class Decorator(object): def __init__(self, attributes=None, statically_defined=False): self.attributes = self.defaults.copy() self.statically_defined = statically_defined + self._user_defined_attributes = set() if attributes: for k, v in attributes.items(): - if k in self.defaults: + self._user_defined_attributes.add(k) + if k in self.defaults or k.startswith("_unpacked_delayed_"): self.attributes[k] = v else: raise InvalidDecoratorAttribute(self.name, k, self.defaults) @@ -146,7 +148,27 @@ def _resolve_delayed_evaluator(v): return {_resolve_delayed_evaluator(x) for x in v} return v + # Expand any eventual _unpacked_delayed_ attributes. These are special attributes + # that allow the delay unpacking of configuration values. + delayed_upack_keys = [ + k for k in self.attributes if k.startswith("_unpacked_delayed_") + ] + if delayed_upack_keys: + for k in delayed_upack_keys: + unpacked = _resolve_delayed_evaluator(self.attributes[k]) + for uk, uv in unpacked.items(): + if uk in self._user_defined_attributes: + raise SyntaxError( + "keyword argument repeated: %s" % uk, "", 0, "" + ) + self._user_defined_attributes.add(uk) + self.attributes[uk] = uv + del self.attributes[k] + + # Now resolve all attributes for k, v in self.attributes.items(): + # This is a special attribute that means we are going to unpack + # the configuration valu self.attributes[k] = _resolve_delayed_evaluator(v) @classmethod @@ -460,6 +482,7 @@ def _base_step_decorator(decotype, *args, **kwargs): Decorator prototype for all step decorators. This function gets specialized and imported for all decorators types by _import_plugin_decorators(). """ + if args: # No keyword arguments specified for the decorator, e.g. @foobar. # The first argument is the function to be decorated. diff --git a/metaflow/flowspec.py b/metaflow/flowspec.py index 9fc65e0a276..e819450a019 100644 --- a/metaflow/flowspec.py +++ b/metaflow/flowspec.py @@ -14,6 +14,7 @@ from .exception import ( MetaflowException, MissingInMergeArtifactsException, + MetaflowInternalError, UnhandledInMergeArtifactsException, ) @@ -21,7 +22,12 @@ from .graph import FlowGraph from .unbounded_foreach import UnboundedForeachInput -from .config_parameters import ConfigInput, ConfigValue +from .user_configs.config_decorators import ( + FlowConfigDecorator, + FlowSpecProxy, + StepConfigDecorator, + StepProxy, +) from .util import to_pod from .metaflow_config import INCLUDE_FOREACH_STACK, MAXIMUM_FOREACH_VALUE_CHARS @@ -68,7 +74,7 @@ def __getitem__(self, item): class _FlowState(Enum): CONFIGS = 1 - CONFIG_FUNCS = 2 + CONFIG_DECORATORS = 2 CACHED_PARAMETERS = 3 @@ -89,52 +95,6 @@ def __new__(cls, name, bases, dct): return f - @property - def configs(cls) -> Generator[Tuple[str, "ConfigValue"], None, None]: - """ - Iterate over all user configurations in this flow - - Use this to parameterize your flow based on configuration. As an example: - ``` - def parametrize(flow): - val = next(flow.configs)[1].steps.start.cpu - flow.start = environment(vars={'mycpu': val})(flow.start) - return flow - - @parametrize - class TestFlow(FlowSpec): - config = Config('myconfig.json') - - @step - def start(self): - pass - ``` - can be used to add an environment decorator to the `start` step. - - Yields - ------ - Tuple[str, ConfigValue] - Iterates over the configurations of the flow - """ - # When configs are parsed, they are loaded in _flow_state[_FlowState.CONFIGS] - for name, value in cls._flow_state.get(_FlowState.CONFIGS, {}).items(): - yield name, ConfigValue(value) - - @property - def steps(cls) -> Generator[Tuple[str, Any], None, None]: - """ - Iterate over all the steps in this flow - - Yields - ------ - Tuple[str, Any] - A tuple with the step name and the step itself - """ - for var in dir(cls): - potential_step = getattr(cls, var) - if callable(potential_step) and hasattr(potential_step, "is_step"): - yield var, potential_step - class FlowSpec(metaclass=FlowSpecMeta): """ @@ -221,11 +181,11 @@ def _check_parameters(self): ) seen.add(norm) - def _process_config_funcs(self, config_options): + def _process_config_decorators(self, config_options): current_cls = self.__class__ # Fast path for no user configurations - if not self._flow_state.get(_FlowState.CONFIG_FUNCS): + if not self._flow_state.get(_FlowState.CONFIG_DECORATORS): return self # We need to convert all the user configurations from DelayedEvaluationParameters @@ -245,10 +205,29 @@ def _process_config_funcs(self, config_options): val = val() setattr(current_cls, var, val) - # Run all the functions. They will now be able to access the configuration - # values directly from the class - for func in self._flow_state[_FlowState.CONFIG_FUNCS]: - current_cls = func(current_cls) + # Run all the decorators + for deco in self._flow_state[_FlowState.CONFIG_DECORATORS]: + if isinstance(deco, FlowConfigDecorator): + # Sanity check to make sure we are applying the decorator to the right + # class + if not deco._flow_cls == current_cls and not issubclass( + current_cls, deco._flow_cls + ): + raise MetaflowInternalError( + "FlowConfigDecorator registered on the wrong flow -- " + "expected %s but got %s" + % (deco._flow_cls.__name__, current_cls.__name__) + ) + deco.evaluate(FlowSpecProxy(current_cls)) + elif isinstance(deco, StepConfigDecorator): + # Again some sanity checks + if deco._flow_cls != current_cls: + raise MetaflowInternalError( + "StepConfigDecorator registered on the wrong flow -- " + "expected %s but got %s" + % (deco._flow_cls.__name__, current_cls.__name__) + ) + deco.evaluate(StepConfigDecorator(deco._my_step)) # Reset all configs that were already present in the class. # TODO: This means that users can't override configs directly. Not sure if this diff --git a/metaflow/includefile.py b/metaflow/includefile.py index 499b4cd6a90..c81a701dfb3 100644 --- a/metaflow/includefile.py +++ b/metaflow/includefile.py @@ -20,7 +20,7 @@ ) from .plugins import DATACLIENTS -from .config_parameters import ConfigValue +from .user_configs.config_parameters import ConfigValue from .util import get_username import functools diff --git a/metaflow/package.py b/metaflow/package.py index 9666d929d08..1385883d5a7 100644 --- a/metaflow/package.py +++ b/metaflow/package.py @@ -6,7 +6,7 @@ import json from io import BytesIO -from .config_parameters import CONFIG_FILE, dump_config_values +from .user_configs.config_parameters import CONFIG_FILE, dump_config_values from .extension_support import EXT_PKG, package_mfext_all from .metaflow_config import DEFAULT_PACKAGE_SUFFIXES from .exception import MetaflowException diff --git a/metaflow/parameters.py b/metaflow/parameters.py index da9669243bf..d1413f47247 100644 --- a/metaflow/parameters.py +++ b/metaflow/parameters.py @@ -15,7 +15,7 @@ ) if TYPE_CHECKING: - from .config_parameters import ConfigValue + from .user_configs.config_parameters import ConfigValue try: # Python2 @@ -226,7 +226,9 @@ def deploy_time_eval(value): # this is called by cli.main def set_parameter_context(flow_name, echo, datastore, configs): - from .config_parameters import ConfigValue # Prevent circular dependency + from .user_configs.config_parameters import ( + ConfigValue, + ) # Prevent circular dependency global context_proto context_proto = ParameterContext( @@ -322,7 +324,9 @@ def __init__( int, bool, Dict[str, Any], - Callable[[], Union[str, float, int, bool, Dict[str, Any]]], + Callable[ + [ParameterContext], Union[str, float, int, bool, Dict[str, Any]] + ], ] ] = None, type: Optional[ diff --git a/metaflow/runner/click_api.py b/metaflow/runner/click_api.py index 0001ed39a95..96faa235281 100644 --- a/metaflow/runner/click_api.py +++ b/metaflow/runner/click_api.py @@ -39,7 +39,7 @@ from metaflow.exception import MetaflowException from metaflow.includefile import FilePathClass from metaflow.parameters import JSONTypeClass, flow_context -from metaflow.config_parameters import LocalFileInput +from metaflow.config_options import LocalFileInput # Define a recursive type alias for JSON JSON = Union[Dict[str, "JSON"], List["JSON"], str, int, float, bool, None] diff --git a/metaflow/runtime.py b/metaflow/runtime.py index fd1e8fb3a01..2154690718c 100644 --- a/metaflow/runtime.py +++ b/metaflow/runtime.py @@ -43,7 +43,8 @@ UBF_TASK, ) -from .config_parameters import ConfigInput, dump_config_values +from .user_configs.config_options import ConfigInput +from .user_configs.config_parameters import dump_config_values import metaflow.tracing as tracing @@ -1471,7 +1472,7 @@ def __init__(self, task): # in the case of the local runtime) configs = self.task.flow._flow_state.get(_FlowState.CONFIGS) if configs: - self.top_level_options["config"] = [ + self.top_level_options["config-value"] = [ (k, ConfigInput.make_key_name(k)) for k in configs ] diff --git a/metaflow/user_configs/__init__.py b/metaflow/user_configs/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/metaflow/user_configs/config_decorators.py b/metaflow/user_configs/config_decorators.py new file mode 100644 index 00000000000..16e9476e08e --- /dev/null +++ b/metaflow/user_configs/config_decorators.py @@ -0,0 +1,214 @@ +from functools import partial +from typing import Any, Callable, Generator, TYPE_CHECKING, Tuple, Union + +from metaflow.exception import MetaflowException +from metaflow.parameters import Parameter +from metaflow.user_configs.config_parameters import ConfigValue + +if TYPE_CHECKING: + from metaflow.flowspec import FlowSpec + from metaflow.decorators import FlowSpecDerived, StepDecorator + + +class StepProxy: + """ + A StepProxy is a wrapper passed to the `StepConfigDecorator`'s `evaluate` method + to allow the decorator to interact with the step and providing easy methods to + modify the behavior of the step. + """ + + def __init__( + self, + step: Union[ + Callable[["FlowSpecDerived"], None], + Callable[["FlowSpecDerived", Any], None], + ], + ): + self._my_step = step + + def remove_decorator(self, deco_name: str, all: bool = True, **kwargs) -> bool: + """ + Remove one or more Metaflow decorators from a step. + + Some decorators can be applied multiple times to a step. This method allows you + to choose which decorator to remove or just remove all of them or one of them. + + Parameters + ---------- + deco_name : str + Name of the decorator to remove + all : bool, default True + If True, remove all instances of the decorator that match the filters + passed using kwargs (or all the instances of the decorator if no filters are + passed). If False, removes only the first found instance of the decorator. + + Returns + ------- + bool + Returns True if at least one decorator was removed. + """ + new_deco_list = [] + did_remove = False + for deco in self._my_step.decorators: + if deco.name == deco_name: + # Check filters + match_ok = True + if kwargs: + for k, v in kwargs.items(): + match_ok = k in deco.attributes and deco.attributes[k] == v + if match_ok is False: + break + if match_ok: + did_remove = True + else: + new_deco_list.append(deco) + else: + new_deco_list.append(deco) + if did_remove and not all: + break + + self._my_step.decorators = new_deco_list + return did_remove + + def add_decorator(self, deco_type: partial, **kwargs) -> None: + """ + Add a Metaflow decorator to a step. + + Parameters + ---------- + deco_type : partial + The decorator class to add to this step + """ + # Prevent circular import + from metaflow.decorators import DuplicateStepDecoratorException, StepDecorator + + # Validate deco_type + if ( + not isinstance(deco_type, partial) + or len(deco_type.args) != 1 + or not issubclass(deco_type.args[0], StepDecorator) + ): + raise TypeError("add_decorator takes a StepDecorator") + + deco_type = deco_type.args[0] + if ( + deco_type.name in [deco.name for deco in self._my_step.decorators] + and not deco_type.allow_multiple + ): + raise DuplicateStepDecoratorException(deco_type.name, self._my_step) + + self._my_step.decorators.append( + deco_type(attributes=kwargs, statically_defined=True) + ) + + +class FlowSpecProxy: + def __init__(self, flow_spec: "FlowSpec"): + self._flow_cls = flow_spec + + @property + def configs(self) -> Generator[Tuple[str, ConfigValue], None, None]: + """ + Iterate over all user configurations in this flow + + Use this to parameterize your flow based on configuration. As an example, the + `evaluate` method of your `FlowConfigDecorator` can use this to add an + environment decorator. + ``` + class MyDecorator(FlowConfigDecorator): + def evaluate(flow: FlowSpecProxy): + val = next(flow.configs)[1].steps.start.cpu + flow.start.add_decorator(environment, vars={'mycpu': val}) + return flow + + @MyDecorator() + class TestFlow(FlowSpec): + config = Config('myconfig.json') + + @step + def start(self): + pass + ``` + can be used to add an environment decorator to the `start` step. + + Yields + ------ + Tuple[str, ConfigValue] + Iterates over the configurations of the flow + """ + from metaflow.flowspec import _FlowState + + # When configs are parsed, they are loaded in _flow_state[_FlowState.CONFIGS] + for name, value in self._flow_cls._flow_state.get( + _FlowState.CONFIGS, {} + ).items(): + yield name, ConfigValue(value) + + @property + def steps(self) -> Generator[Tuple[str, StepProxy], None, None]: + """ + Iterate over all the steps in this flow + + Yields + ------ + Tuple[str, StepProxy] + A tuple with the step name and the step proxy + """ + for var in dir(self._flow_cls): + potential_step = getattr(self._flow_cls, var) + if callable(potential_step) and hasattr(potential_step, "is_step"): + yield var, StepProxy(potential_step) + + def __getattr__(self, name): + # We allow direct access to the steps, configs and parameters but nothing else + attr = getattr(self._flow_cls, name) + if attr: + # Steps + if callable(attr) and hasattr(attr, "is_step"): + return StepProxy(attr) + if name[0] == "_" or name in self._flow_cls._NON_PARAMETERS: + raise AttributeError(self, name) + return attr + raise AttributeError(self, name) + + +class FlowConfigDecorator: + def __call__(self, flow_spec: "FlowSpec") -> "FlowSpec": + from ..flowspec import _FlowState + + flow_spec._flow_state.setdefault(_FlowState.CONFIG_DECORATORS, []).append(self) + self._flow_cls = flow_spec + return flow_spec + + def evaluate(self, flow_proxy: FlowSpecProxy) -> None: + raise NotImplementedError() + + +class StepConfigDecorator: + def __call__( + self, + step: Union[ + Callable[["FlowSpecDerived"], None], + Callable[["FlowSpecDerived", Any], None], + ], + ) -> Union[ + Callable[["FlowSpecDerived"], None], + Callable[["FlowSpecDerived", Any], None], + ]: + from ..flowspec import _FlowState + + if not hasattr(step, "is_step"): + raise MetaflowException( + "StepConfigDecorators must be applied to a step function" + ) + self._my_step = step + # Get the flow + flow_spec = step.__globals__[step.__qualname__.rsplit(".", 1)[0]] + flow_spec._flow_state.setdefault(_FlowState.CONFIG_DECORATORS, []).append(self) + + self._flow_cls = flow_spec + + return step + + def evaluate(self, step_proxy: StepProxy) -> None: + raise NotImplementedError() diff --git a/metaflow/user_configs/config_options.py b/metaflow/user_configs/config_options.py new file mode 100644 index 00000000000..2af0cc8d807 --- /dev/null +++ b/metaflow/user_configs/config_options.py @@ -0,0 +1,384 @@ +import json +import os + +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +from metaflow._vendor import click + +from .config_parameters import CONFIG_FILE, ConfigValue +from ..exception import MetaflowException, MetaflowInternalError +from ..parameters import DeployTimeField, ParameterContext, current_flow +from ..util import get_username + + +def _load_config_values(info_file: Optional[str] = None) -> Optional[Dict[Any, Any]]: + if info_file is None: + info_file = os.path.basename(CONFIG_FILE) + try: + with open(info_file, encoding="utf-8") as contents: + return json.load(contents).get("user_configs", {}) + except IOError: + return None + + +class ConvertPath(click.Path): + name = "ConvertPath" + + def convert(self, value, param, ctx): + if isinstance(value, str) and value.startswith("converted:"): + return value + v = super().convert(value, param, ctx) + return self.convert_value(v) + + @staticmethod + def convert_value(value): + if value is None: + return None + try: + with open(value, "r", encoding="utf-8") as f: + content = f.read() + except OSError: + return "converted:!!NO_FILE!!%s" % value + return "converted:" + content + + +class ConvertDictOrStr(click.ParamType): + name = "ConvertDictOrStr" + + def convert(self, value, param, ctx): + return self.convert_value(value) + + @staticmethod + def convert_value(value): + if value is None: + return None + + if isinstance(value, dict): + return "converted:" + json.dumps(value) + + if value.startswith("converted:"): + return value + + return "converted:" + value + + +class MultipleTuple(click.Tuple): + # Small wrapper around a click.Tuple to allow the environment variable for + # configurations to be a JSON string. Otherwise the default behavior is splitting + # by whitespace which is totally not what we want + # You can now pass multiple configuration options through an environment variable + # using something like: + # METAFLOW_FLOW_CONFIG='{"config1": "filenameforconfig1.json", "config2": {"key1": "value1"}}' + + def split_envvar_value(self, rv): + loaded = json.loads(rv) + return list( + item if isinstance(item, str) else json.dumps(item) + for pair in loaded.items() + for item in pair + ) + + +class ConfigInput: + # ConfigInput is an internal class responsible for processing all the --config and + # --config-value options. + # It gathers information from the --local-config-file (to figure out + # where options are stored) and is also responsible for processing any `--config` or + # `--config-value` options. Note that the process_configs function will be called + # *twice* (once for the configs and another for the config-values). This makes + # this function a little bit more tricky. We need to wait for both calls before + # being able to process anything. + + # It will then store this information in the flow spec for use later in processing. + # It is stored in the flow spec to avoid being global to support the Runner. + + loaded_configs = None # type: Optional[Dict[str, Dict[Any, Any]]] + config_file = None # type: Optional[str] + + def __init__( + self, + req_configs: List[str], + defaults: Dict[str, Tuple[Union[str, Dict[Any, Any]], bool]], + parsers: Dict[str, Callable[[str], Dict[Any, Any]]], + ): + self._req_configs = set(req_configs) + self._defaults = defaults + self._parsers = parsers + self._path_values = None + self._value_values = None + + @staticmethod + def make_key_name(name: str) -> str: + # Special mark to indicate that the configuration value is not content or a file + # name but a value that should be read in the config file (effectively where + # the value has already been materialized). + return "kv." + name.lower() + + @classmethod + def set_config_file(cls, config_file: str): + cls.config_file = config_file + + @classmethod + def get_config(cls, config_name: str) -> Optional[Dict[Any, Any]]: + if cls.loaded_configs is None: + all_configs = _load_config_values(cls.config_file) + if all_configs is None: + raise MetaflowException( + "Could not load expected configuration values " + "from the CONFIG_PARAMETERS file. This is a Metaflow bug. " + "Please contact support." + ) + cls.loaded_configs = all_configs + return cls.loaded_configs.get(config_name, None) + + def process_configs(self, ctx, param, value): + from ..cli import echo_always, echo_dev_null # Prevent circular import + from ..flowspec import _FlowState # Prevent circular import + + flow_cls = getattr(current_flow, "flow_cls", None) + if flow_cls is None: + # This is an error + raise MetaflowInternalError( + "Config values should be processed for a FlowSpec" + ) + + # This function is called by click when processing all the --config and + # --config-value options. + # The value passed in is a list of tuples (name, value). + # Click will provide: + # - all the defaults if nothing is provided on the command line + # - provide *just* the passed in value if anything is provided on the command + # line. + # + # We need to get all config and config-value options and click will call this + # function twice. We will first get all the values on the command line and + # *then* merge with the defaults to form a full set of values. + # We therefore get a full set of values where: + # - the name will correspond to the configuration name + # - the value will be the default (including None if there is no default) or + # the string representation of the value (this will always include + # the "converted:" prefix as it will have gone through the ConvertPath or + # ConvertDictOrStr conversion function). + # A value of None basically means that the config has + # no default and was not specified on the command line. + + print("Got arg name %s and values %s" % (param.name, str(value))) + do_return = self._value_values is None and self._path_values is None + if param.name == "config_value_options": + self._value_values = {k.lower(): v for k, v in value if v is not None} + else: + self._path_values = {k.lower(): v for k, v in value if v is not None} + if do_return: + # One of config_value_options or config_file_options will be None + return None + + # The second go around, we process all the values and merge them. + # Check that the user didn't provide *both* a path and a value. We know that + # defaults are not both non None (this is an error) so if they are both + # non-None (and actual files) here, it means the user explicitly provided both. + common_keys = set(self._value_values or []).intersection( + [ + k + for k, v in self._path_values.items() + if v and not v.startswith("converted:!!NO_FILE!!") + ] + or [] + ) + if common_keys: + raise click.UsageError( + "Cannot provide both a value and a file for the same configuration. " + "Found such values for '%s'" % "', '".join(common_keys) + ) + + # NOTE: Important to start with _path_values as they can have the + # NO_FILE special value. They will be used (and trigger an error) iff there is + # no other value provided. + all_values = dict(self._path_values or {}) + all_values.update(self._value_values or {}) + + print("Got all values: %s" % str(all_values)) + flow_cls._flow_state[_FlowState.CONFIGS] = {} + + to_return = {} + merged_configs = {} + for name, (val, is_path) in self._defaults.items(): + n = name.lower() + if n in all_values: + merged_configs[n] = all_values[n] + else: + if isinstance(val, DeployTimeField): + # This supports a default value that is a deploy-time field (similar + # to Parameter).) + # We will form our own context and pass it down -- note that you cannot + # use configs in the default value of configs as this introduces a bit + # of circularity. Note also that quiet and datastore are *eager* + # options so are available here. + param_ctx = ParameterContext( + flow_name=ctx.obj.flow.name, + user_name=get_username(), + parameter_name=n, + logger=echo_dev_null if ctx.params["quiet"] else echo_always, + ds_type=ctx.params["datastore"], + configs=None, + ) + val = val.fun(param_ctx) + if is_path: + # This is a file path + merged_configs[n] = ConvertPath.convert_value(val) + else: + # This is a value + merged_configs[n] = ConvertDictOrStr.convert_value(val) + + missing_configs = set() + no_file = [] + msgs = [] + for name, val in merged_configs.items(): + if val is None: + missing_configs.add(name) + continue + if val.startswith("converted:!!NO_FILE!!"): + no_file.append(name) + continue + val = val[10:] # Remove the "converted:" prefix + if val.startswith("kv."): + # This means to load it from a file + read_value = self.get_config(val[3:]) + if read_value is None: + raise click.UsageError( + "Could not find configuration '%s' in INFO file" % val + ) + flow_cls._flow_state[_FlowState.CONFIGS][name] = read_value + to_return[name] = ConfigValue(read_value) + else: + if self._parsers[name]: + read_value = self._parsers[name](val) + else: + try: + read_value = json.loads(val) + except json.JSONDecodeError as e: + msgs.append( + "configuration value for '%s' is not valid JSON: %s" + % (name, e) + ) + continue + # TODO: Support YAML + flow_cls._flow_state[_FlowState.CONFIGS][name] = read_value + to_return[name] = ConfigValue(read_value) + + reqs = missing_configs.intersection(self._req_configs) + for missing in reqs: + msgs.append("missing configuration for '%s'" % missing) + for missing in no_file: + msgs.append( + "configuration file '%s' could not be read for '%s'" + % (merged_configs[missing][21:], missing) + ) + if msgs: + raise click.UsageError( + "Bad values passed for configuration options: %s" % ", ".join(msgs) + ) + return to_return + + def __str__(self): + return repr(self) + + def __repr__(self): + return "ConfigInput" + + +class LocalFileInput(click.Path): + # Small wrapper around click.Path to set the value from which to read configuration + # values. This is set immediately upon processing the --local-config-file + # option and will therefore then be available when processing any of the other + # --config options (which will call ConfigInput.process_configs + name = "LocalFileInput" + + def convert(self, value, param, ctx): + v = super().convert(value, param, ctx) + ConfigInput.set_config_file(value) + return v + + def __str__(self): + return repr(self) + + def __repr__(self): + return "LocalFileInput" + + +def config_options(cmd): + help_strs = [] + required_names = [] + defaults = {} + config_seen = set() + parsers = {} + flow_cls = getattr(current_flow, "flow_cls", None) + if flow_cls is None: + return cmd + + parameters = [p for _, p in flow_cls._get_parameters() if p.IS_FLOW_PARAMETER] + # List all the configuration options + for arg in parameters[::-1]: + save_default = arg.kwargs.get("default", None) + kwargs = arg.option_kwargs(False) + if arg.name.lower() in config_seen: + msg = ( + "Multiple configurations use the same name '%s'. Note that names are " + "case-insensitive. Please change the " + "names of some of your configurations" % arg.name + ) + raise MetaflowException(msg) + config_seen.add(arg.name.lower()) + if kwargs["required"]: + required_names.append(arg.name) + defaults[arg.name.lower()] = (save_default, arg._default_is_file) + help_strs.append(" - %s: %s" % (arg.name.lower(), kwargs.get("help", ""))) + parsers[arg.name.lower()] = arg.parser + + if not config_seen: + # No configurations -- don't add anything + return cmd + + help_str = ( + "Configuration options for the flow. " + "Multiple configurations can be specified." + ) + help_str = "\n\n".join([help_str] + help_strs) + cb_func = ConfigInput(required_names, defaults, parsers).process_configs + + cmd.params.insert( + 0, + click.Option( + ["--config-value", "config_value_options"], + nargs=2, + multiple=True, + type=MultipleTuple([click.Choice(config_seen), ConvertDictOrStr()]), + callback=cb_func, + help=help_str, + envvar="METAFLOW_FLOW_CONFIG_VALUE", + show_default=False, + default=[ + (k, v[0] if not callable(v[0]) and not v[1] else None) + for k, v in defaults.items() + ], + required=False, + ), + ) + cmd.params.insert( + 0, + click.Option( + ["--config", "config_file_options"], + nargs=2, + multiple=True, + type=MultipleTuple([click.Choice(config_seen), ConvertPath()]), + callback=cb_func, + help=help_str, + envvar="METAFLOW_FLOW_CONFIG", + show_default=False, + default=[ + (k, v[0] if not callable(v) and v[1] else None) + for k, v in defaults.items() + ], + required=False, + ), + ) + return cmd diff --git a/metaflow/user_configs/config_parameters.py b/metaflow/user_configs/config_parameters.py new file mode 100644 index 00000000000..4a312487829 --- /dev/null +++ b/metaflow/user_configs/config_parameters.py @@ -0,0 +1,350 @@ +import collections.abc +import json +import os +import re + +from typing import Any, Callable, Dict, Optional, TYPE_CHECKING, Union + + +from ..exception import MetaflowException + +from ..parameters import ( + Parameter, + ParameterContext, + current_flow, +) + +if TYPE_CHECKING: + from metaflow import FlowSpec + +# _tracefunc_depth = 0 + + +# def tracefunc(func): +# """Decorates a function to show its trace.""" + +# @functools.wraps(func) +# def tracefunc_closure(*args, **kwargs): +# global _tracefunc_depth +# """The closure.""" +# print(f"{_tracefunc_depth}: {func.__name__}(args={args}, kwargs={kwargs})") +# _tracefunc_depth += 1 +# result = func(*args, **kwargs) +# _tracefunc_depth -= 1 +# print(f"{_tracefunc_depth} => {result}") +# return result + +# return tracefunc_closure + +CONFIG_FILE = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "CONFIG_PARAMETERS" +) + +ID_PATTERN = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*$") + + +def dump_config_values(flow: "FlowSpec"): + from ..flowspec import _FlowState # Prevent circular import + + configs = flow._flow_state.get(_FlowState.CONFIGS) + if configs: + return {"user_configs": configs} + return {} + + +class ConfigValue(collections.abc.Mapping): + """ + ConfigValue is a thin wrapper around an arbitrarily nested dictionary-like + configuration object. It allows you to access elements of this nested structure + using either a "." notation or a [] notation. As an example, if your configuration + object is: + {"foo": {"bar": 42}} + you can access the value 42 using either config["foo"]["bar"] or config.foo.bar. + + All "keys"" need to be valid Python identifiers + """ + + # Thin wrapper to allow configuration values to be accessed using a "." notation + # as well as a [] notation. + + def __init__(self, data: Dict[str, Any]): + if any(not ID_PATTERN.match(k) for k in data.keys()): + raise MetaflowException( + "All keys in the configuration must be valid Python identifiers" + ) + self._data = data + + def __getattr__(self, key: str) -> Any: + """ + Access an element of this configuration + + Parameters + ---------- + key : str + Element to access + + Returns + ------- + Any + Element of the configuration + """ + if key == "_data": + # Called during unpickling. Special case to not run into infinite loop + # below. + raise AttributeError(key) + + if key in self._data: + return self[key] + raise AttributeError(key) + + def __setattr__(self, name: str, value: Any) -> None: + # Prevent configuration modification + if name == "_data": + return super().__setattr__(name, value) + raise TypeError("ConfigValue is immutable") + + def __getitem__(self, key: Any) -> Any: + """ + Access an element of this configuration + + Parameters + ---------- + key : Any + Element to access + + Returns + ------- + Any + Element of the configuration + """ + value = self._data[key] + if isinstance(value, dict): + value = ConfigValue(value) + return value + + def __len__(self): + return len(self._data) + + def __iter__(self): + return iter(self._data) + + def __repr__(self): + return repr(self._data) + + def __str__(self): + return json.dumps(self._data) + + def to_dict(self) -> Dict[Any, Any]: + """ + Returns a dictionary representation of this configuration object. + + Returns + ------- + Dict[Any, Any] + Dictionary equivalent of this configuration object. + """ + return dict(self._data) + + +class DelayEvaluator(collections.abc.Mapping): + """ + Small wrapper that allows the evaluation of a Config() value in a delayed manner. + This is used when we want to use config.* values in decorators for example. + + It also allows the following "delayed" access on an obj that is a DelayEvaluation + - obj.x.y.z (ie: accessing members of DelayEvaluator; acesses will be delayed until + the DelayEvaluator is evaluated) + - **obj (ie: unpacking the DelayEvaluator as a dictionary). Note that this requires + special handling in whatever this is being unpacked into, specifically the handling + of _unpacked_delayed_* + """ + + def __init__(self, ex: str): + self._config_expr = ex + if ID_PATTERN.match(self._config_expr): + # This is a variable only so allow things like config_expr("config").var + self._is_var_only = True + self._access = [] + else: + self._is_var_only = False + self._access = None + + def __iter__(self): + yield "_unpacked_delayed_%d" % id(self) + + def __getitem__(self, key): + if key == "_unpacked_delayed_%d" % id(self): + return self + raise KeyError(key) + + def __len__(self): + return 1 + + def __getattr__(self, name): + if self._access is None: + raise AttributeError() + self._access.append(name) + return self + + def __call__(self, ctx=None, deploy_time=False): + from ..flowspec import _FlowState # Prevent circular import + + # Two additional arguments are only used by DeployTimeField which will call + # this function with those two additional arguments. They are ignored. + flow_cls = getattr(current_flow, "flow_cls", None) + if flow_cls is None: + # We are not executing inside a flow (ie: not the CLI) + raise MetaflowException( + "Config object can only be used directly in the FlowSpec defining them. " + "If using outside of the FlowSpec, please use ConfigEval" + ) + if self._access is not None: + # Build the final expression by adding all the fields in access as . fields + self._config_expr = ".".join([self._config_expr] + self._access) + # Evaluate the expression setting the config values as local variables + try: + return eval( + self._config_expr, + globals(), + { + k: ConfigValue(v) + for k, v in flow_cls._flow_state.get(_FlowState.CONFIGS, {}).items() + }, + ) + except NameError as e: + potential_config_name = self._config_expr.split(".")[0] + if potential_config_name not in flow_cls._flow_state.get( + _FlowState.CONFIGS, {} + ): + raise MetaflowException( + "Config '%s' not found in the flow (maybe not required and not " + "provided?)" % potential_config_name + ) from e + raise + + +def config_expr(expr: str) -> DelayEvaluator: + """ + Function to allow you to use an expression involving a config parameter in + places where it may not be directory accessible or if you want a more complicated + expression than just a single variable. + + You can use it as follows: + - When the config is not directly accessible: + + @project(name=config_expr("config").project.name) + class MyFlow(FlowSpec): + config = Config("config") + ... + - When you want a more complex expression: + class MyFlow(FlowSpec): + config = Config("config") + + @environment(vars={"foo": config_expr("config.bar.baz.lower()")}) + @step + def start(self): + ... + + Parameters + ---------- + expr : str + Expression using the config values. + """ + return DelayEvaluator(expr) + + +class Config(Parameter, collections.abc.Mapping): + """ + Includes a configuration for this flow. + + `Config` is a special type of `Parameter` but differs in a few key areas: + - it is immutable and determined at deploy time (or prior to running if not deploying + to a scheduler) + - as such, it can be used anywhere in your code including in Metaflow decorators + + The value of the configuration is determines as follows: + - use the user-provided file path or value. It is an error to provide both + - if none are present: + - if a default file path (default) is provided, attempt to read this file + - if the file is present, use that value. Note that the file will be used + even if it has an invalid syntax + - if the file is not present, and a default value is present, use that + - if still None and is required, this is an error. + + Parameters + ---------- + name : str + User-visible configuration name. + default : Union[str, Callable[[ParameterContext], str], optional, default None + Default path from where to read this configuration. A function implies that the + value will be computed using that function. + You can only specify default or default_value. + default_value : Union[str, Dict[str, Any], Callable[[ParameterContext, Union[str, Dict[str, Any]]], Any], optional, default None + Default value for the parameter. A function + implies that the value will be computed using that function. + You can only specify default or default_value. + help : str, optional, default None + Help text to show in `run --help`. + required : bool, default False + Require that the user specified a value for the parameter. Note that if + a default is provided, the required flag is ignored. + parser : Callable[[str], Dict[Any, Any]], optional, default None + An optional function that can parse the configuration string into an arbitrarily + nested dictionary. + show_default : bool, default True + If True, show the default value in the help text. + """ + + IS_FLOW_PARAMETER = True + + def __init__( + self, + name: str, + default: Optional[Union[str, Callable[[ParameterContext], str]]] = None, + default_value: Optional[ + Union[ + str, + Dict[str, Any], + Callable[[ParameterContext], Union[str, Dict[str, Any]]], + ] + ] = None, + help: Optional[str] = None, + required: bool = False, + parser: Optional[Callable[[str], Dict[Any, Any]]] = None, + **kwargs: Dict[str, str] + ): + + if default and default_value: + raise MetaflowException( + "For config '%s', you can only specify default or default_value, not both" + % name + ) + self._default_is_file = default is not None + default = default or default_value + + super(Config, self).__init__( + name, default=default, required=required, help=help, type=str, **kwargs + ) + + if isinstance(kwargs.get("default", None), str): + kwargs["default"] = json.dumps(kwargs["default"]) + self.parser = parser + + self._delay_self = DelayEvaluator(name.lower()) + + def load_parameter(self, v): + return v + + # Support . syntax + def __getattr__(self, name): + return self._delay_self.__getattr__(name) + + # Next three methods are to implement mapping to support ** syntax + def __iter__(self): + return iter(self._delay_self) + + def __len__(self): + return len(self._delay_self) + + def __getitem__(self, key): + return self._delay_self[key] diff --git a/metaflow/util.py b/metaflow/util.py index 6695b987d2b..445c5a8d096 100644 --- a/metaflow/util.py +++ b/metaflow/util.py @@ -297,7 +297,7 @@ def get_metaflow_root(): def dict_to_cli_options(params): # Prevent circular imports - from metaflow.config_parameters import ConfigInput, ConfigValue + from .user_configs.config_options import ConfigInput for k, v in params.items(): # Omit boolean options set to false or None, but preserve options with an empty @@ -307,11 +307,13 @@ def dict_to_cli_options(params): # keyword in Python, so we call it 'decospecs' in click args if k == "decospecs": k = "with" - if k == "config_options": + if k in ("config_file_options", "config_value_options"): # Special handling here since we gather them all in one option but actually - # need to send them one at a time using --config kv. + # need to send them one at a time using --config-value kv. + # Note it can be either config_file_options or config_value_options depending + # on click processing order. for config_name in v.keys(): - yield "--config" + yield "--config-value" yield to_unicode(config_name) yield to_unicode(ConfigInput.make_key_name(config_name)) continue diff --git a/test/core/metaflow_test/formatter.py b/test/core/metaflow_test/formatter.py index 096afe78ebb..f9e39d9e983 100644 --- a/test/core/metaflow_test/formatter.py +++ b/test/core/metaflow_test/formatter.py @@ -85,7 +85,7 @@ def _flow_lines(self): tags.extend(tag.split("(")[0] for tag in step.tags) yield 0, "# -*- coding: utf-8 -*-" - yield 0, "from metaflow import Config, config_expr, eval_config, FlowSpec, step, Parameter, project, IncludeFile, JSONType, current, parallel" + yield 0, "from metaflow import Config, config_expr, FlowSpec, step, Parameter, project, IncludeFile, JSONType, current, parallel" yield 0, "from metaflow_test import assert_equals, assert_equals_metadata, assert_exception, ExpectationFailed, is_resumed, ResumeFromHere, TestRetry, try_to_get_card" if tags: yield 0, "from metaflow import %s" % ",".join(tags) diff --git a/test_config/helloconfig.py b/test_config/helloconfig.py index be8246cc6b2..897d3167204 100644 --- a/test_config/helloconfig.py +++ b/test_config/helloconfig.py @@ -8,7 +8,8 @@ step, project, config_expr, - eval_config, + FlowConfigDecorator, + StepConfigDecorator, titus, ) @@ -37,26 +38,21 @@ def config_func(ctx): silly_config = "baz:awesome" -def titus_or_not(flow): - to_replace = [] - for name, s in flow.steps: - if name in flow.config.run_on_titus: - to_replace.append((name, titus(cpu=flow.config.cpu_count)(s))) - for name, val in to_replace: - setattr(flow, name, val) - return flow +class TitusOrNot(FlowConfigDecorator): + def evaluate(self, flow_proxy): + for name, s in flow_proxy.steps: + if name in flow_proxy.config.run_on_titus: + s.add_decorator(titus, cpu=flow_proxy.config.cpu_count) -def add_env_to_start(flow): - # Add a decorator directly to a step - flow.start = environment(vars={"hello": config_expr("config").env_to_start})( - flow.start - ) - return flow +class AddEnvToStart(FlowConfigDecorator): + def evaluate(self, flow_proxy): + s = flow_proxy.start + s.add_decorator(environment, vars={"hello": flow_proxy.config.env_to_start}) -@eval_config(titus_or_not) -@add_env_to_start +@TitusOrNot() +@AddEnvToStart() @project(name=config_expr("config").project_name) class HelloConfig(FlowSpec): """ @@ -72,7 +68,7 @@ class HelloConfig(FlowSpec): default_from_func = Parameter("default_from_func", default=param_func, type=int) - config = Config("config", default=default_config, help="Help for config") + config = Config("config", default_value=default_config, help="Help for config") sconfig = Config( "sconfig", default="sillyconfig.txt", @@ -80,9 +76,11 @@ class HelloConfig(FlowSpec): help="Help for sconfig", required=True, ) - config2 = Config("config2", required=True) + config2 = Config("config2") + + config3 = Config("config3", default_value=config_func) - config3 = Config("config3", default=config_func) + env_config = Config("env_config", default_value={"vars": {"name": "Romain"}}) @step def start(self): @@ -125,6 +123,7 @@ def hello(self): ) self.next(self.end) + @environment(**env_config) @step def end(self): """ @@ -132,7 +131,7 @@ def end(self): last step in the flow. """ - print("HelloFlow is all done") + print("HelloFlow is all done for %s" % os.environ.get("name")) if __name__ == "__main__": From 64b4b6985e9a33e358774fd0586f8c63d6e8a545 Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Wed, 16 Oct 2024 13:40:48 -0700 Subject: [PATCH 13/24] Better handling of default options --- metaflow/user_configs/config_options.py | 141 +++++++++++++++++------- 1 file changed, 101 insertions(+), 40 deletions(-) diff --git a/metaflow/user_configs/config_options.py b/metaflow/user_configs/config_options.py index 2af0cc8d807..8a24905c137 100644 --- a/metaflow/user_configs/config_options.py +++ b/metaflow/user_configs/config_options.py @@ -11,6 +11,15 @@ from ..util import get_username +_CONVERT_PREFIX = "@!c!@:" +_DEFAULT_PREFIX = "@!d!@:" +_NO_FILE = "@!n!@" + +_CONVERTED_DEFAULT = _CONVERT_PREFIX + _DEFAULT_PREFIX +_CONVERTED_NO_FILE = _CONVERT_PREFIX + _NO_FILE +_CONVERTED_DEFAULT_NO_FILE = _CONVERTED_DEFAULT + _NO_FILE + + def _load_config_values(info_file: Optional[str] = None) -> Optional[Dict[Any, Any]]: if info_file is None: info_file = os.path.basename(CONFIG_FILE) @@ -25,41 +34,67 @@ class ConvertPath(click.Path): name = "ConvertPath" def convert(self, value, param, ctx): - if isinstance(value, str) and value.startswith("converted:"): + if isinstance(value, str) and value.startswith(_CONVERT_PREFIX): return value - v = super().convert(value, param, ctx) - return self.convert_value(v) + is_default = False + if value and value.startswith(_DEFAULT_PREFIX): + is_default = True + value = super().convert(value[len(_DEFAULT_PREFIX) :], param, ctx) + return self.convert_value(value, is_default) @staticmethod - def convert_value(value): + def mark_as_default(value): + if value is None: + return None + return _DEFAULT_PREFIX + str(value) + + @staticmethod + def convert_value(value, is_default): + default_str = _DEFAULT_PREFIX if is_default else "" if value is None: return None try: with open(value, "r", encoding="utf-8") as f: content = f.read() except OSError: - return "converted:!!NO_FILE!!%s" % value - return "converted:" + content + return _CONVERT_PREFIX + default_str + _NO_FILE + value + return _CONVERT_PREFIX + default_str + content class ConvertDictOrStr(click.ParamType): name = "ConvertDictOrStr" def convert(self, value, param, ctx): - return self.convert_value(value) + is_default = False + if isinstance(value, str): + if value.startswith(_CONVERT_PREFIX): + return value + if value.startswith(_DEFAULT_PREFIX): + is_default = True + + return self.convert_value(value, is_default) @staticmethod - def convert_value(value): + def convert_value(value, is_default): + default_str = _DEFAULT_PREFIX if is_default else "" if value is None: return None if isinstance(value, dict): - return "converted:" + json.dumps(value) + return _CONVERT_PREFIX + default_str + json.dumps(value) - if value.startswith("converted:"): + if value.startswith(_CONVERT_PREFIX): return value - return "converted:" + value + return _CONVERT_PREFIX + default_str + value + + @staticmethod + def mark_as_default(value): + if value is None: + return None + if isinstance(value, dict): + return _DEFAULT_PREFIX + json.dumps(value) + return _DEFAULT_PREFIX + str(value) class MultipleTuple(click.Tuple): @@ -155,34 +190,37 @@ def process_configs(self, ctx, param, value): # *then* merge with the defaults to form a full set of values. # We therefore get a full set of values where: # - the name will correspond to the configuration name - # - the value will be the default (including None if there is no default) or - # the string representation of the value (this will always include - # the "converted:" prefix as it will have gone through the ConvertPath or - # ConvertDictOrStr conversion function). - # A value of None basically means that the config has - # no default and was not specified on the command line. + # - the value will be: + # - the default (including None if there is no default). If the default is + # not None, it will start with _CONVERTED_DEFAULT since Click will make + # the value go through ConvertPath or ConvertDictOrStr + # - the actual value passed through prefixed with _CONVERT_PREFIX print("Got arg name %s and values %s" % (param.name, str(value))) do_return = self._value_values is None and self._path_values is None + # We only keep around non default values. We could simplify by checking just one + # value and if it is default it means all are but this doesn't seem much more effort + # and is clearer if param.name == "config_value_options": - self._value_values = {k.lower(): v for k, v in value if v is not None} + self._value_values = { + k.lower(): v + for k, v in value + if v is not None and not v.startswith(_CONVERTED_DEFAULT) + } else: - self._path_values = {k.lower(): v for k, v in value if v is not None} + self._path_values = { + k.lower(): v + for k, v in value + if v is not None and not v.startswith(_CONVERTED_DEFAULT) + } if do_return: # One of config_value_options or config_file_options will be None return None # The second go around, we process all the values and merge them. - # Check that the user didn't provide *both* a path and a value. We know that - # defaults are not both non None (this is an error) so if they are both - # non-None (and actual files) here, it means the user explicitly provided both. + # Check that the user didn't provide *both* a path and a value. common_keys = set(self._value_values or []).intersection( - [ - k - for k, v in self._path_values.items() - if v and not v.startswith("converted:!!NO_FILE!!") - ] - or [] + [k for k, v in self._path_values.items()] or [] ) if common_keys: raise click.UsageError( @@ -190,9 +228,6 @@ def process_configs(self, ctx, param, value): "Found such values for '%s'" % "', '".join(common_keys) ) - # NOTE: Important to start with _path_values as they can have the - # NO_FILE special value. They will be used (and trigger an error) iff there is - # no other value provided. all_values = dict(self._path_values or {}) all_values.update(self._value_values or {}) @@ -224,22 +259,26 @@ def process_configs(self, ctx, param, value): val = val.fun(param_ctx) if is_path: # This is a file path - merged_configs[n] = ConvertPath.convert_value(val) + merged_configs[n] = ConvertPath.convert_value(val, False) else: # This is a value - merged_configs[n] = ConvertDictOrStr.convert_value(val) + merged_configs[n] = ConvertDictOrStr.convert_value(val, False) missing_configs = set() no_file = [] + no_default_file = [] msgs = [] for name, val in merged_configs.items(): if val is None: missing_configs.add(name) continue - if val.startswith("converted:!!NO_FILE!!"): + if val.startswith(_CONVERTED_NO_FILE): no_file.append(name) continue - val = val[10:] # Remove the "converted:" prefix + if val.startswith(_CONVERTED_DEFAULT_NO_FILE): + no_default_file.append(name) + continue + val = val[len(_CONVERT_PREFIX) :] # Remove the _CONVERT_PREFIX if val.startswith("kv."): # This means to load it from a file read_value = self.get_config(val[3:]) @@ -271,7 +310,12 @@ def process_configs(self, ctx, param, value): for missing in no_file: msgs.append( "configuration file '%s' could not be read for '%s'" - % (merged_configs[missing][21:], missing) + % (merged_configs[missing][len(_CONVERTED_NO_FILE) :], missing) + ) + for missing in no_default_file: + msgs.append( + "default configuration file '%s' could not be read for '%s'" + % (merged_configs[missing][len(_CONVERTED_DEFAULT_NO_FILE) :], missing) ) if msgs: raise click.UsageError( @@ -318,7 +362,6 @@ def config_options(cmd): parameters = [p for _, p in flow_cls._get_parameters() if p.IS_FLOW_PARAMETER] # List all the configuration options for arg in parameters[::-1]: - save_default = arg.kwargs.get("default", None) kwargs = arg.option_kwargs(False) if arg.name.lower() in config_seen: msg = ( @@ -330,7 +373,11 @@ def config_options(cmd): config_seen.add(arg.name.lower()) if kwargs["required"]: required_names.append(arg.name) - defaults[arg.name.lower()] = (save_default, arg._default_is_file) + + defaults[arg.name.lower()] = ( + arg.kwargs.get("default", None), + arg._default_is_file, + ) help_strs.append(" - %s: %s" % (arg.name.lower(), kwargs.get("help", ""))) parsers[arg.name.lower()] = arg.parser @@ -357,7 +404,14 @@ def config_options(cmd): envvar="METAFLOW_FLOW_CONFIG_VALUE", show_default=False, default=[ - (k, v[0] if not callable(v[0]) and not v[1] else None) + ( + k, + ( + ConvertDictOrStr.mark_as_default(v[0]) + if not callable(v[0]) and not v[1] + else None + ), + ) for k, v in defaults.items() ], required=False, @@ -375,7 +429,14 @@ def config_options(cmd): envvar="METAFLOW_FLOW_CONFIG", show_default=False, default=[ - (k, v[0] if not callable(v) and v[1] else None) + ( + k, + ( + ConvertPath.mark_as_default(v[0]) + if not callable(v) and v[1] + else None + ), + ) for k, v in defaults.items() ], required=False, From ba3578babc2376605855b67747ce0f3fb1e79966 Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Thu, 17 Oct 2024 00:22:16 -0700 Subject: [PATCH 14/24] Changed names --- metaflow/__init__.py | 2 +- metaflow/flowspec.py | 20 +++++++-------- metaflow/user_configs/config_decorators.py | 30 +++++++++++----------- test_config/helloconfig.py | 8 +++--- 4 files changed, 30 insertions(+), 30 deletions(-) diff --git a/metaflow/__init__.py b/metaflow/__init__.py index 2ce16d0b909..b4f0d8d2253 100644 --- a/metaflow/__init__.py +++ b/metaflow/__init__.py @@ -104,7 +104,7 @@ class and related decorators. from .parameters import Parameter, JSONTypeClass, JSONType from .user_configs.config_parameters import Config, config_expr -from .user_configs.config_decorators import FlowConfigDecorator, StepConfigDecorator +from .user_configs.config_decorators import CustomFlowDecorator, CustomStepDecorator # data layer # For historical reasons, we make metaflow.plugins.datatools accessible as diff --git a/metaflow/flowspec.py b/metaflow/flowspec.py index e819450a019..32c854104c9 100644 --- a/metaflow/flowspec.py +++ b/metaflow/flowspec.py @@ -23,10 +23,10 @@ from .graph import FlowGraph from .unbounded_foreach import UnboundedForeachInput from .user_configs.config_decorators import ( - FlowConfigDecorator, - FlowSpecProxy, - StepConfigDecorator, - StepProxy, + CustomFlowDecorator, + CustomStepDecorator, + MutableFlow, + MutableStep, ) from .util import to_pod from .metaflow_config import INCLUDE_FOREACH_STACK, MAXIMUM_FOREACH_VALUE_CHARS @@ -207,27 +207,27 @@ def _process_config_decorators(self, config_options): # Run all the decorators for deco in self._flow_state[_FlowState.CONFIG_DECORATORS]: - if isinstance(deco, FlowConfigDecorator): + if isinstance(deco, CustomFlowDecorator): # Sanity check to make sure we are applying the decorator to the right # class if not deco._flow_cls == current_cls and not issubclass( current_cls, deco._flow_cls ): raise MetaflowInternalError( - "FlowConfigDecorator registered on the wrong flow -- " + "CustomFlowDecorator registered on the wrong flow -- " "expected %s but got %s" % (deco._flow_cls.__name__, current_cls.__name__) ) - deco.evaluate(FlowSpecProxy(current_cls)) - elif isinstance(deco, StepConfigDecorator): + deco.evaluate(MutableFlow(current_cls)) + elif isinstance(deco, CustomStepDecorator): # Again some sanity checks if deco._flow_cls != current_cls: raise MetaflowInternalError( - "StepConfigDecorator registered on the wrong flow -- " + "CustomStepDecorator registered on the wrong flow -- " "expected %s but got %s" % (deco._flow_cls.__name__, current_cls.__name__) ) - deco.evaluate(StepConfigDecorator(deco._my_step)) + deco.evaluate(CustomStepDecorator(deco._my_step)) # Reset all configs that were already present in the class. # TODO: This means that users can't override configs directly. Not sure if this diff --git a/metaflow/user_configs/config_decorators.py b/metaflow/user_configs/config_decorators.py index 16e9476e08e..58be6bc9c42 100644 --- a/metaflow/user_configs/config_decorators.py +++ b/metaflow/user_configs/config_decorators.py @@ -10,9 +10,9 @@ from metaflow.decorators import FlowSpecDerived, StepDecorator -class StepProxy: +class MutableStep: """ - A StepProxy is a wrapper passed to the `StepConfigDecorator`'s `evaluate` method + A MutableStep is a wrapper passed to the `CustomStepDecorator`'s `evaluate` method to allow the decorator to interact with the step and providing easy methods to modify the behavior of the step. """ @@ -102,7 +102,7 @@ def add_decorator(self, deco_type: partial, **kwargs) -> None: ) -class FlowSpecProxy: +class MutableFlow: def __init__(self, flow_spec: "FlowSpec"): self._flow_cls = flow_spec @@ -112,11 +112,11 @@ def configs(self) -> Generator[Tuple[str, ConfigValue], None, None]: Iterate over all user configurations in this flow Use this to parameterize your flow based on configuration. As an example, the - `evaluate` method of your `FlowConfigDecorator` can use this to add an + `evaluate` method of your `CustomFlowDecorator` can use this to add an environment decorator. ``` - class MyDecorator(FlowConfigDecorator): - def evaluate(flow: FlowSpecProxy): + class MyDecorator(CustomFlowDecorator): + def evaluate(flow: MutableFlow): val = next(flow.configs)[1].steps.start.cpu flow.start.add_decorator(environment, vars={'mycpu': val}) return flow @@ -145,19 +145,19 @@ def start(self): yield name, ConfigValue(value) @property - def steps(self) -> Generator[Tuple[str, StepProxy], None, None]: + def steps(self) -> Generator[Tuple[str, MutableStep], None, None]: """ Iterate over all the steps in this flow Yields ------ - Tuple[str, StepProxy] + Tuple[str, MutableStep] A tuple with the step name and the step proxy """ for var in dir(self._flow_cls): potential_step = getattr(self._flow_cls, var) if callable(potential_step) and hasattr(potential_step, "is_step"): - yield var, StepProxy(potential_step) + yield var, MutableStep(potential_step) def __getattr__(self, name): # We allow direct access to the steps, configs and parameters but nothing else @@ -165,14 +165,14 @@ def __getattr__(self, name): if attr: # Steps if callable(attr) and hasattr(attr, "is_step"): - return StepProxy(attr) + return MutableStep(attr) if name[0] == "_" or name in self._flow_cls._NON_PARAMETERS: raise AttributeError(self, name) return attr raise AttributeError(self, name) -class FlowConfigDecorator: +class CustomFlowDecorator: def __call__(self, flow_spec: "FlowSpec") -> "FlowSpec": from ..flowspec import _FlowState @@ -180,11 +180,11 @@ def __call__(self, flow_spec: "FlowSpec") -> "FlowSpec": self._flow_cls = flow_spec return flow_spec - def evaluate(self, flow_proxy: FlowSpecProxy) -> None: + def evaluate(self, mutable_flow: MutableFlow) -> None: raise NotImplementedError() -class StepConfigDecorator: +class CustomStepDecorator: def __call__( self, step: Union[ @@ -199,7 +199,7 @@ def __call__( if not hasattr(step, "is_step"): raise MetaflowException( - "StepConfigDecorators must be applied to a step function" + "CustomStepDecorator must be applied to a step function" ) self._my_step = step # Get the flow @@ -210,5 +210,5 @@ def __call__( return step - def evaluate(self, step_proxy: StepProxy) -> None: + def evaluate(self, mutable_step: MutableStep) -> None: raise NotImplementedError() diff --git a/test_config/helloconfig.py b/test_config/helloconfig.py index 897d3167204..c357685a837 100644 --- a/test_config/helloconfig.py +++ b/test_config/helloconfig.py @@ -8,8 +8,8 @@ step, project, config_expr, - FlowConfigDecorator, - StepConfigDecorator, + CustomFlowDecorator, + CustomStepDecorator, titus, ) @@ -38,14 +38,14 @@ def config_func(ctx): silly_config = "baz:awesome" -class TitusOrNot(FlowConfigDecorator): +class TitusOrNot(CustomFlowDecorator): def evaluate(self, flow_proxy): for name, s in flow_proxy.steps: if name in flow_proxy.config.run_on_titus: s.add_decorator(titus, cpu=flow_proxy.config.cpu_count) -class AddEnvToStart(FlowConfigDecorator): +class AddEnvToStart(CustomFlowDecorator): def evaluate(self, flow_proxy): s = flow_proxy.start s.add_decorator(environment, vars={"hello": flow_proxy.config.env_to_start}) From 48dcde1aab292ff058327739a1b3f71e169e4941 Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Thu, 17 Oct 2024 13:13:17 -0700 Subject: [PATCH 15/24] Fix includefile --- metaflow/includefile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metaflow/includefile.py b/metaflow/includefile.py index c81a701dfb3..d599de3f1a8 100644 --- a/metaflow/includefile.py +++ b/metaflow/includefile.py @@ -137,7 +137,7 @@ def convert(self, value, param, ctx): parameter_name=param.name, logger=ctx.obj.echo, ds_type=ctx.obj.datastore_impl.TYPE, - configs=ConfigValue(dict(ctx.obj.flow.configs)), + configs=None, ) if len(value) > 0 and (value.startswith("{") or value.startswith('"{')): From 0a8fb13f9d2f39aa6510bcece434284a97ba0f75 Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Thu, 17 Oct 2024 17:54:27 -0700 Subject: [PATCH 16/24] Remove more old code --- metaflow/flowspec.py | 2 -- metaflow/parameters.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/metaflow/flowspec.py b/metaflow/flowspec.py index 32c854104c9..0c8f3aab7bd 100644 --- a/metaflow/flowspec.py +++ b/metaflow/flowspec.py @@ -344,8 +344,6 @@ def __iter__(self): return iter(self._steps) def __getattr__(self, name: str): - if name in ("configs", "steps"): - return getattr(self.__class__, name) if self._datastore and name in self._datastore: # load the attribute from the datastore... x = self._datastore[name] diff --git a/metaflow/parameters.py b/metaflow/parameters.py index d1413f47247..d37d1802501 100644 --- a/metaflow/parameters.py +++ b/metaflow/parameters.py @@ -35,7 +35,7 @@ ("parameter_name", str), ("logger", Callable[..., None]), ("ds_type", str), - ("configs", "ConfigValue"), + ("configs", Optional["ConfigValue"]), ], ) From 18e40eb72ef979b90570352f5d242ec74812a84b Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Wed, 23 Oct 2024 00:52:30 -0700 Subject: [PATCH 17/24] Bug fixes and better Custom*Decorator behavior. Fixed some typos and updated test to reflect latest code. Fixed a few other issues: - fixed an issue where a config was used in different decorators causing it to produce an incorrect access string - made the decorators work with or without arguments --- metaflow/runner/click_api.py | 2 +- metaflow/user_configs/config_decorators.py | 109 +++++++++++++++++++-- metaflow/user_configs/config_options.py | 8 +- metaflow/user_configs/config_parameters.py | 15 ++- test/core/metaflow_test/formatter.py | 12 ++- test/core/tests/basic_config_parameters.py | 6 +- test_config/helloconfig.py | 18 ++-- 7 files changed, 133 insertions(+), 37 deletions(-) diff --git a/metaflow/runner/click_api.py b/metaflow/runner/click_api.py index 96faa235281..2d946fe7133 100644 --- a/metaflow/runner/click_api.py +++ b/metaflow/runner/click_api.py @@ -39,7 +39,7 @@ from metaflow.exception import MetaflowException from metaflow.includefile import FilePathClass from metaflow.parameters import JSONTypeClass, flow_context -from metaflow.config_options import LocalFileInput +from metaflow.user_configs.config_options import LocalFileInput # Define a recursive type alias for JSON JSON = Union[Dict[str, "JSON"], List["JSON"], str, int, float, bool, None] diff --git a/metaflow/user_configs/config_decorators.py b/metaflow/user_configs/config_decorators.py index 58be6bc9c42..2e5202406ce 100644 --- a/metaflow/user_configs/config_decorators.py +++ b/metaflow/user_configs/config_decorators.py @@ -1,13 +1,13 @@ from functools import partial -from typing import Any, Callable, Generator, TYPE_CHECKING, Tuple, Union +from typing import Any, Callable, Generator, Optional, TYPE_CHECKING, Tuple, Union from metaflow.exception import MetaflowException from metaflow.parameters import Parameter from metaflow.user_configs.config_parameters import ConfigValue if TYPE_CHECKING: - from metaflow.flowspec import FlowSpec - from metaflow.decorators import FlowSpecDerived, StepDecorator + from metaflow.flowspec import _FlowSpecMeta + from metaflow.decorators import FlowSpecDerived class MutableStep: @@ -173,19 +173,105 @@ def __getattr__(self, name): class CustomFlowDecorator: - def __call__(self, flow_spec: "FlowSpec") -> "FlowSpec": + def __init__(self, *args, **kwargs): + from ..flowspec import FlowSpec, _FlowSpecMeta + + if args and isinstance(args[0], (CustomFlowDecorator, _FlowSpecMeta)): + # This means the decorator is bare like @MyDecorator + # and the first argument is the FlowSpec or another decorator (they + # can be stacked) + if isinstance(args[0], _FlowSpecMeta): + self._set_flow_cls(args[0]) + else: + self._set_flow_cls(args[0]._flow_cls) + else: + # The arguments are actually passed to the init function for this decorator + self._args = args + self._kwargs = kwargs + + def __call__(self, flow_spec: Optional["_FlowSpecMeta"] = None) -> "_FlowSpecMeta": + if flow_spec: + # This is the case of a decorator @MyDecorator(foo=1, bar=2) and so + # we already called __init__ and saved foo and bar and are now calling + # this on the flow itself. + self.init(*self._args, **self._kwargs) + return self._set_flow_cls(flow_spec) + elif not self._flow_cls: + # This means that somehow the initialization did not happen properly + # so this may have been applied to a non flow + raise MetaflowException( + "A CustomFlowDecorator can only be applied to a FlowSpec" + ) + return self._flow_cls() + + def _set_flow_cls(self, flow_spec: "_FlowSpecMeta") -> "_FlowSpecMeta": from ..flowspec import _FlowState flow_spec._flow_state.setdefault(_FlowState.CONFIG_DECORATORS, []).append(self) self._flow_cls = flow_spec return flow_spec + def init(self, *args, **kwargs): + """ + This method is intended to be optionally overridden if you need to + have an initializer. + + Raises + ------ + NotImplementedError: If the method is not overridden in a subclass. + """ + raise NotImplementedError() + def evaluate(self, mutable_flow: MutableFlow) -> None: raise NotImplementedError() class CustomStepDecorator: + def __init__(self, *args, **kwargs): + if args and ( + isinstance(args[0], CustomStepDecorator) + or callable(args[0]) + and hasattr(args[0], "is_step") + ): + # This means the decorator is bare like @MyDecorator + # and the first argument is the step or another decorator (they + # can be stacked) + if isinstance(args[0], CustomStepDecorator): + self._set_my_step(args[0]._my_step) + else: + self._set_my_step(args[0]) + else: + # The arguments are actually passed to the init function for this decorator + self._args = args + self._kwargs = kwargs + def __call__( + self, + step: Optional[ + Union[ + Callable[["FlowSpecDerived"], None], + Callable[["FlowSpecDerived", Any], None], + ] + ] = None, + ) -> Union[ + Callable[["FlowSpecDerived"], None], + Callable[["FlowSpecDerived", Any], None], + ]: + if step: + # This is the case of a decorator @MyDecorator(foo=1, bar=2) and so + # we already called __init__ and saved foo and bar and are now calling + # this on the step itself. + self.init(*self._args, **self._kwargs) + return self._set_my_step(step) + elif not self._my_step: + # This means that somehow the initialization did not happen properly + # so this may have been applied to a non step + raise MetaflowException( + "A CustomStepDecorator can only be applied to a step function" + ) + return self._my_step + + def _set_my_step( self, step: Union[ Callable[["FlowSpecDerived"], None], @@ -197,10 +283,6 @@ def __call__( ]: from ..flowspec import _FlowState - if not hasattr(step, "is_step"): - raise MetaflowException( - "CustomStepDecorator must be applied to a step function" - ) self._my_step = step # Get the flow flow_spec = step.__globals__[step.__qualname__.rsplit(".", 1)[0]] @@ -208,7 +290,16 @@ def __call__( self._flow_cls = flow_spec - return step + def init(self, *args, **kwargs): + """ + This method is intended to be optionally overridden if you need to + have an initializer. + + Raises + ------ + NotImplementedError: If the method is not overridden in a subclass. + """ + raise NotImplementedError() def evaluate(self, mutable_step: MutableStep) -> None: raise NotImplementedError() diff --git a/metaflow/user_configs/config_options.py b/metaflow/user_configs/config_options.py index 8a24905c137..21cf2fcfaec 100644 --- a/metaflow/user_configs/config_options.py +++ b/metaflow/user_configs/config_options.py @@ -196,7 +196,7 @@ def process_configs(self, ctx, param, value): # the value go through ConvertPath or ConvertDictOrStr # - the actual value passed through prefixed with _CONVERT_PREFIX - print("Got arg name %s and values %s" % (param.name, str(value))) + # print("Got arg name %s and values %s" % (param.name, str(value))) do_return = self._value_values is None and self._path_values is None # We only keep around non default values. We could simplify by checking just one # value and if it is default it means all are but this doesn't seem much more effort @@ -231,7 +231,7 @@ def process_configs(self, ctx, param, value): all_values = dict(self._path_values or {}) all_values.update(self._value_values or {}) - print("Got all values: %s" % str(all_values)) + # print("Got all values: %s" % str(all_values)) flow_cls._flow_state[_FlowState.CONFIGS] = {} to_return = {} @@ -263,7 +263,7 @@ def process_configs(self, ctx, param, value): else: # This is a value merged_configs[n] = ConvertDictOrStr.convert_value(val, False) - + # print("Merged configs: %s" % str(merged_configs)) missing_configs = set() no_file = [] no_default_file = [] @@ -433,7 +433,7 @@ def config_options(cmd): k, ( ConvertPath.mark_as_default(v[0]) - if not callable(v) and v[1] + if not callable(v[0]) and v[1] else None ), ) diff --git a/metaflow/user_configs/config_parameters.py b/metaflow/user_configs/config_parameters.py index 4a312487829..e6173bd762d 100644 --- a/metaflow/user_configs/config_parameters.py +++ b/metaflow/user_configs/config_parameters.py @@ -320,31 +320,28 @@ def __init__( % name ) self._default_is_file = default is not None - default = default or default_value - + kwargs["default"] = default or default_value super(Config, self).__init__( - name, default=default, required=required, help=help, type=str, **kwargs + name, required=required, help=help, type=str, **kwargs ) if isinstance(kwargs.get("default", None), str): kwargs["default"] = json.dumps(kwargs["default"]) self.parser = parser - self._delay_self = DelayEvaluator(name.lower()) - def load_parameter(self, v): return v # Support . syntax def __getattr__(self, name): - return self._delay_self.__getattr__(name) + return DelayEvaluator(self.name.lower()).__getattr__(name) # Next three methods are to implement mapping to support ** syntax def __iter__(self): - return iter(self._delay_self) + return iter(DelayEvaluator(self.name.lower())) def __len__(self): - return len(self._delay_self) + return len(DelayEvaluator(self.name.lower())) def __getitem__(self, key): - return self._delay_self[key] + return DelayEvaluator(self.name.lower())[key] diff --git a/test/core/metaflow_test/formatter.py b/test/core/metaflow_test/formatter.py index f9e39d9e983..f7df9660f6b 100644 --- a/test/core/metaflow_test/formatter.py +++ b/test/core/metaflow_test/formatter.py @@ -85,8 +85,16 @@ def _flow_lines(self): tags.extend(tag.split("(")[0] for tag in step.tags) yield 0, "# -*- coding: utf-8 -*-" - yield 0, "from metaflow import Config, config_expr, FlowSpec, step, Parameter, project, IncludeFile, JSONType, current, parallel" - yield 0, "from metaflow_test import assert_equals, assert_equals_metadata, assert_exception, ExpectationFailed, is_resumed, ResumeFromHere, TestRetry, try_to_get_card" + yield 0, ( + "from metaflow import Config, config_expr, FlowSpec, step, Parameter, " + "project, IncludeFile, JSONType, current, parallel, CustomFlowDecorator, " + "CustomStepDecorator" + ) + yield 0, ( + "from metaflow_test import assert_equals, assert_equals_metadata, " + "assert_exception, ExpectationFailed, is_resumed, ResumeFromHere, " + "TestRetry, try_to_get_card" + ) if tags: yield 0, "from metaflow import %s" % ",".join(tags) diff --git a/test/core/tests/basic_config_parameters.py b/test/core/tests/basic_config_parameters.py index dc367bef524..e333b645a73 100644 --- a/test/core/tests/basic_config_parameters.py +++ b/test/core/tests/basic_config_parameters.py @@ -11,16 +11,16 @@ class BasicConfigTest(MetaflowTest): "default_from_func": {"default": "param_default", "type": "int"}, } CONFIGS = { - "config": {"default": "default_config"}, + "config": {"default_value": "default_config"}, "silly_config": {"required": True, "parser": "silly_parser"}, "config2": {}, - "config3": {"default": "config_default"}, + "config3": {"default_value": "config_default"}, } HEADER = """ import json import os -os.environ['METAFLOW_FLOW_CONFIG'] = json.dumps( +os.environ['METAFLOW_FLOW_CONFIG_VALUE'] = json.dumps( { "config2": {"default_param": 123}, "silly_config": "baz:amazing" diff --git a/test_config/helloconfig.py b/test_config/helloconfig.py index c357685a837..e62e795f615 100644 --- a/test_config/helloconfig.py +++ b/test_config/helloconfig.py @@ -39,20 +39,20 @@ def config_func(ctx): class TitusOrNot(CustomFlowDecorator): - def evaluate(self, flow_proxy): - for name, s in flow_proxy.steps: - if name in flow_proxy.config.run_on_titus: - s.add_decorator(titus, cpu=flow_proxy.config.cpu_count) + def evaluate(self, mutable_flow): + for name, s in mutable_flow.steps: + if name in mutable_flow.config.run_on_titus: + s.add_decorator(titus, cpu=mutable_flow.config.cpu_count) class AddEnvToStart(CustomFlowDecorator): - def evaluate(self, flow_proxy): - s = flow_proxy.start - s.add_decorator(environment, vars={"hello": flow_proxy.config.env_to_start}) + def evaluate(self, mutable_flow): + s = mutable_flow.start + s.add_decorator(environment, vars={"hello": mutable_flow.config.env_to_start}) -@TitusOrNot() -@AddEnvToStart() +@TitusOrNot +@AddEnvToStart @project(name=config_expr("config").project_name) class HelloConfig(FlowSpec): """ From 22083c4b1a1d7880d4f5252cee42cbb8e7b57d79 Mon Sep 17 00:00:00 2001 From: Romain Cledat Date: Wed, 23 Oct 2024 13:39:36 -0700 Subject: [PATCH 18/24] WIP --- metaflow/decorators.py | 61 +++++--------- metaflow/flowspec.py | 23 ++++-- metaflow/includefile.py | 65 ++++++++++----- metaflow/parameters.py | 63 ++++++++++----- metaflow/runner/click_api.py | 2 +- metaflow/user_configs/config_decorators.py | 94 ++++++++++++++++------ metaflow/user_configs/config_options.py | 2 +- metaflow/user_configs/config_parameters.py | 38 ++++++++- test/core/tests/basic_config_parameters.py | 35 +++++++- test/core/tests/basic_config_silly.txt | 1 + test/core/tests/custom_decorators.py | 0 11 files changed, 261 insertions(+), 123 deletions(-) create mode 100644 test/core/tests/basic_config_silly.txt create mode 100644 test/core/tests/custom_decorators.py diff --git a/metaflow/decorators.py b/metaflow/decorators.py index b615d5744ac..11743278d13 100644 --- a/metaflow/decorators.py +++ b/metaflow/decorators.py @@ -12,7 +12,11 @@ ) from .parameters import current_flow -from .user_configs.config_parameters import DelayEvaluator +from .user_configs.config_parameters import ( + UNPACK_KEY, + resolve_delayed_evaluator, + unpack_delayed_evaluator, +) from metaflow._vendor import click @@ -117,12 +121,14 @@ def __init__(self, attributes=None, statically_defined=False): self.attributes = self.defaults.copy() self.statically_defined = statically_defined self._user_defined_attributes = set() + self._ran_init = False if attributes: for k, v in attributes.items(): - self._user_defined_attributes.add(k) - if k in self.defaults or k.startswith("_unpacked_delayed_"): + if k in self.defaults or k.startswith(UNPACK_KEY): self.attributes[k] = v + if not k.startswith(UNPACK_KEY): + self._user_defined_attributes.add(k) else: raise InvalidDecoratorAttribute(self.name, k, self.defaults) @@ -132,44 +138,17 @@ def init(self): should be done here. """ - def _resolve_delayed_evaluator(v): - if isinstance(v, DelayEvaluator): - return v() - if isinstance(v, dict): - return { - _resolve_delayed_evaluator(k): _resolve_delayed_evaluator(v) - for k, v in v.items() - } - if isinstance(v, list): - return [_resolve_delayed_evaluator(x) for x in v] - if isinstance(v, tuple): - return tuple(_resolve_delayed_evaluator(x) for x in v) - if isinstance(v, set): - return {_resolve_delayed_evaluator(x) for x in v} - return v - - # Expand any eventual _unpacked_delayed_ attributes. These are special attributes - # that allow the delay unpacking of configuration values. - delayed_upack_keys = [ - k for k in self.attributes if k.startswith("_unpacked_delayed_") - ] - if delayed_upack_keys: - for k in delayed_upack_keys: - unpacked = _resolve_delayed_evaluator(self.attributes[k]) - for uk, uv in unpacked.items(): - if uk in self._user_defined_attributes: - raise SyntaxError( - "keyword argument repeated: %s" % uk, "", 0, "" - ) - self._user_defined_attributes.add(uk) - self.attributes[uk] = uv - del self.attributes[k] - - # Now resolve all attributes - for k, v in self.attributes.items(): - # This is a special attribute that means we are going to unpack - # the configuration valu - self.attributes[k] = _resolve_delayed_evaluator(v) + # In some cases (specifically when using remove_decorator), we may need to call + # init multiple times. Short-circuit re-evaluating. + if self._ran_init: + return + + # Note that by design, later values override previous ones. + self.attributes = unpack_delayed_evaluator(self.attributes) + self._user_defined_attributes.update(self.attributes.keys()) + self.attributes = resolve_delayed_evaluator(self.attributes) + + self._ran_init = True @classmethod def _parse_decorator_spec(cls, deco_spec): diff --git a/metaflow/flowspec.py b/metaflow/flowspec.py index 0c8f3aab7bd..8b72acf299a 100644 --- a/metaflow/flowspec.py +++ b/metaflow/flowspec.py @@ -169,9 +169,11 @@ def script_name(self) -> str: fname = fname[:-1] return os.path.basename(fname) - def _check_parameters(self): + def _check_parameters(self, config_parameters=False): seen = set() - for var, param in self._get_parameters(): + for _, param in self._get_parameters(): + if param.IS_CONFIG_PARAMETER != config_parameters: + continue norm = param.name.lower() if norm in seen: raise MetaflowException( @@ -189,20 +191,20 @@ def _process_config_decorators(self, config_options): return self # We need to convert all the user configurations from DelayedEvaluationParameters - # to actual values so they can be used as is in the config functions. + # to actual values so they can be used as is in the config decorators. # We then reset them to be proper parameters so they can be re-evaluated in # _set_constants to_reset_params = [] - self._check_parameters() + self._check_parameters(config_parameters=True) for var, param in self._get_parameters(): - if not param.IS_FLOW_PARAMETER: + if not param.IS_CONFIG_PARAMETER: continue - to_reset_params.append((var, param)) # Note that a config with no default and not required will be None val = config_options.get(param.name.replace("-", "_").lower()) if isinstance(val, DelayedEvaluationParameter): val = val() + to_reset_params.append((var, param, val)) setattr(current_cls, var, val) # Run all the decorators @@ -229,6 +231,11 @@ def _process_config_decorators(self, config_options): ) deco.evaluate(CustomStepDecorator(deco._my_step)) + # Process parameters to allow them to also use config values easily + for var, param in self._get_parameters(): + if param.IS_CONFIG_PARAMETER: + continue + param.init() # Reset all configs that were already present in the class. # TODO: This means that users can't override configs directly. Not sure if this # is a pattern we want to support @@ -252,7 +259,7 @@ def _set_constants(self, graph, kwargs, config_options): # Persist values for parameters and other constants (class level variables) # only once. This method is called before persist_constants is called to # persist all values set using setattr - self._check_parameters() + self._check_parameters(config_parameters=False) seen = set() self._success = True @@ -260,7 +267,7 @@ def _set_constants(self, graph, kwargs, config_options): parameters_info = [] for var, param in self._get_parameters(): seen.add(var) - if param.IS_FLOW_PARAMETER: + if param.IS_CONFIG_PARAMETER: val = config_options.get(param.name.replace("-", "_").lower()) else: val = kwargs[param.name.replace("-", "_").lower()] diff --git a/metaflow/includefile.py b/metaflow/includefile.py index d599de3f1a8..3c5863485af 100644 --- a/metaflow/includefile.py +++ b/metaflow/includefile.py @@ -245,29 +245,58 @@ class IncludeFile(Parameter): default : Union[str, Callable[ParameterContext, str]] Default path to a local file. A function implies that the parameter corresponds to a *deploy-time parameter*. - is_text : bool, default True + is_text : bool, optional, default None Convert the file contents to a string using the provided `encoding`. - If False, the artifact is stored in `bytes`. - encoding : str, optional, default 'utf-8' - Use this encoding to decode the file contexts if `is_text=True`. - required : bool, default False + If False, the artifact is stored in `bytes`. A value of None is equivalent to + True. + encoding : str, optional, default None + Use this encoding to decode the file contexts if `is_text=True`. A value of None + is equivalent to "utf-8". + required : bool, optional, default None Require that the user specified a value for the parameter. - `required=True` implies that the `default` is not used. + `required=True` implies that the `default` is not used. A value of None is + equivalent to False help : str, optional Help text to show in `run --help`. show_default : bool, default True - If True, show the default value in the help text. + If True, show the default value in the help text. A value of None is equivalent + to True. """ def __init__( self, name: str, - required: bool = False, - is_text: bool = True, - encoding: str = "utf-8", + required: Optional[bool] = None, + is_text: Optional[bool] = None, + encoding: Optional[str] = None, help: Optional[str] = None, **kwargs: Dict[str, str] ): + self._includefile_overrides = {} + if is_text is not None: + self._includefile_overrides["is_text"] = is_text + if encoding is not None: + self._includefile_overrides["encoding"] = encoding + super(IncludeFile, self).__init__( + name, + required=required, + help=help, + type=FilePathClass(is_text, encoding), + **kwargs, + ) + + def init(self): + super(IncludeFile, self).init() + + # This will use the values set explicitly in the args if present, else will + # use and remove from kwargs else will use True/utf-8 + is_text = self._includefile_overrides.get( + "is_text", self.kwargs.pop("is_text", True) + ) + encoding = self._includefile_overrides.get( + "encoding", self.kwargs.pop("encoding", "utf-8") + ) + # If a default is specified, it needs to be uploaded when the flow is deployed # (for example when doing a `step-functions create`) so we make the default # be a DeployTimeField. This means that it will be evaluated in two cases: @@ -277,7 +306,7 @@ def __init__( # In the first case, we will need to fully upload the file whereas in the # second case, we can just return the string as the FilePath.convert method # will take care of evaluating things. - v = kwargs.get("default") + v = self.kwargs.get("default") if v is not None: # If the default is a callable, we have two DeployTimeField: # - the callable nature of the default will require us to "call" the default @@ -290,23 +319,15 @@ def __init__( # (call the default) if callable(v) and not isinstance(v, DeployTimeField): # If default is a callable, make it a DeployTimeField (the inner one) - v = DeployTimeField(name, str, "default", v, return_str=True) - kwargs["default"] = DeployTimeField( - name, + v = DeployTimeField(self.name, str, "default", v, return_str=True) + self.kwargs["default"] = DeployTimeField( + self.name, str, "default", IncludeFile._eval_default(is_text, encoding, v), print_representation=v, ) - super(IncludeFile, self).__init__( - name, - required=required, - help=help, - type=FilePathClass(is_text, encoding), - **kwargs, - ) - def load_parameter(self, v): if v is None: return v diff --git a/metaflow/parameters.py b/metaflow/parameters.py index d37d1802501..f507bf4afe6 100644 --- a/metaflow/parameters.py +++ b/metaflow/parameters.py @@ -305,14 +305,15 @@ class MyFlow(FlowSpec): to the type of `default` or `str` if none specified. help : str, optional Help text to show in `run --help`. - required : bool, default False - Require that the user specified a value for the parameter. If a non-None - default is specified, that default will be used if no other value is provided - show_default : bool, default True - If True, show the default value in the help text. + required : bool, optional, default None + Require that the user specified a value for the parameter. `required=True` implies + that `default` is not used. A value of None is equivalent to False. + show_default : bool, optional, default None + If True, show the default value in the help text. A value of None is equivalent + to True. """ - IS_FLOW_PARAMETER = False + IS_CONFIG_PARAMETER = False def __init__( self, @@ -333,24 +334,44 @@ def __init__( Union[Type[str], Type[float], Type[int], Type[bool], JSONTypeClass] ] = None, help: Optional[str] = None, - required: bool = False, - show_default: bool = True, + required: Optional[bool] = None, + show_default: Optional[bool] = None, **kwargs: Dict[str, Any] ): self.name = name self.kwargs = kwargs - for k, v in { + self._override_kwargs = { "default": default, "type": type, "help": help, "required": required, "show_default": show_default, - }.items(): - if v is not None: - self.kwargs[k] = v + } + + def init(self): + # Prevent circular import + from .user_configs.config_parameters import ( + resolve_delayed_evaluator, + unpack_delayed_evaluator, + ) + + # Resolve any value from configurations + self.kwargs = unpack_delayed_evaluator(self.kwargs) + self.kwargs = resolve_delayed_evaluator(self.kwargs) + + # This was the behavior before configs: values specified in args would override + # stuff in kwargs which is what we implement here as well + for key, value in self._override_kwargs.items(): + if value is not None: + self.kwargs[key] = value + # Set two default values if no-one specified them + self.kwargs.setdefault("required", False) + self.kwargs.setdefault("show_default", True) + + # Continue processing kwargs free of any configuration values :) # TODO: check that the type is one of the supported types - param_type = self.kwargs["type"] = self._get_type(kwargs) + param_type = self.kwargs["type"] = self._get_type(self.kwargs) reserved_params = [ "params", @@ -375,23 +396,27 @@ def __init__( raise MetaflowException( "Parameter name '%s' is a reserved " "word. Please use a different " - "name for your parameter." % (name) + "name for your parameter." % (self.name) ) # make sure the user is not trying to pass a function in one of the # fields that don't support function-values yet for field in ("show_default", "separator", "required"): - if callable(kwargs.get(field)): + if callable(self.kwargs.get(field)): raise MetaflowException( "Parameter *%s*: Field '%s' cannot " - "have a function as its value" % (name, field) + "have a function as its value" % (self.name, field) ) # default can be defined as a function default_field = self.kwargs.get("default") if callable(default_field) and not isinstance(default_field, DeployTimeField): self.kwargs["default"] = DeployTimeField( - name, param_type, "default", self.kwargs["default"], return_str=True + self.name, + param_type, + "default", + self.kwargs["default"], + return_str=True, ) # note that separator doesn't work with DeployTimeFields unless you @@ -400,7 +425,7 @@ def __init__( if self.separator and not self.is_string_type: raise MetaflowException( "Parameter *%s*: Separator is only allowed " - "for string parameters." % name + "for string parameters." % self.name ) def __repr__(self): @@ -457,7 +482,7 @@ def wrapper(cmd): if flow_cls is None: return cmd parameters = [ - p for _, p in flow_cls._get_parameters() if not p.IS_FLOW_PARAMETER + p for _, p in flow_cls._get_parameters() if not p.IS_CONFIG_PARAMETER ] for arg in parameters[::-1]: kwargs = arg.option_kwargs(deploy_mode) diff --git a/metaflow/runner/click_api.py b/metaflow/runner/click_api.py index 2d946fe7133..49f5d1902fb 100644 --- a/metaflow/runner/click_api.py +++ b/metaflow/runner/click_api.py @@ -227,7 +227,7 @@ def name(self): def from_cli(cls, flow_file: str, cli_collection: Callable) -> Callable: flow_cls = extract_flow_class_from_file(flow_file) flow_parameters = [ - p for _, p in flow_cls._get_parameters() if not p.IS_FLOW_PARAMETER + p for _, p in flow_cls._get_parameters() if not p.IS_CONFIG_PARAMETER ] with flow_context(flow_cls) as _: add_decorator_options(cli_collection) diff --git a/metaflow/user_configs/config_decorators.py b/metaflow/user_configs/config_decorators.py index 2e5202406ce..2360f7ea969 100644 --- a/metaflow/user_configs/config_decorators.py +++ b/metaflow/user_configs/config_decorators.py @@ -2,12 +2,14 @@ from typing import Any, Callable, Generator, Optional, TYPE_CHECKING, Tuple, Union from metaflow.exception import MetaflowException -from metaflow.parameters import Parameter -from metaflow.user_configs.config_parameters import ConfigValue +from metaflow.user_configs.config_parameters import ( + ConfigValue, + resolve_delayed_evaluator, +) if TYPE_CHECKING: - from metaflow.flowspec import _FlowSpecMeta - from metaflow.decorators import FlowSpecDerived + import metaflow.flowspec + import metaflow.decorators class MutableStep: @@ -20,8 +22,8 @@ class MutableStep: def __init__( self, step: Union[ - Callable[["FlowSpecDerived"], None], - Callable[["FlowSpecDerived", Any], None], + Callable[["metaflow.decorators.FlowSpecDerived"], None], + Callable[["metaflow.decorators.FlowSpecDerived", Any], None], ], ): self._my_step = step @@ -51,6 +53,9 @@ def remove_decorator(self, deco_name: str, all: bool = True, **kwargs) -> bool: did_remove = False for deco in self._my_step.decorators: if deco.name == deco_name: + # Evaluate all the configuration values if any + deco.init() + # Check filters match_ok = True if kwargs: @@ -144,6 +149,10 @@ def start(self): ).items(): yield name, ConfigValue(value) + @property + def parameters(self) -> Generator[Tuple[str, Any], None, None]: + pass + @property def steps(self) -> Generator[Tuple[str, MutableStep], None, None]: """ @@ -174,13 +183,22 @@ def __getattr__(self, name): class CustomFlowDecorator: def __init__(self, *args, **kwargs): - from ..flowspec import FlowSpec, _FlowSpecMeta + from ..flowspec import FlowSpecMeta - if args and isinstance(args[0], (CustomFlowDecorator, _FlowSpecMeta)): + if args and isinstance(args[0], (CustomFlowDecorator, FlowSpecMeta)): # This means the decorator is bare like @MyDecorator # and the first argument is the FlowSpec or another decorator (they # can be stacked) - if isinstance(args[0], _FlowSpecMeta): + + # If we have a init function, we call it with no arguments -- this can + # happen if the user defines a function with default parameters for example + try: + self.init() + except NotImplementedError: + pass + + # Now set the flow class we apply to + if isinstance(args[0], FlowSpecMeta): self._set_flow_cls(args[0]) else: self._set_flow_cls(args[0]._flow_cls) @@ -189,12 +207,25 @@ def __init__(self, *args, **kwargs): self._args = args self._kwargs = kwargs - def __call__(self, flow_spec: Optional["_FlowSpecMeta"] = None) -> "_FlowSpecMeta": + def __call__( + self, flow_spec: Optional["metaflow.flowspec.FlowSpecMeta"] = None + ) -> "metaflow.flowspec.FlowSpecMeta": if flow_spec: # This is the case of a decorator @MyDecorator(foo=1, bar=2) and so - # we already called __init__ and saved foo and bar and are now calling - # this on the flow itself. - self.init(*self._args, **self._kwargs) + # we already called __init__ and saved foo and bar in self._args and + # self._kwargs and are now calling this on the flow itself. + + # You can use config values in the arguments to a CustomFlowDecorator + # so we resolve those as well + new_args = [resolve_delayed_evaluator(arg) for arg in self._args] + try: + self.init(*self._args, **self._kwargs) + except NotImplementedError as e: + raise MetaflowException( + "CustomFlowDecorator '%s' is used with arguments " + "but does not implement init" % str(self.__class__) + ) from e + return self._set_flow_cls(flow_spec) elif not self._flow_cls: # This means that somehow the initialization did not happen properly @@ -204,7 +235,9 @@ def __call__(self, flow_spec: Optional["_FlowSpecMeta"] = None) -> "_FlowSpecMet ) return self._flow_cls() - def _set_flow_cls(self, flow_spec: "_FlowSpecMeta") -> "_FlowSpecMeta": + def _set_flow_cls( + self, flow_spec: "metaflow.flowspec.FlowSpecMeta" + ) -> "metaflow.flowspec.FlowSpecMeta": from ..flowspec import _FlowState flow_spec._flow_state.setdefault(_FlowState.CONFIG_DECORATORS, []).append(self) @@ -223,6 +256,19 @@ def init(self, *args, **kwargs): raise NotImplementedError() def evaluate(self, mutable_flow: MutableFlow) -> None: + """ + Implement this method to act on the flow and modify it as needed. + + Parameters + ---------- + mutable_flow : MutableFlow + Flow + + Raises + ------ + NotImplementedError + _description_ + """ raise NotImplementedError() @@ -249,18 +295,18 @@ def __call__( self, step: Optional[ Union[ - Callable[["FlowSpecDerived"], None], - Callable[["FlowSpecDerived", Any], None], + Callable[["metaflow.decorators.FlowSpecDerived"], None], + Callable[["metaflow.decorators.FlowSpecDerived", Any], None], ] ] = None, ) -> Union[ - Callable[["FlowSpecDerived"], None], - Callable[["FlowSpecDerived", Any], None], + Callable[["metaflow.decorators.FlowSpecDerived"], None], + Callable[["metaflow.decorators.FlowSpecDerived", Any], None], ]: if step: # This is the case of a decorator @MyDecorator(foo=1, bar=2) and so - # we already called __init__ and saved foo and bar and are now calling - # this on the step itself. + # we already called __init__ and saved foo and bar into self._args and + # self._kwargs and are now calling this on the step itself. self.init(*self._args, **self._kwargs) return self._set_my_step(step) elif not self._my_step: @@ -274,12 +320,12 @@ def __call__( def _set_my_step( self, step: Union[ - Callable[["FlowSpecDerived"], None], - Callable[["FlowSpecDerived", Any], None], + Callable[["metaflow.decorators.FlowSpecDerived"], None], + Callable[["metaflow.decorators.FlowSpecDerived", Any], None], ], ) -> Union[ - Callable[["FlowSpecDerived"], None], - Callable[["FlowSpecDerived", Any], None], + Callable[["metaflow.decorators.FlowSpecDerived"], None], + Callable[["metaflow.decorators.FlowSpecDerived", Any], None], ]: from ..flowspec import _FlowState diff --git a/metaflow/user_configs/config_options.py b/metaflow/user_configs/config_options.py index 21cf2fcfaec..9502e0eab66 100644 --- a/metaflow/user_configs/config_options.py +++ b/metaflow/user_configs/config_options.py @@ -359,7 +359,7 @@ def config_options(cmd): if flow_cls is None: return cmd - parameters = [p for _, p in flow_cls._get_parameters() if p.IS_FLOW_PARAMETER] + parameters = [p for _, p in flow_cls._get_parameters() if p.IS_CONFIG_PARAMETER] # List all the configuration options for arg in parameters[::-1]: kwargs = arg.option_kwargs(False) diff --git a/metaflow/user_configs/config_parameters.py b/metaflow/user_configs/config_parameters.py index e6173bd762d..81e1a0f8738 100644 --- a/metaflow/user_configs/config_parameters.py +++ b/metaflow/user_configs/config_parameters.py @@ -42,6 +42,8 @@ ID_PATTERN = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*$") +UNPACK_KEY = "_unpacked_delayed_" + def dump_config_values(flow: "FlowSpec"): from ..flowspec import _FlowState # Prevent circular import @@ -152,7 +154,7 @@ class DelayEvaluator(collections.abc.Mapping): This is used when we want to use config.* values in decorators for example. It also allows the following "delayed" access on an obj that is a DelayEvaluation - - obj.x.y.z (ie: accessing members of DelayEvaluator; acesses will be delayed until + - obj.x.y.z (ie: accessing members of DelayEvaluator; accesses will be delayed until the DelayEvaluator is evaluated) - **obj (ie: unpacking the DelayEvaluator as a dictionary). Note that this requires special handling in whatever this is being unpacked into, specifically the handling @@ -170,10 +172,10 @@ def __init__(self, ex: str): self._access = None def __iter__(self): - yield "_unpacked_delayed_%d" % id(self) + yield "%s%d" % (UNPACK_KEY, id(self)) def __getitem__(self, key): - if key == "_unpacked_delayed_%d" % id(self): + if key == "%s%d" % (UNPACK_KEY, id(self)): return self raise KeyError(key) @@ -295,7 +297,7 @@ class Config(Parameter, collections.abc.Mapping): If True, show the default value in the help text. """ - IS_FLOW_PARAMETER = True + IS_CONFIG_PARAMETER = True def __init__( self, @@ -345,3 +347,31 @@ def __len__(self): def __getitem__(self, key): return DelayEvaluator(self.name.lower())[key] + + +def resolve_delayed_evaluator(v: Any) -> Any: + if isinstance(v, DelayEvaluator): + return v() + if isinstance(v, dict): + return { + resolve_delayed_evaluator(k): resolve_delayed_evaluator(v) + for k, v in v.items() + } + if isinstance(v, list): + return [resolve_delayed_evaluator(x) for x in v] + if isinstance(v, tuple): + return tuple(resolve_delayed_evaluator(x) for x in v) + if isinstance(v, set): + return {resolve_delayed_evaluator(x) for x in v} + return v + + +def unpack_delayed_evaluator(to_unpack: Dict[str, Any]) -> Dict[str, Any]: + result = {} + for k, v in to_unpack.items(): + if not isinstance(k, str) or not k.startswith(UNPACK_KEY): + result[k] = v + else: + # k.startswith(UNPACK_KEY) + result.update(resolve_delayed_evaluator(v[k])) + return result diff --git a/test/core/tests/basic_config_parameters.py b/test/core/tests/basic_config_parameters.py index e333b645a73..b8e91a0f259 100644 --- a/test/core/tests/basic_config_parameters.py +++ b/test/core/tests/basic_config_parameters.py @@ -11,19 +11,36 @@ class BasicConfigTest(MetaflowTest): "default_from_func": {"default": "param_default", "type": "int"}, } CONFIGS = { + # Test a default value as a dict "config": {"default_value": "default_config"}, - "silly_config": {"required": True, "parser": "silly_parser"}, + # Test parser, various arguments and overriden default + "silly_config": { + "required": True, + "parser": "silly_parser", + "default": "silly.txt", + }, "config2": {}, + # Test using a function to get the value "config3": {"default_value": "config_default"}, + # Test ** notation + "config_env": {}, } HEADER = """ import json import os +# Test passing values directly on the command line os.environ['METAFLOW_FLOW_CONFIG_VALUE'] = json.dumps( { - "config2": {"default_param": 123}, - "silly_config": "baz:amazing" + "config2": {"default_param": 123} + "config_env": {"vars": {"var1": "value1", "var2": "value2"}} + } +) + +# Test overriding a file (the default one) +os.environ['METAFLOW_FLOW_CONFIG'] = json.dumps( + { + "silly_config": "basic_config_silly.txt" } ) @@ -73,6 +90,7 @@ def step_all(self): assert_equals(self.config.nested["value"], 43) assert_equals(self.config["nested"].value, 43) + # Test parser assert_equals(self.silly_config.baz, "amazing") assert_equals(self.silly_config["baz"], "amazing") @@ -90,6 +108,13 @@ def step_all(self): except TypeError: pass + @tag("environment(**config_env)") + @steps(0, ["start"]) + def step_start(self): + # Here we check the environment based on the ** notation + assert_equals(os.environ["var1"], "value1") + assert_equals(os.environ["var2"], "value2") + def check_results(self, flow, checker): for step in flow: checker.assert_artifact( @@ -103,4 +128,8 @@ def check_results(self, flow, checker): }, ) checker.assert_artifact(step.name, "config2", {"default_param": 123}) + checker.assert_artifact(step.name, "config3", {"val": 456}) checker.assert_artifact(step.name, "silly_config", {"baz": "amazing"}) + checker.assert_artifact( + step.name, "config_env", {"vars": {"var1": "value1", "var2": "value2"}} + ) diff --git a/test/core/tests/basic_config_silly.txt b/test/core/tests/basic_config_silly.txt new file mode 100644 index 00000000000..c438d89d5e0 --- /dev/null +++ b/test/core/tests/basic_config_silly.txt @@ -0,0 +1 @@ +baz:amazing diff --git a/test/core/tests/custom_decorators.py b/test/core/tests/custom_decorators.py new file mode 100644 index 00000000000..e69de29bb2d From 99869ce8e53c813da10bcd3c6ada4150bd198406 Mon Sep 17 00:00:00 2001 From: Sakari Ikonen Date: Thu, 17 Oct 2024 23:15:13 +0300 Subject: [PATCH 19/24] do not map config parameters to CLI command for argo/step functions --- metaflow/plugins/argo/argo_workflows.py | 2 ++ metaflow/plugins/aws/step_functions/step_functions.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/metaflow/plugins/argo/argo_workflows.py b/metaflow/plugins/argo/argo_workflows.py index c4e8cbd6c77..94cdf00abb1 100644 --- a/metaflow/plugins/argo/argo_workflows.py +++ b/metaflow/plugins/argo/argo_workflows.py @@ -445,6 +445,8 @@ def _process_parameters(self): has_schedule = self.flow._flow_decorators.get("schedule") is not None seen = set() for var, param in self.flow._get_parameters(): + if param.IS_FLOW_PARAMETER: + continue # Throw an exception if the parameter is specified twice. norm = param.name.lower() if norm in seen: diff --git a/metaflow/plugins/aws/step_functions/step_functions.py b/metaflow/plugins/aws/step_functions/step_functions.py index 0534cd2179d..79b2843c2de 100644 --- a/metaflow/plugins/aws/step_functions/step_functions.py +++ b/metaflow/plugins/aws/step_functions/step_functions.py @@ -476,6 +476,8 @@ def _process_parameters(self): has_schedule = self._cron() is not None seen = set() for var, param in self.flow._get_parameters(): + if param.IS_FLOW_PARAMETER: + continue # Throw an exception if the parameter is specified twice. norm = param.name.lower() if norm in seen: From ccc41b8c96610aabe1e17cf78a3aa750838f7a01 Mon Sep 17 00:00:00 2001 From: Sakari Ikonen Date: Wed, 23 Oct 2024 14:42:26 +0300 Subject: [PATCH 20/24] fix argo config mappings --- metaflow/plugins/argo/argo_workflows.py | 52 +++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/metaflow/plugins/argo/argo_workflows.py b/metaflow/plugins/argo/argo_workflows.py index 94cdf00abb1..4eb05313379 100644 --- a/metaflow/plugins/argo/argo_workflows.py +++ b/metaflow/plugins/argo/argo_workflows.py @@ -59,6 +59,7 @@ ) from metaflow.plugins.kubernetes.kubernetes_jobsets import KubernetesArgoJobSet from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK +from metaflow.user_configs.config_options import ConfigInput from metaflow.util import ( compress_list, dict_to_cli_options, @@ -167,6 +168,7 @@ def __init__( self.enable_heartbeat_daemon = enable_heartbeat_daemon self.enable_error_msg_capture = enable_error_msg_capture self.parameters = self._process_parameters() + self.config_parameters = self._process_config_parameters() self.triggers, self.trigger_options = self._process_triggers() self._schedule, self._timezone = self._get_schedule() @@ -489,6 +491,7 @@ def _process_parameters(self): # execution - which needs to be fixed imminently. if not is_required or default_value is not None: default_value = json.dumps(default_value) + parameters[param.name] = dict( name=param.name, value=default_value, @@ -499,6 +502,41 @@ def _process_parameters(self): ) return parameters + def _process_config_parameters(self): + parameters = {} + seen = set() + for var, param in self.flow._get_parameters(): + if not param.IS_FLOW_PARAMETER: + continue + # Throw an exception if the parameter is specified twice. + norm = param.name.lower() + if norm in seen: + raise MetaflowException( + "Parameter *%s* is specified twice. " + "Note that parameter names are " + "case-insensitive." % param.name + ) + seen.add(norm) + + is_required = param.kwargs.get("required", False) + + default_value = deploy_time_eval(param.kwargs.get("default")) + # If the value is not required and the value is None, we set the value to + # the JSON equivalent of None to please argo-workflows. Unfortunately it + # has the side effect of casting the parameter value to string null during + # execution - which needs to be fixed imminently. + if not is_required or default_value is not None: + default_value = json.dumps(default_value) + + parameters[param.name] = dict( + name=param.name, + kv_name=ConfigInput.make_key_name(param.name), + value=default_value, + description=param.kwargs.get("help"), + is_required=is_required, + ) + return parameters + def _process_triggers(self): # Impute triggers for Argo Workflow Template specified through @trigger and # @trigger_on_finish decorators @@ -1500,6 +1538,12 @@ def _container_templates(self): % (parameter["name"], parameter["name"]) for parameter in self.parameters.values() ] + # + [ + # ( + # "--config-value %s %s" % (config_param["name"], config_param["kv_name"]) + # ) + # for config_param in self.config_parameters.values() + # ] ) if self.tags: init.extend("--tag %s" % tag for tag in self.tags) @@ -1717,6 +1761,14 @@ def _container_templates(self): metaflow_version["production_token"] = self.production_token env["METAFLOW_VERSION"] = json.dumps(metaflow_version) + # map config values + cfg_env = { + param["name"]: param["kv_name"] + for param in self.config_parameters.values() + } + if cfg_env: + env["METAFLOW_FLOW_CONFIG_VALUE"] = json.dumps(cfg_env) + # Set the template inputs and outputs for passing state. Very simply, # the container template takes in input-paths as input and outputs # the task-id (which feeds in as input-paths to the subsequent task). From 003829169e533c214d9c07deaacd90ea94ebd34a Mon Sep 17 00:00:00 2001 From: Sakari Ikonen Date: Wed, 23 Oct 2024 15:04:24 +0300 Subject: [PATCH 21/24] fix step function configparam mappings as well --- .../aws/step_functions/step_functions.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/metaflow/plugins/aws/step_functions/step_functions.py b/metaflow/plugins/aws/step_functions/step_functions.py index 79b2843c2de..e4656577506 100644 --- a/metaflow/plugins/aws/step_functions/step_functions.py +++ b/metaflow/plugins/aws/step_functions/step_functions.py @@ -18,6 +18,7 @@ SFN_S3_DISTRIBUTED_MAP_OUTPUT_PATH, ) from metaflow.parameters import deploy_time_eval +from metaflow.user_configs.config_options import ConfigInput from metaflow.util import dict_to_cli_options, to_pascalcase from ..batch.batch import Batch @@ -71,6 +72,7 @@ def __init__( self.username = username self.max_workers = max_workers self.workflow_timeout = workflow_timeout + self.config_parameters = self._process_config_parameters() # https://aws.amazon.com/blogs/aws/step-functions-distributed-map-a-serverless-solution-for-large-scale-parallel-data-processing/ self.use_distributed_map = use_distributed_map @@ -503,6 +505,27 @@ def _process_parameters(self): parameters.append(dict(name=param.name, value=value)) return parameters + def _process_config_parameters(self): + parameters = [] + seen = set() + for var, param in self.flow._get_parameters(): + if not param.IS_FLOW_PARAMETER: + continue + # Throw an exception if the parameter is specified twice. + norm = param.name.lower() + if norm in seen: + raise MetaflowException( + "Parameter *%s* is specified twice. " + "Note that parameter names are " + "case-insensitive." % param.name + ) + seen.add(norm) + + parameters.append( + dict(name=param.name, kv_name=ConfigInput.make_key_name(param.name)) + ) + return parameters + def _batch(self, node): attrs = { # metaflow.user is only used for setting the AWS Job Name. @@ -749,6 +772,11 @@ def _batch(self, node): metaflow_version["production_token"] = self.production_token env["METAFLOW_VERSION"] = json.dumps(metaflow_version) + # map config values + cfg_env = {param["name"]: param["kv_name"] for param in self.config_parameters} + if cfg_env: + env["METAFLOW_FLOW_CONFIG_VALUE"] = json.dumps(cfg_env) + # Set AWS DynamoDb Table Name for state tracking for for-eaches. # There are three instances when metaflow runtime directly interacts # with AWS DynamoDB. From bfffe5a5ec49fa9c156b3d7cebb6ba9431b2c39b Mon Sep 17 00:00:00 2001 From: Sakari Ikonen Date: Thu, 24 Oct 2024 02:06:05 +0300 Subject: [PATCH 22/24] fix argo events with config params --- metaflow/plugins/argo/argo_workflows.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/metaflow/plugins/argo/argo_workflows.py b/metaflow/plugins/argo/argo_workflows.py index 4eb05313379..dad35ca1a3e 100644 --- a/metaflow/plugins/argo/argo_workflows.py +++ b/metaflow/plugins/argo/argo_workflows.py @@ -560,7 +560,11 @@ def _process_triggers(self): # insensitive. seen = set() params = set( - [param.name.lower() for var, param in self.flow._get_parameters()] + [ + param.name.lower() + for var, param in self.flow._get_parameters() + if not param.IS_FLOW_PARAMETER + ] ) for event in self.flow._flow_decorators.get("trigger")[0].triggers: parameters = {} From a79f7546fa002ffdc1bbcae2ebfdb1524514611c Mon Sep 17 00:00:00 2001 From: Sakari Ikonen Date: Thu, 24 Oct 2024 02:22:18 +0300 Subject: [PATCH 23/24] add notes and cleanup --- metaflow/plugins/argo/argo_workflows.py | 32 ++++--------------- .../aws/step_functions/step_functions.py | 2 ++ 2 files changed, 9 insertions(+), 25 deletions(-) diff --git a/metaflow/plugins/argo/argo_workflows.py b/metaflow/plugins/argo/argo_workflows.py index dad35ca1a3e..f789d7a9aed 100644 --- a/metaflow/plugins/argo/argo_workflows.py +++ b/metaflow/plugins/argo/argo_workflows.py @@ -447,6 +447,8 @@ def _process_parameters(self): has_schedule = self.flow._flow_decorators.get("schedule") is not None seen = set() for var, param in self.flow._get_parameters(): + # NOTE: We skip config parameters as these do not have dynamic values, + # and need to be treated differently. if param.IS_FLOW_PARAMETER: continue # Throw an exception if the parameter is specified twice. @@ -503,7 +505,7 @@ def _process_parameters(self): return parameters def _process_config_parameters(self): - parameters = {} + parameters = [] seen = set() for var, param in self.flow._get_parameters(): if not param.IS_FLOW_PARAMETER: @@ -518,22 +520,8 @@ def _process_config_parameters(self): ) seen.add(norm) - is_required = param.kwargs.get("required", False) - - default_value = deploy_time_eval(param.kwargs.get("default")) - # If the value is not required and the value is None, we set the value to - # the JSON equivalent of None to please argo-workflows. Unfortunately it - # has the side effect of casting the parameter value to string null during - # execution - which needs to be fixed imminently. - if not is_required or default_value is not None: - default_value = json.dumps(default_value) - - parameters[param.name] = dict( - name=param.name, - kv_name=ConfigInput.make_key_name(param.name), - value=default_value, - description=param.kwargs.get("help"), - is_required=is_required, + parameters.append( + dict(name=param.name, kv_name=ConfigInput.make_key_name(param.name)) ) return parameters @@ -559,6 +547,7 @@ def _process_triggers(self): # convert them to lower case since Metaflow parameters are case # insensitive. seen = set() + # NOTE: We skip config parameters as their values can not be set through event payloads params = set( [ param.name.lower() @@ -1542,12 +1531,6 @@ def _container_templates(self): % (parameter["name"], parameter["name"]) for parameter in self.parameters.values() ] - # + [ - # ( - # "--config-value %s %s" % (config_param["name"], config_param["kv_name"]) - # ) - # for config_param in self.config_parameters.values() - # ] ) if self.tags: init.extend("--tag %s" % tag for tag in self.tags) @@ -1767,8 +1750,7 @@ def _container_templates(self): # map config values cfg_env = { - param["name"]: param["kv_name"] - for param in self.config_parameters.values() + param["name"]: param["kv_name"] for param in self.config_parameters } if cfg_env: env["METAFLOW_FLOW_CONFIG_VALUE"] = json.dumps(cfg_env) diff --git a/metaflow/plugins/aws/step_functions/step_functions.py b/metaflow/plugins/aws/step_functions/step_functions.py index e4656577506..491fa5bd47f 100644 --- a/metaflow/plugins/aws/step_functions/step_functions.py +++ b/metaflow/plugins/aws/step_functions/step_functions.py @@ -478,6 +478,8 @@ def _process_parameters(self): has_schedule = self._cron() is not None seen = set() for var, param in self.flow._get_parameters(): + # NOTE: We skip config parameters as these do not have dynamic values, + # and need to be treated differently. if param.IS_FLOW_PARAMETER: continue # Throw an exception if the parameter is specified twice. From 21f0c3a5b20c1b19fa71a7be527d012e449c0773 Mon Sep 17 00:00:00 2001 From: Sakari Ikonen Date: Thu, 24 Oct 2024 22:18:39 +0300 Subject: [PATCH 24/24] validate that config params and parameters names don't conflict on argo and sfn. --- metaflow/plugins/argo/argo_workflows.py | 8 ++++---- metaflow/plugins/aws/step_functions/step_functions.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/metaflow/plugins/argo/argo_workflows.py b/metaflow/plugins/argo/argo_workflows.py index f789d7a9aed..f0efce40645 100644 --- a/metaflow/plugins/argo/argo_workflows.py +++ b/metaflow/plugins/argo/argo_workflows.py @@ -447,10 +447,6 @@ def _process_parameters(self): has_schedule = self.flow._flow_decorators.get("schedule") is not None seen = set() for var, param in self.flow._get_parameters(): - # NOTE: We skip config parameters as these do not have dynamic values, - # and need to be treated differently. - if param.IS_FLOW_PARAMETER: - continue # Throw an exception if the parameter is specified twice. norm = param.name.lower() if norm in seen: @@ -460,6 +456,10 @@ def _process_parameters(self): "case-insensitive." % param.name ) seen.add(norm) + # NOTE: We skip config parameters as these do not have dynamic values, + # and need to be treated differently. + if param.IS_FLOW_PARAMETER: + continue extra_attrs = {} if param.kwargs.get("type") == JSONType: diff --git a/metaflow/plugins/aws/step_functions/step_functions.py b/metaflow/plugins/aws/step_functions/step_functions.py index 491fa5bd47f..61862183883 100644 --- a/metaflow/plugins/aws/step_functions/step_functions.py +++ b/metaflow/plugins/aws/step_functions/step_functions.py @@ -478,10 +478,6 @@ def _process_parameters(self): has_schedule = self._cron() is not None seen = set() for var, param in self.flow._get_parameters(): - # NOTE: We skip config parameters as these do not have dynamic values, - # and need to be treated differently. - if param.IS_FLOW_PARAMETER: - continue # Throw an exception if the parameter is specified twice. norm = param.name.lower() if norm in seen: @@ -491,6 +487,10 @@ def _process_parameters(self): "case-insensitive." % param.name ) seen.add(norm) + # NOTE: We skip config parameters as these do not have dynamic values, + # and need to be treated differently. + if param.IS_FLOW_PARAMETER: + continue is_required = param.kwargs.get("required", False) # Throw an exception if a schedule is set for a flow with required