Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enrich logging through context vars #452

Merged
merged 86 commits into from
Jan 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
86 commits
Select commit Hold shift + click to select a range
4655cc3
mvp experiment logging to file
ankona Dec 7, 2023
ba2f13c
separate out/error
ankona Dec 7, 2023
57a32a6
fmt
ankona Dec 7, 2023
a24a0d9
linting
ankona Dec 7, 2023
4bb2a61
Add filter to avoid logging info to err file
ankona Dec 8, 2023
1aba404
use exp.start to add file handlers
ankona Dec 8, 2023
df09729
remove commented code, fix linting/formatting issues, add docstrings
ankona Dec 8, 2023
c4563ab
mypy/fmt/lint
ankona Dec 8, 2023
49de836
add test cases for new logging code
ankona Dec 8, 2023
94501fa
add out/err file paths to experiment serializer
ankona Dec 8, 2023
02a86b2
minor doc formatting
ankona Dec 8, 2023
86e8b36
minor formatting
ankona Dec 8, 2023
81fe30d
Fix incorrect naming & docstring
ankona Dec 11, 2023
6e6cdeb
Ensure logs can be written if path paths are supplied
ankona Dec 11, 2023
a1476cb
linting
ankona Dec 11, 2023
8d961ff
test bug fix
ankona Dec 11, 2023
0bba1fa
poc contextvars for loggers by experiment
ankona Dec 12, 2023
21eb87d
execute job manager in context aware thread
ankona Dec 12, 2023
4fc3e75
contextualize more methods in experiment for testing
ankona Dec 16, 2023
8922bbd
cleanup
ankona Dec 18, 2023
79931a9
cleanup
ankona Dec 18, 2023
480454b
use filter to inject context info to loggers
ankona Dec 18, 2023
94533fb
bugfix
ankona Dec 18, 2023
f684263
formatting & bugs
ankona Dec 18, 2023
d6c58d7
formatting/linting
ankona Dec 18, 2023
1052334
revert edit
ankona Dec 18, 2023
b112447
revert extraneous logging
ankona Dec 18, 2023
fa0a61f
revert jobmanager thread mod
ankona Dec 18, 2023
bf3073f
ensure job manager contextualization
ankona Dec 18, 2023
766499c
contextualize taskmanager
ankona Dec 18, 2023
142d2bc
formatting/linting
ankona Dec 18, 2023
cc5a1ae
remove recursive log init bug
ankona Dec 18, 2023
e801838
update tests
ankona Dec 18, 2023
b5e96ec
rough draft of context as a decorator
MattToast Dec 19, 2023
a6787c8
nicer typing
MattToast Dec 19, 2023
2301be3
get past lint
MattToast Dec 19, 2023
04c9a6f
tweaked contextualization decorator
ankona Jan 6, 2024
5e9a257
remove commented code
ankona Jan 6, 2024
0eae1f3
cleanup, add docstrings, revert to dynamic contextualizer
ankona Jan 8, 2024
f0409b2
convert create_[ensemble|model] into instance methods; ignore no-self…
ankona Jan 8, 2024
dc583e4
apply isort to modified files
ankona Jan 8, 2024
e8a30e3
add logging tests, fix minor typo
ankona Jan 8, 2024
9714a54
re-use one method to produce exp log file names
ankona Jan 8, 2024
5899a7a
name parametrized tests
ankona Jan 8, 2024
6f4cd34
linter
ankona Jan 8, 2024
41e299c
revert test logging
ankona Jan 8, 2024
2b38e76
fix incorrect docstrings
ankona Jan 9, 2024
52f2c8f
move telemetry subdir constant, update tests
ankona Jan 9, 2024
2efb830
clarify docstring
ankona Jan 9, 2024
ef2c048
use sys.executable to avoid using python2 during tests
ankona Jan 9, 2024
9cc5c11
avoid re-creating experiment output paths
ankona Jan 9, 2024
b4c2514
lint
ankona Jan 9, 2024
86ad215
fix typehint issue
ankona Jan 9, 2024
7183e3a
fix missing logging dir bug
ankona Jan 9, 2024
3d27eda
use config for telmon subdir constant
ankona Jan 10, 2024
df64fe9
simplify filter evaluation function
ankona Jan 12, 2024
50e5c80
reduce code dupe w/fixture
ankona Jan 12, 2024
7c52303
rename decorator
ankona Jan 12, 2024
c770c92
fix typo
ankona Jan 12, 2024
46dc95f
invert guard check to reduce nesting
ankona Jan 12, 2024
b99f197
fix incorrect docstring
ankona Jan 12, 2024
52d5aba
update black pin, reformat
ankona Jan 12, 2024
306c277
fix docstring spacing
ankona Jan 12, 2024
c6961fc
remove unnecessary optional
ankona Jan 12, 2024
1d1f06f
ensure context var doesn't leak from test
ankona Jan 12, 2024
b670b9b
remove invalid test edge cases
ankona Jan 12, 2024
1a284f8
remove unused import
ankona Jan 12, 2024
2f93eb8
parametrize low-pass filter tests
ankona Jan 12, 2024
d3bbe96
fix level integer mypy err
ankona Jan 12, 2024
ffbfc8e
add telmon enable/disable check on log writes
ankona Jan 12, 2024
8aa436c
test context var leakage w/exception
ankona Jan 12, 2024
0a653a9
remove from future import
ankona Jan 16, 2024
4e385d2
add typing_extensions to extras
ankona Jan 16, 2024
8dbee00
remove extraneous validation
ankona Jan 16, 2024
b1ecc62
revert changes to log_to_file
ankona Jan 16, 2024
d241046
remove commented import
ankona Jan 16, 2024
8523337
revert
ankona Jan 16, 2024
13aa2b9
linter
ankona Jan 16, 2024
238971c
Merge branch 'CrayLabs:develop' into 563ctx
ankona Jan 16, 2024
1026a20
tweak Concatenate typehints
ankona Jan 16, 2024
0d8a91d
revert type ignore removal
ankona Jan 16, 2024
b8f4623
move log filter creation into ContextAwareLogger
ankona Jan 17, 2024
d78162e
revert whitespace change
ankona Jan 17, 2024
f237237
fix typo, fix type hint
ankona Jan 17, 2024
4012dad
run formatters, improve docstrings
ankona Jan 17, 2024
89049f7
fix duplicated log messages
ankona Jan 17, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -698,3 +698,7 @@ def setup_test_colo(
assert colo_model.colocated
# Check to make sure that limit_db_cpus made it into the colo settings
return colo_model

@pytest.fixture
def config() -> smartsim._core.config.Config:
return CONFIG
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ def has_ext_modules(_placeholder):
"types-tqdm",
"types-tensorflow==2.12.0.9",
"types-setuptools",
"typing_extensions>=4.9.0",
],
# see smartsim/_core/_install/buildenv.py for more details
**versions.ml_extras_required()
Expand Down
2 changes: 1 addition & 1 deletion smartsim/_core/_cli/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ def check_py_torch_version(versions: Versioner, device_in: _TDeviceStr = "cpu")
"Torch version not found in python environment. "
"Attempting to install via `pip`"
)
wheel_device = device if device == "cpu" else device_suffix.replace("+","")
wheel_device = device if device == "cpu" else device_suffix.replace("+", "")
pip(
"install",
"--extra-index-url",
Expand Down
4 changes: 4 additions & 0 deletions smartsim/_core/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,10 @@ def telemetry_enabled(self) -> bool:
def telemetry_cooldown(self) -> int:
return int(os.environ.get("SMARTSIM_TELEMETRY_COOLDOWN", 90))

@property
def telemetry_subdir(self) -> str:
return ".smartsim/telemetry"


@lru_cache(maxsize=128, typed=False)
def get_config() -> Config:
Expand Down
3 changes: 2 additions & 1 deletion smartsim/_core/control/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -842,11 +842,11 @@
:param exp_dir: An experiment directory
:type exp_dir: str
"""
logger.debug("Starting telemetry monitor process")
if (
self._telemetry_monitor is None
or self._telemetry_monitor.returncode is not None
):
logger.debug("Starting telemetry monitor process")

Check warning on line 849 in smartsim/_core/control/controller.py

View check run for this annotation

Codecov / codecov/patch

smartsim/_core/control/controller.py#L849

Added line #L849 was not covered by tests
cmd = [
sys.executable,
"-m",
Expand All @@ -866,6 +866,7 @@
cwd=str(pathlib.Path(__file__).parent.parent.parent),
shell=False,
)
logger.debug("Telemetry monitor started")

Check warning on line 869 in smartsim/_core/control/controller.py

View check run for this annotation

Codecov / codecov/patch

smartsim/_core/control/controller.py#L869

Added line #L869 was not covered by tests


class _AnonymousBatchJob(EntityList[Model]):
Expand Down
4 changes: 2 additions & 2 deletions smartsim/_core/control/jobmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@

from ...database import Orchestrator
from ...entity import DBNode, EntitySequence, SmartSimEntity
from ...log import get_logger
from ...log import ContextThread, get_logger
from ...status import STATUS_NEVER_STARTED, TERMINAL_STATUSES
from ..config import CONFIG
from ..launcher import Launcher, LocalLauncher
Expand Down Expand Up @@ -80,7 +80,7 @@ def __init__(self, lock: RLock, launcher: t.Optional[Launcher] = None) -> None:

def start(self) -> None:
"""Start a thread for the job manager"""
self.monitor = Thread(name="JobManager", daemon=True, target=self.run)
self.monitor = ContextThread(name="JobManager", daemon=True, target=self.run)
self.monitor.start()

def run(self) -> None:
Expand Down
3 changes: 2 additions & 1 deletion smartsim/_core/control/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from ...database import Orchestrator
from ...entity import DBNode, Ensemble, EntitySequence, Model, SmartSimEntity
from ...error import SmartSimError
from ..config import CONFIG
from ..utils import helpers as _helpers
from ..utils import serialize as _serialize

Expand Down Expand Up @@ -343,7 +344,7 @@ def finalize(self) -> LaunchedManifest[_T]:
def _format_exp_telemetry_path(
exp_path: t.Union[str, "os.PathLike[str]"]
) -> pathlib.Path:
return pathlib.Path(exp_path, _serialize.TELMON_SUBDIR)
return pathlib.Path(exp_path, CONFIG.telemetry_subdir)


def _format_run_telemetry_path(
Expand Down
8 changes: 5 additions & 3 deletions smartsim/_core/entrypoints/telemetrymonitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
from smartsim._core.launcher.slurm.slurmLauncher import SlurmLauncher
from smartsim._core.launcher.stepInfo import StepInfo
from smartsim._core.utils.helpers import get_ts
from smartsim._core.utils.serialize import MANIFEST_FILENAME, TELMON_SUBDIR
from smartsim._core.utils.serialize import MANIFEST_FILENAME
from smartsim.error.errors import SmartSimError
from smartsim.status import STATUS_COMPLETED, TERMINAL_STATUSES

Expand Down Expand Up @@ -582,7 +582,7 @@ def main(
poll for new jobs before attempting to shutdown
:type cooldown_duration: int
"""
manifest_relpath = pathlib.Path(TELMON_SUBDIR) / MANIFEST_FILENAME
manifest_relpath = pathlib.Path(CONFIG.telemetry_subdir) / MANIFEST_FILENAME
manifest_path = experiment_dir / manifest_relpath
monitor_pattern = str(manifest_relpath)

Expand Down Expand Up @@ -667,7 +667,9 @@ def get_parser() -> argparse.ArgumentParser:
log.setLevel(logging.DEBUG)
log.propagate = False

log_path = os.path.join(args.exp_dir, TELMON_SUBDIR, "telemetrymonitor.log")
log_path = os.path.join(
args.exp_dir, CONFIG.telemetry_subdir, "telemetrymonitor.log"
)
fh = logging.FileHandler(log_path, "a")
log.addHandler(fh)

Expand Down
6 changes: 3 additions & 3 deletions smartsim/_core/launcher/taskManager.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,12 @@
import time
import typing as t
from subprocess import PIPE
from threading import RLock, Thread
from threading import RLock

import psutil

from ...error import LauncherError
from ...log import get_logger
from ...log import ContextThread, get_logger
from ..utils.helpers import check_dev_log_level
from .util.shell import execute_async_cmd, execute_cmd

Expand Down Expand Up @@ -74,7 +74,7 @@ def start(self) -> None:
The TaskManager is run as a daemon thread meaning
that it will die when the main thread dies.
"""
monitor = Thread(name="TaskManager", daemon=True, target=self.run)
monitor = ContextThread(name="TaskManager", daemon=True, target=self.run)
monitor.start()

def run(self) -> None:
Expand Down
5 changes: 4 additions & 1 deletion smartsim/_core/utils/serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
TStepLaunchMetaData = t.Tuple[
t.Optional[str], t.Optional[str], t.Optional[bool], str, str, Path
]
TELMON_SUBDIR: t.Final[str] = ".smartsim/telemetry"

MANIFEST_FILENAME: t.Final[str] = "manifest.json"

_LOGGER = smartsim.log.get_logger(__name__)
Expand All @@ -58,6 +58,7 @@
return

manifest.metadata.run_telemetry_subdirectory.mkdir(parents=True, exist_ok=True)
exp_out, exp_err = smartsim.log.get_exp_log_paths()

Check warning on line 61 in smartsim/_core/utils/serialize.py

View check run for this annotation

Codecov / codecov/patch

smartsim/_core/utils/serialize.py#L61

Added line #L61 was not covered by tests

new_run = {
"run_id": manifest.metadata.run_id,
Expand Down Expand Up @@ -87,6 +88,8 @@
"name": manifest.metadata.exp_name,
"path": manifest.metadata.exp_path,
"launcher": manifest.metadata.launcher_name,
"out_file": str(exp_out),
"err_file": str(exp_err),
},
"runs": [new_run],
}
Expand Down
32 changes: 27 additions & 5 deletions smartsim/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,23 @@
from .database import Orchestrator
from .entity import Ensemble, Model, SmartSimEntity
from .error import SmartSimError
from .log import get_logger
from .log import ctx_exp_path, get_logger, method_contextualizer
from .settings import Container, base, settings
from .wlm import detect_launcher

logger = get_logger(__name__)


def _exp_path_map(exp: "Experiment") -> str:
"""Mapping function for use by method contextualizer to place the path of
the currently-executing experiment into context for log enrichment"""
return exp.exp_path


_contextualize = method_contextualizer(ctx_exp_path, _exp_path_map)


# pylint: disable=no-self-use
class Experiment:
"""Experiments are the Python user interface for SmartSim.

Expand Down Expand Up @@ -123,7 +133,7 @@ def __init__(
if not osp.isdir(osp.abspath(exp_path)):
raise NotADirectoryError("Experiment path provided does not exist")
exp_path = osp.abspath(exp_path)
self.exp_path = init_default(osp.join(getcwd(), name), exp_path, str)
self.exp_path: str = init_default(osp.join(getcwd(), name), exp_path, str)
ankona marked this conversation as resolved.
Show resolved Hide resolved

if launcher == "auto":
launcher = detect_launcher()
Expand All @@ -132,6 +142,7 @@ def __init__(
self._launcher = launcher.lower()
self.db_identifiers: t.Set[str] = set()

@_contextualize
def start(
self,
*args: t.Any,
Expand Down Expand Up @@ -205,6 +216,7 @@ def start(
logger.error(e)
raise

@_contextualize
def stop(self, *args: t.Any) -> None:
"""Stop specific instances launched by this ``Experiment``

Expand Down Expand Up @@ -241,6 +253,7 @@ def stop(self, *args: t.Any) -> None:
logger.error(e)
raise

@_contextualize
def generate(
self,
*args: t.Any,
Expand Down Expand Up @@ -278,6 +291,7 @@ def generate(
logger.error(e)
raise

@_contextualize
def poll(
self, interval: int = 10, verbose: bool = True, kill_on_interrupt: bool = True
) -> None:
Expand Down Expand Up @@ -321,6 +335,7 @@ def poll(
logger.error(e)
raise

@_contextualize
def finished(self, entity: SmartSimEntity) -> bool:
"""Query if a job has completed.

Expand All @@ -344,6 +359,7 @@ def finished(self, entity: SmartSimEntity) -> bool:
logger.error(e)
raise

@_contextualize
def get_status(self, *args: t.Any) -> t.List[str]:
"""Query the status of launched instances

Expand Down Expand Up @@ -382,8 +398,9 @@ def get_status(self, *args: t.Any) -> t.List[str]:
logger.error(e)
raise

@staticmethod
@_contextualize
def create_ensemble(
self,
name: str,
params: t.Optional[t.Dict[str, t.Any]] = None,
batch_settings: t.Optional[base.BatchSettings] = None,
Expand Down Expand Up @@ -456,8 +473,9 @@ def create_ensemble(
logger.error(e)
raise

@staticmethod
@_contextualize
def create_model(
self,
name: str,
run_settings: base.RunSettings,
params: t.Optional[t.Dict[str, t.Any]] = None,
Expand Down Expand Up @@ -553,7 +571,6 @@ def create_model(
"""
path = init_default(getcwd(), path, str)

# mcb
if path is None:
path = getcwd()
if params is None:
Expand All @@ -570,6 +587,7 @@ def create_model(
logger.error(e)
raise

@_contextualize
def create_run_settings(
self,
exe: str,
Expand Down Expand Up @@ -634,6 +652,7 @@ class in SmartSim. If found, the class corresponding
logger.error(e)
raise

@_contextualize
def create_batch_settings(
self,
nodes: int = 1,
Expand Down Expand Up @@ -694,6 +713,7 @@ def create_batch_settings(
logger.error(e)
raise

@_contextualize
def create_database(
self,
port: int = 6379,
Expand Down Expand Up @@ -777,6 +797,7 @@ def create_database(
**kwargs,
)

@_contextualize
def reconnect_orchestrator(self, checkpoint: str) -> Orchestrator:
"""Reconnect to a running ``Orchestrator``

Expand All @@ -797,6 +818,7 @@ def reconnect_orchestrator(self, checkpoint: str) -> Orchestrator:
logger.error(e)
raise

@_contextualize
def summary(self, style: str = "github") -> str:
"""Return a summary of the ``Experiment``

Expand Down
Loading
Loading