Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
98 commits
Select commit Hold shift + click to select a range
53ba389
updated
nnshah1 Jun 6, 2025
8115584
updating basic tests with mock
nnshah1 Jun 9, 2025
eb9afbc
incremental to avoid losing work - still wip
nnshah1 Jun 11, 2025
0e74f7b
removing some
nnshah1 Jun 12, 2025
e9a765f
updated
nnshah1 Jun 12, 2025
e5e29cb
updated
Jun 13, 2025
a37735a
updating
Jun 13, 2025
5a27d49
updated
Jun 13, 2025
87ac3d3
Merge branch 'main' into neelays/fault_tolerance_tests
Jun 13, 2025
890a57f
update
Jun 13, 2025
dd60609
remove seed - causing crash
Jun 13, 2025
0fffe73
Merge branch 'main' into neelays/fault_tolerance_tests
Jun 13, 2025
d9fa9c7
deleting unneeded
Jun 13, 2025
65751ea
deleted unneeded
Jun 13, 2025
683b8ce
updating
Jun 13, 2025
61298bb
updated
Jun 13, 2025
679de27
updating
nnshah1 Jun 13, 2025
cf56b02
Merge branch 'neelays/fault_tolerance_tests' of https://github.com/ai…
nnshah1 Jun 13, 2025
ce7c25a
updated
Jun 13, 2025
35ad706
adding additional configs as examples
Jun 14, 2025
2c663be
update with metrics
Jun 14, 2025
3404b70
updated
Jun 16, 2025
24eabbe
updated
Jun 16, 2025
08b7599
updated
Jun 16, 2025
7786186
fixing names
nnshah1 Jun 16, 2025
776f2e6
updating test
nnshah1 Jun 16, 2025
f6a682f
updated
nnshah1 Jun 16, 2025
c65ede8
updated
nnshah1 Jun 16, 2025
33e2ba3
updated
Jun 16, 2025
29b835a
updated
nnshah1 Jun 18, 2025
aca8021
Merge remote-tracking branch 'origin/neelays/fault_tolerance_tests' i…
Jun 18, 2025
00d3435
Merge branch 'main' into neelays/fault_tolerance_tests
Jun 18, 2025
2f7b8ed
updating based on precommit
nnshah1 Jun 18, 2025
2735cdd
Merge branch 'neelays/fault_tolerance_tests' of https://github.com/ai…
nnshah1 Jun 18, 2025
814d54f
Merge remote-tracking branch 'origin/neelays/fault_tolerance_tests' i…
Jun 18, 2025
9b8975e
updating
Jun 19, 2025
05207e2
fix for issue with un recognized failure process name
Jun 20, 2025
08705d4
updating
Jun 20, 2025
8ce8260
updates
nnshah1 Jun 20, 2025
66e8378
updated
nnshah1 Jun 24, 2025
e9e5c8d
Merge branch 'neelays/fault_tolerance_tests' of https://github.com/ai…
nnshah1 Jun 24, 2025
fe99543
updated
nnshah1 Jun 24, 2025
fbcb813
updated
nnshah1 Jun 24, 2025
1f89855
updating
Jun 24, 2025
fd21454
updated
Jun 24, 2025
65a3eef
fix typo
Jun 24, 2025
98c03b2
updated
Jun 25, 2025
b17cd64
updated
Jun 25, 2025
2bfe11f
updates
Jun 25, 2025
bd3bc03
updated
Jun 25, 2025
c34c6f2
updates
Jun 25, 2025
a89eff9
updated
Jun 25, 2025
2f11e43
fixes
Jun 26, 2025
affb10d
fixes and add sla violations
Jun 26, 2025
b029acf
updated
nnshah1 Jun 29, 2025
3eec254
updated
nnshah1 Jun 29, 2025
eb5c3e0
updated
nnshah1 Jun 29, 2025
389150b
updated
nnshah1 Jun 29, 2025
ed67921
updatd
nnshah1 Jun 29, 2025
12d826e
updated
nnshah1 Jun 29, 2025
c24549c
updated
nnshah1 Jun 29, 2025
072ecd3
updated
nnshah1 Jun 29, 2025
63895b2
updated
nnshah1 Jun 29, 2025
c1999e2
updated
nnshah1 Jun 29, 2025
16c1bd9
updated
nnshah1 Jun 29, 2025
4573452
updated
nnshah1 Jun 29, 2025
1b8dcb5
updated
nnshah1 Jun 29, 2025
46de42f
updated
nnshah1 Jun 30, 2025
426c98b
updated
nnshah1 Jun 30, 2025
01d7aed
updated
nnshah1 Jun 30, 2025
8eeb8d0
updated
nnshah1 Jun 30, 2025
e8c572f
updated
nnshah1 Jun 30, 2025
85b26c8
updated
nnshah1 Jun 30, 2025
caf3868
updated
nnshah1 Jun 30, 2025
94246e5
updates
nnshah1 Jun 30, 2025
3db4498
Merge remote-tracking branch 'origin/main' into neelays/fault_toleran…
nnshah1 Jun 30, 2025
775fe92
updates for pre commit
nnshah1 Jun 30, 2025
6f651cd
revert
nnshah1 Jun 30, 2025
25723e5
revert
nnshah1 Jul 1, 2025
145d636
updated
nnshah1 Jul 1, 2025
3c6c0fb
updated
nnshah1 Jul 1, 2025
03bd63a
Update deploy/sdk/src/dynamo/sdk/cli/circus.py
nnshah1 Jul 1, 2025
3a87ec6
Update deploy/sdk/src/dynamo/sdk/cli/circus.py
nnshah1 Jul 1, 2025
f8e99e8
refactoring
nnshah1 Jul 1, 2025
29c7787
Merge branch 'neelays/fault_tolerance_tests' of https://github.com/ai…
nnshah1 Jul 1, 2025
ac9ffba
updating with linting exceptions for fixtures
nnshah1 Jul 1, 2025
6f8595b
updating
nnshah1 Jul 1, 2025
1f93177
log failures
nnshah1 Jul 1, 2025
238f195
updated
nnshah1 Jul 1, 2025
b74a65f
updating
nnshah1 Jul 1, 2025
ab52b90
updating copyrights
nnshah1 Jul 1, 2025
3fec6d8
updated
nnshah1 Jul 1, 2025
98516d5
updating for mypy
nnshah1 Jul 1, 2025
386770f
moved list initialization
nnshah1 Jul 1, 2025
b96d683
updated for code rabbit
nnshah1 Jul 2, 2025
800918a
updated
nnshah1 Jul 2, 2025
67844e8
updating table
nnshah1 Jul 3, 2025
33edcab
updated
nnshah1 Jul 3, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion deploy/sdk/src/dynamo/sdk/cli/circus.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,23 @@ def create_circus_watcher(
use_sockets: bool = True,
**kwargs: Any,
) -> Watcher:
log_dir = os.environ.get("DYN_CIRCUS_LOG_DIR", None)
if log_dir is not None:
prefix = f"{log_dir}/{name}"
os.makedirs(prefix, exist_ok=True)
stdout_stream = {
"class": "FileStream",
"filename": f"{prefix}/output.log",
"backup_count": 10,
}
stderr_stream = {
"class": "FileStream",
"filename": f"{prefix}/error.log",
"backup_count": 10,
}
else:
stdout_stream = None
stderr_stream = None
return Watcher(
name=name,
cmd=shlex.quote(cmd) if psutil.POSIX else cmd,
Expand All @@ -94,7 +111,10 @@ def create_circus_watcher(
stop_children=True,
use_sockets=use_sockets,
graceful_timeout=86400,
respawn=False, # TODO
respawn=os.environ.get("DYN_CIRCUS_RESPAWN", "false").lower()
in ("true", "1", "yes"),
stdout_stream=stdout_stream,
stderr_stream=stderr_stream,
**kwargs,
)

Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,8 @@ markers = [
"weekly: marks tests to run weekly",
"gpu_1: marks tests to run on GPU",
"gpu_2: marks tests to run on 2GPUs",
"gpu_4: marks tests to run on 4GPUs",
"gpu_8: marks tests to run on 8GPUs",
"e2e: marks tests as end-to-end tests",
"integration: marks tests as integration tests",
"unit: marks tests as unit tests",
Expand Down
21 changes: 19 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import logging
import os
import shutil
import tempfile

import pytest
Expand All @@ -23,15 +24,31 @@

# Custom format inspired by your example
LOG_FORMAT = "[TEST] %(asctime)s %(levelname)s %(name)s: %(message)s"
DATE_FORMAT = "%Y-%m-%dT%H:%M:%S"

# Configure logging
logging.basicConfig(
level=logging.INFO,
format=LOG_FORMAT,
datefmt="%Y-%m-%dT%H:%M:%S", # ISO 8601 UTC format
datefmt=DATE_FORMAT, # ISO 8601 UTC format
)


@pytest.fixture(autouse=True)
def logger(request):
log_path = os.path.join(request.node.name, "test.log.txt")
logger = logging.getLogger()
shutil.rmtree(request.node.name, ignore_errors=True)
os.makedirs(request.node.name, exist_ok=True)
handler = logging.FileHandler(log_path, mode="w")
formatter = logging.Formatter(LOG_FORMAT, datefmt=DATE_FORMAT)
handler.setFormatter(formatter)
logger.addHandler(handler)
yield
handler.close()
logger.removeHandler(handler)


def pytest_collection_modifyitems(config, items):
"""
This function is called to modify the list of tests to run.
Expand Down Expand Up @@ -69,7 +86,7 @@ def __init__(self, request, port=2379, timeout=300):
timeout=timeout,
display_output=False,
health_check_ports=[port],
data_dir=tempfile.mkdtemp(prefix="etcd_"),
data_dir=data_dir,
log_dir=request.node.name,
)

Expand Down
Loading
Loading