Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
181 commits
Select commit Hold shift + click to select a range
7f876e8
using session instead of catalog in udfstep
ilongin Oct 20, 2025
a5c4572
refactoring job creation in datachain
ilongin Oct 20, 2025
70a44a6
implementing first phase of UDF checkpoints
ilongin Oct 22, 2025
f4c848b
Merge branch 'main' into ilongin/1392-udf-checkpoints
ilongin Oct 22, 2025
d8a337a
refactoring
ilongin Oct 22, 2025
d7b5ed9
changing udf table names
ilongin Oct 23, 2025
8752a9a
adding checkpoint tests and fixing cleaning udf tables in test
ilongin Oct 23, 2025
862fe28
added udf checkpoint continue from partial results
ilongin Oct 26, 2025
b599429
added udf generator logic and tests
ilongin Oct 27, 2025
3804b0c
merging with main
ilongin Oct 27, 2025
7c05e0d
Merge branch 'main' into ilongin/1392-udf-checkpoints
ilongin Oct 27, 2025
20346e7
fixing logic
ilongin Oct 27, 2025
e0242c9
merging with main
ilongin Oct 28, 2025
8fd41af
fixing issues and tests
ilongin Oct 28, 2025
630e37b
refactoring tests
ilongin Oct 28, 2025
b31d44a
refactoring
ilongin Oct 28, 2025
b5bb8cd
refactoring
ilongin Oct 29, 2025
a5f0fcd
refactoring
ilongin Oct 29, 2025
92590f7
refactoring udf table ownership logic
ilongin Oct 30, 2025
3c2211d
refactoring
ilongin Oct 30, 2025
181ea2e
refactoring tests
ilongin Oct 30, 2025
88c2648
fixing cast of recursive sql
ilongin Oct 30, 2025
c0f46cb
using has_table instead checking metadata
ilongin Oct 31, 2025
14e473b
fixing tests
ilongin Nov 4, 2025
d68d746
fixing cleaning table and partition by
ilongin Nov 6, 2025
8e0339f
Merge branch 'main' into ilongin/1392-udf-checkpoints
ilongin Nov 6, 2025
08c9ec4
fixing test
ilongin Nov 6, 2025
bb50da7
fixing aggregator
ilongin Nov 6, 2025
bd7d978
fixing hash collision
ilongin Nov 7, 2025
4ddee8f
merging with main
ilongin Nov 7, 2025
5b80a87
refactoring and removing processed table
ilongin Nov 7, 2025
9a6c71f
fixing tests
ilongin Nov 7, 2025
a2d6b34
fixing tests
ilongin Nov 7, 2025
3621cde
returning
ilongin Nov 7, 2025
437b63c
updated coverage
ilongin Nov 7, 2025
76125d0
removed coverate sysmon
ilongin Nov 7, 2025
c644aa1
refactoring checkpoint cleaning
ilongin Nov 7, 2025
5f6f183
Remove cleanup_checkpoints functionality for separate PR
ilongin Nov 9, 2025
709873c
fixing tests
ilongin Nov 9, 2025
e180338
fixing tests
ilongin Nov 9, 2025
aaf43f9
added udf checkpoint docs
ilongin Nov 10, 2025
1488fab
refactoring
ilongin Nov 10, 2025
d7f3a50
fixing tests
ilongin Nov 10, 2025
8a2ec11
Merge branch 'main' into ilongin/1392-udf-checkpoints
ilongin Nov 10, 2025
4379d11
fix creating processed table even in reset mode
ilongin Nov 10, 2025
27033e5
added tests
ilongin Nov 11, 2025
d73d55d
refactoring processed tracking for generators
ilongin Nov 12, 2025
b15f1c9
refactoring tests
ilongin Nov 12, 2025
fd5019e
refactoring create_table method
ilongin Nov 12, 2025
e4e6de9
Merge branch 'main' into ilongin/1392-udf-checkpoints
ilongin Nov 13, 2025
1e7f941
fix re-run when UDF output changes
ilongin Nov 13, 2025
6a5c140
Update src/datachain/cli/commands/misc.py
ilongin Nov 13, 2025
f914b7f
fixing docs and some code parts
ilongin Nov 13, 2025
91aa89c
refactoring
ilongin Nov 13, 2025
452ae72
returning sysmon
ilongin Nov 14, 2025
2375237
renaming create_checkpoint method
ilongin Nov 14, 2025
55b3846
simplified logic
ilongin Nov 14, 2025
87a51f3
removing batch_callback
ilongin Nov 16, 2025
43835f5
merging with main
ilongin Nov 17, 2025
911a3dc
refactoring
ilongin Nov 19, 2025
ad2907d
Merge branch 'main' into ilongin/1392-udf-checkpoints
ilongin Nov 19, 2025
0b3e092
removing tracking_fiedl
ilongin Nov 19, 2025
7bbe619
fixing ancestor job id find
ilongin Nov 20, 2025
fa5053b
Merge branch 'main' into ilongin/1392-udf-checkpoints
ilongin Nov 20, 2025
140e56b
refactor remove_checkpoint to accept only id
ilongin Nov 21, 2025
54c4493
removed comment
ilongin Nov 21, 2025
e9f48f5
refactoring creating table
ilongin Nov 21, 2025
38fa81d
refactoring
ilongin Nov 21, 2025
8ab52ae
updated docs by removing parent verb
ilongin Nov 21, 2025
ced14b4
Merge branch 'main' into ilongin/1392-udf-checkpoints
ilongin Nov 23, 2025
0f88905
adding staging sufix for table atomicity when doing copy
ilongin Nov 23, 2025
6f7e06f
break parent connection when reset flag is present
ilongin Nov 24, 2025
af784f2
fixing docs
ilongin Nov 24, 2025
7703eb5
fixing docs and other small fixes
ilongin Nov 24, 2025
4a7b4ca
fixing docs and other small fixes
ilongin Nov 24, 2025
a95a764
fixing comments
ilongin Nov 24, 2025
8876a3b
discarding changes with garabage collecting method of cli
ilongin Nov 24, 2025
13f3552
moving list_tables function to tests util
ilongin Nov 24, 2025
157a437
unifying prepare_row functions
ilongin Nov 24, 2025
c9d3bb0
adding hash_input and hash_output as default args in apply method of …
ilongin Nov 24, 2025
2227fe1
renaming sys_id to sys__processed_id
ilongin Nov 24, 2025
f459d60
removed not needed quote_schema from sqlite in removing tables for test
ilongin Nov 24, 2025
f93b49d
fixing issue with incomplete inputs in generator
ilongin Dec 10, 2025
a2a9a98
merging with main
ilongin Dec 10, 2025
631cdb3
merging with main
ilongin Dec 10, 2025
49e7641
added docs
ilongin Dec 10, 2025
f50058f
reorganizing tests
ilongin Dec 10, 2025
d11fb5d
var renaming
ilongin Dec 10, 2025
96f9de9
added regression test for subtract
ilongin Dec 10, 2025
ec98372
Merge branch 'main' into ilongin/1392-udf-checkpoints
ilongin Dec 13, 2025
c77858a
Merge branch 'main' into ilongin/1392-udf-checkpoints
ilongin Dec 17, 2025
9a51f9c
make hash_callable not fail if unexpected callalbe is input
ilongin Dec 17, 2025
298bcf3
disable checkpoints in threading / multiprocess
Dec 17, 2025
aa11f80
added custom migration function for checkpoints
ilongin Dec 18, 2025
ee464fb
Merge branch 'ilongin/1392-udf-checkpoints' of github.com:datachain-a…
ilongin Dec 18, 2025
e2ab50b
renaming checkpointstable and removing not needed migration function
Dec 19, 2025
9cb16c7
Merge branch 'main' into ilongin/1392-udf-checkpoints
Dec 19, 2025
3685dca
fixing non determinisitc tests for CH
ilongin Dec 19, 2025
eba46b5
fixing bug with continuing udf processing
ilongin Dec 20, 2025
e58d742
fixing test
ilongin Dec 20, 2025
1feac6d
Merge branch 'main' into ilongin/1392-udf-checkpoints
ilongin Dec 21, 2025
13c6aa0
fixing docs
ilongin Dec 21, 2025
a878df6
removed not needde comments
ilongin Dec 22, 2025
c61a13b
Merge branch 'main' into ilongin/1392-udf-checkpoints
ilongin Dec 29, 2025
d88d68a
removed not needed flag
ilongin Dec 29, 2025
abccfbc
Merge branch 'main' into ilongin/1392-udf-checkpoints
ilongin Dec 31, 2025
809e9a3
removed not needed env var
ilongin Dec 31, 2025
ab6799f
renamed env var
ilongin Dec 31, 2025
fa00047
reduced number of parallel
ilongin Dec 31, 2025
da8fd5b
added envs to env docs
ilongin Dec 31, 2025
115ea69
moved function to check concurrency for checkpoints from session to u…
ilongin Dec 31, 2025
334f5fb
removed comment
ilongin Dec 31, 2025
cb18ee4
fixing correct parent id
ilongin Jan 1, 2026
d1f83f1
moving check if checkpoint is enabled because of concurency from meta…
ilongin Jan 2, 2026
79655e7
removed partial constraint
ilongin Jan 2, 2026
b93f328
removing test
ilongin Jan 2, 2026
4352423
refactoring test
ilongin Jan 2, 2026
26f1eef
Merge branch 'main' into ilongin/1392-udf-checkpoints
ilongin Jan 7, 2026
985415d
Merge branch 'main' into ilongin/1392-udf-checkpoints
ilongin Jan 8, 2026
31fe6dd
Merge branch 'main' into ilongin/1392-udf-checkpoints
ilongin Jan 8, 2026
f117751
Merge branch 'main' into ilongin/1392-udf-checkpoints
ilongin Jan 12, 2026
15751ba
returning old checkpoints table name
ilongin Jan 12, 2026
85305b0
refactoring input table name hash
ilongin Jan 13, 2026
cafeadf
Merge branch 'main' into ilongin/1392-udf-checkpoints
ilongin Jan 14, 2026
a022bc1
Merge branch 'main' into ilongin/1392-udf-checkpoints
ilongin Jan 15, 2026
27781df
using group id for input table name in udf
ilongin Jan 15, 2026
05cf600
Merge branch 'main' into ilongin/1392-udf-checkpoints
ilongin Jan 19, 2026
af0dc7f
using pid and thread ownership to determine if checkpoints are enable…
ilongin Jan 19, 2026
b108dc8
fixing test
ilongin Jan 19, 2026
24c3894
refactoring tests
ilongin Jan 19, 2026
eb0e03a
refactoring tests
ilongin Jan 19, 2026
9d93aad
removing not needed conditions
ilongin Jan 21, 2026
e8ec502
refactoring
ilongin Jan 21, 2026
3bcfa18
fixing comment
ilongin Jan 21, 2026
4dd9cd4
refactoring
ilongin Jan 21, 2026
8fdfea2
merging with main
ilongin Jan 22, 2026
34402f4
Merge branch 'main' into ilongin/1392-udf-checkpoints
ilongin Jan 22, 2026
8ea8b12
fixing race condition
ilongin Jan 22, 2026
acf79c8
adde safe_copy_table
ilongin Jan 22, 2026
585d685
refactoring copy_table methods
ilongin Jan 23, 2026
adb828e
continuing UDF if parent partial table is not found
ilongin Jan 23, 2026
797b6cd
added try/catch of missing table
ilongin Jan 23, 2026
505f304
refactor transaction context usage
ilongin Jan 23, 2026
d702b21
optimized query
ilongin Jan 23, 2026
8df4905
added thread lock
ilongin Jan 23, 2026
659cc1c
updated docs with hashing limitations
ilongin Jan 23, 2026
b90a9d6
renaming function
ilongin Jan 23, 2026
3431c10
removed unrelated lint exception
ilongin Jan 26, 2026
19093a3
refactoring checkpoint tests
ilongin Jan 26, 2026
6803345
fixing env vars and verbose comments
ilongin Jan 26, 2026
8c57340
ading runtime error
ilongin Jan 26, 2026
d25b5af
refactoring
ilongin Jan 26, 2026
7a44193
removing name and job_aware to hash method of DataChain
ilongin Jan 26, 2026
e267deb
refactoring
ilongin Jan 26, 2026
bee6e0f
merging with main
ilongin Jan 26, 2026
0144c1a
refactoring
ilongin Jan 27, 2026
c457bf9
refactoring
ilongin Jan 27, 2026
15650eb
added logs
ilongin Jan 27, 2026
495f189
fixing env vars
ilongin Jan 27, 2026
8e6b2e5
Merge branch 'main' into ilongin/1392-udf-checkpoints
ilongin Jan 28, 2026
56c6b78
refactoring tests
ilongin Jan 28, 2026
8b16be4
removing not neededd monkeypatch
ilongin Jan 28, 2026
88048de
merging with main
ilongin Jan 30, 2026
e903715
added more tests
ilongin Jan 30, 2026
79c94f2
closing sqlite connections in test
ilongin Jan 30, 2026
a83fafe
moving get_table to db specific implementation
ilongin Jan 30, 2026
1d77ac2
return get_table to db_engine
ilongin Jan 30, 2026
0de0d3f
added job_id to hash
ilongin Feb 2, 2026
8b8a8d3
improved logging
ilongin Feb 3, 2026
0f61ea7
Added `CheckpointEvent` model to track checkpoint events (#1575)
ilongin Feb 4, 2026
25def9c
added prints
ilongin Feb 6, 2026
5a70e41
added print only when it is second job
ilongin Feb 9, 2026
12a771c
removed not used var
ilongin Feb 9, 2026
41847c9
removed print
ilongin Feb 9, 2026
85e0c42
Merge branch 'main' into ilongin/1392-udf-checkpoints
ilongin Feb 10, 2026
3e6601c
fixing reading files on udf continue
ilongin Feb 10, 2026
a9358d1
UDF checkpoint visibility (#1576)
ilongin Feb 12, 2026
22ebd7d
merged with main
ilongin Feb 12, 2026
1a04a7c
refactoring checkpoint events
ilongin Feb 12, 2026
5998b8e
fixing lint
ilongin Feb 12, 2026
e83884b
adding missing tests and fixing issues
ilongin Feb 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
183 changes: 150 additions & 33 deletions docs/guide/checkpoints.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ Checkpoints are available for both local script runs and Studio executions.
When you run a Python script locally (e.g., `python my_script.py`), DataChain automatically:

1. **Creates a job** for the script execution, using the script's absolute path as the job name
2. **Tracks parent jobs** by finding the last job with the same script name
2. **Tracks previous runs** by finding the last job with the same script name
3. **Calculates hashes** for each dataset save operation based on the DataChain operations chain
4. **Creates checkpoints** after each successful `.save()` call, storing the hash
5. **Checks for existing checkpoints** on subsequent runs - if a matching checkpoint exists in the parent job, DataChain skips the save and reuses the existing dataset
5. **Checks for existing checkpoints** on subsequent runs - if a matching checkpoint exists from the previous run, DataChain skips the save and reuses the existing dataset

This means that if your script creates multiple datasets and fails partway through, the next run will skip recreating the datasets that were already successfully saved.

Expand All @@ -37,7 +37,7 @@ When triggering jobs through the Studio interface:
2. **Checkpoint control** is explicit - you choose between:
- **Run from scratch**: Ignores any existing checkpoints and recreates all datasets
- **Continue from last checkpoint**: Resumes from the last successful checkpoint, skipping already-completed stages
3. **Parent-child job linking** is handled automatically by the system
3. **Job linking between runs** is handled automatically by the system
4. **Checkpoint behavior** during execution is the same as local runs: datasets are saved at each `.save()` call and can be reused on retry


Expand Down Expand Up @@ -75,7 +75,7 @@ result = (

**First run:** The script executes all three stages and creates three datasets: `filtered_data`, `transformed_data`, and `final_results`. If the script fails during Stage 3, only `filtered_data` and `transformed_data` are saved.

**Second run:** DataChain detects that `filtered_data` and `transformed_data` were already created in the parent job with matching hashes. It skips recreating them and proceeds directly to Stage 3, creating only `final_results`.
**Second run:** DataChain detects that `filtered_data` and `transformed_data` were already created in the previous run with matching hashes. It skips recreating them and proceeds directly to Stage 3, creating only `final_results`.

## When Checkpoints Are Used

Expand All @@ -84,27 +84,20 @@ Checkpoints are automatically used when:
- Running a Python script locally (e.g., `python my_script.py`)
- The script has been run before
- A dataset with the same name is being saved
- The chain hash matches a checkpoint from the parent job
- The chain hash matches a checkpoint from the previous run

Checkpoints are **not** used when:

- Running code interactively (Python REPL, Jupyter notebooks)
- Running code as a module (e.g., `python -m mymodule`)
- The `DATACHAIN_CHECKPOINTS_RESET` environment variable is set (see below)
- The `DATACHAIN_IGNORE_CHECKPOINTS` environment variable is set (see below)

## Resetting Checkpoints

To ignore existing checkpoints and run your script from scratch, set the `DATACHAIN_CHECKPOINTS_RESET` environment variable:
To ignore existing checkpoints and run your script from scratch, set the `DATACHAIN_IGNORE_CHECKPOINTS` environment variable:

```bash
export DATACHAIN_CHECKPOINTS_RESET=1
python my_script.py
```

Or set it inline:

```bash
DATACHAIN_CHECKPOINTS_RESET=1 python my_script.py
DATACHAIN_IGNORE_CHECKPOINTS=1 python my_script.py
```

This forces DataChain to recreate all datasets, regardless of existing checkpoints.
Expand All @@ -121,7 +114,7 @@ When running `python my_script.py`, DataChain uses the **absolute path** to the
/home/user/projects/my_script.py
```

This allows DataChain to link runs of the same script together as parent-child jobs, enabling checkpoint lookup.
This allows DataChain to link runs of the same script together, enabling checkpoint lookup across runs.

### Interactive or Module Execution (Checkpoints Disabled)

Expand All @@ -140,7 +133,7 @@ For each `.save()` operation, DataChain calculates a hash based on:
1. The hash of the previous checkpoint in the current job (if any)
2. The hash of the current DataChain operations chain

This creates a chain of hashes that uniquely identifies each stage of data processing. On subsequent runs, DataChain matches these hashes against the parent job's checkpoints and skips recreating datasets where the hashes match.
This creates a chain of hashes that uniquely identifies each stage of data processing. On subsequent runs, DataChain matches these hashes against checkpoints from the previous run and skips recreating datasets where the hashes match.

### Hash Invalidation

Expand Down Expand Up @@ -198,29 +191,153 @@ for ds in dc.datasets():
print(ds.name)
```

## Limitations
## UDF-Level Checkpoints

- **Script-based:** Code must be run as a script (not interactively or as a module).
- **Hash-based matching:** Any change to the chain will create a different hash, preventing checkpoint reuse.
- **Same script path:** The script must be run from the same absolute path for parent job linking to work.
In addition to dataset-level checkpointing via `.save()`, DataChain automatically creates checkpoints for individual UDFs (`.map()`, `.gen()`, `.agg()`) during execution.

## Future Plans
**Two levels of checkpointing:**
- **Dataset checkpoints** (via `.save()`): When you explicitly save a dataset, it's persisted and can be used in other scripts. If you re-run the same chain with unchanged code, DataChain skips recreation and reuses the saved dataset.
- **UDF checkpoints** (automatic): Each UDF execution is automatically checkpointed. If a UDF completes successfully, it's skipped entirely on re-run (if code unchanged). If a UDF fails mid-execution, only the unprocessed rows are recomputed on re-run.

### UDF-Level Checkpoints
**Key differences:**
- `.save()` creates a named dataset that persists even if your script fails later, and can be used in other scripts
- UDF checkpoints are automatic and internal - they optimize execution within a single script by skipping or resuming UDFs

Currently, checkpoints are created only when datasets are saved using `.save()`. This means that if a script fails during a long-running UDF operation (like `.map()`, `.gen()`, or `.agg()`), the entire UDF computation must be rerun on the next execution.
For `.map()` and `.gen()`, **DataChain saves processed rows continuously during UDF execution**. This means:
- If a UDF **completes successfully**, a checkpoint is created and the entire UDF is skipped on re-run (unless code changes)
- If a UDF **fails mid-execution**, the next run continues from where it left off, skipping already-processed rows - even if you've modified the UDF code to fix a bug

Future versions will support **UDF-level checkpoints**, creating checkpoints after each UDF step in the chain. This will provide much more granular recovery:
**Note:** For `.agg()`, checkpoints are created when the aggregation completes successfully, but partial results are not tracked. If an aggregation fails partway through, it will restart from scratch on the next run.

### How It Works

When executing `.map()` or `.gen()`, DataChain:

1. **Saves processed rows incrementally** as the UDF processes your dataset
2. **Creates a checkpoint** when the UDF completes successfully
3. **Allows you to fix bugs and continue** - if the UDF fails, you can modify the code and re-run, skipping already-processed rows
4. **Invalidates the checkpoint if you change the UDF after successful completion** - completed UDFs are recomputed from scratch if the code changes

For `.agg()`, checkpoints are only created upon successful completion, without incremental progress tracking.

### Example: Fixing a Bug Mid-Execution

```python
# Future behavior with UDF-level checkpoints
result = (
dc.read_csv("data.csv")
.map(heavy_computation_1) # Checkpoint created after this UDF
.map(heavy_computation_2) # Checkpoint created after this UDF
.map(heavy_computation_3) # Checkpoint created after this UDF
.save("result")

def process_image(file: File) -> int:
# Bug: this will fail on some images
img = Image.open(file.get_local_path())
return img.size[0]

(
dc.read_dataset("images")
.map(width=process_image)
.save("image_dimensions")
)
```

If the script fails during `heavy_computation_3`, the next run will skip re-executing `heavy_computation_1` and `heavy_computation_2`, resuming only the work that wasn't completed.
**First run:** Script processes 50% of images successfully, then fails on a corrupted image.

**After fixing the bug:**

```python
from datachain import File

def process_image(file: File) -> int:
# Fixed: handle corrupted images gracefully
try:
img = Image.open(file.get_local_path())
return img.size[0]
except Exception:
return 0
```

**Second run:** DataChain automatically skips the 50% of images that were already processed successfully, and continues processing the remaining images using the fixed code. You don't lose any progress from the first run.

### When UDF Checkpoints Are Invalidated

DataChain distinguishes between two types of UDF changes:

#### 1. Code-Only Changes (Bug Fixes) - Continues from Partial Results

When you fix a bug in your UDF code **without changing the output type**, DataChain allows you to continue from where the UDF failed. This is the key benefit of UDF-level checkpoints - you don't lose progress when fixing bugs.

**Example: Bug fix without output change**
```python
# First run - fails partway through
def process(num: int) -> int:
if num > 100:
raise Exception("Bug!") # Oops, a bug!
return num * 10

# Second run - continues from where it failed
def process(num: int) -> int:
return num * 10 # Bug fixed! ✓ Continues from partial results
```

In this case, DataChain will skip already-processed rows and continue processing the remaining rows with your fixed code.

#### 2. Output Schema Changes - Forces Re-run from Scratch

When you change the **output type** of your UDF, DataChain automatically detects this and reruns the entire UDF from scratch. This prevents schema mismatches that would cause errors or corrupt data.

**Example: Output change**
```python
# First run - fails partway through
def process(num: int) -> int:
if num > 100:
raise Exception("Bug!")
return num * 10

# Second run - output type changed
def process(num: int) -> str:
return f"value_{num * 10}" # Output type changed! ✗ Reruns from scratch
```

In this case, DataChain detects that the output type changed from `int` to `str` and discards partial results to avoid schema incompatibility. All rows will be reprocessed with the new output.

#### Changes That Invalidate In-Progress UDF Checkpoints

Partial results are automatically discarded when you change:

- **Output type** - Changes to the `output` parameter or return type annotations
- **Operations before the UDF** - Any changes to the data processing chain before the UDF

#### Changes That Invalidate Completed UDF Checkpoints

Once a UDF completes successfully, its checkpoint is tied to the UDF function code. If you modify the function and re-run the script, DataChain will detect the change and recompute the entire UDF from scratch.

Changes that invalidate completed UDF checkpoints:

- **Modifying the UDF function logic** - Any code changes inside the function
- **Changing function parameters or output types** - Changes to input/output specifications
- **Altering any operations before the UDF in the chain** - Changes to upstream data processing

**Key takeaway:** For in-progress (partial) UDFs, you can fix bugs freely as long as the output stays the same. For completed UDFs, any code change triggers a full recomputation.

## Limitations

When running locally:

- **Script-based:** Code must be run as a script (not interactively or as a module).
- **Same script path:** The script must be run from the same absolute path for linking to previous runs to work.
- **Threading/Multiprocessing:** Checkpoints are automatically disabled when Python threading or multiprocessing is detected to prevent race conditions. Any checkpoints created before threading starts remain valid for future runs. DataChain's built-in `parallel` setting for UDF execution is not affected by this limitation.

These limitations don't apply when running on Studio, where job linking between runs is handled automatically by the platform.

### UDF Hashing Limitations

DataChain computes checkpoint hashes by inspecting UDF code and metadata. Certain types of callables cannot be reliably hashed:

- **Built-in functions** (`len`, `str`, `int`, etc.): Cannot access bytecode, so a random hash is generated on each run. Checkpoints using these functions will not be reused.
- **C extensions**: Same limitation as built-ins - no accessible bytecode means a new hash each run.
- **Mock objects**: `Mock(side_effect=...)` cannot be reliably hashed because the side effect is not discoverable via inspection. Use regular functions instead.
- **Dynamically generated callables**: If a callable is created via `exec`/`eval` or its behavior depends on runtime state, the hash reflects only the method's code, not captured state.

To ensure checkpoints work correctly, use regular Python functions defined with `def` or lambda expressions for your UDFs.

## Future Plans

### Partial Result Tracking for Aggregations

Currently, `.agg()` creates checkpoints only upon successful completion, without tracking partial progress. Future versions will extend the same incremental progress tracking that `.map()` and `.gen()` have to aggregations, allowing them to resume from where they failed rather than restarting from scratch.
Comment on lines +339 to +343
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need "Future Plans" in docs? 🤔

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure, thought it was a good idea but maybe you are right ...it could be confusing @shcheklein what is your take?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Both are fine, we need partial progress though (just these day, brainspace is running 7h long agg job with ~400 inputs, each taking 30 mins)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just double check we have a checkbox to follow up

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ping

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we already have draft PR for this which I will finish when this branch get's merged

3 changes: 3 additions & 0 deletions docs/guide/env.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,7 @@ List of environment variables used to configure DataChain behavior.
- `DATACHAIN_NAMESPACE` – Namespace name to use as default.
- `DATACHAIN_PROJECT` – Project name or combination of namespace name and project name separated by `.` to use as default, example: `DATACHAIN_PROJECT=dev.analytics`

### Checkpoints
.- `DATACHAIN_IGNORE_CHECKPOINTS` – When set to `1` or `true`, ignores all existing checkpoints and runs the script from scratch, forcing DataChain to recreate all datasets.

Note: Some environment variables are used internally and may not be documented here. For the most up-to-date list, refer to the source code.
101 changes: 101 additions & 0 deletions src/datachain/checkpoint_event.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import uuid
from dataclasses import dataclass
from datetime import datetime
from enum import Enum


class CheckpointEventType(str, Enum):
"""Types of checkpoint events."""

# UDF events
UDF_SKIPPED = "UDF_SKIPPED"
UDF_CONTINUED = "UDF_CONTINUED"
UDF_FROM_SCRATCH = "UDF_FROM_SCRATCH"

# Dataset save events
DATASET_SAVE_SKIPPED = "DATASET_SAVE_SKIPPED"
DATASET_SAVE_COMPLETED = "DATASET_SAVE_COMPLETED"


class CheckpointStepType(str, Enum):
"""Types of checkpoint steps."""

UDF_MAP = "UDF_MAP"
UDF_GEN = "UDF_GEN"
DATASET_SAVE = "DATASET_SAVE"


@dataclass
class CheckpointEvent:
"""
Represents a checkpoint event for debugging and visibility.

Checkpoint events are logged during job execution to track checkpoint
decisions (skip, continue, run from scratch) and provide visibility
into what happened during script execution.
"""

id: str
job_id: str
run_group_id: str | None
timestamp: datetime
event_type: CheckpointEventType
step_type: CheckpointStepType
udf_name: str | None = None
dataset_name: str | None = None
checkpoint_hash: str | None = None
hash_partial: str | None = None
hash_input: str | None = None
hash_output: str | None = None
rows_input: int | None = None
rows_processed: int | None = None
rows_output: int | None = None
rows_input_reused: int | None = None
rows_output_reused: int | None = None
rerun_from_job_id: str | None = None
details: dict | None = None

@classmethod
def parse( # noqa: PLR0913
cls,
id: str | uuid.UUID,
job_id: str,
run_group_id: str | None,
timestamp: datetime,
event_type: str,
step_type: str,
udf_name: str | None,
dataset_name: str | None,
checkpoint_hash: str | None,
hash_partial: str | None,
hash_input: str | None,
hash_output: str | None,
rows_input: int | None,
rows_processed: int | None,
rows_output: int | None,
rows_input_reused: int | None,
rows_output_reused: int | None,
rerun_from_job_id: str | None,
details: dict | None,
) -> "CheckpointEvent":
return cls(
id=str(id),
job_id=job_id,
run_group_id=run_group_id,
timestamp=timestamp,
event_type=CheckpointEventType(event_type),
step_type=CheckpointStepType(step_type),
udf_name=udf_name,
dataset_name=dataset_name,
checkpoint_hash=checkpoint_hash,
hash_partial=hash_partial,
hash_input=hash_input,
hash_output=hash_output,
rows_input=rows_input,
rows_processed=rows_processed,
rows_output=rows_output,
rows_input_reused=rows_input_reused,
rows_output_reused=rows_output_reused,
rerun_from_job_id=rerun_from_job_id,
details=details,
)
Loading