From df72a6651b9610254a3d04f8e098b156c1ca82a1 Mon Sep 17 00:00:00 2001
From: Tilman Krokotsch <t.krokotsch@gmail.com>
Date: Wed, 22 May 2024 15:34:49 +0200
Subject: [PATCH] fix: entity splitting bug

splitting entities was implemented under the assumption that entity ids are strictly monotonic. This assumption holds for DS01 but not for most other subdatasets.
---
 rul_datasets/reader/ncmapss.py |  5 +++--
 tests/reader/test_ncmapss.py   | 19 +++++++++++++++++--
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/rul_datasets/reader/ncmapss.py b/rul_datasets/reader/ncmapss.py
index 696cf04..183779a 100644
--- a/rul_datasets/reader/ncmapss.py
+++ b/rul_datasets/reader/ncmapss.py
@@ -358,8 +358,9 @@ def _window_by_cycle(
 
     @staticmethod
     def _get_end_idx(identifiers):
-        _, split_idx = np.unique(identifiers, return_counts=True)
-        split_idx = np.cumsum(split_idx)
+        _, split_idx = np.unique(identifiers, return_index=True)
+        split_idx = np.sort(split_idx)
+        split_idx = np.concatenate([split_idx[1:], [len(identifiers)]])
 
         return split_idx
 
diff --git a/tests/reader/test_ncmapss.py b/tests/reader/test_ncmapss.py
index 333ffd0..1081972 100644
--- a/tests/reader/test_ncmapss.py
+++ b/tests/reader/test_ncmapss.py
@@ -103,8 +103,9 @@ def test_max_rul(max_rul, prepared_ncmapss):
 
 
 @pytest.mark.needs_data
-def test__split_by_unit(prepared_ncmapss):
-    reader = NCmapssReader(1)
+@pytest.mark.parametrize("fd", range(1, 8))
+def test__split_by_unit(fd, prepared_ncmapss):
+    reader = NCmapssReader(fd)
     features, targets, auxiliary = reader._load_raw_data()
     features, targets, auxiliary = reader._split_by_unit(features, targets, auxiliary)
 
@@ -113,6 +114,20 @@ def test__split_by_unit(prepared_ncmapss):
         assert np.unique(auxiliary[i][:, 0]).size == 1  # only one unit id present
 
 
+@pytest.mark.needs_data
+@pytest.mark.parametrize("fd", range(1, 8))
+def test__get_end_idx_for_cycles(fd, prepared_ncmapss):
+    reader = NCmapssReader(fd)
+    features, targets, auxiliary = reader._load_raw_data()
+    features, targets, auxiliary = reader._split_by_unit(features, targets, auxiliary)
+
+    for aux in auxiliary:
+        cycle_end_idx = reader._get_end_idx(aux[:, 1])
+        split_aux = np.split(aux, cycle_end_idx[:-1])
+        for cycle in split_aux:
+            assert np.unique(cycle[:, 1]).size == 1  # only one cycle id present
+
+
 @pytest.mark.needs_data
 @pytest.mark.parametrize("window_size", [10, 100])
 def test_padding_and_window_size(window_size, prepared_ncmapss):