fixed types and device in new nplets dataset

Laouen · Laouen · commit f7d1366618d7 · 2024-10-16T11:56:05.000+02:00
diff --git a/README.md b/README.md
@@ -60,28 +60,28 @@ import numpy as np
 
 X = np.random.normal(0,1, (1000, 10))
 
-# Computation of O information for the entire system
+# Computation of O information for the nplet that consider all the variables of X
 measures = nplets_measures(X)
 
-# Computation of O info for the sub-system composed by 0, 1 and 3
-measures = nplets_measures(X, [0,1,3])
+# Computation of O info for a single nplet (it must be a list of nplets even if it is a single nplet)
+measures = nplets_measures(X, [[0,1,3]])
 
-# Computation of O info for the sub-system composed by 0, 1 and 3
+# Computation of O info for multiple nplets
 measures = nplets_measures(X, [[0,1,3],[3,7,4],[2,6,3]])
 
-# Extensive computation of O information measures over all combinations of X
+# Extensive computation of O information measures over all combinations of features in X
 measures = multi_order_measures(X)
 
-# compute the best 10 combinations using greedy, starting by exaustive search in 
+# Compute the best 10 combinations of features (nplet) using greedy, starting by exaustive search in 
 # lower order and building from there. Result shows best O information for 
 # each built optimal orders
-best_partitions, best_scores = greedy(X, 3, 5, repeat=10)
+best_nplets, best_scores = greedy(X, 3, 5, repeat=10)
 
-# compute the best 10 combinations using simulated annealing: There are two initialization options
-# 1. Starting by exaustive search in lower order, then building with gready.
-# 2. Selection random sample of initial solutions.
+# Compute the best 10 combinations of features (nplet) using simulated annealing: There are two initialization options
+# 1. Starting by a custom initial solution with shape (repeat, order) explicitely provided by the user.
+# 2. Selecting random samples from the order.
 # Result shows best O information for each built optimal orders
-best_partitions, best_scores = simulated_annealing(X, 5, repeat=10)
+best_nplets, best_scores = simulated_annealing(X, 5, repeat=10)
 ```
 
 For detailed usage and examples, please refer to the [documentation](https://github.com/Laouen/THOI).
diff --git a/tests/test_multiorder_measures.py b/tests/test_multiorder_measures.py
@@ -103,7 +103,7 @@ def test_multiorder_measures_precomputed_hot_encoded(self):
         T, N = self.X.shape
         covmat = gaussian_copula_covmat(self.X)
         
-        df_res = multi_order_measures_hot_encoded(covmat, batch_size=10000, use_cpu=True)
+        df_res = multi_order_measures_hot_encoded(covmat, batch_size=200000, use_cpu=True)
 
         dfs = []
         for order in sorted(df_res['order'].unique()):
@@ -122,7 +122,7 @@ def test_multiorder_measures_precomputed_hot_encoded(self):
                 df_desc_order = df_desc_order.sort_index()
                 df_stats_order = df_stats_order.sort_index()
 
-                self.assertTrue(np.allclose(df_desc_order.values, df_stats_order.values, atol=1e-6, equal_nan=True))
+                self.assertTrue(np.allclose(df_desc_order.values, df_stats_order.values, atol=1e-4, equal_nan=True))
 
     def test_multiple_times_same_datasets(self):
         # TODO: implement
diff --git a/thoi/commons.py b/thoi/commons.py
@@ -3,6 +3,8 @@
 import scipy as sp
 import torch
 
+from thoi.typing import TensorLikeArray
+
 
 def _get_string_metric(batched_res: np.ndarray, metric:str):
     '''
@@ -60,16 +62,15 @@ def _to_numpy(X):
         return X.detach().cpu().numpy()
     elif isinstance(X, np.ndarray):
         return X
-    else:
-        raise TypeError(f"Unsupported type: {type(X)}")
+    return np.array(X)
 
 def _get_device(use_cpu:bool=False):
     """Set the use of GPU if available"""
     using_GPU = torch.cuda.is_available() and not use_cpu
     device = torch.device('cuda' if using_GPU else 'cpu')
     return device
 
-def _normalize_input_data(X: Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]],
+def _normalize_input_data(X: TensorLikeArray,
                          covmat_precomputed: bool=False,
                          T: Optional[Union[int, List[int]]]=None,
                          use_cpu: bool=False):
@@ -88,30 +89,21 @@ def _normalize_input_data(X: Union[np.ndarray, torch.Tensor, List[np.ndarray], L
 
     # Handle different options for X parameter. Accept multivariate data or covariance matrix
     if covmat_precomputed:
-        
-        if isinstance(X, (np.ndarray, torch.Tensor)):
-            assert X.shape[-2] == X.shape[-1], 'Covariance matrix should be square'
-            assert len(X.shape) in [2, 3], 'Covariance matrix should have dimensions (N, N) or (D, N, N)'
-            covmats = torch.as_tensor(X)
-            covmats = covmats.unsqueeze(0) if len(covmats.shape) == 2 else covmats
-        else:
-            assert all([len(x.shape) == 2 for x in X]), 'All covariance matrices should have dimensions (N, N)'
-            assert all([x.shape[0] == x.shape[1] == X[0].shape[0] for x in X]), 'All covariance matrices should have same dimensions (N, N)'
-            covmats = torch.stack([torch.as_tensor(x) for x in X])
+        covmats = torch.as_tensor(X)
+        covmats = covmats.unsqueeze(0) if len(covmats.shape) == 2 else covmats
+        assert X.shape[-2] == X.shape[-1], 'Covariance matrix should be square'
+        assert len(X.shape) == 3, 'Covariance matrix should have dimensions (N, N) or (D, N, N)'
     else:
         
-        if isinstance(X, (np.ndarray, torch.Tensor)):
+        try:
             X = _to_numpy(X)
             assert len(X.shape) in [2, 3], 'Covariance matrix should have dimensions (T, N) or (D, T, N)'
-            if len(X.shape) == 2:
-                X = [X]
-            else:
-                X = [X[i] for i in range(X.shape[0])]
-        else:
+            X = [X] if len(X.shape) == 2 else [X[i] for i in range(X.shape[0])]
+        except:
+            X = [_to_numpy(x) for x in X]
             assert all([len(x.shape) == 2 for x in X]), 'All multivariate series should have dimensions (T, N) where T my vary and N be constant across all series'
             assert all([x.shape[1] == X[0].shape[1] for x in X]), 'All multivariate series should have dimensions (T, N) where T my vary and N be constant across all series'
-            X = [_to_numpy(x) for x in X]
-        
+
         covmats = torch.stack([torch.from_numpy(gaussian_copula_covmat(x)) for x in X])
         T = [x.shape[0] for x in X]
 
diff --git a/thoi/heuristics/greedy.py b/thoi/heuristics/greedy.py
@@ -12,10 +12,11 @@
 
 @torch.no_grad()
 def greedy(X: Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]],
-           covmat_precomputed: bool=False,
-           T: Optional[Union[int, List[int]]]=None,
            initial_order: int=3,
            order: Optional[int]=None,
+           *,
+           covmat_precomputed: bool=False,
+           T: Optional[Union[int, List[int]]]=None,
            repeat: int=10,
            use_cpu: bool=False,
            batch_size: int=1000000,
diff --git a/thoi/heuristics/simulated_annealing.py b/thoi/heuristics/simulated_annealing.py
@@ -18,10 +18,11 @@ def random_sampler(N:int, order:int, repeat:int, device:Optional[torch.device]=N
 
 @torch.no_grad()
 def simulated_annealing(X: Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]],
+                        order: Optional[int]=None,
+                        *,
                         covmat_precomputed: bool=False,
                         T: Optional[Union[int, List[int]]]=None,
                         initial_solution: Optional[torch.Tensor] = None,
-                        order: Optional[int]=None,
                         repeat: int = 10,
                         use_cpu: bool = False,
                         max_iterations: int = 1000,
diff --git a/thoi/heuristics/simulated_annealing_multi_order.py b/thoi/heuristics/simulated_annealing_multi_order.py
@@ -28,6 +28,7 @@ def hot_encode_to_indexes(nplets):
 
 @torch.no_grad()
 def simulated_annealing_multi_order(X: Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]],
+                                    *,
                                     covmat_precomputed: bool=False,
                                     T: Optional[Union[int, List[int]]]=None,
                                     initial_solution: Optional[torch.Tensor] = None,
diff --git a/thoi/measures/gaussian_copula.py b/thoi/measures/gaussian_copula.py
@@ -7,6 +7,7 @@
 import torch
 from torch.utils.data import DataLoader
 
+from thoi.typing import TensorLikeArray
 from thoi.commons import _normalize_input_data, _get_device
 from thoi.dataset import CovarianceDataset
 from thoi.collectors import batch_to_csv, concat_and_sort_csv
@@ -98,8 +99,8 @@ def _get_tc_dtc_from_batched_covmat(covmats: torch.Tensor, allmin1: torch.Tensor
     return nplet_tc, nplet_dtc, nplet_o, nplet_s
 
 @torch.no_grad()
-def nplets_measures(X: Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]],
-                    nplets: Optional[Union[np.ndarray,torch.Tensor]] = None,
+def nplets_measures(X: Union[TensorLikeArray],
+                    nplets: Optional[TensorLikeArray] = None,
                     covmat_precomputed: bool = False,
                     T: Optional[Union[int, List[int]]] = None,
                     use_cpu: bool = False):
@@ -164,7 +165,7 @@ def nplets_measures(X: Union[np.ndarray, torch.Tensor, List[np.ndarray], List[to
                         nplets_s.view(batch_size, D)], dim=-1)
 
 @torch.no_grad()
-def multi_order_measures(X: Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]],
+def multi_order_measures(X: TensorLikeArray,
                          covmat_precomputed: bool=False,
                          T: Optional[Union[int, List[int]]]=None,
                          min_order: int=3,
@@ -235,11 +236,14 @@ def multi_order_measures(X: Union[np.ndarray, torch.Tensor, List[np.ndarray], Li
         # calculate measurments for each batch
         for bn, nplets in enumerate(tqdm(dataloader, total=len(dataloader), leave=False, desc='Batch')):
             curr_batch_size = nplets.shape[0]
-            
+
+            # Send nplets to the device in case it is not there
+            nplets = nplets.to(device)
+
             # Create the covariance matrices for each nplet in the batch
             # |curr_batch_size| x |D| x |N| x |N|
             nplets_covmats = _generate_nplets_covmants(covmats, nplets)
-            
+
             # Pack covmats in a single batch
             # |curr_batch_size x D| x |N| x |N|
             nplets_covmats = nplets_covmats.view(curr_batch_size*D, order, order)
diff --git a/thoi/measures/gaussian_copula_hot_encoded.py b/thoi/measures/gaussian_copula_hot_encoded.py
@@ -7,6 +7,7 @@
 import torch
 from torch.utils.data import DataLoader
 
+from thoi.typing import TensorLikeArray
 from thoi.dataset import HotEncodedMultiOrderDataset
 from thoi.collectors import batch_to_csv, concat_and_sort_csv
 from thoi.measures.utils import _all_min_1_ids, _gaussian_entropy_bias_correction, _gaussian_entropy_estimation, _get_single_exclusion_covmats
@@ -177,8 +178,8 @@ def _compute_nplets_measures_hot_encoded(covmats: torch.Tensor,
     )
 
 @torch.no_grad()
-def nplets_measures_hot_encoded(X: Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]],
-                                nplets: Optional[Union[np.ndarray,torch.Tensor]] = None,
+def nplets_measures_hot_encoded(X: TensorLikeArray,
+                                nplets: Optional[TensorLikeArray] = None,
                                 covmat_precomputed: bool = False,
                                 T: Optional[int] = None,
                                 use_cpu: bool = False):
@@ -204,7 +205,7 @@ def nplets_measures_hot_encoded(X: Union[np.ndarray, torch.Tensor, List[np.ndarr
 
 
 @torch.no_grad()
-def multi_order_measures_hot_encoded(X: Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]],
+def multi_order_measures_hot_encoded(X: TensorLikeArray,
                                      covmat_precomputed: bool=False,
                                      T: Optional[int]=None,
                                      min_order: int=3,
diff --git a/thoi/measures/utils.py b/thoi/measures/utils.py
@@ -5,7 +5,7 @@
 from thoi.measures.constants import TWOPIE
 
 
-def _all_min_1_ids(N, device=torch.device('cpu')):
+def _all_min_1_ids(N: torch.device, device: torch.device=torch.device('cpu')):
     base_tensor = torch.arange(N, device=device).unsqueeze(0).repeat(N, 1)  # Shape: (N, N)
     mask = base_tensor != torch.arange(N, device=device).unsqueeze(1)  # Shape: (N, N)
     result = base_tensor[mask].view(N, N - 1)  # Shape: (N, N-1)
@@ -28,20 +28,21 @@ def _get_single_exclusion_covmats(covmats: torch.Tensor, allmin1: torch.Tensor):
     batch_size, N, _ = covmats.shape
 
     # Step 1: Expand allmin1 to match the batch size
-    # Shape: (batch_size, N, N-1)
+    # |batch_size| |N| |N-1|
     allmin1_expanded = allmin1.unsqueeze(0).expand(batch_size, -1, -1)
 
     # Step 2: Expand covmats to include the N dimension for variable exclusion
-    # Shape: (batch_size, N, N, N)
+    # |batch_size| |N| |N| |N|
     covmats_expanded = covmats.unsqueeze(1).expand(-1, N, -1, -1)
 
     # Step 3: Gather the rows corresponding to the indices in allmin1
-    # Shape of indices_row: (batch_size, N, N-1, N)
+    # |batch_size| |N| |N-1| |N|
     indices_row = allmin1_expanded.unsqueeze(-1).expand(-1, -1, -1, N)
     gathered_rows = torch.gather(covmats_expanded, 2, indices_row)
 
     # Step 4: Gather the columns corresponding to the indices in allmin1
     # Shape of indices_col: (batch_size, N, N-1, N-1)
+    # |batch_size| |N| |N-1| |N-1|
     indices_col = allmin1_expanded.unsqueeze(-2).expand(-1, -1, N-1, -1)
     covmats_sub = torch.gather(gathered_rows, 3, indices_col)
 
diff --git a/thoi/typing.py b/thoi/typing.py
@@ -0,0 +1,9 @@
+from typing import Union, Sequence, Any
+import torch
+import numpy as np
+
+TensorLikeArray = Union[
+    torch.Tensor,
+    np.ndarray,
+    Sequence[Union[np.ndarray, Sequence[Any]]],
+]