implemented batch_size on nplet_measures/_hot_encoded (#18)

Laouen · web-flow · commit 16ad17cefebe · 2025-03-06T09:35:46.000+01:00
* implemented batch_size on nplet_measures/_hot_encoded

* implemented batch_size in greedy

* changed batch_size parameter order

* added batch_size to simulated annealing

* implemented batch size in simulated annealing
diff --git a/tests/test_nplet_measures.py b/tests/test_nplet_measures.py
@@ -93,6 +93,28 @@ def test_multiple_times_same_datasets_precomputed(self):
                 nplets = torch.tensor(list(combinations(full_nplet, order)))
                 res = nplets_measures([self.covmat, self.covmat], nplets, covmat_precomputed=True, T=self.X.shape[0])
                 self._validate_same_results_for_repeated_datasets(res, nplets, rtol=1e-16, atol=1e-7)
+    
+    def test_batch_size_does_not_change_result(self):
+        full_nplet = range(self.X.shape[1])
+        
+        nplets = torch.tensor([list(c) for i, c in enumerate(combinations(full_nplet, 3)) if i < 100000])
+        
+        # test for different batch sizes
+        res = nplets_measures(self.X, nplets)
+        res2 = nplets_measures(self.X, nplets, batch_size=10)
+        res3 = nplets_measures(self.X, nplets, batch_size=100)
+        res4 = nplets_measures(self.X, nplets, batch_size=1000)
+        res5 = nplets_measures(self.X, nplets, batch_size=10000)
+        res6 = nplets_measures(self.X, nplets, batch_size=100000)
+        res7 = nplets_measures(self.X, nplets, batch_size=1000000) # this should do a single batch
+        
+        # check that the results are the same
+        self.assertTrue(torch.allclose(res, res2, rtol=1e-16, atol=1e-12))
+        self.assertTrue(torch.allclose(res, res3, rtol=1e-16, atol=1e-12))
+        self.assertTrue(torch.allclose(res, res4, rtol=1e-16, atol=1e-12))
+        self.assertTrue(torch.allclose(res, res5, rtol=1e-16, atol=1e-12))
+        self.assertTrue(torch.allclose(res, res6, rtol=1e-16, atol=1e-12))
+        self.assertTrue(torch.allclose(res, res7, rtol=1e-16, atol=1e-12))
 
     def test_nplets_measures_timeseries_hot_encoded(self):
         N = self.X.shape[1]
@@ -142,6 +164,30 @@ def test_multiple_times_same_dataset_precomputed_hot_encoded(self):
                 res = nplets_measures_hot_encoded([self.covmat, self.covmat], nplets_hot_encoded, covmat_precomputed=True, T=self.X.shape[0])
                 self._validate_same_results_for_repeated_datasets(res, nplets, rtol=1e-8, atol=1e-4)
 
+    def test_batch_size_does_not_change_result_hot_encoded(self):
+        full_nplet = range(self.X.shape[1])
+        
+        nplets = torch.tensor([list(c) for i, c in enumerate(combinations(full_nplet, 3)) if i < 100000])
+        nplets_hot_encoded = torch.zeros((nplets.shape[0], self.X.shape[1]), dtype=torch.int)
+        nplets_hot_encoded[torch.arange(0,nplets.shape[0], dtype=int).view(-1,1), nplets] = 1
+        
+        # test for different batch sizes
+        res = nplets_measures_hot_encoded(self.X, nplets_hot_encoded)
+        res2 = nplets_measures_hot_encoded(self.X, nplets_hot_encoded, batch_size=10)
+        res3 = nplets_measures_hot_encoded(self.X, nplets_hot_encoded, batch_size=100)
+        res4 = nplets_measures_hot_encoded(self.X, nplets_hot_encoded, batch_size=1000)
+        res5 = nplets_measures_hot_encoded(self.X, nplets_hot_encoded, batch_size=10000)
+        res6 = nplets_measures_hot_encoded(self.X, nplets_hot_encoded, batch_size=100000)
+        res7 = nplets_measures_hot_encoded(self.X, nplets_hot_encoded, batch_size=1000000) # this should do a single batch
+        
+        # check that the results are the same
+        self.assertTrue(torch.allclose(res, res2, rtol=1e-16, atol=1e-12))
+        self.assertTrue(torch.allclose(res, res3, rtol=1e-16, atol=1e-12))
+        self.assertTrue(torch.allclose(res, res4, rtol=1e-16, atol=1e-12))
+        self.assertTrue(torch.allclose(res, res5, rtol=1e-16, atol=1e-12))
+        self.assertTrue(torch.allclose(res, res6, rtol=1e-16, atol=1e-12))
+        self.assertTrue(torch.allclose(res, res7, rtol=1e-16, atol=1e-12))
 
+        
 if __name__ == '__main__':
     unittest.main()
diff --git a/thoi/heuristics/greedy.py b/thoi/heuristics/greedy.py
@@ -1,7 +1,6 @@
 from typing import Union, Callable, List, Optional
 from tqdm import trange
 
-import numpy as np
 import torch
 from functools import partial
 
@@ -20,8 +19,8 @@ def greedy(X: TensorLikeArray,
            covmat_precomputed: bool=False,
            T: Optional[Union[int, List[int]]]=None,
            repeat: int=10,
-           device: torch.device=torch.device('cpu'),
            batch_size: int=1000000,
+           device: torch.device=torch.device('cpu'),
            metric: Union[str,Callable]='o',
            largest: bool=False):
 
@@ -74,6 +73,7 @@ def greedy(X: TensorLikeArray,
         best_candidate, best_score = _next_order_greedy(covmats, T, current_solution,
                                                        metric=metric,
                                                        largest=largest,
+                                                       batch_size=batch_size,
                                                        device=device)
         best_scores.append(best_score)
 
@@ -115,6 +115,7 @@ def _next_order_greedy(covmats: torch.Tensor,
                       initial_solution: torch.Tensor,
                       metric: Union[str,Callable],
                       largest: bool,
+                      batch_size: int=1000000,
                       device: torch.device=torch.device('cpu')):
     
     '''
@@ -126,6 +127,7 @@ def _next_order_greedy(covmats: torch.Tensor,
     - initial_solution (torch.Tensor): The initial solution with shape (batch_size, order)
     - metric (Union[str,Callable]): The metric to evaluate. One of tc, dtc, o, s or a callable function
     - largest (bool): A flag to indicate if the metric is to be maximized or minimized
+    - batch_size (int): The batch size to use for the computation. Default is 1000000.
     - device (torch.device): The device to use for the computation. Default is 'cpu'
     
     Returns:
@@ -135,36 +137,51 @@ def _next_order_greedy(covmats: torch.Tensor,
 
     # Get parameters attributes
     N = covmats.shape[1]
-    batch_size, order = initial_solution.shape
+    total_size, order = initial_solution.shape
 
     # Initial valid candidates to iterate one by one
-    # |batch_size| x |N-order|
+    # |total_size| x |N-order|
     valid_candidates = _get_valid_candidates(initial_solution, N, device)
-    
-    # |batch_size| x |N-order| x |order+1|
-    all_solutions = _create_all_solutions(initial_solution, valid_candidates)
-    
-    # |batch_size x N-order| x |order+1|
-    all_solutions = all_solutions.view(batch_size*(N-order), order+1)
-    
-    # |batch_size x N-order|
-    best_score = _evaluate_nplets(covmats, T, all_solutions, metric, device=device)
-    
-    # |batch_size| x |N-order|
-    best_score = best_score.view(batch_size, N-order)
-    
-    if not largest:
-        best_score = -best_score
-    
-    # get for each batch item the best score over the second dimention
-    
-    # |batch_size|
-    max_idxs = torch.argmax(best_score, dim=1)
-    best_candidates = valid_candidates[torch.arange(batch_size), max_idxs]
-    best_score = best_score[torch.arange(batch_size), max_idxs]
-    
-    # If minimizing, then return score to its original sign
-    if not largest:
-        best_score = -best_score
 
-    return best_candidates, best_score
+    best_candidates = []
+    best_scores = []
+
+    for start in range(0, total_size, batch_size):
+        end = min(start + batch_size, total_size)
+        batch_initial_solution = initial_solution[start:end]
+        batch_valid_candidates = valid_candidates[start:end]
+
+        # |batch_size| x |N-order| x |order+1|
+        all_solutions = _create_all_solutions(batch_initial_solution, batch_valid_candidates)
+        
+        # |batch_size x N-order| x |order+1|
+        all_solutions = all_solutions.view(-1, order+1)
+        
+        # |batch_size x N-order|
+        batch_best_score = _evaluate_nplets(covmats, T,
+                                            all_solutions,
+                                            metric,
+                                            batch_size=batch_size,
+                                            device=device)
+        
+        # |batch_size| x |N-order|
+        batch_best_score = batch_best_score.view(end - start, N - order)
+        
+        if not largest:
+            batch_best_score = -batch_best_score
+        
+        # get for each batch item the best score over the second dimension
+        
+        # |batch_size|
+        max_idxs = torch.argmax(batch_best_score, dim=1)
+        batch_best_candidates = batch_valid_candidates[torch.arange(end - start), max_idxs]
+        batch_best_score = batch_best_score[torch.arange(end - start), max_idxs]
+        
+        # If minimizing, then return score to its original sign
+        if not largest:
+            batch_best_score = -batch_best_score
+
+        best_candidates.append(batch_best_candidates)
+        best_scores.append(batch_best_score)
+
+    return torch.cat(best_candidates), torch.cat(best_scores)
diff --git a/thoi/heuristics/scoring.py b/thoi/heuristics/scoring.py
@@ -12,12 +12,14 @@ def _evaluate_nplets(covmats: torch.Tensor,
                      T: Optional[List[int]],
                      batched_nplets: torch.Tensor,
                      metric: Union[str, Callable],
+                     batch_size: int,
                      device: torch.device):
     """
         - covmats (torch.Tensor): The covariance matrix or matrixes with shape (N, N) or (D, N, N)
         - T (Optional[List[int]]): The number of samples for each multivariate series or None
-        - batched_nplets (torch.Tensor): The nplets to calculate the inverse of the oinformation with shape (batch_size, order)
+        - batched_nplets (torch.Tensor): The nplets to calculate the inverse of the oinformation with shape (total_size, order)
         - metric (str): The metric to evaluate. One of tc, dtc, o, s or Callable
+        - batch_size (int): The batch size to use for the computation
         - device (torch.device): The device to use
     """
 
@@ -31,6 +33,7 @@ def _evaluate_nplets(covmats: torch.Tensor,
                                        nplets=batched_nplets,
                                        T=T,
                                        covmat_precomputed=True,
+                                       batch_size=batch_size,
                                        device=device)
     
     # |batch_size|
@@ -41,6 +44,7 @@ def _evaluate_nplet_hot_encoded(covmats: torch.Tensor,
                                 T: int,
                                 batched_nplets: torch.Tensor,
                                 metric: str,
+                                batch_size: int,
                                 device: torch.device):
 
     """
@@ -60,6 +64,7 @@ def _evaluate_nplet_hot_encoded(covmats: torch.Tensor,
                                                    nplets=batched_nplets,
                                                    T=T,
                                                    covmat_precomputed=True,
+                                                   batch_size=batch_size,
                                                    device=device)
 
     # |batch_size|
diff --git a/thoi/heuristics/simulated_annealing.py b/thoi/heuristics/simulated_annealing.py
@@ -26,6 +26,7 @@ def simulated_annealing(X: Union[np.ndarray, torch.Tensor, List[np.ndarray], Lis
                         T: Optional[Union[int, List[int]]]=None,
                         initial_solution: Optional[torch.Tensor] = None,
                         repeat: int = 10,
+                        batch_size: int = 1000000,
                         device: torch.device = torch.device('cpu'),
                         max_iterations: int = 1000,
                         early_stop: int = 100,
@@ -50,7 +51,11 @@ def simulated_annealing(X: Union[np.ndarray, torch.Tensor, List[np.ndarray], Lis
         current_solution = initial_solution.to(device).contiguous()
 
     # |batch_size|
-    current_energy = _evaluate_nplets(covmats, T, current_solution, metric, device=device)
+    current_energy = _evaluate_nplets(covmats, T,
+                                      current_solution,
+                                      metric,
+                                      batch_size=batch_size,
+                                      device=device)
 
     if not largest:
         current_energy = -current_energy
@@ -95,7 +100,11 @@ def simulated_annealing(X: Union[np.ndarray, torch.Tensor, List[np.ndarray], Lis
 
         # Calculate energy of new solution
         # |batch_size|
-        new_energy = _evaluate_nplets(covmats, T, current_solution, metric, device=device)
+        new_energy = _evaluate_nplets(covmats, T,
+                                      current_solution,
+                                      metric,
+                                      batch_size=batch_size,
+                                      device=device)
 
         if not largest:
             new_energy = -new_energy
diff --git a/thoi/heuristics/simulated_annealing_multi_order.py b/thoi/heuristics/simulated_annealing_multi_order.py
@@ -34,6 +34,7 @@ def simulated_annealing_multi_order(X: Union[np.ndarray, torch.Tensor, List[np.n
                                     T: Optional[Union[int, List[int]]]=None,
                                     initial_solution: Optional[torch.Tensor] = None,
                                     repeat: int = 10,
+                                    batch_size: int = 1000000,
                                     device: torch.device = torch.device('cpu'),
                                     max_iterations: int = 1000,
                                     early_stop: int = 100,
@@ -59,7 +60,11 @@ def simulated_annealing_multi_order(X: Union[np.ndarray, torch.Tensor, List[np.n
         current_solution = initial_solution.to(device).contiguous()
 
     # |batch_size|
-    current_energy = _evaluate_nplet_hot_encoded(covmats, T, current_solution, metric, device=device)
+    current_energy = _evaluate_nplet_hot_encoded(covmats, T,
+                                                 current_solution,
+                                                 metric,
+                                                 batch_size=batch_size,
+                                                 device=device)
 
     if not largest:
         current_energy = -current_energy
@@ -97,7 +102,11 @@ def simulated_annealing_multi_order(X: Union[np.ndarray, torch.Tensor, List[np.n
 
         # Calculate energy of new solution
         # |batch_size|
-        new_energy = _evaluate_nplet_hot_encoded(covmats, T, current_solution, metric, device=device)
+        new_energy = _evaluate_nplet_hot_encoded(covmats, T,
+                                                 current_solution,
+                                                 metric,
+                                                 batch_size=batch_size,
+                                                 device=device)
 
         if not largest:
             new_energy = -new_energy
diff --git a/thoi/measures/gaussian_copula.py b/thoi/measures/gaussian_copula.py
@@ -163,7 +163,8 @@ def nplets_measures(X: Union[TensorLikeArray],
                     covmat_precomputed: bool = False,
                     T: Optional[Union[int, List[int]]] = None,
                     device: torch.device = torch.device('cpu'),
-                    verbose: int = logging.INFO):
+                    verbose: int = logging.INFO,
+                    batch_size: int = 1000000):
     
     """
     Compute higher-order measures (TC, DTC, O, S) for specified n-plets in the given data matrices X.
@@ -202,6 +203,9 @@ def nplets_measures(X: Union[TensorLikeArray],
     verbose : int, optional
         Logging verbosity level. Default is `logging.INFO`.
 
+    batch_size : int, optional
+        Batch size for processing n-plets. Default is 1,000,000.
+
     Returns
     -------
     torch.Tensor
@@ -310,7 +314,8 @@ def nplets_measures(X: Union[TensorLikeArray],
         
     # nplets must be a batched tensor
     assert len(nplets.shape) == 2, 'nplets must be a batched tensor with shape (batch_size, order)'
-    batch_size, order = nplets.shape
+    batch_size = min(batch_size, len(nplets))
+    order = nplets.shape[1]
 
     # Create marginal indexes
     # |N| x |N-1|
@@ -320,30 +325,40 @@ def nplets_measures(X: Union[TensorLikeArray],
     # |batch_size x D|, |batch_size x D|, |batch_size x D|
     bc1, bcN, bcNmin1 = _get_bias_correctors(T, order, batch_size, D, device)
 
-    # Create the covariance matrices for each nplet in the batch
-    # |batch_size| x |D| x |N| x |N|
-    nplets_covmats = _generate_nplets_covmants(covmats, nplets)
-
-    # Pack covmat in a single batch
-    # |batch_size x D| x |order| x |order|
-    nplets_covmats = nplets_covmats.view(batch_size*D, order, order)
-
-    # Batch process all nplets at once
-    measures = _get_tc_dtc_from_batched_covmat(nplets_covmats,
-                                               allmin1,
-                                               bc1,
-                                               bcN,
-                                               bcNmin1)
-
-    # Unpack results
-    # |batch_size x D|, |batch_size x D|, |batch_size x D|, |batch_size x D|
-    nplets_tc, nplets_dtc, nplets_o, nplets_s = measures
-
-    # |batch_size| x |D| x |4 = (tc, dtc, o, s)|
-    return torch.stack([nplets_tc.view(batch_size, D),
-                        nplets_dtc.view(batch_size, D),
-                        nplets_o.view(batch_size, D),
-                        nplets_s.view(batch_size, D)], dim=-1)
+    # Create DataLoader for nplets
+    dataloader = DataLoader(nplets, batch_size=batch_size, shuffle=False)
+
+    results = []
+    for nplet_batch in tqdm(dataloader, desc='Processing n-plets', leave=False):
+        curr_batch_size = nplet_batch.shape[0]
+
+        # Create the covariance matrices for each nplet in the batch
+        # |curr_batch_size| x |D| x |order| x |order|
+        nplets_covmats = _generate_nplets_covmants(covmats, nplet_batch)
+        
+        # Pack covmats in a single batch
+        # |curr_batch_size x D| x |order| x |order|
+        nplets_covmats = nplets_covmats.view(curr_batch_size * D, order, order)
+
+        # Batch process all nplets at once
+        measures = _get_tc_dtc_from_batched_covmat(nplets_covmats,
+                                                   allmin1,
+                                                   bc1[:curr_batch_size * D],
+                                                   bcN[:curr_batch_size * D],
+                                                   bcNmin1[:curr_batch_size * D])
+
+        # Unpack results
+        # |curr_batch_size x D|, |curr_batch_size x D|, |curr_batch_size x D|, |curr_batch_size x D|
+        nplets_tc, nplets_dtc, nplets_o, nplets_s = measures
+
+        # Collect results
+        results.append(torch.stack([nplets_tc.view(curr_batch_size, D),
+                                    nplets_dtc.view(curr_batch_size, D),
+                                    nplets_o.view(curr_batch_size, D),
+                                    nplets_s.view(curr_batch_size, D)], dim=-1))
+
+    # Concatenate all results
+    return torch.cat(results, dim=0)
 
 @torch.no_grad()
 def multi_order_measures(X: TensorLikeArray,
diff --git a/thoi/measures/gaussian_copula_hot_encoded.py b/thoi/measures/gaussian_copula_hot_encoded.py