some cleaning.

USM-CHU-FGuyon · USM-CHU-FGuyon · commit c40f4541c587 · 2024-06-26T15:46:40.000+04:00
diff --git a/2_harmonize_amsterdam.py b/2_harmonize_amsterdam.py
@@ -7,10 +7,9 @@
 """
 from amsterdam_preprocessing.timeseries import amsterdamTSP
 from amsterdam_preprocessing.flat_and_labels import Ams_FLProcessor
-import polars as pl
 
 tsp = amsterdamTSP(
-    ts_chunks='numericitems.parquet',
+    ts_pth='numericitems.parquet',
     listitems_pth='listitems.parquet',
     gcs_scores_pth='glasgow_coma_scores.parquet')
 
diff --git a/2_harmonize_eicu.py b/2_harmonize_eicu.py
@@ -7,7 +7,7 @@
 """
 from eicu_preprocessing.flat_and_labels import eicu_FLProcessor
 from eicu_preprocessing.timeseries import eicuTSP
-import polars as pl
+
 tsp = eicuTSP(
     lab_pth='lab.parquet',
     resp_pth='tsresp.parquet',
diff --git a/2_harmonize_hirid.py b/2_harmonize_hirid.py
@@ -7,7 +7,7 @@
 """
 from hirid_preprocessing.flat_and_labels import Hir_FLProcessing
 from hirid_preprocessing.timeseries import hiridTSP
-import polars as pl
+
 tsp = hiridTSP(ts='timeseries.parquet',
                 pharma='medication.parquet')
 
diff --git a/amsterdam_preprocessing/timeseries.py b/amsterdam_preprocessing/timeseries.py
@@ -8,9 +8,9 @@ class amsterdamTSP(TimeseriesProcessor):
     * 1 wide table: the gcs_score tables that was computed in 1_amsterdam.py
     * 1 medication table that was computed in 1_amsterdam.py
     """
-    def __init__(self, ts_chunks, listitems_pth, gcs_scores_pth):
+    def __init__(self, ts_pth, listitems_pth, gcs_scores_pth):
         super().__init__(dataset='amsterdam')
-        self.lf_ts = self.scan(self.savepath+ts_chunks)
+        self.lf_ts = self.scan(self.savepath+ts_pth)
         self.lf_listitems = self.scan(self.savepath+listitems_pth)
         self.lf_medication = self.scan(self.med_savepath)
 
@@ -56,6 +56,11 @@ def run_harmonization(self):
         self.medication_to_long(lf_med)
     
     def run_for_preprocessed(self, reset_dir=None):
+        raise UserWarning("This function is not maintained. It should be replaced"
+                          "by a cleaner/faster alternative in the future.\n"
+                          "Contributions welcome.")
+        
+        
         self.reset_dir(reset_dir)
 
         lf_ts = self.harmonize_columns(self.lf_ts, **self.colnames_ts)
diff --git a/database_processing/timeseriesprocessor.py b/database_processing/timeseriesprocessor.py
@@ -224,13 +224,19 @@ def timeseries_to_long(self,
                            lf_long=None,
                            lf_wide=None,
                            sink=True):
-        cols_index = {self.idx_col: pl.Int64, self.time_col: pl.Int64}
+        cols_index = {self.idx_col: pl.Int64, self.time_col: pl.Duration}
         if lf_wide is None:
             lf_wide = pl.LazyFrame(schema=cols_index|{'dummy': pl.Float32})
         if lf_long is None:
-            lf_long = pl.LazyFrame(schema=cols_index | {'variable':pl.String, 'value': pl.Float32})
-
-        lf_wide_melted = lf_wide.melt(['patient', 'time']).with_columns(pl.col('value').cast(pl.Float32, strict=False))
+            lf_long = pl.LazyFrame(schema=cols_index | {'variable':pl.String,
+                                                        'value': pl.Float32})
+
+        lf_wide_melted = (lf_wide
+                          .melt(['patient', 'time'])
+                          .with_columns(
+                              pl.col('value').cast(pl.Float32, strict=False)
+                              )
+                          )
         
         lf = (pl.concat([df.select(sorted(df.columns)) for df in [lf_wide_melted, lf_long]], how='vertical_relaxed')
               .with_columns(
diff --git a/eicu_preprocessing/timeseries.py b/eicu_preprocessing/timeseries.py
@@ -102,11 +102,13 @@ def run_harmonization(self):
             lf_tsinout,
             ])
 
+        print('Collecting tsperiodic and tsaperiodic', end='')
         #not collecting here causes errors in the following methods. polars bug ?
         lf_ts_hor = (pl.concat([lf_tsperiodic, lf_tsaperiodic],
                               how='diagonal',
                               rechunk=True)
                      .collect(streaming=True))
+        print('  -> Done')
         
         lf_ts_ver = self.filter_tables(lf_ts_ver,
                                     kept_variables=self.kept_ts)
@@ -116,7 +118,9 @@ def run_harmonization(self):
         
     
     def run_preprocessing(self, reset_dir=None):
-
+        raise UserWarning("This function is not maintained. It should be replaced"
+                          "by a cleaner/faster alternative in the future.\n"
+                          "Contributions welcome.")
         self.reset_dir(reset_dir)
 
         
diff --git a/hirid_preprocessing/timeseries.py b/hirid_preprocessing/timeseries.py
@@ -45,7 +45,9 @@ def run_harmonization(self):
         
         
     def run_preprocessing(self, reset_dir=None):
-
+        raise UserWarning("This function is not maintained. It should be replaced"
+                          "by a cleaner/faster alternative in the future.\n"
+                          "Contributions welcome.")
         self.reset_dir(reset_dir)
 
         kept_variables = (self.kept_ts+['Body weight', 'Body height measure'])
diff --git a/mimic3_preprocessing/mimic3preparator.py b/mimic3_preprocessing/mimic3preparator.py
@@ -384,7 +384,7 @@ def gen_timeseries(self):
               .drop_nulls()
               .with_columns(
                   pl.col('CHARTTIME').str.to_datetime("%Y-%m-%d %H:%M:%S"),
-                  pl.col('ICUSTAY_ID').cast(pl.Int64)
+                  pl.col('ICUSTAY_ID').cast(pl.Int32)
                   )
               .pipe(self.pl_prepare_tstable,
                     col_measuretime='CHARTTIME',
@@ -394,6 +394,7 @@ def gen_timeseries(self):
                     unit_los='day')
               .join(ditems.select('ITEMID', 'LABEL'), on='ITEMID')
               .drop('ITEMID')
-              .collect(streaming=True))
+              #.collect(streaming=True)
+              )
 
         self.save(ts, self.ts_savepath)
diff --git a/mimic3_preprocessing/timeseries.py b/mimic3_preprocessing/timeseries.py
@@ -82,13 +82,14 @@ def run_harmonization(self):
         lf_ts = self.filter_tables(lf_timeser,
                                    kept_variables=self.kept_ts)
         
-    
         self.timeseries_to_long(lf_ts)
         self.medication_to_long(lf_med)
         
     
     def run_preprocessing(self):
-
+        raise UserWarning("This function is not maintained. It should be replaced"
+                          "by a cleaner/faster alternative in the future.\n"
+                          "Contributions welcome.")
         
         lf_medication = self.harmonize_columns(self.lf_medication,
                                                     **self.colnames_med)
diff --git a/mimic4_preprocessing/timeseries.py b/mimic4_preprocessing/timeseries.py
@@ -88,6 +88,9 @@ def run_harmonization(self):
         self.medication_to_long(lf_med)
 
     def run_preprocessing(self, reset_dir=None):
+        raise UserWarning("This function is not maintained. It should be replaced"
+                          "by a cleaner/faster alternative in the future.\n"
+                          "Contributions welcome.")
         self.reset_dir(reset_dir)
         
         lf_outputevents = self.harmonize_columns(self.lf_outputevents,