Merge pull request #32 from ivadomed/jv/27-analyze_lesions_using_t2w_sc_seg

valosekj · web-flow · commit c5e5027e25f1 · 2023-07-04T11:41:00.000-04:00
`scripts-t2w_csa/create_T2w_csa_figure.py` script improvements
diff --git a/scripts-t2w_csa/create_T2w_csa_figure.py b/scripts-t2w_csa/create_T2w_csa_figure.py
@@ -80,31 +80,42 @@
     "tor": "Siemens",
 }
 
+variable_to_label = {
+    'MEAN(area)': 'CSA [$mm^2$]',
+    'edss_M0': 'EDSS',
+    'lesion_volume': 'Lesion volume [$mm^3$]',
+}
+
 
 def get_parser():
     parser = argparse.ArgumentParser(
         description="Generate figure for T2w C2-C3 CSA. The figure is saved to the same folder as the input .csv file."
     )
     parser.add_argument(
-        '-i-canproco',
+        '-csa-canproco',
         required=True,
         metavar='<file_path>',
-        help="input .csv file with canproco CSA values")
+        help="Path to the input .csv file with canproco CSA values.")
     parser.add_argument(
-        '-i-spinegeneric',
+        '-csa-spinegeneric',
         required=True,
         metavar='<file_path>',
-        help="input .csv file with spine-generic CSA values")
+        help="Path to the  input .csv file with spine-generic CSA values.")
     parser.add_argument(
-        '-participants-file-canproco',
+        '-participants-canproco',
         required=True,
         metavar='<file_path>',
-        help="canproco participants.tsv file (includes pathology and phenotype columns)")
+        help="Path to the canproco participants.tsv file (includes pathology and phenotype columns).")
     parser.add_argument(
-        '-participants-file-spinegeneric',
+        '-participants-spinegeneric',
         required=True,
         metavar='<file_path>',
-        help="spine-generic participants.tsv file (includes vendor column)")
+        help="Path to the spine-generic participants.tsv file (includes vendor column).")
+    parser.add_argument(
+        '-lesion-folder',
+        required=False,
+        metavar='<file_path>',
+        help="Path to the folder with .xls files with lesion volumes generated by sct_analyze_lesion.")
 
     return parser
 
@@ -305,10 +316,7 @@ def compute_partial_correlation(canproco_pd, site):
     :return:
     """
     # Work only with MS patients
-    if site == 'all':
-        ms_pd = canproco_pd[canproco_pd['pathology'] == 'MS']
-    else:
-        ms_pd = canproco_pd[(canproco_pd['pathology'] == 'MS') & (canproco_pd['site'] == site)]
+    ms_pd = canproco_pd[(canproco_pd['pathology'] == 'MS') & (canproco_pd['site'] == site)]
     # Convert str to int (to be compatible with partial correlation)
     ms_pd = ms_pd.replace({'phenotype': {'RRMS': 0, 'PPMS': 1, 'RIS': 2}})
     stats = pg.partial_corr(data=ms_pd, x='MEAN(area)', y='edss_M0', covar='phenotype', method='spearman')
@@ -356,33 +364,32 @@ def compute_regression(x, y):
     return x_vals, y_vals
 
 
-def create_csa_edss_correlation_figure_persite(canproco_pd, fname_fig):
+def create_correlation_figures_persite(canproco_pd, pair, fname_fig):
     """
-    Plot the relationship between EDSS score and CSA per-site and per-phenotype. Also, plot linear fit per-phenotype and
+    Plot the relationship between pairs of variables per-site and per-phenotype. Also, plot linear fit per-phenotype and
     for the whole cohort.
-    :param canproco_pd:
-    :param fname_fig:
+    :param canproco_pd: pandas dataframe: canproco data
+    :param pair: tuple: pairs of variables for which correlation will be computed, for example ('MEAN(area)', 'edss_M0')
+    :param fname_fig: str: figure name
     :return:
     """
+    # Drop rows with NaN values for the pair of variables
+    canproco_pd = canproco_pd.dropna(subset=list(pair))
     # Create main figure
-    fig, axes = plt.subplots(2, 3, figsize=(20, 14), sharey=True)
+    fig, axes = plt.subplots(2, 3, figsize=(20, 14), sharex=True, sharey=True)
     # Flatten 2D array into 1D to allow iteration by loop
     ax = axes.ravel()
     # Loop across sites (all means all sites together)
     for index, site in enumerate(site_to_vendor_title.keys()):
         # Compute partial correlation (with phenotype as a covariate)
         r, p_val = compute_partial_correlation(canproco_pd, site)
-        print(f'{site}: Partial correlation EDSS vs CSA: r={r}, p-value{format_pvalue(p_val, alpha=0.05)}')
+        print(f'{site}: Partial correlation {pair[0]} vs {pair[1]}: r={r}, p-value{format_pvalue(p_val, alpha=0.05)}')
         # Compute linear regression for all MS patients together (i.e., across all phenotypes) --> ['pathology'] == 'MS'
-        if site == 'all':
-            csa = canproco_pd[canproco_pd['pathology'] == 'MS']['MEAN(area)']
-            edss = canproco_pd[canproco_pd['pathology'] == 'MS']['edss_M0']
-            #phen = canproco_pd[canproco_pd['pathology'] == 'MS']['phenotype']
-        else:
-            csa = canproco_pd[(canproco_pd['pathology'] == 'MS') & (canproco_pd['site'] == site)]['MEAN(area)']
-            edss = canproco_pd[(canproco_pd['pathology'] == 'MS') & (canproco_pd['site'] == site)]['edss_M0']
-            #phen = canproco_pd[(canproco_pd['pathology'] == 'MS') & (canproco_pd['site'] == site)]['phenotype']
-        x_vals, y_vals = compute_regression(csa, edss)
+        var1 = canproco_pd[(canproco_pd['pathology'] == 'MS') & (canproco_pd['site'] == site)][pair[0]]
+        var2 = canproco_pd[(canproco_pd['pathology'] == 'MS') & (canproco_pd['site'] == site)][pair[1]]
+        #phen = canproco_pd[(canproco_pd['pathology'] == 'MS') & (canproco_pd['site'] == site)]['phenotype']
+        x_vals, y_vals = compute_regression(var1, var2)
+        # TODO - Consider replacing by sns.regplot
         ax[index].plot(x_vals, y_vals, '--', color='black', alpha=.5, linewidth=3)
 
         # Insert text with corr coef and pval into every subplot/axis
@@ -392,25 +399,20 @@ def create_csa_edss_correlation_figure_persite(canproco_pd, fname_fig):
 
         for color, phenotype in enumerate(['RRMS', 'PPMS', 'RIS']):
             # Prepare variables for plotting
-            if site == 'all':
-                csa = canproco_pd[canproco_pd['phenotype'] == phenotype]['MEAN(area)']
-                edss = canproco_pd[canproco_pd['phenotype'] == phenotype]['edss_M0']
-                r, p_val = compute_correlation(csa, edss)
-            else:
-                csa = canproco_pd[(canproco_pd['phenotype'] == phenotype) & (canproco_pd['site'] == site)]['MEAN(area)']
-                edss = canproco_pd[(canproco_pd['phenotype'] == phenotype) & (canproco_pd['site'] == site)]['edss_M0']
-                r, p_val = compute_correlation(csa, edss)
-            print(f'{site}, {phenotype}: Correlation EDSS vs CSA: r={r}, p-value{format_pvalue(p_val, alpha=0.05)}')
+            var1 = canproco_pd[(canproco_pd['phenotype'] == phenotype) & (canproco_pd['site'] == site)][pair[0]]
+            var2 = canproco_pd[(canproco_pd['phenotype'] == phenotype) & (canproco_pd['site'] == site)][pair[1]]
+            r, p_val = compute_correlation(var1, var2)
+            print(f'{site}, {phenotype}: Correlation {pair[0]} vs {pair[1]}: r={r}, p-value{format_pvalue(p_val, alpha=0.05)}')
             # Plot individual scatter plots
-            ax[index].scatter(csa, edss, color=color_pallete[color], alpha=.8, label=phenotype, s=100)
-            x_vals, y_vals = compute_regression(csa, edss)
+            ax[index].scatter(var1, var2, color=color_pallete[color], alpha=.8, label=phenotype, s=100)
+            x_vals, y_vals = compute_regression(var1, var2)
             ax[index].plot(x_vals, y_vals, '--', color=color_pallete[color], alpha=.8, linewidth=3)
             if site == 'all':
                 ax[index].set_title(site_to_vendor_title[site], fontsize=FONTSIZE_CORR, fontweight='bold')
             else:
                 ax[index].set_title(site_to_vendor_title[site], fontsize=FONTSIZE_CORR)
             if index > 2:
-                ax[index].set_xlabel('CSA [$mm^2$]', fontsize=FONTSIZE_CORR)
+                ax[index].set_xlabel(variable_to_label[pair[0]], fontsize=FONTSIZE_CORR)
 
             # # Set fixed number of y-ticks
             # xmin, xmax = ax[index].get_xlim()
@@ -419,7 +421,7 @@ def create_csa_edss_correlation_figure_persite(canproco_pd, fname_fig):
             # ax[index].set_xticklabels(custom_ticks)
 
             if index == 0 or index == 3:
-                ax[index].set_ylabel('EDSS', fontsize=FONTSIZE_CORR)
+                ax[index].set_ylabel(variable_to_label[pair[1]], fontsize=FONTSIZE_CORR)
             # Increase size of xticks and yticks
             plt.setp(ax[index].xaxis.get_majorticklabels(), fontsize=FONTSIZE_CORR)
             plt.setp(ax[index].yaxis.get_majorticklabels(), fontsize=FONTSIZE_CORR)
@@ -552,20 +554,63 @@ def read_participants_file(file_path):
         raise FileNotFoundError(f'{file_path} not found')
 
 
+def read_lesion_files(lesion_folder):
+    """
+    Read xls files containing lesion volume generated by sct_analyze_lesion and aggregate them into one pandas DF
+    """
+    list_of_files = os.listdir(lesion_folder)
+    # Ignore .DS_Store
+    if '.DS_Store' in list_of_files:
+        list_of_files.remove('.DS_Store')
+    list_of_files.sort()
+
+    # Initialize pandas dataFrame where lesion volume across all subjects will be stored
+    lesion_df = pd.DataFrame(columns=['subject_id', 'lesion_volume', 'number_of_lesions'])
+
+    # Loop across subjects
+    for file in list_of_files:
+        # Get subject ID
+        subject_id = file.split('_')[0]
+        lesion_dict = {'subject_id': [], 'lesion_volume': [], 'number_of_lesions': []}
+        # Construct path to xls file
+        file_path = os.path.join(lesion_folder, file)
+        # Read xls file as pandas dataFrame
+        # run 'pip install xlrd' if you get an error
+        df = pd.read_excel(file_path)
+        lesion_dict['subject_id'] = subject_id
+        # Sum lesion volume across all lesions
+        lesion_dict['lesion_volume'] = df['volume [mm3]'].sum()
+        # Get number of lesions (number of rows in the dataFrame)
+        lesion_dict['number_of_lesions'] = df.shape[0]
+        # Insert lesion_dict into lesion_df as a new row
+        lesion_df.loc[subject_id] = lesion_dict
+
+    return lesion_df
+
+
 def main():
     parser = get_parser()
     args = parser.parse_args()
 
+    # ------------------------------------------------------
+    # Read input files as pandas DataFrames
+    # ------------------------------------------------------
     # Read .csv file for canproco subjects
-    canproco_pd = read_csv_file(args.i_canproco, subjects_to_exclude_canproco)
+    canproco_pd = read_csv_file(args.csa_canproco, subjects_to_exclude_canproco)
     # Read canproco participants.tsv file (includes pathology and phenotype columns)
-    canproco_participants_pd = read_participants_file(args.participants_file_canproco)
+    canproco_participants_pd = read_participants_file(args.participants_canproco)
 
     # Read .csv file for spine-generic subjects
-    spinegeneric_pd = read_csv_file(args.i_spinegeneric, subjects_to_exclude_spinegeneric)
+    spinegeneric_pd = read_csv_file(args.csa_spinegeneric, subjects_to_exclude_spinegeneric)
     # Read spine-generic participants.tsv file (includes manufacturer column)
-    spinegeneric_participants_pd = read_participants_file(args.participants_file_spinegeneric)
+    spinegeneric_participants_pd = read_participants_file(args.participants_spinegeneric)
 
+    if args.lesion_folder:
+        lesion_df = read_lesion_files(args.lesion_folder)
+
+    # ------------------------------------------------------
+    # Merge and prepare DataFrames for further analysis
+    # ------------------------------------------------------
     # Merge pathology and phenotype columns to the canproco dataframe with CSA values
     canproco_pd = pd.merge(canproco_pd, canproco_participants_pd[['participant_id', 'pathology', 'phenotype', 'edss_M0']],
                            how='left', left_on='subject_id', right_on='participant_id')
@@ -576,10 +621,18 @@ def main():
     # Replace n/a in phenotype by HC to allow sorting in violinplot
     canproco_pd['phenotype'].fillna(canproco_pd['pathology'], inplace=True)
 
+    # Merge lesion_df to the canproco dataframe with CSA values
+    if args.lesion_folder:
+        canproco_pd = pd.merge(canproco_pd, lesion_df[['subject_id', 'lesion_volume', 'number_of_lesions']],
+                               how='left', left_on='subject_id', right_on='subject_id')
+
     # Merge manufacturer column to the spine-generic dataframe with CSA values
     spinegeneric_pd = pd.merge(spinegeneric_pd, spinegeneric_participants_pd[['participant_id', 'manufacturer']],
                                how='left', left_on='subject_id', right_on='participant_id')
 
+    # ------------------------------------------------------
+    # Compute descriptive statistics
+    # ------------------------------------------------------
     # Compute median, mean, std, cov persite and phenotype
     statistic = canproco_pd.groupby(['site', 'phenotype']).agg([np.median, np.mean, np.std, stats.variation])
     print(f'\nDescriptive statistics:\n{statistic}')
@@ -593,8 +646,11 @@ def main():
     temp_pd['site'] = 'all'
     canproco_pd = pd.concat([canproco_pd, temp_pd])
 
+    # ------------------------------------------------------
+    # Create plots and compute between sites statistics
+    # ------------------------------------------------------
     # Create rain plot
-    fname_fig = args.i_canproco.replace('.csv', '_rainplot.png')
+    fname_fig = args.csa_canproco.replace('.csv', '_rainplot.png')
     create_rainplot(canproco_pd, spinegeneric_pd, fname_fig)
 
     # Compute ANOVA among phenotypes
@@ -611,9 +667,16 @@ def main():
     # Compare CSA values between canproco healthy controls and spine-generic per manufacturer
     compare_healthy_controls(canproco_pd, spinegeneric_pd)
 
-    # Compute and plot correlation between EDSS and CSA persite
-    fname_fig = args.i_canproco.replace('.csv', '_correlation_persite.png')
-    create_csa_edss_correlation_figure_persite(canproco_pd, fname_fig)
+    # Define pairs of variables for which correlation will be computed
+    variable_pairs = [('MEAN(area)', 'edss_M0'),]
+    if args.lesion_folder:
+        variable_pairs.append(('MEAN(area)', 'lesion_volume'))
+        variable_pairs.append(('edss_M0', 'lesion_volume'))
+    # Compute and plot correlation between variables persite (including also the whole cohort).
+    # For example between EDSS and CSA, or between EDSS and lesion volume
+    for pair in variable_pairs:
+        fname_fig = args.csa_canproco.replace('.csv', '_correlation_' + pair[0] + '_vs_' + pair[1] + '_persite.png')
+        create_correlation_figures_persite(canproco_pd, pair, fname_fig)
 
 
 if __name__ == "__main__":