Skip to content

Commit c5e5027

Browse files
authored
Merge pull request #32 from ivadomed/jv/27-analyze_lesions_using_t2w_sc_seg
`scripts-t2w_csa/create_T2w_csa_figure.py` script improvements
2 parents 100e68a + 7a20e63 commit c5e5027

File tree

1 file changed

+111
-48
lines changed

1 file changed

+111
-48
lines changed

scripts-t2w_csa/create_T2w_csa_figure.py

+111-48
Original file line numberDiff line numberDiff line change
@@ -80,31 +80,42 @@
8080
"tor": "Siemens",
8181
}
8282

83+
variable_to_label = {
84+
'MEAN(area)': 'CSA [$mm^2$]',
85+
'edss_M0': 'EDSS',
86+
'lesion_volume': 'Lesion volume [$mm^3$]',
87+
}
88+
8389

8490
def get_parser():
8591
parser = argparse.ArgumentParser(
8692
description="Generate figure for T2w C2-C3 CSA. The figure is saved to the same folder as the input .csv file."
8793
)
8894
parser.add_argument(
89-
'-i-canproco',
95+
'-csa-canproco',
9096
required=True,
9197
metavar='<file_path>',
92-
help="input .csv file with canproco CSA values")
98+
help="Path to the input .csv file with canproco CSA values.")
9399
parser.add_argument(
94-
'-i-spinegeneric',
100+
'-csa-spinegeneric',
95101
required=True,
96102
metavar='<file_path>',
97-
help="input .csv file with spine-generic CSA values")
103+
help="Path to the input .csv file with spine-generic CSA values.")
98104
parser.add_argument(
99-
'-participants-file-canproco',
105+
'-participants-canproco',
100106
required=True,
101107
metavar='<file_path>',
102-
help="canproco participants.tsv file (includes pathology and phenotype columns)")
108+
help="Path to the canproco participants.tsv file (includes pathology and phenotype columns).")
103109
parser.add_argument(
104-
'-participants-file-spinegeneric',
110+
'-participants-spinegeneric',
105111
required=True,
106112
metavar='<file_path>',
107-
help="spine-generic participants.tsv file (includes vendor column)")
113+
help="Path to the spine-generic participants.tsv file (includes vendor column).")
114+
parser.add_argument(
115+
'-lesion-folder',
116+
required=False,
117+
metavar='<file_path>',
118+
help="Path to the folder with .xls files with lesion volumes generated by sct_analyze_lesion.")
108119

109120
return parser
110121

@@ -305,10 +316,7 @@ def compute_partial_correlation(canproco_pd, site):
305316
:return:
306317
"""
307318
# Work only with MS patients
308-
if site == 'all':
309-
ms_pd = canproco_pd[canproco_pd['pathology'] == 'MS']
310-
else:
311-
ms_pd = canproco_pd[(canproco_pd['pathology'] == 'MS') & (canproco_pd['site'] == site)]
319+
ms_pd = canproco_pd[(canproco_pd['pathology'] == 'MS') & (canproco_pd['site'] == site)]
312320
# Convert str to int (to be compatible with partial correlation)
313321
ms_pd = ms_pd.replace({'phenotype': {'RRMS': 0, 'PPMS': 1, 'RIS': 2}})
314322
stats = pg.partial_corr(data=ms_pd, x='MEAN(area)', y='edss_M0', covar='phenotype', method='spearman')
@@ -356,33 +364,32 @@ def compute_regression(x, y):
356364
return x_vals, y_vals
357365

358366

359-
def create_csa_edss_correlation_figure_persite(canproco_pd, fname_fig):
367+
def create_correlation_figures_persite(canproco_pd, pair, fname_fig):
360368
"""
361-
Plot the relationship between EDSS score and CSA per-site and per-phenotype. Also, plot linear fit per-phenotype and
369+
Plot the relationship between pairs of variables per-site and per-phenotype. Also, plot linear fit per-phenotype and
362370
for the whole cohort.
363-
:param canproco_pd:
364-
:param fname_fig:
371+
:param canproco_pd: pandas dataframe: canproco data
372+
:param pair: tuple: pairs of variables for which correlation will be computed, for example ('MEAN(area)', 'edss_M0')
373+
:param fname_fig: str: figure name
365374
:return:
366375
"""
376+
# Drop rows with NaN values for the pair of variables
377+
canproco_pd = canproco_pd.dropna(subset=list(pair))
367378
# Create main figure
368-
fig, axes = plt.subplots(2, 3, figsize=(20, 14), sharey=True)
379+
fig, axes = plt.subplots(2, 3, figsize=(20, 14), sharex=True, sharey=True)
369380
# Flatten 2D array into 1D to allow iteration by loop
370381
ax = axes.ravel()
371382
# Loop across sites (all means all sites together)
372383
for index, site in enumerate(site_to_vendor_title.keys()):
373384
# Compute partial correlation (with phenotype as a covariate)
374385
r, p_val = compute_partial_correlation(canproco_pd, site)
375-
print(f'{site}: Partial correlation EDSS vs CSA: r={r}, p-value{format_pvalue(p_val, alpha=0.05)}')
386+
print(f'{site}: Partial correlation {pair[0]} vs {pair[1]}: r={r}, p-value{format_pvalue(p_val, alpha=0.05)}')
376387
# Compute linear regression for all MS patients together (i.e., across all phenotypes) --> ['pathology'] == 'MS'
377-
if site == 'all':
378-
csa = canproco_pd[canproco_pd['pathology'] == 'MS']['MEAN(area)']
379-
edss = canproco_pd[canproco_pd['pathology'] == 'MS']['edss_M0']
380-
#phen = canproco_pd[canproco_pd['pathology'] == 'MS']['phenotype']
381-
else:
382-
csa = canproco_pd[(canproco_pd['pathology'] == 'MS') & (canproco_pd['site'] == site)]['MEAN(area)']
383-
edss = canproco_pd[(canproco_pd['pathology'] == 'MS') & (canproco_pd['site'] == site)]['edss_M0']
384-
#phen = canproco_pd[(canproco_pd['pathology'] == 'MS') & (canproco_pd['site'] == site)]['phenotype']
385-
x_vals, y_vals = compute_regression(csa, edss)
388+
var1 = canproco_pd[(canproco_pd['pathology'] == 'MS') & (canproco_pd['site'] == site)][pair[0]]
389+
var2 = canproco_pd[(canproco_pd['pathology'] == 'MS') & (canproco_pd['site'] == site)][pair[1]]
390+
#phen = canproco_pd[(canproco_pd['pathology'] == 'MS') & (canproco_pd['site'] == site)]['phenotype']
391+
x_vals, y_vals = compute_regression(var1, var2)
392+
# TODO - Consider replacing by sns.regplot
386393
ax[index].plot(x_vals, y_vals, '--', color='black', alpha=.5, linewidth=3)
387394

388395
# Insert text with corr coef and pval into every subplot/axis
@@ -392,25 +399,20 @@ def create_csa_edss_correlation_figure_persite(canproco_pd, fname_fig):
392399

393400
for color, phenotype in enumerate(['RRMS', 'PPMS', 'RIS']):
394401
# Prepare variables for plotting
395-
if site == 'all':
396-
csa = canproco_pd[canproco_pd['phenotype'] == phenotype]['MEAN(area)']
397-
edss = canproco_pd[canproco_pd['phenotype'] == phenotype]['edss_M0']
398-
r, p_val = compute_correlation(csa, edss)
399-
else:
400-
csa = canproco_pd[(canproco_pd['phenotype'] == phenotype) & (canproco_pd['site'] == site)]['MEAN(area)']
401-
edss = canproco_pd[(canproco_pd['phenotype'] == phenotype) & (canproco_pd['site'] == site)]['edss_M0']
402-
r, p_val = compute_correlation(csa, edss)
403-
print(f'{site}, {phenotype}: Correlation EDSS vs CSA: r={r}, p-value{format_pvalue(p_val, alpha=0.05)}')
402+
var1 = canproco_pd[(canproco_pd['phenotype'] == phenotype) & (canproco_pd['site'] == site)][pair[0]]
403+
var2 = canproco_pd[(canproco_pd['phenotype'] == phenotype) & (canproco_pd['site'] == site)][pair[1]]
404+
r, p_val = compute_correlation(var1, var2)
405+
print(f'{site}, {phenotype}: Correlation {pair[0]} vs {pair[1]}: r={r}, p-value{format_pvalue(p_val, alpha=0.05)}')
404406
# Plot individual scatter plots
405-
ax[index].scatter(csa, edss, color=color_pallete[color], alpha=.8, label=phenotype, s=100)
406-
x_vals, y_vals = compute_regression(csa, edss)
407+
ax[index].scatter(var1, var2, color=color_pallete[color], alpha=.8, label=phenotype, s=100)
408+
x_vals, y_vals = compute_regression(var1, var2)
407409
ax[index].plot(x_vals, y_vals, '--', color=color_pallete[color], alpha=.8, linewidth=3)
408410
if site == 'all':
409411
ax[index].set_title(site_to_vendor_title[site], fontsize=FONTSIZE_CORR, fontweight='bold')
410412
else:
411413
ax[index].set_title(site_to_vendor_title[site], fontsize=FONTSIZE_CORR)
412414
if index > 2:
413-
ax[index].set_xlabel('CSA [$mm^2$]', fontsize=FONTSIZE_CORR)
415+
ax[index].set_xlabel(variable_to_label[pair[0]], fontsize=FONTSIZE_CORR)
414416

415417
# # Set fixed number of y-ticks
416418
# xmin, xmax = ax[index].get_xlim()
@@ -419,7 +421,7 @@ def create_csa_edss_correlation_figure_persite(canproco_pd, fname_fig):
419421
# ax[index].set_xticklabels(custom_ticks)
420422

421423
if index == 0 or index == 3:
422-
ax[index].set_ylabel('EDSS', fontsize=FONTSIZE_CORR)
424+
ax[index].set_ylabel(variable_to_label[pair[1]], fontsize=FONTSIZE_CORR)
423425
# Increase size of xticks and yticks
424426
plt.setp(ax[index].xaxis.get_majorticklabels(), fontsize=FONTSIZE_CORR)
425427
plt.setp(ax[index].yaxis.get_majorticklabels(), fontsize=FONTSIZE_CORR)
@@ -552,20 +554,63 @@ def read_participants_file(file_path):
552554
raise FileNotFoundError(f'{file_path} not found')
553555

554556

557+
def read_lesion_files(lesion_folder):
558+
"""
559+
Read xls files containing lesion volume generated by sct_analyze_lesion and aggregate them into one pandas DF
560+
"""
561+
list_of_files = os.listdir(lesion_folder)
562+
# Ignore .DS_Store
563+
if '.DS_Store' in list_of_files:
564+
list_of_files.remove('.DS_Store')
565+
list_of_files.sort()
566+
567+
# Initialize pandas dataFrame where lesion volume across all subjects will be stored
568+
lesion_df = pd.DataFrame(columns=['subject_id', 'lesion_volume', 'number_of_lesions'])
569+
570+
# Loop across subjects
571+
for file in list_of_files:
572+
# Get subject ID
573+
subject_id = file.split('_')[0]
574+
lesion_dict = {'subject_id': [], 'lesion_volume': [], 'number_of_lesions': []}
575+
# Construct path to xls file
576+
file_path = os.path.join(lesion_folder, file)
577+
# Read xls file as pandas dataFrame
578+
# run 'pip install xlrd' if you get an error
579+
df = pd.read_excel(file_path)
580+
lesion_dict['subject_id'] = subject_id
581+
# Sum lesion volume across all lesions
582+
lesion_dict['lesion_volume'] = df['volume [mm3]'].sum()
583+
# Get number of lesions (number of rows in the dataFrame)
584+
lesion_dict['number_of_lesions'] = df.shape[0]
585+
# Insert lesion_dict into lesion_df as a new row
586+
lesion_df.loc[subject_id] = lesion_dict
587+
588+
return lesion_df
589+
590+
555591
def main():
556592
parser = get_parser()
557593
args = parser.parse_args()
558594

595+
# ------------------------------------------------------
596+
# Read input files as pandas DataFrames
597+
# ------------------------------------------------------
559598
# Read .csv file for canproco subjects
560-
canproco_pd = read_csv_file(args.i_canproco, subjects_to_exclude_canproco)
599+
canproco_pd = read_csv_file(args.csa_canproco, subjects_to_exclude_canproco)
561600
# Read canproco participants.tsv file (includes pathology and phenotype columns)
562-
canproco_participants_pd = read_participants_file(args.participants_file_canproco)
601+
canproco_participants_pd = read_participants_file(args.participants_canproco)
563602

564603
# Read .csv file for spine-generic subjects
565-
spinegeneric_pd = read_csv_file(args.i_spinegeneric, subjects_to_exclude_spinegeneric)
604+
spinegeneric_pd = read_csv_file(args.csa_spinegeneric, subjects_to_exclude_spinegeneric)
566605
# Read spine-generic participants.tsv file (includes manufacturer column)
567-
spinegeneric_participants_pd = read_participants_file(args.participants_file_spinegeneric)
606+
spinegeneric_participants_pd = read_participants_file(args.participants_spinegeneric)
568607

608+
if args.lesion_folder:
609+
lesion_df = read_lesion_files(args.lesion_folder)
610+
611+
# ------------------------------------------------------
612+
# Merge and prepare DataFrames for further analysis
613+
# ------------------------------------------------------
569614
# Merge pathology and phenotype columns to the canproco dataframe with CSA values
570615
canproco_pd = pd.merge(canproco_pd, canproco_participants_pd[['participant_id', 'pathology', 'phenotype', 'edss_M0']],
571616
how='left', left_on='subject_id', right_on='participant_id')
@@ -576,10 +621,18 @@ def main():
576621
# Replace n/a in phenotype by HC to allow sorting in violinplot
577622
canproco_pd['phenotype'].fillna(canproco_pd['pathology'], inplace=True)
578623

624+
# Merge lesion_df to the canproco dataframe with CSA values
625+
if args.lesion_folder:
626+
canproco_pd = pd.merge(canproco_pd, lesion_df[['subject_id', 'lesion_volume', 'number_of_lesions']],
627+
how='left', left_on='subject_id', right_on='subject_id')
628+
579629
# Merge manufacturer column to the spine-generic dataframe with CSA values
580630
spinegeneric_pd = pd.merge(spinegeneric_pd, spinegeneric_participants_pd[['participant_id', 'manufacturer']],
581631
how='left', left_on='subject_id', right_on='participant_id')
582632

633+
# ------------------------------------------------------
634+
# Compute descriptive statistics
635+
# ------------------------------------------------------
583636
# Compute median, mean, std, cov persite and phenotype
584637
statistic = canproco_pd.groupby(['site', 'phenotype']).agg([np.median, np.mean, np.std, stats.variation])
585638
print(f'\nDescriptive statistics:\n{statistic}')
@@ -593,8 +646,11 @@ def main():
593646
temp_pd['site'] = 'all'
594647
canproco_pd = pd.concat([canproco_pd, temp_pd])
595648

649+
# ------------------------------------------------------
650+
# Create plots and compute between sites statistics
651+
# ------------------------------------------------------
596652
# Create rain plot
597-
fname_fig = args.i_canproco.replace('.csv', '_rainplot.png')
653+
fname_fig = args.csa_canproco.replace('.csv', '_rainplot.png')
598654
create_rainplot(canproco_pd, spinegeneric_pd, fname_fig)
599655

600656
# Compute ANOVA among phenotypes
@@ -611,9 +667,16 @@ def main():
611667
# Compare CSA values between canproco healthy controls and spine-generic per manufacturer
612668
compare_healthy_controls(canproco_pd, spinegeneric_pd)
613669

614-
# Compute and plot correlation between EDSS and CSA persite
615-
fname_fig = args.i_canproco.replace('.csv', '_correlation_persite.png')
616-
create_csa_edss_correlation_figure_persite(canproco_pd, fname_fig)
670+
# Define pairs of variables for which correlation will be computed
671+
variable_pairs = [('MEAN(area)', 'edss_M0'),]
672+
if args.lesion_folder:
673+
variable_pairs.append(('MEAN(area)', 'lesion_volume'))
674+
variable_pairs.append(('edss_M0', 'lesion_volume'))
675+
# Compute and plot correlation between variables persite (including also the whole cohort).
676+
# For example between EDSS and CSA, or between EDSS and lesion volume
677+
for pair in variable_pairs:
678+
fname_fig = args.csa_canproco.replace('.csv', '_correlation_' + pair[0] + '_vs_' + pair[1] + '_persite.png')
679+
create_correlation_figures_persite(canproco_pd, pair, fname_fig)
617680

618681

619682
if __name__ == "__main__":

0 commit comments

Comments
 (0)