LECA Proteomics

PPI ML pipeline

# if setting up a new project directory:
cd $proj_dir
mkdir {data,results,logs,cmds}
mkdir data/{feature_files,feature_subsets,feature_matrix}
mkdir results/{tpot,feature_selection,ppi_predict,walktrap}
tree -d * # confirm directory structure

# ---------------------------------------------------
# build feature matrix
# ---------------------------------------------------

# pickle feats (run time ~5min)
[ ! -e cmds/pickle_feat_files.sh ] || rm cmds/pickle_feat_files.sh; touch cmds/pickle_feat_files.sh
for x in `ls --ignore=*.pkl ${proj_dir}/data/features/`; do echo "python3 ${script_dir}/make_pkl.py -i ${proj_dir}/data/features/${x}"; done >> cmds/pickle_feat_files.sh
cat cmds/pickle_feat_files.sh | parallel -j24

# combine subsets (run time ~45min)
for dir in all chicken dolphin mouse pig rabbit; do echo \
"python3 ${script_dir}/build_featmat.py \
--directory ${proj_dir}/data/feature_files/${dir}/ \
--pickle --outfile_name ${proj_dir}/data/feature_subsets/${dir}_features"; done > cmds/join_subsets.sh
cat cmds/join_subsets.sh | parallel -j6 2>&1 | tee ${proj_dir}/logs/join_subsets.log

# combine all (run time ~20min)
python3 ${script_dir}/build_featmat.py --directory ${proj_dir}/data/feature_subsets/ \
--pickle --outfile_name ${proj_dir}/data/feature_matrix/feature_matrix 2>&1 | tee ${proj_dir}/logs/build_featmat.log

# ---------------------------------------------------
# label feature matrix
# ---------------------------------------------------

# label positive/negative PPIs (run time ~2 hours)
# seed makes negative PPI labels reproducible
python3 ${script_dir}/label_featmat.py \
--featmat ${proj_dir}/data/featmats/featmat.pkl \
--gold_std_file ${proj_dir}/data/gold_stds/all.gold.cmplx.noRibos.merged.txt \
--outfile_name ${proj_dir}/data/featmats/featmat_labeled --seed 13 2>&1 | tee ${proj_dir}/logs/label_featmat.log

# ---------------------------------------------------
# get optimized ML pipeline
# ---------------------------------------------------

# tpot "generations" and "pop_size" set here to default cfmsflow params
# seed makes group splits, train/test splits, & model reproducible
# extremely slow (will probably run overnight)
# to speed things up reduce num_splits, generations, & pop_size
python3 ${script_dir}/run_tpot.py \
--featmat ${proj_dir}/data/featmats/featmat_labeled_traintest.pkl \
--outdir ${proj_dir}/results/tpot/gss/ \
--group_split_method GroupShuffleSplit --num_splits 5 --train_size 0.75 \
--generations 10 --pop_size 20 --seed 17 2>&1 | tee ${proj_dir}/logs/run_tpot_gss.log

python3 ${script_dir}/run_tpot.py \
--featmat ${proj_dir}/data/featmats/featmat_labeled_traintest.pkl \
--outdir ${proj_dir}/results/tpot/gkfold/ \
--group_split_method GroupKFold --num_splits 5 --train_size 0.75 \
--generations 10 --pop_size 20 --seed 17 2>&1 | tee ${proj_dir}/logs/run_tpot_gkfold.log

python3 ${script_dir}/run_tpot.py \
--featmat ${proj_dir}/data/featmats/featmat_labeled_traintest.pkl \
--outdir ${proj_dir}/results/tpot/sgkfold/ \
--group_split_method StratifiedGroupKFold --num_splits 5 --train_size 0.75 \
--generations 10 --pop_size 20 --seed 13 2>&1 | tee ${proj_dir}/logs/run_tpot_sgkfold.log

# get all test scores
grep --with-filename 'Test set score' ../../logs/run_tpot*log > all_test_scores.txt

# get all pipelines
sed -n '/Average/,/Fix random/p' gss/tpot_pipeline_1

[ ! -e all_pipelines.txt ] || rm all_pipelines.txt; touch all_pipelines.txt
for dir in gkfold gss sgkfold; do 
        echo "-------------------------------------------------------"
        echo "-------------------- ${dir} ---------------------------" 
        echo "-------------------------------------------------------"
        sed -n '/Average/,/Fix random/p' ${dir}/tpot_pipeline* |
        sed '/# Fix random state for all the steps in exported pipeline/d' |
        sed '/# Fix random state in exported estimator/d' |
        sed 's/# Average/\n# Average/'
done >> all_pipelines.txt

# best models:
gkfold/tpot_model_3  # Linear SVC (3 steps)
gkfold/tpot_model_5  # SGD (3 steps)
gss/tpot_model_1     # KNN (3 steps)
gss/tpot_model_4     # ExtraTrees (1 step)

# ---------------------------------------------------
# run recursive feature elimination
# ---------------------------------------------------

# choose best tpot models as input
# note: KNN does not provide feature selection logic

# seed makes group splits, train/test splits, & model reproducible
# highly variable run time (will take 0.25-6+ hours depending on # of features)
# to speed things up increase remove_per_step value

# [[ LinearSVC ]]
python3 ${script_dir}/select_features.py \
--featmat ${proj_dir}/data/featmats/featmat_labeled_traintest.pkl \
--model ${proj_dir}/results/tpot/gkfold/tpot_model_3.pkl \
--group_split_method GroupKFold \
--outdir ${proj_dir}/results/feature_selection/ \
--threads 12 --remove_per_step 3 --num_splits 5 --seed 17 2>&1 | tee ${proj_dir}/logs/select_features_SVC.log

# [[ SGDClassifier ]]
python3 ${script_dir}/select_features.py \
--featmat ${proj_dir}/data/featmats/featmat_labeled_traintest.pkl \
--model ${proj_dir}/results/tpot/gkfold/tpot_model_5.pkl \
--group_split_method GroupKFold \
--outdir ${proj_dir}/results/feature_selection/ \
--threads 12 --remove_per_step 3 --num_splits 5 --seed 17 2>&1 | tee ${proj_dir}/logs/select_features_SGD.log

# [[ ExtraTrees ]]
python3 ${script_dir}/select_features.py \
--featmat ${proj_dir}/data/featmats/featmat_labeled_traintest.pkl \
--model ${proj_dir}/results/tpot/gss/tpot_model_4.pkl \
--group_split_method GroupKFold \
--outdir ${proj_dir}/results/feature_selection/ \
--threads 12 --remove_per_step 3 --num_splits 5 --seed 17 2>&1 | tee ${proj_dir}/logs/select_features_ExtraTrees.log

# ---------------------------------------------------
# predict PPIs
# ---------------------------------------------------

# seed makes group splits, train/test splits, & model reproducible
# fairly quick (run time ~5-15min)

script_dir="/stor/work/Marcotte/project/rmcox/leca/scripts"
proj_dir="/stor/work/Marcotte/project/rmcox/leca/ppi_ml"

# with seed=777 & train_size=0.5, actual train/test split is 75.6/24.4

# [[ LinearSVC ]]
# generate predictions using all features
python3 ${script_dir}/predict_ppis.py \
--featmat ${proj_dir}/data/featmats/featmat_labeled.pkl \
--model ${proj_dir}/results/tpot/gkfold/tpot_model_3.pkl \
--outdir ${proj_dir}/results/ppi_predict/feature_sweep/all/ \
--fdr_cutoff 0.1 --seed 777 --train_size 0.5 2>&1 | tee ${proj_dir}/logs/predict_ppis_LinearSVC_all_feats.log

# generate predictions across a sweep of features
for x in 5 10 25 50 100 250; do echo "python3 ${script_dir}/predict_ppis.py \
--featmat ${proj_dir}/data/featmats/featmat_labeled.pkl \
--model ${proj_dir}/results/tpot/gkfold/tpot_model_3.pkl \
--outdir ${proj_dir}/results/ppi_predict/feature_sweep/${x} \
--feature_selection ${proj_dir}/results/feature_selection/top_feats_LinearSVC_RFECV_all.csv \
--num_feats ${x} --fdr_cutoff 0.1 --seed 777 --train_size 0.5 2>&1 | tee ${proj_dir}/logs/predict_ppis_LinearSVC_${x}_feats.log"; done > cmds/sweep_feats_LinearSVC.sh
cat cmds/sweep_feats_LinearSVC.sh | parallel -j5


# [[ SGDClassifier ]]
# generate predictions using all features
python3 ${script_dir}/predict_ppis.py \
--featmat ${proj_dir}/data/featmats/featmat_labeled.pkl \
--model ${proj_dir}/results/tpot/gkfold/tpot_model_5.pkl \
--outdir ${proj_dir}/results/ppi_predict/feature_sweep/all/ \
--fdr_cutoff 0.1 --seed 777 --train_size 0.5 2>&1 | tee ${proj_dir}/logs/predict_ppis_SGDClassifier_all_feats.log

# generate predictions across a sweep of features
for x in 5 10 25 50 100 250; do echo "python3 ${script_dir}/predict_ppis.py \
--featmat ${proj_dir}/data/featmats/featmat_labeled.pkl \
--model ${proj_dir}/results/tpot/gkfold/tpot_model_5.pkl \
--outdir ${proj_dir}/results/ppi_predict/feature_sweep/${x} \
--feature_selection ${proj_dir}/results/feature_selection/top_feats_SGDClassifier_RFECV_all.csv \
--num_feats ${x} --fdr_cutoff 0.1 --seed 777 --train_size 0.5 2>&1 | tee ${proj_dir}/logs/predict_ppis_SGDClassifier_${x}_feats.log"; done > cmds/sweep_feats_SGDClassifier.sh
cat cmds/sweep_feats_SGDClassifier.sh | parallel -j5


# [[ ExtraTrees ]]
# generate predictions using all features
python3 ${script_dir}/predict_ppis.py \
--featmat ${proj_dir}/data/featmats/featmat_labeled.pkl \
--model ${proj_dir}/results/tpot/gss/tpot_model_4.pkl \
--outdir ${proj_dir}/results/ppi_predict/feature_sweep/all/ \
--fdr_cutoff 0.1 --seed 777 --train_size 0.5 2>&1 | tee ${proj_dir}/logs/predict_ppis_ExtraTreesClassifier_all_feats.log

# generate predictions across a sweep of features
for x in 5 10 25 50 100 250; do echo "python3 ${script_dir}/predict_ppis.py \
--featmat ${proj_dir}/data/featmats/featmat_labeled.pkl \
--model ${proj_dir}/results/tpot/gss/tpot_model_4.pkl \
--outdir ${proj_dir}/results/ppi_predict/feature_sweep/${x} \
--feature_selection ${proj_dir}/results/feature_selection/top_feats_ExtraTreesClassifier_RFECV_all.csv \
--num_feats ${x} --fdr_cutoff 0.1 --seed 777 --train_size 0.5 2>&1 | tee ${proj_dir}/logs/predict_ppis_ExtraTreesClassifier_${x}_feats.log"; done > cmds/sweep_feats_ExtraTreesClassifier.sh
cat cmds/sweep_feats_ExtraTreesClassifier.sh | parallel -j5

# ---------------------------------------------------
# run cross-validation
# ---------------------------------------------------

# [[ ExtraTrees ]]
python3 ${script_dir}/predict_ppis.py \
--featmat ${proj_dir}/data/featmats/featmat_labeled.pkl \
--model ${proj_dir}/results/tpot/gss/tpot_model_4.pkl \
--outdir ${proj_dir}/results/cross_val \
--feature_selection ${proj_dir}/results/feature_selection/top_feats_ExtraTreesClassifier_RFECV_all.csv \
--group_split_method GroupKFold --num_splits 5 \
--num_feats 25 --fdr_cutoff 0.1 --seed 777 --train_size 0.5 2>&1 | tee ${proj_dir}/logs/crossval_ExtraTreesClassifier_25_feats.log

# [[ SGDClassifier ]]
python3 ${script_dir}/predict_ppis.py \
--featmat ${proj_dir}/data/featmats/featmat_labeled.pkl \
--model ${proj_dir}/results/tpot/gkfold/tpot_model_5.pkl \
--outdir ${proj_dir}/results/cross_val \
--feature_selection ${proj_dir}/results/feature_selection/top_feats_SGDClassifier_RFECV_all.csv \
--group_split_method GroupKFold --num_splits 5 \
--num_feats 25 --fdr_cutoff 0.1 --seed 777 --train_size 0.5 2>&1 | tee ${proj_dir}/logs/crossval_SGDClassifier_25_feats.log

# [[ LinearSVC ]]
python3 ${script_dir}/predict_ppis.py \
--featmat ${proj_dir}/data/featmats/featmat_labeled.pkl \
--model ${proj_dir}/results/tpot/gkfold/tpot_model_3.pkl \
--outdir ${proj_dir}/results/cross_val \
--feature_selection ${proj_dir}/results/feature_selection/top_feats_LinearSVC_RFECV_all.csv \
--group_split_method GroupKFold --num_splits 5 \
--num_feats 100 --fdr_cutoff 0.1 --seed 777 --train_size 0.5 2>&1 | tee ${proj_dir}/logs/crossval_LinearSVC_100_feats.log

# ---------------------------------------------------
# cluster PPIs
# ---------------------------------------------------

# best models:
# ExtraTrees (100 feats) -> ppi_predict/feature_sweep/100/scored_interactions_fdr10_ExtraTreesClassifier.csv
# SGD (100 feats) -> ppi_predict/feature_sweep/100/scored_interactions_fdr10_SGDClassifier.csv

# seed makes results reproducible
# very fast (run time ~1min)

# [[ LinearSVC ]]
for x in 3 4 5 6 7; do echo "python3 ${script_dir}/detect_communities.py --scores ${proj_dir}/results/ppi_predict/feature_sweep/100/scored_interactions_fdr10_LinearSVC.csv --steps ${x} --outfile ${proj_dir}/results/walktrap/step_sweep/LinearSVC_100feats_fdr10_${x}steps --annotations ${proj_dir}/annotations/leca_eunog_annots_complete.030721.csv --seed 777 2>&1 | tee ${proj_dir}/logs/walktrap_LinearSVC_100feats_fdr10_${x}steps.log"; done > cmds/sweep_walktrap_steps_LinearSVC_100feats_fdr10.sh
cat cmds/sweep_walktrap_steps_LinearSVC_100feats_fdr10.sh | parallel -j5

# calculate precision-recall per cut
for x in 3 4 5 6 7; do echo "python3 ${script_dir}/calc_cluster_pr.py --cluster_file ${proj_dir}/results/walktrap/step_sweep/LinearSVC_100feats_fdr10_${x}steps.csv --positive_ppi_dict ${proj_dir}/data/featmats/positive_ppi_dict.pkl --negative_ppi_dict ${proj_dir}/data/featmats/negative_ppi_dict.pkl --outfile ${proj_dir}/results/walktrap/step_sweep/LinearSVC_100feats_fdr10_${x}steps_precision_recall.csv 2>&1 | tee ${proj_dir}/logs/walktrap_LinearSVC_100feats_fdr10_${x}steps_pr.log"; done > cmds/walktrap_pr_LinearSVC_100feats_fdr10.sh
cat cmds/walktrap_pr_LinearSVC_100feats_fdr10.sh | parallel -j5

# [[ ExtraTrees ]]
for x in 3 4 5 6 7; do echo "python3 ${script_dir}/detect_communities.py --scores ${proj_dir}/results/ppi_predict/feature_sweep/25/scored_interactions_fdr10_ExtraTreesClassifier.csv --steps ${x} --outfile ${proj_dir}/results/walktrap/step_sweep/ExtraTreesClassifier_25feats_fdr10_${x}steps --annotations ${proj_dir}/annotations/leca_eunog_annots_complete.030721.csv --seed 777 2>&1 | tee ${proj_dir}/logs/walktrap_ExtraTreesClassifier_25feats_fdr10_${x}steps.log"; done > cmds/sweep_walktrap_steps_ExtraTreesClassifier_25feats_fdr10.sh
cat cmds/sweep_walktrap_steps_ExtraTreesClassifier_25feats_fdr10.sh | parallel -j5

# calculate precision-recall per cut
for x in 3 4 5 6 7; do echo "python3 ${script_dir}/calc_cluster_pr.py --cluster_file ${proj_dir}/results/walktrap/step_sweep/ExtraTreesClassifier_25feats_fdr10_${x}steps.csv --positive_ppi_dict ${proj_dir}/data/featmats/positive_ppi_dict.pkl --negative_ppi_dict ${proj_dir}/data/featmats/negative_ppi_dict.pkl --outfile ${proj_dir}/results/walktrap/step_sweep/ExtraTreesClassifier_25feats_fdr10_${x}steps_precision_recall.csv 2>&1 | tee ${proj_dir}/logs/walktrap_ExtraTreesClassifier_25feats_fdr10_${x}steps_pr.log"; done > cmds/walktrap_pr_ExtraTreesClassifier_25feats_fdr10.sh
cat cmds/walktrap_pr_ExtraTreesClassifier_25feats_fdr10.sh | parallel -j5

# [[ SGDClassifier ]]
for x in 3 4 5 6 7; do echo "python3 ${script_dir}/detect_communities.py --scores ${proj_dir}/results/ppi_predict/feature_sweep/25/scored_interactions_fdr10_SGDClassifier.csv --steps ${x} --outfile ${proj_dir}/results/walktrap/step_sweep/SGDClassifier_25feats_fdr10_${x}steps --annotations ${proj_dir}/annotations/leca_eunog_annots_complete.030721.csv --seed 777 2>&1 | tee ${proj_dir}/logs/walktrap_SGDClassifier_25feats_fdr10_${x}steps.log"; done > cmds/sweep_walktrap_steps_SGDClassifier_25feats_fdr10.sh
cat cmds/sweep_walktrap_steps_SGDClassifier_25feats_fdr10.sh | parallel -j5


# calculate precision-recall per cut
for x in 3 4 5 6 7; do echo "python3 ${script_dir}/calc_cluster_pr.py --cluster_file ${proj_dir}/results/walktrap/step_sweep/SGDClassifier_25feats_fdr10_${x}steps.csv  --positive_ppi_dict ${proj_dir}/data/featmats/positive_ppi_dict.pkl --negative_ppi_dict ${proj_dir}/data/featmats/negative_ppi_dict.pkl --outfile ${proj_dir}/results/walktrap/step_sweep/SGDClassifier_25feats_fdr10_${x}steps_precision_recall.csv 2>&1 | tee ${proj_dir}/logs/walktrap_SGDClassifier_25feats_fdr10_${x}steps_pr.log"; done > cmds/walktrap_pr_SGDClassifier_25feats_fdr10.sh
cat cmds/walktrap_pr_SGDClassifier_25feats_fdr10.sh | parallel -j5

# ---------------------------------------------------
# sanity check the pipeline
# ---------------------------------------------------

# pretty quick (run time ~5min)
# will make a nice table in the log file
python3 ${script_dir}/sanity_checks.py \
--featmat_file ${proj_dir}/data/feature_matrix/feature_matrix_labeled.pkl \
--results_file ${proj_dir}/results/ppi_predict/all_feats/scored_interactions_all_ExtraTreesClassifier.csv 2>&1 | tee /project/rmcox/misc/vy.pipeline/logs/sanity_checks.log

# can also check group merge stats
python3 ${script_dir}/label_featmat.py \
--featmat ${proj_dir}/data/feature_matrix/feature_matrix.pkl \
--gold_std_file /project/vyqtdang/MS_brain/nextflow_brain/cfmsflow/gold_std/verNOG/CORUM4.1_core_merged.txt \
--outfile_name ${proj_dir}/data/feature_matrix/feature_matrix_labeled_allgroups \
--keep_cmplx_overlap --seed 13 2>&1 | tee ${proj_dir}/logs/label_featmat_allgroups.log

python3 ${script_dir}/sanity_checks.py \
--featmat_file ${proj_dir}/data/feature_matrix/feature_matrix_labeled_allgroups_traintest.pkl \
--results_file ${proj_dir}/results/ppi_predict/all_feats/scored_interactions_all_ExtraTreesClassifier.csv 2>&1 | tee ${proj_dir}/logs/sanity_checks_groups.log

Name		Name	Last commit message	Last commit date
Latest commit History 94 Commits
figures		figures
notebooks		notebooks
scripts		scripts
.gitignore		.gitignore
README.md		README.md
environment.yml		environment.yml

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

LECA Proteomics

PPI ML pipeline

About

Releases

Packages

Languages

marcottelab/leca-proteomics

Folders and files

Latest commit

History

Repository files navigation

LECA Proteomics

PPI ML pipeline

About

Resources

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages