codes & figures

rintukutum · Aug 31, 2021 · 3020820 · 3020820
1 parent 8debb44
commit 3020820
Show file tree

Hide file tree

Showing 7,354 changed files with 622,787 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -27,6 +27,12 @@ share/python-wheels/
 *.egg
 MANIFEST
 
+# Custom Paths
+data/
+covseq/
+models/
+main/
+
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.

diff --git a/01-A1-train-preprocessing.py b/01-A1-train-preprocessing.py
@@ -0,0 +1,43 @@
+"""
+Script to use Covseq (https://github.com/boxiangliu/covseq) to read genome sequences from FASTA files 
+and identify different regions (generates a TSV).
+"""
+import os
+import glob
+
+
+covseq_scipt_path = "covseq/"
+unprocessed_path = "data/fasta_preprocessing/unprocessed/"
+processed_path = "data/fasta_preprocessing/processed/"
+segmented_path = "data/fasta_preprocessing/segmented/"
+
+
+def clean_fasta_files():
+    raw_fasta_files = glob.glob(os.path.join(unprocessed_path, "**/*.fasta"), recursive=True)
+    for file in raw_fasta_files:
+        filename = os.path.basename(file)
+        out_file = os.path.join(processed_path, filename)
+        print(out_file)
+        try:
+            cmd = f"sed -e 's/ /-/g' -e 's/|/_/g' '{file}' > '{out_file}'"
+            os.system(cmd)
+        except Exception as e:
+            print(f'FAILED for file. Reason: {e}')
+
+def get_tsv_with_covseq():
+    processed_fasta_files = glob.glob(f'{processed_path}/*.fasta')
+    for file in processed_fasta_files:
+        filename = os.path.basename(file)
+        print(filename)
+        out_folder = os.path.join(segmented_path, filename)
+        os.system(f'mkdir -p {out_folder}')
+        cmd = f"cd {covseq_script_path} && python annotation/annotation.py --fasta '{os.path.abspath(file)}' --out_dir '{os.path.abspath(out_folder)}'"
+        os.system(cmd)
+
+def main():
+    clean_fasta_files()
+    get_tsv_with_covseq()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/01-A2-train-preprocessing.py b/01-A2-train-preprocessing.py
@@ -0,0 +1,186 @@
+"""
+Script to read the GISAID preprocessed fasta files and generate country-wise CSVs.
+Pipeline:
+        Read each fasta file.
+        Get metadata & genome sequence for each strain from fasta file.
+        Find relevant folder (generated by covseq script) that contains the TSV corresponding to the strain in the fasta file.
+        Get start and end of each peptide from the TSV and append sequence metadata in CSV for fasta file.
+        Filter fasta file CSV by countries and generate separate CSV for each country. Append if country CSV already exists.
+"""
+import os
+import glob
+import logging
+import traceback
+import pandas as pd
+from Bio import SeqIO
+
+
+PATH_FASTA_DIR = "data/fasta_preprocessing/processed/" # Contains the preprocessed fasta files
+PATH_TSV_FOLDERS =  "data/fasta_preprocessing/segmented/" # Contains the folders containing the TSVs
+PATH_OUTPUT_DIR = "data/fasta_preprocessing/CSVs/countrywise/"
+
+SEQ_METADATA_COLS =  ['Peptide_Name', 'Sequence', 'Sequence_Length', 'DividesBy3', 'Triples_Count', 'Full_Sequence_Length'] # 3mers removed
+OTHER_METADATA_COLS =  ['Accession_ID', 'Virus_Name', 'Country', 'Collection_Date']
+OUTPUT_COLS =  ['Accession_ID', 'Virus_Name', 'Country', 'Peptide_Name', 'Sequence', 'Sequence_Length', 
+                'DividesBy3', 'Triples_Count', 'Full_Sequence_Length', 'Collection_Date']  
+
+logging.basicConfig(filename='app.log', filemode='w', format='%(asctime)s - %(levelname)s - %(message)s')
+
+
+def validate_sequence(sequence, full_sequence=False):
+    if full_sequence:
+        bases_check = (len(set(sequence) - set("ATGC")) == 0)
+        if bases_check:
+            return True
+    else:
+        start_codon_check = (sequence[:3] == 'ATG')
+        end_codon_check = (sequence[-3:] in ['TAA', 'TAG', 'TGA'])
+        if (start_codon_check and end_codon_check):
+            return True
+    return False
+
+
+def get_fasta_strains(filepath):
+    strains_for_file = []
+    fasta_sequences = SeqIO.parse(open(filepath),'fasta')
+    for fasta in fasta_sequences:
+        description, sequence = fasta.description, str(fasta.seq)
+        if not validate_sequence(sequence, full_sequence=True):
+            continue
+        splitted_descr = str(description).split('_')  # hCoV-19/Mayotte/IPP02391/2021_EPI_ISL_1167000_2021-01-21
+        virus_name = splitted_descr[0]
+        country = virus_name.split('/')[1]
+        accession_id = '_'.join(splitted_descr[1:-1])
+        collection_date = splitted_descr[-1]
+        strains_for_file.append([accession_id, virus_name, country, sequence, collection_date])
+    return strains_for_file
+
+
+def handle_orf1ab(start, end, full_sequence):
+    # print(val['Start'], val['End'])
+    if ',' in start:
+        vals = start.split(',')
+        start_a = int(vals[0].strip())
+        start_b = int(vals[1].strip())
+    else:
+        start_a = int(start[:3])
+        start_b = int(start[3:])
+    if ',' in end:
+        vals = end.split(',')
+        end_a = int(vals[0].strip())
+        end_b = int(vals[1].strip())
+    else:
+        end_a = int(end[:5])
+        end_b = int(end[5:])
+    if end_a == start_b:
+        sequence = full_sequence[start_a-1: end_b]
+    else:
+        sequence = full_sequence[start_a-1: end_a] + full_sequence[start_b-1: end_b]
+    return sequence
+
+
+def get_amino_sequences(tsv, full_sequence, n_for_nmers=3):
+    seq_df = pd.DataFrame(columns = SEQ_METADATA_COLS)
+    if len(tsv) != 12:
+        raise Exception('TSV length not 12 for the Strain')
+    for _, val in tsv.iterrows():
+        peptide_name = val['Product']
+        if peptide_name == 'orf1ab polyprotein':   # Special handling reqd.
+            sequence = handle_orf1ab(val['Start'], val['End'], full_sequence)
+        else:
+            sequence = full_sequence[int(val['Start'])-1: int(val['End'])]
+        if not validate_sequence(sequence):
+            raise Exception('Problem in start/end codon')
+        seq_length = len(sequence)   # Ambiguity in TSV lengths
+        dividesbythree = (seq_length/3).is_integer()
+        triples_count = seq_length//3
+        row = pd.Series([peptide_name, sequence, seq_length, dividesbythree, triples_count, len(full_sequence)], 
+                        index=SEQ_METADATA_COLS)
+        seq_df = seq_df.append(row, ignore_index=True)
+    return seq_df
+
+
+def find_covseq_tsv(country, accession_id):
+    tsv_dir_path = glob.glob(f'{PATH_TSV_FOLDERS}/*/*{accession_id}*')
+    if tsv_dir_path:
+        tsvs = os.listdir(tsv_dir_path[0])
+        tsv = [f for f in tsvs if 'orf' in f]
+        if tsv:
+            tsv_name = tsv[0]
+            return f'{tsv_dir_path[0]}/{tsv_name}'
+
+
+def get_df_for_strain(strain_metadata):
+    accession_id, virus_name, country, collection_date = strain_metadata[0], strain_metadata[1], strain_metadata[2], strain_metadata[4]
+    sequence = strain_metadata[3]
+    tsv_path = find_covseq_tsv(country, accession_id)
+    print(tsv_path)
+    if tsv_path:
+        tsv = pd.read_csv(tsv_path, delimiter="\t")
+        try:
+            seq_df = get_amino_sequences(tsv, sequence)
+        except Exception as e:
+            raise Exception(f"{str(e)}. ID:{accession_id}; Country:{country}")    
+        other_df = pd.DataFrame([[accession_id, virus_name, country, collection_date]], columns=OTHER_METADATA_COLS)
+        other_df = pd.concat([other_df]*len(seq_df), ignore_index=True) # Replicate rows for each protein
+        strain_df = pd.concat([other_df, seq_df], axis=1)
+        return strain_df
+    else:
+        raise Exception(f"TSV does not exist for strain. ID:{accession_id}; Country:{country}")
+
+
+def export_countrywise_csvs(df_for_fasta):
+    countries = list(set(df_for_fasta['Country'].values))
+    for country in countries:
+        country_df = df_for_fasta[df_for_fasta['Country']==country]
+        output_path = f'{PATH_OUTPUT_DIR}/{country}.csv'
+        if os.path.exists(output_path):
+            df = pd.read_csv(output_path)
+            out_df = df.append(country_df, ignore_index=True) 
+        else:
+            out_df = country_df
+        out_df = out_df.drop_duplicates(subset=['Accession_ID', 'Peptide_Name'])
+        out_df.to_csv(output_path, index=False)
+
+
+def clean_output_dir():
+    result_files = glob.glob(f'{PATH_OUTPUT_DIR}/*')
+    for file in result_files:
+        try:
+            os.remove(file)
+        except:
+            pass
+    print("Cleaned output directory")
+
+
+def main(cleanup=False):
+    if cleanup:
+        clean_output_dir()
+
+    fasta_files = os.listdir(PATH_FASTA_DIR)
+
+    for file in fasta_files:
+        print(f"PROCESSING: {file}")
+        filepath = f'{PATH_FASTA_DIR}/{file}'
+        try:
+            strains_for_file = get_fasta_strains(filepath)  # [[accession_id, virus_name, country, sequence, collection_date]]
+        except Exception as e:
+            print(f"ERROR getting strains from fasta file. Reason: {e}")
+            logging.error(f"ERROR getting strains from fasta file. Reason: {e}")
+            # traceback.print_exc(file=sys.stdout)
+            continue
+
+        df_for_fasta = pd.DataFrame(columns = OUTPUT_COLS)
+        for strain_metadata in strains_for_file:
+            try:
+                strain_df = get_df_for_strain(strain_metadata)
+                df_for_fasta = df_for_fasta.append(strain_df, ignore_index=True)
+            except Exception as e:
+                print(f"ERROR processing strain. Reason: {e}")
+                logging.error(f"ERROR processing strain. Reason: {e}")
+                # traceback.print_exc(file=sys.stdout)
+        if not df_for_fasta.empty:
+            export_countrywise_csvs(df_for_fasta)
+
+if __name__ == "__main__":
+    main(cleanup=False)
diff --git a/02-A-train-w2v.py b/02-A-train-w2v.py
@@ -0,0 +1,57 @@
+"""
+Script to train w2v model and export embeddings for spike gene sequences
+
+"""
+import glob
+import pandas as pd
+from gensim.models import Word2Vec
+
+sequences_dir = "data/fasta_preprocessing/CSVs/countrywise"
+model_outpath = "models/word2vec-spike-all-countries-36-VS.model"
+output_filename = "data/w2v_output/word2vec-spike-all-countries-36-VS.csv"
+
+
+def get_kmers(sequence, n_for_nmers=3):
+    n_mers = []
+    seq_length = len(sequence)
+    for i in range(0, seq_length, n_for_nmers):
+        n_mers.append(sequence[i: i+n_for_nmers])
+    return n_mers
+
+
+def word2vec_model(tokens, vector_size, window_size):
+    model = Word2Vec(tokens, size = vector_size, window = window_size, min_count = 1, sg = 1)
+    return model
+
+
+def create_vec(df, size, col, model):
+    vector = []
+    for _, row in df.iterrows():
+        vec = [0 for i in range(size)]
+        seq = row[col]
+        for trip in seq:
+            wv = model.wv[trip]
+            for i in range(size):
+                vec[i] += wv[i]
+        vec = [i/len(seq) for i in vec]
+        vector.append(vec)
+    return vector
+
+
+comb_data = pd.DataFrame()
+for file in glob.glob(f'{sequences_dir}/*.csv'):
+    comb_data = pd.concat([comb_data, file], axis=0, ignore_index=True)
+
+comb_data['3-mers'] = comb_data['Sequence'].apply(get_kmers)
+
+tokens3 = []
+max3 = 0
+for seq in comb_data['3-mers']:
+    tokens3.append(seq)
+    max3 = max(max3, len(seq))
+
+model = word2vec_model(tokens3, 36, 20)
+vector = create_vec(comb_data, 36, '3-mers', model)
+data1 = pd.concat([comb_data['Accession_ID'], comb_data['Collection_Date'], comb_data['Country'], pd.DataFrame(vector)], axis=1)
+data1.to_csv(output_filename, index = False)
+model.save(model_outpath)
diff --git a/02-B-pip-loss.py b/02-B-pip-loss.py
@@ -0,0 +1,88 @@
+import numpy as np
+import pandas as pd
+import plotly.graph_objects as go
+from gensim.models import Word2Vec
+from sklearn.linear_model import LinearRegression
+
+models_path = "models/test/"
+
+
+def create_df(model):
+  df = pd.DataFrame()
+  for vocab in model.wv.vocab.keys():
+      df = pd.concat([df, pd.DataFrame(model.wv[vocab]).T])
+  df.index = model.wv.vocab.keys()
+  return df
+
+
+def calculate_loss(df1, df2):
+    loss = np.sqrt(((df1.dot(df1.T) - df2.dot(df2.T))**2).sum().sum())
+    return loss
+
+
+dim = [3,6,9,10,12,15,18,20,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,64]
+keys = []
+values = []
+errors = {}
+for ind in range(len(dim)-1):
+    model1 = Word2Vec.load(models_path + "word2vec-spike-aus-uk-3mers-"+str(dim[ind])+"-VS.model")
+    model2 = Word2Vec.load(models_path + "word2vec-spike-aus-uk-3mers-"+str(dim[ind+1])+"-VS.model")
+
+    df1 = create_df(model1)
+    df2 = create_df(model2)
+    df2.reindex(df1.index)
+
+    pip_loss = calculate_loss(df1, df2)
+    errors[str(dim[ind])+"_"+str(dim[ind+1])] = pip_loss
+    keys.append(str(dim[ind+1]) +"_"+ str(dim[ind]))
+    values.append(pip_loss)
+
+values = values[::-1]
+values = np.cumsum(values)
+keys = keys[::-1]
+key_index = [i for i in range(len(keys))]
+
+k = np.array(key_index).reshape(-1, 1)
+v = np.array(values).reshape(-1, 1)
+
+lgr1 = LinearRegression().fit(k[:10], v[:10])
+lgr2 = LinearRegression().fit(k[9:13], v[9:13])
+lgr3 = LinearRegression().fit(k[12:17], v[12:17])
+lgr4 = LinearRegression().fit(k[16:], v[16:])
+
+k1 = []
+for i in lgr1.predict(k[:10]):
+  k1.append(i[0])
+
+k2 = []
+for i in lgr2.predict(k[9:13]):
+  k2.append(i[0])
+
+k3 = []
+for i in lgr3.predict(k[12:17]):
+  k3.append(i[0])
+
+k4 = []
+for i in lgr4.predict(k[16:]):
+  k4.append(i[0])
+
+
+
+# Plot Cumulative PIP Loss
+fig = go.Figure()
+fig.add_trace(go.Scatter(x=keys[:10], y=k1,line=dict(color="#087FF5")))
+fig.add_trace(go.Scatter(x=keys[9:13], y=k2, line=dict(color="#087FF5")))
+fig.add_trace(go.Scatter(x=keys[12:17], y=k3, line=dict(color="#087FF5"))) 
+fig.add_trace(go.Scatter(x=keys[16:], y=k4, line=dict(color="#087FF5"))) 
+fig.update_layout(
+    title={
+            'text' : "Cumulative PIP Loss for Word2Vec 3-mer Embeddings",
+            'x':0.5,
+        },
+    yaxis_title = 'PIP-Loss',
+    template = 'plotly_white',
+    width=800,
+    height=500,
+    showlegend=False,
+    )
+fig.show()