Skip to content

Commit

Permalink
codes & figures
Browse files Browse the repository at this point in the history
  • Loading branch information
rintukutum committed Aug 31, 2021
1 parent 8debb44 commit 3020820
Show file tree
Hide file tree
Showing 7,354 changed files with 622,787 additions and 0 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@ share/python-wheels/
*.egg
MANIFEST

# Custom Paths
data/
covseq/
models/
main/

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
Expand Down
43 changes: 43 additions & 0 deletions 01-A1-train-preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""
Script to use Covseq (https://github.com/boxiangliu/covseq) to read genome sequences from FASTA files
and identify different regions (generates a TSV).
"""
import os
import glob


covseq_scipt_path = "covseq/"
unprocessed_path = "data/fasta_preprocessing/unprocessed/"
processed_path = "data/fasta_preprocessing/processed/"
segmented_path = "data/fasta_preprocessing/segmented/"


def clean_fasta_files():
raw_fasta_files = glob.glob(os.path.join(unprocessed_path, "**/*.fasta"), recursive=True)
for file in raw_fasta_files:
filename = os.path.basename(file)
out_file = os.path.join(processed_path, filename)
print(out_file)
try:
cmd = f"sed -e 's/ /-/g' -e 's/|/_/g' '{file}' > '{out_file}'"
os.system(cmd)
except Exception as e:
print(f'FAILED for file. Reason: {e}')

def get_tsv_with_covseq():
processed_fasta_files = glob.glob(f'{processed_path}/*.fasta')
for file in processed_fasta_files:
filename = os.path.basename(file)
print(filename)
out_folder = os.path.join(segmented_path, filename)
os.system(f'mkdir -p {out_folder}')
cmd = f"cd {covseq_script_path} && python annotation/annotation.py --fasta '{os.path.abspath(file)}' --out_dir '{os.path.abspath(out_folder)}'"
os.system(cmd)

def main():
clean_fasta_files()
get_tsv_with_covseq()


if __name__ == '__main__':
main()
186 changes: 186 additions & 0 deletions 01-A2-train-preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
"""
Script to read the GISAID preprocessed fasta files and generate country-wise CSVs.
Pipeline:
Read each fasta file.
Get metadata & genome sequence for each strain from fasta file.
Find relevant folder (generated by covseq script) that contains the TSV corresponding to the strain in the fasta file.
Get start and end of each peptide from the TSV and append sequence metadata in CSV for fasta file.
Filter fasta file CSV by countries and generate separate CSV for each country. Append if country CSV already exists.
"""
import os
import glob
import logging
import traceback
import pandas as pd
from Bio import SeqIO


PATH_FASTA_DIR = "data/fasta_preprocessing/processed/" # Contains the preprocessed fasta files
PATH_TSV_FOLDERS = "data/fasta_preprocessing/segmented/" # Contains the folders containing the TSVs
PATH_OUTPUT_DIR = "data/fasta_preprocessing/CSVs/countrywise/"

SEQ_METADATA_COLS = ['Peptide_Name', 'Sequence', 'Sequence_Length', 'DividesBy3', 'Triples_Count', 'Full_Sequence_Length'] # 3mers removed
OTHER_METADATA_COLS = ['Accession_ID', 'Virus_Name', 'Country', 'Collection_Date']
OUTPUT_COLS = ['Accession_ID', 'Virus_Name', 'Country', 'Peptide_Name', 'Sequence', 'Sequence_Length',
'DividesBy3', 'Triples_Count', 'Full_Sequence_Length', 'Collection_Date']

logging.basicConfig(filename='app.log', filemode='w', format='%(asctime)s - %(levelname)s - %(message)s')


def validate_sequence(sequence, full_sequence=False):
if full_sequence:
bases_check = (len(set(sequence) - set("ATGC")) == 0)
if bases_check:
return True
else:
start_codon_check = (sequence[:3] == 'ATG')
end_codon_check = (sequence[-3:] in ['TAA', 'TAG', 'TGA'])
if (start_codon_check and end_codon_check):
return True
return False


def get_fasta_strains(filepath):
strains_for_file = []
fasta_sequences = SeqIO.parse(open(filepath),'fasta')
for fasta in fasta_sequences:
description, sequence = fasta.description, str(fasta.seq)
if not validate_sequence(sequence, full_sequence=True):
continue
splitted_descr = str(description).split('_') # hCoV-19/Mayotte/IPP02391/2021_EPI_ISL_1167000_2021-01-21
virus_name = splitted_descr[0]
country = virus_name.split('/')[1]
accession_id = '_'.join(splitted_descr[1:-1])
collection_date = splitted_descr[-1]
strains_for_file.append([accession_id, virus_name, country, sequence, collection_date])
return strains_for_file


def handle_orf1ab(start, end, full_sequence):
# print(val['Start'], val['End'])
if ',' in start:
vals = start.split(',')
start_a = int(vals[0].strip())
start_b = int(vals[1].strip())
else:
start_a = int(start[:3])
start_b = int(start[3:])
if ',' in end:
vals = end.split(',')
end_a = int(vals[0].strip())
end_b = int(vals[1].strip())
else:
end_a = int(end[:5])
end_b = int(end[5:])
if end_a == start_b:
sequence = full_sequence[start_a-1: end_b]
else:
sequence = full_sequence[start_a-1: end_a] + full_sequence[start_b-1: end_b]
return sequence


def get_amino_sequences(tsv, full_sequence, n_for_nmers=3):
seq_df = pd.DataFrame(columns = SEQ_METADATA_COLS)
if len(tsv) != 12:
raise Exception('TSV length not 12 for the Strain')
for _, val in tsv.iterrows():
peptide_name = val['Product']
if peptide_name == 'orf1ab polyprotein': # Special handling reqd.
sequence = handle_orf1ab(val['Start'], val['End'], full_sequence)
else:
sequence = full_sequence[int(val['Start'])-1: int(val['End'])]
if not validate_sequence(sequence):
raise Exception('Problem in start/end codon')
seq_length = len(sequence) # Ambiguity in TSV lengths
dividesbythree = (seq_length/3).is_integer()
triples_count = seq_length//3
row = pd.Series([peptide_name, sequence, seq_length, dividesbythree, triples_count, len(full_sequence)],
index=SEQ_METADATA_COLS)
seq_df = seq_df.append(row, ignore_index=True)
return seq_df


def find_covseq_tsv(country, accession_id):
tsv_dir_path = glob.glob(f'{PATH_TSV_FOLDERS}/*/*{accession_id}*')
if tsv_dir_path:
tsvs = os.listdir(tsv_dir_path[0])
tsv = [f for f in tsvs if 'orf' in f]
if tsv:
tsv_name = tsv[0]
return f'{tsv_dir_path[0]}/{tsv_name}'


def get_df_for_strain(strain_metadata):
accession_id, virus_name, country, collection_date = strain_metadata[0], strain_metadata[1], strain_metadata[2], strain_metadata[4]
sequence = strain_metadata[3]
tsv_path = find_covseq_tsv(country, accession_id)
print(tsv_path)
if tsv_path:
tsv = pd.read_csv(tsv_path, delimiter="\t")
try:
seq_df = get_amino_sequences(tsv, sequence)
except Exception as e:
raise Exception(f"{str(e)}. ID:{accession_id}; Country:{country}")
other_df = pd.DataFrame([[accession_id, virus_name, country, collection_date]], columns=OTHER_METADATA_COLS)
other_df = pd.concat([other_df]*len(seq_df), ignore_index=True) # Replicate rows for each protein
strain_df = pd.concat([other_df, seq_df], axis=1)
return strain_df
else:
raise Exception(f"TSV does not exist for strain. ID:{accession_id}; Country:{country}")


def export_countrywise_csvs(df_for_fasta):
countries = list(set(df_for_fasta['Country'].values))
for country in countries:
country_df = df_for_fasta[df_for_fasta['Country']==country]
output_path = f'{PATH_OUTPUT_DIR}/{country}.csv'
if os.path.exists(output_path):
df = pd.read_csv(output_path)
out_df = df.append(country_df, ignore_index=True)
else:
out_df = country_df
out_df = out_df.drop_duplicates(subset=['Accession_ID', 'Peptide_Name'])
out_df.to_csv(output_path, index=False)


def clean_output_dir():
result_files = glob.glob(f'{PATH_OUTPUT_DIR}/*')
for file in result_files:
try:
os.remove(file)
except:
pass
print("Cleaned output directory")


def main(cleanup=False):
if cleanup:
clean_output_dir()

fasta_files = os.listdir(PATH_FASTA_DIR)

for file in fasta_files:
print(f"PROCESSING: {file}")
filepath = f'{PATH_FASTA_DIR}/{file}'
try:
strains_for_file = get_fasta_strains(filepath) # [[accession_id, virus_name, country, sequence, collection_date]]
except Exception as e:
print(f"ERROR getting strains from fasta file. Reason: {e}")
logging.error(f"ERROR getting strains from fasta file. Reason: {e}")
# traceback.print_exc(file=sys.stdout)
continue

df_for_fasta = pd.DataFrame(columns = OUTPUT_COLS)
for strain_metadata in strains_for_file:
try:
strain_df = get_df_for_strain(strain_metadata)
df_for_fasta = df_for_fasta.append(strain_df, ignore_index=True)
except Exception as e:
print(f"ERROR processing strain. Reason: {e}")
logging.error(f"ERROR processing strain. Reason: {e}")
# traceback.print_exc(file=sys.stdout)
if not df_for_fasta.empty:
export_countrywise_csvs(df_for_fasta)

if __name__ == "__main__":
main(cleanup=False)
57 changes: 57 additions & 0 deletions 02-A-train-w2v.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""
Script to train w2v model and export embeddings for spike gene sequences
"""
import glob
import pandas as pd
from gensim.models import Word2Vec

sequences_dir = "data/fasta_preprocessing/CSVs/countrywise"
model_outpath = "models/word2vec-spike-all-countries-36-VS.model"
output_filename = "data/w2v_output/word2vec-spike-all-countries-36-VS.csv"


def get_kmers(sequence, n_for_nmers=3):
n_mers = []
seq_length = len(sequence)
for i in range(0, seq_length, n_for_nmers):
n_mers.append(sequence[i: i+n_for_nmers])
return n_mers


def word2vec_model(tokens, vector_size, window_size):
model = Word2Vec(tokens, size = vector_size, window = window_size, min_count = 1, sg = 1)
return model


def create_vec(df, size, col, model):
vector = []
for _, row in df.iterrows():
vec = [0 for i in range(size)]
seq = row[col]
for trip in seq:
wv = model.wv[trip]
for i in range(size):
vec[i] += wv[i]
vec = [i/len(seq) for i in vec]
vector.append(vec)
return vector


comb_data = pd.DataFrame()
for file in glob.glob(f'{sequences_dir}/*.csv'):
comb_data = pd.concat([comb_data, file], axis=0, ignore_index=True)

comb_data['3-mers'] = comb_data['Sequence'].apply(get_kmers)

tokens3 = []
max3 = 0
for seq in comb_data['3-mers']:
tokens3.append(seq)
max3 = max(max3, len(seq))

model = word2vec_model(tokens3, 36, 20)
vector = create_vec(comb_data, 36, '3-mers', model)
data1 = pd.concat([comb_data['Accession_ID'], comb_data['Collection_Date'], comb_data['Country'], pd.DataFrame(vector)], axis=1)
data1.to_csv(output_filename, index = False)
model.save(model_outpath)
88 changes: 88 additions & 0 deletions 02-B-pip-loss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from gensim.models import Word2Vec
from sklearn.linear_model import LinearRegression

models_path = "models/test/"


def create_df(model):
df = pd.DataFrame()
for vocab in model.wv.vocab.keys():
df = pd.concat([df, pd.DataFrame(model.wv[vocab]).T])
df.index = model.wv.vocab.keys()
return df


def calculate_loss(df1, df2):
loss = np.sqrt(((df1.dot(df1.T) - df2.dot(df2.T))**2).sum().sum())
return loss


dim = [3,6,9,10,12,15,18,20,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,64]
keys = []
values = []
errors = {}
for ind in range(len(dim)-1):
model1 = Word2Vec.load(models_path + "word2vec-spike-aus-uk-3mers-"+str(dim[ind])+"-VS.model")
model2 = Word2Vec.load(models_path + "word2vec-spike-aus-uk-3mers-"+str(dim[ind+1])+"-VS.model")

df1 = create_df(model1)
df2 = create_df(model2)
df2.reindex(df1.index)

pip_loss = calculate_loss(df1, df2)
errors[str(dim[ind])+"_"+str(dim[ind+1])] = pip_loss
keys.append(str(dim[ind+1]) +"_"+ str(dim[ind]))
values.append(pip_loss)

values = values[::-1]
values = np.cumsum(values)
keys = keys[::-1]
key_index = [i for i in range(len(keys))]

k = np.array(key_index).reshape(-1, 1)
v = np.array(values).reshape(-1, 1)

lgr1 = LinearRegression().fit(k[:10], v[:10])
lgr2 = LinearRegression().fit(k[9:13], v[9:13])
lgr3 = LinearRegression().fit(k[12:17], v[12:17])
lgr4 = LinearRegression().fit(k[16:], v[16:])

k1 = []
for i in lgr1.predict(k[:10]):
k1.append(i[0])

k2 = []
for i in lgr2.predict(k[9:13]):
k2.append(i[0])

k3 = []
for i in lgr3.predict(k[12:17]):
k3.append(i[0])

k4 = []
for i in lgr4.predict(k[16:]):
k4.append(i[0])



# Plot Cumulative PIP Loss
fig = go.Figure()
fig.add_trace(go.Scatter(x=keys[:10], y=k1,line=dict(color="#087FF5")))
fig.add_trace(go.Scatter(x=keys[9:13], y=k2, line=dict(color="#087FF5")))
fig.add_trace(go.Scatter(x=keys[12:17], y=k3, line=dict(color="#087FF5")))
fig.add_trace(go.Scatter(x=keys[16:], y=k4, line=dict(color="#087FF5")))
fig.update_layout(
title={
'text' : "Cumulative PIP Loss for Word2Vec 3-mer Embeddings",
'x':0.5,
},
yaxis_title = 'PIP-Loss',
template = 'plotly_white',
width=800,
height=500,
showlegend=False,
)
fig.show()
Loading

0 comments on commit 3020820

Please sign in to comment.