-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
8debb44
commit 3020820
Showing
7,354 changed files
with
622,787 additions
and
0 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
""" | ||
Script to use Covseq (https://github.com/boxiangliu/covseq) to read genome sequences from FASTA files | ||
and identify different regions (generates a TSV). | ||
""" | ||
import os | ||
import glob | ||
|
||
|
||
covseq_scipt_path = "covseq/" | ||
unprocessed_path = "data/fasta_preprocessing/unprocessed/" | ||
processed_path = "data/fasta_preprocessing/processed/" | ||
segmented_path = "data/fasta_preprocessing/segmented/" | ||
|
||
|
||
def clean_fasta_files(): | ||
raw_fasta_files = glob.glob(os.path.join(unprocessed_path, "**/*.fasta"), recursive=True) | ||
for file in raw_fasta_files: | ||
filename = os.path.basename(file) | ||
out_file = os.path.join(processed_path, filename) | ||
print(out_file) | ||
try: | ||
cmd = f"sed -e 's/ /-/g' -e 's/|/_/g' '{file}' > '{out_file}'" | ||
os.system(cmd) | ||
except Exception as e: | ||
print(f'FAILED for file. Reason: {e}') | ||
|
||
def get_tsv_with_covseq(): | ||
processed_fasta_files = glob.glob(f'{processed_path}/*.fasta') | ||
for file in processed_fasta_files: | ||
filename = os.path.basename(file) | ||
print(filename) | ||
out_folder = os.path.join(segmented_path, filename) | ||
os.system(f'mkdir -p {out_folder}') | ||
cmd = f"cd {covseq_script_path} && python annotation/annotation.py --fasta '{os.path.abspath(file)}' --out_dir '{os.path.abspath(out_folder)}'" | ||
os.system(cmd) | ||
|
||
def main(): | ||
clean_fasta_files() | ||
get_tsv_with_covseq() | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,186 @@ | ||
""" | ||
Script to read the GISAID preprocessed fasta files and generate country-wise CSVs. | ||
Pipeline: | ||
Read each fasta file. | ||
Get metadata & genome sequence for each strain from fasta file. | ||
Find relevant folder (generated by covseq script) that contains the TSV corresponding to the strain in the fasta file. | ||
Get start and end of each peptide from the TSV and append sequence metadata in CSV for fasta file. | ||
Filter fasta file CSV by countries and generate separate CSV for each country. Append if country CSV already exists. | ||
""" | ||
import os | ||
import glob | ||
import logging | ||
import traceback | ||
import pandas as pd | ||
from Bio import SeqIO | ||
|
||
|
||
PATH_FASTA_DIR = "data/fasta_preprocessing/processed/" # Contains the preprocessed fasta files | ||
PATH_TSV_FOLDERS = "data/fasta_preprocessing/segmented/" # Contains the folders containing the TSVs | ||
PATH_OUTPUT_DIR = "data/fasta_preprocessing/CSVs/countrywise/" | ||
|
||
SEQ_METADATA_COLS = ['Peptide_Name', 'Sequence', 'Sequence_Length', 'DividesBy3', 'Triples_Count', 'Full_Sequence_Length'] # 3mers removed | ||
OTHER_METADATA_COLS = ['Accession_ID', 'Virus_Name', 'Country', 'Collection_Date'] | ||
OUTPUT_COLS = ['Accession_ID', 'Virus_Name', 'Country', 'Peptide_Name', 'Sequence', 'Sequence_Length', | ||
'DividesBy3', 'Triples_Count', 'Full_Sequence_Length', 'Collection_Date'] | ||
|
||
logging.basicConfig(filename='app.log', filemode='w', format='%(asctime)s - %(levelname)s - %(message)s') | ||
|
||
|
||
def validate_sequence(sequence, full_sequence=False): | ||
if full_sequence: | ||
bases_check = (len(set(sequence) - set("ATGC")) == 0) | ||
if bases_check: | ||
return True | ||
else: | ||
start_codon_check = (sequence[:3] == 'ATG') | ||
end_codon_check = (sequence[-3:] in ['TAA', 'TAG', 'TGA']) | ||
if (start_codon_check and end_codon_check): | ||
return True | ||
return False | ||
|
||
|
||
def get_fasta_strains(filepath): | ||
strains_for_file = [] | ||
fasta_sequences = SeqIO.parse(open(filepath),'fasta') | ||
for fasta in fasta_sequences: | ||
description, sequence = fasta.description, str(fasta.seq) | ||
if not validate_sequence(sequence, full_sequence=True): | ||
continue | ||
splitted_descr = str(description).split('_') # hCoV-19/Mayotte/IPP02391/2021_EPI_ISL_1167000_2021-01-21 | ||
virus_name = splitted_descr[0] | ||
country = virus_name.split('/')[1] | ||
accession_id = '_'.join(splitted_descr[1:-1]) | ||
collection_date = splitted_descr[-1] | ||
strains_for_file.append([accession_id, virus_name, country, sequence, collection_date]) | ||
return strains_for_file | ||
|
||
|
||
def handle_orf1ab(start, end, full_sequence): | ||
# print(val['Start'], val['End']) | ||
if ',' in start: | ||
vals = start.split(',') | ||
start_a = int(vals[0].strip()) | ||
start_b = int(vals[1].strip()) | ||
else: | ||
start_a = int(start[:3]) | ||
start_b = int(start[3:]) | ||
if ',' in end: | ||
vals = end.split(',') | ||
end_a = int(vals[0].strip()) | ||
end_b = int(vals[1].strip()) | ||
else: | ||
end_a = int(end[:5]) | ||
end_b = int(end[5:]) | ||
if end_a == start_b: | ||
sequence = full_sequence[start_a-1: end_b] | ||
else: | ||
sequence = full_sequence[start_a-1: end_a] + full_sequence[start_b-1: end_b] | ||
return sequence | ||
|
||
|
||
def get_amino_sequences(tsv, full_sequence, n_for_nmers=3): | ||
seq_df = pd.DataFrame(columns = SEQ_METADATA_COLS) | ||
if len(tsv) != 12: | ||
raise Exception('TSV length not 12 for the Strain') | ||
for _, val in tsv.iterrows(): | ||
peptide_name = val['Product'] | ||
if peptide_name == 'orf1ab polyprotein': # Special handling reqd. | ||
sequence = handle_orf1ab(val['Start'], val['End'], full_sequence) | ||
else: | ||
sequence = full_sequence[int(val['Start'])-1: int(val['End'])] | ||
if not validate_sequence(sequence): | ||
raise Exception('Problem in start/end codon') | ||
seq_length = len(sequence) # Ambiguity in TSV lengths | ||
dividesbythree = (seq_length/3).is_integer() | ||
triples_count = seq_length//3 | ||
row = pd.Series([peptide_name, sequence, seq_length, dividesbythree, triples_count, len(full_sequence)], | ||
index=SEQ_METADATA_COLS) | ||
seq_df = seq_df.append(row, ignore_index=True) | ||
return seq_df | ||
|
||
|
||
def find_covseq_tsv(country, accession_id): | ||
tsv_dir_path = glob.glob(f'{PATH_TSV_FOLDERS}/*/*{accession_id}*') | ||
if tsv_dir_path: | ||
tsvs = os.listdir(tsv_dir_path[0]) | ||
tsv = [f for f in tsvs if 'orf' in f] | ||
if tsv: | ||
tsv_name = tsv[0] | ||
return f'{tsv_dir_path[0]}/{tsv_name}' | ||
|
||
|
||
def get_df_for_strain(strain_metadata): | ||
accession_id, virus_name, country, collection_date = strain_metadata[0], strain_metadata[1], strain_metadata[2], strain_metadata[4] | ||
sequence = strain_metadata[3] | ||
tsv_path = find_covseq_tsv(country, accession_id) | ||
print(tsv_path) | ||
if tsv_path: | ||
tsv = pd.read_csv(tsv_path, delimiter="\t") | ||
try: | ||
seq_df = get_amino_sequences(tsv, sequence) | ||
except Exception as e: | ||
raise Exception(f"{str(e)}. ID:{accession_id}; Country:{country}") | ||
other_df = pd.DataFrame([[accession_id, virus_name, country, collection_date]], columns=OTHER_METADATA_COLS) | ||
other_df = pd.concat([other_df]*len(seq_df), ignore_index=True) # Replicate rows for each protein | ||
strain_df = pd.concat([other_df, seq_df], axis=1) | ||
return strain_df | ||
else: | ||
raise Exception(f"TSV does not exist for strain. ID:{accession_id}; Country:{country}") | ||
|
||
|
||
def export_countrywise_csvs(df_for_fasta): | ||
countries = list(set(df_for_fasta['Country'].values)) | ||
for country in countries: | ||
country_df = df_for_fasta[df_for_fasta['Country']==country] | ||
output_path = f'{PATH_OUTPUT_DIR}/{country}.csv' | ||
if os.path.exists(output_path): | ||
df = pd.read_csv(output_path) | ||
out_df = df.append(country_df, ignore_index=True) | ||
else: | ||
out_df = country_df | ||
out_df = out_df.drop_duplicates(subset=['Accession_ID', 'Peptide_Name']) | ||
out_df.to_csv(output_path, index=False) | ||
|
||
|
||
def clean_output_dir(): | ||
result_files = glob.glob(f'{PATH_OUTPUT_DIR}/*') | ||
for file in result_files: | ||
try: | ||
os.remove(file) | ||
except: | ||
pass | ||
print("Cleaned output directory") | ||
|
||
|
||
def main(cleanup=False): | ||
if cleanup: | ||
clean_output_dir() | ||
|
||
fasta_files = os.listdir(PATH_FASTA_DIR) | ||
|
||
for file in fasta_files: | ||
print(f"PROCESSING: {file}") | ||
filepath = f'{PATH_FASTA_DIR}/{file}' | ||
try: | ||
strains_for_file = get_fasta_strains(filepath) # [[accession_id, virus_name, country, sequence, collection_date]] | ||
except Exception as e: | ||
print(f"ERROR getting strains from fasta file. Reason: {e}") | ||
logging.error(f"ERROR getting strains from fasta file. Reason: {e}") | ||
# traceback.print_exc(file=sys.stdout) | ||
continue | ||
|
||
df_for_fasta = pd.DataFrame(columns = OUTPUT_COLS) | ||
for strain_metadata in strains_for_file: | ||
try: | ||
strain_df = get_df_for_strain(strain_metadata) | ||
df_for_fasta = df_for_fasta.append(strain_df, ignore_index=True) | ||
except Exception as e: | ||
print(f"ERROR processing strain. Reason: {e}") | ||
logging.error(f"ERROR processing strain. Reason: {e}") | ||
# traceback.print_exc(file=sys.stdout) | ||
if not df_for_fasta.empty: | ||
export_countrywise_csvs(df_for_fasta) | ||
|
||
if __name__ == "__main__": | ||
main(cleanup=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
""" | ||
Script to train w2v model and export embeddings for spike gene sequences | ||
""" | ||
import glob | ||
import pandas as pd | ||
from gensim.models import Word2Vec | ||
|
||
sequences_dir = "data/fasta_preprocessing/CSVs/countrywise" | ||
model_outpath = "models/word2vec-spike-all-countries-36-VS.model" | ||
output_filename = "data/w2v_output/word2vec-spike-all-countries-36-VS.csv" | ||
|
||
|
||
def get_kmers(sequence, n_for_nmers=3): | ||
n_mers = [] | ||
seq_length = len(sequence) | ||
for i in range(0, seq_length, n_for_nmers): | ||
n_mers.append(sequence[i: i+n_for_nmers]) | ||
return n_mers | ||
|
||
|
||
def word2vec_model(tokens, vector_size, window_size): | ||
model = Word2Vec(tokens, size = vector_size, window = window_size, min_count = 1, sg = 1) | ||
return model | ||
|
||
|
||
def create_vec(df, size, col, model): | ||
vector = [] | ||
for _, row in df.iterrows(): | ||
vec = [0 for i in range(size)] | ||
seq = row[col] | ||
for trip in seq: | ||
wv = model.wv[trip] | ||
for i in range(size): | ||
vec[i] += wv[i] | ||
vec = [i/len(seq) for i in vec] | ||
vector.append(vec) | ||
return vector | ||
|
||
|
||
comb_data = pd.DataFrame() | ||
for file in glob.glob(f'{sequences_dir}/*.csv'): | ||
comb_data = pd.concat([comb_data, file], axis=0, ignore_index=True) | ||
|
||
comb_data['3-mers'] = comb_data['Sequence'].apply(get_kmers) | ||
|
||
tokens3 = [] | ||
max3 = 0 | ||
for seq in comb_data['3-mers']: | ||
tokens3.append(seq) | ||
max3 = max(max3, len(seq)) | ||
|
||
model = word2vec_model(tokens3, 36, 20) | ||
vector = create_vec(comb_data, 36, '3-mers', model) | ||
data1 = pd.concat([comb_data['Accession_ID'], comb_data['Collection_Date'], comb_data['Country'], pd.DataFrame(vector)], axis=1) | ||
data1.to_csv(output_filename, index = False) | ||
model.save(model_outpath) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
import numpy as np | ||
import pandas as pd | ||
import plotly.graph_objects as go | ||
from gensim.models import Word2Vec | ||
from sklearn.linear_model import LinearRegression | ||
|
||
models_path = "models/test/" | ||
|
||
|
||
def create_df(model): | ||
df = pd.DataFrame() | ||
for vocab in model.wv.vocab.keys(): | ||
df = pd.concat([df, pd.DataFrame(model.wv[vocab]).T]) | ||
df.index = model.wv.vocab.keys() | ||
return df | ||
|
||
|
||
def calculate_loss(df1, df2): | ||
loss = np.sqrt(((df1.dot(df1.T) - df2.dot(df2.T))**2).sum().sum()) | ||
return loss | ||
|
||
|
||
dim = [3,6,9,10,12,15,18,20,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,64] | ||
keys = [] | ||
values = [] | ||
errors = {} | ||
for ind in range(len(dim)-1): | ||
model1 = Word2Vec.load(models_path + "word2vec-spike-aus-uk-3mers-"+str(dim[ind])+"-VS.model") | ||
model2 = Word2Vec.load(models_path + "word2vec-spike-aus-uk-3mers-"+str(dim[ind+1])+"-VS.model") | ||
|
||
df1 = create_df(model1) | ||
df2 = create_df(model2) | ||
df2.reindex(df1.index) | ||
|
||
pip_loss = calculate_loss(df1, df2) | ||
errors[str(dim[ind])+"_"+str(dim[ind+1])] = pip_loss | ||
keys.append(str(dim[ind+1]) +"_"+ str(dim[ind])) | ||
values.append(pip_loss) | ||
|
||
values = values[::-1] | ||
values = np.cumsum(values) | ||
keys = keys[::-1] | ||
key_index = [i for i in range(len(keys))] | ||
|
||
k = np.array(key_index).reshape(-1, 1) | ||
v = np.array(values).reshape(-1, 1) | ||
|
||
lgr1 = LinearRegression().fit(k[:10], v[:10]) | ||
lgr2 = LinearRegression().fit(k[9:13], v[9:13]) | ||
lgr3 = LinearRegression().fit(k[12:17], v[12:17]) | ||
lgr4 = LinearRegression().fit(k[16:], v[16:]) | ||
|
||
k1 = [] | ||
for i in lgr1.predict(k[:10]): | ||
k1.append(i[0]) | ||
|
||
k2 = [] | ||
for i in lgr2.predict(k[9:13]): | ||
k2.append(i[0]) | ||
|
||
k3 = [] | ||
for i in lgr3.predict(k[12:17]): | ||
k3.append(i[0]) | ||
|
||
k4 = [] | ||
for i in lgr4.predict(k[16:]): | ||
k4.append(i[0]) | ||
|
||
|
||
|
||
# Plot Cumulative PIP Loss | ||
fig = go.Figure() | ||
fig.add_trace(go.Scatter(x=keys[:10], y=k1,line=dict(color="#087FF5"))) | ||
fig.add_trace(go.Scatter(x=keys[9:13], y=k2, line=dict(color="#087FF5"))) | ||
fig.add_trace(go.Scatter(x=keys[12:17], y=k3, line=dict(color="#087FF5"))) | ||
fig.add_trace(go.Scatter(x=keys[16:], y=k4, line=dict(color="#087FF5"))) | ||
fig.update_layout( | ||
title={ | ||
'text' : "Cumulative PIP Loss for Word2Vec 3-mer Embeddings", | ||
'x':0.5, | ||
}, | ||
yaxis_title = 'PIP-Loss', | ||
template = 'plotly_white', | ||
width=800, | ||
height=500, | ||
showlegend=False, | ||
) | ||
fig.show() |
Oops, something went wrong.