Skip to content

Commit 27e2fd5

Browse files
committed
add notebook on ANOVA test
1 parent 91fba86 commit 27e2fd5

File tree

1 file changed

+178
-0
lines changed

1 file changed

+178
-0
lines changed

ANOVA-genotype-phenotype.ipynb

+178
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# ANOVA test with genotype -> phenotype data"
8+
]
9+
},
10+
{
11+
"cell_type": "code",
12+
"execution_count": null,
13+
"metadata": {},
14+
"outputs": [],
15+
"source": [
16+
"import pandas as pd\n",
17+
"import numpy as np\n",
18+
"import os"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {},
25+
"outputs": [],
26+
"source": [
27+
"import scipy.stats as stats\n",
28+
"import matplotlib.pyplot as plt\n",
29+
"import seaborn as sns"
30+
]
31+
},
32+
{
33+
"cell_type": "markdown",
34+
"metadata": {},
35+
"source": [
36+
"# Importing the data"
37+
]
38+
},
39+
{
40+
"cell_type": "code",
41+
"execution_count": null,
42+
"metadata": {},
43+
"outputs": [],
44+
"source": [
45+
"# Config for accessing the data on the s3 storage\n",
46+
"storage_options = {'anon':True, 'client_kwargs':{'endpoint_url':'https://os.unil.cloud.switch.ch'}}\n",
47+
"s3_path = 's3://lts2-graphnex/BXDmice/'"
48+
]
49+
},
50+
{
51+
"cell_type": "code",
52+
"execution_count": null,
53+
"metadata": {},
54+
"outputs": [],
55+
"source": [
56+
"# Load the data\n",
57+
"genotype_path = os.path.join(s3_path, 'geno_reduced.csv.gz')\n",
58+
"#genotype_path = os.path.join(s3_path, 'genotype_BXD.csv.gz')\n",
59+
"genotype = pd.read_csv(genotype_path, storage_options=storage_options)\n",
60+
"print('File {} Opened.'.format(genotype_path))\n",
61+
"phenotype_path = os.path.join(s3_path, 'Phenotype.txt.gz')\n",
62+
"phenotype = pd.read_csv(phenotype_path, sep='\\t', storage_options=storage_options)\n",
63+
"print('File {} Opened.'.format(phenotype_path))\n",
64+
"# Phenotype description\n",
65+
"phenotypeinfo_path = os.path.join(s3_path, 'phenotypes_id_aligner.txt.gz')\n",
66+
"phenotypeinfo = pd.read_csv(phenotypeinfo_path, sep='\\t', storage_options=storage_options)\n",
67+
"print('File {} Opened.'.format(phenotypeinfo_path))"
68+
]
69+
},
70+
{
71+
"cell_type": "markdown",
72+
"metadata": {},
73+
"source": [
74+
"## Example on one phenotype\n",
75+
"We choose the phenotype with id 'X122'. This phenotype is highly dependent on a small set of SNPs. This dependence is clearly visible with an ANOVA test."
76+
]
77+
},
78+
{
79+
"cell_type": "code",
80+
"execution_count": null,
81+
"metadata": {},
82+
"outputs": [],
83+
"source": [
84+
"pheno_id = 'X122'\n",
85+
"\n",
86+
"print('Phenotype description:')\n",
87+
"description = phenotypeinfo[phenotypeinfo['PhenoID']==pheno_id]['Phenotype'].values\n",
88+
"print(description)\n",
89+
"print('----------')\n",
90+
"pheno_BXD = phenotype[phenotype['PhenoID']==pheno_id].dropna(axis=1).drop('PhenoID', axis=1)\n",
91+
"mouse_list = list(pheno_BXD.columns)\n",
92+
"print('Phenotype values:')\n",
93+
"pheno_BXD"
94+
]
95+
},
96+
{
97+
"cell_type": "code",
98+
"execution_count": null,
99+
"metadata": {},
100+
"outputs": [],
101+
"source": [
102+
"# For each SNP, we separate the mice in two groups:\n",
103+
"# the one with -1 and the one with +1\n",
104+
"# and we compute the p-value\n",
105+
"geno_BXD = genotype[mouse_list]\n",
106+
"fvalues = []\n",
107+
"pvalues = []\n",
108+
"for SNP,row in geno_BXD.iterrows():\n",
109+
" population1 = row[row==-1]\n",
110+
" population2 = row[row==1]\n",
111+
" x = pheno_BXD[population1.keys()].values\n",
112+
" y = pheno_BXD[population2.keys()].values\n",
113+
" fvalue, pvalue = stats.f_oneway(x.T, y.T)\n",
114+
" fvalues += [fvalue[0]]\n",
115+
" pvalues += [pvalue[0]]"
116+
]
117+
},
118+
{
119+
"cell_type": "code",
120+
"execution_count": null,
121+
"metadata": {},
122+
"outputs": [],
123+
"source": [
124+
"# We create a dataframe with the results\n",
125+
"df = pd.DataFrame()\n",
126+
"df['fvalues'] = fvalues\n",
127+
"df['pvalues'] = pvalues\n",
128+
"df['Chr'] = genotype['Chr'].values\n",
129+
"df['Pos'] = genotype['Pos'].values\n",
130+
"# Turn the index as a column with a name\n",
131+
"df.reset_index(inplace=True)\n",
132+
"df.rename(columns={'index' : 'SNP index'}, inplace=True)\n",
133+
"df.head()"
134+
]
135+
},
136+
{
137+
"cell_type": "code",
138+
"execution_count": null,
139+
"metadata": {},
140+
"outputs": [],
141+
"source": [
142+
"# Plot the results of the ANOVA test\n",
143+
"f, ax = plt.subplots(figsize=(10, 10))\n",
144+
"ax.set(yscale=\"log\")\n",
145+
"sns.scatterplot(x=\"SNP index\", y=\"pvalues\", data=df.reset_index(), hue=\"Chr\").invert_yaxis()\n",
146+
"ax.axhline(0.05, ls='--', c='red')"
147+
]
148+
},
149+
{
150+
"cell_type": "code",
151+
"execution_count": null,
152+
"metadata": {},
153+
"outputs": [],
154+
"source": []
155+
}
156+
],
157+
"metadata": {
158+
"kernelspec": {
159+
"display_name": "venv",
160+
"language": "python",
161+
"name": "venv"
162+
},
163+
"language_info": {
164+
"codemirror_mode": {
165+
"name": "ipython",
166+
"version": 3
167+
},
168+
"file_extension": ".py",
169+
"mimetype": "text/x-python",
170+
"name": "python",
171+
"nbconvert_exporter": "python",
172+
"pygments_lexer": "ipython3",
173+
"version": "3.8.0"
174+
}
175+
},
176+
"nbformat": 4,
177+
"nbformat_minor": 4
178+
}

0 commit comments

Comments
 (0)