add notebook on ANOVA test

bricaud · bricaud · commit 27e2fd505e97 · 2021-07-13T22:06:34.000+02:00
diff --git a/ANOVA-genotype-phenotype.ipynb b/ANOVA-genotype-phenotype.ipynb
@@ -0,0 +1,178 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ANOVA test with genotype -> phenotype data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import scipy.stats as stats\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Importing the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Config for accessing the data on the s3 storage\n",
+    "storage_options = {'anon':True, 'client_kwargs':{'endpoint_url':'https://os.unil.cloud.switch.ch'}}\n",
+    "s3_path = 's3://lts2-graphnex/BXDmice/'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the data\n",
+    "genotype_path = os.path.join(s3_path, 'geno_reduced.csv.gz')\n",
+    "#genotype_path = os.path.join(s3_path, 'genotype_BXD.csv.gz')\n",
+    "genotype = pd.read_csv(genotype_path, storage_options=storage_options)\n",
+    "print('File {} Opened.'.format(genotype_path))\n",
+    "phenotype_path = os.path.join(s3_path, 'Phenotype.txt.gz')\n",
+    "phenotype = pd.read_csv(phenotype_path, sep='\\t', storage_options=storage_options)\n",
+    "print('File {} Opened.'.format(phenotype_path))\n",
+    "# Phenotype description\n",
+    "phenotypeinfo_path = os.path.join(s3_path, 'phenotypes_id_aligner.txt.gz')\n",
+    "phenotypeinfo = pd.read_csv(phenotypeinfo_path, sep='\\t', storage_options=storage_options)\n",
+    "print('File {} Opened.'.format(phenotypeinfo_path))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Example on one phenotype\n",
+    "We choose the phenotype with id 'X122'. This phenotype is highly dependent on a small set of SNPs. This dependence is clearly visible with an ANOVA test."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pheno_id = 'X122'\n",
+    "\n",
+    "print('Phenotype description:')\n",
+    "description = phenotypeinfo[phenotypeinfo['PhenoID']==pheno_id]['Phenotype'].values\n",
+    "print(description)\n",
+    "print('----------')\n",
+    "pheno_BXD = phenotype[phenotype['PhenoID']==pheno_id].dropna(axis=1).drop('PhenoID', axis=1)\n",
+    "mouse_list = list(pheno_BXD.columns)\n",
+    "print('Phenotype values:')\n",
+    "pheno_BXD"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# For each SNP, we separate the mice in two groups:\n",
+    "# the one with -1 and the one with +1\n",
+    "# and we compute the p-value\n",
+    "geno_BXD = genotype[mouse_list]\n",
+    "fvalues = []\n",
+    "pvalues = []\n",
+    "for SNP,row in geno_BXD.iterrows():\n",
+    "    population1 = row[row==-1]\n",
+    "    population2 = row[row==1]\n",
+    "    x = pheno_BXD[population1.keys()].values\n",
+    "    y = pheno_BXD[population2.keys()].values\n",
+    "    fvalue, pvalue  = stats.f_oneway(x.T, y.T)\n",
+    "    fvalues += [fvalue[0]]\n",
+    "    pvalues += [pvalue[0]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We create a dataframe with the results\n",
+    "df = pd.DataFrame()\n",
+    "df['fvalues'] = fvalues\n",
+    "df['pvalues'] = pvalues\n",
+    "df['Chr'] = genotype['Chr'].values\n",
+    "df['Pos'] = genotype['Pos'].values\n",
+    "# Turn the index as a column with a name\n",
+    "df.reset_index(inplace=True)\n",
+    "df.rename(columns={'index' : 'SNP index'}, inplace=True)\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Plot the results of the ANOVA test\n",
+    "f, ax = plt.subplots(figsize=(10, 10))\n",
+    "ax.set(yscale=\"log\")\n",
+    "sns.scatterplot(x=\"SNP index\", y=\"pvalues\", data=df.reset_index(), hue=\"Chr\").invert_yaxis()\n",
+    "ax.axhline(0.05, ls='--', c='red')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "venv"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}