Skip to content

Commit

Permalink
Merge pull request #15 from AI-sandbox/feature/PSW-86-develop-snp-lev…
Browse files Browse the repository at this point in the history
…el-local-ancestry-object

Modify the SNPObject to store LAI data at the SNP-level + add mehod to go from window-level to SNP-level in the WindowLevelAncestryObject
  • Loading branch information
salcc authored Dec 28, 2024
2 parents 1b208a9 + fc9a5da commit 5462ae3
Show file tree
Hide file tree
Showing 9 changed files with 483 additions and 100 deletions.
48 changes: 43 additions & 5 deletions demos/LAI_visualization.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "50752deb-9a50-4725-90dc-0d10005e1bbb",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -50,26 +50,64 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 4,
"id": "84b99b1d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:snputils.ancestry.io.local.read.msp:Reading '/home/miriam/Documents/snputils/data/lai.msp'...\n"
"INFO:snputils.ancestry.io.local.read.msp:Reading '../data/easComp_6_samples_chr1.msp'...\n"
]
}
],
"source": [
"# Specify the path to the LAI data file\n",
"filename = '/home/miriam/Documents/snputils/data/lai.msp'\n",
"filename = '../data/easComp_6_samples_chr1.msp'\n",
"\n",
"# Load LAI data using MSPReader, which returns a LocalAncestryObject\n",
"laiobj = MSPReader(filename).read()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "681b9362",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['GA000856_GA000856.0',\n",
" 'GA000856_GA000856.1',\n",
" 'GA000857_GA000857.0',\n",
" 'GA000857_GA000857.1',\n",
" 'GA000858_GA000858.0',\n",
" 'GA000858_GA000858.1']"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"laiobj.haplotypes"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "e4aa1fdb",
"metadata": {},
"outputs": [],
"source": [
"# Construct haplotypes and LAI data\n",
"samples = list(range(10))\n",
"haplotypes = [f\"{sample}_{sample}.0\" for sample in samples] + [f\"{sample}_{sample}.1\" for sample in samples]"
]
},
{
"cell_type": "markdown",
"id": "da3a5b0f-dee8-4e1d-ae74-ab2a0fa1c4b0",
Expand Down Expand Up @@ -148,7 +186,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "galaxybio",
"display_name": "snputils",
"language": "python",
"name": "python3"
},
Expand Down
124 changes: 63 additions & 61 deletions demos/SNPObj.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -40,19 +40,19 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"id": "2b25818b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:snputils.snp.io.read.vcf:Reading ../data/subset.vcf\n",
"INFO:snputils.snp.io.read.vcf:Finished reading ../data/subset.vcf\n",
"Attributes of the SNPObject: ['calldata_gt', 'samples', 'variants_ref', 'variants_alt', 'variants_chrom', 'variants_filter_pass', 'variants_id', 'variants_pos', 'variants_qual']\n",
"INFO:snputils.snp.io.read.vcf:Reading ../data/subset.vcf\n",
"INFO:snputils.snp.io.read.vcf:Finished reading ../data/subset.vcf\n"
"INFO:snputils.snp.io.read.vcf:Reading ../data/vcf/subset.vcf\n",
"INFO:snputils.snp.io.read.vcf:Finished reading ../data/vcf/subset.vcf\n",
"Attributes of the SNPObject: ['calldata_gt', 'samples', 'variants_ref', 'variants_alt', 'variants_chrom', 'variants_filter_pass', 'variants_id', 'variants_pos', 'variants_qual', 'calldata_lai', 'ancestry_map']\n",
"INFO:snputils.snp.io.read.vcf:Reading ../data/vcf/subset.vcf\n",
"INFO:snputils.snp.io.read.vcf:Finished reading ../data/vcf/subset.vcf\n"
]
}
],
Expand Down Expand Up @@ -92,8 +92,8 @@
"output_type": "stream",
"text": [
"Original sample IDs: ['HG00096' 'HG00097' 'HG00099' 'HG00100']\n",
"Updated sample IDs (attribute access): ['sample_1', 'sample_2', 'sample_3', 'sample_4']\n",
"Updated sample IDs (dictionary access): ['sample_A', 'sample_B', 'sample_C', 'sample_D']\n"
"Updated sample IDs (attribute access): ['sample_1' 'sample_2' 'sample_3' 'sample_4']\n",
"Updated sample IDs (dictionary access): ['sample_A' 'sample_B' 'sample_C' 'sample_D']\n"
]
}
],
Expand Down Expand Up @@ -256,8 +256,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:snputils.snp.io.read.vcf:Reading ../data/subset.vcf\n",
"INFO:snputils.snp.io.read.vcf:Finished reading ../data/subset.vcf\n",
"INFO:snputils.snp.io.read.vcf:Reading ../data/vcf/subset.vcf\n",
"INFO:snputils.snp.io.read.vcf:Finished reading ../data/vcf/subset.vcf\n",
"Chromosomes after matching format: ['21']\n"
]
}
Expand Down Expand Up @@ -319,13 +319,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Unique genotype values before renaming missings: [0 1]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Unique genotype values before renaming missings: [0 1]\n",
"Unique genotype values after renaming missings: [0 1]\n"
]
}
Expand Down Expand Up @@ -360,13 +354,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Number of SNPs before filtering: 976599\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of SNPs before filtering: 976599\n",
"Number of SNPs after filtering: 0\n"
]
}
Expand Down Expand Up @@ -398,7 +386,13 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Unique chromosomes before filtering: ['21']\n",
"Unique chromosomes before filtering: ['21']\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Unique chromosomes after filtering: []\n"
]
}
Expand Down Expand Up @@ -466,14 +460,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Samples before filtering: ['sample_A', 'sample_B', 'sample_C', 'sample_D']\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Samples after filtering: ['sample_A', 'sample_B']\n"
"Samples before filtering: ['sample_A' 'sample_B' 'sample_C' 'sample_D']\n",
"Samples after filtering: ['sample_A' 'sample_B']\n"
]
}
],
Expand Down Expand Up @@ -504,8 +492,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Samples before filtering: ['sample_A', 'sample_B', 'sample_C', 'sample_D']\n",
"Samples after filtering: ['sample_B', 'sample_C']\n"
"Samples before filtering: ['sample_A' 'sample_B' 'sample_C' 'sample_D']\n",
"Samples after filtering: ['sample_B' 'sample_C']\n"
]
}
],
Expand Down Expand Up @@ -540,8 +528,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:snputils.snp.io.read.vcf:Reading ../data/subset.vcf\n",
"INFO:snputils.snp.io.read.vcf:Finished reading ../data/subset.vcf\n",
"INFO:snputils.snp.io.read.vcf:Reading ../data/vcf/subset.vcf\n",
"INFO:snputils.snp.io.read.vcf:Finished reading ../data/vcf/subset.vcf\n",
"Number of SNPs after subsetting to common variants: 976599\n"
]
}
Expand Down Expand Up @@ -721,14 +709,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"First 5 variant positions before shuffling: [5033871 5033884 5033887 5035658 5038298]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"First 5 variant positions after shuffling: [23057061 26864842 29640589 30592114 33893258]\n"
"First 5 variant positions before shuffling: [5033871 5033884 5033887 5035658 5038298]\n",
"First 5 variant positions after shuffling: [40064132 19950752 21721370 46148286 36956322]\n"
]
}
],
Expand Down Expand Up @@ -760,7 +742,13 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Variants_ref before handling empty entries: ['' 'G' 'G' 'C' 'A']\n",
"Variants_ref before handling empty entries: ['' 'G' 'G' 'C' 'A']\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Variants_ref after handling empty entries: ['.' 'G' 'G' 'C' 'A']\n"
]
}
Expand Down Expand Up @@ -796,16 +784,16 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 23,
"id": "3ebf53e5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:snputils.snp.io.read.vcf:Reading ../data/subset.vcf\n",
"INFO:snputils.snp.io.read.vcf:Finished reading ../data/subset.vcf\n"
"INFO:snputils.snp.io.read.vcf:Reading ../data/vcf/subset.vcf\n",
"INFO:snputils.snp.io.read.vcf:Finished reading ../data/vcf/subset.vcf\n"
]
}
],
Expand All @@ -820,10 +808,30 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 24,
"id": "544b2955",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/miriam/.local/lib/python3.10/site-packages/joblib/externals/loky/backend/fork_exec.py:38: RuntimeWarning: Using fork() can cause Polars to deadlock in the child process.\n",
"In addition, using fork() with Python in general is a recipe for mysterious\n",
"deadlocks and crashes.\n",
"\n",
"The most likely reason you are seeing this error is because you are using the\n",
"multiprocessing module on Linux, which uses fork() by default. This will be\n",
"fixed in Python 3.14. Until then, you want to use the \"spawn\" context instead.\n",
"\n",
"See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.\n",
"\n",
"If you really know what your doing, you can silence this warning with the warning module\n",
"or by setting POLARS_ALLOW_FORKING_THREAD=1.\n",
"\n",
" pid = os.fork()\n"
]
},
{
"name": "stdout",
"output_type": "stream",
Expand Down Expand Up @@ -857,21 +865,15 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 25,
"id": "08eff0b2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:snputils.snp.io.write.pgen:Writing to ../data/output.pvar\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:snputils.snp.io.write.pgen:Writing to ../data/output.pvar\n",
"INFO:snputils.snp.io.write.pgen:Writing ../data/output.psam\n",
"INFO:snputils.snp.io.write.pgen:Writing to ../data/output.pgen\n",
"SNPObject saved to ../data/output.pgen\n",
Expand Down Expand Up @@ -906,7 +908,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 26,
"id": "fbca1fd8",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -957,7 +959,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 27,
"id": "cea856ad",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -995,7 +997,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "galaxybio",
"display_name": "snputils",
"language": "python",
"name": "python3"
},
Expand Down
2 changes: 1 addition & 1 deletion demos/admixture_mapping.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down
Loading

0 comments on commit 5462ae3

Please sign in to comment.