Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modify the SNPObject to store LAI data at the SNP-level + add mehod to go from window-level to SNP-level in the WindowLevelAncestryObject #15

Merged
merged 12 commits into from
Dec 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 43 additions & 5 deletions demos/LAI_visualization.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "50752deb-9a50-4725-90dc-0d10005e1bbb",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -50,26 +50,64 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 4,
"id": "84b99b1d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:snputils.ancestry.io.local.read.msp:Reading '/home/miriam/Documents/snputils/data/lai.msp'...\n"
"INFO:snputils.ancestry.io.local.read.msp:Reading '../data/easComp_6_samples_chr1.msp'...\n"
]
}
],
"source": [
"# Specify the path to the LAI data file\n",
"filename = '/home/miriam/Documents/snputils/data/lai.msp'\n",
"filename = '../data/easComp_6_samples_chr1.msp'\n",
"\n",
"# Load LAI data using MSPReader, which returns a LocalAncestryObject\n",
"laiobj = MSPReader(filename).read()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "681b9362",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['GA000856_GA000856.0',\n",
" 'GA000856_GA000856.1',\n",
" 'GA000857_GA000857.0',\n",
" 'GA000857_GA000857.1',\n",
" 'GA000858_GA000858.0',\n",
" 'GA000858_GA000858.1']"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"laiobj.haplotypes"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "e4aa1fdb",
"metadata": {},
"outputs": [],
"source": [
"# Construct haplotypes and LAI data\n",
"samples = list(range(10))\n",
"haplotypes = [f\"{sample}_{sample}.0\" for sample in samples] + [f\"{sample}_{sample}.1\" for sample in samples]"
]
},
{
"cell_type": "markdown",
"id": "da3a5b0f-dee8-4e1d-ae74-ab2a0fa1c4b0",
Expand Down Expand Up @@ -148,7 +186,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "galaxybio",
"display_name": "snputils",
"language": "python",
"name": "python3"
},
Expand Down
124 changes: 63 additions & 61 deletions demos/SNPObj.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -40,19 +40,19 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"id": "2b25818b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:snputils.snp.io.read.vcf:Reading ../data/subset.vcf\n",
"INFO:snputils.snp.io.read.vcf:Finished reading ../data/subset.vcf\n",
"Attributes of the SNPObject: ['calldata_gt', 'samples', 'variants_ref', 'variants_alt', 'variants_chrom', 'variants_filter_pass', 'variants_id', 'variants_pos', 'variants_qual']\n",
"INFO:snputils.snp.io.read.vcf:Reading ../data/subset.vcf\n",
"INFO:snputils.snp.io.read.vcf:Finished reading ../data/subset.vcf\n"
"INFO:snputils.snp.io.read.vcf:Reading ../data/vcf/subset.vcf\n",
"INFO:snputils.snp.io.read.vcf:Finished reading ../data/vcf/subset.vcf\n",
"Attributes of the SNPObject: ['calldata_gt', 'samples', 'variants_ref', 'variants_alt', 'variants_chrom', 'variants_filter_pass', 'variants_id', 'variants_pos', 'variants_qual', 'calldata_lai', 'ancestry_map']\n",
"INFO:snputils.snp.io.read.vcf:Reading ../data/vcf/subset.vcf\n",
"INFO:snputils.snp.io.read.vcf:Finished reading ../data/vcf/subset.vcf\n"
]
}
],
Expand Down Expand Up @@ -92,8 +92,8 @@
"output_type": "stream",
"text": [
"Original sample IDs: ['HG00096' 'HG00097' 'HG00099' 'HG00100']\n",
"Updated sample IDs (attribute access): ['sample_1', 'sample_2', 'sample_3', 'sample_4']\n",
"Updated sample IDs (dictionary access): ['sample_A', 'sample_B', 'sample_C', 'sample_D']\n"
"Updated sample IDs (attribute access): ['sample_1' 'sample_2' 'sample_3' 'sample_4']\n",
"Updated sample IDs (dictionary access): ['sample_A' 'sample_B' 'sample_C' 'sample_D']\n"
]
}
],
Expand Down Expand Up @@ -256,8 +256,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:snputils.snp.io.read.vcf:Reading ../data/subset.vcf\n",
"INFO:snputils.snp.io.read.vcf:Finished reading ../data/subset.vcf\n",
"INFO:snputils.snp.io.read.vcf:Reading ../data/vcf/subset.vcf\n",
"INFO:snputils.snp.io.read.vcf:Finished reading ../data/vcf/subset.vcf\n",
"Chromosomes after matching format: ['21']\n"
]
}
Expand Down Expand Up @@ -319,13 +319,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Unique genotype values before renaming missings: [0 1]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Unique genotype values before renaming missings: [0 1]\n",
"Unique genotype values after renaming missings: [0 1]\n"
]
}
Expand Down Expand Up @@ -360,13 +354,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Number of SNPs before filtering: 976599\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of SNPs before filtering: 976599\n",
"Number of SNPs after filtering: 0\n"
]
}
Expand Down Expand Up @@ -398,7 +386,13 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Unique chromosomes before filtering: ['21']\n",
"Unique chromosomes before filtering: ['21']\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Unique chromosomes after filtering: []\n"
]
}
Expand Down Expand Up @@ -466,14 +460,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Samples before filtering: ['sample_A', 'sample_B', 'sample_C', 'sample_D']\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Samples after filtering: ['sample_A', 'sample_B']\n"
"Samples before filtering: ['sample_A' 'sample_B' 'sample_C' 'sample_D']\n",
"Samples after filtering: ['sample_A' 'sample_B']\n"
]
}
],
Expand Down Expand Up @@ -504,8 +492,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Samples before filtering: ['sample_A', 'sample_B', 'sample_C', 'sample_D']\n",
"Samples after filtering: ['sample_B', 'sample_C']\n"
"Samples before filtering: ['sample_A' 'sample_B' 'sample_C' 'sample_D']\n",
"Samples after filtering: ['sample_B' 'sample_C']\n"
]
}
],
Expand Down Expand Up @@ -540,8 +528,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:snputils.snp.io.read.vcf:Reading ../data/subset.vcf\n",
"INFO:snputils.snp.io.read.vcf:Finished reading ../data/subset.vcf\n",
"INFO:snputils.snp.io.read.vcf:Reading ../data/vcf/subset.vcf\n",
"INFO:snputils.snp.io.read.vcf:Finished reading ../data/vcf/subset.vcf\n",
"Number of SNPs after subsetting to common variants: 976599\n"
]
}
Expand Down Expand Up @@ -721,14 +709,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"First 5 variant positions before shuffling: [5033871 5033884 5033887 5035658 5038298]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"First 5 variant positions after shuffling: [23057061 26864842 29640589 30592114 33893258]\n"
"First 5 variant positions before shuffling: [5033871 5033884 5033887 5035658 5038298]\n",
"First 5 variant positions after shuffling: [40064132 19950752 21721370 46148286 36956322]\n"
]
}
],
Expand Down Expand Up @@ -760,7 +742,13 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Variants_ref before handling empty entries: ['' 'G' 'G' 'C' 'A']\n",
"Variants_ref before handling empty entries: ['' 'G' 'G' 'C' 'A']\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Variants_ref after handling empty entries: ['.' 'G' 'G' 'C' 'A']\n"
]
}
Expand Down Expand Up @@ -796,16 +784,16 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 23,
"id": "3ebf53e5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:snputils.snp.io.read.vcf:Reading ../data/subset.vcf\n",
"INFO:snputils.snp.io.read.vcf:Finished reading ../data/subset.vcf\n"
"INFO:snputils.snp.io.read.vcf:Reading ../data/vcf/subset.vcf\n",
"INFO:snputils.snp.io.read.vcf:Finished reading ../data/vcf/subset.vcf\n"
]
}
],
Expand All @@ -820,10 +808,30 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 24,
"id": "544b2955",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/miriam/.local/lib/python3.10/site-packages/joblib/externals/loky/backend/fork_exec.py:38: RuntimeWarning: Using fork() can cause Polars to deadlock in the child process.\n",
"In addition, using fork() with Python in general is a recipe for mysterious\n",
"deadlocks and crashes.\n",
"\n",
"The most likely reason you are seeing this error is because you are using the\n",
"multiprocessing module on Linux, which uses fork() by default. This will be\n",
"fixed in Python 3.14. Until then, you want to use the \"spawn\" context instead.\n",
"\n",
"See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.\n",
"\n",
"If you really know what your doing, you can silence this warning with the warning module\n",
"or by setting POLARS_ALLOW_FORKING_THREAD=1.\n",
"\n",
" pid = os.fork()\n"
]
},
{
"name": "stdout",
"output_type": "stream",
Expand Down Expand Up @@ -857,21 +865,15 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 25,
"id": "08eff0b2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:snputils.snp.io.write.pgen:Writing to ../data/output.pvar\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:snputils.snp.io.write.pgen:Writing to ../data/output.pvar\n",
"INFO:snputils.snp.io.write.pgen:Writing ../data/output.psam\n",
"INFO:snputils.snp.io.write.pgen:Writing to ../data/output.pgen\n",
"SNPObject saved to ../data/output.pgen\n",
Expand Down Expand Up @@ -906,7 +908,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 26,
"id": "fbca1fd8",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -957,7 +959,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 27,
"id": "cea856ad",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -995,7 +997,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "galaxybio",
"display_name": "snputils",
"language": "python",
"name": "python3"
},
Expand Down
2 changes: 1 addition & 1 deletion demos/admixture_mapping.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down
Loading
Loading