Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add sort by confidence to web app #75

Merged
merged 8 commits into from
Jan 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 26 additions & 19 deletions notebooks/chemicals-unbiased-evaluation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,15 @@
"metadata": {},
"outputs": [],
"source": [
"import biomappings\n",
"import random\n",
"import pandas as pd\n",
"import itertools as itt\n",
"import random\n",
"from collections import Counter\n",
"from pathlib import Path\n",
"\n",
"import pandas as pd\n",
"from tqdm.auto import tqdm\n",
"from pathlib import Path"
"\n",
"import biomappings"
]
},
{
Expand Down Expand Up @@ -302,7 +304,7 @@
"\n",
"if path.is_file():\n",
" print(f\"loading from {path}\")\n",
" df = pd.read_csv(path, sep='\\t')\n",
" df = pd.read_csv(path, sep=\"\\t\")\n",
"\n",
"else:\n",
" mappings = []\n",
Expand All @@ -317,20 +319,23 @@
" if mapping[\"source prefix\"] == \"chebi\" and mapping[\"target prefix\"] == \"mesh\":\n",
" mapping[\"curation_status\"] = label\n",
" mappings.append(mapping)\n",
" \n",
" mappings = sorted(mappings, key=lambda m: (\n",
" m[\"source identifier\"],\n",
" m[\"relation\"],\n",
" m[\"target identifier\"],\n",
" ))\n",
"\n",
" mappings = sorted(\n",
" mappings,\n",
" key=lambda m: (\n",
" m[\"source identifier\"],\n",
" m[\"relation\"],\n",
" m[\"target identifier\"],\n",
" ),\n",
" )\n",
"\n",
" print(f\"There are {len(mappings):,} total mappings\")\n",
"\n",
" random.seed(0)\n",
" subset = random.choices(mappings, k=100)\n",
" df = pd.DataFrame(subset).sort_values(\"curation_status\")\n",
" df.to_csv(path, sep='\\t', index=False)\n",
" \n",
" df.to_csv(path, sep=\"\\t\", index=False)\n",
"\n",
"df"
]
},
Expand All @@ -356,10 +361,10 @@
"pairs = set(map(tuple, df[[\"source identifier\", \"target identifier\"]].values))\n",
"\n",
"for label, xxx in [\n",
" (\"prediction\", biomappings.load_predictions()),\n",
" (\"positive\", biomappings.load_mappings()),\n",
" (\"negative\", biomappings.load_false_mappings()),\n",
" (\"unsure\", biomappings.load_unsure()),\n",
" (\"prediction\", biomappings.load_predictions()),\n",
" (\"positive\", biomappings.load_mappings()),\n",
" (\"negative\", biomappings.load_false_mappings()),\n",
" (\"unsure\", biomappings.load_unsure()),\n",
"]:\n",
" for mapping in xxx:\n",
" if (mapping[\"source identifier\"], mapping[\"target identifier\"]) in pairs:\n",
Expand Down Expand Up @@ -398,10 +403,12 @@
"precision_mi = (counter[\"positive\"] + counter[\"unsure\"] / 2) / total\n",
"precision_error = counter[\"unsure\"] / 2 / total\n",
"\n",
"print(f\"\"\"\\\n",
"print(\n",
" f\"\"\"\\\n",
"With {total:,} random curations, we estimate a precision \\\n",
"of {precision_mi:.1%} ± {precision_error:.1%}\n",
"\"\"\")"
"\"\"\"\n",
")"
]
}
],
Expand Down
4 changes: 2 additions & 2 deletions src/biomappings/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ def update(ctx: click.Context):
ctx.invoke(sssom)
click.secho("Generating charts", fg="green")
ctx.invoke(charts)
click.secho("Uploading to NDEx", fg="green")
ctx.invoke(ndex)
# click.secho("Uploading to NDEx", fg="green")
# ctx.invoke(ndex)


@main.command()
Expand Down
6 changes: 6 additions & 0 deletions src/biomappings/resources/incorrect.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,7 @@ mesh C023514 2,6-dinitrotoluene skos:exactMatch doid DOID:2679 dysembryoplastic
mesh C025665 glyoxylate reductase (NADP+) skos:exactMatch go GO:0030267 glyoxylate reductase (NADP) activity manually_reviewed orcid:0000-0001-9439-5346
mesh C026044 deoxyribonuclease V skos:exactMatch go GO:0043737 deoxyribonuclease V activity manually_reviewed orcid:0000-0001-9439-5346
mesh C026677 9 alpha,11 alpha,15 alpha-trihydroxy-16-phenoxy-17,18,19,20-tetranorprosta-4,5,13-trienoic acid skos:exactMatch go GO:0009670 triose-phosphate:phosphate antiporter activity manually_reviewed orcid:0000-0001-9439-5346
mesh C028136 4-S-(propionic acid)sulfidocyclophosphamide skos:exactMatch efo 0010025 PASP manually_reviewed orcid:0000-0003-4423-4370
mesh C029757 phosphogluconate dehydrogenase (decarboxylating) skos:exactMatch go GO:0004616 phosphogluconate dehydrogenase (decarboxylating) activity manually_reviewed orcid:0000-0001-9439-5346
mesh C031570 2-hydroxybutyric acid skos:exactMatch efo 0010458 alpha-hydroxybutyric acid measurement manually_reviewed orcid:0000-0003-4423-4370
mesh C031616 high density lipoprotein receptors skos:exactMatch go GO:0070506 high-density lipoprotein particle receptor activity manually_reviewed orcid:0000-0001-9439-5346
Expand All @@ -368,6 +369,7 @@ mesh C055755 2'-(2-hydroxyphenyl)-2'-thiazoline-4'-carboxylic acid skos:exactMat
mesh C056957 1-(4-carbethoxyphenyl)-3-hydroxymethyl-3-methyltriazene skos:exactMatch go GO:0033802 isoliquiritigenin 2'-O-methyltransferase activity manually_reviewed orcid:0000-0001-9439-5346
mesh C063522 NAD+ synthase (glutamine-hydrolysing) skos:exactMatch go GO:0003952 NAD+ synthase (glutamine-hydrolyzing) activity manually_reviewed orcid:0000-0001-9439-5346
mesh C066234 enterobactin synthetase skos:exactMatch go GO:0009239 enterobactin biosynthetic process manually_reviewed orcid:0000-0001-9439-5346
mesh C068195 methylarginyl-lysyl-prolyl-tryptophyl-tert-leucyl-leucyl-ethyl ester skos:exactMatch efo 0002707 NT-1 manually_reviewed orcid:0000-0003-4423-4370
mesh C071543 cytochrome P450 CYP3A10 (hamster) skos:exactMatch go GO:0033777 lithocholate 6beta-hydroxylase activity manually_reviewed orcid:0000-0001-9439-5346
mesh C072825 4-hexyloxyaniline skos:exactMatch go GO:0008701 4-hydroxy-2-oxovalerate aldolase activity manually_reviewed orcid:0000-0001-9439-5346
mesh C076131 DNA-3-methyladenine glycosidase II skos:exactMatch go GO:0003905 alkylbase DNA N-glycosylase activity manually_reviewed orcid:0000-0001-9439-5346
Expand All @@ -388,9 +390,11 @@ mesh C110804 mycophenolic adenine dinucleotide skos:exactMatch go GO:0072671 mit
mesh C115412 cytochrome P-450 CYP81E1 (Glycyrrhiza echinata) skos:exactMatch go GO:0033773 isoflavone 2'-hydroxylase activity manually_reviewed orcid:0000-0001-9439-5346
mesh C116435 lipoic acid synthase skos:exactMatch go GO:0016992 lipoate synthase activity manually_reviewed orcid:0000-0001-9439-5346
mesh C120640 DNA ligase (NAD) skos:exactMatch go GO:0003911 DNA ligase (NAD+) activity manually_reviewed orcid:0000-0001-9439-5346
mesh C415368 interleukin-22 skos:exactMatch efo 0003810 interleukin-22 (Homo sapiens) manually_reviewed orcid:0000-0003-4423-4370
mesh C415369 interleukin-22 receptor skos:exactMatch go GO:0042018 interleukin-22 receptor activity manually_reviewed orcid:0000-0001-9439-5346
mesh C416670 N-acetylgalactosamine 4-sulfate 6-O-sulfotransferase skos:exactMatch go GO:0050659 N-acetylgalactosamine 4-sulfate 6-O-sulfotransferase activity manually_reviewed orcid:0000-0001-9439-5346
mesh C419642 NRH - quinone oxidoreductase2 skos:exactMatch go GO:0001512 dihydronicotinamide riboside quinone reductase activity manually_reviewed orcid:0000-0001-9439-5346
mesh C422042 interleukin 20 skos:exactMatch efo 0003803 interleukin-20 (Homo sapiens) manually_reviewed orcid:0000-0003-4423-4370
mesh C422952 telomerase RNA skos:exactMatch go GO:0000332 template for synthesis of G-rich strand of telomere DNA activity manually_reviewed orcid:0000-0001-9439-5346
mesh C427385 4-hydroxyacetophenone monooxygenase skos:exactMatch go GO:0033767 4-hydroxyacetophenone monooxygenase activity manually_reviewed orcid:0000-0001-9439-5346
mesh C428384 MLK-like mitogen-activated protein triple kinase skos:exactMatch go GO:0004709 MAP kinase kinase kinase activity manually_reviewed orcid:0000-0001-9439-5346
Expand Down Expand Up @@ -621,6 +625,7 @@ mesh D044169 Receptors, Calcium-Sensing skos:exactMatch hgnc 1514 CASR manually_
mesh D044542 LEOPARD Syndrome skos:exactMatch doid DOID:0080548 LEOPARD syndrome 1 manually_reviewed orcid:0000-0003-1307-2508
mesh D044764 Ubiquitin-Activating Enzymes skos:exactMatch hgnc 12469 UBA1 manually_reviewed orcid:0000-0001-9439-5346
mesh D045584 Chromosome Positioning skos:exactMatch go GO:0051303 establishment of chromosome localization manually_reviewed orcid:0000-0003-4423-4370
mesh D045683 Furin skos:exactMatch hgnc 8568 FURIN manually_reviewed orcid:0000-0003-4423-4370
mesh D047868 Pulmonary Sclerosing Hemangioma skos:exactMatch doid DOID:495 sclerosing hemangioma manually_reviewed orcid:0000-0003-1307-2508
mesh D048209 Echinococcus granulosus skos:exactMatch doid DOID:1495 cystic echinococcosis manually_reviewed orcid:0000-0003-1307-2508
mesh D049310 Distal Myopathies skos:exactMatch doid DOID:0070198 Miyoshi muscular dystrophy manually_reviewed orcid:0000-0003-1307-2508
Expand Down Expand Up @@ -652,6 +657,7 @@ mesh D066167 Slit Lamp skos:exactMatch ncit C75583 Slit-lamp Examination manuall
mesh D066246 ErbB Receptors skos:exactMatch ncit C17068 Epidermal Growth Factor Receptor manually_reviewed orcid:0000-0001-9439-5346
mondo 0005187 human herpesvirus 8 infection skos:exactMatch mesh D019288 Herpesvirus 8, Human manually_reviewed orcid:0000-0001-9439-5346
mondo 0007028 rotator cuff syndrome skos:exactMatch mesh D000070636 Rotator Cuff Injuries manually_reviewed orcid:0000-0003-4423-4370
mondo 0007323 Chondronectin skos:exactMatch mesh C029172 chondronectin protein, human manually_reviewed orcid:0000-0003-4423-4370
mondo 0015053 hereditary angioedema type 1 skos:exactMatch mesh D056829 Hereditary Angioedema Types I and II manually_reviewed orcid:0000-0001-9439-5346
mondo 0020320 acute myeloblastic leukemia with maturation skos:exactMatch mesh D000650 Amnion manually_reviewed orcid:0000-0001-9439-5346
mondo 0021661 coronary atherosclerosis skos:exactMatch mesh D003324 Coronary Artery Disease manually_reviewed orcid:0000-0001-9439-5346
Expand Down
Loading