Skip to content

Commit 1e0b2ff

Browse files
Reviewing DFW/knetminer with new arrangements.
1 parent 5160e11 commit 1e0b2ff

17 files changed

+100
-99
lines changed

dfw-dataset/brandizi-env.sh

+11-9
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
# Use this prolog to go to the script's directory
2-
mydir="`pwd`"
32
cd "`dirname ${BASH_SOURCE[0]}`"
43

54
export DFW_ETL="`pwd`"
5+
cd ..
6+
export AG_DIR="`pwd`"
67

78
export ETL_OUT="$DFW_ETL/output" # Overwrites the value set by the etl-tools script.
89
export ETL_TMP="$ETL_OUT/tmp" # temp stuff produced by the pipeline
@@ -13,19 +14,20 @@ export JENA_HOME=/Applications/local/dev/semantic_web/jena
1314
export BIOPORTAL_APIKEY='a9f8528b-4db9-4f35-995f-14e81106615f'
1415
export AGROPORTAL_APIKEY='c5a0f99c-a061-4175-8d7e-e49c47b6337d'
1516

16-
export NAMESPACES_PATH="$mydir/namespaces.ttl"
17+
export NAMESPACES_PATH="$DFW_ETL/namespaces.ttl"
1718
export JAVA_TOOL_OPTIONS="-Xmx8G"
1819

19-
cd ..
20-
. lib/default-env.sh
20+
export ETL_LOG_CONF="$AG_DIR/lib/etltools/logging-test.yaml" # or logging.yaml for production
21+
22+
. "$AG_DIR/lib/default-env.sh"
2123

22-
for mod in gxa
24+
for mod in dfw-dataset/knetminer # dfw-dataset/gxa TODO
2325
do
24-
. "$mydir/$mod/brandizi-env.sh"
25-
cd "$mydir"
26+
. "$AG_DIR/$mod/brandizi-env.sh"
27+
cd "$AG_DIR"
2628
done
2729

28-
~/bin/conda-init.sh
30+
. ~/bin/conda-init.sh
2931
conda activate snakemake
3032

31-
cd "$mydir"
33+
cd "$DFW_ETL"

dfw-dataset/gxa/snake-config.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
gxa_organisms:
2+
- "arabidopsis thaliana"
3+
- "triticum aestivum"

dfw-dataset/knetminer/brandizi-env.sh

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
# These are needed for tests
2-
export ODX2RDF="$HOME/Documents/Work/RRes/ondex_git/ondex-knet-builder/ondex-knet-builder/modules/rdf-export-2-cli/target/rdf-export-2-cli_4.0-SNAPSHOT"
3-
export KNET_RDF_DIR="/tmp/knet-wheat-rdf"
2+
export OXL2NEO_HOME="$HOME/Documents/Work/RRes/ondex_git/ondex-knet-builder/ondex-knet-builder/modules/neo4j-export/target/neo4j-exporter"
3+
export KNET_RDF_DIR="/tmp/knet-poaceae-rdf"

dfw-dataset/knetminer/data-build.snakefile

+8-5
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,11 @@ log = logger_config ( __name__ )
66

77
KNET_RDF_DIR = os.getenv ( "KNET_RDF_DIR" )
88
ETL_OUT = os.getenv ( "ETL_OUT" )
9-
ETL_TOOLS = os.getenv ( "ETL_TOOLS" )
10-
etl_lib_path = ETL_TOOLS + "/lib/etltools/"
9+
AG_LIB = os.getenv ( "AG_LIB" )
10+
etl_lib_path = AG_LIB + "/etltools/"
1111
JENA_HOME = os.getenv ( "JENA_HOME" )
1212

13-
TDB_DIR = ETL_OUT + "tmp/tdb"
13+
TDB_DIR = ETL_OUT + "/tmp/tdb"
1414
MAPPING_OUT = ETL_OUT + "/rdf/knetminer-mapping.nt"
1515

1616

@@ -35,6 +35,7 @@ rdf_inputs = [ "../agri-schema.ttl" ] \
3535
+ glob.glob ( KNET_RDF_DIR + "/ontologies/ext/*.*" ) \
3636
+ glob.glob ( KNET_RDF_DIR + "/*.*" )
3737

38+
3839
rule generate_tdb:
3940
input:
4041
rdf_inputs
@@ -43,7 +44,9 @@ rule generate_tdb:
4344
message:
4445
"Generating Working TDB '%s'" % TDB_DIR
4546
run:
46-
print ( "Re-downloading BioKNO mappings" )
47-
shell ( "wget 'https://raw.githubusercontent.com/Rothamsted/bioknet-onto/master/bk_mappings.ttl' -O '" + KNET_RDF_DIR + "/ontologies/bk_mappings.ttl'" )
47+
#print ( "Re-downloading BioKNO mappings" )
48+
#shell ( "wget 'https://raw.githubusercontent.com/Rothamsted/bioknet-onto/master/bk_mappings.ttl' -O '" + KNET_RDF_DIR + "/ontologies/bk_mappings.ttl'" )
49+
50+
print ( "Running AgriSchemas mappings" )
4851
shell ( "'" + JENA_HOME + "/bin/tdbloader' --loc={output} {input}" )
4952

dfw-dataset/knetminer/test/sample-data-build.snakefile

+29-22
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,22 @@
11
import os, sys
2+
import glob
23
from etltools import sparqlmap
3-
import etltools.getfilescfg as onto_cfg
4+
from etltools.utils import download_files
45

56
ETL_OUT = os.getenv ( "ETL_OUT" )
6-
ETL_TOOLS = os.getenv ( "ETL_TOOLS" )
7-
8-
etl_lib_path = ETL_TOOLS + "/lib/etltools/"
9-
10-
#configfile: "../../snake-config.yaml"
11-
include: etl_lib_path + "/getfiles.snakefile"
12-
13-
ODX2RDF = os.getenv ( "ODX2RDF" )
7+
AG_LIB = os.getenv ( "AG_LIB" )
8+
etl_lib_path = AG_LIB + "/etltools"
149
JENA_HOME = os.getenv ( "JENA_HOME" )
10+
OXL2NEO_HOME = os.getenv ( "OXL2NEO_HOME" )
1511

16-
onto_cfg.init_config ( config )
17-
18-
TEST_OXL = ODX2RDF + "/examples/text_mining.oxl"
19-
TEST_RDF = ETL_OUT + "/test/knetminer-sample.ttl"
12+
TEST_RDF_URL = "https://github.com/Rothamsted/knetminer-backend/blob/master/test-data-server/src/main/resources/poaceae-sample.ttl.bz2?raw=true"
13+
TEST_RDF = ETL_OUT + "/test/knetminer-sample.ttl.bz2"
2014
TEST_TDB = ETL_OUT + "/test/test-tdb"
2115
MAPPING_OUT = ETL_OUT + "/test/knetminer-mapping-test-out.nt"
2216

17+
ontos_dir = ETL_OUT + "/test/ontologies"
18+
19+
2320
rule all:
2421
input:
2522
TEST_TDB
@@ -36,23 +33,33 @@ rule all:
3633
)
3734

3835
rule generate_tdb:
39-
input: onto_cfg.OUT_FILES + [ TEST_RDF, "../../../agri-schema.ttl" ]
40-
output: directory ( TEST_TDB )
36+
input:
37+
ontos_dir,
38+
"../../../agri-schema.ttl",
39+
TEST_RDF
40+
output:
41+
directory ( TEST_TDB )
4142
message:
4243
"Generating Test TDB"
4344
shell:
44-
f"'{JENA_HOME}/bin/tdbloader'" + " --loc={output} {input}"
45-
45+
f"'{JENA_HOME}/bin/tdbloader' --loc={{output}} {ontos_dir}/*.* {ontos_dir}/ext/*.* ../../../agri-schema.ttl {TEST_RDF}"
4646

47-
rule generate_rdf:
48-
input:
49-
TEST_OXL
47+
rule download_rdf:
5048
output:
5149
TEST_RDF
5250
message:
53-
"Generating RDF from the test OXL"
51+
"Getting Poaceae Sample Dataset RDF"
5452
shell:
55-
f"'{ODX2RDF}/odx2rdf.sh'" + " {input} {output}"
53+
f'wget -O "{{output}}" "{TEST_RDF_URL}"'
54+
55+
rule download_ontos:
56+
output:
57+
directory ( ontos_dir )
58+
message:
59+
"Getting Ontologies"
60+
run:
61+
shell ( f'mkdir -p "{ontos_dir}"' )
62+
shell ( f'"{OXL2NEO_HOME}/get_ontologies.sh" "{{output}}"' )
5663

5764

5865
rule clean:

dfw-dataset/knetminer/test/sample_data_test.py

+31-23
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
import os, sys
1+
import os
22
from os.path import dirname, abspath
3-
from etltools import sparqlmap
4-
from etltools.utils import get_jena_home, sparql_ask
3+
from etltools.utils import sparql_ask, DEFAULT_NAMESPACES
54
import rdflib
65
import unittest
6+
from zipfile import bz2
77

88
graph = None
99

@@ -16,23 +16,33 @@ def run_mappings ():
1616
os.chdir ( mydir )
1717

1818
print ( "Running mapping workflow" )
19-
if os.system ( "snakemake --snakefile sample-data-build.snakefile --configfile ../../snake-config.yaml --cores all" ) != 0:
19+
if os.system ( "snakemake --verbose --snakefile sample-data-build.snakefile --cores all" ) != 0:
2020
raise ChildProcessError ( "Mapping workflow execution failed" )
2121

2222
ETL_OUT = os.getenv ( "ETL_OUT" )
2323
DFW_ETL = os.getenv ( "DFW_ETL" )
2424

25+
"""
26+
the bz2 can be used for verifications that need the original data, however, the tests
27+
below are written against the output only, since these loadings are slow.
28+
"""
2529
out_names = [
26-
"knetminer-mapping-test-out.nt",
27-
"knetminer-sample.ttl"
30+
# "knetminer-sample.ttl.bz2",
31+
"knetminer-mapping-test-out.nt"
2832
]
2933
for oname in out_names:
3034
out_path = ETL_OUT + "/test/" + oname
3135
print ( "--- Loading result from '%s'" % out_path )
36+
if oname.endswith ( ".bz2" ):
37+
with bz2.open ( out_path, mode = "rt" ) as fh:
38+
graph.parse ( source = fh )
39+
continue
3240
graph.parse ( out_path, format = "turtle" )
3341

3442
graph.parse ( DFW_ETL + "/../agri-schema.ttl", format = "turtle" )
3543

44+
45+
3646
print ( "----- Test Initialised -----\n\n" )
3747

3848

@@ -43,18 +53,18 @@ def __init__ ( self, methodName ):
4353
super().__init__ ( methodName )
4454

4555
def assert_sparql ( self, ask_query, msg ):
46-
self.assertTrue ( sparql_ask ( graph, ask_query ), msg )
56+
self.assertTrue ( sparql_ask ( graph, ask_query, DEFAULT_NAMESPACES ), msg )
4757

4858
def test_pref_name ( self ):
4959
for p in [ "rdfs:label", "schema:name", "skos:prefLabel" ]:
5060
self.assert_sparql (
51-
"ASK { bkr:to_0002682 %s 'plant cell shape'}" % p,
61+
"ASK { bkr:trait_to_0006001 %s 'salt tolerance'}" % p,
5262
"%s not inferred!" % p
5363
)
5464

5565
def test_name ( self ):
56-
s = "bkr:to_0000387"
57-
l = "plant phenotype"
66+
s = "bkr:gene_at1g71100_locus_2026296"
67+
l = "ribose 5-phosphate isomerase"
5868
for p in [ "rdfs:label", "skos:altLabel" ]:
5969
self.assert_sparql (
6070
"ASK { %s %s '%s'}" % (s, p, l),
@@ -68,19 +78,19 @@ def test_name ( self ):
6878

6979
def test_bioschema_Protein ( self ):
7080
self.assert_sparql (
71-
"ASK { bkr:protein_q0d6f4 a bioschema:Protein }",
81+
"ASK { bkr:protein_p07519 a bioschema:Protein }",
7282
"bioschema:Protein not inferred!"
7383
)
7484

7585
def test_bioschema_Publication ( self ):
76-
pmid = "18089549"
86+
pmid = "3558409"
7787

7888
for po in [
7989
"a agri:ScholarlyPublication",
80-
"dcterms:title 'The Rice Annotation Project Database (RAP-DB): 2008 update.'",
81-
"dcterms:identifier '%s'" % pmid,
82-
"dcterms:issued 2008",
83-
"schema:datePublished 2008"
90+
"dcterms:title ?title",
91+
"dcterms:identifier '%s'" % pmid, # TODO not in the sample dataset
92+
"dcterms:issued 1987",
93+
"schema:datePublished 1987"
8494
]:
8595
self.assert_sparql (
8696
"ASK { bkr:publication_%s %s }" % (pmid, po),
@@ -89,15 +99,13 @@ def test_bioschema_Publication ( self ):
8999

90100
self.assert_sparql (
91101
"""ASK { bkr:publication_%s
92-
bka:Abstract ?abs;
93102
schema:abstract ?abs;
94103
dcterms:description ?abs
95104
}""" % pmid,
96105
"abstract properties not inferred!"
97106
)
98107
self.assert_sparql (
99108
"""ASK { bkr:publication_%s
100-
bka:AUTHORS ?authors;
101109
dcterms:creator ?authors;
102110
agri:authorsList ?authors
103111
}""" % pmid,
@@ -116,26 +124,26 @@ def test_name_is_not_title ( self ):
116124

117125
def test_bioschema_isPartOf ( self ):
118126
self.assert_sparql (
119-
"ASK { bkr:to_0000804 schema:isPartOf bkr:to_0006031 }",
127+
"ASK { bkr:gene_6652998 schema:isPartOf bkr:path_6645240 }",
120128
"schema:isPartOf not inferred!"
121129
)
122130

123131
def test_agri_evidence ( self ):
124132
self.assert_sparql (
125-
"ASK { bkr:publication_16240171 agri:evidence bk:IMPD }",
133+
"ASK { bkr:publication_28380544 agri:evidence bk:IMPD }",
126134
"agri:evidence not inferred!"
127135
)
128136

129137
def test_dc_source ( self ):
130138
self.assert_sparql (
131-
"ASK { bkr:publication_16240171 dc:source bk:NLM_UNIPROTKB }",
139+
"ASK { bkr:publication_12472693 dc:source bk:NLM_UNIPROTKB }",
132140
"dc:source not inferred!"
133141
)
134142

135143
def test_mentions ( self ):
136144
self.assert_sparql (
137-
"""ASK { bkr:publication_26473199
138-
schema:mentions bkr:protein_q6h5l4, bkr:protein_q0ddi1
145+
"""ASK { bkr:publication_22399647
146+
schema:mentions bkr:gene_bradi_3g39910v3, bkr:gene_horvu6hr1g085710
139147
}""",
140148
"schema:mentions not inferred!"
141149
)

dfw-dataset/namespaces.ttl

+12-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,14 @@
1-
prefix bk: <http://knetminer.org/data/rdf/terms/biokno/>
21
prefix agres: <http://agrischemas.org/resources/>
32
prefix agGraph: <http://agrischemas.org/graphs/>
3+
prefix bk: <http://knetminer.org/data/rdf/terms/biokno/>
4+
prefix bkr: <http://knetminer.org/data/rdf/resources/>
5+
prefix bka: <http://knetminer.org/data/rdf/terms/biokno/attributes/>
6+
prefix bkg: <http://knetminer.org/data/rdf/resources/graphs/>
7+
prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
8+
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
9+
prefix owl: <http://www.w3.org/2002/07/owl#>
10+
prefix dc: <http://purl.org/dc/elements/1.1/>
11+
prefix dcterms: <http://purl.org/dc/terms/>
12+
prefix agri: <http://agrischemas.org/>
13+
prefix bioschema: <http://bioschemas.org/>
14+
prefix schema: <http://schema.org/>

dfw-dataset/snake-config.yaml

-34
This file was deleted.
File renamed without changes.

lib/default-env.sh

+1
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@ if [[ ! "$PYTHONPATH" =~ "$AG_LIB" ]]; then
77
fi
88

99
. etltools/default-env.sh
10+
cd "$AG_LIB"

lib/etltools/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@
44
A set of Python utilities and tools to build data extraction,
55
transformation and loading (ETL) pipelines.
66
7-
@author Marco Brandizi
7+
:author: Marco Brandizi
88
"""

lib/etltools/sparqlmap.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
44
These employ SPARQL CONSTRUCT to build RDF-to-RDF mapping.
55
6-
@author Marco Brandizi
6+
:author: Marco Brandizi
77
"""
88

99

lib/etltools/sparqlmap_test.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def __init__ ( self, methodName ):
5656
super().__init__ ( methodName )
5757

5858
def assert_sparql ( self, ask_query, msg ):
59-
self.assertTrue ( sparql_ask ( graph, ask_query ), msg )
59+
self.assertTrue ( sparql_ask ( graph, ask_query, DEFAULT_NAMESPACES ), msg )
6060

6161
def test_subclass ( self ):
6262
self.assert_sparql ( "ASK { ex:b a ex1:SuperB }", "super-class not inferred!" )
File renamed without changes.
File renamed without changes.
File renamed without changes.

0 commit comments

Comments
 (0)