Rothamsted
diff --git a/‎dfw-dataset/brandizi-env.sh
+11-9 b/‎dfw-dataset/brandizi-env.sh
+11-9
diff --git a/‎dfw-dataset/gxa/snake-config.yaml
+3 b/‎dfw-dataset/gxa/snake-config.yaml
+3
diff --git a/‎dfw-dataset/knetminer/brandizi-env.sh
+2-2 b/‎dfw-dataset/knetminer/brandizi-env.sh
+2-2
diff --git a/‎dfw-dataset/knetminer/data-build.snakefile
+8-5 b/‎dfw-dataset/knetminer/data-build.snakefile
+8-5
diff --git a/‎dfw-dataset/knetminer/test/sample-data-build.snakefile
+29-22 b/‎dfw-dataset/knetminer/test/sample-data-build.snakefile
+29-22
diff --git a/‎dfw-dataset/knetminer/test/sample_data_test.py
+31-23 b/‎dfw-dataset/knetminer/test/sample_data_test.py
+31-23
diff --git a/‎dfw-dataset/namespaces.ttl
+12-1 b/‎dfw-dataset/namespaces.ttl
+12-1
diff --git a/‎dfw-dataset/snake-config.yaml
-34 b/‎dfw-dataset/snake-config.yaml
-34
diff --git a/‎dfw-dataset/data-build.snakefile ‎dfw-dataset/todelete/data-build.snakefile b/‎dfw-dataset/data-build.snakefile ‎dfw-dataset/todelete/data-build.snakefile
diff --git a/‎lib/default-env.sh
+1 b/‎lib/default-env.sh
+1
diff --git a/‎lib/etltools/__init__.py
+1-1 b/‎lib/etltools/__init__.py
+1-1
diff --git a/‎lib/etltools/sparqlmap.py
+1-1 b/‎lib/etltools/sparqlmap.py
+1-1
diff --git a/‎lib/etltools/sparqlmap_test.py
+1-1 b/‎lib/etltools/sparqlmap_test.py
+1-1
diff --git a/‎lib/etltools/getfiles-test-cfg.yaml ‎lib/etltools/to-delete/getfiles-test-cfg.yaml b/‎lib/etltools/getfiles-test-cfg.yaml ‎lib/etltools/to-delete/getfiles-test-cfg.yaml
diff --git a/‎lib/etltools/getfiles.snakefile ‎lib/etltools/to-delete/getfiles.snakefile b/‎lib/etltools/getfiles.snakefile ‎lib/etltools/to-delete/getfiles.snakefile
diff --git a/‎lib/etltools/getfiles_test.py ‎lib/etltools/to-delete/getfiles_test.py b/‎lib/etltools/getfiles_test.py ‎lib/etltools/to-delete/getfiles_test.py
diff --git a/‎lib/etltools/getfilescfg.py ‎lib/etltools/to-delete/getfilescfg.py b/‎lib/etltools/getfilescfg.py ‎lib/etltools/to-delete/getfilescfg.py
@@ -1,8 +1,9 @@
 # Use this prolog to go to the script's directory
-mydir="`pwd`"
 cd "`dirname ${BASH_SOURCE[0]}`"
 
 export DFW_ETL="`pwd`"
+cd ..
+export AG_DIR="`pwd`"
 
 export ETL_OUT="$DFW_ETL/output" # Overwrites the value set by the etl-tools script.
 export ETL_TMP="$ETL_OUT/tmp" # temp stuff produced by the pipeline
@@ -13,19 +14,20 @@ export JENA_HOME=/Applications/local/dev/semantic_web/jena
 export BIOPORTAL_APIKEY='a9f8528b-4db9-4f35-995f-14e81106615f'
 export AGROPORTAL_APIKEY='c5a0f99c-a061-4175-8d7e-e49c47b6337d'
 
-export NAMESPACES_PATH="$mydir/namespaces.ttl"
+export NAMESPACES_PATH="$DFW_ETL/namespaces.ttl"
 export JAVA_TOOL_OPTIONS="-Xmx8G"
 
-cd ..
-. lib/default-env.sh
+export ETL_LOG_CONF="$AG_DIR/lib/etltools/logging-test.yaml" # or logging.yaml for production
+
+. "$AG_DIR/lib/default-env.sh" 
 
-for mod in gxa
+for mod in dfw-dataset/knetminer # dfw-dataset/gxa TODO
 do
-	. "$mydir/$mod/brandizi-env.sh"
-	cd "$mydir"
+	. "$AG_DIR/$mod/brandizi-env.sh"
+	cd "$AG_DIR"
 done
 
-~/bin/conda-init.sh
+. ~/bin/conda-init.sh
 conda activate snakemake
 
-cd "$mydir"
+cd "$DFW_ETL"
@@ -0,0 +1,3 @@
+gxa_organisms:
+  - "arabidopsis thaliana"
+  - "triticum aestivum"
@@ -1,3 +1,3 @@
 # These are needed for tests
-export ODX2RDF="$HOME/Documents/Work/RRes/ondex_git/ondex-knet-builder/ondex-knet-builder/modules/rdf-export-2-cli/target/rdf-export-2-cli_4.0-SNAPSHOT"
-export KNET_RDF_DIR="/tmp/knet-wheat-rdf"
+export OXL2NEO_HOME="$HOME/Documents/Work/RRes/ondex_git/ondex-knet-builder/ondex-knet-builder/modules/neo4j-export/target/neo4j-exporter"
+export KNET_RDF_DIR="/tmp/knet-poaceae-rdf"
@@ -6,11 +6,11 @@ log = logger_config ( __name__ )
 
 KNET_RDF_DIR = os.getenv ( "KNET_RDF_DIR" )
 ETL_OUT = os.getenv ( "ETL_OUT" )
-ETL_TOOLS = os.getenv ( "ETL_TOOLS" )
-etl_lib_path = ETL_TOOLS + "/lib/etltools/"
+AG_LIB = os.getenv ( "AG_LIB" )
+etl_lib_path = AG_LIB + "/etltools/"
 JENA_HOME = os.getenv ( "JENA_HOME" )
 
-TDB_DIR = ETL_OUT + "tmp/tdb"
+TDB_DIR = ETL_OUT + "/tmp/tdb"
 MAPPING_OUT = ETL_OUT + "/rdf/knetminer-mapping.nt"
 
 
@@ -35,6 +35,7 @@ rdf_inputs  = [ "../agri-schema.ttl" ] \
   + glob.glob ( KNET_RDF_DIR + "/ontologies/ext/*.*" ) \
   + glob.glob ( KNET_RDF_DIR + "/*.*" )
 
+
 rule generate_tdb:
   input:
   	rdf_inputs
@@ -43,7 +44,9 @@ rule generate_tdb:
 	message:
 		"Generating Working TDB '%s'" %  TDB_DIR
 	run:
-		print ( "Re-downloading BioKNO mappings" )
-		shell ( "wget 'https://raw.githubusercontent.com/Rothamsted/bioknet-onto/master/bk_mappings.ttl' -O '" + KNET_RDF_DIR + "/ontologies/bk_mappings.ttl'" )
+		#print ( "Re-downloading BioKNO mappings" )
+		#shell ( "wget 'https://raw.githubusercontent.com/Rothamsted/bioknet-onto/master/bk_mappings.ttl' -O '" + KNET_RDF_DIR + "/ontologies/bk_mappings.ttl'" )
+
+		print ( "Running AgriSchemas mappings" )
 		shell ( "'" + JENA_HOME + "/bin/tdbloader' --loc={output} {input}" )
 
@@ -1,25 +1,22 @@
 import os, sys
+import glob
 from etltools import sparqlmap
-import etltools.getfilescfg as onto_cfg
+from etltools.utils import download_files
 
 ETL_OUT = os.getenv ( "ETL_OUT" )
-ETL_TOOLS = os.getenv ( "ETL_TOOLS" )
-
-etl_lib_path = ETL_TOOLS + "/lib/etltools/"
-
-#configfile: "../../snake-config.yaml"
-include: etl_lib_path + "/getfiles.snakefile"
-
-ODX2RDF = os.getenv ( "ODX2RDF" )
+AG_LIB = os.getenv ( "AG_LIB" )
+etl_lib_path = AG_LIB + "/etltools"
 JENA_HOME = os.getenv ( "JENA_HOME" )
+OXL2NEO_HOME = os.getenv ( "OXL2NEO_HOME" )
 
-onto_cfg.init_config ( config )
-
-TEST_OXL = ODX2RDF + "/examples/text_mining.oxl"
-TEST_RDF = ETL_OUT + "/test/knetminer-sample.ttl"
+TEST_RDF_URL = "https://github.com/Rothamsted/knetminer-backend/blob/master/test-data-server/src/main/resources/poaceae-sample.ttl.bz2?raw=true"
+TEST_RDF = ETL_OUT + "/test/knetminer-sample.ttl.bz2"
 TEST_TDB = ETL_OUT + "/test/test-tdb"
 MAPPING_OUT = ETL_OUT + "/test/knetminer-mapping-test-out.nt"
 
+ontos_dir = ETL_OUT + "/test/ontologies"
+
+
 rule all:
 	input:
 		TEST_TDB
@@ -36,23 +33,33 @@ rule all:
 		)
 
 rule generate_tdb:
-  input: onto_cfg.OUT_FILES + [ TEST_RDF, "../../../agri-schema.ttl" ]
-	output: directory ( TEST_TDB )
+	input:
+	  ontos_dir,
+		"../../../agri-schema.ttl",
+		TEST_RDF
+	output:
+		directory ( TEST_TDB )
 	message:
 		"Generating Test TDB"
 	shell:
-		f"'{JENA_HOME}/bin/tdbloader'" + " --loc={output} {input}"
-
+		f"'{JENA_HOME}/bin/tdbloader' --loc={{output}} {ontos_dir}/*.* {ontos_dir}/ext/*.* ../../../agri-schema.ttl {TEST_RDF}"
 
-rule generate_rdf:
-	input:
-		TEST_OXL
+rule download_rdf:
 	output:
 		TEST_RDF
 	message:
-		"Generating RDF from the test OXL"
+		"Getting Poaceae Sample Dataset RDF"
 	shell:
-		f"'{ODX2RDF}/odx2rdf.sh'" + " {input} {output}"
+		f'wget -O "{{output}}" "{TEST_RDF_URL}"' 
+
+rule download_ontos:
+	output:
+		directory ( ontos_dir )
+	message:
+		"Getting Ontologies"
+	run:
+		shell ( f'mkdir -p "{ontos_dir}"' )
+		shell ( f'"{OXL2NEO_HOME}/get_ontologies.sh" "{{output}}"' )
 
 
 rule clean:
 
@@ -1,9 +1,9 @@
-import os, sys
+import os
 from os.path import dirname, abspath
-from etltools import sparqlmap
-from etltools.utils import get_jena_home, sparql_ask
+from etltools.utils import sparql_ask, DEFAULT_NAMESPACES
 import rdflib
 import unittest
+from zipfile import bz2
 
 graph = None
 
@@ -16,23 +16,33 @@ def run_mappings ():
 	os.chdir ( mydir )
 
 	print ( "Running mapping workflow" )
-	if os.system ( "snakemake --snakefile sample-data-build.snakefile --configfile ../../snake-config.yaml  --cores all" ) != 0:
+	if os.system ( "snakemake --verbose --snakefile sample-data-build.snakefile --cores all" ) != 0:
 		raise ChildProcessError ( "Mapping workflow execution failed" )
 
 	ETL_OUT = os.getenv ( "ETL_OUT" )
 	DFW_ETL = os.getenv ( "DFW_ETL" )
 
+	"""
+		the bz2 can be used for verifications that need the original data, however, the tests 
+		below are written against the output only, since these loadings are slow. 
+	"""
 	out_names = [
-		"knetminer-mapping-test-out.nt",
-		"knetminer-sample.ttl"
+		# "knetminer-sample.ttl.bz2",
+		"knetminer-mapping-test-out.nt"
 	]
 	for oname in out_names:
 		out_path = ETL_OUT + "/test/" + oname
 		print ( "--- Loading result from '%s'" % out_path )
+		if oname.endswith ( ".bz2" ):
+			with bz2.open ( out_path, mode = "rt" ) as fh:
+				graph.parse ( source = fh )
+			continue
 		graph.parse ( out_path, format = "turtle" )	
 
 	graph.parse ( DFW_ETL + "/../agri-schema.ttl", format = "turtle" )
 
+
+
 	print ( "----- Test Initialised -----\n\n" )
 
 
@@ -43,18 +53,18 @@ def __init__ ( self, methodName ):
 		super().__init__ ( methodName )
 
 	def assert_sparql ( self, ask_query, msg ):
-		self.assertTrue ( sparql_ask ( graph, ask_query ), msg )
+		self.assertTrue ( sparql_ask ( graph, ask_query, DEFAULT_NAMESPACES ), msg )
 
 	def test_pref_name ( self ):
 		for p in [ "rdfs:label", "schema:name", "skos:prefLabel" ]:
 			self.assert_sparql (
-				"ASK { bkr:to_0002682 %s 'plant cell shape'}" % p,
+				"ASK { bkr:trait_to_0006001 %s 'salt tolerance'}" % p,
 				"%s not inferred!" % p
 			)
 
 	def test_name ( self ):
-		s = "bkr:to_0000387"
-		l = "plant phenotype"
+		s = "bkr:gene_at1g71100_locus_2026296"
+		l = "ribose 5-phosphate isomerase"
 		for p in [ "rdfs:label", "skos:altLabel" ]:
 			self.assert_sparql (
 				"ASK { %s %s '%s'}" % (s, p, l),
@@ -68,19 +78,19 @@ def test_name ( self ):
 
 	def test_bioschema_Protein ( self ):
 		self.assert_sparql ( 
-			"ASK { bkr:protein_q0d6f4 a bioschema:Protein }",
+			"ASK { bkr:protein_p07519 a bioschema:Protein }",
 			"bioschema:Protein not inferred!"
 		)
 
 	def test_bioschema_Publication ( self ):
-		pmid = "18089549"
+		pmid = "3558409"
 
 		for po in [ 
 			"a agri:ScholarlyPublication", 
-			"dcterms:title 'The Rice Annotation Project Database (RAP-DB): 2008 update.'",
-			"dcterms:identifier '%s'" % pmid,
-			"dcterms:issued 2008",
-			"schema:datePublished 2008"
+			"dcterms:title ?title",
+			# "dcterms:identifier '%s'" % pmid, # TODO not in the sample dataset
+			"dcterms:issued 1987",
+			"schema:datePublished 1987"
 		]:
 			self.assert_sparql ( 
 				"ASK { bkr:publication_%s %s }" % (pmid, po),
@@ -89,15 +99,13 @@ def test_bioschema_Publication ( self ):
 
 		self.assert_sparql ( 
 			"""ASK { bkr:publication_%s 
-			     bka:Abstract ?abs;
 			     schema:abstract ?abs;
 			     dcterms:description ?abs 
 			}""" % pmid,
 			"abstract properties not inferred!" 
 		)
 		self.assert_sparql ( 
 			"""ASK { bkr:publication_%s 
-			     bka:AUTHORS ?authors;
 			     dcterms:creator ?authors;
 			     agri:authorsList ?authors 
 			}""" % pmid,
@@ -116,26 +124,26 @@ def test_name_is_not_title ( self ):
 
 	def test_bioschema_isPartOf ( self ):
 		self.assert_sparql ( 
-			"ASK { bkr:to_0000804 schema:isPartOf bkr:to_0006031 }",
+			"ASK { bkr:gene_6652998 schema:isPartOf bkr:path_6645240 }",
 			"schema:isPartOf not inferred!"
 		)
 
 	def test_agri_evidence ( self ):
 		self.assert_sparql ( 
-			"ASK { bkr:publication_16240171 agri:evidence bk:IMPD }",
+			"ASK { bkr:publication_28380544 agri:evidence bk:IMPD }",
 			"agri:evidence not inferred!"
 		)
 
 	def test_dc_source ( self ):
 		self.assert_sparql ( 
-			"ASK { bkr:publication_16240171 dc:source bk:NLM_UNIPROTKB }",
+			"ASK { bkr:publication_12472693 dc:source bk:NLM_UNIPROTKB }",
 			"dc:source not inferred!"
 		)
 
 	def test_mentions ( self ):
 		self.assert_sparql ( 
-			"""ASK { bkr:publication_26473199 
-			     schema:mentions bkr:protein_q6h5l4, bkr:protein_q0ddi1
+			"""ASK { bkr:publication_22399647 
+			     schema:mentions bkr:gene_bradi_3g39910v3, bkr:gene_horvu6hr1g085710
 			}""",
 			"schema:mentions not inferred!"
 		)
 
@@ -1,3 +1,14 @@
-prefix bk:  <http://knetminer.org/data/rdf/terms/biokno/>
 prefix agres: <http://agrischemas.org/resources/>
 prefix agGraph: <http://agrischemas.org/graphs/>
+prefix bk: <http://knetminer.org/data/rdf/terms/biokno/>
+prefix bkr: <http://knetminer.org/data/rdf/resources/>
+prefix bka: <http://knetminer.org/data/rdf/terms/biokno/attributes/>
+prefix bkg: <http://knetminer.org/data/rdf/resources/graphs/>
+prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+prefix owl: <http://www.w3.org/2002/07/owl#>
+prefix dc: <http://purl.org/dc/elements/1.1/>
+prefix dcterms: <http://purl.org/dc/terms/>	
+prefix agri: <http://agrischemas.org/>
+prefix bioschema: <http://bioschemas.org/>	
+prefix schema: <http://schema.org/>
@@ -7,3 +7,4 @@ if [[ ! "$PYTHONPATH" =~ "$AG_LIB" ]]; then
 fi
 
 . etltools/default-env.sh
+cd "$AG_LIB"
@@ -4,5 +4,5 @@
 	A set of Python utilities and tools to build data extraction, 
 	transformation and loading (ETL) pipelines.
 	
-	@author Marco Brandizi 
+	:author: Marco Brandizi 
 """
@@ -3,7 +3,7 @@
 	
 	These employ SPARQL CONSTRUCT to build RDF-to-RDF mapping.
 	
-	@author Marco Brandizi
+	:author: Marco Brandizi
 """
 
 
 
@@ -56,7 +56,7 @@ def __init__ ( self, methodName ):
 		super().__init__ ( methodName )
 
 	def assert_sparql ( self, ask_query, msg ):
-		self.assertTrue ( sparql_ask ( graph, ask_query ), msg )
+		self.assertTrue ( sparql_ask ( graph, ask_query, DEFAULT_NAMESPACES ), msg )
 
 	def test_subclass ( self ):
 		self.assert_sparql ( "ASK { ex:b a ex1:SuperB }", "super-class not inferred!" )
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+gxa_organisms:`
	`2`	`+ - "arabidopsis thaliana"`
	`3`	`+ - "triticum aestivum"`
Original file line number	Diff line number	Diff line change
`@@ -7,3 +7,4 @@ if [[ ! "$PYTHONPATH" =~ "$AG_LIB" ]]; then`
`7`	`7`	`fi`
`8`	`8`
`9`	`9`	`. etltools/default-env.sh`
	`10`	`+cd "$AG_LIB"`