-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmpn_RDF.pl
executable file
·231 lines (191 loc) · 7.65 KB
/
mpn_RDF.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
#!/usr/bin/env perl
# Text::TogoAnnotatorを利用したバージョン
# yayamamo 2014/06/12
# 注意点: 従来プログラムではretrieve内でqueryが空のときは'hypothetical protein'を与えていたが、削除。
# : 呼出し側で行うことを想定する。
# Yasunori Yamamoto / Database Center for Life Science
# 2013.05.20
# Takatomo Fujisawa / National Institute of Genetics
# 2013.12.25
# sample:
# perl match_protein_names.pl -f data/test/annotation_default.ttl
# perl match_protein_names.pl -f data/test/genes.txt
# -d: dictionary (default: nite_ALL_130517.txt)
# -f: ta_query file
# -m: cs_max (default: 5)
# -n: column No. of id and entity in tsv (default: 1,6 for genes.txt)
# -o: output format (tsv or ttl)
# -m: cs_max (default: 5) # TogoAnnotator
# -t: cos_threshold (default: 3) # TogoAnnotator
# -v: verbose mode
use warnings;
use strict;
use Fatal qw/open/;
# use simstring;
use Getopt::Std;
use File::Basename;
use File::Spec;
use JSON;
use DateTime;
use File::Path 'mkpath';
#use lib qw(/opt/services2/togoannot/togoannotator);
use FindBin qw($Bin);
use lib "$Bin";
use Text::TogoAnnotator;
### setting for genome refine #################
my $base_uri = 'http://genome.microbedb.jp/';
# /var/genome_conf/staging/tconf/236/data_sources/GR16/genome.ttl
#my $genome_conf = '/var/genome_conf/staging/tconf';
################################################
use Data::UUID;
use Date::Format;
our $ug = new Data::UUID;
our ($opt_t, $opt_m, $opt_d, $opt_f, $opt_c, $opt_n, $opt_o, $opt_v);
getopt('tmdfon'); # -tm take arg. Sets $opt_t, $opt_m as a side effect.
my $ta_dictionary = $opt_d || "dictionary/nite_ALL_130517.txt"; # d: dictionary
my $ta_query = $opt_f || '/var/genome_conf/staging/tconf/235/data_sources/GR15/annotation_default.ttl';
my ($filename, $path, $suffix) = fileparse($ta_query, qw/tsv csv txt ttl/);
grep(/^$suffix$/, qw/tsv csv txt ttl/) or die "use csv, tsv or ttl format";
$opt_o ||= $suffix;
$opt_n ||= "1,6";
my($id_n, $entity_n) = split /,/, $opt_n;
my $sysroot = File::Spec->rel2abs(dirname(dirname(__FILE__)));
my $cos_threshold = 0.6; # cosine距離で類似度を測る際に用いる閾値。この値以上類似している場合は変換対象の候補とする。
my $e_threashold = 30; # E列での表現から候補を探す場合、辞書中での最大出現頻度がここで指定する数未満の場合のもののみを対象とする。
my $cs_max = 5; # 複数表示する候補が在る場合の最大表示数
my $n_gram = 3; # 3: trigram
$cos_threshold = $opt_t if $opt_t;
$cs_max = $opt_m if $opt_m;
#print "#th:", $cos_threshold, ", dm:", $cs_max, "\n";
print '@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix oa: <http://www.w3.org/ns/oa#> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix cnt: <http://www.w3.org/2011/content#> .
@prefix gref: <http://genome.microbedb.jp/ontologies/genomerefine#> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix prov: <http://www.w3.org/ns/prov#> .
'."\n";
Text::TogoAnnotator->init($cos_threshold, $e_threashold, $cs_max, $n_gram, $sysroot, $ta_dictionary, "", 1, "nite_ALL_130517");
Text::TogoAnnotator->openDicts;
### input ###
if($suffix eq 'ttl'){
use warnings;
use RDF::Trine;
use RDF::Query;
my $model = RDF::Trine::Model->temporary_model;
my $parser = RDF::Trine::Parser->new( 'turtle' );
$parser->parse_file_into_model( $base_uri, $ta_query, $model );
my $sparql =<<EOQ;
prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
prefix oa: <http://www.w3.org/ns/oa#>
prefix cnt: <http://www.w3.org/2011/content#>
prefix prov: <http://www.w3.org/ns/prov#>
prefix foaf: <http://xmlns.com/foaf/0.1/>
prefix gref: <http://genome.microbedb.jp/ontologies/genomerefine#>
SELECT
?target ?content
FROM
<http://genome.microbedb.jp/graph/annotation>
WHERE
{
?annotation oa:hasTarget ?target.
?annotation oa:hasBody ?body.
?body cnt:chars ?content.
}
EOQ
my $query = RDF::Query->new( $sparql );
my $iterator = $query->execute( $model );
while (my $row = $iterator->next) {
#$row->{ 'product' }->as_string ||= 'hypothetical protein';
my $content = $row->{ 'content' }->as_string;
my $target = $row->{ 'target' }->as_string;
$content =~s/(^"|"$)//g; #Todo
$content =~s/\n/ /g; #Todo
&output(Text::TogoAnnotator->retrieve($content), $target);
#print $row->{ 'id' }->as_string ."\t" if ($opt_v);
}
}else{
use IO::File;
use Text::CSV_XS;
use Data::Dumper;
my $fh = IO::File->new($ta_query) or die 'cannot open file: '.Text::CSV->error_diag ();
my $csv = Text::CSV_XS->new({binary => 1});
my $i =0;
until ($fh->eof) {
my $line = <$fh>;
chomp($line);
my @vals = split /\t/, $line;
my $columns = \@vals;
next if @$columns == 0;
print join("\t",(@$columns))."\t" if ($opt_v and $opt_o ne 'ttl');
my $query = $columns->[$entity_n];
&output(Text::TogoAnnotator->retrieve($query), $columns->[$id_n]);
}
$fh->close;
}
Text::TogoAnnotator->closeDicts;
sub output {
my $h = shift;
$h->{'id'} = shift;
if($opt_o eq 'json'){
print encode_json($h) ."\n";
}elsif($opt_o eq 'ttl'){
#default
if ($opt_v and $opt_o eq 'ttl'){
print '<'.$base_uri.'236/GR16/genes/'.$h->{'id'}. '#product mdb:annotation [ rdfs:label "'.$h->{'query'}.'" ;
rdfs:comment "generated by MiGAP" ;
rdf:type mdb:AnnotationDeafualt ].'."\n";
}
#if(0){
#togoannotator
# print $h->{'id'}. ' mdb:definition [ rdfs:label "'.$h->{'result'}.'" ;
# rdfs:comment "'.$h->{'match'}.': generated by TogoAnnotator" ;
# rdfs:comment "'. $h->{'info'}.'" ;
# rdf:type mdb:AnnotationAuto ;
# skos:prefLabel "'.$h->{'result'}.'" ]'."\n";
my $uuid1 = $ug->create_str();
my $uuid2 = $ug->create_str();
my $dt = DateTime->now();
my $date = $dt->datetime().'Z';
#print Dumper $date;
print <<EOF;
<urn:uuid:$uuid1> a oa:Annotation;
oa:hasTarget $h->{'id'};
oa:hasBody <urn:uuid:$uuid2> ;
oa:annotatedBy <http://dbcls.jp/togoannotator>.
oa:annotatedAt "$date"^^xsd:dateTime ;
oa:serializedBy <http://genome.annotation.jp/genomerefine>.
oa:serializedAt "$date"^^xsd:dateTime ;
<urn:uuid:$uuid2> a cnt:ContentAsText ;
a gref:Annotated_gene_porduct ;
cnt:chars "$h->{'result'}" ;
skos:prefLabel "$h->{'result'}" ;
skos:hiddenLabel "$h->{'query'}" ;
rdfs:comment "$h->{'match'}" ;
rdfs:comment "$h->{'info'}" ;
cnt:characterEncoding "utf-8" .
<http://dbcls.jp/togoannotator> a foaf:Agent, prov:SoftwareAgent ;
foaf:name "TogoAnnotator".
<http://genome.annotation.jp/genomerefine> a foaf:Agent, prov:SoftwareAgent ;
foaf:name "GenomeRefine".
EOF
#ToDo: match ex,cs,bcs,no_hit とskos mapping
#ToDo: togoannotatorの他の候補のaltLabel処理
#}
}else{
#print join("\t",($h->{'id'}, $h->{'query'},$h->{'result'},$h->{'match'},$h->{'info'}))."\n";
print join("\t",($h->{'result'},$h->{'match'},$h->{'info'}))."\n";
}
}
__END__
=head1 NAME
Protein Definition Normalizer
=head1 SYNOPSIS
normProt.pl -t0.7
=head1 ABSTRACT
配列相同性に基いて複数のプログラムにより自動的に命名されたタンパク質名の表記を、既に人手で正規化されている表記を利用して正規形に変換する。
=head1 COPYRIGHT AND LICENSE
Copyright by Yasunori Yamamoto / Database Center for Life Science
このプログラムはフリーであり、また、目的を問わず自由に再配布および修正可能です。
=cut