Skip to content

Commit 7dea509

Browse files
authored
Merge pull request wtsi-npg#242 from ces/devel
Add support for loading PacBio ccs secondary analysis BAM files
2 parents d5b5189 + 8531523 commit 7dea509

18 files changed

+332
-48
lines changed

Changes

+4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11

2+
3+
- Add support for loading PacBio ccs BAM files and setting target = 1
4+
on relevant PacBio sequence files.
5+
26
Release 2.13.0
37

48
- WTSI::NPG::HTS::Illumina::ResultSet - added geno to genotype_regex.

MANIFEST

+4
Original file line numberDiff line numberDiff line change
@@ -16510,6 +16510,10 @@ t/data/pacbio/sequel_analysis/001612/tasks/barcoding.tasks.lima-0/lima_output.lb
1651016510
t/data/pacbio/sequel_analysis/001612/tasks/barcoding.tasks.lima-0/lima_output.removed.bam
1651116511
t/data/pacbio/sequel_analysis/001612/tasks/barcoding.tasks.lima-0/lima_output.removed.bam.pbi
1651216512
t/data/pacbio/sequel_analysis/001612/tasks/barcoding.tasks.lima-0/lima_output.removed.subreadset.xml
16513+
t/data/pacbio/sequel_analysis/000226/entry-points/acf46f00-12b8-45e6-bc10-b0790f8d6758.subreadset.xml
16514+
t/data/pacbio/sequel_analysis/000226/tasks/pbcoretools.tasks.auto_ccs_outputs-0
16515+
t/data/pacbio/sequel_analysis/000226/tasks/pbcoretools.tasks.auto_ccs_outputs-0/m64016_190608_025655.ccs.bam
16516+
t/data/pacbio/sequel_analysis/000226/tasks/pbcoretools.tasks.auto_ccs_outputs-0/m64016_190608_025655.ccs.bam.pbi
1651316517
t/data/pacbio/superfoo/24862_627/A01_1/Analysis_Results/m131209_183112_00127_c100579142550000001823092301191430_s1_p0.1.bax.h5
1651416518
t/data/pacbio/superfoo/24862_627/A01_1/Analysis_Results/m131209_183112_00127_c100579142550000001823092301191430_s1_p0.1.log
1651516519
t/data/pacbio/superfoo/24862_627/A01_1/Analysis_Results/m131209_183112_00127_c100579142550000001823092301191430_s1_p0.2.bax.h5

bin/npg_pacbio_analysis_monitor.pl

100644100755
File mode changed.

lib/WTSI/NPG/HTS/PacBio/Annotator.pm

+54-30
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package WTSI::NPG::HTS::PacBio::Annotator;
33
use List::AllUtils qw[uniq];
44
use Moose::Role;
55
use WTSI::NPG::iRODS::Metadata;
6+
use WTSI::DNAP::Utilities::Params qw[function_params];
67

78
our $VERSION = '';
89

@@ -13,7 +14,16 @@ with qw[
1314
=head2 make_primary_metadata
1415
1516
Arg [1] PacBio run metadata, WTSI::NPG::HTS::PacBio::Metadata.
16-
Arg [2] Is data R & D? Boolean. Optional, defaults to false.
17+
18+
Named args : data_level Processing level of data being archived
19+
e.g. Primary - off instrument, secondary
20+
- subsequently post processed. Optional.
21+
is_target Is target? If false then target flag
22+
is not set. Data is not target where it
23+
is not deplexed or where data at a different
24+
data level is the default for the customer.
25+
Boolean. Defaults to true.
26+
is_r_and_d Is data R & D? Boolean. Defaults to false.
1727
1828
Example : my @avus = $ann->make_primary_metadata($metadata);
1929
Description: Return instrument, run, cell index, collection number, set
@@ -23,35 +33,49 @@ with qw[
2333
2434
=cut
2535

26-
sub make_primary_metadata {
27-
my ($self, $metadata, $is_r_and_d) = @_;
28-
29-
defined $metadata or
30-
$self->logconfess('A defined metadata argument is required');
31-
32-
my @avus;
33-
push @avus, $self->make_avu($PACBIO_CELL_INDEX, $metadata->cell_index);
34-
push @avus, $self->make_avu($PACBIO_COLLECTION_NUMBER, $metadata->collection_number);
35-
push @avus, $self->make_avu($PACBIO_INSTRUMENT_NAME, $metadata->instrument_name);
36-
push @avus, $self->make_avu($PACBIO_RUN, $metadata->run_name);
37-
push @avus, $self->make_avu($PACBIO_WELL, $metadata->well_name);
38-
push @avus, $self->make_avu($PACBIO_SAMPLE_LOAD_NAME, $metadata->sample_name);
39-
40-
# Deprecated field, used in early version of RS
41-
if ($metadata->has_set_number){
42-
push @avus, $self->make_avu($PACBIO_SET_NUMBER, $metadata->set_number);
43-
}
44-
45-
if ($is_r_and_d) {
46-
# R & D data
47-
push @avus, $self->make_avu($SAMPLE_NAME, $metadata->sample_name);
48-
}
49-
else {
50-
# Production data
51-
push @avus, $self->make_avu($PACBIO_SOURCE, $PACBIO_PRODUCTION);
52-
}
53-
54-
return @avus;
36+
{
37+
my $positional = 2;
38+
my @named = qw[data_level is_target is_r_and_d ];
39+
my $params = function_params($positional, @named);
40+
41+
sub make_primary_metadata {
42+
my ($self, $metadata) = $params->parse(@_);
43+
44+
defined $metadata or
45+
$self->logconfess('A defined meta argument is required');
46+
47+
my @avus;
48+
push @avus, $self->make_avu($PACBIO_CELL_INDEX, $metadata->cell_index);
49+
push @avus, $self->make_avu($PACBIO_COLLECTION_NUMBER, $metadata->collection_number);
50+
push @avus, $self->make_avu($PACBIO_INSTRUMENT_NAME, $metadata->instrument_name);
51+
push @avus, $self->make_avu($PACBIO_RUN, $metadata->run_name);
52+
push @avus, $self->make_avu($PACBIO_WELL, $metadata->well_name);
53+
push @avus, $self->make_avu($PACBIO_SAMPLE_LOAD_NAME, $metadata->sample_name);
54+
55+
if ($params->data_level) {
56+
push @avus, $self->make_avu($PACBIO_DATA_LEVEL, $params->data_level);
57+
}
58+
59+
# Deprecated field, used in early version of RS
60+
if ($metadata->has_set_number){
61+
push @avus, $self->make_avu($PACBIO_SET_NUMBER, $metadata->set_number);
62+
}
63+
64+
if ($params->is_r_and_d) {
65+
# R & D data
66+
push @avus, $self->make_avu($SAMPLE_NAME, $metadata->sample_name);
67+
}
68+
else {
69+
# Production data
70+
push @avus, $self->make_avu($PACBIO_SOURCE, $PACBIO_PRODUCTION);
71+
}
72+
73+
if ($params->is_target || !defined $params->is_target) {
74+
push @avus, $self->make_avu($TARGET, 1);
75+
}
76+
77+
return @avus;
78+
}
5579
}
5680

5781
=head2 make_secondary_metadata

lib/WTSI/NPG/HTS/PacBio/Metadata.pm

+6-1
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,12 @@ has 'results_folder' =>
7676
predicate => 'has_results_folder',
7777
documentation => 'The results folder');
7878

79-
79+
has 'is_ccs' =>
80+
(isa => 'Str',
81+
is => 'ro',
82+
required => 0,
83+
predicate => 'has_is_ccs',
84+
documentation => 'Is the PacBio data ccs');
8085

8186

8287
around BUILDARGS => sub {

lib/WTSI/NPG/HTS/PacBio/RunPublisher.pm

+5-1
Original file line numberDiff line numberDiff line change
@@ -442,7 +442,11 @@ sub publish_basx_files {
442442
$num_files = $num_processed = $num_errors = scalar @{$files};
443443
}
444444
else {
445-
my @primary_avus = $self->make_primary_metadata($metadata, $is_r_and_d);
445+
my $is_target = $is_r_and_d ? 0 : 1;
446+
my @primary_avus = $self->make_primary_metadata
447+
($metadata,
448+
is_target => $is_target,
449+
is_r_and_d => $is_r_and_d);
446450
my @secondary_avus = $self->make_secondary_metadata(@run_records);
447451
my @extra_avus = $self->make_avu($FILE_TYPE, 'bas');
448452

lib/WTSI/NPG/HTS/PacBio/Sequel/APIClient.pm

-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ use Moose;
88
use MooseX::StrictConstructor;
99
use URI;
1010
use URI::Split qw(uri_join);
11-
use Readonly;
1211
use JSON;
1312

1413
with qw[

lib/WTSI/NPG/HTS/PacBio/Sequel/AnalysisPublisher.pm

+24-9
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,14 @@ our $METADATA_SET = 'subreadset';
2525
# Location of source metadata file
2626
our $ENTRY_DIR = 'entry-points';
2727

28-
our $NOT_DEPLEXED = '\.removed\.';
29-
3028
# Well directory pattern
3129
our $WELL_DIRECTORY_PATTERN = '\d+_[A-Z]\d+$';
3230

31+
# Additional sequence filenames permitted for loading
32+
our @FNAME_PERMITTED = qw[removed ccs];
33+
34+
# Data processing level
35+
our $DATA_LEVEL = 'secondary';
3336

3437
has 'analysis_path' =>
3538
(isa => 'Str',
@@ -106,16 +109,23 @@ sub publish_sequence_files {
106109
if ($tag_id) {
107110
@tag_records = $self->find_pacbio_runs
108111
($self->_metadata->run_name, $self->_metadata->well_name, $tag_id);
112+
} else {
113+
$self->_is_allowed_fname($file) or
114+
$self->logcroak("Unexpected file name for $file");
109115
}
110116

111117
my @records =
112118
(@tag_records == 1) ?
113119
@tag_records :
114120
$self->find_pacbio_runs($self->_metadata->run_name,
115121
$self->_metadata->well_name);
116-
117122
if (@records >= 1) {
118-
my @primary_avus = $self->make_primary_metadata($self->_metadata);
123+
my $is_target = @records > 1 ? 0 : 1;
124+
125+
my @primary_avus = $self->make_primary_metadata
126+
($self->_metadata,
127+
data_level => $DATA_LEVEL,
128+
is_target => $is_target);
119129
my @secondary_avus = $self->make_secondary_metadata(@records);
120130

121131
my ($a_files, $a_processed, $a_errors) =
@@ -246,14 +256,16 @@ sub _get_tag_from_fname {
246256
my ($bc1, $bc2) = ($1, $2);
247257
$tag_id = ($bc1 == $bc2) ? $bc1 : undef;
248258
}
249-
250-
defined ($tag_id || $file =~ /$NOT_DEPLEXED/smx) or
251-
$self->logcroak("Unexpected deplexed file name : $file");
252-
253259
return $tag_id;
254260
}
255261

256-
sub _dest_path{
262+
sub _is_allowed_fname {
263+
my ($self, $file) = @_;
264+
my @exists = grep { $file =~ m{[.] $_ [.]}smx } @FNAME_PERMITTED;
265+
return @exists == 1 ? 1 : 0;
266+
}
267+
268+
sub _dest_path {
257269
my ($self) = @_;
258270

259271
@{$self->smrt_names} == 1 or
@@ -278,6 +290,9 @@ WTSI::NPG::HTS::PacBio::Sequel::AnalysisPublisher
278290
279291
Publishes relevant files to iRODS, adds metadata and sets permissions.
280292
293+
This module is suitable for loading auto secondary analysis output from
294+
demultiplex jobs, ccs analysis and combined demultiplex+css analysis.
295+
281296
Since SMRT Link v7 deplexing jobs have produced BAM files for identified
282297
barcode tags and also files named removed.bam (equivalent to tag zero
283298
in Illumina) which contain the reads not assigned to any tag. Expected

lib/WTSI/NPG/HTS/PacBio/Sequel/MetaXMLParser.pm

+7
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ our $CELL_INDEX_TAG = 'CellIndex';
2929
our $OUTPUT_TAG = 'OutputOptions';
3030
our $RFOLDER_TAG = 'ResultsFolder';
3131

32+
our $IS_CCS_TAG = 'IsCCS';
3233

3334

3435
=head2 parse_file
@@ -76,6 +77,9 @@ sub parse_file {
7677
my $results_folder =
7778
$output->getElementsByTagName($prefix . $RFOLDER_TAG)->[0]->string_value;
7879

80+
my $is_ccs = $dom->getElementsByTagName($prefix . $IS_CCS_TAG) ?
81+
$dom->getElementsByTagName($prefix . $IS_CCS_TAG)->[0]->string_value : 0;
82+
7983
return WTSI::NPG::HTS::PacBio::Metadata->new
8084
(file_path => $file_path,
8185
instrument_name => $instrument_name,
@@ -86,6 +90,7 @@ sub parse_file {
8690
collection_number => $collection_number,
8791
cell_index => $cell_index,
8892
results_folder => $results_folder,
93+
is_ccs => $is_ccs,
8994
);
9095
}
9196

@@ -106,6 +111,8 @@ WTSI::NPG::HTS::PacBio::Sequel::MetaXMLParser
106111
Parser for the Sequel PacBio metadata XML file(s) found in each SMRT
107112
cell subdirectory of completed run data.
108113
114+
Some fields e.g. IsCCS are only found in XMLs running ICS version 7+.
115+
109116
=head1 AUTHOR
110117
111118
Guoying Qi E<lt>[email protected]E<gt>

lib/WTSI/NPG/HTS/PacBio/Sequel/RunPublisher.pm

+11-1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ our $FILE_PREFIX_PATTERN = 'm\d+_\d+_\d+';
2626
# Well directory pattern
2727
our $WELL_DIRECTORY_PATTERN = '\d+_[A-Z]\d+$';
2828

29+
# Data processing level
30+
our $DATA_LEVEL = 'primary';
31+
2932
override '_build_directory_pattern' => sub {
3033
my ($self) = @_;
3134

@@ -150,7 +153,14 @@ sub publish_sequence_files {
150153
": publishing '$smrt_name' as R and D data");
151154
}
152155

153-
my @primary_avus = $self->make_primary_metadata($metadata, $is_r_and_d);
156+
my $is_target =
157+
($metadata->is_ccs eq 'true' || @run_records > 1 || $is_r_and_d) ? 0 : 1;
158+
159+
my @primary_avus = $self->make_primary_metadata
160+
($metadata,
161+
data_level => $DATA_LEVEL,
162+
is_target => $is_target,
163+
is_r_and_d => $is_r_and_d);
154164
my @secondary_avus = $self->make_secondary_metadata(@run_records);
155165

156166
my $files = $self->list_sequence_files($smrt_name);

t/data/pacbio/sequel/r54097_20170727_165601/1_A02/m54097_170727_170646.subreadset.xml

+1
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
<WellName>A02</WellName>
3535
<Concentration>0</Concentration>
3636
<InsertSize>20000</InsertSize>
37+
<IsCCS>false</IsCCS>
3738
<SampleReuseEnabled>false</SampleReuseEnabled>
3839
<StageHotstartEnabled>false</StageHotstartEnabled>
3940
<SizeSelectionEnabled>false</SizeSelectionEnabled>

0 commit comments

Comments
 (0)