Skip to content

Commit c9827fb

Browse files
committed
v1.2.7 add epiRSV to seqsender and metadata template
1 parent cb0cfe8 commit c9827fb

10 files changed

+366
-24
lines changed

README.Rmd

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ github_pages_url <- description$GITHUB_PAGES
2626

2727
<p style="font-size: 16px;"><em>Public Database Submission Pipeline</em></p>
2828

29-
**Beta Version**: v1.2.6. This pipeline is currently in Beta testing, and issues could appear during submission. Please use it at your own risk. Feedback and suggestions are welcome!
29+
**Beta Version**: v1.2.7. This pipeline is currently in Beta testing, and issues could appear during submission. Please use it at your own risk. Feedback and suggestions are welcome!
3030

3131
**General Disclaimer**: This repository was created for use by CDC programs to collaborate on public health related projects in support of the [CDC mission](https://www.cdc.gov/about/organization/mission.htm). GitHub is not hosted by the CDC, but is a third party website used by CDC and its partners to share information and collaborate on software. CDC use of GitHub does not imply an endorsement of any one particular service, product, or enterprise.
3232

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
<!-- ![GitHub last commit](https://img.shields.io/github/last-commit/montilab/cadra) -->
1111

12-
**Beta Version**: 1.2.6. This pipeline is currently in Beta testing, and
12+
**Beta Version**: 1.2.7. This pipeline is currently in Beta testing, and
1313
issues could appear during submission. Please use it at your own risk.
1414
Feedback and suggestions are welcome\!
1515

config/gisaid/gisaid_RSV_schema.py

+316
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,316 @@
1+
from pandera import DataFrameSchema, Column, Check, Index, MultiIndex
2+
3+
schema = DataFrameSchema(
4+
columns={
5+
"sequence_name": Column(
6+
dtype="object",
7+
checks=[
8+
Check.str_matches(r"^(?!\s*$).+"),
9+
],
10+
nullable=False,
11+
unique=True,
12+
coerce=False,
13+
required=True,
14+
description="Sequence identifier used in fasta file. This is used to create the fasta file for Genbank or GISAID by updating the sequence name in your fasta file to reflect the sample name for the specified database.",
15+
title="sequence name",
16+
),
17+
"gs-sample_name": Column(
18+
dtype="object",
19+
checks=[
20+
Check.str_length(min_value=1,max_value=50),
21+
],
22+
nullable=False,
23+
unique=True,
24+
coerce=False,
25+
required=True,
26+
description="Identifier name used for GISAID. Max length is 50 characters. This field is the same as \"rsv_sequence_name\" in GISAID's metadata template.",
27+
title="sample name",
28+
),
29+
"gs-rsv_subtype": Column(
30+
dtype="object",
31+
checks=[
32+
Check.str_matches(r"^(?!\s*$).+"),
33+
],
34+
nullable=False,
35+
unique=False,
36+
coerce=False,
37+
required=True,
38+
description="For RSV, there are two subtypes, \"RSV-A\" or \"RSV-B\".",
39+
title="virus subtype",
40+
),
41+
"gs-rsv_passage": Column(
42+
dtype="object",
43+
checks=[
44+
Check.str_matches(r"^(?!\s*$).+"),
45+
],
46+
nullable=False,
47+
unique=False,
48+
coerce=False,
49+
required=True,
50+
description="\"Original\" if the sample was sequenced directly from swabs, otherwise add the name of the cell line (e.g., \"Vero\") used to culture the specimen.",
51+
title="passage",
52+
),
53+
"gs-rsv_location": Column(
54+
dtype="object",
55+
checks=[
56+
Check.str_matches(r"^(?!\s*$).+"),
57+
],
58+
nullable=False,
59+
unique=False,
60+
coerce=False,
61+
required=True,
62+
description="Format as \"Continent / Country / Region / Sub-region\".",
63+
title="location",
64+
),
65+
"gs-rsv_add_location": Column(
66+
dtype="object",
67+
checks=None,
68+
nullable=True,
69+
unique=False,
70+
coerce=False,
71+
required=False,
72+
description="Additional location information (e.g. Cruise Ship, Convention, Live animal market).",
73+
title="additional location information",
74+
),
75+
"gs-rsv_host": Column(
76+
dtype="object",
77+
checks=[
78+
Check.str_matches(r"^(?!\s*$).+"),
79+
],
80+
nullable=False,
81+
unique=False,
82+
coerce=False,
83+
required=True,
84+
description="Host species name. For Wastewater use \"Environment\".",
85+
title="host",
86+
),
87+
"gs-rsv_add_host_info": Column(
88+
dtype="object",
89+
checks=None,
90+
nullable=True,
91+
unique=False,
92+
coerce=False,
93+
required=False,
94+
description="Additional information regarding patient (e.g. Patient infected while interacting with animal).",
95+
title="Additional host information",
96+
),
97+
"gs-rsv_sampling_strategy": Column(
98+
dtype="object",
99+
checks=None,
100+
nullable=True,
101+
unique=False,
102+
coerce=False,
103+
required=False,
104+
description="Sampling strategy for sequence (e.g. Sentinel surveillance (ILI), Sentinel surveillance (ARI), Sentinel surveillance (SARI), Non-sentinel-surveillance (hospital), Non-sentinel-surveillance (GP network), Longitudinal sampling on same patient(s), S gene dropout).",
105+
title="sampling strategy",
106+
),
107+
"gs-rsv_gender": Column(
108+
dtype="object",
109+
checks=[
110+
Check.str_matches(r"(?i)(\W|^)(male|m|female|f|unknown|missing)(\W|$)"),
111+
],
112+
nullable=False,
113+
unique=False,
114+
coerce=False,
115+
required=True,
116+
description="Synonym for \"Biological sex\". Should be \"Female\", \"Male\", or \"Unknown\".",
117+
title="gender",
118+
),
119+
"gs-rsv_patient_age": Column(
120+
dtype="object",
121+
checks=[
122+
Check.str_matches(r"^(?!\s*$).+"),
123+
],
124+
nullable=False,
125+
unique=False,
126+
coerce=False,
127+
required=True,
128+
description="Age in years of the person from whom the specimen was collected. May take format other than numeric years, for example, \"0.5\" (i.e., 6 months), \"5 days\", \"7 months\". If units are not given, they are assumed in years. If missing, use \"Unknown\".",
129+
title="patient age",
130+
),
131+
"gs-rsv_patient_status": Column(
132+
dtype="object",
133+
checks=[
134+
Check.str_matches(r"^(?!\s*$).+"),
135+
],
136+
nullable=False,
137+
unique=False,
138+
coerce=False,
139+
required=True,
140+
description="E.g., \"Hospitalized\", \"Released\", \"Live\", \"Deceased\", \"Unknown\".",
141+
title="patient status",
142+
),
143+
"gs-rsv_specimen": Column(
144+
dtype="object",
145+
checks=None,
146+
nullable=True,
147+
unique=False,
148+
coerce=False,
149+
required=False,
150+
description="Specimen source. For wastewater it must be \"Wastewater surveillance\".",
151+
title="specimen source",
152+
),
153+
"gs-rsv_outbreak": Column(
154+
dtype="object",
155+
checks=None,
156+
nullable=True,
157+
unique=False,
158+
coerce=False,
159+
required=False,
160+
description="Outbreak information (Date, Location e.g. type of gathering, Family cluster, etc.).",
161+
title="outbreak information",
162+
),
163+
"gs-rsv_last_vaccinated": Column(
164+
dtype="object",
165+
checks=None,
166+
nullable=True,
167+
unique=False,
168+
coerce=False,
169+
required=False,
170+
description="Provide details if applicable.",
171+
title="last vaccinated",
172+
),
173+
"gs-rsv_treatment": Column(
174+
dtype="object",
175+
checks=None,
176+
nullable=True,
177+
unique=False,
178+
coerce=False,
179+
required=False,
180+
description="Provide details if applicable (e.g. Drug name, dosage).",
181+
title="treatment",
182+
),
183+
"gs-rsv_seq_technology": Column(
184+
dtype="object",
185+
checks=[
186+
Check.str_matches(r"^(?!\s*$).+"),
187+
],
188+
nullable=False,
189+
unique=False,
190+
coerce=False,
191+
required=True,
192+
description="Add the sequencer brand and model (e.g. Illumina MiSeq, Sanger, Nanopore MinION).",
193+
title="sequencing technology",
194+
),
195+
"gs-rsv_assembly_method": Column(
196+
dtype="object",
197+
checks=None,
198+
nullable=True,
199+
unique=False,
200+
coerce=False,
201+
required=False,
202+
description="Genome assembly algorithm (e.g. CLC Genomics Workbench 12, Geneious 10.2.4, SPAdes/MEGAHIT v1.2.9, UGENE v. 33).",
203+
title="assembly method",
204+
),
205+
"gs-rsv_coverage": Column(
206+
dtype="object",
207+
checks=None,
208+
nullable=True,
209+
unique=False,
210+
coerce=False,
211+
required=False,
212+
description="Average genome coverage (e.g. 50x, 100x, 1,000x).",
213+
title="average coverage",
214+
),
215+
"gs-rsv_orig_lab": Column(
216+
dtype="object",
217+
checks=[
218+
Check.str_matches(r"^(?!\s*$).+"),
219+
],
220+
nullable=False,
221+
unique=False,
222+
coerce=False,
223+
required=True,
224+
description="Full name of laboratory from where sample originated.",
225+
title="originating lab",
226+
),
227+
"gs-rsv_orig_lab_addr": Column(
228+
dtype="object",
229+
checks=[
230+
Check.str_matches(r"^(?!\s*$).+"),
231+
],
232+
nullable=False,
233+
unique=False,
234+
coerce=False,
235+
required=True,
236+
description="Complete building address of laboratory from where sample originated.",
237+
title="originating lab address",
238+
),
239+
"gs-rsv_provider_sample_id": Column(
240+
dtype="object",
241+
checks=None,
242+
nullable=True,
243+
unique=False,
244+
coerce=False,
245+
required=False,
246+
description="ID used by originating lab.",
247+
title="provider sample id",
248+
),
249+
"gs-rsv_subm_lab": Column(
250+
dtype="object",
251+
checks=[
252+
Check.str_matches(r"^(?!\s*$).+"),
253+
],
254+
nullable=False,
255+
unique=False,
256+
coerce=False,
257+
required=True,
258+
description="Full name of laboratory submitting this record to GISAID.",
259+
title="submitting lab",
260+
),
261+
"gs-rsv_subm_lab_addr": Column(
262+
dtype="object",
263+
checks=[
264+
Check.str_matches(r"^(?!\s*$).+"),
265+
],
266+
nullable=False,
267+
unique=False,
268+
coerce=False,
269+
required=True,
270+
description="Complete building address of the submitting laboratory.",
271+
title="submitting lab address",
272+
),
273+
"gs-rsv_subm_sample_id": Column(
274+
dtype="object",
275+
checks=None,
276+
nullable=True,
277+
unique=False,
278+
coerce=False,
279+
required=False,
280+
description="ID used by submitting lab.",
281+
title="submitter sample id",
282+
),
283+
"gs-rsv_comment": Column(
284+
dtype="object",
285+
checks=None,
286+
nullable=True,
287+
unique=False,
288+
coerce=False,
289+
required=True,
290+
description="Leave blank.",
291+
title="comment",
292+
),
293+
"gs-comment_type": Column(
294+
dtype="object",
295+
checks=None,
296+
nullable=True,
297+
unique=False,
298+
coerce=False,
299+
required=True,
300+
description="Leave blank.",
301+
title="comment type",
302+
),
303+
},
304+
checks=None,
305+
index=None,
306+
coerce=False,
307+
strict="filter",
308+
name="gisaid_cov_schema",
309+
ordered=False,
310+
unique=None,
311+
report_duplicates="all",
312+
unique_column_names=True,
313+
add_missing_columns=False,
314+
title="seqsender GISAID COV schema",
315+
description="Schema validation for GISAID SARS-COV2 database.",
316+
)

docs/app.json

+1-1
Large diffs are not rendered by default.

gisaid_handler.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def create_gisaid_files(organism: str, database: str, submission_name: str, subm
3030
gisaid_df.columns = gisaid_df.columns.str.replace("gs-","").str.strip()
3131
# Add required GISAID fields
3232
# covCLI returns an error when authors or collection_date are capitalized
33-
if organism in ["COV", "POX", "ARBO"]:
33+
if organism in ["COV", "POX", "ARBO", "RSV"]:
3434
if organism == "COV":
3535
sample_name_column = "covv_virus_name"
3636
else:

settings.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
VERSION: str = "1.2.3 (Beta)"
1616

1717
# Organism options with unique submission options
18-
ORGANISM_CHOICES: List[str] = ["FLU", "COV", "POX", "ARBO", "OTHER"]
18+
ORGANISM_CHOICES: List[str] = ["FLU", "COV", "POX", "ARBO", "RSV", "OTHER"]
1919

2020
# Database submisison options
2121
DATABASE_CHOICES: List[str] = ["BIOSAMPLE", "SRA", "GENBANK", "GISAID"]

0 commit comments

Comments
 (0)