CDCgov
diff --git a/‎README.Rmd
+1-1 b/‎README.Rmd
+1-1
diff --git a/‎README.md
+1-1 b/‎README.md
+1-1
diff --git a/‎config/gisaid/gisaid_RSV_schema.py
+316 b/‎config/gisaid/gisaid_RSV_schema.py
+316
diff --git a/‎docs/app.json
+1-1 b/‎docs/app.json
+1-1
diff --git a/‎gisaid_handler.py
+1-1 b/‎gisaid_handler.py
+1-1
diff --git a/‎settings.py
+1-1 b/‎settings.py
+1-1
@@ -26,7 +26,7 @@ github_pages_url <- description$GITHUB_PAGES
 
 <p style="font-size: 16px;"><em>Public Database Submission Pipeline</em></p>
 
-**Beta Version**: v1.2.6. This pipeline is currently in Beta testing, and issues could appear during submission. Please use it at your own risk. Feedback and suggestions are welcome! 
+**Beta Version**: v1.2.7. This pipeline is currently in Beta testing, and issues could appear during submission. Please use it at your own risk. Feedback and suggestions are welcome! 
 
 **General Disclaimer**: This repository was created for use by CDC programs to collaborate on public health related projects in support of the [CDC mission](https://www.cdc.gov/about/organization/mission.htm).  GitHub is not hosted by the CDC, but is a third party website used by CDC and its partners to share information and collaborate on software. CDC use of GitHub does not imply an endorsement of any one particular service, product, or enterprise.
 
 
@@ -9,7 +9,7 @@
 
 <!-- ![GitHub last commit](https://img.shields.io/github/last-commit/montilab/cadra) -->
 
-**Beta Version**: 1.2.6. This pipeline is currently in Beta testing, and
+**Beta Version**: 1.2.7. This pipeline is currently in Beta testing, and
 issues could appear during submission. Please use it at your own risk.
 Feedback and suggestions are welcome\!
 
 
@@ -0,0 +1,316 @@
+from pandera import DataFrameSchema, Column, Check, Index, MultiIndex
+
+schema = DataFrameSchema(
+	columns={
+		"sequence_name": Column(
+			dtype="object",
+			checks=[
+				Check.str_matches(r"^(?!\s*$).+"),
+			],
+			nullable=False,
+			unique=True,
+			coerce=False,
+			required=True,
+			description="Sequence identifier used in fasta file. This is used to create the fasta file for Genbank or GISAID by updating the sequence name in your fasta file to reflect the sample name for the specified database.",
+			title="sequence name",
+		),
+		"gs-sample_name": Column(
+			dtype="object",
+			checks=[
+				Check.str_length(min_value=1,max_value=50),
+			],
+			nullable=False,
+			unique=True,
+			coerce=False,
+			required=True,
+			description="Identifier name used for GISAID. Max length is 50 characters. This field is the same as \"rsv_sequence_name\" in GISAID's metadata template.",
+			title="sample name",
+		),
+		"gs-rsv_subtype": Column(
+			dtype="object",
+			checks=[
+				Check.str_matches(r"^(?!\s*$).+"),
+			],
+			nullable=False,
+			unique=False,
+			coerce=False,
+			required=True,
+			description="For RSV, there are two subtypes, \"RSV-A\" or \"RSV-B\".",
+			title="virus subtype",
+		),
+		"gs-rsv_passage": Column(
+			dtype="object",
+			checks=[
+				Check.str_matches(r"^(?!\s*$).+"),
+			],
+			nullable=False,
+			unique=False,
+			coerce=False,
+			required=True,
+			description="\"Original\" if the sample was sequenced directly from swabs, otherwise add the name of the cell line (e.g., \"Vero\") used to culture the specimen.",
+			title="passage",
+		),
+		"gs-rsv_location": Column(
+			dtype="object",
+			checks=[
+				Check.str_matches(r"^(?!\s*$).+"),
+			],
+			nullable=False,
+			unique=False,
+			coerce=False,
+			required=True,
+			description="Format as \"Continent / Country / Region / Sub-region\".",
+			title="location",
+		),
+		"gs-rsv_add_location": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="Additional location information (e.g. Cruise Ship, Convention, Live animal market).",
+			title="additional location information",
+		),
+		"gs-rsv_host": Column(
+			dtype="object",
+			checks=[
+				Check.str_matches(r"^(?!\s*$).+"),
+			],
+			nullable=False,
+			unique=False,
+			coerce=False,
+			required=True,
+			description="Host species name. For Wastewater use \"Environment\".",
+			title="host",
+		),
+		"gs-rsv_add_host_info": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="Additional information regarding patient (e.g. Patient infected while interacting with animal).",
+			title="Additional host information",
+		),
+		"gs-rsv_sampling_strategy": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="Sampling strategy for sequence (e.g. Sentinel surveillance (ILI), Sentinel surveillance (ARI), Sentinel surveillance (SARI), Non-sentinel-surveillance (hospital), Non-sentinel-surveillance (GP network), Longitudinal sampling on same patient(s), S gene dropout).",
+			title="sampling strategy",
+		),
+		"gs-rsv_gender": Column(
+			dtype="object",
+			checks=[
+				Check.str_matches(r"(?i)(\W|^)(male|m|female|f|unknown|missing)(\W|$)"),
+			],
+			nullable=False,
+			unique=False,
+			coerce=False,
+			required=True,
+			description="Synonym for \"Biological sex\". Should be \"Female\", \"Male\", or \"Unknown\".",
+			title="gender",
+		),
+		"gs-rsv_patient_age": Column(
+			dtype="object",
+			checks=[
+				Check.str_matches(r"^(?!\s*$).+"),
+			],
+			nullable=False,
+			unique=False,
+			coerce=False,
+			required=True,
+			description="Age in years of the person from whom the specimen was collected. May take format other than numeric years, for example, \"0.5\" (i.e., 6 months), \"5 days\", \"7 months\". If units are not given, they are assumed in years. If missing, use \"Unknown\".",
+			title="patient age",
+		),
+		"gs-rsv_patient_status": Column(
+			dtype="object",
+			checks=[
+				Check.str_matches(r"^(?!\s*$).+"),
+			],
+			nullable=False,
+			unique=False,
+			coerce=False,
+			required=True,
+			description="E.g., \"Hospitalized\", \"Released\", \"Live\", \"Deceased\", \"Unknown\".",
+			title="patient status",
+		),
+		"gs-rsv_specimen": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="Specimen source. For wastewater it must be \"Wastewater surveillance\".",
+			title="specimen source",
+		),
+		"gs-rsv_outbreak": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="Outbreak information (Date, Location e.g. type of gathering, Family cluster, etc.).",
+			title="outbreak information",
+		),
+		"gs-rsv_last_vaccinated": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="Provide details if applicable.",
+			title="last vaccinated",
+		),
+		"gs-rsv_treatment": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="Provide details if applicable (e.g. Drug name, dosage).",
+			title="treatment",
+		),
+		"gs-rsv_seq_technology": Column(
+			dtype="object",
+			checks=[
+				Check.str_matches(r"^(?!\s*$).+"),
+			],
+			nullable=False,
+			unique=False,
+			coerce=False,
+			required=True,
+			description="Add the sequencer brand and model (e.g. Illumina MiSeq, Sanger, Nanopore MinION).",
+			title="sequencing technology",
+		),
+		"gs-rsv_assembly_method": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="Genome assembly algorithm (e.g. CLC Genomics Workbench 12, Geneious 10.2.4, SPAdes/MEGAHIT v1.2.9, UGENE v. 33).",
+			title="assembly method",
+		),
+		"gs-rsv_coverage": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="Average genome coverage (e.g. 50x, 100x, 1,000x).",
+			title="average coverage",
+		),
+		"gs-rsv_orig_lab": Column(
+			dtype="object",
+			checks=[
+				Check.str_matches(r"^(?!\s*$).+"),
+			],
+			nullable=False,
+			unique=False,
+			coerce=False,
+			required=True,
+			description="Full name of laboratory from where sample originated.",
+			title="originating lab",
+		),
+		"gs-rsv_orig_lab_addr": Column(
+			dtype="object",
+			checks=[
+				Check.str_matches(r"^(?!\s*$).+"),
+			],
+			nullable=False,
+			unique=False,
+			coerce=False,
+			required=True,
+			description="Complete building address of laboratory from where sample originated.",
+			title="originating lab address",
+		),
+		"gs-rsv_provider_sample_id": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="ID used by originating lab.",
+			title="provider sample id",
+		),
+		"gs-rsv_subm_lab": Column(
+			dtype="object",
+			checks=[
+				Check.str_matches(r"^(?!\s*$).+"),
+			],
+			nullable=False,
+			unique=False,
+			coerce=False,
+			required=True,
+			description="Full name of laboratory submitting this record to GISAID.",
+			title="submitting lab",
+		),
+		"gs-rsv_subm_lab_addr": Column(
+			dtype="object",
+			checks=[
+				Check.str_matches(r"^(?!\s*$).+"),
+			],
+			nullable=False,
+			unique=False,
+			coerce=False,
+			required=True,
+			description="Complete building address of the submitting laboratory.",
+			title="submitting lab address",
+		),
+		"gs-rsv_subm_sample_id": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=False,
+			description="ID used by submitting lab.",
+			title="submitter sample id",
+		),
+		"gs-rsv_comment": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=True,
+			description="Leave blank.",
+			title="comment",
+		),
+		"gs-comment_type": Column(
+			dtype="object",
+			checks=None,
+			nullable=True,
+			unique=False,
+			coerce=False,
+			required=True,
+			description="Leave blank.",
+			title="comment type",
+		),
+	},
+     checks=None,
+     index=None,
+     coerce=False,
+     strict="filter",
+     name="gisaid_cov_schema",
+     ordered=False,
+     unique=None,
+     report_duplicates="all",
+     unique_column_names=True,
+     add_missing_columns=False,
+     title="seqsender GISAID COV schema",
+     description="Schema validation for GISAID SARS-COV2 database.",
+)
@@ -30,7 +30,7 @@ def create_gisaid_files(organism: str, database: str, submission_name: str, subm
 	gisaid_df.columns = gisaid_df.columns.str.replace("gs-","").str.strip()
 	# Add required GISAID fields
 	# covCLI returns an error when authors or collection_date are capitalized
-	if organism in ["COV", "POX", "ARBO"]:
+	if organism in ["COV", "POX", "ARBO", "RSV"]:
 		if organism == "COV":
 			sample_name_column = "covv_virus_name"
 		else:
 
@@ -15,7 +15,7 @@
 VERSION: str = "1.2.3 (Beta)"
 
 # Organism options with unique submission options
-ORGANISM_CHOICES: List[str] = ["FLU", "COV", "POX", "ARBO", "OTHER"]
+ORGANISM_CHOICES: List[str] = ["FLU", "COV", "POX", "ARBO", "RSV", "OTHER"]
 
 # Database submisison options
 DATABASE_CHOICES: List[str] = ["BIOSAMPLE", "SRA", "GENBANK", "GISAID"]