|
| 1 | +from pandera import DataFrameSchema, Column, Check, Index, MultiIndex |
| 2 | + |
| 3 | +schema = DataFrameSchema( |
| 4 | + columns={ |
| 5 | + "sequence_name": Column( |
| 6 | + dtype="object", |
| 7 | + checks=[ |
| 8 | + Check.str_matches(r"^(?!\s*$).+"), |
| 9 | + ], |
| 10 | + nullable=False, |
| 11 | + unique=True, |
| 12 | + coerce=False, |
| 13 | + required=True, |
| 14 | + description="Sequence identifier used in fasta file. This is used to create the fasta file for Genbank or GISAID by updating the sequence name in your fasta file to reflect the sample name for the specified database.", |
| 15 | + title="sequence name", |
| 16 | + ), |
| 17 | + "gs-sample_name": Column( |
| 18 | + dtype="object", |
| 19 | + checks=[ |
| 20 | + Check.str_length(min_value=1,max_value=50), |
| 21 | + ], |
| 22 | + nullable=False, |
| 23 | + unique=True, |
| 24 | + coerce=False, |
| 25 | + required=True, |
| 26 | + description="Identifier name used for GISAID. Max length is 50 characters. This field is the same as \"rsv_sequence_name\" in GISAID's metadata template.", |
| 27 | + title="sample name", |
| 28 | + ), |
| 29 | + "gs-rsv_subtype": Column( |
| 30 | + dtype="object", |
| 31 | + checks=[ |
| 32 | + Check.str_matches(r"^(?!\s*$).+"), |
| 33 | + ], |
| 34 | + nullable=False, |
| 35 | + unique=False, |
| 36 | + coerce=False, |
| 37 | + required=True, |
| 38 | + description="For RSV, there are two subtypes, \"RSV-A\" or \"RSV-B\".", |
| 39 | + title="virus subtype", |
| 40 | + ), |
| 41 | + "gs-rsv_passage": Column( |
| 42 | + dtype="object", |
| 43 | + checks=[ |
| 44 | + Check.str_matches(r"^(?!\s*$).+"), |
| 45 | + ], |
| 46 | + nullable=False, |
| 47 | + unique=False, |
| 48 | + coerce=False, |
| 49 | + required=True, |
| 50 | + description="\"Original\" if the sample was sequenced directly from swabs, otherwise add the name of the cell line (e.g., \"Vero\") used to culture the specimen.", |
| 51 | + title="passage", |
| 52 | + ), |
| 53 | + "gs-rsv_location": Column( |
| 54 | + dtype="object", |
| 55 | + checks=[ |
| 56 | + Check.str_matches(r"^(?!\s*$).+"), |
| 57 | + ], |
| 58 | + nullable=False, |
| 59 | + unique=False, |
| 60 | + coerce=False, |
| 61 | + required=True, |
| 62 | + description="Format as \"Continent / Country / Region / Sub-region\".", |
| 63 | + title="location", |
| 64 | + ), |
| 65 | + "gs-rsv_add_location": Column( |
| 66 | + dtype="object", |
| 67 | + checks=None, |
| 68 | + nullable=True, |
| 69 | + unique=False, |
| 70 | + coerce=False, |
| 71 | + required=False, |
| 72 | + description="Additional location information (e.g. Cruise Ship, Convention, Live animal market).", |
| 73 | + title="additional location information", |
| 74 | + ), |
| 75 | + "gs-rsv_host": Column( |
| 76 | + dtype="object", |
| 77 | + checks=[ |
| 78 | + Check.str_matches(r"^(?!\s*$).+"), |
| 79 | + ], |
| 80 | + nullable=False, |
| 81 | + unique=False, |
| 82 | + coerce=False, |
| 83 | + required=True, |
| 84 | + description="Host species name. For Wastewater use \"Environment\".", |
| 85 | + title="host", |
| 86 | + ), |
| 87 | + "gs-rsv_add_host_info": Column( |
| 88 | + dtype="object", |
| 89 | + checks=None, |
| 90 | + nullable=True, |
| 91 | + unique=False, |
| 92 | + coerce=False, |
| 93 | + required=False, |
| 94 | + description="Additional information regarding patient (e.g. Patient infected while interacting with animal).", |
| 95 | + title="Additional host information", |
| 96 | + ), |
| 97 | + "gs-rsv_sampling_strategy": Column( |
| 98 | + dtype="object", |
| 99 | + checks=None, |
| 100 | + nullable=True, |
| 101 | + unique=False, |
| 102 | + coerce=False, |
| 103 | + required=False, |
| 104 | + description="Sampling strategy for sequence (e.g. Sentinel surveillance (ILI), Sentinel surveillance (ARI), Sentinel surveillance (SARI), Non-sentinel-surveillance (hospital), Non-sentinel-surveillance (GP network), Longitudinal sampling on same patient(s), S gene dropout).", |
| 105 | + title="sampling strategy", |
| 106 | + ), |
| 107 | + "gs-rsv_gender": Column( |
| 108 | + dtype="object", |
| 109 | + checks=[ |
| 110 | + Check.str_matches(r"(?i)(\W|^)(male|m|female|f|unknown|missing)(\W|$)"), |
| 111 | + ], |
| 112 | + nullable=False, |
| 113 | + unique=False, |
| 114 | + coerce=False, |
| 115 | + required=True, |
| 116 | + description="Synonym for \"Biological sex\". Should be \"Female\", \"Male\", or \"Unknown\".", |
| 117 | + title="gender", |
| 118 | + ), |
| 119 | + "gs-rsv_patient_age": Column( |
| 120 | + dtype="object", |
| 121 | + checks=[ |
| 122 | + Check.str_matches(r"^(?!\s*$).+"), |
| 123 | + ], |
| 124 | + nullable=False, |
| 125 | + unique=False, |
| 126 | + coerce=False, |
| 127 | + required=True, |
| 128 | + description="Age in years of the person from whom the specimen was collected. May take format other than numeric years, for example, \"0.5\" (i.e., 6 months), \"5 days\", \"7 months\". If units are not given, they are assumed in years. If missing, use \"Unknown\".", |
| 129 | + title="patient age", |
| 130 | + ), |
| 131 | + "gs-rsv_patient_status": Column( |
| 132 | + dtype="object", |
| 133 | + checks=[ |
| 134 | + Check.str_matches(r"^(?!\s*$).+"), |
| 135 | + ], |
| 136 | + nullable=False, |
| 137 | + unique=False, |
| 138 | + coerce=False, |
| 139 | + required=True, |
| 140 | + description="E.g., \"Hospitalized\", \"Released\", \"Live\", \"Deceased\", \"Unknown\".", |
| 141 | + title="patient status", |
| 142 | + ), |
| 143 | + "gs-rsv_specimen": Column( |
| 144 | + dtype="object", |
| 145 | + checks=None, |
| 146 | + nullable=True, |
| 147 | + unique=False, |
| 148 | + coerce=False, |
| 149 | + required=False, |
| 150 | + description="Specimen source. For wastewater it must be \"Wastewater surveillance\".", |
| 151 | + title="specimen source", |
| 152 | + ), |
| 153 | + "gs-rsv_outbreak": Column( |
| 154 | + dtype="object", |
| 155 | + checks=None, |
| 156 | + nullable=True, |
| 157 | + unique=False, |
| 158 | + coerce=False, |
| 159 | + required=False, |
| 160 | + description="Outbreak information (Date, Location e.g. type of gathering, Family cluster, etc.).", |
| 161 | + title="outbreak information", |
| 162 | + ), |
| 163 | + "gs-rsv_last_vaccinated": Column( |
| 164 | + dtype="object", |
| 165 | + checks=None, |
| 166 | + nullable=True, |
| 167 | + unique=False, |
| 168 | + coerce=False, |
| 169 | + required=False, |
| 170 | + description="Provide details if applicable.", |
| 171 | + title="last vaccinated", |
| 172 | + ), |
| 173 | + "gs-rsv_treatment": Column( |
| 174 | + dtype="object", |
| 175 | + checks=None, |
| 176 | + nullable=True, |
| 177 | + unique=False, |
| 178 | + coerce=False, |
| 179 | + required=False, |
| 180 | + description="Provide details if applicable (e.g. Drug name, dosage).", |
| 181 | + title="treatment", |
| 182 | + ), |
| 183 | + "gs-rsv_seq_technology": Column( |
| 184 | + dtype="object", |
| 185 | + checks=[ |
| 186 | + Check.str_matches(r"^(?!\s*$).+"), |
| 187 | + ], |
| 188 | + nullable=False, |
| 189 | + unique=False, |
| 190 | + coerce=False, |
| 191 | + required=True, |
| 192 | + description="Add the sequencer brand and model (e.g. Illumina MiSeq, Sanger, Nanopore MinION).", |
| 193 | + title="sequencing technology", |
| 194 | + ), |
| 195 | + "gs-rsv_assembly_method": Column( |
| 196 | + dtype="object", |
| 197 | + checks=None, |
| 198 | + nullable=True, |
| 199 | + unique=False, |
| 200 | + coerce=False, |
| 201 | + required=False, |
| 202 | + description="Genome assembly algorithm (e.g. CLC Genomics Workbench 12, Geneious 10.2.4, SPAdes/MEGAHIT v1.2.9, UGENE v. 33).", |
| 203 | + title="assembly method", |
| 204 | + ), |
| 205 | + "gs-rsv_coverage": Column( |
| 206 | + dtype="object", |
| 207 | + checks=None, |
| 208 | + nullable=True, |
| 209 | + unique=False, |
| 210 | + coerce=False, |
| 211 | + required=False, |
| 212 | + description="Average genome coverage (e.g. 50x, 100x, 1,000x).", |
| 213 | + title="average coverage", |
| 214 | + ), |
| 215 | + "gs-rsv_orig_lab": Column( |
| 216 | + dtype="object", |
| 217 | + checks=[ |
| 218 | + Check.str_matches(r"^(?!\s*$).+"), |
| 219 | + ], |
| 220 | + nullable=False, |
| 221 | + unique=False, |
| 222 | + coerce=False, |
| 223 | + required=True, |
| 224 | + description="Full name of laboratory from where sample originated.", |
| 225 | + title="originating lab", |
| 226 | + ), |
| 227 | + "gs-rsv_orig_lab_addr": Column( |
| 228 | + dtype="object", |
| 229 | + checks=[ |
| 230 | + Check.str_matches(r"^(?!\s*$).+"), |
| 231 | + ], |
| 232 | + nullable=False, |
| 233 | + unique=False, |
| 234 | + coerce=False, |
| 235 | + required=True, |
| 236 | + description="Complete building address of laboratory from where sample originated.", |
| 237 | + title="originating lab address", |
| 238 | + ), |
| 239 | + "gs-rsv_provider_sample_id": Column( |
| 240 | + dtype="object", |
| 241 | + checks=None, |
| 242 | + nullable=True, |
| 243 | + unique=False, |
| 244 | + coerce=False, |
| 245 | + required=False, |
| 246 | + description="ID used by originating lab.", |
| 247 | + title="provider sample id", |
| 248 | + ), |
| 249 | + "gs-rsv_subm_lab": Column( |
| 250 | + dtype="object", |
| 251 | + checks=[ |
| 252 | + Check.str_matches(r"^(?!\s*$).+"), |
| 253 | + ], |
| 254 | + nullable=False, |
| 255 | + unique=False, |
| 256 | + coerce=False, |
| 257 | + required=True, |
| 258 | + description="Full name of laboratory submitting this record to GISAID.", |
| 259 | + title="submitting lab", |
| 260 | + ), |
| 261 | + "gs-rsv_subm_lab_addr": Column( |
| 262 | + dtype="object", |
| 263 | + checks=[ |
| 264 | + Check.str_matches(r"^(?!\s*$).+"), |
| 265 | + ], |
| 266 | + nullable=False, |
| 267 | + unique=False, |
| 268 | + coerce=False, |
| 269 | + required=True, |
| 270 | + description="Complete building address of the submitting laboratory.", |
| 271 | + title="submitting lab address", |
| 272 | + ), |
| 273 | + "gs-rsv_subm_sample_id": Column( |
| 274 | + dtype="object", |
| 275 | + checks=None, |
| 276 | + nullable=True, |
| 277 | + unique=False, |
| 278 | + coerce=False, |
| 279 | + required=False, |
| 280 | + description="ID used by submitting lab.", |
| 281 | + title="submitter sample id", |
| 282 | + ), |
| 283 | + "gs-rsv_comment": Column( |
| 284 | + dtype="object", |
| 285 | + checks=None, |
| 286 | + nullable=True, |
| 287 | + unique=False, |
| 288 | + coerce=False, |
| 289 | + required=True, |
| 290 | + description="Leave blank.", |
| 291 | + title="comment", |
| 292 | + ), |
| 293 | + "gs-comment_type": Column( |
| 294 | + dtype="object", |
| 295 | + checks=None, |
| 296 | + nullable=True, |
| 297 | + unique=False, |
| 298 | + coerce=False, |
| 299 | + required=True, |
| 300 | + description="Leave blank.", |
| 301 | + title="comment type", |
| 302 | + ), |
| 303 | + }, |
| 304 | + checks=None, |
| 305 | + index=None, |
| 306 | + coerce=False, |
| 307 | + strict="filter", |
| 308 | + name="gisaid_cov_schema", |
| 309 | + ordered=False, |
| 310 | + unique=None, |
| 311 | + report_duplicates="all", |
| 312 | + unique_column_names=True, |
| 313 | + add_missing_columns=False, |
| 314 | + title="seqsender GISAID COV schema", |
| 315 | + description="Schema validation for GISAID SARS-COV2 database.", |
| 316 | +) |
0 commit comments