Skip to content

Commit 875ebd2

Browse files
committed
changes for rlen calculation as per vcf 4.4, 4.5
1 parent 5ba41fe commit 875ebd2

File tree

3 files changed

+411
-32
lines changed

3 files changed

+411
-32
lines changed

htslib/vcf.h

+5-4
Original file line numberDiff line numberDiff line change
@@ -950,14 +950,13 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write().
950950
* The @p string in bcf_update_info_flag() is optional,
951951
* @p n indicates whether the flag is set or removed.
952952
*
953-
* Note that updating an END info tag will cause line->rlen to be
953+
* Note that updating an END,SVLEN info tags will cause line->rlen to be
954954
* updated as a side-effect (removing the tag will set it to the
955955
* string length of the REF allele). If line->pos is being changed as
956956
* well, it is important that this is done before calling
957-
* bcf_update_info_int32() to update the END tag, otherwise rlen will be
957+
* bcf_update_info_int32() to update the END/SVLEN tag, otherwise rlen will be
958958
* set incorrectly. If the new END value is less than or equal to
959-
* line->pos, a warning will be printed and line->rlen will be set to
960-
* the length of the REF allele.
959+
* the length of the REF allele for versions upto 4.3.
961960
*/
962961
#define bcf_update_info_int32(hdr,line,key,values,n) bcf_update_info((hdr),(line),(key),(values),(n),BCF_HT_INT)
963962
#define bcf_update_info_float(hdr,line,key,values,n) bcf_update_info((hdr),(line),(key),(values),(n),BCF_HT_REAL)
@@ -1002,6 +1001,8 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write().
10021001
* of fixed-length strings. In case of strings with variable length, shorter strings
10031002
* can be \0-padded. Note that the collapsed strings passed to bcf_update_format_char()
10041003
* are not \0-terminated.
1004+
* With vcf4.5, rlen depends on format field LEN and rlen calculation uses v->pos as well.
1005+
* So any change to pos be done before updating LEN that rlen calculated is correct.
10051006
*
10061007
* Returns 0 on success or negative value on error.
10071008
*/

test/test-vcf-api.c

+131-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* test/test-vcf-api.c -- VCF test harness.
22
3-
Copyright (C) 2013, 2014, 2017-2021, 2023 Genome Research Ltd.
3+
Copyright (C) 2013, 2014, 2017-2021, 2023, 2025 Genome Research Ltd.
44
55
Author: Petr Danecek <[email protected]>
66
@@ -658,6 +658,135 @@ void test_open_format(void) {
658658
error("Expected failure for wrong extension 'vcf.bvcf.bgz'");
659659
}
660660

661+
//tests on rlen calculation
662+
void test_rlen_values(void)
663+
{
664+
bcf_hdr_t *hdr;
665+
bcf1_t *rec, *rec2;
666+
int i, j;
667+
int32_t tmpi;
668+
//data common for all versions, interpreted differently based on version
669+
#define data "##reference=file://tmp\n" \
670+
"##FILTER=<ID=PASS,Description=\"All filters passed\">\n" \
671+
"##INFO=<ID=END,Number=1,Type=Integer,Description=\"end\">\n" \
672+
"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"gt\">\n" \
673+
"##INFO=<ID=SVLEN,Number=A,Type=Integer,Description=\"svlen\">\n" \
674+
"##INFO=<ID=SVCLAIM,Number=A,Type=String,Description=\"svclaim\">\n" \
675+
"##FORMAT=<ID=LEN,Number=1,Type=Integer,Description=\"fmt len\">\n" \
676+
"##contig=<ID=1,Length=40>\n" \
677+
"##ALT=<ID=INS,Description=\"INS\">\n" \
678+
"##ALT=<ID=DEL,Description=\"DEL\">\n" \
679+
"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSample1\tSample2\n" \
680+
"1\t4310\t.\tG\tA\t.\t.\t.\tGT\t0/0\t0|1\n" \
681+
"1\t4311\t.\tC\tCT\t.\t.\t.\tGT\t0/0\t1/0\n" \
682+
"1\t4312\t.\tTC\tT\t213.73\t.\t.\tGT\t0/1\t0|0\n" \
683+
"1\t4314\t.\tG\t<INS>\t213.73\t.\tSVLEN=10;SVCLAIM=J\tGT\t0/1\t0|0\n" \
684+
"1\t4315\t.\tG\t<DEL>\t213.73\t.\tSVLEN=-10;SVCLAIM=D\tGT\t0/1\t0|0\n" \
685+
"1\t4326\t.\tG\t<INS>\t213.73\t.\tEND=4326;SVLEN=10;SVCLAIM=J\tGT\t0/1\t0|0\n" \
686+
"1\t4327\t.\tG\t<DEL>\t213.73\t.\tEND=4337;SVLEN=-10;SVCLAIM=J\tGT\t0/1\t0|0\n" \
687+
"1\t4338\t.\tG\t<*>\t213.73\t.\tEND=4342;SVLEN=.;SVCLAIM=.\tGT:LEN\t0/1:7\t0|0:8\n" \
688+
"1\t4353\t.\tG\t<*>\t213.73\t.\tEND=4357;SVLEN=.;SVCLAIM=.\tGT:LEN\t0/1:7\t0|0:.\n" \
689+
"1\t4363\t.\tG\t<*>\t213.73\t.\tEND=4367;SVLEN=.;SVCLAIM=.\tGT:LEN\t0/1:7\t0|0:.\n" \
690+
"1\t4373\t.\tG\tT\t213.73\t.\tEND=4375;SVLEN=8;SVCLAIM=.\tGT:LEN\t0/1:.\t0|0:10\n"
691+
//this last one is invalid one, to showcase how SVLEN/LEN are considered even when ALT is no SV/gvcf
692+
//this is because data are not cross checked with allele types, to make it simple
693+
694+
//test vcf with different versions
695+
const int vcfsz = 11, testsz = 3;
696+
static char d43[] = "data:,"
697+
"##fileformat=VCFv4.3\n" data;
698+
static char d44[] = "data:,"
699+
"##fileformat=VCFv4.4\n" data;
700+
static char d45[] = "data:,"
701+
"##fileformat=VCFv4.5\n" data;
702+
//expected rlen for tests
703+
int rlen43[] = {1, 1, 2, 1, 1, 1, 11, 5, 5, 5, 3};
704+
int rlen44[] = {1, 1, 2, 1, 11, 1, 11, 5, 5, 5, 9};
705+
int rlen45[] = {1, 1, 2, 1, 11, 1, 11, 8, 7, 7, 10};
706+
707+
char *darr[] = {&d43[0], &d44[0], &d45[0]}; //data array
708+
int *rarr[] = {&rlen43[0], &rlen44[0], &rlen45[0]}; //result array
709+
710+
enum htsLogLevel logging = hts_get_log_level();
711+
712+
// Silence warning messages
713+
hts_set_log_level(HTS_LOG_ERROR);
714+
715+
rec = bcf_init1(); rec2 = bcf_init1();
716+
if (!rec || !rec2) error("Failed to allocate BCF record : %s", strerror(errno));
717+
//calculating rlen with different vcf versions
718+
for (i = 0; i < testsz; ++i) {
719+
htsFile *fp = hts_open(darr[i], "r");
720+
if (!fp) error("Failed to open vcf data with %d : %s", i + 1, strerror(errno));
721+
bcf_clear1(rec);
722+
hdr = bcf_hdr_read(fp);
723+
if (!hdr) error("Failed to read BCF header with %d : %s", i + 1, strerror(errno));
724+
for (j = 0; j < vcfsz; ++j) {
725+
check0(bcf_read(fp, hdr, rec));
726+
if (rec->rlen != rarr[i][j]) {
727+
error("Incorrect rlen @ vcf %d on test %d - expected %d got %"PRIhts_pos"\n", j + 1, i + 1, rarr[i][j], rec->rlen);
728+
}
729+
}
730+
bcf_hdr_destroy(hdr);
731+
hts_close(fp);
732+
}
733+
734+
//calculating rlen with update to fields
735+
htsFile *fp = hts_open(d45, "r");
736+
int id = 1, val[]= { 1, 15 };
737+
bcf_clear1(rec);
738+
hdr = bcf_hdr_read(fp);
739+
if (!hdr) error("Failed to read BCF header : %s", strerror(errno));
740+
check0(bcf_read(fp, hdr, rec));
741+
//"1\t4310\t.\tG\tA\t.\t.\t.\tGT\t0/0\t0|1\n"
742+
if (rec->rlen != 1)
743+
error("Incorrect rlen set, expected 1 got %"PRIhts_pos" @ %d\n", rec->rlen, id);
744+
//alt change A->AT
745+
++id; check0(bcf_update_alleles_str(hdr, rec, "G,AT"));
746+
if (rec->rlen != 1)
747+
error("Incorrect rlen set, expected 1 got %"PRIhts_pos" @ %d\n", rec->rlen, id);
748+
//ref change G->GC, alt AT -> A, "1\t4310\t.\tGC\tA\t.\t.\t.\tGT\t0/0\t0|1\n"
749+
id++; check0(bcf_update_alleles_str(hdr, rec, "GC,A"));
750+
if (rec->rlen != 2)
751+
error("Incorrect rlen set, expected 2 got %"PRIhts_pos" @ %d\n", rec->rlen, id);
752+
++id; check0(bcf_update_alleles_str(hdr, rec, "G,<*>"));
753+
if (rec->rlen != 1)
754+
error("Incorrect rlen set, expected 1 got %"PRIhts_pos" @ %d\n", rec->rlen, id);
755+
tmpi = 4323;
756+
++id; check0(bcf_update_info_int32(hdr, rec, "END", &tmpi, 1)); //SVLEN to infer from END and use
757+
if (rec->rlen != 14)
758+
error("Incorrect rlen set, expected 14 got %"PRIhts_pos" @ %d\n", rec->rlen, id);
759+
++id; check0(bcf_update_format_int32(hdr, rec, "LEN", &val, 2));
760+
if (rec->rlen != 15)
761+
error("Incorrect rlen set, expected 15 got %"PRIhts_pos" @ %d\n", rec->rlen, id);
762+
++id; check0(bcf_update_info_int32(hdr, rec, "END", &tmpi, 0)); //removes END
763+
if (rec->rlen != 15)
764+
error("Incorrect rlen set, expected 15 got %"PRIhts_pos" @ %d\n", rec->rlen, id);
765+
++id; check0(bcf_update_format_int32(hdr, rec, "LEN", &tmpi, 0)); //removes LEN
766+
if (rec->rlen != 1)
767+
error("Incorrect rlen set, expected 1 got %"PRIhts_pos" @ %d\n", rec->rlen, id);
768+
//ALT -> DEL, "1\t4310\t.\tG\tT,<DEL>\t.\t.\t.\tGT\t0/0\t0|1\n"
769+
++id; check0(bcf_update_alleles_str(hdr, rec, "G,T,<DEL>"));
770+
if (rec->rlen != 1)
771+
error("Incorrect rlen set, expected 1 got %"PRIhts_pos" @ %d\n", rec->rlen, id);
772+
val[0] = 0; val[1] = -5;
773+
++id; check0(bcf_update_info_int32(hdr, rec, "SVLEN", &val, 2)); //Add svlen
774+
if (rec->rlen != 6)
775+
error("Incorrect rlen set, expected 6 got %"PRIhts_pos" @ %d\n", rec->rlen, id);
776+
++id; bcf_copy(rec2, rec);
777+
if (rec2->rlen != 6)
778+
error("Incorrect rlen set, expected 6 got %"PRIhts_pos" @ %d\n", rec->rlen, id);
779+
780+
//needs update when header version change is handled
781+
bcf_destroy1(rec);
782+
bcf_destroy1(rec2);
783+
bcf_hdr_destroy(hdr);
784+
785+
hts_close(fp);
786+
787+
hts_set_log_level(logging);
788+
}
789+
661790
int main(int argc, char **argv)
662791
{
663792
char *fname = argc>1 ? argv[1] : "rmme.bcf";
@@ -674,5 +803,6 @@ int main(int argc, char **argv)
674803
test_get_info_values(fname);
675804
test_invalid_end_tag();
676805
test_open_format();
806+
test_rlen_values();
677807
return 0;
678808
}

0 commit comments

Comments
 (0)