Skip to content

Commit b14fffb

Browse files
authored
update to work with vcf 4.4 prefixed phasing info (#1861)
1 parent cdb3245 commit b14fffb

File tree

7 files changed

+477
-34
lines changed

7 files changed

+477
-34
lines changed

htslib/kstring.h

+52-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/* The MIT License
22
33
Copyright (C) 2011 by Attractive Chaos <[email protected]>
4-
Copyright (C) 2013-2014, 2016, 2018-2020, 2022, 2024 Genome Research Ltd.
4+
Copyright (C) 2013-2014, 2016, 2018-2020, 2022, 2024-2025 Genome Research Ltd.
55
66
Permission is hereby granted, free of charge, to any person obtaining
77
a copy of this software and associated documentation files (the
@@ -449,6 +449,57 @@ static inline int *ksplit(kstring_t *s, int delimiter, int *n)
449449
return offsets;
450450
}
451451

452+
/**
453+
* kinsert_char - inserts a char to kstring
454+
* @param c - char to insert
455+
* @param pos - position at which to insert, starting from 0
456+
* @param s - pointer to output string
457+
* Returns 0 on success and -1 on failure
458+
* 0 for pos inserts at start and length of current string as pos appends at
459+
* the end.
460+
*/
461+
static inline int kinsert_char(char c, size_t pos, kstring_t *s)
462+
{
463+
if (!s || pos > s->l) {
464+
return EOF;
465+
}
466+
if (ks_resize(s, s->l + 2) < 0) {
467+
return EOF;
468+
}
469+
memmove(s->s + pos + 1, s->s + pos, s->l - pos);
470+
s->s[pos] = c;
471+
s->s[++s->l] = 0;
472+
return 0;
473+
}
474+
475+
/**
476+
* kinsert_str - inserts a null terminated string to kstring
477+
* @param str - string to insert
478+
* @param pos - position at which to insert, starting from 0
479+
* @param s - pointer to output string
480+
* Returns 0 on success and -1 on failure
481+
* 0 for pos inserts at start and length of current string as pos appends at
482+
* the end. empty string makes no update.
483+
*/
484+
static inline int kinsert_str(const char *str, size_t pos, kstring_t *s)
485+
{
486+
size_t len = 0;
487+
if (!s || pos > s->l || !str) {
488+
return EOF;
489+
}
490+
if (!(len = strlen(str))) {
491+
return 0;
492+
}
493+
if (ks_resize(s, s->l + len + 1) < 0) {
494+
return EOF;
495+
}
496+
memmove(s->s + pos + len, s->s + pos, s->l - pos);
497+
memcpy(s->s + pos, str, len);
498+
s->l += len;
499+
s->s[s->l] = '\0';
500+
return 0;
501+
}
502+
452503
#ifdef HTSLIB_SSIZE_T
453504
#undef HTSLIB_SSIZE_T
454505
#undef ssize_t

htslib/vcf.h

+18-24
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
/// High-level VCF/BCF variant calling file operations.
33
/*
44
Copyright (C) 2012, 2013 Broad Institute.
5-
Copyright (C) 2012-2020, 2022-2023 Genome Research Ltd.
5+
Copyright (C) 2012-2020, 2022-2025 Genome Research Ltd.
66
77
Author: Heng Li <[email protected]>
88
@@ -1501,31 +1501,25 @@ static inline int bcf_float_is_vector_end(float f)
15011501
return u.i==bcf_float_vector_end ? 1 : 0;
15021502
}
15031503

1504+
1505+
/**
1506+
* bcf_format_gt_v2 - formats GT information on a string
1507+
* @param hdr - bcf header, to get version
1508+
* @param fmt - pointer to bcf format data
1509+
* @param isample - position of interested sample in data
1510+
* @param str - pointer to output string
1511+
* Returns 0 on success and -1 on failure
1512+
* This method is preferred over bcf_format_gt as this supports vcf4.4 and
1513+
* prefixed phasing. Explicit / prefixed phasing for 1st allele is used only
1514+
* when it is a must to correctly express phasing.
1515+
*/
1516+
HTSLIB_EXPORT
1517+
int bcf_format_gt_v2(const bcf_hdr_t *hdr, bcf_fmt_t *fmt, int isample,
1518+
kstring_t *str) HTS_RESULT_USED;
1519+
15041520
static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str)
15051521
{
1506-
uint32_t e = 0;
1507-
#define BRANCH(type_t, convert, missing, vector_end) { \
1508-
uint8_t *ptr = fmt->p + isample*fmt->size; \
1509-
int i; \
1510-
for (i=0; i<fmt->n; i++, ptr += sizeof(type_t)) \
1511-
{ \
1512-
type_t val = convert(ptr); \
1513-
if ( val == vector_end ) break; \
1514-
if ( i ) e |= kputc("/|"[val&1], str) < 0; \
1515-
if ( !(val>>1) ) e |= kputc('.', str) < 0; \
1516-
else e |= kputw((val>>1) - 1, str) < 0; \
1517-
} \
1518-
if (i == 0) e |= kputc('.', str) < 0; \
1519-
}
1520-
switch (fmt->type) {
1521-
case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, bcf_int8_missing, bcf_int8_vector_end); break;
1522-
case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing, bcf_int16_vector_end); break;
1523-
case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing, bcf_int32_vector_end); break;
1524-
case BCF_BT_NULL: e |= kputc('.', str) < 0; break;
1525-
default: hts_log_error("Unexpected type %d", fmt->type); return -2;
1526-
}
1527-
#undef BRANCH
1528-
return e == 0 ? 0 : -1;
1522+
return bcf_format_gt_v2(NULL, fmt, isample, str);
15291523
}
15301524

15311525
static inline int bcf_enc_size(kstring_t *s, int size, int type)

test/test.pl

+10
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
run_test('test_bcf2vcf',$opts);
5454
run_test('test_vcf_sweep',$opts,out=>'test-vcf-sweep.out');
5555
run_test('test_vcf_various',$opts);
56+
run_test('test_vcf_44', $opts);
5657
run_test('test_bcf_sr_sort',$opts);
5758
run_test('test_bcf_sr_no_index',$opts);
5859
run_test('test_bcf_sr_range', $opts);
@@ -1160,6 +1161,15 @@ sub test_vcf_various
11601161
cmd => "$$opts{path}/test_view $$opts{path}/modhdr.vcf.gz chr22:1-2");
11611162
}
11621163

1164+
sub test_vcf_44
1165+
{
1166+
my ($opts, %args) = @_;
1167+
1168+
# vcf4.4 with implicit and explicit phasing info combinations
1169+
test_cmd($opts, %args, out => "vcf44_1.expected",
1170+
cmd => "$$opts{bin}/htsfile -c $$opts{path}/vcf44_1.vcf");
1171+
}
1172+
11631173
sub write_multiblock_bgzf {
11641174
my ($name, $frags) = @_;
11651175

test/test_kstring.c

+135-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* test_kstring.c -- kstring unit tests
22
3-
Copyright (C) 2018, 2020, 2024 Genome Research Ltd.
3+
Copyright (C) 2018, 2020, 2024-2025 Genome Research Ltd.
44
55
Author: Rob Davies <[email protected]>
66
@@ -451,6 +451,134 @@ static int test_kgetline2(void) {
451451
return EXIT_SUCCESS;
452452
}
453453

454+
static int test_kinsertchar(void) {
455+
kstring_t t = KS_INITIALIZE, res = KS_INITIALIZE;
456+
int i = 0;
457+
struct data {
458+
int pos;
459+
const char *val;
460+
};
461+
462+
struct data tdata[] = { { -1, ""}, {0, "X0123"}, {1, "0X123"}, {2, "01X23"},
463+
{3, "012X3"}, {4, "0123X"}, {5, ""} };
464+
465+
for (i = -1; i < 6; ++i) {
466+
kstring_t s = KS_INITIALIZE;
467+
kputs("0123", &s);
468+
if (kinsert_char('X', i, &s) < 0) {
469+
if ( i < 0 || i > 4) { ks_free(&s); continue; } //expected failures
470+
fprintf(stderr, "kinsert_char failed\n");
471+
ks_free(&s);
472+
return -1;
473+
}
474+
if (s.s[s.l] != '\0') {
475+
fprintf(stderr, "No NUL termination on string from kinsert_char\n");
476+
ks_free(&s);
477+
return -1;
478+
}
479+
if (memcmp(s.s, tdata[i + 1].val, s.l + 1)) {
480+
fprintf(stderr, "kinsert_char comparison failed\n");
481+
ks_free(&s);
482+
return -1;
483+
}
484+
ks_free(&s);
485+
}
486+
//realloc checks
487+
for (i = 0; i < 7; ++i) {
488+
kputc('A' + i, &res);
489+
if (kinsert_char('A' + i, t.l, &t) < 0) {
490+
fprintf(stderr, "kinsert_char failed in realloc\n");
491+
ks_free(&res); ks_free(&t);
492+
return -1;
493+
}
494+
if (t.s[t.l] != '\0') {
495+
fprintf(stderr, "No NUL termination on string from kinsert_char in realloc\n");
496+
ks_free(&res); ks_free(&t);
497+
return -1;
498+
}
499+
if (memcmp(t.s, res.s, res.l+1)) {
500+
fprintf(stderr, "kinsert_char realloc comparison failed in realloc\n");
501+
ks_free(&res); ks_free(&t);
502+
return -1;
503+
}
504+
}
505+
ks_free(&t);
506+
ks_free(&res);
507+
return 0;
508+
}
509+
510+
static int test_kinsertstr(void) {
511+
kstring_t t = KS_INITIALIZE, res = KS_INITIALIZE;
512+
int i = 0;
513+
struct data {
514+
int pos;
515+
const char *val;
516+
};
517+
518+
struct data tdata[] = { { -1, ""}, {0, "XYZ0123"}, {1, "0XYZ123"},
519+
{2, "01XYZ23"}, {3, "012XYZ3"}, {4, "0123XYZ"}, {5, ""} };
520+
521+
for (i = -1; i < 6; ++i) {
522+
kstring_t s = KS_INITIALIZE;
523+
kputs("0123", &s);
524+
if (kinsert_str("XYZ", i, &s) < 0) {
525+
if ( i < 0 || i > 4) { ks_free(&s); continue; } //expected failures
526+
fprintf(stderr, "kinsert_str failed\n");
527+
return -1;
528+
}
529+
if (s.s[s.l] != '\0') {
530+
fprintf(stderr, "No NUL termination on string from kinsert_str\n");
531+
return -1;
532+
}
533+
if (memcmp(s.s, tdata[i + 1].val, s.l + 1)) {
534+
fprintf(stderr, "kinsert_str comparison failed\n");
535+
return -1;
536+
}
537+
ks_free(&s);
538+
}
539+
//realloc checks
540+
for (i = 0; i < 15; ++i) {
541+
kstring_t val = KS_INITIALIZE;
542+
ksprintf(&val, "%c", 'A' + i);
543+
kputs(val.s, &res);
544+
if (kinsert_str(val.s, t.l, &t) < 0) {
545+
ks_free(&val);
546+
fprintf(stderr, "kinsert_str failed in realloc\n");
547+
return -1;
548+
}
549+
if (t.s[t.l] != '\0') {
550+
ks_free(&val); ks_free(&res);
551+
fprintf(stderr, "No NUL termination on string from kinsert_str in realloc\n");
552+
return -1;
553+
}
554+
if (memcmp(t.s, res.s, res.l+1)) {
555+
ks_free(&val); ks_free(&res);
556+
fprintf(stderr, "kinsert_str realloc comparison failed in realloc\n");
557+
return -1;
558+
}
559+
ks_free(&val);
560+
}
561+
//empty strings
562+
ks_free(&t);
563+
if (kinsert_str("", 1, &t)) { //expected
564+
if (kinsert_str("", 0, &t) || t.l != 0) {
565+
fprintf(stderr, "kinsert_str empty insertion failed\n");
566+
return -1;
567+
}
568+
} else {
569+
fprintf(stderr, "kinsert_str empty ins to invalid pos succeeded\n");
570+
return -1;
571+
}
572+
i = res.l;
573+
if (kinsert_str("", 1, &res) || i != res.l) {
574+
fprintf(stderr, "kinsert_str empty ins to valid pos failed\n");
575+
ks_free(&res);
576+
return -1;
577+
}
578+
ks_free(&res);
579+
return 0;
580+
}
581+
454582
int main(int argc, char **argv) {
455583
int opt, res = EXIT_SUCCESS;
456584
int64_t start = 0;
@@ -500,5 +628,11 @@ int main(int argc, char **argv) {
500628
if (!test || strcmp(test, "kgetline2") == 0)
501629
if (test_kgetline2() != 0) res = EXIT_FAILURE;
502630

631+
if (!test || strcmp(test, "kinsertchar") == 0)
632+
if (test_kinsertchar() != 0) res = EXIT_FAILURE;
633+
634+
if (!test || strcmp(test, "kinsertstr") == 0)
635+
if (test_kinsertstr() != 0) res = EXIT_FAILURE;
636+
503637
return res;
504638
}

test/vcf44_1.expected

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
##fileformat=VCFv4.4
2+
##FILTER=<ID=PASS,Description="All filters passed">
3+
##contig=<ID=1,length=1000>
4+
##reference=file://test
5+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
6+
##failue="test file on explicit and implicit phasing markers in 4.4"
7+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00096 HG00097
8+
1 61462 rs56992750 T A 100 PASS . GT 0|0|1 0/1
9+
1 61480 rs56992751 T A 100 PASS . GT 0 /0|1
10+
1 61481 rs56992752 T A 100 PASS . GT /0 |0/1
11+
1 61482 rs56992752 T A 100 PASS . GT /0 /1
12+
1 61483 rs56992752 T A 100 PASS . GT 0 1
13+
1 61484 rs56992752 T A 100 PASS . GT 0 /1
14+
1 61485 rs56992752 T A 100 PASS . GT 0 1
15+
1 61486 rs56992752 T A 100 PASS . GT 0 1
16+
1 61487 rs56992752 T A 100 PASS . GT 0 1
17+
1 61488 rs56992752 T A 100 PASS . GT 0 /1
18+
1 61489 rs56992752 T A 100 PASS . GT /0 1
19+
1 61490 rs56992752 T A 100 PASS . GT /0 1
20+
1 61491 rs56992752 T A 100 PASS . GT /0 /1
21+
1 61492 rs56992752 T A 100 PASS . GT /0|0 1/0
22+
1 61493 rs56992752 T A 100 PASS . GT 0|0 |1/0
23+
1 61494 rs56992752 T A 100 PASS . GT /0|0 1/0
24+
1 61495 rs56992752 T A 100 PASS . GT 0|0 |1/0
25+
1 61496 rs56992752 T A 100 PASS . GT . .
26+
1 61497 rs56992752 T A 100 PASS . GT . |.
27+
1 61498 rs56992752 T A 100 PASS . GT ./1 .|1
28+
1 61499 rs56992752 T A 100 PASS . GT ./1 .|1
29+
1 61500 rs56992752 T A 100 PASS . GT |./1 /.|1
30+
1 61501 rs56992752 T A 100 PASS . GT 1/. 1|.
31+
1 61502 rs56992752 T A 100 PASS . GT 1/. /1|.
32+
1 61503 rs56992752 T A 100 PASS . GT |1/. 1|.
33+
1 61504 rs56992752 T A 100 PASS . GT ./. .|.
34+
1 61505 rs56992752 T A 100 PASS . GT ./. .|.
35+
1 61506 rs56992752 T A 100 PASS . GT |./. /.|.

test/vcf44_1.vcf

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
##fileformat=VCFv4.4
2+
##contig=<ID=1,length=1000>
3+
##reference=file://test
4+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
5+
##failue="test file on explicit and implicit phasing markers in 4.4"
6+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00096 HG00097
7+
1 61462 rs56992750 T A 100 PASS . GT 0|0|1 0/1
8+
1 61480 rs56992751 T A 100 PASS . GT 0 /0|1
9+
1 61481 rs56992752 T A 100 PASS . GT /0 |0/1
10+
1 61482 rs56992752 T A 100 PASS . GT /0 /1
11+
1 61483 rs56992752 T A 100 PASS . GT 0 1
12+
1 61484 rs56992752 T A 100 PASS . GT 0 /1
13+
1 61485 rs56992752 T A 100 PASS . GT 0 |1
14+
1 61486 rs56992752 T A 100 PASS . GT |0 1
15+
1 61487 rs56992752 T A 100 PASS . GT |0 |1
16+
1 61488 rs56992752 T A 100 PASS . GT |0 /1
17+
1 61489 rs56992752 T A 100 PASS . GT /0 1
18+
1 61490 rs56992752 T A 100 PASS . GT /0 |1
19+
1 61491 rs56992752 T A 100 PASS . GT /0 /1
20+
1 61492 rs56992752 T A 100 PASS . GT /0|0 /1/0
21+
1 61493 rs56992752 T A 100 PASS . GT |0|0 |1/0
22+
1 61494 rs56992752 T A 100 PASS . GT /0|0 1/0
23+
1 61495 rs56992752 T A 100 PASS . GT 0|0 |1/0
24+
1 61496 rs56992752 T A 100 PASS . GT . .
25+
1 61497 rs56992752 T A 100 PASS . GT /. |.
26+
1 61498 rs56992752 T A 100 PASS . GT ./1 .|1
27+
1 61499 rs56992752 T A 100 PASS . GT /./1 |.|1
28+
1 61500 rs56992752 T A 100 PASS . GT |./1 /.|1
29+
1 61501 rs56992752 T A 100 PASS . GT 1/. 1|.
30+
1 61502 rs56992752 T A 100 PASS . GT /1/. /1|.
31+
1 61503 rs56992752 T A 100 PASS . GT |1/. |1|.
32+
1 61504 rs56992752 T A 100 PASS . GT ./. .|.
33+
1 61505 rs56992752 T A 100 PASS . GT /./. |.|.
34+
1 61506 rs56992752 T A 100 PASS . GT |./. /.|.

0 commit comments

Comments
 (0)