Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add bcftools affy2vcf version 1.9 plugin #11382

Merged
merged 1 commit into from
Nov 1, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 119 additions & 0 deletions recipes/bcftools-gtc2vcf-plugin/1.9/affy2vcf.c.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
--- gtc2vcf-b890909/affy2vcf.c.orig 2018-10-11 10:20:20.163766095 -0400
+++ gtc2vcf-b890909/affy2vcf.c 2018-10-11 10:22:47.924639141 -0400
@@ -192,7 +192,8 @@

int moff = 0, *off = NULL;
int ncols = ksplit_core(str.s, ',', &moff, &off);
- for (int i=0; i<ncols; i++)
+ int i;
+ for (i=0; i<ncols; i++)
{
if ( strcmp(&str.s[off[i]], "\"Probe Set ID\"")==0 ) probeset_id = i;
else if ( strcmp(&str.s[off[i]], "\"dbSNP RS ID\"")==0 ) dbsnp_id = i;
@@ -247,7 +248,8 @@
static void annot_destroy(annot_t *annot)
{
khash_str2int_destroy(annot->probeset_id);
- for (int i=0; i<annot->n_records; i++)
+ int i;
+ for (i=0; i<annot->n_records; i++)
{
free(annot->records[i].probe_set);
free(annot->records[i].dbsnp);
@@ -297,7 +299,8 @@

static void report_destroy(report_t *report)
{
- for (int i=0; i<report->n_samples; i++) free(report->cel_files[i]);
+ int i;
+ for (i=0; i<report->n_samples; i++) free(report->cel_files[i]);
free(report->genders);
}

@@ -309,7 +312,8 @@
{
bcf_hdr_t *hdr = bcf_hdr_init("w");
int n = faidx_nseq(fai);
- for (int i=0; i<n; i++)
+ int i;
+ for (i=0; i<n; i++)
{
const char *seq = faidx_iseq(fai, i);
int len = faidx_seq_len(fai, seq);
@@ -335,7 +339,8 @@
float *baf,
float *lrr)
{
- for (int i=0; i<n; i++) // compute THETA and R
+ int i;
+ for (i=0; i<n; i++) // compute THETA and R
{
float log2x = logf(norm_x[i]) * (float)M_LOG2E;
float log2y = logf(norm_y[i]) * (float)M_LOG2E;
@@ -343,7 +348,7 @@
size[i] = ( log2x + log2y ) * 0.5f;
}

- for (int i=0; i<n; i++) // compute LRR and BAF
+ for (i=0; i<n; i++) // compute LRR and BAF
{
if ( delta[i] == cluster->ab_delta )
{
@@ -393,7 +398,8 @@
ncols = ksplit_core(str.s, '\t', &moff, &off);
if ( strcmp(&str.s[off[0]], "probeset_id") ) error("Malformed first line from summary file: %s\n%s\n", summary_fn, str.s);

- for (int i=1; i<ncols; i++) bcf_hdr_add_sample(hdr, &str.s[off[i]]);
+ int i;
+ for (i=1; i<ncols; i++) bcf_hdr_add_sample(hdr, &str.s[off[i]]);
if ( bcf_hdr_write(out_fh, hdr) < 0 ) error("Unable to write to output VCF file\n");
bcf_hdr_sync( hdr ); // updates the number of samples

@@ -442,7 +448,8 @@
}
char *tmp;
// read genotypes
- for (int i=1; i<ncols; i++)
+ int i;
+ for (i=1; i<ncols; i++)
{
int gt = strtol( &str.s[off[i]], &tmp, 10 );
switch ( gt )
@@ -472,17 +479,17 @@
if ( hts_getline(confidences_fp, KS_SEP_LINE, &str) <=0 ) error("Confidences file ended prematurely\n");
ncols = ksplit_core(str.s, '\t', &moff, &off);
if ( ncols != bcf_hdr_nsamples(hdr) + 1 ) error("Expected %d columns but %d columns found in the confidences file\n", bcf_hdr_nsamples(hdr) + 1, ncols);
- for (int i=1; i<ncols; i++) conf_arr[i-1] = strtof(&str.s[off[i]], &tmp);
+ for (i=1; i<ncols; i++) conf_arr[i-1] = strtof(&str.s[off[i]], &tmp);

// read intensities
if ( hts_getline(summary_fp, KS_SEP_LINE, &str) <=0 ) error("Summary file ended prematurely\n");
ncols = ksplit_core(str.s, '\t', &moff, &off);
if ( ncols != bcf_hdr_nsamples(hdr) + 1 ) error("Expected %d columns but %d columns found in the confidences file\n", bcf_hdr_nsamples(hdr) + 1, ncols);
- for (int i=1; i<ncols; i++) norm_x_arr[i-1] = strtof(&str.s[off[i]], &tmp);
+ for (i=1; i<ncols; i++) norm_x_arr[i-1] = strtof(&str.s[off[i]], &tmp);
if ( hts_getline(summary_fp, KS_SEP_LINE, &str) <=0 ) error("Summary file ended prematurely\n");
ncols = ksplit_core(str.s, '\t', &moff, &off);
if ( ncols != bcf_hdr_nsamples(hdr) + 1 ) error("Expected %d columns but %d columns found in the confidences file\n", bcf_hdr_nsamples(hdr) + 1, ncols);
- for (int i=1; i<ncols; i++) norm_y_arr[i-1] = strtof(&str.s[off[i]], &tmp);
+ for (i=1; i<ncols; i++) norm_y_arr[i-1] = strtof(&str.s[off[i]], &tmp);

// compute LRR and BAF
get_lrr_baf(norm_x_arr, norm_y_arr, bcf_hdr_nsamples(hdr), cluster, delta_arr, size_arr, baf_arr, lrr_arr);
@@ -638,7 +645,8 @@
report_t *report = report_init(report_fname);
FILE *sex_fh = fopen(sex_fname, "w");
if ( !sex_fh ) error("Failed to open %s: %s\n", sex_fname, strerror(errno));
- for (int i=0; i<report->n_samples; i++) fprintf(sex_fh, "%s\t%d\n", report->cel_files[i], report->genders[i]);
+ int i;
+ for (i=0; i<report->n_samples; i++) fprintf(sex_fh, "%s\t%d\n", report->cel_files[i], report->genders[i]);
fclose(sex_fh);
report_destroy(report);
}
@@ -650,4 +658,4 @@
bcf_hdr_destroy(hdr);
hts_close(out_fh);
return 0;
-}
\ No newline at end of file
+}
35 changes: 35 additions & 0 deletions recipes/bcftools-gtc2vcf-plugin/1.9/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/sh

export VERSION=1.9
export C_INCLUDE_PATH=${PREFIX}/include
export CPP_INCLUDE_PATH=${PREFIX}/include
export CXX_INCLUDE_PATH=${PREFIX}/include
export CPLUS_INCLUDE_PATH=${PREFIX}/include
export LIBRARY_PATH=${PREFIX}/lib
export CPPFLAGS="$CPPFLAGS -I$PREFIX/include"
export LDFLAGS="$LDFLAGS -L$PREFIX/lib"
export CFLAGS="$CFLAGS -I$PREFIX/include"

mkdir -p $PREFIX/bin
mkdir -p $PREFIX/libexec/bcftools

# Copy plugin source to the bcftools plugin directory.
cp gtc2vcf-b890909/affy2vcf.c bcftools-$VERSION/plugins/
cp gtc2vcf-b890909/gtc2vcf.c bcftools-$VERSION/plugins/

# Compile htslib and bcftools with affy2vcf and gtc2vcf plugins.
pushd htslib-$VERSION
autoheader
(autoconf || autoconf)
./configure --disable-bz2 --disable-lzma --prefix=$PREFIX
make
popd

pushd bcftools-$VERSION
make
popd

# Move custom bcftools plugins to the ~/libexec/bcftools directory.
mv bcftools-$VERSION/plugins/affy2vcf.so $PREFIX/libexec/bcftools/
mv bcftools-$VERSION/plugins/fixref.so $PREFIX/libexec/bcftools/
mv bcftools-$VERSION/plugins/gtc2vcf.so $PREFIX/libexec/bcftools/
107 changes: 107 additions & 0 deletions recipes/bcftools-gtc2vcf-plugin/1.9/fixref.c.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
--- bcftools-1.9/plugins/fixref.c.orig 2018-06-30 03:48:02.000000000 -0400
+++ bcftools-1.9/plugins/fixref.c 2018-10-03 09:03:07.270160080 -0400
@@ -90,6 +90,7 @@
#define MODE_TOP2FWD 2
#define MODE_FLIP2FWD 3
#define MODE_USE_ID 4
+#define MODE_SWAP 6

typedef struct
{
@@ -104,12 +105,14 @@
typedef struct
{
char *dbsnp_fname;
- int mode, discard;
+ int mode, discard, flip_baf;
bcf_hdr_t *hdr;
faidx_t *fai;
int rid, skip_rid;
i2m_t *i2m;
int32_t *gts, ngts, pos;
+ float *bafs;
+ int32_t nbafs;
uint32_t nsite,nok,nflip,nunresolved,nswap,nflip_swap,nonSNP,nonACGT,nonbiallelic;
uint32_t count[4][4], npos_err, unsorted;
}
@@ -130,6 +133,7 @@
" Currently the following modes are recognised:\n"
" flip .. flips non-ambiguous SNPs and ignores the rest\n"
" id .. swap REF/ALT and GTs using the ID column to determine the REF allele\n"
+ " swap .. swap REF/ALT columns and GTs and ignore the rest\n"
" stats .. collect and print stats\n"
" top .. converts from Illumina TOP strand to fwd\n"
"\n"
@@ -148,7 +152,8 @@
" -i, --use-id <file.vcf> Swap REF/ALT using the ID column to determine the REF allele, implies -m id.\n"
" Download the dbSNP file from\n"
" https://www.ncbi.nlm.nih.gov/variation/docs/human_variation_vcf\n"
- " -m, --mode <string> Collect stats (\"stats\") or convert (\"flip\", \"id\", \"top\") [stats]\n"
+ " -m, --mode <string> Collect stats (\"stats\") or convert (\"flip\", \"id\", \"swap\", \"top\") [stats]\n"
+ " -b --flip-baf Flip BAF when swapping genotypes\n"
"\n"
"Examples:\n"
" # run stats\n"
@@ -178,10 +183,11 @@
{"discard",no_argument,NULL,'d'},
{"fasta-ref",required_argument,NULL,'f'},
{"use-id",required_argument,NULL,'i'},
+ {"flip-baf",no_argument,NULL,'b'},
{NULL,0,NULL,0}
};
int c;
- while ((c = getopt_long(argc, argv, "?hf:m:di:",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "?hf:m:di:b:",loptions,NULL)) >= 0)
{
switch (c)
{
@@ -190,11 +196,13 @@
else if ( !strcasecmp(optarg,"flip") ) args.mode = MODE_FLIP2FWD;
else if ( !strcasecmp(optarg,"id") ) args.mode = MODE_USE_ID;
else if ( !strcasecmp(optarg,"stats") ) args.mode = MODE_STATS;
+ else if ( !strcasecmp(optarg,"swap") ) args.mode = MODE_SWAP;
else error("The source strand convention not recognised: %s\n", optarg);
break;
case 'i': args.dbsnp_fname = optarg; args.mode = MODE_USE_ID; break;
case 'd': args.discard = 1; break;
case 'f': ref_fname = optarg; break;
+ case 'b': args.flip_baf = 1; break;
case 'h':
case '?':
default: error("%s", usage()); break;
@@ -231,6 +239,13 @@
}
}
bcf_update_genotypes(args->hdr,rec,args->gts,args->ngts);
+
+ if (args->flip_baf && bcf_get_format_float(args->hdr, rec, "BAF", &args->bafs, &args->nbafs) >= 0)
+ {
+ float *ptr = args->bafs;
+ for (i=0; i<args->nbafs; i++, ptr++) *ptr = 1 - *ptr;
+ bcf_update_format_float(args->hdr, rec, "BAF", args->bafs, args->nbafs);
+ }

return rec;
}
@@ -430,6 +445,13 @@
if ( ir==revint(ib) ) { args.nflip_swap++; return set_ref_alt(&args,rec,int2nt(revint(ib)),int2nt(revint(ia)),1); }
error("FIXME: this should not happen %s:%d\n", bcf_seqname(args.hdr,rec),rec->pos+1);
}
+ else if ( args.mode==MODE_SWAP )
+ {
+ if ( ir==ia ) return ret;
+ if ( ir==ib ) { args.nswap++; return set_ref_alt(&args,rec,int2nt(ib),int2nt(ia),1); }
+ args.nunresolved++;
+ return args.discard ? NULL : ret;
+ }
else if ( args.mode==MODE_TOP2FWD )
{
int pair = 1 << ia | 1 << ib;
@@ -567,6 +589,7 @@
fprintf(stderr,"NS\tnon-biallelic\t%u\n", args.nonbiallelic);

free(args.gts);
+ free(args.bafs);
if ( args.fai ) fai_destroy(args.fai);
if ( args.i2m ) kh_destroy(i2m, args.i2m);
}
Loading