ppvcf  1.0.0
Library for the parallel parsing of VCF files
ppvcf.hpp
1 #ifndef _PPVCF_HPP_
2 #define _PPVCF_HPP_
3 
4 #include <iostream>
5 #include <map>
6 #include <numeric>
7 #include <string>
8 #include <tuple>
9 #include <vector>
10 
11 #include <omp.h>
12 
13 #include "htslib/vcf.h"
14 
15 using namespace std;
16 
17 /*---------- Auxiliary structs ----------*/
19 
22 typedef struct {
24  size_t size;
26  char *seq;
27 } str_w_l;
28 
29 /*---------- Genotype struct ----------*/
31 
34 struct GT {
36  uint8_t a1;
38  uint8_t a2;
40  bool phased;
41 
49  GT(uint8_t _a1 = 0, uint8_t _a2 = 0, bool _phased = true) : a1(_a1), a2(_a2) {
50  phased = _phased || a1 == a2;
51  }
55  ~GT() {}
56 };
57 
58 /*---------- Variant class ----------*/
60 
63 class Variant {
64 private:
66  string seq_name;
68  int pos;
70  string idx;
72  string ref;
74  vector<string> alts;
76  string quality;
78  string filter;
80  map<string, string> info;
81 
83  uint32_t nsamples;
85  uint32_t gti;
87  vector<GT> genotypes;
88 
95  void store_filter(bcf_hdr_t *header, bcf1_t *record) {
96  // TODO: store filter differently (not as a single string) if we want to
97  // allow filtering
98  filter = "";
99  if (record->d.n_flt) {
100  for (int i = 0; i < record->d.n_flt; ++i) {
101  if (i)
102  filter += ";";
103  filter += header->id[BCF_DT_ID][record->d.flt[i]].key;
104  }
105  } else
106  filter = ".";
107  }
108 
116  void store_info(bcf_hdr_t *header, bcf1_t *record) {
117  for (int i = 0; i < record->n_info; ++i) {
118  bcf_info_t *info_field = &record->d.info[i];
119  int key_idx = info_field->key;
120  int type = info_field->type;
121  const char *key = header->id[BCF_DT_ID][key_idx].key;
122 
123  int ndst = 0;
124  string value = "";
125  if (type == BCF_BT_NULL) {
126  bool v = bcf_get_info_flag(header, record, key, &v, &ndst);
127  value = to_string(v);
128  // FIXME: remember to manage booleans (eg when producing output)
129  } else if (type == BCF_BT_INT8 || type == BCF_BT_INT16 ||
130  type == BCF_BT_INT32) {
131  int *v = NULL;
132  bcf_get_info_int32(header, record, key, &v, &ndst);
133  value = to_string(*v);
134  free(v);
135  } else if (type == BCF_BT_FLOAT) {
136  float *v = NULL;
137  bcf_get_info_float(header, record, key, &v, &ndst);
138  value = to_string(*v);
139  free(v);
140  } else if (type == BCF_BT_CHAR) {
141  char *v = NULL;
142  bcf_get_info_string(header, record, key, &v, &ndst);
143  value = string(v);
144  free(v);
145  } else {
146  cerr << "Unknown type " << type << " (field " << key << ")" << endl;
147  exit(1);
148  }
149 
150  info[key] = value;
151 
152  /* // Maybe we can adapt this
153  #define BRANCH(type_t, bcf_ht_t) { \
154  type_t *value = NULL; \
155  bcf_get_info_values(header, record, key, (void**)(&value), &ndst,
156  bcf_ht_t); \
157  cout << *value << endl; \
158  }
159  switch(type) {
160  case BCF_BT_INT8: BRANCH(int, BCF_HT_INT); break;
161  case BCF_BT_INT16: BRANCH(int, BCF_HT_INT); break;
162  case BCF_BT_INT32: BRANCH(int, BCF_HT_INT); break;
163  case BCF_BT_FLOAT: BRANCH(float, BCF_HT_REAL); break;
164  case BCF_BT_CHAR: BRANCH(string, BCF_HT_STR); break;
165  default: cerr << "Unknown type " << type << endl; exit(1);
166  }
167  #undef BRANCH
168  */
169  }
170  }
171 
172 public:
178  Variant(const uint32_t _nsamples)
179  : nsamples(_nsamples), gti(0), genotypes(_nsamples) {}
183  ~Variant() {}
184 
191  void update_till_info(bcf_hdr_t *header, bcf1_t *record) {
192  seq_name = bcf_hdr_id2name(header, record->rid);
193  pos = record->pos;
194  idx = record->d.id;
195  ref = record->d.allele[0];
196 
197  for (int i = 1; i < record->n_allele; ++i) {
198  char *curr_alt = record->d.allele[i];
199  if (curr_alt[0] != '<')
200  alts.push_back(string(curr_alt));
201  }
202 
203  quality = to_string(record->qual);
204  quality = quality == "nan" ? "." : quality;
205  store_filter(header, record);
206  store_info(header, record);
207  }
208 
216  void add_genotype(const uint8_t a1, const uint8_t a2, const bool phased) {
217  assert(gti < nsamples);
218  genotypes[gti].a1 = a1;
219  genotypes[gti].a2 = a2;
220  genotypes[gti].phased = phased;
221  ++gti;
222  }
223 
227  string get_seqname() const { return seq_name; }
228 
232  int get_pos() const { return pos - 1; }
233 
237  string get_idx() const { return idx; }
238 
242  string get_ref() const { return ref; }
243 
247  vector<string> get_alts() const { return alts; }
248 
252  string get_alt(const int i) const { return alts[i - 1]; }
253 
257  string get_quality() const { return quality; }
258 
265  string get_info(const string &k) const { return info.at(k); }
266 };
267 
268 /*---------- VCF class ----------*/
270 
273 class VCF {
274 private:
276  htsFile *vcf;
278  bcf_hdr_t *vcf_header;
280  bcf1_t *vcf_record;
281 
285  vector<Variant> variants;
287  vector<str_w_l> fmt_lines;
289  size_t block_size;
291  size_t to_parse;
292 
295 
296 public:
304  VCF(char *vcf_path, const int n_threads_, const int block_size_) {
305  vcf = bcf_open(vcf_path, "r");
306  vcf_header = bcf_hdr_read(vcf);
307  vcf_record = bcf_init();
308  vcf_record->max_unpack = BCF_UN_INFO;
309 
310  n_samples = bcf_hdr_nsamples(vcf_header);
311 
312  n_threads = n_threads_;
313 
314  block_size = block_size_;
315  to_parse = 0;
316 
317  // Allocate space for fmt_lines
318  for (size_t i = 0; i < block_size; ++i) {
319  str_w_l s;
320  s.size = 1024 * 10;
321  s.seq = (char *)calloc(s.size, sizeof(char));
322 
323  fmt_lines.push_back(s);
324  }
325  }
329  ~VCF() {
330  for (size_t i = 0; i < block_size; ++i) {
331  free(fmt_lines[i].seq);
332  }
333  bcf_hdr_destroy(vcf_header);
334  bcf_destroy(vcf_record);
335  bcf_close(vcf);
336  }
337 
344  bool parse() {
345  variants.clear();
346  uint32_t i = 0;
347  uint32_t n = block_size;
348  while (i < block_size && bcf_read(vcf, vcf_header, vcf_record) == 0) {
349  // 1. we unpack till INFO field
350  bcf_unpack(vcf_record, BCF_UN_INFO);
351  variants.push_back(Variant(n_samples));
352  variants.back().update_till_info(vcf_header, vcf_record);
353 
354  // 2. we store the format line
355  char *samples = get_samples(vcf->line.s, vcf->line.l + 1);
356  size_t samples_s = vcf->line.l - (samples - vcf->line.s);
357  while (samples_s > fmt_lines[i].size) {
358  fmt_lines[i].size *= 1.6;
359  fmt_lines[i].seq = (char *)realloc(fmt_lines[i].seq, fmt_lines[i].size);
360  }
361  strncpy(fmt_lines[i].seq, samples, samples_s);
362  ++i;
363  }
364  to_parse = i;
365 
366  // 3. we parse and store the genotypes in parallel
367  fill_genotypes();
368 
369  return i == n;
370  }
371 
375  vector<Variant> get_variants() const { return variants; }
376 
377 private:
385  char *get_samples(char *const fmt_line, const size_t len) {
386  /*
387  * We need len since fmt_line contains \0 to separate the first 8
388  * fields (htslib replaces \t with \0)
389  */
390  char *gt_start = fmt_line;
391  char *const gt_end = fmt_line + len;
392 
393  /*
394  * From VCF format specification: "The first sub-field must always
395  * be the genotype (GT) if it is present. There are no required
396  * sub-fields."
397  */
398  for (; gt_start != gt_end && (*gt_start != 'G' || *(gt_start + 1) != 'T' ||
399  *(gt_start + 2) != '\t');
400  ++gt_start)
401  ;
402  return gt_start;
403  }
404 
412  tuple<uint8_t, uint8_t, bool> extract_genotype(char *start) {
413  if (*start == '.')
414  return tuple<uint8_t, uint8_t, bool>(-1, -1, false);
415 
416  uint8_t all1, all2;
417  char *start2 = start;
418 
419  for (; *start2 != '|' && *start2 != '/'; ++start2)
420  ;
421  bool phased = *start2 == '|';
422  *start2 = '\0';
423 
424  all1 = atoi(start);
425  all2 = atoi(start2 + 1);
426 
427  *start2 = '|'; // don't care, just fix
428 
429  return std::move(tuple<uint8_t, uint8_t, bool>(all1, all2, phased));
430  }
431 
438  void fill_variant(const uint32_t var_idx) {
439  assert(var_idx < to_parse);
440  char *samples = fmt_lines[var_idx].seq;
441 
442  // Drop "GT" at the beginning of the line
443  for (; *samples != '\t'; ++samples)
444  ;
445  ++samples;
446 
447  char *start = samples;
448  char *end = start;
449  bool last = *end == '\0';
450  tuple<uint8_t, uint8_t, bool> gt;
451  while (!last) {
452  for (; *end != '\t' && *end != '\0'; ++end)
453  ;
454  last = *end == '\0';
455  *end = '\0';
456  gt = extract_genotype(start);
457  variants[var_idx].add_genotype(get<0>(gt), get<1>(gt), get<2>(gt));
458  *end = last ? '\0' : '\t';
459  start = end + 1;
460  end = start;
461  }
462  // TODO: manage ":"
463  }
464 
469  void fill_genotypes() {
470 #pragma omp parallel for num_threads(n_threads) shared(fmt_lines, variants)
471  for (uint i = 0; i < to_parse; ++i) {
472  fill_variant(i);
473  }
474  }
475 };
476 
477 #endif
void update_till_info(bcf_hdr_t *header, bcf1_t *record)
Definition: ppvcf.hpp:191
GT(uint8_t _a1=0, uint8_t _a2=0, bool _phased=true)
Definition: ppvcf.hpp:49
int n_threads
Number of threads to use.
Definition: ppvcf.hpp:294
string get_alt(const int i) const
Definition: ppvcf.hpp:252
vector< Variant > variants
Variants read.
Definition: ppvcf.hpp:285
~GT()
Definition: ppvcf.hpp:55
htsFile * vcf
htslib file
Definition: ppvcf.hpp:276
size_t block_size
Number of variants to read at each iteration.
Definition: ppvcf.hpp:289
vector< string > get_alts() const
Definition: ppvcf.hpp:247
string idx
Identifier.
Definition: ppvcf.hpp:70
string quality
Phred-scaled quality score.
Definition: ppvcf.hpp:76
void store_info(bcf_hdr_t *header, bcf1_t *record)
Definition: ppvcf.hpp:116
size_t size
cstring size
Definition: ppvcf.hpp:24
vector< GT > genotypes
Genotype fields.
Definition: ppvcf.hpp:87
void add_genotype(const uint8_t a1, const uint8_t a2, const bool phased)
Definition: ppvcf.hpp:216
vector< Variant > get_variants() const
Definition: ppvcf.hpp:375
Class for vcf file.
Definition: ppvcf.hpp:273
bcf_hdr_t * vcf_header
htslib header
Definition: ppvcf.hpp:278
void fill_variant(const uint32_t var_idx)
Definition: ppvcf.hpp:438
char * seq
cstring
Definition: ppvcf.hpp:26
string ref
Reference base(s)
Definition: ppvcf.hpp:72
~Variant()
Definition: ppvcf.hpp:183
string get_info(const string &k) const
Definition: ppvcf.hpp:265
char * get_samples(char *const fmt_line, const size_t len)
Definition: ppvcf.hpp:385
uint32_t nsamples
Total number of samples.
Definition: ppvcf.hpp:83
Class for a single variant.
Definition: ppvcf.hpp:63
bcf1_t * vcf_record
htslib record
Definition: ppvcf.hpp:280
bool phased
Phased flag.
Definition: ppvcf.hpp:40
string filter
Filter status.
Definition: ppvcf.hpp:78
Auxiliary struct.
Definition: ppvcf.hpp:22
uint8_t a1
First allele.
Definition: ppvcf.hpp:36
tuple< uint8_t, uint8_t, bool > extract_genotype(char *start)
Definition: ppvcf.hpp:412
size_t to_parse
Number of variants read in the last iteration.
Definition: ppvcf.hpp:291
void fill_genotypes()
Definition: ppvcf.hpp:469
~VCF()
Definition: ppvcf.hpp:329
uint32_t gti
Genotype counter represeting how many GTs have been stored.
Definition: ppvcf.hpp:85
int get_pos() const
Definition: ppvcf.hpp:232
string get_ref() const
Definition: ppvcf.hpp:242
int pos
Position (0-based)
Definition: ppvcf.hpp:68
bool parse()
Definition: ppvcf.hpp:344
vector< string > alts
Alternate base(s)
Definition: ppvcf.hpp:74
Variant(const uint32_t _nsamples)
Definition: ppvcf.hpp:178
uint8_t a2
Second allele.
Definition: ppvcf.hpp:38
string get_idx() const
Definition: ppvcf.hpp:237
int n_samples
Total number of samples per variant.
Definition: ppvcf.hpp:283
VCF(char *vcf_path, const int n_threads_, const int block_size_)
Definition: ppvcf.hpp:304
void store_filter(bcf_hdr_t *header, bcf1_t *record)
Definition: ppvcf.hpp:95
Struct for a single genotype.
Definition: ppvcf.hpp:34
string get_seqname() const
Definition: ppvcf.hpp:227
vector< str_w_l > fmt_lines
Genotype fields as cstring (from "GT" to the end of each line)
Definition: ppvcf.hpp:287
map< string, string > info
Additional information.
Definition: ppvcf.hpp:80
string seq_name
Chromosome.
Definition: ppvcf.hpp:66
string get_quality() const
Definition: ppvcf.hpp:257