13 #include "htslib/vcf.h" 49 GT(uint8_t _a1 = 0, uint8_t _a2 = 0,
bool _phased =
true) : a1(_a1), a2(_a2) {
50 phased = _phased || a1 == a2;
99 if (record->d.n_flt) {
100 for (
int i = 0; i < record->d.n_flt; ++i) {
103 filter += header->id[BCF_DT_ID][record->d.flt[i]].key;
117 for (
int i = 0; i < record->n_info; ++i) {
118 bcf_info_t *info_field = &record->d.info[i];
119 int key_idx = info_field->key;
120 int type = info_field->type;
121 const char *key = header->id[BCF_DT_ID][key_idx].key;
125 if (type == BCF_BT_NULL) {
126 bool v = bcf_get_info_flag(header, record, key, &v, &ndst);
127 value = to_string(v);
129 }
else if (type == BCF_BT_INT8 || type == BCF_BT_INT16 ||
130 type == BCF_BT_INT32) {
132 bcf_get_info_int32(header, record, key, &v, &ndst);
133 value = to_string(*v);
135 }
else if (type == BCF_BT_FLOAT) {
137 bcf_get_info_float(header, record, key, &v, &ndst);
138 value = to_string(*v);
140 }
else if (type == BCF_BT_CHAR) {
142 bcf_get_info_string(header, record, key, &v, &ndst);
146 cerr <<
"Unknown type " << type <<
" (field " << key <<
")" << endl;
179 : nsamples(_nsamples), gti(0), genotypes(_nsamples) {}
192 seq_name = bcf_hdr_id2name(header, record->rid);
195 ref = record->d.allele[0];
197 for (
int i = 1; i < record->n_allele; ++i) {
198 char *curr_alt = record->d.allele[i];
199 if (curr_alt[0] !=
'<')
200 alts.push_back(
string(curr_alt));
203 quality = to_string(record->qual);
204 quality = quality ==
"nan" ?
"." : quality;
205 store_filter(header, record);
206 store_info(header, record);
216 void add_genotype(
const uint8_t a1,
const uint8_t a2,
const bool phased) {
217 assert(gti < nsamples);
218 genotypes[gti].a1 = a1;
219 genotypes[gti].a2 = a2;
220 genotypes[gti].phased = phased;
252 string get_alt(
const int i)
const {
return alts[i - 1]; }
265 string get_info(
const string &k)
const {
return info.at(k); }
304 VCF(
char *vcf_path,
const int n_threads_,
const int block_size_) {
305 vcf = bcf_open(vcf_path,
"r");
306 vcf_header = bcf_hdr_read(vcf);
307 vcf_record = bcf_init();
308 vcf_record->max_unpack = BCF_UN_INFO;
310 n_samples = bcf_hdr_nsamples(vcf_header);
312 n_threads = n_threads_;
314 block_size = block_size_;
318 for (
size_t i = 0; i < block_size; ++i) {
321 s.
seq = (
char *)calloc(s.
size,
sizeof(
char));
323 fmt_lines.push_back(s);
330 for (
size_t i = 0; i < block_size; ++i) {
331 free(fmt_lines[i].seq);
333 bcf_hdr_destroy(vcf_header);
334 bcf_destroy(vcf_record);
347 uint32_t n = block_size;
348 while (i < block_size && bcf_read(vcf, vcf_header, vcf_record) == 0) {
350 bcf_unpack(vcf_record, BCF_UN_INFO);
351 variants.push_back(
Variant(n_samples));
352 variants.back().update_till_info(vcf_header, vcf_record);
355 char *samples = get_samples(vcf->line.s, vcf->line.l + 1);
356 size_t samples_s = vcf->line.l - (samples - vcf->line.s);
357 while (samples_s > fmt_lines[i].size) {
358 fmt_lines[i].size *= 1.6;
359 fmt_lines[i].seq = (
char *)realloc(fmt_lines[i].seq, fmt_lines[i].size);
361 strncpy(fmt_lines[i].seq, samples, samples_s);
390 char *gt_start = fmt_line;
391 char *
const gt_end = fmt_line + len;
398 for (; gt_start != gt_end && (*gt_start !=
'G' || *(gt_start + 1) !=
'T' ||
399 *(gt_start + 2) !=
'\t');
414 return tuple<uint8_t, uint8_t, bool>(-1, -1,
false);
417 char *start2 = start;
419 for (; *start2 !=
'|' && *start2 !=
'/'; ++start2)
421 bool phased = *start2 ==
'|';
425 all2 = atoi(start2 + 1);
429 return std::move(tuple<uint8_t, uint8_t, bool>(all1, all2, phased));
439 assert(var_idx < to_parse);
440 char *samples = fmt_lines[var_idx].seq;
443 for (; *samples !=
'\t'; ++samples)
447 char *start = samples;
449 bool last = *end ==
'\0';
450 tuple<uint8_t, uint8_t, bool> gt;
452 for (; *end !=
'\t' && *end !=
'\0'; ++end)
456 gt = extract_genotype(start);
457 variants[var_idx].add_genotype(get<0>(gt), get<1>(gt), get<2>(gt));
458 *end = last ?
'\0' :
'\t';
470 #pragma omp parallel for num_threads(n_threads) shared(fmt_lines, variants) 471 for (uint i = 0; i < to_parse; ++i) {
void update_till_info(bcf_hdr_t *header, bcf1_t *record)
Definition: ppvcf.hpp:191
GT(uint8_t _a1=0, uint8_t _a2=0, bool _phased=true)
Definition: ppvcf.hpp:49
int n_threads
Number of threads to use.
Definition: ppvcf.hpp:294
string get_alt(const int i) const
Definition: ppvcf.hpp:252
vector< Variant > variants
Variants read.
Definition: ppvcf.hpp:285
~GT()
Definition: ppvcf.hpp:55
htsFile * vcf
htslib file
Definition: ppvcf.hpp:276
size_t block_size
Number of variants to read at each iteration.
Definition: ppvcf.hpp:289
vector< string > get_alts() const
Definition: ppvcf.hpp:247
string idx
Identifier.
Definition: ppvcf.hpp:70
string quality
Phred-scaled quality score.
Definition: ppvcf.hpp:76
void store_info(bcf_hdr_t *header, bcf1_t *record)
Definition: ppvcf.hpp:116
size_t size
cstring size
Definition: ppvcf.hpp:24
vector< GT > genotypes
Genotype fields.
Definition: ppvcf.hpp:87
void add_genotype(const uint8_t a1, const uint8_t a2, const bool phased)
Definition: ppvcf.hpp:216
vector< Variant > get_variants() const
Definition: ppvcf.hpp:375
Class for vcf file.
Definition: ppvcf.hpp:273
bcf_hdr_t * vcf_header
htslib header
Definition: ppvcf.hpp:278
void fill_variant(const uint32_t var_idx)
Definition: ppvcf.hpp:438
char * seq
cstring
Definition: ppvcf.hpp:26
string ref
Reference base(s)
Definition: ppvcf.hpp:72
~Variant()
Definition: ppvcf.hpp:183
string get_info(const string &k) const
Definition: ppvcf.hpp:265
char * get_samples(char *const fmt_line, const size_t len)
Definition: ppvcf.hpp:385
uint32_t nsamples
Total number of samples.
Definition: ppvcf.hpp:83
Class for a single variant.
Definition: ppvcf.hpp:63
bcf1_t * vcf_record
htslib record
Definition: ppvcf.hpp:280
bool phased
Phased flag.
Definition: ppvcf.hpp:40
string filter
Filter status.
Definition: ppvcf.hpp:78
Auxiliary struct.
Definition: ppvcf.hpp:22
uint8_t a1
First allele.
Definition: ppvcf.hpp:36
tuple< uint8_t, uint8_t, bool > extract_genotype(char *start)
Definition: ppvcf.hpp:412
size_t to_parse
Number of variants read in the last iteration.
Definition: ppvcf.hpp:291
void fill_genotypes()
Definition: ppvcf.hpp:469
~VCF()
Definition: ppvcf.hpp:329
uint32_t gti
Genotype counter represeting how many GTs have been stored.
Definition: ppvcf.hpp:85
int get_pos() const
Definition: ppvcf.hpp:232
string get_ref() const
Definition: ppvcf.hpp:242
int pos
Position (0-based)
Definition: ppvcf.hpp:68
bool parse()
Definition: ppvcf.hpp:344
vector< string > alts
Alternate base(s)
Definition: ppvcf.hpp:74
Variant(const uint32_t _nsamples)
Definition: ppvcf.hpp:178
uint8_t a2
Second allele.
Definition: ppvcf.hpp:38
string get_idx() const
Definition: ppvcf.hpp:237
int n_samples
Total number of samples per variant.
Definition: ppvcf.hpp:283
VCF(char *vcf_path, const int n_threads_, const int block_size_)
Definition: ppvcf.hpp:304
void store_filter(bcf_hdr_t *header, bcf1_t *record)
Definition: ppvcf.hpp:95
Struct for a single genotype.
Definition: ppvcf.hpp:34
string get_seqname() const
Definition: ppvcf.hpp:227
vector< str_w_l > fmt_lines
Genotype fields as cstring (from "GT" to the end of each line)
Definition: ppvcf.hpp:287
map< string, string > info
Additional information.
Definition: ppvcf.hpp:80
string seq_name
Chromosome.
Definition: ppvcf.hpp:66
string get_quality() const
Definition: ppvcf.hpp:257