libStatGen Software 1
bgzf.c
1/* The MIT License
2
3 Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
4 2011 Attractive Chaos <attractor@live.co.uk>
5
6 Permission is hereby granted, free of charge, to any person obtaining a copy
7 of this software and associated documentation files (the "Software"), to deal
8 in the Software without restriction, including without limitation the rights
9 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 copies of the Software, and to permit persons to whom the Software is
11 furnished to do so, subject to the following conditions:
12
13 The above copyright notice and this permission notice shall be included in
14 all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 THE SOFTWARE.
23*/
24
25#ifdef __ZLIB_AVAILABLE__
26#include <stdio.h>
27#include <stdlib.h>
28#include <string.h>
29#include <unistd.h>
30#include <assert.h>
31#include <sys/types.h>
32#include "bgzf.h"
33
34#ifdef _USE_KNETFILE
35#include "knetfile.h"
36typedef knetFile *_bgzf_file_t;
37#define _bgzf_open(fn, mode) knet_open(fn, mode)
38#define _bgzf_dopen(fp, mode) knet_dopen(fp, mode)
39#define _bgzf_close(fp) knet_close(fp)
40#define _bgzf_fileno(fp) ((fp)->fd)
41#define _bgzf_tell(fp) knet_tell(fp)
42#define _bgzf_seek(fp, offset, whence) knet_seek(fp, offset, whence)
43#define _bgzf_read(fp, buf, len) knet_read(fp, buf, len)
44#define _bgzf_write(fp, buf, len) knet_write(fp, buf, len)
45#else // ~defined(_USE_KNETFILE)
46#if defined(_WIN32) || defined(_MSC_VER)
47#define ftello(fp) ftell(fp)
48#define fseeko(fp, offset, whence) fseek(fp, offset, whence)
49#else // ~defined(_WIN32)
50extern off_t ftello(FILE *stream);
51extern int fseeko(FILE *stream, off_t offset, int whence);
52#endif // ~defined(_WIN32)
53typedef FILE *_bgzf_file_t;
54#define _bgzf_open(fn, mode) fopen(fn, mode)
55#define _bgzf_dopen(fp, mode) fdopen(fp, mode)
56#define _bgzf_close(fp) fclose(fp)
57#define _bgzf_fileno(fp) fileno(fp)
58#define _bgzf_tell(fp) ftello(fp)
59#define _bgzf_seek(fp, offset, whence) fseeko(fp, offset, whence)
60#define _bgzf_read(fp, buf, len) fread(buf, 1, len, fp)
61#define _bgzf_write(fp, buf, len) fwrite(buf, 1, len, fp)
62#endif // ~define(_USE_KNETFILE)
63
64#define BLOCK_HEADER_LENGTH 18
65#define BLOCK_FOOTER_LENGTH 8
66
67/* BGZF/GZIP header (speciallized from RFC 1952; little endian):
68 +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
69 | 31|139| 8| 4| 0| 0|255| 6| 66| 67| 2|BLK_LEN|
70 +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
71*/
72static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0";
73
74#ifdef BGZF_CACHE
75typedef struct {
76 int size;
77 uint8_t *block;
78 int64_t end_offset;
79} cache_t;
80#include "khash.h"
81KHASH_MAP_INIT_INT64(cache, cache_t)
82#endif
83
84static inline void packInt16(uint8_t *buffer, uint16_t value)
85{
86 buffer[0] = value;
87 buffer[1] = value >> 8;
88}
89
90static inline int unpackInt16(const uint8_t *buffer)
91{
92 return buffer[0] | buffer[1] << 8;
93}
94
95static inline void packInt32(uint8_t *buffer, uint32_t value)
96{
97 buffer[0] = value;
98 buffer[1] = value >> 8;
99 buffer[2] = value >> 16;
100 buffer[3] = value >> 24;
101}
102
103static BGZF *bgzf_read_init()
104{
105 BGZF *fp;
106 fp = calloc(1, sizeof(BGZF));
107 fp->open_mode = 'r';
108 fp->uncompressed_block = malloc(BGZF_BLOCK_SIZE);
109 fp->compressed_block = malloc(BGZF_BLOCK_SIZE);
110#ifdef BGZF_CACHE
111 fp->cache = kh_init(cache);
112#endif
113 return fp;
114}
115
116static BGZF *bgzf_write_init(int compress_level) // compress_level==-1 for the default level
117{
118 BGZF *fp;
119 fp = calloc(1, sizeof(BGZF));
120 fp->open_mode = 'w';
121 fp->uncompressed_block = malloc(BGZF_BLOCK_SIZE);
122 fp->compressed_block = malloc(BGZF_BLOCK_SIZE);
123 fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1
124 if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
125 return fp;
126}
127// get the compress level from the mode string
128static int mode2level(const char *__restrict mode)
129{
130 int i, compress_level = -1;
131 for (i = 0; mode[i]; ++i)
132 if (mode[i] >= '0' && mode[i] <= '9') break;
133 if (mode[i]) compress_level = (int)mode[i] - '0';
134 if (strchr(mode, 'u')) compress_level = 0;
135 return compress_level;
136}
137
138BGZF *bgzf_open(const char *path, const char *mode)
139{
140 BGZF *fp = 0;
141 if (strchr(mode, 'r') || strchr(mode, 'R')) {
142 _bgzf_file_t fpr;
143 if ((fpr = _bgzf_open(path, "r")) == 0) return 0;
144 fp = bgzf_read_init();
145 fp->fp = fpr;
146 } else if (strchr(mode, 'w') || strchr(mode, 'W')) {
147 FILE *fpw;
148 if ((fpw = fopen(path, "w")) == 0) return 0;
149 fp = bgzf_write_init(mode2level(mode));
150 fp->fp = fpw;
151 } else if (strchr(mode, 'a') || strchr(mode, 'A')) {
152 FILE *fpw;
153 if ((fpw = fopen(path, "r+")) == 0) return 0;
154 fp = bgzf_write_init(mode2level(mode));
155 fp->fp = fpw;
156 // Check for trailing EOF block.
157 if(bgzf_check_EOF(fp))
158 {
159 // Overwrite the trailing EOF.
160 _bgzf_seek(fp->fp, -28, SEEK_END);
161 }
162 else
163 {
164 // No trailing EOF block, so go to the end
165 _bgzf_seek(fp->fp, 0, SEEK_END);
166 }
167 }
168 return fp;
169}
170
171BGZF *bgzf_dopen(int fd, const char *mode)
172{
173 BGZF *fp = 0;
174 if (strchr(mode, 'r') || strchr(mode, 'R')) {
175 _bgzf_file_t fpr;
176 if ((fpr = _bgzf_dopen(fd, "r")) == 0) return 0;
177 fp = bgzf_read_init();
178 fp->fp = fpr;
179 } else if (strchr(mode, 'w') || strchr(mode, 'W')) {
180 FILE *fpw;
181 if ((fpw = fdopen(fd, "w")) == 0) return 0;
182 fp = bgzf_write_init(mode2level(mode));
183 fp->fp = fpw;
184 }
185 return fp;
186}
187
188// Deflate the block in fp->uncompressed_block into fp->compressed_block. Also adds an extra field that stores the compressed block length.
189static int deflate_block(BGZF *fp, int block_length)
190{
191 uint8_t *buffer = fp->compressed_block;
192 int buffer_size = BGZF_BLOCK_SIZE;
193 int input_length = block_length;
194 int compressed_length = 0;
195 int remaining;
196 uint32_t crc;
197
198 assert(block_length <= BGZF_BLOCK_SIZE); // guaranteed by the caller
199 memcpy(buffer, g_magic, BLOCK_HEADER_LENGTH); // the last two bytes are a place holder for the length of the block
200 while (1) { // loop to retry for blocks that do not compress enough
201 int status;
202 z_stream zs;
203 zs.zalloc = NULL;
204 zs.zfree = NULL;
205 zs.next_in = fp->uncompressed_block;
206 zs.avail_in = input_length;
207 zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
208 zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
209 status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY); // -15 to disable zlib header/footer
210 if (status != Z_OK) {
211 fp->errcode |= BGZF_ERR_ZLIB;
212 return -1;
213 }
214 status = deflate(&zs, Z_FINISH);
215 if (status != Z_STREAM_END) { // not compressed enough
216 deflateEnd(&zs); // reset the stream
217 if (status == Z_OK) { // reduce the size and recompress
218 input_length -= 1024;
219 assert(input_length > 0); // logically, this should not happen
220 continue;
221 }
222 fp->errcode |= BGZF_ERR_ZLIB;
223 return -1;
224 }
225 if (deflateEnd(&zs) != Z_OK) {
226 fp->errcode |= BGZF_ERR_ZLIB;
227 return -1;
228 }
229 compressed_length = zs.total_out;
230 compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
231 assert(compressed_length <= BGZF_BLOCK_SIZE);
232 break;
233 }
234
235 assert(compressed_length > 0);
236 packInt16((uint8_t*)&buffer[16], compressed_length - 1); // write the compressed_length; -1 to fit 2 bytes
237 crc = crc32(0L, NULL, 0L);
238 crc = crc32(crc, fp->uncompressed_block, input_length);
239 packInt32((uint8_t*)&buffer[compressed_length-8], crc);
240 packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
241
242 remaining = block_length - input_length;
243 if (remaining > 0) {
244 assert(remaining <= input_length);
245 memcpy(fp->uncompressed_block, fp->uncompressed_block + input_length, remaining);
246 }
247 fp->block_offset = remaining;
248 return compressed_length;
249}
250
251// Inflate the block in fp->compressed_block into fp->uncompressed_block
252static int inflate_block(BGZF* fp, int block_length)
253{
254 z_stream zs;
255 zs.zalloc = NULL;
256 zs.zfree = NULL;
257 zs.next_in = fp->compressed_block + 18;
258 zs.avail_in = block_length - 16;
259 zs.next_out = fp->uncompressed_block;
260 zs.avail_out = BGZF_BLOCK_SIZE;
261
262 if (inflateInit2(&zs, -15) != Z_OK) {
263 fp->errcode |= BGZF_ERR_ZLIB;
264 return -1;
265 }
266 if (inflate(&zs, Z_FINISH) != Z_STREAM_END) {
267 inflateEnd(&zs);
268 fp->errcode |= BGZF_ERR_ZLIB;
269 return -1;
270 }
271 if (inflateEnd(&zs) != Z_OK) {
272 fp->errcode |= BGZF_ERR_ZLIB;
273 return -1;
274 }
275 return zs.total_out;
276}
277
278static int check_header(const uint8_t *header)
279{
280 return (header[0] == 31 && header[1] == 139 && header[2] == 8 && (header[3] & 4) != 0
281 && unpackInt16((uint8_t*)&header[10]) == 6
282 && header[12] == 'B' && header[13] == 'C'
283 && unpackInt16((uint8_t*)&header[14]) == 2);
284}
285
286#ifdef BGZF_CACHE
287static void free_cache(BGZF *fp)
288{
289 khint_t k;
290 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
291 if (fp->open_mode != 'r') return;
292 for (k = kh_begin(h); k < kh_end(h); ++k)
293 if (kh_exist(h, k)) free(kh_val(h, k).block);
294 kh_destroy(cache, h);
295}
296
297static int load_block_from_cache(BGZF *fp, int64_t block_address)
298{
299 khint_t k;
300 cache_t *p;
301 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
302 k = kh_get(cache, h, block_address);
303 if (k == kh_end(h)) return 0;
304 p = &kh_val(h, k);
305 if (fp->block_length != 0) fp->block_offset = 0;
306 fp->block_address = block_address;
307 fp->block_length = p->size;
308 memcpy(fp->uncompressed_block, p->block, BGZF_BLOCK_SIZE);
309 _bgzf_seek((_bgzf_file_t)fp->fp, p->end_offset, SEEK_SET);
310 return p->size;
311}
312
313static void cache_block(BGZF *fp, int size)
314{
315 int ret;
316 khint_t k;
317 cache_t *p;
318 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
319 if (BGZF_BLOCK_SIZE >= fp->cache_size) return;
320 if ((kh_size(h) + 1) * BGZF_BLOCK_SIZE > fp->cache_size) {
321 /* A better way would be to remove the oldest block in the
322 * cache, but here we remove a random one for simplicity. This
323 * should not have a big impact on performance. */
324 for (k = kh_begin(h); k < kh_end(h); ++k)
325 if (kh_exist(h, k)) break;
326 if (k < kh_end(h)) {
327 free(kh_val(h, k).block);
328 kh_del(cache, h, k);
329 }
330 }
331 k = kh_put(cache, h, fp->block_address, &ret);
332 if (ret == 0) return; // if this happens, a bug!
333 p = &kh_val(h, k);
334 p->size = fp->block_length;
335 p->end_offset = fp->block_address + size;
336 p->block = malloc(BGZF_BLOCK_SIZE);
337 memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_BLOCK_SIZE);
338}
339#else
340static void free_cache(BGZF *fp) {}
341static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;}
342static void cache_block(BGZF *fp, int size) {}
343#endif
344
345int bgzf_read_block(BGZF *fp)
346{
347 uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block;
348 int count, size = 0, block_length, remaining;
349 int64_t block_address;
350 block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
351 if (load_block_from_cache(fp, block_address)) return 0;
352 count = _bgzf_read(fp->fp, header, sizeof(header));
353 if (count == 0) { // no data read
354 fp->block_length = 0;
355 return 0;
356 }
357 if (count != sizeof(header) || !check_header(header)) {
358 fp->errcode |= BGZF_ERR_HEADER;
359 return -1;
360 }
361 size = count;
362 block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1"
363 compressed_block = (uint8_t*)fp->compressed_block;
364 memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
365 remaining = block_length - BLOCK_HEADER_LENGTH;
366 count = _bgzf_read(fp->fp, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
367 if (count != remaining) {
368 fp->errcode |= BGZF_ERR_IO;
369 return -1;
370 }
371 size += count;
372 if ((count = inflate_block(fp, block_length)) < 0) return -1;
373 if (fp->block_length != 0) fp->block_offset = 0; // Do not reset offset if this read follows a seek.
374 fp->block_address = block_address;
375 fp->block_length = count;
376 cache_block(fp, size);
377 return 0;
378}
379
380ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length)
381{
382 ssize_t bytes_read = 0;
383 uint8_t *output = data;
384 if (length <= 0) return 0;
385 assert(fp->open_mode == 'r');
386 while (bytes_read < length) {
387 int copy_length, available = fp->block_length - fp->block_offset;
388 uint8_t *buffer;
389 if (available <= 0) {
390 if (bgzf_read_block(fp) != 0) return -1;
391 available = fp->block_length - fp->block_offset;
392 if (available <= 0) break;
393 }
394 copy_length = length - bytes_read < available? length - bytes_read : available;
395 buffer = fp->uncompressed_block;
396 memcpy(output, buffer + fp->block_offset, copy_length);
397 fp->block_offset += copy_length;
398 output += copy_length;
399 bytes_read += copy_length;
400 }
401 if (fp->block_offset == fp->block_length) {
402 fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
403 fp->block_offset = fp->block_length = 0;
404 }
405 return bytes_read;
406}
407
408int bgzf_flush(BGZF *fp)
409{
410 assert(fp->open_mode == 'w');
411 while (fp->block_offset > 0) {
412 int block_length;
413 block_length = deflate_block(fp, fp->block_offset);
414 if (block_length < 0) return -1;
415 if (fwrite(fp->compressed_block, 1, block_length, fp->fp) != block_length) {
416 fp->errcode |= BGZF_ERR_IO; // possibly truncated file
417 return -1;
418 }
419 fp->block_address += block_length;
420 }
421 return 0;
422}
423
424int bgzf_flush_try(BGZF *fp, ssize_t size)
425{
426 if (fp->block_offset + size > BGZF_BLOCK_SIZE)
427 return bgzf_flush(fp);
428 return -1;
429}
430
431ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length)
432{
433 const uint8_t *input = data;
434 int block_length = BGZF_BLOCK_SIZE, bytes_written;
435 assert(fp->open_mode == 'w');
436 input = data;
437 bytes_written = 0;
438 while (bytes_written < length) {
439 uint8_t* buffer = fp->uncompressed_block;
440 int copy_length = block_length - fp->block_offset < length - bytes_written? block_length - fp->block_offset : length - bytes_written;
441 memcpy(buffer + fp->block_offset, input, copy_length);
442 fp->block_offset += copy_length;
443 input += copy_length;
444 bytes_written += copy_length;
445 if (fp->block_offset == block_length && bgzf_flush(fp)) break;
446 }
447 return bytes_written;
448}
449
450int bgzf_close(BGZF* fp)
451{
452 int ret, count, block_length;
453 if (fp == 0) return -1;
454 if (fp->open_mode == 'w') {
455 if (bgzf_flush(fp) != 0) return -1;
456 block_length = deflate_block(fp, 0); // write an empty block
457 count = fwrite(fp->compressed_block, 1, block_length, fp->fp);
458 if(count != 0)
459 {
460 // Something was written
461 }
462 if (fflush(fp->fp) != 0) {
463 fp->errcode |= BGZF_ERR_IO;
464 return -1;
465 }
466 }
467 ret = fp->open_mode == 'w'? fclose(fp->fp) : _bgzf_close(fp->fp);
468 if (ret != 0) return -1;
469 free(fp->uncompressed_block);
470 free(fp->compressed_block);
471 free_cache(fp);
472 free(fp);
473 return 0;
474}
475
476void bgzf_set_cache_size(BGZF *fp, int cache_size)
477{
478 if (fp) fp->cache_size = cache_size;
479}
480
481int bgzf_check_EOF(BGZF *fp)
482{
483 static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
484 // Last 28 bytes of an uncompressed bgzf file which are different
485 // from the last 28 bytes of compressed bgzf files.
486 static uint8_t magic2[28] = "\4\0\0\0\0\0\377\6\0\102\103\2\0\036\0\1\0\0\377\377\0\0\0\0\0\0\0\0";
487 uint8_t buf[28];
488 off_t offset;
489 offset = _bgzf_tell((_bgzf_file_t)fp->fp);
490 if (_bgzf_seek(fp->fp, -28, SEEK_END) < 0) return 0;
491 int count = _bgzf_read(fp->fp, buf, 28);
492 if(count != 28)
493 {
494 fp->errcode |= BGZF_ERR_IO; // possibly truncated file
495 return(0);
496 }
497 _bgzf_seek(fp->fp, offset, SEEK_SET);
498 if((memcmp(magic, buf, 28) == 0) || (memcmp(magic2, buf, 28) == 0))
499 {
500 return(1);
501 }
502 return(0);
503}
504
505int64_t bgzf_seek(BGZF* fp, int64_t pos, int where)
506{
507 int block_offset;
508 int64_t block_address;
509
510 if (fp->open_mode != 'r' || where != SEEK_SET) {
511 fp->errcode |= BGZF_ERR_MISUSE;
512 return -1;
513 }
514 block_offset = pos & 0xFFFF;
515 block_address = pos >> 16;
516 if (_bgzf_seek(fp->fp, block_address, SEEK_SET) < 0) {
517 fp->errcode |= BGZF_ERR_IO;
518 return -1;
519 }
520 fp->block_length = 0; // indicates current block has not been loaded
521 fp->block_address = block_address;
522 fp->block_offset = block_offset;
523 return 0;
524}
525
526int bgzf_is_bgzf(const char *fn)
527{
528 uint8_t buf[16];
529 int n;
530 _bgzf_file_t fp;
531 if ((fp = _bgzf_open(fn, "r")) == 0) return 0;
532 n = _bgzf_read(fp, buf, 16);
533 _bgzf_close(fp);
534 if (n != 16) return 0;
535 return memcmp(g_magic, buf, 16) == 0? 1 : 0;
536}
537
538int bgzf_getc(BGZF *fp)
539{
540 int c;
541 if (fp->block_offset >= fp->block_length) {
542 if (bgzf_read_block(fp) != 0) return -2; /* error */
543 if (fp->block_length == 0) return -1; /* end-of-file */
544 }
545 c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
546 if (fp->block_offset == fp->block_length) {
547 fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
548 fp->block_offset = 0;
549 fp->block_length = 0;
550 }
551 return c;
552}
553
554#ifndef kroundup32
555#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
556#endif
557
558int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
559{
560 int l, state = 0;
561 unsigned char *buf = (unsigned char*)fp->uncompressed_block;
562 str->l = 0;
563 do {
564 if (fp->block_offset >= fp->block_length) {
565 if (bgzf_read_block(fp) != 0) { state = -2; break; }
566 if (fp->block_length == 0) { state = -1; break; }
567 }
568 for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l);
569 if (l < fp->block_length) state = 1;
570 l -= fp->block_offset;
571 if (str->l + l + 1 >= str->m) {
572 str->m = str->l + l + 2;
573 kroundup32(str->m);
574 str->s = (char*)realloc(str->s, str->m);
575 }
576 memcpy(str->s + str->l, buf + fp->block_offset, l);
577 str->l += l;
578 fp->block_offset += l + 1;
579 if (fp->block_offset >= fp->block_length) {
580 fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
581 fp->block_offset = 0;
582 fp->block_length = 0;
583 }
584 } while (state == 0);
585 if (str->l == 0 && state < 0) return state;
586 str->s[str->l] = 0;
587 return str->l;
588}
589#endif
Definition: bgzf.h:44