libStatGen Software 1
GenomeSequence.cpp
1/*
2 * Copyright (C) 2010-2012 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#include "assert.h"
19#include "ctype.h"
20#include "stdio.h"
21#include "Error.h"
22
23
24#include "Generic.h"
25#include "GenomeSequence.h"
26
27#include <algorithm>
28#include <istream>
29#include <fstream>
30#include <sstream>
31#include <stdexcept>
32
33#if defined(_WIN32)
34#include <io.h>
35#ifndef R_OK
36#define R_OK 4
37#endif
38#endif
39
40// not general use:
41#include "CSG_MD5.h"
42
43//
44// given a read in a string, pack it into the vector of
45// bytes coded as two bases per byte.
46//
47// The goal is to allow us to more rapidly compare against
48// the genome, which is itself packed 2 bases per byte.
49//
50// Unfortunately, the match position may be odd or even,
51// so as a result, we also need to be able to prepad
52// with 'N' bases to get the byte alignment the same, so
53// padWithNCount may be a positive number indicating how
54// many N bases to prepend.
55//
56void PackedRead::set(const char *rhs, int padWithNCount)
57{
58 clear();
59
60 // pad this packed read with 'N' bases two at a time
61 while (padWithNCount>1)
62 {
63 packedBases.push_back(
64 BaseAsciiMap::base2int[(int) 'N'] << 4 |
65 BaseAsciiMap::base2int[(int) 'N']
66 );
67 padWithNCount -= 2;
68 length+=2;
69 }
70
71 // when we have only one base, pack one 'N' base with
72 // the first base in rhs if there is one.
73 if (padWithNCount)
74 {
75 // NB: *rhs could be NUL, which is ok here - just keep
76 // the length straight.
77 packedBases.push_back(
78 BaseAsciiMap::base2int[(int) *rhs] << 4 |
79 BaseAsciiMap::base2int[(int) 'N']
80 );
81 // two cases - have characters in rhs or we don't:
82 if (*rhs)
83 {
84 length+=2; // pad byte plus one byte from rhs
85 rhs++;
86 }
87 else
88 {
89 length++;
90 }
91 padWithNCount--; // should now be zero, so superfluous.
92 assert(padWithNCount==0);
93 }
94
95 // pad pairs of bases from rhs, two at a time:
96 while (*rhs && *(rhs+1))
97 {
98 packedBases.push_back(
99 BaseAsciiMap::base2int[(int) *(rhs+1)] << 4 |
100 BaseAsciiMap::base2int[(int) *(rhs+0)]
101 );
102 rhs+=2;
103 length+=2;
104 }
105
106 // if there is an odd base left at the end, put it
107 // in a byte all its own (low 4 bits == 0):
108 if (*rhs)
109 {
110 packedBases.push_back(
111 BaseAsciiMap::base2int[(int) *(rhs+0)]
112 );
113 length++;
114 }
115 return;
116}
117
118std::string GenomeSequence::IntegerToSeq(unsigned int n, unsigned int wordsize) const
119{
120 std::string sequence("");
121 for (unsigned int i = 0; i < wordsize; i ++)
122 sequence += "N";
123
124 unsigned int clearHigherBits = ~(3U << (wordsize<<1)); // XXX refactor - this appears several places
125
126 if (n > clearHigherBits)
127 error("%d needs to be a non-negative integer < clearHigherBits\n", n);
128
129 for (unsigned int i = 0; i < wordsize; i++)
130 {
131 sequence[wordsize-1-i] = BaseAsciiMap::int2base[n & 3];
132 n >>= 2;
133 }
134 return sequence;
135}
136
137
138
140{
141 constructorClear();
142}
143
144void GenomeSequence::constructorClear()
145{
146 _debugFlag = 0;
147 _progressStream = NULL;
148 _colorSpace = false;
149 _createOverwrite = false;
150}
151
152void GenomeSequence::setup(const char *referenceFilename)
153{
154 setReferenceName(referenceFilename);
155
156 if (_progressStream) *_progressStream << "open and prefetch reference genome " << referenceFilename << ": " << std::flush;
157
158 if (open(false))
159 {
160 std::cerr << "Failed to open reference genome " << referenceFilename << std::endl;
161 std::cerr << errorStr << std::endl;
162 exit(1);
163 }
164
165 prefetch();
166 if (_progressStream) *_progressStream << "done." << std::endl << std::flush;
167}
168
170{
171 // free up resources:
172 _umfaFile.close();
173}
174
175//
176// mapped open.
177//
178// if the file exists, map in into memory, and fill in a few useful
179// fields.
180//
181
182bool GenomeSequence::open(bool isColorSpace, int flags)
183{
184 bool rc;
185
186 if (isColorSpace)
187 {
188 _umfaFilename = _baseFilename + "-cs.umfa";
189 }
190 else
191 {
192 _umfaFilename = _baseFilename + "-bs.umfa";
193 }
194
195 if(access(_umfaFilename.c_str(), R_OK) != 0)
196 {
197 // umfa file doesn't exist, so try to create it.
198 if(create(isColorSpace))
199 {
200 // Couldon't access or create the umfa.
201 std::cerr << "GenomeSequence::open: failed to open file "
202 << _umfaFilename
203 << " also failed creating it."
204 << std::endl;
205 return true;
206 }
207 }
208
209 rc = genomeSequenceArray::open(_umfaFilename.c_str(), flags);
210 if (rc)
211 {
212 std::cerr << "GenomeSequence::open: failed to open file "
213 << _umfaFilename
214 << std::endl;
215 return true;
216 }
217
218 _colorSpace = header->_colorSpace;
219
220 return false;
221}
222
223void GenomeSequence::sanityCheck(MemoryMap &fasta) const
224{
225 unsigned int i;
226
227 unsigned int genomeIndex = 0;
228 for (i=0; i<fasta.length(); i++)
229 {
230 switch (fasta[i])
231 {
232 case '>':
233 while (fasta[i]!='\n' && fasta[i]!='\r') i++;
234 break;
235 case '\n':
236 case '\r':
237 break;
238 default:
239 assert(BaseAsciiMap::base2int[(int)(*this)[genomeIndex]] ==
240 BaseAsciiMap::base2int[(int) fasta[i]]);
241 genomeIndex++;
242 break;
243 }
244 }
245}
246
247#define HAS_SUFFIX(str, suffix) ((strlen(suffix) < str.size()) && (str.substr(str.size() - strlen(suffix)) == suffix))
248//
249// referenceFilename is either a fasta or a UM fasta (.fa or .umfa)
250// filename. In both cases, the suffix gets removed and the
251// base name is kept for later use depending on context.
252// @return always return false
253//
254bool GenomeSequence::setReferenceName(std::string referenceFilename)
255{
256
257 if (HAS_SUFFIX(referenceFilename, ".fa"))
258 {
259 _referenceFilename = referenceFilename;
260 _baseFilename = _referenceFilename.substr(0, referenceFilename.size() - 3);
261 }
262 else if (HAS_SUFFIX(referenceFilename, ".umfa"))
263 {
264 _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 5);
265 }
266 else if (HAS_SUFFIX(referenceFilename, "-cs.umfa"))
267 {
268 _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8);
269 }
270 else if (HAS_SUFFIX(referenceFilename, "-bs.umfa"))
271 {
272 _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8);
273 }
274 else
275 {
276 _baseFilename = referenceFilename;
277 }
278 _fastaFilename = _baseFilename + ".fa";
279
280 if (HAS_SUFFIX(referenceFilename, ".fasta"))
281 {
282 _referenceFilename = referenceFilename;
283 _baseFilename = _referenceFilename.substr(0, referenceFilename.size() - 6);
284 _fastaFilename = _baseFilename + ".fasta";
285 }
286
287 return false;
288}
289
290//
291// this works in lockstep with ::create to populate
292// the per chromosome header fields size and md5
293// checksum.
294//
295// It relies on header->elementCount being set to
296// the data length loaded so far ... not the ultimate
297// reference length.
298//
299bool GenomeSequence::setChromosomeMD5andLength(uint32_t whichChromosome)
300{
301 if (whichChromosome>=header->_chromosomeCount) return true;
302
303 ChromosomeInfo *c = &header->_chromosomes[whichChromosome];
304 c->size = header->elementCount - c->start;
305
306 MD5_CTX md5Context;
307 uint8_t md5Signature[MD5_DIGEST_LENGTH];
308
309 //
310 // it's easier to ensure we do this right if we just do it
311 // in one big chunk:
312 //
313 char *md5Buffer = (char *) malloc(c->size);
314
315 MD5Init(&md5Context);
316
317 for (genomeIndex_t i = 0; i < c->size; i ++)
318 {
319 md5Buffer[i] = (*this)[c->start + i];
320 }
321 MD5Update(&md5Context, (unsigned char *) md5Buffer, c->size);
322 MD5Final((unsigned char *) &md5Signature, &md5Context);
323 free(md5Buffer);
324 for (int i=0; i<MD5_DIGEST_LENGTH; i++)
325 {
326 // cheesy but works:
327 sprintf(c->md5+2*i, "%02x", md5Signature[i]);
328 }
329 // redundant, strictly speaking due to sprintf NUL terminating
330 // it's output strings, but put it here anyway.
331 c->md5[2*MD5_DIGEST_LENGTH] = '\0';
332
333 return false;
334}
335
336//
337// Given a buffer with a fasta format contents, count
338// the number of chromosomes in it and return that value.
339//
340static bool getFastaStats(const char *fastaData, size_t fastaDataSize, uint32_t &chromosomeCount, uint64_t &baseCount)
341{
342 chromosomeCount = 0;
343 baseCount = 0;
344 bool atLineStart = true;
345
346 //
347 // loop over the fasta file, essentially matching for the
348 // pattern '^>.*$' and counting them.
349 //
350 for (size_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++)
351 {
352 switch (fastaData[fastaIndex])
353 {
354 case '\n':
355 case '\r':
356 atLineStart = true;
357 break;
358 case '>':
359 {
360 if (!atLineStart) break;
361 chromosomeCount++;
362 //
363 // eat the rest of the line
364 //
365 while (fastaIndex < fastaDataSize && fastaData[fastaIndex]!='\n' && fastaData[fastaIndex]!='\r')
366 {
367 fastaIndex++;
368 }
369 break;
370 }
371 default:
372 baseCount++;
373 atLineStart = false;
374 break;
375 }
376
377 }
378 return false;
379}
380
381class PackedSequenceData : public std::vector<uint8_t>
382{
383 std::vector<uint8_t> m_packedBases;
384
385 size_t m_baseCount;
386
387 void set(size_t index, uint8_t value) {
388 m_packedBases[index>>1] =
389 (m_packedBases[index>>1] // original value
390 & ~(7<<((index&0x01)<<2))) // logical AND off the original value
391 | ((value&0x0f)<<((index&0x1)<<2)); // logical OR in the new value
392 }
393
394public:
395
396 void reserve(size_t baseCount) {m_packedBases.reserve(baseCount/2);}
397 size_t size() {return m_baseCount;}
398 void clear() {m_packedBases.clear(); m_baseCount = 0;}
399 uint8_t operator [](size_t index)
400 {
401 return (m_packedBases[index>>1] >> ((index&0x1)<<2)) & 0xf;
402 }
403 void push_back(uint8_t base);
404};
405
406//
407// Load a fasta format file from filename into the buffer
408// provided by the caller.
409// While parsing the fasta file, record each chromosome name,
410// its start location, and its size.
411//
412// NB: the caller must implement the logic to determine how
413// large the sequence data is. There is no correct way to do
414// this, because we can't reliably estimate here how much sequence
415// data is contained in a compressed file.
416//
417// To safely pre-allocate space in sequenceData, use the reserve() method
418// before calling this function.
419//
420bool loadFastaFile(const char *filename,
421 std::vector<PackedSequenceData> &sequenceData,
422 std::vector<std::string> &chromosomeNames)
423{
424 InputFile inputStream(filename, "r", InputFile::DEFAULT);
425
426 if(!inputStream.isOpen()) {
427 std::cerr << "Failed to open file " << filename << "\n";
428 return true;
429 }
430
431 int whichChromosome = -1;
432 chromosomeNames.clear();
433
434 char ch;
435 while((ch = inputStream.ifgetc()) != EOF) {
436 switch (ch)
437 {
438 case '\n':
439 case '\r':
440 break;
441 case '>':
442 {
443 std::string chromosomeName = "";
444 //
445 // pull out the chromosome new name
446 //
447 while (!isspace((ch = inputStream.ifgetc())) && ch != EOF)
448 {
449 chromosomeName += ch; // slow, but who cares
450 }
451 //
452 // eat the rest of the line
453 //
454 do {
455 ch = inputStream.ifgetc();
456 } while(ch != EOF && ch != '\n' && ch != '\r');
457 //
458 // save the Chromosome name and index into our
459 // header so we can use them later.
460 //
461 chromosomeNames.push_back(chromosomeName);
462
463 whichChromosome++;
464
465 break;
466 }
467 default:
468 // we get here for sequence data.
469 //
470 // save the base value
471 // Note: invalid characters come here as well, but we
472 // let ::set deal with mapping them.
473 break;
474 }
475 }
476 return false;
477}
478
479//
480// recreate the umfa file from a reference fasta format file
481//
482// The general format of a FASTA file is best described
483// on wikipedia at http://en.wikipedia.org/wiki/FASTA_format
484//
485// The format parsed here is a simpler subset, and is
486// described here http://www.ncbi.nlm.nih.gov/blast/fasta.shtml
487//
488bool GenomeSequence::create(bool isColor)
489{
490 setColorSpace(isColor);
491
492 if (_baseFilename=="")
493 {
494 std::cerr << "Base reference filename is empty." << std::endl;
495 return true;
496 }
497
498 if (isColorSpace())
499 {
500 _umfaFilename = _baseFilename + "-cs.umfa";
501 }
502 else
503 {
504 _umfaFilename = _baseFilename + "-bs.umfa";
505 }
506
507 if (!_createOverwrite && access(_umfaFilename.c_str(), R_OK) == 0)
508 {
509 std::cerr << "Output file '" << _umfaFilename << "' exists or is not writable - please remove." << std::endl;
510 return true;
511 }
512
513 MemoryMap fastaFile;
514
515 if (fastaFile.open(_fastaFilename.c_str()))
516 {
517 std::cerr << "failed to open input fasta file '" << _fastaFilename << "'." << std::endl;
518 return true;
519 }
520
521 std::cerr << "Creating FASTA "
522 << (isColorSpace() ? "color space " : "")
523 << "binary cache file '"
524 << _umfaFilename
525 << "'."
526 << std::endl;
527
528 std::cerr << std::flush;
529
530 //
531 // simple ptr to fasta data -- just treat the memory map
532 // as an array of fastaDataSize characters...
533 //
534 const char *fasta = (const char *) fastaFile.data;
535 size_t fastaDataSize = fastaFile.length();
536
537 uint32_t chromosomeCount = 0;
538 uint64_t baseCount = 0;
539 getFastaStats(fasta, fastaDataSize, chromosomeCount, baseCount);
540
541 if (genomeSequenceArray::create(_umfaFilename.c_str(), baseCount, chromosomeCount))
542 {
543 std::cerr << "failed to create '"
544 << _umfaFilename
545 << "'."
546 << std::endl;
547 perror("");
548 return true;
549 }
550 header->elementCount = 0;
551 header->_colorSpace = isColorSpace();
552 header->setApplication(_application.c_str());
553 header->_chromosomeCount = chromosomeCount;
554 //
555 // clear out the variable length chromosome info array
556 //
557 for (uint32_t i=0; i<header->_chromosomeCount; i++) header->_chromosomes[i].constructorClear();
558
559 std::string chromosomeName;
560
561 //
562 // for converting the reference to colorspace, the first base is always 5 (in base space it is 'N')
563 signed char lastBase = BaseAsciiMap::base2int[(int) 'N'];
564 bool terminateLoad = false;
565 int percent = -1, newPercent;
566 uint32_t whichChromosome = 0;
567 for (uint64_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++)
568 {
569 if (_progressStream)
570 {
571 newPercent = (int) (1.0 * fastaIndex / fastaDataSize) * 100;
572 if (newPercent>percent)
573 {
574 *_progressStream << "\r" << newPercent << "% ";
575 *_progressStream << std::flush;
576 percent = newPercent;
577 }
578 }
579 switch (fasta[fastaIndex])
580 {
581 case '\n':
582 case '\r':
583 break;
584 case '>':
585 {
586 chromosomeName = "";
587 fastaIndex++; // skip the > char
588 //
589 // pull out the chromosome new name
590 //
591 while (!isspace(fasta[fastaIndex]))
592 {
593 chromosomeName += fasta[fastaIndex++]; // slow, but who cares
594 }
595 //
596 // eat the rest of the line
597 //
598 while (fasta[fastaIndex]!='\n' && fasta[fastaIndex]!='\r')
599 {
600 fastaIndex++;
601 }
602 //
603 // save the Chromosome name and index into our
604 // header so we can use them later.
605 //
606 ChromosomeInfo *c = &header->_chromosomes[whichChromosome];
607 c->setChromosomeName(chromosomeName.c_str());
608 c->start = header->elementCount;
609 // c->size gets computed at the next '>' line or at the EOF
610
611 if (whichChromosome>0)
612 {
613 //
614 // compute md5 checksum for the chromosome that we just
615 // loaded (if there was one) - note that on the last
616 // chromosome, we have to duplicate this code after
617 // the end of this loop
618 //
619 setChromosomeMD5andLength(whichChromosome - 1);
620 }
621 whichChromosome++;
622 if (whichChromosome > header->_chromosomeCount)
623 {
624 std::cerr << "BUG: Exceeded computed chromosome count ("
625 << header->_chromosomeCount
626 << ") - genome is now truncated at chromosome "
627 << header->_chromosomes[header->_chromosomeCount-1].name
628 << " (index "
629 << header->_chromosomeCount
630 << ")."
631 << std::endl;
632 terminateLoad = true;
633 }
634 break;
635 }
636 default:
637 // save the base pair value
638 // Note: invalid characters come here as well, but we
639 // let ::set deal with mapping them.
640 if (isColorSpace())
641 {
642//
643// anything outside these values represents an invalid base
644// base codes: 0-> A, 1-> C, 2-> G, 3-> T
645// colorspace: 0-> blue, 1-> green, 2-> oragne, 3->red
646//
647 const char fromBase2CS[] =
648 {
649 /* 0000 */ 0, // A->A
650 /* 0001 */ 1, // A->C
651 /* 0010 */ 2, // A->G
652 /* 0011 */ 3, // A->T
653 /* 0100 */ 1, // C->A
654 /* 0101 */ 0, // C->C
655 /* 0110 */ 3, // C->G
656 /* 0111 */ 2, // C->T
657 /* 1000 */ 2, // G->A
658 /* 1001 */ 3, // G->C
659 /* 1010 */ 0, // G->G
660 /* 1011 */ 1, // G->T
661 /* 1100 */ 3, // T->A
662 /* 1101 */ 2, // T->C
663 /* 1110 */ 1, // T->G
664 /* 1111 */ 0, // T->T
665 };
666 //
667 // we are writing color space values on transitions,
668 // so we don't write a colorspace value when we
669 // get the first base value.
670 //
671 // On second and subsequent bases, write based on
672 // the index table above
673 //
674 char thisBase =
675 BaseAsciiMap::base2int[(int)(fasta[fastaIndex])];
676 if (lastBase>=0)
677 {
678 char color;
679 if (lastBase>3 || thisBase>3) color=4;
680 else color = fromBase2CS[(int)(lastBase<<2 | thisBase)];
681 // re-use the int to base, because ::set expects a base char (ATCG), not
682 // a color code (0123). It should only matter on final output.
683 set(header->elementCount++,
684 BaseAsciiMap::int2base[(int) color]);
685 }
686 lastBase = thisBase;
687 }
688 else
689 {
690 set(header->elementCount++, toupper(fasta[fastaIndex]));
691 }
692 break;
693 }
694
695 //
696 // slightly awkward exit handling when we exceed the fixed
697 // number of chromosomes
698 //
699 if (terminateLoad) break;
700 }
701
702 //
703 // also slightly awkward code to handle the last dangling chromosome...
704 // all we should need to do is compute the md5 checksum
705 //
706 if (whichChromosome==0)
707 {
708 fastaFile.close();
709 throw std::runtime_error("No chromosomes found - aborting!");
710 }
711 else
712 {
713 setChromosomeMD5andLength(whichChromosome-1);
714 }
715
716 fastaFile.close();
717
718 if (_progressStream) *_progressStream << "\r";
719
720 std::cerr << "FASTA binary cache file '"
721 << _umfaFilename
722 << "' created."
723 << std::endl;
724
725 //
726 // leave the umfastaFile open in case caller wants to use it
727 //
728 return false;
729}
730
732{
733 return header->_chromosomeCount;
734}
735
736//return chromosome index: 0, 1, ... 24;
737int GenomeSequence::getChromosome(genomeIndex_t position) const
738{
739 if (position == INVALID_GENOME_INDEX) return INVALID_CHROMOSOME_INDEX;
740
741 if (header->_chromosomeCount == 0)
742 return INVALID_CHROMOSOME_INDEX;
743
744 int start = 0;
745 int stop = header->_chromosomeCount - 1;
746
747 // eliminate case where position is in the last chromosome, since the loop
748 // below falls off the end of the list if it in the last one.
749
750 if (position > header->_chromosomes[stop].start)
751 return (stop);
752
753 while (start <= stop)
754 {
755 int middle = (start + stop) / 2;
756
757 if (position >= header->_chromosomes[middle].start && position < header->_chromosomes[middle + 1].start)
758 return middle;
759
760 if (position == header->_chromosomes[middle + 1].start)
761 return (middle + 1);
762
763 if (position > header->_chromosomes[middle + 1].start)
764 start = middle + 1;
765
766 if (position < header->_chromosomes[middle].start)
767 stop = middle - 1;
768 }
769
770 return -1;
771}
772
773//
774// Given a chromosome name and 1-based chromosome index, return the
775// genome index (0 based) into sequence for it.
776//
777// NB: the header->chromosomes array contains zero based genome positions
778//
780 const char *chromosomeName,
781 unsigned int chromosomeIndex) const
782{
783 genomeIndex_t i = getGenomePosition(chromosomeName);
784 if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX;
785 return i + chromosomeIndex - 1;
786}
787
789 int chromosome,
790 unsigned int chromosomeIndex) const
791{
792 if (chromosome<0 || chromosome >= (int) header->_chromosomeCount) return INVALID_GENOME_INDEX;
793
794 genomeIndex_t i = header->_chromosomes[chromosome].start;
795 if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX;
796 return i + chromosomeIndex - 1;
797}
798
799//
800// return the genome index (0 based) of the start of the named
801// chromosome. If none is found, INVALID_GENOME_INDEX is returned.
802//
803// XXX may need to speed this up - and smarten it up with some
804// modest chromosome name parsing.... e.g. '%d/X/Y' or 'chr%d/chrX/chrY' or
805// other schemes.
806//
807genomeIndex_t GenomeSequence::getGenomePosition(const char *chromosomeName) const
808{
809 int chromosome = getChromosome(chromosomeName);
810 if (chromosome==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX;
811 return header->_chromosomes[chromosome].start;
812}
813
814int GenomeSequence::getChromosome(const char *chromosomeName) const
815{
816 unsigned int i;
817 for (i=0; i<header->_chromosomeCount; i++)
818 {
819 if (strcmp(header->_chromosomes[i].name, chromosomeName)==0)
820 {
821 return i;
822 }
823 }
824 return INVALID_CHROMOSOME_INDEX;
825}
826
827//
828// Given a read, reverse the string and swap the base
829// pairs for the reverse strand equivalents.
830//
831void GenomeSequence::getReverseRead(std::string &read)
832{
833 std::string newRead;
834 if (read.size()) for (int32_t i=(int) read.size() - 1; i>=0; i--)
835 {
836 newRead.push_back(BasePair(read[i]));
837 }
838 read = newRead;
839}
840
841void GenomeSequence::getReverseRead(String& read)
842{
843 int i = 0;
844 int j = read.Length()-1;
845 char temp;
846 while (i < j)
847 {
848 temp = read[j];
849 read[j] = read[i];
850 read[i] = temp;
851 }
852}
853
854#define ABS(x) ( (x) > 0 ? (x) : -(x) )
855int GenomeSequence::debugPrintReadValidation(
856 std::string &read,
857 std::string &quality,
858 char direction,
859 genomeIndex_t readLocation,
860 int sumQuality,
861 int mismatchCount,
862 bool recurse
863)
864{
865 int validateSumQ = 0;
866 int validateMismatchCount = 0;
867 int rc = 0;
868 std::string genomeData;
869
870 for (uint32_t i=0; i<read.size(); i++)
871 {
872 if (tolower(read[i]) != tolower((*this)[readLocation + i]))
873 {
874 validateSumQ += quality[i] - '!';
875 // XXX no longer valid:
876 if (direction=='F' ? i<24 : (i >= (read.size() - 24))) validateMismatchCount++;
877 genomeData.push_back(tolower((*this)[readLocation + i]));
878 }
879 else
880 {
881 genomeData.push_back(toupper((*this)[readLocation + i]));
882 }
883 }
884 assert(validateSumQ>=0);
885 if (validateSumQ != sumQuality && validateMismatchCount == mismatchCount)
886 {
887 printf("SUMQ: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d\n",
888 genomeData.c_str(),
889 read.c_str(),
890 validateSumQ,
891 sumQuality
892 );
893 rc++;
894 }
895 else if (validateSumQ == sumQuality && validateMismatchCount != mismatchCount)
896 {
897 printf("MISM: Original Genome: %s test read: %s : actual mismatch %d test mismatches %d\n",
898 genomeData.c_str(),
899 read.c_str(),
900 validateMismatchCount,
901 mismatchCount
902 );
903 rc++;
904 }
905 else if (validateSumQ != sumQuality && validateMismatchCount != mismatchCount)
906 {
907 printf("BOTH: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d, actual mismatch %d test mismatches %d\n",
908 genomeData.c_str(),
909 read.c_str(),
910 validateSumQ,
911 sumQuality,
912 validateMismatchCount,
913 mismatchCount
914 );
915 rc++;
916 }
917
918 if (recurse && ABS(validateMismatchCount - mismatchCount) > (int) read.size()/2)
919 {
920 printf("large mismatch difference, trying reverse strand: ");
921 std::string reverseRead = read;
922 std::string reverseQuality = quality;
923 getReverseRead(reverseRead);
924 reverse(reverseQuality.begin(), reverseQuality.end());
925 rc = debugPrintReadValidation(reverseRead, reverseQuality, readLocation, sumQuality, mismatchCount, false);
926 }
927 return rc;
928}
929#undef ABS
930
931
932bool GenomeSequence::wordMatch(unsigned int index, std::string &word) const
933{
934 for (uint32_t i = 0; i<word.size(); i++)
935 {
936 if ((*this)[index + i] != word[i]) return false;
937 }
938 return true;
939}
940
941bool GenomeSequence::printNearbyWords(unsigned int index, unsigned int deviation, std::string &word) const
942{
943 for (unsigned int i = index - deviation; i < index + deviation; i++)
944 {
945 if (wordMatch(i, word))
946 {
947 std::cerr << "word '"
948 << word
949 << "' found "
950 << i - index
951 << " away from position "
952 << index
953 << "."
954 << std::endl;
955 }
956 }
957 return false;
958}
959
960void GenomeSequence::dumpSequenceSAMDictionary(std::ostream &file) const
961{
962 for (unsigned int i=0; i<header->_chromosomeCount; i++)
963 {
964 file
965 << "@SQ"
966 << "\tSN:" << header->_chromosomes[i].name // name
967 << "\tLN:" << header->_chromosomes[i].size // number of bases
968 << "\tAS:" << header->_chromosomes[i].assemblyID // e.g. NCBI36.3
969 << "\tM5:" << header->_chromosomes[i].md5
970 << "\tUR:" << header->_chromosomes[i].uri
971 << "\tSP:" << header->_chromosomes[i].species // e.g. Homo_sapiens
972 << std::endl;
973 }
974}
975
976void GenomeSequence::dumpHeaderTSV(std::ostream &file) const
977{
978 file << "# Reference: " << _baseFilename << std::endl;
979 file << "# SN: sample name - must be unique" << std::endl;
980 file << "# AS: assembly name" << std::endl;
981 file << "# SP: species" << std::endl;
982 file << "# LN: chromosome/contig length" << std::endl;
983 file << "# M5: chromosome/contig MD5 checksum" << std::endl;
984 file << "# LN and M5 are only printed for informational purposes." << std::endl;
985 file << "# Karma will only set those values when creating the index." << std::endl;
986 file << "SN" << "\t" << "AS" << "\t" << "SP" << "\t" << "UR" << "\t" << "LN" << "\t" << "M5" << std::endl;
987 for (unsigned int i=0; i<header->_chromosomeCount; i++)
988 {
989 file
990 << header->_chromosomes[i].name // name
991 << "\t" << header->_chromosomes[i].assemblyID // e.g. NCBI36.3
992 << "\t" << header->_chromosomes[i].uri
993 << "\t" << header->_chromosomes[i].species // e.g. Homo_sapiens
994 << "\t" << header->_chromosomes[i].size // number of bases
995 << "\t" << header->_chromosomes[i].md5
996 << std::endl;
997 }
998}
999
1000
1001void GenomeSequence::getString(std::string &str, int chromosome, uint32_t index, int baseCount) const
1002{
1003 //
1004 // calculate the genome index for the lazy caller...
1005 //
1006 genomeIndex_t genomeIndex = header->_chromosomes[chromosome].start + index - 1;
1007
1008 getString(str, genomeIndex, baseCount);
1009}
1010
1011void GenomeSequence::getString(String &str, int chromosome, uint32_t index, int baseCount) const
1012{
1013 std::string string;
1014 this-> getString(string, chromosome, index, baseCount);
1015 str = string.c_str();
1016}
1017
1018void GenomeSequence::getString(std::string &str, genomeIndex_t index, int baseCount) const
1019{
1020 str.clear();
1021 if (baseCount > 0)
1022 {
1023 for (int i=0; i<baseCount; i++)
1024 {
1025 str.push_back((*this)[index + i]);
1026 }
1027 }
1028 else
1029 {
1030 // if caller passed negative basecount, give them
1031 // the read for the 3' end
1032 for (int i=0; i< -baseCount; i++)
1033 {
1034 str.push_back(BaseAsciiMap::base2complement[(int)(*this)[index + i]]);
1035 }
1036 }
1037}
1038
1039void GenomeSequence::getString(String &str, genomeIndex_t index, int baseCount) const
1040{
1041 std::string string;
1042 getString(string, index, baseCount);
1043 str = string.c_str();
1044}
1045
1046void GenomeSequence::getHighLightedString(std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd) const
1047{
1048 str.clear();
1049 if (baseCount > 0)
1050 {
1051 for (int i=0; i<baseCount; i++)
1052 {
1053 char base = (*this)[index + i];
1054 if (in(index+i, highLightStart, highLightEnd))
1055 base = tolower(base);
1056 str.push_back(base);
1057 }
1058 }
1059 else
1060 {
1061 // if caller passed negative basecount, give them
1062 // the read for the 3' end
1063 for (int i=0; i< -baseCount; i++)
1064 {
1065 char base = BaseAsciiMap::base2complement[(int)(*this)[index + i]];
1066 if (in(index+i, highLightStart, highLightEnd))
1067 base = tolower(base);
1068 str.push_back(base);
1069 }
1070 }
1071}
1072
1073void GenomeSequence::print30(genomeIndex_t index) const
1074{
1075 std::cout << "index: " << index << "\n";
1076 for (genomeIndex_t i=index-30; i<index+30; i++)
1077 std::cout << (*this)[i];
1078 std::cout << "\n";
1079 for (genomeIndex_t i=index-30; i<index; i++)
1080 std::cout << " ";
1081 std::cout << "^";
1082 std::cout << std::endl;
1083}
1084
1085void GenomeSequence::getMismatchHatString(std::string &result, const std::string &read, genomeIndex_t location) const
1086{
1087 result = "";
1088 for (uint32_t i=0; i < read.size(); i++)
1089 {
1090 if (read[i] == (*this)[location+i])
1091 result.push_back(' ');
1092 else
1093 result.push_back('^');
1094 }
1095}
1096
1097void GenomeSequence::getMismatchString(std::string &result, const std::string &read, genomeIndex_t location) const
1098{
1099 result = "";
1100 for (uint32_t i=0; i < read.size(); i++)
1101 {
1102 if (read[i] == (*this)[location+i])
1103 result.push_back(toupper(read[i]));
1104 else
1105 result.push_back(tolower(read[i]));
1106 }
1107}
1108
1109genomeIndex_t GenomeSequence::simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index, int windowSize) const
1110{
1111 int bestScore = 1000000; // either mismatch count or sumQ
1112 genomeIndex_t bestMatchLocation = INVALID_GENOME_INDEX;
1113 for (int i=-windowSize; i<windowSize; i++)
1114 {
1115 int newScore;
1116
1117 if (i<0 && ((uint32_t) -i) > index) continue;
1118 if (index + i + read.size() >= getNumberBases()) continue;
1119
1120 if (quality=="")
1121 {
1122 newScore = this->getMismatchCount(read, index + i);
1123 }
1124 else
1125 {
1126 newScore = this->getSumQ(read, quality, index + i);
1127 }
1128 if (newScore < bestScore)
1129 {
1130 bestScore = newScore;
1131 bestMatchLocation = index + i;
1132 }
1133 }
1134 return bestMatchLocation;
1135}
1136
1137std::ostream &operator << (std::ostream &stream, genomeSequenceMmapHeader &h)
1138{
1139 stream << (MemoryMapArrayHeader &) h;
1140 stream << "chromosomeCount: " << h._chromosomeCount << std::endl;
1141 stream << "isColorSpace: " << h._colorSpace << std::endl;
1142 stream << "chromosomeCount: " << h._chromosomeCount << std::endl;
1143 uint64_t totalSize = 0;
1144 for (uint32_t i=0; i < h._chromosomeCount; i++)
1145 {
1146 totalSize += h._chromosomes[i].size;
1147 stream << "Chromosome Index " << i << " name: " << h._chromosomes[i].name << std::endl;
1148 stream << "Chromosome Index " << i << " whole genome start: " << h._chromosomes[i].start << std::endl;
1149 stream << "Chromosome Index " << i << " whole genome size: " << h._chromosomes[i].size << std::endl;
1150 stream << "Chromosome Index " << i << " md5 checksum: " << h._chromosomes[i].md5 << std::endl;
1151 stream << "Chromosome Index " << i << " assemblyID: " << h._chromosomes[i].assemblyID << std::endl;
1152 stream << "Chromosome Index " << i << " species: " << h._chromosomes[i].species << std::endl;
1153 stream << "Chromosome Index " << i << " URI: " << h._chromosomes[i].uri << std::endl;
1154 }
1155 stream << "Total Genome Size: " << totalSize << " bases."<< std::endl;
1156 if (totalSize != h.elementCount)
1157 {
1158 stream << "Total Genome Size: does not match elementCount!\n";
1159 }
1160
1161 stream << std::endl;
1162 return stream;
1163}
1164
1165void GenomeSequence::getChromosomeAndIndex(std::string &s, genomeIndex_t i) const
1166{
1167 int whichChromosome = 0;
1168
1169 whichChromosome = getChromosome(i);
1170
1171 if (whichChromosome == INVALID_CHROMOSOME_INDEX)
1172 {
1173 s = "invalid genome index"; // TODO include the index in error
1174 }
1175 else
1176 {
1177 std::ostringstream buf;
1178 genomeIndex_t chromosomeIndex = i - getChromosomeStart(whichChromosome) + 1;
1179 buf << header->_chromosomes[whichChromosome].name << ":" << chromosomeIndex;
1180#if 0
1181 buf << " (GenomeIndex " << i << ")";
1182#endif
1183 s = buf.str();
1184 }
1185 return;
1186}
1187
1188
1189void GenomeSequence::getChromosomeAndIndex(String& s, genomeIndex_t i) const
1190{
1191 std::string ss;
1192 getChromosomeAndIndex(ss, i);
1193 s = ss.c_str();
1194 return;
1195}
1196
1197
1198//
1199// This is intended to be a helper routine to get dbSNP files
1200// loaded. In some cases, we will load into an mmap() file (ie
1201// when we are creating it), in others, we will simply be loading
1202// an existing dbSNP file into RAM (when the binary file does not
1203// exist or when we are running with useMemoryMapFlag == false.
1204//
1205// Assume that dbSNP exists, is writable, and is the right size.
1206//
1207// Using the dbSNPFilename given, mark each dbSNP position
1208// with a bool true.
1209//
1210// Return value:
1211// True: if populateDBSNP() succeed
1212// False: if not succeed
1213bool GenomeSequence::populateDBSNP(
1214 mmapArrayBool_t &dbSNP,
1215 IFILE inputFile) const
1216{
1217 assert(dbSNP.getElementCount() == getNumberBases());
1218
1219 if(inputFile == NULL)
1220 {
1221 // FAIL, file not opened.
1222 return(false);
1223 }
1224
1225 std::string chromosomeName;
1226 std::string position;
1227 genomeIndex_t chromosomePosition1; // 1-based
1228 uint64_t ignoredLineCount = 0;
1229
1230 // Read til the end of the file.
1231 char* postPosPtr = NULL;
1232 while(!inputFile->ifeof())
1233 {
1234 chromosomeName.clear();
1235 position.clear();
1236 // Read the chromosome
1237 if(inputFile->readTilTab(chromosomeName) <= 0)
1238 {
1239 // hit either eof or end of line, check if
1240 // it is a header.
1241 if(chromosomeName.size()>0 && chromosomeName[0]=='#')
1242 {
1243 // header, so just continue.
1244 continue;
1245 }
1246 // Not the header, so this line is poorly formatted.
1247 ++ignoredLineCount;
1248 // Continue to the next line.
1249 continue;
1250 }
1251
1252 // Check if it is a header line.
1253 if(chromosomeName.size()>0 && chromosomeName[0]=='#')
1254 {
1255 // did not hit eof or end of line,
1256 // so discard the rest of the line.
1257 inputFile->discardLine();
1258 continue;
1259 }
1260
1261 // Not a header, so read the position.
1262 if(inputFile->readTilTab(position) > 0)
1263 {
1264 // Additional data on the line, so discard it.
1265 inputFile->discardLine();
1266 }
1267
1268 // Convert the position to a string.
1269 chromosomePosition1 = strtoul(position.c_str(), &postPosPtr, 0);
1270
1271
1272 if(postPosPtr == position.c_str())
1273 {
1274 ++ignoredLineCount;
1275 continue;
1276 }
1277
1278 // 1-based genome index.
1279 genomeIndex_t genomeIndex =
1280 getGenomePosition(chromosomeName.c_str(), chromosomePosition1);
1281
1282 // if the genome index is invalid, ignore it
1283 if((genomeIndex == INVALID_GENOME_INDEX) ||
1284 (genomeIndex > getNumberBases()))
1285 {
1286 ignoredLineCount++;
1287 continue;
1288 }
1289
1290 dbSNP.set(genomeIndex, true);
1291 }
1292
1293 if (ignoredLineCount > 0)
1294 {
1295 std::cerr << "GenomeSequence::populateDBSNP: ignored " << ignoredLineCount << " SNP positions due to invalid format of line." << std::endl;
1296 return false;
1297 }
1298 return true;
1299}
1300
1302 mmapArrayBool_t &dbSNP,
1303 const char *inputFileName) const
1304{
1305 //
1306 // the goal in this section of code is to allow the user
1307 // to either specify a valid binary version of the SNP file,
1308 // or the original text file that it gets created from.
1309 //
1310 // To do this, we basically open, sniff the error message,
1311 // and if it claims it is not a binary version of the file,
1312 // we go ahead and treat it as the text file and use the
1313 // GenomeSequence::populateDBSNP method to load it.
1314 //
1315 // Further checking is really needed to ensure users don't
1316 // mix a dbSNP file for a different reference, since it is really
1317 // easy to do.
1318 //
1319 if (strlen(inputFileName)!=0)
1320 {
1321 std::cerr << "Load dbSNP file '" << inputFileName << "': " << std::flush;
1322
1323 if (dbSNP.open(inputFileName, O_RDONLY))
1324 {
1325 //
1326 // failed to open, possibly due to bad magic.
1327 //
1328 // this is really awful ... need to have a return
1329 // code that is smart enough to avoid this ugliness:
1330 //
1331 if (dbSNP.getErrorString().find("wrong type of file")==std::string::npos)
1332 {
1333 std::cerr << "Error: " << dbSNP.getErrorString() << std::endl;
1334 exit(1);
1335 }
1336 //
1337 // we have a file, assume we can load it as a text file
1338 //
1339 IFILE inputFile = ifopen(inputFileName, "r");
1340 if(inputFile == NULL)
1341 {
1342 std::cerr << "Error: failed to open " << inputFileName << std::endl;
1343 exit(1);
1344 }
1345
1346 std::cerr << "(as text file) ";
1347
1348 // anonymously (RAM resident only) create:
1349 dbSNP.create(getNumberBases());
1350
1351 // now load it into RAM
1352 populateDBSNP(dbSNP, inputFile);
1353 ifclose(inputFile);
1354
1355 }
1356 else
1357 {
1358 std::cerr << "(as binary mapped file) ";
1359 }
1360
1361 std::cerr << "DONE!" << std::endl;
1362 return false;
1363 }
1364 else
1365 {
1366 return true;
1367 }
1368}
1369
1370
1371#if defined(TEST)
1372
1373void simplestExample(void)
1374{
1375 GenomeSequence reference;
1376 genomeIndex_t index;
1377
1378 // a particular reference is set by:
1379 // reference.setFastaName("/usr/cluster/share/karma/human_g1k_v37_12CS.fa")
1380 //
1381 // In the above example, the suffix .fa is stripped and replaced with .umfa,
1382 // which contains the actual file being opened.
1383 //
1384 if (reference.open())
1385 {
1386 perror("GenomeSequence::open");
1387 exit(1);
1388 }
1389
1390
1391 index = 1000000000; // 10^9
1392 //
1393 // Write the base at the given index. Here, index is 0 based,
1394 // and is across the whole genome, as all chromosomes are sequentially
1395 // concatenated, so the allowed range is
1396 //
1397 // 0.. (reference.getChromosomeStart(last) + reference.getChromosomeSize(last))
1398 //
1399 // (where int last = reference.getChromosomeCount() - 1;)
1400 //
1401 std::cout << "base[" << index << "] = " << reference[index] << std::endl;
1402
1403 //
1404 // Example for finding chromosome and one based chromosome position given
1405 // and absolute position on the genome in 'index':
1406 //
1407 int chr = reference.getChromosome(index);
1408 genomeIndex_t chrIndex = index - reference.getChromosomeStart(chr) + 1; // 1-based
1409
1410 std::cout << "genome index " << index << " corresponds to chromosome " << chr << " position " << chrIndex << std::endl;
1411
1412 //
1413 // Example for finding an absolute genome index position when the
1414 // chromosome name and one based position are known:
1415 //
1416 const char *chromosomeName = "5";
1417 chr = reference.getChromosome(chromosomeName); // 0-based
1418 chrIndex = 100000; // 1-based
1419
1420 index = reference.getChromosomeStart(chr) + chrIndex - 1;
1421
1422 std::cout << "Chromosome '" << chromosomeName << "' position " << chrIndex << " corresponds to genome index position " << index << std::endl;
1423
1424 reference.close();
1425}
1426
1427void testGenomeSequence(void)
1428{
1429 GenomeSequence reference;
1430
1431#if 0
1432 std::string referenceName = "someotherreference";
1433 if (reference.setFastaName(referenceName))
1434 {
1435 std::cerr << "failed to open reference file "
1436 << referenceName
1437 << std::endl;
1438 exit(1);
1439 }
1440#endif
1441
1442 std::cerr << "open and prefetch the reference genome: ";
1443
1444 // open it
1445 if (reference.open())
1446 {
1447 exit(1);
1448 }
1449 std::cerr << "done!" << std::endl;
1450
1451 //
1452 // For the human genome, genomeIndex ranges from 0 to 3.2x10^9
1453 //
1454 genomeIndex_t genomeIndex; // 0 based
1455 unsigned int chromosomeIndex; // 1 based
1456 unsigned int chromosome; // 0..23 or so
1457 std::string chromosomeName;
1458
1459 //
1460 // Here we'll start with a chromosome name, then obtain the genome
1461 // index, and use it to find the base we want:
1462 //
1463 chromosomeName = "2";
1464 chromosomeIndex = 1234567;
1465 // this call is slow (string search for chromsomeName):
1466 genomeIndex = reference.getGenomePosition(chromosomeName.c_str(), chromosomeIndex);
1467 assert(genomeIndex!=INVALID_GENOME_INDEX);
1468 std::cout << "Chromosome " << chromosomeName << ", index ";
1469 std::cout << chromosomeIndex << " contains base " << reference[genomeIndex];
1470 std::cout << " at genome index position " << genomeIndex << std::endl;
1471
1472 //
1473 // now reverse it - given a genomeIndex from above, find the chromosome
1474 // name and index:
1475 //
1476
1477 // slow (binary search on genomeIndex):
1478 chromosome = reference.getChromosome(genomeIndex);
1479 unsigned int newChromosomeIndex;
1480 // not slow:
1481 newChromosomeIndex = genomeIndex - reference.getChromosomeStart(chromosome) + 1;
1482
1483 assert(chromosomeIndex == newChromosomeIndex);
1484
1485 // more testing... at least test and use PackedRead:
1486 //
1487 PackedRead pr;
1488
1489 pr.set("ATCGATCG", 0);
1490 assert(pr.size()==8);
1491 assert(pr[0]==BaseAsciiMap::base2int[(int) 'A']);
1492 assert(pr[1]==BaseAsciiMap::base2int[(int) 'T']);
1493 assert(pr[2]==BaseAsciiMap::base2int[(int) 'C']);
1494 assert(pr[3]==BaseAsciiMap::base2int[(int) 'G']);
1495 pr.set("ATCGATCG", 1);
1496 assert(pr.size()==9);
1497 pr.set("", 0);
1498 assert(pr.size()==0);
1499 pr.set("", 1);
1500 assert(pr.size()==1);
1501 pr.set("", 2);
1502 assert(pr.size()==2);
1503 pr.set("", 3);
1504 assert(pr.size()==3);
1505 assert(pr[0]==BaseAsciiMap::base2int[(int) 'N']);
1506 assert(pr[1]==BaseAsciiMap::base2int[(int) 'N']);
1507 assert(pr[2]==BaseAsciiMap::base2int[(int) 'N']);
1508 pr.set("C", 1);
1509 assert(pr.size()==2);
1510 assert(pr[0]==BaseAsciiMap::base2int[(int) 'N']);
1511 assert(pr[1]==BaseAsciiMap::base2int[(int) 'C']);
1512
1513}
1514
1515//
1516// After I build libcsg, I compile and run this test code using:
1517//
1518// g++ -DTEST -o try GenomeSequence.cpp -L. -lcsg -lm -lz -lssl
1519// you also may need -fno-rtti
1520// ./try
1521//
1522int main(int argc, const char **argv)
1523{
1524#if 1
1525 simplestExample();
1526#else
1527 testGenomeSequence();
1528#endif
1529 exit(0);
1530}
1531
1532#endif
IFILE ifopen(const char *filename, const char *mode, InputFile::ifileCompression compressionMode=InputFile::DEFAULT)
Open a file with the specified name and mode, using a filename of "-" to indicate stdin/stdout.
Definition: InputFile.h:562
InputFile & operator<<(InputFile &stream, const std::string &str)
Write to a file using streaming.
Definition: InputFile.h:736
int ifclose(IFILE &file)
Close the file.
Definition: InputFile.h:580
static unsigned char base2complement[]
This table maps 5' base space to the 3' complement base space values, as well as 5' color space value...
Definition: BaseAsciiMap.h:41
static unsigned char base2int[256+1]
Map ASCII values to a 2 (or 3) bit encoding for the base pair value for just base space (ACTGNactgn).
Definition: BaseAsciiMap.h:61
static const char int2base[]
Convert from int representation to the base.
Definition: BaseAsciiMap.h:38
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
bool loadDBSNP(mmapArrayBool_t &dbSNP, const char *inputFileName) const
user friendly dbSNP loader.
int getChromosomeCount() const
Return the number of chromosomes in the genome.
int getSumQ(std::string &read, std::string &qualities, genomeIndex_t location) const
brute force sumQ - no sanity checking
int getMismatchCount(std::string &read, genomeIndex_t location, char exclude='\0') const
Return the mismatch count, disregarding CIGAR strings.
genomeIndex_t getChromosomeStart(int chromosomeIndex) const
given a chromosome, return the genome base it starts in
genomeIndex_t getGenomePosition(const char *chromosomeName, unsigned int chromosomeIndex) const
given a chromosome name and position, return the genome position
~GenomeSequence()
Close the file if open and destroy the object.
bool isColorSpace() const
tell us if we are a color space reference or not
int getChromosome(genomeIndex_t position) const
given a whole genome index, get the chromosome it is located in
bool setReferenceName(std::string referenceFilename)
set the reference name that will be used in open()
genomeIndex_t getNumberBases() const
return the number of bases represented in this reference
GenomeSequence()
Simple constructor - no implicit file open.
bool open(bool isColorSpace=false, int flags=O_RDONLY)
open the reference specified using GenomeSequence::setReferenceName
Class for easily reading/writing files without having to worry about file type (uncompressed,...
Definition: InputFile.h:37
int ifeof() const
Check to see if we have reached the EOF.
Definition: InputFile.h:386
int discardLine()
Read until the end of the line, discarding the characters, returning -1 returned for EOF and returnin...
Definition: InputFile.cpp:95
@ DEFAULT
Check the extension, if it is ".gz", treat as gzip, otherwise treat it as UNCOMPRESSED.
Definition: InputFile.h:45
int readTilTab(std::string &field)
Read, appending the characters into the specified string until tab, new line, or EOF is found,...
Definition: InputFile.cpp:133
bool open(const char *file, int flags=O_RDONLY)
open a previously created mapped vector
int create(const char *file, indexT elementCount, int optionalHeaderCount=0)
Create a vector with elementCount memebers.
There are a pair of related data structures in the operating system, and also a few simple algorithms...
Definition: MemoryMap.h:156
virtual bool open(const char *file, int flags=O_RDONLY)
open a previously created mapped vector
Definition: MemoryMap.cpp:156