libStatGen Software 1
ModifyVar.cpp
1/*
2 * Copyright (C) 2010 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#include "SamFile.h"
19#include "ModifyVar.h"
20#include "SamValidation.h"
21
22void modifyFirstBase()
23{
24 SamFile samIn;
25 // Open the file for reading.
26 assert(samIn.OpenForRead("testFiles/testVar.bam"));
27
28 SamFile samOut;
29 // Open the file for writing.
30 assert(samOut.OpenForWrite("results/updateVar.bam"));
31
32 // Read the sam header.
33 SamFileHeader samHeader;
34 assert(samIn.ReadHeader(samHeader));
35
36 assert(samOut.WriteHeader(samHeader));
37
38 // Read the sam records.
39 SamRecord samRecord;
40
41 // Keep reading records until the end of the file is reached.
42 while(samIn.ReadRecord(samHeader, samRecord))
43 {
44 // Successfully read a record from the file, so check to see
45 // if it is valid.
46 SamValidationErrors samValidationErrors;
47 assert(SamValidator::isValid(samHeader, samRecord,
48 samValidationErrors));
49
50 // Get the sequence.
51 const char* sequence = samRecord.getSequence();
52 assert(strcmp(sequence, "") != 0);
53 std::string upSeq = sequence;
54 upSeq[0] = 'N';
55 assert(samRecord.setSequence(upSeq.c_str()));
56
57 // write the sequence.
58 assert(samOut.WriteRecord(samHeader, samRecord));
59 }
60
61 // Should have exited only when done reading.
62 assert(samIn.GetStatus() == SamStatus::NO_MORE_RECS);
63}
64
65
66void modifyFirstBaseLong()
67{
68 SamFile samIn;
69 // Open the file for reading.
70 assert(samIn.OpenForRead("statusTests/HalfG.bam"));
71
72 SamFile samOut;
73 // Open the file for writing.
74 assert(samOut.OpenForWrite("results/updateSeq.bam"));
75
76 // Read the sam header.
77 SamFileHeader samHeader;
78 assert(samIn.ReadHeader(samHeader));
79
80 assert(samOut.WriteHeader(samHeader));
81
82 // Read the sam records.
83 SamRecord samRecord;
84
85 // Keep reading records until the end of the file is reached.
86 while(samIn.ReadRecord(samHeader, samRecord))
87 {
88 // Successfully read a record from the file, so check to see
89 // if it is valid.
90 SamValidationErrors samValidationErrors;
91 assert(SamValidator::isValid(samHeader, samRecord,
92 samValidationErrors));
93
94 // Get the sequence.
95 const char* sequence = samRecord.getSequence();
96 assert(strcmp(sequence, "") != 0);
97 std::string upSeq = sequence;
98 upSeq[0] = 'N';
99 assert(samRecord.setSequence(upSeq.c_str()));
100
101 // write the sequence.
102 assert(samOut.WriteRecord(samHeader, samRecord));
103 }
104
105 // Should have exited only when done reading.
106 assert(samIn.GetStatus() == SamStatus::NO_MORE_RECS);
107}
108
109
110void testModifyVar()
111{
112#ifdef __ZLIB_AVAILABLE__
113 modifyFirstBase();
114#endif
115 modifyVar modTest;
116 modTest.testModifyVar("testFiles/testSam.sam", true);
117 modTest.testModifyVar("testFiles/testSam.sam", false);
118#ifdef __ZLIB_AVAILABLE__
119 modTest.testModifyVar("testFiles/testBam.bam", true);
120 modTest.testModifyVar("testFiles/testBam.bam", false);
121#endif
122}
123
124
125void modifyVar::testModifyVar(const char* filename, bool valBufFirst)
126{
127 myFilename = filename;
128 myValBufFirst = valBufFirst;
129
130 testModifyReadNameOnlySameLength();
131 testModifyCigarOnlySameLength();
132 testModifySequenceOnlySameLength();
133 testModifyQualityOnlySameLength();
134 testRemoveQuality();
135 testShortenQuality();
136 testLengthenQuality();
137
138 testShortenReadName();
139 testShortenCigar();
140 testShortenSequence();
141
142 testLengthenReadName();
143 testLengthenCigar();
144 testLengthenSequence();
145
146 testRemoveCigar();
147 testRemoveSequence();
148 testLengthenSequenceAndQuality();
149}
150
151
152void modifyVar::testModifyReadNameOnlySameLength()
153{
154 resetExpected();
155 openAndRead1Rec();
156
157 // Set the Read Name - same length, just different name.
158 expectedReadNameString = "1:1011:G:255+17M15D20M";
159 samRecord.setReadName(expectedReadNameString.c_str());
160
161 validate();
162}
163
164
165void modifyVar::testModifyCigarOnlySameLength()
166{
167 resetExpected();
168 openAndRead1Rec();
169
170 // Set the Cigar - same length, just different values.
171 expectedCigarString = "3M2I";
172 samRecord.setCigar(expectedCigarString.c_str());
173
174 // The new Cigar for record 1 is 3M2I
175 // 3M = 3 << 4 | 0 = 0x30
176 // 2I = 2 << 4 | 1 = 0x21
177 expectedCigarBufLen = 2;
178 expectedCigarBuffer[0] = 0x30;
179 expectedCigarBuffer[1] = 0x21;
180
181 validate();
182}
183
184
185void modifyVar::testModifySequenceOnlySameLength()
186{
187 resetExpected();
188 openAndRead1Rec();
189
190 // Set the Sequence - same length, just different values.
191 expectedSequenceString = "NCGAN";
192 samRecord.setSequence(expectedSequenceString.c_str());
193
194 // NCGAN = NC GA N = 0xF2 0x41 0xF0
195 expectedSequenceBuffer[0] = 0xF2;
196 expectedSequenceBuffer[1] = 0x41;
197 expectedSequenceBuffer[2] = 0xF0;
198
199 validate();
200}
201
202
203void modifyVar::testModifyQualityOnlySameLength()
204{
205 resetExpected();
206 openAndRead1Rec();
207
208 // Set the Quality - same length, just different values.
209 expectedQualityString = "!>6+!";
210 samRecord.setQuality(expectedQualityString.c_str());
211
212 validate();
213}
214
215
216void modifyVar::testRemoveQuality()
217{
218 resetExpected();
219 openAndRead1Rec();
220
221 // Set the Quality - to "*" - does not affect the length since the
222 // sequence field drives the length.
223 expectedQualityString = "*";
224 samRecord.setQuality(expectedQualityString.c_str());
225
226 validate();
227}
228
229
230void modifyVar::testShortenQuality()
231{
232 resetExpected();
233 openAndRead1Rec();
234
235 // Set the Quality - shorten, but doesn't affect the length since
236 // the sequence field drives the length.
237 expectedQualityString = "!!";
238 samRecord.setQuality(expectedQualityString.c_str());
239
240 validate();
241}
242
243
244void modifyVar::testLengthenQuality()
245{
246 resetExpected();
247 openAndRead1Rec();
248
249 // Set the Quality - lengthen, but doesn't affect the length since
250 // the sequence field drives the length.
251 expectedQualityString = "!!@@##";
252 samRecord.setQuality(expectedQualityString.c_str());
253
254 validate();
255}
256
257
258void modifyVar::testShortenReadName()
259{
260 resetExpected();
261 openAndRead1Rec();
262
263 // Set the Read Name - shorter length
264 expectedReadNameString = "1:1011:G:255";
265 samRecord.setReadName(expectedReadNameString.c_str());
266
267 validate();
268}
269
270
271void modifyVar::testShortenCigar()
272{
273 resetExpected();
274 openAndRead1Rec();
275
276 // Set the Cigar - shorter length
277 expectedCigarString = "5M";
278 samRecord.setCigar(expectedCigarString.c_str());
279
280 // The new Cigar for record 1 is 5M
281 // 5M = 5 << 4 | 0 = 0x50
282 expectedCigarBufLen = 1;
283 expectedCigarBuffer[0] = 0x50;
284
285 validate();
286}
287
288
289void modifyVar::testShortenSequence()
290{
291 resetExpected();
292 openAndRead1Rec();
293
294 // Set the Sequence - shorter length.
295 expectedSequenceString = "CCGA";
296 samRecord.setSequence(expectedSequenceString.c_str());
297
298 // CCGA = CC GA = 0x22 0x41
299 expectedSequenceBuffer[0] = 0x22;
300 expectedSequenceBuffer[1] = 0x41;
301
302 validate();
303}
304
305
306void modifyVar::testLengthenReadName()
307{
308 resetExpected();
309 openAndRead1Rec();
310
311 // Set the Read Name - longer.
312 expectedReadNameString = "1:1011:G:255+17M15D20M:1111111";
313 samRecord.setReadName(expectedReadNameString.c_str());
314
315 validate();
316}
317
318
319void modifyVar::testLengthenCigar()
320{
321 resetExpected();
322 openAndRead1Rec();
323
324 // Set the Cigar - longer length.
325 expectedCigarString = "3M2D2I";
326 samRecord.setCigar(expectedCigarString.c_str());
327
328 // The new Cigar for record 1 is 3M2I
329 // 3M = 3 << 4 | 0 = 0x30
330 // 2D = 2 << 2 | 1 = 0x22
331 // 2I = 2 << 4 | 1 = 0x21
332 expectedCigarBufLen = 3;
333 expectedCigarBuffer[0] = 0x30;
334 expectedCigarBuffer[1] = 0x22;
335 expectedCigarBuffer[2] = 0x21;
336
337 validate();
338}
339
340
341void modifyVar::testLengthenSequence()
342{
343 resetExpected();
344 openAndRead1Rec();
345
346 // Set the Sequence - longer length.
347 expectedSequenceString = "CCGAATT";
348 samRecord.setSequence(expectedSequenceString.c_str());
349
350 // CCGAATT = CC GA AT T = 0x22 0x41 0x18 0x80
351 expectedSequenceBuffer[0] = 0x22;
352 expectedSequenceBuffer[1] = 0x41;
353 expectedSequenceBuffer[2] = 0x18;
354 expectedSequenceBuffer[3] = 0x80;
355
356 validate();
357}
358
359
360void modifyVar::testRemoveCigar()
361{
362 resetExpected();
363 openAndRead1Rec();
364
365 // Set the Cigar - same length, just different values.
366 expectedCigarString = "*";
367 expectedCigarBufLen = 0;
368 samRecord.setCigar(expectedCigarString.c_str());
369
370 validate();
371}
372
373
374void modifyVar::testRemoveSequence()
375{
376 resetExpected();
377 openAndRead1Rec();
378
379 // Set the Sequence - shorter length.
380 expectedSequenceString = "*";
381 samRecord.setSequence(expectedSequenceString.c_str());
382
383 validate();
384}
385
386
387void modifyVar::testLengthenSequenceAndQuality()
388{
389 resetExpected();
390 openAndRead1Rec();
391
392 // Set the Sequence & quality - longer.
393 expectedSequenceString = "CCGAATT";
394 expectedQualityString = "!@#$%^&";
395 samRecord.setSequence(expectedSequenceString.c_str());
396 samRecord.setQuality(expectedQualityString.c_str());
397
398 // CCGAATT = CC GA AT T = 0x22 0x41 0x18 0x80
399 expectedSequenceBuffer[0] = 0x22;
400 expectedSequenceBuffer[1] = 0x41;
401 expectedSequenceBuffer[2] = 0x18;
402 expectedSequenceBuffer[3] = 0x80;
403
404 validate();
405}
406
407
408void modifyVar::validate()
409{
410 if(myValBufFirst)
411 {
412 // get the record.
413 const bamRecordStruct* recordBuffer =
414 (const bamRecordStruct*)samRecord.getRecordBuffer();
415
416 // Validate the buffer.
417 validateReadName(recordBuffer);
418 validateCigar(recordBuffer);
419 validateSequence(recordBuffer);
420 validateQuality(recordBuffer);
421 validateTags(recordBuffer);
422
423 // Validate the strings.
424 validateReadNameString();
425 validateCigarString();
426 validateSequenceString();
427 validateQualityString();
428 validateTagsString();
429 }
430 else
431 {
432 // get the record.
433 const bamRecordStruct* recordBuffer =
434 (const bamRecordStruct*)samRecord.getRecordBuffer();
435
436 // Validate the buffer.
437 validateReadName(recordBuffer);
438 validateCigar(recordBuffer);
439 validateSequence(recordBuffer);
440 validateQuality(recordBuffer);
441 validateTags(recordBuffer);
442
443 // Validate the strings.
444 validateReadNameString();
445 validateCigarString();
446 validateSequenceString();
447 validateQualityString();
448 validateTagsString();
449 }
450}
451
452void modifyVar::validateReadName(const bamRecordStruct* recordBuffer)
453{
454 const char* varPtr = (const char*)&(recordBuffer->myData);
455
456 unsigned int len = expectedReadNameString.length();
457 for(unsigned int i = 0; i < len; i++)
458 {
459 assert(varPtr[i] == expectedReadNameString[i]);
460 }
461
462 // Verify ending null.
463 assert(varPtr[len] == 0);
464
465 // verify the length - add one for the terminating null.
466 assert(recordBuffer->myReadNameLength ==
467 expectedReadNameString.length() + 1);
468}
469
470
471void modifyVar::validateCigar(const bamRecordStruct* recordBuffer)
472{
473 const unsigned char* cigarStart =
474 (const unsigned char*)&(recordBuffer->myData)
475 + recordBuffer->myReadNameLength;
476
477 unsigned int* varPtr = (unsigned int*)cigarStart;
478
479 for(int i = 0; i < expectedCigarBufLen; i++)
480 {
481 assert(varPtr[i] == expectedCigarBuffer[i]);
482 }
483
484 assert(recordBuffer->myCigarLength == expectedCigarBufLen);
485}
486
487
488void modifyVar::validateSequence(const bamRecordStruct* recordBuffer)
489{
490 // Calculate the sequence length.
491 int expectedReadLen = expectedSequenceString.length();
492 int seqLen = (expectedReadLen + 1)/2;
493 if(expectedSequenceString == "*")
494 {
495 expectedReadLen = 0;
496 seqLen = 0;
497 }
498
499 const unsigned char* sequencePtr =
500 (const unsigned char*)&(recordBuffer->myData)
501 + recordBuffer->myReadNameLength + (recordBuffer->myCigarLength * 4);
502
503 for(int i = 0; i < seqLen; i++)
504 {
505 assert(sequencePtr[i] == expectedSequenceBuffer[i]);
506 }
507
508 assert(recordBuffer->myReadLength == expectedReadLen);
509}
510
511
512void modifyVar::validateQuality(const bamRecordStruct* recordBuffer)
513{
514 int expectedReadLen = expectedSequenceString.length();
515 int seqLen = (expectedReadLen + 1)/2;
516 if(expectedSequenceString == "*")
517 {
518 expectedReadLen = 0;
519 seqLen = 0;
520 }
521
522 const uint8_t* qualityPtr =
523 (const unsigned char*)&(recordBuffer->myData)
524 + recordBuffer->myReadNameLength + (recordBuffer->myCigarLength * 4)
525 + seqLen;
526
527 int qualityLen = expectedQualityString.length();
528
529 for(int i = 0; i < expectedReadLen; i++)
530 {
531 if(expectedQualityString == "*")
532 {
533 // no quality, so check for 0xFF.
534 assert(qualityPtr[i] == 0xFF);
535 }
536 else if(i >= qualityLen)
537 {
538 // Quality is shorter than the sequence, so should be padded with
539 // 0xFF.
540 assert(qualityPtr[i] == 0xFF);
541 }
542 else
543 {
544 assert(qualityPtr[i] == (expectedQualityString[i] - 33));
545 }
546 }
547
548 assert(recordBuffer->myReadLength == expectedReadLen);
549}
550
551
552void modifyVar::validateTags(const bamRecordStruct* recordBuffer)
553{
554 const unsigned char* tagsPtr =
555 (const unsigned char*)&(recordBuffer->myData)
556 + recordBuffer->myReadNameLength + (recordBuffer->myCigarLength * 4)
557 + (recordBuffer->myReadLength + 1)/2 + recordBuffer->myReadLength;
558
559 for(int i = 0; i < expectedTagsLen; i++)
560 {
561 assert(tagsPtr[i] == expectedTagsBuffer[i]);
562 }
563
564 // Calculate expected block size - from the start of the buffer to the
565 // start of the tags plus the tags length - minus the size of the blocksize
566 // field.
567 int32_t expectedBlockSize = tagsPtr - (const unsigned char*)(recordBuffer)
568 + expectedTagsLen - 4;
569 assert(recordBuffer->myBlockSize == expectedBlockSize);
570}
571
572
573void modifyVar::validateTagsString()
574{
575 char tag[3];
576 char type;
577 void* value;
578 assert(samRecord.getNextSamTag(tag, type, &value) == true);
579 assert(tag[0] == 'A');
580 assert(tag[1] == 'M');
581 assert(type == 'i');
582 assert(*(char*)value == 0);
583 assert(samRecord.getNextSamTag(tag, type, &value) == true);
584 assert(tag[0] == 'M');
585 assert(tag[1] == 'D');
586 assert(type == 'Z');
587 assert(*(String*)value == "37");
588 assert(samRecord.getNextSamTag(tag, type, &value) == true);
589 assert(tag[0] == 'N');
590 assert(tag[1] == 'M');
591 assert(type == 'i');
592 assert(*(char*)value == 0);
593 assert(samRecord.getNextSamTag(tag, type, &value) == true);
594 assert(tag[0] == 'X');
595 assert(tag[1] == 'T');
596 assert(type == 'A');
597 assert(*(char*)value == 'R');
598 // No more tags, should return false.
599 assert(samRecord.getNextSamTag(tag, type, &value) == false);
600 assert(samRecord.getNextSamTag(tag, type, &value) == false);
601}
602
603void modifyVar::validateReadNameString()
604{
605 assert(samRecord.getReadName() == expectedReadNameString);
606}
607
608void modifyVar::validateCigarString()
609{
610 assert(samRecord.getCigar() == expectedCigarString);
611}
612
613void modifyVar::validateSequenceString()
614{
615 assert(samRecord.getSequence() == expectedSequenceString);
616}
617
618void modifyVar::validateQualityString()
619{
620 assert(samRecord.getQuality() == expectedQualityString);
621}
622
623
624void modifyVar::resetExpected()
625{
626 expectedReadNameString = "1:1011:F:255+17M15D20M";
627 expectedCigarString = "5M2D";
628 expectedSequenceString = "CCGAA";
629 expectedQualityString = "6>6+4";
630
631 // The default Cigar for record 1 is 5M2D
632 // 5M = 5 << 4 | 0 = 0x50
633 // 2D = 2 << 4 | 2 = 0x22
634 expectedCigarBufLen = 2;
635 expectedCigarBuffer[0] = 0x50;
636 expectedCigarBuffer[1] = 0x22;
637
638 // CCGAA = CC GA A = 0x22 0x41 0x10
639 expectedSequenceBuffer[0] = 0x22;
640 expectedSequenceBuffer[1] = 0x41;
641 expectedSequenceBuffer[2] = 0x10;
642
643 expectedTagsLen = 18;
644 expectedTagsBuffer[0] = 'A';
645 expectedTagsBuffer[1] = 'M';
646 expectedTagsBuffer[2] = 'C';
647 expectedTagsBuffer[3] = 0;
648 expectedTagsBuffer[4] = 'M';
649 expectedTagsBuffer[5] = 'D';
650 expectedTagsBuffer[6] = 'Z';
651 expectedTagsBuffer[7] = '3';
652 expectedTagsBuffer[8] = '7';
653 expectedTagsBuffer[9] = 0;
654 expectedTagsBuffer[10] = 'N';
655 expectedTagsBuffer[11] = 'M';
656 expectedTagsBuffer[12] = 'C';
657 expectedTagsBuffer[13] = 0;
658 expectedTagsBuffer[14] = 'X';
659 expectedTagsBuffer[15] = 'T';
660 expectedTagsBuffer[16] = 'A';
661 expectedTagsBuffer[17] = 'R';
662}
663
664
665void modifyVar::openAndRead1Rec()
666{
667 // Open the file for reading.
668 assert(samIn.OpenForRead(myFilename));
669
670 // Read the sam header.
671 assert(samIn.ReadHeader(samHeader));
672
673 // Read the first record.
674 assert(samIn.ReadRecord(samHeader, samRecord));
675}
This class allows a user to get/set the fields in a SAM/BAM Header.
Definition: SamFileHeader.h:35
Allows the user to easily read/write a SAM/BAM file.
Definition: SamFile.h:36
bool ReadHeader(SamFileHeader &header)
Reads the header section from the file and stores it in the passed in header.
Definition: SamFile.cpp:450
bool ReadRecord(SamFileHeader &header, SamRecord &record)
Reads the next record from the file & stores it in the passed in record.
Definition: SamFile.cpp:514
bool OpenForRead(const char *filename, SamFileHeader *header=NULL)
Open a sam/bam file for reading with the specified filename, determing the type of file and SAM/BAM b...
Definition: SamFile.cpp:93
SamStatus::Status GetStatus()
Get the Status of the last call that sets status.
Definition: SamFile.h:212
bool OpenForWrite(const char *filename, SamFileHeader *header=NULL)
Open a sam/bam file for writing with the specified filename, determining SAM/BAM from the extension (...
Definition: SamFile.cpp:223
bool WriteHeader(SamFileHeader &header)
Writes the specified header into the file.
Definition: SamFile.cpp:480
bool WriteRecord(SamFileHeader &header, SamRecord &record)
Writes the specified record into the file.
Definition: SamFile.cpp:632
Class providing an easy to use interface to get/set/operate on the fields in a SAM/BAM record.
Definition: SamRecord.h:52
bool setReadName(const char *readName)
Set QNAME to the passed in name.
Definition: SamRecord.cpp:193
const void * getRecordBuffer()
Get a const pointer to the buffer that contains the BAM representation of the record.
Definition: SamRecord.cpp:1204
bool setCigar(const char *cigar)
Set the CIGAR to the specified SAM formatted cigar string.
Definition: SamRecord.cpp:259
bool getNextSamTag(char *tag, char &vtype, void **value)
Get the next tag from the record.
Definition: SamRecord.cpp:1962
bool setSequence(const char *seq)
Sets the sequence (SEQ) to the specified SAM formatted sequence string.
Definition: SamRecord.cpp:344
const char * getCigar()
Returns the SAM formatted CIGAR string.
Definition: SamRecord.cpp:1555
const char * getReadName()
Returns the SAM formatted Read Name (QNAME).
Definition: SamRecord.cpp:1542
bool setQuality(const char *quality)
Sets the quality (QUAL) to the specified SAM formatted quality string.
Definition: SamRecord.cpp:357
const char * getQuality()
Returns the SAM formatted quality string (QUAL).
Definition: SamRecord.cpp:1638
const char * getSequence()
Returns the SAM formatted sequence string (SEQ), translating the base as specified by setSequenceTran...
Definition: SamRecord.cpp:1568
The SamValidationErrors class is a container class that holds SamValidationError Objects,...
static bool isValid(SamFileHeader &samHeader, SamRecord &samRecord, SamValidationErrors &validationErrors)
Validates whether or not the specified SamRecord is valid, calling all of the other validations.
@ NO_MORE_RECS
NO_MORE_RECS: failed to read a record since there are no more to read either in the file or section i...
Definition: StatGenStatus.h:36
Structure of a BAM record.
Definition: SamRecord.h:34