libStatGen Software 1
SamValidation.cpp
1/*
2 * Copyright (C) 2010-2015 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#include <iostream>
19
20#include "SamValidation.h"
21#include "CigarRoller.h"
22#include "SamTags.h"
23
24const char* SamValidationError::enumSeverityString[] = {
25 "WARNING", "ERROR"};
26
27const char* SamValidationError::enumTypeString[] = {
28 "INVALID_QNAME",
29 "INVALID_REF_ID",
30 "INVALID_RNAME",
31 "INVALID_POS",
32 "INVALID_MAPQ",
33 "INVALID_CIGAR",
34 "INVALID_MRNM",
35 "INVALID_QUAL",
36 "INVALID_TAG"
37};
38
40{
41 return(enumTypeString[type]);
42}
43
44
46 std::string message)
47{
48 myType = type;
49 mySeverity = severity;
50 myMessage = message;
51}
52
53
55{
56 return(myType);
57}
58
59
61{
62 return(mySeverity);
63}
64
65
67{
68 return(myMessage.c_str());
69}
70
71
73{
74 return(enumTypeString[myType]);
75}
76
77
79{
80 return(enumSeverityString[mySeverity]);
81}
82
83
84void SamValidationError::getErrorString(std::string& errorString) const
85{
86 errorString = getTypeString();
87 errorString += " (";
88 errorString += getSeverityString();
89 errorString += ") : ";
90 errorString += getMessage();
91 errorString += "\n";
92}
93
94
96{
97 std::cerr << this;
98}
99
100
101
102// Constructor.
104 : myValidationErrors()
105{
106 myErrorIter = myValidationErrors.begin();
107}
108
109
110// Destructor
112{
113 clear();
114}
115
116
118{
119 // Clear the errors.
120 std::list<const SamValidationError*>::iterator errorIter;
121 for(errorIter = myValidationErrors.begin();
122 errorIter != myValidationErrors.end(); ++errorIter)
123 {
124 delete *errorIter;
125 *errorIter = NULL;
126 }
127 myValidationErrors.clear();
128 myErrorIter = myValidationErrors.end();
129}
130
131
133 SamValidationError::Severity newSeverity,
134 const char* newMessage)
135{
136 myValidationErrors.push_back(new SamValidationError(newType,
137 newSeverity,
138 newMessage));
139
140 // If this is the first element in the list, set the iterator.
141 if(myValidationErrors.size() == 1)
142 {
143 // set the iterator to the first element.
144 myErrorIter = myValidationErrors.begin();
145 }
146}
147
148
149
150// Return the number of validation errors that are contained in this object.
152{
153 return(myValidationErrors.size());
154}
155
156
157// Return a pointer to the next error. It does not remove it from the list.
158// Returns null once all errors have been retrieved until resetErrorIter
159// is called.
161{
162 if(myErrorIter == myValidationErrors.end())
163 {
164 // at the end of the list, return null.
165 return(NULL);
166 }
167 // Not at the end of the list, return the last element and increment.
168 return(*myErrorIter++);
169}
170
171
172// Resets the iterator to the begining of the errors.
174{
175 myErrorIter = myValidationErrors.begin();
176}
177
178
179// Appends the error messages to the passed in string.
180void SamValidationErrors::getErrorString(std::string& errorString) const
181{
182 for(std::list<const SamValidationError*>::
183 const_iterator validationErrorIter =
184 myValidationErrors.begin();
185 validationErrorIter != myValidationErrors.end();
186 validationErrorIter++)
187 {
188 std::string error = "";
189 (*validationErrorIter)->getErrorString(error);
190 errorString += error;
191 }
192}
193
194
195bool SamValidator::isValid(SamFileHeader& samHeader, SamRecord& samRecord,
196 SamValidationErrors& validationErrors)
197{
198 bool status = true;
199 status &= isValidQname(samRecord.getReadName(),
200 samRecord.getReadNameLength(),
201 validationErrors);
202
203 status &= isValidFlag(samRecord.getFlag(),
204 validationErrors);
205
206 // Validate the RName including validating it against the header.
207 status &= isValidRname(samHeader,
208 samRecord.getReferenceName(),
209 validationErrors);
210
211 status &= isValidRefID(samRecord.getReferenceID(),
212 samHeader.getReferenceInfo(),
213 validationErrors);
214
215 status &= isValid1BasedPos(samRecord.get1BasedPosition(),
216 validationErrors);
217
218 status &= isValidMapQuality(samRecord.getMapQuality(), validationErrors);
219
220 status &= isValidSequence(samRecord, validationErrors);
221
222 status &= isValidCigar(samRecord, validationErrors);
223
224 status &= isValidQuality(samRecord, validationErrors);
225
226 status &= isValidTags(samRecord, validationErrors);
227
228 return(status);
229}
230
231
232// qname is the query (read) name - result of SamRecord::getReadName().
233// readNameLen is the length of the read name including the null (the result
234// of SamRecord::getReadNameLength()).
235// For some invalid records, the getReadNameLength may be different than the
236// length of qname.
237// NOTE: Query Name and Read Name both refer to the same field.
238bool SamValidator::isValidQname(const char* qname, uint8_t readNameLen,
239 SamValidationErrors& validationErrors)
240{
241 // Validation for QNAME is:
242 // a) length of the qname string is the same as the read name length
243 // b) length is between 1 and 254.
244 // c) [ \t\n\r] are not allowed in the name.
245
246 bool status = true;
247
248 // Get the length of the qname string.
249 int32_t qnameLenNull = strlen(qname) + 1;
250
251 ////////////////////////////////////
252 // a) length of the qname string is the same as the read name length
253 if(qnameLenNull != readNameLen)
254 {
255 // This results from a poorly formatted bam file, where the null
256 // terminated read_name field is not the same length as specified by
257 // read_name_len.
258 String message = "Invalid Query Name - the string length (";
259 message += qnameLenNull;
260 message += ") does not match the specified query name length (";
261 message += readNameLen;
262 message += ").";
263
266 message.c_str());
267 status = false;
268 }
269
270 ////////////////////////////////////
271 // b) length is between 1 and 254
272 // The length with the terminating null must be between 2 & 255,
273 if((qnameLenNull < 2) || (qnameLenNull > 255))
274 {
275 String message = "Invalid Query Name (QNAME) length: ";
276 message += qnameLenNull;
277 message += ". Length with the terminating null must be between 2 & 255.";
278
281 message.c_str());
282 status = false;
283 }
284
285 ////////////////////////////////////
286 // Loop through and validate they all characters are valid.
287 // c) [ \t\n\r] are not allowed in the name.
288 String message;
289 for(int i = 0; i < qnameLenNull; ++i)
290 {
291 switch(qname[i])
292 {
293 case ' ':
294 // Invalid character.
295 message = "Invalid character in the Query Name (QNAME): ' ' at position ";
296 message += i;
297 message += ".";
300 message.c_str());
301 status = false;
302 break;
303 case '\t':
304 // Invalid character.
305 message = "Invalid character in the Query Name (QNAME): '\t' at position ";
306 message += i;
307 message += ".";
310 message.c_str());
311 status = false;
312 break;
313 case '\n':
314 // Invalid character.
315 message = "Invalid character in the Query Name (QNAME): '\n' at position ";
316 message += i;
317 message += ".";
320 message.c_str());
321 status = false;
322 break;
323 case '\r':
324 // Invalid character.
325 message = "Invalid character in the Query Name (QNAME): '\r' at position ";
326 message += i;
327 message += ".";
330 message.c_str());
331 status = false;
332 break;
333 }
334 }
335
336 return(status);
337}
338
339
340bool SamValidator::isValidFlag(uint16_t flag,
341 SamValidationErrors& validationErrors)
342{
343 // All values in a uint16_t are valid, so return true.
344 return(true);
345}
346
347
349 const char* rname,
350 SamValidationErrors& validationErrors)
351{
352 bool status = true;
353
354 // Cross validate the rname and the header.
355 // If the rname is not '*'
356 // AND there are any SQ records in the header,
357 // Then the rname must be in one of them.
358 if((strcmp(rname, "*") != 0) &&
359 (samHeader.getNumSQs() != 0) &&
360 (samHeader.getSQ(rname) == NULL))
361 {
362 // There are SQ fields, but the ref name is not in it.
363 status = false;
364 std::string message = "RNAME, ";
365 message += rname;
366 message += ", was not found in a SAM Header SQ record";
369 message.c_str());
370 }
371 status &= isValidRname(rname, validationErrors);
372 return(status);
373}
374
375
376bool SamValidator::isValidRname(const char* rname,
377 SamValidationErrors& validationErrors)
378{
379 // Validation for RNAME is:
380 // a) cannot be 0 length.
381 // b) [ \t\n\r@=] are not allowed in the name.
382
383 bool status = true;
384
385 // Get the length of the rname string.
386 int32_t rnameLen = strlen(rname);
387
388 String message;
389
390 if(rnameLen == 0)
391 {
394 "Reference Sequence Name (RNAME) cannot have 0 length.");
395 status = false;
396 }
397
398 ////////////////////////////////////
399 ////////////////////////////////////
400 // Loop through and validate they all characters are valid.
401 // b) [ \t\n\r] are not allowed in the name.
402 for(int i = 0; i < rnameLen; ++i)
403 {
404 switch(rname[i])
405 {
406 case ' ':
407 // Invalid character.
408 message = "Invalid character in the Reference Sequence Name (RNAME): ' ' at position ";
409 message += i;
410 message += ".";
413 message.c_str());
414 status = false;
415 break;
416 case '\t':
417 // Invalid character.
418 message = "Invalid character in the Reference Sequence Name (RNAME): '\t' at position ";
419 message += i;
420 message += ".";
423 message.c_str());
424 status = false;
425 break;
426 case '\n':
427 // Invalid character.
428 message = "Invalid character in the Reference Sequence Name (RNAME): '\n' at position ";
429 message += i;
430 message += ".";
433 message.c_str());
434 status = false;
435 break;
436 case '\r':
437 // Invalid character.
438 message = "Invalid character in the Reference Sequence Name (RNAME): '\r' at position ";
439 message += i;
440 message += ".";
443 message.c_str());
444 status = false;
445 break;
446 case '@':
447 // Invalid character.
448 message = "Invalid character in the Reference Sequence Name (RNAME): '@' at position ";
449 message += i;
450 message += ".";
453 message.c_str());
454 status = false;
455 break;
456 case '=':
457 // Invalid character.
458 message = "Invalid character in the Reference Sequence Name (RNAME): '=' at position ";
459 message += i;
460 message += ".";
463 message.c_str());
464 status = false;
465 break;
466 default:
467 // Allowed character.
468 break;
469 }
470 }
471
472 return(status);
473}
474
475
476bool SamValidator::isValidRefID(int32_t refID,
477 const SamReferenceInfo& refInfo,
478 SamValidationErrors& validationErrors)
479{
480 // Validation for rID is:
481 // a) must be between -1 and the number of refInfo.
482 // -1 is allowed, and otherwise it must properly index into the array.
483
484 bool status = true;
485 if((refID < -1) || (refID >= refInfo.getNumEntries()))
486 {
487 // Reference ID is too large or too small.
488 String message = "Invalid Reference ID, out of range (";
489 message += refID;
490 message += ") must be between -1 and ";
491 message += refInfo.getNumEntries() - 1;
492 message += ".";
493
496 message.c_str());
497 status = false;
498 }
499
500 return(status);
501}
502
503
505 SamValidationErrors& validationErrors)
506{
507 // Validation for pos is:
508 // a) must be between 0 and (2^29)-1.
509
510 bool status = true;
511
512 if((pos < 0) || (pos > 536870911))
513 {
514 String message = "POS out of range (";
515 message += pos;
516 message += ") must be between 0 and (2^29)-1.";
517
520 message.c_str());
521 status = false;
522 }
523
524 return(status);
525}
526
527
528bool SamValidator::isValidMapQuality(uint8_t mapQuality,
529 SamValidationErrors& validationErrors)
530{
531 // All values in a uint8_t are valid, so return true.
532 return(true);
533}
534
535
537 SamValidationErrors& validationErrors)
538{
539 return(true);
540}
541
542
544 SamValidationErrors& validationErrors)
545{
546 return(isValidCigar(samRecord.getCigar(),
547 samRecord.getReadLength(),
548 validationErrors));
549}
550
551bool SamValidator::isValidCigar(const char* cigar,
552 const char* sequence,
553 SamValidationErrors& validationErrors)
554{
555 if(strcmp(sequence, "*") != 0)
556 {
557 return(isValidCigar(cigar, strlen(sequence), validationErrors));
558 }
559 // Sequence is '*', so the length is 0.
560 return(isValidCigar(cigar, 0, validationErrors));
561}
562
563bool SamValidator::isValidCigar(const char* cigar,
564 int seqLen,
565 SamValidationErrors& validationErrors)
566{
567 // Validation for CIGAR is:
568 // a) cannot be 0 length.
569 // if not "*", validate the following:
570 // b) must have an integer length for each operator (if not "*"). TODO
571 // c) all operators must be valid (if not "*"). TODO
572 // d) evaluates to the same read length as the sequence string if not '*'.
573 bool status = true;
574 String message;
575
576 int32_t cigarLen = strlen(cigar);
577
578 // a) cannot be 0 length.
579 if(cigarLen == 0)
580 {
583 "Cigar must not be blank.");
584 status = false;
585 }
586
587 if(strcmp(cigar, "*") != 0)
588 {
589 // The cigar is not "*", so validate it.
590 CigarRoller cigarRoller(cigar);
591
592 // b) must have an integer length for each operator.
593 // TODO
594 // c) all operators must be valid.
595 // TODO
596
597 // d) is the same length as the sequence string.
598 int cigarSeqLen = cigarRoller.getExpectedQueryBaseCount();
599 if((cigarSeqLen != seqLen) && (seqLen != 0))
600 {
601 message = "CIGAR does not evaluate to the same length as SEQ, (";
602 message += cigarSeqLen;
603 message += " != ";
604 message += seqLen;
605 message += ").";
608 message.c_str());
609 status = false;
610 }
611 }
612 return(status);
613}
614
615
617 SamValidationErrors& validationErrors)
618{
619 return(isValidQuality(samRecord.getQuality(),
620 samRecord.getReadLength(),
621 validationErrors));
622}
623
624
625bool SamValidator::isValidQuality(const char* quality,
626 const char* sequence,
627 SamValidationErrors& validationErrors)
628{
629 // Determine the length of the sequence.
630 int seqLen = strlen(sequence);
631
632 // Check if the sequence is '*' since then the seqLength is 0.
633 if(strcmp(sequence, "*") == 0)
634 {
635 seqLen = 0;
636 }
637 return(isValidQuality(quality, seqLen, validationErrors));
638}
639
640
641bool SamValidator::isValidQuality(const char* quality,
642 int seqLength,
643 SamValidationErrors& validationErrors)
644{
645 bool status = true;
646
647 // If the quality or the sequence are non-"*", validate that the quality
648 // and sequence have the same length.
649 if((seqLength != 0) && (strcmp(quality, "*") != 0))
650 {
651 int qualLen = strlen(quality);
652 // Both the sequence and the quality are not "*", so validate
653 // that they are the same length.
654 if(seqLength != qualLen)
655 {
656 // Both fields are specified but are different lengths.
657
658 String message = "QUAL is not the same length as SEQ, (";
659 message += qualLen;
660 message += " != ";
661 message += seqLength;
662 message += ").";
663
666 message.c_str());
667 status = false;
668 }
669 }
670 return(status);
671}
672
673
675 SamValidationErrors& validationErrors)
676{
677 bool status = true;
678
679 GenomeSequence* reference = samRecord.getReference();
680 // If the reference is not null, check the MD tag.
681 if(reference != NULL)
682 {
683 const String* recordMD = samRecord.getStringTag(SamTags::MD_TAG);
684 if(recordMD != NULL)
685 {
686 // The record has an MD tag so check to see if it is correct.
687 if(!SamTags::isMDTagCorrect(samRecord, *reference))
688 {
689 // Invalid MD tags.
690 String correctMD;
691 if(!SamTags::createMDTag(correctMD, samRecord, *reference))
692 {
693 // Failed to get the MD tag, so indicate that it is unknown.
694 correctMD = "UNKNOWN";
695 }
696 String message = "Incorrect MD Tag, ";
697 message += *recordMD;
698 message += ", should be ";
699 message += correctMD;
700 message += ".";
701
704 message.c_str());
705
706 status = false;
707 }
708 }
709 }
710
711 return(status);
712}
The purpose of this class is to provide accessors for setting, updating, modifying the CIGAR object....
Definition: CigarRoller.h:67
int getExpectedQueryBaseCount() const
Return the length of the read that corresponds to the current CIGAR string.
Definition: Cigar.cpp:95
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
This class allows a user to get/set the fields in a SAM/BAM Header.
Definition: SamFileHeader.h:35
SamHeaderSQ * getSQ(const char *name)
Get the SQ object with the specified sequence name, returning NULL if there is no SQ object with that...
int getNumSQs()
Get the number of SQ objects.
const SamReferenceInfo & getReferenceInfo() const
Get the Reference Information.
Class providing an easy to use interface to get/set/operate on the fields in a SAM/BAM record.
Definition: SamRecord.h:52
const char * getReferenceName()
Get the reference sequence name (RNAME) of the record.
Definition: SamRecord.cpp:1298
int32_t get1BasedPosition()
Get the 1-based(SAM) leftmost position (POS) of the record.
Definition: SamRecord.cpp:1312
int32_t getReferenceID()
Get the reference sequence id of the record (BAM format rid).
Definition: SamRecord.cpp:1305
GenomeSequence * getReference()
Returns a pointer to the genome sequence object associated with this record if it was set (NULL if it...
Definition: SamRecord.cpp:1923
uint8_t getReadNameLength()
Get the length of the readname (QNAME) including the null.
Definition: SamRecord.cpp:1326
uint16_t getFlag()
Get the flag (FLAG).
Definition: SamRecord.cpp:1384
int32_t getReadLength()
Get the length of the read.
Definition: SamRecord.cpp:1391
const String * getStringTag(const char *tag)
Get the string value for the specified tag.
Definition: SamRecord.cpp:2180
const char * getCigar()
Returns the SAM formatted CIGAR string.
Definition: SamRecord.cpp:1555
uint8_t getMapQuality()
Get the mapping quality (MAPQ) of the record.
Definition: SamRecord.cpp:1340
const char * getReadName()
Returns the SAM formatted Read Name (QNAME).
Definition: SamRecord.cpp:1542
const char * getQuality()
Returns the SAM formatted quality string (QUAL).
Definition: SamRecord.cpp:1638
Class for tracking the reference information mapping between the reference ids and the reference name...
int32_t getNumEntries() const
Get the number of entries contained here.
static bool isMDTagCorrect(SamRecord &inputRec, GenomeSequence &genome)
Check to see if the MD tag in the record is accurate.
Definition: SamTags.cpp:126
static bool createMDTag(String &outputMDtag, SamRecord &inputRec, GenomeSequence &genome)
Create the MD tag for the specified input record and the genome.
Definition: SamTags.cpp:34
The SamValidationError class describes a validation error that occured, containing the error type,...
Definition: SamValidation.h:35
Type getType() const
Return the type enum of this validation error object.
const char * getSeverityString() const
Return the string representing this object's severity of validation error.
void printError() const
Print a formatted output of the error to cerr.
void getErrorString(std::string &errorString) const
Get the error string representing this object's error.
Severity
Severity of the error.
Definition: SamValidation.h:39
@ WARNING
Warning is used if it is just an invalid value.
Definition: SamValidation.h:40
@ ERROR
Error is used if parsing could not succeed.
Definition: SamValidation.h:41
const char * getMessage() const
Return the error message of this validation error object.
SamValidationError(Type type, Severity severity, std::string Message)
Constructor that sets the type, severity, and message for the validation error.
Type
Type of the error.
Definition: SamValidation.h:48
@ INVALID_REF_ID
Invalid reference id.
Definition: SamValidation.h:50
@ INVALID_TAG
Invalid tag.
Definition: SamValidation.h:57
@ INVALID_QNAME
Invalid read/query name.
Definition: SamValidation.h:49
@ INVALID_CIGAR
Invalid CIGAR.
Definition: SamValidation.h:54
@ INVALID_POS
Invalid position.
Definition: SamValidation.h:52
@ INVALID_RNAME
Invalid reference name.
Definition: SamValidation.h:51
@ INVALID_QUAL
Invalid base quality.
Definition: SamValidation.h:56
Severity getSeverity() const
Return the severity enum of this validation error object.
const char * getTypeString() const
Return the string representing this object's type of validation error.
The SamValidationErrors class is a container class that holds SamValidationError Objects,...
void getErrorString(std::string &errorString) const
Append the error messages contained in this container to the passed in string.
const SamValidationError * getNextError()
Return a pointer to the next error without removing it from the container, and returning null once al...
SamValidationErrors()
Constructor.
void resetErrorIter()
Reset the iterator to the begining of the errors.
void clear()
Remove all the errors from the container.
unsigned int numErrors()
Return the number of validation errors contained in this object.
void addError(SamValidationError::Type newType, SamValidationError::Severity newSeverity, const char *newMessage)
Add the specified error to this container.
~SamValidationErrors()
Destructor.
static bool isValidQname(const char *qname, uint8_t qnameLen, SamValidationErrors &validationErrors)
Determines whether or not the specified qname is valid.
static bool isValidTags(SamRecord &samRecord, SamValidationErrors &validationErrors)
Validate the tags.
static bool isValidFlag(uint16_t flag, SamValidationErrors &validationErrors)
Determines whether or not the flag is valid.
static bool isValidQuality(SamRecord &samRecord, SamValidationErrors &validationErrors)
Validate the base quality.
static bool isValid(SamFileHeader &samHeader, SamRecord &samRecord, SamValidationErrors &validationErrors)
Validates whether or not the specified SamRecord is valid, calling all of the other validations.
static bool isValid1BasedPos(int32_t pos, SamValidationErrors &validationErrors)
Validate the refeference position.
static bool isValidRname(SamFileHeader &samHeader, const char *rname, SamValidationErrors &validationErrors)
Validate the reference name including validating against the header.
static bool isValidRefID(int32_t refID, const SamReferenceInfo &refInfo, SamValidationErrors &validationErrors)
Validate whether or not the specified reference id is valid.
static bool isValidCigar(SamRecord &samRecord, SamValidationErrors &validationErrors)
Validate the cigar.
static bool isValidMapQuality(uint8_t mapQuality, SamValidationErrors &validationErrors)
Validate the mapping quality.
static bool isValidSequence(SamRecord &samRecord, SamValidationErrors &validationErrors)
Validate the sequence, but not against the cigar or quality string.