libStatGen Software 1
FastQFile.cpp
1/*
2 * Copyright (C) 2010 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#include <iostream>
19
20#include "InputFile.h"
21#include "FastQFile.h"
22#include "BaseUtilities.h"
23
24// Constructor.
25// minReadLength - The minimum length that a base sequence must be for
26// it to be valid.
27// numPrintableErrors - The maximum number of errors that should be reported
28// in detail before suppressing the errors.
29//
30FastQFile::FastQFile(int minReadLength, int numPrintableErrors)
31 : myFile(NULL),
32 myBaseComposition(),
33 myQualPerCycle(),
34 myCountPerCycle(),
35 myCheckSeqID(true),
36 myInterleaved(false),
37 myPrevSeqID(""),
38 myMinReadLength(minReadLength),
39 myNumPrintableErrors(numPrintableErrors),
40 myMaxErrors(-1),
41 myDisableMessages(false),
42 myFileProblem(false)
43{
44 // Reset the member data.
45 reset();
46}
47
48
50{
51 myDisableMessages = true;
52}
53
54
56{
57 myDisableMessages = false;
58}
59
60
61// Disable Unique Sequence ID checking.
62// Unique Sequence ID checking is enabled by default.
64{
65 myCheckSeqID = false;
66}
67
68
69// Enable Unique Sequence ID checking.
70// Unique Sequence ID checking is enabled by default.
72{
73 myCheckSeqID = true;
74}
75
76
77/// Interleaved.
79{
80 myInterleaved = true;
81}
82
83
84// Set the number of errors after which to quit reading/validating a file.
85void FastQFile::setMaxErrors(int maxErrors)
86{
87 myMaxErrors = maxErrors;
88}
89
90
91// Open a FastQFile.
94{
95 // reset the member data.
96 reset();
97
98 myBaseComposition.resetBaseMapType();
99 myBaseComposition.setBaseMapType(spaceType);
100 myQualPerCycle.clear();
101 myCountPerCycle.clear();
102
104
105 // Close the file if there is already one open - checked by close.
106 status = closeFile();
107 if(status == FastQStatus::FASTQ_SUCCESS)
108 {
109 // Successfully closed a previously opened file if there was one.
110
111 // Open the file
112 myFile = ifopen(fileName, "rt");
113 myFileName = fileName;
114
115 if(myFile == NULL)
116 {
117 // Failed to open the file.
119 }
120 }
121
122 if(status != FastQStatus::FASTQ_SUCCESS)
123 {
124 // Failed to open the file.
125 std::string errorMessage = "ERROR: Failed to open file: ";
126 errorMessage += fileName;
127 logMessage(errorMessage.c_str());
128 }
129 return(status);
130}
131
132
133// Close a FastQFile.
135{
136 int closeStatus = 0; // Success.
137
138 // If a file has been opened, close it.
139 if(myFile != NULL)
140 {
141 // Close the file.
142 closeStatus = ifclose(myFile);
143 myFile = NULL;
144 }
145 if(closeStatus == 0)
146 {
147 // Success - either there wasn't a file to close or it was closed
148 // successfully.
150 }
151 else
152 {
153 std::string errorMessage = "Failed to close file: ";
154 errorMessage += myFileName.c_str();
155 logMessage(errorMessage.c_str());
157 }
158}
159
160
161// Check to see if the file is open.
163{
164 // Check to see if the file is open.
165 if((myFile != NULL) && (myFile->isOpen()))
166 {
167 // File pointer exists and the file is open.
168 return true;
169 }
170
171 // File is not open.
172 return false;
173}
174
175
176// Check to see if the file is at the end of the file.
178{
179 // Check to see if the file is open.
180 if((myFile != NULL) && (ifeof(myFile)))
181 {
182 // At EOF.
183 return true;
184 }
185
186 // Not at EOF.
187 return false;
188}
189
190
191// Returns whether or not to keep reading the file.
192// Stop reading (false) if eof or there is a problem reading the file.
194{
195 if(isEof() || myFileProblem)
196 {
197 return(false);
198 }
199 return(true);
200}
201
202
203// Validate the specified fastq file
205 bool printBaseComp,
206 BaseAsciiMap::SPACE_TYPE spaceType,
207 bool printQualAvg)
208{
209 // Open the fastqfile.
210 if(openFile(filename, spaceType) != FastQStatus::FASTQ_SUCCESS)
211 {
212 // Failed to open the specified file.
214 }
215
216 // Track the total number of sequences that were validated.
217 int numSequences = 0;
218
219 // Keep reading the file until there are no more fastq sequences to process
220 // and not configured to quit after a certain number of errors or there
221 // has not yet been that many errors.
222 // Or exit if there is a problem reading the file.
224 while (keepReadingFile() &&
225 ((myMaxErrors == -1) || (myMaxErrors > myNumErrors)))
226 {
227 // Validate one sequence. This call will read all the lines for
228 // one sequence.
229 status = readFastQSequence();
230 if((status == FastQStatus::FASTQ_SUCCESS) || (status == FastQStatus::FASTQ_INVALID))
231 {
232 // Read a sequence and it is either valid or invalid, but
233 // either way, a sequence was read, so increment the sequence count.
234 ++numSequences;
235 }
236 else
237 {
238 // Other error, so break out of processing.
239 break;
240 }
241 }
242
243 // Report Base Composition Statistics.
244 if(printBaseComp)
245 {
246 myBaseComposition.print();
247 }
248
249 if(printQualAvg)
250 {
251 printAvgQual();
252 }
253
254 std::string finishMessage = "Finished processing ";
255 finishMessage += myFileName.c_str();
256 char buffer[100];
257 if(sprintf(buffer,
258 " with %u lines containing %d sequences.",
259 myLineNum, numSequences) > 0)
260 {
261 finishMessage += buffer;
262 logMessage(finishMessage.c_str());
263 }
264 if(sprintf(buffer,
265 "There were a total of %d errors.",
266 myNumErrors) > 0)
267 {
268 logMessage(buffer);
269 }
270
271 // Close the input file.
272 FastQStatus::Status closeStatus = closeFile();
273
274 if((status != FastQStatus::FASTQ_SUCCESS) && (status != FastQStatus::FASTQ_INVALID) &&
276 {
277 // Stopped validating due to some error other than invalid, so
278 // return that error.
279 return(status);
280 }
281 else if(myNumErrors == 0)
282 {
283 // No errors, check to see if there were any sequences.
284 // Finished processing all of the sequences in the file.
285 // If there are no sequences, report an error.
286 if(numSequences == 0)
287 {
288 // Empty file, return error.
289 logMessage("ERROR: No FastQSequences in the file.");
291 }
293 }
294 else
295 {
296 // The file is invalid. But check the close status. If the close
297 // failed, it means there is a problem with the file itself not just
298 // with validation, so the close failure should be returned.
299 if(closeStatus != FastQStatus::FASTQ_SUCCESS)
300 {
301 return(closeStatus);
302 }
304 }
305}
306
307
308// Reads and validates a single fastq sequence from myFile.
310{
311 // First verify that a file is open, if not, return failure.
312 if(!isOpen())
313 {
314 std::string message =
315 "ERROR: Trying to read a fastq file but no file is open.";
316 logMessage(message.c_str());
318 }
319
320 // Reset variables for each sequence.
321 resetForEachSequence();
322
323 bool valid = true;
324
325 // No sequence was read.
326 if(isTimeToQuit())
327 {
329 }
330
331 // The first line is the sequence identifier, so validate that.
332 valid = validateSequenceIdentifierLine();
333
334 if(myFileProblem)
335 {
337 }
338
339 // If we are at the end of the file, check to see if it is a partial
340 // sequence or just an empty line at the end.
341 if(ifeof(myFile))
342 {
343 // If the sequence identifier line was empty and we are at the
344 // end of the file, there is nothing more to validate.
345 if(mySequenceIdLine.Length() != 0)
346 {
347 // There was a sequence identifier line, so this is an incomplete
348 // sequence.
349 myErrorString = "Incomplete Sequence.\n";
350 reportErrorOnLine();
351
352 valid = false;
353 }
354 if(valid)
355 {
356 // Return failure - no sequences were left to read. At the end
357 // of the file. It wasn't invalid and it wasn't really an error.
359 }
360 else
361 {
363 }
364 }
365
366 // If enough errors, quit before reading any more.
367 if(isTimeToQuit())
368 {
369 // Means there was an error, so mark it as invalid.
371 }
372
373 // Validate the Raw Sequence Line(s) and the "+" line.
374 valid &= validateRawSequenceAndPlusLines();
375
376 if(myFileProblem)
377 {
379 }
380
381 // If enough errors, quit before reading any more.
382 if(isTimeToQuit())
383 {
385 }
386
387 // If it is the end of a file, it is missing the quality string.
388 if(ifeof(myFile))
389 {
390 // There was a sequence identifier line, so this is an incomplete
391 // sequence.
392 myErrorString = "Incomplete Sequence, missing Quality String.";
393 reportErrorOnLine();
394 valid = false;
396 }
397
398 // All that is left is to validate the quality string line(s).
399 valid &= validateQualityStringLines();
400
401 if(myFileProblem)
402 {
404 }
405
406 if(valid)
407 {
409 }
411}
412
413
414// Reads and validates the sequence identifier line of a fastq sequence.
415bool FastQFile::validateSequenceIdentifierLine()
416{
417 // Read the first line of the sequence.
418 int readStatus = mySequenceIdLine.ReadLine(myFile);
419
420 // Check to see if the read was successful.
421 if(readStatus <= 0)
422 {
423 // If EOF, not an error.
424 if(ifeof(myFile))
425 {
426 return true;
427 }
428 myFileProblem = true;
429 myErrorString = "Failure trying to read sequence identifier line";
430 reportErrorOnLine();
431 return false;
432 }
433
434 // If the line is 0 length and it is the end of the file, just
435 // return since this is the eof - no error.
436 if((mySequenceIdLine.Length() == 0) && (ifeof(myFile)))
437 {
438 // Not an error, just a new line at the end of the file.
439 return true;
440 }
441
442 // Increment the line number.
443 myLineNum++;
444
445 // Verify that the line has at least 2 characters: '@' and at least
446 // one character for the sequence identifier.
447 if(mySequenceIdLine.Length() < 2)
448 {
449 // Error. Sequence Identifier line not long enough.
450 myErrorString = "The sequence identifier line was too short.";
451 reportErrorOnLine();
452 return false;
453 }
454
455 // The sequence identifier line must start wtih a '@'
456 if(mySequenceIdLine[0] != '@')
457 {
458 // Error - sequence identifier line does not begin with an '@'.
459 myErrorString = "First line of a sequence does not begin with @";
460 reportErrorOnLine();
461 return false;
462 }
463
464 // Valid Sequence Identifier Line.
465
466 // The sequence identifier ends at the first space or at the end of the
467 // line if there is no space.
468 // Use fast find since this is a case insensitive search.
469 // Start at 1 since we know that 0 is '@'
470 int endSequenceIdentifier = mySequenceIdLine.FastFindChar(' ', 1);
471
472 // Check if a " " was found.
473 if(endSequenceIdentifier == -1)
474 {
475 // Did not find a ' ', so the identifier is the rest of the line.
476 // It starts at 1 since @ is at offset 0.
477 mySequenceIdentifier = (mySequenceIdLine.SubStr(1)).c_str();
478 }
479 else
480 {
481 // Found a ' ', so the identifier ends just before that.
482 // The sequence identifier must be at least 1 character long,
483 // therefore the endSequenceIdentifier must be greater than 1.
484 if(endSequenceIdentifier <= 1)
485 {
486 myErrorString =
487 "No Sequence Identifier specified before the comment.";
488 reportErrorOnLine();
489 return false;
490 }
491
492 mySequenceIdentifier =
493 (mySequenceIdLine.SubStr(1, endSequenceIdentifier - 1)).c_str();
494 }
495
496 // If myInterleaved, validate matches the previous seqID.
497 if(myInterleaved && (myPrevSeqID != ""))
498 {
499 // Valid if the sequence identifiers are identical or if
500 // the only difference is a trailing 1 or 2.
501 if(myPrevSeqID.compare(mySequenceIdentifier) != 0)
502 {
503 // Compare all but the last characters, then check the last characters for 1 or 2.
504 if((myPrevSeqID.compare(0, myPrevSeqID.length()-1, mySequenceIdentifier.c_str(), mySequenceIdentifier.Length()-1) != 0) ||
505 (((myPrevSeqID[myPrevSeqID.length()-1] != '1') || (mySequenceIdentifier[mySequenceIdentifier.Length()-1] != '2')) &&
506 (myPrevSeqID[myPrevSeqID.length()-1] != mySequenceIdentifier[mySequenceIdentifier.Length()-1])))
507 {
508 myErrorString = "Interleaved: consecutive reads do not have matching sequence identifiers: ";
509 myErrorString += mySequenceIdentifier.c_str();
510 myErrorString += " and ";
511 myErrorString += myPrevSeqID.c_str();
512 reportErrorOnLine();
513 myPrevSeqID.clear();
514 return(false);
515 }
516 }
517 myPrevSeqID.clear();
518 }
519 else
520 {
521 if(myInterleaved)
522 {
523 myPrevSeqID = mySequenceIdentifier.c_str();
524 }
525
526 // Check if sequence identifier should be validated for uniqueness if it is
527 // not the 2nd in an interleaved pair.
528 if(myCheckSeqID)
529 {
530 // Check to see if the sequenceIdentifier is a repeat by adding
531 // it to the set and seeing if it already existed.
532 std::pair<std::map<std::string, unsigned int>::iterator,bool> insertResult;
533 insertResult =
534 myIdentifierMap.insert(std::make_pair(mySequenceIdentifier.c_str(),
535 myLineNum));
536
537 if(insertResult.second == false)
538 {
539 // Sequence Identifier is a repeat.
540 myErrorString = "Repeated Sequence Identifier: ";
541 myErrorString += mySequenceIdentifier.c_str();
542 myErrorString += " at Lines ";
543 myErrorString += insertResult.first->second;
544 myErrorString += " and ";
545 myErrorString += myLineNum;
546 reportErrorOnLine();
547 return(false);
548 }
549 }
550 }
551
552 // Valid, return true.
553 return(true);
554}
555
556
557// Reads and validates the raw sequence line(s) and the plus line. Both are
558// included in one method since it is unknown when the raw sequence line
559// ends until you find the plus line that divides it from the quality
560// string. Since this method will read the plus line to know when the
561// raw sequence ends, it also validates that line.
562bool FastQFile::validateRawSequenceAndPlusLines()
563{
564 // Read the raw sequence.
565 int readStatus = myRawSequence.ReadLine(myFile);
566
567 myLineNum++;
568
569 if(readStatus <= 0)
570 {
571 myFileProblem = true;
572 myErrorString = "Failure trying to read sequence line";
573 reportErrorOnLine();
574 return false;
575 }
576
577 // Offset into the raw sequence to be validated.
578 int offset = 0;
579
580 // Validate the raw sequence.
581 bool valid = validateRawSequence(offset);
582
583 // Increment the offset for what was just read.
584 offset = myRawSequence.Length();
585
586 // The next line is either a continuation of the raw sequence or it starts
587 // with a '+'
588 // Keep validating til the '+' line or the end of file is found.
589 bool stillRawLine = true;
590 while(stillRawLine &&
591 !ifeof(myFile))
592 {
593 // If enough errors, quit before reading any more.
594 if(isTimeToQuit())
595 {
596 return(false);
597 }
598
599 // Read the next line.
600 // Read into the plus line, but if it isn't a plus line, then
601 // it will be copied into the raw sequence line.
602 readStatus = myPlusLine.ReadLine(myFile);
603 myLineNum++;
604
605 if(readStatus <= 0)
606 {
607 myFileProblem = true;
608 myErrorString = "Failure trying to read sequence/plus line";
609 reportErrorOnLine();
610 return false;
611 }
612
613 // Check if the next line is blank
614 if(myPlusLine.Length() == 0)
615 {
616 // The next line is blank. Assume it is part of the raw sequence and
617 // report an error since there are no valid characters on the line.
618 myErrorString =
619 "Looking for continuation of Raw Sequence or '+' instead found a blank line, assuming it was part of Raw Sequence.";
620 reportErrorOnLine();
621 }
622 // Check for the plus line.
623 else if(myPlusLine[0] == '+')
624 {
625 // This is the + line.
626 valid &= validateSequencePlus();
627 stillRawLine = false;
628 }
629 else
630 {
631 // Not a plus line, so assume this is a continuation of the Raw
632 // Sequence.
633 // Copy from the plus line to the raw sequence line.
634 myRawSequence += myPlusLine;
635 myPlusLine.SetLength(0);
636 valid &= validateRawSequence(offset);
637
638 // Increment the offset.
639 offset = myRawSequence.Length();
640 }
641 }
642
643 // If enough errors, quit before reading any more.
644 if(isTimeToQuit())
645 {
646 return(false);
647 }
648
649 // Now that the entire raw sequence has been obtained, check its length
650 // against the minimum allowed length.
651 if(myRawSequence.Length() < myMinReadLength)
652 {
653 // Raw sequence is not long enough - error.
654 myErrorString = "Raw Sequence is shorter than the min read length: ";
655 myErrorString += myRawSequence.Length();
656 myErrorString += " < ";
657 myErrorString += myMinReadLength;
658 reportErrorOnLine();
659 valid = false;
660 }
661
662 // If enough errors, quit before reading any more.
663 if(isTimeToQuit())
664 {
665 return(false);
666 }
667
668 // if the flag still indicates it is processing the raw sequence that means
669 // we reached the end of the file without a '+' line. So report that error.
670 if(stillRawLine)
671 {
672 myErrorString = "Reached the end of the file without a '+' line.";
673 reportErrorOnLine();
674 valid = false;
675 }
676
677 return(valid);
678}
679
680
681// Reads and validates the quality string line(s).
682bool FastQFile::validateQualityStringLines()
683{
684 // Read the quality string.
685 int readStatus = myQualityString.ReadLine(myFile);
686 myLineNum++;
687
688 if(readStatus <= 0)
689 {
690 myFileProblem = true;
691 myErrorString = "Failure trying to read quality line";
692 reportErrorOnLine();
693 return false;
694 }
695
696 // track the offset into the quality string to validate.
697 int offset = 0;
698
699 // Validate this line of the quality string.
700 bool valid = validateQualityString(offset);
701
702 offset = myQualityString.Length();
703
704 // Keep reading quality string lines until the length of the
705 // raw sequence has been hit or the end of the file is reached.
706 while((myQualityString.Length() < myRawSequence.Length()) &&
707 (!ifeof(myFile)))
708 {
709 // If enough errors, quit before reading any more.
710 if(isTimeToQuit())
711 {
712 return(false);
713 }
714
715 // Read another line of the quality string.
716 readStatus = myTempPartialQuality.ReadLine(myFile);
717 myLineNum++;
718
719 if(readStatus <= 0)
720 {
721 myFileProblem = true;
722 myErrorString = "Failure trying to read quality line";
723 reportErrorOnLine();
724 return false;
725 }
726
727 myQualityString += myTempPartialQuality;
728 myTempPartialQuality.Clear();
729
730 // Validate this line of the quality string.
731 valid &= validateQualityString(offset);
732 offset = myQualityString.Length();
733 }
734
735 // If enough errors, quit before reading any more.
736 if(isTimeToQuit())
737 {
738 return(false);
739 }
740
741 // Validate that the quality string length is the same as the
742 // raw sequence length.
743 if(myQualityString.Length() != myRawSequence.Length())
744 {
745 myErrorString = "Quality string length (";
746 myErrorString += myQualityString.Length();
747 myErrorString += ") does not equal raw sequence length (";
748 myErrorString += myRawSequence.Length();
749 myErrorString += ")";
750 reportErrorOnLine();
751 valid = false;
752 }
753 return(valid);
754}
755
756
757// Method to validate a line that contains part of the raw sequence.
758bool FastQFile::validateRawSequence(int offset)
759{
760 bool validBase = false;
761 bool valid = true;
762
763 // Loop through validating each character is valid for the raw sequence.
764 for(int sequenceIndex = offset; sequenceIndex < myRawSequence.Length();
765 sequenceIndex++)
766 {
767 // Update the composition for this position. Returns false if the
768 // character was not a valid base.
769 validBase =
770 myBaseComposition.updateComposition(sequenceIndex,
771 myRawSequence[sequenceIndex]);
772 // Check the return
773 if(!validBase)
774 {
775 // Error, found a value that is not a valid base character.
776 myErrorString = "Invalid character ('";
777 myErrorString += myRawSequence[sequenceIndex];
778 myErrorString += "') in base sequence.";
779 reportErrorOnLine();
780 valid = false;
781 // If enough errors, quit before reading any more.
782 if(isTimeToQuit())
783 {
784 return(false);
785 }
786 }
787 }
788 return(valid);
789}
790
791
792// Method to validate the "+" line that seperates the raw sequence and the
793// quality string.
794bool FastQFile::validateSequencePlus()
795{
796 // Validate that optional sequence identifier is the same
797 // as the one on the @ line.
798
799 // Check to see if there is more to the line than just the plus
800 int lineLength = myPlusLine.Length();
801
802 // If the line is only 1 character or the second character is a space,
803 // then there is no sequence identifier on this line and there is nothing
804 // further to validate.
805 if((lineLength == 1) || (myPlusLine[1] == ' '))
806 {
807 // No sequence identifier, so just return valid.
808 return true;
809 }
810
811 // There is a sequence identifier on this line, so validate that
812 // it matches the one from the associated @ line.
813 // The read in line must be at least 1 more character ('+') than the
814 // sequence identifier read from the '@' line.
815 // If it is not longer than the sequence identifier, then we know that it
816 // cannot be the same.
817 int sequenceIdentifierLength = mySequenceIdentifier.Length();
818 if(lineLength <= sequenceIdentifierLength)
819 {
820 myErrorString =
821 "Sequence Identifier on '+' line does not equal the one on the '@' line.";
822 reportErrorOnLine();
823 return false;
824 }
825
826 bool same = true;
827 int seqIndex = 0;
828 int lineIndex = 1; // Start at 1 since offset 0 has '+'
829
830 // Loop through the sequence index and the line buffer verifying they
831 // are the same until a difference is found or the end of the sequence
832 // identifier is found.
833 while((same == true) && (seqIndex < sequenceIdentifierLength))
834 {
835 if(myPlusLine[lineIndex] != mySequenceIdentifier[seqIndex])
836 {
837 myErrorString =
838 "Sequence Identifier on '+' line does not equal the one on the '@' line.";
839 reportErrorOnLine();
840 same = false;
841 }
842 lineIndex++;
843 seqIndex++;
844 }
845 return(same);
846}
847
848
849// Method to validate the quality string.
850bool FastQFile::validateQualityString(int offset)
851{
852 bool valid = true;
853 if(myQualityString.Length() > (int)(myQualPerCycle.size()))
854 {
855 myQualPerCycle.resize(myQualityString.Length());
856 myCountPerCycle.resize(myQualityString.Length());
857 }
858 // For each character in the line, verify that it is ascii > 32.
859 for(int i=offset; i < myQualityString.Length(); i++)
860 {
861 if(myQualityString[i] <= 32)
862 {
863 myErrorString = "Invalid character ('";
864 myErrorString += myQualityString[i];
865 myErrorString += "') in quality string.";
866 reportErrorOnLine();
867 valid = false;
868 // If enough errors, quit before reading any more.
869 if(isTimeToQuit())
870 {
871 return(false);
872 }
873 }
874 else
875 {
876 myQualPerCycle[i] += BaseUtilities::getPhredBaseQuality(myQualityString[i]);
877 myCountPerCycle[i] += 1;
878 }
879 }
880 return(valid);
881}
882
883
884// Helper method for printing the contents of myErrorString. It will
885// only print the errors until the maximum number of reportable errors is
886// reached.
887void FastQFile::reportErrorOnLine()
888{
889 // Increment the total number of errors.
890 myNumErrors++;
891
892 // Only display the first X number of errors.
893 if(myNumErrors <= myNumPrintableErrors)
894 {
895 // Write the error with the line number.
896 char buffer[100];
897 sprintf(buffer, "ERROR on Line %u: ", myLineNum);
898 std::string message = buffer;
899 message += myErrorString.c_str();
900 logMessage(message.c_str());
901 }
902}
903
904
905// Reset member data that is unique for each fastQFile.
906void FastQFile::reset()
907{
908 // Each fastq file processing needs to also reset the member data for each
909 // sequence.
910 resetForEachSequence();
911 myNumErrors = 0; // per fastqfile
912 myLineNum = 0; // per fastqfile
913 myFileName.SetLength(0); // reset the filename string.
914 myIdentifierMap.clear(); // per fastqfile
915 myBaseComposition.clear(); // clear the base composition.
916 myQualPerCycle.clear();
917 myCountPerCycle.clear();
918 myFileProblem = false;
919}
920
921
922// Reset the member data that is unique for each sequence.
923void FastQFile::resetForEachSequence()
924{
925 myLineBuffer.SetLength(0);
926 myErrorString.SetLength(0);
927 myRawSequence.SetLength(0);
928 mySequenceIdLine.SetLength(0);
929 mySequenceIdentifier.SetLength(0);
930 myPlusLine.SetLength(0);
931 myQualityString.SetLength(0);
932 myTempPartialQuality.SetLength(0);
933}
934
935
936void FastQFile::logMessage(const char* logMessage)
937{
938 // Write the message if they are not disabled.
939 if(!myDisableMessages)
940 {
941 std::cout << logMessage << std::endl;
942 }
943}
944
945
946// Determine if it is time to quit by checking if we are to quit after a
947// certain number of errors and that many errors have been encountered.
948bool FastQFile::isTimeToQuit()
949{
950 // It is time to quit if we are to quit after a certain number of errors
951 // and that many errors have been encountered.
952 if((myMaxErrors != -1) && (myNumErrors >= myMaxErrors))
953 {
954 return(true);
955 }
956 return(false);
957}
958
959
960void FastQFile::printAvgQual()
961{
962 std::cout << std::endl << "Average Phred Quality by Read Index (starts at 0):" << std::endl;
963 std::cout.precision(2);
964 std::cout << std::fixed << "Read Index\tAverage Quality"
965 << std::endl;
966 if(myQualPerCycle.size() != myCountPerCycle.size())
967 {
968 // This is a code error and should NEVER happen.
969 std::cerr << "ERROR calculating the average Qualities per cycle\n";
970 }
971
972 double sumQual = 0;
973 double count = 0;
974 double avgQual = 0;
975 for(unsigned int i = 0; i < myQualPerCycle.size(); i++)
976 {
977 avgQual = 0;
978 if(myCountPerCycle[i] != 0)
979 {
980 avgQual = myQualPerCycle[i] / (double)(myCountPerCycle[i]);
981 }
982 std::cout << i << "\t" << avgQual << "\n";
983 sumQual += myQualPerCycle[i];
984 count += myCountPerCycle[i];
985 }
986 std::cout << std::endl;
987 avgQual = 0;
988 if(count != 0)
989 {
990 avgQual = sumQual / count;
991 }
992 std::cout << "Overall Average Phred Quality = " << avgQual << std::endl;
993}
int ifeof(IFILE file)
Check to see if we have reached the EOF (returns 0 if not EOF).
Definition: InputFile.h:654
IFILE ifopen(const char *filename, const char *mode, InputFile::ifileCompression compressionMode=InputFile::DEFAULT)
Open a file with the specified name and mode, using a filename of "-" to indicate stdin/stdout.
Definition: InputFile.h:562
int ifclose(IFILE &file)
Close the file.
Definition: InputFile.h:580
SPACE_TYPE
The type of space (color or base) to use in the mapping.
Definition: BaseAsciiMap.h:44
bool updateComposition(unsigned int rawSequenceCharIndex, char baseChar)
Update the composition for the specified index with the specified character.
void clear()
Clear the composition stored in the base count vector.
void setBaseMapType(BaseAsciiMap::SPACE_TYPE spaceType)
Set the base map type for this composition.
void print()
Print the composition.
void resetBaseMapType()
Reset the base map type for this composition.
static uint8_t getPhredBaseQuality(char charQuality)
Get phred base quality from the specified ascii quality.
void interleaved()
Interleaved.
Definition: FastQFile.cpp:78
FastQStatus::Status openFile(const char *fileName, BaseAsciiMap::SPACE_TYPE spaceType=BaseAsciiMap::UNKNOWN)
Open a FastQFile.
Definition: FastQFile.cpp:92
void enableSeqIDCheck()
Enable Unique Sequence ID checking.
Definition: FastQFile.cpp:71
void disableMessages()
Disable messages - do not write to cout.
Definition: FastQFile.cpp:49
bool isOpen()
Check to see if the file is open.
Definition: FastQFile.cpp:162
void disableSeqIDCheck()
Disable Unique Sequence ID checking (Unique Sequence ID checking is enabled by default).
Definition: FastQFile.cpp:63
FastQStatus::Status readFastQSequence()
Read 1 FastQSequence, validating it.
Definition: FastQFile.cpp:309
FastQStatus::Status closeFile()
Close a FastQFile.
Definition: FastQFile.cpp:134
FastQStatus::Status validateFastQFile(const String &filename, bool printBaseComp, BaseAsciiMap::SPACE_TYPE spaceType, bool printQualAvg=false)
Validate the specified fastq file.
Definition: FastQFile.cpp:204
bool keepReadingFile()
Returns whether or not to keep reading the file, it stops reading (false) if eof or there is a proble...
Definition: FastQFile.cpp:193
void enableMessages()
Enable messages - write to cout.
Definition: FastQFile.cpp:55
void setMaxErrors(int maxErrors)
Set the number of errors after which to quit reading/validating a file, defaults to -1.
Definition: FastQFile.cpp:85
FastQFile(int minReadLength=10, int numPrintableErrors=20)
Constructor.
Definition: FastQFile.cpp:30
bool isEof()
Check to see if the file is at the end of the file.
Definition: FastQFile.cpp:177
Status
Return value enum for the FastQFile class methods, indicating success or error codes.
Definition: FastQStatus.h:31
@ FASTQ_ORDER_ERROR
means the methods are called out of order, like trying to read a file before opening it.
Definition: FastQStatus.h:34
@ FASTQ_READ_ERROR
means that a problem occurred on a read.
Definition: FastQStatus.h:37
@ FASTQ_SUCCESS
indicates method finished successfully.
Definition: FastQStatus.h:32
@ FASTQ_INVALID
means that the sequence was invalid.
Definition: FastQStatus.h:33
@ FASTQ_OPEN_ERROR
means the file could not be opened.
Definition: FastQStatus.h:35
@ FASTQ_NO_SEQUENCE_ERROR
means there were no errors, but no sequences read.
Definition: FastQStatus.h:38
@ FASTQ_CLOSE_ERROR
means the file could not be closed.
Definition: FastQStatus.h:36
bool isOpen() const
Returns whether or not the file was successfully opened.
Definition: InputFile.h:423