libStatGen Software 1
FastQFile.h
1/*
2 * Copyright (C) 2010 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#ifndef __FASTQ_VALIDATOR_H__
19#define __FASTQ_VALIDATOR_H__
20
21#include <iostream>
22#include <map>
23#include "StringBasics.h"
24#include "InputFile.h"
25#include "BaseComposition.h"
26#include "FastQStatus.h"
27
28/// Class for reading/validating a fastq file.
30{
31 public:
32 /// Constructor.
33 /// /param minReadLength The minimum length that a base sequence must be for
34 /// it to be valid.
35 /// \param numPrintableErrors The maximum number of errors that should be reported
36 /// in detail before suppressing the errors.
37 FastQFile(int minReadLength = 10, int numPrintableErrors = 20);
38
39 /// Disable messages - do not write to cout.
40 void disableMessages();
41
42 /// Enable messages - write to cout.
43 void enableMessages();
44
45 /// Disable Unique Sequence ID checking
46 /// (Unique Sequence ID checking is enabled by default).
47 void disableSeqIDCheck();
48
49 /// Enable Unique Sequence ID checking.
50 /// (Unique Sequence ID checking is enabled by default).
51 void enableSeqIDCheck();
52
53 /// Interleaved.
54 void interleaved();
55
56 /// Set the number of errors after which to quit reading/validating a file,
57 /// defaults to -1.
58 /// \param maxErrors # of errors before quitting,
59 /// -1 indicates to not quit until the entire file has been read/validated (default),
60 /// 0 indicates to quit without reading/validating anything.
61 void setMaxErrors(int maxErrors);
62
63 /// Open a FastQFile.
64 /// Use the specified SPACE_TYPE to determine BASE, COLOR, or UNKNOWN.
65 FastQStatus::Status openFile(const char* fileName,
67
68 /// Close a FastQFile.
70
71 /// Check to see if the file is open.
72 bool isOpen();
73
74 /// Check to see if the file is at the end of the file.
75 bool isEof();
76
77 /// Returns whether or not to keep reading the file,
78 /// it stops reading (false) if eof or there is a problem reading the file.
79 bool keepReadingFile();
80
81 /// Validate the specified fastq file
82 /// \param filename fastq file to be validated.
83 /// \param printBaseComp whether or not to print the base composition for the file.
84 /// true means print it, false means do not.
85 /// \param spaceType the spaceType to use for validation - BASE_SPACE, COLOR_SPACE,
86 /// or UNKNOWN (UNKNOWN means to determine the spaceType to
87 /// validate against from the first character of the first
88 /// sequence).
89 /// \param printQualAvg whether or not to print the quality averages for the file.
90 /// true means to print it, false (default) means do not.
91 /// \return the fastq validation status, SUCCESS on a successfully
92 /// validated fastq file.
94 bool printBaseComp,
96 bool printQualAvg = false);
97
98 /// Read 1 FastQSequence, validating it.
100
101 ///////////////////////
102 /// @name Public Sequence Line variables.
103 /// Keep public variables for a sequence's line so they can be accessed
104 /// without having to do string copies.
105 //@{
106 String myRawSequence;
107 String mySequenceIdLine;
108 String mySequenceIdentifier;
109 String myPlusLine;
110 String myQualityString;
111 //@}
112
113 /// Get the space type used for this file.
115 {
116 return(myBaseComposition.getSpaceType());
117 }
118
119private:
120 // Validates a single fastq sequence from myFile.
121 bool validateFastQSequence();
122
123 // Reads and validates the sequence identifier line of a fastq sequence.
124 bool validateSequenceIdentifierLine();
125
126 // Reads and validates the raw sequence line(s) and the plus line. Both are
127 // included in one method since it is unknown when the raw sequence line
128 // ends until you find the plus line that divides it from the quality
129 // string. Since this method will read the plus line to know when the
130 // raw sequence ends, it also validates that line.
131 bool validateRawSequenceAndPlusLines();
132
133 // Reads and validates the quality string line(s).
134 bool validateQualityStringLines();
135
136 // Method to validate a line that contains part of the raw sequence.
137 // offset specifies where in the sequence to start validating.
138 bool validateRawSequence(int offset);
139
140 // Method to validate the "+" line that seperates the raw sequence and the
141 // quality string.
142 bool validateSequencePlus();
143
144 // Method to validate the quality string.
145 // offset specifies where in the quality string to start validating.
146 bool validateQualityString(int offset);
147
148 // Helper method to read a line from the input file into a string.
149 // It also tracks the line number.
150 void readLine();
151
152 // Helper method for printing the contents of myErrorString. It will
153 // only print the errors until the maximum number of reportable errors is
154 // reached.
155 void reportErrorOnLine();
156
157 // Reset the member data for each fastq file.
158 void reset();
159
160 // Reset the member data for each sequence.
161 void resetForEachSequence();
162
163 // Log the specified message if enabled.
164 void logMessage(const char* message);
165
166 // Determine if it is time to quit by checking if we are to quit after a
167 // certain number of errors and that many errors have been encountered.
168 bool isTimeToQuit();
169
170 void printAvgQual();
171
172 //////////////////////////////////////////////////////////////////////
173 // Following member data elements are reset for each validated sequence.
174 //
175
176 // Buffer for storing the contents of the line read.
177 // Stored as member data so memory allocation is only done once.
178 String myLineBuffer;
179
180 // Buffer for storing the error string. This prevents the reallocation of
181 // the string buffer for each error.
182 String myErrorString;
183
184 String myTempPartialQuality;
185
186 //////////////////////////////////////////////////////////////////////
187 // Following member data elements are reset for each validated file.
188 //
189 IFILE myFile; // Input file to be read.
190 String myFileName; // Name of file being processed.
191 int myNumErrors; // Tracks the number of errors.
192 unsigned int myLineNum; // Track the line number - used for reporting errors.
193 BaseComposition myBaseComposition; // Tracks the base composition.
194 std::vector<int> myQualPerCycle; // Tracks the quality by cycle.
195 std::vector<int> myCountPerCycle; // Tracks the number of entries by cycle.
196
197 // Whether or not to check the sequence identifier for uniqueness.
198 // Checking may use up a lot of memory.
199 bool myCheckSeqID;
200
201 // Whether or not to check that the file is interleaved.
202 // Disabled by myCheckSeqID
203 bool myInterleaved;
204
205 // Previous sequence id for checking interleaved.
206 std::string myPrevSeqID;
207
208 // Map to track which identifiers have appeared in the file.
209 std::map<std::string, unsigned int> myIdentifierMap;
210
211 //////////////////////////////////////////////////////////////////////
212 // Following member data do not change for each call to the validator.
213 //
214 int myMinReadLength; // Min Length for a read.
215 int myNumPrintableErrors; // Max number of errors to print the details of.
216
217 // Number of errors after which to quit reading/validating a file.
218 // Defaults to -1.
219 // -1 indicates to not quit until the entire file has been read/validated.
220 // 0 indicates to quit without reading/validating anything.
221 int myMaxErrors;
222
223 // Whether or not messages should be printed.
224 // Defaulted to false (they should be printed).
225 bool myDisableMessages;
226
227 // Track if there is a problem reading the file. If there are read
228 // problems, stop reading the file.
229 bool myFileProblem;
230};
231
232#endif
SPACE_TYPE
The type of space (color or base) to use in the mapping.
Definition: BaseAsciiMap.h:44
@ UNKNOWN
Base decision on the first raw seq character/type has yet to be determined.
Definition: BaseAsciiMap.h:47
Class that tracks the composition of base by read location.
BaseAsciiMap::SPACE_TYPE getSpaceType()
Get the space type for this composition.
Class for reading/validating a fastq file.
Definition: FastQFile.h:30
void interleaved()
Interleaved.
Definition: FastQFile.cpp:78
FastQStatus::Status openFile(const char *fileName, BaseAsciiMap::SPACE_TYPE spaceType=BaseAsciiMap::UNKNOWN)
Open a FastQFile.
Definition: FastQFile.cpp:92
void enableSeqIDCheck()
Enable Unique Sequence ID checking.
Definition: FastQFile.cpp:71
void disableMessages()
Disable messages - do not write to cout.
Definition: FastQFile.cpp:49
bool isOpen()
Check to see if the file is open.
Definition: FastQFile.cpp:162
void disableSeqIDCheck()
Disable Unique Sequence ID checking (Unique Sequence ID checking is enabled by default).
Definition: FastQFile.cpp:63
BaseAsciiMap::SPACE_TYPE getSpaceType()
Get the space type used for this file.
Definition: FastQFile.h:114
FastQStatus::Status readFastQSequence()
Read 1 FastQSequence, validating it.
Definition: FastQFile.cpp:309
FastQStatus::Status closeFile()
Close a FastQFile.
Definition: FastQFile.cpp:134
FastQStatus::Status validateFastQFile(const String &filename, bool printBaseComp, BaseAsciiMap::SPACE_TYPE spaceType, bool printQualAvg=false)
Validate the specified fastq file.
Definition: FastQFile.cpp:204
bool keepReadingFile()
Returns whether or not to keep reading the file, it stops reading (false) if eof or there is a proble...
Definition: FastQFile.cpp:193
void enableMessages()
Enable messages - write to cout.
Definition: FastQFile.cpp:55
void setMaxErrors(int maxErrors)
Set the number of errors after which to quit reading/validating a file, defaults to -1.
Definition: FastQFile.cpp:85
FastQFile(int minReadLength=10, int numPrintableErrors=20)
Constructor.
Definition: FastQFile.cpp:30
bool isEof()
Check to see if the file is at the end of the file.
Definition: FastQFile.cpp:177
Status
Return value enum for the FastQFile class methods, indicating success or error codes.
Definition: FastQStatus.h:31
Class for easily reading/writing files without having to worry about file type (uncompressed,...
Definition: InputFile.h:37