libStatGen Software 1
PedigreeAlleleFreq.cpp
1/*
2 * Copyright (C) 2010 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#include "PedigreeAlleleFreq.h"
19#include "QuickIndex.h"
20#include "Error.h"
21
22#include <math.h>
23
24int CountAlleles(Pedigree & /* ped */, int marker)
25{
26 // With automatic recoding in the pedigree loader there
27 // is no need to iterate through the pedigree ...
28 MarkerInfo * info = Pedigree::GetMarkerInfo(marker);
29
30 return info->CountAlleles();
31}
32
33void LumpAlleles(Pedigree & ped, int marker, double threshold, bool reorder)
34{
35 // find out how many alleles there are
36 int alleles = ped.CountAlleles(marker);
37
38 if (alleles < 2) return;
39
40 MarkerInfo * info = PedigreeGlobals::GetMarkerInfo(marker);
41
42 if (alleles < info->freq.Length())
43 alleles = info->freq.Length() - 1;
44
45 IntArray counts(alleles + 1);
46 counts.Zero();
47
48 // Count number of occurrences for each allele
49 for (int i = 0; i < ped.count; i++)
50 {
51 counts[int(ped[i].markers[marker][0])]++;
52 counts[int(ped[i].markers[marker][1])]++;
53 }
54
55 // Calculate treshold for lumping alleles
56 int total = 0;
57 for (int i = 1; i <= alleles; i++)
58 total += counts[i];
59 int thresh = int(total * threshold);
60
61 // If threshold is set at zero, we artificially increase
62 // counts for alleles that do not appear in the pedigree
63 // but whose frequencies are set > 0.0. This ensures that
64 // allele frequency data does not get discarded when simply
65 // recoding alleles (vs. lumping)
66 if (thresh == 0)
67 for (int i = 1; i < info->freq.Length(); i++)
68 if (counts[i] == 0 && info->freq[i] > 0.0)
69 counts[i] = 1, total++;
70
71 // If allele reordering is disabled, put in dummy allele
72 // counts so as to ensure that allele have desired ordering
73 if (!reorder)
74 {
75 QuickIndex index(info->alleleLabels);
76 index.Reverse();
77
78 for (int i = 0; i < index.Length(); i++)
79 counts[index[i]] = i + 1;
80
81 total = counts.Sum(1, counts.Length() - 1);
82 }
83
84 // Order all alleles according to their frequency
85 // Zero, which corresponds to missing values, stays put!
86 counts[0] = total + 1;
87 QuickIndex index(counts);
88 index.Reverse();
89
90 // recode alleles
91 // all alleles where frequency < thresh are labelled N
92 // use counts array to keep track of labels
93 int N = 0;
94 bool rare = false;
95 for (int i = 0; i <= alleles; i++)
96 if (counts[index[i]] > thresh)
97 {
98 counts[index[i]] = i;
99 N++;
100 }
101 else
102 {
103 if (counts[index[i]] > 0)
104 rare = true;
105 counts[index[i]] = N;
106 }
107
108 // This loop does the recoding
109 for (int i = 0; i < ped.count; i++)
110 {
111 Alleles & current = ped[i].markers[marker];
112 current[0] = counts[current[0]];
113 current[1] = counts[current[1]];
114 }
115
116 StringArray oldLabels(info->alleleLabels);
117 String label;
118
119 info->alleleLabels.Clear();
120 info->alleleNumbers.Clear();
121
122 for (int i = 0; i < N; i++)
123 {
124 if (oldLabels.Length() <= index[i])
125 info->alleleLabels.Push(label = index[i]);
126 else
127 info->alleleLabels.Push(oldLabels[index[i]]);
128
129 if (i) info->alleleNumbers.SetInteger(info->alleleLabels.Last(), i);
130 }
131
132 // Reorder allele frequencies if necessary
133 if (info->freq.Length())
134 {
135 Vector freq(info->freq);
136
137 info->freq.Dimension(N);
138 info->freq[0] = 0.0;
139
140 for (int i = 1; i < N; i++)
141 {
142 info->freq[i] = freq[index[i]];
143 freq[index[i]] = 0;
144 }
145
146 if ((1.0 - info->freq.Sum()) > 1e-10)
147 rare = true;
148
149 if (rare)
150 {
151 info->freq.Dimension(N + 1);
152 info->freq[N] = 1.0 - info->freq.Sum();
153 }
154 }
155
156 if (rare)
157 {
158 info->alleleLabels.Push("OTHER");
159 info->alleleNumbers.SetInteger("OTHER", info->alleleLabels.Length());
160 }
161}
162
163bool EstimateFrequencies(Pedigree & ped, int marker, int estimator)
164{
165 int alleleCount = CountAlleles(ped, marker);
166
167 IntArray founder(alleleCount + 1);
168 IntArray all(alleleCount + 1);
169
170 founder.Zero();
171 all.Zero();
172
173 for (int i = 0; i < ped.count; i++)
174 {
175 // When counting alleles, note that males only carry one X chromosome
176 // and are arbitrarily scored as homozygous.
177 all[ped[i].markers[marker][0]]++;
178 if (!ped.chromosomeX || ped[i].sex != SEX_MALE)
179 all[ped[i].markers[marker][1]]++;
180 if (!ped[i].isFounder()) continue;
181 founder[ped[i].markers[marker][0]]++;
182 if (!ped.chromosomeX || ped[i].sex != SEX_MALE)
183 founder[ped[i].markers[marker][1]]++;
184 }
185
186 MarkerInfo * info = ped.GetMarkerInfo(marker);
187
188 if (info->freq.dim > 0)
189 {
190 // previous allele frequency information is available
191 if (alleleCount >= info->freq.dim)
192 error("For marker %s, input files define %d alleles, but at least\n"
193 "one other allele (named '%s') occurs in the pedigree\n",
194 (const char *) info->name, info->freq.dim - 1,
195 (const char *) info->GetAlleleLabel(alleleCount));
196
197 for (int i = 1; i <= alleleCount; i++)
198 if (all[i] > 0 && info->freq[i] <= 0.0)
199 error("Although allele %s for marker %s has frequency zero,\n"
200 "it occurs %d times in the pedigree",
201 (const char *) info->GetAlleleLabel(i), (const char *) info->name, all[i]);
202
203 return false;
204 }
205 else
206 {
207 if (alleleCount < 1)
208 {
209 // If no one is genotyped, default to two equifrequent allele
210 // since some programs do not like monomorphic markers
211 info->freq.Dimension(3);
212 info->freq[0] = 0.0;
213 info->freq[1] = 0.99999;
214 info->freq[2] = 0.00001;
215 return true;
216 }
217
218 info->freq.Dimension(alleleCount + 1);
219 info->freq.Zero();
220
221 if (estimator == FREQ_FOUNDERS && founder.Sum() > founder[0])
222 {
223 // Make sure the frequency of alleles occuring in the pedigree
224 // is never zero
225 for (int i = 1; i <= alleleCount; i++)
226 if (founder[i] == 0 && all[i] > 0)
227 founder[i] = 1;
228
229 // To get frequencies, just multiply counts by 1 / total_counts
230 double factor = 1.0 / (founder.Sum() - founder[0]);
231
232 for (int i = 1; i <= alleleCount; i++)
233 info->freq[i] = founder[i] * factor;
234 }
235 else if (estimator == FREQ_ALL || estimator == FREQ_FOUNDERS)
236 {
237 // To get frequencies, just multiply counts by 1 / total_counts
238 double factor = 1.0 / (all.Sum() - all[0]);
239
240 for (int i = 1; i <= alleleCount; i++)
241 info->freq[i] = all[i] * factor;
242 }
243 else if (estimator == FREQ_EQUAL)
244 // Assume all alleles have equal frequency
245 {
246 // Count the number of observed alleles
247 all[0] = 0;
248 int alleles = all.CountIfGreater(0);
249 double freq = 1.0 / alleles;
250
251 // Set equal frequencies for all occuring alleles
252 for (int i = 0; i <= alleleCount; i++)
253 info->freq[i] = all[i] ? freq : 0.0;
254 }
255 }
256
257 return true;
258}
259