0
|
1 /* The MIT License
|
|
2
|
|
3 Copyright (c) 2009, by Sohrab Shah <sshah@bccrc.ca> and Rodrigo Goya <rgoya@bcgsc.ca>
|
|
4
|
|
5 Permission is hereby granted, free of charge, to any person obtaining
|
|
6 a copy of this software and associated documentation files (the
|
|
7 "Software"), to deal in the Software without restriction, including
|
|
8 without limitation the rights to use, copy, modify, merge, publish,
|
|
9 distribute, sublicense, and/or sell copies of the Software, and to
|
|
10 permit persons to whom the Software is furnished to do so, subject to
|
|
11 the following conditions:
|
|
12
|
|
13 The above copyright notice and this permission notice shall be
|
|
14 included in all copies or substantial portions of the Software.
|
|
15
|
|
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
|
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
|
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
23 SOFTWARE.
|
|
24 */
|
|
25 #include <stdio.h>
|
|
26 #include <stdlib.h>
|
|
27 #include <string.h>
|
|
28 #include <math.h>
|
|
29 #include <unistd.h>
|
|
30 #include <stdbool.h>
|
|
31 #include "sam.h"
|
|
32 #include "faidx.h"
|
|
33 #include "khash.h"
|
|
34
|
|
35 #define TYPE_mb 0
|
|
36 #define TYPE_m 1
|
|
37 #define TYPE_b 2
|
|
38 #define TYPE_M 3
|
|
39 #define TYPE_Mb 4
|
|
40 #define TYPE_MB 5
|
|
41 #define TYPE_SNVMix1 6
|
|
42
|
|
43 #define Q_PILEUP 0
|
|
44 #define M_PILEUP 1
|
|
45 #define S_PILEUP 2
|
|
46 #define BAM_FILE 3
|
|
47
|
|
48 #define PHRED_MAX 200
|
|
49
|
|
50 #define N 0
|
|
51 #define A 1
|
|
52 #define G 2
|
|
53 #define C 3
|
|
54 #define T 4
|
|
55
|
|
56 char base_code[] = {'N','A','G','C','T'};
|
|
57
|
|
58 struct params_train {
|
|
59 long double *alpha;
|
|
60 long double *beta;
|
|
61 long double *delta;
|
|
62 char *param_file;
|
|
63 int max_iter;
|
|
64 unsigned char **bQ;
|
|
65 unsigned char **mQ;
|
|
66 signed char **calls;
|
|
67 char *ref;
|
|
68 int *pos;
|
|
69 int *depth;
|
|
70 int len;
|
|
71 };
|
|
72
|
|
73 typedef struct {
|
|
74 FILE *input;
|
|
75 FILE *output;
|
|
76 char *inputfile;
|
|
77 char *outputfile;
|
|
78 char *modelfile;
|
|
79 char *bamfile;
|
|
80 char *fastafile;
|
|
81 char *listfile;
|
|
82 int filter_type;
|
|
83 int filter_chast;
|
|
84 int filter_dup;
|
|
85 int train;
|
|
86 int classify;
|
|
87 int filter;
|
|
88 int full;
|
|
89 int input_type; // 0 = processed, 1 = maq pileup, 2 = sam pileup
|
|
90 long double *mu;
|
|
91 long double *pi;
|
|
92 int max_iter;
|
|
93 int bQ;
|
|
94 int mQ;
|
|
95 int debug;
|
|
96 //struct {
|
|
97 // int alpha[3];
|
|
98 // int beta[3];
|
|
99 //} train;
|
|
100 struct params_train trainP;
|
|
101 int states;
|
|
102 int window;
|
|
103 } param_struct;
|
|
104
|
|
105 typedef int *indel_list_t;
|
|
106 KHASH_MAP_INIT_INT64(64, indel_list_t)
|
|
107
|
|
108 typedef struct{
|
|
109 bool in_use;
|
|
110 uint32_t tid; // get with snvmix_pl->in->header->target_name[tid],
|
|
111 uint32_t pos;
|
|
112 char ref;
|
|
113 char nref;
|
|
114 int ref_count;
|
|
115 int nref_count;
|
|
116 long double *p;
|
|
117 int maxP;
|
|
118 // 0 = REF , 1 = NREF
|
|
119 int forward[2];
|
|
120 int reverse[2];
|
|
121 int good_pair[2];
|
|
122 int bad_pair[2];
|
|
123 int c_clean[2];
|
|
124 int c_ins[2];
|
|
125 int c_del[2];
|
|
126 int c_junc[2];
|
|
127 int nref_edges; // FIXME: fixed 5 bp edges
|
|
128 int nref_center; //
|
|
129 int indel_pos; // How many reads have a deletion at that site
|
|
130 int indel_near; // How many reasd have a deletion overall
|
|
131 int aln_unique_pos;
|
|
132 int full_depth; // Full depth at this position
|
|
133 int raw_cvg[5];
|
|
134 int thr_cvg[5];
|
|
135 } snvmix_call_t;
|
|
136
|
|
137 typedef struct {
|
|
138 param_struct *params;
|
|
139 int begin, end;
|
|
140 samfile_t *in;
|
|
141 faidx_t *fai;
|
|
142 int tid, len;
|
|
143 char *ref;
|
|
144 /* might want to move this elsewhere */
|
|
145 khash_t(64) *hash;
|
|
146 long double *notmu, *log_pi, *b;
|
|
147 long double phred[PHRED_MAX + 1];
|
|
148 int *calls, depth_allocated;
|
|
149 snvmix_call_t *buffer;
|
|
150 int n_buffer, b_start, b_end;
|
|
151 // Extra flags
|
|
152 } snvmix_pl_t;
|
|
153
|
|
154
|
|
155 void updatePhred(long double *phredTable);
|
|
156 void initPhred(long double *phredTable, int elem);
|
|
157
|
|
158 void resetParams(param_struct *params);
|
|
159 void initSNVMix(int argc , char **argv, param_struct *params);
|
|
160 void usage(char *selfname);
|
|
161
|
|
162 void allocateParameters(param_struct *params);
|
|
163 void setTrainingParameters(param_struct *params);
|
|
164 void setClassificationParameters(param_struct *params);
|
|
165 void readParamFile(param_struct *params, char type);
|
|
166
|
|
167
|
|
168 void snvmixClassify_qualities(param_struct *params);
|
|
169 void snvmixClassify_pileup(param_struct *params);
|
|
170
|
|
171 int snvmixClassify_bamfile(param_struct *params);
|
|
172 static int snvmixClassify_pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data);
|
|
173
|
|
174 inline void snvmixGetCallString(char *col, int *calls, int depth, char *nref);
|
|
175
|
|
176 inline int snvmixFilterCalls(int *calls, int depth, char *bQ, char *mQ, param_struct *params);
|
|
177 inline int snvmixSkipCall(int *calls, int qual_num, param_struct *params, char *bQ, char *mQ);
|
|
178 inline int snvmixSkipCall_alt(param_struct *params, int call, char bQ, char mQ);
|
|
179
|
|
180 void snvmixTrain_qualities(param_struct *params);
|
|
181 void snvmixGetTrainSet_pileup(param_struct *params);
|
|
182 void snvmixTrain_pileup(param_struct *params);
|
|
183
|
|
184 long double normalise(long double *values, int len);
|
|
185
|
|
186 char **__bam_get_lines(const char *fn, int *_n);
|
|
187 static khash_t(64) *load_pos(const char *fn, bam_header_t *h)
|
|
188 {
|
|
189 char **list;
|
|
190 int i, j, n, *fields, max_fields;
|
|
191 khash_t(64) *hash;
|
|
192 bam_init_header_hash(h);
|
|
193 list = __bam_get_lines(fn, &n);
|
|
194 fprintf(stderr,"got %d lines\n", n);
|
|
195 hash = kh_init(64);
|
|
196 max_fields = 0; fields = 0;
|
|
197 for (i = 0; i < n; ++i) {
|
|
198 char *str = list[i];
|
|
199 int chr, n_fields, ret;
|
|
200 khint_t k;
|
|
201 uint64_t x;
|
|
202 n_fields = ksplit_core(str, 0, &max_fields, &fields);
|
|
203 if (n_fields < 2) continue;
|
|
204 chr = bam_get_tid(h, str + fields[0]);
|
|
205 if (chr < 0) {
|
|
206 fprintf(stderr, "[load_pos] unknown reference sequence name: %s\n", str + fields[0]);
|
|
207 continue;
|
|
208 }
|
|
209 x = (uint64_t)chr << 32 | (atoi(str + fields[1]) - 1);
|
|
210 k = kh_put(64, hash, x, &ret);
|
|
211 if (ret == 0) {
|
|
212 fprintf(stderr, "[load_pos] position %s:%s has been loaded.\n", str+fields[0], str+fields[1]);
|
|
213 continue;
|
|
214 }
|
|
215 kh_val(hash, k) = 0;
|
|
216 if (n_fields > 2) {
|
|
217 // count
|
|
218 for (j = 2; j < n_fields; ++j) {
|
|
219 char *s = str + fields[j];
|
|
220 if ((*s != '+' && *s != '-') || !isdigit(s[1])) break;
|
|
221 }
|
|
222 if (j > 2) { // update kh_val()
|
|
223 int *q, y, z;
|
|
224 q = kh_val(hash, k) = (int*)calloc(j - 1, sizeof(int));
|
|
225 q[0] = j - 2; z = j; y = 1;
|
|
226 for (j = 2; j < z; ++j)
|
|
227 q[y++] = atoi(str + fields[j]);
|
|
228 }
|
|
229 }
|
|
230 free(str);
|
|
231 }
|
|
232 free(list); free(fields);
|
|
233 return hash;
|
|
234 }
|