annotate pyPRADA_1.2/tools/samtools-0.1.16/misc/wgsim.c @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
1 /* The MIT License
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
2
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
3 Copyright (c) 2008 Genome Research Ltd (GRL).
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
4
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
5 Permission is hereby granted, free of charge, to any person obtaining
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
6 a copy of this software and associated documentation files (the
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
7 "Software"), to deal in the Software without restriction, including
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
8 without limitation the rights to use, copy, modify, merge, publish,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
9 distribute, sublicense, and/or sell copies of the Software, and to
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
10 permit persons to whom the Software is furnished to do so, subject to
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
11 the following conditions:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
12
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
13 The above copyright notice and this permission notice shall be
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
14 included in all copies or substantial portions of the Software.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
15
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
23 SOFTWARE.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
24 */
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
25
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
27
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
28 /* This program is separated from maq's read simulator with Colin
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
29 * Hercus' modification to allow longer indels. Colin is the chief
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
30 * developer of novoalign. */
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
31
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
32 #include <stdlib.h>
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
33 #include <math.h>
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
34 #include <time.h>
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
35 #include <assert.h>
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
36 #include <stdio.h>
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
37 #include <unistd.h>
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
38 #include <stdint.h>
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
39 #include <ctype.h>
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
40 #include <string.h>
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
41
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
42 #define PACKAGE_VERSION "0.2.3"
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
43
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
44 const uint8_t nst_nt4_table[256] = {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
45 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
46 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
47 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
48 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
49 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
50 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
51 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
52 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
53 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
54 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
55 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
56 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
57 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
58 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
59 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
60 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
61 };
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
62
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
63 const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4};
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
64
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
65 /* Simple normal random number generator, copied from genran.c */
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
66
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
67 double ran_normal()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
68 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
69 static int iset = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
70 static double gset;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
71 double fac, rsq, v1, v2;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
72 if (iset == 0) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
73 do {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
74 v1 = 2.0 * drand48() - 1.0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
75 v2 = 2.0 * drand48() - 1.0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
76 rsq = v1 * v1 + v2 * v2;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
77 } while (rsq >= 1.0 || rsq == 0.0);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
78 fac = sqrt(-2.0 * log(rsq) / rsq);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
79 gset = v1 * fac;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
80 iset = 1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
81 return v2 * fac;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
82 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
83 iset = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
84 return gset;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
85 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
86 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
87
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
88 /* FASTA parser, copied from seq.c */
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
89
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
90 typedef struct {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
91 int l, m; /* length and maximum buffer size */
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
92 unsigned char *s; /* sequence */
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
93 } seq_t;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
94
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
95 #define INIT_SEQ(seq) (seq).s = 0; (seq).l = (seq).m = 0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
96
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
97 static int SEQ_BLOCK_SIZE = 512;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
98
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
99 void seq_set_block_size(int size)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
100 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
101 SEQ_BLOCK_SIZE = size;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
102 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
103
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
104 int seq_read_fasta(FILE *fp, seq_t *seq, char *locus, char *comment)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
105 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
106 int c, l, max;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
107 char *p;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
108
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
109 c = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
110 while (!feof(fp) && fgetc(fp) != '>');
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
111 if (feof(fp)) return -1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
112 p = locus;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
113 while (!feof(fp) && (c = fgetc(fp)) != ' ' && c != '\t' && c != '\n')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
114 if (c != '\r') *p++ = c;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
115 *p = '\0';
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
116 if (comment) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
117 p = comment;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
118 if (c != '\n') {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
119 while (!feof(fp) && ((c = fgetc(fp)) == ' ' || c == '\t'));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
120 if (c != '\n') {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
121 *p++ = c;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
122 while (!feof(fp) && (c = fgetc(fp)) != '\n')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
123 if (c != '\r') *p++ = c;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
124 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
125 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
126 *p = '\0';
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
127 } else if (c != '\n') while (!feof(fp) && fgetc(fp) != '\n');
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
128 l = 0; max = seq->m;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
129 while (!feof(fp) && (c = fgetc(fp)) != '>') {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
130 if (isalpha(c) || c == '-' || c == '.') {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
131 if (l + 1 >= max) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
132 max += SEQ_BLOCK_SIZE;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
133 seq->s = (unsigned char*)realloc(seq->s, sizeof(char) * max);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
134 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
135 seq->s[l++] = (unsigned char)c;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
136 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
137 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
138 if (c == '>') ungetc(c,fp);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
139 seq->s[l] = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
140 seq->m = max; seq->l = l;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
141 return l;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
142 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
143
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
144 /* Error-checking open, copied from utils.c */
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
145
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
146 #define xopen(fn, mode) err_xopen_core(__func__, fn, mode)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
147
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
148 FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
149 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
150 FILE *fp = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
151 if (strcmp(fn, "-") == 0)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
152 return (strstr(mode, "r"))? stdin : stdout;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
153 if ((fp = fopen(fn, mode)) == 0) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
154 fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
155 abort();
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
156 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
157 return fp;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
158 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
159
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
160 /* wgsim */
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
161
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
162 enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000};
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
163 typedef unsigned short mut_t;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
164 static mut_t mutmsk = (mut_t)0xf000;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
165
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
166 typedef struct {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
167 int l, m; /* length and maximum buffer size */
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
168 mut_t *s; /* sequence */
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
169 } mutseq_t;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
170
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
171 static double ERR_RATE = 0.02;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
172 static double MUT_RATE = 0.001;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
173 static double INDEL_FRAC = 0.1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
174 static double INDEL_EXTEND = 0.3;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
175 static int IS_SOLID = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
176 static int SHOW_MM_INFO = 1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
177
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
178 void maq_mut_diref(const seq_t *seq, int is_hap, mutseq_t *hap1, mutseq_t *hap2)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
179 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
180 int i, deleting = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
181 mutseq_t *ret[2];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
182
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
183 ret[0] = hap1; ret[1] = hap2;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
184 ret[0]->l = seq->l; ret[1]->l = seq->l;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
185 ret[0]->m = seq->m; ret[1]->m = seq->m;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
186 ret[0]->s = (mut_t *)calloc(seq->m, sizeof(mut_t));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
187 ret[1]->s = (mut_t *)calloc(seq->m, sizeof(mut_t));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
188 for (i = 0; i != seq->l; ++i) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
189 int c;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
190 c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)seq->s[i]];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
191 if (deleting) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
192 if (drand48() < INDEL_EXTEND) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
193 if (deleting & 1) ret[0]->s[i] |= DELETE;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
194 if (deleting & 2) ret[1]->s[i] |= DELETE;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
195 continue;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
196 } else deleting = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
197 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
198 if (c < 4 && drand48() < MUT_RATE) { // mutation
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
199 if (drand48() >= INDEL_FRAC) { // substitution
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
200 double r = drand48();
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
201 c = (c + (int)(r * 3.0 + 1)) & 3;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
202 if (is_hap || drand48() < 0.333333) { // hom
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
203 ret[0]->s[i] = ret[1]->s[i] = SUBSTITUTE|c;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
204 } else { // het
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
205 ret[drand48()<0.5?0:1]->s[i] = SUBSTITUTE|c;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
206 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
207 } else { // indel
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
208 if (drand48() < 0.5) { // deletion
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
209 if (is_hap || drand48() < 0.333333) { // hom-del
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
210 ret[0]->s[i] = ret[1]->s[i] = DELETE;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
211 deleting = 3;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
212 } else { // het-del
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
213 deleting = drand48()<0.5?1:2;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
214 ret[deleting-1]->s[i] = DELETE;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
215 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
216 } else { // insertion
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
217 int num_ins = 0, ins = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
218 do {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
219 num_ins++;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
220 ins = (ins << 2) | (int)(drand48() * 4.0);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
221 } while (num_ins < 4 && drand48() < INDEL_EXTEND);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
222
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
223 if (is_hap || drand48() < 0.333333) { // hom-ins
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
224 ret[0]->s[i] = ret[1]->s[i] = (num_ins << 12) | (ins << 4) | c;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
225 } else { // het-ins
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
226 ret[drand48()<0.5?0:1]->s[i] = (num_ins << 12) | (ins << 4) | c;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
227 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
228 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
229 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
230 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
231 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
232 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
233 void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
234 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
235 int i;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
236 for (i = 0; i != seq->l; ++i) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
237 int c[3];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
238 c[0] = nst_nt4_table[(int)seq->s[i]];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
239 c[1] = hap1->s[i]; c[2] = hap2->s[i];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
240 if (c[0] >= 4) continue;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
241 if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
242 printf("%s\t%d\t", name, i+1);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
243 if (c[1] == c[2]) { // hom
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
244 if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
245 printf("%c\t%c\t-\n", "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
246 } else if ((c[1]&mutmsk) == DELETE) { // del
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
247 printf("%c\t-\t-\n", "ACGTN"[c[0]]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
248 } else if (((c[1] & mutmsk) >> 12) <= 5) { // ins
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
249 printf("-\t");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
250 int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
251 while(n > 0) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
252 putchar("ACGTN"[ins & 0x3]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
253 n--;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
254 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
255 printf("\t-\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
256 } else assert(0);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
257 } else { // het
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
258 if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
259 printf("%c\t%c\t+\n", "ACGTN"[c[0]], "XACMGRSVTWYHKDBN"[1<<(c[1]&0x3)|1<<(c[2]&0x3)]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
260 } else if ((c[1]&mutmsk) == DELETE) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
261 printf("%c\t-\t+\n", "ACGTN"[c[0]]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
262 } else if ((c[2]&mutmsk) == DELETE) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
263 printf("%c\t-\t+\n", "ACGTN"[c[0]]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
264 } else if (((c[1] & mutmsk) >> 12) <= 4) { // ins1
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
265 printf("-\t");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
266 int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
267 while (n > 0) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
268 putchar("ACGTN"[ins & 0x3]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
269 n--;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
270 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
271 printf("\t+\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
272 } else if (((c[2] & mutmsk) >> 12) <= 5) { // ins2
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
273 printf("-\t");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
274 int n = (c[2]&mutmsk) >> 12, ins = c[2] >> 4;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
275 while (n > 0) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
276 putchar("ACGTN"[ins & 0x3]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
277 ins >>= 2;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
278 n--;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
279 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
280 printf("\t+\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
281 } else assert(0);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
282 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
283 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
284 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
285 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
286
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
287 void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
288 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
289 seq_t seq;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
290 mutseq_t rseq[2];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
291 uint64_t tot_len, ii;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
292 int i, l, n_ref;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
293 char name[256], *qstr;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
294 int size[2], Q;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
295 uint8_t *tmp_seq[2];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
296 mut_t *target;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
297
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
298 INIT_SEQ(seq);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
299 srand48(time(0));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
300 seq_set_block_size(0x1000000);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
301 l = size_l > size_r? size_l : size_r;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
302 qstr = (char*)calloc(l+1, 1);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
303 tmp_seq[0] = (uint8_t*)calloc(l+2, 1);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
304 tmp_seq[1] = (uint8_t*)calloc(l+2, 1);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
305 size[0] = size_l; size[1] = size_r;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
306
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
307 Q = (ERR_RATE == 0.0)? 'I' : (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
308
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
309 tot_len = n_ref = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
310 while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
311 tot_len += l;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
312 ++n_ref;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
313 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
314 fprintf(stderr, "[wgsim_core] %d sequences, total length: %llu\n", n_ref, (long long)tot_len);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
315 rewind(fp_fa);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
316
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
317 while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
318 uint64_t n_pairs = (uint64_t)((long double)l / tot_len * N + 0.5);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
319 if (l < dist + 3 * std_dev) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
320 fprintf(stderr, "[wgsim_core] kkip sequence '%s' as it is shorter than %d!\n", name, dist + 3 * std_dev);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
321 continue;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
322 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
323
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
324 // generate mutations and print them out
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
325 maq_mut_diref(&seq, is_hap, rseq, rseq+1);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
326 maq_print_mutref(name, &seq, rseq, rseq+1);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
327
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
328 for (ii = 0; ii != n_pairs; ++ii) { // the core loop
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
329 double ran;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
330 int d, pos, s[2], is_flip = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
331 int n_sub[2], n_indel[2], n_err[2], ext_coor[2], j, k;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
332 FILE *fpo[2];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
333
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
334 do { // avoid boundary failure
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
335 ran = ran_normal();
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
336 ran = ran * std_dev + dist;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
337 d = (int)(ran + 0.5);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
338 pos = (int)((l - d + 1) * drand48());
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
339 } while (pos < 0 || pos >= seq.l || pos + d - 1 >= seq.l);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
340
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
341 // flip or not
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
342 if (drand48() < 0.5) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
343 fpo[0] = fpout1; fpo[1] = fpout2;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
344 s[0] = size[0]; s[1] = size[1];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
345 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
346 fpo[1] = fpout1; fpo[0] = fpout2;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
347 s[1] = size[0]; s[0] = size[1];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
348 is_flip = 1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
349 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
350
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
351 // generate the read sequences
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
352 target = rseq[drand48()<0.5?0:1].s; // haplotype from which the reads are generated
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
353 n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
354
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
355 #define __gen_read(x, start, iter) do { \
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
356 for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < seq.l && k < s[x]; iter) { \
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
357 int c = target[i], mut_type = c & mutmsk; \
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
358 if (ext_coor[x] < 0) { \
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
359 if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
360 ext_coor[x] = i; \
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
361 } \
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
362 if (mut_type == DELETE) ++n_indel[x]; \
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
363 else if (mut_type == NOCHANGE || mut_type == SUBSTITUTE) { \
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
364 tmp_seq[x][k++] = c & 0xf; \
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
365 if (mut_type == SUBSTITUTE) ++n_sub[x]; \
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
366 } else { \
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
367 int n, ins; \
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
368 ++n_indel[x]; \
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
369 tmp_seq[x][k++] = c & 0xf; \
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
370 for (n = mut_type>>12, ins = c>>4; n > 0 && k < s[x]; --n, ins >>= 2) \
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
371 tmp_seq[x][k++] = ins & 0x3; \
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
372 } \
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
373 } \
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
374 if (k != s[x]) ext_coor[x] = -10; \
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
375 } while (0)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
376
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
377 if (!IS_SOLID) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
378 __gen_read(0, pos, ++i);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
379 __gen_read(1, pos + d - 1, --i);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
380 for (k = 0; k < s[1]; ++k) tmp_seq[1][k] = tmp_seq[1][k] < 4? 3 - tmp_seq[1][k] : 4; // complement
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
381 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
382 int c1, c2, c;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
383 ++s[0]; ++s[1]; // temporarily increase read length by 1
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
384 if (is_flip) { // RR pair
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
385 __gen_read(0, pos + s[0], --i);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
386 __gen_read(1, pos + d - 1, --i);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
387 } else { // FF pair
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
388 __gen_read(0, pos, ++i);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
389 __gen_read(1, pos + d - 1 - s[1], ++i);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
390 ++ext_coor[0]; ++ext_coor[1];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
391 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
392 // change to color sequence: (0,1,2,3) -> (A,C,G,T)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
393 for (j = 0; j < 2; ++j) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
394 c1 = tmp_seq[j][0];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
395 for (i = 1; i < s[j]; ++i) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
396 c2 = tmp_seq[j][i];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
397 c = (c1 >= 4 || c2 >= 4)? 4 : nst_color_space_table[(1<<c1)|(1<<c2)];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
398 tmp_seq[j][i-1] = c;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
399 c1 = c2;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
400 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
401 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
402 --s[0]; --s[1]; // change back
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
403 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
404 if (ext_coor[0] < 0 || ext_coor[1] < 0) { // fail to generate the read(s)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
405 --ii;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
406 continue;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
407 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
408
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
409 // generate sequencing errors
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
410 for (j = 0; j < 2; ++j) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
411 for (i = 0; i < s[j]; ++i) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
412 int c = tmp_seq[j][i];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
413 if (c >= 4) c = 4; // actually c should be never larger than 4 if everything is correct
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
414 else if (drand48() < ERR_RATE) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
415 c = (c + (int)(drand48() * 3.0 + 1)) & 3;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
416 ++n_err[j];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
417 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
418 tmp_seq[j][i] = c;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
419 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
420 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
421
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
422 // print
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
423 for (j = 0; j < 2; ++j) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
424 for (i = 0; i < s[j]; ++i) qstr[i] = Q;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
425 qstr[i] = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
426 if (SHOW_MM_INFO) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
427 fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", name, ext_coor[0]+1, ext_coor[1]+1,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
428 n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1],
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
429 (long long)ii, j==0? is_flip+1 : 2-is_flip);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
430 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
431 fprintf(fpo[j], "@%s_%u_%u_%llx/%d %d:%d:%d_%d:%d:%d\n", name, ext_coor[0]+1, ext_coor[1]+1,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
432 (long long)ii, j==0? is_flip+1 : 2-is_flip,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
433 n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
434 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
435 for (i = 0; i < s[j]; ++i)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
436 fputc("ACGTN"[(int)tmp_seq[j][i]], fpo[j]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
437 fprintf(fpo[j], "\n+\n%s\n", qstr);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
438 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
439 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
440 free(rseq[0].s); free(rseq[1].s);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
441 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
442 free(seq.s); free(qstr);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
443 free(tmp_seq[0]); free(tmp_seq[1]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
444 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
445
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
446 static int simu_usage()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
447 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
448 fprintf(stderr, "\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
449 fprintf(stderr, "Program: wgsim (short read simulator)\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
450 fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
451 fprintf(stderr, "Contact: Heng Li <lh3@sanger.ac.uk>\n\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
452 fprintf(stderr, "Usage: wgsim [options] <in.ref.fa> <out.read1.fq> <out.read2.fq>\n\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
453 fprintf(stderr, "Options: -e FLOAT base error rate [%.3f]\n", ERR_RATE);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
454 fprintf(stderr, " -d INT outer distance between the two ends [500]\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
455 fprintf(stderr, " -s INT standard deviation [50]\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
456 fprintf(stderr, " -N INT number of read pairs [1000000]\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
457 fprintf(stderr, " -1 INT length of the first read [70]\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
458 fprintf(stderr, " -2 INT length of the second read [70]\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
459 fprintf(stderr, " -r FLOAT rate of mutations [%.4f]\n", MUT_RATE);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
460 fprintf(stderr, " -R FLOAT fraction of indels [%.2f]\n", INDEL_FRAC);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
461 fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", INDEL_EXTEND);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
462 fprintf(stderr, " -c generate reads in color space (SOLiD reads)\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
463 fprintf(stderr, " -C show mismatch info in comment rather than read name\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
464 fprintf(stderr, " -h haplotype mode\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
465 fprintf(stderr, "\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
466 fprintf(stderr, "Note: For SOLiD reads, the first read is F3 and the second is R3.\n\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
467 return 1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
468 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
469
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
470 int main(int argc, char *argv[])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
471 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
472 int64_t N;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
473 int dist, std_dev, c, size_l, size_r, is_hap = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
474 FILE *fpout1, *fpout2, *fp_fa;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
475
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
476 N = 1000000; dist = 500; std_dev = 50;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
477 size_l = size_r = 70;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
478 while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:cC")) >= 0) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
479 switch (c) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
480 case 'd': dist = atoi(optarg); break;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
481 case 's': std_dev = atoi(optarg); break;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
482 case 'N': N = atoi(optarg); break;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
483 case '1': size_l = atoi(optarg); break;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
484 case '2': size_r = atoi(optarg); break;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
485 case 'e': ERR_RATE = atof(optarg); break;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
486 case 'r': MUT_RATE = atof(optarg); break;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
487 case 'R': INDEL_FRAC = atof(optarg); break;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
488 case 'X': INDEL_EXTEND = atof(optarg); break;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
489 case 'c': IS_SOLID = 1; break;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
490 case 'C': SHOW_MM_INFO = 0; break;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
491 case 'h': is_hap = 1; break;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
492 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
493 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
494 if (argc - optind < 3) return simu_usage();
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
495 fp_fa = (strcmp(argv[optind+0], "-") == 0)? stdin : xopen(argv[optind+0], "r");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
496 fpout1 = xopen(argv[optind+1], "w");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
497 fpout2 = xopen(argv[optind+2], "w");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
498 wgsim_core(fpout1, fpout2, fp_fa, is_hap, N, dist, std_dev, size_l, size_r);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
499
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
500 fclose(fpout1); fclose(fpout2); fclose(fp_fa);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
501 return 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
502 }