Mercurial > repos > portiahollyoak > fastuniq
comparison source/fastq_uniq.c @ 0:816cb55b5a2d draft default tip
planemo upload for repository https://github.com/portiahollyoak/Tools commit c4769fd68ad9583d4b9dbdf212e4ecb5968cef1c-dirty
author | portiahollyoak |
---|---|
date | Thu, 02 Jun 2016 11:34:51 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:816cb55b5a2d |
---|---|
1 /* This program was used to remove duplicates in paired FASTQ sequences, | |
2 * which is usually appeared in mate pair libraries. | |
3 * | |
4 * This file and its partner was written by Haibin Xu, December 2011. | |
5 */ | |
6 | |
7 #ifndef MAX_FILE_NUMBER | |
8 #define MAX_FILE_NUMBER 1000 | |
9 #endif | |
10 | |
11 #include <unistd.h> | |
12 #include "fastq_pair_array.h" | |
13 | |
14 void fastq_uniq_usage() | |
15 { | |
16 fprintf(stderr, "-i : The input file list of paired FSATQ sequence files [FILE IN]\n"); | |
17 fprintf(stderr, " Maximum 1000 pairs\n"); | |
18 fprintf(stderr, "\n"); | |
19 fprintf(stderr, " This parameter is used to specify a list of paired sequence files in\n"); | |
20 fprintf(stderr, " FASTQ format as input, in which two adjacent files with reads in the\n"); | |
21 fprintf(stderr, " same order belong to a pair.\n"); | |
22 fprintf(stderr, "\n"); | |
23 fprintf(stderr, "-t : Output sequence format [q/f/p]\n"); | |
24 fprintf(stderr, " q : FASTQ format into TWO output files\n"); | |
25 fprintf(stderr, " f : FASTA format into TWO output files\n"); | |
26 fprintf(stderr, " p : FASTA format into ONE output file\n"); | |
27 fprintf(stderr, " default = q\n"); | |
28 fprintf(stderr, "\n"); | |
29 fprintf(stderr, " This parameter is used to specify sequence format in output file(s).\n"); | |
30 fprintf(stderr, " FastUniq could output read pairs into two files in either FASTQ [q]\n"); | |
31 fprintf(stderr, " or FASTA [f] format, in which reads in the same order belonging to a\n"); | |
32 fprintf(stderr, " pair. FastUniq could also output read pairs into a single file in\n"); | |
33 fprintf(stderr, " FASTA format [p], in which adjacent reads belonging to a pair.\n"); | |
34 fprintf(stderr, "\n"); | |
35 fprintf(stderr, "-o : The first output file [FILE OUT]\n"); | |
36 fprintf(stderr, "\n"); | |
37 fprintf(stderr, "-p : The second output file [FILE OUT]\n"); | |
38 fprintf(stderr, " Optional. ONLY required when output sequence format(-t) is specify as\n"); | |
39 fprintf(stderr, " [q] or [f].\n"); | |
40 fprintf(stderr, "\n"); | |
41 fprintf(stderr, "-c : Types of sequence descriptions for output [0/1]\n"); | |
42 fprintf(stderr, " 0 : The raw descriptions\n"); | |
43 fprintf(stderr, " 1 : New serial numbers assigned by FastUniq\n"); | |
44 fprintf(stderr, " default = 0\n"); | |
45 fprintf(stderr, "\n"); | |
46 return; | |
47 } | |
48 | |
49 int main (int argc, const char * argv[]) | |
50 { | |
51 FILE *fp_in_list, *fp_in_left, *fp_in_right, *fp_out_left, *fp_out_right; | |
52 char str_in_left[MAX_FILE_NUMBER][1000], str_in_right[MAX_FILE_NUMBER][1000]; | |
53 char str_in_list[1000], str_out_left[1000], str_out_right[1000]; | |
54 char s_left[1000], s_right[1000]; | |
55 char output_format; | |
56 int description_type; | |
57 int flag_i=0, flag_o=0, flag_t=0, flag_p=0, flag_c=0; | |
58 char ch; | |
59 FASTQ_PAIR *fq_pair; | |
60 FASTQ_PAIR_ARRAY *fq_pair_array, *temp_fq_pair_array; | |
61 long i, seq_pair_count; | |
62 | |
63 if(argc==1) | |
64 { | |
65 fastq_uniq_usage(); | |
66 return 1; | |
67 } | |
68 | |
69 /* initializing */ | |
70 for(i=0;i<MAX_FILE_NUMBER;i++) | |
71 { | |
72 str_in_left[i][0]='\0'; | |
73 str_in_right[i][0]='\0'; | |
74 } | |
75 str_in_list[0]='\0'; | |
76 str_out_left[0]='\0'; | |
77 str_out_right[0]='\0'; | |
78 output_format='\0'; | |
79 | |
80 /* obtain inputted arguments */ | |
81 while((ch=getopt(argc, argv, "i:t:o:p:c:"))!=-1) | |
82 { | |
83 switch(ch) | |
84 { | |
85 case 'i': | |
86 strcpy(str_in_list,optarg); | |
87 if(strcmp(str_in_list,"")!=0) | |
88 flag_i=1; | |
89 else | |
90 { | |
91 fastq_uniq_usage(); | |
92 return 1; | |
93 } | |
94 break; | |
95 case 't': | |
96 if(strlen(optarg)==1) | |
97 { | |
98 if(optarg[0]=='q') | |
99 { | |
100 output_format='q'; | |
101 flag_t=1; | |
102 break; | |
103 } | |
104 else if(optarg[0]=='f') | |
105 { | |
106 output_format='f'; | |
107 flag_t=1; | |
108 break; | |
109 } | |
110 else if(optarg[0]=='p') | |
111 { | |
112 output_format='p'; | |
113 flag_t=1; | |
114 break; | |
115 } | |
116 else | |
117 { | |
118 fastq_uniq_usage(); | |
119 return 1; | |
120 } | |
121 } | |
122 fastq_uniq_usage(); | |
123 return 1; | |
124 case 'o': | |
125 strcpy(str_out_left,optarg); | |
126 if(strcmp(str_out_left,"")!=0) | |
127 flag_o=1; | |
128 else | |
129 { | |
130 fastq_uniq_usage(); | |
131 return 1; | |
132 } | |
133 break; | |
134 case 'p': | |
135 strcpy(str_out_right,optarg); | |
136 if(strcmp(str_out_right,"")!=0) | |
137 flag_p=1; | |
138 else | |
139 { | |
140 fastq_uniq_usage(); | |
141 return 1; | |
142 } | |
143 break; | |
144 case 'c': | |
145 if(strlen(optarg)==1) | |
146 { | |
147 if(optarg[0]=='0') | |
148 { | |
149 description_type=0; | |
150 flag_c=1; | |
151 break; | |
152 } | |
153 else if(optarg[0]=='1') | |
154 { | |
155 description_type=1; | |
156 flag_c=1; | |
157 break; | |
158 } | |
159 else | |
160 { | |
161 fastq_uniq_usage(); | |
162 return 1; | |
163 } | |
164 } | |
165 fastq_uniq_usage(); | |
166 return 1; | |
167 default: | |
168 fastq_uniq_usage(); | |
169 break; | |
170 } | |
171 } | |
172 | |
173 /* check inputted arguments */ | |
174 if(flag_i==0) | |
175 { | |
176 fprintf(stderr, "Error in input the name of FASTQ file list!\n"); | |
177 return 1; | |
178 } | |
179 if(flag_t==0) | |
180 output_format='q'; | |
181 if(flag_o==0 || (output_format!='p' && flag_p==0)) | |
182 { | |
183 fprintf(stderr, "Error in output sequence file name!\n"); | |
184 return 1; | |
185 } | |
186 if(flag_c==0) | |
187 description_type=0; | |
188 | |
189 /* get pair-end FASTQ file list */ | |
190 if((fp_in_list=fopen(str_in_list, "r"))==NULL) | |
191 { | |
192 fprintf(stderr, "Error in open FASTQ file list %s for read!\n", | |
193 str_in_list); | |
194 return 1; | |
195 } | |
196 for(i=0; !feof(fp_in_list) && i<MAX_FILE_NUMBER;) | |
197 { | |
198 /* get the file store left FASTQ sequences */ | |
199 s_left[0]='\0'; | |
200 fgets(s_left, 1000, fp_in_list); | |
201 if(s_left[0]=='\0') | |
202 continue; | |
203 else if(strlen(s_left)>=2 && s_left[strlen(s_left)-1]=='\n') | |
204 s_left[strlen(s_left)-1]='\0'; | |
205 else | |
206 { | |
207 fprintf(stderr, "Error in read from FASTQ file list!\n"); | |
208 return 1; | |
209 } | |
210 | |
211 /* get the file store right FASTQ sequences */ | |
212 s_right[0]='\0'; | |
213 fgets(s_right, 1000, fp_in_list); | |
214 if(strlen(s_right)>=2) | |
215 { | |
216 if(s_right[strlen(s_right)-1]=='\n') | |
217 s_right[strlen(s_right)-1]='\0'; | |
218 } | |
219 else | |
220 { | |
221 fprintf(stderr, "Error in read from FASTQ file list!\n"); | |
222 return 1; | |
223 } | |
224 | |
225 /* append the fiel name to list array */ | |
226 strcpy(str_in_left[i], s_left); | |
227 strcpy(str_in_right[i++], s_right); | |
228 } | |
229 fclose(fp_in_list); | |
230 | |
231 /* check the status of pair-end FASTQ files */ | |
232 for(i=0;i<MAX_FILE_NUMBER;i++) | |
233 { | |
234 /* check whether list reached the end */ | |
235 if(str_in_left[i][0]=='\0') | |
236 break; | |
237 | |
238 /* check file status */ | |
239 if((fp_in_left=fopen(str_in_left[i], "r"))==NULL) | |
240 { | |
241 fprintf(stderr, "Error in open left fastq file %s for read!\n", | |
242 str_in_left[i]); | |
243 return 1; | |
244 } | |
245 fclose(fp_in_left); | |
246 | |
247 if((fp_in_right=fopen(str_in_right[i], "r"))==NULL) | |
248 { | |
249 fprintf(stderr, "Error in open right fastq file %s for read!\n", | |
250 str_in_right[i]); | |
251 return 1; | |
252 } | |
253 fclose(fp_in_right); | |
254 } | |
255 | |
256 | |
257 /* read all pair-end FASTQ sequences into memory */ | |
258 seq_pair_count=0; | |
259 if((fq_pair_array=fastq_pair_array_create())==NULL) | |
260 { | |
261 fprintf(stderr, "Error in allocate enough memory!\n"); | |
262 return 1; | |
263 } | |
264 if((temp_fq_pair_array=fastq_pair_array_create())==NULL) | |
265 { | |
266 fprintf(stderr, "Error in allocate enough memory!\n"); | |
267 return 1; | |
268 } | |
269 for(i=0;i<MAX_FILE_NUMBER;i++) | |
270 { | |
271 /* check whether list reached the end */ | |
272 if(str_in_left[i][0]=='\0') | |
273 break; | |
274 | |
275 /* open inputted pair-end FASTQ file */ | |
276 if((fp_in_left=fopen(str_in_left[i], "r"))==NULL) | |
277 { | |
278 fprintf(stderr, "Error in open left fastq file %s for read!\n", | |
279 str_in_left[i]); | |
280 return 1; | |
281 } | |
282 if((fp_in_right=fopen(str_in_right[i], "r"))==NULL) | |
283 { | |
284 fprintf(stderr, "Error in open right fastq file %s for read!\n", | |
285 str_in_right[i]); | |
286 return 1; | |
287 } | |
288 | |
289 /* read sequences */ | |
290 for(;!feof(fp_in_left) && !feof(fp_in_right);) | |
291 { | |
292 fq_pair=NULL; | |
293 if((fq_pair=fastq_pair_create())==NULL) | |
294 { | |
295 fprintf(stderr, "Error in allocate enough memory!\n"); | |
296 return 1; | |
297 } | |
298 | |
299 if(output_format=='f' || output_format=='p') | |
300 { | |
301 /* NOT require quality */ | |
302 if(fastq_pair_scanf(fq_pair, fp_in_left, fp_in_right, description_type==0?1:0, 0)!=0) | |
303 { | |
304 fastq_pair_remove(fq_pair); | |
305 break; | |
306 } | |
307 } | |
308 else | |
309 { | |
310 /* require quality */ | |
311 if(fastq_pair_scanf(fq_pair, fp_in_left, fp_in_right, description_type==0?1:0, 1)!=0) | |
312 { | |
313 fastq_pair_remove(fq_pair); | |
314 break; | |
315 } | |
316 } | |
317 | |
318 fastq_pair_array_append(fq_pair, fq_pair_array); | |
319 fastq_pair_array_append(fq_pair, temp_fq_pair_array); | |
320 seq_pair_count++; | |
321 } | |
322 | |
323 if(!feof(fp_in_left) && !feof(fp_in_right)) | |
324 { | |
325 fprintf(stderr, "Error in Reading pair-end FASTQ sequence!\n"); | |
326 return 1; | |
327 } | |
328 } | |
329 | |
330 /* create memory address index for each BLOCK in a FASTQ_PAIR_ARRAY */ | |
331 fastq_pair_array_generate_index(fq_pair_array); | |
332 fastq_pair_array_generate_index(temp_fq_pair_array); | |
333 | |
334 /* sort the pair-end FASTQ sequences */ | |
335 fastq_pair_array_sort(fq_pair_array, temp_fq_pair_array, 1, seq_pair_count); | |
336 | |
337 /* open output fastq file */ | |
338 if((fp_out_left=fopen(str_out_left, "w"))==NULL) | |
339 { | |
340 fprintf(stderr, "Error in open left fastq file %s for write!\n", | |
341 str_out_left); | |
342 return 1; | |
343 } | |
344 | |
345 if(str_out_right[0]!='\0') | |
346 { | |
347 if((fp_out_right=fopen(str_out_right, "w"))==NULL) | |
348 { | |
349 fprintf(stderr, "Error in open right fastq file %s for write!\n", | |
350 str_out_right); | |
351 return 1; | |
352 } | |
353 } | |
354 | |
355 /* output the sequence in specific format */ | |
356 if(output_format=='f') | |
357 fastq_pair_array_printf(fq_pair_array, fp_out_left, fp_out_right, "fa", description_type, 1); | |
358 else if(output_format=='p') | |
359 fastq_pair_array_printf(fq_pair_array, fp_out_left, NULL, "fa", description_type, 1); | |
360 else | |
361 fastq_pair_array_printf(fq_pair_array, fp_out_left, fp_out_right, "fq", description_type, 1); | |
362 | |
363 /* close output files */ | |
364 fclose(fp_out_left); | |
365 if(str_out_right[0]!='\0') | |
366 fclose(fp_out_right); | |
367 | |
368 // /* free memory */ | |
369 // fastq_pair_array_remove(fq_pair_array); | |
370 | |
371 return 0; | |
372 } | |
373 |