comparison source/fastq_uniq.c @ 0:816cb55b5a2d draft default tip

planemo upload for repository https://github.com/portiahollyoak/Tools commit c4769fd68ad9583d4b9dbdf212e4ecb5968cef1c-dirty
author portiahollyoak
date Thu, 02 Jun 2016 11:34:51 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:816cb55b5a2d
1 /* This program was used to remove duplicates in paired FASTQ sequences,
2 * which is usually appeared in mate pair libraries.
3 *
4 * This file and its partner was written by Haibin Xu, December 2011.
5 */
6
7 #ifndef MAX_FILE_NUMBER
8 #define MAX_FILE_NUMBER 1000
9 #endif
10
11 #include <unistd.h>
12 #include "fastq_pair_array.h"
13
14 void fastq_uniq_usage()
15 {
16 fprintf(stderr, "-i : The input file list of paired FSATQ sequence files [FILE IN]\n");
17 fprintf(stderr, " Maximum 1000 pairs\n");
18 fprintf(stderr, "\n");
19 fprintf(stderr, " This parameter is used to specify a list of paired sequence files in\n");
20 fprintf(stderr, " FASTQ format as input, in which two adjacent files with reads in the\n");
21 fprintf(stderr, " same order belong to a pair.\n");
22 fprintf(stderr, "\n");
23 fprintf(stderr, "-t : Output sequence format [q/f/p]\n");
24 fprintf(stderr, " q : FASTQ format into TWO output files\n");
25 fprintf(stderr, " f : FASTA format into TWO output files\n");
26 fprintf(stderr, " p : FASTA format into ONE output file\n");
27 fprintf(stderr, " default = q\n");
28 fprintf(stderr, "\n");
29 fprintf(stderr, " This parameter is used to specify sequence format in output file(s).\n");
30 fprintf(stderr, " FastUniq could output read pairs into two files in either FASTQ [q]\n");
31 fprintf(stderr, " or FASTA [f] format, in which reads in the same order belonging to a\n");
32 fprintf(stderr, " pair. FastUniq could also output read pairs into a single file in\n");
33 fprintf(stderr, " FASTA format [p], in which adjacent reads belonging to a pair.\n");
34 fprintf(stderr, "\n");
35 fprintf(stderr, "-o : The first output file [FILE OUT]\n");
36 fprintf(stderr, "\n");
37 fprintf(stderr, "-p : The second output file [FILE OUT]\n");
38 fprintf(stderr, " Optional. ONLY required when output sequence format(-t) is specify as\n");
39 fprintf(stderr, " [q] or [f].\n");
40 fprintf(stderr, "\n");
41 fprintf(stderr, "-c : Types of sequence descriptions for output [0/1]\n");
42 fprintf(stderr, " 0 : The raw descriptions\n");
43 fprintf(stderr, " 1 : New serial numbers assigned by FastUniq\n");
44 fprintf(stderr, " default = 0\n");
45 fprintf(stderr, "\n");
46 return;
47 }
48
49 int main (int argc, const char * argv[])
50 {
51 FILE *fp_in_list, *fp_in_left, *fp_in_right, *fp_out_left, *fp_out_right;
52 char str_in_left[MAX_FILE_NUMBER][1000], str_in_right[MAX_FILE_NUMBER][1000];
53 char str_in_list[1000], str_out_left[1000], str_out_right[1000];
54 char s_left[1000], s_right[1000];
55 char output_format;
56 int description_type;
57 int flag_i=0, flag_o=0, flag_t=0, flag_p=0, flag_c=0;
58 char ch;
59 FASTQ_PAIR *fq_pair;
60 FASTQ_PAIR_ARRAY *fq_pair_array, *temp_fq_pair_array;
61 long i, seq_pair_count;
62
63 if(argc==1)
64 {
65 fastq_uniq_usage();
66 return 1;
67 }
68
69 /* initializing */
70 for(i=0;i<MAX_FILE_NUMBER;i++)
71 {
72 str_in_left[i][0]='\0';
73 str_in_right[i][0]='\0';
74 }
75 str_in_list[0]='\0';
76 str_out_left[0]='\0';
77 str_out_right[0]='\0';
78 output_format='\0';
79
80 /* obtain inputted arguments */
81 while((ch=getopt(argc, argv, "i:t:o:p:c:"))!=-1)
82 {
83 switch(ch)
84 {
85 case 'i':
86 strcpy(str_in_list,optarg);
87 if(strcmp(str_in_list,"")!=0)
88 flag_i=1;
89 else
90 {
91 fastq_uniq_usage();
92 return 1;
93 }
94 break;
95 case 't':
96 if(strlen(optarg)==1)
97 {
98 if(optarg[0]=='q')
99 {
100 output_format='q';
101 flag_t=1;
102 break;
103 }
104 else if(optarg[0]=='f')
105 {
106 output_format='f';
107 flag_t=1;
108 break;
109 }
110 else if(optarg[0]=='p')
111 {
112 output_format='p';
113 flag_t=1;
114 break;
115 }
116 else
117 {
118 fastq_uniq_usage();
119 return 1;
120 }
121 }
122 fastq_uniq_usage();
123 return 1;
124 case 'o':
125 strcpy(str_out_left,optarg);
126 if(strcmp(str_out_left,"")!=0)
127 flag_o=1;
128 else
129 {
130 fastq_uniq_usage();
131 return 1;
132 }
133 break;
134 case 'p':
135 strcpy(str_out_right,optarg);
136 if(strcmp(str_out_right,"")!=0)
137 flag_p=1;
138 else
139 {
140 fastq_uniq_usage();
141 return 1;
142 }
143 break;
144 case 'c':
145 if(strlen(optarg)==1)
146 {
147 if(optarg[0]=='0')
148 {
149 description_type=0;
150 flag_c=1;
151 break;
152 }
153 else if(optarg[0]=='1')
154 {
155 description_type=1;
156 flag_c=1;
157 break;
158 }
159 else
160 {
161 fastq_uniq_usage();
162 return 1;
163 }
164 }
165 fastq_uniq_usage();
166 return 1;
167 default:
168 fastq_uniq_usage();
169 break;
170 }
171 }
172
173 /* check inputted arguments */
174 if(flag_i==0)
175 {
176 fprintf(stderr, "Error in input the name of FASTQ file list!\n");
177 return 1;
178 }
179 if(flag_t==0)
180 output_format='q';
181 if(flag_o==0 || (output_format!='p' && flag_p==0))
182 {
183 fprintf(stderr, "Error in output sequence file name!\n");
184 return 1;
185 }
186 if(flag_c==0)
187 description_type=0;
188
189 /* get pair-end FASTQ file list */
190 if((fp_in_list=fopen(str_in_list, "r"))==NULL)
191 {
192 fprintf(stderr, "Error in open FASTQ file list %s for read!\n",
193 str_in_list);
194 return 1;
195 }
196 for(i=0; !feof(fp_in_list) && i<MAX_FILE_NUMBER;)
197 {
198 /* get the file store left FASTQ sequences */
199 s_left[0]='\0';
200 fgets(s_left, 1000, fp_in_list);
201 if(s_left[0]=='\0')
202 continue;
203 else if(strlen(s_left)>=2 && s_left[strlen(s_left)-1]=='\n')
204 s_left[strlen(s_left)-1]='\0';
205 else
206 {
207 fprintf(stderr, "Error in read from FASTQ file list!\n");
208 return 1;
209 }
210
211 /* get the file store right FASTQ sequences */
212 s_right[0]='\0';
213 fgets(s_right, 1000, fp_in_list);
214 if(strlen(s_right)>=2)
215 {
216 if(s_right[strlen(s_right)-1]=='\n')
217 s_right[strlen(s_right)-1]='\0';
218 }
219 else
220 {
221 fprintf(stderr, "Error in read from FASTQ file list!\n");
222 return 1;
223 }
224
225 /* append the fiel name to list array */
226 strcpy(str_in_left[i], s_left);
227 strcpy(str_in_right[i++], s_right);
228 }
229 fclose(fp_in_list);
230
231 /* check the status of pair-end FASTQ files */
232 for(i=0;i<MAX_FILE_NUMBER;i++)
233 {
234 /* check whether list reached the end */
235 if(str_in_left[i][0]=='\0')
236 break;
237
238 /* check file status */
239 if((fp_in_left=fopen(str_in_left[i], "r"))==NULL)
240 {
241 fprintf(stderr, "Error in open left fastq file %s for read!\n",
242 str_in_left[i]);
243 return 1;
244 }
245 fclose(fp_in_left);
246
247 if((fp_in_right=fopen(str_in_right[i], "r"))==NULL)
248 {
249 fprintf(stderr, "Error in open right fastq file %s for read!\n",
250 str_in_right[i]);
251 return 1;
252 }
253 fclose(fp_in_right);
254 }
255
256
257 /* read all pair-end FASTQ sequences into memory */
258 seq_pair_count=0;
259 if((fq_pair_array=fastq_pair_array_create())==NULL)
260 {
261 fprintf(stderr, "Error in allocate enough memory!\n");
262 return 1;
263 }
264 if((temp_fq_pair_array=fastq_pair_array_create())==NULL)
265 {
266 fprintf(stderr, "Error in allocate enough memory!\n");
267 return 1;
268 }
269 for(i=0;i<MAX_FILE_NUMBER;i++)
270 {
271 /* check whether list reached the end */
272 if(str_in_left[i][0]=='\0')
273 break;
274
275 /* open inputted pair-end FASTQ file */
276 if((fp_in_left=fopen(str_in_left[i], "r"))==NULL)
277 {
278 fprintf(stderr, "Error in open left fastq file %s for read!\n",
279 str_in_left[i]);
280 return 1;
281 }
282 if((fp_in_right=fopen(str_in_right[i], "r"))==NULL)
283 {
284 fprintf(stderr, "Error in open right fastq file %s for read!\n",
285 str_in_right[i]);
286 return 1;
287 }
288
289 /* read sequences */
290 for(;!feof(fp_in_left) && !feof(fp_in_right);)
291 {
292 fq_pair=NULL;
293 if((fq_pair=fastq_pair_create())==NULL)
294 {
295 fprintf(stderr, "Error in allocate enough memory!\n");
296 return 1;
297 }
298
299 if(output_format=='f' || output_format=='p')
300 {
301 /* NOT require quality */
302 if(fastq_pair_scanf(fq_pair, fp_in_left, fp_in_right, description_type==0?1:0, 0)!=0)
303 {
304 fastq_pair_remove(fq_pair);
305 break;
306 }
307 }
308 else
309 {
310 /* require quality */
311 if(fastq_pair_scanf(fq_pair, fp_in_left, fp_in_right, description_type==0?1:0, 1)!=0)
312 {
313 fastq_pair_remove(fq_pair);
314 break;
315 }
316 }
317
318 fastq_pair_array_append(fq_pair, fq_pair_array);
319 fastq_pair_array_append(fq_pair, temp_fq_pair_array);
320 seq_pair_count++;
321 }
322
323 if(!feof(fp_in_left) && !feof(fp_in_right))
324 {
325 fprintf(stderr, "Error in Reading pair-end FASTQ sequence!\n");
326 return 1;
327 }
328 }
329
330 /* create memory address index for each BLOCK in a FASTQ_PAIR_ARRAY */
331 fastq_pair_array_generate_index(fq_pair_array);
332 fastq_pair_array_generate_index(temp_fq_pair_array);
333
334 /* sort the pair-end FASTQ sequences */
335 fastq_pair_array_sort(fq_pair_array, temp_fq_pair_array, 1, seq_pair_count);
336
337 /* open output fastq file */
338 if((fp_out_left=fopen(str_out_left, "w"))==NULL)
339 {
340 fprintf(stderr, "Error in open left fastq file %s for write!\n",
341 str_out_left);
342 return 1;
343 }
344
345 if(str_out_right[0]!='\0')
346 {
347 if((fp_out_right=fopen(str_out_right, "w"))==NULL)
348 {
349 fprintf(stderr, "Error in open right fastq file %s for write!\n",
350 str_out_right);
351 return 1;
352 }
353 }
354
355 /* output the sequence in specific format */
356 if(output_format=='f')
357 fastq_pair_array_printf(fq_pair_array, fp_out_left, fp_out_right, "fa", description_type, 1);
358 else if(output_format=='p')
359 fastq_pair_array_printf(fq_pair_array, fp_out_left, NULL, "fa", description_type, 1);
360 else
361 fastq_pair_array_printf(fq_pair_array, fp_out_left, fp_out_right, "fq", description_type, 1);
362
363 /* close output files */
364 fclose(fp_out_left);
365 if(str_out_right[0]!='\0')
366 fclose(fp_out_right);
367
368 // /* free memory */
369 // fastq_pair_array_remove(fq_pair_array);
370
371 return 0;
372 }
373