Mercurial > repos > nikhil-joshi > sickle
comparison src/trim_paired.c @ 4:c70137414dcd draft
sickle v1.33
author | nikhil-joshi |
---|---|
date | Wed, 23 Jul 2014 18:35:10 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
3:f6ebdaca9925 | 4:c70137414dcd |
---|---|
1 #include <assert.h> | |
2 #include <ctype.h> | |
3 #include <stdlib.h> | |
4 #include <zlib.h> | |
5 #include <stdio.h> | |
6 #include <getopt.h> | |
7 #include <unistd.h> | |
8 #include "sickle.h" | |
9 #include "kseq.h" | |
10 #include "print_record.h" | |
11 | |
12 __KS_GETC(gzread, BUFFER_SIZE) | |
13 __KS_GETUNTIL(gzread, BUFFER_SIZE) | |
14 __KSEQ_READ | |
15 | |
16 int paired_qual_threshold = 20; | |
17 int paired_length_threshold = 20; | |
18 | |
19 static struct option paired_long_options[] = { | |
20 {"qual-type", required_argument, 0, 't'}, | |
21 {"pe-file1", optional_argument, 0, 'f'}, | |
22 {"pe-file2", optional_argument, 0, 'r'}, | |
23 {"pe-combo", optional_argument, 0, 'c'}, | |
24 {"output-pe1", optional_argument, 0, 'o'}, | |
25 {"output-pe2", optional_argument, 0, 'p'}, | |
26 {"output-single", optional_argument, 0, 's'}, | |
27 {"output-combo", optional_argument, 0, 'm'}, | |
28 {"qual-threshold", optional_argument, 0, 'q'}, | |
29 {"length-threshold", optional_argument, 0, 'l'}, | |
30 {"no-fiveprime", optional_argument, 0, 'x'}, | |
31 {"truncate-n", optional_argument, 0, 'n'}, | |
32 {"gzip-output", optional_argument, 0, 'g'}, | |
33 {"output-combo-all", optional_argument, 0, 'M'}, | |
34 {"quiet", optional_argument, 0, 'z'}, | |
35 {GETOPT_HELP_OPTION_DECL}, | |
36 {GETOPT_VERSION_OPTION_DECL}, | |
37 {NULL, 0, NULL, 0} | |
38 }; | |
39 | |
40 void paired_usage (int status, char *msg) { | |
41 | |
42 fprintf(stderr, "\nIf you have separate files for forward and reverse reads:\n"); | |
43 fprintf(stderr, "Usage: %s pe [options] -f <paired-end forward fastq file> -r <paired-end reverse fastq file> -t <quality type> -o <trimmed PE forward file> -p <trimmed PE reverse file> -s <trimmed singles file>\n\n", PROGRAM_NAME); | |
44 fprintf(stderr, "If you have one file with interleaved forward and reverse reads:\n"); | |
45 fprintf(stderr, "Usage: %s pe [options] -c <interleaved input file> -t <quality type> -m <interleaved trimmed paired-end output> -s <trimmed singles file>\n\n\ | |
46 If you have one file with interleaved reads as input and you want ONLY one interleaved file as output:\n\ | |
47 Usage: %s pe [options] -c <interleaved input file> -t <quality type> -M <interleaved trimmed output>\n\n", PROGRAM_NAME, PROGRAM_NAME); | |
48 fprintf(stderr, "Options:\n\ | |
49 Paired-end separated reads\n\ | |
50 --------------------------\n\ | |
51 -f, --pe-file1, Input paired-end forward fastq file (Input files must have same number of records)\n\ | |
52 -r, --pe-file2, Input paired-end reverse fastq file\n\ | |
53 -o, --output-pe1, Output trimmed forward fastq file\n\ | |
54 -p, --output-pe2, Output trimmed reverse fastq file. Must use -s option.\n\n\ | |
55 Paired-end interleaved reads\n\ | |
56 ----------------------------\n"); | |
57 fprintf(stderr,"-c, --pe-combo, Combined (interleaved) input paired-end fastq\n\ | |
58 -m, --output-combo, Output combined (interleaved) paired-end fastq file. Must use -s option.\n\ | |
59 -M, --output-combo-all, Output combined (interleaved) paired-end fastq file with any discarded read written to output file as a single N. Cannot be used with the -s option.\n\n\ | |
60 Global options\n\ | |
61 --------------\n\ | |
62 -t, --qual-type, Type of quality values (solexa (CASAVA < 1.3), illumina (CASAVA 1.3 to 1.7), sanger (which is CASAVA >= 1.8)) (required)\n"); | |
63 fprintf(stderr, "-s, --output-single, Output trimmed singles fastq file\n\ | |
64 -q, --qual-threshold, Threshold for trimming based on average quality in a window. Default 20.\n\ | |
65 -l, --length-threshold, Threshold to keep a read based on length after trimming. Default 20.\n\ | |
66 -x, --no-fiveprime, Don't do five prime trimming.\n\ | |
67 -n, --truncate-n, Truncate sequences at position of first N.\n"); | |
68 | |
69 | |
70 fprintf(stderr, "-g, --gzip-output, Output gzipped files.\n--quiet, do not output trimming info\n\ | |
71 --help, display this help and exit\n\ | |
72 --version, output version information and exit\n\n"); | |
73 | |
74 if (msg) fprintf(stderr, "%s\n\n", msg); | |
75 exit(status); | |
76 } | |
77 | |
78 | |
79 int paired_main(int argc, char *argv[]) { | |
80 | |
81 gzFile pe1 = NULL; /* forward input file handle */ | |
82 gzFile pe2 = NULL; /* reverse input file handle */ | |
83 gzFile pec = NULL; /* combined input file handle */ | |
84 kseq_t *fqrec1 = NULL; | |
85 kseq_t *fqrec2 = NULL; | |
86 int l1, l2; | |
87 FILE *outfile1 = NULL; /* forward output file handle */ | |
88 FILE *outfile2 = NULL; /* reverse output file handle */ | |
89 FILE *combo = NULL; /* combined output file handle */ | |
90 FILE *single = NULL; /* single output file handle */ | |
91 gzFile outfile1_gzip = NULL; | |
92 gzFile outfile2_gzip = NULL; | |
93 gzFile combo_gzip = NULL; | |
94 gzFile single_gzip = NULL; | |
95 int debug = 0; | |
96 int optc; | |
97 extern char *optarg; | |
98 int qualtype = -1; | |
99 cutsites *p1cut; | |
100 cutsites *p2cut; | |
101 char *outfn1 = NULL; /* forward file out name */ | |
102 char *outfn2 = NULL; /* reverse file out name */ | |
103 char *outfnc = NULL; /* combined file out name */ | |
104 char *sfn = NULL; /* single/combined file out name */ | |
105 char *infn1 = NULL; /* forward input filename */ | |
106 char *infn2 = NULL; /* reverse input filename */ | |
107 char *infnc = NULL; /* combined input filename */ | |
108 int kept_p = 0; | |
109 int discard_p = 0; | |
110 int kept_s1 = 0; | |
111 int kept_s2 = 0; | |
112 int discard_s1 = 0; | |
113 int discard_s2 = 0; | |
114 int quiet = 0; | |
115 int no_fiveprime = 0; | |
116 int trunc_n = 0; | |
117 int gzip_output = 0; | |
118 int combo_all=0; | |
119 int combo_s=0; | |
120 | |
121 while (1) { | |
122 int option_index = 0; | |
123 optc = getopt_long(argc, argv, "df:r:c:t:o:p:m:M:s:q:l:xng", paired_long_options, &option_index); | |
124 | |
125 if (optc == -1) | |
126 break; | |
127 | |
128 switch (optc) { | |
129 if (paired_long_options[option_index].flag != 0) | |
130 break; | |
131 | |
132 case 'f': | |
133 infn1 = (char *) malloc(strlen(optarg) + 1); | |
134 strcpy(infn1, optarg); | |
135 break; | |
136 | |
137 case 'r': | |
138 infn2 = (char *) malloc(strlen(optarg) + 1); | |
139 strcpy(infn2, optarg); | |
140 break; | |
141 | |
142 case 'c': | |
143 infnc = (char *) malloc(strlen(optarg) + 1); | |
144 strcpy(infnc, optarg); | |
145 break; | |
146 | |
147 case 't': | |
148 if (!strcmp(optarg, "illumina")) qualtype = ILLUMINA; | |
149 else if (!strcmp(optarg, "solexa")) qualtype = SOLEXA; | |
150 else if (!strcmp(optarg, "sanger")) qualtype = SANGER; | |
151 else { | |
152 fprintf(stderr, "Error: Quality type '%s' is not a valid type.\n", optarg); | |
153 return EXIT_FAILURE; | |
154 } | |
155 break; | |
156 | |
157 case 'o': | |
158 outfn1 = (char *) malloc(strlen(optarg) + 1); | |
159 strcpy(outfn1, optarg); | |
160 break; | |
161 | |
162 case 'p': | |
163 outfn2 = (char *) malloc(strlen(optarg) + 1); | |
164 strcpy(outfn2, optarg); | |
165 break; | |
166 | |
167 case 'm': | |
168 outfnc = (char *) malloc(strlen(optarg) + 1); | |
169 strcpy(outfnc, optarg); | |
170 combo_s = 1; | |
171 break; | |
172 | |
173 case 'M': | |
174 outfnc = (char *) malloc(strlen(optarg) + 1); | |
175 strcpy(outfnc, optarg); | |
176 combo_all = 1; | |
177 break; | |
178 | |
179 case 's': | |
180 sfn = (char *) malloc(strlen(optarg) + 1); | |
181 strcpy(sfn, optarg); | |
182 break; | |
183 | |
184 case 'q': | |
185 paired_qual_threshold = atoi(optarg); | |
186 if (paired_qual_threshold < 0) { | |
187 fprintf(stderr, "Quality threshold must be >= 0\n"); | |
188 return EXIT_FAILURE; | |
189 } | |
190 break; | |
191 | |
192 case 'l': | |
193 paired_length_threshold = atoi(optarg); | |
194 if (paired_length_threshold < 0) { | |
195 fprintf(stderr, "Length threshold must be >= 0\n"); | |
196 return EXIT_FAILURE; | |
197 } | |
198 break; | |
199 | |
200 case 'x': | |
201 no_fiveprime = 1; | |
202 break; | |
203 | |
204 case 'n': | |
205 trunc_n = 1; | |
206 break; | |
207 | |
208 case 'g': | |
209 gzip_output = 1; | |
210 break; | |
211 | |
212 case 'z': | |
213 quiet = 1; | |
214 break; | |
215 | |
216 case 'd': | |
217 debug = 1; | |
218 break; | |
219 | |
220 case_GETOPT_HELP_CHAR(paired_usage); | |
221 case_GETOPT_VERSION_CHAR(PROGRAM_NAME, VERSION, AUTHORS); | |
222 | |
223 case '?': | |
224 paired_usage(EXIT_FAILURE, NULL); | |
225 break; | |
226 | |
227 default: | |
228 paired_usage(EXIT_FAILURE, NULL); | |
229 break; | |
230 } | |
231 } | |
232 | |
233 /* required: qualtype */ | |
234 if (qualtype == -1) { | |
235 paired_usage(EXIT_FAILURE, "****Error: Quality type is required."); | |
236 } | |
237 | |
238 /* make sure minimum input filenames are specified */ | |
239 if (!infn1 && !infnc) { | |
240 paired_usage(EXIT_FAILURE, "****Error: Must have either -f OR -c argument."); | |
241 } | |
242 | |
243 if (infnc) { /* using combined input file */ | |
244 | |
245 if (infn1 || infn2 || outfn1 || outfn2) { | |
246 paired_usage(EXIT_FAILURE, "****Error: Cannot have -f, -r, -o, or -p options with -c."); | |
247 } | |
248 | |
249 if ((combo_all && combo_s) || (!combo_all && !combo_s)) { | |
250 paired_usage(EXIT_FAILURE, "****Error: Must have only one of either -m or -M options with -c."); | |
251 } | |
252 | |
253 if ((combo_s && !sfn) || (combo_all && sfn)) { | |
254 paired_usage(EXIT_FAILURE, "****Error: -m option must have -s option, and -M option cannot have -s option."); | |
255 } | |
256 | |
257 /* check for duplicate file names */ | |
258 if (!strcmp(infnc, outfnc) || (combo_s && (!strcmp(infnc, sfn) || !strcmp(outfnc, sfn)))) { | |
259 fprintf(stderr, "****Error: Duplicate filename between combo input, combo output, and/or single output file names.\n\n"); | |
260 return EXIT_FAILURE; | |
261 } | |
262 | |
263 /* get combined output file */ | |
264 if (!gzip_output) { | |
265 combo = fopen(outfnc, "w"); | |
266 if (!combo) { | |
267 fprintf(stderr, "****Error: Could not open combo output file '%s'.\n\n", outfnc); | |
268 return EXIT_FAILURE; | |
269 } | |
270 } else { | |
271 combo_gzip = gzopen(outfnc, "w"); | |
272 if (!combo_gzip) { | |
273 fprintf(stderr, "****Error: Could not open combo output file '%s'.\n\n", outfnc); | |
274 return EXIT_FAILURE; | |
275 } | |
276 } | |
277 | |
278 pec = gzopen(infnc, "r"); | |
279 if (!pec) { | |
280 fprintf(stderr, "****Error: Could not open combined input file '%s'.\n\n", infnc); | |
281 return EXIT_FAILURE; | |
282 } | |
283 | |
284 } else { /* using forward and reverse input files */ | |
285 | |
286 if (infn1 && (!infn2 || !outfn1 || !outfn2 || !sfn)) { | |
287 paired_usage(EXIT_FAILURE, "****Error: Using the -f option means you must have the -r, -o, -p, and -s options."); | |
288 } | |
289 | |
290 if (infn1 && (infnc || combo_all || combo_s)) { | |
291 paired_usage(EXIT_FAILURE, "****Error: The -f option cannot be used in combination with -c, -m, or -M."); | |
292 } | |
293 | |
294 if (!strcmp(infn1, infn2) || !strcmp(infn1, outfn1) || !strcmp(infn1, outfn2) || | |
295 !strcmp(infn1, sfn) || !strcmp(infn2, outfn1) || !strcmp(infn2, outfn2) || | |
296 !strcmp(infn2, sfn) || !strcmp(outfn1, outfn2) || !strcmp(outfn1, sfn) || !strcmp(outfn2, sfn)) { | |
297 | |
298 fprintf(stderr, "****Error: Duplicate input and/or output file names.\n\n"); | |
299 return EXIT_FAILURE; | |
300 } | |
301 | |
302 pe1 = gzopen(infn1, "r"); | |
303 if (!pe1) { | |
304 fprintf(stderr, "****Error: Could not open input file '%s'.\n\n", infn1); | |
305 return EXIT_FAILURE; | |
306 } | |
307 | |
308 pe2 = gzopen(infn2, "r"); | |
309 if (!pe2) { | |
310 fprintf(stderr, "****Error: Could not open input file '%s'.\n\n", infn2); | |
311 return EXIT_FAILURE; | |
312 } | |
313 | |
314 if (!gzip_output) { | |
315 outfile1 = fopen(outfn1, "w"); | |
316 if (!outfile1) { | |
317 fprintf(stderr, "****Error: Could not open output file '%s'.\n\n", outfn1); | |
318 return EXIT_FAILURE; | |
319 } | |
320 | |
321 outfile2 = fopen(outfn2, "w"); | |
322 if (!outfile2) { | |
323 fprintf(stderr, "****Error: Could not open output file '%s'.\n\n", outfn2); | |
324 return EXIT_FAILURE; | |
325 } | |
326 } else { | |
327 outfile1_gzip = gzopen(outfn1, "w"); | |
328 if (!outfile1_gzip) { | |
329 fprintf(stderr, "****Error: Could not open output file '%s'.\n\n", outfn1); | |
330 return EXIT_FAILURE; | |
331 } | |
332 | |
333 outfile2_gzip = gzopen(outfn2, "w"); | |
334 if (!outfile2_gzip) { | |
335 fprintf(stderr, "****Error: Could not open output file '%s'.\n\n", outfn2); | |
336 return EXIT_FAILURE; | |
337 } | |
338 | |
339 } | |
340 } | |
341 | |
342 /* get singles output file handle */ | |
343 if (sfn && !combo_all) { | |
344 if (!gzip_output) { | |
345 single = fopen(sfn, "w"); | |
346 if (!single) { | |
347 fprintf(stderr, "****Error: Could not open single output file '%s'.\n\n", sfn); | |
348 return EXIT_FAILURE; | |
349 } | |
350 } else { | |
351 single_gzip = gzopen(sfn, "w"); | |
352 if (!single_gzip) { | |
353 fprintf(stderr, "****Error: Could not open single output file '%s'.\n\n", sfn); | |
354 return EXIT_FAILURE; | |
355 } | |
356 } | |
357 } | |
358 | |
359 if (pec) { | |
360 fqrec1 = kseq_init(pec); | |
361 fqrec2 = (kseq_t *) malloc(sizeof(kseq_t)); | |
362 fqrec2->f = fqrec1->f; | |
363 } else { | |
364 fqrec1 = kseq_init(pe1); | |
365 fqrec2 = kseq_init(pe2); | |
366 } | |
367 | |
368 while ((l1 = kseq_read(fqrec1)) >= 0) { | |
369 | |
370 l2 = kseq_read(fqrec2); | |
371 if (l2 < 0) { | |
372 fprintf(stderr, "Warning: PE file 2 is shorter than PE file 1. Disregarding rest of PE file 1.\n"); | |
373 break; | |
374 } | |
375 | |
376 p1cut = sliding_window(fqrec1, qualtype, paired_length_threshold, paired_qual_threshold, no_fiveprime, trunc_n, debug); | |
377 p2cut = sliding_window(fqrec2, qualtype, paired_length_threshold, paired_qual_threshold, no_fiveprime, trunc_n, debug); | |
378 | |
379 if (debug) printf("p1cut: %d,%d\n", p1cut->five_prime_cut, p1cut->three_prime_cut); | |
380 if (debug) printf("p2cut: %d,%d\n", p2cut->five_prime_cut, p2cut->three_prime_cut); | |
381 | |
382 /* The sequence and quality print statements below print out the sequence string starting from the 5' cut */ | |
383 /* and then only print out to the 3' cut, however, we need to adjust the 3' cut */ | |
384 /* by subtracting the 5' cut because the 3' cut was calculated on the original sequence */ | |
385 | |
386 /* if both sequences passed quality and length filters, then output both records */ | |
387 if (p1cut->three_prime_cut >= 0 && p2cut->three_prime_cut >= 0) { | |
388 if (!gzip_output) { | |
389 if (pec) { | |
390 print_record (combo, fqrec1, p1cut); | |
391 print_record (combo, fqrec2, p2cut); | |
392 } else { | |
393 print_record (outfile1, fqrec1, p1cut); | |
394 print_record (outfile2, fqrec2, p2cut); | |
395 } | |
396 } else { | |
397 if (pec) { | |
398 print_record_gzip (combo_gzip, fqrec1, p1cut); | |
399 print_record_gzip (combo_gzip, fqrec2, p2cut); | |
400 } else { | |
401 print_record_gzip (outfile1_gzip, fqrec1, p1cut); | |
402 print_record_gzip (outfile2_gzip, fqrec2, p2cut); | |
403 } | |
404 } | |
405 | |
406 kept_p += 2; | |
407 } | |
408 | |
409 /* if only one sequence passed filter, then put its record in singles and discard the other */ | |
410 /* or put an "N" record in if that option was chosen. */ | |
411 else if (p1cut->three_prime_cut >= 0 && p2cut->three_prime_cut < 0) { | |
412 if (!gzip_output) { | |
413 if (combo_all) { | |
414 print_record (combo, fqrec1, p1cut); | |
415 print_record_N (combo, fqrec2, qualtype); | |
416 } else { | |
417 print_record (single, fqrec1, p1cut); | |
418 } | |
419 } else { | |
420 if (combo_all) { | |
421 print_record_gzip (combo_gzip, fqrec1, p1cut); | |
422 print_record_N_gzip (combo_gzip, fqrec2, qualtype); | |
423 } else { | |
424 print_record_gzip (single_gzip, fqrec1, p1cut); | |
425 } | |
426 } | |
427 | |
428 kept_s1++; | |
429 discard_s2++; | |
430 } | |
431 | |
432 else if (p1cut->three_prime_cut < 0 && p2cut->three_prime_cut >= 0) { | |
433 if (!gzip_output) { | |
434 if (combo_all) { | |
435 print_record_N (combo, fqrec1, qualtype); | |
436 print_record (combo, fqrec2, p2cut); | |
437 } else { | |
438 print_record (single, fqrec2, p2cut); | |
439 } | |
440 } else { | |
441 if (combo_all) { | |
442 print_record_N_gzip (combo_gzip, fqrec1, qualtype); | |
443 print_record_gzip (combo_gzip, fqrec2, p2cut); | |
444 } else { | |
445 print_record_gzip (single_gzip, fqrec2, p2cut); | |
446 } | |
447 } | |
448 | |
449 kept_s2++; | |
450 discard_s1++; | |
451 | |
452 } else { | |
453 | |
454 /* If both records are to be discarded, but the -M option */ | |
455 /* is being used, then output two "N" records */ | |
456 if (combo_all) { | |
457 if (!gzip_output) { | |
458 print_record_N (combo, fqrec1, qualtype); | |
459 print_record_N (combo, fqrec2, qualtype); | |
460 } else { | |
461 print_record_N_gzip (combo_gzip, fqrec1, qualtype); | |
462 print_record_N_gzip (combo_gzip, fqrec2, qualtype); | |
463 } | |
464 } | |
465 | |
466 discard_p += 2; | |
467 } | |
468 | |
469 free(p1cut); | |
470 free(p2cut); | |
471 } /* end of while ((l1 = kseq_read (fqrec1)) >= 0) */ | |
472 | |
473 if (l1 < 0) { | |
474 l2 = kseq_read(fqrec2); | |
475 if (l2 >= 0) { | |
476 fprintf(stderr, "Warning: PE file 1 is shorter than PE file 2. Disregarding rest of PE file 2.\n"); | |
477 } | |
478 } | |
479 | |
480 if (!quiet) { | |
481 fprintf(stdout, "\nFastQ paired records kept: %d (%d pairs)\n", kept_p, (kept_p / 2)); | |
482 if (pec) fprintf(stdout, "FastQ single records kept: %d\n", (kept_s1 + kept_s2)); | |
483 else fprintf(stdout, "FastQ single records kept: %d (from PE1: %d, from PE2: %d)\n", (kept_s1 + kept_s2), kept_s1, kept_s2); | |
484 | |
485 fprintf(stdout, "FastQ paired records discarded: %d (%d pairs)\n", discard_p, (discard_p / 2)); | |
486 | |
487 if (pec) fprintf(stdout, "FastQ single records discarded: %d\n\n", (discard_s1 + discard_s2)); | |
488 else fprintf(stdout, "FastQ single records discarded: %d (from PE1: %d, from PE2: %d)\n\n", (discard_s1 + discard_s2), discard_s1, discard_s2); | |
489 } | |
490 | |
491 kseq_destroy(fqrec1); | |
492 if (pec) free(fqrec2); | |
493 else kseq_destroy(fqrec2); | |
494 | |
495 if (sfn && !combo_all) { | |
496 if (!gzip_output) fclose(single); | |
497 else gzclose(single_gzip); | |
498 } | |
499 | |
500 if (pec) { | |
501 gzclose(pec); | |
502 if (!gzip_output) fclose(combo); | |
503 else gzclose(combo_gzip); | |
504 } else { | |
505 gzclose(pe1); | |
506 gzclose(pe2); | |
507 if (!gzip_output) { | |
508 fclose(outfile1); | |
509 fclose(outfile2); | |
510 } else { | |
511 gzclose(outfile1_gzip); | |
512 gzclose(outfile2_gzip); | |
513 } | |
514 } | |
515 | |
516 return EXIT_SUCCESS; | |
517 } /* end of paired_main() */ |