comparison src/trim_paired.c @ 4:c70137414dcd draft

sickle v1.33
author nikhil-joshi
date Wed, 23 Jul 2014 18:35:10 -0400
parents
children
comparison
equal deleted inserted replaced
3:f6ebdaca9925 4:c70137414dcd
1 #include <assert.h>
2 #include <ctype.h>
3 #include <stdlib.h>
4 #include <zlib.h>
5 #include <stdio.h>
6 #include <getopt.h>
7 #include <unistd.h>
8 #include "sickle.h"
9 #include "kseq.h"
10 #include "print_record.h"
11
12 __KS_GETC(gzread, BUFFER_SIZE)
13 __KS_GETUNTIL(gzread, BUFFER_SIZE)
14 __KSEQ_READ
15
16 int paired_qual_threshold = 20;
17 int paired_length_threshold = 20;
18
19 static struct option paired_long_options[] = {
20 {"qual-type", required_argument, 0, 't'},
21 {"pe-file1", optional_argument, 0, 'f'},
22 {"pe-file2", optional_argument, 0, 'r'},
23 {"pe-combo", optional_argument, 0, 'c'},
24 {"output-pe1", optional_argument, 0, 'o'},
25 {"output-pe2", optional_argument, 0, 'p'},
26 {"output-single", optional_argument, 0, 's'},
27 {"output-combo", optional_argument, 0, 'm'},
28 {"qual-threshold", optional_argument, 0, 'q'},
29 {"length-threshold", optional_argument, 0, 'l'},
30 {"no-fiveprime", optional_argument, 0, 'x'},
31 {"truncate-n", optional_argument, 0, 'n'},
32 {"gzip-output", optional_argument, 0, 'g'},
33 {"output-combo-all", optional_argument, 0, 'M'},
34 {"quiet", optional_argument, 0, 'z'},
35 {GETOPT_HELP_OPTION_DECL},
36 {GETOPT_VERSION_OPTION_DECL},
37 {NULL, 0, NULL, 0}
38 };
39
40 void paired_usage (int status, char *msg) {
41
42 fprintf(stderr, "\nIf you have separate files for forward and reverse reads:\n");
43 fprintf(stderr, "Usage: %s pe [options] -f <paired-end forward fastq file> -r <paired-end reverse fastq file> -t <quality type> -o <trimmed PE forward file> -p <trimmed PE reverse file> -s <trimmed singles file>\n\n", PROGRAM_NAME);
44 fprintf(stderr, "If you have one file with interleaved forward and reverse reads:\n");
45 fprintf(stderr, "Usage: %s pe [options] -c <interleaved input file> -t <quality type> -m <interleaved trimmed paired-end output> -s <trimmed singles file>\n\n\
46 If you have one file with interleaved reads as input and you want ONLY one interleaved file as output:\n\
47 Usage: %s pe [options] -c <interleaved input file> -t <quality type> -M <interleaved trimmed output>\n\n", PROGRAM_NAME, PROGRAM_NAME);
48 fprintf(stderr, "Options:\n\
49 Paired-end separated reads\n\
50 --------------------------\n\
51 -f, --pe-file1, Input paired-end forward fastq file (Input files must have same number of records)\n\
52 -r, --pe-file2, Input paired-end reverse fastq file\n\
53 -o, --output-pe1, Output trimmed forward fastq file\n\
54 -p, --output-pe2, Output trimmed reverse fastq file. Must use -s option.\n\n\
55 Paired-end interleaved reads\n\
56 ----------------------------\n");
57 fprintf(stderr,"-c, --pe-combo, Combined (interleaved) input paired-end fastq\n\
58 -m, --output-combo, Output combined (interleaved) paired-end fastq file. Must use -s option.\n\
59 -M, --output-combo-all, Output combined (interleaved) paired-end fastq file with any discarded read written to output file as a single N. Cannot be used with the -s option.\n\n\
60 Global options\n\
61 --------------\n\
62 -t, --qual-type, Type of quality values (solexa (CASAVA < 1.3), illumina (CASAVA 1.3 to 1.7), sanger (which is CASAVA >= 1.8)) (required)\n");
63 fprintf(stderr, "-s, --output-single, Output trimmed singles fastq file\n\
64 -q, --qual-threshold, Threshold for trimming based on average quality in a window. Default 20.\n\
65 -l, --length-threshold, Threshold to keep a read based on length after trimming. Default 20.\n\
66 -x, --no-fiveprime, Don't do five prime trimming.\n\
67 -n, --truncate-n, Truncate sequences at position of first N.\n");
68
69
70 fprintf(stderr, "-g, --gzip-output, Output gzipped files.\n--quiet, do not output trimming info\n\
71 --help, display this help and exit\n\
72 --version, output version information and exit\n\n");
73
74 if (msg) fprintf(stderr, "%s\n\n", msg);
75 exit(status);
76 }
77
78
79 int paired_main(int argc, char *argv[]) {
80
81 gzFile pe1 = NULL; /* forward input file handle */
82 gzFile pe2 = NULL; /* reverse input file handle */
83 gzFile pec = NULL; /* combined input file handle */
84 kseq_t *fqrec1 = NULL;
85 kseq_t *fqrec2 = NULL;
86 int l1, l2;
87 FILE *outfile1 = NULL; /* forward output file handle */
88 FILE *outfile2 = NULL; /* reverse output file handle */
89 FILE *combo = NULL; /* combined output file handle */
90 FILE *single = NULL; /* single output file handle */
91 gzFile outfile1_gzip = NULL;
92 gzFile outfile2_gzip = NULL;
93 gzFile combo_gzip = NULL;
94 gzFile single_gzip = NULL;
95 int debug = 0;
96 int optc;
97 extern char *optarg;
98 int qualtype = -1;
99 cutsites *p1cut;
100 cutsites *p2cut;
101 char *outfn1 = NULL; /* forward file out name */
102 char *outfn2 = NULL; /* reverse file out name */
103 char *outfnc = NULL; /* combined file out name */
104 char *sfn = NULL; /* single/combined file out name */
105 char *infn1 = NULL; /* forward input filename */
106 char *infn2 = NULL; /* reverse input filename */
107 char *infnc = NULL; /* combined input filename */
108 int kept_p = 0;
109 int discard_p = 0;
110 int kept_s1 = 0;
111 int kept_s2 = 0;
112 int discard_s1 = 0;
113 int discard_s2 = 0;
114 int quiet = 0;
115 int no_fiveprime = 0;
116 int trunc_n = 0;
117 int gzip_output = 0;
118 int combo_all=0;
119 int combo_s=0;
120
121 while (1) {
122 int option_index = 0;
123 optc = getopt_long(argc, argv, "df:r:c:t:o:p:m:M:s:q:l:xng", paired_long_options, &option_index);
124
125 if (optc == -1)
126 break;
127
128 switch (optc) {
129 if (paired_long_options[option_index].flag != 0)
130 break;
131
132 case 'f':
133 infn1 = (char *) malloc(strlen(optarg) + 1);
134 strcpy(infn1, optarg);
135 break;
136
137 case 'r':
138 infn2 = (char *) malloc(strlen(optarg) + 1);
139 strcpy(infn2, optarg);
140 break;
141
142 case 'c':
143 infnc = (char *) malloc(strlen(optarg) + 1);
144 strcpy(infnc, optarg);
145 break;
146
147 case 't':
148 if (!strcmp(optarg, "illumina")) qualtype = ILLUMINA;
149 else if (!strcmp(optarg, "solexa")) qualtype = SOLEXA;
150 else if (!strcmp(optarg, "sanger")) qualtype = SANGER;
151 else {
152 fprintf(stderr, "Error: Quality type '%s' is not a valid type.\n", optarg);
153 return EXIT_FAILURE;
154 }
155 break;
156
157 case 'o':
158 outfn1 = (char *) malloc(strlen(optarg) + 1);
159 strcpy(outfn1, optarg);
160 break;
161
162 case 'p':
163 outfn2 = (char *) malloc(strlen(optarg) + 1);
164 strcpy(outfn2, optarg);
165 break;
166
167 case 'm':
168 outfnc = (char *) malloc(strlen(optarg) + 1);
169 strcpy(outfnc, optarg);
170 combo_s = 1;
171 break;
172
173 case 'M':
174 outfnc = (char *) malloc(strlen(optarg) + 1);
175 strcpy(outfnc, optarg);
176 combo_all = 1;
177 break;
178
179 case 's':
180 sfn = (char *) malloc(strlen(optarg) + 1);
181 strcpy(sfn, optarg);
182 break;
183
184 case 'q':
185 paired_qual_threshold = atoi(optarg);
186 if (paired_qual_threshold < 0) {
187 fprintf(stderr, "Quality threshold must be >= 0\n");
188 return EXIT_FAILURE;
189 }
190 break;
191
192 case 'l':
193 paired_length_threshold = atoi(optarg);
194 if (paired_length_threshold < 0) {
195 fprintf(stderr, "Length threshold must be >= 0\n");
196 return EXIT_FAILURE;
197 }
198 break;
199
200 case 'x':
201 no_fiveprime = 1;
202 break;
203
204 case 'n':
205 trunc_n = 1;
206 break;
207
208 case 'g':
209 gzip_output = 1;
210 break;
211
212 case 'z':
213 quiet = 1;
214 break;
215
216 case 'd':
217 debug = 1;
218 break;
219
220 case_GETOPT_HELP_CHAR(paired_usage);
221 case_GETOPT_VERSION_CHAR(PROGRAM_NAME, VERSION, AUTHORS);
222
223 case '?':
224 paired_usage(EXIT_FAILURE, NULL);
225 break;
226
227 default:
228 paired_usage(EXIT_FAILURE, NULL);
229 break;
230 }
231 }
232
233 /* required: qualtype */
234 if (qualtype == -1) {
235 paired_usage(EXIT_FAILURE, "****Error: Quality type is required.");
236 }
237
238 /* make sure minimum input filenames are specified */
239 if (!infn1 && !infnc) {
240 paired_usage(EXIT_FAILURE, "****Error: Must have either -f OR -c argument.");
241 }
242
243 if (infnc) { /* using combined input file */
244
245 if (infn1 || infn2 || outfn1 || outfn2) {
246 paired_usage(EXIT_FAILURE, "****Error: Cannot have -f, -r, -o, or -p options with -c.");
247 }
248
249 if ((combo_all && combo_s) || (!combo_all && !combo_s)) {
250 paired_usage(EXIT_FAILURE, "****Error: Must have only one of either -m or -M options with -c.");
251 }
252
253 if ((combo_s && !sfn) || (combo_all && sfn)) {
254 paired_usage(EXIT_FAILURE, "****Error: -m option must have -s option, and -M option cannot have -s option.");
255 }
256
257 /* check for duplicate file names */
258 if (!strcmp(infnc, outfnc) || (combo_s && (!strcmp(infnc, sfn) || !strcmp(outfnc, sfn)))) {
259 fprintf(stderr, "****Error: Duplicate filename between combo input, combo output, and/or single output file names.\n\n");
260 return EXIT_FAILURE;
261 }
262
263 /* get combined output file */
264 if (!gzip_output) {
265 combo = fopen(outfnc, "w");
266 if (!combo) {
267 fprintf(stderr, "****Error: Could not open combo output file '%s'.\n\n", outfnc);
268 return EXIT_FAILURE;
269 }
270 } else {
271 combo_gzip = gzopen(outfnc, "w");
272 if (!combo_gzip) {
273 fprintf(stderr, "****Error: Could not open combo output file '%s'.\n\n", outfnc);
274 return EXIT_FAILURE;
275 }
276 }
277
278 pec = gzopen(infnc, "r");
279 if (!pec) {
280 fprintf(stderr, "****Error: Could not open combined input file '%s'.\n\n", infnc);
281 return EXIT_FAILURE;
282 }
283
284 } else { /* using forward and reverse input files */
285
286 if (infn1 && (!infn2 || !outfn1 || !outfn2 || !sfn)) {
287 paired_usage(EXIT_FAILURE, "****Error: Using the -f option means you must have the -r, -o, -p, and -s options.");
288 }
289
290 if (infn1 && (infnc || combo_all || combo_s)) {
291 paired_usage(EXIT_FAILURE, "****Error: The -f option cannot be used in combination with -c, -m, or -M.");
292 }
293
294 if (!strcmp(infn1, infn2) || !strcmp(infn1, outfn1) || !strcmp(infn1, outfn2) ||
295 !strcmp(infn1, sfn) || !strcmp(infn2, outfn1) || !strcmp(infn2, outfn2) ||
296 !strcmp(infn2, sfn) || !strcmp(outfn1, outfn2) || !strcmp(outfn1, sfn) || !strcmp(outfn2, sfn)) {
297
298 fprintf(stderr, "****Error: Duplicate input and/or output file names.\n\n");
299 return EXIT_FAILURE;
300 }
301
302 pe1 = gzopen(infn1, "r");
303 if (!pe1) {
304 fprintf(stderr, "****Error: Could not open input file '%s'.\n\n", infn1);
305 return EXIT_FAILURE;
306 }
307
308 pe2 = gzopen(infn2, "r");
309 if (!pe2) {
310 fprintf(stderr, "****Error: Could not open input file '%s'.\n\n", infn2);
311 return EXIT_FAILURE;
312 }
313
314 if (!gzip_output) {
315 outfile1 = fopen(outfn1, "w");
316 if (!outfile1) {
317 fprintf(stderr, "****Error: Could not open output file '%s'.\n\n", outfn1);
318 return EXIT_FAILURE;
319 }
320
321 outfile2 = fopen(outfn2, "w");
322 if (!outfile2) {
323 fprintf(stderr, "****Error: Could not open output file '%s'.\n\n", outfn2);
324 return EXIT_FAILURE;
325 }
326 } else {
327 outfile1_gzip = gzopen(outfn1, "w");
328 if (!outfile1_gzip) {
329 fprintf(stderr, "****Error: Could not open output file '%s'.\n\n", outfn1);
330 return EXIT_FAILURE;
331 }
332
333 outfile2_gzip = gzopen(outfn2, "w");
334 if (!outfile2_gzip) {
335 fprintf(stderr, "****Error: Could not open output file '%s'.\n\n", outfn2);
336 return EXIT_FAILURE;
337 }
338
339 }
340 }
341
342 /* get singles output file handle */
343 if (sfn && !combo_all) {
344 if (!gzip_output) {
345 single = fopen(sfn, "w");
346 if (!single) {
347 fprintf(stderr, "****Error: Could not open single output file '%s'.\n\n", sfn);
348 return EXIT_FAILURE;
349 }
350 } else {
351 single_gzip = gzopen(sfn, "w");
352 if (!single_gzip) {
353 fprintf(stderr, "****Error: Could not open single output file '%s'.\n\n", sfn);
354 return EXIT_FAILURE;
355 }
356 }
357 }
358
359 if (pec) {
360 fqrec1 = kseq_init(pec);
361 fqrec2 = (kseq_t *) malloc(sizeof(kseq_t));
362 fqrec2->f = fqrec1->f;
363 } else {
364 fqrec1 = kseq_init(pe1);
365 fqrec2 = kseq_init(pe2);
366 }
367
368 while ((l1 = kseq_read(fqrec1)) >= 0) {
369
370 l2 = kseq_read(fqrec2);
371 if (l2 < 0) {
372 fprintf(stderr, "Warning: PE file 2 is shorter than PE file 1. Disregarding rest of PE file 1.\n");
373 break;
374 }
375
376 p1cut = sliding_window(fqrec1, qualtype, paired_length_threshold, paired_qual_threshold, no_fiveprime, trunc_n, debug);
377 p2cut = sliding_window(fqrec2, qualtype, paired_length_threshold, paired_qual_threshold, no_fiveprime, trunc_n, debug);
378
379 if (debug) printf("p1cut: %d,%d\n", p1cut->five_prime_cut, p1cut->three_prime_cut);
380 if (debug) printf("p2cut: %d,%d\n", p2cut->five_prime_cut, p2cut->three_prime_cut);
381
382 /* The sequence and quality print statements below print out the sequence string starting from the 5' cut */
383 /* and then only print out to the 3' cut, however, we need to adjust the 3' cut */
384 /* by subtracting the 5' cut because the 3' cut was calculated on the original sequence */
385
386 /* if both sequences passed quality and length filters, then output both records */
387 if (p1cut->three_prime_cut >= 0 && p2cut->three_prime_cut >= 0) {
388 if (!gzip_output) {
389 if (pec) {
390 print_record (combo, fqrec1, p1cut);
391 print_record (combo, fqrec2, p2cut);
392 } else {
393 print_record (outfile1, fqrec1, p1cut);
394 print_record (outfile2, fqrec2, p2cut);
395 }
396 } else {
397 if (pec) {
398 print_record_gzip (combo_gzip, fqrec1, p1cut);
399 print_record_gzip (combo_gzip, fqrec2, p2cut);
400 } else {
401 print_record_gzip (outfile1_gzip, fqrec1, p1cut);
402 print_record_gzip (outfile2_gzip, fqrec2, p2cut);
403 }
404 }
405
406 kept_p += 2;
407 }
408
409 /* if only one sequence passed filter, then put its record in singles and discard the other */
410 /* or put an "N" record in if that option was chosen. */
411 else if (p1cut->three_prime_cut >= 0 && p2cut->three_prime_cut < 0) {
412 if (!gzip_output) {
413 if (combo_all) {
414 print_record (combo, fqrec1, p1cut);
415 print_record_N (combo, fqrec2, qualtype);
416 } else {
417 print_record (single, fqrec1, p1cut);
418 }
419 } else {
420 if (combo_all) {
421 print_record_gzip (combo_gzip, fqrec1, p1cut);
422 print_record_N_gzip (combo_gzip, fqrec2, qualtype);
423 } else {
424 print_record_gzip (single_gzip, fqrec1, p1cut);
425 }
426 }
427
428 kept_s1++;
429 discard_s2++;
430 }
431
432 else if (p1cut->three_prime_cut < 0 && p2cut->three_prime_cut >= 0) {
433 if (!gzip_output) {
434 if (combo_all) {
435 print_record_N (combo, fqrec1, qualtype);
436 print_record (combo, fqrec2, p2cut);
437 } else {
438 print_record (single, fqrec2, p2cut);
439 }
440 } else {
441 if (combo_all) {
442 print_record_N_gzip (combo_gzip, fqrec1, qualtype);
443 print_record_gzip (combo_gzip, fqrec2, p2cut);
444 } else {
445 print_record_gzip (single_gzip, fqrec2, p2cut);
446 }
447 }
448
449 kept_s2++;
450 discard_s1++;
451
452 } else {
453
454 /* If both records are to be discarded, but the -M option */
455 /* is being used, then output two "N" records */
456 if (combo_all) {
457 if (!gzip_output) {
458 print_record_N (combo, fqrec1, qualtype);
459 print_record_N (combo, fqrec2, qualtype);
460 } else {
461 print_record_N_gzip (combo_gzip, fqrec1, qualtype);
462 print_record_N_gzip (combo_gzip, fqrec2, qualtype);
463 }
464 }
465
466 discard_p += 2;
467 }
468
469 free(p1cut);
470 free(p2cut);
471 } /* end of while ((l1 = kseq_read (fqrec1)) >= 0) */
472
473 if (l1 < 0) {
474 l2 = kseq_read(fqrec2);
475 if (l2 >= 0) {
476 fprintf(stderr, "Warning: PE file 1 is shorter than PE file 2. Disregarding rest of PE file 2.\n");
477 }
478 }
479
480 if (!quiet) {
481 fprintf(stdout, "\nFastQ paired records kept: %d (%d pairs)\n", kept_p, (kept_p / 2));
482 if (pec) fprintf(stdout, "FastQ single records kept: %d\n", (kept_s1 + kept_s2));
483 else fprintf(stdout, "FastQ single records kept: %d (from PE1: %d, from PE2: %d)\n", (kept_s1 + kept_s2), kept_s1, kept_s2);
484
485 fprintf(stdout, "FastQ paired records discarded: %d (%d pairs)\n", discard_p, (discard_p / 2));
486
487 if (pec) fprintf(stdout, "FastQ single records discarded: %d\n\n", (discard_s1 + discard_s2));
488 else fprintf(stdout, "FastQ single records discarded: %d (from PE1: %d, from PE2: %d)\n\n", (discard_s1 + discard_s2), discard_s1, discard_s2);
489 }
490
491 kseq_destroy(fqrec1);
492 if (pec) free(fqrec2);
493 else kseq_destroy(fqrec2);
494
495 if (sfn && !combo_all) {
496 if (!gzip_output) fclose(single);
497 else gzclose(single_gzip);
498 }
499
500 if (pec) {
501 gzclose(pec);
502 if (!gzip_output) fclose(combo);
503 else gzclose(combo_gzip);
504 } else {
505 gzclose(pe1);
506 gzclose(pe2);
507 if (!gzip_output) {
508 fclose(outfile1);
509 fclose(outfile2);
510 } else {
511 gzclose(outfile1_gzip);
512 gzclose(outfile2_gzip);
513 }
514 }
515
516 return EXIT_SUCCESS;
517 } /* end of paired_main() */