comparison src/trim_single.c @ 4:c70137414dcd draft

sickle v1.33
author nikhil-joshi
date Wed, 23 Jul 2014 18:35:10 -0400
parents
children
comparison
equal deleted inserted replaced
3:f6ebdaca9925 4:c70137414dcd
1 #include <assert.h>
2 #include <ctype.h>
3 #include <stdlib.h>
4 #include <zlib.h>
5 #include <stdio.h>
6 #include <getopt.h>
7 #include "sickle.h"
8 #include "kseq.h"
9 #include "print_record.h"
10
11 __KS_GETC(gzread, BUFFER_SIZE)
12 __KS_GETUNTIL(gzread, BUFFER_SIZE)
13 __KSEQ_READ
14
15 int single_qual_threshold = 20;
16 int single_length_threshold = 20;
17
18 static struct option single_long_options[] = {
19 {"fastq-file", required_argument, 0, 'f'},
20 {"output-file", required_argument, 0, 'o'},
21 {"qual-type", required_argument, 0, 't'},
22 {"qual-threshold", optional_argument, 0, 'q'},
23 {"length-threshold", optional_argument, 0, 'l'},
24 {"no-fiveprime", optional_argument, 0, 'x'},
25 {"discard-n", optional_argument, 0, 'n'},
26 {"gzip-output", optional_argument, 0, 'g'},
27 {"quiet", optional_argument, 0, 'z'},
28 {GETOPT_HELP_OPTION_DECL},
29 {GETOPT_VERSION_OPTION_DECL},
30 {NULL, 0, NULL, 0}
31 };
32
33 void single_usage(int status, char *msg) {
34
35 fprintf(stderr, "\nUsage: %s se [options] -f <fastq sequence file> -t <quality type> -o <trimmed fastq file>\n\
36 \n\
37 Options:\n\
38 -f, --fastq-file, Input fastq file (required)\n\
39 -t, --qual-type, Type of quality values (solexa (CASAVA < 1.3), illumina (CASAVA 1.3 to 1.7), sanger (which is CASAVA >= 1.8)) (required)\n\
40 -o, --output-file, Output trimmed fastq file (required)\n", PROGRAM_NAME);
41
42 fprintf(stderr, "-q, --qual-threshold, Threshold for trimming based on average quality in a window. Default 20.\n\
43 -l, --length-threshold, Threshold to keep a read based on length after trimming. Default 20.\n\
44 -x, --no-fiveprime, Don't do five prime trimming.\n\
45 -n, --trunc-n, Truncate sequences at position of first N.\n\
46 -g, --gzip-output, Output gzipped files.\n\
47 --quiet, Don't print out any trimming information\n\
48 --help, display this help and exit\n\
49 --version, output version information and exit\n\n");
50
51 if (msg) fprintf(stderr, "%s\n\n", msg);
52 exit(status);
53 }
54
55 int single_main(int argc, char *argv[]) {
56
57 gzFile se = NULL;
58 kseq_t *fqrec;
59 int l;
60 FILE *outfile = NULL;
61 gzFile outfile_gzip = NULL;
62 int debug = 0;
63 int optc;
64 extern char *optarg;
65 int qualtype = -1;
66 cutsites *p1cut;
67 char *outfn = NULL;
68 char *infn = NULL;
69 int kept = 0;
70 int discard = 0;
71 int quiet = 0;
72 int no_fiveprime = 0;
73 int trunc_n = 0;
74 int gzip_output = 0;
75
76 while (1) {
77 int option_index = 0;
78 optc = getopt_long(argc, argv, "df:t:o:q:l:zxng", single_long_options, &option_index);
79
80 if (optc == -1)
81 break;
82
83 switch (optc) {
84 if (single_long_options[option_index].flag != 0)
85 break;
86
87 case 'f':
88 infn = (char *) malloc(strlen(optarg) + 1);
89 strcpy(infn, optarg);
90 break;
91
92 case 't':
93 if (!strcmp(optarg, "illumina"))
94 qualtype = ILLUMINA;
95 else if (!strcmp(optarg, "solexa"))
96 qualtype = SOLEXA;
97 else if (!strcmp(optarg, "sanger"))
98 qualtype = SANGER;
99 else {
100 fprintf(stderr, "Error: Quality type '%s' is not a valid type.\n", optarg);
101 return EXIT_FAILURE;
102 }
103 break;
104
105 case 'o':
106 outfn = (char *) malloc(strlen(optarg) + 1);
107 strcpy(outfn, optarg);
108 break;
109
110 case 'q':
111 single_qual_threshold = atoi(optarg);
112 if (single_qual_threshold < 0) {
113 fprintf(stderr, "Quality threshold must be >= 0\n");
114 return EXIT_FAILURE;
115 }
116 break;
117
118 case 'l':
119 single_length_threshold = atoi(optarg);
120 if (single_length_threshold < 0) {
121 fprintf(stderr, "Length threshold must be >= 0\n");
122 return EXIT_FAILURE;
123 }
124 break;
125
126 case 'x':
127 no_fiveprime = 1;
128 break;
129
130 case 'n':
131 trunc_n = 1;
132 break;
133
134 case 'g':
135 gzip_output = 1;
136 break;
137
138 case 'z':
139 quiet = 1;
140 break;
141
142 case 'd':
143 debug = 1;
144 break;
145
146 case_GETOPT_HELP_CHAR(single_usage)
147 case_GETOPT_VERSION_CHAR(PROGRAM_NAME, VERSION, AUTHORS);
148
149 case '?':
150 single_usage(EXIT_FAILURE, NULL);
151 break;
152
153 default:
154 single_usage(EXIT_FAILURE, NULL);
155 break;
156 }
157 }
158
159
160 if (qualtype == -1 || !infn || !outfn) {
161 single_usage(EXIT_FAILURE, "****Error: Must have quality type, input file, and output file.");
162 }
163
164 if (!strcmp(infn, outfn)) {
165 fprintf(stderr, "****Error: Input file is same as output file.\n\n");
166 return EXIT_FAILURE;
167 }
168
169 se = gzopen(infn, "r");
170 if (!se) {
171 fprintf(stderr, "****Error: Could not open input file '%s'.\n\n", infn);
172 return EXIT_FAILURE;
173 }
174
175 if (!gzip_output) {
176 outfile = fopen(outfn, "w");
177 if (!outfile) {
178 fprintf(stderr, "****Error: Could not open output file '%s'.\n\n", outfn);
179 return EXIT_FAILURE;
180 }
181 } else {
182 outfile_gzip = gzopen(outfn, "w");
183 if (!outfile_gzip) {
184 fprintf(stderr, "****Error: Could not open output file '%s'.\n\n", outfn);
185 return EXIT_FAILURE;
186 }
187 }
188
189
190 fqrec = kseq_init(se);
191
192 while ((l = kseq_read(fqrec)) >= 0) {
193
194 p1cut = sliding_window(fqrec, qualtype, single_length_threshold, single_qual_threshold, no_fiveprime, trunc_n, debug);
195
196 if (debug) printf("P1cut: %d,%d\n", p1cut->five_prime_cut, p1cut->three_prime_cut);
197
198 /* if sequence quality and length pass filter then output record, else discard */
199 if (p1cut->three_prime_cut >= 0) {
200 if (!gzip_output) {
201 /* This print statement prints out the sequence string starting from the 5' cut */
202 /* and then only prints out to the 3' cut, however, we need to adjust the 3' cut */
203 /* by subtracting the 5' cut because the 3' cut was calculated on the original sequence */
204
205 print_record (outfile, fqrec, p1cut);
206 } else {
207 print_record_gzip (outfile_gzip, fqrec, p1cut);
208 }
209
210 kept++;
211 }
212
213 else discard++;
214
215 free(p1cut);
216 }
217
218 if (!quiet) fprintf(stdout, "\nFastQ records kept: %d\nFastQ records discarded: %d\n\n", kept, discard);
219
220 kseq_destroy(fqrec);
221 gzclose(se);
222 if (!gzip_output) fclose(outfile);
223 else gzclose(outfile_gzip);
224
225 return EXIT_SUCCESS;
226 }