9
|
1 #include <assert.h>
|
|
2 #include <ctype.h>
|
|
3 #include <stdlib.h>
|
|
4 #include <zlib.h>
|
|
5 #include <stdio.h>
|
|
6 #include <getopt.h>
|
|
7 #include "sickle.h"
|
|
8 #include "kseq.h"
|
|
9 #include "print_record.h"
|
|
10
|
|
11 __KS_GETC(gzread, BUFFER_SIZE)
|
|
12 __KS_GETUNTIL(gzread, BUFFER_SIZE)
|
|
13 __KSEQ_READ
|
|
14
|
|
15 int single_qual_threshold = 20;
|
|
16 int single_length_threshold = 20;
|
|
17
|
|
18 static struct option single_long_options[] = {
|
|
19 {"fastq-file", required_argument, 0, 'f'},
|
|
20 {"output-file", required_argument, 0, 'o'},
|
|
21 {"qual-type", required_argument, 0, 't'},
|
|
22 {"qual-threshold", required_argument, 0, 'q'},
|
|
23 {"length-threshold", required_argument, 0, 'l'},
|
|
24 {"no-fiveprime", no_argument, 0, 'x'},
|
|
25 {"discard-n", no_argument, 0, 'n'},
|
|
26 {"gzip-output", no_argument, 0, 'g'},
|
|
27 {"quiet", no_argument, 0, 'z'},
|
|
28 {GETOPT_HELP_OPTION_DECL},
|
|
29 {GETOPT_VERSION_OPTION_DECL},
|
|
30 {NULL, 0, NULL, 0}
|
|
31 };
|
|
32
|
|
33 void single_usage(int status, char *msg) {
|
|
34
|
|
35 fprintf(stderr, "\nUsage: %s se [options] -f <fastq sequence file> -t <quality type> -o <trimmed fastq file>\n\
|
|
36 \n\
|
|
37 Options:\n\
|
|
38 -f, --fastq-file, Input fastq file (required)\n\
|
|
39 -t, --qual-type, Type of quality values (solexa (CASAVA < 1.3), illumina (CASAVA 1.3 to 1.7), sanger (which is CASAVA >= 1.8)) (required)\n\
|
|
40 -o, --output-file, Output trimmed fastq file (required)\n", PROGRAM_NAME);
|
|
41
|
|
42 fprintf(stderr, "-q, --qual-threshold, Threshold for trimming based on average quality in a window. Default 20.\n\
|
|
43 -l, --length-threshold, Threshold to keep a read based on length after trimming. Default 20.\n\
|
|
44 -x, --no-fiveprime, Don't do five prime trimming.\n\
|
|
45 -n, --trunc-n, Truncate sequences at position of first N.\n\
|
|
46 -g, --gzip-output, Output gzipped files.\n\
|
|
47 --quiet, Don't print out any trimming information\n\
|
|
48 --help, display this help and exit\n\
|
|
49 --version, output version information and exit\n\n");
|
|
50
|
|
51 if (msg) fprintf(stderr, "%s\n\n", msg);
|
|
52 exit(status);
|
|
53 }
|
|
54
|
|
55 int single_main(int argc, char *argv[]) {
|
|
56
|
|
57 gzFile se = NULL;
|
|
58 kseq_t *fqrec;
|
|
59 int l;
|
|
60 FILE *outfile = NULL;
|
|
61 gzFile outfile_gzip = NULL;
|
|
62 int debug = 0;
|
|
63 int optc;
|
|
64 extern char *optarg;
|
|
65 int qualtype = -1;
|
|
66 cutsites *p1cut;
|
|
67 char *outfn = NULL;
|
|
68 char *infn = NULL;
|
|
69 int kept = 0;
|
|
70 int discard = 0;
|
|
71 int quiet = 0;
|
|
72 int no_fiveprime = 0;
|
|
73 int trunc_n = 0;
|
|
74 int gzip_output = 0;
|
|
75 int total=0;
|
|
76
|
|
77 while (1) {
|
|
78 int option_index = 0;
|
|
79 optc = getopt_long(argc, argv, "df:t:o:q:l:zxng", single_long_options, &option_index);
|
|
80
|
|
81 if (optc == -1)
|
|
82 break;
|
|
83
|
|
84 switch (optc) {
|
|
85 if (single_long_options[option_index].flag != 0)
|
|
86 break;
|
|
87
|
|
88 case 'f':
|
|
89 infn = (char *) malloc(strlen(optarg) + 1);
|
|
90 strcpy(infn, optarg);
|
|
91 break;
|
|
92
|
|
93 case 't':
|
|
94 if (!strcmp(optarg, "illumina"))
|
|
95 qualtype = ILLUMINA;
|
|
96 else if (!strcmp(optarg, "solexa"))
|
|
97 qualtype = SOLEXA;
|
|
98 else if (!strcmp(optarg, "sanger"))
|
|
99 qualtype = SANGER;
|
|
100 else {
|
|
101 fprintf(stderr, "Error: Quality type '%s' is not a valid type.\n", optarg);
|
|
102 return EXIT_FAILURE;
|
|
103 }
|
|
104 break;
|
|
105
|
|
106 case 'o':
|
|
107 outfn = (char *) malloc(strlen(optarg) + 1);
|
|
108 strcpy(outfn, optarg);
|
|
109 break;
|
|
110
|
|
111 case 'q':
|
|
112 single_qual_threshold = atoi(optarg);
|
|
113 if (single_qual_threshold < 0) {
|
|
114 fprintf(stderr, "Quality threshold must be >= 0\n");
|
|
115 return EXIT_FAILURE;
|
|
116 }
|
|
117 break;
|
|
118
|
|
119 case 'l':
|
|
120 single_length_threshold = atoi(optarg);
|
|
121 if (single_length_threshold < 0) {
|
|
122 fprintf(stderr, "Length threshold must be >= 0\n");
|
|
123 return EXIT_FAILURE;
|
|
124 }
|
|
125 break;
|
|
126
|
|
127 case 'x':
|
|
128 no_fiveprime = 1;
|
|
129 break;
|
|
130
|
|
131 case 'n':
|
|
132 trunc_n = 1;
|
|
133 break;
|
|
134
|
|
135 case 'g':
|
|
136 gzip_output = 1;
|
|
137 break;
|
|
138
|
|
139 case 'z':
|
|
140 quiet = 1;
|
|
141 break;
|
|
142
|
|
143 case 'd':
|
|
144 debug = 1;
|
|
145 break;
|
|
146
|
|
147 case_GETOPT_HELP_CHAR(single_usage)
|
|
148 case_GETOPT_VERSION_CHAR(PROGRAM_NAME, VERSION, AUTHORS);
|
|
149
|
|
150 case '?':
|
|
151 single_usage(EXIT_FAILURE, NULL);
|
|
152 break;
|
|
153
|
|
154 default:
|
|
155 single_usage(EXIT_FAILURE, NULL);
|
|
156 break;
|
|
157 }
|
|
158 }
|
|
159
|
|
160
|
|
161 if (qualtype == -1 || !infn || !outfn) {
|
|
162 single_usage(EXIT_FAILURE, "****Error: Must have quality type, input file, and output file.");
|
|
163 }
|
|
164
|
|
165 if (!strcmp(infn, outfn)) {
|
|
166 fprintf(stderr, "****Error: Input file is same as output file.\n\n");
|
|
167 return EXIT_FAILURE;
|
|
168 }
|
|
169
|
|
170 se = gzopen(infn, "r");
|
|
171 if (!se) {
|
|
172 fprintf(stderr, "****Error: Could not open input file '%s'.\n\n", infn);
|
|
173 return EXIT_FAILURE;
|
|
174 }
|
|
175
|
|
176 if (!gzip_output) {
|
|
177 outfile = fopen(outfn, "w");
|
|
178 if (!outfile) {
|
|
179 fprintf(stderr, "****Error: Could not open output file '%s'.\n\n", outfn);
|
|
180 return EXIT_FAILURE;
|
|
181 }
|
|
182 } else {
|
|
183 outfile_gzip = gzopen(outfn, "w");
|
|
184 if (!outfile_gzip) {
|
|
185 fprintf(stderr, "****Error: Could not open output file '%s'.\n\n", outfn);
|
|
186 return EXIT_FAILURE;
|
|
187 }
|
|
188 }
|
|
189
|
|
190
|
|
191 fqrec = kseq_init(se);
|
|
192
|
|
193 while ((l = kseq_read(fqrec)) >= 0) {
|
|
194
|
|
195 p1cut = sliding_window(fqrec, qualtype, single_length_threshold, single_qual_threshold, no_fiveprime, trunc_n, debug);
|
|
196 total++;
|
|
197
|
|
198 if (debug) printf("P1cut: %d,%d\n", p1cut->five_prime_cut, p1cut->three_prime_cut);
|
|
199
|
|
200 /* if sequence quality and length pass filter then output record, else discard */
|
|
201 if (p1cut->three_prime_cut >= 0) {
|
|
202 if (!gzip_output) {
|
|
203 /* This print statement prints out the sequence string starting from the 5' cut */
|
|
204 /* and then only prints out to the 3' cut, however, we need to adjust the 3' cut */
|
|
205 /* by subtracting the 5' cut because the 3' cut was calculated on the original sequence */
|
|
206
|
|
207 print_record (outfile, fqrec, p1cut);
|
|
208 } else {
|
|
209 print_record_gzip (outfile_gzip, fqrec, p1cut);
|
|
210 }
|
|
211
|
|
212 kept++;
|
|
213 }
|
|
214
|
|
215 else discard++;
|
|
216
|
|
217 free(p1cut);
|
|
218 }
|
|
219
|
|
220 if (!quiet) fprintf(stdout, "\nSE input file: %s\n\nTotal FastQ records: %d\nFastQ records kept: %d\nFastQ records discarded: %d\n\n", infn, total, kept, discard);
|
|
221
|
|
222 kseq_destroy(fqrec);
|
|
223 gzclose(se);
|
|
224 if (!gzip_output) fclose(outfile);
|
|
225 else gzclose(outfile_gzip);
|
|
226
|
|
227 return EXIT_SUCCESS;
|
|
228 }
|