Mercurial > repos > nikhil-joshi > sickle
comparison src/trim_single.c @ 4:c70137414dcd draft
sickle v1.33
author | nikhil-joshi |
---|---|
date | Wed, 23 Jul 2014 18:35:10 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
3:f6ebdaca9925 | 4:c70137414dcd |
---|---|
1 #include <assert.h> | |
2 #include <ctype.h> | |
3 #include <stdlib.h> | |
4 #include <zlib.h> | |
5 #include <stdio.h> | |
6 #include <getopt.h> | |
7 #include "sickle.h" | |
8 #include "kseq.h" | |
9 #include "print_record.h" | |
10 | |
11 __KS_GETC(gzread, BUFFER_SIZE) | |
12 __KS_GETUNTIL(gzread, BUFFER_SIZE) | |
13 __KSEQ_READ | |
14 | |
15 int single_qual_threshold = 20; | |
16 int single_length_threshold = 20; | |
17 | |
18 static struct option single_long_options[] = { | |
19 {"fastq-file", required_argument, 0, 'f'}, | |
20 {"output-file", required_argument, 0, 'o'}, | |
21 {"qual-type", required_argument, 0, 't'}, | |
22 {"qual-threshold", optional_argument, 0, 'q'}, | |
23 {"length-threshold", optional_argument, 0, 'l'}, | |
24 {"no-fiveprime", optional_argument, 0, 'x'}, | |
25 {"discard-n", optional_argument, 0, 'n'}, | |
26 {"gzip-output", optional_argument, 0, 'g'}, | |
27 {"quiet", optional_argument, 0, 'z'}, | |
28 {GETOPT_HELP_OPTION_DECL}, | |
29 {GETOPT_VERSION_OPTION_DECL}, | |
30 {NULL, 0, NULL, 0} | |
31 }; | |
32 | |
33 void single_usage(int status, char *msg) { | |
34 | |
35 fprintf(stderr, "\nUsage: %s se [options] -f <fastq sequence file> -t <quality type> -o <trimmed fastq file>\n\ | |
36 \n\ | |
37 Options:\n\ | |
38 -f, --fastq-file, Input fastq file (required)\n\ | |
39 -t, --qual-type, Type of quality values (solexa (CASAVA < 1.3), illumina (CASAVA 1.3 to 1.7), sanger (which is CASAVA >= 1.8)) (required)\n\ | |
40 -o, --output-file, Output trimmed fastq file (required)\n", PROGRAM_NAME); | |
41 | |
42 fprintf(stderr, "-q, --qual-threshold, Threshold for trimming based on average quality in a window. Default 20.\n\ | |
43 -l, --length-threshold, Threshold to keep a read based on length after trimming. Default 20.\n\ | |
44 -x, --no-fiveprime, Don't do five prime trimming.\n\ | |
45 -n, --trunc-n, Truncate sequences at position of first N.\n\ | |
46 -g, --gzip-output, Output gzipped files.\n\ | |
47 --quiet, Don't print out any trimming information\n\ | |
48 --help, display this help and exit\n\ | |
49 --version, output version information and exit\n\n"); | |
50 | |
51 if (msg) fprintf(stderr, "%s\n\n", msg); | |
52 exit(status); | |
53 } | |
54 | |
55 int single_main(int argc, char *argv[]) { | |
56 | |
57 gzFile se = NULL; | |
58 kseq_t *fqrec; | |
59 int l; | |
60 FILE *outfile = NULL; | |
61 gzFile outfile_gzip = NULL; | |
62 int debug = 0; | |
63 int optc; | |
64 extern char *optarg; | |
65 int qualtype = -1; | |
66 cutsites *p1cut; | |
67 char *outfn = NULL; | |
68 char *infn = NULL; | |
69 int kept = 0; | |
70 int discard = 0; | |
71 int quiet = 0; | |
72 int no_fiveprime = 0; | |
73 int trunc_n = 0; | |
74 int gzip_output = 0; | |
75 | |
76 while (1) { | |
77 int option_index = 0; | |
78 optc = getopt_long(argc, argv, "df:t:o:q:l:zxng", single_long_options, &option_index); | |
79 | |
80 if (optc == -1) | |
81 break; | |
82 | |
83 switch (optc) { | |
84 if (single_long_options[option_index].flag != 0) | |
85 break; | |
86 | |
87 case 'f': | |
88 infn = (char *) malloc(strlen(optarg) + 1); | |
89 strcpy(infn, optarg); | |
90 break; | |
91 | |
92 case 't': | |
93 if (!strcmp(optarg, "illumina")) | |
94 qualtype = ILLUMINA; | |
95 else if (!strcmp(optarg, "solexa")) | |
96 qualtype = SOLEXA; | |
97 else if (!strcmp(optarg, "sanger")) | |
98 qualtype = SANGER; | |
99 else { | |
100 fprintf(stderr, "Error: Quality type '%s' is not a valid type.\n", optarg); | |
101 return EXIT_FAILURE; | |
102 } | |
103 break; | |
104 | |
105 case 'o': | |
106 outfn = (char *) malloc(strlen(optarg) + 1); | |
107 strcpy(outfn, optarg); | |
108 break; | |
109 | |
110 case 'q': | |
111 single_qual_threshold = atoi(optarg); | |
112 if (single_qual_threshold < 0) { | |
113 fprintf(stderr, "Quality threshold must be >= 0\n"); | |
114 return EXIT_FAILURE; | |
115 } | |
116 break; | |
117 | |
118 case 'l': | |
119 single_length_threshold = atoi(optarg); | |
120 if (single_length_threshold < 0) { | |
121 fprintf(stderr, "Length threshold must be >= 0\n"); | |
122 return EXIT_FAILURE; | |
123 } | |
124 break; | |
125 | |
126 case 'x': | |
127 no_fiveprime = 1; | |
128 break; | |
129 | |
130 case 'n': | |
131 trunc_n = 1; | |
132 break; | |
133 | |
134 case 'g': | |
135 gzip_output = 1; | |
136 break; | |
137 | |
138 case 'z': | |
139 quiet = 1; | |
140 break; | |
141 | |
142 case 'd': | |
143 debug = 1; | |
144 break; | |
145 | |
146 case_GETOPT_HELP_CHAR(single_usage) | |
147 case_GETOPT_VERSION_CHAR(PROGRAM_NAME, VERSION, AUTHORS); | |
148 | |
149 case '?': | |
150 single_usage(EXIT_FAILURE, NULL); | |
151 break; | |
152 | |
153 default: | |
154 single_usage(EXIT_FAILURE, NULL); | |
155 break; | |
156 } | |
157 } | |
158 | |
159 | |
160 if (qualtype == -1 || !infn || !outfn) { | |
161 single_usage(EXIT_FAILURE, "****Error: Must have quality type, input file, and output file."); | |
162 } | |
163 | |
164 if (!strcmp(infn, outfn)) { | |
165 fprintf(stderr, "****Error: Input file is same as output file.\n\n"); | |
166 return EXIT_FAILURE; | |
167 } | |
168 | |
169 se = gzopen(infn, "r"); | |
170 if (!se) { | |
171 fprintf(stderr, "****Error: Could not open input file '%s'.\n\n", infn); | |
172 return EXIT_FAILURE; | |
173 } | |
174 | |
175 if (!gzip_output) { | |
176 outfile = fopen(outfn, "w"); | |
177 if (!outfile) { | |
178 fprintf(stderr, "****Error: Could not open output file '%s'.\n\n", outfn); | |
179 return EXIT_FAILURE; | |
180 } | |
181 } else { | |
182 outfile_gzip = gzopen(outfn, "w"); | |
183 if (!outfile_gzip) { | |
184 fprintf(stderr, "****Error: Could not open output file '%s'.\n\n", outfn); | |
185 return EXIT_FAILURE; | |
186 } | |
187 } | |
188 | |
189 | |
190 fqrec = kseq_init(se); | |
191 | |
192 while ((l = kseq_read(fqrec)) >= 0) { | |
193 | |
194 p1cut = sliding_window(fqrec, qualtype, single_length_threshold, single_qual_threshold, no_fiveprime, trunc_n, debug); | |
195 | |
196 if (debug) printf("P1cut: %d,%d\n", p1cut->five_prime_cut, p1cut->three_prime_cut); | |
197 | |
198 /* if sequence quality and length pass filter then output record, else discard */ | |
199 if (p1cut->three_prime_cut >= 0) { | |
200 if (!gzip_output) { | |
201 /* This print statement prints out the sequence string starting from the 5' cut */ | |
202 /* and then only prints out to the 3' cut, however, we need to adjust the 3' cut */ | |
203 /* by subtracting the 5' cut because the 3' cut was calculated on the original sequence */ | |
204 | |
205 print_record (outfile, fqrec, p1cut); | |
206 } else { | |
207 print_record_gzip (outfile_gzip, fqrec, p1cut); | |
208 } | |
209 | |
210 kept++; | |
211 } | |
212 | |
213 else discard++; | |
214 | |
215 free(p1cut); | |
216 } | |
217 | |
218 if (!quiet) fprintf(stdout, "\nFastQ records kept: %d\nFastQ records discarded: %d\n\n", kept, discard); | |
219 | |
220 kseq_destroy(fqrec); | |
221 gzclose(se); | |
222 if (!gzip_output) fclose(outfile); | |
223 else gzclose(outfile_gzip); | |
224 | |
225 return EXIT_SUCCESS; | |
226 } |