0
|
1 #!/usr/bin/perl
|
|
2
|
|
3 # FASTX-toolkit - FASTA/FASTQ preprocessing tools.
|
|
4 # Copyright (C) 2009-2013 A. Gordon (assafgordon@gmail.com)
|
|
5 #
|
|
6 # This program is free software: you can redistribute it and/or modify
|
|
7 # it under the terms of the GNU Affero General Public License as
|
|
8 # published by the Free Software Foundation, either version 3 of the
|
|
9 # License, or (at your option) any later version.
|
|
10 #
|
|
11 # This program is distributed in the hope that it will be useful,
|
|
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14 # GNU Affero General Public License for more details.
|
|
15 #
|
|
16 # You should have received a copy of the GNU Affero General Public License
|
|
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
18
|
|
19 use strict;
|
|
20 use warnings;
|
|
21 use IO::Handle;
|
|
22 use Data::Dumper;
|
|
23 use Getopt::Long;
|
|
24 use Carp;
|
|
25
|
|
26 ##
|
|
27 ## This program splits a FASTQ/FASTA file into several smaller files,
|
|
28 ## Based on barcode matching.
|
|
29 ##
|
|
30 ## run with "--help" for usage information
|
|
31 ##
|
|
32 ## Assaf Gordon <assafgordon@gmail.com> , 11sep2008
|
|
33
|
|
34 # Forward declarations
|
|
35 sub load_barcode_file ($);
|
|
36 sub parse_command_line ;
|
|
37 sub match_sequences ;
|
|
38 sub mismatch_count($$) ;
|
|
39 sub print_results;
|
|
40 sub open_and_detect_input_format;
|
|
41 sub read_record;
|
|
42 sub write_record($);
|
|
43 sub usage();
|
|
44
|
|
45 # Global flags and arguments,
|
|
46 # Set by command line argumens
|
|
47 my $barcode_file ;
|
|
48 my $barcodes_at_eol = 0 ;
|
|
49 my $barcodes_at_bol = 0 ;
|
|
50 my $exact_match = 0 ;
|
|
51 my $allow_partial_overlap = 0;
|
|
52 my $allowed_mismatches = 1;
|
|
53 my $newfile_suffix = '';
|
|
54 my $newfile_prefix ;
|
|
55 my $quiet = 0 ;
|
|
56 my $debug = 0 ;
|
|
57 my $fastq_format = 1;
|
|
58
|
|
59 # Global variables
|
|
60 # Populated by 'create_output_files'
|
|
61 my %filenames;
|
|
62 my %files;
|
|
63 my %counts = ( 'unmatched' => 0 );
|
|
64 my $barcodes_length;
|
|
65 my @barcodes;
|
|
66 my $input_file_io;
|
|
67
|
|
68
|
|
69 # The Four lines per record in FASTQ format.
|
|
70 # (when using FASTA format, only the first two are used)
|
|
71 my $seq_name;
|
|
72 my $seq_bases;
|
|
73 my $seq_name2;
|
|
74 my $seq_qualities;
|
|
75
|
|
76
|
|
77 #
|
|
78 # Start of Program
|
|
79 #
|
|
80 parse_command_line ;
|
|
81
|
|
82 load_barcode_file ( $barcode_file ) ;
|
|
83
|
|
84 open_and_detect_input_format;
|
|
85
|
|
86 match_sequences ;
|
|
87
|
|
88 print_results unless $quiet;
|
|
89
|
|
90 #
|
|
91 # End of program
|
|
92 #
|
|
93
|
|
94
|
|
95
|
|
96
|
|
97
|
|
98
|
|
99
|
|
100
|
|
101 sub parse_command_line {
|
|
102 my $help;
|
|
103
|
|
104 usage() if (scalar @ARGV==0);
|
|
105
|
|
106 my $result = GetOptions ( "bcfile=s" => \$barcode_file,
|
|
107 "eol" => \$barcodes_at_eol,
|
|
108 "bol" => \$barcodes_at_bol,
|
|
109 "exact" => \$exact_match,
|
|
110 "prefix=s" => \$newfile_prefix,
|
|
111 "suffix=s" => \$newfile_suffix,
|
|
112 "quiet" => \$quiet,
|
|
113 "partial=i" => \$allow_partial_overlap,
|
|
114 "debug" => \$debug,
|
|
115 "mismatches=i" => \$allowed_mismatches,
|
|
116 "help" => \$help
|
|
117 ) ;
|
|
118
|
|
119 usage() if ($help);
|
|
120
|
|
121 die "Error: barcode file not specified (use '--bcfile [FILENAME]')\n" unless defined $barcode_file;
|
|
122 die "Error: prefix path/filename not specified (use '--prefix [PATH]')\n" unless defined $newfile_prefix;
|
|
123
|
|
124 if ($barcodes_at_bol == $barcodes_at_eol) {
|
|
125 die "Error: can't specify both --eol & --bol\n" if $barcodes_at_eol;
|
|
126 die "Error: must specify either --eol or --bol\n" ;
|
|
127 }
|
|
128
|
|
129 die "Error: invalid for value partial matches (valid values are 0 or greater)\n" if $allow_partial_overlap<0;
|
|
130
|
|
131 $allowed_mismatches = 0 if $exact_match;
|
|
132
|
|
133 die "Error: invalid value for mismatches (valid values are 0 or more)\n" if ($allowed_mismatches<0);
|
|
134
|
|
135 die "Error: partial overlap value ($allow_partial_overlap) bigger than " .
|
|
136 "max. allowed mismatches ($allowed_mismatches)\n" if ($allow_partial_overlap > $allowed_mismatches);
|
|
137
|
|
138
|
|
139 exit unless $result;
|
|
140 }
|
|
141
|
|
142
|
|
143
|
|
144 #
|
|
145 # Read the barcode file
|
|
146 #
|
|
147 sub load_barcode_file ($) {
|
|
148 my $filename = shift or croak "Missing barcode file name";
|
|
149
|
|
150 open BCFILE,"<$filename" or die "Error: failed to open barcode file ($filename)\n";
|
|
151 while (<BCFILE>) {
|
|
152 next if m/^#/;
|
|
153 chomp;
|
|
154 my ($ident, $barcode) = split ;
|
|
155
|
|
156 $barcode = uc($barcode);
|
|
157
|
|
158 # Sanity checks on the barcodes
|
|
159 die "Error: bad data at barcode file ($filename) line $.\n" unless defined $barcode;
|
|
160 die "Error: bad barcode value ($barcode) at barcode file ($filename) line $.\n"
|
|
161 unless $barcode =~ m/^[AGCT]+$/;
|
|
162
|
|
163 die "Error: bad identifier value ($ident) at barcode file ($filename) line $. (must be alphanumeric)\n"
|
|
164 unless $ident =~ m/^\w+$/;
|
|
165
|
|
166 die "Error: badcode($ident, $barcode) is shorter or equal to maximum number of " .
|
|
167 "mismatches ($allowed_mismatches). This makes no sense. Specify fewer mismatches.\n"
|
|
168 if length($barcode)<=$allowed_mismatches;
|
|
169
|
|
170 $barcodes_length = length($barcode) unless defined $barcodes_length;
|
|
171 die "Error: found barcodes in different lengths. this feature is not supported yet.\n"
|
|
172 unless $barcodes_length == length($barcode);
|
|
173
|
|
174 push @barcodes, [$ident, $barcode];
|
|
175
|
|
176 if ($allow_partial_overlap>0) {
|
|
177 foreach my $i (1 .. $allow_partial_overlap) {
|
|
178 substr $barcode, ($barcodes_at_bol)?0:-1, 1, '';
|
|
179 push @barcodes, [$ident, $barcode];
|
|
180 }
|
|
181 }
|
|
182 }
|
|
183 close BCFILE;
|
|
184
|
|
185 if ($debug) {
|
|
186 print STDERR "barcode\tsequence\n";
|
|
187 foreach my $barcoderef (@barcodes) {
|
|
188 my ($ident, $seq) = @{$barcoderef};
|
|
189 print STDERR $ident,"\t", $seq ,"\n";
|
|
190 }
|
|
191 }
|
|
192 }
|
|
193
|
|
194 # Create one output file for each barcode.
|
|
195 # (Also create a file for the dummy 'unmatched' barcode)
|
|
196 sub create_output_files {
|
|
197 my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers;
|
|
198 $barcodes{'unmatched'} = 1 ;
|
|
199
|
|
200 foreach my $ident (keys %barcodes) {
|
|
201 my $new_filename = $newfile_prefix . $ident . $newfile_suffix;
|
|
202 $filenames{$ident} = $new_filename;
|
|
203 open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n";
|
|
204 $files{$ident} = $file ;
|
|
205 }
|
|
206 }
|
|
207
|
|
208 sub match_sequences {
|
|
209
|
|
210 my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers;
|
|
211 $barcodes{'unmatched'} = 1 ;
|
|
212
|
|
213 #reset counters
|
|
214 foreach my $ident ( keys %barcodes ) {
|
|
215 $counts{$ident} = 0;
|
|
216 }
|
|
217
|
|
218 create_output_files;
|
|
219
|
|
220 # Read file FASTQ file
|
|
221 # split accotding to barcodes
|
|
222 while ( read_record ) {
|
|
223 chomp $seq_bases;
|
|
224
|
|
225 print STDERR "sequence $seq_bases: \n" if $debug;
|
|
226
|
|
227 my $best_barcode_mismatches_count = $barcodes_length;
|
|
228 my $best_barcode_ident = undef;
|
|
229
|
|
230 #Try all barcodes, find the one with the lowest mismatch count
|
|
231 foreach my $barcoderef (@barcodes) {
|
|
232 my ($ident, $barcode) = @{$barcoderef};
|
|
233
|
|
234 # Get DNA fragment (in the length of the barcodes)
|
|
235 # The barcode will be tested only against this fragment
|
|
236 # (no point in testing the barcode against the whole sequence)
|
|
237 my $sequence_fragment;
|
|
238 if ($barcodes_at_bol) {
|
|
239 $sequence_fragment = substr $seq_bases, 0, $barcodes_length;
|
|
240 } else {
|
|
241 $sequence_fragment = substr $seq_bases, - $barcodes_length;
|
|
242 }
|
|
243
|
|
244 my $mm = mismatch_count($sequence_fragment, $barcode) ;
|
|
245
|
|
246 # if this is a partial match, add the non-overlap as a mismatch
|
|
247 # (partial barcodes are shorter than the length of the original barcodes)
|
|
248 $mm += ($barcodes_length - length($barcode));
|
|
249
|
|
250 if ( $mm < $best_barcode_mismatches_count ) {
|
|
251 $best_barcode_mismatches_count = $mm ;
|
|
252 $best_barcode_ident = $ident ;
|
|
253 }
|
|
254 }
|
|
255
|
|
256 $best_barcode_ident = 'unmatched'
|
|
257 if ( (!defined $best_barcode_ident) || $best_barcode_mismatches_count>$allowed_mismatches) ;
|
|
258
|
|
259 print STDERR "sequence $seq_bases matched barcode: $best_barcode_ident\n" if $debug;
|
|
260
|
|
261 $counts{$best_barcode_ident}++;
|
|
262
|
|
263 #get the file associated with the matched barcode.
|
|
264 #(note: there's also a file associated with 'unmatched' barcode)
|
|
265 my $file = $files{$best_barcode_ident};
|
|
266
|
|
267 write_record($file);
|
|
268 }
|
|
269 }
|
|
270
|
|
271 #Quickly calculate hamming distance between two strings
|
|
272 #
|
|
273 #NOTE: Strings must be same length.
|
|
274 # returns number of different characters.
|
|
275 #see http://www.perlmonks.org/?node_id=500235
|
|
276 sub mismatch_count($$) { length( $_[ 0 ] ) - ( ( $_[ 0 ] ^ $_[ 1 ] ) =~ tr[\0][\0] ) }
|
|
277
|
|
278
|
|
279
|
|
280 sub print_results
|
|
281 {
|
|
282 print "Barcode\tCount\tLocation\n";
|
|
283 my $total = 0 ;
|
|
284 foreach my $ident (sort keys %counts) {
|
|
285 print $ident, "\t", $counts{$ident},"\t",$filenames{$ident},"\n";
|
|
286 $total += $counts{$ident};
|
|
287 }
|
|
288 print "total\t",$total,"\n";
|
|
289 }
|
|
290
|
|
291
|
|
292 sub read_record
|
|
293 {
|
|
294 $seq_name = $input_file_io->getline();
|
|
295
|
|
296 return undef unless defined $seq_name; # End of file?
|
|
297
|
|
298 $seq_bases = $input_file_io->getline();
|
|
299 die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases;
|
|
300
|
|
301 # If using FASTQ format, read two more lines
|
|
302 if ($fastq_format) {
|
|
303 $seq_name2 = $input_file_io->getline();
|
|
304 die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2;
|
|
305
|
|
306 $seq_qualities = $input_file_io->getline();
|
|
307 die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities;
|
|
308 }
|
|
309 return 1;
|
|
310 }
|
|
311
|
|
312 sub write_record($)
|
|
313 {
|
|
314 my $file = shift;
|
|
315
|
|
316 croak "Bad file handle" unless defined $file;
|
|
317
|
|
318 print $file $seq_name;
|
|
319 print $file $seq_bases,"\n";
|
|
320
|
|
321 #if using FASTQ format, write two more lines
|
|
322 if ($fastq_format) {
|
|
323 print $file $seq_name2;
|
|
324 print $file $seq_qualities;
|
|
325 }
|
|
326 }
|
|
327
|
|
328 sub open_and_detect_input_format
|
|
329 {
|
|
330 $input_file_io = new IO::Handle;
|
|
331 die "Failed to open STDIN " unless $input_file_io->fdopen(fileno(STDIN),"r");
|
|
332
|
|
333 # Get the first characeter, and push it back
|
|
334 my $first_char = $input_file_io->getc();
|
|
335 $input_file_io->ungetc(ord $first_char);
|
|
336
|
|
337 if ($first_char eq '>') {
|
|
338 # FASTA format
|
|
339 $fastq_format = 0 ;
|
|
340 print STDERR "Detected FASTA format\n" if $debug;
|
|
341 } elsif ($first_char eq '@') {
|
|
342 # FASTQ format
|
|
343 $fastq_format = 1;
|
|
344 print STDERR "Detected FASTQ format\n" if $debug;
|
|
345 } else {
|
|
346 die "Error: unknown file format. First character = '$first_char' (expecting > or \@)\n";
|
|
347 }
|
|
348 }
|
|
349
|
|
350 sub usage()
|
|
351 {
|
|
352 print<<EOF;
|
|
353 Barcode Splitter, by Assaf Gordon (gordon\@cshl.edu), 11sep2008
|
|
354
|
|
355 This program reads FASTA/FASTQ file and splits it into several smaller files,
|
|
356 Based on barcode matching.
|
|
357 FASTA/FASTQ data is read from STDIN (format is auto-detected.)
|
|
358 Output files will be writen to disk.
|
|
359 Summary will be printed to STDOUT.
|
|
360
|
|
361 usage: $0 --bcfile FILE --prefix PREFIX [--suffix SUFFIX] [--bol|--eol]
|
|
362 [--mismatches N] [--exact] [--partial N] [--help] [--quiet] [--debug]
|
|
363
|
|
364 Arguments:
|
|
365
|
|
366 --bcfile FILE - Barcodes file name. (see explanation below.)
|
|
367 --prefix PREFIX - File prefix. will be added to the output files. Can be used
|
|
368 to specify output directories.
|
|
369 --suffix SUFFIX - File suffix (optional). Can be used to specify file
|
|
370 extensions.
|
|
371 --bol - Try to match barcodes at the BEGINNING of sequences.
|
|
372 (What biologists would call the 5' end, and programmers
|
|
373 would call index 0.)
|
|
374 --eol - Try to match barcodes at the END of sequences.
|
|
375 (What biologists would call the 3' end, and programmers
|
|
376 would call the end of the string.)
|
|
377 NOTE: one of --bol, --eol must be specified, but not both.
|
|
378 --mismatches N - Max. number of mismatches allowed. default is 1.
|
|
379 --exact - Same as '--mismatches 0'. If both --exact and --mismatches
|
|
380 are specified, '--exact' takes precedence.
|
|
381 --partial N - Allow partial overlap of barcodes. (see explanation below.)
|
|
382 (Default is not partial matching)
|
|
383 --quiet - Don't print counts and summary at the end of the run.
|
|
384 (Default is to print.)
|
|
385 --debug - Print lots of useless debug information to STDERR.
|
|
386 --help - This helpful help screen.
|
|
387
|
|
388 Example (Assuming 's_2_100.txt' is a FASTQ file, 'mybarcodes.txt' is
|
|
389 the barcodes file):
|
|
390
|
|
391 \$ cat s_2_100.txt | $0 --bcfile mybarcodes.txt --bol --mismatches 2 \\
|
|
392 --prefix /tmp/bla_ --suffix ".txt"
|
|
393
|
|
394 Barcode file format
|
|
395 -------------------
|
|
396 Barcode files are simple text files. Each line should contain an identifier
|
|
397 (descriptive name for the barcode), and the barcode itself (A/C/G/T),
|
|
398 separated by a TAB character. Example:
|
|
399
|
|
400 #This line is a comment (starts with a 'number' sign)
|
|
401 BC1 GATCT
|
|
402 BC2 ATCGT
|
|
403 BC3 GTGAT
|
|
404 BC4 TGTCT
|
|
405
|
|
406 For each barcode, a new FASTQ file will be created (with the barcode's
|
|
407 identifier as part of the file name). Sequences matching the barcode
|
|
408 will be stored in the appropriate file.
|
|
409
|
|
410 Running the above example (assuming "mybarcodes.txt" contains the above
|
|
411 barcodes), will create the following files:
|
|
412 /tmp/bla_BC1.txt
|
|
413 /tmp/bla_BC2.txt
|
|
414 /tmp/bla_BC3.txt
|
|
415 /tmp/bla_BC4.txt
|
|
416 /tmp/bla_unmatched.txt
|
|
417 The 'unmatched' file will contain all sequences that didn't match any barcode.
|
|
418
|
|
419 Barcode matching
|
|
420 ----------------
|
|
421
|
|
422 ** Without partial matching:
|
|
423
|
|
424 Count mismatches between the FASTA/Q sequences and the barcodes.
|
|
425 The barcode which matched with the lowest mismatches count (providing the
|
|
426 count is small or equal to '--mismatches N') 'gets' the sequences.
|
|
427
|
|
428 Example (using the above barcodes):
|
|
429 Input Sequence:
|
|
430 GATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
|
|
431
|
|
432 Matching with '--bol --mismatches 1':
|
|
433 GATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
|
|
434 GATCT (1 mismatch, BC1)
|
|
435 ATCGT (4 mismatches, BC2)
|
|
436 GTGAT (3 mismatches, BC3)
|
|
437 TGTCT (3 mismatches, BC4)
|
|
438
|
|
439 This sequence will be classified as 'BC1' (it has the lowest mismatch count).
|
|
440 If '--exact' or '--mismatches 0' were specified, this sequence would be
|
|
441 classified as 'unmatched' (because, although BC1 had the lowest mismatch count,
|
|
442 it is above the maximum allowed mismatches).
|
|
443
|
|
444 Matching with '--eol' (end of line) does the same, but from the other side
|
|
445 of the sequence.
|
|
446
|
|
447 ** With partial matching (very similar to indels):
|
|
448
|
|
449 Same as above, with the following addition: barcodes are also checked for
|
|
450 partial overlap (number of allowed non-overlapping bases is '--partial N').
|
|
451
|
|
452 Example:
|
|
453 Input sequence is ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
|
|
454 (Same as above, but note the missing 'G' at the beginning.)
|
|
455
|
|
456 Matching (without partial overlapping) against BC1 yields 4 mismatches:
|
|
457 ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
|
|
458 GATCT (4 mismatches)
|
|
459
|
|
460 Partial overlapping would also try the following match:
|
|
461 -ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
|
|
462 GATCT (1 mismatch)
|
|
463
|
|
464 Note: scoring counts a missing base as a mismatch, so the final
|
|
465 mismatch count is 2 (1 'real' mismatch, 1 'missing base' mismatch).
|
|
466 If running with '--mismatches 2' (meaning allowing upto 2 mismatches) - this
|
|
467 seqeunce will be classified as BC1.
|
|
468
|
|
469 EOF
|
|
470
|
|
471 exit 1;
|
|
472 }
|