comparison GenerateDegenerateFasta.pl @ 0:163892325845 draft default tip

Initial commit.
author galaxyp
date Fri, 10 May 2013 17:15:08 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:163892325845
1 #!/usr/bin/perl -w
2 #
3 # This script generates a multi-sequence FASTA file
4 # with all possible combinations of sequences based
5 # on degenerate input sequences.
6 #
7 # =====================================================
8 # $Id: GenerateDegenerateFasta.pl 67 2010-11-09 15:20:01Z pieter.neerincx@gmail.com $
9 # $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/GenerateDegenerateFasta.pl $
10 # $LastChangedDate: 2010-11-09 09:20:01 -0600 (Tue, 09 Nov 2010) $
11 # $LastChangedRevision: 67 $
12 # $LastChangedBy: pieter.neerincx@gmail.com $
13 # =====================================================
14
15 #
16 # initialise environment
17 #
18 use strict;
19 use Getopt::Std;
20 use Log::Log4perl qw(:easy);
21
22 my %log_levels = (
23 'ALL' => $ALL,
24 'TRACE' => $TRACE,
25 'DEBUG' => $DEBUG,
26 'INFO' => $INFO,
27 'WARN' => $WARN,
28 'ERROR' => $ERROR,
29 'FATAL' => $FATAL,
30 'OFF' => $OFF,
31 );
32
33 my %amino_acids = (
34 'A' => ['A'],
35 'B' => ['D','N'],
36 'C' => ['C'],
37 'D' => ['D'],
38 'E' => ['E'],
39 'F' => ['F'],
40 'G' => ['G'],
41 'H' => ['H'],
42 'I' => ['I'],
43 'J' => ['I','L'],
44 'K' => ['K'],
45 'L' => ['L'],
46 'M' => ['M'],
47 'N' => ['N'],
48 'O' => ['O'],
49 'P' => ['P'],
50 'Q' => ['Q'],
51 'R' => ['R'],
52 'S' => ['S'],
53 'T' => ['T'],
54 'U' => ['U'],
55 'V' => ['V'],
56 'W' => ['W'],
57 'X' => ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y'], # 20 regular Amino Acids.
58 'X22' => ['A','C','D','E','F','G','H','I','K','L','M','N','O','P','Q','R','S','T','U','V','W','Y'], # 22 Amino Acids including the rare Selenocysteine and Pyrrolysine.
59 'Y' => ['Y'],
60 'Z' => ['E','Q'],
61 );
62
63 my %dna = (
64 'A' => ['A'],
65 'B' => ['C','G','T'],
66 'C' => ['C'],
67 'D' => ['A','G','T'],
68 'G' => ['G'],
69 'H' => ['A','C','T'],
70 'K' => ['G','T'],
71 'M' => ['A','C'],
72 'N' => ['A','C','G','T'],
73 'R' => ['A','G'],
74 'S' => ['C','G'],
75 'T' => ['T'],
76 'V' => ['A','C','G'],
77 'W' => ['A','T'],
78 'Y' => ['C','T'],
79 );
80
81 my %rna = (
82 'A' => ['A'],
83 'B' => ['C','G','U'],
84 'C' => ['C'],
85 'D' => ['A','G','U'],
86 'G' => ['G'],
87 'H' => ['A','C','U'],
88 'K' => ['G','U'],
89 'M' => ['A','C'],
90 'N' => ['A','C','G','U'],
91 'R' => ['A','G'],
92 'S' => ['C','G'],
93 'U' => ['U'],
94 'V' => ['A','C','G'],
95 'W' => ['A','U'],
96 'Y' => ['C','U'],
97 );
98
99 #
100 # Get options.
101 #
102 my %opts;
103 Getopt::Std::getopts('i:p:s:t:o:x:l:', \%opts);
104
105 my $input = $opts{'i'};
106 my $prefix_column = $opts{'p'};
107 my $sequence_column = $opts{'s'};
108 my $acid_type = $opts{'t'};
109 my $output = $opts{'o'};
110 my $aa_x = $opts{'x'};
111 my $log_level = $opts{'l'};
112
113 #
114 # Configure logging.
115 #
116 # Provides default if user did not specify log level:
117 $log_level = (defined($log_level) ? $log_level : 'INFO');
118
119 # Reset log level to default if user specified illegal log level.
120 $log_level = (
121 defined($log_levels{$log_level})
122 ? $log_levels{$log_level}
123 : $log_levels{'INFO'});
124
125 #Log::Log4perl->init('log4perl.properties');
126 Log::Log4perl->easy_init(
127
128 #{ level => $log_level,
129 # file => ">>GenerateDegenrateFasta.log",
130 # layout => '%F{1}-%L-%M: %m%n' },
131 {
132 level => $log_level,
133 file => "STDOUT",
134 layout => '%d L:%L %p> %m%n'
135 },
136 );
137 my $logger = Log::Log4perl::get_logger();
138
139 #
140 # Check user input.
141 #
142 unless (defined($input) && defined($output)) {
143 _Usage();
144 exit;
145 }
146 if ($input =~ /^$/ || $output =~ /^$/) {
147 _Usage();
148 exit;
149 }
150 if ($input eq $output) {
151 $logger->fatal('Output file is the same as the input file. Select a different file for the output.');
152 exit;
153 }
154
155 #
156 # Check input file.
157 #
158 unless (-e $input && -f $input && -r $input) {
159 $logger->fatal('Cannot read from input file ' . $input . ': ' . $!);
160 exit;
161 }
162
163 $prefix_column = (defined($prefix_column) ? $prefix_column : 1);
164 unless ($prefix_column =~ m/^\d+$/ && $prefix_column > 0) {
165 $logger->fatal('Prefix column must be a positive integer.');
166 exit;
167 } else {
168 $logger->debug('Prefix column = ' . $prefix_column);
169 }
170
171 $sequence_column = (defined($sequence_column) ? $sequence_column : 2);
172 unless ($sequence_column =~ m/^\d+$/ && $sequence_column > 0) {
173 $logger->fatal('Sequence column must be a positive integer.');
174 exit;
175 } else {
176 $logger->debug('Sequence column = ' . $sequence_column);
177 }
178
179 $acid_type = (defined($acid_type) ? $acid_type : 'aa');
180 unless ($acid_type eq 'aa' || $acid_type eq 'dna' || $acid_type eq 'rna') {
181 $logger->fatal('Illegal value for option -t. Value for acid type must be \'aa\' for amino acids, \'dna\' for deoxyribonucleic acids or \'rna\' for ribonucleic acids.');
182 exit;
183 }
184
185 my %acids;
186 if ($acid_type eq 'aa') {
187 $aa_x = (defined($aa_x) ? $aa_x : 20);
188 unless ($aa_x == 20 || $aa_x == 22) {
189 $logger->fatal('Illegal value for option -x. Value for amino acid \'X\' expansion must be 20 or 22.');
190 exit;
191 }
192 %acids = %amino_acids;
193 } elsif ($acid_type eq 'dna') {
194 %acids = %dna;
195 } elsif ($acid_type eq 'rna') {
196 %acids = %rna;
197 }
198
199 #
200 # Read degenerate sequences and their IDs from a tab delimited file
201 #
202 my $degenerate_sequences = _ParseInput($input, $prefix_column, $sequence_column);
203 my $degenerate_sequence_count = scalar(keys(%{$degenerate_sequences}));
204 $logger->info('Number of degenerate sequences: ' . $degenerate_sequence_count);
205
206 #
207 # Generate FASTA DB with all possible permutations.
208 #
209 my ($counter) = _GenerateSeqs($degenerate_sequences, $output, $acid_type);
210
211 $logger->info('Generated FASTA DB with ' . $counter . ' sequences.');
212 $logger->info('Finished!');
213
214 #
215 ##
216 ### Internal subs.
217 ##
218 #
219
220 sub _ParseInput {
221
222 my ($file_path, $prefix_column, $sequence_column) = @_;
223
224 $logger->info('Parsing ' . $file_path . '...');
225
226 my %degenerate_sequences;
227 my $file_path_fh;
228 my $prefix_column_offset = $prefix_column - 1;
229 my $sequence_column_offset = $sequence_column - 1;
230 my $line_counter = 0;
231
232 eval {
233 open($file_path_fh, "<$file_path");
234 };
235 if ($@) {
236 $logger->fatal('Cannot read input file: ' . $@);
237 exit;
238 }
239
240 LINE: while (my $line = <$file_path_fh>) {
241
242 $line =~ s/[\r\n]+//g;
243 $line_counter++;
244
245 my @values = split("\t", $line);
246
247 unless (defined($values[$prefix_column_offset])) {
248 $logger->fatal('Prefix missing on line ' . $line_counter . ' of the input file.');
249 exit;
250 }
251
252 unless (defined($values[$sequence_column_offset])) {
253 $logger->fatal('Sequence missing on line ' . $line_counter . ' of the input file.');
254 exit;
255 }
256
257 my $prefix = $values[$prefix_column_offset];
258 my $degenerate_sequence = $values[$sequence_column_offset];
259
260
261 unless ($prefix =~ m/[a-zA-Z0-9_:\.\-]/i) {
262 $logger->fatal('Prefix countains illegal characters on line ' . $line_counter . '. Prefix should contain only a-z A-Z 0-9 _ : . or -.');
263 exit;
264 }
265
266 unless ($degenerate_sequence =~ m/[a-zA-Z0-9_:\.\-]/i) {
267 $logger->fatal('Degenerate sequence countains illegal characters on line ' . $line_counter . '. Sequence should contain only a-z A-Z.');
268 exit;
269 }
270
271 $degenerate_sequences{$prefix} = $degenerate_sequence;
272 $logger->debug('Found degenerate sequence ' . $degenerate_sequence . ' with prefix ' . $prefix);
273
274 }
275
276 close($file_path_fh);
277 $logger->info('Parsed input.');
278
279 return (\%degenerate_sequences);
280
281 }
282
283 sub _GenerateSeqs {
284
285 my ($degenerate_sequences, $path_to, $acid_type) = @_;
286
287 $logger->info('Generating sequences...');
288
289 my $generated_seqs = 0;
290 my $path_to_fh;
291
292 eval {
293 open($path_to_fh, ">$path_to");
294 };
295 if ($@) {
296 $logger->fatal('Cannot write FASTA file: ' . $@);
297 exit;
298 }
299
300 DS:foreach my $prefix(sort(keys(%{$degenerate_sequences}))) {
301
302 my $degenerate_sequence = ${$degenerate_sequences}{$prefix};
303 $degenerate_sequence = uc($degenerate_sequence);
304
305 $logger->debug("\t" . 'For ' . $prefix);
306
307 my $suffix = 1;
308 my @degenerate_sequence_acids = split('', $degenerate_sequence);
309 my $new_sequences = [];
310 my $next_acid_count = 0;
311
312 foreach my $next_acid (@degenerate_sequence_acids) {
313
314 $next_acid_count++;
315
316 unless (defined($acids{$next_acid})) {
317 $logger->error('Unknown (degenerate) acid ' . $next_acid . ' in input sequence ' . $prefix . ' (' . $degenerate_sequence . ').');
318 $logger->warn("\t" . 'Skipping sequence ' . $prefix . '.');
319 next DS;
320 }
321
322 if ($acid_type eq 'aa') {
323
324 if ($next_acid eq 'X') {
325
326 #
327 # By default X will expand to the 20 most common amino acids,
328 # but if -x 22 was specified X will expand to all 22 amino acids
329 # including the very rare ones.
330 #
331 if ($aa_x == 22) {
332
333 $next_acid = 'X22';
334
335 }
336 }
337 }
338
339 $new_sequences = _GrowSequence ($new_sequences, $next_acid);
340
341 my $sequence_count = scalar(@{$new_sequences});
342 $logger->trace("\t\t" . $next_acid_count . ' Acid ' . $next_acid . ': ' . $sequence_count . ' sequences.');
343
344 }
345
346 foreach my $new_sequence (@{$new_sequences}) {
347
348 $generated_seqs++;
349 my $id = $prefix . '_' . $suffix;
350
351 $logger->trace("\t\t\t" . $id . ' :: ' . $new_sequence);
352
353 print($path_to_fh '>' . $id . "\n");
354 # TODO: wrap long sequences over multiple lines.
355 print($path_to_fh $new_sequence . "\n");
356
357 $suffix++;
358
359 }
360 }
361
362 close($path_to_fh);
363
364 return ($generated_seqs);
365
366 }
367
368 #
369 # Usage
370 #
371
372 sub _GrowSequence {
373
374 my ($sequences, $next_acid) = @_;
375
376 my @larger_sequences;
377
378 if (scalar(@{$sequences}) < 1) {
379
380 foreach my $acid (@{$acids{$next_acid}}) {
381
382 my $larger_sequence = $acid;
383
384 push(@larger_sequences, $larger_sequence);
385
386 }
387
388 } else {
389
390 foreach my $sequence (@{$sequences}) {
391
392 foreach my $acid (@{$acids{$next_acid}}) {
393
394 my $larger_sequence = $sequence . $acid;
395
396 push(@larger_sequences, $larger_sequence);
397
398 }
399 }
400 }
401
402 return (\@larger_sequences);
403
404 }
405
406 sub _Usage {
407
408 print STDERR "\n"
409 . 'GenerateDegenerateFasta.pl:' . "\n\n"
410 . ' Generates a multi-sequence FASTA file with all possible combinations of sequences ' . "\n"
411 . ' based on degenerate input sequences.' . "\n\n"
412 . 'Usage:' . "\n\n"
413 . ' GenerateDegenerateFasta.pl options' . "\n\n"
414 . 'Available options are:' . "\n\n"
415 . ' -i [file] Input file in tab delimited format.' . "\n"
416 . ' -p [number] Prefix column.' . "\n"
417 . ' Column from the input file containg strings that will be used as prefixes ' . "\n"
418 . ' to generate unique IDs for the FASTA sequences.' . "\n"
419 . ' -s [number] Sequence column.' . "\n"
420 . ' Column from the input file containg degenerate amino or nucleic acid sequences ' . "\n"
421 . ' that will be used to generate the FASTA sequences.' . "\n"
422 . ' For example RXX can be used to generate the amino acids sequences RAA, RAC, RAD ... RYY.' . "\n"
423 . ' -t [type] Acid type of the degenerate input sequences.' . "\n"
424 . ' Must be one of:' . "\n"
425 . ' aa - for amino acids' . "\n"
426 . ' dna - for deoxyribonucleic acids' . "\n"
427 . ' rna - for ribonucleic acids' . "\n"
428 . ' -o [file] Output file in FASTA format.' . "\n"
429 . ' -x [20|22] Indicates whether the degenerate amino acid X represents only the 20 most common amino acids (default).' . "\n"
430 . ' or all 22 amino acids including the rare Selenocysteine and Pyrrolysine.' . "\n"
431 . ' -l [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.' . "\n"
432 . "\n";
433 exit;
434
435 }