annotate ConvertFastaHeaders.pl @ 0:163892325845 draft default tip

Initial commit.
author galaxyp
date Fri, 10 May 2013 17:15:08 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
163892325845 Initial commit.
galaxyp
parents:
diff changeset
1 #!/usr/bin/perl
163892325845 Initial commit.
galaxyp
parents:
diff changeset
2
163892325845 Initial commit.
galaxyp
parents:
diff changeset
3 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
4 # convertFastaHeaders.pl
163892325845 Initial commit.
galaxyp
parents:
diff changeset
5 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
6 # $Id: ConvertFastaHeaders.pl 44 2010-10-18 12:58:41Z pieter.neerincx@gmail.com $
163892325845 Initial commit.
galaxyp
parents:
diff changeset
7 # $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ConvertFastaHeaders.pl $
163892325845 Initial commit.
galaxyp
parents:
diff changeset
8 # $LastChangedDate: 2010-10-18 07:58:41 -0500 (Mon, 18 Oct 2010) $
163892325845 Initial commit.
galaxyp
parents:
diff changeset
9 # $LastChangedRevision: 44 $
163892325845 Initial commit.
galaxyp
parents:
diff changeset
10 # $LastChangedBy: pieter.neerincx@gmail.com $
163892325845 Initial commit.
galaxyp
parents:
diff changeset
11 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
12 # Converts sequence header of FASTA files (in various customisable ways).
163892325845 Initial commit.
galaxyp
parents:
diff changeset
13 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
14
163892325845 Initial commit.
galaxyp
parents:
diff changeset
15 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
16 # Initialize evironment
163892325845 Initial commit.
galaxyp
parents:
diff changeset
17 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
18 use strict;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
19 use Getopt::Std;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
20 use Log::Log4perl qw(:easy);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
21
163892325845 Initial commit.
galaxyp
parents:
diff changeset
22 my %log_levels = (
163892325845 Initial commit.
galaxyp
parents:
diff changeset
23 'ALL' => $ALL,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
24 'TRACE' => $TRACE,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
25 'DEBUG' => $DEBUG,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
26 'INFO' => $INFO,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
27 'WARN' => $WARN,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
28 'ERROR' => $ERROR,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
29 'FATAL' => $FATAL,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
30 'OFF' => $OFF,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
31 );
163892325845 Initial commit.
galaxyp
parents:
diff changeset
32
163892325845 Initial commit.
galaxyp
parents:
diff changeset
33 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
34 # Get options.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
35 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
36 my %opts;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
37 Getopt::Std::getopts('i:o:l:e:f:n:a:p:', \%opts);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
38 my $input = $opts{'i'};
163892325845 Initial commit.
galaxyp
parents:
diff changeset
39 my $output = $opts{'o'};
163892325845 Initial commit.
galaxyp
parents:
diff changeset
40 my $log_level = $opts{'l'};
163892325845 Initial commit.
galaxyp
parents:
diff changeset
41 my $extension = $opts{'e'};
163892325845 Initial commit.
galaxyp
parents:
diff changeset
42 my @x_fixes_array = split(/\s+/, $opts{'f'});
163892325845 Initial commit.
galaxyp
parents:
diff changeset
43 my $new_x_fix = $opts{'n'};
163892325845 Initial commit.
galaxyp
parents:
diff changeset
44 my $action = $opts{'a'};
163892325845 Initial commit.
galaxyp
parents:
diff changeset
45 my $position = $opts{'p'};
163892325845 Initial commit.
galaxyp
parents:
diff changeset
46 my %ids_to_delete;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
47 my @new_id_order;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
48
163892325845 Initial commit.
galaxyp
parents:
diff changeset
49 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
50 # Configure logging.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
51 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
52 # Provides default if user did not specify log level:
163892325845 Initial commit.
galaxyp
parents:
diff changeset
53 $log_level = (defined($log_level) ? $log_level : 'WARN');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
54 # Reset log level to default if user specified illegal log level.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
55 $log_level = (defined($log_levels{$log_level}) ? $log_levels{$log_level} : $log_levels{'WARN'});
163892325845 Initial commit.
galaxyp
parents:
diff changeset
56 #Log::Log4perl->init('log4perl.properties');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
57 Log::Log4perl->easy_init(
163892325845 Initial commit.
galaxyp
parents:
diff changeset
58 #{ level => $log_level,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
59 # file => ">>ConvertFastaHeaders.log",
163892325845 Initial commit.
galaxyp
parents:
diff changeset
60 # layout => '%F{1}-%L-%M: %m%n' },
163892325845 Initial commit.
galaxyp
parents:
diff changeset
61 { level => $log_level,
163892325845 Initial commit.
galaxyp
parents:
diff changeset
62 file => "STDERR",
163892325845 Initial commit.
galaxyp
parents:
diff changeset
63 layout => '%d L:%L %p> %m%n' },
163892325845 Initial commit.
galaxyp
parents:
diff changeset
64 );
163892325845 Initial commit.
galaxyp
parents:
diff changeset
65 my $logger = Log::Log4perl::get_logger();
163892325845 Initial commit.
galaxyp
parents:
diff changeset
66
163892325845 Initial commit.
galaxyp
parents:
diff changeset
67 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
68 # Start the conversion process.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
69 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
70 $logger->info("Starting...");
163892325845 Initial commit.
galaxyp
parents:
diff changeset
71
163892325845 Initial commit.
galaxyp
parents:
diff changeset
72 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
73 # Check user input.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
74 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
75
163892325845 Initial commit.
galaxyp
parents:
diff changeset
76 # Provides default if user did not specify action:
163892325845 Initial commit.
galaxyp
parents:
diff changeset
77 $action = (defined($action) ? $action : 'add');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
78
163892325845 Initial commit.
galaxyp
parents:
diff changeset
79 # Check for valid action and action specific options.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
80 if ($action eq 'add' || $action eq 'strip' || $action eq 'replace') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
81
163892325845 Initial commit.
galaxyp
parents:
diff changeset
82 unless (scalar(@x_fixes_array) > 0) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
83 $logger->fatal('No prefixes or suffixes specified.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
84 _Usage();
163892325845 Initial commit.
galaxyp
parents:
diff changeset
85 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
86
163892325845 Initial commit.
galaxyp
parents:
diff changeset
87 if ($action eq 'replace') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
88 unless (defined($new_x_fix) && $new_x_fix ne '') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
89 $logger->fatal('No new prefix or suffix specified to replace the existing ones.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
90 _Usage();
163892325845 Initial commit.
galaxyp
parents:
diff changeset
91 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
92 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
93
163892325845 Initial commit.
galaxyp
parents:
diff changeset
94 # Provides default if user did not specify position:
163892325845 Initial commit.
galaxyp
parents:
diff changeset
95 $position = (defined($position) ? $position : 'prefix');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
96 # Check for valid position.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
97 if ($action eq 'add' || $action eq 'strip') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
98 unless ($position eq 'prefix' || $position eq 'suffix') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
99 $logger->fatal('Illegal position specified. Must be \'prefix\' or \'suffix\'.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
100 _Usage();
163892325845 Initial commit.
galaxyp
parents:
diff changeset
101 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
102 } elsif ($action eq 'replace') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
103 unless ($position eq 'prefix' || $position eq 'suffix' || $position eq 'pre2suf' || $position eq 'suf2pre') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
104 $logger->fatal('Illegal position specified. Must be \'prefix\', \'suffix\', \'pre2suf\' or \'suf2pre\'.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
105 _Usage();
163892325845 Initial commit.
galaxyp
parents:
diff changeset
106 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
107 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
108
163892325845 Initial commit.
galaxyp
parents:
diff changeset
109 } elsif ($action eq 'delete' || $action eq 'shuffle') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
110
163892325845 Initial commit.
galaxyp
parents:
diff changeset
111 unless (defined($position) && $position ne '') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
112 $logger->fatal('No position specified.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
113 _Usage();
163892325845 Initial commit.
galaxyp
parents:
diff changeset
114 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
115
163892325845 Initial commit.
galaxyp
parents:
diff changeset
116 my @id_indices = split(/,/, $position);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
117
163892325845 Initial commit.
galaxyp
parents:
diff changeset
118 # Check if the value is a number.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
119 foreach my $index_number (@id_indices) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
120
163892325845 Initial commit.
galaxyp
parents:
diff changeset
121 unless ($index_number =~ m/^[1-9][0-9]*$/) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
122
163892325845 Initial commit.
galaxyp
parents:
diff changeset
123 $logger->fatal('Illegal character in position list. Must be a single positive integer or comma separated list of positive integers.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
124 _Usage();
163892325845 Initial commit.
galaxyp
parents:
diff changeset
125
163892325845 Initial commit.
galaxyp
parents:
diff changeset
126 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
127
163892325845 Initial commit.
galaxyp
parents:
diff changeset
128 if ($action eq 'delete') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
129
163892325845 Initial commit.
galaxyp
parents:
diff changeset
130 $ids_to_delete{$index_number} = 'del';
163892325845 Initial commit.
galaxyp
parents:
diff changeset
131
163892325845 Initial commit.
galaxyp
parents:
diff changeset
132 } elsif ($action eq 'shuffle') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
133
163892325845 Initial commit.
galaxyp
parents:
diff changeset
134 push(@new_id_order, $index_number);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
135
163892325845 Initial commit.
galaxyp
parents:
diff changeset
136 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
137 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
138
163892325845 Initial commit.
galaxyp
parents:
diff changeset
139 } else {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
140 $logger->fatal('Illegal action specified. Must be add, strip, replace, delete or shuffle.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
141 _Usage();
163892325845 Initial commit.
galaxyp
parents:
diff changeset
142 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
143
163892325845 Initial commit.
galaxyp
parents:
diff changeset
144
163892325845 Initial commit.
galaxyp
parents:
diff changeset
145 # Provides default if user did not specify log level:
163892325845 Initial commit.
galaxyp
parents:
diff changeset
146 $log_level = (defined($log_level) ? $log_level : 'WARN');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
147 # Reset log level to default if user specified illegal log level.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
148 $log_level = (defined($log_levels{$log_level}) ? $log_levels{$log_level} : $log_levels{'WARN'});
163892325845 Initial commit.
galaxyp
parents:
diff changeset
149
163892325845 Initial commit.
galaxyp
parents:
diff changeset
150 # Provide default if user did not specify fasta filename extension:
163892325845 Initial commit.
galaxyp
parents:
diff changeset
151 $extension = (defined($extension) ? $extension : 'fa');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
152
163892325845 Initial commit.
galaxyp
parents:
diff changeset
153 if ($input =~ /^$/ || $output =~ /^$/) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
154 # Indir and outdir cannot be empty.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
155 _Usage();
163892325845 Initial commit.
galaxyp
parents:
diff changeset
156 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
157 if ($input eq $output) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
158 $logger->fatal("Output dir/file is the same as the input dir/file. Please choose a different one.");
163892325845 Initial commit.
galaxyp
parents:
diff changeset
159 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
160 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
161
163892325845 Initial commit.
galaxyp
parents:
diff changeset
162 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
163 # Check if input is a single file or a directory.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
164 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
165 unless (-e $input && -r $input) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
166
163892325845 Initial commit.
galaxyp
parents:
diff changeset
167 $logger->fatal("Input $input does not exist or is not readable: $!");
163892325845 Initial commit.
galaxyp
parents:
diff changeset
168 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
169
163892325845 Initial commit.
galaxyp
parents:
diff changeset
170 } else {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
171
163892325845 Initial commit.
galaxyp
parents:
diff changeset
172 if (-f $input) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
173
163892325845 Initial commit.
galaxyp
parents:
diff changeset
174 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
175 # We've got an input file.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
176 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
177 my $file;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
178 if ($input =~ m/(.+\/)([^\/]+)$/) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
179 $file = $2;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
180 } else {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
181 $file = $input;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
182 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
183
163892325845 Initial commit.
galaxyp
parents:
diff changeset
184 $logger->info('Parsing ' . $file . "...\n");
163892325845 Initial commit.
galaxyp
parents:
diff changeset
185
163892325845 Initial commit.
galaxyp
parents:
diff changeset
186 _ConvertFastaHeaders($input, $output, $action, \@x_fixes_array, $new_x_fix, $position, \%ids_to_delete, \@new_id_order);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
187
163892325845 Initial commit.
galaxyp
parents:
diff changeset
188 $logger->info('Converted ' . $file);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
189
163892325845 Initial commit.
galaxyp
parents:
diff changeset
190 } else {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
191
163892325845 Initial commit.
galaxyp
parents:
diff changeset
192 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
193 # We've got an input directory.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
194 # Assume the output is also a directory.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
195 # Append trailing path separators if they was missing.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
196 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
197 my $indir;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
198 my $outdir;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
199 unless ($input =~ m/\/$/) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
200 $input = $input .+ '/';
163892325845 Initial commit.
galaxyp
parents:
diff changeset
201 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
202 unless ($output =~ m/\/$/) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
203 $output = $output .+ '/';
163892325845 Initial commit.
galaxyp
parents:
diff changeset
204 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
205 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
206 # Make sure the input dir is a directory.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
207 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
208 unless (-d $input) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
209 $logger->fatal("Input $input is not a file nor a directory: $!");
163892325845 Initial commit.
galaxyp
parents:
diff changeset
210 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
211 } else {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
212 $indir = $input;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
213 $outdir = $output;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
214 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
215
163892325845 Initial commit.
galaxyp
parents:
diff changeset
216 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
217 # Get all FASTA files from the input dir.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
218 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
219 my $files = _GetFiles($indir, $outdir, $extension);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
220
163892325845 Initial commit.
galaxyp
parents:
diff changeset
221 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
222 # Create the output directory if did not exist yet.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
223 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
224 if (-e $outdir && -d $outdir) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
225 unless (-w $outdir) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
226 $logger->fatal("Cannot write to output directory $outdir. Check for permission errors, read-only file systems, etc.");
163892325845 Initial commit.
galaxyp
parents:
diff changeset
227 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
228 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
229 } else {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
230 $logger->info("Creating output directory $outdir...");
163892325845 Initial commit.
galaxyp
parents:
diff changeset
231 eval{mkdir($outdir);};
163892325845 Initial commit.
galaxyp
parents:
diff changeset
232 if ($@) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
233 $logger->fatal("Cannot create output directory $outdir: $@");
163892325845 Initial commit.
galaxyp
parents:
diff changeset
234 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
235 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
236 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
237
163892325845 Initial commit.
galaxyp
parents:
diff changeset
238 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
239 # Convert FASTA files.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
240 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
241 foreach my $file (@{$files}) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
242
163892325845 Initial commit.
galaxyp
parents:
diff changeset
243 $logger->info('Parsing ' . $file . "...\n");
163892325845 Initial commit.
galaxyp
parents:
diff changeset
244
163892325845 Initial commit.
galaxyp
parents:
diff changeset
245 my $pathfrom = $indir .+ $file;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
246 my $pathto = $outdir .+ $file;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
247
163892325845 Initial commit.
galaxyp
parents:
diff changeset
248 _ConvertFastaHeaders($input, $output, $action, \@x_fixes_array, $new_x_fix, $position, \%ids_to_delete, \@new_id_order);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
249
163892325845 Initial commit.
galaxyp
parents:
diff changeset
250 $logger->info('Converted ' . $file);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
251
163892325845 Initial commit.
galaxyp
parents:
diff changeset
252 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
253 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
254 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
255
163892325845 Initial commit.
galaxyp
parents:
diff changeset
256 $logger->info('Finished!');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
257
163892325845 Initial commit.
galaxyp
parents:
diff changeset
258 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
259 ##
163892325845 Initial commit.
galaxyp
parents:
diff changeset
260 ### Internal subs.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
261 ##
163892325845 Initial commit.
galaxyp
parents:
diff changeset
262 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
263
163892325845 Initial commit.
galaxyp
parents:
diff changeset
264 sub _GetFiles {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
265
163892325845 Initial commit.
galaxyp
parents:
diff changeset
266 my ($indir, $outdir, $extension) = @_;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
267 my @files;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
268
163892325845 Initial commit.
galaxyp
parents:
diff changeset
269 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
270 # Get the relative path to the outdir.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
271 # Use this to remove it from the list of files/folders that need to be processed
163892325845 Initial commit.
galaxyp
parents:
diff changeset
272 # in case it's a subfolder of the input directory.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
273 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
274 $outdir =~ m/\/([^\/]+)\/$/;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
275 my $outdir_rel = $1;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
276
163892325845 Initial commit.
galaxyp
parents:
diff changeset
277 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
278 # Get and parse all files from the input dir.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
279 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
280 eval{
163892325845 Initial commit.
galaxyp
parents:
diff changeset
281 opendir (INDIR, $indir);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
282 @files = grep { /.+\.$extension/i and not /^\..*/ and not /$outdir_rel/} readdir INDIR;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
283 closedir INDIR;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
284 };
163892325845 Initial commit.
galaxyp
parents:
diff changeset
285 if ($@) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
286 $logger->fatal("Cannot read files from input directory $indir: $@");
163892325845 Initial commit.
galaxyp
parents:
diff changeset
287 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
288 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
289
163892325845 Initial commit.
galaxyp
parents:
diff changeset
290 return(\@files);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
291 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
292
163892325845 Initial commit.
galaxyp
parents:
diff changeset
293 sub _ConvertFastaHeaders {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
294
163892325845 Initial commit.
galaxyp
parents:
diff changeset
295 $logger->debug('_ConvertFastaHeaders sub');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
296
163892325845 Initial commit.
galaxyp
parents:
diff changeset
297 my ($pathfrom, $pathto, $action, $x_fixes_array, $new_x_fix, $position, $ids_to_delete, $new_id_order) = @_;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
298
163892325845 Initial commit.
galaxyp
parents:
diff changeset
299 my $header_count = 0;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
300
163892325845 Initial commit.
galaxyp
parents:
diff changeset
301 #local($/) = "\n\n"; # set line seperator to a blank line
163892325845 Initial commit.
galaxyp
parents:
diff changeset
302 open(READ,"<$pathfrom") or die "\tcan't open input file $pathfrom: $!";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
303 open(SAVE,">$pathto") or die "\tcan't open output file $pathto: $!";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
304 while (my $line = <READ>) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
305
163892325845 Initial commit.
galaxyp
parents:
diff changeset
306 my $new_line;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
307
163892325845 Initial commit.
galaxyp
parents:
diff changeset
308 if ($line =~ /^>/) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
309
163892325845 Initial commit.
galaxyp
parents:
diff changeset
310 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
311 # It's a header line.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
312 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
313 $header_count++;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
314 my $ids_string;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
315 my $description;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
316 my $line_end;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
317
163892325845 Initial commit.
galaxyp
parents:
diff changeset
318 if ($line =~ /^>([^\s]+)\s+(.+)([\n\r\f]+)/i) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
319
163892325845 Initial commit.
galaxyp
parents:
diff changeset
320 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
321 # Header with descripton
163892325845 Initial commit.
galaxyp
parents:
diff changeset
322 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
323 $ids_string = $1;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
324 $description = $2;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
325 $line_end = $3;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
326
163892325845 Initial commit.
galaxyp
parents:
diff changeset
327 } elsif ($line =~ /^>([^\s]+)\s*([\n\r\f]+)/i) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
328
163892325845 Initial commit.
galaxyp
parents:
diff changeset
329 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
330 # Header without descripton
163892325845 Initial commit.
galaxyp
parents:
diff changeset
331 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
332 $ids_string = $1;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
333 $line_end = $2;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
334
163892325845 Initial commit.
galaxyp
parents:
diff changeset
335 } else {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
336
163892325845 Initial commit.
galaxyp
parents:
diff changeset
337 $logger->fatal("Malformed header line. Cannot find ID.");
163892325845 Initial commit.
galaxyp
parents:
diff changeset
338 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
339
163892325845 Initial commit.
galaxyp
parents:
diff changeset
340 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
341
163892325845 Initial commit.
galaxyp
parents:
diff changeset
342 my @ids = split(/\|/, $ids_string);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
343
163892325845 Initial commit.
galaxyp
parents:
diff changeset
344 if ($action eq 'strip') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
345
163892325845 Initial commit.
galaxyp
parents:
diff changeset
346 $new_line = _StripFix($x_fixes_array, $ids_string, $description);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
347
163892325845 Initial commit.
galaxyp
parents:
diff changeset
348 } elsif ($action eq 'replace') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
349
163892325845 Initial commit.
galaxyp
parents:
diff changeset
350 $new_line = _ReplaceFix($x_fixes_array, $new_x_fix, $position, \@ids, $description);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
351
163892325845 Initial commit.
galaxyp
parents:
diff changeset
352 } elsif ($action eq 'add') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
353
163892325845 Initial commit.
galaxyp
parents:
diff changeset
354 $new_line = _AddFix($x_fixes_array, $position, \@ids, $description);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
355
163892325845 Initial commit.
galaxyp
parents:
diff changeset
356 } elsif ($action eq 'delete') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
357
163892325845 Initial commit.
galaxyp
parents:
diff changeset
358 $new_line = _DeleteID($ids_to_delete, \@ids, $description);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
359
163892325845 Initial commit.
galaxyp
parents:
diff changeset
360 } elsif ($action eq 'shuffle') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
361
163892325845 Initial commit.
galaxyp
parents:
diff changeset
362 $new_line = _ShuffleID($new_id_order, \@ids, $description);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
363
163892325845 Initial commit.
galaxyp
parents:
diff changeset
364 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
365
163892325845 Initial commit.
galaxyp
parents:
diff changeset
366 unless (defined($new_line)) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
367
163892325845 Initial commit.
galaxyp
parents:
diff changeset
368 $logger->fatal('Cannot convert header number: ' . $header_count);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
369 $logger->fatal('Offending header line was: ' . $line);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
370 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
371
163892325845 Initial commit.
galaxyp
parents:
diff changeset
372 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
373
163892325845 Initial commit.
galaxyp
parents:
diff changeset
374 $new_line .= $line_end;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
375
163892325845 Initial commit.
galaxyp
parents:
diff changeset
376 } elsif ($line =~ /^[\n\r\f]+$/) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
377
163892325845 Initial commit.
galaxyp
parents:
diff changeset
378 # Skip blank line.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
379
163892325845 Initial commit.
galaxyp
parents:
diff changeset
380 } else {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
381
163892325845 Initial commit.
galaxyp
parents:
diff changeset
382 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
383 # It must be a sequence line.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
384 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
385 $new_line = $line;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
386
163892325845 Initial commit.
galaxyp
parents:
diff changeset
387 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
388
163892325845 Initial commit.
galaxyp
parents:
diff changeset
389 # Save (modified) line.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
390 print SAVE $new_line or die "\tcan't save output to file $pathto: $!";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
391
163892325845 Initial commit.
galaxyp
parents:
diff changeset
392 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
393
163892325845 Initial commit.
galaxyp
parents:
diff changeset
394 close(READ);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
395 close(SAVE);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
396
163892325845 Initial commit.
galaxyp
parents:
diff changeset
397 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
398
163892325845 Initial commit.
galaxyp
parents:
diff changeset
399 sub _StripFix {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
400
163892325845 Initial commit.
galaxyp
parents:
diff changeset
401 my ($x_fixes_array, $ids_string, $description) = @_;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
402 my $new_line;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
403
163892325845 Initial commit.
galaxyp
parents:
diff changeset
404 foreach my $x_fix (@{$x_fixes_array}) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
405
163892325845 Initial commit.
galaxyp
parents:
diff changeset
406 $ids_string =~ s/$x_fix//g;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
407
163892325845 Initial commit.
galaxyp
parents:
diff changeset
408 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
409
163892325845 Initial commit.
galaxyp
parents:
diff changeset
410 if (defined($description)) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
411 $new_line = '>' . $ids_string . ' ' . $description;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
412 } else {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
413 $new_line = '>' . $ids_string;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
414 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
415
163892325845 Initial commit.
galaxyp
parents:
diff changeset
416 return($new_line);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
417
163892325845 Initial commit.
galaxyp
parents:
diff changeset
418 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
419
163892325845 Initial commit.
galaxyp
parents:
diff changeset
420 sub _ReplaceFix {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
421
163892325845 Initial commit.
galaxyp
parents:
diff changeset
422 my ($x_fixes_array, $new_x_fix, $position, $ids, $description) = @_;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
423 my $new_line = '>';
163892325845 Initial commit.
galaxyp
parents:
diff changeset
424
163892325845 Initial commit.
galaxyp
parents:
diff changeset
425 for my $count (0 .. $#{$ids}) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
426
163892325845 Initial commit.
galaxyp
parents:
diff changeset
427 my $id = ${$ids}[$count];
163892325845 Initial commit.
galaxyp
parents:
diff changeset
428 my $stripped_id;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
429 my $match = 0;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
430
163892325845 Initial commit.
galaxyp
parents:
diff changeset
431 if ($position eq 'prefix' || $position eq 'pre2suf') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
432
163892325845 Initial commit.
galaxyp
parents:
diff changeset
433 foreach my $x_fix (@{$x_fixes_array}) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
434
163892325845 Initial commit.
galaxyp
parents:
diff changeset
435 if ($id =~ m/^$x_fix(.+)/) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
436
163892325845 Initial commit.
galaxyp
parents:
diff changeset
437 $stripped_id = $1;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
438 $id = $stripped_id;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
439 $match = 1;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
440
163892325845 Initial commit.
galaxyp
parents:
diff changeset
441 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
442 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
443
163892325845 Initial commit.
galaxyp
parents:
diff changeset
444 } elsif ($position eq 'suffix' || $position eq 'suf2pre') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
445
163892325845 Initial commit.
galaxyp
parents:
diff changeset
446 foreach my $x_fix (@{$x_fixes_array}) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
447
163892325845 Initial commit.
galaxyp
parents:
diff changeset
448 if ($id =~ m/(.+)$x_fix$/) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
449
163892325845 Initial commit.
galaxyp
parents:
diff changeset
450 $stripped_id = $1;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
451 $id = $stripped_id;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
452 $match = 1;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
453
163892325845 Initial commit.
galaxyp
parents:
diff changeset
454 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
455 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
456
163892325845 Initial commit.
galaxyp
parents:
diff changeset
457 } else {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
458
163892325845 Initial commit.
galaxyp
parents:
diff changeset
459 $logger->fatal("Illegal or no position $position specified.");
163892325845 Initial commit.
galaxyp
parents:
diff changeset
460 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
461
163892325845 Initial commit.
galaxyp
parents:
diff changeset
462 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
463
163892325845 Initial commit.
galaxyp
parents:
diff changeset
464 if ($match) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
465
163892325845 Initial commit.
galaxyp
parents:
diff changeset
466 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
467 # Append the new *fix.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
468 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
469 if ($position eq 'prefix' || $position eq 'suf2pre') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
470
163892325845 Initial commit.
galaxyp
parents:
diff changeset
471 $new_line .= $new_x_fix . $stripped_id . '|';
163892325845 Initial commit.
galaxyp
parents:
diff changeset
472
163892325845 Initial commit.
galaxyp
parents:
diff changeset
473 } elsif ($position eq 'pre2suf' || $position eq 'suffix') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
474
163892325845 Initial commit.
galaxyp
parents:
diff changeset
475 $new_line .= $stripped_id . $new_x_fix . '|';
163892325845 Initial commit.
galaxyp
parents:
diff changeset
476
163892325845 Initial commit.
galaxyp
parents:
diff changeset
477 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
478
163892325845 Initial commit.
galaxyp
parents:
diff changeset
479 } else {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
480
163892325845 Initial commit.
galaxyp
parents:
diff changeset
481 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
482 # Copy the ID unmodified to the result.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
483 #
163892325845 Initial commit.
galaxyp
parents:
diff changeset
484 $new_line .= ${$ids}[$count] . '|';
163892325845 Initial commit.
galaxyp
parents:
diff changeset
485
163892325845 Initial commit.
galaxyp
parents:
diff changeset
486 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
487 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
488
163892325845 Initial commit.
galaxyp
parents:
diff changeset
489 $new_line =~ s/\|$//;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
490 if (defined($description)) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
491 $new_line .= ' ' . $description;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
492 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
493
163892325845 Initial commit.
galaxyp
parents:
diff changeset
494 return($new_line);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
495
163892325845 Initial commit.
galaxyp
parents:
diff changeset
496 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
497
163892325845 Initial commit.
galaxyp
parents:
diff changeset
498 sub _AddFix {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
499
163892325845 Initial commit.
galaxyp
parents:
diff changeset
500 my ($x_fixes_array, $position, $ids, $description) = @_;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
501 my $new_line = '>';
163892325845 Initial commit.
galaxyp
parents:
diff changeset
502
163892325845 Initial commit.
galaxyp
parents:
diff changeset
503 my $id_count = scalar(@{$ids});
163892325845 Initial commit.
galaxyp
parents:
diff changeset
504 my $x_fix_count = scalar(@{$x_fixes_array});
163892325845 Initial commit.
galaxyp
parents:
diff changeset
505
163892325845 Initial commit.
galaxyp
parents:
diff changeset
506 unless ($id_count == $x_fix_count) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
507 $logger->fatal('Amount of pre- or suffixes specified (' . $x_fix_count . ') does not match with amount if IDs found ' . $id_count . ').');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
508 return(undef);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
509 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
510
163892325845 Initial commit.
galaxyp
parents:
diff changeset
511 for my $count (0 .. $#{$ids}) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
512
163892325845 Initial commit.
galaxyp
parents:
diff changeset
513 if ($position eq 'prefix') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
514
163892325845 Initial commit.
galaxyp
parents:
diff changeset
515 $new_line .= ${$x_fixes_array}[$count] . ${$ids}[$count] . '|';
163892325845 Initial commit.
galaxyp
parents:
diff changeset
516
163892325845 Initial commit.
galaxyp
parents:
diff changeset
517 } elsif ($position eq 'suffix') {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
518
163892325845 Initial commit.
galaxyp
parents:
diff changeset
519 $new_line .= ${$ids}[$count] . ${$x_fixes_array}[$count] . '|';
163892325845 Initial commit.
galaxyp
parents:
diff changeset
520
163892325845 Initial commit.
galaxyp
parents:
diff changeset
521 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
522 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
523
163892325845 Initial commit.
galaxyp
parents:
diff changeset
524 $new_line =~ s/\|$//;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
525 if (defined($description)) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
526 $new_line .= ' ' . $description;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
527 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
528
163892325845 Initial commit.
galaxyp
parents:
diff changeset
529 return($new_line);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
530
163892325845 Initial commit.
galaxyp
parents:
diff changeset
531 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
532
163892325845 Initial commit.
galaxyp
parents:
diff changeset
533 sub _DeleteID {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
534
163892325845 Initial commit.
galaxyp
parents:
diff changeset
535 my ($ids_to_delete, $ids, $description) = @_;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
536 my $new_line = '>';
163892325845 Initial commit.
galaxyp
parents:
diff changeset
537
163892325845 Initial commit.
galaxyp
parents:
diff changeset
538 $new_line = '>';
163892325845 Initial commit.
galaxyp
parents:
diff changeset
539
163892325845 Initial commit.
galaxyp
parents:
diff changeset
540 for my $offset (0 .. $#{$ids}) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
541
163892325845 Initial commit.
galaxyp
parents:
diff changeset
542 my $index = $offset + 1;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
543
163892325845 Initial commit.
galaxyp
parents:
diff changeset
544 if (defined(${$ids_to_delete}{$index})) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
545
163892325845 Initial commit.
galaxyp
parents:
diff changeset
546 # Skip (drop) this ID.
163892325845 Initial commit.
galaxyp
parents:
diff changeset
547 $logger->debug('Dropping ' . ${$ids}[$offset] . ' as it is ID number ' . $index . '.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
548
163892325845 Initial commit.
galaxyp
parents:
diff changeset
549 } else {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
550
163892325845 Initial commit.
galaxyp
parents:
diff changeset
551 $new_line .= ${$ids}[$offset] . '|';
163892325845 Initial commit.
galaxyp
parents:
diff changeset
552
163892325845 Initial commit.
galaxyp
parents:
diff changeset
553 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
554 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
555
163892325845 Initial commit.
galaxyp
parents:
diff changeset
556 $new_line =~ s/\|$//;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
557 if (defined($description)) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
558 $new_line .= ' ' . $description;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
559 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
560
163892325845 Initial commit.
galaxyp
parents:
diff changeset
561 return($new_line);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
562
163892325845 Initial commit.
galaxyp
parents:
diff changeset
563 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
564
163892325845 Initial commit.
galaxyp
parents:
diff changeset
565 sub _ShuffleID {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
566
163892325845 Initial commit.
galaxyp
parents:
diff changeset
567 my ($new_id_order, $ids, $description) = @_;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
568 my $new_line = '>';
163892325845 Initial commit.
galaxyp
parents:
diff changeset
569
163892325845 Initial commit.
galaxyp
parents:
diff changeset
570 my $id_count = scalar(@{$ids});
163892325845 Initial commit.
galaxyp
parents:
diff changeset
571 my $new_id_order_item_count = scalar(@{$new_id_order});
163892325845 Initial commit.
galaxyp
parents:
diff changeset
572
163892325845 Initial commit.
galaxyp
parents:
diff changeset
573 unless ($id_count == $new_id_order_item_count) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
574 $logger->fatal('Amount of IDs specified to re-order (' . $new_id_order_item_count . ') does not match with amount if IDs found (' . $id_count . ').');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
575 return(undef);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
576 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
577
163892325845 Initial commit.
galaxyp
parents:
diff changeset
578 $new_line = '>';
163892325845 Initial commit.
galaxyp
parents:
diff changeset
579
163892325845 Initial commit.
galaxyp
parents:
diff changeset
580 foreach my $rank (@{$new_id_order}) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
581
163892325845 Initial commit.
galaxyp
parents:
diff changeset
582 my $offset = $rank - 1;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
583 $logger->debug('ID rank ' . $rank . ' = ' . ${$ids}[$offset] . '.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
584 $new_line .= ${$ids}[$offset] . '|';
163892325845 Initial commit.
galaxyp
parents:
diff changeset
585 $logger->debug('New header line now contains ' . $new_line . '.');
163892325845 Initial commit.
galaxyp
parents:
diff changeset
586
163892325845 Initial commit.
galaxyp
parents:
diff changeset
587 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
588
163892325845 Initial commit.
galaxyp
parents:
diff changeset
589 $new_line =~ s/\|$//;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
590 if (defined($description)) {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
591 $new_line .= ' ' . $description;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
592 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
593
163892325845 Initial commit.
galaxyp
parents:
diff changeset
594 return($new_line);
163892325845 Initial commit.
galaxyp
parents:
diff changeset
595
163892325845 Initial commit.
galaxyp
parents:
diff changeset
596 }
163892325845 Initial commit.
galaxyp
parents:
diff changeset
597
163892325845 Initial commit.
galaxyp
parents:
diff changeset
598 sub _Usage {
163892325845 Initial commit.
galaxyp
parents:
diff changeset
599
163892325845 Initial commit.
galaxyp
parents:
diff changeset
600 print "\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
601 print "ConvertFastaHeaders.pl - Converts sequence headers of FASTA files.\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
602 print "\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
603 print "Usage:\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
604 print "\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
605 print " ConvertFastaHeaders.pl options\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
606 print "\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
607 print "Available options are:\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
608 print "\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
609 print " -i [dir/file] Input can be a single FASTA file or a directory containing FASTA files.\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
610 print " -e [ext] File name extension for the FASTA files in case the input is a directory. (default = fa)\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
611 print " -o [dir/file] Output file or directory where the result(s) will be saved.\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
612 print " -a [action] Action must be one of 'add', 'strip', 'replace', 'delete' or 'shuffle'.\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
613 print " The actions 'delete' and 'shuffle' operate on complete sequence IDs with or without (database namespace) prefixes or suffixes.\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
614 print " The actions 'add', 'strip' and 'replace' operate on sequence ID prefixes or suffixes.\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
615 print " Note in case *fixes are added the order of the *fixes is important! (See below for examples.)\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
616 print " -p [position] Positon must be a comma separated list of numbers in case the action is 'delete' or 'shuffle'.\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
617 print " Position must be one of 'prefix' or 'suffix' when the action is 'add' or 'strip'.\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
618 print " In case the action is 'replace' the position can also be one of pre2suf or suf2pre \n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
619 print " to replace a prefix with a suffix or vice versa.\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
620 print " -f '[*fix1 *fix2 *fixN]' Space separated list of prefixes or suffixes, which will be replaced in, added to or removed from pipe separated identifiers.\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
621 print " Note that in case of database namespace prefixes you must specify both the database name space and \n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
622 print " the character to separate the namespace from the accession number as the prefix. (See below for examples.) \n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
623 print " -n '[*fix]' A single new prefix or suffix to replace the *fixes specified with -f.\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
624 print " (Only required in case the action is 'replace'.)\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
625 print " -l [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
626 print "\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
627 print "Examples:\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
628 print "\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
629 print " Adding prefixes\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
630 print " In this case the order of the *fixes specified with -f is important!\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
631 print " With -a add -p prefix -f 'UniProtAcc: UniProtID:', this header:\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
632 print " >P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
633 print " will be converted into:\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
634 print " >UniProtAcc:P32234|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
635 print " Stripping prefixes\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
636 print " In this case the order of the *fixes specified with -f is not relevant.\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
637 print " With both -a strip -p prefix -f 'UniProtAcc: UniProtID:' or \n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
638 print " with -a strip -p prefix -f 'UniProtID: UniProtAcc:', this header:\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
639 print " >UniProtAcc:P32234|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
640 print " will be converted into:\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
641 print " >P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
642 print " Replacing prefixes with a suffix\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
643 print " In this case the order of the *fixes specified with -f is not relevant.\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
644 print " With -a replace -p pre2suf -f 'REV_' -n '_REV', this header:\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
645 print " >REV_P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
646 print " will be converted into:\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
647 print " >P32234_REV|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
648 print " Deleting sequence identifiers\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
649 print " Supply a comma separated list of numbers for the ranks of the identifiers / accession numbers you want to remove.\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
650 print " Multiple identifiers must be separated with a pipe symbol.\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
651 print " With -a delete -p '1,3', this header:\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
652 print " >UniProtID:128UP_DROME|UniProtAcc:P32234|EMBL:AY069810 GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
653 print " will be converted into:\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
654 print " >UniProtAcc:P32234 GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
655 print " Changing the order of sequence identifiers\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
656 print " Supply a comma separated list of numbers for the new order of all the identifiers / accession numbers in a header.\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
657 print " Multiple identifiers must be separated with a pipe symbol.\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
658 print " Hence if your headers contain 4 pipe separated IDs and you only want to swap the order of the first and the second, \n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
659 print " you will still need to specify the new (unchanged) order for number 3 and 4 too.\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
660 print " With -a shuffle -p '2,1,3', this header:\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
661 print " >UniProtID:128UP_DROME|UniProtAcc:P32234|EMBL:AY069810 GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
662 print " will be converted into:\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
663 print " >UniProtAcc:P32234|UniProtID:128UP_DROME|EMBL:AY069810 GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
664 print " Specifying only *2,1* as the New order for the IDs will not work, because this header contains 3 IDs, \n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
665 print " so you'll have to include the (new) position for the third one as well.\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
666 print "\n";
163892325845 Initial commit.
galaxyp
parents:
diff changeset
667 exit;
163892325845 Initial commit.
galaxyp
parents:
diff changeset
668
163892325845 Initial commit.
galaxyp
parents:
diff changeset
669 }