6
|
1 #! /usr/bin/env perl
|
|
2
|
|
3 # reformat.pl
|
|
4 # Reformat a multiple alignment file
|
|
5 #
|
|
6 # HHsuite version 2.0.16 (January 2013)
|
|
7 #
|
|
8 # Reference:
|
|
9 # Remmert M., Biegert A., Hauser A., and Soding J.
|
|
10 # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
|
|
11 # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
|
|
12
|
|
13 # (C) Johannes Soeding, 2012
|
|
14
|
|
15 # This program is free software: you can redistribute it and/or modify
|
|
16 # it under the terms of the GNU General Public License as published by
|
|
17 # the Free Software Foundation, either version 3 of the License, or
|
|
18 # (at your option) any later version.
|
|
19
|
|
20 # This program is distributed in the hope that it will be useful,
|
|
21 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
22 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
23 # GNU General Public License for more details.
|
|
24
|
|
25 # You should have received a copy of the GNU General Public License
|
|
26 # along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
27
|
|
28 # We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de
|
|
29
|
|
30 use lib $ENV{"HHLIB"}."/scripts";
|
|
31 use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc.
|
|
32 use strict;
|
|
33 use warnings;
|
|
34 my $numres=100; # number of residues per line
|
|
35 my $desclen=1000; # maximum number of characters in nameline
|
|
36 my $ARGC=scalar(@ARGV);
|
|
37 if ($ARGC<2) {
|
|
38 die("
|
|
39 reformat.pl from HHsuite $VERSION
|
|
40 Read a multiple alignment in one format and write it in another format
|
|
41 Usage: reformat.pl [informat] [outformat] infile outfile [options]
|
|
42 or reformat.pl [informat] [outformat] 'fileglob' .ext [options]
|
|
43
|
|
44 Available input formats:
|
|
45 fas: aligned fasta; lower and upper case equivalent, '.' and '-' equivalent
|
|
46 a2m: aligned fasta; inserts: lower case, matches: upper case, deletes: '-',
|
|
47 gaps aligned to inserts: '.'
|
|
48 a3m: like a2m, but gaps aligned to inserts MAY be omitted
|
|
49 sto: Stockholm format; sequences in several blocks with sequence name at
|
|
50 beginning of line (hmmer output)
|
|
51 psi: format as read by PSI-BLAST using the -B option (like sto with -M first -r)
|
|
52 clu: Clustal format; sequences in several blocks with sequence name at beginning
|
|
53 of line
|
|
54 Available output formats:
|
|
55 fas: aligned fasta; all gaps '-'
|
|
56 a2m: aligned fasta; inserts: lower case, matches: upper case, deletes: '-', gaps
|
|
57 aligned to inserts: '.'
|
|
58 a3m: like a2m, but gaps aligned to inserts are omitted
|
|
59 sto: Stockholm format; sequences in just one block, one line per sequence
|
|
60 psi: format as read by PSI-BLAST using the -B option
|
|
61 clu: Clustal format
|
|
62 If no input or output format is given the file extension is interpreted as format
|
|
63 specification ('aln' as 'clu')
|
|
64
|
|
65 Options:
|
|
66 -v int verbose mode (0:off, 1:on)
|
|
67 -num add number prefix to sequence names: 'name', '1:name' '2:name' etc
|
|
68 -noss remove secondary structure sequences (beginning with >ss_)
|
|
69 -sa do not remove solvent accessibility sequences (beginning with >sa_)
|
|
70 -M first make all columns with residue in first seuqence match columns
|
|
71 (default for output format a2m or a3m)
|
|
72 -M int make all columns with less than X% gaps match columns
|
|
73 (for output format a2m or a3m)
|
|
74 -r remove all lower case residues (insert states)
|
|
75 (AFTER -M option has been processed)
|
|
76 -r int remove all lower case columns with more than X% gaps
|
|
77 -g '' suppress all gaps
|
|
78 -g '-' write all gaps as '-'
|
|
79 -uc write all residues in upper case (AFTER all other options have been processed)
|
|
80 -lc write all residues in lower case (AFTER all other options have been processed)
|
|
81 -l number of residues per line (for Clustal, FASTA, A2M, A3M formats)
|
|
82 (default=$numres)
|
|
83 -d maximum number of characers in nameline (default=$desclen)
|
|
84
|
|
85 Examples: reformat.pl 1hjra.a3m 1hjra.a2m
|
|
86 (same as reformat.pl a3m a2m 1hjra.a3m 1hjra.a2m)
|
|
87 reformat.pl test.a3m test.fas -num -r 90
|
|
88 reformat.pl fas sto '*.fasta' .stockholm
|
|
89 \n");
|
|
90 # clu: clustal format (hmmer output)
|
|
91 # sel: Selex format
|
|
92 # phy: Phylip format
|
|
93 }
|
|
94
|
|
95 my $informat="";
|
|
96 my $outformat="";
|
|
97 my $infile="";
|
|
98 my $outfile="";
|
|
99 my $num=0; # don't use sequence number as prefix: '>n|name'
|
|
100 my $noss=0; # don't remove secondary structure
|
|
101 my $nosa=1; # remove solvent accessibility sequences
|
|
102 my $line;
|
|
103 my $options="";
|
|
104 my @names; # names of sequences read in
|
|
105 my @seqs; # residues of sequences read in
|
|
106 my $n; # number of sequences read in
|
|
107 my $k; # counts sequences for output
|
|
108 my $remove_inserts=0;
|
|
109 my $remove_gapped=0;
|
|
110 my $matchmode=""; # do not change capitalization
|
|
111 my $match_gaprule=0; # columns with less than this percentage of gaps will be match columns
|
|
112 my $v=2;
|
|
113 my $update=0;
|
|
114 my $nss=-1; # index of secondary structure sequence
|
|
115 my $lname; # length of sequence name in clustal, stockholm, psi format
|
|
116 my $titleline; # first line beginning with "#" in A3M, A2M, or FASTA files
|
|
117
|
|
118 my @informats= ("fas","a2m","a3m","sto","psi","clu");
|
|
119 my @outformats= ("fas","a2m","a3m","sto","psi","clu","ufas");
|
|
120 my $found;
|
|
121 my $element;
|
|
122 my $gap="default";
|
|
123 my $case="default";
|
|
124
|
|
125 # Process optionsz
|
|
126 for (my $i=0; $i<$ARGC; $i++) {$options.=" $ARGV[$i] ";}
|
|
127 if ($options=~s/ -i\s+(\S+) / /) {$infile=$1;}
|
|
128 if ($options=~s/ -o\s+(\S+) / /) {$outfile=$1;}
|
|
129 if ($options=~s/ -num / /) {$num=1; $desclen=505;}
|
|
130 if ($options=~s/ -noss / /) {$noss=1;}
|
|
131 if ($options=~s/ -sa / /) {$nosa=0;}
|
|
132 if ($options=~s/ -g\s+\'?(\S*)\'? / /) {$gap=$1;}
|
|
133 if ($options=~s/ -r\s+(\d+) / /) {$remove_gapped=$1;}
|
|
134 if ($options=~s/ -r / /) {$remove_inserts=1;}
|
|
135 if ($options=~s/ -lc / /) {$case="lc";}
|
|
136 if ($options=~s/ -uc / /) {$case="uc";}
|
|
137 if ($options=~s/ -v\s*(\d+) / /) {$v=$1;}
|
|
138 if ($options=~s/ -v / /) {$v=2;}
|
|
139 if ($options=~s/ -M\s+(\d+) / /) {$matchmode="gaprule"; $match_gaprule=$1;}
|
|
140 if ($options=~s/ -M\s+first / /) {$matchmode="first"; $match_gaprule=$1;}
|
|
141 if ($options=~s/ -u / /) {$update=1;}
|
|
142 if ($options=~s/ -l\s+(\S+) / /) {$numres=$1;}
|
|
143 if ($options=~s/ -lname\s+(\S+) / /) {$lname=$1;}
|
|
144 if ($options=~s/ -d\s+(\S+) / /) {$desclen=$1;}
|
|
145
|
|
146 # Assign informat, outformat, infile, and outfile
|
|
147 if ($outfile eq "") {
|
|
148 if ($options=~s/(\S+)\s*$//) {
|
|
149 $outfile=$1;
|
|
150 } else {
|
|
151 die("Error: no output file given: '$options'\n");
|
|
152 }
|
|
153 }
|
|
154 if ($infile eq "") {
|
|
155 if ($options=~s/(\S+)\s*$//) {
|
|
156 $infile=$1;
|
|
157 } else {
|
|
158 die("Error: no input file given: '$options'\n");
|
|
159 }
|
|
160 }
|
|
161 if ($options=~s/(\S+)\s*$//) {
|
|
162 $outformat=$1;
|
|
163 } else {
|
|
164 if ($outfile=~/\S*\.(\S+?)$/) {
|
|
165 $outformat=lc($1);
|
|
166 if ($outformat eq "aln") {$outformat="clu";}
|
|
167 elsif ($outformat eq "fa") {$outformat="fas";}
|
|
168 elsif ($outformat eq "fasta") {$outformat="fas";}
|
|
169 elsif ($outformat eq "afa") {$outformat="fas";}
|
|
170 elsif ($outformat eq "afas") {$outformat="fas";}
|
|
171 elsif ($outformat eq "afasta") {$outformat="fas";}
|
|
172 } else {
|
|
173 print ("Using FASTA output format: '$options'\n"); $outformat="fas";
|
|
174 }
|
|
175 }
|
|
176 if ($options=~s/(\S+)\s*$//) {
|
|
177 $informat=$1;
|
|
178 } else {
|
|
179 if ($infile=~/\S*\.(\S+?)$/) {
|
|
180 $informat=lc($1);
|
|
181 if ($informat eq "aln") {$informat="clu";}
|
|
182 elsif ($informat eq "fa") {$informat="fas";}
|
|
183 elsif ($informat eq "fasta") {$informat="fas";}
|
|
184 } else {
|
|
185 print ("Using FASTA input format: '$options'\n"); $informat="fas";
|
|
186 }
|
|
187 }
|
|
188
|
|
189
|
|
190 # Warn if unknown options found
|
|
191 if ($options!~/^\s*$/) {
|
|
192 $options=~s/^\s*(.*?)\s*$/$1/g;
|
|
193 print("\nWARNING: unknown options '$options'\n");
|
|
194 }
|
|
195
|
|
196 # Check if input and output formats are valid
|
|
197 $found=0;
|
|
198 foreach $element (@informats) {if ($informat eq $element) {$found=1; last;}}
|
|
199 if(!$found) {die("\nError: $informat is not a valid input format option\n");}
|
|
200 $found=0;
|
|
201 foreach $element (@outformats) {if ($outformat eq $element) {$found=1; last;}}
|
|
202 if(!$found) {die("\nError: $outformat is not a valid output format option\n");}
|
|
203
|
|
204 #if($outformat eq "psi") {
|
|
205 # $remove_inserts=1;
|
|
206 #}
|
|
207 if($outformat eq "ufas") {$gap="";}
|
|
208
|
|
209
|
|
210 if ($infile=~/\*/ || $outfile=~/^\./) # if infile contains '*' or outfile is just an extension
|
|
211 {
|
|
212 $outfile=~/.*\.(\S*)$/;
|
|
213 my $outext=$1;
|
|
214 my @infiles=glob($infile);
|
|
215 printf("%i files to reformat\n",scalar(@infiles));
|
|
216 foreach $infile (@infiles)
|
|
217 {
|
|
218 if ($infile!~/(\S+)\.\S+/) {$infile=~/(\S+)/}
|
|
219 $outfile="$1.$outext";
|
|
220 if ($update && -e $outfile) {next;}
|
|
221 if ($v>=3) {print("Reformatting $infile from $informat to $outformat ...\n");}
|
|
222 &reformat($infile,$outfile);
|
|
223 }
|
|
224 exit 0;
|
|
225 }
|
|
226 else
|
|
227 {
|
|
228 if ($v>=3) {print("Reformatting $infile from $informat to $outformat ...\n");}
|
|
229 &reformat($infile,$outfile);
|
|
230 exit 0;
|
|
231 }
|
|
232
|
|
233
|
|
234 ################################################################################################
|
|
235 # Reformat a single file
|
|
236 ################################################################################################
|
|
237 sub reformat()
|
|
238 {
|
|
239 $infile=$_[0];
|
|
240 $nss=-1;
|
|
241 $titleline="";
|
|
242
|
|
243 ################################################################################################
|
|
244 # Input part
|
|
245 ################################################################################################
|
|
246
|
|
247 my $skip=0;
|
|
248 open (INFILE,"<$infile") or die ("ERROR: cannot open $infile: $!\n");
|
|
249
|
|
250 # Read a2m, a3m, fas format
|
|
251 if ($informat eq "fas" || $informat eq "a2m" || $informat eq "a3m")
|
|
252 {
|
|
253 $/=">"; # set input field seperator
|
|
254 $n=0;
|
|
255 my $seq=<INFILE>;
|
|
256 if ($seq=~s/^(\#.*)//) {$titleline=$1;} # remove commentary lines at beginning of file
|
|
257 $seq=~s/(\n\#.*)*\n//; # remove commentary lines at beginning of file
|
|
258
|
|
259 # If first line has no sequence name, use >root_name
|
|
260 if ($seq ne ">") {
|
|
261 $infile="/$infile."; # determine root name 1
|
|
262 $infile=~/^.*\/(.*?)\..*/; # determine root name 2
|
|
263 $names[$n]=$1; # don't move this line away from previous line $seq=~s/([^\n]*)//;
|
|
264 $seq=~tr/\n //d; # remove newlines and blanks
|
|
265 $seqs[$n]=$seq;
|
|
266 $n++;
|
|
267 }
|
|
268
|
|
269 while ($seq=<INFILE>) {
|
|
270 $seq=~s/\n\#.*//g; # remove commentary lines
|
|
271 while ($seq=~s/(.)>/$1/) {$seq.=<INFILE>;} # in the case that nameline contains a '>'; '.' matches anything except '\n'
|
|
272 if ($seq=~/^aa_/) {next;}
|
|
273 if ($seq=~/^sa_/ && $nosa) {next;}
|
|
274 if ($seq=~/^ss_/) {
|
|
275 if ($noss) {next;} # do not read in >ss_ and >sa_ sequences
|
|
276 # if ($seq=~/^ss_conf/) {next;} # comment out to read sequence with confidence values
|
|
277 # if ($nss>=0) {next;} # comment out: allow for two or more >ss_dssp and >ss_pred lines
|
|
278 $nss=$n;
|
|
279 }
|
|
280 $seq=~s/^\s*(.*)//; # divide into nameline and residues; '.' matches anything except '\n'
|
|
281 if (defined $1 && $1 ne "") {
|
|
282 $names[$n]=$1; # don't move this line away from previous line $seq=~s/([^\n]*)//;
|
|
283 } else {
|
|
284 $names[$n]=$n;
|
|
285 }
|
|
286 $seqs[$n]=$seq;
|
|
287 $n++;
|
|
288 }
|
|
289
|
|
290 $/="\n"; # reset input field seperator
|
|
291 }
|
|
292
|
|
293 # Read Stockholm format
|
|
294 elsif ($informat eq "sto")
|
|
295 {
|
|
296 my %seqhash;
|
|
297 my $name;
|
|
298 my $first_block=1;
|
|
299
|
|
300 $n=0;
|
|
301 while ($line = <INFILE>)
|
|
302 {
|
|
303 $line=~tr/\r//d;
|
|
304 $line=~s/\s+/ /g;
|
|
305 if ($line=~s/^\#=GC SS_cons/ss_dssp/) {} # skip commentary and blank line
|
|
306 if ($line=~/^\#/) {next;} # skip commentary and blank line
|
|
307 if ($line=~/^\/\//) {last;} # reached end of file
|
|
308 if ($line=~/^\s*$/){$first_block=0; next;} # new sequence block starts
|
|
309 if ($line!~/^\s*(\S+)\s+(\S+)/) {
|
|
310 die ("\nERROR found in stockholm format: $!");
|
|
311 }
|
|
312 if (!(exists $seqhash{$1}))
|
|
313 {
|
|
314 if ($line=~/^aa_/) {next;} # do not read in >aa_ sequences
|
|
315 if ($line=~/^sa_/ && $nosa) {next;} # do not read in >sa_ sequences
|
|
316 if ($line=~/^ss_/) {
|
|
317 if ($noss) {next;} # do not read in >ss_ and >aa_ sequences
|
|
318 # if ($line=~/^ss_conf/) {next;} # comment out to read sequence with confidence values
|
|
319 # if ($nss>=0) {next;} # comment out: allow for two or more >ss_dssp and >ss_pred lines
|
|
320 $nss=$n;
|
|
321 }
|
|
322 $line=~/^\s*(\S+)\s+(\S+)/;
|
|
323 $names[$n]=$1;
|
|
324 $seqs[$n]=$2;
|
|
325 $seqhash{$1}=$n++;
|
|
326 $first_block=1;
|
|
327 }
|
|
328 else
|
|
329 {
|
|
330 if ($first_block) {die ("\nERROR: sequence $1 appears more than once per block\n");}
|
|
331 $seqs[$seqhash{$1}].=$2;
|
|
332 }
|
|
333 # printf("%3i %s\n",$n-1,$seqs[$n-1]);
|
|
334 }
|
|
335 }
|
|
336
|
|
337 elsif ($informat eq "clu")
|
|
338 {
|
|
339 my $residues_per_line=50; # default number of characters for namefield residues
|
|
340 # (only needed if no gap between name and residues in first sequence -> SMART)
|
|
341 my $block=1; # number of block currently read
|
|
342 my $name;
|
|
343 my $residues;
|
|
344 $n=0; # sequence number of first block
|
|
345 $k=0; # sequence index to zero for first block
|
|
346
|
|
347 while ($line = <INFILE>)
|
|
348 {
|
|
349 # print($namefieldlen.$line);
|
|
350 $line=~tr/\r//d;
|
|
351 if ($line=~/CLUSTAL/i) {next;}
|
|
352 if ($line=~/^\#/) {next;} # skip commentary and blank line
|
|
353 if ($line=~/^\/\//) {last;} # reached end of file
|
|
354 if ($line=~/^\s*$/){ # new sequence block starts
|
|
355 if ($k) {
|
|
356 if ($n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");}
|
|
357 $block++;
|
|
358 $n=$k;
|
|
359 $k=0; # set sequence index to zero for next block to read
|
|
360 }
|
|
361 next;
|
|
362 }
|
|
363 if ($line!~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/) {
|
|
364 if ($line=~/^[*.: ]*$/) {next;}
|
|
365 if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} # do not read in >ss_ and >aa_ sequences
|
|
366 chomp($line);
|
|
367 if ($line!~/^(\S{1,20})([a-zA-Z0-9.-]{$residues_per_line})(\s+\d+)?$/) {
|
|
368 die ("\nError found in Clustal format in $infile, line $.: '$line'\n");
|
|
369 }
|
|
370 $name=$1;
|
|
371 $residues=$2;
|
|
372 print("WARNING: Found no space between name and residues in $infile, line $.: '$line'\n");
|
|
373 } else {
|
|
374 if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} # do not read in >ss_ and >aa_ sequences
|
|
375 if ($line=~/^aa_/ || $line=~/^sa_/) {next;} # do not read in >ss_ and >aa_ sequences
|
|
376 if ($line=~/^ss_/) {
|
|
377 # if ($line=~/^ss_conf/) {next;} # comment out to read sequence with confidence values
|
|
378 # if ($nss>=0) {next;} # comment out: allow for two or more >ss_dssp and >ss_pred lines
|
|
379 $nss=$n;
|
|
380 }
|
|
381 $line=~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/;
|
|
382 $name=$1;
|
|
383 $residues=$2;
|
|
384 $residues=~tr/ //d;
|
|
385 $residues_per_line=length($residues);
|
|
386 }
|
|
387 if ($block==1) {
|
|
388 $names[$k]=$name;
|
|
389 $seqs[$k]=$residues;
|
|
390 } else {
|
|
391 $seqs[$k].=$residues;
|
|
392 if ($names[$k] ne $name) {
|
|
393 print("WARNING: name of sequence $k in block 1 ($names[$k]) is not the same as in block $block ($name) in $infile\n");
|
|
394 }
|
|
395 }
|
|
396 # printf("%3i %3i %-16.16s %s\n",$block,$k,$names[$k],$seqs[$k]);
|
|
397 $k++;
|
|
398 }
|
|
399 if ($k && $n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");}
|
|
400 if (!$n) {$n=$k;}
|
|
401 }
|
|
402
|
|
403 # Read psi format
|
|
404 elsif ($informat eq "psi")
|
|
405 {
|
|
406 my $block=1; # number of block currently read
|
|
407 my $name;
|
|
408 my $residues;
|
|
409 $n=0; # sequence number of first block
|
|
410 $k=0; # sequence index to zero for first block
|
|
411
|
|
412 while ($line = <INFILE>)
|
|
413 {
|
|
414 # print($namefieldlen.$line);
|
|
415 $line=~tr/\r//d;
|
|
416 if ($line=~/^\s*$/){ # new sequence block starts
|
|
417 if ($k) {
|
|
418 if ($n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");}
|
|
419 $block++;
|
|
420 $n=$k;
|
|
421 $k=0; # set sequence index to zero for next block to read
|
|
422 }
|
|
423 next;
|
|
424 }
|
|
425
|
|
426 if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} # do not read in >ss_ and >aa_ sequences
|
|
427 if ($line=~/^aa_/ || $line=~/^sa_/) {next;} # do not read in >ss_ and >aa_ sequences
|
|
428 if ($line=~/^ss_/) {
|
|
429 # if ($line=~/^ss_conf/) {next;} # comment out to read sequence with confidence values
|
|
430 # if ($nss>=0) {next;} # comment out: allow for two or more >ss_dssp and >ss_pred lines
|
|
431 $nss=$n;
|
|
432 }
|
|
433 $line=~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/;
|
|
434 $name=$1;
|
|
435 $residues=$2;
|
|
436 $residues=~tr/ //d;
|
|
437
|
|
438 if ($block==1) {
|
|
439 $names[$k]=$name;
|
|
440 $seqs[$k]=$residues;
|
|
441 } else {
|
|
442 $seqs[$k].=$residues;
|
|
443 if ($names[$k] ne $name) {
|
|
444 print("WARNING: name of sequence $k in block 1 ($names[$k]) is not the same as in block $block ($name) in $infile\n");
|
|
445 }
|
|
446 }
|
|
447 # printf("%3i %3i %-16.16s %s\n",$block,$k,$names[$k],$seqs[$k]);
|
|
448 $k++;
|
|
449 }
|
|
450 if ($k && $n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");}
|
|
451 if (!$n) {$n=$k;}
|
|
452 }
|
|
453
|
|
454 close INFILE;
|
|
455
|
|
456
|
|
457 # Empty input file?
|
|
458 if ($n==0) {die("\nERROR: input file $infile contains no sequences\n");}
|
|
459
|
|
460
|
|
461
|
|
462 ################################################################################################
|
|
463 # Transforming to upper-case
|
|
464 ################################################################################################
|
|
465 if ($informat ne "a3m" && $informat ne "a2m") { # Transform to upper case if input format is not A3M or A2M
|
|
466 for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/a-z/A-Z/;}
|
|
467 }
|
|
468
|
|
469 ################################################################################################
|
|
470 # Removing non-alphanumeric symbols
|
|
471 ################################################################################################
|
|
472 for ($k=0; $k<$n; $k++) {
|
|
473 $seqs[$k]=~tr/A-Za-z0-9.~-//cd;
|
|
474 $seqs[$k]=~tr/~/-/;
|
|
475 }
|
|
476
|
|
477
|
|
478 ################################################################################################
|
|
479 # Filling up with gaps '.' or deleting gaps
|
|
480 ################################################################################################
|
|
481
|
|
482 # Insertion of '.' only needed for a3m if: NOT -r option OR -M first OR -M <int> option
|
|
483 if ($informat eq "a3m" && (!$remove_inserts || $matchmode))
|
|
484 {
|
|
485 print("inserting gaps...\n");
|
|
486 my @len_ins; # $len_ins[$j] will count the maximum number of inserted residues after match state $i.
|
|
487 my $j; # counts match states
|
|
488 my @inserts; # $inserts[$j] contains the insert (in small case) of sequence $i after the $j'th match state
|
|
489 my $insert;
|
|
490
|
|
491 # Determine length of longest insert
|
|
492 for ($k=0; $k<$n; $k++)
|
|
493 {
|
|
494 # split into list of single match states and variable-length inserts
|
|
495 # ([A-Z]|-) is the split pattern. The parenthesis indicate that split patterns are to be included as list elements
|
|
496 # The '#' symbol is prepended to get rid of a perl bug in split
|
|
497 @inserts = split(/([A-Z]|-|~|[0-9])/,"#".$seqs[$k]."#");
|
|
498 $j=0;
|
|
499 # printf("Sequence $k: $seqs[$k]\n");
|
|
500 # printf("Sequence $k: @inserts\n");
|
|
501
|
|
502 # Does sequence $k contain insert after match state j that is longer than $ngap[$j]?
|
|
503 foreach $insert (@inserts)
|
|
504 {
|
|
505 if( !defined $len_ins[$j] || length($insert)>$len_ins[$j]) {$len_ins[$j]=length($insert);}
|
|
506 $j++;
|
|
507 # printf("$insert|");
|
|
508 }
|
|
509 # printf("\n");
|
|
510 }
|
|
511 my $ngap;
|
|
512
|
|
513 # Fill up inserts with gaps '.' up to length $len_ins[$j]
|
|
514 for ($k=0; $k<$n; $k++)
|
|
515 {
|
|
516 # split into list of single match states and variable-length inserts
|
|
517 @inserts = split(/([A-Z]|-|~|[0-9])/,"#".$seqs[$k]."#");
|
|
518 $j=0;
|
|
519
|
|
520 # append the missing number of gaps after each match state
|
|
521 foreach $insert (@inserts)
|
|
522 {
|
|
523 for (my $l=length($insert); $l<$len_ins[$j]; $l++) {$insert.=".";}
|
|
524 $j++;
|
|
525 }
|
|
526 $seqs[$k] = join("",@inserts);
|
|
527 $seqs[$k] =~ tr/\#//d; # remove the '#' symbols inserted at the beginning and end
|
|
528 }
|
|
529 }
|
|
530
|
|
531
|
|
532 ################################################################################################
|
|
533 # Match state assignment
|
|
534 ################################################################################################
|
|
535
|
|
536 # Default match state rule for a3m is -M first
|
|
537 if ($matchmode eq "" && ($outformat eq "a3m" || $outformat eq "a2m")) {$matchmode="first";}
|
|
538
|
|
539 # Use gap rule for match state assignment?
|
|
540 if ($matchmode eq "gaprule") {
|
|
541
|
|
542 my @gaps=();
|
|
543 my $residues;
|
|
544 my @residues;
|
|
545
|
|
546 # Determine number of gaps per column
|
|
547 for ($k=0; $k<$n; $k++) {
|
|
548 @residues=unpack("C*",$seqs[$k]);
|
|
549 for (my $l=0; $l<@residues; $l++) {
|
|
550 if ($residues[$l]==46 || $residues[$l]==45) {
|
|
551 if (defined $gaps[$l]) {$gaps[$l]++;} else {$gaps[$l]=1;}
|
|
552 }
|
|
553 }
|
|
554 }
|
|
555
|
|
556 # Set columns with more gaps than $match_gaprule to lower case,
|
|
557 for ($k=0; $k<$n; $k++) {
|
|
558 @residues=unpack("C*",$seqs[$k]);
|
|
559 $residues="";
|
|
560 for (my $l=0; $l<@residues; $l++) {
|
|
561 if (!defined $gaps[$l] || $gaps[$l]<0.01*$match_gaprule*$n) {
|
|
562 if ($residues[$l]==46) {
|
|
563 $residues .= "-";
|
|
564 } else {
|
|
565 $residues .= uc(chr($residues[$l]));
|
|
566 }
|
|
567 } else {
|
|
568 if ($residues[$l]==45) {
|
|
569 $residues .= ".";
|
|
570 } else {
|
|
571 $residues .= lc(chr($residues[$l]));
|
|
572 }
|
|
573 }
|
|
574 $seqs[$k]=$residues;
|
|
575 }
|
|
576 }
|
|
577 }
|
|
578
|
|
579 # Use first sequence for match state assignment?
|
|
580 if ($matchmode eq "first") {
|
|
581
|
|
582 my @match=();
|
|
583 my $residues;
|
|
584 my @residues;
|
|
585
|
|
586
|
|
587 # Determine which columns have a gap in first sequence
|
|
588 for ($k=0; $k<scalar(@names); $k++) { #find seed sequence
|
|
589 if ($names[$k]!~/^(ss_|aa_|sa_)/) {last;}
|
|
590 }
|
|
591 @residues=unpack("C*",$seqs[$k]);
|
|
592 for (my $l=0; $l<@residues; $l++) {
|
|
593 if ($residues[$l]==46 || $residues[$l]==45) {$match[$l]=0;} else {$match[$l]=1;}
|
|
594 }
|
|
595
|
|
596 # Set columns without residue in first sequence to upper case,
|
|
597 for ($k=0; $k<$n; $k++) {
|
|
598 @residues=unpack("C*",$seqs[$k]);
|
|
599 $residues="";
|
|
600 for (my $l=0; $l<@residues; $l++) {
|
|
601 if ($match[$l]) {
|
|
602 if ($residues[$l]==46) {
|
|
603 $residues .= "-";
|
|
604 } else {
|
|
605 $residues .= uc(chr($residues[$l]));
|
|
606 }
|
|
607 } else {
|
|
608 if ($residues[$l]==45) {
|
|
609 $residues .= ".";
|
|
610 } else {
|
|
611 $residues .= lc(chr($residues[$l]));
|
|
612 }
|
|
613 }
|
|
614 $seqs[$k]=$residues;
|
|
615 }
|
|
616 }
|
|
617 }
|
|
618
|
|
619
|
|
620 ################################################################################################
|
|
621 # Remove gaps etc.
|
|
622 ################################################################################################
|
|
623
|
|
624 # Remove columns with too many gaps?
|
|
625 if ($remove_gapped) {
|
|
626
|
|
627 my @gaps=();
|
|
628 my $residues;
|
|
629 my @residues;
|
|
630
|
|
631 # Determine number of gaps '.' or '-' per column
|
|
632 for ($k=0; $k<$n; $k++) {
|
|
633 @residues=unpack("C*",$seqs[$k]);
|
|
634 for (my $l=0; $l<@residues; $l++) {
|
|
635 if ($residues[$l]==45 || $residues[$l]==46) {
|
|
636 if (defined $gaps[$l]) {$gaps[$l]++;} else {$gaps[$l]=1;}
|
|
637 }
|
|
638 }
|
|
639 }
|
|
640
|
|
641 # Remove columns with too many gaps
|
|
642 for ($k=0; $k<$n; $k++) {
|
|
643 @residues=unpack("C*",$seqs[$k]);
|
|
644 $residues="";
|
|
645 for (my $l=0; $l<@residues; $l++) {
|
|
646 if (!defined $gaps[$l] || $gaps[$l]<0.01*$remove_gapped*$n) {
|
|
647 $residues .= chr($residues[$l])
|
|
648 }
|
|
649 $seqs[$k]=$residues;
|
|
650 }
|
|
651 }
|
|
652 }
|
|
653
|
|
654 # Remove/transliterate gaps?
|
|
655 for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/ //d;}
|
|
656 if ($remove_inserts) {
|
|
657 for ($k=0; $k<$n; $k++) {
|
|
658 $seqs[$k]=~tr/a-z.//d;
|
|
659 # printf("%s\n",$seqs[$k]);
|
|
660 }
|
|
661 }
|
|
662
|
|
663
|
|
664 # Remove sequences which contain only gaps
|
|
665 my $nin=$n;
|
|
666 for ($k=0; $k<$n; $k++) {
|
|
667 if (($seqs[$k]=~tr/a-zA-Z0-9/a-zA-Z0-9/==0)) {
|
|
668 if ($v>=2) {print("Sequence contains only gaps and is removed: $names[$k]\n");}
|
|
669 splice(@seqs,$k,1);
|
|
670 splice(@names,$k,1);
|
|
671 $k--; $n--;
|
|
672 }
|
|
673 }
|
|
674
|
|
675
|
|
676 # Cut down sequence names to $desclen characters maximum
|
|
677 for ($k=0; $k<$n; $k++) {$names[$k]=substr($names[$k],0,$desclen);}
|
|
678
|
|
679 if ($outformat eq "a3m") {
|
|
680 # Remove all '.' gaps
|
|
681 for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/.//d;}
|
|
682 } elsif ($outformat eq "fas" || $outformat eq "clu" || $outformat eq "sto" || $outformat eq "psi" ) {
|
|
683 # Substitute all '.' by '-'
|
|
684 for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/./-/;}
|
|
685 }
|
|
686 if ($gap ne "default") {
|
|
687 for ($k=0; $k<$n; $k++) {$seqs[$k]=~s/\./$gap/g; $seqs[$k]=~s/-/$gap/g;}
|
|
688 }
|
|
689 if ($case eq "uc") {
|
|
690 for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/a-z/A-Z/;}
|
|
691 } elsif ($case eq "lc") {
|
|
692 for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/A-Z/a-z/;}
|
|
693 }
|
|
694
|
|
695
|
|
696
|
|
697 ####################################################
|
|
698 # Check that sequences have same length
|
|
699 ####################################################
|
|
700 if ($outformat ne "a3m" && $outformat ne "ufas") {
|
|
701 my $len=length($seqs[0]);
|
|
702 for($k=1; $k<$n; $k++) {
|
|
703 if (length($seqs[$k])!=$len) {
|
|
704 printf("\nError: Sequences in $infile do not all have same length, e.g. >%-.20s (len=%i) and >%-.20s (len=%i)\n",
|
|
705 $names[0],$len,$names[$k],length($seqs[$k]));
|
|
706 if ($v>=3) {
|
|
707 printf("%.20s %s\n%.20s %s\n\n",$names[0],$seqs[0],$names[$k],$seqs[$k]);
|
|
708 }
|
|
709 exit 1;
|
|
710 }
|
|
711 }
|
|
712 }
|
|
713
|
|
714
|
|
715 ####################################################
|
|
716 # Remove html tags
|
|
717 ####################################################
|
|
718 for($k=0; $k<$n; $k++) {
|
|
719 $names[$k]=~s/<[A-Za-z\/].*?>//g; # remove html tags
|
|
720 }
|
|
721
|
|
722
|
|
723
|
|
724 ################################################################################################
|
|
725 # Output part
|
|
726 ################################################################################################
|
|
727
|
|
728 my $ndssp=-1;
|
|
729 my $nsa=-1;
|
|
730 my $npred=-1;
|
|
731 my $nconf=-1;
|
|
732 my $nquery=-1;
|
|
733 for ($k=0; $k<$n; $k++) {
|
|
734 if ($names[$k]=~/^ss_dssp/){$ndssp=$k; }
|
|
735 elsif ($names[$k]=~/^sa_dssp/){$nsa=$k; }
|
|
736 elsif ($names[$k]=~/^ss_pred/){$npred=$k; }
|
|
737 elsif ($names[$k]=~/^ss_conf/){$nconf=$k; }
|
|
738 elsif ($nquery==-1 && $names[$k]!~/^aa_/) {$nquery=$k;}
|
|
739 }
|
|
740
|
|
741 #Write sequences into output file
|
|
742 open (OUTFILE, ">$outfile") or die ("cannot open $outfile:$!\n");
|
|
743 if ($outformat eq "sto" || $outformat eq "psi") {
|
|
744 my $refline;
|
|
745 if ($outformat eq "sto") {
|
|
746 print(OUTFILE "# STOCKHOLM 1.0\n\n");
|
|
747
|
|
748 # Write reference annotation line for match states (-M first)
|
|
749 if (!$lname) {$lname=32;}
|
|
750 $names[$nquery]=~/^\S+\s+(.*)/;
|
|
751 printf(OUTFILE "%-$lname.$lname"."s %s\n","#=GF DE",$1);
|
|
752 $refline=$seqs[$nquery];
|
|
753 $refline=~s/[a-z]/-/g;
|
|
754 printf(OUTFILE "%-$lname.$lname"."s %s\n","#=GC RF",$refline);
|
|
755 if ($ndssp>=0) {
|
|
756 printf(OUTFILE "%-32.32s %s\n","#=GC SS_cons",$seqs[$ndssp]);
|
|
757 }
|
|
758 }
|
|
759 if ($num) {
|
|
760 my $num=2;
|
|
761 for ($k=0; $k<$n; $k++) {
|
|
762 if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;}
|
|
763 $names[$k]=~s/^(\S+)\#\d+/$1/;
|
|
764 $names[$k]=~s/^(\S{1,25})\S+/$1\#$num/;
|
|
765 $num++;
|
|
766 }
|
|
767 }
|
|
768 for ($k=0; $k<$n; $k++) {
|
|
769 if ($k==$ndssp || $k==$npred || $k==$nconf) {next;}
|
|
770 $names[$k] =~ /\s*(\S+)/;
|
|
771 if (!$lname) {$lname=32;}
|
|
772 printf(OUTFILE "%-$lname.$lname"."s %s\n",$1,$seqs[$k]);
|
|
773 }
|
|
774 if ($outformat eq "sto") {print(OUTFILE "//\n");}
|
|
775 } elsif ($outformat eq "clu") {
|
|
776 printf(OUTFILE "CLUSTAL\n\n\n");
|
|
777 if ($num) {
|
|
778 my $num=2;
|
|
779 for ($k=0; $k<$n; $k++) {
|
|
780 if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;}
|
|
781 $names[$k]=~s/^(\S+)\#\d+/$1/;
|
|
782 $names[$k]=~s/^(\S{1,10})\S*/$1\#$num/;
|
|
783 $num++;
|
|
784 }
|
|
785 }
|
|
786 while($seqs[0] ne "") { # While there are still residues left
|
|
787 for ($k=0; $k<scalar(@names); $k++) { # go through all sequences
|
|
788 $names[$k] =~ s/\s*(\S+).*/$1/;
|
|
789 $seqs[$k]=~s/(\S{1,$numres})//; # remove the leftmost (up to) 60 residues from sequence $nseq
|
|
790 if (!$lname) {$lname=18;}
|
|
791 printf(OUTFILE "%-$lname.$lname"."s %s\n",$names[$k],$1); # and print them after the sequence name
|
|
792 }
|
|
793 print(OUTFILE "\n"); # leave a blank line after each block of 60 residues
|
|
794 }
|
|
795 } else {
|
|
796 if ($num) {
|
|
797 my $num=2;
|
|
798 for ($k=0; $k<$n; $k++) {
|
|
799 if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;}
|
|
800 $names[$k]=~s/^(\S+)\#\d+/$1/;
|
|
801 $names[$k]=~s/^(\S{1,25})\S+/$1\#$num/;
|
|
802 # $names[$k]=~s/^(\S+)/$1\#$num/;
|
|
803 $num++;
|
|
804 }
|
|
805 }
|
|
806 if ($titleline ne "" && $outformat eq "a3m") {
|
|
807 printf(OUTFILE "%s\n",$titleline);
|
|
808 }
|
|
809 for ($k=0; $k<$n; $k++) {
|
|
810 $seqs[$k]=~s/(\S{$numres})/$1\n/g;
|
|
811 printf(OUTFILE ">%s\n%s\n",$names[$k],$seqs[$k]);
|
|
812 }
|
|
813 }
|
|
814
|
|
815 close OUTFILE;
|
|
816 if ($v>=2) {
|
|
817 if ($nin==1) {print("Reformatted $infile with 1 sequence from $informat to $outformat and written to file $outfile\n");}
|
|
818 else {
|
|
819 if (!$nin==$n) {printf("Removed %i sequences which contained no residues\n",$nin-$n); }
|
|
820 print("Reformatted $infile with $n sequences from $informat to $outformat and written to file $outfile\n");
|
|
821 }
|
|
822 }
|
|
823
|
|
824 return;
|
|
825 }
|