Mercurial > repos > hammock > hammock
comparison external_tools/darwin/lib/hh/scripts/reformat.pl @ 6:2277dd59b9f9 draft
Uploaded
author | hammock |
---|---|
date | Wed, 01 Nov 2017 05:54:28 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:b7652b7c97bd | 6:2277dd59b9f9 |
---|---|
1 #! /usr/bin/env perl | |
2 | |
3 # reformat.pl | |
4 # Reformat a multiple alignment file | |
5 # | |
6 # HHsuite version 2.0.16 (January 2013) | |
7 # | |
8 # Reference: | |
9 # Remmert M., Biegert A., Hauser A., and Soding J. | |
10 # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment. | |
11 # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011). | |
12 | |
13 # (C) Johannes Soeding, 2012 | |
14 | |
15 # This program is free software: you can redistribute it and/or modify | |
16 # it under the terms of the GNU General Public License as published by | |
17 # the Free Software Foundation, either version 3 of the License, or | |
18 # (at your option) any later version. | |
19 | |
20 # This program is distributed in the hope that it will be useful, | |
21 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
22 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
23 # GNU General Public License for more details. | |
24 | |
25 # You should have received a copy of the GNU General Public License | |
26 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
27 | |
28 # We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de | |
29 | |
30 use lib $ENV{"HHLIB"}."/scripts"; | |
31 use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc. | |
32 use strict; | |
33 use warnings; | |
34 my $numres=100; # number of residues per line | |
35 my $desclen=1000; # maximum number of characters in nameline | |
36 my $ARGC=scalar(@ARGV); | |
37 if ($ARGC<2) { | |
38 die(" | |
39 reformat.pl from HHsuite $VERSION | |
40 Read a multiple alignment in one format and write it in another format | |
41 Usage: reformat.pl [informat] [outformat] infile outfile [options] | |
42 or reformat.pl [informat] [outformat] 'fileglob' .ext [options] | |
43 | |
44 Available input formats: | |
45 fas: aligned fasta; lower and upper case equivalent, '.' and '-' equivalent | |
46 a2m: aligned fasta; inserts: lower case, matches: upper case, deletes: '-', | |
47 gaps aligned to inserts: '.' | |
48 a3m: like a2m, but gaps aligned to inserts MAY be omitted | |
49 sto: Stockholm format; sequences in several blocks with sequence name at | |
50 beginning of line (hmmer output) | |
51 psi: format as read by PSI-BLAST using the -B option (like sto with -M first -r) | |
52 clu: Clustal format; sequences in several blocks with sequence name at beginning | |
53 of line | |
54 Available output formats: | |
55 fas: aligned fasta; all gaps '-' | |
56 a2m: aligned fasta; inserts: lower case, matches: upper case, deletes: '-', gaps | |
57 aligned to inserts: '.' | |
58 a3m: like a2m, but gaps aligned to inserts are omitted | |
59 sto: Stockholm format; sequences in just one block, one line per sequence | |
60 psi: format as read by PSI-BLAST using the -B option | |
61 clu: Clustal format | |
62 If no input or output format is given the file extension is interpreted as format | |
63 specification ('aln' as 'clu') | |
64 | |
65 Options: | |
66 -v int verbose mode (0:off, 1:on) | |
67 -num add number prefix to sequence names: 'name', '1:name' '2:name' etc | |
68 -noss remove secondary structure sequences (beginning with >ss_) | |
69 -sa do not remove solvent accessibility sequences (beginning with >sa_) | |
70 -M first make all columns with residue in first seuqence match columns | |
71 (default for output format a2m or a3m) | |
72 -M int make all columns with less than X% gaps match columns | |
73 (for output format a2m or a3m) | |
74 -r remove all lower case residues (insert states) | |
75 (AFTER -M option has been processed) | |
76 -r int remove all lower case columns with more than X% gaps | |
77 -g '' suppress all gaps | |
78 -g '-' write all gaps as '-' | |
79 -uc write all residues in upper case (AFTER all other options have been processed) | |
80 -lc write all residues in lower case (AFTER all other options have been processed) | |
81 -l number of residues per line (for Clustal, FASTA, A2M, A3M formats) | |
82 (default=$numres) | |
83 -d maximum number of characers in nameline (default=$desclen) | |
84 | |
85 Examples: reformat.pl 1hjra.a3m 1hjra.a2m | |
86 (same as reformat.pl a3m a2m 1hjra.a3m 1hjra.a2m) | |
87 reformat.pl test.a3m test.fas -num -r 90 | |
88 reformat.pl fas sto '*.fasta' .stockholm | |
89 \n"); | |
90 # clu: clustal format (hmmer output) | |
91 # sel: Selex format | |
92 # phy: Phylip format | |
93 } | |
94 | |
95 my $informat=""; | |
96 my $outformat=""; | |
97 my $infile=""; | |
98 my $outfile=""; | |
99 my $num=0; # don't use sequence number as prefix: '>n|name' | |
100 my $noss=0; # don't remove secondary structure | |
101 my $nosa=1; # remove solvent accessibility sequences | |
102 my $line; | |
103 my $options=""; | |
104 my @names; # names of sequences read in | |
105 my @seqs; # residues of sequences read in | |
106 my $n; # number of sequences read in | |
107 my $k; # counts sequences for output | |
108 my $remove_inserts=0; | |
109 my $remove_gapped=0; | |
110 my $matchmode=""; # do not change capitalization | |
111 my $match_gaprule=0; # columns with less than this percentage of gaps will be match columns | |
112 my $v=2; | |
113 my $update=0; | |
114 my $nss=-1; # index of secondary structure sequence | |
115 my $lname; # length of sequence name in clustal, stockholm, psi format | |
116 my $titleline; # first line beginning with "#" in A3M, A2M, or FASTA files | |
117 | |
118 my @informats= ("fas","a2m","a3m","sto","psi","clu"); | |
119 my @outformats= ("fas","a2m","a3m","sto","psi","clu","ufas"); | |
120 my $found; | |
121 my $element; | |
122 my $gap="default"; | |
123 my $case="default"; | |
124 | |
125 # Process optionsz | |
126 for (my $i=0; $i<$ARGC; $i++) {$options.=" $ARGV[$i] ";} | |
127 if ($options=~s/ -i\s+(\S+) / /) {$infile=$1;} | |
128 if ($options=~s/ -o\s+(\S+) / /) {$outfile=$1;} | |
129 if ($options=~s/ -num / /) {$num=1; $desclen=505;} | |
130 if ($options=~s/ -noss / /) {$noss=1;} | |
131 if ($options=~s/ -sa / /) {$nosa=0;} | |
132 if ($options=~s/ -g\s+\'?(\S*)\'? / /) {$gap=$1;} | |
133 if ($options=~s/ -r\s+(\d+) / /) {$remove_gapped=$1;} | |
134 if ($options=~s/ -r / /) {$remove_inserts=1;} | |
135 if ($options=~s/ -lc / /) {$case="lc";} | |
136 if ($options=~s/ -uc / /) {$case="uc";} | |
137 if ($options=~s/ -v\s*(\d+) / /) {$v=$1;} | |
138 if ($options=~s/ -v / /) {$v=2;} | |
139 if ($options=~s/ -M\s+(\d+) / /) {$matchmode="gaprule"; $match_gaprule=$1;} | |
140 if ($options=~s/ -M\s+first / /) {$matchmode="first"; $match_gaprule=$1;} | |
141 if ($options=~s/ -u / /) {$update=1;} | |
142 if ($options=~s/ -l\s+(\S+) / /) {$numres=$1;} | |
143 if ($options=~s/ -lname\s+(\S+) / /) {$lname=$1;} | |
144 if ($options=~s/ -d\s+(\S+) / /) {$desclen=$1;} | |
145 | |
146 # Assign informat, outformat, infile, and outfile | |
147 if ($outfile eq "") { | |
148 if ($options=~s/(\S+)\s*$//) { | |
149 $outfile=$1; | |
150 } else { | |
151 die("Error: no output file given: '$options'\n"); | |
152 } | |
153 } | |
154 if ($infile eq "") { | |
155 if ($options=~s/(\S+)\s*$//) { | |
156 $infile=$1; | |
157 } else { | |
158 die("Error: no input file given: '$options'\n"); | |
159 } | |
160 } | |
161 if ($options=~s/(\S+)\s*$//) { | |
162 $outformat=$1; | |
163 } else { | |
164 if ($outfile=~/\S*\.(\S+?)$/) { | |
165 $outformat=lc($1); | |
166 if ($outformat eq "aln") {$outformat="clu";} | |
167 elsif ($outformat eq "fa") {$outformat="fas";} | |
168 elsif ($outformat eq "fasta") {$outformat="fas";} | |
169 elsif ($outformat eq "afa") {$outformat="fas";} | |
170 elsif ($outformat eq "afas") {$outformat="fas";} | |
171 elsif ($outformat eq "afasta") {$outformat="fas";} | |
172 } else { | |
173 print ("Using FASTA output format: '$options'\n"); $outformat="fas"; | |
174 } | |
175 } | |
176 if ($options=~s/(\S+)\s*$//) { | |
177 $informat=$1; | |
178 } else { | |
179 if ($infile=~/\S*\.(\S+?)$/) { | |
180 $informat=lc($1); | |
181 if ($informat eq "aln") {$informat="clu";} | |
182 elsif ($informat eq "fa") {$informat="fas";} | |
183 elsif ($informat eq "fasta") {$informat="fas";} | |
184 } else { | |
185 print ("Using FASTA input format: '$options'\n"); $informat="fas"; | |
186 } | |
187 } | |
188 | |
189 | |
190 # Warn if unknown options found | |
191 if ($options!~/^\s*$/) { | |
192 $options=~s/^\s*(.*?)\s*$/$1/g; | |
193 print("\nWARNING: unknown options '$options'\n"); | |
194 } | |
195 | |
196 # Check if input and output formats are valid | |
197 $found=0; | |
198 foreach $element (@informats) {if ($informat eq $element) {$found=1; last;}} | |
199 if(!$found) {die("\nError: $informat is not a valid input format option\n");} | |
200 $found=0; | |
201 foreach $element (@outformats) {if ($outformat eq $element) {$found=1; last;}} | |
202 if(!$found) {die("\nError: $outformat is not a valid output format option\n");} | |
203 | |
204 #if($outformat eq "psi") { | |
205 # $remove_inserts=1; | |
206 #} | |
207 if($outformat eq "ufas") {$gap="";} | |
208 | |
209 | |
210 if ($infile=~/\*/ || $outfile=~/^\./) # if infile contains '*' or outfile is just an extension | |
211 { | |
212 $outfile=~/.*\.(\S*)$/; | |
213 my $outext=$1; | |
214 my @infiles=glob($infile); | |
215 printf("%i files to reformat\n",scalar(@infiles)); | |
216 foreach $infile (@infiles) | |
217 { | |
218 if ($infile!~/(\S+)\.\S+/) {$infile=~/(\S+)/} | |
219 $outfile="$1.$outext"; | |
220 if ($update && -e $outfile) {next;} | |
221 if ($v>=3) {print("Reformatting $infile from $informat to $outformat ...\n");} | |
222 &reformat($infile,$outfile); | |
223 } | |
224 exit 0; | |
225 } | |
226 else | |
227 { | |
228 if ($v>=3) {print("Reformatting $infile from $informat to $outformat ...\n");} | |
229 &reformat($infile,$outfile); | |
230 exit 0; | |
231 } | |
232 | |
233 | |
234 ################################################################################################ | |
235 # Reformat a single file | |
236 ################################################################################################ | |
237 sub reformat() | |
238 { | |
239 $infile=$_[0]; | |
240 $nss=-1; | |
241 $titleline=""; | |
242 | |
243 ################################################################################################ | |
244 # Input part | |
245 ################################################################################################ | |
246 | |
247 my $skip=0; | |
248 open (INFILE,"<$infile") or die ("ERROR: cannot open $infile: $!\n"); | |
249 | |
250 # Read a2m, a3m, fas format | |
251 if ($informat eq "fas" || $informat eq "a2m" || $informat eq "a3m") | |
252 { | |
253 $/=">"; # set input field seperator | |
254 $n=0; | |
255 my $seq=<INFILE>; | |
256 if ($seq=~s/^(\#.*)//) {$titleline=$1;} # remove commentary lines at beginning of file | |
257 $seq=~s/(\n\#.*)*\n//; # remove commentary lines at beginning of file | |
258 | |
259 # If first line has no sequence name, use >root_name | |
260 if ($seq ne ">") { | |
261 $infile="/$infile."; # determine root name 1 | |
262 $infile=~/^.*\/(.*?)\..*/; # determine root name 2 | |
263 $names[$n]=$1; # don't move this line away from previous line $seq=~s/([^\n]*)//; | |
264 $seq=~tr/\n //d; # remove newlines and blanks | |
265 $seqs[$n]=$seq; | |
266 $n++; | |
267 } | |
268 | |
269 while ($seq=<INFILE>) { | |
270 $seq=~s/\n\#.*//g; # remove commentary lines | |
271 while ($seq=~s/(.)>/$1/) {$seq.=<INFILE>;} # in the case that nameline contains a '>'; '.' matches anything except '\n' | |
272 if ($seq=~/^aa_/) {next;} | |
273 if ($seq=~/^sa_/ && $nosa) {next;} | |
274 if ($seq=~/^ss_/) { | |
275 if ($noss) {next;} # do not read in >ss_ and >sa_ sequences | |
276 # if ($seq=~/^ss_conf/) {next;} # comment out to read sequence with confidence values | |
277 # if ($nss>=0) {next;} # comment out: allow for two or more >ss_dssp and >ss_pred lines | |
278 $nss=$n; | |
279 } | |
280 $seq=~s/^\s*(.*)//; # divide into nameline and residues; '.' matches anything except '\n' | |
281 if (defined $1 && $1 ne "") { | |
282 $names[$n]=$1; # don't move this line away from previous line $seq=~s/([^\n]*)//; | |
283 } else { | |
284 $names[$n]=$n; | |
285 } | |
286 $seqs[$n]=$seq; | |
287 $n++; | |
288 } | |
289 | |
290 $/="\n"; # reset input field seperator | |
291 } | |
292 | |
293 # Read Stockholm format | |
294 elsif ($informat eq "sto") | |
295 { | |
296 my %seqhash; | |
297 my $name; | |
298 my $first_block=1; | |
299 | |
300 $n=0; | |
301 while ($line = <INFILE>) | |
302 { | |
303 $line=~tr/\r//d; | |
304 $line=~s/\s+/ /g; | |
305 if ($line=~s/^\#=GC SS_cons/ss_dssp/) {} # skip commentary and blank line | |
306 if ($line=~/^\#/) {next;} # skip commentary and blank line | |
307 if ($line=~/^\/\//) {last;} # reached end of file | |
308 if ($line=~/^\s*$/){$first_block=0; next;} # new sequence block starts | |
309 if ($line!~/^\s*(\S+)\s+(\S+)/) { | |
310 die ("\nERROR found in stockholm format: $!"); | |
311 } | |
312 if (!(exists $seqhash{$1})) | |
313 { | |
314 if ($line=~/^aa_/) {next;} # do not read in >aa_ sequences | |
315 if ($line=~/^sa_/ && $nosa) {next;} # do not read in >sa_ sequences | |
316 if ($line=~/^ss_/) { | |
317 if ($noss) {next;} # do not read in >ss_ and >aa_ sequences | |
318 # if ($line=~/^ss_conf/) {next;} # comment out to read sequence with confidence values | |
319 # if ($nss>=0) {next;} # comment out: allow for two or more >ss_dssp and >ss_pred lines | |
320 $nss=$n; | |
321 } | |
322 $line=~/^\s*(\S+)\s+(\S+)/; | |
323 $names[$n]=$1; | |
324 $seqs[$n]=$2; | |
325 $seqhash{$1}=$n++; | |
326 $first_block=1; | |
327 } | |
328 else | |
329 { | |
330 if ($first_block) {die ("\nERROR: sequence $1 appears more than once per block\n");} | |
331 $seqs[$seqhash{$1}].=$2; | |
332 } | |
333 # printf("%3i %s\n",$n-1,$seqs[$n-1]); | |
334 } | |
335 } | |
336 | |
337 elsif ($informat eq "clu") | |
338 { | |
339 my $residues_per_line=50; # default number of characters for namefield residues | |
340 # (only needed if no gap between name and residues in first sequence -> SMART) | |
341 my $block=1; # number of block currently read | |
342 my $name; | |
343 my $residues; | |
344 $n=0; # sequence number of first block | |
345 $k=0; # sequence index to zero for first block | |
346 | |
347 while ($line = <INFILE>) | |
348 { | |
349 # print($namefieldlen.$line); | |
350 $line=~tr/\r//d; | |
351 if ($line=~/CLUSTAL/i) {next;} | |
352 if ($line=~/^\#/) {next;} # skip commentary and blank line | |
353 if ($line=~/^\/\//) {last;} # reached end of file | |
354 if ($line=~/^\s*$/){ # new sequence block starts | |
355 if ($k) { | |
356 if ($n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");} | |
357 $block++; | |
358 $n=$k; | |
359 $k=0; # set sequence index to zero for next block to read | |
360 } | |
361 next; | |
362 } | |
363 if ($line!~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/) { | |
364 if ($line=~/^[*.: ]*$/) {next;} | |
365 if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} # do not read in >ss_ and >aa_ sequences | |
366 chomp($line); | |
367 if ($line!~/^(\S{1,20})([a-zA-Z0-9.-]{$residues_per_line})(\s+\d+)?$/) { | |
368 die ("\nError found in Clustal format in $infile, line $.: '$line'\n"); | |
369 } | |
370 $name=$1; | |
371 $residues=$2; | |
372 print("WARNING: Found no space between name and residues in $infile, line $.: '$line'\n"); | |
373 } else { | |
374 if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} # do not read in >ss_ and >aa_ sequences | |
375 if ($line=~/^aa_/ || $line=~/^sa_/) {next;} # do not read in >ss_ and >aa_ sequences | |
376 if ($line=~/^ss_/) { | |
377 # if ($line=~/^ss_conf/) {next;} # comment out to read sequence with confidence values | |
378 # if ($nss>=0) {next;} # comment out: allow for two or more >ss_dssp and >ss_pred lines | |
379 $nss=$n; | |
380 } | |
381 $line=~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/; | |
382 $name=$1; | |
383 $residues=$2; | |
384 $residues=~tr/ //d; | |
385 $residues_per_line=length($residues); | |
386 } | |
387 if ($block==1) { | |
388 $names[$k]=$name; | |
389 $seqs[$k]=$residues; | |
390 } else { | |
391 $seqs[$k].=$residues; | |
392 if ($names[$k] ne $name) { | |
393 print("WARNING: name of sequence $k in block 1 ($names[$k]) is not the same as in block $block ($name) in $infile\n"); | |
394 } | |
395 } | |
396 # printf("%3i %3i %-16.16s %s\n",$block,$k,$names[$k],$seqs[$k]); | |
397 $k++; | |
398 } | |
399 if ($k && $n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");} | |
400 if (!$n) {$n=$k;} | |
401 } | |
402 | |
403 # Read psi format | |
404 elsif ($informat eq "psi") | |
405 { | |
406 my $block=1; # number of block currently read | |
407 my $name; | |
408 my $residues; | |
409 $n=0; # sequence number of first block | |
410 $k=0; # sequence index to zero for first block | |
411 | |
412 while ($line = <INFILE>) | |
413 { | |
414 # print($namefieldlen.$line); | |
415 $line=~tr/\r//d; | |
416 if ($line=~/^\s*$/){ # new sequence block starts | |
417 if ($k) { | |
418 if ($n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");} | |
419 $block++; | |
420 $n=$k; | |
421 $k=0; # set sequence index to zero for next block to read | |
422 } | |
423 next; | |
424 } | |
425 | |
426 if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} # do not read in >ss_ and >aa_ sequences | |
427 if ($line=~/^aa_/ || $line=~/^sa_/) {next;} # do not read in >ss_ and >aa_ sequences | |
428 if ($line=~/^ss_/) { | |
429 # if ($line=~/^ss_conf/) {next;} # comment out to read sequence with confidence values | |
430 # if ($nss>=0) {next;} # comment out: allow for two or more >ss_dssp and >ss_pred lines | |
431 $nss=$n; | |
432 } | |
433 $line=~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/; | |
434 $name=$1; | |
435 $residues=$2; | |
436 $residues=~tr/ //d; | |
437 | |
438 if ($block==1) { | |
439 $names[$k]=$name; | |
440 $seqs[$k]=$residues; | |
441 } else { | |
442 $seqs[$k].=$residues; | |
443 if ($names[$k] ne $name) { | |
444 print("WARNING: name of sequence $k in block 1 ($names[$k]) is not the same as in block $block ($name) in $infile\n"); | |
445 } | |
446 } | |
447 # printf("%3i %3i %-16.16s %s\n",$block,$k,$names[$k],$seqs[$k]); | |
448 $k++; | |
449 } | |
450 if ($k && $n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");} | |
451 if (!$n) {$n=$k;} | |
452 } | |
453 | |
454 close INFILE; | |
455 | |
456 | |
457 # Empty input file? | |
458 if ($n==0) {die("\nERROR: input file $infile contains no sequences\n");} | |
459 | |
460 | |
461 | |
462 ################################################################################################ | |
463 # Transforming to upper-case | |
464 ################################################################################################ | |
465 if ($informat ne "a3m" && $informat ne "a2m") { # Transform to upper case if input format is not A3M or A2M | |
466 for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/a-z/A-Z/;} | |
467 } | |
468 | |
469 ################################################################################################ | |
470 # Removing non-alphanumeric symbols | |
471 ################################################################################################ | |
472 for ($k=0; $k<$n; $k++) { | |
473 $seqs[$k]=~tr/A-Za-z0-9.~-//cd; | |
474 $seqs[$k]=~tr/~/-/; | |
475 } | |
476 | |
477 | |
478 ################################################################################################ | |
479 # Filling up with gaps '.' or deleting gaps | |
480 ################################################################################################ | |
481 | |
482 # Insertion of '.' only needed for a3m if: NOT -r option OR -M first OR -M <int> option | |
483 if ($informat eq "a3m" && (!$remove_inserts || $matchmode)) | |
484 { | |
485 print("inserting gaps...\n"); | |
486 my @len_ins; # $len_ins[$j] will count the maximum number of inserted residues after match state $i. | |
487 my $j; # counts match states | |
488 my @inserts; # $inserts[$j] contains the insert (in small case) of sequence $i after the $j'th match state | |
489 my $insert; | |
490 | |
491 # Determine length of longest insert | |
492 for ($k=0; $k<$n; $k++) | |
493 { | |
494 # split into list of single match states and variable-length inserts | |
495 # ([A-Z]|-) is the split pattern. The parenthesis indicate that split patterns are to be included as list elements | |
496 # The '#' symbol is prepended to get rid of a perl bug in split | |
497 @inserts = split(/([A-Z]|-|~|[0-9])/,"#".$seqs[$k]."#"); | |
498 $j=0; | |
499 # printf("Sequence $k: $seqs[$k]\n"); | |
500 # printf("Sequence $k: @inserts\n"); | |
501 | |
502 # Does sequence $k contain insert after match state j that is longer than $ngap[$j]? | |
503 foreach $insert (@inserts) | |
504 { | |
505 if( !defined $len_ins[$j] || length($insert)>$len_ins[$j]) {$len_ins[$j]=length($insert);} | |
506 $j++; | |
507 # printf("$insert|"); | |
508 } | |
509 # printf("\n"); | |
510 } | |
511 my $ngap; | |
512 | |
513 # Fill up inserts with gaps '.' up to length $len_ins[$j] | |
514 for ($k=0; $k<$n; $k++) | |
515 { | |
516 # split into list of single match states and variable-length inserts | |
517 @inserts = split(/([A-Z]|-|~|[0-9])/,"#".$seqs[$k]."#"); | |
518 $j=0; | |
519 | |
520 # append the missing number of gaps after each match state | |
521 foreach $insert (@inserts) | |
522 { | |
523 for (my $l=length($insert); $l<$len_ins[$j]; $l++) {$insert.=".";} | |
524 $j++; | |
525 } | |
526 $seqs[$k] = join("",@inserts); | |
527 $seqs[$k] =~ tr/\#//d; # remove the '#' symbols inserted at the beginning and end | |
528 } | |
529 } | |
530 | |
531 | |
532 ################################################################################################ | |
533 # Match state assignment | |
534 ################################################################################################ | |
535 | |
536 # Default match state rule for a3m is -M first | |
537 if ($matchmode eq "" && ($outformat eq "a3m" || $outformat eq "a2m")) {$matchmode="first";} | |
538 | |
539 # Use gap rule for match state assignment? | |
540 if ($matchmode eq "gaprule") { | |
541 | |
542 my @gaps=(); | |
543 my $residues; | |
544 my @residues; | |
545 | |
546 # Determine number of gaps per column | |
547 for ($k=0; $k<$n; $k++) { | |
548 @residues=unpack("C*",$seqs[$k]); | |
549 for (my $l=0; $l<@residues; $l++) { | |
550 if ($residues[$l]==46 || $residues[$l]==45) { | |
551 if (defined $gaps[$l]) {$gaps[$l]++;} else {$gaps[$l]=1;} | |
552 } | |
553 } | |
554 } | |
555 | |
556 # Set columns with more gaps than $match_gaprule to lower case, | |
557 for ($k=0; $k<$n; $k++) { | |
558 @residues=unpack("C*",$seqs[$k]); | |
559 $residues=""; | |
560 for (my $l=0; $l<@residues; $l++) { | |
561 if (!defined $gaps[$l] || $gaps[$l]<0.01*$match_gaprule*$n) { | |
562 if ($residues[$l]==46) { | |
563 $residues .= "-"; | |
564 } else { | |
565 $residues .= uc(chr($residues[$l])); | |
566 } | |
567 } else { | |
568 if ($residues[$l]==45) { | |
569 $residues .= "."; | |
570 } else { | |
571 $residues .= lc(chr($residues[$l])); | |
572 } | |
573 } | |
574 $seqs[$k]=$residues; | |
575 } | |
576 } | |
577 } | |
578 | |
579 # Use first sequence for match state assignment? | |
580 if ($matchmode eq "first") { | |
581 | |
582 my @match=(); | |
583 my $residues; | |
584 my @residues; | |
585 | |
586 | |
587 # Determine which columns have a gap in first sequence | |
588 for ($k=0; $k<scalar(@names); $k++) { #find seed sequence | |
589 if ($names[$k]!~/^(ss_|aa_|sa_)/) {last;} | |
590 } | |
591 @residues=unpack("C*",$seqs[$k]); | |
592 for (my $l=0; $l<@residues; $l++) { | |
593 if ($residues[$l]==46 || $residues[$l]==45) {$match[$l]=0;} else {$match[$l]=1;} | |
594 } | |
595 | |
596 # Set columns without residue in first sequence to upper case, | |
597 for ($k=0; $k<$n; $k++) { | |
598 @residues=unpack("C*",$seqs[$k]); | |
599 $residues=""; | |
600 for (my $l=0; $l<@residues; $l++) { | |
601 if ($match[$l]) { | |
602 if ($residues[$l]==46) { | |
603 $residues .= "-"; | |
604 } else { | |
605 $residues .= uc(chr($residues[$l])); | |
606 } | |
607 } else { | |
608 if ($residues[$l]==45) { | |
609 $residues .= "."; | |
610 } else { | |
611 $residues .= lc(chr($residues[$l])); | |
612 } | |
613 } | |
614 $seqs[$k]=$residues; | |
615 } | |
616 } | |
617 } | |
618 | |
619 | |
620 ################################################################################################ | |
621 # Remove gaps etc. | |
622 ################################################################################################ | |
623 | |
624 # Remove columns with too many gaps? | |
625 if ($remove_gapped) { | |
626 | |
627 my @gaps=(); | |
628 my $residues; | |
629 my @residues; | |
630 | |
631 # Determine number of gaps '.' or '-' per column | |
632 for ($k=0; $k<$n; $k++) { | |
633 @residues=unpack("C*",$seqs[$k]); | |
634 for (my $l=0; $l<@residues; $l++) { | |
635 if ($residues[$l]==45 || $residues[$l]==46) { | |
636 if (defined $gaps[$l]) {$gaps[$l]++;} else {$gaps[$l]=1;} | |
637 } | |
638 } | |
639 } | |
640 | |
641 # Remove columns with too many gaps | |
642 for ($k=0; $k<$n; $k++) { | |
643 @residues=unpack("C*",$seqs[$k]); | |
644 $residues=""; | |
645 for (my $l=0; $l<@residues; $l++) { | |
646 if (!defined $gaps[$l] || $gaps[$l]<0.01*$remove_gapped*$n) { | |
647 $residues .= chr($residues[$l]) | |
648 } | |
649 $seqs[$k]=$residues; | |
650 } | |
651 } | |
652 } | |
653 | |
654 # Remove/transliterate gaps? | |
655 for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/ //d;} | |
656 if ($remove_inserts) { | |
657 for ($k=0; $k<$n; $k++) { | |
658 $seqs[$k]=~tr/a-z.//d; | |
659 # printf("%s\n",$seqs[$k]); | |
660 } | |
661 } | |
662 | |
663 | |
664 # Remove sequences which contain only gaps | |
665 my $nin=$n; | |
666 for ($k=0; $k<$n; $k++) { | |
667 if (($seqs[$k]=~tr/a-zA-Z0-9/a-zA-Z0-9/==0)) { | |
668 if ($v>=2) {print("Sequence contains only gaps and is removed: $names[$k]\n");} | |
669 splice(@seqs,$k,1); | |
670 splice(@names,$k,1); | |
671 $k--; $n--; | |
672 } | |
673 } | |
674 | |
675 | |
676 # Cut down sequence names to $desclen characters maximum | |
677 for ($k=0; $k<$n; $k++) {$names[$k]=substr($names[$k],0,$desclen);} | |
678 | |
679 if ($outformat eq "a3m") { | |
680 # Remove all '.' gaps | |
681 for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/.//d;} | |
682 } elsif ($outformat eq "fas" || $outformat eq "clu" || $outformat eq "sto" || $outformat eq "psi" ) { | |
683 # Substitute all '.' by '-' | |
684 for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/./-/;} | |
685 } | |
686 if ($gap ne "default") { | |
687 for ($k=0; $k<$n; $k++) {$seqs[$k]=~s/\./$gap/g; $seqs[$k]=~s/-/$gap/g;} | |
688 } | |
689 if ($case eq "uc") { | |
690 for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/a-z/A-Z/;} | |
691 } elsif ($case eq "lc") { | |
692 for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/A-Z/a-z/;} | |
693 } | |
694 | |
695 | |
696 | |
697 #################################################### | |
698 # Check that sequences have same length | |
699 #################################################### | |
700 if ($outformat ne "a3m" && $outformat ne "ufas") { | |
701 my $len=length($seqs[0]); | |
702 for($k=1; $k<$n; $k++) { | |
703 if (length($seqs[$k])!=$len) { | |
704 printf("\nError: Sequences in $infile do not all have same length, e.g. >%-.20s (len=%i) and >%-.20s (len=%i)\n", | |
705 $names[0],$len,$names[$k],length($seqs[$k])); | |
706 if ($v>=3) { | |
707 printf("%.20s %s\n%.20s %s\n\n",$names[0],$seqs[0],$names[$k],$seqs[$k]); | |
708 } | |
709 exit 1; | |
710 } | |
711 } | |
712 } | |
713 | |
714 | |
715 #################################################### | |
716 # Remove html tags | |
717 #################################################### | |
718 for($k=0; $k<$n; $k++) { | |
719 $names[$k]=~s/<[A-Za-z\/].*?>//g; # remove html tags | |
720 } | |
721 | |
722 | |
723 | |
724 ################################################################################################ | |
725 # Output part | |
726 ################################################################################################ | |
727 | |
728 my $ndssp=-1; | |
729 my $nsa=-1; | |
730 my $npred=-1; | |
731 my $nconf=-1; | |
732 my $nquery=-1; | |
733 for ($k=0; $k<$n; $k++) { | |
734 if ($names[$k]=~/^ss_dssp/){$ndssp=$k; } | |
735 elsif ($names[$k]=~/^sa_dssp/){$nsa=$k; } | |
736 elsif ($names[$k]=~/^ss_pred/){$npred=$k; } | |
737 elsif ($names[$k]=~/^ss_conf/){$nconf=$k; } | |
738 elsif ($nquery==-1 && $names[$k]!~/^aa_/) {$nquery=$k;} | |
739 } | |
740 | |
741 #Write sequences into output file | |
742 open (OUTFILE, ">$outfile") or die ("cannot open $outfile:$!\n"); | |
743 if ($outformat eq "sto" || $outformat eq "psi") { | |
744 my $refline; | |
745 if ($outformat eq "sto") { | |
746 print(OUTFILE "# STOCKHOLM 1.0\n\n"); | |
747 | |
748 # Write reference annotation line for match states (-M first) | |
749 if (!$lname) {$lname=32;} | |
750 $names[$nquery]=~/^\S+\s+(.*)/; | |
751 printf(OUTFILE "%-$lname.$lname"."s %s\n","#=GF DE",$1); | |
752 $refline=$seqs[$nquery]; | |
753 $refline=~s/[a-z]/-/g; | |
754 printf(OUTFILE "%-$lname.$lname"."s %s\n","#=GC RF",$refline); | |
755 if ($ndssp>=0) { | |
756 printf(OUTFILE "%-32.32s %s\n","#=GC SS_cons",$seqs[$ndssp]); | |
757 } | |
758 } | |
759 if ($num) { | |
760 my $num=2; | |
761 for ($k=0; $k<$n; $k++) { | |
762 if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;} | |
763 $names[$k]=~s/^(\S+)\#\d+/$1/; | |
764 $names[$k]=~s/^(\S{1,25})\S+/$1\#$num/; | |
765 $num++; | |
766 } | |
767 } | |
768 for ($k=0; $k<$n; $k++) { | |
769 if ($k==$ndssp || $k==$npred || $k==$nconf) {next;} | |
770 $names[$k] =~ /\s*(\S+)/; | |
771 if (!$lname) {$lname=32;} | |
772 printf(OUTFILE "%-$lname.$lname"."s %s\n",$1,$seqs[$k]); | |
773 } | |
774 if ($outformat eq "sto") {print(OUTFILE "//\n");} | |
775 } elsif ($outformat eq "clu") { | |
776 printf(OUTFILE "CLUSTAL\n\n\n"); | |
777 if ($num) { | |
778 my $num=2; | |
779 for ($k=0; $k<$n; $k++) { | |
780 if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;} | |
781 $names[$k]=~s/^(\S+)\#\d+/$1/; | |
782 $names[$k]=~s/^(\S{1,10})\S*/$1\#$num/; | |
783 $num++; | |
784 } | |
785 } | |
786 while($seqs[0] ne "") { # While there are still residues left | |
787 for ($k=0; $k<scalar(@names); $k++) { # go through all sequences | |
788 $names[$k] =~ s/\s*(\S+).*/$1/; | |
789 $seqs[$k]=~s/(\S{1,$numres})//; # remove the leftmost (up to) 60 residues from sequence $nseq | |
790 if (!$lname) {$lname=18;} | |
791 printf(OUTFILE "%-$lname.$lname"."s %s\n",$names[$k],$1); # and print them after the sequence name | |
792 } | |
793 print(OUTFILE "\n"); # leave a blank line after each block of 60 residues | |
794 } | |
795 } else { | |
796 if ($num) { | |
797 my $num=2; | |
798 for ($k=0; $k<$n; $k++) { | |
799 if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;} | |
800 $names[$k]=~s/^(\S+)\#\d+/$1/; | |
801 $names[$k]=~s/^(\S{1,25})\S+/$1\#$num/; | |
802 # $names[$k]=~s/^(\S+)/$1\#$num/; | |
803 $num++; | |
804 } | |
805 } | |
806 if ($titleline ne "" && $outformat eq "a3m") { | |
807 printf(OUTFILE "%s\n",$titleline); | |
808 } | |
809 for ($k=0; $k<$n; $k++) { | |
810 $seqs[$k]=~s/(\S{$numres})/$1\n/g; | |
811 printf(OUTFILE ">%s\n%s\n",$names[$k],$seqs[$k]); | |
812 } | |
813 } | |
814 | |
815 close OUTFILE; | |
816 if ($v>=2) { | |
817 if ($nin==1) {print("Reformatted $infile with 1 sequence from $informat to $outformat and written to file $outfile\n");} | |
818 else { | |
819 if (!$nin==$n) {printf("Removed %i sequences which contained no residues\n",$nin-$n); } | |
820 print("Reformatted $infile with $n sequences from $informat to $outformat and written to file $outfile\n"); | |
821 } | |
822 } | |
823 | |
824 return; | |
825 } |