annotate external_tools/darwin/lib/hh/scripts/reformat.pl @ 6:2277dd59b9f9 draft

Uploaded
author hammock
date Wed, 01 Nov 2017 05:54:28 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
1 #! /usr/bin/env perl
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
2
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
3 # reformat.pl
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
4 # Reformat a multiple alignment file
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
5 #
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
6 # HHsuite version 2.0.16 (January 2013)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
7 #
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
8 # Reference:
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
9 # Remmert M., Biegert A., Hauser A., and Soding J.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
10 # HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
11 # Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
12
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
13 # (C) Johannes Soeding, 2012
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
14
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
15 # This program is free software: you can redistribute it and/or modify
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
16 # it under the terms of the GNU General Public License as published by
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
17 # the Free Software Foundation, either version 3 of the License, or
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
18 # (at your option) any later version.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
19
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
20 # This program is distributed in the hope that it will be useful,
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
21 # but WITHOUT ANY WARRANTY; without even the implied warranty of
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
22 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
23 # GNU General Public License for more details.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
24
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
25 # You should have received a copy of the GNU General Public License
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
26 # along with this program. If not, see <http://www.gnu.org/licenses/>.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
27
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
28 # We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
29
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
30 use lib $ENV{"HHLIB"}."/scripts";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
31 use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
32 use strict;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
33 use warnings;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
34 my $numres=100; # number of residues per line
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
35 my $desclen=1000; # maximum number of characters in nameline
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
36 my $ARGC=scalar(@ARGV);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
37 if ($ARGC<2) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
38 die("
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
39 reformat.pl from HHsuite $VERSION
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
40 Read a multiple alignment in one format and write it in another format
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
41 Usage: reformat.pl [informat] [outformat] infile outfile [options]
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
42 or reformat.pl [informat] [outformat] 'fileglob' .ext [options]
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
43
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
44 Available input formats:
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
45 fas: aligned fasta; lower and upper case equivalent, '.' and '-' equivalent
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
46 a2m: aligned fasta; inserts: lower case, matches: upper case, deletes: '-',
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
47 gaps aligned to inserts: '.'
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
48 a3m: like a2m, but gaps aligned to inserts MAY be omitted
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
49 sto: Stockholm format; sequences in several blocks with sequence name at
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
50 beginning of line (hmmer output)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
51 psi: format as read by PSI-BLAST using the -B option (like sto with -M first -r)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
52 clu: Clustal format; sequences in several blocks with sequence name at beginning
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
53 of line
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
54 Available output formats:
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
55 fas: aligned fasta; all gaps '-'
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
56 a2m: aligned fasta; inserts: lower case, matches: upper case, deletes: '-', gaps
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
57 aligned to inserts: '.'
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
58 a3m: like a2m, but gaps aligned to inserts are omitted
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
59 sto: Stockholm format; sequences in just one block, one line per sequence
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
60 psi: format as read by PSI-BLAST using the -B option
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
61 clu: Clustal format
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
62 If no input or output format is given the file extension is interpreted as format
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
63 specification ('aln' as 'clu')
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
64
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
65 Options:
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
66 -v int verbose mode (0:off, 1:on)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
67 -num add number prefix to sequence names: 'name', '1:name' '2:name' etc
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
68 -noss remove secondary structure sequences (beginning with >ss_)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
69 -sa do not remove solvent accessibility sequences (beginning with >sa_)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
70 -M first make all columns with residue in first seuqence match columns
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
71 (default for output format a2m or a3m)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
72 -M int make all columns with less than X% gaps match columns
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
73 (for output format a2m or a3m)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
74 -r remove all lower case residues (insert states)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
75 (AFTER -M option has been processed)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
76 -r int remove all lower case columns with more than X% gaps
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
77 -g '' suppress all gaps
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
78 -g '-' write all gaps as '-'
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
79 -uc write all residues in upper case (AFTER all other options have been processed)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
80 -lc write all residues in lower case (AFTER all other options have been processed)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
81 -l number of residues per line (for Clustal, FASTA, A2M, A3M formats)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
82 (default=$numres)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
83 -d maximum number of characers in nameline (default=$desclen)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
84
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
85 Examples: reformat.pl 1hjra.a3m 1hjra.a2m
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
86 (same as reformat.pl a3m a2m 1hjra.a3m 1hjra.a2m)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
87 reformat.pl test.a3m test.fas -num -r 90
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
88 reformat.pl fas sto '*.fasta' .stockholm
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
89 \n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
90 # clu: clustal format (hmmer output)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
91 # sel: Selex format
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
92 # phy: Phylip format
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
93 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
94
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
95 my $informat="";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
96 my $outformat="";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
97 my $infile="";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
98 my $outfile="";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
99 my $num=0; # don't use sequence number as prefix: '>n|name'
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
100 my $noss=0; # don't remove secondary structure
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
101 my $nosa=1; # remove solvent accessibility sequences
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
102 my $line;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
103 my $options="";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
104 my @names; # names of sequences read in
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
105 my @seqs; # residues of sequences read in
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
106 my $n; # number of sequences read in
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
107 my $k; # counts sequences for output
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
108 my $remove_inserts=0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
109 my $remove_gapped=0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
110 my $matchmode=""; # do not change capitalization
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
111 my $match_gaprule=0; # columns with less than this percentage of gaps will be match columns
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
112 my $v=2;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
113 my $update=0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
114 my $nss=-1; # index of secondary structure sequence
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
115 my $lname; # length of sequence name in clustal, stockholm, psi format
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
116 my $titleline; # first line beginning with "#" in A3M, A2M, or FASTA files
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
117
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
118 my @informats= ("fas","a2m","a3m","sto","psi","clu");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
119 my @outformats= ("fas","a2m","a3m","sto","psi","clu","ufas");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
120 my $found;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
121 my $element;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
122 my $gap="default";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
123 my $case="default";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
124
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
125 # Process optionsz
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
126 for (my $i=0; $i<$ARGC; $i++) {$options.=" $ARGV[$i] ";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
127 if ($options=~s/ -i\s+(\S+) / /) {$infile=$1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
128 if ($options=~s/ -o\s+(\S+) / /) {$outfile=$1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
129 if ($options=~s/ -num / /) {$num=1; $desclen=505;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
130 if ($options=~s/ -noss / /) {$noss=1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
131 if ($options=~s/ -sa / /) {$nosa=0;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
132 if ($options=~s/ -g\s+\'?(\S*)\'? / /) {$gap=$1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
133 if ($options=~s/ -r\s+(\d+) / /) {$remove_gapped=$1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
134 if ($options=~s/ -r / /) {$remove_inserts=1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
135 if ($options=~s/ -lc / /) {$case="lc";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
136 if ($options=~s/ -uc / /) {$case="uc";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
137 if ($options=~s/ -v\s*(\d+) / /) {$v=$1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
138 if ($options=~s/ -v / /) {$v=2;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
139 if ($options=~s/ -M\s+(\d+) / /) {$matchmode="gaprule"; $match_gaprule=$1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
140 if ($options=~s/ -M\s+first / /) {$matchmode="first"; $match_gaprule=$1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
141 if ($options=~s/ -u / /) {$update=1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
142 if ($options=~s/ -l\s+(\S+) / /) {$numres=$1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
143 if ($options=~s/ -lname\s+(\S+) / /) {$lname=$1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
144 if ($options=~s/ -d\s+(\S+) / /) {$desclen=$1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
145
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
146 # Assign informat, outformat, infile, and outfile
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
147 if ($outfile eq "") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
148 if ($options=~s/(\S+)\s*$//) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
149 $outfile=$1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
150 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
151 die("Error: no output file given: '$options'\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
152 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
153 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
154 if ($infile eq "") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
155 if ($options=~s/(\S+)\s*$//) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
156 $infile=$1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
157 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
158 die("Error: no input file given: '$options'\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
159 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
160 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
161 if ($options=~s/(\S+)\s*$//) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
162 $outformat=$1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
163 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
164 if ($outfile=~/\S*\.(\S+?)$/) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
165 $outformat=lc($1);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
166 if ($outformat eq "aln") {$outformat="clu";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
167 elsif ($outformat eq "fa") {$outformat="fas";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
168 elsif ($outformat eq "fasta") {$outformat="fas";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
169 elsif ($outformat eq "afa") {$outformat="fas";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
170 elsif ($outformat eq "afas") {$outformat="fas";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
171 elsif ($outformat eq "afasta") {$outformat="fas";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
172 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
173 print ("Using FASTA output format: '$options'\n"); $outformat="fas";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
174 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
175 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
176 if ($options=~s/(\S+)\s*$//) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
177 $informat=$1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
178 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
179 if ($infile=~/\S*\.(\S+?)$/) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
180 $informat=lc($1);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
181 if ($informat eq "aln") {$informat="clu";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
182 elsif ($informat eq "fa") {$informat="fas";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
183 elsif ($informat eq "fasta") {$informat="fas";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
184 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
185 print ("Using FASTA input format: '$options'\n"); $informat="fas";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
186 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
187 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
188
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
189
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
190 # Warn if unknown options found
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
191 if ($options!~/^\s*$/) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
192 $options=~s/^\s*(.*?)\s*$/$1/g;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
193 print("\nWARNING: unknown options '$options'\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
194 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
195
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
196 # Check if input and output formats are valid
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
197 $found=0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
198 foreach $element (@informats) {if ($informat eq $element) {$found=1; last;}}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
199 if(!$found) {die("\nError: $informat is not a valid input format option\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
200 $found=0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
201 foreach $element (@outformats) {if ($outformat eq $element) {$found=1; last;}}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
202 if(!$found) {die("\nError: $outformat is not a valid output format option\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
203
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
204 #if($outformat eq "psi") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
205 # $remove_inserts=1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
206 #}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
207 if($outformat eq "ufas") {$gap="";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
208
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
209
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
210 if ($infile=~/\*/ || $outfile=~/^\./) # if infile contains '*' or outfile is just an extension
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
211 {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
212 $outfile=~/.*\.(\S*)$/;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
213 my $outext=$1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
214 my @infiles=glob($infile);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
215 printf("%i files to reformat\n",scalar(@infiles));
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
216 foreach $infile (@infiles)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
217 {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
218 if ($infile!~/(\S+)\.\S+/) {$infile=~/(\S+)/}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
219 $outfile="$1.$outext";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
220 if ($update && -e $outfile) {next;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
221 if ($v>=3) {print("Reformatting $infile from $informat to $outformat ...\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
222 &reformat($infile,$outfile);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
223 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
224 exit 0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
225 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
226 else
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
227 {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
228 if ($v>=3) {print("Reformatting $infile from $informat to $outformat ...\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
229 &reformat($infile,$outfile);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
230 exit 0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
231 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
232
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
233
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
234 ################################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
235 # Reformat a single file
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
236 ################################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
237 sub reformat()
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
238 {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
239 $infile=$_[0];
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
240 $nss=-1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
241 $titleline="";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
242
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
243 ################################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
244 # Input part
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
245 ################################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
246
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
247 my $skip=0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
248 open (INFILE,"<$infile") or die ("ERROR: cannot open $infile: $!\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
249
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
250 # Read a2m, a3m, fas format
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
251 if ($informat eq "fas" || $informat eq "a2m" || $informat eq "a3m")
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
252 {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
253 $/=">"; # set input field seperator
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
254 $n=0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
255 my $seq=<INFILE>;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
256 if ($seq=~s/^(\#.*)//) {$titleline=$1;} # remove commentary lines at beginning of file
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
257 $seq=~s/(\n\#.*)*\n//; # remove commentary lines at beginning of file
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
258
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
259 # If first line has no sequence name, use >root_name
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
260 if ($seq ne ">") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
261 $infile="/$infile."; # determine root name 1
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
262 $infile=~/^.*\/(.*?)\..*/; # determine root name 2
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
263 $names[$n]=$1; # don't move this line away from previous line $seq=~s/([^\n]*)//;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
264 $seq=~tr/\n //d; # remove newlines and blanks
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
265 $seqs[$n]=$seq;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
266 $n++;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
267 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
268
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
269 while ($seq=<INFILE>) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
270 $seq=~s/\n\#.*//g; # remove commentary lines
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
271 while ($seq=~s/(.)>/$1/) {$seq.=<INFILE>;} # in the case that nameline contains a '>'; '.' matches anything except '\n'
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
272 if ($seq=~/^aa_/) {next;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
273 if ($seq=~/^sa_/ && $nosa) {next;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
274 if ($seq=~/^ss_/) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
275 if ($noss) {next;} # do not read in >ss_ and >sa_ sequences
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
276 # if ($seq=~/^ss_conf/) {next;} # comment out to read sequence with confidence values
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
277 # if ($nss>=0) {next;} # comment out: allow for two or more >ss_dssp and >ss_pred lines
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
278 $nss=$n;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
279 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
280 $seq=~s/^\s*(.*)//; # divide into nameline and residues; '.' matches anything except '\n'
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
281 if (defined $1 && $1 ne "") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
282 $names[$n]=$1; # don't move this line away from previous line $seq=~s/([^\n]*)//;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
283 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
284 $names[$n]=$n;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
285 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
286 $seqs[$n]=$seq;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
287 $n++;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
288 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
289
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
290 $/="\n"; # reset input field seperator
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
291 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
292
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
293 # Read Stockholm format
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
294 elsif ($informat eq "sto")
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
295 {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
296 my %seqhash;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
297 my $name;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
298 my $first_block=1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
299
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
300 $n=0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
301 while ($line = <INFILE>)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
302 {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
303 $line=~tr/\r//d;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
304 $line=~s/\s+/ /g;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
305 if ($line=~s/^\#=GC SS_cons/ss_dssp/) {} # skip commentary and blank line
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
306 if ($line=~/^\#/) {next;} # skip commentary and blank line
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
307 if ($line=~/^\/\//) {last;} # reached end of file
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
308 if ($line=~/^\s*$/){$first_block=0; next;} # new sequence block starts
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
309 if ($line!~/^\s*(\S+)\s+(\S+)/) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
310 die ("\nERROR found in stockholm format: $!");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
311 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
312 if (!(exists $seqhash{$1}))
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
313 {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
314 if ($line=~/^aa_/) {next;} # do not read in >aa_ sequences
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
315 if ($line=~/^sa_/ && $nosa) {next;} # do not read in >sa_ sequences
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
316 if ($line=~/^ss_/) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
317 if ($noss) {next;} # do not read in >ss_ and >aa_ sequences
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
318 # if ($line=~/^ss_conf/) {next;} # comment out to read sequence with confidence values
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
319 # if ($nss>=0) {next;} # comment out: allow for two or more >ss_dssp and >ss_pred lines
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
320 $nss=$n;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
321 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
322 $line=~/^\s*(\S+)\s+(\S+)/;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
323 $names[$n]=$1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
324 $seqs[$n]=$2;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
325 $seqhash{$1}=$n++;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
326 $first_block=1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
327 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
328 else
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
329 {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
330 if ($first_block) {die ("\nERROR: sequence $1 appears more than once per block\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
331 $seqs[$seqhash{$1}].=$2;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
332 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
333 # printf("%3i %s\n",$n-1,$seqs[$n-1]);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
334 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
335 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
336
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
337 elsif ($informat eq "clu")
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
338 {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
339 my $residues_per_line=50; # default number of characters for namefield residues
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
340 # (only needed if no gap between name and residues in first sequence -> SMART)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
341 my $block=1; # number of block currently read
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
342 my $name;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
343 my $residues;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
344 $n=0; # sequence number of first block
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
345 $k=0; # sequence index to zero for first block
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
346
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
347 while ($line = <INFILE>)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
348 {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
349 # print($namefieldlen.$line);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
350 $line=~tr/\r//d;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
351 if ($line=~/CLUSTAL/i) {next;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
352 if ($line=~/^\#/) {next;} # skip commentary and blank line
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
353 if ($line=~/^\/\//) {last;} # reached end of file
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
354 if ($line=~/^\s*$/){ # new sequence block starts
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
355 if ($k) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
356 if ($n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
357 $block++;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
358 $n=$k;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
359 $k=0; # set sequence index to zero for next block to read
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
360 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
361 next;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
362 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
363 if ($line!~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
364 if ($line=~/^[*.: ]*$/) {next;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
365 if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} # do not read in >ss_ and >aa_ sequences
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
366 chomp($line);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
367 if ($line!~/^(\S{1,20})([a-zA-Z0-9.-]{$residues_per_line})(\s+\d+)?$/) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
368 die ("\nError found in Clustal format in $infile, line $.: '$line'\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
369 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
370 $name=$1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
371 $residues=$2;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
372 print("WARNING: Found no space between name and residues in $infile, line $.: '$line'\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
373 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
374 if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} # do not read in >ss_ and >aa_ sequences
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
375 if ($line=~/^aa_/ || $line=~/^sa_/) {next;} # do not read in >ss_ and >aa_ sequences
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
376 if ($line=~/^ss_/) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
377 # if ($line=~/^ss_conf/) {next;} # comment out to read sequence with confidence values
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
378 # if ($nss>=0) {next;} # comment out: allow for two or more >ss_dssp and >ss_pred lines
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
379 $nss=$n;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
380 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
381 $line=~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
382 $name=$1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
383 $residues=$2;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
384 $residues=~tr/ //d;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
385 $residues_per_line=length($residues);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
386 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
387 if ($block==1) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
388 $names[$k]=$name;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
389 $seqs[$k]=$residues;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
390 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
391 $seqs[$k].=$residues;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
392 if ($names[$k] ne $name) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
393 print("WARNING: name of sequence $k in block 1 ($names[$k]) is not the same as in block $block ($name) in $infile\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
394 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
395 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
396 # printf("%3i %3i %-16.16s %s\n",$block,$k,$names[$k],$seqs[$k]);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
397 $k++;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
398 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
399 if ($k && $n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
400 if (!$n) {$n=$k;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
401 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
402
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
403 # Read psi format
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
404 elsif ($informat eq "psi")
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
405 {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
406 my $block=1; # number of block currently read
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
407 my $name;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
408 my $residues;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
409 $n=0; # sequence number of first block
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
410 $k=0; # sequence index to zero for first block
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
411
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
412 while ($line = <INFILE>)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
413 {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
414 # print($namefieldlen.$line);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
415 $line=~tr/\r//d;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
416 if ($line=~/^\s*$/){ # new sequence block starts
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
417 if ($k) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
418 if ($n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
419 $block++;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
420 $n=$k;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
421 $k=0; # set sequence index to zero for next block to read
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
422 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
423 next;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
424 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
425
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
426 if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} # do not read in >ss_ and >aa_ sequences
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
427 if ($line=~/^aa_/ || $line=~/^sa_/) {next;} # do not read in >ss_ and >aa_ sequences
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
428 if ($line=~/^ss_/) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
429 # if ($line=~/^ss_conf/) {next;} # comment out to read sequence with confidence values
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
430 # if ($nss>=0) {next;} # comment out: allow for two or more >ss_dssp and >ss_pred lines
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
431 $nss=$n;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
432 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
433 $line=~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
434 $name=$1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
435 $residues=$2;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
436 $residues=~tr/ //d;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
437
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
438 if ($block==1) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
439 $names[$k]=$name;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
440 $seqs[$k]=$residues;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
441 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
442 $seqs[$k].=$residues;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
443 if ($names[$k] ne $name) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
444 print("WARNING: name of sequence $k in block 1 ($names[$k]) is not the same as in block $block ($name) in $infile\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
445 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
446 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
447 # printf("%3i %3i %-16.16s %s\n",$block,$k,$names[$k],$seqs[$k]);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
448 $k++;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
449 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
450 if ($k && $n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
451 if (!$n) {$n=$k;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
452 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
453
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
454 close INFILE;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
455
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
456
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
457 # Empty input file?
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
458 if ($n==0) {die("\nERROR: input file $infile contains no sequences\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
459
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
460
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
461
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
462 ################################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
463 # Transforming to upper-case
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
464 ################################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
465 if ($informat ne "a3m" && $informat ne "a2m") { # Transform to upper case if input format is not A3M or A2M
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
466 for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/a-z/A-Z/;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
467 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
468
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
469 ################################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
470 # Removing non-alphanumeric symbols
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
471 ################################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
472 for ($k=0; $k<$n; $k++) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
473 $seqs[$k]=~tr/A-Za-z0-9.~-//cd;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
474 $seqs[$k]=~tr/~/-/;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
475 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
476
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
477
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
478 ################################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
479 # Filling up with gaps '.' or deleting gaps
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
480 ################################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
481
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
482 # Insertion of '.' only needed for a3m if: NOT -r option OR -M first OR -M <int> option
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
483 if ($informat eq "a3m" && (!$remove_inserts || $matchmode))
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
484 {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
485 print("inserting gaps...\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
486 my @len_ins; # $len_ins[$j] will count the maximum number of inserted residues after match state $i.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
487 my $j; # counts match states
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
488 my @inserts; # $inserts[$j] contains the insert (in small case) of sequence $i after the $j'th match state
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
489 my $insert;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
490
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
491 # Determine length of longest insert
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
492 for ($k=0; $k<$n; $k++)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
493 {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
494 # split into list of single match states and variable-length inserts
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
495 # ([A-Z]|-) is the split pattern. The parenthesis indicate that split patterns are to be included as list elements
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
496 # The '#' symbol is prepended to get rid of a perl bug in split
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
497 @inserts = split(/([A-Z]|-|~|[0-9])/,"#".$seqs[$k]."#");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
498 $j=0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
499 # printf("Sequence $k: $seqs[$k]\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
500 # printf("Sequence $k: @inserts\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
501
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
502 # Does sequence $k contain insert after match state j that is longer than $ngap[$j]?
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
503 foreach $insert (@inserts)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
504 {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
505 if( !defined $len_ins[$j] || length($insert)>$len_ins[$j]) {$len_ins[$j]=length($insert);}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
506 $j++;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
507 # printf("$insert|");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
508 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
509 # printf("\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
510 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
511 my $ngap;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
512
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
513 # Fill up inserts with gaps '.' up to length $len_ins[$j]
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
514 for ($k=0; $k<$n; $k++)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
515 {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
516 # split into list of single match states and variable-length inserts
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
517 @inserts = split(/([A-Z]|-|~|[0-9])/,"#".$seqs[$k]."#");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
518 $j=0;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
519
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
520 # append the missing number of gaps after each match state
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
521 foreach $insert (@inserts)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
522 {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
523 for (my $l=length($insert); $l<$len_ins[$j]; $l++) {$insert.=".";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
524 $j++;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
525 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
526 $seqs[$k] = join("",@inserts);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
527 $seqs[$k] =~ tr/\#//d; # remove the '#' symbols inserted at the beginning and end
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
528 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
529 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
530
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
531
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
532 ################################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
533 # Match state assignment
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
534 ################################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
535
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
536 # Default match state rule for a3m is -M first
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
537 if ($matchmode eq "" && ($outformat eq "a3m" || $outformat eq "a2m")) {$matchmode="first";}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
538
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
539 # Use gap rule for match state assignment?
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
540 if ($matchmode eq "gaprule") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
541
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
542 my @gaps=();
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
543 my $residues;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
544 my @residues;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
545
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
546 # Determine number of gaps per column
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
547 for ($k=0; $k<$n; $k++) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
548 @residues=unpack("C*",$seqs[$k]);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
549 for (my $l=0; $l<@residues; $l++) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
550 if ($residues[$l]==46 || $residues[$l]==45) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
551 if (defined $gaps[$l]) {$gaps[$l]++;} else {$gaps[$l]=1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
552 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
553 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
554 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
555
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
556 # Set columns with more gaps than $match_gaprule to lower case,
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
557 for ($k=0; $k<$n; $k++) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
558 @residues=unpack("C*",$seqs[$k]);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
559 $residues="";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
560 for (my $l=0; $l<@residues; $l++) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
561 if (!defined $gaps[$l] || $gaps[$l]<0.01*$match_gaprule*$n) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
562 if ($residues[$l]==46) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
563 $residues .= "-";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
564 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
565 $residues .= uc(chr($residues[$l]));
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
566 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
567 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
568 if ($residues[$l]==45) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
569 $residues .= ".";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
570 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
571 $residues .= lc(chr($residues[$l]));
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
572 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
573 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
574 $seqs[$k]=$residues;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
575 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
576 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
577 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
578
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
579 # Use first sequence for match state assignment?
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
580 if ($matchmode eq "first") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
581
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
582 my @match=();
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
583 my $residues;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
584 my @residues;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
585
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
586
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
587 # Determine which columns have a gap in first sequence
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
588 for ($k=0; $k<scalar(@names); $k++) { #find seed sequence
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
589 if ($names[$k]!~/^(ss_|aa_|sa_)/) {last;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
590 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
591 @residues=unpack("C*",$seqs[$k]);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
592 for (my $l=0; $l<@residues; $l++) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
593 if ($residues[$l]==46 || $residues[$l]==45) {$match[$l]=0;} else {$match[$l]=1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
594 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
595
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
596 # Set columns without residue in first sequence to upper case,
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
597 for ($k=0; $k<$n; $k++) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
598 @residues=unpack("C*",$seqs[$k]);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
599 $residues="";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
600 for (my $l=0; $l<@residues; $l++) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
601 if ($match[$l]) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
602 if ($residues[$l]==46) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
603 $residues .= "-";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
604 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
605 $residues .= uc(chr($residues[$l]));
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
606 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
607 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
608 if ($residues[$l]==45) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
609 $residues .= ".";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
610 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
611 $residues .= lc(chr($residues[$l]));
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
612 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
613 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
614 $seqs[$k]=$residues;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
615 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
616 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
617 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
618
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
619
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
620 ################################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
621 # Remove gaps etc.
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
622 ################################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
623
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
624 # Remove columns with too many gaps?
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
625 if ($remove_gapped) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
626
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
627 my @gaps=();
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
628 my $residues;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
629 my @residues;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
630
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
631 # Determine number of gaps '.' or '-' per column
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
632 for ($k=0; $k<$n; $k++) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
633 @residues=unpack("C*",$seqs[$k]);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
634 for (my $l=0; $l<@residues; $l++) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
635 if ($residues[$l]==45 || $residues[$l]==46) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
636 if (defined $gaps[$l]) {$gaps[$l]++;} else {$gaps[$l]=1;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
637 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
638 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
639 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
640
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
641 # Remove columns with too many gaps
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
642 for ($k=0; $k<$n; $k++) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
643 @residues=unpack("C*",$seqs[$k]);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
644 $residues="";
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
645 for (my $l=0; $l<@residues; $l++) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
646 if (!defined $gaps[$l] || $gaps[$l]<0.01*$remove_gapped*$n) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
647 $residues .= chr($residues[$l])
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
648 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
649 $seqs[$k]=$residues;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
650 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
651 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
652 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
653
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
654 # Remove/transliterate gaps?
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
655 for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/ //d;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
656 if ($remove_inserts) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
657 for ($k=0; $k<$n; $k++) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
658 $seqs[$k]=~tr/a-z.//d;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
659 # printf("%s\n",$seqs[$k]);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
660 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
661 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
662
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
663
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
664 # Remove sequences which contain only gaps
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
665 my $nin=$n;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
666 for ($k=0; $k<$n; $k++) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
667 if (($seqs[$k]=~tr/a-zA-Z0-9/a-zA-Z0-9/==0)) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
668 if ($v>=2) {print("Sequence contains only gaps and is removed: $names[$k]\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
669 splice(@seqs,$k,1);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
670 splice(@names,$k,1);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
671 $k--; $n--;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
672 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
673 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
674
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
675
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
676 # Cut down sequence names to $desclen characters maximum
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
677 for ($k=0; $k<$n; $k++) {$names[$k]=substr($names[$k],0,$desclen);}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
678
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
679 if ($outformat eq "a3m") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
680 # Remove all '.' gaps
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
681 for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/.//d;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
682 } elsif ($outformat eq "fas" || $outformat eq "clu" || $outformat eq "sto" || $outformat eq "psi" ) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
683 # Substitute all '.' by '-'
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
684 for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/./-/;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
685 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
686 if ($gap ne "default") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
687 for ($k=0; $k<$n; $k++) {$seqs[$k]=~s/\./$gap/g; $seqs[$k]=~s/-/$gap/g;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
688 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
689 if ($case eq "uc") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
690 for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/a-z/A-Z/;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
691 } elsif ($case eq "lc") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
692 for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/A-Z/a-z/;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
693 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
694
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
695
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
696
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
697 ####################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
698 # Check that sequences have same length
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
699 ####################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
700 if ($outformat ne "a3m" && $outformat ne "ufas") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
701 my $len=length($seqs[0]);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
702 for($k=1; $k<$n; $k++) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
703 if (length($seqs[$k])!=$len) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
704 printf("\nError: Sequences in $infile do not all have same length, e.g. >%-.20s (len=%i) and >%-.20s (len=%i)\n",
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
705 $names[0],$len,$names[$k],length($seqs[$k]));
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
706 if ($v>=3) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
707 printf("%.20s %s\n%.20s %s\n\n",$names[0],$seqs[0],$names[$k],$seqs[$k]);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
708 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
709 exit 1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
710 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
711 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
712 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
713
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
714
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
715 ####################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
716 # Remove html tags
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
717 ####################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
718 for($k=0; $k<$n; $k++) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
719 $names[$k]=~s/<[A-Za-z\/].*?>//g; # remove html tags
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
720 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
721
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
722
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
723
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
724 ################################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
725 # Output part
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
726 ################################################################################################
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
727
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
728 my $ndssp=-1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
729 my $nsa=-1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
730 my $npred=-1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
731 my $nconf=-1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
732 my $nquery=-1;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
733 for ($k=0; $k<$n; $k++) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
734 if ($names[$k]=~/^ss_dssp/){$ndssp=$k; }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
735 elsif ($names[$k]=~/^sa_dssp/){$nsa=$k; }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
736 elsif ($names[$k]=~/^ss_pred/){$npred=$k; }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
737 elsif ($names[$k]=~/^ss_conf/){$nconf=$k; }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
738 elsif ($nquery==-1 && $names[$k]!~/^aa_/) {$nquery=$k;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
739 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
740
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
741 #Write sequences into output file
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
742 open (OUTFILE, ">$outfile") or die ("cannot open $outfile:$!\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
743 if ($outformat eq "sto" || $outformat eq "psi") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
744 my $refline;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
745 if ($outformat eq "sto") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
746 print(OUTFILE "# STOCKHOLM 1.0\n\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
747
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
748 # Write reference annotation line for match states (-M first)
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
749 if (!$lname) {$lname=32;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
750 $names[$nquery]=~/^\S+\s+(.*)/;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
751 printf(OUTFILE "%-$lname.$lname"."s %s\n","#=GF DE",$1);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
752 $refline=$seqs[$nquery];
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
753 $refline=~s/[a-z]/-/g;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
754 printf(OUTFILE "%-$lname.$lname"."s %s\n","#=GC RF",$refline);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
755 if ($ndssp>=0) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
756 printf(OUTFILE "%-32.32s %s\n","#=GC SS_cons",$seqs[$ndssp]);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
757 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
758 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
759 if ($num) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
760 my $num=2;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
761 for ($k=0; $k<$n; $k++) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
762 if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
763 $names[$k]=~s/^(\S+)\#\d+/$1/;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
764 $names[$k]=~s/^(\S{1,25})\S+/$1\#$num/;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
765 $num++;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
766 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
767 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
768 for ($k=0; $k<$n; $k++) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
769 if ($k==$ndssp || $k==$npred || $k==$nconf) {next;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
770 $names[$k] =~ /\s*(\S+)/;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
771 if (!$lname) {$lname=32;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
772 printf(OUTFILE "%-$lname.$lname"."s %s\n",$1,$seqs[$k]);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
773 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
774 if ($outformat eq "sto") {print(OUTFILE "//\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
775 } elsif ($outformat eq "clu") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
776 printf(OUTFILE "CLUSTAL\n\n\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
777 if ($num) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
778 my $num=2;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
779 for ($k=0; $k<$n; $k++) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
780 if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
781 $names[$k]=~s/^(\S+)\#\d+/$1/;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
782 $names[$k]=~s/^(\S{1,10})\S*/$1\#$num/;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
783 $num++;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
784 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
785 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
786 while($seqs[0] ne "") { # While there are still residues left
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
787 for ($k=0; $k<scalar(@names); $k++) { # go through all sequences
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
788 $names[$k] =~ s/\s*(\S+).*/$1/;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
789 $seqs[$k]=~s/(\S{1,$numres})//; # remove the leftmost (up to) 60 residues from sequence $nseq
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
790 if (!$lname) {$lname=18;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
791 printf(OUTFILE "%-$lname.$lname"."s %s\n",$names[$k],$1); # and print them after the sequence name
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
792 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
793 print(OUTFILE "\n"); # leave a blank line after each block of 60 residues
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
794 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
795 } else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
796 if ($num) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
797 my $num=2;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
798 for ($k=0; $k<$n; $k++) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
799 if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
800 $names[$k]=~s/^(\S+)\#\d+/$1/;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
801 $names[$k]=~s/^(\S{1,25})\S+/$1\#$num/;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
802 # $names[$k]=~s/^(\S+)/$1\#$num/;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
803 $num++;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
804 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
805 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
806 if ($titleline ne "" && $outformat eq "a3m") {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
807 printf(OUTFILE "%s\n",$titleline);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
808 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
809 for ($k=0; $k<$n; $k++) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
810 $seqs[$k]=~s/(\S{$numres})/$1\n/g;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
811 printf(OUTFILE ">%s\n%s\n",$names[$k],$seqs[$k]);
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
812 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
813 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
814
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
815 close OUTFILE;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
816 if ($v>=2) {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
817 if ($nin==1) {print("Reformatted $infile with 1 sequence from $informat to $outformat and written to file $outfile\n");}
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
818 else {
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
819 if (!$nin==$n) {printf("Removed %i sequences which contained no residues\n",$nin-$n); }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
820 print("Reformatted $infile with $n sequences from $informat to $outformat and written to file $outfile\n");
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
821 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
822 }
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
823
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
824 return;
2277dd59b9f9 Uploaded
hammock
parents:
diff changeset
825 }