Mercurial > repos > hammock > hammock

#! /usr/bin/env perl

# reformat.pl
# Reformat a multiple alignment file
#
#     HHsuite version 2.0.16 (January 2013)
#
#     Reference:
#     Remmert M., Biegert A., Hauser A., and Soding J.
#     HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
#     Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).

#     (C) Johannes Soeding, 2012

#     This program is free software: you can redistribute it and/or modify
#     it under the terms of the GNU General Public License as published by
#     the Free Software Foundation, either version 3 of the License, or
#     (at your option) any later version.

#     This program is distributed in the hope that it will be useful,
#     but WITHOUT ANY WARRANTY; without even the implied warranty of
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#     GNU General Public License for more details.

#     You should have received a copy of the GNU General Public License
#     along with this program.  If not, see <http://www.gnu.org/licenses/>.

#     We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de

use lib $ENV{"HHLIB"}."/scripts";
use HHPaths;   # config file with path variables for nr, blast, psipred, pdb, dssp etc.
use strict;
use warnings;
my $numres=100;             # number of residues per line
my $desclen=1000;           # maximum number of characters in nameline
my $ARGC=scalar(@ARGV);
if ($ARGC<2) {
    die("
reformat.pl from HHsuite $VERSION
Read a multiple alignment in one format and write it in another format
Usage: reformat.pl [informat] [outformat] infile outfile [options]
  or   reformat.pl [informat] [outformat] 'fileglob' .ext [options]

Available input formats:
   fas:     aligned fasta; lower and upper case equivalent, '.' and '-' equivalent
   a2m:     aligned fasta; inserts: lower case, matches: upper case, deletes: '-',
            gaps aligned to inserts: '.'
   a3m:     like a2m, but gaps aligned to inserts MAY be omitted
   sto:     Stockholm format; sequences in several blocks with sequence name at
            beginning of line (hmmer output)
   psi:     format as read by PSI-BLAST using the -B option (like sto with -M first -r)
   clu:     Clustal format; sequences in several blocks with sequence name at beginning
            of line
Available output formats:
   fas:     aligned fasta; all gaps '-'
   a2m:     aligned fasta; inserts: lower case, matches: upper case, deletes: '-', gaps
            aligned to inserts: '.'
   a3m:     like a2m, but gaps aligned to inserts are omitted
   sto:     Stockholm format; sequences in just one block, one line per sequence
   psi:     format as read by PSI-BLAST using the -B option
   clu:     Clustal format
If no input or output format is given the file extension is interpreted as format
specification ('aln' as 'clu')

Options:
  -v int    verbose mode (0:off, 1:on)
  -num      add number prefix to sequence names: 'name', '1:name' '2:name' etc
  -noss     remove secondary structure sequences (beginning with >ss_)
  -sa       do not remove solvent accessibility sequences (beginning with >sa_)
  -M first  make all columns with residue in first seuqence match columns
            (default for output format a2m or a3m)
  -M int    make all columns with less than X% gaps match columns
            (for output format a2m or a3m)
  -r        remove all lower case residues (insert states)
            (AFTER -M option has been processed)
  -r int    remove all lower case columns with more than X% gaps
  -g ''     suppress all gaps
  -g '-'    write all gaps as '-'
  -uc       write all residues in upper case (AFTER all other options have been processed)
  -lc       write all residues in lower case (AFTER all other options have been processed)
  -l        number of residues per line (for Clustal, FASTA, A2M, A3M formats)
            (default=$numres)
  -d        maximum number of characers in nameline (default=$desclen)

Examples: reformat.pl 1hjra.a3m 1hjra.a2m
          (same as reformat.pl a3m a2m 1hjra.a3m 1hjra.a2m)
          reformat.pl test.a3m test.fas -num -r 90
          reformat.pl fas sto '*.fasta' .stockholm
\n");
#  clu:  clustal format (hmmer output)
#  sel:  Selex format
#  phy:  Phylip format
}

my $informat="";
my $outformat="";
my $infile="";
my $outfile="";
my $num=0;                    # don't use sequence number as prefix: '>n|name'
my $noss=0;                   # don't remove secondary structure
my $nosa=1;                   # remove solvent accessibility sequences
my $line;
my $options="";
my @names;   # names of sequences read in
my @seqs;    # residues of sequences read in
my $n;       # number of sequences read in
my $k;       # counts sequences for output
my $remove_inserts=0;
my $remove_gapped=0;
my $matchmode=""; # do not change capitalization
my $match_gaprule=0;   # columns with less than this percentage of gaps will be match columns
my $v=2;
my $update=0;
my $nss=-1;  # index of secondary structure sequence
my $lname;   # length of sequence name in clustal, stockholm, psi format
my $titleline; # first line beginning with "#" in A3M, A2M, or FASTA files

my @informats=  ("fas","a2m","a3m","sto","psi","clu");
my @outformats= ("fas","a2m","a3m","sto","psi","clu","ufas");
my $found;
my $element;
my $gap="default";
my $case="default";

# Process optionsz
for (my $i=0; $i<$ARGC; $i++) {$options.=" $ARGV[$i] ";}
if ($options=~s/ -i\s+(\S+) / /) {$infile=$1;}
if ($options=~s/ -o\s+(\S+) / /) {$outfile=$1;}
if ($options=~s/ -num / /)    {$num=1; $desclen=505;}
if ($options=~s/ -noss / /)   {$noss=1;}
if ($options=~s/ -sa / /)     {$nosa=0;}
if ($options=~s/ -g\s+\'?(\S*)\'? / /) {$gap=$1;}
if ($options=~s/ -r\s+(\d+) / /) {$remove_gapped=$1;}
if ($options=~s/ -r / /)     {$remove_inserts=1;}
if ($options=~s/ -lc / /)    {$case="lc";}
if ($options=~s/ -uc / /)    {$case="uc";}
if ($options=~s/ -v\s*(\d+) / /) {$v=$1;}
if ($options=~s/ -v / /) {$v=2;}
if ($options=~s/ -M\s+(\d+) / /) {$matchmode="gaprule"; $match_gaprule=$1;}
if ($options=~s/ -M\s+first / /) {$matchmode="first";   $match_gaprule=$1;}
if ($options=~s/ -u / /) {$update=1;}
if ($options=~s/ -l\s+(\S+) / /) {$numres=$1;}
if ($options=~s/ -lname\s+(\S+) / /) {$lname=$1;}
if ($options=~s/ -d\s+(\S+) / /) {$desclen=$1;}

# Assign informat, outformat, infile, and outfile
if ($outfile eq "") {
    if ($options=~s/(\S+)\s*$//) {
	$outfile=$1;
    } else {
	die("Error: no output file given: '$options'\n");
    }
}
if ($infile eq "") {
    if ($options=~s/(\S+)\s*$//) {
	$infile=$1;
    } else {
	die("Error: no input file given: '$options'\n");
    }
}
if ($options=~s/(\S+)\s*$//) {
    $outformat=$1;
} else {
    if ($outfile=~/\S*\.(\S+?)$/) {
	$outformat=lc($1);
	if    ($outformat eq "aln")   {$outformat="clu";}
	elsif ($outformat eq "fa")    {$outformat="fas";}
	elsif ($outformat eq "fasta") {$outformat="fas";}
	elsif ($outformat eq "afa")   {$outformat="fas";}
	elsif ($outformat eq "afas")  {$outformat="fas";}
	elsif ($outformat eq "afasta") {$outformat="fas";}
    } else {
	print ("Using FASTA output format: '$options'\n"); $outformat="fas";
    }
}
if ($options=~s/(\S+)\s*$//) {
    $informat=$1;
} else {
    if ($infile=~/\S*\.(\S+?)$/) {
	$informat=lc($1);
	if    ($informat eq "aln")   {$informat="clu";}
	elsif ($informat eq "fa")    {$informat="fas";}
	elsif ($informat eq "fasta") {$informat="fas";}
    } else {
	print ("Using FASTA input format: '$options'\n"); $informat="fas";
    }
}


# Warn if unknown options found
if ($options!~/^\s*$/) {
    $options=~s/^\s*(.*?)\s*$/$1/g;
    print("\nWARNING: unknown options '$options'\n");
}

# Check if input and output formats are valid
$found=0;
foreach $element (@informats) {if ($informat eq $element) {$found=1; last;}}
if(!$found) {die("\nError: $informat is not a valid input format option\n");}
$found=0;
foreach $element (@outformats) {if ($outformat eq $element) {$found=1; last;}}
if(!$found) {die("\nError: $outformat is not a valid output format option\n");}

#if($outformat eq "psi") {
#   $remove_inserts=1;
#}
if($outformat eq "ufas") {$gap="";}


if ($infile=~/\*/ || $outfile=~/^\./) # if infile contains '*' or outfile is just an extension
{
    $outfile=~/.*\.(\S*)$/;
    my $outext=$1;
    my @infiles=glob($infile);
    printf("%i files to reformat\n",scalar(@infiles));
    foreach $infile (@infiles)
    {
	if ($infile!~/(\S+)\.\S+/) {$infile=~/(\S+)/}
	$outfile="$1.$outext";
	if ($update && -e $outfile) {next;}
	if ($v>=3) {print("Reformatting $infile from $informat to $outformat ...\n");}
	&reformat($infile,$outfile);
    }
    exit 0;
}
else
{
    if ($v>=3) {print("Reformatting $infile from $informat to $outformat ...\n");}
    &reformat($infile,$outfile);
    exit 0;
}


################################################################################################
# Reformat a single file
################################################################################################
sub reformat()
{
    $infile=$_[0];
    $nss=-1;
    $titleline="";

################################################################################################
# Input part
################################################################################################

    my $skip=0;
    open (INFILE,"<$infile") or die ("ERROR: cannot open $infile: $!\n");

    # Read a2m, a3m, fas format
    if ($informat eq "fas" || $informat eq "a2m" || $informat eq "a3m")
    {
	$/=">"; # set input field seperator
	$n=0;
	my $seq=<INFILE>;
	if ($seq=~s/^(\#.*)//) {$titleline=$1;}          # remove commentary lines at beginning of file
	$seq=~s/(\n\#.*)*\n//;    # remove commentary lines at beginning of file

	# If first line has no sequence name, use >root_name
	if ($seq ne ">") {
	    $infile="/$infile.";        # determine root name 1
	    $infile=~/^.*\/(.*?)\..*/;  # determine root name 2
	    $names[$n]=$1;        # don't move this line away from previous line $seq=~s/([^\n]*)//;
	    $seq=~tr/\n //d;      # remove newlines and blanks
	    $seqs[$n]=$seq;
	    $n++;
	}

	while ($seq=<INFILE>) {
	    $seq=~s/\n\#.*//g;    # remove commentary lines
	    while ($seq=~s/(.)>/$1/) {$seq.=<INFILE>;}  # in the case that nameline contains a '>'; '.' matches anything except '\n'
	    if ($seq=~/^aa_/) {next;}
	    if ($seq=~/^sa_/ && $nosa) {next;}
	    if ($seq=~/^ss_/) {
		if ($noss) {next;}             # do not read in >ss_ and >sa_ sequences
#		if ($seq=~/^ss_conf/)  {next;} # comment out to read sequence with confidence values
#		if ($nss>=0) {next;}           # comment out: allow for two or more >ss_dssp and >ss_pred lines
 		$nss=$n;
	    }
	    $seq=~s/^\s*(.*)//;        # divide into nameline and residues; '.' matches anything except '\n'
	    if (defined $1 && $1 ne "") {
		$names[$n]=$1;        # don't move this line away from previous line $seq=~s/([^\n]*)//;
	    } else {
		$names[$n]=$n;
	    }
	    $seqs[$n]=$seq;
	    $n++;
	}

	$/="\n"; # reset input field seperator
    }

    # Read Stockholm format
    elsif ($informat eq "sto")
    {
	my %seqhash;
	my $name;
	my $first_block=1;

	$n=0;
	while ($line = <INFILE>)
	{
	    $line=~tr/\r//d;
	    $line=~s/\s+/ /g;
	    if ($line=~s/^\#=GC SS_cons/ss_dssp/) {}        # skip commentary and blank line
	    if ($line=~/^\#/) {next;}                       # skip commentary and blank line
	    if ($line=~/^\/\//) {last;}                     # reached end of file
	    if ($line=~/^\s*$/){$first_block=0; next;}      # new sequence block starts
	    if ($line!~/^\s*(\S+)\s+(\S+)/) {
		die ("\nERROR found in stockholm format: $!");
	    }
	    if (!(exists $seqhash{$1}))
	    {
		if ($line=~/^aa_/) {next;}          # do not read in >aa_ sequences
		if ($line=~/^sa_/ && $nosa) {next;} # do not read in >sa_ sequences
		if ($line=~/^ss_/) {
		    if ($noss) {next;} # do not read in >ss_ and >aa_ sequences
#	  	    if ($line=~/^ss_conf/)  {next;} # comment out to read sequence with confidence values
#		    if ($nss>=0) {next;}  # comment out: allow for two or more >ss_dssp and >ss_pred lines
		    $nss=$n;
		}
		$line=~/^\s*(\S+)\s+(\S+)/;
		$names[$n]=$1;
		$seqs[$n]=$2;
		$seqhash{$1}=$n++;
		$first_block=1;
	    }
	    else
	    {
		if ($first_block) {die ("\nERROR: sequence $1 appears more than once per block\n");}
		$seqs[$seqhash{$1}].=$2;
	    }
#	    printf("%3i %s\n",$n-1,$seqs[$n-1]);
	}
    }

    elsif ($informat eq "clu")
    {
	my $residues_per_line=50; # default number of characters for namefield residues
                             # (only needed if no gap between name and residues in first sequence -> SMART)
	my $block=1;         # number of block currently read
	my $name;
	my $residues;
	$n=0;                # sequence number of first block
	$k=0;                # sequence index to zero for first block

	while ($line = <INFILE>)
	{
#	    print($namefieldlen.$line);
	    $line=~tr/\r//d;
	    if ($line=~/CLUSTAL/i) {next;}
	    if ($line=~/^\#/) {next;}                     # skip commentary and blank line
	    if ($line=~/^\/\//) {last;}                   # reached end of file
	    if ($line=~/^\s*$/){                          # new sequence block starts
		if ($k) {
		    if ($n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");}
		    $block++;
		    $n=$k;
		    $k=0;  # set sequence index to zero for next block to read
		}
		next;
	    }
	    if ($line!~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/) {
		if ($line=~/^[*.: ]*$/) {next;}
		if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} # do not read in >ss_ and >aa_ sequences
		chomp($line);
		if ($line!~/^(\S{1,20})([a-zA-Z0-9.-]{$residues_per_line})(\s+\d+)?$/) {
		    die ("\nError found in Clustal format in $infile, line $.: '$line'\n");
		}
		$name=$1;
		$residues=$2;
		print("WARNING: Found no space between name and residues in $infile, line $.: '$line'\n");
	    } else {
		if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} # do not read in >ss_ and >aa_ sequences
		if ($line=~/^aa_/ || $line=~/^sa_/) {next;}     # do not read in >ss_ and >aa_ sequences
		if ($line=~/^ss_/) {
#		    if ($line=~/^ss_conf/)  {next;} # comment out to read sequence with confidence values
#		    if ($nss>=0) {next;}  # comment out: allow for two or more >ss_dssp and >ss_pred lines
		    $nss=$n;
		}
		$line=~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/;
		$name=$1;
		$residues=$2;
		$residues=~tr/ //d;
		$residues_per_line=length($residues);
	    }
	    if ($block==1) {
		$names[$k]=$name;
		$seqs[$k]=$residues;
	    } else {
		$seqs[$k].=$residues;
		if ($names[$k] ne $name) {
		    print("WARNING: name of sequence $k in block 1 ($names[$k]) is not the same as in block $block ($name) in $infile\n");
		}
	    }
#	    printf("%3i %3i %-16.16s %s\n",$block,$k,$names[$k],$seqs[$k]);
	    $k++;
	}
	if ($k && $n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");}
	if (!$n) {$n=$k;}
    }

   # Read psi format
    elsif ($informat eq "psi")
    {
	my $block=1;         # number of block currently read
	my $name;
	my $residues;
	$n=0;                # sequence number of first block
	$k=0;                # sequence index to zero for first block

	while ($line = <INFILE>)
	{
#	    print($namefieldlen.$line);
	    $line=~tr/\r//d;
	    if ($line=~/^\s*$/){                          # new sequence block starts
		if ($k) {
		    if ($n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");}
		    $block++;
		    $n=$k;
		    $k=0;  # set sequence index to zero for next block to read
		}
		next;
	    }

	    if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} # do not read in >ss_ and >aa_ sequences
	    if ($line=~/^aa_/ || $line=~/^sa_/) {next;}     # do not read in >ss_ and >aa_ sequences
	    if ($line=~/^ss_/) {
#		    if ($line=~/^ss_conf/)  {next;} # comment out to read sequence with confidence values
#		    if ($nss>=0) {next;}  # comment out: allow for two or more >ss_dssp and >ss_pred lines
		$nss=$n;
	    }
	    $line=~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/;
	    $name=$1;
	    $residues=$2;
	    $residues=~tr/ //d;

	    if ($block==1) {
		$names[$k]=$name;
		$seqs[$k]=$residues;
	    } else {
		$seqs[$k].=$residues;
		if ($names[$k] ne $name) {
		    print("WARNING: name of sequence $k in block 1 ($names[$k]) is not the same as in block $block ($name) in $infile\n");
		}
	    }
#	    printf("%3i %3i %-16.16s %s\n",$block,$k,$names[$k],$seqs[$k]);
	    $k++;
	}
	if ($k && $n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");}
	if (!$n) {$n=$k;}
    }

    close INFILE;


    # Empty input file?
    if ($n==0) {die("\nERROR: input file $infile contains no sequences\n");}


################################################################################################
# Transforming to upper-case
################################################################################################
    if ($informat ne "a3m" && $informat ne "a2m") {	# Transform to upper case if input format is not A3M or A2M
	for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/a-z/A-Z/;}
    }

################################################################################################
# Removing non-alphanumeric symbols
################################################################################################
    for ($k=0; $k<$n; $k++) {
	$seqs[$k]=~tr/A-Za-z0-9.~-//cd;
	$seqs[$k]=~tr/~/-/;
    }


################################################################################################
# Filling up with gaps '.' or deleting gaps
################################################################################################

    # Insertion of '.' only needed for a3m if: NOT -r option  OR  -M first  OR  -M <int> option
    if ($informat eq "a3m" && (!$remove_inserts || $matchmode))
    {
	print("inserting gaps...\n");
	my @len_ins;   # $len_ins[$j] will count the maximum number of inserted residues after match state $i.
	my $j;       # counts match states
	my @inserts; # $inserts[$j] contains the insert (in small case) of sequence $i after the $j'th match state
	my $insert;

	# Determine length of longest insert
	for ($k=0; $k<$n; $k++)
	{
	    # split into list of single match states and variable-length inserts
	    # ([A-Z]|-) is the split pattern. The parenthesis indicate that split patterns are to be included as list elements
	    # The '#' symbol is prepended to get rid of a perl bug in split
	    @inserts = split(/([A-Z]|-|~|[0-9])/,"#".$seqs[$k]."#");
	    $j=0;
#	printf("Sequence $k: $seqs[$k]\n");
#	printf("Sequence $k: @inserts\n");

	    # Does sequence $k contain insert after match state j that is longer than $ngap[$j]?
	    foreach $insert (@inserts)
	    {
		if( !defined $len_ins[$j] || length($insert)>$len_ins[$j]) {$len_ins[$j]=length($insert);}
		$j++;
#	    printf("$insert|");
	    }
#	printf("\n");
	}
	my $ngap;

	# Fill up inserts with gaps '.' up to length $len_ins[$j]
	for ($k=0; $k<$n; $k++)
	{
	    # split into list of single match states and variable-length inserts
	    @inserts = split(/([A-Z]|-|~|[0-9])/,"#".$seqs[$k]."#");
	    $j=0;

	    # append the missing number of gaps after each match state
	    foreach $insert (@inserts)
	    {
		for (my $l=length($insert); $l<$len_ins[$j]; $l++) {$insert.=".";}
		$j++;
	    }
	    $seqs[$k] = join("",@inserts);
	    $seqs[$k] =~ tr/\#//d; # remove the '#' symbols inserted at the beginning and end
	}
    }


################################################################################################
# Match state assignment
################################################################################################

    # Default match state rule for a3m is -M first
    if ($matchmode eq "" && ($outformat eq "a3m" || $outformat eq "a2m")) {$matchmode="first";}

    # Use gap rule for match state assignment?
    if ($matchmode eq "gaprule") {

	my @gaps=();
	my $residues;
	my @residues;

	# Determine number of gaps per column
	for ($k=0; $k<$n; $k++) {
	    @residues=unpack("C*",$seqs[$k]);
	    for (my $l=0; $l<@residues; $l++) {
		if ($residues[$l]==46 || $residues[$l]==45) {
		    if (defined $gaps[$l]) {$gaps[$l]++;} else {$gaps[$l]=1;}
		}
	    }
	}

	# Set columns with more gaps than $match_gaprule to lower case,
	for ($k=0; $k<$n; $k++) {
	    @residues=unpack("C*",$seqs[$k]);
	    $residues="";
	    for (my $l=0; $l<@residues; $l++) {
		if (!defined $gaps[$l] || $gaps[$l]<0.01*$match_gaprule*$n) {
		    if ($residues[$l]==46) {
			$residues .= "-";
		    } else {
			$residues .= uc(chr($residues[$l]));
		    }
		} else {
		    if ($residues[$l]==45) {
			$residues .= ".";
		    } else {
			$residues .= lc(chr($residues[$l]));
		    }
		}
		$seqs[$k]=$residues;
	    }
	}
    }

    # Use first sequence for match state assignment?
    if ($matchmode eq "first") {

	my @match=();
	my $residues;
	my @residues;


	# Determine which columns have a gap in first sequence
	for ($k=0; $k<scalar(@names); $k++) {  #find seed sequence
	    if ($names[$k]!~/^(ss_|aa_|sa_)/) {last;}
	}
	@residues=unpack("C*",$seqs[$k]);
	for (my $l=0; $l<@residues; $l++) {
	    if ($residues[$l]==46 || $residues[$l]==45) {$match[$l]=0;} else {$match[$l]=1;}
	}

	# Set columns without residue in first sequence to upper case,
	for ($k=0; $k<$n; $k++) {
	    @residues=unpack("C*",$seqs[$k]);
	    $residues="";
	    for (my $l=0; $l<@residues; $l++) {
		if ($match[$l]) {
		    if ($residues[$l]==46) {
			$residues .= "-";
		    } else {
			$residues .= uc(chr($residues[$l]));
		    }
		} else {
		    if ($residues[$l]==45) {
			$residues .= ".";
		    } else {
			$residues .= lc(chr($residues[$l]));
		    }
		}
		$seqs[$k]=$residues;
	    }
	}
    }


################################################################################################
# Remove gaps etc.
################################################################################################

    # Remove columns with too many gaps?
    if ($remove_gapped) {

	my @gaps=();
	my $residues;
	my @residues;

	# Determine number of gaps '.' or '-' per column
	for ($k=0; $k<$n; $k++) {
	    @residues=unpack("C*",$seqs[$k]);
	    for (my $l=0; $l<@residues; $l++) {
		if ($residues[$l]==45 || $residues[$l]==46) {
		    if (defined $gaps[$l]) {$gaps[$l]++;} else {$gaps[$l]=1;}
		}
	    }
	}

	# Remove columns with too many gaps
	for ($k=0; $k<$n; $k++) {
	    @residues=unpack("C*",$seqs[$k]);
	    $residues="";
	    for (my $l=0; $l<@residues; $l++) {
		if (!defined $gaps[$l] || $gaps[$l]<0.01*$remove_gapped*$n) {
		    $residues .= chr($residues[$l])
		}
		$seqs[$k]=$residues;
	    }
	}
    }

    # Remove/transliterate gaps?
    for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/ //d;}
    if ($remove_inserts) {
	for ($k=0; $k<$n; $k++) {
	    $seqs[$k]=~tr/a-z.//d;
#	    printf("%s\n",$seqs[$k]);
	}
    }


    # Remove sequences which contain only gaps
    my $nin=$n;
    for ($k=0; $k<$n; $k++) {
	if (($seqs[$k]=~tr/a-zA-Z0-9/a-zA-Z0-9/==0)) {
	    if ($v>=2) {print("Sequence contains only gaps and is removed: $names[$k]\n");}
	    splice(@seqs,$k,1);
	    splice(@names,$k,1);
	    $k--; $n--;
	}
    }


    # Cut down sequence names to $desclen characters maximum
    for ($k=0; $k<$n; $k++) {$names[$k]=substr($names[$k],0,$desclen);}

    if ($outformat eq "a3m") {
	# Remove all '.' gaps
	for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/.//d;}
    } elsif ($outformat eq "fas" || $outformat eq "clu" || $outformat eq "sto" || $outformat eq "psi" ) {
	# Substitute all '.' by '-'
	for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/./-/;}
    }
    if ($gap ne "default") {
	for ($k=0; $k<$n; $k++) {$seqs[$k]=~s/\./$gap/g; $seqs[$k]=~s/-/$gap/g;}
    }
    if ($case eq "uc") {
	for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/a-z/A-Z/;}
    } elsif ($case eq "lc") {
	for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/A-Z/a-z/;}
    }


    ####################################################
    # Check that sequences have same length
    ####################################################
    if ($outformat ne "a3m" && $outformat ne "ufas") {
	my $len=length($seqs[0]);
	for($k=1; $k<$n; $k++) {
  	    if (length($seqs[$k])!=$len) {
		printf("\nError: Sequences in $infile do not all have same length, e.g. >%-.20s  (len=%i)  and  >%-.20s  (len=%i)\n",
		       $names[0],$len,$names[$k],length($seqs[$k]));
		if ($v>=3) {
		    printf("%.20s %s\n%.20s %s\n\n",$names[0],$seqs[0],$names[$k],$seqs[$k]);
		}
		exit 1;
	    }
	}
    }


    ####################################################
    # Remove html tags
    ####################################################
    for($k=0; $k<$n; $k++) {
	$names[$k]=~s/<[A-Za-z\/].*?>//g; # remove html tags
    }


################################################################################################
# Output part
################################################################################################

    my $ndssp=-1;
    my $nsa=-1;
    my $npred=-1;
    my $nconf=-1;
    my $nquery=-1;
    for ($k=0; $k<$n; $k++) {
	if ($names[$k]=~/^ss_dssp/){$ndssp=$k; }
	elsif ($names[$k]=~/^sa_dssp/){$nsa=$k; }
	elsif ($names[$k]=~/^ss_pred/){$npred=$k; }
	elsif ($names[$k]=~/^ss_conf/){$nconf=$k; }
	elsif ($nquery==-1 && $names[$k]!~/^aa_/) {$nquery=$k;}
    }

    #Write sequences into output file
    open (OUTFILE, ">$outfile") or die ("cannot open $outfile:$!\n");
    if ($outformat eq "sto" || $outformat eq "psi") {
	my $refline;
	if ($outformat eq "sto") {
	    print(OUTFILE "# STOCKHOLM 1.0\n\n");

	    # Write reference annotation line for match states (-M first)
	    if (!$lname) {$lname=32;}
	    $names[$nquery]=~/^\S+\s+(.*)/;
	    printf(OUTFILE "%-$lname.$lname"."s %s\n","#=GF DE",$1);
	    $refline=$seqs[$nquery];
	    $refline=~s/[a-z]/-/g;
	    printf(OUTFILE "%-$lname.$lname"."s %s\n","#=GC RF",$refline);
	    if ($ndssp>=0) {
		printf(OUTFILE "%-32.32s %s\n","#=GC SS_cons",$seqs[$ndssp]);
	    }
	}
	if ($num) {
	    my $num=2;
	    for ($k=0; $k<$n; $k++) {
		if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;}
		$names[$k]=~s/^(\S+)\#\d+/$1/;
		$names[$k]=~s/^(\S{1,25})\S+/$1\#$num/;
		$num++;
	    }
	}
	for ($k=0; $k<$n; $k++) {
	    if ($k==$ndssp || $k==$npred || $k==$nconf) {next;}
	    $names[$k] =~ /\s*(\S+)/;
	    if (!$lname) {$lname=32;}
	    printf(OUTFILE "%-$lname.$lname"."s %s\n",$1,$seqs[$k]);
	}
	if ($outformat eq "sto") {print(OUTFILE "//\n");}
    } elsif ($outformat eq "clu") {
	printf(OUTFILE "CLUSTAL\n\n\n");
	if ($num) {
	    my $num=2;
	    for ($k=0; $k<$n; $k++) {
		if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;}
		$names[$k]=~s/^(\S+)\#\d+/$1/;
		$names[$k]=~s/^(\S{1,10})\S*/$1\#$num/;
		$num++;
	    }
	}
	while($seqs[0] ne "") {                         # While there are still residues left
	    for ($k=0; $k<scalar(@names); $k++) {       # go through all sequences
		$names[$k] =~ s/\s*(\S+).*/$1/;
		$seqs[$k]=~s/(\S{1,$numres})//;           # remove the leftmost (up to) 60 residues from sequence $nseq
		if (!$lname) {$lname=18;}
		printf(OUTFILE "%-$lname.$lname"."s %s\n",$names[$k],$1); # and print them after the sequence name
	    }
	    print(OUTFILE "\n");                            # leave a blank line after each block of 60 residues
	}
    } else {
	if ($num) {
	    my $num=2;
	    for ($k=0; $k<$n; $k++) {
		if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;}
		$names[$k]=~s/^(\S+)\#\d+/$1/;
		$names[$k]=~s/^(\S{1,25})\S+/$1\#$num/;
#		$names[$k]=~s/^(\S+)/$1\#$num/;
		$num++;
	    }
	}
	if ($titleline ne "" && $outformat eq "a3m") {
	    printf(OUTFILE "%s\n",$titleline);
	}
	for ($k=0; $k<$n; $k++) {
	    $seqs[$k]=~s/(\S{$numres})/$1\n/g;
	    printf(OUTFILE ">%s\n%s\n",$names[$k],$seqs[$k]);
	}
    }

    close OUTFILE;
    if ($v>=2) {
	if ($nin==1) {print("Reformatted $infile with 1 sequence from $informat to $outformat and written to file $outfile\n");}
	else {
	    if (!$nin==$n) {printf("Removed %i sequences which contained no residues\n",$nin-$n); }
	    print("Reformatted $infile with $n sequences from $informat to $outformat and written to file $outfile\n");
	}
    }

    return;
}
author	hammock
date	Wed, 01 Nov 2017 05:54:28 -0400
parents
children