view breakMAF.pl @ 3:d49b9759e294 draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/rnacode commit 3abc213e109bb564de7dee75d71d90eb1bf78c7e
author rnateam
date Thu, 15 Nov 2018 01:24:06 -0500
parents 434332033e82
children
line wrap: on
line source

#!/usr/bin/perl -w

use strict;
use Getopt::Long;
use POSIX qw(ceil floor);

my $maxLength     = 400;
my $desiredLength = 200;
my $help          = 0;

GetOptions(
  "maxLength:i"     => \$maxLength,
  "desiredLength:i" => \$desiredLength,
  "help"            => \$help,
  "h"               => \$help
);

if ($help) {
  print STDERR "\nbreakMAF [options] < input.maf > output.maf\n\n";
  print STDERR "Options:\n";
  print STDERR "  maxLength:     Break all blocks longer than that (default: 400 columns)\n";
  print STDERR "  desiredLength: Try to create blocks of this size (default: 200 columns)\n";
  exit(1);
}

$/ = 'a score=';

while ( my $block = <> ) {

  $block = 'a score=' . $block;

  my $currAln = readMAF($block)->[0];

  next if not defined($currAln);

  my $length = length( $currAln->[0]->{seq} );

  if ( $length > $maxLength ) {

    my $breakN      = int( $length / $desiredLength );
    my $chunkLength = ceil( $length / $breakN );

    my $from = 0;

    while (1) {
      my $to = $from + $chunkLength;

      if ( $to > $length ) {
        $to = $length;
      }

      my $slice = sliceAlnByColumn( $currAln, $from, $to );

      print formatAln( $slice, 'maf' ), "\n";

      $from = $to;

      last if $from == $length;
    }

  } else {
    print formatAln( $currAln, 'maf' ), "\n";
  }
}

######################################################################
#
# readMAF($string)
#
# Converts the MAF in string to internal alignment format. Returns
# list of usual array references.
#
######################################################################

sub readMAF {

  my $string = shift;

  return [] if $string eq '';

  my @input = split( "\n", $string );

  #open(FILE,"<$file") || die("Could not read $file ($!)");

  my @outAlns = ();
  my @aln     = ();

  foreach my $i ( 0 .. $#input ) {

    $_ = $input[$i];

    next if (/^\s?\#/);
    next if (/^\s?a/);

    if (/^\s?s/) {
      ( my $dummy, my $name, my $start, my $length, my $strand, my $fullLength, my $seq ) = split;

      my $end = $start + $length;

      ( my $org, my $chrom ) = split( /\./, $name );

      push @aln, {
        name   => $name,
        org    => $org,
        chrom  => $chrom,
        start  => $start,
        end    => $end,
        seq    => $seq,
        strand => $strand
        };
    }

    if ( /^\s?$/ and @aln ) {
      push @outAlns, [@aln];
      @aln = ();
    }
    if ( ( not defined $input[ $i + 1 ] ) and (@aln) ) {
      push @outAlns, [@aln];
      @aln = ();
    }
  }
  return \@outAlns;
}

sub formatAln {

  my @aln = @{ $_[0] };

  my $format = $_[1];

  $format = lc($format);

  my @alnSeqs  = ();
  my @alnNames = ();

  my $counter = 1;

  foreach my $row (@aln) {

    my $name = "seq$counter";
    $counter++;
    if ( $row->{name} ) {
      $name = ( $row->{name} );
    } elsif ( $row->{org} and $row->{chrom} ) {
      $name = "$row->{org}.$row->{chrom}";
    }

    my $start  = $row->{start};
    my $end    = $row->{end};
    my $strand = $row->{strand};

    my $pos = '';

    if (  defined $start
      and defined $end
      and defined $strand
      and ( $format ne 'phylip' ) ) {
      ( $start, $end ) = ( $end, $start ) if ( $strand eq '-' );
      $pos = "/$start-$end";
    }

    push @alnNames, "$name$pos";
    push @alnSeqs,  $row->{seq};

  }

  my $output = '';

  if ( $format eq 'clustal' ) {

    $output = "CLUSTAL W(1.81) multiple sequence alignment\n\n\n";
    my $maxName = 0;

    foreach my $name (@alnNames) {
      $maxName = ( $maxName < length($name) ) ? length($name) : $maxName;
    }

    for my $i ( 0 .. $#alnNames ) {
      my $buffer = " " x ( ( $maxName + 6 ) - length( $alnNames[$i] ) );
      $alnNames[$i] .= $buffer;
    }
    my $columnWidth = 60;
    my $currPos     = 0;
    my $length      = length( $alnSeqs[0] );

    while ( $currPos < $length ) {
      for my $i ( 0 .. $#alnNames ) {
        $output .= $alnNames[$i];
        $output .= substr( $alnSeqs[$i], $currPos, $columnWidth );
        $output .= "\n";
      }
      $output .= "\n\n";
      $currPos += $columnWidth;
    }
  } elsif ( $format eq 'fasta' ) {
    foreach my $i ( 0 .. $#alnNames ) {
      my $name = $alnNames[$i];
      my $seq  = $alnSeqs[$i];
      $seq =~ s/(.{60})/$1\n/g;
      $output .= ">$name\n$seq\n";
    }
  } elsif ( $format eq 'phylip' ) {
    my $length = length( $alnSeqs[0] );
    my $N      = @alnSeqs;
    $output .= "  $N $length\n";
    foreach my $i ( 0 .. $#alnNames ) {
      my $name = $alnNames[$i];
      my $seq  = $alnSeqs[$i];
      $seq =~ s/(.{60})/$1\n/g;
      $output .= "$name\n$seq\n";
    }
  } elsif ( $format eq 'maf' ) {
    $output .= "a score=0\n";
    foreach my $row (@aln) {
      my $length = $row->{end} - $row->{start};
      $output .= "s $row->{org}.$row->{chrom} $row->{start} $length $row->{strand} 0 $row->{seq}\n";
    }
  }
  return $output;

}

######################################################################
#
# sliceAlnByColumn(\@aln ref-to-alignment, $start int, $end int)
#
# Returns slice of alignment specified by alignment column.
#
# \@aln ... alignment in list of hash format
# $start, $end ... slice to cut
#
# Returns reference to alignment in list of hash format. This is a new
# alignment, i.e. the input is not sliced in place
#
######################################################################

sub sliceAlnByColumn {

  my @aln = @{ $_[0] };
  shift;
  ( my $start, my $end ) = @_;

  # correct ends without warning if outside of valid range
  $start = 0 if ( $start < 0 );
  $end = length( $aln[0]->{seq} ) if ( $end > length( $aln[0]->{seq} ) );

  #my @newAln=@aln;

  # make deep copy of list of hash
  my @newAln = ();
  foreach (@aln) {
    push @newAln, { %{$_} };
  }

  foreach my $i ( 0 .. $#newAln ) {

    my $oldStart = $newAln[$i]->{start};
    my $oldEnd   = $newAln[$i]->{end};

    $newAln[$i]->{start} = alnCol2genomePos( $newAln[$i]->{seq}, $oldStart, $start );
    $newAln[$i]->{end}   = alnCol2genomePos( $newAln[$i]->{seq}, $oldStart, $end - 1 ) + 1;
    $newAln[$i]->{seq} = substr( $newAln[$i]->{seq}, $start, $end - $start );

  }

  return ( [@newAln] );

}

######################################################################
#
# alnCol2genomePos($seq string, $start int, $col int)
#
# Calculates the genomic position corresponding to a column in an
# alignment.
#
# $seq ... sequence from alignment (i.e. letters with gaps)
# $start ... Genomic position of first letter in $seq
# $col ... column in the alignment that is to be mapped
#
# Returns genomic position. No error handling, so $col must be a valid
# column of the string $seq.
#
#######################################################################

sub alnCol2genomePos {

  ( my $seq, my $start, my $col ) = @_;

  $seq =~ s/\./-/g;    #Convert all gaps to "-"

  my $newPos = $start;

  # if gap only...
  return $start if ( $seq =~ /^-+$/ );

  ( my $tmp ) = $seq =~ /(-*)[^-]/;

  my $leadingGaps = length($tmp);

  # if column is in gap before first letter,
  # return position of the first letter
  return $start if ( $col < $leadingGaps );

  $newPos = $start - 1;

  for my $i ( $leadingGaps .. $col ) {
    $newPos++ if ( ( my $t = substr( $seq, $i, 1 ) ) ne '-' );
  }
  return $newPos;
}