Mercurial > repos > earlhaminst > t_coffee_to_cigar
diff t_coffee_to_cigar.pl @ 0:304d1a82708f draft default tip
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee_to_cigar commit e24b91d3051c422ee2a20caf8bb12f0896e2c84a-dirty
author | earlhaminst |
---|---|
date | Fri, 11 Nov 2016 06:57:26 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/t_coffee_to_cigar.pl Fri Nov 11 06:57:26 2016 -0500 @@ -0,0 +1,48 @@ +#!/usr/bin/perl +# +use strict; +use warnings; + +# A simple Perl script to convert FASTA sequence alignments into 2-column output where first column is FASTA id and second is CIGAR line +# TCoffee_to_cigar.pl <file> + +sub convert_and_print { + my ($header, $sequence) = @_; + # Converts each match into M and each gap into D + $sequence =~ s/[^-]/M/g; + $sequence =~ s/-/D/g; + + # Split the sequence in substrings composed by the same letter + $sequence =~ s/DM/D,M/g; + $sequence =~ s/MD/M,D/g; + my @cigar_array = split(',', $sequence); + + # Condense each substring, e.g. DDDD in 4D, and concatenate them again + my $cigar = ''; + foreach my $str (@cigar_array) { + if (length($str) > 1) { + $cigar .= length($str); + } + $cigar .= substr($str, 0, 1); + } + print "$header\t$cigar\n"; +} + +my $file1 = $ARGV[0]; +open my $fh1, '<', $file1; + +my $header = '', my $sequence = ''; +while (my $line = <$fh1>) { + chomp $line; + if (substr($line, 0, 1) eq '>') { + if ($header) { + convert_and_print($header, $sequence); + } + $header = substr($line, 1); + $sequence = ''; + } else { + $sequence .= $line; + } +} +close $fh1; +convert_and_print($header, $sequence);