Mercurial > repos > earlhaminst > t_coffee_to_cigar

diff t_coffee_to_cigar.pl @ 0:304d1a82708f draft default tip
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee_to_cigar commit e24b91d3051c422ee2a20caf8bb12f0896e2c84a-dirty
author: earlhaminst
date: Fri, 11 Nov 2016 06:57:26 -0500
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/t_coffee_to_cigar.pl	Fri Nov 11 06:57:26 2016 -0500
@@ -0,0 +1,48 @@
+#!/usr/bin/perl
+#
+use strict;
+use warnings;
+
+# A simple Perl script to convert FASTA sequence alignments into 2-column output where first column is FASTA id and second is CIGAR line
+# TCoffee_to_cigar.pl <file>
+
+sub convert_and_print {
+    my ($header, $sequence) = @_;
+    # Converts each match into M and each gap into D
+    $sequence =~ s/[^-]/M/g;
+    $sequence =~ s/-/D/g;
+
+    # Split the sequence in substrings composed by the same letter
+    $sequence =~ s/DM/D,M/g;
+    $sequence =~ s/MD/M,D/g;
+    my @cigar_array = split(',', $sequence);
+
+    # Condense each substring, e.g. DDDD in 4D, and concatenate them again
+    my $cigar = '';
+    foreach my $str (@cigar_array) {
+        if (length($str) > 1) {
+            $cigar .= length($str);
+        }
+        $cigar .= substr($str, 0, 1);
+    }
+    print "$header\t$cigar\n";
+}
+
+my $file1 = $ARGV[0];
+open my $fh1, '<', $file1;
+
+my $header = '', my $sequence = '';
+while (my $line = <$fh1>) {
+    chomp $line;
+    if (substr($line, 0, 1) eq '>') {
+        if ($header) {
+            convert_and_print($header, $sequence);
+        }
+        $header = substr($line, 1);
+        $sequence = '';
+    } else {
+        $sequence .= $line;
+    }
+}
+close $fh1;
+convert_and_print($header, $sequence);
author	earlhaminst
date	Fri, 11 Nov 2016 06:57:26 -0500
parents
children