diff pfam_annot/annota.pl @ 0:68a3648c7d91 draft default tip

Uploaded
author matteoc
date Thu, 22 Dec 2016 04:45:31 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pfam_annot/annota.pl	Thu Dec 22 04:45:31 2016 -0500
@@ -0,0 +1,221 @@
+#!/usr/bin/perl -w
+
+use strict;
+my $d_file="/home/inmare/galaxy/tools/pfam_annot/pfamA.txt";
+open(IN,$d_file);
+my %decode=();
+my %clan_decode;
+my $id="";
+my %c=();
+
+
+my $prot_file=shift;
+my $pfam_file=shift;
+my $prefix=shift;
+
+while(<IN>)
+{
+        if ($_=~/^\d/)
+        {
+                my @vl=(split(/\t+/));
+		$decode{$vl[1]}="$vl[3]<br>";#$vl[8] $vl[9]";
+                my $cc=0;
+		my %repeated=();
+		foreach my $v (@vl)
+                {
+			$v=~s/\[\d+\]/ /g;
+			last if $v=~/hmmbuild/;
+			last if $cc>10;
+			next if $v=~/anon/;
+			next if $v=~/Bates/;
+			next if $v=~/Cogis/;
+			next if $v=~/Coggis/;
+			next if $v=~/Bateman/;
+			next if $v=~/Sonnhammer/;
+			next if $v=~/Finn/;
+			next if $v=~/Studholme/;
+			next if $v=~/Kerrison/;
+			next if $repeated{$v};
+			next if $v eq $vl[3];
+			next unless length($v)>=20 && $cc<=9;
+			$decode{$vl[1]}.="$v ";
+                	$repeated{$v}++;
+			$cc++;
+		}
+		#print "$vl[1] $decode{$vl[1]}\n";
+        }
+}
+close(IN);
+
+my $clan_file="/home/inmare/galaxy/tools/pfam_annot/clans.txt";
+open(IN,$clan_file);
+while(<IN>)
+{
+        my @vl=(split(/\t/));
+       	#$clan_decode{$vl[1]}="$vl[3]";
+	my $cc=0;
+        foreach my $v (@vl)
+	{
+		$cc++;
+		$v=~s/\[\d+\]/ /g;
+		$clan_decode{$vl[1]}.="$v " if length($v) >=30 && $cc<=10;
+	}	
+
+}
+my %plasm=();
+open(IN,"$prot_file");
+while(<IN>)
+{
+        if ($_=~/^>(.*)/)
+        {
+                $id=$1;	
+		$id=(split(/\s+/,$id))[0];
+		if ($id=~/#/)
+		{
+			my $pid=(split(/\#/,$id))[0];
+			$plasm{$pid}++;
+		}
+        }else{
+		chomp;
+                $c{$id}.=$_;
+        }
+}
+close(IN);
+
+open(OUT,">$prefix");
+print OUT "<html>\n<head>\n";
+print OUT "<style type=\"text/css\">\nspan {\n\ttext-decoration:underline;\n\tcolor:blue;\n\tcursor:pointer;\n}\n</style>\n";
+print OUT "<script>\nfunction show(elementID) {\n\tvar ele = document.getElementById(elementID);\n\tif (!ele) {\n\t\talert(\"no such element\");\t\treturn;\n\t}\n\tvar pages = document.getElementsByClassName('page');\n\tfor(var i = 0; i < pages.length; i++) {\n\t\tpages[i].style.display = 'none';\n\t}\n\tele.style.display = 'block';\n}\n</script>\n";
+#print OUT "<script src=\"script.js\"></script>\n";
+#print OUT "<script>\n\tshow(elementID)\n</script>\n";
+print OUT "</head>\n";
+my $color="\"#czb9dz\"";
+my %printed;
+open(IN,$pfam_file);
+print OUT "Proteins with PFAM domains:\n<br><br>\n";
+my @dd=keys %plasm;
+if ($#dd>0)
+{
+	print OUT "<p>Show results ";
+	for (my $i=0;$i<=$#dd;$i++)
+	{
+		if ($i==$#dd)
+		{
+			print OUT "<span onclick=\"show(\'$dd[$i]\');\">$dd[$i]</span>.</p>\n";
+		}else{
+			print OUT "<span onclick=\"show(\'$dd[$i]\');\">$dd[$i]</span>,\n"; 
+		}
+	}
+}else{
+	print OUT "<div>\n<table cellpadding=\"0\" width=650>\n"; #div per ogni plasmide|
+}
+my $ntokens=0;
+my $prev_plasmid="";
+my $curr_plasmid="";
+my $np=0;
+while(<IN>)
+{
+        next if $_=~/^\#/;
+        my ($name,$domain,$clan)=(split(/\s+/))[0,5,-1];
+        next unless $name;
+	if ($name=~/#/)
+        {
+        	$curr_plasmid=(split(/\#/,$name))[0];
+		if ($curr_plasmid ne $prev_plasmid)
+		{
+			if ($np>0)
+			{
+				print OUT "</table>\n";
+				print OUT "</div>\n";
+			}
+			print OUT "<div id=\"$curr_plasmid\" class=\"page\" style=\"\">\n";
+			print OUT "<table cellpadding=\"0\" width=650>\n";
+			$np++;
+		}
+		$prev_plasmid=$curr_plasmid;   
+        }
+
+        $domain=~s/\.\d+//;
+        unless ($printed{$name})
+	{
+		my $seq=$c{$name};
+		
+		$seq=~s/\*//g;
+		$seq=form($seq,90);
+		print OUT "<td>\n";
+		print OUT "<HR SIZE=3 WIDTH=80%></HR>\n";
+		print OUT "<center><b>$name</b><br>\n</center>\n";
+		print OUT "</td>\n<tr></tr>\n";
+                print OUT "<td bgcolor=$color>\n";
+		print OUT "<pre> \n$seq\n </pre>\n";
+		print OUT "</td>\n<tr></tr>\n<td></td>\n<tr></tr>\n";
+		$ntokens=2;
+	}
+	my $hd=uc $domain;
+	#<a href="http://www.canoro.altervista.org/" class="nav" target="_blank">www.canoro.altervista.org</a>
+	if ($decode{$domain})
+	{
+		my $ddes=$decode{$domain};
+		$ddes=~s/\s+/ /g;
+		if ($ntokens % 2==0)
+                {
+                        print OUT "<td>\n";
+                }else{
+                        print OUT "<td bgcolor=$color>\n";
+                }
+	
+		print OUT "<p align=\"left\">\n";
+        	print OUT "<a href=http://pfam.xfam.org/family/$hd> $domain</a>\n<p align=\"justify\">$ddes</p>\n\n";
+		print OUT "</p>\n</td>\n<tr></tr>\n<td></td>\n<tr></tr>\n";
+		$ntokens++;
+	}
+        if ($clan_decode{$clan})
+        {
+                my $clanD=$clan_decode{$clan};
+		$clanD=~s/\s+/ /g;
+		next if $decode{$domain} eq $clan_decode{$clan};
+		my $ddes=$decode{$domain};
+                if ($ntokens % 2==0)
+                {
+                        print OUT "<td>\n";
+                }else{
+                        print OUT "<td bgcolor=$color>\n";
+                }
+
+                print OUT "<p align=\"left\">\n";
+		print OUT "<a href=http://pfam.xfam.org/clan/$clan> $clan</a>\n <p align=\"justify\">$clanD</p>\n\n";
+		print OUT "</p>\n</td>\n<tr></tr>\n<td></td>\n<tr></tr>\n";
+		$ntokens++;
+        }
+        $printed{$name}=1;
+}
+
+#if ($curr_plasmid ne "")
+#{
+	print OUT "</table>\n";
+	print OUT "</div>\n";
+	print OUT "</body>\n";
+#}
+#print OUT "<br><br>Proteins without PFAM domains:\n<br>\n";
+#foreach my $seq (keys %c)
+#{
+#        next if $printed{$seq};
+#        print OUT "<>$seq</pre>\n\n<br><br><left>\n$c{$seq}</left><br>\n";
+#	print OUT "<HR SIZE=3 WIDTH=80%>\n";
+#}
+#print OUT "</table>\n</div>\n</body>\n</html>\n";
+#close(OUT);
+
+sub form
+{
+	my $string=$_[0];
+	my $len=$_[1];
+	my $outS="";
+	for (my $i=0;$i<=length($string);$i+=$len)
+	{
+		$outS.=substr($string,$i,$len)."\n";
+	}
+	#print "A:$outS";
+	#$outS=~s/\s+//g;
+	return $outS;
+}