view pfam_search/annota.Filter.pl @ 0:68a3648c7d91 draft default tip

Uploaded
author matteoc
date Thu, 22 Dec 2016 04:45:31 -0500
parents
children
line wrap: on
line source

#!/usr/bin/perl -w

use strict;
my $d_file="/home/inmare/galaxy/tools/pfam_search/pfamA.txt";
open(IN,$d_file);
my %decode=();
my %clan_decode;
my $id="";
my %c=();


my ($prot_file,$pfam_file,$prefix,@search_T)=@ARGV;
my $searchP="";
while(<IN>)
{
        if ($_=~/^\d/)
        {
                my @vl=(split(/\t+/));
		$decode{$vl[1]}="$vl[3]<br>";#$vl[8] $vl[9]";
                my $cc=0;
		my %repeated=();
		foreach my $v (@vl)
                {
			$v=~s/\[\d+\]/ /g;
			$cc++;
			last if $v=~/hmmbuild/;
                        last if $cc>10;
                        next if $v=~/anon/;
                        next if $v=~/Bates/;
                        next if $v=~/Cogis/;
                        next if $v=~/Bateman/;
                        next if $v=~/Sonnhammer/;
                        next if $v=~/Finn/;
			next if $v=~/Studholme/;
			next if $v eq $vl[3];
			next if $v=~/Kerrison/;
                        next if $repeated{$v};
                        #next if length($v)>=30 && $cc<=10;
			$decode{$vl[1]}.="$v " if length($v)>=20 && $cc<=10;
                	$repeated{$v}++;
		}
        }
}
close(IN);

my $clan_file="/home/inmare/galaxy/tools/pfam_search/clans.txt";
open(IN,$clan_file);
while(<IN>)
{
        my @vl=(split(/\t/));
       	#$clan_decode{$vl[1]}="$vl[3]";
	my $cc=0;
        foreach my $v (@vl)
	{
		$cc++;
		$v=~s/\[\d+\]/ /g;
		$clan_decode{$vl[1]}.="$v " if length($v) >=30 && $cc<=10;
	}	

}
close(IN);
open(IN,"$prot_file");
while(<IN>)
{
        if ($_=~/^>(.*)/)
        {
                $id=$1;
		$id=(split(/\s+/,$id))[0];
        }else{
		chomp;
                $c{$id}.=$_;
        }
}
close(IN);

foreach my $s (@search_T)
{
	$searchP.="$s ";
}

open(OUT,">$prefix");
print OUT "<html>\n<head></head>\n<body>\n";
my $color="\"#czb9dz\"";
my %printed;
open(IN,$pfam_file);
print OUT "Proteins with PFAM domains matching the keywords:\n<br><br>\n";
print OUT "<div>\n<table cellpadding=\"0\" width=650>\n";
my $ntokens=0;
while(<IN>)
{
        next if $_=~/^\#/;
        my ($name,$domain,$clan)=(split(/\s+/))[0,5,-1];
	next unless $name;
	next unless $domain;
	$domain=~s/\.\d+//;
	my $sd=$decode{$domain} ? $decode{$domain} : "MagnottaPantaleo§§";
	my $sc=$clan_decode{$clan} ? $clan_decode{$clan} : "SciarrattaCalogero@@";
	my $continue=match($searchP,$sd,$sc);
	#print "$name $domain $clan $sd $sc\n";
	next unless $continue;
        unless ($printed{$name})
	{
		my $seq=$c{$name};
		
		$seq=~s/\*//g;
		$seq=form($seq,90);
		print OUT "<td>\n";
		print OUT "<HR SIZE=3 WIDTH=80%>\n";
		print OUT "<center><b>$name</b><br>\n</center>\n";
		print OUT "</td\n<tr></tr>\n";
                print OUT "<td bgcolor=$color>\n";
		print OUT "<pre> \n$seq\n </pre>\n";
		print OUT "</td>\n<tr></tr>\n<td></td>\n<tr></tr>\n";
		$ntokens=2;
	}
	my $hd=uc $domain;
	#<a href="http://www.canoro.altervista.org/" class="nav" target="_blank">www.canoro.altervista.org</a>
	if ($decode{$domain})
	{
		my $ddes=$decode{$domain};
		if ($ntokens % 2==0)
                {
                        print OUT "<td>\n";
                }else{
                        print OUT "<td bgcolor=$color>\n";
                }
	
		print OUT "<p align=\"left\">\n";
        	print OUT "<a href=http://pfam.xfam.org/family/$hd> $domain</a>\n<p align=\"justify\">$ddes</p>\n\n";
		print OUT "</p>\n</td>\n<tr></tr>\n<td></td>\n<tr></tr>\n";
		$ntokens++;
	}
        if ($clan_decode{$clan})
        {
                my $clanD=$clan_decode{$clan};
		my $ddes=$decode{$domain};
                if ($ntokens % 2==0)
                {
                        print OUT "<td>\n";
                }else{
                        print OUT "<td bgcolor=$color>\n";
                }

                print OUT "<p align=\"left\">\n";
		print OUT "<a href=http://pfam.xfam.org/clan/$clan> $clan</a>\n <p align=\"justify\">$clanD</p>\n\n";
		print OUT "</p>\n</td>\n<tr></tr>\n<td></td>\n<tr></tr>\n";
		$ntokens++;
        }
        $printed{$name}=1;
}

#print OUT "<br><br>Proteins without PFAM domains:\n<br>\n";
#foreach my $seq (keys %c)
#{
#        next if $printed{$seq};
#        print OUT "<>$seq</pre>\n\n<br><br><left>\n$c{$seq}</left><br>\n";
#	print OUT "<HR SIZE=3 WIDTH=80%>\n";
#}
print OUT "</table>\n</div>\n</body>\n</html>\n";
close(OUT);

sub form
{
	my $string=$_[0];
	my $len=$_[1];
	my $outS="";
	my @vl=split('',$string);
	for (my $i=1;$i<=$#vl;$i++)
	{
		if ($i % $len==0 && $i>0)
		{
			$outS.="$vl[$i-1]\n";
		}else{
			$outS.=$vl[$i-1];
		} 
	}
	$outS.="\n";
	$outS=~s/ //g;
	return $outS;
}

sub match
{
	my $terms=$_[0];
	my $d1=$_[1];
	my $d2=$_[2];
	$terms=~s/AND/ /g;
	my @t1s=(split(/OR/,$terms));
	my $print_out=0;
	foreach my $t (@t1s)
	{
		#print "$t\n";
		last if $print_out==1; #OR non serve se 1 è verificata;
		#permute
		my @vl=(split(/\s+/,$t));
		my $nm1=0;
		my $nm2=0;
		foreach my $v (@vl)
		{
			$nm1++ if ($d1=~/$v/i);
			$nm2++ if ($d1=~/$v/i);		
		}
		$print_out=1 if ($nm1==($#vl+1)) || ($nm2==($#vl+1));
	}
	return $print_out;
}