view ClassAnnotate.pl @ 0:87fe81de0931 draft default tip

Uploaded
author bigrna
date Sun, 04 Jan 2015 02:47:25 -0500
parents
children
line wrap: on
line source

#!/usr/bin/perl -w
#Filename:
#Author: Chen Tingting
#Email: chentt@big.ac.cn
#Date: 2014/5/13
#Modified:
#Description: cluster annotate
my $version=1.00;

use strict;
use Getopt::Long;

my %opts;
GetOptions(\%opts,"i=s","g=s","n=s","r=s","p=s","o=s","t=s","l=s","h");
if (!(defined $opts{i} and defined $opts{g} and defined $opts{n} and defined $opts{r} and defined $opts{p} and defined $opts{o} and defined $opts{t}  and defined $opts{l}) || defined $opts{h}) { #necessary arguments
&usage;
}

#my %gene;
my %gene1;
open IN,"<$opts{g}";
open OUT ,">$opts{l}";
print OUT "#ID\tchr\tstart\tend\tstrand\n";
my $n=1;
while (my $aline=<IN>) {
	chomp $aline;
	next if($aline=~/^\#/);
	my @tmp=split/\t/,$aline;
	my $ID;
	if ($tmp[2] eq "gene") {
		$tmp[0]=~s/Chr\./Chr/;
		#$tmp[0]=~s/Chr/chr/;
		my @infor=split/;/,$tmp[8];
		for (my $i=0;$i<@infor ;$i++) {
			if ($infor[$i]=~/Alias\=(\S+)$/) {
				$ID=$1;
				last;
			}
			else {
				$ID="unknown$n";
				$n++;
			}
		}
		#$gene{$tmp[0]}{$ID}=[$tmp[3],$tmp[4],$tmp[6]];#$gene{chr}{geneID}=[start,end,strand]
		push @{$gene1{$ID}},[$tmp[3],$tmp[4],$tmp[0]];
		print OUT "$ID\t$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]\n";
	}
}
#while (my $aline=<IN>) {
#	chomp $aline;
#	next if($aline=~/^\#/);
#	my @tmp=split/\t/,$aline;#ID chr start end strand
#	push @{$gene1{$tmp[0]}},[$tmp[2],$tmp[3],$tmp[1]];
#	#$gene{$tmp[1]}{$tmp[0]}=[$tmp[2],$tmp[3],$tmp[1]];
#}
close IN;
close OUT;

my %nat;
open TMP,">$opts{t}";
print TMP "#NAT_ID\tGene\tStrand\tChr\tGene_start\tGene_end\tAntiGene\tStrand\tChr\tAntiGene_start\tAntiGene_end\tType1\tType2\tNATS1_start\tNATS1_end\tNATS2_start\tNATS2_end\n";

open IN,"<$opts{n}";
my $nat_n=1;
while (my $aline=<IN>) {
	next if($aline=~/^\#/);#osj LOC_Os05g02659 - LOC_Os01g24200 + trans 1559 1802 660 905 246 100nt -
	chomp $aline;
	my @arr=split /\t/,$aline;
	my ($ns,$ne,$ns2,$ne2)=(0,0,0,0);
	if ($arr[11]=~/Nearby/) {
		($ns,$ne)=&nearby($gene1{$arr[1]}[0][0],$gene1{$arr[1]}[0][1],$gene1{$arr[3]}[0][0],$gene1{$arr[3]}[0][1]);
		push @{$nat{$gene1{$arr[1]}[0][2]}},[$ns,$ne,$arr[5],$arr[11],"NATs".$nat_n];
		print TMP "NATs$nat_n\t$arr[1]\t$arr[2]\t$gene1{$arr[1]}[0][2]\t$gene1{$arr[1]}[0][0]\t$gene1{$arr[1]}[0][1]\t$arr[3]\t$arr[4]\t$gene1{$arr[3]}[0][2]\t$gene1{$arr[3]}[0][0]\t$gene1{$arr[3]}[0][1]\t$arr[5]\t$arr[11]\t$ns\t$ne\t$ns\t$ne\n";
		$nat_n++;
	}else{
		$ns=$gene1{$arr[1]}[0][0]+$arr[6]-1;
		$ne=$gene1{$arr[1]}[0][0]+$arr[7]-1;
		$ns2=$gene1{$arr[3]}[0][1]-$arr[9]+1;
		$ne2=$gene1{$arr[3]}[0][1]-$arr[8]+1;
		push @{$nat{$gene1{$arr[1]}[0][2]}},[$ns,$ne,$arr[5],$arr[11],"NATs$nat_n"."_1"];#start,end,class1,class2
		push @{$nat{$gene1{$arr[3]}[0][2]}},[$ns2,$ne2,$arr[5],$arr[11],"NATs$nat_n"."_2"];
		print TMP "NATs$nat_n\t$arr[1]\t$arr[2]\t$gene1{$arr[1]}[0][2]\t$gene1{$arr[1]}[0][0]\t$gene1{$arr[1]}[0][1]\t$arr[3]\t$arr[4]\t$gene1{$arr[3]}[0][2]\t$gene1{$arr[3]}[0][0]\t$gene1{$arr[3]}[0][1]\t$arr[5]\t$arr[11]\t$ns\t$ne\t$ns2\t$ne2\n";
		$nat_n++;
	}
}
close IN;
close TMP;

my %repeat;
open IN,"<$opts{'r'}";
my $first=<IN>;
$first=<IN>;
$first=<IN>;
while (my $aline=<IN>) {
	chomp $aline;
	$aline=~s/^\s+//;
	my @tmp=split/\s+/,$aline;
	$tmp[4]=~s/chr0/Chr/;
	$tmp[4]=~s/chr/Chr/;
	push @{$repeat{$tmp[4]}},[$tmp[5],$tmp[6],$tmp[10]];#start,end,class
	#print "$tmp[4]\t$tmp[5]\t$tmp[6]\t$tmp[10]\n";
}
close IN;

my %phase;
open IN,"<$opts{'p'}";
while (my $aline=<IN>) {
	chomp $aline;
	next if($aline=~/^\#/);
	my @tmp=split/\t/,$aline;
	if ($tmp[5]>=25) {
		$phase{$tmp[0]}=$tmp[5];
	}
}
close IN;

my $filein=$opts{'i'};
my $fileout=$opts{'o'};
open IN,"<$filein"; #input file  
open OUT,">$fileout"; #output file  
while (my $aline=<IN>) {
	chomp $aline;
	if($aline=~/^\#/){
		print OUT "$aline\tPhase\tLong\tRepeatClass\tNatClass1\tNatClass2\tNatID\n";
		next;
	}
	my @tmp=split/\t/,$aline;
	my @inf=split/\:/,$tmp[0];
	my @pos=split/\-/,$inf[1];
	my $chr=$inf[0];
	$chr=~s/Chr0/Chr/;
	my $start=$pos[0];
	my $end=$pos[1];
	#=========Repeat============
	my @repeat;
	if (defined(@{$repeat{$chr}})) {
		my @r_array=sort {$a->[0] <=> $b->[0]} @{$repeat{$chr}};
		for (my $i=0;$i<@r_array ;$i++) {
			if ($start<$r_array[$i][0] && $end>$r_array[$i][0]) {
				push @repeat,$r_array[$i][2];
			}
			elsif($start>$r_array[$i][0] && $start<$r_array[$i][1]){
				push @repeat,$r_array[$i][2];

			}
			elsif($end<$r_array[$i][0]){
				last;
			}
			else{next;}
		}
	}
	my $repeat;
	if (@repeat==0) {
		$repeat="\/";
	}
	else{
		$repeat=join ";",@repeat;
	}
	#=========nat===============
	my @nat1;#class 1
	my @nat2;#class 2
	my @nat;#nat ID
	#foreach my $chr (keys %nat) {
		my @n_array=sort {$a->[0] <=> $b->[0] } @{$nat{$chr}};
		for (my $i=0;$i<@n_array ;$i++) {
			if ($start<$n_array[$i][0] && $end>$n_array[$i][0]) {
				push @nat1,$n_array[$i][2];
				push @nat2,$n_array[$i][3];
				push @nat,$n_array[$i][4];
			}
			elsif($start>$n_array[$i][0] && $start<$n_array[$i][1]){
				push @nat1,$n_array[$i][2];
				push @nat2,$n_array[$i][3];
				push @nat,$n_array[$i][4];
			}
			elsif($end<$n_array[$i][0]){
				last;
			}
			else{next;}
		}
	#}

	my $nat1;
	my $nat2;
	my $nat;
	if (@nat1==0) {
		$nat1="\/";
	}
	else{
		$nat1=join ";",@nat1;
	}
	if (@nat2==0) {
		$nat2="\/";
	}
	else{
		$nat2=join ";",@nat2;
	}
	if (@nat==0) {
		$nat="\/";
	}
	else{
		$nat=join ";",@nat;
	}
	#========phase==============
	my $phase="\/";
	if (defined($phase{$tmp[0]})) {
		$phase="phase";
	}
	#=========long===============
	my $long="\/";
	if ($tmp[1] eq "\>30nt" and $tmp[2]>=0.5) {
		$long="long";
	}
	print OUT "$aline\t$phase\t$long\t$repeat\t$nat1\t$nat2\t$nat\n";
}

close IN;
close OUT;

sub nearby{
	my @p=@_;
	my ($s,$e)=(0,0);
	if ($p[1]<$p[2]) {
		$s=$p[1];
		$e=$p[2];
	}else{
		$s=$p[3];
		$e=$p[0];
		}
	return ($s,$e);
}

sub usage{
print <<"USAGE";
Version $version
Usage:
$0 -i -o -g -n -r -p -t -l
options:
-i input file
	-g gff file
	-n NATs file
	-r repeat file
	-p phase file
-o output file
-t nat output file
-l genelist output file
-h help
USAGE
exit(1);
}