0
+ − 1 #!/usr/bin/perl -w
+ − 2 #Filename:
+ − 3 #Author: Chen Tingting
+ − 4 #Email: chentt@big.ac.cn
+ − 5 #Date: 2014/5/13
+ − 6 #Modified:
+ − 7 #Description: cluster annotate
+ − 8 my $version=1.00;
+ − 9
+ − 10 use strict;
+ − 11 use Getopt::Long;
+ − 12
+ − 13 my %opts;
+ − 14 GetOptions(\%opts,"i=s","g=s","n=s","r=s","p=s","o=s","t=s","l=s","h");
+ − 15 if (!(defined $opts{i} and defined $opts{g} and defined $opts{n} and defined $opts{r} and defined $opts{p} and defined $opts{o} and defined $opts{t} and defined $opts{l}) || defined $opts{h}) { #necessary arguments
+ − 16 &usage;
+ − 17 }
+ − 18
+ − 19 #my %gene;
+ − 20 my %gene1;
+ − 21 open IN,"<$opts{g}";
+ − 22 open OUT ,">$opts{l}";
+ − 23 print OUT "#ID\tchr\tstart\tend\tstrand\n";
+ − 24 my $n=1;
+ − 25 while (my $aline=<IN>) {
+ − 26 chomp $aline;
+ − 27 next if($aline=~/^\#/);
+ − 28 my @tmp=split/\t/,$aline;
+ − 29 my $ID;
+ − 30 if ($tmp[2] eq "gene") {
+ − 31 $tmp[0]=~s/Chr\./Chr/;
+ − 32 #$tmp[0]=~s/Chr/chr/;
+ − 33 my @infor=split/;/,$tmp[8];
+ − 34 for (my $i=0;$i<@infor ;$i++) {
+ − 35 if ($infor[$i]=~/Alias\=(\S+)$/) {
+ − 36 $ID=$1;
+ − 37 last;
+ − 38 }
+ − 39 else {
+ − 40 $ID="unknown$n";
+ − 41 $n++;
+ − 42 }
+ − 43 }
+ − 44 #$gene{$tmp[0]}{$ID}=[$tmp[3],$tmp[4],$tmp[6]];#$gene{chr}{geneID}=[start,end,strand]
+ − 45 push @{$gene1{$ID}},[$tmp[3],$tmp[4],$tmp[0]];
+ − 46 print OUT "$ID\t$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]\n";
+ − 47 }
+ − 48 }
+ − 49 #while (my $aline=<IN>) {
+ − 50 # chomp $aline;
+ − 51 # next if($aline=~/^\#/);
+ − 52 # my @tmp=split/\t/,$aline;#ID chr start end strand
+ − 53 # push @{$gene1{$tmp[0]}},[$tmp[2],$tmp[3],$tmp[1]];
+ − 54 # #$gene{$tmp[1]}{$tmp[0]}=[$tmp[2],$tmp[3],$tmp[1]];
+ − 55 #}
+ − 56 close IN;
+ − 57 close OUT;
+ − 58
+ − 59 my %nat;
+ − 60 open TMP,">$opts{t}";
+ − 61 print TMP "#NAT_ID\tGene\tStrand\tChr\tGene_start\tGene_end\tAntiGene\tStrand\tChr\tAntiGene_start\tAntiGene_end\tType1\tType2\tNATS1_start\tNATS1_end\tNATS2_start\tNATS2_end\n";
+ − 62
+ − 63 open IN,"<$opts{n}";
+ − 64 my $nat_n=1;
+ − 65 while (my $aline=<IN>) {
+ − 66 next if($aline=~/^\#/);#osj LOC_Os05g02659 - LOC_Os01g24200 + trans 1559 1802 660 905 246 100nt -
+ − 67 chomp $aline;
+ − 68 my @arr=split /\t/,$aline;
+ − 69 my ($ns,$ne,$ns2,$ne2)=(0,0,0,0);
+ − 70 if ($arr[11]=~/Nearby/) {
+ − 71 ($ns,$ne)=&nearby($gene1{$arr[1]}[0][0],$gene1{$arr[1]}[0][1],$gene1{$arr[3]}[0][0],$gene1{$arr[3]}[0][1]);
+ − 72 push @{$nat{$gene1{$arr[1]}[0][2]}},[$ns,$ne,$arr[5],$arr[11],"NATs".$nat_n];
+ − 73 print TMP "NATs$nat_n\t$arr[1]\t$arr[2]\t$gene1{$arr[1]}[0][2]\t$gene1{$arr[1]}[0][0]\t$gene1{$arr[1]}[0][1]\t$arr[3]\t$arr[4]\t$gene1{$arr[3]}[0][2]\t$gene1{$arr[3]}[0][0]\t$gene1{$arr[3]}[0][1]\t$arr[5]\t$arr[11]\t$ns\t$ne\t$ns\t$ne\n";
+ − 74 $nat_n++;
+ − 75 }else{
+ − 76 $ns=$gene1{$arr[1]}[0][0]+$arr[6]-1;
+ − 77 $ne=$gene1{$arr[1]}[0][0]+$arr[7]-1;
+ − 78 $ns2=$gene1{$arr[3]}[0][1]-$arr[9]+1;
+ − 79 $ne2=$gene1{$arr[3]}[0][1]-$arr[8]+1;
+ − 80 push @{$nat{$gene1{$arr[1]}[0][2]}},[$ns,$ne,$arr[5],$arr[11],"NATs$nat_n"."_1"];#start,end,class1,class2
+ − 81 push @{$nat{$gene1{$arr[3]}[0][2]}},[$ns2,$ne2,$arr[5],$arr[11],"NATs$nat_n"."_2"];
+ − 82 print TMP "NATs$nat_n\t$arr[1]\t$arr[2]\t$gene1{$arr[1]}[0][2]\t$gene1{$arr[1]}[0][0]\t$gene1{$arr[1]}[0][1]\t$arr[3]\t$arr[4]\t$gene1{$arr[3]}[0][2]\t$gene1{$arr[3]}[0][0]\t$gene1{$arr[3]}[0][1]\t$arr[5]\t$arr[11]\t$ns\t$ne\t$ns2\t$ne2\n";
+ − 83 $nat_n++;
+ − 84 }
+ − 85 }
+ − 86 close IN;
+ − 87 close TMP;
+ − 88
+ − 89 my %repeat;
+ − 90 open IN,"<$opts{'r'}";
+ − 91 my $first=<IN>;
+ − 92 $first=<IN>;
+ − 93 $first=<IN>;
+ − 94 while (my $aline=<IN>) {
+ − 95 chomp $aline;
+ − 96 $aline=~s/^\s+//;
+ − 97 my @tmp=split/\s+/,$aline;
+ − 98 $tmp[4]=~s/chr0/Chr/;
+ − 99 $tmp[4]=~s/chr/Chr/;
+ − 100 push @{$repeat{$tmp[4]}},[$tmp[5],$tmp[6],$tmp[10]];#start,end,class
+ − 101 #print "$tmp[4]\t$tmp[5]\t$tmp[6]\t$tmp[10]\n";
+ − 102 }
+ − 103 close IN;
+ − 104
+ − 105 my %phase;
+ − 106 open IN,"<$opts{'p'}";
+ − 107 while (my $aline=<IN>) {
+ − 108 chomp $aline;
+ − 109 next if($aline=~/^\#/);
+ − 110 my @tmp=split/\t/,$aline;
+ − 111 if ($tmp[5]>=25) {
+ − 112 $phase{$tmp[0]}=$tmp[5];
+ − 113 }
+ − 114 }
+ − 115 close IN;
+ − 116
+ − 117 my $filein=$opts{'i'};
+ − 118 my $fileout=$opts{'o'};
+ − 119 open IN,"<$filein"; #input file
+ − 120 open OUT,">$fileout"; #output file
+ − 121 while (my $aline=<IN>) {
+ − 122 chomp $aline;
+ − 123 if($aline=~/^\#/){
+ − 124 print OUT "$aline\tPhase\tLong\tRepeatClass\tNatClass1\tNatClass2\tNatID\n";
+ − 125 next;
+ − 126 }
+ − 127 my @tmp=split/\t/,$aline;
+ − 128 my @inf=split/\:/,$tmp[0];
+ − 129 my @pos=split/\-/,$inf[1];
+ − 130 my $chr=$inf[0];
+ − 131 $chr=~s/Chr0/Chr/;
+ − 132 my $start=$pos[0];
+ − 133 my $end=$pos[1];
+ − 134 #=========Repeat============
+ − 135 my @repeat;
+ − 136 if (defined(@{$repeat{$chr}})) {
+ − 137 my @r_array=sort {$a->[0] <=> $b->[0]} @{$repeat{$chr}};
+ − 138 for (my $i=0;$i<@r_array ;$i++) {
+ − 139 if ($start<$r_array[$i][0] && $end>$r_array[$i][0]) {
+ − 140 push @repeat,$r_array[$i][2];
+ − 141 }
+ − 142 elsif($start>$r_array[$i][0] && $start<$r_array[$i][1]){
+ − 143 push @repeat,$r_array[$i][2];
+ − 144
+ − 145 }
+ − 146 elsif($end<$r_array[$i][0]){
+ − 147 last;
+ − 148 }
+ − 149 else{next;}
+ − 150 }
+ − 151 }
+ − 152 my $repeat;
+ − 153 if (@repeat==0) {
+ − 154 $repeat="\/";
+ − 155 }
+ − 156 else{
+ − 157 $repeat=join ";",@repeat;
+ − 158 }
+ − 159 #=========nat===============
+ − 160 my @nat1;#class 1
+ − 161 my @nat2;#class 2
+ − 162 my @nat;#nat ID
+ − 163 #foreach my $chr (keys %nat) {
+ − 164 my @n_array=sort {$a->[0] <=> $b->[0] } @{$nat{$chr}};
+ − 165 for (my $i=0;$i<@n_array ;$i++) {
+ − 166 if ($start<$n_array[$i][0] && $end>$n_array[$i][0]) {
+ − 167 push @nat1,$n_array[$i][2];
+ − 168 push @nat2,$n_array[$i][3];
+ − 169 push @nat,$n_array[$i][4];
+ − 170 }
+ − 171 elsif($start>$n_array[$i][0] && $start<$n_array[$i][1]){
+ − 172 push @nat1,$n_array[$i][2];
+ − 173 push @nat2,$n_array[$i][3];
+ − 174 push @nat,$n_array[$i][4];
+ − 175 }
+ − 176 elsif($end<$n_array[$i][0]){
+ − 177 last;
+ − 178 }
+ − 179 else{next;}
+ − 180 }
+ − 181 #}
+ − 182
+ − 183 my $nat1;
+ − 184 my $nat2;
+ − 185 my $nat;
+ − 186 if (@nat1==0) {
+ − 187 $nat1="\/";
+ − 188 }
+ − 189 else{
+ − 190 $nat1=join ";",@nat1;
+ − 191 }
+ − 192 if (@nat2==0) {
+ − 193 $nat2="\/";
+ − 194 }
+ − 195 else{
+ − 196 $nat2=join ";",@nat2;
+ − 197 }
+ − 198 if (@nat==0) {
+ − 199 $nat="\/";
+ − 200 }
+ − 201 else{
+ − 202 $nat=join ";",@nat;
+ − 203 }
+ − 204 #========phase==============
+ − 205 my $phase="\/";
+ − 206 if (defined($phase{$tmp[0]})) {
+ − 207 $phase="phase";
+ − 208 }
+ − 209 #=========long===============
+ − 210 my $long="\/";
+ − 211 if ($tmp[1] eq "\>30nt" and $tmp[2]>=0.5) {
+ − 212 $long="long";
+ − 213 }
+ − 214 print OUT "$aline\t$phase\t$long\t$repeat\t$nat1\t$nat2\t$nat\n";
+ − 215 }
+ − 216
+ − 217 close IN;
+ − 218 close OUT;
+ − 219
+ − 220 sub nearby{
+ − 221 my @p=@_;
+ − 222 my ($s,$e)=(0,0);
+ − 223 if ($p[1]<$p[2]) {
+ − 224 $s=$p[1];
+ − 225 $e=$p[2];
+ − 226 }else{
+ − 227 $s=$p[3];
+ − 228 $e=$p[0];
+ − 229 }
+ − 230 return ($s,$e);
+ − 231 }
+ − 232
+ − 233 sub usage{
+ − 234 print <<"USAGE";
+ − 235 Version $version
+ − 236 Usage:
+ − 237 $0 -i -o -g -n -r -p -t -l
+ − 238 options:
+ − 239 -i input file
+ − 240 -g gff file
+ − 241 -n NATs file
+ − 242 -r repeat file
+ − 243 -p phase file
+ − 244 -o output file
+ − 245 -t nat output file
+ − 246 -l genelist output file
+ − 247 -h help
+ − 248 USAGE
+ − 249 exit(1);
+ − 250 }
+ − 251