comparison ClassAnnotate.pl @ 23:cad6a1dfb174 draft

Uploaded
author big-tiandm
date Wed, 05 Nov 2014 21:11:49 -0500
parents 07745c0958dd
children
comparison
equal deleted inserted replaced
22:b6686462d0cb 23:cad6a1dfb174
1 #!/usr/bin/perl -w
2 #Filename:
3 #Author: Chen Tingting
4 #Email: chentt@big.ac.cn
5 #Date: 2014/5/13
6 #Modified:
7 #Description: cluster annotate
8 my $version=1.00;
9
10 use strict;
11 use Getopt::Long;
12
13 my %opts;
14 GetOptions(\%opts,"i=s","g=s","n=s","r=s","p=s","o=s","t=s","l=s","h");
15 if (!(defined $opts{i} and defined $opts{g} and defined $opts{n} and defined $opts{r} and defined $opts{p} and defined $opts{o} and defined $opts{t} and defined $opts{l}) || defined $opts{h}) { #necessary arguments
16 &usage;
17 }
18
19 #my %gene;
20 my %gene1;
21 open IN,"<$opts{g}";
22 open OUT ,">$opts{l}";
23 print OUT "#ID\tchr\tstart\tend\tstrand\n";
24 my $n=1;
25 while (my $aline=<IN>) {
26 chomp $aline;
27 next if($aline=~/^\#/);
28 my @tmp=split/\t/,$aline;
29 my $ID;
30 if ($tmp[2] eq "gene") {
31 $tmp[0]=~s/Chr\./Chr/;
32 #$tmp[0]=~s/Chr/chr/;
33 my @infor=split/;/,$tmp[8];
34 for (my $i=0;$i<@infor ;$i++) {
35 if ($infor[$i]=~/Alias\=(\S+)$/) {
36 $ID=$1;
37 last;
38 }
39 else {
40 $ID="unknown$n";
41 $n++;
42 }
43 }
44 #$gene{$tmp[0]}{$ID}=[$tmp[3],$tmp[4],$tmp[6]];#$gene{chr}{geneID}=[start,end,strand]
45 push @{$gene1{$ID}},[$tmp[3],$tmp[4],$tmp[0]];
46 print OUT "$ID\t$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]\n";
47 }
48 }
49 #while (my $aline=<IN>) {
50 # chomp $aline;
51 # next if($aline=~/^\#/);
52 # my @tmp=split/\t/,$aline;#ID chr start end strand
53 # push @{$gene1{$tmp[0]}},[$tmp[2],$tmp[3],$tmp[1]];
54 # #$gene{$tmp[1]}{$tmp[0]}=[$tmp[2],$tmp[3],$tmp[1]];
55 #}
56 close IN;
57 close OUT;
58
59 my %nat;
60 open TMP,">$opts{t}";
61 print TMP "#NAT_ID\tGene\tStrand\tChr\tGene_start\tGene_end\tAntiGene\tStrand\tChr\tAntiGene_start\tAntiGene_end\tType1\tType2\tNATS1_start\tNATS1_end\tNATS2_start\tNATS2_end\n";
62
63 open IN,"<$opts{n}";
64 my $nat_n=1;
65 while (my $aline=<IN>) {
66 next if($aline=~/^\#/);#osj LOC_Os05g02659 - LOC_Os01g24200 + trans 1559 1802 660 905 246 100nt -
67 chomp $aline;
68 my @arr=split /\t/,$aline;
69 my ($ns,$ne,$ns2,$ne2)=(0,0,0,0);
70 if ($arr[11]=~/Nearby/) {
71 ($ns,$ne)=&nearby($gene1{$arr[1]}[0][0],$gene1{$arr[1]}[0][1],$gene1{$arr[3]}[0][0],$gene1{$arr[3]}[0][1]);
72 push @{$nat{$gene1{$arr[1]}[0][2]}},[$ns,$ne,$arr[5],$arr[11],"NATs".$nat_n];
73 print TMP "NATs$nat_n\t$arr[1]\t$arr[2]\t$gene1{$arr[1]}[0][2]\t$gene1{$arr[1]}[0][0]\t$gene1{$arr[1]}[0][1]\t$arr[3]\t$arr[4]\t$gene1{$arr[3]}[0][2]\t$gene1{$arr[3]}[0][0]\t$gene1{$arr[3]}[0][1]\t$arr[5]\t$arr[11]\t$ns\t$ne\t$ns\t$ne\n";
74 $nat_n++;
75 }else{
76 $ns=$gene1{$arr[1]}[0][0]+$arr[6]-1;
77 $ne=$gene1{$arr[1]}[0][0]+$arr[7]-1;
78 $ns2=$gene1{$arr[3]}[0][1]-$arr[9]+1;
79 $ne2=$gene1{$arr[3]}[0][1]-$arr[8]+1;
80 push @{$nat{$gene1{$arr[1]}[0][2]}},[$ns,$ne,$arr[5],$arr[11],"NATs$nat_n"."_1"];#start,end,class1,class2
81 push @{$nat{$gene1{$arr[3]}[0][2]}},[$ns2,$ne2,$arr[5],$arr[11],"NATs$nat_n"."_2"];
82 print TMP "NATs$nat_n\t$arr[1]\t$arr[2]\t$gene1{$arr[1]}[0][2]\t$gene1{$arr[1]}[0][0]\t$gene1{$arr[1]}[0][1]\t$arr[3]\t$arr[4]\t$gene1{$arr[3]}[0][2]\t$gene1{$arr[3]}[0][0]\t$gene1{$arr[3]}[0][1]\t$arr[5]\t$arr[11]\t$ns\t$ne\t$ns2\t$ne2\n";
83 $nat_n++;
84 }
85 }
86 close IN;
87 close TMP;
88
89 my %repeat;
90 open IN,"<$opts{'r'}";
91 my $first=<IN>;
92 $first=<IN>;
93 $first=<IN>;
94 while (my $aline=<IN>) {
95 chomp $aline;
96 $aline=~s/^\s+//;
97 my @tmp=split/\s+/,$aline;
98 $tmp[4]=~s/chr0/Chr/;
99 $tmp[4]=~s/chr/Chr/;
100 push @{$repeat{$tmp[4]}},[$tmp[5],$tmp[6],$tmp[10]];#start,end,class
101 #print "$tmp[4]\t$tmp[5]\t$tmp[6]\t$tmp[10]\n";
102 }
103 close IN;
104
105 my %phase;
106 open IN,"<$opts{'p'}";
107 while (my $aline=<IN>) {
108 chomp $aline;
109 next if($aline=~/^\#/);
110 my @tmp=split/\t/,$aline;
111 if ($tmp[5]>=25) {
112 $phase{$tmp[0]}=$tmp[5];
113 }
114 }
115 close IN;
116
117 my $filein=$opts{'i'};
118 my $fileout=$opts{'o'};
119 open IN,"<$filein"; #input file
120 open OUT,">$fileout"; #output file
121 while (my $aline=<IN>) {
122 chomp $aline;
123 if($aline=~/^\#/){
124 print OUT "$aline\tPhase\tLong\tRepeatClass\tNatClass1\tNatClass2\tNatID\n";
125 next;
126 }
127 my @tmp=split/\t/,$aline;
128 my @inf=split/\:/,$tmp[0];
129 my @pos=split/\-/,$inf[1];
130 my $chr=$inf[0];
131 $chr=~s/Chr0/Chr/;
132 my $start=$pos[0];
133 my $end=$pos[1];
134 #=========Repeat============
135 my @repeat;
136 if (defined(@{$repeat{$chr}})) {
137 my @r_array=sort {$a->[0] <=> $b->[0]} @{$repeat{$chr}};
138 for (my $i=0;$i<@r_array ;$i++) {
139 if ($start<$r_array[$i][0] && $end>$r_array[$i][0]) {
140 push @repeat,$r_array[$i][2];
141 }
142 elsif($start>$r_array[$i][0] && $start<$r_array[$i][1]){
143 push @repeat,$r_array[$i][2];
144
145 }
146 elsif($end<$r_array[$i][0]){
147 last;
148 }
149 else{next;}
150 }
151 }
152 my $repeat;
153 if (@repeat==0) {
154 $repeat="\/";
155 }
156 else{
157 $repeat=join ";",@repeat;
158 }
159 #=========nat===============
160 my @nat1;#class 1
161 my @nat2;#class 2
162 my @nat;#nat ID
163 #foreach my $chr (keys %nat) {
164 my @n_array=sort {$a->[0] <=> $b->[0] } @{$nat{$chr}};
165 for (my $i=0;$i<@n_array ;$i++) {
166 if ($start<$n_array[$i][0] && $end>$n_array[$i][0]) {
167 push @nat1,$n_array[$i][2];
168 push @nat2,$n_array[$i][3];
169 push @nat,$n_array[$i][4];
170 }
171 elsif($start>$n_array[$i][0] && $start<$n_array[$i][1]){
172 push @nat1,$n_array[$i][2];
173 push @nat2,$n_array[$i][3];
174 push @nat,$n_array[$i][4];
175 }
176 elsif($end<$n_array[$i][0]){
177 last;
178 }
179 else{next;}
180 }
181 #}
182
183 my $nat1;
184 my $nat2;
185 my $nat;
186 if (@nat1==0) {
187 $nat1="\/";
188 }
189 else{
190 $nat1=join ";",@nat1;
191 }
192 if (@nat2==0) {
193 $nat2="\/";
194 }
195 else{
196 $nat2=join ";",@nat2;
197 }
198 if (@nat==0) {
199 $nat="\/";
200 }
201 else{
202 $nat=join ";",@nat;
203 }
204 #========phase==============
205 my $phase="\/";
206 if (defined($phase{$tmp[0]})) {
207 $phase="phase";
208 }
209 #=========long===============
210 my $long="\/";
211 if ($tmp[1] eq "\>30nt" and $tmp[2]>=0.5) {
212 $long="long";
213 }
214 print OUT "$aline\t$phase\t$long\t$repeat\t$nat1\t$nat2\t$nat\n";
215 }
216
217 close IN;
218 close OUT;
219
220 sub nearby{
221 my @p=@_;
222 my ($s,$e)=(0,0);
223 if ($p[1]<$p[2]) {
224 $s=$p[1];
225 $e=$p[2];
226 }else{
227 $s=$p[3];
228 $e=$p[0];
229 }
230 return ($s,$e);
231 }
232
233 sub usage{
234 print <<"USAGE";
235 Version $version
236 Usage:
237 $0 -i -o -g -n -r -p -t -l
238 options:
239 -i input file
240 -g gff file
241 -n NATs file
242 -r repeat file
243 -p phase file
244 -o output file
245 -t nat output file
246 -l genelist output file
247 -h help
248 USAGE
249 exit(1);
250 }
251