comparison precursors.pl @ 17:1131b4008650 draft

Uploaded
author big-tiandm
date Fri, 25 Jul 2014 05:22:09 -0400
parents
children 4c0b1a94b882
comparison
equal deleted inserted replaced
16:6a878837c24a 17:1131b4008650
1 #!/usr/bin/perl -w
2 #Filename:
3 #Author: Tian Dongmei
4 #Email: tiandm@big.ac.cn
5 #Date: 2013/7/19
6 #Modified:
7 #Description:
8 my $version=1.00;
9
10 use strict;
11 use Getopt::Long;
12 use RNA;
13
14 my %opts;
15 GetOptions(\%opts,"map=s","g=s","d:i","f:i","o=s","e:f","s=s","h");
16 if (!(defined $opts{map} and defined $opts{g} and defined $opts{o} and defined $opts{s} ) || defined $opts{h}) { #necessary arguments
17 &usage;
18 }
19
20 my $filein=$opts{'map'};
21 my $faout=$opts{'o'};
22 my $strout=$opts{'s'};
23 my $genome= $opts{'g'};
24
25 my $maxd=defined $opts{'d'} ? $opts{'d'} : 200;
26 my $flank=defined $opts{'f'}? $opts{'f'} : 10;
27
28 my $MAX_ENERGY=-18;
29 if (defined $opts{'e'}) {$MAX_ENERGY=$opts{'e'};}
30 my $MAX_UNPAIR=5;
31 my $MIN_PAIR=15;
32 my $MAX_SIZEDIFF=4;
33 my $MAX_BULGE=2;
34 my $ASYMMETRY=5;
35 my $MIN_UNPAIR=0;
36 my $MIN_SPACE=5;
37 my $MAX_SPACE=$maxd;
38 my $FLANK=$flank;
39
40 ######### load in genome sequences start ########
41 my %genome;
42 my %lng;
43 my $name;
44 open IN,"<$genome";
45 while (my $aline=<IN>) {
46 chomp $aline;
47 next if($aline=~/^\#/);
48 if ($aline=~/^>(\S+)/) {
49 $name=$1;
50 next;
51 }
52 $genome{$name} .=$aline;
53 }
54 close IN;
55 foreach my $key (keys %genome) {
56 $lng{$key}=length($genome{$key});
57 }
58 ####### load in genome sequences end ##########
59
60 my %breaks; ### reads number bigger than 3
61 open IN,"<$filein"; #input file
62 while (my $aline=<IN>) {
63 chomp $aline;
64 my @tmp=split/\t/,$aline;
65 $tmp[0]=~/_x(\d+)$/;
66 my $no=$1;
67 next if($no<3);
68 #my $trand=&find_strand($tmp[9]);
69 #my @pos=split/\.\./,$tmp[5];
70 my $end=$tmp[3]+length($tmp[4])-1;
71 if($tmp[1] eq "-"){$tmp[4]=revcom($tmp[4]);}
72 push @{$breaks{$tmp[2]}{$tmp[1]}},[$tmp[3],$end,$no,$tmp[4]]; ### 0 base
73 }
74 close IN;
75
76 my %cites; ### peaks
77 foreach my $chr (keys %breaks) {
78 foreach my $strand (keys %{$breaks{$chr}}) {
79 my @array=@{$breaks{$chr}{$strand}};
80 @array=sort{$a->[0]<=>$b->[0]} @array;
81 for (my $i=0;$i<@array;$i++) {
82 my $start=$array[$i][0];my $end=$array[$i][1];
83 my @subarray=();
84 push @subarray,$array[$i];
85
86 for (my $j=$i+1;$j<@array;$j++) {
87 if ($start<$array[$j][1] && $end>$array[$j][0]) {
88 push @subarray,$array[$j];
89 ($start,$end)=&newpos($start,$end,$array[$j][0],$array[$j][1]);
90 }
91 else{
92 $i=$j;
93 &find_cites(\@subarray,$chr,$strand);
94 last;
95 }
96 }
97 }
98 }
99 }
100
101 open FA,">$faout"; #output file
102 open STR,">$strout";
103 foreach my $chr (keys %cites) {
104 foreach my $strand (keys %{$cites{$chr}}) {
105
106 my @array2=@{$cites{$chr}{$strand}};
107 @array2=sort{$a->[0]<=>$b->[0]} @array2;
108 &excise(\@array2,$chr,$strand);
109 }
110 }
111 close FA;
112 close STR;
113 sub oneCiteDn{
114 my ($array,$a,$chr,$strand)=@_;
115
116 my $ss=$$array[$a][0]-$flank;
117 $ss=0 if($ss<0);
118 my $ee=$$array[$a][1]+$maxd+$flank;
119 $ee=$lng{$chr} if($ee>$lng{$chr});
120
121 my $seq=substr($genome{$chr},$ss,$ee-$ss+1);
122 if($strand eq "-"){$seq=revcom($seq);}
123
124 my $val=&ffw1($seq,$$array[$a][3],$chr,$strand,$ss,$ee);
125 return $val;
126 }
127 sub oneCiteUp{
128 my ($array,$a,$chr,$strand)=@_;
129
130 my $ss=$$array[$a][0]-$maxd-$flank;
131 $ss=0 if($ss<0);
132 my $ee=$$array[$a][1]+$flank;
133 $ee=$lng{$chr} if($ee>$lng{$chr});
134
135 my $seq=substr($genome{$chr},$ss,$ee-$ss+1);
136 if($strand eq "-"){$seq=revcom($seq);}
137
138 my $val=&ffw1($seq,$$array[$a][3],$chr,$strand,$ss,$ee);
139 return $val;
140
141 }
142
143 sub twoCites{
144 my ($array,$a,$b,$chr,$strand)=@_;
145
146 my $ss=$$array[$a][0]-$flank;
147 $ss=0 if($ss<0);
148 my $ee=$$array[$b][1]+$flank;
149 $ee=$lng{$chr} if($ee>$lng{$chr});
150
151 my $seq=substr($genome{$chr},$ss,$ee-$ss+1);
152 if($strand eq "-"){$seq=revcom($seq);}
153
154 # my( $str,$mfe)=RNA::fold($seq);
155 # return 0 if($mfe>$MAX_ENERGY); ### minimum mfe
156 my $val=&ffw2($seq,$$array[$a][3],$$array[$b][3],$chr,$strand,$ss,$ee);
157
158 return $val;
159
160 }
161 sub excise{
162 my ($cluster,$chr,$strand)=@_;
163
164 my $last_pos=0;
165 for (my $i=0;$i<@{$cluster};$i++) {
166 next if($$cluster[$i][0]<$last_pos);
167 my $ok=0;
168 for (my $j=$i+1;$j<@{$cluster} ;$j++) {
169 if($$cluster[$j][0]-$$cluster[$i][1]>$maxd){
170 $i=$j;
171 last;
172 }else{
173 $ok=&twoCites($cluster,$i,$j,$chr,$strand);
174 if($ok){ $last_pos=$$cluster[$j][1]+$flank; $i=$j; last;}
175 }
176 }
177 next if($ok);
178
179 $ok=&oneCiteDn($cluster,$i,$chr,$strand);
180 if($ok){$last_pos=$$cluster[$i][1]+$maxd+$flank; next;}
181 $ok=&oneCiteUp($cluster,$i,$chr,$strand);
182 if($ok){$last_pos=$$cluster[$i][1]+$flank;next;}
183 }
184
185
186 }
187
188 sub ffw2{
189 my ($seq,$tag1,$tag2,$chr,$strand,$ss,$ee)=@_;
190
191 my $N_count=$seq=~tr/N//; ## precursor sequence has not more than 5 Ns
192 if ($N_count > 5) {
193 return 0;
194 }
195
196 my $seq_length=length $seq;
197 # position tag1 and tag2
198 my $tag1_beg=index($seq,$tag1,0)+1;
199 if ($tag1_beg < 1) {
200 warn "[ffw2] coordinate error.\n";
201 # $fold->{reason}="coordinate error";
202 return 0;
203 }
204 my $tag2_beg=index($seq,$tag2,0)+1;
205 if ($tag2_beg < 1) {
206 warn "[ffw2] coordinate error.\n";
207 # $fold->{reason}="coordinate error";
208 return 0;
209 }
210 if ($tag2_beg < $tag1_beg) {
211 # swap tag1 and tag2
212 ($tag1,$tag2)=($tag2,$tag1);
213 ($tag1_beg,$tag2_beg)=($tag2_beg,$tag1_beg);
214 }
215 my $tag1_end=$tag1_beg+length($tag1)-1;
216 my $tag2_end=$tag2_beg+length($tag2)-1;
217 # re-clipping
218 my $beg=$tag1_beg-$FLANK; $beg=1 if $beg < 1;
219 my $end=$tag2_end+$FLANK; $end=$seq_length if $end > $seq_length;
220 $seq=substr($seq,$beg-1,$end-$beg+1);
221 $seq_length=length $seq;
222 # re-reposition
223 $tag1_beg=index($seq,$tag1,0)+1;
224 if ($tag1_beg < 1) {
225 warn "[ffw2] coordinate error.\n";
226 # $fold->{reason}="coordinate error";
227 return 0;
228 }
229
230 $tag2_beg=index($seq,$tag2,0)+1;
231 if ($tag2_beg < 1) {
232 warn "[ffw2] coordinate error.\n";
233 # $fold->{reason}="coordinate error";
234 return 0;
235 }
236 $tag1_end=$tag1_beg+length($tag1)-1;
237 $tag2_end=$tag2_beg+length($tag2)-1;
238
239 # fold
240 my ($struct,$mfe)=RNA::fold($seq);
241 $mfe=sprintf "%.2f", $mfe;
242 if ($mfe > $MAX_ENERGY) {return 0;}
243
244 # tag1
245 my $tag1_length=$tag1_end-$tag1_beg+1;
246 my $tag1_struct=substr($struct,$tag1_beg-1,$tag1_length);
247 my $tag1_arm=which_arm($tag1_struct);
248 my $tag1_unpair=$tag1_struct=~tr/.//;
249 my $tag1_pair=$tag1_length-$tag1_unpair;
250 my $tag1_max_bulge=biggest_bulge($tag1_struct);
251 if ($tag1_arm ne "5p") { return 0;} # tag not in stem
252 # if ($tag1_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag1_unpair ($MAX_UNPAIR)"; return $pass}
253 if ($tag1_pair < $MIN_PAIR) {return 0;}
254 if ($tag1_max_bulge > $MAX_BULGE) {return 0;}
255
256 # tag2
257 my $tag2_length=$tag2_end-$tag2_beg+1;
258 my $tag2_struct=substr($struct,$tag2_beg-1,$tag2_length);
259 my $tag2_arm=which_arm($tag2_struct);
260 my $tag2_unpair=$tag2_struct=~tr/.//;
261 my $tag2_pair=$tag2_length-$tag2_unpair;
262 my $tag2_max_bulge=biggest_bulge($tag2_struct);
263 if ($tag2_arm ne "3p") {return 0;} # star not in stem
264 # if ($tag2_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag2_unpair ($MAX_UNPAIR)"; return $pass}
265 if ($tag2_pair < $MIN_PAIR) {return 0;}
266 if ($tag2_max_bulge > $MAX_BULGE) {return 0;}
267
268 # space size between miR and miR*
269 my $space=$tag2_beg-$tag1_end-1;
270 if ($space < $MIN_SPACE) {return 0;}
271 if ($space > $MAX_SPACE) {return 0;}
272
273 # size diff of miR and miR*
274 my $size_diff=abs($tag1_length-$tag2_length);
275 if ($size_diff > $MAX_SIZEDIFF) {return 0;}
276
277 # build base pairing table
278 my %pairtable;
279 &parse_struct($struct,\%pairtable); # coords count from 1
280
281 my $asy1=get_asy(\%pairtable,$tag1_beg,$tag1_end);
282 my $asy2=get_asy(\%pairtable,$tag2_beg,$tag2_end);
283 my $asy=($asy1 < $asy2) ? $asy1 : $asy2;
284 if ($asy > $ASYMMETRY) {return 0}
285
286 # duplex fold, determine whether two matures like a miR/miR* ike duplex
287 my ($like_mir_duplex1,$duplex_pair,$overhang1,$overhang2)=likeMirDuplex1($tag1,$tag2);
288 # parse hairpin, determine whether two matures form miR/miR* duplex in hairpin context
289 my ($like_mir_duplex2,$duplex_pair2,$overhang_b,$overhang_t)=likeMirDuplex2(\%pairtable,$tag1_beg,$tag1_end,$tag2_beg,$tag2_end);
290 if ($like_mir_duplex1==0 && $like_mir_duplex2==0) {
291 return 0;
292 }
293
294 print FA ">$chr:$strand:$ss..$ee\n$seq\n";
295 print STR ">$chr:$strand:$ss..$ee\n$seq\n$struct\t($mfe)\n";
296
297 return 1;
298 }
299
300 sub ffw1{
301 my ($seq,$tag,$chr,$strand,$ss,$ee)=@_;
302 my $pass=0;
303
304 my $N_count=$seq=~tr/N//;
305 if ($N_count > 5) {
306 return 0;
307 }
308
309 my $seq_length=length $seq;
310 my $tag_length=length $tag;
311
312 # position
313 my $tag_beg=index($seq,$tag,0)+1;
314 if ($tag_beg < 1) {
315 warn "[ffw1] coordinate error.\n";
316 return $pass;
317 }
318 my $tag_end=$tag_beg+length($tag)-1;
319
320
321 # define candidate precursor by hybrid short arm to long arm, not solid enough
322 my($beg,$end)=define_precursor($seq,$tag);
323 if (not defined $beg) {
324 return $pass;
325 }
326 if (not defined $end) {
327 return $pass;
328 }
329 $seq=substr($seq,$beg-1,$end-$beg+1);
330 $seq_length=length $seq;
331
332
333 # fold
334 my ($struct,$mfe)=RNA::fold($seq);
335 $mfe=sprintf "%.2f",$mfe;
336 if ($mfe > $MAX_ENERGY) {
337 $pass=0;
338 return $pass;
339 }
340
341 # reposition
342 $tag_beg=index($seq,$tag,0)+1;
343 if ($tag_beg < 1) {
344 warn "[ffw1] coordinate error.\n";
345 return 0;
346 }
347 $tag_end=$tag_beg+length($tag)-1;
348
349 my $tag_struct=substr($struct,$tag_beg-1,$tag_length);
350 my $tag_arm=which_arm($tag_struct);
351 my $tag_unpair=$tag_struct=~tr/.//;
352 my $tag_pair=$tag_length-$tag_unpair;
353 my $tag_max_bulge=biggest_bulge($tag_struct);
354 if ($tag_arm eq "-") { return $pass;}
355 # if ($tag_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag_unpair ($MAX_UNPAIR)"; return $pass}
356 if ($tag_pair < $MIN_PAIR) { return $pass;}
357 if ($tag_max_bulge > $MAX_BULGE) {return $pass;}
358
359 # build base pairing table
360 my %pairtable;
361 &parse_struct($struct,\%pairtable); # coords count from 1
362
363 # get star
364 my ($star_beg,$star_end)=get_star(\%pairtable,$tag_beg,$tag_end);
365 my $star=substr($seq,$star_beg-1,$star_end-$star_beg+1);
366 my $star_length=$star_end-$star_beg+1;
367 my $star_struct=substr($struct,$star_beg-1,$star_end-$star_beg+1);
368 my $star_arm=which_arm($star_struct);
369 my $star_unpair=$star_struct=~tr/.//;
370 my $star_pair=$star_length-$star_unpair;
371 my $star_max_bulge=biggest_bulge($star_struct);
372 if ($star_arm eq "-") { return $pass;}
373 # if ($star_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$star_unpair ($MAX_UNPAIR)"; return $pass}
374 if ($star_pair < $MIN_PAIR) {return $pass;}
375 if ($star_max_bulge > $MAX_BULGE) {return $pass;}
376
377 if ($tag_arm eq $star_arm) {return $pass;}
378
379 # space size between miR and miR*
380 my $space;
381 if ($tag_beg < $star_beg) {
382 $space=$star_beg-$tag_end-1;
383 }
384 else {
385 $space=$tag_beg-$star_end-1;
386 }
387 if ($space < $MIN_SPACE) { return $pass;}
388 if ($space > $MAX_SPACE) { return $pass;}
389
390 # size diff
391 my $size_diff=abs($tag_length-$star_length);
392 if ($size_diff > $MAX_SIZEDIFF) { return $pass;}
393
394 # asymmetry
395 my $asy=get_asy(\%pairtable,$tag_beg,$tag_end);
396 if ($asy > $ASYMMETRY) {return $pass;}
397
398 $pass=1;
399 print FA ">$chr:$strand:$ss..$ee\n$seq\n";
400 print STR ">$chr:$strand:$ss..$ee\n$seq\n$struct\t($mfe)\n";
401 return $pass;
402
403 }
404 sub get_star {
405 my($table,$beg,$end)=@_;
406
407 my ($s1,$e1,$s2,$e2); # s1 pair to s2, e1 pair to e2
408 foreach my $i ($beg..$end) {
409 if (defined $table->{$i}) {
410 my $j=$table->{$i};
411 $s1=$i;
412 $s2=$j;
413 last;
414 }
415 }
416 foreach my $i (reverse ($beg..$end)) {
417 if (defined $table->{$i}) {
418 my $j=$table->{$i};
419 $e1=$i;
420 $e2=$j;
421 last;
422 }
423 }
424 # print "$s1,$e1 $s2,$e2\n";
425
426 # correct terminus
427 my $off1=$s1-$beg;
428 my $off2=$end-$e1;
429 $s2+=$off1;
430 $s2+=2; # 081009
431 $e2-=$off2; $e2=1 if $e2 < 1;
432 $e2+=2; $e2=1 if $e2 < 1; # 081009
433 ($s2,$e2)=($e2,$s2) if ($s2 > $e2);
434 return ($s2,$e2);
435 }
436
437 sub define_precursor {
438 my $seq=shift;
439 my $tag=shift;
440
441 my $seq_length=length $seq;
442 my $tag_length=length $tag;
443 my $tag_beg=index($seq,$tag,0)+1;
444 my $tag_end=$tag_beg+$tag_length-1;
445
446 # split the candidate region into short arm and long arm
447 my $tag_arm;
448 my ($larm,$larm_beg,$larm_end);
449 my ($sarm,$sarm_beg,$sarm_end);
450 if ($tag_beg-1 < $seq_length-$tag_end) { # on 5' arm
451 $sarm=substr($seq,0,$tag_end);
452 $larm=substr($seq,$tag_end);
453 $sarm_beg=1;
454 $sarm_end=$tag_end;
455 $larm_beg=$tag_end+1;
456 $larm_end=$seq_length;
457 $tag_arm="5p";
458 }
459 else {
460 $larm=substr($seq,0,$tag_beg-1); # on 3' arm
461 $sarm=substr($seq,$tag_beg-1);
462 $larm_beg=1;
463 $larm_end=$tag_beg-1;
464 $sarm_beg=$tag_beg;
465 $sarm_end=$seq_length;
466 $tag_arm="3p";
467 }
468
469 # print "$sarm_beg,$sarm_end $sarm\n";
470 # print "$larm_beg,$larm_end $larm\n";
471
472 # clipping short arm
473 if ($tag_arm eq "5p") {
474 $sarm_beg=$tag_beg-$flank; $sarm_beg=1 if $sarm_beg < 1;
475 $sarm=substr($seq,$sarm_beg-1,$sarm_end-$sarm_beg+1);
476 }
477 else {
478 $sarm_end=$tag_end+$flank; $sarm_end=$seq_length if $sarm_end > $seq_length;
479 $sarm=substr($seq,$sarm_beg-1,$sarm_end-$sarm_beg+1);
480 }
481 # print "$sarm_beg,$sarm_end $sarm\n";
482 # print "$larm_beg,$larm_end $larm\n";
483
484 # define the precursor by hybriding short arm to long arm
485 my $duplex=RNA::duplexfold($sarm,$larm);
486 my $struct=$duplex->{structure};
487 my $energy=sprintf "%.2f", $duplex->{energy};
488 my ($str1,$str2)=split(/&/,$struct);
489 my $pair=$str1=~tr/(//;
490 # print "pair=$pair\n";
491 my $beg1=$duplex->{i}+1-length($str1);
492 my $end1=$duplex->{i};
493 my $beg2=$duplex->{j};
494 my $end2=$duplex->{j}+length($str2)-1;
495 # print "$beg1:$end1 $beg2:$end2\n";
496 # transform coordinates
497 $beg1=$beg1+$sarm_beg-1;
498 $end1=$end1+$sarm_beg-1;
499 $beg2=$beg2+$larm_beg-1;
500 $end2=$end2+$larm_beg-1;
501 # print "$beg1:$end1 $beg2:$end2\n";
502
503 my $off5p=$beg1-$sarm_beg;
504 my $off3p=$sarm_end-$end1;
505 $beg2-=$off3p; $beg2=1 if $beg2 < 1;
506 $end2+=$off5p; $end2=$seq_length if $end2 > $seq_length;
507
508 # print "$beg1:$end1 $beg2:$end2\n";
509
510 my $beg=$sarm_beg < $beg2 ? $sarm_beg : $beg2;
511 my $end=$sarm_end > $end2 ? $sarm_end : $end2;
512
513 return if $pair < $MIN_PAIR;
514 # print "$beg,$end\n";
515 return ($beg,$end);
516 }
517
518
519 # duplex fold, judge whether two short seqs like a miRNA/miRNA* duplex
520 sub likeMirDuplex1 {
521 my $seq1=shift;
522 my $seq2=shift;
523 my $like_mir_duplex=1;
524
525 my $length1=length $seq1;
526 my $length2=length $seq2;
527 my $duplex=RNA::duplexfold($seq1, $seq2);
528 my $duplex_struct=$duplex->{structure};
529 my $duplex_energy=sprintf "%.2f", $duplex->{energy};
530 my ($str1,$str2)=split(/&/,$duplex_struct);
531 my $beg1=$duplex->{i}+1-length($str1);
532 my $end1=$duplex->{i};
533 my $beg2=$duplex->{j};
534 my $end2=$duplex->{j}+length($str2)-1;
535
536 # revise beg1, end1, beg2, end2
537 $str1=~/^(\.*)/;
538 $beg1+=length($1);
539 $str1=~/(\.*)$/;
540 $end1-=length($1);
541 $str2=~/^(\.*)/;
542 $beg2+=length($1);
543 $str2=~/(\.*)$/;
544 $end2-=length($1);
545
546 my $pair_num=$str1=~tr/(//;
547 my $overhang1=($length2-$end2)-($beg1-1); # 3' overhang at hairpin bottom
548 my $overhang2=($length1-$end1)-($beg2-1); # 3' overhang at hairpin neck
549 # print $pair_num,"\n";
550 # print $overhang1,"\n";
551 # print $overhang2,"\n";
552 if ($pair_num < 13) {
553 $like_mir_duplex=0;
554 }
555 if ($overhang1 < 0 || $overhang2 < 0 ) {
556 $like_mir_duplex=0;
557 }
558 if ($overhang1 > 4 || $overhang2 > 4) {
559 $like_mir_duplex=0;
560 }
561 return ($like_mir_duplex,$pair_num,$overhang1,$overhang2);
562 }
563
564 # judge whether two matures form miR/miR* duplex, in hairpin context
565 sub likeMirDuplex2 {
566 my ($table,$beg1,$end1,$beg2,$end2)=@_;
567 my $like_mir_duplex=1;
568
569 # s1 e1
570 # 5 ----------------------------3
571 # | | |||| ||| |
572 #3 -------------------------------5
573 # e2 s2
574
575 my $pair_num=0;
576 my $overhang1=0;
577 my $overhang2=0;
578 my ($s1,$e1,$s2,$e2);
579 foreach my $i ($beg1..$end1) {
580 if (defined $table->{$i}) {
581 my $j=$table->{$i};
582 if ($j <= $end2 && $j >= $beg2) {
583 $s1=$i;
584 $e2=$j;
585 last;
586 }
587 }
588 }
589 foreach my $i (reverse ($beg1..$end1)) {
590 if (defined $table->{$i}) {
591 my $j=$table->{$i};
592 if ($j <= $end2 && $j >= $beg2) {
593 $e1=$i;
594 $s2=$j;
595 last;
596 }
597 }
598 }
599
600 # print "$beg1,$end1 $s1,$e1\n";
601 # print "$beg2,$end2 $s2,$e2\n";
602
603 foreach my $i ($beg1..$end1) {
604 if (defined $table->{$i}) {
605 my $j=$table->{$i};
606 if ($j <= $end2 && $j >= $beg2) {
607 ++$pair_num;
608 }
609 }
610 }
611 if (defined $s1 && defined $e2) {
612 $overhang1=($end2-$e2)-($s1-$beg1);
613 }
614 if (defined $e1 && defined $s2) {
615 $overhang2=($end1-$e1)-($s2-$beg2);
616 }
617
618 if ($pair_num < 13) {
619 $like_mir_duplex=0;
620 }
621 if ($overhang1 < 0 && $overhang2 < 0) {
622 $like_mir_duplex=0;
623 }
624 return ($like_mir_duplex,$pair_num,$overhang1,$overhang2);
625 }
626 sub parse_struct {
627 my $struct=shift;
628 my $table=shift;
629
630 my @t=split('',$struct);
631 my @lbs; # left brackets
632 foreach my $k (0..$#t) {
633 if ($t[$k] eq "(") {
634 push @lbs, $k+1;
635 }
636 elsif ($t[$k] eq ")") {
637 my $lb=pop @lbs;
638 my $rb=$k+1;
639 $table->{$lb}=$rb;
640 $table->{$rb}=$lb;
641 }
642 }
643 if (@lbs) {
644 warn "unbalanced RNA struct.\n";
645 }
646 }
647 sub which_arm {
648 my $substruct=shift;
649 my $arm;
650 if ($substruct=~/\(/ && $substruct=~/\)/) {
651 $arm="-";
652 }
653 elsif ($substruct=~/\(/) {
654 $arm="5p";
655 }
656 else {
657 $arm="3p";
658 }
659 return $arm;
660 }
661 sub biggest_bulge {
662 my $struct=shift;
663 my $bulge_size=0;
664 my $max_bulge=0;
665 while ($struct=~/(\.+)/g) {
666 $bulge_size=length $1;
667 if ($bulge_size > $max_bulge) {
668 $max_bulge=$bulge_size;
669 }
670 }
671 return $max_bulge;
672 }
673 sub get_asy {
674 my($table,$a1,$a2)=@_;
675 my ($pre_i,$pre_j);
676 my $asymmetry=0;
677 foreach my $i ($a1..$a2) {
678 if (defined $table->{$i}) {
679 my $j=$table->{$i};
680 if (defined $pre_i && defined $pre_j) {
681 my $diff=($i-$pre_i)+($j-$pre_j);
682 $asymmetry += abs($diff);
683 }
684 $pre_i=$i;
685 $pre_j=$j;
686 }
687 }
688 return $asymmetry;
689 }
690
691 sub peaks{
692 my @cluster=@{$_[0]};
693
694 return if(@cluster<1);
695
696 my $max=0; my $index=-1;
697 for (my $i=0;$i<@cluster;$i++) {
698 if($cluster[$i][2]>$max){
699 $max=$cluster[$i][2];
700 $index=$i;
701 }
702 }
703 # &excise(\@cluster,$index,$_[1],$_[2]);
704 return($index);
705 }
706
707 sub find_cites{
708 my @tmp=@{$_[0]};
709 my $i=&peaks(\@tmp);
710
711 my $start=$tmp[$i][0];
712 my $total=0; my $node5=0;
713 for (my $j=0;$j<@tmp ;$j++) {
714 $total+=$tmp[$j][2];
715 $node5 +=$tmp[$j][2] if($tmp[$j][0]-$start<=2 && $tmp[$j][0]-$start>=-2);
716 }
717 push @{$cites{$_[1]}{$_[2]}},$tmp[$i] if($node5/$total>0.80 && $tmp[$i][2]/$node5>0.5);
718 }
719
720 sub newpos{
721 my ($a,$b,$c,$d)=@_;
722 my $s= $a>$c ? $c : $a;
723 my $e=$b>$d ? $b : $d;
724 return($s,$e);
725 }
726
727 sub rev{
728
729 my($sequence)=@_;
730
731 my $rev=reverse $sequence;
732
733 return $rev;
734 }
735
736 sub com{
737
738 my($sequence)=@_;
739
740 $sequence=~tr/acgtuACGTU/TGCAATGCAA/;
741
742 return $sequence;
743 }
744
745 sub revcom{
746
747 my($sequence)=@_;
748
749 my $revcom=rev(com($sequence));
750
751 return $revcom;
752 }
753
754 sub find_strand{
755
756 #A subroutine to find the strand, parsing different blast formats
757 my($other)=@_;
758
759 my $strand="+";
760
761 if($other=~/-/){
762 $strand="-";
763 }
764
765 if($other=~/minus/i){
766 $strand="-";
767 }
768
769 return($strand);
770 }
771 sub usage{
772 print <<"USAGE";
773 Version $version
774 Usage:
775 $0 -map -g -d -f -o -s -e
776 options:
777 -map input file# align result # bst. format
778 -g input file # genome sequence fasta format
779 -d <int> Maximal space between miRNA and miRNA* (200)
780 -f <int> Flank sequence length of miRNA precursor (10)
781 -o output file# percursor fasta file
782 -s output file# precursor structure file
783 -e <folat> Maximal free energy allowed for a miRNA precursor (-18 kcal/mol)
784
785 -h help
786 USAGE
787 exit(1);
788 }
789