comparison precursors.pl @ 50:7b5a48b972e9 draft

Uploaded
author big-tiandm
date Fri, 05 Dec 2014 00:11:02 -0500
parents
children
comparison
equal deleted inserted replaced
49:f008ab2cadc6 50:7b5a48b972e9
1 #!/usr/bin/perl -w
2 #Filename:
3 #Author: Tian Dongmei
4 #Email: tiandm@big.ac.cn
5 #Date: 2013/7/19
6 #Modified:
7 #Description:
8 my $version=1.00;
9
10 use strict;
11 use Getopt::Long;
12 #use RNA;
13
14 my %opts;
15 GetOptions(\%opts,"map=s","g=s","d:i","f:i","o=s","e:f","s=s","h");
16 if (!(defined $opts{map} and defined $opts{g} and defined $opts{o} and defined $opts{s} ) || defined $opts{h}) { #necessary arguments
17 &usage;
18 }
19
20 my $checkno=1;
21 my $filein=$opts{'map'};
22 my $faout=$opts{'o'};
23 my $strout=$opts{'s'};
24 my $genome= $opts{'g'};
25
26 my $maxd=defined $opts{'d'} ? $opts{'d'} : 200;
27 my $flank=defined $opts{'f'}? $opts{'f'} : 10;
28
29 my $MAX_ENERGY=-18;
30 if (defined $opts{'e'}) {$MAX_ENERGY=$opts{'e'};}
31 my $MAX_UNPAIR=5;
32 my $MIN_PAIR=15;
33 my $MAX_SIZEDIFF=4;
34 my $MAX_BULGE=2;
35 my $ASYMMETRY=5;
36 my $MIN_UNPAIR=0;
37 my $MIN_SPACE=5;
38 my $MAX_SPACE=$maxd;
39 my $FLANK=$flank;
40
41 ######### load in genome sequences start ########
42 my %genome;
43 my %lng;
44 my $name;
45 open IN,"<$genome";
46 while (my $aline=<IN>) {
47 chomp $aline;
48 next if($aline=~/^\#/);
49 if ($aline=~/^>(\S+)/) {
50 $name=$1;
51 next;
52 }
53 $genome{$name} .=$aline;
54 }
55 close IN;
56 foreach my $key (keys %genome) {
57 $lng{$key}=length($genome{$key});
58 }
59 ####### load in genome sequences end ##########
60
61 my %breaks; ### reads number bigger than 3
62 open IN,"<$filein"; #input file
63 while (my $aline=<IN>) {
64 chomp $aline;
65 my @tmp=split/\t/,$aline;
66 $tmp[0]=~/_x(\d+)$/;
67 my $no=$1;
68 next if($no<3);
69 #my $trand=&find_strand($tmp[9]);
70 #my @pos=split/\.\./,$tmp[5];
71 my $end=$tmp[3]+length($tmp[4])-1;
72 if($tmp[1] eq "-"){$tmp[4]=revcom($tmp[4]);}
73 push @{$breaks{$tmp[2]}{$tmp[1]}},[$tmp[3],$end,$no,$tmp[4]]; ### 0 base
74 }
75 close IN;
76
77 my %cites; ### peaks
78 foreach my $chr (keys %breaks) {
79 foreach my $strand (keys %{$breaks{$chr}}) {
80 my @array=@{$breaks{$chr}{$strand}};
81 @array=sort{$a->[0]<=>$b->[0]} @array;
82 for (my $i=0;$i<@array;$i++) {
83 my $start=$array[$i][0];my $end=$array[$i][1];
84 my @subarray=();
85 push @subarray,$array[$i];
86
87 for (my $j=$i+1;$j<@array;$j++) {
88 if ($start<$array[$j][1] && $end>$array[$j][0]) {
89 push @subarray,$array[$j];
90 ($start,$end)=&newpos($start,$end,$array[$j][0],$array[$j][1]);
91 }
92 else{
93 $i=$j;
94 &find_cites(\@subarray,$chr,$strand);
95 last;
96 }
97 }
98 }
99 }
100 }
101
102 open FA,">$faout"; #output file
103 open STR,">$strout";
104 foreach my $chr (keys %cites) {
105 foreach my $strand (keys %{$cites{$chr}}) {
106
107 my @array2=@{$cites{$chr}{$strand}};
108 @array2=sort{$a->[0]<=>$b->[0]} @array2;
109 &excise(\@array2,$chr,$strand);
110 }
111 }
112 close FA;
113 close STR;
114 sub oneCiteDn{
115 my ($array,$a,$chr,$strand)=@_;
116
117 my $ss=$$array[$a][0]-$flank;
118 $ss=0 if($ss<0);
119 my $ee=$$array[$a][1]+$maxd+$flank;
120 $ee=$lng{$chr} if($ee>$lng{$chr});
121
122 my $seq=substr($genome{$chr},$ss,$ee-$ss+1);
123 if($strand eq "-"){$seq=revcom($seq);}
124
125 my $val=&ffw1($seq,$$array[$a][3],$chr,$strand,$ss,$ee);
126 return $val;
127 }
128 sub oneCiteUp{
129 my ($array,$a,$chr,$strand)=@_;
130
131 my $ss=$$array[$a][0]-$maxd-$flank;
132 $ss=0 if($ss<0);
133 my $ee=$$array[$a][1]+$flank;
134 $ee=$lng{$chr} if($ee>$lng{$chr});
135
136 my $seq=substr($genome{$chr},$ss,$ee-$ss+1);
137 if($strand eq "-"){$seq=revcom($seq);}
138
139 my $val=&ffw1($seq,$$array[$a][3],$chr,$strand,$ss,$ee);
140 return $val;
141
142 }
143
144 sub twoCites{
145 my ($array,$a,$b,$chr,$strand)=@_;
146
147 my $ss=$$array[$a][0]-$flank;
148 $ss=0 if($ss<0);
149 my $ee=$$array[$b][1]+$flank;
150 $ee=$lng{$chr} if($ee>$lng{$chr});
151
152 my $seq=substr($genome{$chr},$ss,$ee-$ss+1);
153 if($strand eq "-"){$seq=revcom($seq);}
154
155 # my( $str,$mfe)=RNA::fold($seq);
156 # return 0 if($mfe>$MAX_ENERGY); ### minimum mfe
157 my $val=&ffw2($seq,$$array[$a][3],$$array[$b][3],$chr,$strand,$ss,$ee);
158
159 return $val;
160
161 }
162 sub excise{
163 my ($cluster,$chr,$strand)=@_;
164
165 my $last_pos=0;
166 for (my $i=0;$i<@{$cluster};$i++) {
167 next if($$cluster[$i][0]<$last_pos);
168 my $ok=0;
169 for (my $j=$i+1;$j<@{$cluster} ;$j++) {
170 if($$cluster[$j][0]-$$cluster[$i][1]>$maxd){
171 $i=$j;
172 last;
173 }else{
174 $ok=&twoCites($cluster,$i,$j,$chr,$strand);
175 if($ok){ $last_pos=$$cluster[$j][1]+$flank; $i=$j; last;}
176 }
177 }
178 next if($ok);
179
180 $ok=&oneCiteDn($cluster,$i,$chr,$strand);
181 if($ok){$last_pos=$$cluster[$i][1]+$maxd+$flank; next;}
182 $ok=&oneCiteUp($cluster,$i,$chr,$strand);
183 if($ok){$last_pos=$$cluster[$i][1]+$flank;next;}
184 }
185
186
187 }
188
189 sub ffw2{
190 my ($seq,$tag1,$tag2,$chr,$strand,$ss,$ee)=@_;
191
192 my $N_count=$seq=~tr/N//; ## precursor sequence has not more than 5 Ns
193 if ($N_count > 5) {
194 return 0;
195 }
196
197 my $seq_length=length $seq;
198 # position tag1 and tag2
199 my $tag1_beg=index($seq,$tag1,0)+1;
200 if ($tag1_beg < 1) {
201 warn "[ffw2] coordinate error.\n";
202 # $fold->{reason}="coordinate error";
203 return 0;
204 }
205 my $tag2_beg=index($seq,$tag2,0)+1;
206 if ($tag2_beg < 1) {
207 warn "[ffw2] coordinate error.\n";
208 # $fold->{reason}="coordinate error";
209 return 0;
210 }
211 if ($tag2_beg < $tag1_beg) {
212 # swap tag1 and tag2
213 ($tag1,$tag2)=($tag2,$tag1);
214 ($tag1_beg,$tag2_beg)=($tag2_beg,$tag1_beg);
215 }
216 my $tag1_end=$tag1_beg+length($tag1)-1;
217 my $tag2_end=$tag2_beg+length($tag2)-1;
218 # re-clipping
219 my $beg=$tag1_beg-$FLANK; $beg=1 if $beg < 1;
220 my $end=$tag2_end+$FLANK; $end=$seq_length if $end > $seq_length;
221 $seq=substr($seq,$beg-1,$end-$beg+1);
222 $seq_length=length $seq;
223 # re-reposition
224 $tag1_beg=index($seq,$tag1,0)+1;
225 if ($tag1_beg < 1) {
226 warn "[ffw2] coordinate error.\n";
227 # $fold->{reason}="coordinate error";
228 return 0;
229 }
230
231 $tag2_beg=index($seq,$tag2,0)+1;
232 if ($tag2_beg < 1) {
233 warn "[ffw2] coordinate error.\n";
234 # $fold->{reason}="coordinate error";
235 return 0;
236 }
237 $tag1_end=$tag1_beg+length($tag1)-1;
238 $tag2_end=$tag2_beg+length($tag2)-1;
239
240 # fold
241 #my ($struct,$mfe)=RNA::fold($seq);
242 my $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`;
243 my @rawfolds=split/\s+/,$rnafold;
244 my $struct=$rawfolds[1];
245 my $mfe=$rawfolds[-1];
246 $mfe=~s/\(//;
247 $mfe=~s/\)//;
248 #$mfe=sprintf "%.2f", $mfe;
249 if ($mfe > $MAX_ENERGY) {return 0;}
250
251 # tag1
252 my $tag1_length=$tag1_end-$tag1_beg+1;
253 my $tag1_struct=substr($struct,$tag1_beg-1,$tag1_length);
254 my $tag1_arm=which_arm($tag1_struct);
255 my $tag1_unpair=$tag1_struct=~tr/.//;
256 my $tag1_pair=$tag1_length-$tag1_unpair;
257 my $tag1_max_bulge=biggest_bulge($tag1_struct);
258 if ($tag1_arm ne "5p") { return 0;} # tag not in stem
259 # if ($tag1_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag1_unpair ($MAX_UNPAIR)"; return $pass}
260 if ($tag1_pair < $MIN_PAIR) {return 0;}
261 if ($tag1_max_bulge > $MAX_BULGE) {return 0;}
262
263 # tag2
264 my $tag2_length=$tag2_end-$tag2_beg+1;
265 my $tag2_struct=substr($struct,$tag2_beg-1,$tag2_length);
266 my $tag2_arm=which_arm($tag2_struct);
267 my $tag2_unpair=$tag2_struct=~tr/.//;
268 my $tag2_pair=$tag2_length-$tag2_unpair;
269 my $tag2_max_bulge=biggest_bulge($tag2_struct);
270 if ($tag2_arm ne "3p") {return 0;} # star not in stem
271 # if ($tag2_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag2_unpair ($MAX_UNPAIR)"; return $pass}
272 if ($tag2_pair < $MIN_PAIR) {return 0;}
273 if ($tag2_max_bulge > $MAX_BULGE) {return 0;}
274
275 # space size between miR and miR*
276 my $space=$tag2_beg-$tag1_end-1;
277 if ($space < $MIN_SPACE) {return 0;}
278 if ($space > $MAX_SPACE) {return 0;}
279
280 # size diff of miR and miR*
281 my $size_diff=abs($tag1_length-$tag2_length);
282 if ($size_diff > $MAX_SIZEDIFF) {return 0;}
283
284 # build base pairing table
285 my %pairtable;
286 &parse_struct($struct,\%pairtable); # coords count from 1
287
288 my $asy1=get_asy(\%pairtable,$tag1_beg,$tag1_end);
289 my $asy2=get_asy(\%pairtable,$tag2_beg,$tag2_end);
290 my $asy=($asy1 < $asy2) ? $asy1 : $asy2;
291 if ($asy > $ASYMMETRY) {return 0}
292
293 # duplex fold, determine whether two matures like a miR/miR* ike duplex
294 my ($like_mir_duplex1,$duplex_pair,$overhang1,$overhang2)=likeMirDuplex1($tag1,$tag2);
295 # parse hairpin, determine whether two matures form miR/miR* duplex in hairpin context
296 my ($like_mir_duplex2,$duplex_pair2,$overhang_b,$overhang_t)=likeMirDuplex2(\%pairtable,$tag1_beg,$tag1_end,$tag2_beg,$tag2_end);
297 if ($like_mir_duplex1==0 && $like_mir_duplex2==0) {
298 return 0;
299 }
300
301 print FA ">$chr:$strand:$ss..$ee\n$seq\n";
302 print STR ">$chr:$strand:$ss..$ee\n$seq\n$struct\t($mfe)\n";
303
304 return 1;
305 }
306
307 sub ffw1{
308 my ($seq,$tag,$chr,$strand,$ss,$ee)=@_;
309 my $pass=0;
310
311 my $N_count=$seq=~tr/N//;
312 if ($N_count > 5) {
313 return 0;
314 }
315
316 my $seq_length=length $seq;
317 my $tag_length=length $tag;
318
319 # position
320 my $tag_beg=index($seq,$tag,0)+1;
321 if ($tag_beg < 1) {
322 warn "[ffw1] coordinate error.\n";
323 return $pass;
324 }
325 my $tag_end=$tag_beg+length($tag)-1;
326
327
328 # define candidate precursor by hybrid short arm to long arm, not solid enough
329 my($beg,$end)=define_precursor($seq,$tag);
330 if (not defined $beg) {
331 return $pass;
332 }
333 if (not defined $end) {
334 return $pass;
335 }
336 $seq=substr($seq,$beg-1,$end-$beg+1);
337 $seq_length=length $seq;
338
339
340 # fold
341 #my ($struct,$mfe)=RNA::fold($seq);
342 my $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`;
343 my @rawfolds=split/\s+/,$rnafold;
344 my $struct=$rawfolds[1];
345 my $mfe=$rawfolds[-1];
346 $mfe=~s/\(//;
347 $mfe=~s/\)//;
348
349 if ($mfe > $MAX_ENERGY) {
350 $pass=0;
351 return $pass;
352 }
353
354 # reposition
355 $tag_beg=index($seq,$tag,0)+1;
356 if ($tag_beg < 1) {
357 warn "[ffw1] coordinate error.\n";
358 return 0;
359 }
360 $tag_end=$tag_beg+length($tag)-1;
361
362 my $tag_struct=substr($struct,$tag_beg-1,$tag_length);
363 my $tag_arm=which_arm($tag_struct);
364 my $tag_unpair=$tag_struct=~tr/.//;
365 my $tag_pair=$tag_length-$tag_unpair;
366 my $tag_max_bulge=biggest_bulge($tag_struct);
367 if ($tag_arm eq "-") { return $pass;}
368 # if ($tag_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag_unpair ($MAX_UNPAIR)"; return $pass}
369 if ($tag_pair < $MIN_PAIR) { return $pass;}
370 if ($tag_max_bulge > $MAX_BULGE) {return $pass;}
371
372 # build base pairing table
373 my %pairtable;
374 &parse_struct($struct,\%pairtable); # coords count from 1
375
376 # get star
377 my ($star_beg,$star_end)=get_star(\%pairtable,$tag_beg,$tag_end);
378 my $star=substr($seq,$star_beg-1,$star_end-$star_beg+1);
379 my $star_length=$star_end-$star_beg+1;
380 my $star_struct=substr($struct,$star_beg-1,$star_end-$star_beg+1);
381 my $star_arm=which_arm($star_struct);
382 my $star_unpair=$star_struct=~tr/.//;
383 my $star_pair=$star_length-$star_unpair;
384 my $star_max_bulge=biggest_bulge($star_struct);
385 if ($star_arm eq "-") { return $pass;}
386 # if ($star_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$star_unpair ($MAX_UNPAIR)"; return $pass}
387 if ($star_pair < $MIN_PAIR) {return $pass;}
388 if ($star_max_bulge > $MAX_BULGE) {return $pass;}
389
390 if ($tag_arm eq $star_arm) {return $pass;}
391
392 # space size between miR and miR*
393 my $space;
394 if ($tag_beg < $star_beg) {
395 $space=$star_beg-$tag_end-1;
396 }
397 else {
398 $space=$tag_beg-$star_end-1;
399 }
400 if ($space < $MIN_SPACE) { return $pass;}
401 if ($space > $MAX_SPACE) { return $pass;}
402
403 # size diff
404 my $size_diff=abs($tag_length-$star_length);
405 if ($size_diff > $MAX_SIZEDIFF) { return $pass;}
406
407 # asymmetry
408 my $asy=get_asy(\%pairtable,$tag_beg,$tag_end);
409 if ($asy > $ASYMMETRY) {return $pass;}
410
411 $pass=1;
412 print FA ">$chr:$strand:$ss..$ee\n$seq\n";
413 print STR ">$chr:$strand:$ss..$ee\n$seq\n$struct\t($mfe)\n";
414 return $pass;
415
416 }
417 sub get_star {
418 my($table,$beg,$end)=@_;
419
420 my ($s1,$e1,$s2,$e2); # s1 pair to s2, e1 pair to e2
421 foreach my $i ($beg..$end) {
422 if (defined $table->{$i}) {
423 my $j=$table->{$i};
424 $s1=$i;
425 $s2=$j;
426 last;
427 }
428 }
429 foreach my $i (reverse ($beg..$end)) {
430 if (defined $table->{$i}) {
431 my $j=$table->{$i};
432 $e1=$i;
433 $e2=$j;
434 last;
435 }
436 }
437 # print "$s1,$e1 $s2,$e2\n";
438
439 # correct terminus
440 my $off1=$s1-$beg;
441 my $off2=$end-$e1;
442 $s2+=$off1;
443 $s2+=2; # 081009
444 $e2-=$off2; $e2=1 if $e2 < 1;
445 $e2+=2; $e2=1 if $e2 < 1; # 081009
446 ($s2,$e2)=($e2,$s2) if ($s2 > $e2);
447 return ($s2,$e2);
448 }
449
450 sub define_precursor {
451 my $seq=shift;
452 my $tag=shift;
453
454 my $seq_length=length $seq;
455 my $tag_length=length $tag;
456 my $tag_beg=index($seq,$tag,0)+1;
457 my $tag_end=$tag_beg+$tag_length-1;
458
459 # split the candidate region into short arm and long arm
460 my $tag_arm;
461 my ($larm,$larm_beg,$larm_end);
462 my ($sarm,$sarm_beg,$sarm_end);
463 if ($tag_beg-1 < $seq_length-$tag_end) { # on 5' arm
464 $sarm=substr($seq,0,$tag_end);
465 $larm=substr($seq,$tag_end);
466 $sarm_beg=1;
467 $sarm_end=$tag_end;
468 $larm_beg=$tag_end+1;
469 $larm_end=$seq_length;
470 $tag_arm="5p";
471 }
472 else {
473 $larm=substr($seq,0,$tag_beg-1); # on 3' arm
474 $sarm=substr($seq,$tag_beg-1);
475 $larm_beg=1;
476 $larm_end=$tag_beg-1;
477 $sarm_beg=$tag_beg;
478 $sarm_end=$seq_length;
479 $tag_arm="3p";
480 }
481
482 # print "$sarm_beg,$sarm_end $sarm\n";
483 # print "$larm_beg,$larm_end $larm\n";
484
485 # clipping short arm
486 if ($tag_arm eq "5p") {
487 $sarm_beg=$tag_beg-$flank; $sarm_beg=1 if $sarm_beg < 1;
488 $sarm=substr($seq,$sarm_beg-1,$sarm_end-$sarm_beg+1);
489 }
490 else {
491 $sarm_end=$tag_end+$flank; $sarm_end=$seq_length if $sarm_end > $seq_length;
492 $sarm=substr($seq,$sarm_beg-1,$sarm_end-$sarm_beg+1);
493 }
494 # print "$sarm_beg,$sarm_end $sarm\n";
495 # print "$larm_beg,$larm_end $larm\n";
496
497 # define the precursor by hybriding short arm to long arm
498 =cut #modify in 2014-10-28
499 my $duplex=RNA::duplexfold($sarm,$larm);
500 my $struct=$duplex->{structure};
501 my $energy=sprintf "%.2f", $duplex->{energy};
502 my ($str1,$str2)=split(/&/,$struct);
503 my $pair=$str1=~tr/(//;
504 # print "pair=$pair\n";
505 my $beg1=$duplex->{i}+1-length($str1);
506 my $end1=$duplex->{i};
507 my $beg2=$duplex->{j};
508 my $end2=$duplex->{j}+length($str2)-1;
509 =cut
510 ###### new codes begin
511 my $duplex=`perl -e 'print "$sarm\n$larm"' | RNAduplex`;
512 #(.(.(((.....(((.&))))))...).). 1,16 : 1,13 (-7.20)
513 my @tmpduplex=split/\s+/,$duplex;
514 my $struct=$tmpduplex[0];
515 $tmpduplex[-1]=~s/[(|)]//g;
516 my $energy=$tmpduplex[-1];
517 my ($str1,$str2)=split(/&/,$struct);
518 my $pair=$str1=~tr/(//;
519 my ($beg1,$end1)=split/,/,$tmpduplex[1];
520 my ($beg2,$end2)=split/,/,$tmpduplex[3];
521 ######## new codes end
522
523 # print "$beg1:$end1 $beg2:$end2\n";
524 # transform coordinates
525 $beg1=$beg1+$sarm_beg-1;
526 $end1=$end1+$sarm_beg-1;
527 $beg2=$beg2+$larm_beg-1;
528 $end2=$end2+$larm_beg-1;
529 # print "$beg1:$end1 $beg2:$end2\n";
530
531 my $off5p=$beg1-$sarm_beg;
532 my $off3p=$sarm_end-$end1;
533 $beg2-=$off3p; $beg2=1 if $beg2 < 1;
534 $end2+=$off5p; $end2=$seq_length if $end2 > $seq_length;
535
536 # print "$beg1:$end1 $beg2:$end2\n";
537
538 my $beg=$sarm_beg < $beg2 ? $sarm_beg : $beg2;
539 my $end=$sarm_end > $end2 ? $sarm_end : $end2;
540
541 return if $pair < $MIN_PAIR;
542 # print "$beg,$end\n";
543 return ($beg,$end);
544 }
545
546
547 # duplex fold, judge whether two short seqs like a miRNA/miRNA* duplex
548 sub likeMirDuplex1 {
549 my $seq1=shift;
550 my $seq2=shift;
551 my $like_mir_duplex=1;
552
553 my $length1=length $seq1;
554 my $length2=length $seq2;
555 =cut
556 my $duplex=RNA::duplexfold($seq1, $seq2);
557 my $duplex_struct=$duplex->{structure};
558 my $duplex_energy=sprintf "%.2f", $duplex->{energy};
559 my ($str1,$str2)=split(/&/,$duplex_struct);
560 my $beg1=$duplex->{i}+1-length($str1);
561 my $end1=$duplex->{i};
562 my $beg2=$duplex->{j};
563 my $end2=$duplex->{j}+length($str2)-1;
564 =cut
565 my $duplex=`perl -e 'print "$seq1\n$seq2"' | RNAduplex`;
566 #(.(.(((.....(((.&))))))...).). 1,16 : 1,13 (-7.20)
567 my @tmpduplex=split/\s+/,$duplex;
568 my $duplex_struct=$tmpduplex[0];
569 $tmpduplex[-1]=~s/[(|)]//g;
570 my $duplex_energy=$tmpduplex[-1];
571 my ($str1,$str2)=split(/&/,$duplex_struct);
572 #my $pair=$str1=~tr/(//;
573 my ($beg1,$end1)=split/,/,$tmpduplex[1];
574 my ($beg2,$end2)=split/,/,$tmpduplex[3];
575
576 # revise beg1, end1, beg2, end2
577 $str1=~/^(\.*)/;
578 $beg1+=length($1);
579 $str1=~/(\.*)$/;
580 $end1-=length($1);
581 $str2=~/^(\.*)/;
582 $beg2+=length($1);
583 $str2=~/(\.*)$/;
584 $end2-=length($1);
585
586 my $pair_num=$str1=~tr/(//;
587 my $overhang1=($length2-$end2)-($beg1-1); # 3' overhang at hairpin bottom
588 my $overhang2=($length1-$end1)-($beg2-1); # 3' overhang at hairpin neck
589 # print $pair_num,"\n";
590 # print $overhang1,"\n";
591 # print $overhang2,"\n";
592 if ($pair_num < 13) {
593 $like_mir_duplex=0;
594 }
595 if ($overhang1 < 0 || $overhang2 < 0 ) {
596 $like_mir_duplex=0;
597 }
598 if ($overhang1 > 4 || $overhang2 > 4) {
599 $like_mir_duplex=0;
600 }
601 return ($like_mir_duplex,$pair_num,$overhang1,$overhang2);
602 }
603
604 # judge whether two matures form miR/miR* duplex, in hairpin context
605 sub likeMirDuplex2 {
606 my ($table,$beg1,$end1,$beg2,$end2)=@_;
607 my $like_mir_duplex=1;
608
609 # s1 e1
610 # 5 ----------------------------3
611 # | | |||| ||| |
612 #3 -------------------------------5
613 # e2 s2
614
615 my $pair_num=0;
616 my $overhang1=0;
617 my $overhang2=0;
618 my ($s1,$e1,$s2,$e2);
619 foreach my $i ($beg1..$end1) {
620 if (defined $table->{$i}) {
621 my $j=$table->{$i};
622 if ($j <= $end2 && $j >= $beg2) {
623 $s1=$i;
624 $e2=$j;
625 last;
626 }
627 }
628 }
629 foreach my $i (reverse ($beg1..$end1)) {
630 if (defined $table->{$i}) {
631 my $j=$table->{$i};
632 if ($j <= $end2 && $j >= $beg2) {
633 $e1=$i;
634 $s2=$j;
635 last;
636 }
637 }
638 }
639
640 # print "$beg1,$end1 $s1,$e1\n";
641 # print "$beg2,$end2 $s2,$e2\n";
642
643 foreach my $i ($beg1..$end1) {
644 if (defined $table->{$i}) {
645 my $j=$table->{$i};
646 if ($j <= $end2 && $j >= $beg2) {
647 ++$pair_num;
648 }
649 }
650 }
651 if (defined $s1 && defined $e2) {
652 $overhang1=($end2-$e2)-($s1-$beg1);
653 }
654 if (defined $e1 && defined $s2) {
655 $overhang2=($end1-$e1)-($s2-$beg2);
656 }
657
658 if ($pair_num < 13) {
659 $like_mir_duplex=0;
660 }
661 if ($overhang1 < 0 && $overhang2 < 0) {
662 $like_mir_duplex=0;
663 }
664 return ($like_mir_duplex,$pair_num,$overhang1,$overhang2);
665 }
666 sub parse_struct {
667 my $struct=shift;
668 my $table=shift;
669
670 my @t=split('',$struct);
671 my @lbs; # left brackets
672 foreach my $k (0..$#t) {
673 if ($t[$k] eq "(") {
674 push @lbs, $k+1;
675 }
676 elsif ($t[$k] eq ")") {
677 my $lb=pop @lbs;
678 my $rb=$k+1;
679 $table->{$lb}=$rb;
680 $table->{$rb}=$lb;
681 }
682 }
683 if (@lbs) {
684 warn "unbalanced RNA struct.\n";
685 }
686 }
687 sub which_arm {
688 my $substruct=shift;
689 my $arm;
690 if ($substruct=~/\(/ && $substruct=~/\)/) {
691 $arm="-";
692 }
693 elsif ($substruct=~/\(/) {
694 $arm="5p";
695 }
696 else {
697 $arm="3p";
698 }
699 return $arm;
700 }
701 sub biggest_bulge {
702 my $struct=shift;
703 my $bulge_size=0;
704 my $max_bulge=0;
705 while ($struct=~/(\.+)/g) {
706 $bulge_size=length $1;
707 if ($bulge_size > $max_bulge) {
708 $max_bulge=$bulge_size;
709 }
710 }
711 return $max_bulge;
712 }
713 sub get_asy {
714 my($table,$a1,$a2)=@_;
715 my ($pre_i,$pre_j);
716 my $asymmetry=0;
717 foreach my $i ($a1..$a2) {
718 if (defined $table->{$i}) {
719 my $j=$table->{$i};
720 if (defined $pre_i && defined $pre_j) {
721 my $diff=($i-$pre_i)+($j-$pre_j);
722 $asymmetry += abs($diff);
723 }
724 $pre_i=$i;
725 $pre_j=$j;
726 }
727 }
728 return $asymmetry;
729 }
730
731 sub peaks{
732 my @cluster=@{$_[0]};
733
734 return if(@cluster<1);
735
736 my $max=0; my $index=-1;
737 for (my $i=0;$i<@cluster;$i++) {
738 if($cluster[$i][2]>$max){
739 $max=$cluster[$i][2];
740 $index=$i;
741 }
742 }
743 # &excise(\@cluster,$index,$_[1],$_[2]);
744 return($index);
745 }
746
747 sub find_cites{
748 my @tmp=@{$_[0]};
749 my $i=&peaks(\@tmp);
750
751 my $start=$tmp[$i][0];
752 my $total=0; my $node5=0;
753 for (my $j=0;$j<@tmp ;$j++) {
754 $total+=$tmp[$j][2];
755 $node5 +=$tmp[$j][2] if($tmp[$j][0]-$start<=2 && $tmp[$j][0]-$start>=-2);
756 }
757 push @{$cites{$_[1]}{$_[2]}},$tmp[$i] if($node5/$total>0.80 && $tmp[$i][2]/$node5>0.5);
758 }
759
760 sub newpos{
761 my ($a,$b,$c,$d)=@_;
762 my $s= $a>$c ? $c : $a;
763 my $e=$b>$d ? $b : $d;
764 return($s,$e);
765 }
766
767 sub rev{
768
769 my($sequence)=@_;
770
771 my $rev=reverse $sequence;
772
773 return $rev;
774 }
775
776 sub com{
777
778 my($sequence)=@_;
779
780 $sequence=~tr/acgtuACGTU/TGCAATGCAA/;
781
782 return $sequence;
783 }
784
785 sub revcom{
786
787 my($sequence)=@_;
788
789 my $revcom=rev(com($sequence));
790
791 return $revcom;
792 }
793
794 sub find_strand{
795
796 #A subroutine to find the strand, parsing different blast formats
797 my($other)=@_;
798
799 my $strand="+";
800
801 if($other=~/-/){
802 $strand="-";
803 }
804
805 if($other=~/minus/i){
806 $strand="-";
807 }
808
809 return($strand);
810 }
811 sub usage{
812 print <<"USAGE";
813 Version $version
814 Usage:
815 $0 -map -g -d -f -o -s -e
816 options:
817 -map input file# align result # bst. format
818 -g input file # genome sequence fasta format
819 -d <int> Maximal space between miRNA and miRNA* (200)
820 -f <int> Flank sequence length of miRNA precursor (10)
821 -o output file# percursor fasta file
822 -s output file# precursor structure file
823 -e <folat> Maximal free energy allowed for a miRNA precursor (-18 kcal/mol)
824
825 -h help
826 USAGE
827 exit(1);
828 }
829