Mercurial > repos > bioitcore > splicetrap
diff bin/get_bed_fa_j.pl @ 1:adc0f7765d85 draft
planemo upload
author | bioitcore |
---|---|
date | Thu, 07 Sep 2017 15:06:58 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/get_bed_fa_j.pl Thu Sep 07 15:06:58 2017 -0400 @@ -0,0 +1,330 @@ +# Adapted from Chenghai Xue's script + +$starttime=time(); + +$input_file_1 = $ARGV[0]; # exon junction file +$input_file_2 = $ARGV[1]; # genome file list +$output_file_1 = $ARGV[2]; # exon junction bed (might be less than input_file_1 +$output_file_2 = $ARGV[3]; # exon junction fa +#$leftLen = $ARGV[4]; +#$rightLen = $ARGV[5]; + +open(IN_1, "$input_file_1") or die "can't open the input file : $!"; +open(IN_2, "$input_file_2") or die "can't open the input file : $!"; +open OUT_1, ">$output_file_1" or die "Can not open output_file : $!"; +open OUT_2, ">$output_file_2" or die "Can not open output_file : $!"; + +@chromList = (<IN_2>); +chomp(@chromList); +$len_chromList = @chromList; +print "BED2FA: in $input_file_2, found $len_chromList chromosomes\n"; +foreach $one (@chromList){ + if($one =~ /\/(chr.[^\/]*?)\.*fa$/i){ + $chr_hash{$1} = $one; + #print $1,"\n"; + } +} +@key_chr_hash = keys(%chr_hash); +$len_key_chr_hash = @key_chr_hash; +@sort_key_chr_hash = sort_chromNo(@key_chr_hash); +$len_sort_key_chr_hash = @sort_key_chr_hash; +#for($i=0; $i<$len_sort_key_chr_hash; $i++){ +# print "$sort_key_chr_hash[$i] $chr_hash{$sort_key_chr_hash[$i]}\n"; +#} + +$num_1=0; +$num_2=0; +$num_count_chrom=0; +my ($chrom, $chromStart, $chromEnd, $name, $score, $strand, $thickStart, $thickEnd, $itemRgb, $blockCount, $blockSizes, $blockStarts); +$current_chrom = ""; +while(<IN_1>){ + $num_1++; + $line = $_; + chomp $line; + #print $line,"\n"; + @cols = split ("\t", $line); + if(scalar(@cols)==12) + { + ($chrom, $chromStart, $chromEnd, $name, $score, $strand, $thickStart, $thickEnd, $itemRgb, $blockCount, $blockSizes, $blockStarts) = @cols; + } + if(scalar(@cols)!=12) + { + ($chrom, $chromStart, $chromEnd, $name, $score, $strand)=@cols; + $thickStart=$chromStart; + # print $thickStart,"\n"; + $thickEnd = $chromEnd; + $blockCount=1; + $blockSizes=$chromEnd-$chromStart; + $blockStarts = 0; + } + $strand="+" if !$strand; + @a_blockSizes = split (/\,/, $blockSizes); + @a_blockStarts = split (/\,/, $blockStarts); + if($chrom ne $current_chrom){ + if($num_1 != 1){ + print "$num_chr_1 $num_chr_2 $len_contigSeqStr\n"; + } + print "BED2FA: $chrom: "; + + $num_chr_1=0; + $num_chr_2=0; + + if(exists $chr_hash{$chrom}){ + $num_count_chrom++; + $current_chrom = $chrom; + #print $current_chrom,"\n"; +#=pod + $chromFastaFile = $chr_hash{$chrom}; + #print $chromFastaFile,"\n"; + open($fin, "<$chromFastaFile") or die "can't open the chrom file : $!"; + local ($/) = undef; + $contigSeqStr = <$fin>; + close ($fin); + #print $contigSeqStr,"mark\t"; + $contigSeqStr =~s/^\>.*?\n//g; + #print $contigSeqStr,"mark2\t"; + + $contigSeqStr =~s/\s|\n//g; + #print $contigSeqStr,"mark3\n"; + + $len_contigSeqStr = length $contigSeqStr; +#=cut + } + else{ + $num_chr_1++; + next; + } + } + $num_chr_1++; + +# modify from here................................ + my @Starts; + my @Ends; + my @JuncSeq; + my $ssStrTag=1; + for($i_wuj=0;$i_wuj<$blockCount;$i_wuj++) + { + $Starts[$i_wuj] = $chromStart + $a_blockStarts[$i_wuj]; + $Ends[$i_wuj] = $Starts[$i_wuj] + $a_blockSizes[$i_wuj]; + $JuncSeq[$i_wuj] = uc substr ($contigSeqStr,$Starts[$i_wuj], $a_blockSizes[$i_wuj]); + if($strand eq "-"){ + $JuncSeq[$i_wuj] = uc string_reverse_complement(lc $JuncSeq[$i_wuj]); + } + } + # for($i_wuj=0;$i_wuj<$blockCount-1;$i_wuj++) +# { +# $ssStr = uc substr ($contigSeqStr, $Ends[$i_wuj], 2) . substr ($contigSeqStr, $Starts[$i_wuj+1] - 2, 2); +# if($strand eq "-"){ + # $ssStr = uc string_reverse_complement(lc $ssStr); + #$ssStr = $rc_ssStr; +# } +# $ssStrTag = 0 if ($ssStr ne "GTAG"); + + # } +# if($ssStrTag ==1){ + if(1){ + $num_2++; + $num_chr_2++; + print OUT_1 "$line\n"; + #print OUT_2 ">$name\|$chrom\|$chromStart\|$chromEnd\|$strand\|$ssStr\|$num_2\n$junctionSeqStrLeft$junctionSeqStrRight\n"; +# print OUT_2 ">$name\|$chrom\|$chromStart\|$chromEnd\|$strand\|GTAG\|$num_2\|$blockCount\n"; + print OUT_2 ">$name\|$chrom\|$chromStart\|$chromEnd\|$strand\|$num_2\|$blockCount\n"; + if($strand eq "+") + { + for($i_wuj=0;$i_wuj<$blockCount;$i_wuj++) + { + print OUT_2 $JuncSeq[$i_wuj]; + } + } + else + { + for($i_wuj=$blockCount-1;$i_wuj>-1;$i_wuj--) + { + print OUT_2 $JuncSeq[$i_wuj]; + } + + } + print OUT_2 "\n"; + } + +} +print "$num_chr_1 $num_chr_2 $len_contigSeqStr\n"; +print "BED2FA: in file1, $num_count_chrom chroms, $num_1 beds, $num_2 saved.\n"; + +close IN_1 or die "can't close the input file : $!"; +close IN_2 or die "can't close the input file : $!"; +close OUT_1 or die "can't close the output file : $!"; +close OUT_2 or die "can't close the output file : $!"; + +####################################### +$complete_time = time()-$starttime; +print "BED2FA: Run $complete_time seconds...Done!\n"; + +####################################### +# sub fuctions +sub string_reverse_complement{ + local($string) = @_; + local($len_str, $ret, $i, $char); + + $len_str = length $string; + $ret = ""; + for($i=0; $i<$len_str; $i++){ + $char = substr($string, $i, 1); + if($char eq 'a'){ + $char = 't'; + } + elsif($char eq 't'){ + $char = 'a'; + } + elsif($char eq 'c'){ + $char = 'g'; + } + elsif($char eq 'g'){ + $char = 'c'; + } + else{ + $char = 'n'; + } + $ret = $char.$ret; + } + + return $ret; +} + +sub sort_chromNo{ + local(@chrom) = @_; + local($len_key_chr_hash, $i, @sort_chr_hash); + local(@digit_random, @words_random, @digit_other_1, @digit_other_2, @words_other_1, @words_other_2, @digit, @words); + local(@sort_digit, @sort_words, @sort_digit_random, @sort_words_random, @sort_digit_other, @sort_words_other); + local($len_digit, $len_words, $len_digit_random, $len_words_random, $len_digit_other, $len_words_other, $term); + + $len_key_chr_hash = @chrom; + # sort via chr number for printing result + for($i=0; $i<$len_key_chr_hash; $i++){ + if($key_chr_hash[$i] =~ /chr(\d+)\_random/){ + push(@digit_random, $1); + } + elsif($key_chr_hash[$i] =~ /chr(\w+)\_random/){ + push(@words_random, $1); + } + elsif($key_chr_hash[$i] =~ /chr(\d+)\_([\w\d\_]+)/){ + push(@digit_other_1, $1); + push(@digit_other_2, $2); + } + elsif($key_chr_hash[$i] =~ /chr(\w+)\_([\w\d\_]+)/){ + push(@words_other_1, $1); + push(@words_other_2, $2); + } + elsif($key_chr_hash[$i] =~ /chr(\d+)/){ + push(@digit, $1); + } + elsif($key_chr_hash[$i] =~ /chr(\w+)/){ + push(@words, $1); + } + else{ + print "BED2FA: There is unknown type of chromosomes: $key_chr_hash[$i]\n"; + } + } + @sort_digit = sort by_mostly_numeric @digit; + @sort_words = sort by_mostly_string @words; + @sort_digit_random = sort by_mostly_numeric @digit_random; + @sort_words_random = sort by_mostly_string @words_random; + @sort_digit_other = sort_2_array_number_string(\@digit_other_1, \@digit_other_2); + @sort_words_other = sort_2_array_string_string(\@words_other_1, \@words_other_2); + + $len_digit = @sort_digit; + for($i=0; $i<$len_digit; $i++){ + $term = "chr".$sort_digit[$i]; + push(@sort_chr_hash, $term); + } + $len_words = @sort_words; + for($i=0; $i<$len_words; $i++){ + $term = "chr".$sort_words[$i]; + push(@sort_chr_hash, $term); + } + $len_digit_random = @sort_digit_random; + for($i=0; $i<$len_digit_random; $i++){ + $term = "chr".$sort_digit_random[$i]."_random"; + push(@sort_chr_hash, $term); + } + $len_words_random = @sort_words_random; + for($i=0; $i<$len_words_random; $i++){ + $term = "chr".$sort_words_random[$i]."_random"; + push(@sort_chr_hash, $term); + } + $len_digit_other = @sort_digit_other; + for($i=0; $i<$len_digit_other; $i=$i+2){ + $term = "chr".$sort_digit_other[$i]."_".$sort_digit_other[$i+1]; + push(@sort_chr_hash, $term); + } + $len_words_other = @sort_words_other; + for($i=0; $i<$len_words_other; $i=$i+2){ + $term = "chr".$sort_words_other[$i]."_".$sort_words_other[$i+1]; + push(@sort_chr_hash, $term); + } + + return @sort_chr_hash; +} + +sub sort_2_array_number_string{ + local($a, $b) = @_; + local($len_a, $len_b, $i, %family, $one, $two); + local(@ret); + + $len_a = @$a; + $len_b = @$b; + if($len_a == $len_b){ + for($i=0; $i<$len_a; $i++){ + $family{$$a[$i]}{$$b[$i]} = 0; + } + for $one (sort by_mostly_numeric keys %family) { + for $two (sort by_mostly_string keys %{ $family{$one} }) { + push(@ret, $one); + push(@ret, $two); + } + } + } + else{ + print "ERROR: Sort array is not same size\n"; + print "a $len_a, b $len_b\n"; + } + + return @ret; +} + +sub sort_2_array_string_string{ + local($a, $b) = @_; + local($len_a, $len_b, $i, %family, $one, $two); + local(@ret); + + $len_a = @$a; + $len_b = @$b; + if($len_a == $len_b){ + for($i=0; $i<$len_a; $i++){ + $family{$$a[$i]}{$$b[$i]} = 0; + } + for $one (sort by_mostly_string keys %family) { + for $two (sort by_mostly_string keys %{ $family{$one} }) { + push(@ret, $one); + push(@ret, $two); + } + } + } + else{ + print "ERROR: Sort array is not same size\n"; + print "a $len_a, b $len_b\n"; + } + + return @ret; +} + +sub by_mostly_numeric{ +# ( $a <=> $b ) || ( $a cmp $b ); + ( $a <=> $b ); +} + +sub by_mostly_string{ +# ( $a <=> $b ) || ( $a cmp $b ); + ( $a cmp $b ); +} +