Mercurial > repos > bioitcore > splicetrap
view bin/get_bed_fa_j.pl @ 5:2ebca9da5e42 draft default tip
planemo upload
author | bioitcore |
---|---|
date | Thu, 07 Sep 2017 17:39:24 -0400 |
parents | adc0f7765d85 |
children |
line wrap: on
line source
# Adapted from Chenghai Xue's script $starttime=time(); $input_file_1 = $ARGV[0]; # exon junction file $input_file_2 = $ARGV[1]; # genome file list $output_file_1 = $ARGV[2]; # exon junction bed (might be less than input_file_1 $output_file_2 = $ARGV[3]; # exon junction fa #$leftLen = $ARGV[4]; #$rightLen = $ARGV[5]; open(IN_1, "$input_file_1") or die "can't open the input file : $!"; open(IN_2, "$input_file_2") or die "can't open the input file : $!"; open OUT_1, ">$output_file_1" or die "Can not open output_file : $!"; open OUT_2, ">$output_file_2" or die "Can not open output_file : $!"; @chromList = (<IN_2>); chomp(@chromList); $len_chromList = @chromList; print "BED2FA: in $input_file_2, found $len_chromList chromosomes\n"; foreach $one (@chromList){ if($one =~ /\/(chr.[^\/]*?)\.*fa$/i){ $chr_hash{$1} = $one; #print $1,"\n"; } } @key_chr_hash = keys(%chr_hash); $len_key_chr_hash = @key_chr_hash; @sort_key_chr_hash = sort_chromNo(@key_chr_hash); $len_sort_key_chr_hash = @sort_key_chr_hash; #for($i=0; $i<$len_sort_key_chr_hash; $i++){ # print "$sort_key_chr_hash[$i] $chr_hash{$sort_key_chr_hash[$i]}\n"; #} $num_1=0; $num_2=0; $num_count_chrom=0; my ($chrom, $chromStart, $chromEnd, $name, $score, $strand, $thickStart, $thickEnd, $itemRgb, $blockCount, $blockSizes, $blockStarts); $current_chrom = ""; while(<IN_1>){ $num_1++; $line = $_; chomp $line; #print $line,"\n"; @cols = split ("\t", $line); if(scalar(@cols)==12) { ($chrom, $chromStart, $chromEnd, $name, $score, $strand, $thickStart, $thickEnd, $itemRgb, $blockCount, $blockSizes, $blockStarts) = @cols; } if(scalar(@cols)!=12) { ($chrom, $chromStart, $chromEnd, $name, $score, $strand)=@cols; $thickStart=$chromStart; # print $thickStart,"\n"; $thickEnd = $chromEnd; $blockCount=1; $blockSizes=$chromEnd-$chromStart; $blockStarts = 0; } $strand="+" if !$strand; @a_blockSizes = split (/\,/, $blockSizes); @a_blockStarts = split (/\,/, $blockStarts); if($chrom ne $current_chrom){ if($num_1 != 1){ print "$num_chr_1 $num_chr_2 $len_contigSeqStr\n"; } print "BED2FA: $chrom: "; $num_chr_1=0; $num_chr_2=0; if(exists $chr_hash{$chrom}){ $num_count_chrom++; $current_chrom = $chrom; #print $current_chrom,"\n"; #=pod $chromFastaFile = $chr_hash{$chrom}; #print $chromFastaFile,"\n"; open($fin, "<$chromFastaFile") or die "can't open the chrom file : $!"; local ($/) = undef; $contigSeqStr = <$fin>; close ($fin); #print $contigSeqStr,"mark\t"; $contigSeqStr =~s/^\>.*?\n//g; #print $contigSeqStr,"mark2\t"; $contigSeqStr =~s/\s|\n//g; #print $contigSeqStr,"mark3\n"; $len_contigSeqStr = length $contigSeqStr; #=cut } else{ $num_chr_1++; next; } } $num_chr_1++; # modify from here................................ my @Starts; my @Ends; my @JuncSeq; my $ssStrTag=1; for($i_wuj=0;$i_wuj<$blockCount;$i_wuj++) { $Starts[$i_wuj] = $chromStart + $a_blockStarts[$i_wuj]; $Ends[$i_wuj] = $Starts[$i_wuj] + $a_blockSizes[$i_wuj]; $JuncSeq[$i_wuj] = uc substr ($contigSeqStr,$Starts[$i_wuj], $a_blockSizes[$i_wuj]); if($strand eq "-"){ $JuncSeq[$i_wuj] = uc string_reverse_complement(lc $JuncSeq[$i_wuj]); } } # for($i_wuj=0;$i_wuj<$blockCount-1;$i_wuj++) # { # $ssStr = uc substr ($contigSeqStr, $Ends[$i_wuj], 2) . substr ($contigSeqStr, $Starts[$i_wuj+1] - 2, 2); # if($strand eq "-"){ # $ssStr = uc string_reverse_complement(lc $ssStr); #$ssStr = $rc_ssStr; # } # $ssStrTag = 0 if ($ssStr ne "GTAG"); # } # if($ssStrTag ==1){ if(1){ $num_2++; $num_chr_2++; print OUT_1 "$line\n"; #print OUT_2 ">$name\|$chrom\|$chromStart\|$chromEnd\|$strand\|$ssStr\|$num_2\n$junctionSeqStrLeft$junctionSeqStrRight\n"; # print OUT_2 ">$name\|$chrom\|$chromStart\|$chromEnd\|$strand\|GTAG\|$num_2\|$blockCount\n"; print OUT_2 ">$name\|$chrom\|$chromStart\|$chromEnd\|$strand\|$num_2\|$blockCount\n"; if($strand eq "+") { for($i_wuj=0;$i_wuj<$blockCount;$i_wuj++) { print OUT_2 $JuncSeq[$i_wuj]; } } else { for($i_wuj=$blockCount-1;$i_wuj>-1;$i_wuj--) { print OUT_2 $JuncSeq[$i_wuj]; } } print OUT_2 "\n"; } } print "$num_chr_1 $num_chr_2 $len_contigSeqStr\n"; print "BED2FA: in file1, $num_count_chrom chroms, $num_1 beds, $num_2 saved.\n"; close IN_1 or die "can't close the input file : $!"; close IN_2 or die "can't close the input file : $!"; close OUT_1 or die "can't close the output file : $!"; close OUT_2 or die "can't close the output file : $!"; ####################################### $complete_time = time()-$starttime; print "BED2FA: Run $complete_time seconds...Done!\n"; ####################################### # sub fuctions sub string_reverse_complement{ local($string) = @_; local($len_str, $ret, $i, $char); $len_str = length $string; $ret = ""; for($i=0; $i<$len_str; $i++){ $char = substr($string, $i, 1); if($char eq 'a'){ $char = 't'; } elsif($char eq 't'){ $char = 'a'; } elsif($char eq 'c'){ $char = 'g'; } elsif($char eq 'g'){ $char = 'c'; } else{ $char = 'n'; } $ret = $char.$ret; } return $ret; } sub sort_chromNo{ local(@chrom) = @_; local($len_key_chr_hash, $i, @sort_chr_hash); local(@digit_random, @words_random, @digit_other_1, @digit_other_2, @words_other_1, @words_other_2, @digit, @words); local(@sort_digit, @sort_words, @sort_digit_random, @sort_words_random, @sort_digit_other, @sort_words_other); local($len_digit, $len_words, $len_digit_random, $len_words_random, $len_digit_other, $len_words_other, $term); $len_key_chr_hash = @chrom; # sort via chr number for printing result for($i=0; $i<$len_key_chr_hash; $i++){ if($key_chr_hash[$i] =~ /chr(\d+)\_random/){ push(@digit_random, $1); } elsif($key_chr_hash[$i] =~ /chr(\w+)\_random/){ push(@words_random, $1); } elsif($key_chr_hash[$i] =~ /chr(\d+)\_([\w\d\_]+)/){ push(@digit_other_1, $1); push(@digit_other_2, $2); } elsif($key_chr_hash[$i] =~ /chr(\w+)\_([\w\d\_]+)/){ push(@words_other_1, $1); push(@words_other_2, $2); } elsif($key_chr_hash[$i] =~ /chr(\d+)/){ push(@digit, $1); } elsif($key_chr_hash[$i] =~ /chr(\w+)/){ push(@words, $1); } else{ print "BED2FA: There is unknown type of chromosomes: $key_chr_hash[$i]\n"; } } @sort_digit = sort by_mostly_numeric @digit; @sort_words = sort by_mostly_string @words; @sort_digit_random = sort by_mostly_numeric @digit_random; @sort_words_random = sort by_mostly_string @words_random; @sort_digit_other = sort_2_array_number_string(\@digit_other_1, \@digit_other_2); @sort_words_other = sort_2_array_string_string(\@words_other_1, \@words_other_2); $len_digit = @sort_digit; for($i=0; $i<$len_digit; $i++){ $term = "chr".$sort_digit[$i]; push(@sort_chr_hash, $term); } $len_words = @sort_words; for($i=0; $i<$len_words; $i++){ $term = "chr".$sort_words[$i]; push(@sort_chr_hash, $term); } $len_digit_random = @sort_digit_random; for($i=0; $i<$len_digit_random; $i++){ $term = "chr".$sort_digit_random[$i]."_random"; push(@sort_chr_hash, $term); } $len_words_random = @sort_words_random; for($i=0; $i<$len_words_random; $i++){ $term = "chr".$sort_words_random[$i]."_random"; push(@sort_chr_hash, $term); } $len_digit_other = @sort_digit_other; for($i=0; $i<$len_digit_other; $i=$i+2){ $term = "chr".$sort_digit_other[$i]."_".$sort_digit_other[$i+1]; push(@sort_chr_hash, $term); } $len_words_other = @sort_words_other; for($i=0; $i<$len_words_other; $i=$i+2){ $term = "chr".$sort_words_other[$i]."_".$sort_words_other[$i+1]; push(@sort_chr_hash, $term); } return @sort_chr_hash; } sub sort_2_array_number_string{ local($a, $b) = @_; local($len_a, $len_b, $i, %family, $one, $two); local(@ret); $len_a = @$a; $len_b = @$b; if($len_a == $len_b){ for($i=0; $i<$len_a; $i++){ $family{$$a[$i]}{$$b[$i]} = 0; } for $one (sort by_mostly_numeric keys %family) { for $two (sort by_mostly_string keys %{ $family{$one} }) { push(@ret, $one); push(@ret, $two); } } } else{ print "ERROR: Sort array is not same size\n"; print "a $len_a, b $len_b\n"; } return @ret; } sub sort_2_array_string_string{ local($a, $b) = @_; local($len_a, $len_b, $i, %family, $one, $two); local(@ret); $len_a = @$a; $len_b = @$b; if($len_a == $len_b){ for($i=0; $i<$len_a; $i++){ $family{$$a[$i]}{$$b[$i]} = 0; } for $one (sort by_mostly_string keys %family) { for $two (sort by_mostly_string keys %{ $family{$one} }) { push(@ret, $one); push(@ret, $two); } } } else{ print "ERROR: Sort array is not same size\n"; print "a $len_a, b $len_b\n"; } return @ret; } sub by_mostly_numeric{ # ( $a <=> $b ) || ( $a cmp $b ); ( $a <=> $b ); } sub by_mostly_string{ # ( $a <=> $b ) || ( $a cmp $b ); ( $a cmp $b ); }