1
|
1 # Adapted from Chenghai Xue's script
|
|
2
|
|
3 $starttime=time();
|
|
4
|
|
5 $input_file_1 = $ARGV[0]; # exon junction file
|
|
6 $input_file_2 = $ARGV[1]; # genome file list
|
|
7 $output_file_1 = $ARGV[2]; # exon junction bed (might be less than input_file_1
|
|
8 $output_file_2 = $ARGV[3]; # exon junction fa
|
|
9 #$leftLen = $ARGV[4];
|
|
10 #$rightLen = $ARGV[5];
|
|
11
|
|
12 open(IN_1, "$input_file_1") or die "can't open the input file : $!";
|
|
13 open(IN_2, "$input_file_2") or die "can't open the input file : $!";
|
|
14 open OUT_1, ">$output_file_1" or die "Can not open output_file : $!";
|
|
15 open OUT_2, ">$output_file_2" or die "Can not open output_file : $!";
|
|
16
|
|
17 @chromList = (<IN_2>);
|
|
18 chomp(@chromList);
|
|
19 $len_chromList = @chromList;
|
|
20 print "BED2FA: in $input_file_2, found $len_chromList chromosomes\n";
|
|
21 foreach $one (@chromList){
|
|
22 if($one =~ /\/(chr.[^\/]*?)\.*fa$/i){
|
|
23 $chr_hash{$1} = $one;
|
|
24 #print $1,"\n";
|
|
25 }
|
|
26 }
|
|
27 @key_chr_hash = keys(%chr_hash);
|
|
28 $len_key_chr_hash = @key_chr_hash;
|
|
29 @sort_key_chr_hash = sort_chromNo(@key_chr_hash);
|
|
30 $len_sort_key_chr_hash = @sort_key_chr_hash;
|
|
31 #for($i=0; $i<$len_sort_key_chr_hash; $i++){
|
|
32 # print "$sort_key_chr_hash[$i] $chr_hash{$sort_key_chr_hash[$i]}\n";
|
|
33 #}
|
|
34
|
|
35 $num_1=0;
|
|
36 $num_2=0;
|
|
37 $num_count_chrom=0;
|
|
38 my ($chrom, $chromStart, $chromEnd, $name, $score, $strand, $thickStart, $thickEnd, $itemRgb, $blockCount, $blockSizes, $blockStarts);
|
|
39 $current_chrom = "";
|
|
40 while(<IN_1>){
|
|
41 $num_1++;
|
|
42 $line = $_;
|
|
43 chomp $line;
|
|
44 #print $line,"\n";
|
|
45 @cols = split ("\t", $line);
|
|
46 if(scalar(@cols)==12)
|
|
47 {
|
|
48 ($chrom, $chromStart, $chromEnd, $name, $score, $strand, $thickStart, $thickEnd, $itemRgb, $blockCount, $blockSizes, $blockStarts) = @cols;
|
|
49 }
|
|
50 if(scalar(@cols)!=12)
|
|
51 {
|
|
52 ($chrom, $chromStart, $chromEnd, $name, $score, $strand)=@cols;
|
|
53 $thickStart=$chromStart;
|
|
54 # print $thickStart,"\n";
|
|
55 $thickEnd = $chromEnd;
|
|
56 $blockCount=1;
|
|
57 $blockSizes=$chromEnd-$chromStart;
|
|
58 $blockStarts = 0;
|
|
59 }
|
|
60 $strand="+" if !$strand;
|
|
61 @a_blockSizes = split (/\,/, $blockSizes);
|
|
62 @a_blockStarts = split (/\,/, $blockStarts);
|
|
63 if($chrom ne $current_chrom){
|
|
64 if($num_1 != 1){
|
|
65 print "$num_chr_1 $num_chr_2 $len_contigSeqStr\n";
|
|
66 }
|
|
67 print "BED2FA: $chrom: ";
|
|
68
|
|
69 $num_chr_1=0;
|
|
70 $num_chr_2=0;
|
|
71
|
|
72 if(exists $chr_hash{$chrom}){
|
|
73 $num_count_chrom++;
|
|
74 $current_chrom = $chrom;
|
|
75 #print $current_chrom,"\n";
|
|
76 #=pod
|
|
77 $chromFastaFile = $chr_hash{$chrom};
|
|
78 #print $chromFastaFile,"\n";
|
|
79 open($fin, "<$chromFastaFile") or die "can't open the chrom file : $!";
|
|
80 local ($/) = undef;
|
|
81 $contigSeqStr = <$fin>;
|
|
82 close ($fin);
|
|
83 #print $contigSeqStr,"mark\t";
|
|
84 $contigSeqStr =~s/^\>.*?\n//g;
|
|
85 #print $contigSeqStr,"mark2\t";
|
|
86
|
|
87 $contigSeqStr =~s/\s|\n//g;
|
|
88 #print $contigSeqStr,"mark3\n";
|
|
89
|
|
90 $len_contigSeqStr = length $contigSeqStr;
|
|
91 #=cut
|
|
92 }
|
|
93 else{
|
|
94 $num_chr_1++;
|
|
95 next;
|
|
96 }
|
|
97 }
|
|
98 $num_chr_1++;
|
|
99
|
|
100 # modify from here................................
|
|
101 my @Starts;
|
|
102 my @Ends;
|
|
103 my @JuncSeq;
|
|
104 my $ssStrTag=1;
|
|
105 for($i_wuj=0;$i_wuj<$blockCount;$i_wuj++)
|
|
106 {
|
|
107 $Starts[$i_wuj] = $chromStart + $a_blockStarts[$i_wuj];
|
|
108 $Ends[$i_wuj] = $Starts[$i_wuj] + $a_blockSizes[$i_wuj];
|
|
109 $JuncSeq[$i_wuj] = uc substr ($contigSeqStr,$Starts[$i_wuj], $a_blockSizes[$i_wuj]);
|
|
110 if($strand eq "-"){
|
|
111 $JuncSeq[$i_wuj] = uc string_reverse_complement(lc $JuncSeq[$i_wuj]);
|
|
112 }
|
|
113 }
|
|
114 # for($i_wuj=0;$i_wuj<$blockCount-1;$i_wuj++)
|
|
115 # {
|
|
116 # $ssStr = uc substr ($contigSeqStr, $Ends[$i_wuj], 2) . substr ($contigSeqStr, $Starts[$i_wuj+1] - 2, 2);
|
|
117 # if($strand eq "-"){
|
|
118 # $ssStr = uc string_reverse_complement(lc $ssStr);
|
|
119 #$ssStr = $rc_ssStr;
|
|
120 # }
|
|
121 # $ssStrTag = 0 if ($ssStr ne "GTAG");
|
|
122
|
|
123 # }
|
|
124 # if($ssStrTag ==1){
|
|
125 if(1){
|
|
126 $num_2++;
|
|
127 $num_chr_2++;
|
|
128 print OUT_1 "$line\n";
|
|
129 #print OUT_2 ">$name\|$chrom\|$chromStart\|$chromEnd\|$strand\|$ssStr\|$num_2\n$junctionSeqStrLeft$junctionSeqStrRight\n";
|
|
130 # print OUT_2 ">$name\|$chrom\|$chromStart\|$chromEnd\|$strand\|GTAG\|$num_2\|$blockCount\n";
|
|
131 print OUT_2 ">$name\|$chrom\|$chromStart\|$chromEnd\|$strand\|$num_2\|$blockCount\n";
|
|
132 if($strand eq "+")
|
|
133 {
|
|
134 for($i_wuj=0;$i_wuj<$blockCount;$i_wuj++)
|
|
135 {
|
|
136 print OUT_2 $JuncSeq[$i_wuj];
|
|
137 }
|
|
138 }
|
|
139 else
|
|
140 {
|
|
141 for($i_wuj=$blockCount-1;$i_wuj>-1;$i_wuj--)
|
|
142 {
|
|
143 print OUT_2 $JuncSeq[$i_wuj];
|
|
144 }
|
|
145
|
|
146 }
|
|
147 print OUT_2 "\n";
|
|
148 }
|
|
149
|
|
150 }
|
|
151 print "$num_chr_1 $num_chr_2 $len_contigSeqStr\n";
|
|
152 print "BED2FA: in file1, $num_count_chrom chroms, $num_1 beds, $num_2 saved.\n";
|
|
153
|
|
154 close IN_1 or die "can't close the input file : $!";
|
|
155 close IN_2 or die "can't close the input file : $!";
|
|
156 close OUT_1 or die "can't close the output file : $!";
|
|
157 close OUT_2 or die "can't close the output file : $!";
|
|
158
|
|
159 #######################################
|
|
160 $complete_time = time()-$starttime;
|
|
161 print "BED2FA: Run $complete_time seconds...Done!\n";
|
|
162
|
|
163 #######################################
|
|
164 # sub fuctions
|
|
165 sub string_reverse_complement{
|
|
166 local($string) = @_;
|
|
167 local($len_str, $ret, $i, $char);
|
|
168
|
|
169 $len_str = length $string;
|
|
170 $ret = "";
|
|
171 for($i=0; $i<$len_str; $i++){
|
|
172 $char = substr($string, $i, 1);
|
|
173 if($char eq 'a'){
|
|
174 $char = 't';
|
|
175 }
|
|
176 elsif($char eq 't'){
|
|
177 $char = 'a';
|
|
178 }
|
|
179 elsif($char eq 'c'){
|
|
180 $char = 'g';
|
|
181 }
|
|
182 elsif($char eq 'g'){
|
|
183 $char = 'c';
|
|
184 }
|
|
185 else{
|
|
186 $char = 'n';
|
|
187 }
|
|
188 $ret = $char.$ret;
|
|
189 }
|
|
190
|
|
191 return $ret;
|
|
192 }
|
|
193
|
|
194 sub sort_chromNo{
|
|
195 local(@chrom) = @_;
|
|
196 local($len_key_chr_hash, $i, @sort_chr_hash);
|
|
197 local(@digit_random, @words_random, @digit_other_1, @digit_other_2, @words_other_1, @words_other_2, @digit, @words);
|
|
198 local(@sort_digit, @sort_words, @sort_digit_random, @sort_words_random, @sort_digit_other, @sort_words_other);
|
|
199 local($len_digit, $len_words, $len_digit_random, $len_words_random, $len_digit_other, $len_words_other, $term);
|
|
200
|
|
201 $len_key_chr_hash = @chrom;
|
|
202 # sort via chr number for printing result
|
|
203 for($i=0; $i<$len_key_chr_hash; $i++){
|
|
204 if($key_chr_hash[$i] =~ /chr(\d+)\_random/){
|
|
205 push(@digit_random, $1);
|
|
206 }
|
|
207 elsif($key_chr_hash[$i] =~ /chr(\w+)\_random/){
|
|
208 push(@words_random, $1);
|
|
209 }
|
|
210 elsif($key_chr_hash[$i] =~ /chr(\d+)\_([\w\d\_]+)/){
|
|
211 push(@digit_other_1, $1);
|
|
212 push(@digit_other_2, $2);
|
|
213 }
|
|
214 elsif($key_chr_hash[$i] =~ /chr(\w+)\_([\w\d\_]+)/){
|
|
215 push(@words_other_1, $1);
|
|
216 push(@words_other_2, $2);
|
|
217 }
|
|
218 elsif($key_chr_hash[$i] =~ /chr(\d+)/){
|
|
219 push(@digit, $1);
|
|
220 }
|
|
221 elsif($key_chr_hash[$i] =~ /chr(\w+)/){
|
|
222 push(@words, $1);
|
|
223 }
|
|
224 else{
|
|
225 print "BED2FA: There is unknown type of chromosomes: $key_chr_hash[$i]\n";
|
|
226 }
|
|
227 }
|
|
228 @sort_digit = sort by_mostly_numeric @digit;
|
|
229 @sort_words = sort by_mostly_string @words;
|
|
230 @sort_digit_random = sort by_mostly_numeric @digit_random;
|
|
231 @sort_words_random = sort by_mostly_string @words_random;
|
|
232 @sort_digit_other = sort_2_array_number_string(\@digit_other_1, \@digit_other_2);
|
|
233 @sort_words_other = sort_2_array_string_string(\@words_other_1, \@words_other_2);
|
|
234
|
|
235 $len_digit = @sort_digit;
|
|
236 for($i=0; $i<$len_digit; $i++){
|
|
237 $term = "chr".$sort_digit[$i];
|
|
238 push(@sort_chr_hash, $term);
|
|
239 }
|
|
240 $len_words = @sort_words;
|
|
241 for($i=0; $i<$len_words; $i++){
|
|
242 $term = "chr".$sort_words[$i];
|
|
243 push(@sort_chr_hash, $term);
|
|
244 }
|
|
245 $len_digit_random = @sort_digit_random;
|
|
246 for($i=0; $i<$len_digit_random; $i++){
|
|
247 $term = "chr".$sort_digit_random[$i]."_random";
|
|
248 push(@sort_chr_hash, $term);
|
|
249 }
|
|
250 $len_words_random = @sort_words_random;
|
|
251 for($i=0; $i<$len_words_random; $i++){
|
|
252 $term = "chr".$sort_words_random[$i]."_random";
|
|
253 push(@sort_chr_hash, $term);
|
|
254 }
|
|
255 $len_digit_other = @sort_digit_other;
|
|
256 for($i=0; $i<$len_digit_other; $i=$i+2){
|
|
257 $term = "chr".$sort_digit_other[$i]."_".$sort_digit_other[$i+1];
|
|
258 push(@sort_chr_hash, $term);
|
|
259 }
|
|
260 $len_words_other = @sort_words_other;
|
|
261 for($i=0; $i<$len_words_other; $i=$i+2){
|
|
262 $term = "chr".$sort_words_other[$i]."_".$sort_words_other[$i+1];
|
|
263 push(@sort_chr_hash, $term);
|
|
264 }
|
|
265
|
|
266 return @sort_chr_hash;
|
|
267 }
|
|
268
|
|
269 sub sort_2_array_number_string{
|
|
270 local($a, $b) = @_;
|
|
271 local($len_a, $len_b, $i, %family, $one, $two);
|
|
272 local(@ret);
|
|
273
|
|
274 $len_a = @$a;
|
|
275 $len_b = @$b;
|
|
276 if($len_a == $len_b){
|
|
277 for($i=0; $i<$len_a; $i++){
|
|
278 $family{$$a[$i]}{$$b[$i]} = 0;
|
|
279 }
|
|
280 for $one (sort by_mostly_numeric keys %family) {
|
|
281 for $two (sort by_mostly_string keys %{ $family{$one} }) {
|
|
282 push(@ret, $one);
|
|
283 push(@ret, $two);
|
|
284 }
|
|
285 }
|
|
286 }
|
|
287 else{
|
|
288 print "ERROR: Sort array is not same size\n";
|
|
289 print "a $len_a, b $len_b\n";
|
|
290 }
|
|
291
|
|
292 return @ret;
|
|
293 }
|
|
294
|
|
295 sub sort_2_array_string_string{
|
|
296 local($a, $b) = @_;
|
|
297 local($len_a, $len_b, $i, %family, $one, $two);
|
|
298 local(@ret);
|
|
299
|
|
300 $len_a = @$a;
|
|
301 $len_b = @$b;
|
|
302 if($len_a == $len_b){
|
|
303 for($i=0; $i<$len_a; $i++){
|
|
304 $family{$$a[$i]}{$$b[$i]} = 0;
|
|
305 }
|
|
306 for $one (sort by_mostly_string keys %family) {
|
|
307 for $two (sort by_mostly_string keys %{ $family{$one} }) {
|
|
308 push(@ret, $one);
|
|
309 push(@ret, $two);
|
|
310 }
|
|
311 }
|
|
312 }
|
|
313 else{
|
|
314 print "ERROR: Sort array is not same size\n";
|
|
315 print "a $len_a, b $len_b\n";
|
|
316 }
|
|
317
|
|
318 return @ret;
|
|
319 }
|
|
320
|
|
321 sub by_mostly_numeric{
|
|
322 # ( $a <=> $b ) || ( $a cmp $b );
|
|
323 ( $a <=> $b );
|
|
324 }
|
|
325
|
|
326 sub by_mostly_string{
|
|
327 # ( $a <=> $b ) || ( $a cmp $b );
|
|
328 ( $a cmp $b );
|
|
329 }
|
|
330
|