3
|
1 #!/usr/bin/perl
|
|
2
|
|
3
|
|
4 use warnings;
|
|
5 use strict;
|
|
6 use Getopt::Std;
|
|
7
|
|
8 ######################################### USAGE ################################
|
|
9
|
|
10 my $usage=
|
|
11 "$0 file_bowtie_result file_solexa_seq file_chromosome
|
|
12
|
|
13 This is a converter which changes Bowtie output into Blast format.
|
|
14 The input includes three files: a Bowtie result file (default Bowtie
|
|
15 output file), a fasta file consisting of small Reads and a chromosome
|
|
16 fasta file. It outputs the alignments in blast_parsed format.
|
|
17
|
|
18 file_bowtie_result likes:
|
|
19
|
|
20 AtFlower100010_x2 + MIR319c 508 AAGGAGATTCTTTCAGTCCAG IIIIIIIIIIIIIIIIIIIII 0
|
|
21 AtFlower1000188_x1 + MIR2933a 421 TCGGAGAGGAAATTCGTCGGCG IIIIIIIIIIIIIIIIIIIIII 0
|
|
22
|
|
23 file_solexa_seq likes:
|
|
24
|
|
25 >AtFlower100010_x2
|
|
26 AAGGAGATTCTTTCAGTCCAG
|
|
27
|
|
28 file_chromosome contains chromosome seq in fasta format
|
|
29
|
|
30 ";
|
|
31
|
|
32
|
|
33 ####################################### INPUT FILES ############################
|
|
34
|
|
35 my $file_bowtie_result=shift or die $usage;
|
|
36 my $file_short_seq=shift or die $usage;
|
|
37 my $file_chromosome_seq=shift or die $usage;
|
|
38
|
|
39
|
|
40 ##################################### GLOBAL VARIBALES #########################
|
|
41
|
|
42 my %short_seq_length=();
|
|
43 my %chromosome_length=();
|
|
44
|
|
45
|
|
46 ######################################### MAIN #################################
|
|
47
|
|
48 #get the short sequence id and its length
|
|
49 sequence_length($file_short_seq,\%short_seq_length);
|
|
50
|
|
51 #get the chromosome sequence id and its length
|
|
52 sequence_length($file_chromosome_seq,\%chromosome_length);
|
|
53
|
|
54 #convert bowtie result format to blast format;
|
|
55 change_format($file_bowtie_result);
|
|
56
|
|
57 exit;
|
|
58
|
|
59
|
|
60 ##################################### SUBROUTINES ##############################
|
|
61
|
|
62 sub sequence_length{
|
|
63 my ($file,$hash) = @_;
|
|
64 my ($id, $desc, $sequence, $seq_length) = ();
|
|
65
|
|
66 open (FASTA, "<$file") or die "can not open $$file\n";
|
|
67 while (<FASTA>)
|
|
68 {
|
|
69 chomp;
|
|
70 if (/^>(\S+)(.*)/)
|
|
71 {
|
|
72 $id = $1;
|
|
73 $desc = $2;
|
|
74 $sequence = "";
|
|
75 while (<FASTA>){
|
|
76 chomp;
|
|
77 if (/^>(\S+)(.*)/){
|
|
78 $$hash{$id} = length $sequence;
|
|
79 $id = $1;
|
|
80 $desc = $2;
|
|
81 $sequence = "";
|
|
82 next;
|
|
83 }
|
|
84 $sequence .= $_;
|
|
85 }
|
|
86 }
|
|
87 }
|
|
88 $seq_length=length($sequence);
|
|
89 $$hash{$id} = $seq_length;
|
|
90 close FASTA;
|
|
91 }
|
|
92
|
|
93
|
|
94
|
|
95
|
|
96
|
|
97 sub change_format{
|
|
98 #Change Bowtie format into blast format
|
|
99 my $file=shift @_;
|
|
100 open(FILE,"<$file")||die"can not open the bowtie result file:$!\n";
|
|
101 #open(BLASTOUT,">blastout")||die"can not create the blastout file:$!\n";
|
|
102
|
|
103 while(<FILE>){
|
|
104 chomp;
|
|
105 my @tmp=split("\t",$_);
|
|
106 #Clean the reads ID
|
|
107 my @tmp1=split(" ",$tmp[0]);
|
|
108 print "$tmp1[0]"."\t"."$short_seq_length{$tmp1[0]}"."\t"."1".'..'."$short_seq_length{$tmp1[0]}"."\t"."$tmp[2]"."\t"."$chromosome_length{$tmp[2]}"."\t";
|
|
109 if($tmp[1] eq "+"){
|
|
110 my $seq_end=$tmp[3] + $short_seq_length{$tmp1[0]};
|
|
111 my $seq_bg=$tmp[3] + 1;
|
|
112 print "$seq_bg".'..'."$seq_end"."\t"."1e-04"."\t"."1.00"."\t"."42.1"."\t"."Plus / Plus"."\n";
|
|
113 }
|
|
114 if($tmp[1] eq "-"){
|
|
115 my $seq_end=$chromosome_length{$tmp[2]} - $tmp[3];
|
|
116 my $seq_bg=$seq_end - $short_seq_length{$tmp1[0]} + 1;
|
|
117 print "$seq_bg".'..'."$seq_end"."\t"."1e-04"."\t"."1.00"."\t"."42.1"."\t"."Plus / Minus"."\n";
|
|
118 }
|
|
119 }
|
|
120
|
|
121 # close BLASTOUT;
|
|
122
|
|
123 }
|
|
124
|
|
125
|
|
126
|