0
|
1 #!/usr/bin/perl -w
|
|
2 #Filename:
|
|
3 #Author: Tian Dongmei
|
|
4 #Email: tiandm@big.ac.cn
|
|
5 #Date: 2014-4-22
|
|
6 #Modified:
|
|
7 #Description: plant microRNA prediction
|
|
8 my $version=1.00;
|
|
9
|
|
10 use strict;
|
|
11 use Getopt::Long;
|
|
12 use threads;
|
|
13 #use threads::shared;
|
|
14 use File::Path;
|
|
15 use File::Basename;
|
|
16 #use RNA;
|
|
17 #use Term::ANSIColor;
|
|
18
|
|
19 my %opts;
|
|
20 GetOptions(\%opts,"i=s","fa=s","gfa=s","pre:s","mat:s","dis:i","flank:i","mfe:f","idx:s","mis:i","r:i","e:i","f:i","t:i","o:s","path:s","D","h");
|
|
21 if (!(defined $opts{i} and defined $opts{gfa}) || defined $opts{h}) { #necessary arguments
|
|
22 &usage;
|
|
23 }
|
|
24
|
|
25 my $time=&Time();
|
|
26 print "miPlant program start:\n The time is $time!\n";
|
|
27 print "Command line:\n $0 @ARGV\n";
|
|
28
|
|
29 my $mypath=`pwd`;
|
|
30 chomp $mypath;
|
|
31
|
|
32 my $dir=defined $opts{'o'} ? $opts{'o'} : "$mypath/miRNA_out/";
|
|
33
|
|
34
|
|
35 unless ($dir=~/\/$/) {$dir.="/";}
|
|
36 if (not -d $dir) {
|
|
37 mkdir $dir;
|
|
38 }
|
|
39 my $config=$opts{'i'};
|
|
40 my $data=$opts{'fa'};
|
|
41
|
|
42 my $scipt_path=defined $opts{'path'} ? $opts{'path'} : "/Users/big/galaxy-dist/tools/myTools/";
|
|
43
|
|
44 my $t=1; #threads number
|
|
45 if (defined $opts{'t'}) {$t=$opts{'t'};}
|
|
46
|
|
47 my $mis=0; #mismatch number for microRNA
|
|
48 if (defined $opts{'mis'}) {$mis=$opts{'mis'};}
|
|
49
|
|
50 my $hit=25; # maximum reads mapping hits in genome
|
|
51 if (defined $opts{'r'}) {$hit=$opts{'r'};}
|
|
52
|
|
53 my $upstream = 2; # microRNA 5' extension
|
|
54 $upstream = $opts{'e'} if(defined $opts{'e'});
|
|
55
|
|
56 my $downstream = 5;# microRNA 3' extension
|
|
57 $downstream = $opts{'f'} if(defined $opts{'f'});
|
|
58
|
|
59 my $maxd=defined $opts{'dis'} ? $opts{'dis'} : 200;
|
|
60 my $flank=defined $opts{'flank'} ? $opts{'flank'} :10;
|
|
61 my $mfe=defined $opts{'mfe'} ? $opts{'mfe'} : -20;
|
|
62
|
|
63 $time=&Time();
|
|
64 print "$time, Checking input file!\n";
|
|
65
|
|
66 my (@filein,@mark);
|
|
67 &read_config();
|
|
68
|
|
69 &checkfa($opts{pre}) if(defined $opts{pre});
|
|
70 &checkfa($opts{mat}) if(defined $opts{mat});
|
|
71 &checkfa($opts{gfa});
|
|
72
|
|
73 chdir $dir;
|
|
74 my $data2=$data;
|
|
75 my $known_result=$dir."known_miRNA_Express";
|
|
76 if(defined $opts{pre} and defined $opts{mat}){
|
|
77 &quantify(); ### known microRAN quantify
|
|
78 $data2=$known_result."/mirbase_not_mapped.fa";
|
|
79 }
|
|
80
|
|
81 my $genome_map=$dir."genome_match";
|
|
82 &genome($data2);
|
|
83
|
|
84 #my $genome_map=&search($dir,"genome_match_");
|
|
85 my $mapfile=$genome_map."/genome_mapped.bwt";
|
|
86 my $mapfa=$genome_map."/genome_mapped.fa";
|
|
87 my $unmap=$genome_map."/genome_not_mapped.fa";
|
|
88
|
|
89 #$time=Time();
|
|
90 #print "$time: Novel microRNA prediction!\n\n";
|
|
91
|
|
92 &predict($mapfa);
|
|
93
|
|
94 $time=Time();
|
|
95 print "$time: Program end!!\n";
|
|
96
|
|
97 ############################## sub programs ###################################
|
|
98 sub predict{
|
|
99 my ($file)=@_;
|
|
100 $time=&Time();
|
|
101 print "$time: Novel microRNA prediction!\n\n";
|
|
102 my $predict=$dir."Novel_miRNA_predict";
|
|
103 mkdir $predict;
|
|
104 chdir $predict;
|
|
105 system("perl $scipt_path/precursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe");
|
|
106 # print "\nprecursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe\n";
|
|
107
|
|
108 system("bowtie-build -f excised_precursor.fa excised_precursor");
|
|
109 # print "\nbowtie-build -f excised_precursor.fa excised_precursor\n";
|
|
110
|
|
111 system("bowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt 2> run.log");
|
|
112 # print "\nbowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt\n";
|
|
113
|
|
114 system("perl $scipt_path/convert_bowtie_to_blast.pl precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst");
|
|
115 # print "\nconvert_bowtie_to_blast.pl precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst\n";
|
|
116
|
|
117 system("sort -k 4 precursor_mapped.bst > signatures.bst");
|
|
118 # print "\nsort +3 -25 precursor_mapped.bst > ../signatures.bst\n";
|
|
119
|
|
120 chdir $dir;
|
|
121 system("perl $scipt_path/miRDeep_plant.pl $predict/signatures.bst $predict/excised_precursor_struc.txt novel_tmp_dir -y > microRNA_prediction.mrd");
|
|
122 # print "\nmiRDeep_plant.pl $dir/signatures.bst $predict/excised_precursor_struc.txt tmp_dir -y > microRNA_prediction.txt\n";
|
|
123 #system("rm novel_tmp_dir -rf");
|
|
124 my $tag=join "," ,@mark;
|
|
125 system("perl $scipt_path/miRNA_Express_and_sequence.pl -i microRNA_prediction.mrd -list novel_microRNA_express.txt -fa novel_microRNA_mature.fa -pre novel_microRNA_precursor.fa -tag $tag");
|
|
126
|
|
127 system("perl $scipt_path/non_miRNA_reads.pl -i microRNA_prediction.mrd -fa $file -o non_microRNA_sequence.fa");
|
|
128
|
|
129 }
|
|
130
|
|
131 sub genome{
|
|
132 my ($file)=@_;
|
|
133 if(defined $opts{'idx'}){
|
|
134 system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} ") ;
|
|
135 # print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} -time $time\n";
|
|
136 }else{
|
|
137 system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir ") ;
|
|
138 # print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -time $time\n";
|
|
139 }
|
|
140 }
|
|
141
|
|
142 sub quantify{
|
|
143 my $tag=join "\\;" ,@mark;
|
|
144 system("perl $scipt_path/quantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -mis $mis -t $t -e $upstream -f $downstream -tag $tag");
|
|
145 print "\nquantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -mis $mis -t $t -e $upstream -f $downstream -tag $tag\n";
|
|
146 }
|
|
147
|
|
148 sub read_config{
|
|
149 open CON,"<$config";
|
|
150 while (my $aline=<CON>) {
|
|
151 chomp $aline;
|
|
152 my @tmp=split/\t/,$aline;
|
|
153 push @filein,$tmp[0];
|
|
154 push @mark,$tmp[1];
|
|
155 #&check_rawdata($tmp[0]);
|
|
156 }
|
|
157 close CON;
|
|
158 if (@filein != @mark) {
|
|
159 #&printErr();
|
|
160 die "Maybe config file have some wrong!!!\n";
|
|
161 }
|
|
162 }
|
|
163 sub checkfa{
|
|
164 my ($file_reads)=@_;
|
|
165 open N,"<$file_reads";
|
|
166 my $line=<N>;
|
|
167 chomp $line;
|
|
168 if($line !~ /^>\S+/){
|
|
169 #printErr();
|
|
170 die "The first line of file $file_reads does not start with '>identifier'
|
|
171 Reads file $file_reads is not a valid fasta file\n\n";
|
|
172 }
|
|
173 if(<N> !~ /^[ACGTNacgtn]*$/){
|
|
174 #printErr();
|
|
175 die "File $file_reads contains not allowed characters in sequences
|
|
176 Allowed characters are ACGTN
|
|
177 Reads file $file_reads is not a fasta file\n\n";
|
|
178 }
|
|
179 close N;
|
|
180 }
|
|
181 sub search{
|
|
182 my ($dir,$str)=@_;
|
|
183 opendir I,$dir;
|
|
184 my @ret;
|
|
185 while (my $file=readdir I) {
|
|
186 if ($file=~/$str/) {
|
|
187 push @ret, $file;
|
|
188 }
|
|
189 }
|
|
190 closedir I;
|
|
191 if (@ret != 1) {
|
|
192 #&printErr();
|
|
193
|
|
194 die "Can not find directory or file which name has string: $str !!!\n";
|
|
195 }
|
|
196 return $ret[0];
|
|
197 }
|
|
198
|
|
199
|
|
200 sub Time{
|
|
201 my $time=time();
|
|
202 my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];
|
|
203 $month++;
|
|
204 $year+=1900;
|
|
205 if (length($sec) == 1) {$sec = "0"."$sec";}
|
|
206 if (length($min) == 1) {$min = "0"."$min";}
|
|
207 if (length($hour) == 1) {$hour = "0"."$hour";}
|
|
208 if (length($day) == 1) {$day = "0"."$day";}
|
|
209 if (length($month) == 1) {$month = "0"."$month";}
|
|
210 #print "$year-$month-$day $hour:$min:$sec\n";
|
|
211 return("$year-$month-$day $hour:$min:$sec");
|
|
212 }
|
|
213
|
|
214
|
|
215 sub usage{
|
|
216 print <<"USAGE";
|
|
217 Version $version
|
|
218 Usage:
|
|
219
|
|
220 $0 -i -fa -gfa -idx -pre -mat -mis -e -f -t -o -path
|
|
221 options:
|
|
222 -i input files, # config
|
|
223
|
|
224 -fa ,#fasta sequence file
|
|
225
|
|
226 -path scirpt path
|
|
227
|
|
228 -gfa string, input file # genome fasta. sequence file
|
|
229 -idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter
|
|
230 string must be the prefix of the bowtie index. For instance, if
|
|
231 the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then
|
|
232 the prefix is 'h_sapiens_37_asm'.##can be null
|
|
233
|
|
234 -pre string, input file #species specific microRNA precursor sequences
|
|
235 -mat string, input file #species specific microRNA mature sequences
|
|
236
|
|
237 -mis [int] number of allowed mismatches when mapping reads to precursors, default 0
|
|
238 -e [int] number of nucleotides upstream of the mature sequence to consider, default 2
|
|
239 -f [int] number of nucleotides downstream of the mature sequence to consider, default 5
|
|
240 -r int a read is allowed to map up to this number of positions in the genome,default is 25
|
|
241
|
|
242 -dis <int> Maximal space between miRNA and miRNA* (200)
|
|
243 -flank <int> Flank sequence length of miRNA precursor (10)
|
|
244 -mfe <folat> Maximal free energy allowed for a miRNA precursor (-20)
|
|
245
|
|
246 -t int, number of threads [1]
|
|
247
|
|
248 -o output directory# absolute path
|
|
249 -h help
|
|
250 USAGE
|
|
251 exit(1);
|
|
252 }
|
|
253
|