comparison mirplant2/preProcess.pl @ 0:6006e58458ae draft

Uploaded
author adefelicibus
date Tue, 15 Mar 2016 15:10:44 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:6006e58458ae
1 #!/usr/bin/perl -w
2 #Filename:
3 #Author: Tian Dongmei
4 #Email: tiandm@big.ac.cn
5 #Date: 2014-12-2
6 #Modified:
7 #Description: RNA-seq data pre-process
8 my $version=1.00;
9
10 use strict;
11 use Getopt::Long;
12 use threads;
13 #use threads::shared;
14 use File::Path;
15 use File::Basename;
16 #use RNA;
17 #use Term::ANSIColor;
18
19 my %opts;
20 GetOptions(\%opts,"i:s@","tag:s@","format=s","phred:i","gfa=s","rfam:s","idx:s","idx2:s","mis:i","v:i","a:s","M:i","t:i","min:i","max:i","o:s","path:s","h");
21 if (!(defined $opts{i} and defined $opts{format} and defined $opts{gfa} ) || defined $opts{h}) { #necessary arguments
22 &usage;
23 }
24
25 my $time=&Time();
26 print "miPlant program start:\n The time is $time!\n";
27 print "Command line:\n $0 @ARGV\n";
28
29 my $format=$opts{'format'};
30 if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") {
31 #&printErr();
32 die "Parameter \"-format\" is error! Parameter is fastq, fq, fasta or fa\n";
33 }
34
35 my $phred_qv=64;
36 if (defined $opts{'phred'}) {$phred_qv=$opts{'phred'};}
37
38 my @inputfiles=@{$opts{'i'}};
39 my @inputtags=@{$opts{'tag'}};
40
41 my $mypath=`pwd`;
42 chomp $mypath;
43
44 my $dir=defined $opts{'o'} ? $opts{'o'} : "$mypath/preProcess/";
45
46
47 unless ($dir=~/\/$/) {$dir.="/";}
48 if (not -d $dir) {
49 mkdir $dir;
50 }
51 my $config=$dir."/input_config";
52 open CONFIG,">$config";
53 for (my $i=0;$i<@inputfiles;$i++) {
54 print CONFIG $inputfiles[$i],"\t",$inputtags[$i],"\n";
55 }
56 close CONFIG;
57
58 my $scipt_path=defined $opts{'path'} ? $opts{'path'} : "/Users/big/galaxy-dist/tools/myTools/";
59
60 my $a="ATCTCGTATG"; #adapter
61 if (defined $opts{'a'}) {$a=$opts{'a'};}
62
63 my $m=6; #adapter minimum mapped nt
64 if (defined $opts{'M'}) {$m=$opts{'M'};}
65
66 my $t=1; #threads number
67 if (defined $opts{'t'}) {$t=$opts{'t'};}
68
69 my $min_nt=19; # minimum reads length
70 if (defined $opts{'min'}) {$min_nt=$opts{'min'};}
71
72 my $max_nt=28; #maximum reads length
73 if (defined $opts{'max'}) {$max_nt=$opts{'max'};}
74
75 my $mis=0; #mismatch number for microRNA
76 if (defined $opts{'mis'}) {$mis=$opts{'mis'};}
77
78 my $mis_rfam=0;# mismatch number for rfam
79 if (defined $opts{'v'}) {$mis_rfam=$opts{'v'};}
80
81 my (@filein,@mark,@clean);
82 #&read_config();
83 @filein=@inputfiles;
84 @mark=@inputtags;
85
86 &checkfa($opts{gfa});
87
88
89 ##### clip adpter --> clean data start
90 my $preprocess=$dir."preProcess_clean/";
91 mkdir $preprocess;
92 my $can_use_threads = eval 'use threads; 1';
93 if ($can_use_threads) {
94 # Do processing using threads
95 print "Do processing using threads\n";
96 my @filein1=@filein; my @mark1=@mark;
97 while (@filein1>0) {
98 my @thrs; my @res;
99 for (my $i=0;$i<$t ;$i++) {
100 last if(@filein1==0);
101 my $in=shift @filein1;
102 my $out=shift @mark1;
103 push @clean,$preprocess.$out."_clips_adapter.fq";
104 $thrs[$i]=threads->create(\&clips,$in,$out);
105 }
106 for (my $i=0;$i<@thrs;$i++) {
107 $res[$i]=$thrs[$i]->join();
108 }
109 }
110 } else {
111 # Do not processing using threads
112 print "Do not processing using threads\n";
113 for (my $i=0;$i<@filein ;$i++) {
114 my $in=$filein[$i];
115 my $out=$mark[$i];
116 push @clean,$preprocess.$out."_clips_adapter.fq";
117 &clips($in,$out);
118 }
119 }
120
121 ##### clip adpter --> clean data end
122
123 my $collapsed=$preprocess."collapse_reads.fa";
124 my $data=$preprocess."collapse_reads_${min_nt}_${max_nt}.fa"; ## raw clean data
125 &collapse(\@clean,$collapsed); #collapse reads to tags
126
127 &filterbylength(); # filter <$min_nt && >$max_nt
128
129 print "The final clean data file is $data, only contains reads which length is among $min_nt\~$max_nt\n\n";
130
131
132 $time=Time();
133 print "$time: Genome alignment!\n\n";
134 my $genome_map=$dir."genome_match";
135 &genome($data);
136 #my $genome_map=&search($dir,"genome_match_");
137 my $mapfile=$genome_map."/genome_mapped.bwt";
138 my $mapfa=$genome_map."/genome_mapped.fa";
139 my $unmap=$genome_map."/genome_not_mapped.fa";
140
141 chdir $dir;
142 my $pathfile="$dir/path.txt";
143 open PA,">$pathfile";
144 print PA "$config\n";
145 print PA "$preprocess\n";
146 print PA "$genome_map\n";
147
148 if (defined $opts{'rfam'}) { #rfam mapping and analysis
149 $time=Time();
150 print "$time: RNA annotate!\n\n";
151 $time=~s/:/-/g;
152 $time=~s/ /-/g;
153 my $rfam_exp_dir=$dir."rfam_match";
154 &rfam();
155 #my $rfam_exp_dir=&search($dir,"rfam_match_");
156 print PA "$rfam_exp_dir\n";
157
158 my $tag=join "\\;" ,@mark;
159 system("perl $scipt_path/count_rfam_express.pl -i $rfam_exp_dir/rfam_mapped.bwt -tag $tag -o rfam_non-miRNA_annotation.txt");
160 }
161
162
163 close PA;
164 system("perl $scipt_path/html_preprocess.pl -i $pathfile -format $format -min $min_nt -max $max_nt -o $dir/preprocessResult.html");
165
166 $time=Time();
167 print "$time: Program end!!\n";
168
169 ############################## sub programs ###################################
170 sub genome{
171 my ($file)=@_;
172 if(defined $opts{'idx'}){
173 system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -r 1000 -v $mis -p $t -o $dir -index $opts{idx}") ;
174 # print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} -time $time\n";
175 }else{
176 system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -r 1000 -v $mis -p $t -o $dir") ;
177 # print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -time $time\n";
178 }
179 }
180 sub rfam{
181 if (defined $opts{'idx2'}) {
182 system("perl $scipt_path/rfam.pl -i $mapfa -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -index $opts{idx2} ");
183 # print "\nrfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -index $opts{idx2} -time $time\n";
184 }else{
185 system("perl $scipt_path/rfam.pl -i $mapfa -ref $opts{rfam} -v $mis_rfam -p $t -o $dir ");
186 # print "\nrfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -time $time\n";
187 }
188 }
189 sub filterbylength{
190 my $tmpmark=join ",", @mark;
191 system("perl $scipt_path/filterReadsByLength.pl -i $collapsed -o $data -min $min_nt -max $max_nt -mark $tmpmark");
192 system("perl $scipt_path/Length_Distibution.pl -i $preprocess/reads_length_distribution.txt -o $preprocess/length.html");
193 # print "\nfilterReadsByLength.pl -i $collapsed -o $data -min $min_nt -max $max_nt -mark $tmpmark\n";
194
195 }
196 sub collapse{
197 my ($ins,$data)=@_;
198 my $str="";
199 for (my $i=0;$i<@{$ins};$i++) {
200 $str .="-i $$ins[$i] ";
201 }
202 system ("perl $scipt_path/collapseReads2Tags.pl $str -mark seq -o $data -format $format");
203 # print "\ncollapseReads2Tags.pl $str -mark seq -o $data -format $format\n";
204 }
205
206 sub clips{
207 my ($in,$out)=@_;
208 my $adapter=$preprocess.$out."_clips_adapter.fq";
209 if($format eq "fq" || $format eq "fastq"){
210 system("fastx_clipper -a $a -M $m -Q $phred_qv -i $in -o $adapter") ;
211 # print "\nfastx_clipper -a $a -M $m -Q $phred_qv -i $in -o $adapter\n";
212 }
213 if($format eq "fa" || $format eq "fasta"){
214 system("fastx_clipper -a $a -M $m -i $in -o $adapter") ;
215 # print "\nfastx_clipper -a $a -M $m -i $in -o $adapter\n";
216 }
217 #my $clean=$preprocess.$out."_clean.fq";
218 #system("filterReadsByLength.pl -i $adapter -o $clean -min $min_nt -max $max_nt ");
219
220 return;
221 }
222
223 sub read_config{
224 open CON,"<$config";
225 while (my $aline=<CON>) {
226 chomp $aline;
227 my @tmp=split/\t/,$aline;
228 push @filein,$tmp[0];
229 push @mark,$tmp[1];
230 &check_rawdata($tmp[0]);
231 }
232 close CON;
233 if (@filein != @mark) {
234 #&printErr();
235 die "Maybe config file have some wrong!!!\n";
236 }
237 }
238 sub check_rawdata{
239 my ($fileforcheck)=@_;
240 if (!(-s $fileforcheck)) {
241 #&printErr();
242 die "Can not find $fileforcheck, or file is empty!!!\n";
243 }
244 if ($format eq "fasta" || $format eq "fa") {
245 &checkfa($fileforcheck);
246 }
247 if ($format eq "fastq" || $format eq "fq") {
248 &checkfq($fileforcheck);
249 }
250 }
251 sub checkfa{
252 my ($file_reads)=@_;
253 open N,"<$file_reads";
254 my $line=<N>;
255 chomp $line;
256 if($line !~ /^>\S+/){
257 #printErr();
258 die "The first line of file $file_reads does not start with '>identifier'
259 Reads file $file_reads is not a valid fasta file\n\n";
260 }
261 if(<N> !~ /^[ACGTNacgtn]*$/){
262 #printErr();
263 die "File $file_reads contains not allowed characters in sequences
264 Allowed characters are ACGTN
265 Reads file $file_reads is not a fasta file\n\n";
266 }
267 close N;
268 }
269 sub checkfq{
270 my ($file_reads)=@_;
271
272 open N,"<$file_reads";
273 for (my $i=0;$i<10;$i++) {
274 my $a=<N>;
275 my $b=<N>;
276 my $c=<N>;
277 my $d=<N>;
278 chomp $a;
279 chomp $b;
280 chomp $c;
281 chomp $d;
282 if($a!~/^\@/){
283 #&printErr();
284 die "$file_reads is not a fastq file\n\n";
285 }
286 if($b!~ /^[ACGTNacgtn]*$/){
287 #&printErr();
288 die "File $file_reads contains not allowed characters in sequences
289 Allowed characters are ACGTN
290 Reads file $file_reads is not a fasta file\n\n";
291 }
292 if ($c!~/^\@/ && $c!~/^\+/) {
293 #&printErr();
294 die "$file_reads is not a fastq file\n\n";
295 }
296 if ((length $b) != (length $d)) {
297 #&printErr();
298 die "$file_reads is not a fastq file\n\n";
299 }
300 my @qv=split //,$d;
301 for (my $j=0;$j<@qv ;$j++) {
302 my $q=ord($qv[$j])-64;
303 if($q<0){$phred_qv=33;}
304 }
305 }
306 close N;
307 }
308
309 sub search{
310 my ($dir,$str)=@_;
311 opendir I,$dir;
312 my @ret;
313 while (my $file=readdir I) {
314 if ($file=~/$str/) {
315 push @ret, $file;
316 }
317 }
318 closedir I;
319 if (@ret != 1) {
320 #&printErr();
321
322 die "Can not find directory or file which name has string: $str !!!\n";
323 }
324 return $ret[0];
325 }
326
327 sub Time{
328 my $time=time();
329 my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];
330 $month++;
331 $year+=1900;
332 if (length($sec) == 1) {$sec = "0"."$sec";}
333 if (length($min) == 1) {$min = "0"."$min";}
334 if (length($hour) == 1) {$hour = "0"."$hour";}
335 if (length($day) == 1) {$day = "0"."$day";}
336 if (length($month) == 1) {$month = "0"."$month";}
337 #print "$year-$month-$day $hour:$min:$sec\n";
338 return("$year-$month-$day $hour:$min:$sec");
339 }
340
341
342 sub usage{
343 print <<"USAGE";
344 Version $version
345 Usage:
346 $0 -i -format -gfa -index -rfam -a -M -min -max -mis -v -t -o -path
347 options:
348 -i input files, # raw data file, can be multipe eg. -i xxx.fq -i xxx .fq ...
349 -tag string # raw data file names, -tag xxx -tag xxx
350
351 -format string,#specific input rawdata file format : fastq|fq|fasta|fa
352 -phred int # phred quality number, default is 64
353
354 -path scirpt path
355
356 -gfa string, input file # genome fasta. sequence file
357 -idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter
358 string must be the prefix of the bowtie index. For instance, if
359 the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then
360 the prefix is 'h_sapiens_37_asm'.##can be null
361
362 -rfam string, input file# rfam database file, microRNAs must not be contained in this file## if not define, rfam small RNA will not be count.
363 -idx2 string, rfam file index, file-prefix #(must be indexed by bowtie-build) The parameter
364 string must be the prefix of the bowtie index. For instance, if
365 the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then
366 the prefix is 'h_sapiens_37_asm'.##can be null
367
368 -a string, ADAPTER string. default is ATCTCGTATG.
369 -M int, require minimum adapter alignment length of N. If less than N nucleotides aligned with the adapter - don't clip it.
370 -min int, reads min length,default is 19.
371 -max int, reads max length,default is 28.
372
373 -mis [int] number of allowed mismatches when mapping reads to genome, default 0
374 -v <int> report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; used in rfam alignment
375
376 -t int, number of threads [1]
377
378 -o output directory# absolute path
379 -h help
380 USAGE
381 exit(1);
382 }
383