Mercurial > repos > romaingred > pirna_pipeline
comparison bin/piPipe.pl @ 0:198009598544 draft
Uploaded
author | romaingred |
---|---|
date | Wed, 11 Oct 2017 09:57:58 -0400 |
parents | |
children | 2bd100775c36 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:198009598544 |
---|---|
1 #!/usr/bin/perl | |
2 use strict; | |
3 use warnings; | |
4 use Getopt::Long; | |
5 use Parallel::ForkManager; | |
6 use File::Basename; | |
7 use File::Copy::Recursive qw( dircopy ); | |
8 use POSIX; | |
9 use FindBin; | |
10 use lib $FindBin::Bin; | |
11 use resize qw ( size_distribution ); | |
12 use subgroups qw (subgroups ); | |
13 use ppp qw ( ping_pong_partners ); | |
14 use Rcall qw (pie_chart bg_to_png ); | |
15 use align qw ( to_build get_unique sam_count sam_count_mis sam_sorted_bam rpms_rpkm BWA_call get_fastq_seq extract_sam sam_to_bam_bg ); | |
16 use html qw ( main_page details_pages menu_page ppp_page ); | |
17 use File::Copy; | |
18 | |
19 my ( @fastq, @fastq_n, $dir, $min, $max, $mis, $misTE, $help, $Pcheck, $Dcheck, $mapnumf, $html_out); | |
20 my ( $ref, $tRNAs, $rRNAs, $snRNAs, $miRNAs, $exons, $TE ); | |
21 my ( $si_min, $si_max, $pi_min, $pi_max ); | |
22 my ( $build_index, $build_tRNAs, $build_rRNAs, $build_snRNAs, $build_miRNAs, $build_exons, $build_TE ); | |
23 my $max_procs = 4; | |
24 | |
25 ( $build_index, $build_tRNAs, $build_rRNAs, $build_snRNAs, $build_miRNAs, $build_exons, $build_TE ) = (0,0,0,0,0,0,0); | |
26 ( $min, $max, $mis, $misTE, $si_min, $si_max, $pi_min, $pi_max, $dir ) = ( 18, 29, 0, 3, 21, 21, 23, 29 ); | |
27 $Pcheck ='true'; | |
28 | |
29 GetOptions ( | |
30 "fastq=s" => \@fastq, | |
31 "fastq_n=s" => \@fastq_n, | |
32 "dir=s" => \$dir, | |
33 "min:i" => \$min, | |
34 "max:i" => \$max, | |
35 "si_min:i" => \$si_min, | |
36 "si_max:i" => \$si_max, | |
37 "pi_min:i" => \$pi_min, | |
38 "pi_max:i" => \$pi_max, | |
39 "mis:i" => \$mis, | |
40 "misTE:i" => \$misTE, | |
41 "html:s" => \$html_out, | |
42 "PPPon:s" => \$Pcheck, | |
43 "Dison:s" => \$Dcheck, | |
44 "help" => \$help, | |
45 "ref:s" => \$ref, | |
46 "tRNAs:s" => \$tRNAs, | |
47 "rRNAs:s" => \$rRNAs, | |
48 "snRNAs:s" => \$snRNAs, | |
49 "miRNAs:s" => \$miRNAs, | |
50 "exons:s" => \$exons, | |
51 "TE:s" => \$TE, | |
52 "build_index" => \$build_index, | |
53 "build_tRNAs" => \$build_tRNAs, | |
54 "build_snRNAs" => \$build_snRNAs, | |
55 "build_miRNAs" => \$build_miRNAs, | |
56 "build_exons" => \$build_exons, | |
57 "build_rRNAs" => \$build_rRNAs, | |
58 "build_TE" => \$build_TE | |
59 ); | |
60 | |
61 my $fq_collection = 'fastq_dir/'; | |
62 mkdir $dir; mkdir $fq_collection; | |
63 $dir = $dir.'/' unless $dir =~ /\/$/; | |
64 mkdir $dir.'/css';mkdir $dir.'/js'; | |
65 dircopy( $FindBin::Bin.'/css', $dir.'/css' ); | |
66 dircopy( $FindBin::Bin.'/js', $dir.'/js' ); | |
67 | |
68 my $file = $dir.'report.txt'; | |
69 open my $report, '>', $file or die "Cannot open $file $!\n"; | |
70 | |
71 my %toBuild; | |
72 @toBuild{ ( $ref, $tRNAs, $rRNAs, $snRNAs, $miRNAs, $exons, $TE ) } = ( $build_index, $build_tRNAs, $build_rRNAs, $build_snRNAs, $build_miRNAs, $build_exons, $build_TE ) ; | |
73 to_build ( \%toBuild, $report ); | |
74 | |
75 my $proc_child = ceil($max_procs / scalar(@fastq)); | |
76 my $proc_grand_child = ceil($proc_child/5); | |
77 my $pm = Parallel::ForkManager->new($max_procs); | |
78 my $pm3 = Parallel::ForkManager->new($proc_grand_child); | |
79 | |
80 $pm->run_on_finish( sub { | |
81 my ($pid, $exit_code, $ident) = @_; | |
82 print $report "Fastq fork $ident just finished ". | |
83 "with PID $pid and exit code: $exit_code\n"; | |
84 die "Something went wrong!\n" if $exit_code != 0; | |
85 }); | |
86 $pm->run_on_start( sub { | |
87 my ($pid,$ident)=@_; | |
88 print $report "Fastq fork : $ident started, pid: $pid\n"; | |
89 }); | |
90 $pm3->run_on_finish( sub { | |
91 my ($pid, $exit_code, $ident) = @_; | |
92 print $report "** Subgroup fork $ident just finished ". | |
93 "with PID $pid and exit code: $exit_code\n"; | |
94 die "Something went wrong!\n" if $exit_code != 0; | |
95 }); | |
96 $pm3->run_on_start( sub { | |
97 my ($pid,$ident)=@_; | |
98 print $report "** Subgroup fork $ident started, pid: $pid\n"; | |
99 }); | |
100 | |
101 | |
102 foreach my $child ( 0 .. $#fastq ) | |
103 { | |
104 my @suffix = ('.fastq', '.fastq.gz,', '.fq', '.fq.gz', 'ref', '.dat', '.fa','.fas','.fasta', '.txt'); | |
105 my ( $name, $path, $suffix ) = fileparse( $fastq[$child], @suffix ); | |
106 my ( $ref_name, $ref_path, $ref_suffix ) = fileparse( $ref, @suffix ); | |
107 my ( $TE_name, $TE_path, $TE_suffix ) = fileparse( $TE, @suffix ); | |
108 my ( $ex_name, $ex_path, $ex_suffix ) = fileparse( $exons,@suffix ); | |
109 | |
110 $pm->start($fastq[$child]) and next; | |
111 | |
112 my $dir_fq = $dir.$name.'/'; | |
113 mkdir $dir_fq; | |
114 | |
115 my $gen_dir = $dir_fq.'genome/'; | |
116 mkdir $gen_dir; | |
117 | |
118 my $size_dir = $dir_fq.'size/'; | |
119 mkdir $size_dir; | |
120 | |
121 my $fastq_resized = $dir_fq.$name.'_'.$min.'-'.$max.'.fastq'; | |
122 size_distribution ( $fastq[$child], $fastq_resized, $size_dir, $min, $max ); | |
123 | |
124 my $sam_genome = $gen_dir.$name.'_'.$min.'-'.$max.'_'.$ref_name.'.sam'; | |
125 my $sam_genome_unique = $gen_dir.$name.'_'.$min.'-'.$max.'_'.$ref_name.'_unique.sam'; | |
126 my $fastq_prefix = $gen_dir.$name.'_'.$min.'-'.$max.'_'.$ref_name; | |
127 | |
128 BWA_call ( $ref, $fastq_resized, $sam_genome, $mis, $proc_child, $report ); | |
129 my ( $fai_ref_hashP, $ma, $ma_uni ) = get_unique ( $sam_genome, $sam_genome_unique, $gen_dir, 1, $report ); | |
130 | |
131 my $scale = 1000000 / $ma; | |
132 sam_to_bam_bg ( $sam_genome_unique, $scale, $proc_child ); | |
133 sam_to_bam_bg ( $sam_genome, $scale, $proc_child ); | |
134 | |
135 my $Gviz_dir = $gen_dir.'Gviz/'; | |
136 my $fai_file = $gen_dir.'fai'; | |
137 mkdir $Gviz_dir; | |
138 my $Gviz_dir_rand = $Gviz_dir.'rand/'; | |
139 mkdir $Gviz_dir_rand; | |
140 my $Gviz_dir_uni = $Gviz_dir.'unique/'; | |
141 mkdir $Gviz_dir_uni; | |
142 | |
143 open my $gfai, '>', $fai_file; | |
144 foreach my $k ( sort keys %{$fai_ref_hashP} ) | |
145 { | |
146 print $gfai "$k\t$fai_ref_hashP->{$k}\n"; | |
147 } | |
148 close $gfai; | |
149 bg_to_png ( $fai_file, $fastq_prefix.'_unique_plus.bedgraph', $fastq_prefix.'_unique_minus.bedgraph', $Gviz_dir_uni, 'Mb' ); | |
150 bg_to_png ( $fai_file, $fastq_prefix.'_plus.bedgraph', $fastq_prefix.'_minus.bedgraph', $Gviz_dir_rand, 'Mb' ); | |
151 | |
152 my $group_dir = $dir_fq.'subgroups/'; | |
153 my $fastq_uni = $gen_dir.'unique.fastq'; | |
154 my $fastq_all = $gen_dir.'all.fastq'; | |
155 my ($bo, $mi, $pi) = subgroups ( $fastq_all, $group_dir, $mis, $misTE, $proc_child, $tRNAs, $rRNAs, $snRNAs, $miRNAs, $exons, $TE, $si_min, $si_max, $pi_min, $pi_max, $report); | |
156 | |
157 pie_chart($group_dir); | |
158 | |
159 open (my $dupnum, $gen_dir.'dup_mapnum.txt') || die "cannot open dup_mapnum.txt $!"; | |
160 my %dupnum_genome; | |
161 my $header = <$dupnum>; | |
162 while (<$dupnum>) | |
163 { | |
164 chomp $_; | |
165 my @dupline = split /\t/, $_; | |
166 $dupnum_genome{$dupline[0]} = [$dupline[1], $dupline[2]]; | |
167 } | |
168 close $dupnum; | |
169 | |
170 my $mi_sam = $group_dir.'miRNAs.sam'; | |
171 mkdir $group_dir.'miRNAs/'; | |
172 my $mi_count_file = $group_dir.'miRNAs/miRNAs_reads_counts.txt'; | |
173 my ( $mi_count, $mi_ref_size ) = sam_count ( $mi_sam ); | |
174 | |
175 rpms_rpkm( $mi_count, $mi_ref_size, $ma, $mi_count_file, $pi, $mi, $bo ); | |
176 | |
177 my ( $sam_exons, $sam_TEs ) = ( $group_dir.'exons.sam', $group_dir.'TEs.sam' ); | |
178 my @types = ($group_dir.'bonafide_reads.fastq', $group_dir.'miRNAs.fastq', $group_dir.'siRNAs.fastq', $group_dir.'piRNAs.fastq' ); | |
179 my @types_names = ('bonafide_reads', 'miRNAs', 'siRNAs', 'piRNAs'); | |
180 foreach my $grand_child ( 0 .. $#types ) | |
181 { | |
182 my $type_dir = $group_dir.$types_names[$grand_child].'/'; | |
183 my $type_prefix = $types_names[$grand_child].'-'; | |
184 mkdir $type_dir; | |
185 | |
186 my ( $type_sam_genome, $type_sam_TEs, $type_sam_exons ) = ( $type_dir.$type_prefix.'genome.sam', $type_dir.$type_prefix.'TEs.sam', $type_dir.$type_prefix.'exons.sam' ); | |
187 my ( $type_sam_uni_genome, $type_sam_uni_TEs, $type_sam_uni_exons ) = ( $type_dir.$type_prefix.'genome_unique.sam', $type_dir.$type_prefix.'TEs_unique.sam', $type_dir.$type_prefix.'exons_unique.sam' ); | |
188 my ( $type_uni_genome_fastq, $type_uni_TEs_fastq, $type_uni_exons_fastq ) = ( $fq_collection.$type_prefix.'genome_uni.fastq', $fq_collection.$type_prefix.'TEs_uni.fastq', $fq_collection.$type_prefix.'exons_uni.fastq'); | |
189 my ( $type_genome_fastq, $type_TEs_fastq, $type_exons_fastq ) = ( $fq_collection.$type_prefix.'genome.fastq', $fq_collection.$type_prefix.'TEs.fastq', $fq_collection.$type_prefix.'exons.fastq'); | |
190 my $type_sequence_hashP = get_fastq_seq ( $types[$grand_child] ); | |
191 | |
192 if ( $grand_child == 1 ) | |
193 { | |
194 BWA_call ( $TE, $types[$grand_child], $type_sam_TEs, $misTE, $proc_child, $report ); | |
195 BWA_call ( $exons, $types[$grand_child], $type_sam_exons, $mis, $proc_child, $report ); | |
196 BWA_call ( $ref, $types[$grand_child], $type_sam_genome, $mis, $proc_child, $report ); | |
197 extract_sam ( undef, $type_sam_TEs, $type_sam_TEs, $type_sam_uni_TEs, $type_uni_TEs_fastq, $type_uni_TEs_fastq ); | |
198 extract_sam ( undef, $type_sam_exons, $type_sam_exons, $type_sam_uni_exons, $type_exons_fastq, $type_uni_exons_fastq ); | |
199 extract_sam ( undef, $type_sam_genome, $type_sam_genome, $type_sam_uni_genome, $type_genome_fastq, $type_uni_genome_fastq ); | |
200 } | |
201 else | |
202 { | |
203 extract_sam ( $type_sequence_hashP, $sam_TEs, $type_sam_TEs, $type_sam_uni_TEs, $type_TEs_fastq, $type_uni_TEs_fastq ); | |
204 extract_sam ( $type_sequence_hashP, $sam_exons, $type_sam_exons, $type_sam_uni_exons, $type_exons_fastq, $type_uni_exons_fastq ); | |
205 extract_sam ( $type_sequence_hashP, $sam_genome, $type_sam_genome, $type_sam_uni_genome, $type_genome_fastq, $type_uni_genome_fastq ); | |
206 } | |
207 | |
208 my $ex_count_file = $type_dir.'exons_reads_counts.txt'; | |
209 my ( $ex_count, $ex_ref_size ) = sam_count ( $type_sam_exons ); | |
210 rpms_rpkm( $ex_count, $ex_ref_size, $ma, $ex_count_file, $pi, $mi, $bo ); | |
211 | |
212 my ( $TEs_count, $TEs_ref_size, $TEs_count_NoM, $TEs_count_M ) = sam_count_mis ( $type_sam_TEs ); | |
213 my $TEs_count_file = $type_dir.$type_prefix.'TEs_reads_counts.txt'; | |
214 my $TEs_count_file_M = $type_dir.$type_prefix.'TEs_reads_counts_mismatches.txt'; | |
215 my $TEs_count_file_noM = $type_dir.$type_prefix.'TEs_reads_counts_nomismatches.txt'; | |
216 rpms_rpkm( $TEs_count, $TEs_ref_size, $ma, $TEs_count_file, $pi, $mi, $bo ); | |
217 rpms_rpkm( $TEs_count_NoM, $TEs_ref_size, $ma, $TEs_count_file_M, $pi, $mi, $bo ); | |
218 rpms_rpkm( $TEs_count_M, $TEs_ref_size, $ma, $TEs_count_file_noM, $pi, $mi, $bo ); | |
219 | |
220 sam_to_bam_bg ( $type_sam_TEs, $scale, $grand_child ); | |
221 sam_sorted_bam ( $type_sam_exons, $grand_child ); sam_sorted_bam ( $type_sam_uni_exons, $grand_child ); | |
222 sam_sorted_bam ( $type_sam_uni_TEs, $grand_child ); | |
223 | |
224 my $Gviz_TEs = $type_dir.'Gviz_TEs/'; | |
225 mkdir $Gviz_TEs; | |
226 bg_to_png ( $group_dir.'TEs.fai', $type_dir.$type_prefix.'TEs_plus.bedgraph', $type_dir.$type_prefix.'TEs_minus.bedgraph', $Gviz_TEs, 'Kb' ); | |
227 | |
228 my $Gviz_genome= $type_dir.'Gviz_genome/'; | |
229 my $Gviz_genome_rand = $Gviz_genome.'rand/'; | |
230 my $Gviz_genome_uni = $Gviz_genome.'unique/'; | |
231 mkdir $Gviz_genome; mkdir $Gviz_genome_uni; mkdir $Gviz_genome_rand; | |
232 | |
233 sam_to_bam_bg ( $type_sam_genome, $scale, $grand_child ); | |
234 sam_to_bam_bg ( $type_sam_uni_genome, $scale, $grand_child ); | |
235 | |
236 bg_to_png ( $fai_file, $type_dir.$type_prefix.'genome_unique_plus.bedgraph', $type_dir.$type_prefix.'genome_unique_minus.bedgraph', $Gviz_genome_uni, 'Mb' ); | |
237 bg_to_png ( $fai_file, $type_dir.$type_prefix.'genome_plus.bedgraph', $type_dir.$type_prefix.'genome_minus.bedgraph', $Gviz_genome_rand, 'Mb' ); | |
238 | |
239 #HTML Details | |
240 my $prefix_details_pages = $dir.$fastq_n[$child].'-'.$types_names[$grand_child]; | |
241 details_pages ( $type_dir, $prefix_details_pages, \@fastq_n, $fastq_n[$child], $misTE, $dir ); | |
242 | |
243 $pm3->finish($grand_child); | |
244 } | |
245 $pm3->wait_all_children; | |
246 | |
247 if ( $Pcheck eq 'true' ) | |
248 { | |
249 my $ppp = $group_dir.'PPPartners/'; mkdir $ppp; | |
250 print $report "ping_pong_partners $group_dir/bonafide_reads/TEs.sam $ppp\n"; | |
251 ping_pong_partners ( $group_dir.'TEs.fai', $group_dir.'bonafide_reads/bonafide_reads-TEs_sorted.bam', $ppp, $min ); | |
252 my $ppp_page = $dir.$fastq_n[$child].'-bonafide_reads-PPP.html'; | |
253 ppp_page ( $group_dir, $ppp_page, \@fastq_n, $fastq_n[$child], $ppp, $dir ); | |
254 } | |
255 | |
256 #HTML Main Webpage | |
257 my $index_page = $dir.$fastq_n[$child].'.html'; | |
258 main_page ( $gen_dir, $index_page, \@fastq_n, $fastq_n[$child], $ma, $ma_uni, $dir ); | |
259 copy ($index_page, $html_out) if $child == 0; | |
260 #HTML Menu | |
261 my $menu_page = $dir.$fastq_n[$child].'-sub.html'; | |
262 menu_page ( $group_dir, $menu_page, \@fastq_n, $fastq_n[$child], $min, $max, $si_min, $si_max, $pi_min, $pi_max, $dir ); | |
263 | |
264 $pm->finish($child); # pass an exit code to finish | |
265 } | |
266 $pm->wait_all_children; | |
267 | |
268 print $report "Job done!\n"; | |
269 close $report; |