Mercurial > repos > mvdbeek > damidseq_findpeaks
view find_peaks @ 9:8fd3191302d0 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damidseq_findpeaks commit 5bf4f4dbd82988d1612d951d69125bc5f5b439be
author | mvdbeek |
---|---|
date | Wed, 13 Jun 2018 12:41:00 -0400 |
parents | be830200e987 |
children | 7b0ca503bb95 |
line wrap: on
line source
#!/usr/bin/perl -w # Copyright © 2012-2015, Owen Marshall # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or (at # your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 # USA use strict; use File::Basename; use 5.010; $|++; my $version = "1.0.1"; print STDERR "\nfind_peaks v$version\nCopyright © 2012-15, Owen Marshall\n\n"; my %vars = ( 'fdr' => 0.01, 'min_count' => 2, 'n' => 100, 'frac' => 0, 'min_quant' => 0.95, 'step' => 0.01, 'unified_peaks' => 'max', ); my %vars_details = ( 'fdr' => 'False discovery rate value', 'min_count' => 'Minimum number of fragments to consider as a peak', 'n' => 'Number of iterations', 'frac' => 'Number of random fragments to consider per iteration', 'min_quant' => 'Minimum quantile for considering peaks', 'step' => 'Stepping for quantiles', 'unified_peaks' => "Method for calling peak overlaps (two options):\n\r'min': call minimum overlapping peak area\n\r'max': call maximum overlap as peak", ); my @in_files; process_cli(); # Time and date my ($sec,$min,$hour,$mday,$mon,$year) = localtime(); my $date = sprintf("%04d-%02d-%02d.%02d-%02d-%02d",$year+1900,$mon+1,$mday,$hour,$min,$sec); help() unless @in_files; foreach my $fn (@in_files) { my @in; my @unified_peaks; my @sig_peaks; my @peakmins; my %peaks; my %peak_count; my %peak_count_real; my %log_scores; my %regression; my %peak_fdr_cutoff; my %fdr; # Output file names my ($name,$dir,$ext) = fileparse($fn, qr/\.[^.]*/); # filenames my $fn_base_date = "peak_analysis.".$name.".$date"; my $base_dir = "$fn_base_date/"; my $out = "$base_dir"."$name-FDR$vars{'fdr'}"; my $out_peak_unified_track = $out.".peaks.gff"; my $out_peaks = $out."_FDR-data"; # Load gff data files load_gff(FILE=>$fn, IN_REF=>\@in); my $probes = @in; find_quants(IN_REF=>\@in, PEAKMINS_REF=>\@peakmins); find_randomised_peaks(IN_REF=>\@in, PEAKMINS_REF=>\@peakmins, PEAK_COUNT=>\%peak_count, PEAKS=>\%peaks); # Make directory mkdir($base_dir); # Open peaks file for writing open(OUTP, ">$out_peaks")|| die "Cannot open peak file for writing: $!\n"; print OUTP "FDR peak call v$version\n\n"; print OUTP "Input file: $fn\n"; calculate_regressions(IN_REF=>\@in, PEAKMINS_REF=>\@peakmins, PEAK_COUNT=>\%peak_count, PEAKS=>\%peaks, LOG_SCORES=>\%log_scores, REGRESSION=>\%regression); call_peaks_unified_redux(ITER=>1, REAL=>1, AREF=>\@in, PEAKMINS_REF=>\@peakmins, PEAK_COUNT=>\%peak_count, PEAK_COUNT_REAL=>\%peak_count_real, PEAKS=>\%peaks); calculate_fdr(IN_REF=>\@in, PEAKMINS_REF=>\@peakmins, PEAK_COUNT=>\%peak_count, PEAK_COUNT_REAL=>\%peak_count_real, PEAK_FDR_CUTOFF=>\%peak_fdr_cutoff, FDR=>\%fdr, LOG_SCORES=>\%log_scores, REGRESSION=>\%regression); find_significant_peaks(PEAKMINS=>\@peakmins, SIG_PEAKS=>\@sig_peaks, PEAKS=>\%peaks, PEAK_FDR_CUTOFF=>\%peak_fdr_cutoff, FDR=>\%fdr); make_unified_peaks(SIG_PEAKS=>\@sig_peaks, UNIFIED_PEAKS=>\@unified_peaks, OUT=>$out_peak_unified_track, TYPE=>$vars{'unified_peaks'}); print STDERR "$#unified_peaks peaks found.\n\n"; close OUTP; } print STDERR "All done.\n\n"; exit 0; ###################### # Subroutines start here # sub find_significant_peaks { my (%opts) = @_; my $peakmins = $opts{PEAKMINS}; my $peaks = $opts{PEAKS}; my $sig_peaks = $opts{SIG_PEAKS}; my $fdr = $opts{FDR}; my $peak_fdr_cutoff = $opts{PEAK_FDR_CUTOFF}; # Generate significant peaks and unify peaks print STDERR "Selecting significant peaks ... \n"; foreach my $pm (@$peakmins) { for my $i (0 .. $#{$$peaks{$pm}}) { my ($chr, $pstart, $pend, $mean_pscore, $pscore, $count, $size) = @{ $$peaks{$pm}[$i] }; if ($count >= $$peak_fdr_cutoff{$pm}) { push (@$sig_peaks, [ @{$$peaks{$pm}[$i]}, $$fdr{$pm}[$count] ]); } } } print OUTP "\nNumber of peaks: $#$sig_peaks\n"; } sub calculate_fdr { my (%opts) = @_; my $in_ref = $opts{IN_REF}; my $peakmins = $opts{PEAKMINS_REF}; my $peak_count = $opts{PEAK_COUNT}; my $log_scores = $opts{LOG_SCORES}; my $regression = $opts{REGRESSION}; my $fdr = $opts{FDR}; my $peak_fdr_cutoff = $opts{PEAK_FDR_CUTOFF}; my $peak_count_real = $opts{PEAK_COUNT_REAL}; foreach my $pm (@$peakmins) { # get regression variables my ($m,$b) = @{$$regression{$pm}} if $$regression{$pm}[0]; for my $i (0 .. $#{$$peak_count_real{$pm}}) { next unless $$peak_count_real{$pm}[$i]; my $expect = 10**($m*$i + $b); my $real_count = $$peak_count_real{$pm}[$i]; my $fdr_conservative = $expect/$real_count; $$fdr{$pm}[$i]= $fdr_conservative; } } # print FDR rates print OUTP "\n"; foreach my $pm (@$peakmins) { print OUTP "Peak min = $pm\n"; for my $c (0 .. $#{$$fdr{$pm}}) { next unless defined($$fdr{$pm}[$c]); print OUTP "Peak size: $c\tCount: $$peak_count_real{$pm}[$c]\tFDR: $$fdr{$pm}[$c]\n"; $$peak_fdr_cutoff{$pm} = $c if (($$fdr{$pm}[$c]<$vars{'fdr'}) && (!$$peak_fdr_cutoff{$pm})); } $$peak_fdr_cutoff{$pm}||= 10**10; # clumsy hack to prevent errors print OUTP "\n"; } foreach my $pm (@$peakmins) { print OUTP "Peak min $pm: peak cutoff size for alpha = $vars{'fdr'} was $$peak_fdr_cutoff{$pm}\n\n"; } } sub calculate_regressions { my (%opts) = @_; my $in_ref = $opts{IN_REF}; my $peakmins = $opts{PEAKMINS_REF}; my $peaks = $opts{PEAKS}; my $peak_count = $opts{PEAK_COUNT}; my $log_scores = $opts{LOG_SCORES}; my $regression = $opts{REGRESSION}; my $in_num = @$in_ref; foreach my $pm (@$peakmins) { print OUTP "Peak min = $pm\n"; for my $c (0 .. $#{$$peak_count{$pm}}) { my $peak_count_avg = $$peak_count{$pm}[$c]/$vars{'n'} if $$peak_count{$pm}[$c]; next unless $peak_count_avg; if ($vars{'frac'}) { $peak_count_avg = $peak_count_avg * $in_num/$vars{'frac'}; } $$log_scores{$pm}[$c] = log($peak_count_avg)/log(10); print OUTP "Peak size: $c\tCount:$peak_count_avg\n"; } # calculate exponential decay rates # y= a+bx for log(y) my ($sumx, $sumy, $sumxsq, $sumxy); my $n=0; for my $i (0 .. $#{$$peak_count{$pm}}) { next unless $$peak_count{$pm}[$i]; $n++; $sumx += $i; $sumy += $$log_scores{$pm}[$i]; $sumxsq += $i ** 2; $sumxy += $i * $$log_scores{$pm}[$i]; } next unless $n > 1; my $mean_x = $sumx/$n; my $mean_y = $sumy/$n; my $mean_xsq = $sumxsq/$n; my $mean_xy = $sumxy/$n; my $b = ($mean_xy - ($mean_x * $mean_y)) / ($mean_xsq - ($mean_x **2)); my $a = $mean_y - ($b * $mean_x); # store values $$regression{$pm}=[$b, $a]; print OUTP "regression: log(y) = $b(x) + $a\n"; for my $i (0 .. $#{$$peak_count{$pm}}) { next unless $$peak_count{$pm}[$i]; my ($b,$a) = @{$$regression{$pm}}; my $logval = ($b*$i + $a); my $val = 10**$logval; print OUTP "lin regress: $i\t$$log_scores{$pm}[$i]\t$logval\t$val\n"; } print OUTP "\n"; } } sub find_randomised_peaks { my (%opts) = @_; my $in_ref = $opts{IN_REF}; my $peakmins_ref = $opts{PEAKMINS_REF}; my $peaks = $opts{PEAKS}; my $peak_count = $opts{PEAK_COUNT}; print STDERR "Duplicating ... \n"; my @inr = @$in_ref; # Call peaks on input file print STDERR "Calling peaks on input file ...\n"; for my $iter (1 .. $vars{'n'}) { #print STDERR "Iteration $iter ... \r"; print STDERR "Iteration $iter: [shuffling] \r"; if ($vars{'frac'}) { # only use a fraction of the array per rep my @a = inside_out(\@inr, $vars{'frac'}); call_peaks_unified_redux(ITER=>$iter, AREF=>\@a, PEAKMINS_REF=>$peakmins_ref, PEAK_COUNT=>$peak_count); } else { shuffle(\@inr); call_peaks_unified_redux(ITER=>$iter, AREF=>\@inr, PEAKMINS_REF=>$peakmins_ref, PEAK_COUNT=>$peak_count); } } } sub call_peaks_unified_redux { my (%opts) = @_; my $iter = $opts{ITER}; my $real = $opts{REAL}; my $a = $opts{AREF}; my $peakmins_ref = $opts{PEAKMINS_REF}; my $peak_count_real = $opts{PEAK_COUNT_REAL}; my $peaks = $opts{PEAKS}; my $peak_count = $opts{PEAK_COUNT}; my ($pstart, $pend, $inpeak, $pscore, $count); $pstart=$pend=$pscore=$inpeak=$count=0; my @tmp_peak; my $total = $#$a; if ($real) { print STDERR "Calling real peaks ... \r"; } else { print STDERR "Iteration $iter: [processing ...] \r"; } my $old_chr=""; foreach my $pm (@$peakmins_ref) { for my $i (0 .. $total) { my ($chr, $start, $end, $score) = @{ @$a[$i] }; next unless $score; if ($real) { unless ($chr eq $old_chr) { # Next chromosome # (Peaks can't carry over chromosomes, but we don't use this shortcut when randomly shuffling) $pstart=$pend=$pscore=$inpeak=$count=0; @tmp_peak = () if $real; } } $old_chr = $chr if $real; unless ($inpeak) { next unless $score >= $pm; # record new peak $pstart = $start; $pend = $end; $pscore = $score * ($end-$start)/1000; $count++; push @tmp_peak, $score if $real; $inpeak = 1; } else { if ($score >= $pm) { # still in peak $count++; # Fragment score to deal with scoring peaks made from uneven sized fragments my $fragment_score = $score * ($end-$start)/1000; push @tmp_peak, $score if $real; $pscore += $fragment_score; $pend = $end; } else { # out of a peak if ($count >= $vars{'min_count'}) { # record peak if ($real) { $$peak_count_real{$pm}[$count]++; my $mean_pscore = sprintf('%0.2f',($pscore/(($pend-$pstart)/1000))); push (@{$$peaks{$pm}},[($chr, $pstart, $pend, $mean_pscore, $pscore, $count, ($pend-$pstart))]); } else { $$peak_count{$pm}[$count]++; } } # reset $pstart=$pend=$pscore=$inpeak=$count=0; @tmp_peak = () if $real; } } } } } sub shuffle { # Fisher-Yates shuffle (Knuth shuffle) my ($array) = @_; my $i = @$array; while ( --$i ) { my $j = int rand( $i+1 ); @$array[$i,$j] = @$array[$j,$i]; } } sub inside_out { my ($array, $frac) = @_; my @a; for my $i (0 .. $frac-1) { my $j = int rand( $i+1 ); if ($j != $i) { $a[$i] = $a[$j] } $a[$j] = @$array[$i] } return @a; } sub load_gff { my (%opts) = @_; my $fn = $opts{FILE}; my $in_ref = $opts{IN_REF}; print STDERR "Reading input file: $fn ...\n"; open (IN, "<$fn") || die "Unable to read $fn: $!\n"; my $i; while (<IN>) { $i++; print STDERR "Read $i lines ...\r" if $i%10000 == 0; chomp; my @line = split('\t'); my ($chr, $start, $end, $score); if ($#line == 3) { # bedgraph ($chr, $start, $end, $score) = @line; } else { # GFF ($chr, $start, $end, $score) = @line[0,3,4,5]; } next unless $start; $score = 0 if $score eq "NA"; push (@$in_ref, [$chr, $start, $end, $score]); } close (IN); print STDERR "Sorting ... \n"; @$in_ref = sort { $a->[1] <=> $b->[1] } @$in_ref; @$in_ref = sort { $a->[0] cmp $b->[0] } @$in_ref; } sub find_quants { my (%opts) = @_; my $in_ref = $opts{IN_REF}; my $peakmins_ref = $opts{PEAKMINS_REF}; my %seg; my @frags; my $total_coverage; foreach my $l (@$in_ref) { my ($chr, $start, $end, $score) = @$l; next unless $score; $score = 0 if $score eq "NA"; $total_coverage += $end-$start; push @frags, $score; } @frags = sort {$a <=> $b} @frags; print STDERR "Total coverage was $total_coverage bp\n"; my @quants; for (my $q=0;$q<=1;$q+=$vars{'step'}) { push @quants, [$q, int($q * @frags)] if $q > $vars{'min_quant'}; } foreach (@quants) { my $cut_off = @{$_}[0]; my $score = $frags[@{$_}[1]]; printf(" Quantile %0.2f: %0.2f\n",$cut_off,$score); $seg{$cut_off} = $score; } foreach my $c (sort {$a <=> $b} keys %seg) { push (@$peakmins_ref, $seg{$c}); } } sub make_unified_peaks { my (%opts) = @_; my $ref = $opts{SIG_PEAKS}; my $out_peak_unified_track = $opts{OUT}; my $unified_peaks = $opts{UNIFIED_PEAKS}; my $type = $opts{TYPE}; my $total = @$ref; # Unify overlapping peaks, and make significant peaks file my $skipped_peaks; print STDERR "Combining significant peaks ...\n"; # unroll chromosomes for speed foreach my $chr (uniq( map @{$_}[0], @$ref )) { my @c = grep {@{$_}[0] eq $chr} @$ref; my @unified_peaks_chr; foreach my $ar (@c) { if (state $i++ % 100 == 0) { my $pc = sprintf("%0.2f",($i*100)/$total); print STDERR "$pc\% processed ...\r"; } my ($chra, $start, $end, $score, $total_score, $count, $peaklen, $fdr) = @$ar; # next if @unified_peaks_chr already overlaps next if grep { @{$_}[3] < $end && @{$_}[4] > $start } @unified_peaks_chr; # Grab all elements that overlap my @test = grep { @{$_}[1] < $end && @{$_}[2] > $start } @c; for my $j (0 .. $#test) { my ($chr1, $start1, $end1, $score1, $total_score1, $count1, $peaklen1, $fdr1) = @{$test[$j]}; next unless $start1 < $end; next unless $end1 > $start; if ($type eq 'min') { $start = max($start, $start1); $end = min($end, $end1); } else { $start = min($start, $start1); $end = max($end, $end1); } $score = max($score, $score1); $fdr = min($fdr, $fdr1); } push @unified_peaks_chr, [($chr, '.', '.', $start, $end, $score, '.', '.', "FDR=$fdr")]; } @$unified_peaks = (@$unified_peaks, @unified_peaks_chr); } $total = $#$unified_peaks; print STDERR "Sorting unified peaks ...\n"; @$unified_peaks = sort { $a->[3] <=> $b->[3] } @$unified_peaks; @$unified_peaks = sort { $a->[0] cmp $b->[0] } @$unified_peaks; print STDERR "Writing unified peaks file ...\n"; open(PEAKOUTUNI, ">$out_peak_unified_track") || die "Unable to open peak output track for writing: $!\n\n"; for my $j (0 .. $#$unified_peaks) { print PEAKOUTUNI join("\t", @{$$unified_peaks[$j]}), "\n"; } } sub max { my ($max, @vars) = @_; my $index=0; $max||=0; for my $i (0..$#vars) { ($max, $index) = ($vars[$i], $i+1) if $vars[$i] > $max; } return $max; } sub min { my ($min, @vars) = @_; my $index=0; $min||=0; for my $i (0..$#vars) { ($min, $index) = ($vars[$i],$i+1) if $vars[$i] < $min; } return $min; } sub uniq { my %seen; return grep { !$seen{$_}++ } @_; } sub process_cli { foreach (@ARGV) { if (/--(.*)=(.*)/) { unless (defined($vars{$1})) { print STDERR "Did not understand $_ ...\n"; help(); } my ($v, $opt) = ($1,$2); $vars{$v} = $opt; next; } elsif (/--h[elp]*/) { help(); } elsif (/--(.*)/) { print STDERR "Please add a parameter to $_ ...\n\n"; exit 1; } push @in_files, $_; } } sub help { print STDOUT <<EOT; Simple FDR random permutation peak caller Usage: [options] [files in bedgraph or GFF format] Options: EOT my $opt_len = 0; foreach (keys %vars) { my $l = length($_); $opt_len = $l if $l > $opt_len; } $opt_len+=2; my $cols= `tput cols` || 80; my ($v, $val, $def, $def_format); my $help_format = "format STDOUT =\n" .' '.'^'.'<'x$opt_len . ' '. '^' . '<'x($cols-$opt_len-4) . "\n" .'$v, $def_format'."\n" .' '.'^'.'<'x$opt_len . ' '. '^' . '<'x($cols-$opt_len-6) . "~~\n" .'$v, $def_format'."\n" .".\n"; eval $help_format; die $@ if $@; foreach my $k (sort (keys %vars)) { ($v, $val, $def) = ($k, $vars{$k}, $vars_details{$k}); $def||=""; $def_format = $val ? "$def\n\r[Current value: $val]" : $def; $v = "--$v"; # format = # ^<<<<<<<<<<<<<<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #$v, $def_format # ^<<<<<<<<<<<<<<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< ~~ #$v, $def_format #. write(); } print STDOUT "\n"; exit 1; }