Mercurial > repos > mvdbeek > damidseq_findpeaks

diff find_peaks @ 0:be830200e987 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damidseq_findpeaks commit 87c99a97cb2ce55640afdd2e55c8b3ae5ad99324
author: mvdbeek
date: Fri, 20 Apr 2018 04:34:17 -0400
children: 7b0ca503bb95
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/find_peaks	Fri Apr 20 04:34:17 2018 -0400
@@ -0,0 +1,633 @@
+#!/usr/bin/perl -w
+
+# Copyright © 2012-2015, Owen Marshall
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at
+# your option) any later version. 
+# 
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details. 
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 
+# USA
+
+use strict;
+use File::Basename;
+use 5.010;
+$|++;
+
+my $version = "1.0.1";
+
+print STDERR "\nfind_peaks v$version\nCopyright © 2012-15, Owen Marshall\n\n";
+
+my %vars = (
+	'fdr' => 0.01,
+	'min_count' => 2,
+	'n' => 100,
+	'frac' => 0,
+	'min_quant' => 0.95,
+	'step' => 0.01,
+	'unified_peaks' => 'max',
+);
+
+my %vars_details = (
+	'fdr' => 'False discovery rate value',
+	'min_count' => 'Minimum number of fragments to consider as a peak',
+	'n' => 'Number of iterations',
+	'frac' => 'Number of random fragments to consider per iteration',
+	'min_quant' => 'Minimum quantile for considering peaks',
+	'step' => 'Stepping for quantiles',
+	'unified_peaks' => "Method for calling peak overlaps (two options):\n\r'min': call minimum overlapping peak area\n\r'max': call maximum overlap as peak",
+);
+
+
+my @in_files;
+process_cli();
+
+# Time and date
+my ($sec,$min,$hour,$mday,$mon,$year) = localtime();
+my $date = sprintf("%04d-%02d-%02d.%02d-%02d-%02d",$year+1900,$mon+1,$mday,$hour,$min,$sec);
+
+help() unless @in_files;
+
+foreach my $fn (@in_files) {
+	my @in;
+	my @unified_peaks;
+	my @sig_peaks;
+	my @peakmins;
+	
+	my %peaks;
+	my %peak_count;
+	my %peak_count_real;
+	my %log_scores;
+	my %regression;
+	my %peak_fdr_cutoff;
+	my %fdr;
+	
+	# Output file names
+	my ($name,$dir,$ext) = fileparse($fn, qr/\.[^.]*/);
+	
+	# filenames
+	my $fn_base_date = "peak_analysis.".$name.".$date";
+	my $base_dir = "$fn_base_date/";
+	my $out = "$base_dir"."$name-FDR$vars{'fdr'}";
+	my $out_peak_unified_track = $out.".peaks.gff";
+	my $out_peaks = $out."_FDR-data";
+	
+	# Load gff data files
+	load_gff(FILE=>$fn, IN_REF=>\@in);
+	
+	my $probes = @in;
+	
+	find_quants(IN_REF=>\@in, PEAKMINS_REF=>\@peakmins);
+	find_randomised_peaks(IN_REF=>\@in, PEAKMINS_REF=>\@peakmins, PEAK_COUNT=>\%peak_count, PEAKS=>\%peaks);
+	
+	# Make directory
+	mkdir($base_dir);
+	
+	# Open peaks file for writing
+	open(OUTP, ">$out_peaks")|| die "Cannot open peak file for writing: $!\n";
+	print OUTP "FDR peak call v$version\n\n";
+	print OUTP "Input file: $fn\n";
+	
+	calculate_regressions(IN_REF=>\@in, PEAKMINS_REF=>\@peakmins, PEAK_COUNT=>\%peak_count, PEAKS=>\%peaks, LOG_SCORES=>\%log_scores, REGRESSION=>\%regression);
+	
+	call_peaks_unified_redux(ITER=>1, REAL=>1, AREF=>\@in, PEAKMINS_REF=>\@peakmins, PEAK_COUNT=>\%peak_count, PEAK_COUNT_REAL=>\%peak_count_real, PEAKS=>\%peaks);
+	
+	calculate_fdr(IN_REF=>\@in, PEAKMINS_REF=>\@peakmins, PEAK_COUNT=>\%peak_count, PEAK_COUNT_REAL=>\%peak_count_real, PEAK_FDR_CUTOFF=>\%peak_fdr_cutoff, FDR=>\%fdr, LOG_SCORES=>\%log_scores, REGRESSION=>\%regression);
+	
+	find_significant_peaks(PEAKMINS=>\@peakmins, SIG_PEAKS=>\@sig_peaks, PEAKS=>\%peaks, PEAK_FDR_CUTOFF=>\%peak_fdr_cutoff, FDR=>\%fdr);
+	
+	make_unified_peaks(SIG_PEAKS=>\@sig_peaks, UNIFIED_PEAKS=>\@unified_peaks, OUT=>$out_peak_unified_track, TYPE=>$vars{'unified_peaks'});
+	
+	print STDERR "$#unified_peaks peaks found.\n\n";
+	
+	close OUTP;
+}
+
+print STDERR "All done.\n\n";
+exit 0;
+
+
+######################
+# Subroutines start here
+#
+
+sub find_significant_peaks {
+	my (%opts) = @_;
+	
+	my $peakmins = $opts{PEAKMINS};
+	my $peaks = $opts{PEAKS};
+	my $sig_peaks = $opts{SIG_PEAKS};
+	my $fdr = $opts{FDR};
+	my $peak_fdr_cutoff = $opts{PEAK_FDR_CUTOFF};
+	
+	# Generate significant peaks and unify peaks
+	print STDERR "Selecting significant peaks ...     \n";
+
+	foreach my $pm (@$peakmins) {
+		for my $i (0 .. $#{$$peaks{$pm}}) {
+			my ($chr, $pstart, $pend, $mean_pscore, $pscore, $count, $size) = @{ $$peaks{$pm}[$i] };
+			if ($count >= $$peak_fdr_cutoff{$pm}) {
+				push (@$sig_peaks, [ @{$$peaks{$pm}[$i]}, $$fdr{$pm}[$count] ]);
+			}
+		}
+	}
+	
+	print OUTP "\nNumber of peaks: $#$sig_peaks\n";
+}
+
+sub calculate_fdr {
+	my (%opts) = @_;
+	my $in_ref = $opts{IN_REF};
+	my $peakmins = $opts{PEAKMINS_REF};
+	my $peak_count = $opts{PEAK_COUNT};
+	my $log_scores = $opts{LOG_SCORES};
+	my $regression = $opts{REGRESSION};
+	my $fdr = $opts{FDR};
+	my $peak_fdr_cutoff = $opts{PEAK_FDR_CUTOFF};
+	my $peak_count_real = $opts{PEAK_COUNT_REAL};
+		
+	foreach my $pm (@$peakmins) {
+		# get regression variables
+		my ($m,$b) = @{$$regression{$pm}} if $$regression{$pm}[0];
+		
+		for my $i (0 .. $#{$$peak_count_real{$pm}}) {
+			next unless $$peak_count_real{$pm}[$i];
+			my $expect = 10**($m*$i + $b);
+		
+			my $real_count = $$peak_count_real{$pm}[$i];
+			my $fdr_conservative = $expect/$real_count;
+			$$fdr{$pm}[$i]= $fdr_conservative;
+		}
+	}
+	
+	# print FDR rates
+	print OUTP "\n";
+
+	foreach my $pm (@$peakmins) {
+		print OUTP "Peak min = $pm\n";
+		for my $c (0 .. $#{$$fdr{$pm}}) {
+			next unless defined($$fdr{$pm}[$c]);
+			print OUTP "Peak size: $c\tCount: $$peak_count_real{$pm}[$c]\tFDR: $$fdr{$pm}[$c]\n";
+			$$peak_fdr_cutoff{$pm} = $c if (($$fdr{$pm}[$c]<$vars{'fdr'}) && (!$$peak_fdr_cutoff{$pm}));
+		}
+		$$peak_fdr_cutoff{$pm}||= 10**10; # clumsy hack to prevent errors 
+		print OUTP "\n";
+	}
+	
+	foreach my $pm (@$peakmins) {
+		print OUTP "Peak min $pm: peak cutoff size for alpha = $vars{'fdr'} was $$peak_fdr_cutoff{$pm}\n\n";
+	}
+}
+
+sub calculate_regressions {
+	my (%opts) = @_;
+	my $in_ref = $opts{IN_REF};
+	my $peakmins = $opts{PEAKMINS_REF};
+	my $peaks = $opts{PEAKS};
+	my $peak_count = $opts{PEAK_COUNT};
+	my $log_scores = $opts{LOG_SCORES};
+	my $regression = $opts{REGRESSION};
+		
+	my $in_num = @$in_ref;
+
+	foreach my $pm (@$peakmins) {
+		print OUTP "Peak min = $pm\n";
+		for my $c (0 .. $#{$$peak_count{$pm}}) {
+			my $peak_count_avg = $$peak_count{$pm}[$c]/$vars{'n'} if $$peak_count{$pm}[$c];
+			next unless $peak_count_avg;
+	
+			if ($vars{'frac'}) {
+				$peak_count_avg = $peak_count_avg * $in_num/$vars{'frac'};
+			}
+			$$log_scores{$pm}[$c] = log($peak_count_avg)/log(10);
+			print OUTP "Peak size: $c\tCount:$peak_count_avg\n";
+		}
+		
+		# calculate exponential decay rates
+		# y= a+bx for log(y)	
+		my ($sumx, $sumy, $sumxsq, $sumxy);
+		my $n=0;
+		for my $i (0 .. $#{$$peak_count{$pm}}) {
+			next unless $$peak_count{$pm}[$i];
+			$n++;
+			$sumx   += $i;
+			$sumy   += $$log_scores{$pm}[$i];
+			$sumxsq += $i ** 2;
+			$sumxy  += $i * $$log_scores{$pm}[$i];
+		}
+		
+		next unless $n > 1;
+		
+		my $mean_x = $sumx/$n;
+		my $mean_y = $sumy/$n;
+		my $mean_xsq = $sumxsq/$n;
+		my $mean_xy = $sumxy/$n;
+		
+		my $b = ($mean_xy - ($mean_x * $mean_y)) / ($mean_xsq - ($mean_x **2));
+		my $a = $mean_y - ($b * $mean_x);
+				
+		# store values
+		$$regression{$pm}=[$b, $a];
+		print OUTP "regression: log(y) = $b(x) + $a\n";
+		
+		for my $i (0 .. $#{$$peak_count{$pm}}) {
+			next unless $$peak_count{$pm}[$i];
+			my ($b,$a) = @{$$regression{$pm}};
+			my $logval = ($b*$i + $a);
+			my $val = 10**$logval;
+			print OUTP "lin regress: $i\t$$log_scores{$pm}[$i]\t$logval\t$val\n";
+		}
+		
+		print OUTP "\n";
+	}
+}
+
+sub find_randomised_peaks {
+	my (%opts) = @_;
+	my $in_ref = $opts{IN_REF};
+	my $peakmins_ref = $opts{PEAKMINS_REF};
+	my $peaks = $opts{PEAKS};
+	my $peak_count = $opts{PEAK_COUNT};
+	
+	print STDERR "Duplicating ...               \n";
+	my @inr = @$in_ref;
+	
+	# Call peaks on input file
+	print STDERR "Calling peaks on input file ...\n";
+
+	for my $iter (1 .. $vars{'n'}) {
+		#print STDERR "Iteration $iter ...       \r";
+		print STDERR "Iteration $iter: [shuffling]            \r";
+		
+		if ($vars{'frac'}) {
+			# only use a fraction of the array per rep
+			my @a = inside_out(\@inr, $vars{'frac'});
+			call_peaks_unified_redux(ITER=>$iter, AREF=>\@a, PEAKMINS_REF=>$peakmins_ref, PEAK_COUNT=>$peak_count);
+		} else {
+			shuffle(\@inr);
+			call_peaks_unified_redux(ITER=>$iter, AREF=>\@inr, PEAKMINS_REF=>$peakmins_ref, PEAK_COUNT=>$peak_count);
+		}
+	}
+	
+}
+
+sub call_peaks_unified_redux {
+	my (%opts) = @_;
+	my $iter = $opts{ITER};
+	my $real = $opts{REAL};
+	my $a = $opts{AREF};
+	my $peakmins_ref = $opts{PEAKMINS_REF};
+	my $peak_count_real = $opts{PEAK_COUNT_REAL};
+	my $peaks = $opts{PEAKS};
+	my $peak_count = $opts{PEAK_COUNT};
+
+	my ($pstart, $pend, $inpeak, $pscore, $count);
+	$pstart=$pend=$pscore=$inpeak=$count=0;
+	
+	my @tmp_peak;
+	my $total = $#$a;
+	
+	if ($real) {
+		print STDERR "Calling real peaks ...            \r";
+	} else {
+		print STDERR "Iteration $iter: [processing ...]        \r";
+	}
+	
+	my $old_chr="";
+	foreach my $pm (@$peakmins_ref) {
+		for my $i (0 .. $total) {
+			my ($chr, $start, $end, $score) = @{ @$a[$i] };
+			next unless $score;
+			
+			if ($real) {
+				unless ($chr eq $old_chr) {
+					# Next chromosome
+					# (Peaks can't carry over chromosomes, but we don't use this shortcut when randomly shuffling)
+					$pstart=$pend=$pscore=$inpeak=$count=0;
+					@tmp_peak = () if $real;
+				}
+			}
+			$old_chr = $chr if $real;
+			
+			unless ($inpeak) {
+				next unless $score >= $pm;
+				# record new peak
+				$pstart = $start;
+				$pend = $end;
+				$pscore = $score * ($end-$start)/1000;
+				$count++;
+				push @tmp_peak, $score if $real;
+				$inpeak = 1;
+			} else {
+				if ($score >= $pm) {
+					# still in peak
+					$count++;
+					
+					# Fragment score to deal with scoring peaks made from uneven sized fragments
+					my $fragment_score = $score * ($end-$start)/1000;
+					
+					push @tmp_peak, $score if $real;
+					$pscore += $fragment_score;
+					$pend = $end;
+				} else {
+					# out of a peak
+					if ($count >= $vars{'min_count'}) {
+						# record peak
+						if ($real) {
+							$$peak_count_real{$pm}[$count]++;
+							my $mean_pscore = sprintf('%0.2f',($pscore/(($pend-$pstart)/1000)));
+							push (@{$$peaks{$pm}},[($chr, $pstart, $pend, $mean_pscore, $pscore, $count, ($pend-$pstart))]);
+						} else {
+							$$peak_count{$pm}[$count]++;
+						}
+					} 
+					
+					# reset
+					$pstart=$pend=$pscore=$inpeak=$count=0;
+					@tmp_peak = () if $real;
+				}
+			}
+		}
+	}
+}
+
+
+sub shuffle {
+	# Fisher-Yates shuffle (Knuth shuffle)
+	my ($array) = @_;
+	my $i = @$array;
+	while ( --$i ) {
+		my $j = int rand( $i+1 );
+		@$array[$i,$j] = @$array[$j,$i];
+	}
+}
+
+sub inside_out {
+	my ($array, $frac) = @_;
+	my @a;
+	for my $i (0 .. $frac-1) {
+		my $j = int rand( $i+1 );
+		if ($j != $i) {
+			$a[$i] = $a[$j]
+		}
+		$a[$j] = @$array[$i]
+	}
+	return @a;
+}
+
+sub load_gff {
+	my (%opts) = @_;
+	my $fn = $opts{FILE};
+	my $in_ref = $opts{IN_REF};
+
+	print STDERR "Reading input file: $fn ...\n";
+	open (IN, "<$fn") || die "Unable to read $fn: $!\n";
+	
+	my $i;
+	while (<IN>) {
+		$i++;
+		print STDERR "Read $i lines ...\r" if $i%10000 == 0;
+		chomp;
+		
+		my @line = split('\t');
+		
+		my ($chr, $start, $end, $score);
+		if ($#line == 3) {
+			# bedgraph
+			($chr, $start, $end, $score) = @line;
+		} else {
+			# GFF
+			($chr, $start, $end, $score) = @line[0,3,4,5];
+		}
+		
+		next unless $start;
+		
+		$score = 0 if $score eq "NA";
+		
+		push (@$in_ref, [$chr, $start, $end, $score]);
+	}
+	
+	close (IN);
+
+	print STDERR "Sorting ...                 \n";
+	@$in_ref = sort { $a->[1] <=> $b->[1] } @$in_ref;
+	@$in_ref = sort { $a->[0] cmp $b->[0] } @$in_ref;
+}
+
+sub find_quants {
+	my (%opts) = @_;
+	
+	my $in_ref = $opts{IN_REF};
+	my $peakmins_ref = $opts{PEAKMINS_REF};
+	
+	my %seg;
+	my @frags;
+
+	my $total_coverage;
+
+	foreach my $l (@$in_ref) {	
+		my ($chr, $start, $end, $score) = @$l;
+		next unless $score;
+		$score = 0 if $score eq "NA";
+		$total_coverage += $end-$start;
+		push @frags, $score;
+	}
+	
+	@frags = sort {$a <=> $b} @frags;
+	
+	print STDERR "Total coverage was $total_coverage bp\n";
+	
+	my @quants;
+	for (my $q=0;$q<=1;$q+=$vars{'step'}) {
+		push @quants, [$q, int($q * @frags)] if $q > $vars{'min_quant'};
+	}
+
+	foreach  (@quants) {
+		my $cut_off = @{$_}[0];
+		my $score = $frags[@{$_}[1]];
+		printf("   Quantile %0.2f: %0.2f\n",$cut_off,$score);
+		$seg{$cut_off} = $score;
+	}
+	
+	foreach my $c (sort {$a <=> $b} keys %seg) {
+		push (@$peakmins_ref, $seg{$c});
+	}
+}
+
+sub make_unified_peaks {
+	my (%opts) = @_;
+	my $ref = $opts{SIG_PEAKS};
+	my $out_peak_unified_track = $opts{OUT};
+	my $unified_peaks = $opts{UNIFIED_PEAKS};
+	my $type = $opts{TYPE};
+	my $total = @$ref;
+		
+	# Unify overlapping peaks, and make significant peaks file
+	my $skipped_peaks;
+	print STDERR "Combining significant peaks ...\n";
+	
+	# unroll chromosomes for speed
+	foreach my $chr (uniq( map @{$_}[0], @$ref )) {
+		my @c = grep {@{$_}[0] eq $chr} @$ref;
+		
+		my @unified_peaks_chr;
+		foreach my $ar (@c) {
+			if (state $i++ % 100 == 0) {
+					my $pc = sprintf("%0.2f",($i*100)/$total);
+					print STDERR "$pc\% processed ...\r";
+			}
+			
+			my ($chra, $start, $end, $score, $total_score, $count, $peaklen, $fdr) = @$ar;
+			
+			# next if @unified_peaks_chr already overlaps
+			next if grep {
+						@{$_}[3] < $end
+						&& @{$_}[4] > $start
+					} @unified_peaks_chr;
+			
+			# Grab all elements that overlap
+			my @test = grep {
+						@{$_}[1] < $end
+						&& @{$_}[2] > $start
+					} @c;
+			
+			for my $j (0 .. $#test) {
+				my ($chr1, $start1, $end1, $score1, $total_score1, $count1, $peaklen1, $fdr1) = @{$test[$j]};
+				
+				next unless $start1 < $end;
+				next unless $end1 > $start;
+				
+				if ($type eq 'min') {
+					$start = max($start, $start1);
+					$end = min($end, $end1);
+				} else {
+					$start = min($start, $start1);
+					$end = max($end, $end1);
+				}
+				
+				$score = max($score, $score1);
+				$fdr = min($fdr, $fdr1);
+			}
+			
+			push @unified_peaks_chr, [($chr, '.', '.', $start, $end, $score, '.', '.', "FDR=$fdr")];
+		}
+		
+		@$unified_peaks = (@$unified_peaks, @unified_peaks_chr);
+	}
+	
+	$total = $#$unified_peaks;
+	
+	print STDERR "Sorting unified peaks ...\n";
+	@$unified_peaks = sort { $a->[3] <=> $b->[3] } @$unified_peaks;
+	@$unified_peaks = sort { $a->[0] cmp $b->[0] } @$unified_peaks;
+	
+	print STDERR "Writing unified peaks file ...\n";
+	open(PEAKOUTUNI, ">$out_peak_unified_track") || die "Unable to open peak output track for writing: $!\n\n"; 
+	for my $j (0 .. $#$unified_peaks) {
+		print PEAKOUTUNI join("\t", @{$$unified_peaks[$j]}), "\n";
+	}
+}
+
+sub max {
+    my ($max, @vars) = @_;
+	my $index=0;
+	$max||=0;
+    for my $i (0..$#vars) {
+        ($max, $index) = ($vars[$i], $i+1) if $vars[$i] > $max;
+    }
+    return $max;
+}
+
+sub min {
+    my ($min, @vars) = @_;
+	my $index=0;
+	$min||=0;
+    for my $i (0..$#vars) {
+        ($min, $index) = ($vars[$i],$i+1) if $vars[$i] < $min;
+    }
+    return $min;
+}
+
+sub uniq {
+	my %seen;
+	return grep { !$seen{$_}++ } @_;
+}
+
+
+
+sub process_cli {
+	foreach (@ARGV) {
+		if (/--(.*)=(.*)/) {
+			unless (defined($vars{$1})) {
+				print STDERR "Did not understand $_ ...\n";
+				help();
+			}
+			my ($v, $opt) = ($1,$2);
+			$vars{$v} = $opt;
+			next;
+		} elsif (/--h[elp]*/) {
+			help();
+		} elsif (/--(.*)/) {
+			print STDERR "Please add a parameter to $_ ...\n\n";
+			exit 1;
+		}
+		push @in_files, $_;
+	}
+}
+
+
+sub help {
+	print STDOUT <<EOT;
+Simple FDR random permutation peak caller
+Usage: [options] [files in bedgraph or GFF format]
+	
+Options:
+EOT
+	
+	my $opt_len = 0;
+	foreach (keys %vars) {
+		my $l = length($_);
+		$opt_len = $l if $l > $opt_len;
+	}
+	
+	$opt_len+=2;
+	
+	my $cols= `tput cols` || 80;
+	
+	my ($v, $val, $def, $def_format);
+	my $help_format = "format STDOUT =\n"
+		.' '.'^'.'<'x$opt_len . ' '. '^' . '<'x($cols-$opt_len-4) . "\n"
+		.'$v, $def_format'."\n"
+		.' '.'^'.'<'x$opt_len . '   '. '^' . '<'x($cols-$opt_len-6) . "~~\n"
+		.'$v, $def_format'."\n"
+		.".\n";
+		
+	eval $help_format;
+	die $@ if $@;
+	
+	foreach my $k (sort (keys %vars)) {
+		($v, $val, $def) = ($k, $vars{$k}, $vars_details{$k});
+		$def||="";
+		$def_format = $val ? "$def\n\r[Current value: $val]" : $def;
+		$v = "--$v";
+#		format =
+# ^<<<<<<<<<<<<<<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+#$v, $def_format
+# ^<<<<<<<<<<<<<<<<<<<<   ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< ~~
+#$v, $def_format
+#.
+
+		write();
+		
+	}
+	print STDOUT "\n";
+	exit 1;
+}
author	mvdbeek
date	Fri, 20 Apr 2018 04:34:17 -0400
parents
children	7b0ca503bb95