Mercurial > repos > portiahollyoak > temp
diff scripts/mergeTagsWithGap.pl @ 0:28d1a6f8143f draft
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
author | portiahollyoak |
---|---|
date | Mon, 25 Apr 2016 13:08:56 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/mergeTagsWithGap.pl Mon Apr 25 13:08:56 2016 -0400 @@ -0,0 +1,196 @@ +#!/share/bin/perl +#chr2L 114333 114409 FBgn0003055_P-element harwich,1; + antisense +#chr2L 114443 114567 FBgn0003055_P-element harwich,3; + antisense +#chr2L 114636 114712 FBgn0003055_P-element harwich,1; - antisense +#chr2L 131640 131929 FBgn0003055_P-element harwich,42; + sense +#chr2L 131948 132274 FBgn0003055_P-element harwich,18; - sense +#chr2L 132027 132103 FBgn0003055_P-element harwich,1; - antisense + +use warnings; +use strict; +use List::Util qw(max min); + +if(scalar(@ARGV)<2 || grep {/^-h/} @ARGV) +{ + die " +usage: mergeOverlapBed4.pl inputFile +Expects BED input with at least 4 fields. For each {chr,name} pair, +merges overlapping ranges and prints out sorted BED4 to stdout. +inputFile can be - or stdin to read from stdin. +"; +} + +my $input=shift @ARGV; +my $maxgap=shift @ARGV; +grep {s/^stdin$/-/i} $input; + +my %item2coords; +open IN,$input; +while (<IN>) +{ + chomp; + my ($chrom,$start,$end,$transposonName,$count,$strand,$transposonStrand)=split/\t/; + push @{$item2coords{"$chrom;$transposonName;$transposonStrand"}},[$start,$end,$count,$strand]; +} +close IN; + +my @results; +foreach my $item (keys %item2coords) +{ + my @sortedCoords=sort{ $a->[0]<=>$b->[0] } @{$item2coords{$item}}; + my ($chrom,$tName,$tStrand)=split(/;/,$item); + my ($mergeStart,$mergeEnd,$mergeCounts,$mergeStrand)=@{shift @sortedCoords}; + my %sampleCounts=(); + my ($breakStart,$breakEnd)=0; + foreach my $sa (split/;/,$mergeCounts) + { + my ($s,$c)=split/,/,$sa; + $sampleCounts{$s}{$mergeStrand}=$c; + } + foreach my $rangeRef (@sortedCoords) + { + my ($rangeStart,$rangeEnd,$rangeCounts,$rangeStrand)=@{$rangeRef}; + if($mergeStrand=~/\Q$rangeStrand\E$/) + { + if($rangeStart>=$mergeEnd+$maxgap) + { + $mergeCounts=""; + foreach my $s (keys %sampleCounts) + { + $mergeCounts.=$s; + $mergeCounts.=",".$_.$sampleCounts{$s}{$_} foreach keys %{$sampleCounts{$s}}; + $mergeCounts.=";"; + } + if($mergeStrand eq "+") + { + $breakStart=$mergeEnd; + $breakEnd=$mergeEnd+$maxgap; + } + if($mergeStrand eq "-") + { + $breakStart=$mergeStart-$maxgap; + $breakEnd=$mergeStart; + } + push @results,[$chrom,$mergeStart,$mergeEnd,$tName,$mergeCounts,$mergeStrand,$tStrand,"$chrom:$breakStart.$breakEnd"]; + ($mergeStart,$mergeEnd,$mergeStrand)=($rangeStart,$rangeEnd,$rangeStrand); + %sampleCounts=(); + foreach my $sa (split/;/,$rangeCounts) + { + my ($s,$c)=split/,/,$sa; + $sampleCounts{$s}{$rangeStrand}=$c; + } + } + else + { + $mergeEnd=max($rangeEnd,$mergeEnd); + foreach my $sa (split/;/,$rangeCounts) + { + my ($s,$c)=split/,/,$sa; + $sampleCounts{$s}{$rangeStrand}+=$c; + } + } + } + elsif($rangeStrand eq "+") + { + $mergeCounts=""; + foreach my $s (keys %sampleCounts) + { + $mergeCounts.=$s; + $mergeCounts.=",".$_.$sampleCounts{$s}{$_} foreach keys %{$sampleCounts{$s}}; + $mergeCounts.=";"; + } + if($mergeStrand eq "+") + { + $breakStart=$mergeEnd; + $breakEnd=$mergeEnd+$maxgap; + } + if($mergeStrand eq "-") + { + $breakStart=$mergeStart-$maxgap; + $breakEnd=$mergeStart; + } + push @results,[$chrom,$mergeStart,$mergeEnd,$tName,$mergeCounts,$mergeStrand,$tStrand,"$chrom:$breakStart.$breakEnd"]; + ($mergeStart,$mergeEnd,$mergeStrand)=($rangeStart,$rangeEnd,$rangeStrand); + %sampleCounts=(); + foreach my $sa (split/;/,$rangeCounts) + { + my ($s,$c)=split/,/,$sa; + $sampleCounts{$s}{$rangeStrand}=$c; + } + } + else + { + if($rangeStart>=$mergeEnd+$maxgap*2) + { + $mergeCounts=""; + foreach my $s (keys %sampleCounts) + { + $mergeCounts.=$s; + $mergeCounts.=",".$_.$sampleCounts{$s}{$_} foreach keys %{$sampleCounts{$s}}; + $mergeCounts.=";"; + } + if($mergeStrand eq "+") + { + $breakStart=$mergeEnd; + $breakEnd=$mergeEnd+$maxgap; + } + if($mergeStrand eq "-") + { + $breakStart=$mergeStart-$maxgap; + $breakEnd=$mergeStart; + } + push @results,[$chrom,$mergeStart,$mergeEnd,$tName,$mergeCounts,$mergeStrand,$tStrand,"$chrom:$breakStart.$breakEnd"]; + ($mergeStart,$mergeEnd,$mergeStrand)=($rangeStart,$rangeEnd,$rangeStrand); + %sampleCounts=(); + foreach my $sa (split/;/,$rangeCounts) + { + my ($s,$c)=split/,/,$sa; + $sampleCounts{$s}{$rangeStrand}=$c; + } + } + else + { + $breakStart=$mergeEnd; + $mergeEnd=max($rangeEnd,$mergeEnd); + $breakEnd=$rangeStart; + foreach my $sa (split/;/,$rangeCounts) + { + my ($s,$c)=split/,/,$sa; + $sampleCounts{$s}{$rangeStrand}+=$c; + } + $mergeStrand.=$rangeStrand; + } + } + } + $mergeCounts=""; + foreach my $s (keys %sampleCounts) + { + $mergeCounts.=$s; + $mergeCounts.=",".$_.$sampleCounts{$s}{$_} foreach keys %{$sampleCounts{$s}}; + $mergeCounts.=";"; + } + if($mergeStrand eq "+") + { + $breakStart=$mergeEnd; + $breakEnd=$mergeEnd+$maxgap; + } + if($mergeStrand eq "-") + { + $breakStart=$mergeStart-$maxgap; + $breakEnd=$mergeStart; + } + push @results,[$chrom,$mergeStart,$mergeEnd,$tName,$mergeCounts,$mergeStrand,$tStrand,"$chrom:$breakStart.$breakEnd"] if $mergeEnd; +} + +sub bed4Cmp +{ + # For sorting by chrom, chromStart, and names -- reverse order for names + return $a->[0] cmp $b->[0] || + $a->[1] <=> $b->[1] || + $b->[3] cmp $a->[3]; +} + +foreach my $r (sort bed4Cmp @results) +{ + print join("\t",@{$r}),"\n"; +}