diff scripts/mergeTagsWithoutGap.pl @ 0:28d1a6f8143f draft

planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
author portiahollyoak
date Mon, 25 Apr 2016 13:08:56 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/mergeTagsWithoutGap.pl	Mon Apr 25 13:08:56 2016 -0400
@@ -0,0 +1,91 @@
+#!/share/bin/perl
+#chr2L   735929  736005  HWUSI-EAS1533_0002:1:73:4665:12371#0/2  FBgn0000155_roo,-58;FBgn0000155_roo,-8722;      -
+use warnings;
+use strict;
+
+if(scalar(@ARGV)<1 || grep {/^-h/} @ARGV)
+{
+	die "
+usage: mergeOverlapBed4.pl inputFile
+Expects BED input with at least 4 fields.  For each {chr,name} pair,
+merges overlapping ranges and prints out sorted BED4 to stdout.
+inputFile can be - or stdin to read from stdin.
+";
+}
+
+my $input=shift @ARGV;
+grep {s/^stdin$/-/i} $input;
+
+my %item2coords;
+open IN,$input;
+while (<IN>)
+{
+	chomp;
+	my ($chrom,$start,$end,$sample,$class,$strand)=split/\t/;
+  	die "Sorry, input must have at least 4 fields of BED.\n" if ! $class;
+	# random choose one
+#	my @loc=$class=~/(.*?),(\+|-)(.*)/;
+#	my $transposonStrand=($strand eq $loc[1])?"antisense":"sense";
+#	push @{$item2coords{"$chrom;$strand;$loc[0];$transposonStrand"}},[$start,$end,$sample] 
+
+	# norm by class
+	my @loc=map { [/(.*?),(\+|-)(.*)/] } split/;/,$class;
+	my %transposonName;
+	foreach my $l (@loc)
+	{
+		my $transposonStrand=($strand eq $$l[1])?"antisense":"sense";
+		$transposonName{$$l[0]}=$transposonStrand;
+	}
+	my $c=1/scalar(keys %transposonName);
+	push @{$item2coords{"$chrom;$strand;$_;$transposonName{$_}"}},[$start,$end,$sample,$c] foreach keys %transposonName; 
+}
+close IN;
+
+my @results;
+foreach my $item (keys %item2coords)
+{
+	my @sortedCoords=sort{ $a->[0]<=>$b->[0] } @{$item2coords{$item}};
+	my ($chrom,$strand,$tName,$tStrand)=split(/;/,$item);
+	my ($mergeStart,$mergeEnd,$mergeSample,$mergeCounts)=@{shift @sortedCoords};
+	my %sampleCounts;
+	$sampleCounts{$mergeSample}=$mergeCounts;
+	foreach my $rangeRef (@sortedCoords) 
+	{
+    		my ($rangeStart,$rangeEnd,$rangeSample,$rangeCounts)=@{$rangeRef};
+    		if($rangeEnd<=$mergeEnd)
+		{
+			$sampleCounts{$rangeSample}+=$rangeCounts;
+			next;
+		}
+		if($rangeStart>=$mergeEnd)
+		{
+			my $count="";
+			$count.=$_.",".$sampleCounts{$_}.";" foreach keys %sampleCounts;
+			push @results,[$chrom,$mergeStart,$mergeEnd,$tName,$count,$strand,$tStrand];
+			($mergeStart,$mergeEnd,$mergeSample,$mergeCounts)=($rangeStart,$rangeEnd,$rangeSample,$rangeCounts);
+			%sampleCounts=();
+			$sampleCounts{$mergeSample}=$mergeCounts;
+		}
+		else
+		{
+			$mergeEnd=$rangeEnd;
+			$sampleCounts{$rangeSample}+=$rangeCounts;
+		}
+	}
+	my $count="";
+	$count.=$_.",".$sampleCounts{$_}.";" foreach keys %sampleCounts;
+	push @results,[$chrom,$mergeStart,$mergeEnd,$tName,$count,$strand,$tStrand] if $mergeEnd;
+}
+
+sub bed4Cmp
+{
+  # For sorting by chrom, chromStart, and names -- reverse order for names
+	return $a->[0] cmp $b->[0] ||
+	$a->[1] <=> $b->[1] ||
+	$b->[3] cmp $a->[3];
+}
+
+foreach my $r (sort bed4Cmp @results)
+{
+	print join("\t",@{$r}),"\n";
+}