comparison scripts/mergeTagsWithGap.pl @ 0:28d1a6f8143f draft

planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
author portiahollyoak
date Mon, 25 Apr 2016 13:08:56 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:28d1a6f8143f
1 #!/share/bin/perl
2 #chr2L 114333 114409 FBgn0003055_P-element harwich,1; + antisense
3 #chr2L 114443 114567 FBgn0003055_P-element harwich,3; + antisense
4 #chr2L 114636 114712 FBgn0003055_P-element harwich,1; - antisense
5 #chr2L 131640 131929 FBgn0003055_P-element harwich,42; + sense
6 #chr2L 131948 132274 FBgn0003055_P-element harwich,18; - sense
7 #chr2L 132027 132103 FBgn0003055_P-element harwich,1; - antisense
8
9 use warnings;
10 use strict;
11 use List::Util qw(max min);
12
13 if(scalar(@ARGV)<2 || grep {/^-h/} @ARGV)
14 {
15 die "
16 usage: mergeOverlapBed4.pl inputFile
17 Expects BED input with at least 4 fields. For each {chr,name} pair,
18 merges overlapping ranges and prints out sorted BED4 to stdout.
19 inputFile can be - or stdin to read from stdin.
20 ";
21 }
22
23 my $input=shift @ARGV;
24 my $maxgap=shift @ARGV;
25 grep {s/^stdin$/-/i} $input;
26
27 my %item2coords;
28 open IN,$input;
29 while (<IN>)
30 {
31 chomp;
32 my ($chrom,$start,$end,$transposonName,$count,$strand,$transposonStrand)=split/\t/;
33 push @{$item2coords{"$chrom;$transposonName;$transposonStrand"}},[$start,$end,$count,$strand];
34 }
35 close IN;
36
37 my @results;
38 foreach my $item (keys %item2coords)
39 {
40 my @sortedCoords=sort{ $a->[0]<=>$b->[0] } @{$item2coords{$item}};
41 my ($chrom,$tName,$tStrand)=split(/;/,$item);
42 my ($mergeStart,$mergeEnd,$mergeCounts,$mergeStrand)=@{shift @sortedCoords};
43 my %sampleCounts=();
44 my ($breakStart,$breakEnd)=0;
45 foreach my $sa (split/;/,$mergeCounts)
46 {
47 my ($s,$c)=split/,/,$sa;
48 $sampleCounts{$s}{$mergeStrand}=$c;
49 }
50 foreach my $rangeRef (@sortedCoords)
51 {
52 my ($rangeStart,$rangeEnd,$rangeCounts,$rangeStrand)=@{$rangeRef};
53 if($mergeStrand=~/\Q$rangeStrand\E$/)
54 {
55 if($rangeStart>=$mergeEnd+$maxgap)
56 {
57 $mergeCounts="";
58 foreach my $s (keys %sampleCounts)
59 {
60 $mergeCounts.=$s;
61 $mergeCounts.=",".$_.$sampleCounts{$s}{$_} foreach keys %{$sampleCounts{$s}};
62 $mergeCounts.=";";
63 }
64 if($mergeStrand eq "+")
65 {
66 $breakStart=$mergeEnd;
67 $breakEnd=$mergeEnd+$maxgap;
68 }
69 if($mergeStrand eq "-")
70 {
71 $breakStart=$mergeStart-$maxgap;
72 $breakEnd=$mergeStart;
73 }
74 push @results,[$chrom,$mergeStart,$mergeEnd,$tName,$mergeCounts,$mergeStrand,$tStrand,"$chrom:$breakStart.$breakEnd"];
75 ($mergeStart,$mergeEnd,$mergeStrand)=($rangeStart,$rangeEnd,$rangeStrand);
76 %sampleCounts=();
77 foreach my $sa (split/;/,$rangeCounts)
78 {
79 my ($s,$c)=split/,/,$sa;
80 $sampleCounts{$s}{$rangeStrand}=$c;
81 }
82 }
83 else
84 {
85 $mergeEnd=max($rangeEnd,$mergeEnd);
86 foreach my $sa (split/;/,$rangeCounts)
87 {
88 my ($s,$c)=split/,/,$sa;
89 $sampleCounts{$s}{$rangeStrand}+=$c;
90 }
91 }
92 }
93 elsif($rangeStrand eq "+")
94 {
95 $mergeCounts="";
96 foreach my $s (keys %sampleCounts)
97 {
98 $mergeCounts.=$s;
99 $mergeCounts.=",".$_.$sampleCounts{$s}{$_} foreach keys %{$sampleCounts{$s}};
100 $mergeCounts.=";";
101 }
102 if($mergeStrand eq "+")
103 {
104 $breakStart=$mergeEnd;
105 $breakEnd=$mergeEnd+$maxgap;
106 }
107 if($mergeStrand eq "-")
108 {
109 $breakStart=$mergeStart-$maxgap;
110 $breakEnd=$mergeStart;
111 }
112 push @results,[$chrom,$mergeStart,$mergeEnd,$tName,$mergeCounts,$mergeStrand,$tStrand,"$chrom:$breakStart.$breakEnd"];
113 ($mergeStart,$mergeEnd,$mergeStrand)=($rangeStart,$rangeEnd,$rangeStrand);
114 %sampleCounts=();
115 foreach my $sa (split/;/,$rangeCounts)
116 {
117 my ($s,$c)=split/,/,$sa;
118 $sampleCounts{$s}{$rangeStrand}=$c;
119 }
120 }
121 else
122 {
123 if($rangeStart>=$mergeEnd+$maxgap*2)
124 {
125 $mergeCounts="";
126 foreach my $s (keys %sampleCounts)
127 {
128 $mergeCounts.=$s;
129 $mergeCounts.=",".$_.$sampleCounts{$s}{$_} foreach keys %{$sampleCounts{$s}};
130 $mergeCounts.=";";
131 }
132 if($mergeStrand eq "+")
133 {
134 $breakStart=$mergeEnd;
135 $breakEnd=$mergeEnd+$maxgap;
136 }
137 if($mergeStrand eq "-")
138 {
139 $breakStart=$mergeStart-$maxgap;
140 $breakEnd=$mergeStart;
141 }
142 push @results,[$chrom,$mergeStart,$mergeEnd,$tName,$mergeCounts,$mergeStrand,$tStrand,"$chrom:$breakStart.$breakEnd"];
143 ($mergeStart,$mergeEnd,$mergeStrand)=($rangeStart,$rangeEnd,$rangeStrand);
144 %sampleCounts=();
145 foreach my $sa (split/;/,$rangeCounts)
146 {
147 my ($s,$c)=split/,/,$sa;
148 $sampleCounts{$s}{$rangeStrand}=$c;
149 }
150 }
151 else
152 {
153 $breakStart=$mergeEnd;
154 $mergeEnd=max($rangeEnd,$mergeEnd);
155 $breakEnd=$rangeStart;
156 foreach my $sa (split/;/,$rangeCounts)
157 {
158 my ($s,$c)=split/,/,$sa;
159 $sampleCounts{$s}{$rangeStrand}+=$c;
160 }
161 $mergeStrand.=$rangeStrand;
162 }
163 }
164 }
165 $mergeCounts="";
166 foreach my $s (keys %sampleCounts)
167 {
168 $mergeCounts.=$s;
169 $mergeCounts.=",".$_.$sampleCounts{$s}{$_} foreach keys %{$sampleCounts{$s}};
170 $mergeCounts.=";";
171 }
172 if($mergeStrand eq "+")
173 {
174 $breakStart=$mergeEnd;
175 $breakEnd=$mergeEnd+$maxgap;
176 }
177 if($mergeStrand eq "-")
178 {
179 $breakStart=$mergeStart-$maxgap;
180 $breakEnd=$mergeStart;
181 }
182 push @results,[$chrom,$mergeStart,$mergeEnd,$tName,$mergeCounts,$mergeStrand,$tStrand,"$chrom:$breakStart.$breakEnd"] if $mergeEnd;
183 }
184
185 sub bed4Cmp
186 {
187 # For sorting by chrom, chromStart, and names -- reverse order for names
188 return $a->[0] cmp $b->[0] ||
189 $a->[1] <=> $b->[1] ||
190 $b->[3] cmp $a->[3];
191 }
192
193 foreach my $r (sort bed4Cmp @results)
194 {
195 print join("\t",@{$r}),"\n";
196 }