comparison nibls.pl @ 0:07745c0958dd draft

Uploaded
author big-tiandm
date Thu, 18 Sep 2014 21:40:25 -0400
parents
children e0884a4b996b
comparison
equal deleted inserted replaced
-1:000000000000 0:07745c0958dd
1 #!/usr/bin/perl
2 #####################################################################################################
3 #LocusPocus is a free script, it is provided with the hope that you will enjoy, you may freely redistribute it at will. We would be greatful if you would keep these acknowledgements with it.
4 #
5 # Dan MacLean
6 # dan.maclean@sainsbury-laboratory.ac.uk
7 #
8 # This program is free academic software; academic and non-profit
9 # users may redistribute it freely.
10 #
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
14 #
15 # This software is released under GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007
16 # see included file GPL3.txt
17 #
18 #
19
20
21 ###Dont forget you will need ...
22 #####################################################################################################
23 # Boost::Graph
24 #Copyright 2005 by David Burdick
25 # Available from http://search.cpan.org/~dburdick/Boost-Graph-1.2/Graph.pm
26 #Boost::Graph is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
27 #####################################################################################################
28
29
30
31 use strict;
32 use warnings;
33 use Boost::Graph;
34 use Getopt::Long;
35
36
37 my $usage = "usage: $0 -f GFF_FILE [options]\n\n -m minimum inclusion distance (default 5)\n -c clustering coefficient (default 0.6) -b buffer between graphs (default 0) -k sample mark -o output file -t temp output file\n";
38
39 my $gff_file ;
40 my $min_inc = 5;
41 my $clus = 0.6;
42 my $buff = 0;
43 my $output_file;
44 my $temp;
45 my $mark;
46
47 GetOptions(
48
49 'c=f' => \$clus,
50 'm=i' => \$min_inc,
51 'f|file=s' => \$gff_file,
52 'b=i' => \$buff,
53 'o=s' => \$output_file,
54 't=s' => \$temp,
55 'k=s' => \$mark
56 ) ;
57
58
59 die $usage unless $gff_file;
60
61
62 my $starttime = time;
63 warn "started $starttime\n";
64
65 ## load in data
66 my %molecules; # stores starts and ends of srnas
67 open GFF, "<$gff_file";
68
69 while (my $entry = <GFF>){
70
71 chomp $entry;
72 next if($entry=~/^\#/);
73 my @data = split(/\t/,$entry);
74 my $chr=shift @data;
75 my $strand=shift @data;
76 my $start=shift @data;
77 my $end=shift @data;
78 # my $length1=$end-$start+1;
79 # if ($length1>30) {
80 # $length1=40;
81 # }
82 my $total;
83 foreach (0..$#data) {
84 $total+=$_;
85 }
86 push @data,$total;
87 # push @data,$length1;
88 # if (defined $molecules{$chr}{$start}{$end}{$strand}) {
89 # my @old_data=split(/;/,$molecules{$chr}{$start}{$end}{$strand});
90 # for (my $i=0;$i<$#old_data ;$i++) {
91 # $data[$i]+=$old_data[$i];
92 # }
93 # }
94 my $data=join ";",@data;
95 $molecules{$chr}{$start}{$end}{$strand} = $data;#chr#start#end#strand#add Tags information
96 #print "$chr\t$start\t$end\n";
97 }
98
99 close GFF;
100
101 warn "Data loaded...\nBuilding graphs and finding loci\nPlease be patient, this can take a while...\n";
102
103 my @sample=split/\#/,$mark;
104 $mark=join"\"\t\"",@sample;
105 open OUT, ">$output_file";
106 print OUT "\"Chr\"\t\"MajorLength\"\t\"Percent\"\t\"$mark\"\n";
107 open CLUSTER,">$temp";
108 print CLUSTER "\#Chr\tMajorLength\tPercent\tTagsNumber\tTagsInfor\n";
109 foreach my $chromosome (keys %molecules){
110 my $g = new Boost::Graph(directed=>0);
111 my @starts = keys(%{$molecules{$chromosome}} );
112 @starts = sort {$a <=> $b} @starts;
113
114 while (my $srna_start = shift @starts){ ## work from left most sRNA to right most, add to graph if they close enough
115
116
117 foreach my $srna_end (keys %{$molecules{$chromosome}{$srna_start}}){
118
119
120 ###use new graph if the next srna is too far away from this one..
121 if(defined $starts[0] and $srna_end + $min_inc + $buff < $starts[0]){
122
123
124 ##dump the info from the old graph
125 if (scalar(@{$g->get_nodes()}) > 2){
126
127 my $cluster_coeff = get_cc($g);
128 if ($cluster_coeff >= $clus){
129 dump_locus($g, $cluster_coeff);
130 }
131 }
132
133
134 $g = new Boost::Graph(directed=>0);
135
136 }
137
138 foreach my $e (keys %{$molecules{$chromosome}{$srna_start}}){ ### extra bit because all loci with same start and different end overlap by definition. but are not collected by main search below
139
140 unless ($e eq $srna_end){
141 my $sn = $chromosome. ':' . $srna_start . ':' . $srna_end; ## turn coordinate of sRNA inro a node name
142 my $en = $chromosome. ':' . $srna_start . ':' . $e;
143 $g->add_edge(node1=>"$sn", node2=>"$en", weight=>'1');
144 }
145
146 }
147
148 foreach my $start (@starts){ ##build graph of overlaps
149 my $new = 0;
150 last if $start - $min_inc > $srna_end;
151 if ($start - $min_inc <= $srna_end){
152
153 my $start_node = $chromosome . ':' . $srna_start . ':' . $srna_end;
154 foreach my $end (keys %{$molecules{$chromosome}{$start}}){
155
156 my $end_node = $chromosome . ':' . $start . ':' . $end;
157 $g->add_edge(node1=>"$start_node", node2=>"$end_node", weight=>'1');
158 }
159
160 }
161 }
162 }
163
164 if (!(defined $starts[0])) {
165 ##dump the info from the last graph
166 if (scalar(@{$g->get_nodes()}) > 2){
167
168 my $cluster_coeff = get_cc($g);
169 if ($cluster_coeff >= $clus){
170 dump_locus($g, $cluster_coeff);
171 }
172 }
173 }
174 }
175 }
176
177 warn "Loci printed\nFinished\n";
178
179 my $endtime = time;
180
181 my $elapsed = $endtime - $starttime;
182
183 warn "Time elapsed = $elapsed s\n";
184 close OUT;
185 close CLUSTER;
186 #########################################################################################
187 sub get_cc{ ## do cluster coeff calculation. No useful method anyway so self implemented NB, this is an undirected graph so k is n(n-1)/2
188
189 my $graph = shift;
190
191 my @component = @{$graph->get_nodes()}; #number of nodes
192 my @clustering_coefficients;
193
194 foreach my $vertex (@component)
195 {
196
197 my @neighbours = @{$graph->neighbors($vertex)};
198
199 my %edges_in_graph;
200
201 my $n = @neighbours; #n = the number of neighbours
202 my $k = ($n * ($n - 1))/2; #k = total number of possible connections
203
204 my $e= 0; #actual number of connections within sub-graph
205
206 foreach my $neighbour (@neighbours)
207 {
208 foreach my $neighbour_2 (@neighbours)
209 {
210 my $edge1 = "$neighbour\t$neighbour_2";
211 my $edge2 = "$neighbour_2\t$neighbour";
212 unless (exists $edges_in_graph{$edge1} or exists $edges_in_graph{$edge2})
213 {
214 if ($graph->has_edge($neighbour, $neighbour_2) or $graph->has_edge($neighbour_2, $neighbour))
215 {
216 ++$e;
217 $edges_in_graph{$edge1}=1;
218 $edges_in_graph{$edge2}=1;
219 }
220 }
221 }
222 }
223
224 if ($k >= 1)
225 {
226 my $c = $e / $k;
227 push @clustering_coefficients, $c;
228 }
229 else {push @clustering_coefficients, '0';}
230 }
231
232 my $graph_n = scalar(@clustering_coefficients);
233 my $graph_cc = 0;
234 foreach my $cc (@clustering_coefficients){
235
236 $graph_cc = $graph_cc + $cc;
237
238 }
239 $graph_cc = $graph_cc / $graph_n;
240
241 return $graph_cc;
242 }
243
244 ############################################################################################################
245
246 sub dump_locus{
247
248 my $g = shift;
249 my $cc = shift;
250 my $chr;
251 my $start = 1000000000000000000000000000000000000000000000;
252 my $end = -1;
253 my @sample;
254 my @tag;
255 foreach my $node (@{$g->get_nodes()}){
256
257 $node =~ m/^(\S+):(\d+):(\d+)$/;
258 my $c=$1;
259 my $s=$2;
260 my $e=$3;
261 my @data;
262 foreach my $str (keys %{$molecules{$c}{$s}{$e}}) {
263 push @tag,($s.",".$e.",".$str);
264 @data=split(/;/,$molecules{$c}{$s}{$e}{$str});
265 # for (my $i=0;$i<$#old_data ;$i++) {
266 # $data[$i]+=$old_data[$i];
267 # }
268 my $length=$e-$s+1;
269 if ($length>30) {
270 $length=40;
271 }
272 push @data,$length;
273 my $data=join ";",@data;#sample_exp/total_exp/length;
274 push @sample,$data;
275 }
276
277 $chr = $c;
278 $start = $s if $s < $start;
279 $end = $e if $e > $end;
280 }
281 my $tag=join";",@tag;
282
283 my ($max_length,$max_p,@cluster_exp)=Max_length(\@sample);
284 if ($max_length==40) {
285 $max_length="\>30";
286 }
287 my $cluster_exp=join"\t",@cluster_exp;
288 my $gff = $chr."\:$start\-$end\t".$max_length."nt\t".$max_p."\t" . $cluster_exp;
289 print CLUSTER "$chr\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";
290 print OUT $gff, "\n";
291 }
292
293 sub Max_length{
294 my @exp=@{$_[0]};
295 my %sample_length;
296 my $total_exp;
297 my @each;
298 for (my $i=0;$i<=$#exp ;$i++) {
299 my @tag=split/;/,$exp[$i];
300 my $length=pop(@tag);
301 my $exp=pop(@tag);
302 $sample_length{$length}+=$exp;
303 $total_exp+=$exp;
304 for (my $j=0;$j<=$#tag ;$j++) {
305 $each[$j]+=$tag[$j];
306 }
307 }
308 my $max=0;
309 my $max_key;
310 foreach my $key (sort keys %sample_length) {
311 my $p=$sample_length{$key}/$total_exp;
312 if ($p>$max) {
313 $max=$p;
314 $max_key=$key;
315 }
316 $sample_length{$key}=sprintf("%.2f",$p);
317 }
318 return($max_key,$sample_length{$max_key},@each);
319 }