diff nibls.pl @ 21:9dcffd531c76 draft

Uploaded
author big-tiandm
date Wed, 05 Nov 2014 21:09:35 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/nibls.pl	Wed Nov 05 21:09:35 2014 -0500
@@ -0,0 +1,319 @@
+#!/usr/bin/perl
+#####################################################################################################
+#LocusPocus is a free script, it is provided with the hope that you will enjoy, you may freely redistribute it at will. We would be greatful if you would keep these acknowledgements with it. 
+#
+# Dan MacLean
+# dan.maclean@sainsbury-laboratory.ac.uk
+#
+# This program is free academic software; academic and non-profit
+# users may redistribute it freely.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+#
+# This software is released under GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007
+# see included file GPL3.txt
+#
+#
+
+
+###Dont forget you will need ... 
+#####################################################################################################
+# Boost::Graph 
+#Copyright 2005 by David Burdick
+# Available from http://search.cpan.org/~dburdick/Boost-Graph-1.2/Graph.pm
+#Boost::Graph is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
+#####################################################################################################
+
+
+
+use strict;
+use warnings;
+use Boost::Graph;
+use Getopt::Long;
+
+
+my $usage = "usage: $0 -f GFF_FILE [options]\n\n -m minimum inclusion distance (default 5)\n -c clustering coefficient (default 0.6) -b buffer between graphs (default 0) -k sample mark -o output file -t temp output file\n";
+
+my $gff_file ;
+my $min_inc = 5;
+my $clus = 0.6;
+my $buff = 0;
+my $output_file;
+my $temp;
+my $mark;
+
+GetOptions(
+
+	'c=f'          => \$clus,
+	'm=i' => \$min_inc,
+	'f|file=s'     => \$gff_file,
+	'b=i' => \$buff,
+	'o=s' => \$output_file,
+	't=s' => \$temp,
+	'k=s' => \$mark
+) ;
+
+
+die $usage unless $gff_file;
+
+
+my $starttime = time;
+warn "started $starttime\n";
+
+## load in data
+my %molecules; # stores starts and ends of srnas
+open GFF, "<$gff_file";
+
+while (my $entry = <GFF>){
+
+	chomp $entry;
+	next if($entry=~/^\#/);
+	my @data = split(/\t/,$entry);
+	my $chr=shift @data;
+	my $strand=shift @data;
+	my $start=shift @data;
+	my $end=shift @data;
+#	my $length1=$end-$start+1;
+#	if ($length1>30) {
+#		$length1=40;
+#	}
+	my $total;
+	for (my $s=0;$s<@data ;$s++) {
+		$total+=$data[$s];
+	}
+	push @data,$total;
+#	push @data,$length1;
+#	if (defined $molecules{$chr}{$start}{$end}{$strand}) {
+#		my @old_data=split(/;/,$molecules{$chr}{$start}{$end}{$strand});
+#		for (my $i=0;$i<$#old_data ;$i++) {
+#			$data[$i]+=$old_data[$i];
+#		}
+#	}
+	my $data=join ";",@data;
+	$molecules{$chr}{$start}{$end}{$strand} = $data;#chr#start#end#strand#add Tags information
+	#print "$chr\t$start\t$end\n";
+}
+
+close GFF;
+
+warn "Data loaded...\nBuilding graphs and finding loci\nPlease be patient, this can take a while...\n";
+
+my @sample=split/\#/,$mark;
+$mark=join"\"\t\"",@sample;
+open OUT, ">$output_file";
+print OUT "\"Chr\"\t\"MajorLength\"\t\"Percent\"\t\"$mark\"\n";
+open CLUSTER,">$temp";
+print CLUSTER "\#Chr\tMajorLength\tPercent\tTagsNumber\tTagsInfor\n";
+foreach my $chromosome (keys %molecules){
+	my $g = new Boost::Graph(directed=>0);
+	my @starts = keys(%{$molecules{$chromosome}} );
+	@starts = sort {$a <=> $b} @starts;  
+
+	while (my $srna_start = shift @starts){   ## work from left most sRNA to right most, add to graph if they close enough
+
+
+		foreach my $srna_end (keys %{$molecules{$chromosome}{$srna_start}}){
+
+
+			###use new graph if the next srna is too far away from this one.. 
+			if(defined $starts[0] and $srna_end + $min_inc + $buff < $starts[0]){
+
+
+				##dump the info from the old graph
+				if (scalar(@{$g->get_nodes()}) > 2){
+
+					my $cluster_coeff = get_cc($g);
+					if ($cluster_coeff >= $clus){
+					 dump_locus($g, $cluster_coeff);
+					}
+				}
+
+
+				$g = new Boost::Graph(directed=>0);
+
+			}
+
+			foreach my $e (keys %{$molecules{$chromosome}{$srna_start}}){   ### extra bit because all loci with same start and different end overlap by definition. but are not collected by main search below
+
+				unless ($e eq $srna_end){
+					my $sn = $chromosome. ':' . $srna_start . ':' . $srna_end;   ## turn coordinate of sRNA inro a node name
+					my $en = $chromosome. ':' . $srna_start . ':' . $e;
+					$g->add_edge(node1=>"$sn", node2=>"$en", weight=>'1');
+				} 
+
+			}
+
+			foreach my $start (@starts){  ##build graph of overlaps
+				my $new = 0;
+				last if $start - $min_inc > $srna_end;
+				if ($start - $min_inc <= $srna_end){
+
+					my $start_node = $chromosome . ':' . $srna_start . ':' . $srna_end;
+					foreach my $end (keys %{$molecules{$chromosome}{$start}}){
+
+						my $end_node = $chromosome . ':' . $start . ':' . $end;
+						$g->add_edge(node1=>"$start_node", node2=>"$end_node", weight=>'1');
+					}
+
+				}
+			}
+		}
+
+		if (!(defined $starts[0])) {
+			##dump the info from the last graph
+			if (scalar(@{$g->get_nodes()}) > 2){
+
+				my $cluster_coeff = get_cc($g);
+				if ($cluster_coeff >= $clus){
+				 dump_locus($g, $cluster_coeff);
+				}
+			}
+		}
+	}
+}
+
+warn "Loci printed\nFinished\n";
+
+my $endtime = time;
+
+my $elapsed = $endtime - $starttime;
+
+warn "Time elapsed = $elapsed s\n";	
+close OUT;
+close CLUSTER;
+#########################################################################################
+sub get_cc{   ## do cluster coeff calculation. No useful method anyway so self implemented NB, this is an undirected graph so k is n(n-1)/2
+
+	my $graph = shift;
+
+	my @component = @{$graph->get_nodes()}; #number of nodes
+	my @clustering_coefficients;
+
+	foreach my $vertex (@component)
+	{
+
+		my @neighbours = @{$graph->neighbors($vertex)};
+
+		my %edges_in_graph;
+
+		my $n = @neighbours; #n = the number of neighbours
+		my $k = ($n * ($n - 1))/2;	#k = total number of possible connections
+
+		my $e= 0; #actual number of connections within sub-graph
+
+		foreach my $neighbour (@neighbours)
+		{
+			foreach my $neighbour_2 (@neighbours)
+			{
+			my $edge1 = "$neighbour\t$neighbour_2";
+			my $edge2 = "$neighbour_2\t$neighbour";
+				unless (exists $edges_in_graph{$edge1} or exists $edges_in_graph{$edge2})
+				{
+					if ($graph->has_edge($neighbour, $neighbour_2) or $graph->has_edge($neighbour_2, $neighbour))
+					{
+						++$e;
+						$edges_in_graph{$edge1}=1;
+						$edges_in_graph{$edge2}=1;
+					}
+				}
+			}
+		}
+
+		if ($k >= 1)
+		{	
+		my $c = $e / $k;
+		push @clustering_coefficients, $c;
+		}
+		else {push @clustering_coefficients, '0';}
+	}
+
+	my $graph_n = scalar(@clustering_coefficients);
+	my $graph_cc = 0;
+	foreach my $cc (@clustering_coefficients){ 
+
+		$graph_cc = $graph_cc + $cc;
+
+	}
+	$graph_cc = $graph_cc / $graph_n;
+
+	return $graph_cc;
+}
+
+############################################################################################################
+
+sub dump_locus{
+
+	my $g = shift;
+	my $cc = shift;
+	my $chr;
+	my $start = 1000000000000000000000000000000000000000000000; 
+	my $end = -1;
+	my @sample;
+	my @tag;
+	foreach my $node (@{$g->get_nodes()}){
+
+		$node =~ m/^(\S+):(\d+):(\d+)$/;
+		my $c=$1;
+		my $s=$2;
+		my $e=$3;
+	#	my @data;
+		foreach my $str (keys %{$molecules{$c}{$s}{$e}}) {
+			my @data=split(/;/,$molecules{$c}{$s}{$e}{$str});
+			push @tag,($s.",".$e.",".$str.",".$data[-1]);
+#			for (my $i=0;$i<$#old_data ;$i++) {
+#				$data[$i]+=$old_data[$i];
+#			}
+			my $length=$e-$s+1;
+			if ($length>30) {
+				$length=40;
+			}
+			push @data,$length;
+			my $data=join ";",@data;#sample_exp/total_exp/length;
+			push @sample,$data;
+		}
+
+		$chr = $c;
+		$start = $s if $s < $start;
+		$end = $e if $e > $end;
+	}
+	my $tag=join";",@tag;
+	my $tag_number=@tag;
+	my ($max_length,$max_p,@cluster_exp)=Max_length(\@sample);
+	if ($max_length==40) {
+		$max_length="\>30";
+	}
+	my $cluster_exp=join"\t",@cluster_exp;
+	my $gff = $chr."\:$start\-$end\t".$max_length."nt\t".$max_p."\t" . $cluster_exp;
+	print CLUSTER "$chr\:$start\-$end\t$max_length"."nt\t$max_p\t$tag_number\t$tag\n";
+	print OUT $gff, "\n";  
+}
+
+sub Max_length{
+	my @exp=@{$_[0]};
+	my %sample_length;
+	my $total_exp;
+	my @each;
+	for (my $i=0;$i<=$#exp ;$i++) {
+		my @tag=split/;/,$exp[$i];
+		my $length=pop(@tag);
+		my $exp=pop(@tag);
+		$sample_length{$length}+=$exp;
+		$total_exp+=$exp;
+		for (my $j=0;$j<=$#tag ;$j++) {
+			$each[$j]+=$tag[$j];
+		}
+	}
+	my $max=0;
+	my $max_key;
+	foreach my $key (sort keys %sample_length) {
+		my $p=$sample_length{$key}/$total_exp;
+		if ($p>$max) {
+			$max=$p;
+			$max_key=$key;
+		}
+		$sample_length{$key}=sprintf("%.2f",$p);
+	}
+	return($max_key,$sample_length{$max_key},@each);
+}