diff conventional.pl @ 23:cad6a1dfb174 draft

Uploaded
author big-tiandm
date Wed, 05 Nov 2014 21:11:49 -0500
parents 07745c0958dd
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/conventional.pl	Wed Nov 05 21:11:49 2014 -0500
@@ -0,0 +1,156 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Chentt
+#Email: chentt@big.ac.cn
+#Date: 2014/04/09
+#Modified:
+#Description: islands merged of merged samples 
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+
+my %opts;
+GetOptions(\%opts,"i=s","d=i","o=s","N=i","t=s","mark=s","h");
+if (!(defined $opts{i} and defined $opts{d} and defined $opts{N} and defined $opts{mark} and defined $opts{t} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my $filein=$opts{'i'};
+my $fileout=$opts{'o'};
+my $distance=$opts{'d'};
+my $tempout=$opts{'t'};
+my $mark=$opts{'mark'};
+my @sample=split/\#/,$mark;
+$mark=join"\"\t\"",@sample;
+
+open IN,"<$filein"; #input file  
+open OUT,">$fileout"; #output file  
+print OUT "\"Chr\"\t\"MajorLength\"\t\"Percent\"\t\"$mark\"\n";
+open TMP,">$tempout";
+print TMP "\#Chr\tMajorLength\tPercent\tTagsNumber\tTagsInfor\n";
+my %hash;
+while (my $aline=<IN>) {
+	chomp $aline;
+	if($aline=~/^\#/){
+		#print OUT "$aline\n";
+		next;
+	}
+	my @tmp=split/\t/,$aline;
+	my $chr=shift @tmp;
+	#shift @tmp;
+	push @{$hash{$chr}},[@tmp];
+}
+
+close IN;
+
+foreach my $key (keys %hash) {
+	my @tag=sort{$a->[1] <=> $b->[1]} @{$hash{$key}};
+	my @sample;
+	my $start=$tag[0][1];
+	my $end=$tag[0][2];
+	push @sample,[@{$tag[0]}];
+	for (my $i=1;$i<@tag-1;$i++) {
+		if ($tag[$i][1]-$end<=$distance) {
+			if ($tag[$i][2]>$end) {
+				$end=$tag[$i][2];
+			}
+			push @sample,[@{$tag[$i]}];
+		}
+		else{
+			my ($max_length,$max_p,$tag,@cluster_exp)=Max_length(\@sample);
+			my $cluster_exp=join"\t",@cluster_exp;
+			if ($max_length>30) {
+				print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";
+				$max_length="\>30";
+			}
+			else{print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";}
+			print OUT "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$cluster_exp\n";			
+			$start=$tag[$i][1];
+			$end=$tag[$i][2];
+
+			@sample=();
+			push @sample,[@{$tag[$i]}];
+		}
+	}
+	if ($tag[$#tag][1]-$end<=$distance) {
+			if ($tag[$#tag][2]>$end) {
+				$end=$tag[$#tag][2];
+			}
+			push @sample,[@{$tag[$#tag]}];
+			my ($max_length,$max_p,$tag,@cluster_exp)=Max_length(\@sample);
+			my $cluster_exp=join"\t",@cluster_exp;
+			if ($max_length>30) {
+				$max_length="\>30";
+				print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";
+			}
+			else{print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";}
+			print OUT "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$cluster_exp\n";
+	}
+	else{
+		my ($max_length,$max_p,$tag,@cluster_exp)=Max_length(\@sample);
+		my $cluster_exp=join"\t",@cluster_exp;
+		if ($max_length>30) {
+				$max_length="\>30";
+				print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";
+		}
+		else{print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";}
+		print OUT "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$cluster_exp\n";
+
+	}
+}
+close OUT;
+close TMP;
+sub Max_length{
+	my @exp=@{$_[0]};
+	my %sample_length;
+	my $total_exp;
+	my @each;
+	my @tag;
+	for (my $i=0;$i<=$#exp ;$i++) {
+		my $length=$exp[$i][2]-$exp[$i][1]+1;
+		#if ($length>30) {
+		#	$length=40;
+		#}
+		my $exp=0;
+		foreach  (1..$opts{'N'}) {
+			$exp+=$exp[$i][$_+2];
+			$each[$_-1]+=$exp[$i][$_+2];
+		}	
+		$sample_length{$length}+=$exp;
+		$total_exp+=$exp;
+		push @tag,($exp[$i][1].",".$exp[$i][2].",".$exp[$i][0].",".$exp);
+	}
+	my $max=0;
+	my $max_key;
+	foreach my $key (sort keys %sample_length) {
+		my $p=$sample_length{$key}/$total_exp;
+		if ($p>$max) {
+			$max=$p;
+			$max_key=$key;
+		}
+		$sample_length{$key}=sprintf("%.2f",$p);
+	}
+	my $tag_n=@tag;
+	my $tag=join";",@tag;
+	$tag=$tag_n."\t".$tag;
+	return($max_key,$sample_length{$max_key},$tag,@each);
+}
+
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -o -d -N -t -mark
+options:
+-i input file
+-d distance of two islands
+-mark sample name;
+-o output file
+-N sample number
+-t temp output file
+-h help
+USAGE
+exit(1);
+}
+