diff fosm_cluster/compute.stats.pl @ 0:68a3648c7d91 draft default tip

Uploaded
author matteoc
date Thu, 22 Dec 2016 04:45:31 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fosm_cluster/compute.stats.pl	Thu Dec 22 04:45:31 2016 -0500
@@ -0,0 +1,81 @@
+#!/usr/bin/perl -w
+$file=shift;
+$ncluster=shift;
+$ofile=shift;
+$fasfile=shift;
+$flist="4mers.list";
+open(IN,$flist);
+while(<IN>)
+{
+        chomp;
+        push (@let,$_);
+}
+close(IN);
+open(OUT,">$file.stats");
+open(IN,$file);
+%c=();
+$name=(split(/\./,$file))[0];
+while(<IN>)
+{
+	chomp;
+	if ($_=~/^>(.*)/)
+	{
+		$id=$1;
+		$M{$id}=$name;
+	}else{
+		$c{$id}.=$_;
+	}
+}
+close(IN);
+foreach $s (keys %c)
+{
+	($len,$cov)=(split(/\_/,$s))[3,5];
+	$seq=$c{$s};
+        $at=0;
+        $gc=0;
+        $le=length($seq);
+        $tt=0;
+        %DD=();
+	@seq=(split('',$c{$s}));
+
+        foreach $l (@seq)
+        {
+                if ($l eq "A" || $l eq "T")
+                {
+                        $at++;
+                }else{
+                        $gc++
+                }
+        }
+        for ($i=0;$i<=length($seq)-4;$i++)
+        {
+                $subs=substr($seq,$i,4);
+                $rc=reverse($subs);
+                $rc=~tr/ACGT/TGCA/;
+                $tt+=2;
+                $DD{$subs}++;
+                $DD{$rc}++;
+        }
+        $gc=$gc/$le;
+        next unless $len > 1000;
+        print OUT "$s\t$gc\t$cov\t";
+	foreach $L (sort @let)
+        {
+                $val=$DD{$L} ? $DD{$L}/$tt : 0;
+                print OUT "$val\t";
+        }
+        print OUT "$M{$s}\n";
+}
+
+system("./kmeans.R $file.stats $ncluster $ofile ")==0||die("no kmeans");
+
+open(OF,$ofile);
+$l=<OF>;
+open(FAS,">$fasfile");
+while(<OF>)
+{
+	($id,$cluster)=(split())[0,1];
+	$NC{$cluster}++;
+	$id=~s/\"//g;
+	print FAS ">$cluster\_$NC{$cluster}\#$id\n$c{$id}\n";		
+}