Mercurial > repos > matteoc > agame_custom_tools
diff fosm_cluster/compute.stats.pl @ 0:68a3648c7d91 draft default tip
Uploaded
author | matteoc |
---|---|
date | Thu, 22 Dec 2016 04:45:31 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fosm_cluster/compute.stats.pl Thu Dec 22 04:45:31 2016 -0500 @@ -0,0 +1,81 @@ +#!/usr/bin/perl -w +$file=shift; +$ncluster=shift; +$ofile=shift; +$fasfile=shift; +$flist="4mers.list"; +open(IN,$flist); +while(<IN>) +{ + chomp; + push (@let,$_); +} +close(IN); +open(OUT,">$file.stats"); +open(IN,$file); +%c=(); +$name=(split(/\./,$file))[0]; +while(<IN>) +{ + chomp; + if ($_=~/^>(.*)/) + { + $id=$1; + $M{$id}=$name; + }else{ + $c{$id}.=$_; + } +} +close(IN); +foreach $s (keys %c) +{ + ($len,$cov)=(split(/\_/,$s))[3,5]; + $seq=$c{$s}; + $at=0; + $gc=0; + $le=length($seq); + $tt=0; + %DD=(); + @seq=(split('',$c{$s})); + + foreach $l (@seq) + { + if ($l eq "A" || $l eq "T") + { + $at++; + }else{ + $gc++ + } + } + for ($i=0;$i<=length($seq)-4;$i++) + { + $subs=substr($seq,$i,4); + $rc=reverse($subs); + $rc=~tr/ACGT/TGCA/; + $tt+=2; + $DD{$subs}++; + $DD{$rc}++; + } + $gc=$gc/$le; + next unless $len > 1000; + print OUT "$s\t$gc\t$cov\t"; + foreach $L (sort @let) + { + $val=$DD{$L} ? $DD{$L}/$tt : 0; + print OUT "$val\t"; + } + print OUT "$M{$s}\n"; +} + +system("./kmeans.R $file.stats $ncluster $ofile ")==0||die("no kmeans"); + +open(OF,$ofile); +$l=<OF>; +open(FAS,">$fasfile"); +while(<OF>) +{ + ($id,$cluster)=(split())[0,1]; + $NC{$cluster}++; + $id=~s/\"//g; + print FAS ">$cluster\_$NC{$cluster}\#$id\n$c{$id}\n"; +}