annotate fosm_cluster/compute.stats.pl @ 0:68a3648c7d91 draft default tip

Uploaded
author matteoc
date Thu, 22 Dec 2016 04:45:31 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
1 #!/usr/bin/perl -w
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
2 $file=shift;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
3 $ncluster=shift;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
4 $ofile=shift;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
5 $fasfile=shift;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
6 $flist="4mers.list";
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
7 open(IN,$flist);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
8 while(<IN>)
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
9 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
10 chomp;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
11 push (@let,$_);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
12 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
13 close(IN);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
14 open(OUT,">$file.stats");
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
15 open(IN,$file);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
16 %c=();
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
17 $name=(split(/\./,$file))[0];
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
18 while(<IN>)
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
19 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
20 chomp;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
21 if ($_=~/^>(.*)/)
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
22 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
23 $id=$1;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
24 $M{$id}=$name;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
25 }else{
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
26 $c{$id}.=$_;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
27 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
28 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
29 close(IN);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
30 foreach $s (keys %c)
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
31 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
32 ($len,$cov)=(split(/\_/,$s))[3,5];
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
33 $seq=$c{$s};
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
34 $at=0;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
35 $gc=0;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
36 $le=length($seq);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
37 $tt=0;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
38 %DD=();
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
39 @seq=(split('',$c{$s}));
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
40
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
41 foreach $l (@seq)
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
42 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
43 if ($l eq "A" || $l eq "T")
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
44 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
45 $at++;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
46 }else{
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
47 $gc++
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
48 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
49 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
50 for ($i=0;$i<=length($seq)-4;$i++)
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
51 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
52 $subs=substr($seq,$i,4);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
53 $rc=reverse($subs);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
54 $rc=~tr/ACGT/TGCA/;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
55 $tt+=2;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
56 $DD{$subs}++;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
57 $DD{$rc}++;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
58 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
59 $gc=$gc/$le;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
60 next unless $len > 1000;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
61 print OUT "$s\t$gc\t$cov\t";
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
62 foreach $L (sort @let)
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
63 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
64 $val=$DD{$L} ? $DD{$L}/$tt : 0;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
65 print OUT "$val\t";
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
66 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
67 print OUT "$M{$s}\n";
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
68 }
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
69
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
70 system("./kmeans.R $file.stats $ncluster $ofile ")==0||die("no kmeans");
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
71
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
72 open(OF,$ofile);
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
73 $l=<OF>;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
74 open(FAS,">$fasfile");
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
75 while(<OF>)
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
76 {
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
77 ($id,$cluster)=(split())[0,1];
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
78 $NC{$cluster}++;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
79 $id=~s/\"//g;
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
80 print FAS ">$cluster\_$NC{$cluster}\#$id\n$c{$id}\n";
68a3648c7d91 Uploaded
matteoc
parents:
diff changeset
81 }