0
|
1 #!/usr/bin/perl -w
|
|
2 $file=shift;
|
|
3 $ncluster=shift;
|
|
4 $ofile=shift;
|
|
5 $fasfile=shift;
|
|
6 $flist="4mers.list";
|
|
7 open(IN,$flist);
|
|
8 while(<IN>)
|
|
9 {
|
|
10 chomp;
|
|
11 push (@let,$_);
|
|
12 }
|
|
13 close(IN);
|
|
14 open(OUT,">$file.stats");
|
|
15 open(IN,$file);
|
|
16 %c=();
|
|
17 $name=(split(/\./,$file))[0];
|
|
18 while(<IN>)
|
|
19 {
|
|
20 chomp;
|
|
21 if ($_=~/^>(.*)/)
|
|
22 {
|
|
23 $id=$1;
|
|
24 $M{$id}=$name;
|
|
25 }else{
|
|
26 $c{$id}.=$_;
|
|
27 }
|
|
28 }
|
|
29 close(IN);
|
|
30 foreach $s (keys %c)
|
|
31 {
|
|
32 ($len,$cov)=(split(/\_/,$s))[3,5];
|
|
33 $seq=$c{$s};
|
|
34 $at=0;
|
|
35 $gc=0;
|
|
36 $le=length($seq);
|
|
37 $tt=0;
|
|
38 %DD=();
|
|
39 @seq=(split('',$c{$s}));
|
|
40
|
|
41 foreach $l (@seq)
|
|
42 {
|
|
43 if ($l eq "A" || $l eq "T")
|
|
44 {
|
|
45 $at++;
|
|
46 }else{
|
|
47 $gc++
|
|
48 }
|
|
49 }
|
|
50 for ($i=0;$i<=length($seq)-4;$i++)
|
|
51 {
|
|
52 $subs=substr($seq,$i,4);
|
|
53 $rc=reverse($subs);
|
|
54 $rc=~tr/ACGT/TGCA/;
|
|
55 $tt+=2;
|
|
56 $DD{$subs}++;
|
|
57 $DD{$rc}++;
|
|
58 }
|
|
59 $gc=$gc/$le;
|
|
60 next unless $len > 1000;
|
|
61 print OUT "$s\t$gc\t$cov\t";
|
|
62 foreach $L (sort @let)
|
|
63 {
|
|
64 $val=$DD{$L} ? $DD{$L}/$tt : 0;
|
|
65 print OUT "$val\t";
|
|
66 }
|
|
67 print OUT "$M{$s}\n";
|
|
68 }
|
|
69
|
|
70 system("./kmeans.R $file.stats $ncluster $ofile ")==0||die("no kmeans");
|
|
71
|
|
72 open(OF,$ofile);
|
|
73 $l=<OF>;
|
|
74 open(FAS,">$fasfile");
|
|
75 while(<OF>)
|
|
76 {
|
|
77 ($id,$cluster)=(split())[0,1];
|
|
78 $NC{$cluster}++;
|
|
79 $id=~s/\"//g;
|
|
80 print FAS ">$cluster\_$NC{$cluster}\#$id\n$c{$id}\n";
|
|
81 }
|