| 0 | 1 #!/usr/bin/perl -w | 
|  | 2 $file=shift; | 
|  | 3 $ncluster=shift; | 
|  | 4 $ofile=shift; | 
|  | 5 $fasfile=shift; | 
|  | 6 $flist="4mers.list"; | 
|  | 7 open(IN,$flist); | 
|  | 8 while(<IN>) | 
|  | 9 { | 
|  | 10         chomp; | 
|  | 11         push (@let,$_); | 
|  | 12 } | 
|  | 13 close(IN); | 
|  | 14 open(OUT,">$file.stats"); | 
|  | 15 open(IN,$file); | 
|  | 16 %c=(); | 
|  | 17 $name=(split(/\./,$file))[0]; | 
|  | 18 while(<IN>) | 
|  | 19 { | 
|  | 20 	chomp; | 
|  | 21 	if ($_=~/^>(.*)/) | 
|  | 22 	{ | 
|  | 23 		$id=$1; | 
|  | 24 		$M{$id}=$name; | 
|  | 25 	}else{ | 
|  | 26 		$c{$id}.=$_; | 
|  | 27 	} | 
|  | 28 } | 
|  | 29 close(IN); | 
|  | 30 foreach $s (keys %c) | 
|  | 31 { | 
|  | 32 	($len,$cov)=(split(/\_/,$s))[3,5]; | 
|  | 33 	$seq=$c{$s}; | 
|  | 34         $at=0; | 
|  | 35         $gc=0; | 
|  | 36         $le=length($seq); | 
|  | 37         $tt=0; | 
|  | 38         %DD=(); | 
|  | 39 	@seq=(split('',$c{$s})); | 
|  | 40 | 
|  | 41         foreach $l (@seq) | 
|  | 42         { | 
|  | 43                 if ($l eq "A" || $l eq "T") | 
|  | 44                 { | 
|  | 45                         $at++; | 
|  | 46                 }else{ | 
|  | 47                         $gc++ | 
|  | 48                 } | 
|  | 49         } | 
|  | 50         for ($i=0;$i<=length($seq)-4;$i++) | 
|  | 51         { | 
|  | 52                 $subs=substr($seq,$i,4); | 
|  | 53                 $rc=reverse($subs); | 
|  | 54                 $rc=~tr/ACGT/TGCA/; | 
|  | 55                 $tt+=2; | 
|  | 56                 $DD{$subs}++; | 
|  | 57                 $DD{$rc}++; | 
|  | 58         } | 
|  | 59         $gc=$gc/$le; | 
|  | 60         next unless $len > 1000; | 
|  | 61         print OUT "$s\t$gc\t$cov\t"; | 
|  | 62 	foreach $L (sort @let) | 
|  | 63         { | 
|  | 64                 $val=$DD{$L} ? $DD{$L}/$tt : 0; | 
|  | 65                 print OUT "$val\t"; | 
|  | 66         } | 
|  | 67         print OUT "$M{$s}\n"; | 
|  | 68 } | 
|  | 69 | 
|  | 70 system("./kmeans.R $file.stats $ncluster $ofile ")==0||die("no kmeans"); | 
|  | 71 | 
|  | 72 open(OF,$ofile); | 
|  | 73 $l=<OF>; | 
|  | 74 open(FAS,">$fasfile"); | 
|  | 75 while(<OF>) | 
|  | 76 { | 
|  | 77 	($id,$cluster)=(split())[0,1]; | 
|  | 78 	$NC{$cluster}++; | 
|  | 79 	$id=~s/\"//g; | 
|  | 80 	print FAS ">$cluster\_$NC{$cluster}\#$id\n$c{$id}\n"; | 
|  | 81 } |