2
|
1 #!/usr/bin/perl -w
|
|
2 #Filename:
|
|
3 #Author: Tian Dongmei
|
|
4 #Email: tiandm@big.ac.cn
|
|
5 #Date: 2014-3-20
|
|
6 #Modified:
|
|
7 #Description: fastq file form reads cluster(the same sequence in the same cluster)
|
|
8 my $version=1.00;
|
|
9
|
|
10 use strict;
|
|
11 use Getopt::Long;
|
|
12
|
|
13 my %opts;
|
|
14 GetOptions(\%opts,"i:s@","format=s","mark:s","qual:s","qv:i","o=s","h");
|
|
15 if (!(defined $opts{o} and defined $opts{'format'}) || defined $opts{h}) { #necessary arguments
|
|
16 &usage;
|
|
17 }
|
|
18 my @filein=@{$opts{i}} if(defined $opts{i});
|
|
19 my $name=defined $opts{'mark'} ? $opts{'mark'} : "seq";
|
|
20 my $fileout=$opts{'o'};
|
|
21 my $pq=defined $opts{'qv'} ? $opts{'qv'} : 33;
|
|
22 my %hash;##分块存放原始序列
|
|
23
|
|
24 my $format=$opts{'format'};
|
|
25 if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") {
|
|
26 die "Parameter -format is error!\n";
|
|
27 }
|
|
28
|
|
29 my ($qualT,$qualV);
|
|
30 if (defined $opts{'qual'} && ($format eq "fastq" || $format eq "fq")) { #quality filter
|
|
31 my @temp=split /:/,$opts{'qual'};
|
|
32 $qualT=$temp[0];
|
|
33 $qualV=$temp[1];
|
|
34
|
|
35 for (my $i=0;$i<@filein;$i++) {
|
|
36 open IN,"<$filein[$i]";
|
|
37 while (my $aline=<IN>) {
|
|
38 my $seq=<IN>;
|
|
39 my $n=<IN>;
|
|
40 my $qv=<IN>;
|
|
41 my $tag=&qvcheck($qv,$qualT,$qualV);
|
|
42 next if(!$tag);
|
|
43 my $str=substr($seq,0,6);
|
|
44 $hash{$str}[$i].=$seq;
|
|
45 }
|
|
46 close IN;
|
|
47 }
|
|
48 }
|
|
49 elsif($format eq "fastq" || $format eq "fq"){ ### do not filter low quality reads
|
|
50 for (my $i=0;$i<@filein;$i++) {
|
|
51 open IN,"<$filein[$i]";
|
|
52 while (my $aline=<IN>) {
|
|
53 my $seq=<IN>;
|
|
54 my $n=<IN>;
|
|
55 my $qv=<IN>;
|
|
56 my $str=substr($seq,0,6);
|
|
57 $hash{$str}[$i].=$seq;
|
|
58 }
|
|
59 close IN;
|
|
60 }
|
|
61
|
|
62 }
|
|
63 elsif($format eq "fasta" || $format eq "fa"){
|
|
64 for (my $i=0;$i<@filein;$i++) {
|
|
65 open IN,"<$filein[$i]";
|
|
66 while (my $aline=<IN>) {
|
|
67 my $seq=<IN>;
|
|
68 my $str=substr($seq,0,6);
|
|
69 $hash{$str}[$i].=$seq;
|
|
70 }
|
|
71 close IN;
|
|
72 }
|
|
73 }
|
|
74
|
|
75 open OUT,">$fileout"; #output file
|
|
76 my $count=0;
|
|
77 foreach my $key (keys %hash) {
|
|
78 my %cluster;
|
|
79 for (my $i=0;$i<@filein;$i++) {
|
|
80 next if(!(defined $hash{$key}[$i]));
|
|
81 my @tmp=split/\n/,$hash{$key}[$i];
|
|
82 foreach (@tmp) {
|
|
83 $cluster{$_}[$i]++;
|
|
84 }
|
|
85 }
|
|
86
|
|
87 foreach my $seq (keys %cluster) {
|
|
88 my $exp=""; my $ee=0;
|
|
89 for (my $i=0;$i<@filein;$i++) {
|
|
90 if (defined $cluster{$seq}[$i]) {
|
|
91 $exp.="_$cluster{$seq}[$i]";
|
|
92 $ee+=$cluster{$seq}[$i];
|
|
93 }else{
|
|
94 $exp.="_0";
|
|
95 }
|
|
96 }
|
|
97 $count+=$ee;
|
|
98 $exp=~s/^_//;
|
|
99 print OUT ">$name","_$count:$exp","_x$ee\n$seq\n";
|
|
100 }
|
|
101 }
|
|
102 close OUT;
|
|
103
|
|
104
|
|
105 sub qvcheck{
|
|
106 my ($str,$t,$v)=@_;
|
|
107 my $qv=0;
|
|
108 if($t eq "mean"){
|
|
109 $qv=&getMeanQuality($str);
|
|
110 }
|
|
111 elsif($t eq "min"){
|
|
112 $qv=&getMinQuality($str);
|
|
113 }
|
|
114 if ($qv<$v) {
|
|
115 return 0;
|
|
116 }
|
|
117 return 1;
|
|
118 }
|
|
119
|
|
120 sub getMeanQuality(){
|
|
121 chomp $_[0];
|
|
122 my @bases = split(//,$_[0]);
|
|
123 my $sum = 0;
|
|
124 for(my $i = 0; $i <= $#bases; $i++){
|
|
125 my $num = ord($bases[$i]) - $pq;
|
|
126 $sum += $num;
|
|
127 }
|
|
128
|
|
129 return $sum/($#bases+1);
|
|
130
|
|
131 }
|
|
132
|
|
133 ###
|
|
134 ### This function gives back the Q-value of the worst base
|
|
135 sub getMinQuality(){
|
|
136 chomp $_[0];
|
|
137 my @bases = split(//,$_[0]);
|
|
138 my $worst = 1000;
|
|
139 for(my $i = 0; $i <= $#bases; $i++){
|
|
140 # printf ("base: $bases[$i] --> %d\n",ord($bases[$i]));
|
|
141 my $num = ord($bases[$i]) - $pq;
|
|
142 if($num < $worst){
|
|
143 $worst = $num;
|
|
144 }
|
|
145 }
|
|
146 return $worst;
|
|
147 }
|
|
148
|
|
149 sub usage{
|
|
150 print <<"USAGE";
|
|
151 Version $version
|
|
152 Usage:
|
|
153 $0 -i -format -mark -qual -qv -o
|
|
154 options:
|
|
155 -i input file#fastq file ##can be multiple -i file1 -i file2 ...
|
|
156 -mark string#quary name,default is "seq"
|
|
157 -o output file
|
|
158 -format string # fastq|fasta|fq|fa
|
|
159
|
|
160 -qual #reads filter
|
|
161 eg:(min:value/mean:value)
|
|
162 This parameter just for solexa reads.
|
|
163 If the input files are solid and needs filter,please do filter first .
|
|
164
|
|
165 -qv integer #Phred quality64/33,default 33
|
|
166 -h help
|
|
167 USAGE
|
|
168 exit(1);
|
|
169 }
|
|
170
|