# HG changeset patch # User big-tiandm # Date 1406279975 14400 # Node ID e08814f01490e4457ba750c5cb467cfa47f8f35f # Parent 4ad3cbe96ded9b96381a0bed73f506aabcf3cb20 Uploaded diff -r 4ad3cbe96ded -r e08814f01490 collapseReads2Tags.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/collapseReads2Tags.pl Fri Jul 25 05:19:35 2014 -0400 @@ -0,0 +1,170 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2014-3-20 +#Modified: +#Description: fastq file form reads cluster(the same sequence in the same cluster) +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i:s@","format=s","mark:s","qual:s","qv:i","o=s","h"); +if (!(defined $opts{o} and defined $opts{'format'}) || defined $opts{h}) { #necessary arguments +&usage; +} +my @filein=@{$opts{i}} if(defined $opts{i}); +my $name=defined $opts{'mark'} ? $opts{'mark'} : "seq"; +my $fileout=$opts{'o'}; +my $pq=defined $opts{'qv'} ? $opts{'qv'} : 33; +my %hash;##分块存放原始序列 + +my $format=$opts{'format'}; +if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { + die "Parameter -format is error!\n"; +} + +my ($qualT,$qualV); +if (defined $opts{'qual'} && ($format eq "fastq" || $format eq "fq")) { #quality filter + my @temp=split /:/,$opts{'qual'}; + $qualT=$temp[0]; + $qualV=$temp[1]; + + for (my $i=0;$i<@filein;$i++) { + open IN,"<$filein[$i]"; + while (my $aline=) { + my $seq=; + my $n=; + my $qv=; + my $tag=&qvcheck($qv,$qualT,$qualV); + next if(!$tag); + my $str=substr($seq,0,6); + $hash{$str}[$i].=$seq; + } + close IN; + } +} +elsif($format eq "fastq" || $format eq "fq"){ ### do not filter low quality reads + for (my $i=0;$i<@filein;$i++) { + open IN,"<$filein[$i]"; + while (my $aline=) { + my $seq=; + my $n=; + my $qv=; + my $str=substr($seq,0,6); + $hash{$str}[$i].=$seq; + } + close IN; + } + +} +elsif($format eq "fasta" || $format eq "fa"){ + for (my $i=0;$i<@filein;$i++) { + open IN,"<$filein[$i]"; + while (my $aline=) { + my $seq=; + my $str=substr($seq,0,6); + $hash{$str}[$i].=$seq; + } + close IN; + } +} + +open OUT,">$fileout"; #output file +my $count=0; +foreach my $key (keys %hash) { + my %cluster; + for (my $i=0;$i<@filein;$i++) { + next if(!(defined $hash{$key}[$i])); + my @tmp=split/\n/,$hash{$key}[$i]; + foreach (@tmp) { + $cluster{$_}[$i]++; + } + } + + foreach my $seq (keys %cluster) { + my $exp=""; my $ee=0; + for (my $i=0;$i<@filein;$i++) { + if (defined $cluster{$seq}[$i]) { + $exp.="_$cluster{$seq}[$i]"; + $ee+=$cluster{$seq}[$i]; + }else{ + $exp.="_0"; + } + } + $count+=$ee; + $exp=~s/^_//; + print OUT ">$name","_$count:$exp","_x$ee\n$seq\n"; + } +} +close OUT; + + +sub qvcheck{ + my ($str,$t,$v)=@_; + my $qv=0; + if($t eq "mean"){ + $qv=&getMeanQuality($str); + } + elsif($t eq "min"){ + $qv=&getMinQuality($str); + } + if ($qv<$v) { + return 0; + } + return 1; +} + +sub getMeanQuality(){ + chomp $_[0]; + my @bases = split(//,$_[0]); + my $sum = 0; + for(my $i = 0; $i <= $#bases; $i++){ + my $num = ord($bases[$i]) - $pq; + $sum += $num; + } + + return $sum/($#bases+1); + +} + +### +### This function gives back the Q-value of the worst base +sub getMinQuality(){ + chomp $_[0]; + my @bases = split(//,$_[0]); + my $worst = 1000; + for(my $i = 0; $i <= $#bases; $i++){ +# printf ("base: $bases[$i] --> %d\n",ord($bases[$i])); + my $num = ord($bases[$i]) - $pq; + if($num < $worst){ + $worst = $num; + } + } + return $worst; +} + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -format -mark -qual -qv -o +options: +-i input file#fastq file ##can be multiple -i file1 -i file2 ... +-mark string#quary name,default is "seq" +-o output file +-format string # fastq|fasta|fq|fa + +-qual #reads filter + eg:(min:value/mean:value) + This parameter just for solexa reads. + If the input files are solid and needs filter,please do filter first . + +-qv integer #Phred quality64/33,default 33 +-h help +USAGE +exit(1); +} +