diff collapseReads2Tags.pl @ 6:e08814f01490 draft

Uploaded
author big-tiandm
date Fri, 25 Jul 2014 05:19:35 -0400
parents ea2fdf667620
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/collapseReads2Tags.pl	Fri Jul 25 05:19:35 2014 -0400
@@ -0,0 +1,170 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Tian Dongmei
+#Email: tiandm@big.ac.cn
+#Date: 2014-3-20
+#Modified:
+#Description: fastq file form reads cluster(the same sequence in the same cluster)
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+
+my %opts;
+GetOptions(\%opts,"i:s@","format=s","mark:s","qual:s","qv:i","o=s","h");
+if (!(defined $opts{o} and defined $opts{'format'})  || defined $opts{h}) { #necessary arguments
+&usage;
+}
+my @filein=@{$opts{i}} if(defined $opts{i});
+my $name=defined $opts{'mark'} ? $opts{'mark'} : "seq";
+my $fileout=$opts{'o'};
+my $pq=defined $opts{'qv'} ? $opts{'qv'} : 33;
+my %hash;##分块存放原始序列
+
+my $format=$opts{'format'};
+if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") {
+	die "Parameter -format is error!\n";
+}
+
+my ($qualT,$qualV);
+if (defined $opts{'qual'} && ($format eq "fastq" || $format eq "fq")) {  #quality filter
+	my @temp=split /:/,$opts{'qual'};
+	$qualT=$temp[0];
+	$qualV=$temp[1];
+
+	for (my $i=0;$i<@filein;$i++) {
+		open IN,"<$filein[$i]";
+		while (my $aline=<IN>) {
+			my $seq=<IN>;
+			my $n=<IN>;
+			my $qv=<IN>;
+			my $tag=&qvcheck($qv,$qualT,$qualV);
+			next if(!$tag);
+			my $str=substr($seq,0,6);
+			$hash{$str}[$i].=$seq;
+		}
+		close IN;
+	}
+}
+elsif($format eq "fastq" || $format eq "fq"){ ### do not filter low quality reads
+	for (my $i=0;$i<@filein;$i++) {
+		open IN,"<$filein[$i]";
+		while (my $aline=<IN>) {
+			my $seq=<IN>;
+			my $n=<IN>;
+			my $qv=<IN>;
+			my $str=substr($seq,0,6);
+			$hash{$str}[$i].=$seq;
+		}
+		close IN;
+	}
+
+}
+elsif($format eq "fasta" || $format eq "fa"){
+	for (my $i=0;$i<@filein;$i++) {
+		open IN,"<$filein[$i]";
+		while (my $aline=<IN>) {
+			my $seq=<IN>;
+			my $str=substr($seq,0,6);
+			$hash{$str}[$i].=$seq;
+		}
+		close IN;
+	}
+}
+
+open OUT,">$fileout"; #output file  
+my $count=0;
+foreach my $key (keys %hash) {
+	my %cluster;
+	for (my $i=0;$i<@filein;$i++) {
+		next if(!(defined $hash{$key}[$i]));
+		my @tmp=split/\n/,$hash{$key}[$i];
+		foreach  (@tmp) {
+			$cluster{$_}[$i]++;
+		}
+	}
+	
+	foreach my $seq (keys %cluster) {
+		my $exp=""; my $ee=0;
+		for (my $i=0;$i<@filein;$i++) {
+			if (defined $cluster{$seq}[$i]) {
+				$exp.="_$cluster{$seq}[$i]";
+				$ee+=$cluster{$seq}[$i];
+			}else{
+				$exp.="_0";
+			}
+		}
+		$count+=$ee;
+		$exp=~s/^_//;
+		print OUT ">$name","_$count:$exp","_x$ee\n$seq\n";
+	}
+}
+close OUT;
+
+
+sub qvcheck{
+	my ($str,$t,$v)=@_;
+	my $qv=0;
+	if($t eq "mean"){ 
+		$qv=&getMeanQuality($str);
+	}
+	elsif($t eq "min"){
+		$qv=&getMinQuality($str);
+	}
+	if ($qv<$v) {
+		return 0;
+	}
+	return 1;
+}
+
+sub getMeanQuality(){
+	chomp $_[0];
+	my @bases = split(//,$_[0]);
+	my $sum = 0;
+	for(my $i = 0; $i <= $#bases; $i++){
+		my $num = ord($bases[$i]) - $pq;
+		$sum += $num;
+	}
+	
+	return $sum/($#bases+1);
+	
+}
+
+###
+### This function gives back the Q-value of the worst base
+sub getMinQuality(){
+	chomp $_[0];
+	my @bases = split(//,$_[0]);
+	my $worst = 1000;
+	for(my $i = 0; $i <= $#bases; $i++){
+#		printf ("base: $bases[$i]  --> %d\n",ord($bases[$i]));
+		my $num = ord($bases[$i]) - $pq;
+		if($num < $worst){
+			$worst = $num;
+		}
+	}
+	return $worst;
+}
+
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+$0 -i -format -mark -qual -qv -o 
+options:
+-i input file#fastq file ##can be multiple -i file1 -i file2 ...
+-mark string#quary name,default is "seq"
+-o output file
+-format string # fastq|fasta|fq|fa
+
+-qual #reads filter
+   eg:(min:value/mean:value)
+   This parameter just for solexa reads.
+   If the input files are solid and needs filter,please do filter first .
+   
+-qv  integer #Phred quality64/33,default 33
+-h help
+USAGE
+exit(1);
+}
+