Mercurial > repos > big-tiandm > mirplant2

--- a/collapseReads2Tags.pl	Fri Jul 25 05:17:20 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,170 +0,0 @@
-#!/usr/bin/perl -w
-#Filename:
-#Author: Tian Dongmei
-#Email: tiandm@big.ac.cn
-#Date: 2014-3-20
-#Modified:
-#Description: fastq file form reads cluster(the same sequence in the same cluster)
-my $version=1.00;
-
-use strict;
-use Getopt::Long;
-
-my %opts;
-GetOptions(\%opts,"i:s@","format=s","mark:s","qual:s","qv:i","o=s","h");
-if (!(defined $opts{o} and defined $opts{'format'})  || defined $opts{h}) { #necessary arguments
-&usage;
-}
-my @filein=@{$opts{i}} if(defined $opts{i});
-my $name=defined $opts{'mark'} ? $opts{'mark'} : "seq";
-my $fileout=$opts{'o'};
-my $pq=defined $opts{'qv'} ? $opts{'qv'} : 33;
-my %hash;##分块存放原始序列
-
-my $format=$opts{'format'};
-if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") {
-	die "Parameter -format is error!\n";
-}
-
-my ($qualT,$qualV);
-if (defined $opts{'qual'} && ($format eq "fastq" || $format eq "fq")) {  #quality filter
-	my @temp=split /:/,$opts{'qual'};
-	$qualT=$temp[0];
-	$qualV=$temp[1];
-
-	for (my $i=0;$i<@filein;$i++) {
-		open IN,"<$filein[$i]";
-		while (my $aline=<IN>) {
-			my $seq=<IN>;
-			my $n=<IN>;
-			my $qv=<IN>;
-			my $tag=&qvcheck($qv,$qualT,$qualV);
-			next if(!$tag);
-			my $str=substr($seq,0,6);
-			$hash{$str}[$i].=$seq;
-		}
-		close IN;
-	}
-}
-elsif($format eq "fastq" || $format eq "fq"){ ### do not filter low quality reads
-	for (my $i=0;$i<@filein;$i++) {
-		open IN,"<$filein[$i]";
-		while (my $aline=<IN>) {
-			my $seq=<IN>;
-			my $n=<IN>;
-			my $qv=<IN>;
-			my $str=substr($seq,0,6);
-			$hash{$str}[$i].=$seq;
-		}
-		close IN;
-	}
-
-}
-elsif($format eq "fasta" || $format eq "fa"){
-	for (my $i=0;$i<@filein;$i++) {
-		open IN,"<$filein[$i]";
-		while (my $aline=<IN>) {
-			my $seq=<IN>;
-			my $str=substr($seq,0,6);
-			$hash{$str}[$i].=$seq;
-		}
-		close IN;
-	}
-}
-
-open OUT,">$fileout"; #output file
-my $count=0;
-foreach my $key (keys %hash) {
-	my %cluster;
-	for (my $i=0;$i<@filein;$i++) {
-		next if(!(defined $hash{$key}[$i]));
-		my @tmp=split/\n/,$hash{$key}[$i];
-		foreach  (@tmp) {
-			$cluster{$_}[$i]++;
-		}
-	}
-
-	foreach my $seq (keys %cluster) {
-		my $exp=""; my $ee=0;
-		for (my $i=0;$i<@filein;$i++) {
-			if (defined $cluster{$seq}[$i]) {
-				$exp.="_$cluster{$seq}[$i]";
-				$ee+=$cluster{$seq}[$i];
-			}else{
-				$exp.="_0";
-			}
-		}
-		$count+=$ee;
-		$exp=~s/^_//;
-		print OUT ">$name","_$count:$exp","_x$ee\n$seq\n";
-	}
-}
-close OUT;
-
-
-sub qvcheck{
-	my ($str,$t,$v)=@_;
-	my $qv=0;
-	if($t eq "mean"){
-		$qv=&getMeanQuality($str);
-	}
-	elsif($t eq "min"){
-		$qv=&getMinQuality($str);
-	}
-	if ($qv<$v) {
-		return 0;
-	}
-	return 1;
-}
-
-sub getMeanQuality(){
-	chomp $_[0];
-	my @bases = split(//,$_[0]);
-	my $sum = 0;
-	for(my $i = 0; $i <= $#bases; $i++){
-		my $num = ord($bases[$i]) - $pq;
-		$sum += $num;
-	}
-
-	return $sum/($#bases+1);
-
-}
-
-###
-### This function gives back the Q-value of the worst base
-sub getMinQuality(){
-	chomp $_[0];
-	my @bases = split(//,$_[0]);
-	my $worst = 1000;
-	for(my $i = 0; $i <= $#bases; $i++){
-#		printf ("base: $bases[$i]  --> %d\n",ord($bases[$i]));
-		my $num = ord($bases[$i]) - $pq;
-		if($num < $worst){
-			$worst = $num;
-		}
-	}
-	return $worst;
-}
-
-sub usage{
-print <<"USAGE";
-Version $version
-Usage:
-$0 -i -format -mark -qual -qv -o
-options:
--i input file#fastq file ##can be multiple -i file1 -i file2 ...
--mark string#quary name,default is "seq"
--o output file
--format string # fastq|fasta|fq|fa
-
--qual #reads filter
-   eg:(min:value/mean:value)
-   This parameter just for solexa reads.
-   If the input files are solid and needs filter,please do filter first .
-
--qv  integer #Phred quality64/33,default 33
--h help
-USAGE
-exit(1);
-}
-
--- a/convert_bowtie_to_blast.pl	Fri Jul 25 05:17:20 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,126 +0,0 @@
-#!/usr/bin/perl
-
-
-use warnings;
-use strict;
-use Getopt::Std;
-
-######################################### USAGE ################################
-
-my $usage=
-"$0 file_bowtie_result file_solexa_seq file_chromosome
-
-This is a converter which changes Bowtie output into Blast format.
-The input includes three files: a Bowtie result file (default Bowtie
-output file), a fasta file consisting of small Reads and a chromosome
-fasta file. It outputs the alignments in blast_parsed format.
-
-file_bowtie_result likes:
-
-AtFlower100010_x2	+	MIR319c	508	AAGGAGATTCTTTCAGTCCAG	IIIIIIIIIIIIIIIIIIIII	0
-AtFlower1000188_x1	+	MIR2933a	421	TCGGAGAGGAAATTCGTCGGCG	IIIIIIIIIIIIIIIIIIIIII	0
-
-file_solexa_seq likes:
-
->AtFlower100010_x2
-AAGGAGATTCTTTCAGTCCAG
-
-file_chromosome contains chromosome seq in fasta format
-
-";
-
-
-####################################### INPUT FILES ############################
-
-my $file_bowtie_result=shift or die $usage;
-my $file_short_seq=shift or die $usage;
-my $file_chromosome_seq=shift or die $usage;
-
-
-##################################### GLOBAL VARIBALES #########################
-
-my %short_seq_length=();
-my %chromosome_length=();
-
-
-######################################### MAIN #################################
-
-#get the short sequence id and its length
-sequence_length($file_short_seq,\%short_seq_length);
-
-#get the chromosome sequence id and its length
-sequence_length($file_chromosome_seq,\%chromosome_length);
-
-#convert bowtie result format to blast format;
-change_format($file_bowtie_result);
-
-exit;
-
-
-##################################### SUBROUTINES ##############################
-
-sub sequence_length{
-    my ($file,$hash) = @_;
-    my ($id, $desc, $sequence, $seq_length) = ();
-
-    open (FASTA, "<$file") or die "can not open $$file\n";
-    while (<FASTA>)
-    {
-        chomp;
-        if (/^>(\S+)(.*)/)
-	{
-	    $id       = $1;
-	    $desc     = $2;
-	    $sequence = "";
-	    while (<FASTA>){
-                chomp;
-                if (/^>(\S+)(.*)/){
-		    $$hash{$id}  = length $sequence;
-		    $id         = $1;
-		    $desc       = $2;
-		    $sequence   = "";
-		    next;
-                }
-		$sequence .= $_;
-            }
-        }
-    }
-    $seq_length=length($sequence);
-    $$hash{$id} = $seq_length;
-    close FASTA;
-}
-
-
-
-
-
-sub change_format{
-	#Change Bowtie format into blast format
-	my $file=shift @_;
-	open(FILE,"<$file")||die"can not open the bowtie result file:$!\n";
-  #open(BLASTOUT,">blastout")||die"can not create the blastout file:$!\n";
-
-	while(<FILE>){
-		chomp;
-		my @tmp=split("\t",$_);
-	#Clean the reads ID
-		my @tmp1=split(" ",$tmp[0]);
-		print  "$tmp1[0]"."\t"."$short_seq_length{$tmp1[0]}"."\t"."1".'..'."$short_seq_length{$tmp1[0]}"."\t"."$tmp[2]"."\t"."$chromosome_length{$tmp[2]}"."\t";
-		if($tmp[1] eq "+"){
-			my $seq_end=$tmp[3] + $short_seq_length{$tmp1[0]};
-			my $seq_bg=$tmp[3] + 1;
-			print  "$seq_bg".'..'."$seq_end"."\t"."1e-04"."\t"."1.00"."\t"."42.1"."\t"."Plus / Plus"."\n";
-			}
-		if($tmp[1] eq "-"){
-			my $seq_end=$chromosome_length{$tmp[2]} - $tmp[3];
-			my $seq_bg=$seq_end - $short_seq_length{$tmp1[0]} + 1;
-			print  "$seq_bg".'..'."$seq_end"."\t"."1e-04"."\t"."1.00"."\t"."42.1"."\t"."Plus / Minus"."\n";
-			}
-		}
-
-#	close BLASTOUT;
-
-}
-
-
-