diff perl/scripts/readPerContig.pl @ 1:3203097d0a70 draft

Uploaded
author stheil
date Thu, 15 Oct 2015 09:53:48 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/perl/scripts/readPerContig.pl	Thu Oct 15 09:53:48 2015 -0400
@@ -0,0 +1,179 @@
+#!/usr/bin/perl
+use strict;
+use Data::Dumper;
+use Logger::Logger;
+use File::Basename;
+use Getopt::Long;
+use Cwd 'abs_path' ;
+use Tools::Fasta ;
+use Tools::Fastq ;
+
+
+my $ref='';
+my @p1='';
+my @p2='';
+my @read='';
+my $identity = 95;
+my $coverage = 60;
+my $verbosity=1;
+my $format;
+my $stat;
+my $blatOutput='';
+my $statOutput='';
+
+GetOptions(
+	"r=s"              => \$ref,
+	"id=i"             => \$identity,
+	"cov=i"            => \$coverage,
+	"b|blat"	         => \$blatOutput,
+	"o|output=s"       =>	\$statOutput,
+	"f=s"		           => \$format,
+	"s=s"		           => \$stat,
+	"v|verbosity=i"    => \$verbosity
+);
+
+Logger::Logger->changeMode($verbosity);
+
+&main;
+
+sub main {
+	my $self = {};
+	bless $self;
+	$self->_setOptions();
+
+	$self->_checkFormat($format);
+
+	$self->{ref_tool} = Tools::Fasta->new(file => $self->{reference});
+
+	$self->_parseBlat();
+
+	$self->_print($self->{_statistics}, $self->{statOutput});
+}
+
+
+sub _parseBlat {
+	my ($self) = @_;
+	open(PSL,$self->{blatOutput}) || $logger->logdie('Cant read psl file ' . $self->{blatOutput});
+	while(<PSL>){
+		chomp;
+		my @line = split(/\t/,$_);
+		my $tSize = $line[14];
+		my $tStart = $line[15];
+		my $tEnd = $line[16];
+		my $coverage = (($tEnd-$tStart)/$tSize) * 100;
+
+		if($coverage >= $self->{_covThreshold}){
+			$self->{readNumber}->{$line[9]}++;
+			$self->{totalReadNumber}++;
+			$self->{size}->{$line[9]} = $line[10];
+			$self->{reads_size_sum}->{$line[9]} += $tSize;
+		}
+		else{
+			next;
+		}
+	}
+	close PSL;
+}
+
+
+sub _print {
+	my ($self, $mode, $statOutput) = @_;
+	$logger->info('Printing results ...');
+
+	if(!defined $mode){
+		$mode = 'nb_reads';
+	}
+
+	if(!defined $statOutput){
+		$statOutput = $self->{reference} . '.rn';
+	}
+
+	open(OUT,">$statOutput");
+	foreach my $ref (sort(keys(%{$self->{ref_tool}->{index}}))){
+		my $value = 0;
+		if(defined($self->{readNumber}->{$ref})){
+			if($mode eq 'nb_reads'){
+				$value = $self->{readNumber}->{$ref};
+			}
+			elsif($mode eq 'rpkm'){
+				$value = $self->{readNumber}->{$ref} / ($self->{size}->{$ref}/1000 * $self->{totalReadNumber}/1000000);
+			}
+			elsif($mode eq 'mean_coverage'){
+				$value =  $self->{reads_size_sum}->{$ref} / $self->{size}->{$ref};
+			}
+		}
+		print OUT $ref . "\t" . $value . "\n";
+	}
+	close OUT;
+}
+
+
+sub _setOptions {
+	my ($self) = @_;
+	if(-e $ref){
+		$self->{reference} = abs_path($ref);
+	}
+	else{
+		$logger->error('You must provide a reference.');
+		&help;
+	}
+	if($identity >= 0 && $identity <= 100){
+		$self->{_idThreshold} = $identity;
+	}
+	else{
+		$logger->logdie('ERROR: identity must be between 0 and 100.');
+	}
+	if($coverage >=0 && $coverage <= 100){
+		$self->{_covThreshold} = $coverage;
+	}
+	else{
+		$logger->logdie('ERROR: coverage must be between 0 and 100.');
+	}
+	if($stat =~ /nb_reads|rpkm|mean_coverage/){
+		$self->{_statistics} = $stat;
+	}
+	else{
+		$logdie('');
+	}
+	if($blatOutput eq ''){
+		$self->{blatOutput} = 'blat.psl';
+	}
+	else{
+		$self->{blatOutput} = $blatOutput;
+	}
+	if($statOutput eq ''){
+		my($filename, $dirs, $suffix) = fileparse($ref, qr/\.[^.]*/);
+		$statOutput = $filename . '.psl';
+	}
+	else{
+		$self->{statOutput} = $statOutput;
+	}
+}
+
+
+sub help {
+my $prog = basename($0) ;
+print STDERR <<EOF ;
+#### $prog ####
+#
+# AUTHOR:     Sebastien THEIL
+# LAST MODIF: 16/07/2015
+# PURPOSE:    This script is basically used to map reads (using blat) on contigs (or scaffolds) and count the
+              number of read per contig.
+              Input can be multiple fasta or fastq.
+              Output is a 2 column tabular file.
+
+USAGE:
+      $prog  -i singl.fastq -i singl.fasta -1 R1.fastq -2 R2.fastq ....
+
+      ### OPTIONS ###
+      -r          <string>   Rererence sequences file.
+      -id         <int>      Identity threshold. (Default: $identity)
+      -cov        <int>      Coverage threshold. (Default: $coverage)
+      -b          <string>   Blat output file path
+			-o          <string>   Output file path
+      -s          <string>   Statistics to print : nb_reads, rpkm or mean_coverage.
+      -v          <int>      Verbosity level. (0 -> 4).
+EOF
+exit(1);
+}