view perl/scripts/readPerContig.pl @ 1:3203097d0a70 draft

Uploaded
author stheil
date Thu, 15 Oct 2015 09:53:48 -0400
parents
children
line wrap: on
line source

#!/usr/bin/perl
use strict;
use Data::Dumper;
use Logger::Logger;
use File::Basename;
use Getopt::Long;
use Cwd 'abs_path' ;
use Tools::Fasta ;
use Tools::Fastq ;


my $ref='';
my @p1='';
my @p2='';
my @read='';
my $identity = 95;
my $coverage = 60;
my $verbosity=1;
my $format;
my $stat;
my $blatOutput='';
my $statOutput='';

GetOptions(
	"r=s"              => \$ref,
	"id=i"             => \$identity,
	"cov=i"            => \$coverage,
	"b|blat"	         => \$blatOutput,
	"o|output=s"       =>	\$statOutput,
	"f=s"		           => \$format,
	"s=s"		           => \$stat,
	"v|verbosity=i"    => \$verbosity
);

Logger::Logger->changeMode($verbosity);

&main;

sub main {
	my $self = {};
	bless $self;
	$self->_setOptions();

	$self->_checkFormat($format);

	$self->{ref_tool} = Tools::Fasta->new(file => $self->{reference});

	$self->_parseBlat();

	$self->_print($self->{_statistics}, $self->{statOutput});
}


sub _parseBlat {
	my ($self) = @_;
	open(PSL,$self->{blatOutput}) || $logger->logdie('Cant read psl file ' . $self->{blatOutput});
	while(<PSL>){
		chomp;
		my @line = split(/\t/,$_);
		my $tSize = $line[14];
		my $tStart = $line[15];
		my $tEnd = $line[16];
		my $coverage = (($tEnd-$tStart)/$tSize) * 100;

		if($coverage >= $self->{_covThreshold}){
			$self->{readNumber}->{$line[9]}++;
			$self->{totalReadNumber}++;
			$self->{size}->{$line[9]} = $line[10];
			$self->{reads_size_sum}->{$line[9]} += $tSize;
		}
		else{
			next;
		}
	}
	close PSL;
}


sub _print {
	my ($self, $mode, $statOutput) = @_;
	$logger->info('Printing results ...');

	if(!defined $mode){
		$mode = 'nb_reads';
	}

	if(!defined $statOutput){
		$statOutput = $self->{reference} . '.rn';
	}

	open(OUT,">$statOutput");
	foreach my $ref (sort(keys(%{$self->{ref_tool}->{index}}))){
		my $value = 0;
		if(defined($self->{readNumber}->{$ref})){
			if($mode eq 'nb_reads'){
				$value = $self->{readNumber}->{$ref};
			}
			elsif($mode eq 'rpkm'){
				$value = $self->{readNumber}->{$ref} / ($self->{size}->{$ref}/1000 * $self->{totalReadNumber}/1000000);
			}
			elsif($mode eq 'mean_coverage'){
				$value =  $self->{reads_size_sum}->{$ref} / $self->{size}->{$ref};
			}
		}
		print OUT $ref . "\t" . $value . "\n";
	}
	close OUT;
}


sub _setOptions {
	my ($self) = @_;
	if(-e $ref){
		$self->{reference} = abs_path($ref);
	}
	else{
		$logger->error('You must provide a reference.');
		&help;
	}
	if($identity >= 0 && $identity <= 100){
		$self->{_idThreshold} = $identity;
	}
	else{
		$logger->logdie('ERROR: identity must be between 0 and 100.');
	}
	if($coverage >=0 && $coverage <= 100){
		$self->{_covThreshold} = $coverage;
	}
	else{
		$logger->logdie('ERROR: coverage must be between 0 and 100.');
	}
	if($stat =~ /nb_reads|rpkm|mean_coverage/){
		$self->{_statistics} = $stat;
	}
	else{
		$logdie('');
	}
	if($blatOutput eq ''){
		$self->{blatOutput} = 'blat.psl';
	}
	else{
		$self->{blatOutput} = $blatOutput;
	}
	if($statOutput eq ''){
		my($filename, $dirs, $suffix) = fileparse($ref, qr/\.[^.]*/);
		$statOutput = $filename . '.psl';
	}
	else{
		$self->{statOutput} = $statOutput;
	}
}


sub help {
my $prog = basename($0) ;
print STDERR <<EOF ;
#### $prog ####
#
# AUTHOR:     Sebastien THEIL
# LAST MODIF: 16/07/2015
# PURPOSE:    This script is basically used to map reads (using blat) on contigs (or scaffolds) and count the
              number of read per contig.
              Input can be multiple fasta or fastq.
              Output is a 2 column tabular file.

USAGE:
      $prog  -i singl.fastq -i singl.fasta -1 R1.fastq -2 R2.fastq ....

      ### OPTIONS ###
      -r          <string>   Rererence sequences file.
      -id         <int>      Identity threshold. (Default: $identity)
      -cov        <int>      Coverage threshold. (Default: $coverage)
      -b          <string>   Blat output file path
			-o          <string>   Output file path
      -s          <string>   Statistics to print : nb_reads, rpkm or mean_coverage.
      -v          <int>      Verbosity level. (0 -> 4).
EOF
exit(1);
}