Mercurial > repos > stheil > readpercontig_blat
diff perl/scripts/readPerContig.pl @ 1:3203097d0a70 draft
Uploaded
author | stheil |
---|---|
date | Thu, 15 Oct 2015 09:53:48 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/perl/scripts/readPerContig.pl Thu Oct 15 09:53:48 2015 -0400 @@ -0,0 +1,179 @@ +#!/usr/bin/perl +use strict; +use Data::Dumper; +use Logger::Logger; +use File::Basename; +use Getopt::Long; +use Cwd 'abs_path' ; +use Tools::Fasta ; +use Tools::Fastq ; + + +my $ref=''; +my @p1=''; +my @p2=''; +my @read=''; +my $identity = 95; +my $coverage = 60; +my $verbosity=1; +my $format; +my $stat; +my $blatOutput=''; +my $statOutput=''; + +GetOptions( + "r=s" => \$ref, + "id=i" => \$identity, + "cov=i" => \$coverage, + "b|blat" => \$blatOutput, + "o|output=s" => \$statOutput, + "f=s" => \$format, + "s=s" => \$stat, + "v|verbosity=i" => \$verbosity +); + +Logger::Logger->changeMode($verbosity); + +&main; + +sub main { + my $self = {}; + bless $self; + $self->_setOptions(); + + $self->_checkFormat($format); + + $self->{ref_tool} = Tools::Fasta->new(file => $self->{reference}); + + $self->_parseBlat(); + + $self->_print($self->{_statistics}, $self->{statOutput}); +} + + +sub _parseBlat { + my ($self) = @_; + open(PSL,$self->{blatOutput}) || $logger->logdie('Cant read psl file ' . $self->{blatOutput}); + while(<PSL>){ + chomp; + my @line = split(/\t/,$_); + my $tSize = $line[14]; + my $tStart = $line[15]; + my $tEnd = $line[16]; + my $coverage = (($tEnd-$tStart)/$tSize) * 100; + + if($coverage >= $self->{_covThreshold}){ + $self->{readNumber}->{$line[9]}++; + $self->{totalReadNumber}++; + $self->{size}->{$line[9]} = $line[10]; + $self->{reads_size_sum}->{$line[9]} += $tSize; + } + else{ + next; + } + } + close PSL; +} + + +sub _print { + my ($self, $mode, $statOutput) = @_; + $logger->info('Printing results ...'); + + if(!defined $mode){ + $mode = 'nb_reads'; + } + + if(!defined $statOutput){ + $statOutput = $self->{reference} . '.rn'; + } + + open(OUT,">$statOutput"); + foreach my $ref (sort(keys(%{$self->{ref_tool}->{index}}))){ + my $value = 0; + if(defined($self->{readNumber}->{$ref})){ + if($mode eq 'nb_reads'){ + $value = $self->{readNumber}->{$ref}; + } + elsif($mode eq 'rpkm'){ + $value = $self->{readNumber}->{$ref} / ($self->{size}->{$ref}/1000 * $self->{totalReadNumber}/1000000); + } + elsif($mode eq 'mean_coverage'){ + $value = $self->{reads_size_sum}->{$ref} / $self->{size}->{$ref}; + } + } + print OUT $ref . "\t" . $value . "\n"; + } + close OUT; +} + + +sub _setOptions { + my ($self) = @_; + if(-e $ref){ + $self->{reference} = abs_path($ref); + } + else{ + $logger->error('You must provide a reference.'); + &help; + } + if($identity >= 0 && $identity <= 100){ + $self->{_idThreshold} = $identity; + } + else{ + $logger->logdie('ERROR: identity must be between 0 and 100.'); + } + if($coverage >=0 && $coverage <= 100){ + $self->{_covThreshold} = $coverage; + } + else{ + $logger->logdie('ERROR: coverage must be between 0 and 100.'); + } + if($stat =~ /nb_reads|rpkm|mean_coverage/){ + $self->{_statistics} = $stat; + } + else{ + $logdie(''); + } + if($blatOutput eq ''){ + $self->{blatOutput} = 'blat.psl'; + } + else{ + $self->{blatOutput} = $blatOutput; + } + if($statOutput eq ''){ + my($filename, $dirs, $suffix) = fileparse($ref, qr/\.[^.]*/); + $statOutput = $filename . '.psl'; + } + else{ + $self->{statOutput} = $statOutput; + } +} + + +sub help { +my $prog = basename($0) ; +print STDERR <<EOF ; +#### $prog #### +# +# AUTHOR: Sebastien THEIL +# LAST MODIF: 16/07/2015 +# PURPOSE: This script is basically used to map reads (using blat) on contigs (or scaffolds) and count the + number of read per contig. + Input can be multiple fasta or fastq. + Output is a 2 column tabular file. + +USAGE: + $prog -i singl.fastq -i singl.fasta -1 R1.fastq -2 R2.fastq .... + + ### OPTIONS ### + -r <string> Rererence sequences file. + -id <int> Identity threshold. (Default: $identity) + -cov <int> Coverage threshold. (Default: $coverage) + -b <string> Blat output file path + -o <string> Output file path + -s <string> Statistics to print : nb_reads, rpkm or mean_coverage. + -v <int> Verbosity level. (0 -> 4). +EOF +exit(1); +}