comparison perl/scripts/readPerContig.pl @ 1:3203097d0a70 draft

Uploaded
author stheil
date Thu, 15 Oct 2015 09:53:48 -0400
parents
children
comparison
equal deleted inserted replaced
0:9730db7c9ad3 1:3203097d0a70
1 #!/usr/bin/perl
2 use strict;
3 use Data::Dumper;
4 use Logger::Logger;
5 use File::Basename;
6 use Getopt::Long;
7 use Cwd 'abs_path' ;
8 use Tools::Fasta ;
9 use Tools::Fastq ;
10
11
12 my $ref='';
13 my @p1='';
14 my @p2='';
15 my @read='';
16 my $identity = 95;
17 my $coverage = 60;
18 my $verbosity=1;
19 my $format;
20 my $stat;
21 my $blatOutput='';
22 my $statOutput='';
23
24 GetOptions(
25 "r=s" => \$ref,
26 "id=i" => \$identity,
27 "cov=i" => \$coverage,
28 "b|blat" => \$blatOutput,
29 "o|output=s" => \$statOutput,
30 "f=s" => \$format,
31 "s=s" => \$stat,
32 "v|verbosity=i" => \$verbosity
33 );
34
35 Logger::Logger->changeMode($verbosity);
36
37 &main;
38
39 sub main {
40 my $self = {};
41 bless $self;
42 $self->_setOptions();
43
44 $self->_checkFormat($format);
45
46 $self->{ref_tool} = Tools::Fasta->new(file => $self->{reference});
47
48 $self->_parseBlat();
49
50 $self->_print($self->{_statistics}, $self->{statOutput});
51 }
52
53
54 sub _parseBlat {
55 my ($self) = @_;
56 open(PSL,$self->{blatOutput}) || $logger->logdie('Cant read psl file ' . $self->{blatOutput});
57 while(<PSL>){
58 chomp;
59 my @line = split(/\t/,$_);
60 my $tSize = $line[14];
61 my $tStart = $line[15];
62 my $tEnd = $line[16];
63 my $coverage = (($tEnd-$tStart)/$tSize) * 100;
64
65 if($coverage >= $self->{_covThreshold}){
66 $self->{readNumber}->{$line[9]}++;
67 $self->{totalReadNumber}++;
68 $self->{size}->{$line[9]} = $line[10];
69 $self->{reads_size_sum}->{$line[9]} += $tSize;
70 }
71 else{
72 next;
73 }
74 }
75 close PSL;
76 }
77
78
79 sub _print {
80 my ($self, $mode, $statOutput) = @_;
81 $logger->info('Printing results ...');
82
83 if(!defined $mode){
84 $mode = 'nb_reads';
85 }
86
87 if(!defined $statOutput){
88 $statOutput = $self->{reference} . '.rn';
89 }
90
91 open(OUT,">$statOutput");
92 foreach my $ref (sort(keys(%{$self->{ref_tool}->{index}}))){
93 my $value = 0;
94 if(defined($self->{readNumber}->{$ref})){
95 if($mode eq 'nb_reads'){
96 $value = $self->{readNumber}->{$ref};
97 }
98 elsif($mode eq 'rpkm'){
99 $value = $self->{readNumber}->{$ref} / ($self->{size}->{$ref}/1000 * $self->{totalReadNumber}/1000000);
100 }
101 elsif($mode eq 'mean_coverage'){
102 $value = $self->{reads_size_sum}->{$ref} / $self->{size}->{$ref};
103 }
104 }
105 print OUT $ref . "\t" . $value . "\n";
106 }
107 close OUT;
108 }
109
110
111 sub _setOptions {
112 my ($self) = @_;
113 if(-e $ref){
114 $self->{reference} = abs_path($ref);
115 }
116 else{
117 $logger->error('You must provide a reference.');
118 &help;
119 }
120 if($identity >= 0 && $identity <= 100){
121 $self->{_idThreshold} = $identity;
122 }
123 else{
124 $logger->logdie('ERROR: identity must be between 0 and 100.');
125 }
126 if($coverage >=0 && $coverage <= 100){
127 $self->{_covThreshold} = $coverage;
128 }
129 else{
130 $logger->logdie('ERROR: coverage must be between 0 and 100.');
131 }
132 if($stat =~ /nb_reads|rpkm|mean_coverage/){
133 $self->{_statistics} = $stat;
134 }
135 else{
136 $logdie('');
137 }
138 if($blatOutput eq ''){
139 $self->{blatOutput} = 'blat.psl';
140 }
141 else{
142 $self->{blatOutput} = $blatOutput;
143 }
144 if($statOutput eq ''){
145 my($filename, $dirs, $suffix) = fileparse($ref, qr/\.[^.]*/);
146 $statOutput = $filename . '.psl';
147 }
148 else{
149 $self->{statOutput} = $statOutput;
150 }
151 }
152
153
154 sub help {
155 my $prog = basename($0) ;
156 print STDERR <<EOF ;
157 #### $prog ####
158 #
159 # AUTHOR: Sebastien THEIL
160 # LAST MODIF: 16/07/2015
161 # PURPOSE: This script is basically used to map reads (using blat) on contigs (or scaffolds) and count the
162 number of read per contig.
163 Input can be multiple fasta or fastq.
164 Output is a 2 column tabular file.
165
166 USAGE:
167 $prog -i singl.fastq -i singl.fasta -1 R1.fastq -2 R2.fastq ....
168
169 ### OPTIONS ###
170 -r <string> Rererence sequences file.
171 -id <int> Identity threshold. (Default: $identity)
172 -cov <int> Coverage threshold. (Default: $coverage)
173 -b <string> Blat output file path
174 -o <string> Output file path
175 -s <string> Statistics to print : nb_reads, rpkm or mean_coverage.
176 -v <int> Verbosity level. (0 -> 4).
177 EOF
178 exit(1);
179 }