1
|
1 #!/usr/bin/perl
|
|
2 use strict;
|
|
3 use Data::Dumper;
|
|
4 use Logger::Logger;
|
|
5 use File::Basename;
|
|
6 use Getopt::Long;
|
|
7 use Cwd 'abs_path' ;
|
|
8 use Tools::Fasta ;
|
|
9 use Tools::Fastq ;
|
|
10
|
|
11
|
|
12 my $ref='';
|
|
13 my @p1='';
|
|
14 my @p2='';
|
|
15 my @read='';
|
|
16 my $identity = 95;
|
|
17 my $coverage = 60;
|
|
18 my $verbosity=1;
|
|
19 my $format;
|
|
20 my $stat;
|
|
21 my $blatOutput='';
|
|
22 my $statOutput='';
|
|
23
|
|
24 GetOptions(
|
|
25 "r=s" => \$ref,
|
|
26 "id=i" => \$identity,
|
|
27 "cov=i" => \$coverage,
|
|
28 "b|blat" => \$blatOutput,
|
|
29 "o|output=s" => \$statOutput,
|
|
30 "f=s" => \$format,
|
|
31 "s=s" => \$stat,
|
|
32 "v|verbosity=i" => \$verbosity
|
|
33 );
|
|
34
|
|
35 Logger::Logger->changeMode($verbosity);
|
|
36
|
|
37 &main;
|
|
38
|
|
39 sub main {
|
|
40 my $self = {};
|
|
41 bless $self;
|
|
42 $self->_setOptions();
|
|
43
|
|
44 $self->_checkFormat($format);
|
|
45
|
|
46 $self->{ref_tool} = Tools::Fasta->new(file => $self->{reference});
|
|
47
|
|
48 $self->_parseBlat();
|
|
49
|
|
50 $self->_print($self->{_statistics}, $self->{statOutput});
|
|
51 }
|
|
52
|
|
53
|
|
54 sub _parseBlat {
|
|
55 my ($self) = @_;
|
|
56 open(PSL,$self->{blatOutput}) || $logger->logdie('Cant read psl file ' . $self->{blatOutput});
|
|
57 while(<PSL>){
|
|
58 chomp;
|
|
59 my @line = split(/\t/,$_);
|
|
60 my $tSize = $line[14];
|
|
61 my $tStart = $line[15];
|
|
62 my $tEnd = $line[16];
|
|
63 my $coverage = (($tEnd-$tStart)/$tSize) * 100;
|
|
64
|
|
65 if($coverage >= $self->{_covThreshold}){
|
|
66 $self->{readNumber}->{$line[9]}++;
|
|
67 $self->{totalReadNumber}++;
|
|
68 $self->{size}->{$line[9]} = $line[10];
|
|
69 $self->{reads_size_sum}->{$line[9]} += $tSize;
|
|
70 }
|
|
71 else{
|
|
72 next;
|
|
73 }
|
|
74 }
|
|
75 close PSL;
|
|
76 }
|
|
77
|
|
78
|
|
79 sub _print {
|
|
80 my ($self, $mode, $statOutput) = @_;
|
|
81 $logger->info('Printing results ...');
|
|
82
|
|
83 if(!defined $mode){
|
|
84 $mode = 'nb_reads';
|
|
85 }
|
|
86
|
|
87 if(!defined $statOutput){
|
|
88 $statOutput = $self->{reference} . '.rn';
|
|
89 }
|
|
90
|
|
91 open(OUT,">$statOutput");
|
|
92 foreach my $ref (sort(keys(%{$self->{ref_tool}->{index}}))){
|
|
93 my $value = 0;
|
|
94 if(defined($self->{readNumber}->{$ref})){
|
|
95 if($mode eq 'nb_reads'){
|
|
96 $value = $self->{readNumber}->{$ref};
|
|
97 }
|
|
98 elsif($mode eq 'rpkm'){
|
|
99 $value = $self->{readNumber}->{$ref} / ($self->{size}->{$ref}/1000 * $self->{totalReadNumber}/1000000);
|
|
100 }
|
|
101 elsif($mode eq 'mean_coverage'){
|
|
102 $value = $self->{reads_size_sum}->{$ref} / $self->{size}->{$ref};
|
|
103 }
|
|
104 }
|
|
105 print OUT $ref . "\t" . $value . "\n";
|
|
106 }
|
|
107 close OUT;
|
|
108 }
|
|
109
|
|
110
|
|
111 sub _setOptions {
|
|
112 my ($self) = @_;
|
|
113 if(-e $ref){
|
|
114 $self->{reference} = abs_path($ref);
|
|
115 }
|
|
116 else{
|
|
117 $logger->error('You must provide a reference.');
|
|
118 &help;
|
|
119 }
|
|
120 if($identity >= 0 && $identity <= 100){
|
|
121 $self->{_idThreshold} = $identity;
|
|
122 }
|
|
123 else{
|
|
124 $logger->logdie('ERROR: identity must be between 0 and 100.');
|
|
125 }
|
|
126 if($coverage >=0 && $coverage <= 100){
|
|
127 $self->{_covThreshold} = $coverage;
|
|
128 }
|
|
129 else{
|
|
130 $logger->logdie('ERROR: coverage must be between 0 and 100.');
|
|
131 }
|
|
132 if($stat =~ /nb_reads|rpkm|mean_coverage/){
|
|
133 $self->{_statistics} = $stat;
|
|
134 }
|
|
135 else{
|
|
136 $logdie('');
|
|
137 }
|
|
138 if($blatOutput eq ''){
|
|
139 $self->{blatOutput} = 'blat.psl';
|
|
140 }
|
|
141 else{
|
|
142 $self->{blatOutput} = $blatOutput;
|
|
143 }
|
|
144 if($statOutput eq ''){
|
|
145 my($filename, $dirs, $suffix) = fileparse($ref, qr/\.[^.]*/);
|
|
146 $statOutput = $filename . '.psl';
|
|
147 }
|
|
148 else{
|
|
149 $self->{statOutput} = $statOutput;
|
|
150 }
|
|
151 }
|
|
152
|
|
153
|
|
154 sub help {
|
|
155 my $prog = basename($0) ;
|
|
156 print STDERR <<EOF ;
|
|
157 #### $prog ####
|
|
158 #
|
|
159 # AUTHOR: Sebastien THEIL
|
|
160 # LAST MODIF: 16/07/2015
|
|
161 # PURPOSE: This script is basically used to map reads (using blat) on contigs (or scaffolds) and count the
|
|
162 number of read per contig.
|
|
163 Input can be multiple fasta or fastq.
|
|
164 Output is a 2 column tabular file.
|
|
165
|
|
166 USAGE:
|
|
167 $prog -i singl.fastq -i singl.fasta -1 R1.fastq -2 R2.fastq ....
|
|
168
|
|
169 ### OPTIONS ###
|
|
170 -r <string> Rererence sequences file.
|
|
171 -id <int> Identity threshold. (Default: $identity)
|
|
172 -cov <int> Coverage threshold. (Default: $coverage)
|
|
173 -b <string> Blat output file path
|
|
174 -o <string> Output file path
|
|
175 -s <string> Statistics to print : nb_reads, rpkm or mean_coverage.
|
|
176 -v <int> Verbosity level. (0 -> 4).
|
|
177 EOF
|
|
178 exit(1);
|
|
179 }
|