Mercurial > repos > antmarge > dataoverview
comparison dataOverview.pl @ 1:b66f4a551e25 draft
Uploaded
author | antmarge |
---|---|
date | Tue, 28 Mar 2017 21:56:04 -0400 |
parents | |
children | 80205e898861 |
comparison
equal
deleted
inserted
replaced
0:bb1dbc0a1763 | 1:b66f4a551e25 |
---|---|
1 #!/usr/bin/perl -w | |
2 | |
3 #Margaret Antonio 16.08.29 | |
4 | |
5 #use strict; | |
6 use Getopt::Long; | |
7 use Bio::SeqIO; | |
8 use autodie; | |
9 no warnings; | |
10 | |
11 | |
12 | |
13 #AVAILABLE OPTIONS. WILL print OUT UPON ERROR | |
14 sub print_usage() { | |
15 | |
16 print "\n###############################################################\n"; | |
17 print "dataOverview: outputs basic statistics for tn-seq library files \n\n"; | |
18 | |
19 print "USAGE:\n"; | |
20 print "perl dataOverview.pl -i inputs/ -f genome.fasta -r genome.gbk\n"; | |
21 | |
22 print "\nREQUIRED:\n"; | |
23 print " -d\tDirectory containing all input files (results files from\n"; | |
24 print " \tcalc fitness script)\n"; | |
25 print " \t OR\n"; | |
26 print " \tIn the command line (without a flag), input the name(s) of \n"; | |
27 print " \tthe files containing fitness values for individual \n\tinsertion mutants\n"; | |
28 print " -f\tFilename for genome sequence, in fasta format\n"; | |
29 print " -r\tFilename for genome annotation, in GenBank format\n"; | |
30 | |
31 print "\nOPTIONAL:\n"; | |
32 print " -h\tprint OUT usage\n"; | |
33 print " -c\tCutoff average(c1+c2)>c. Default: 15\n"; | |
34 print " -o\tFilename for output. Default: standard output\n"; | |
35 print " \n~~~~Always check that file paths are correctly specified~~~~\n"; | |
36 print " \n###############################################################\n"; | |
37 | |
38 } | |
39 | |
40 # print "What's on the commandline: ", $ARGV; | |
41 | |
42 sub get_time() { | |
43 my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(time); | |
44 return "$hour:$min:$sec"; | |
45 } | |
46 sub mean { | |
47 my $sum=0; | |
48 foreach my $n(@_){ | |
49 $sum+=$n; | |
50 } | |
51 my $total=scalar @_; | |
52 my $mean=$sum/$total; | |
53 return $mean; | |
54 } | |
55 sub minmax{ | |
56 my @unsorted=@_; | |
57 my @sorted = sort { $a <=> $b } @unsorted; | |
58 my $min = $sorted[0]; | |
59 my $max = $sorted[scalar @sorted -1]; | |
60 return ($min, $max); | |
61 } | |
62 sub uniq{ | |
63 my @input=@_; | |
64 my @unique = do { my %seen; grep { !$seen{$_}++ } @input }; | |
65 } | |
66 | |
67 #ASSIGN INPUTS TO VARIABLES | |
68 our ($cutoff,$fastaFile, $outfile,$help,$ref,$weight_ceiling); | |
69 GetOptions( | |
70 'r:s' => \$ref, | |
71 'f:s' => \$fastaFile, | |
72 'c:i'=>\$cutoff, | |
73 'o:s' => \$outfile, | |
74 'h'=> \$help, | |
75 'w:i' => \$weight_ceiling, | |
76 ); | |
77 | |
78 # Set defaults | |
79 #if (!$weight_ceiling){$weight_ceiling=50;} | |
80 #if (!$cutoff){$cutoff=10;} | |
81 | |
82 # If help option is specified or required files are not specified: | |
83 | |
84 if ($help) { | |
85 print print_usage(); | |
86 print "\n"; | |
87 exit; | |
88 } | |
89 | |
90 if (!$fastaFile or !$ref){ | |
91 print "\nERROR: Please correctly specify reference genome fasta and genbank files\n"; | |
92 print "Most genomes (in fasta and gbk format) are available at NCBI\n"; | |
93 print print_usage(); | |
94 print "\n"; | |
95 exit; | |
96 } | |
97 # Redirect STDOUT to log.txt. Anything print OUTed to the terminal will go into the log file | |
98 if (! $outfile){ | |
99 $outfile="summary.txt"; | |
100 } | |
101 | |
102 open OUT, ">",$outfile; | |
103 | |
104 #Not sure if I'll need this but sometimes funky data inputs have hidden characters | |
105 sub cleaner{ | |
106 my $line=$_[0]; | |
107 chomp($line); | |
108 $line =~ s/\x0d{0,1}\x0a{0,1}\Z//s; | |
109 return $line; | |
110 } | |
111 | |
112 | |
113 #Get the input files out of the input directory, or take off of command line | |
114 | |
115 my @files=@ARGV; | |
116 foreach my $f(@files){ | |
117 #print $f; | |
118 } | |
119 my $num=(scalar @files); | |
120 | |
121 #print OUT "Gathering data overview for Tn-Seq experiment\n\n"; | |
122 #print OUT "Begin time: ",get_time(),"\n\n"; | |
123 | |
124 #CREATE AN ARRAY OF DATA FROM INPUT CSV FILE(S). | |
125 #These are the "results" files from calc_fitness.pl. Insertion location, fitness, etc. | |
126 #Go through each file from the commandline (ARGV array) and read each line as an array | |
127 #into select array if values satisfy the cutoff | |
128 | |
129 | |
130 #Store ALL insertion locations in this array. Later, get unique insertions | |
131 my @insertions_all; | |
132 #Store all genes with valid insertions here | |
133 my @genes_insertions; | |
134 #all lines that satisfied cutoff | |
135 my @unsorted; | |
136 #array to hold all positions of insertions. Going to use this later to match up with TA sites | |
137 my @insertPos; | |
138 | |
139 #Markers | |
140 my $rows=-1; | |
141 my $last=0; | |
142 | |
143 print OUT "Library description\n\n"; | |
144 my @header=("library","file_path","ins","ins.f","genes.ins"); | |
145 print OUT join ("\t",@header),"\n"; | |
146 | |
147 for (my $i=0; $i<$num; $i++){ | |
148 #Temp arrays for library | |
149 my(@insertions_all_lib,@genes_insertions_lib,@insertPos_lib); | |
150 my $file=$files[$i]; | |
151 open(DATA, '<', $file) or die "Could not open '$file' Make sure input .csv files are entered in the command line\n"; | |
152 my $dummy=<DATA>; #read and store column names in dummy variable | |
153 while (my $entry = <DATA>) { | |
154 chomp $entry; | |
155 my @line=split(",",$entry); | |
156 my $locus = $line[9]; #gene id (SP_0000) | |
157 my $w = $line[12]; #nW | |
158 if (!$w){ $w=0 } # For blanks | |
159 my $c1 = $line[2]; | |
160 my $c2 = $line[3]; | |
161 my $coord= $line[0]; | |
162 push (@insertions_all_lib,$coord); | |
163 #Average counts must be greater than cutoff (minimum allowed) | |
164 my $avg = ($c1+$c2)/2; | |
165 if ($avg > $cutoff) { | |
166 my @select=($coord,$w,$avg,$locus); | |
167 my $select=\@select; | |
168 push(@unsorted,$select); | |
169 push(@insertPos_lib,$line[0]); #keep track of actual insertion site position | |
170 push (@genes_insertions_lib,$locus); | |
171 $last=$select[0]; | |
172 $rows++; | |
173 } | |
174 if ($avg >= $weight_ceiling) { $avg = $weight_ceiling } # Maximum weight | |
175 } | |
176 close DATA; | |
177 push (@insertions_all,@insertions_all_lib); | |
178 @genes_insertions_lib= uniq @genes_insertions_lib; | |
179 push (@genes_insertions,@genes_insertions_lib); | |
180 push (@insertPos,@insertPos_lib); | |
181 my @stat=($i+1,$file,scalar @insertions_all_lib,scalar @insertPos_lib,scalar @genes_insertions_lib); | |
182 print OUT join("\t",@stat),"\n"; | |
183 } | |
184 | |
185 @insertPos = sort { $a <=> $b } @insertPos; | |
186 @insertPos= uniq @insertPos; | |
187 @genes_insertions= uniq @genes_insertions; | |
188 @insertions_all=uniq @insertions_all; | |
189 my $totalAll=scalar @insertions_all; | |
190 my $total=scalar @insertPos; | |
191 my $temp="1-".$num; | |
192 my @all_stat=($temp,"NA",$totalAll,$total,scalar @genes_insertions); | |
193 print OUT join("\t",@all_stat),"\n"; | |
194 | |
195 #Genome description: #TA sites, distance between TA sites, #TA sites in ORFS | |
196 print OUT "\n-------------------------\n"; | |
197 print OUT "\nGenome description\n\n"; | |
198 print OUT "File for genome: ", $fastaFile,"\n"; | |
199 | |
200 my @sites; | |
201 #First read fasta file into a string | |
202 my $seqio = Bio::SeqIO->new(-file => $fastaFile, '-format' => 'Fasta'); | |
203 my $fasta; | |
204 while(my $seq = $seqio->next_seq) { | |
205 $fasta = $seq->seq; | |
206 } | |
207 #Just in case $fasta file is in lowercase, change it to uppercase | |
208 $fasta=uc $fasta; | |
209 | |
210 #Get genomic coordinate for TA sites: | |
211 my $x="TA"; | |
212 my $offset=0; | |
213 my @indices; | |
214 my $result=index($fasta,$x,$offset); | |
215 while ($result !=-1){ | |
216 push (@indices,$result); | |
217 $offset=$result+1; | |
218 $result=index($fasta,$x,$offset); | |
219 } | |
220 my $countTA=scalar @indices; | |
221 | |
222 #Get longest stretch with no TA sites | |
223 my @tempta=@indices; | |
224 my $prev=shift @tempta; | |
225 my $current=shift @tempta; | |
226 my $lg_dist_ta=$current-$prev; | |
227 foreach my $site(@tempta){ | |
228 $prev=$current; | |
229 $current=$site; | |
230 my $d=$current-$prev; | |
231 if ($d>$lg_dist_ta){ | |
232 $lg_dist_ta=$d; | |
233 } | |
234 } | |
235 | |
236 #Get longest stretch of with no insertions | |
237 my @tempins=@insertPos; | |
238 $prev=shift @tempins; | |
239 $current=shift @tempins; | |
240 my $lg_dist_ins=$current-$prev; | |
241 foreach my $site(@tempins){ | |
242 $prev=$current; | |
243 $current=$site; | |
244 my $d=$current-$prev; | |
245 if ($d>$lg_dist_ins){ | |
246 $lg_dist_ins=$d; | |
247 } | |
248 } | |
249 | |
250 | |
251 my $genSize=length $fasta; | |
252 print OUT "$genSize\tGenome size\n"; | |
253 print OUT "$countTA\tTotal number of TA sites\n\n"; | |
254 | |
255 my $sat=sprintf("%.2f", ($total/$countTA)*100); | |
256 my $satAll=sprintf("%.2f", ($totalAll/$countTA)*100); | |
257 my $inscov=sprintf("%.2f", ($total/$genSize)*100); | |
258 my $tacov=sprintf("%.2f", ($countTA/$genSize)*100); | |
259 | |
260 #Get GC content of genome | |
261 | |
262 my $sequence = ' '; | |
263 my $Ccount = 0; | |
264 my $Gcount = 0; | |
265 my $identifier = ' '; | |
266 | |
267 my @nucleotides = split('', $fasta); | |
268 | |
269 foreach my $nuc (@nucleotides) { | |
270 if ($nuc eq 'G') {$Gcount++} | |
271 elsif ($nuc eq 'C') {$Ccount++} | |
272 } | |
273 my $sequencelength=length $fasta; | |
274 | |
275 my $GCcontent = sprintf("%.2f",((($Gcount + $Ccount) / $sequencelength) * 100)); | |
276 my $ATcontent =100-$GCcontent; | |
277 | |
278 print OUT "$GCcontent%\tGC content of this genome\n"; | |
279 print OUT "$ATcontent%\tAT content of this genome\n"; | |
280 | |
281 print OUT "$satAll%\tSaturation of TA sites before cutoff filter (allInsertions/TAsites)\n"; | |
282 print OUT "$sat%\tSaturation of TA sites after cutoff filter (validInsertions/TAsites)\n"; | |
283 print OUT "$inscov%\tGenome coverage by insertions (validInsertions/genomeSize)\n"; | |
284 print OUT "$tacov%\tGenome coverage by TA sites (TAsites/genomeSize)\n"; | |
285 print OUT "$lg_dist_ta\tLargest distance between TA sites\n"; | |
286 print OUT "$lg_dist_ins\tLargest distance between insertions\n"; | |
287 print OUT "\n\nOpen Reading Frames\n\n"; | |
288 | |
289 #Store everything to be print OUTed in array | |
290 my @table; | |
291 | |
292 #Find open reading frames from fasta file | |
293 local $_ = $fasta; | |
294 my @orfSize; | |
295 my @allc; #numbers of TAs in the ORFS here. | |
296 my $blank=0; #ORFS that don't have any TA sites. | |
297 my $orfCount=0; #keep track of the number of ORFs found. | |
298 my $minSize=0; | |
299 #Read somewhere that 99 is a good min but there is an annotated 86 bp gene for 19F | |
300 while ( /ATG/g ) { | |
301 my $start = pos() - 3; | |
302 if ( /T(?:AA|AG|GA)/g ) { | |
303 my $stop = pos; | |
304 my $size=$stop - $start; | |
305 if ($size>=$minSize){ | |
306 push (@orfSize,$size); | |
307 my $seq=substr ($_, $start, $stop - $start); | |
308 my @ctemp = $seq =~ /$x/g; | |
309 my $countTA = @ctemp; | |
310 if ($countTA==0){$blank++} | |
311 push (@allc,$countTA); | |
312 $orfCount++; | |
313 } | |
314 } | |
315 } | |
316 | |
317 print OUT "\nORFs based on Fasta sequence and start (ATG) and end (TAA,TAG,TGA) codons\n"; | |
318 push (@table,["Set minimum size for an ORF",$minSize]); | |
319 print OUT "$orfCount\tTotal number of ORFs found\n"; | |
320 my ($minORF, $maxORF) = minmax(@orfSize); | |
321 print OUT "$minORF\tSmallest ORF\n"; | |
322 print OUT "$maxORF\tLargest ORF\n"; | |
323 my ($mintaORF,$maxtaORF) = minmax(@allc); | |
324 print OUT "$mintaORF\tFewest # TA sites in an ORF\n"; | |
325 print OUT "$maxtaORF\tGreatest # TA sites in an ORF\n"; | |
326 print OUT "$blank\tNumber of ORFs that don't have any TA sites\n"; | |
327 | |
328 | |
329 print OUT "\nGenes using the genbank annotation file\n\n"; | |
330 ###Get genbank file. Find all start and stop for genes | |
331 #See how many insertions fall into genes vs intergenic regions | |
332 #Get array of coordinates for all insertions then remove insertion if it is | |
333 #within a gene region | |
334 my $gb = Bio::SeqIO->new(-file => $ref, -format => 'genbank'); | |
335 my $refseq = $gb->next_seq; | |
336 | |
337 #store number of insertions in a gene here | |
338 my @geneIns; | |
339 my @allLengths; | |
340 my $blankGene=0; #Number of genes that don't have any insertions in them | |
341 my @genomeSeq=split('',$fasta); | |
342 | |
343 | |
344 #keep a copy to remove insertions that are in genes | |
345 my @insertPosCopy=@insertPos; | |
346 | |
347 my @features = $refseq->get_SeqFeatures(); # just top level | |
348 foreach my $feature ( @features ) { | |
349 if ($feature->primary_tag eq "gene"){ | |
350 my $start=$feature->start; | |
351 my $end=$feature->end; | |
352 my $length=$end-$start; | |
353 push (@allLengths,$length); | |
354 #turn this into a for loop | |
355 my $i=0; | |
356 my $ins=0; | |
357 my $current=$insertPos[$i];; | |
358 while ($current<=$end && $i<scalar @insertPos){ | |
359 if ($current>=$start){ | |
360 splice(@insertPosCopy, $i, 1); | |
361 $ins++; | |
362 } | |
363 $current=$insertPos[$i++]; | |
364 } | |
365 if ($ins==0){$blankGene++} | |
366 push (@geneIns,$ins); | |
367 } | |
368 } | |
369 my $avgLength=sprintf("%.2f",mean(@allLengths)); | |
370 | |
371 my ($minLength, $maxLength) = minmax @allLengths; | |
372 my $avgInsGene=sprintf("%.2f",mean(@geneIns)); | |
373 | |
374 | |
375 | |
376 | |
377 | |
378 my ($minInsGene, $maxInsGene) = minmax @geneIns; | |
379 my $nonGeneIns=scalar @insertPosCopy; | |
380 my $totalIns=scalar @insertPos; | |
381 my $percNon=sprintf("%.2f",($nonGeneIns/$totalIns)*100); | |
382 | |
383 print OUT "Length of a gene\n"; | |
384 print OUT "$avgLength\tAverage","\n$minLength\tMininum","\n$maxLength\tMaximum\n"; | |
385 print OUT "\nFor insertions in a gene:\n"; | |
386 print OUT "$avgInsGene\tAverage","\n$minInsGene\tMininum","\n$maxInsGene\tMaximum\n"; | |
387 print OUT "Number of genes that do not have any insertions: ",$blankGene,"\n"; | |
388 print OUT "\n$nonGeneIns\tInsertions that are not in genes","\n$percNon% of all insertions\n"; | |
389 #How many insertions are in genes and how many are in non-gene regions? | |
390 | |
391 | |
392 |