Mercurial > repos > cristian > notos
comparison CpGoe.pl @ 0:1535ffddeff4 draft
planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
author | cristian |
---|---|
date | Thu, 07 Sep 2017 08:51:57 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1535ffddeff4 |
---|---|
1 #! /usr/bin/perl | |
2 | |
3 #################################### | |
4 # | |
5 # CpGoe.pl -f fasta-file -o output_file -m minlength | |
6 # | |
7 # reads concatanated fasta files, writes name, length, CpGs and GpCs, CpGo/e ratios and other quantities into TAB file for each sequence | |
8 ##################################### | |
9 | |
10 use diagnostics; | |
11 use strict; | |
12 use Carp; | |
13 use FileHandle; | |
14 use File::Path; | |
15 use File::Basename; | |
16 use Data::Dumper; | |
17 use Getopt::Std; | |
18 | |
19 my $VERSION = "1.0"; | |
20 $Getopt::Std::STANDARD_HELP_VERSION = 1; | |
21 | |
22 | |
23 # Called when --help set as flag | |
24 sub HELP_MESSAGE | |
25 { | |
26 print "Description: CpGoe processes a FASTA file and outputs the CpGo/e ratios and - if specified - further quantities\n\n" . | |
27 "Usage: CpGoe.pl [OPTION] -f FASTA_FILE \n\n" . | |
28 "Options:\n" . | |
29 " -f FASTA_FILE Name of FASTA file containing the input sequences. REQUIRED.\n" . | |
30 " -o OUT_FILE name of output file containing the results\n" . | |
31 " -m MIN_LEN minimum length of sequences, shorter sequences are discarded\n" . | |
32 " -c CONTEXT Context to calculate the ratio (CpA, CpC, CpG, CpT) default CpG.\n" . | |
33 " -a ALGORITHM Algorithm used to calculate CpGo/e ratio. Default: 1\n" . | |
34 " 1 - (CpG / (C * G)) * (L^2 / L-1)\n" . | |
35 " 2 - (CpG / (C * G)) * (L^2 / L)\n" . | |
36 " 3 - (CpG / L) / (G + C)^2\n" . | |
37 " 4 - CpG / ( (C + G)/2 )^2\n" . | |
38 " -d detailed output, providing other quantities additional to the CpGo/e ratios\n" . | |
39 " -h output header line\n". | |
40 " -v verbose messages\n" ; | |
41 exit 0; | |
42 } | |
43 | |
44 | |
45 | |
46 # Called when --version set as flag | |
47 sub VERSION_MESSAGE | |
48 { | |
49 print "CpGoe $VERSION\n"; | |
50 } | |
51 | |
52 | |
53 | |
54 # Command line parsing | |
55 # ... read argument | |
56 my %opts; | |
57 getopts('f:o:m:c:a:dvh', \%opts); | |
58 #if ($#ARGV != 0) { | |
59 # print STDERR "Exactly one argument has to be provided, the name of the input FASTA file.\n" . | |
60 # "Moreover, the options must be listed first, then the name of the input FASTA file.\n"; | |
61 # exit 1; | |
62 #} | |
63 my $fasta_fname; | |
64 if (exists($opts{'f'})) { | |
65 $fasta_fname = $opts{'f'}; | |
66 } else { | |
67 HELP_MESSAGE | |
68 } | |
69 | |
70 # ... read options | |
71 my $out_fname; | |
72 my $has_file_output; | |
73 if (exists($opts{'o'})) { | |
74 $out_fname = $opts{'o'}; | |
75 $has_file_output = 1; | |
76 } | |
77 else { | |
78 $has_file_output = 0; | |
79 } | |
80 | |
81 my $min_len; | |
82 if (exists($opts{'m'})) { | |
83 $min_len = $opts{'m'}; | |
84 } | |
85 else { | |
86 $min_len = 1; | |
87 } | |
88 | |
89 my $algo = 1; | |
90 if (exists($opts{'a'})) { | |
91 $algo = $opts{'a'}; | |
92 } | |
93 | |
94 my $context = 'CpG'; | |
95 if (exists($opts{'c'})) { | |
96 $context = $opts{'c'}; | |
97 } | |
98 | |
99 my $is_verbose = exists($opts{'v'}); | |
100 my $has_header = exists($opts{'h'}); | |
101 my $is_detailed = exists($opts{'d'}); | |
102 | |
103 | |
104 | |
105 # read input file and split into fasta sequences on the fly | |
106 # ... check whether input FASTA file exists | |
107 my $FASTA; | |
108 if (-e $fasta_fname) { | |
109 if (-f $fasta_fname) { | |
110 my $res = open($FASTA, $fasta_fname); | |
111 if (!$res) { | |
112 print STDERR "could not open $fasta_fname\n"; | |
113 exit 1; | |
114 } | |
115 } | |
116 else { | |
117 print STDERR "$fasta_fname is not a file\n"; | |
118 exit 1; | |
119 } | |
120 } | |
121 else { | |
122 print STDERR "cannot open file $fasta_fname\n"; | |
123 exit 1; | |
124 } | |
125 | |
126 | |
127 # ... determine which linebreak is used (linux / windows / mac) | |
128 my $found_n = 0; | |
129 my $found_r = 0; | |
130 my $found_rn = 0; | |
131 while ( defined( my $ch = getc($FASTA) ) ) { | |
132 if ($ch eq "\n") { | |
133 if ($found_r) { | |
134 $found_rn = 1; | |
135 $found_r = 0; | |
136 } else { | |
137 $found_n = 1; | |
138 } | |
139 last; | |
140 } elsif ($ch eq "\r") { | |
141 $found_r = 1; | |
142 } else { | |
143 if ($found_r) { | |
144 last; | |
145 } | |
146 } | |
147 } | |
148 close($FASTA); | |
149 if ($found_r + $found_n + $found_rn != 1) { | |
150 print STDERR "something went wrong determining the linebreaks used in $fasta_fname\n"; | |
151 } | |
152 | |
153 | |
154 # ... read in sequences | |
155 my $old_linebreak = $/; | |
156 if ($found_n) { | |
157 $/ = "\n"; | |
158 } elsif ($found_r) { | |
159 $/ = "\r"; | |
160 } else { | |
161 $/ = "\r\n"; | |
162 } | |
163 my $res = open($FASTA, $fasta_fname); | |
164 if (!$res) { | |
165 print STDERR "could not open $fasta_fname\n"; | |
166 exit 1; | |
167 } | |
168 | |
169 my @names = (); # names of the sequences | |
170 my @seqs = (); # sequences | |
171 my $is_first = 1; | |
172 while ( <$FASTA> ) { | |
173 chomp; | |
174 if (/^[^#]/) { | |
175 if ( /^>(\S+)/) { | |
176 #s/^>|\s+$//g; # remove leading '>' and trailing whitespaces | |
177 #s/\s/_/g; # replace spaces by underscores | |
178 push(@names, $1); | |
179 push(@seqs, ""); | |
180 $is_first = 0; | |
181 } | |
182 else { | |
183 if ($is_first) { | |
184 print STDERR "first non-comment line of FASTA file " . $fasta_fname . " does not start with \'>\'\n"; | |
185 exit 1; | |
186 } | |
187 else { | |
188 s/[\-\.]*//g; # remove dashes and dots | |
189 $seqs[-1] .= $_; | |
190 } | |
191 } | |
192 } | |
193 } | |
194 $res = close($FASTA); | |
195 if (!$res) { | |
196 print STDERR "could not close $fasta_fname\n"; | |
197 exit 1; | |
198 } | |
199 $/ = $old_linebreak; | |
200 | |
201 # print Dumper(@names) . "\n"; | |
202 # print Dumper(@seqs) . "\n"; | |
203 | |
204 | |
205 # ... check sequences | |
206 # ... ... are there any sequence names? | |
207 if ($#names < 0) { | |
208 print STDERR "FASTA file $fasta_fname is empty\n"; | |
209 exit 1; | |
210 } | |
211 | |
212 # ... ... are there empty sequences?} | |
213 my $str = ""; | |
214 my $err = 0; | |
215 my $num = 0; | |
216 my $MAX = 50; # maximum number of notifications about an empty sequence | |
217 for (my $i = 0; $i <= $#names; ++$i) { | |
218 if ($seqs[$i] eq "") { | |
219 if ($num < $MAX) { | |
220 $str .= "Sequence " . $names[$i] . " in FASTA file $fasta_fname is empty\n"; | |
221 } | |
222 $err = 1; | |
223 ++$num; | |
224 } | |
225 } | |
226 if ($err) { | |
227 print STDERR "$str"; | |
228 if ($num > $MAX) { | |
229 print STDERR "$num empty sequences in total in FASTA file $fasta_fname \n"; | |
230 } | |
231 exit 1; | |
232 } | |
233 | |
234 | |
235 # ... ... check for illegal characters in sequences | |
236 for (my $i = 0; $i <= $#names; ++$i) { | |
237 my $str = $seqs[$i]; | |
238 $str =~ s/[abcdghkmnrstuvwy]//gi; | |
239 if (length($str) > 0) { | |
240 $str = join '', sort { $a cmp $b } split(//, $str); | |
241 print STDERR "Sequence " . $names[$i] . " of FASTA file " . $fasta_fname . " contains illegal characters: "; | |
242 for (my $j = 0; $j <= length($str); ++$j) { | |
243 my $out = ($j == 0); | |
244 my $curr = substr($str, $j, 1); | |
245 if (!$out) { | |
246 my $prev = substr($str, $j - 1, 1) ; | |
247 $out = ($curr ne $prev); | |
248 } | |
249 if ($out) { | |
250 print STDERR $curr; | |
251 } | |
252 } | |
253 print STDERR "\n"; | |
254 exit 1; | |
255 } | |
256 } | |
257 | |
258 | |
259 # ... output | |
260 if ($is_verbose) { | |
261 print $#names + 1 . " sequence(s) read.\n"; | |
262 } | |
263 | |
264 | |
265 | |
266 # output quantities | |
267 # ... open output file | |
268 my $OUT; | |
269 if ($has_file_output) { | |
270 if ((-e $out_fname) && !(-f $out_fname)) { | |
271 print STDERR "$out_fname exists and is not a file\n"; | |
272 exit 1; | |
273 } | |
274 if (!open($OUT, ">$out_fname")) { | |
275 print STDERR "cannot create file $out_fname\n"; | |
276 exit 1; | |
277 } | |
278 } else { | |
279 $OUT = *STDOUT; | |
280 } | |
281 | |
282 # ... print header | |
283 if ($has_header) { | |
284 print $OUT "#name\tlength\tCpGs\tGpCs\tCs\tGs\tNs\tCpG o\/e\n"; | |
285 } | |
286 | |
287 | |
288 | |
289 # ... for each sequence calculate CpGo/e ratios and related quantities: | |
290 # - length of the sequence | |
291 # - CpGs present in the sequence | |
292 # - GpCs present in the sequence | |
293 # - Cs the number of C present in the sequence | |
294 # - Gs the number of G present in the sequence | |
295 # - CpG o/e ratio of the sequence | |
296 my $num_short = 0; # number of sequences which are too short | |
297 for my $i (0 .. $#names) { | |
298 my @ar = (); | |
299 @ar = split('\|', $names[$i]); | |
300 my $seqname = $ar[1]; | |
301 my $num_N = () = ( $seqs[$i] =~ m/N/gi ); | |
302 my $len = length($seqs[$i]); | |
303 my $l = $len - $num_N; | |
304 if ($l >= $min_len) { | |
305 my ($num_G, $num_CG); | |
306 if ($context eq 'CpG') { | |
307 $num_G = () = ( $seqs[$i] =~ m/G/gi ); | |
308 $num_CG = () = ( $seqs[$i] =~ m/CG/gi ); | |
309 } elsif ($context eq 'CpA') { | |
310 $num_G = () = ( $seqs[$i] =~ m/A/gi ); | |
311 $num_CG = () = ( $seqs[$i] =~ m/CA/gi ); | |
312 } elsif ($context eq 'CpC') { | |
313 $num_G = () = ( $seqs[$i] =~ m/C/gi ); | |
314 $num_CG = () = ( $seqs[$i] =~ m/CC/gi ); | |
315 } elsif ($context eq 'CpT') { | |
316 $num_G = () = ( $seqs[$i] =~ m/T/gi ); | |
317 $num_CG = () = ( $seqs[$i] =~ m/CT/gi ); | |
318 } else { | |
319 $num_G = 0; | |
320 $num_CG = 0; | |
321 } | |
322 my $num_C = () = ( $seqs[$i] =~ m/C/gi ); | |
323 my $num_TG = () = ( $seqs[$i] =~ m/TG/gi ); | |
324 my $CpGoe; | |
325 if ( ($num_G == 0) || ($num_C == 0) || ($l == 1) || ($num_CG == 0) ) { | |
326 $CpGoe = 0; | |
327 } | |
328 else { | |
329 if ($algo == 1) { | |
330 my $x = $num_CG / ($num_C * $num_G); | |
331 my $y = $l**2 / ($l - 1); | |
332 $CpGoe = $x * $y; | |
333 } elsif ($algo == 2) { | |
334 # cf.Gardiner-Garden and Frommer | |
335 $CpGoe = ($num_CG/($num_C * $num_G))*$l; | |
336 } elsif ($algo == 3) { | |
337 # cf. Zeng and Yi | |
338 $CpGoe = ($num_CG / $l)/(($num_C + $num_G)/$l)**2; | |
339 } elsif ($algo == 4) { | |
340 # cf. Saxonov, Berg and Brutlag | |
341 $CpGoe = $num_CG / (($num_C + $num_G)/2)**2; | |
342 } | |
343 } | |
344 print $OUT $names[$i] . "\t"; | |
345 if ($is_detailed) { | |
346 if ($algo == 3) { | |
347 print $OUT $len . "\t" . $num_CG . "\t" . $num_TG . "\t" .$num_C. "\t" .$num_G. "\t" .$num_N. "\t"; | |
348 } else { | |
349 print $OUT $len . "\t" . $num_CG . "\t" . $num_C . "\t" .$num_G. "\t" .$num_N. "\t"; | |
350 } | |
351 } | |
352 print $OUT $CpGoe . "\n"; | |
353 } else { | |
354 ++$num_short; | |
355 } | |
356 } | |
357 if ($is_verbose) { | |
358 print $num_short . " sequence(s) discarded due to short length.\n"; | |
359 } | |
360 | |
361 if ($has_file_output) { | |
362 my $res = close($OUT); | |
363 if (!$res) { | |
364 print STDERR "could not close $out_fname\n"; | |
365 exit 1; | |
366 } | |
367 } | |
368 |