Mercurial > repos > cristian > notos
diff CpGoe.pl @ 0:1535ffddeff4 draft
planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
author | cristian |
---|---|
date | Thu, 07 Sep 2017 08:51:57 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CpGoe.pl Thu Sep 07 08:51:57 2017 -0400 @@ -0,0 +1,368 @@ +#! /usr/bin/perl + +#################################### +# +# CpGoe.pl -f fasta-file -o output_file -m minlength +# +# reads concatanated fasta files, writes name, length, CpGs and GpCs, CpGo/e ratios and other quantities into TAB file for each sequence +##################################### + +use diagnostics; +use strict; +use Carp; +use FileHandle; +use File::Path; +use File::Basename; +use Data::Dumper; +use Getopt::Std; + +my $VERSION = "1.0"; +$Getopt::Std::STANDARD_HELP_VERSION = 1; + + +# Called when --help set as flag +sub HELP_MESSAGE +{ + print "Description: CpGoe processes a FASTA file and outputs the CpGo/e ratios and - if specified - further quantities\n\n" . + "Usage: CpGoe.pl [OPTION] -f FASTA_FILE \n\n" . + "Options:\n" . + " -f FASTA_FILE Name of FASTA file containing the input sequences. REQUIRED.\n" . + " -o OUT_FILE name of output file containing the results\n" . + " -m MIN_LEN minimum length of sequences, shorter sequences are discarded\n" . + " -c CONTEXT Context to calculate the ratio (CpA, CpC, CpG, CpT) default CpG.\n" . + " -a ALGORITHM Algorithm used to calculate CpGo/e ratio. Default: 1\n" . + " 1 - (CpG / (C * G)) * (L^2 / L-1)\n" . + " 2 - (CpG / (C * G)) * (L^2 / L)\n" . + " 3 - (CpG / L) / (G + C)^2\n" . + " 4 - CpG / ( (C + G)/2 )^2\n" . + " -d detailed output, providing other quantities additional to the CpGo/e ratios\n" . + " -h output header line\n". + " -v verbose messages\n" ; + exit 0; +} + + + +# Called when --version set as flag +sub VERSION_MESSAGE +{ + print "CpGoe $VERSION\n"; +} + + + +# Command line parsing +# ... read argument +my %opts; +getopts('f:o:m:c:a:dvh', \%opts); +#if ($#ARGV != 0) { +# print STDERR "Exactly one argument has to be provided, the name of the input FASTA file.\n" . +# "Moreover, the options must be listed first, then the name of the input FASTA file.\n"; +# exit 1; +#} +my $fasta_fname; +if (exists($opts{'f'})) { + $fasta_fname = $opts{'f'}; +} else { + HELP_MESSAGE +} + +# ... read options +my $out_fname; +my $has_file_output; +if (exists($opts{'o'})) { + $out_fname = $opts{'o'}; + $has_file_output = 1; +} +else { + $has_file_output = 0; +} + +my $min_len; +if (exists($opts{'m'})) { + $min_len = $opts{'m'}; +} +else { + $min_len = 1; +} + +my $algo = 1; +if (exists($opts{'a'})) { + $algo = $opts{'a'}; +} + +my $context = 'CpG'; +if (exists($opts{'c'})) { + $context = $opts{'c'}; +} + +my $is_verbose = exists($opts{'v'}); +my $has_header = exists($opts{'h'}); +my $is_detailed = exists($opts{'d'}); + + + +# read input file and split into fasta sequences on the fly +# ... check whether input FASTA file exists +my $FASTA; +if (-e $fasta_fname) { + if (-f $fasta_fname) { + my $res = open($FASTA, $fasta_fname); + if (!$res) { + print STDERR "could not open $fasta_fname\n"; + exit 1; + } + } + else { + print STDERR "$fasta_fname is not a file\n"; + exit 1; + } +} +else { + print STDERR "cannot open file $fasta_fname\n"; + exit 1; +} + + +# ... determine which linebreak is used (linux / windows / mac) +my $found_n = 0; +my $found_r = 0; +my $found_rn = 0; +while ( defined( my $ch = getc($FASTA) ) ) { + if ($ch eq "\n") { + if ($found_r) { + $found_rn = 1; + $found_r = 0; + } else { + $found_n = 1; + } + last; + } elsif ($ch eq "\r") { + $found_r = 1; + } else { + if ($found_r) { + last; + } + } +} +close($FASTA); +if ($found_r + $found_n + $found_rn != 1) { + print STDERR "something went wrong determining the linebreaks used in $fasta_fname\n"; +} + + +# ... read in sequences +my $old_linebreak = $/; +if ($found_n) { + $/ = "\n"; +} elsif ($found_r) { + $/ = "\r"; +} else { + $/ = "\r\n"; +} +my $res = open($FASTA, $fasta_fname); +if (!$res) { + print STDERR "could not open $fasta_fname\n"; + exit 1; +} + +my @names = (); # names of the sequences +my @seqs = (); # sequences +my $is_first = 1; +while ( <$FASTA> ) { + chomp; + if (/^[^#]/) { + if ( /^>(\S+)/) { + #s/^>|\s+$//g; # remove leading '>' and trailing whitespaces + #s/\s/_/g; # replace spaces by underscores + push(@names, $1); + push(@seqs, ""); + $is_first = 0; + } + else { + if ($is_first) { + print STDERR "first non-comment line of FASTA file " . $fasta_fname . " does not start with \'>\'\n"; + exit 1; + } + else { + s/[\-\.]*//g; # remove dashes and dots + $seqs[-1] .= $_; + } + } + } +} +$res = close($FASTA); +if (!$res) { + print STDERR "could not close $fasta_fname\n"; + exit 1; +} +$/ = $old_linebreak; + +# print Dumper(@names) . "\n"; +# print Dumper(@seqs) . "\n"; + + +# ... check sequences +# ... ... are there any sequence names? +if ($#names < 0) { + print STDERR "FASTA file $fasta_fname is empty\n"; + exit 1; +} + +# ... ... are there empty sequences?} +my $str = ""; +my $err = 0; +my $num = 0; +my $MAX = 50; # maximum number of notifications about an empty sequence +for (my $i = 0; $i <= $#names; ++$i) { + if ($seqs[$i] eq "") { + if ($num < $MAX) { + $str .= "Sequence " . $names[$i] . " in FASTA file $fasta_fname is empty\n"; + } + $err = 1; + ++$num; + } +} +if ($err) { + print STDERR "$str"; + if ($num > $MAX) { + print STDERR "$num empty sequences in total in FASTA file $fasta_fname \n"; + } + exit 1; +} + + +# ... ... check for illegal characters in sequences +for (my $i = 0; $i <= $#names; ++$i) { + my $str = $seqs[$i]; + $str =~ s/[abcdghkmnrstuvwy]//gi; + if (length($str) > 0) { + $str = join '', sort { $a cmp $b } split(//, $str); + print STDERR "Sequence " . $names[$i] . " of FASTA file " . $fasta_fname . " contains illegal characters: "; + for (my $j = 0; $j <= length($str); ++$j) { + my $out = ($j == 0); + my $curr = substr($str, $j, 1); + if (!$out) { + my $prev = substr($str, $j - 1, 1) ; + $out = ($curr ne $prev); + } + if ($out) { + print STDERR $curr; + } + } + print STDERR "\n"; + exit 1; + } +} + + +# ... output +if ($is_verbose) { + print $#names + 1 . " sequence(s) read.\n"; +} + + + +# output quantities +# ... open output file +my $OUT; +if ($has_file_output) { + if ((-e $out_fname) && !(-f $out_fname)) { + print STDERR "$out_fname exists and is not a file\n"; + exit 1; + } + if (!open($OUT, ">$out_fname")) { + print STDERR "cannot create file $out_fname\n"; + exit 1; + } +} else { + $OUT = *STDOUT; +} + +# ... print header +if ($has_header) { + print $OUT "#name\tlength\tCpGs\tGpCs\tCs\tGs\tNs\tCpG o\/e\n"; +} + + + +# ... for each sequence calculate CpGo/e ratios and related quantities: +# - length of the sequence +# - CpGs present in the sequence +# - GpCs present in the sequence +# - Cs the number of C present in the sequence +# - Gs the number of G present in the sequence +# - CpG o/e ratio of the sequence +my $num_short = 0; # number of sequences which are too short +for my $i (0 .. $#names) { + my @ar = (); + @ar = split('\|', $names[$i]); + my $seqname = $ar[1]; + my $num_N = () = ( $seqs[$i] =~ m/N/gi ); + my $len = length($seqs[$i]); + my $l = $len - $num_N; + if ($l >= $min_len) { + my ($num_G, $num_CG); + if ($context eq 'CpG') { + $num_G = () = ( $seqs[$i] =~ m/G/gi ); + $num_CG = () = ( $seqs[$i] =~ m/CG/gi ); + } elsif ($context eq 'CpA') { + $num_G = () = ( $seqs[$i] =~ m/A/gi ); + $num_CG = () = ( $seqs[$i] =~ m/CA/gi ); + } elsif ($context eq 'CpC') { + $num_G = () = ( $seqs[$i] =~ m/C/gi ); + $num_CG = () = ( $seqs[$i] =~ m/CC/gi ); + } elsif ($context eq 'CpT') { + $num_G = () = ( $seqs[$i] =~ m/T/gi ); + $num_CG = () = ( $seqs[$i] =~ m/CT/gi ); + } else { + $num_G = 0; + $num_CG = 0; + } + my $num_C = () = ( $seqs[$i] =~ m/C/gi ); + my $num_TG = () = ( $seqs[$i] =~ m/TG/gi ); + my $CpGoe; + if ( ($num_G == 0) || ($num_C == 0) || ($l == 1) || ($num_CG == 0) ) { + $CpGoe = 0; + } + else { + if ($algo == 1) { + my $x = $num_CG / ($num_C * $num_G); + my $y = $l**2 / ($l - 1); + $CpGoe = $x * $y; + } elsif ($algo == 2) { + # cf.Gardiner-Garden and Frommer + $CpGoe = ($num_CG/($num_C * $num_G))*$l; + } elsif ($algo == 3) { + # cf. Zeng and Yi + $CpGoe = ($num_CG / $l)/(($num_C + $num_G)/$l)**2; + } elsif ($algo == 4) { + # cf. Saxonov, Berg and Brutlag + $CpGoe = $num_CG / (($num_C + $num_G)/2)**2; + } + } + print $OUT $names[$i] . "\t"; + if ($is_detailed) { + if ($algo == 3) { + print $OUT $len . "\t" . $num_CG . "\t" . $num_TG . "\t" .$num_C. "\t" .$num_G. "\t" .$num_N. "\t"; + } else { + print $OUT $len . "\t" . $num_CG . "\t" . $num_C . "\t" .$num_G. "\t" .$num_N. "\t"; + } + } + print $OUT $CpGoe . "\n"; + } else { + ++$num_short; + } +} +if ($is_verbose) { + print $num_short . " sequence(s) discarded due to short length.\n"; +} + +if ($has_file_output) { + my $res = close($OUT); + if (!$res) { + print STDERR "could not close $out_fname\n"; + exit 1; + } +} +