annotate CpGoe.pl @ 1:cb8bac9d0d37 draft default tip

planemo upload commit 4ec21642211c1fb7427e4c98fdf0f4b9a3f8a185-dirty
author cristian
date Thu, 07 Sep 2017 10:21:45 -0400
parents 1535ffddeff4
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
1 #! /usr/bin/perl
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
2
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
3 ####################################
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
4 #
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
5 # CpGoe.pl -f fasta-file -o output_file -m minlength
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
6 #
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
7 # reads concatanated fasta files, writes name, length, CpGs and GpCs, CpGo/e ratios and other quantities into TAB file for each sequence
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
8 #####################################
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
9
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
10 use diagnostics;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
11 use strict;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
12 use Carp;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
13 use FileHandle;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
14 use File::Path;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
15 use File::Basename;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
16 use Data::Dumper;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
17 use Getopt::Std;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
18
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
19 my $VERSION = "1.0";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
20 $Getopt::Std::STANDARD_HELP_VERSION = 1;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
21
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
22
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
23 # Called when --help set as flag
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
24 sub HELP_MESSAGE
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
25 {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
26 print "Description: CpGoe processes a FASTA file and outputs the CpGo/e ratios and - if specified - further quantities\n\n" .
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
27 "Usage: CpGoe.pl [OPTION] -f FASTA_FILE \n\n" .
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
28 "Options:\n" .
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
29 " -f FASTA_FILE Name of FASTA file containing the input sequences. REQUIRED.\n" .
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
30 " -o OUT_FILE name of output file containing the results\n" .
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
31 " -m MIN_LEN minimum length of sequences, shorter sequences are discarded\n" .
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
32 " -c CONTEXT Context to calculate the ratio (CpA, CpC, CpG, CpT) default CpG.\n" .
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
33 " -a ALGORITHM Algorithm used to calculate CpGo/e ratio. Default: 1\n" .
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
34 " 1 - (CpG / (C * G)) * (L^2 / L-1)\n" .
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
35 " 2 - (CpG / (C * G)) * (L^2 / L)\n" .
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
36 " 3 - (CpG / L) / (G + C)^2\n" .
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
37 " 4 - CpG / ( (C + G)/2 )^2\n" .
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
38 " -d detailed output, providing other quantities additional to the CpGo/e ratios\n" .
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
39 " -h output header line\n".
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
40 " -v verbose messages\n" ;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
41 exit 0;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
42 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
43
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
44
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
45
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
46 # Called when --version set as flag
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
47 sub VERSION_MESSAGE
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
48 {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
49 print "CpGoe $VERSION\n";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
50 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
51
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
52
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
53
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
54 # Command line parsing
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
55 # ... read argument
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
56 my %opts;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
57 getopts('f:o:m:c:a:dvh', \%opts);
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
58 #if ($#ARGV != 0) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
59 # print STDERR "Exactly one argument has to be provided, the name of the input FASTA file.\n" .
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
60 # "Moreover, the options must be listed first, then the name of the input FASTA file.\n";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
61 # exit 1;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
62 #}
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
63 my $fasta_fname;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
64 if (exists($opts{'f'})) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
65 $fasta_fname = $opts{'f'};
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
66 } else {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
67 HELP_MESSAGE
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
68 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
69
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
70 # ... read options
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
71 my $out_fname;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
72 my $has_file_output;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
73 if (exists($opts{'o'})) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
74 $out_fname = $opts{'o'};
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
75 $has_file_output = 1;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
76 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
77 else {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
78 $has_file_output = 0;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
79 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
80
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
81 my $min_len;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
82 if (exists($opts{'m'})) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
83 $min_len = $opts{'m'};
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
84 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
85 else {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
86 $min_len = 1;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
87 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
88
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
89 my $algo = 1;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
90 if (exists($opts{'a'})) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
91 $algo = $opts{'a'};
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
92 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
93
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
94 my $context = 'CpG';
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
95 if (exists($opts{'c'})) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
96 $context = $opts{'c'};
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
97 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
98
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
99 my $is_verbose = exists($opts{'v'});
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
100 my $has_header = exists($opts{'h'});
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
101 my $is_detailed = exists($opts{'d'});
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
102
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
103
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
104
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
105 # read input file and split into fasta sequences on the fly
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
106 # ... check whether input FASTA file exists
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
107 my $FASTA;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
108 if (-e $fasta_fname) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
109 if (-f $fasta_fname) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
110 my $res = open($FASTA, $fasta_fname);
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
111 if (!$res) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
112 print STDERR "could not open $fasta_fname\n";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
113 exit 1;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
114 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
115 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
116 else {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
117 print STDERR "$fasta_fname is not a file\n";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
118 exit 1;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
119 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
120 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
121 else {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
122 print STDERR "cannot open file $fasta_fname\n";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
123 exit 1;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
124 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
125
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
126
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
127 # ... determine which linebreak is used (linux / windows / mac)
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
128 my $found_n = 0;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
129 my $found_r = 0;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
130 my $found_rn = 0;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
131 while ( defined( my $ch = getc($FASTA) ) ) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
132 if ($ch eq "\n") {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
133 if ($found_r) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
134 $found_rn = 1;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
135 $found_r = 0;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
136 } else {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
137 $found_n = 1;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
138 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
139 last;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
140 } elsif ($ch eq "\r") {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
141 $found_r = 1;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
142 } else {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
143 if ($found_r) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
144 last;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
145 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
146 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
147 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
148 close($FASTA);
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
149 if ($found_r + $found_n + $found_rn != 1) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
150 print STDERR "something went wrong determining the linebreaks used in $fasta_fname\n";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
151 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
152
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
153
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
154 # ... read in sequences
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
155 my $old_linebreak = $/;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
156 if ($found_n) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
157 $/ = "\n";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
158 } elsif ($found_r) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
159 $/ = "\r";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
160 } else {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
161 $/ = "\r\n";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
162 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
163 my $res = open($FASTA, $fasta_fname);
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
164 if (!$res) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
165 print STDERR "could not open $fasta_fname\n";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
166 exit 1;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
167 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
168
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
169 my @names = (); # names of the sequences
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
170 my @seqs = (); # sequences
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
171 my $is_first = 1;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
172 while ( <$FASTA> ) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
173 chomp;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
174 if (/^[^#]/) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
175 if ( /^>(\S+)/) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
176 #s/^>|\s+$//g; # remove leading '>' and trailing whitespaces
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
177 #s/\s/_/g; # replace spaces by underscores
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
178 push(@names, $1);
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
179 push(@seqs, "");
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
180 $is_first = 0;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
181 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
182 else {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
183 if ($is_first) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
184 print STDERR "first non-comment line of FASTA file " . $fasta_fname . " does not start with \'>\'\n";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
185 exit 1;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
186 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
187 else {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
188 s/[\-\.]*//g; # remove dashes and dots
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
189 $seqs[-1] .= $_;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
190 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
191 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
192 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
193 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
194 $res = close($FASTA);
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
195 if (!$res) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
196 print STDERR "could not close $fasta_fname\n";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
197 exit 1;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
198 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
199 $/ = $old_linebreak;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
200
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
201 # print Dumper(@names) . "\n";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
202 # print Dumper(@seqs) . "\n";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
203
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
204
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
205 # ... check sequences
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
206 # ... ... are there any sequence names?
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
207 if ($#names < 0) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
208 print STDERR "FASTA file $fasta_fname is empty\n";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
209 exit 1;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
210 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
211
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
212 # ... ... are there empty sequences?}
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
213 my $str = "";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
214 my $err = 0;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
215 my $num = 0;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
216 my $MAX = 50; # maximum number of notifications about an empty sequence
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
217 for (my $i = 0; $i <= $#names; ++$i) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
218 if ($seqs[$i] eq "") {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
219 if ($num < $MAX) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
220 $str .= "Sequence " . $names[$i] . " in FASTA file $fasta_fname is empty\n";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
221 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
222 $err = 1;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
223 ++$num;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
224 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
225 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
226 if ($err) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
227 print STDERR "$str";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
228 if ($num > $MAX) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
229 print STDERR "$num empty sequences in total in FASTA file $fasta_fname \n";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
230 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
231 exit 1;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
232 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
233
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
234
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
235 # ... ... check for illegal characters in sequences
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
236 for (my $i = 0; $i <= $#names; ++$i) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
237 my $str = $seqs[$i];
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
238 $str =~ s/[abcdghkmnrstuvwy]//gi;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
239 if (length($str) > 0) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
240 $str = join '', sort { $a cmp $b } split(//, $str);
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
241 print STDERR "Sequence " . $names[$i] . " of FASTA file " . $fasta_fname . " contains illegal characters: ";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
242 for (my $j = 0; $j <= length($str); ++$j) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
243 my $out = ($j == 0);
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
244 my $curr = substr($str, $j, 1);
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
245 if (!$out) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
246 my $prev = substr($str, $j - 1, 1) ;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
247 $out = ($curr ne $prev);
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
248 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
249 if ($out) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
250 print STDERR $curr;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
251 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
252 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
253 print STDERR "\n";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
254 exit 1;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
255 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
256 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
257
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
258
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
259 # ... output
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
260 if ($is_verbose) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
261 print $#names + 1 . " sequence(s) read.\n";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
262 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
263
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
264
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
265
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
266 # output quantities
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
267 # ... open output file
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
268 my $OUT;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
269 if ($has_file_output) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
270 if ((-e $out_fname) && !(-f $out_fname)) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
271 print STDERR "$out_fname exists and is not a file\n";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
272 exit 1;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
273 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
274 if (!open($OUT, ">$out_fname")) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
275 print STDERR "cannot create file $out_fname\n";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
276 exit 1;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
277 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
278 } else {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
279 $OUT = *STDOUT;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
280 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
281
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
282 # ... print header
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
283 if ($has_header) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
284 print $OUT "#name\tlength\tCpGs\tGpCs\tCs\tGs\tNs\tCpG o\/e\n";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
285 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
286
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
287
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
288
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
289 # ... for each sequence calculate CpGo/e ratios and related quantities:
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
290 # - length of the sequence
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
291 # - CpGs present in the sequence
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
292 # - GpCs present in the sequence
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
293 # - Cs the number of C present in the sequence
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
294 # - Gs the number of G present in the sequence
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
295 # - CpG o/e ratio of the sequence
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
296 my $num_short = 0; # number of sequences which are too short
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
297 for my $i (0 .. $#names) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
298 my @ar = ();
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
299 @ar = split('\|', $names[$i]);
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
300 my $seqname = $ar[1];
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
301 my $num_N = () = ( $seqs[$i] =~ m/N/gi );
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
302 my $len = length($seqs[$i]);
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
303 my $l = $len - $num_N;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
304 if ($l >= $min_len) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
305 my ($num_G, $num_CG);
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
306 if ($context eq 'CpG') {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
307 $num_G = () = ( $seqs[$i] =~ m/G/gi );
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
308 $num_CG = () = ( $seqs[$i] =~ m/CG/gi );
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
309 } elsif ($context eq 'CpA') {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
310 $num_G = () = ( $seqs[$i] =~ m/A/gi );
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
311 $num_CG = () = ( $seqs[$i] =~ m/CA/gi );
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
312 } elsif ($context eq 'CpC') {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
313 $num_G = () = ( $seqs[$i] =~ m/C/gi );
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
314 $num_CG = () = ( $seqs[$i] =~ m/CC/gi );
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
315 } elsif ($context eq 'CpT') {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
316 $num_G = () = ( $seqs[$i] =~ m/T/gi );
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
317 $num_CG = () = ( $seqs[$i] =~ m/CT/gi );
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
318 } else {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
319 $num_G = 0;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
320 $num_CG = 0;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
321 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
322 my $num_C = () = ( $seqs[$i] =~ m/C/gi );
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
323 my $num_TG = () = ( $seqs[$i] =~ m/TG/gi );
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
324 my $CpGoe;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
325 if ( ($num_G == 0) || ($num_C == 0) || ($l == 1) || ($num_CG == 0) ) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
326 $CpGoe = 0;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
327 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
328 else {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
329 if ($algo == 1) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
330 my $x = $num_CG / ($num_C * $num_G);
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
331 my $y = $l**2 / ($l - 1);
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
332 $CpGoe = $x * $y;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
333 } elsif ($algo == 2) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
334 # cf.Gardiner-Garden and Frommer
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
335 $CpGoe = ($num_CG/($num_C * $num_G))*$l;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
336 } elsif ($algo == 3) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
337 # cf. Zeng and Yi
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
338 $CpGoe = ($num_CG / $l)/(($num_C + $num_G)/$l)**2;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
339 } elsif ($algo == 4) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
340 # cf. Saxonov, Berg and Brutlag
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
341 $CpGoe = $num_CG / (($num_C + $num_G)/2)**2;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
342 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
343 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
344 print $OUT $names[$i] . "\t";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
345 if ($is_detailed) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
346 if ($algo == 3) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
347 print $OUT $len . "\t" . $num_CG . "\t" . $num_TG . "\t" .$num_C. "\t" .$num_G. "\t" .$num_N. "\t";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
348 } else {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
349 print $OUT $len . "\t" . $num_CG . "\t" . $num_C . "\t" .$num_G. "\t" .$num_N. "\t";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
350 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
351 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
352 print $OUT $CpGoe . "\n";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
353 } else {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
354 ++$num_short;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
355 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
356 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
357 if ($is_verbose) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
358 print $num_short . " sequence(s) discarded due to short length.\n";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
359 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
360
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
361 if ($has_file_output) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
362 my $res = close($OUT);
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
363 if (!$res) {
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
364 print STDERR "could not close $out_fname\n";
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
365 exit 1;
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
366 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
367 }
1535ffddeff4 planemo upload commit a7ac27de550a07fd6a3e3ea3fb0de65f3a10a0e6-dirty
cristian
parents:
diff changeset
368