Mercurial > repos > ucsb-phylogenetics > osiris_phylogenetics
comparison orthologs/ucsb_hamster/lib/run_genewise.pm @ 0:5b9a38ec4a39 draft default tip
First commit of old repositories
| author | osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> |
|---|---|
| date | Tue, 11 Mar 2014 12:19:13 -0700 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:5b9a38ec4a39 |
|---|---|
| 1 package run_genewise; | |
| 2 use strict; | |
| 3 $ENV{'WISECONFIGDIR'} = "/home/osiris/galaxy-dist/tools/osiris/orthologs/ucsb_hamster/lib/wisecfg"; | |
| 4 # this module runs genewise on a DNA sequence and a protein sequence | |
| 5 # and then allows to parse this result. | |
| 6 # the constructor creates an object containing a reference to an array | |
| 7 # containing the file content | |
| 8 1; | |
| 9 sub new { | |
| 10 my $self_tmp = []; | |
| 11 my $self; | |
| 12 my ($class, $dna, $prot, $path) = @_; | |
| 13 if (!defined $path) { | |
| 14 $path = '/tmp'; | |
| 15 } | |
| 16 $dna =~ s/R/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence | |
| 17 $dna =~ s/S/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence | |
| 18 $dna =~ s/W/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence | |
| 19 $dna =~ s/D/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence | |
| 20 $dna =~ s/K/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence | |
| 21 $dna =~ s/Y/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence | |
| 22 $dna =~ s/B/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence | |
| 23 $dna =~ s/V/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence | |
| 24 $dna =~ s/M/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence | |
| 25 | |
| 26 # the file names | |
| 27 my $protname = 'protein'; | |
| 28 my $dnaname = 'dna'; | |
| 29 | |
| 30 ## print the two sequences to default path /tmp/ | |
| 31 open (DNA, ">$path/dna.fa") or die "could not open $path/dna.fa for writing\n"; | |
| 32 print DNA ">$dnaname\n$dna"; | |
| 33 close DNA; | |
| 34 open (PROTEIN, ">$path/prot.fa") or die "could not open $path/prot.fa for writing\n"; | |
| 35 print PROTEIN ">$protname\n$prot"; | |
| 36 close PROTEIN; | |
| 37 | |
| 38 ## run genewise on the two sequences | |
| 39 `echo \$WISECONFIGDIR`; | |
| 40 | |
| 41 # $self_tmp = [`.\/genewise -trans -cdna -pep -sum $path/prot.fa $path/dna.fa`]; | |
| 42 #THO--For Galaxy run Genewise in the path | |
| 43 $self_tmp = [`genewise -trans -cdna -pep -sum $path/prot.fa $path/dna.fa`]; | |
| 44 for (my $i = 0; $i < @$self_tmp; $i++) { | |
| 45 $self_tmp->[$i] =~ s/\s{1,}$//; | |
| 46 } | |
| 47 $self->{gw} = $self_tmp; | |
| 48 $self->{nt_seq} = $dna; | |
| 49 $self->{prot_seq} = $prot; | |
| 50 $self->{protname} = $protname; | |
| 51 $self->{dnaname} = $dnaname; | |
| 52 $self->{gw_count} = @$self_tmp; | |
| 53 $self->{get_indel} = 1; ## per default the indel-part is recovererd, rather than masked by 'N'. See code for details | |
| 54 $self->{indels} = _GetIndels($self_tmp); | |
| 55 bless ($self, $class); | |
| 56 return $self;} | |
| 57 ################# | |
| 58 ## sub score extract the score for the alignment | |
| 59 sub score { | |
| 60 my $self = shift; | |
| 61 my $score; | |
| 62 for (my $i = 0; $i < $self->{gw_count}; $i ++) { | |
| 63 if ($self->{gw}->[$i] =~ /^(\d{1,}\.{0,1}\d{0,}).*/) { | |
| 64 $score = $1; | |
| 65 last; | |
| 66 } | |
| 67 } | |
| 68 return ($score); | |
| 69 } | |
| 70 ################## | |
| 71 sub protein { | |
| 72 my $self = shift; | |
| 73 my $gw = $self->{gw}; | |
| 74 my $prot = ''; | |
| 75 for (my $i = 0; $i < @$gw; $i++) { | |
| 76 if ($gw->[$i] =~ />.*\.pep/) { #the protein seq starts | |
| 77 my $count = 1; | |
| 78 while ($gw->[$i+$count] ne '//') { | |
| 79 my $protpart = $gw->[$i+$count]; | |
| 80 chomp $protpart; | |
| 81 $prot .= $protpart; | |
| 82 $count ++; | |
| 83 } | |
| 84 } | |
| 85 elsif (length $prot > 0) { | |
| 86 last; | |
| 87 } | |
| 88 } | |
| 89 return($prot); | |
| 90 } | |
| 91 ################## | |
| 92 sub translation { | |
| 93 my $self = shift; | |
| 94 my $finish = 0; | |
| 95 my $translated_seq = ''; | |
| 96 my @transtmp; | |
| 97 | |
| 98 ## step 1: extract the relevant info from the genewise output | |
| 99 | |
| 100 for (my $i = 0; $i < $self->{gw_count}; $i++) { | |
| 101 if ($self->{gw}->[$i] =~ />.*.tr/) {# a translated bit starts | |
| 102 while ($self->{gw}->[$i] !~ '//') { | |
| 103 push @transtmp, $self->{gw}->[$i]; | |
| 104 $i++; | |
| 105 } | |
| 106 last; # end the for loop since nothing left to be done | |
| 107 } | |
| 108 } | |
| 109 | |
| 110 ## step two: get the sequences | |
| 111 my $count = -1; | |
| 112 my $trans; | |
| 113 for (my $i = 0; $i < @transtmp; $i++) { | |
| 114 if ($transtmp[$i] =~ />/) { | |
| 115 $count++; | |
| 116 $trans->[$count]->{seq} = ''; # initialize | |
| 117 if ($transtmp[$i] =~ /.*\[([0-9]{1,}):([0-9]{1,})\].*/) { | |
| 118 $trans->[$count]->{start} = $1; | |
| 119 $trans->[$count]->{end} = $2; | |
| 120 } | |
| 121 } | |
| 122 else { | |
| 123 $trans->[$count]->{seq} .= $transtmp[$i]; | |
| 124 } | |
| 125 } | |
| 126 | |
| 127 ## step 3: connect the fragments | |
| 128 if (@$trans == 1) { | |
| 129 $translated_seq = $trans->[0]->{seq}; | |
| 130 } | |
| 131 else { | |
| 132 for (my $i = 0; $i < @$trans; $i++) { | |
| 133 $translated_seq .= $trans->[$i]->{seq}; | |
| 134 if ($i < (@$trans - 1)) { | |
| 135 my $missing = $trans->[$i+1]->{start} - $trans->[$i]->{end} -1; | |
| 136 $translated_seq .= 'X'; | |
| 137 } | |
| 138 } | |
| 139 } | |
| 140 return($translated_seq); | |
| 141 } | |
| 142 | |
| 143 ################## | |
| 144 sub codons { | |
| 145 my $self = shift; | |
| 146 my $finish = 0; | |
| 147 my $codon_seq = ''; | |
| 148 my @transtmp; | |
| 149 | |
| 150 ## step 1: extract the relevant info from the genewise output | |
| 151 for (my $i = 0; $i < $self->{gw_count}; $i++) { | |
| 152 if ($self->{gw}->[$i] =~ />.*sp$/) {# the codons set starts | |
| 153 while ($self->{gw}->[$i] !~ '//') { | |
| 154 push @transtmp, $self->{gw}->[$i]; | |
| 155 $i++; | |
| 156 } | |
| 157 last; # end the for loop since nothing left to be done | |
| 158 } | |
| 159 } | |
| 160 | |
| 161 ## step two: get the sequences | |
| 162 my $count = -1; | |
| 163 my $trans; | |
| 164 for (my $i = 0; $i < @transtmp; $i++) { | |
| 165 if ($transtmp[$i] =~ />/) { | |
| 166 $count++; | |
| 167 $trans->[$count]->{seq} = ''; # initialize | |
| 168 if ($transtmp[$i] =~ /.*\[([0-9]{1,}):([0-9]{1,})\].*/) { | |
| 169 $trans->[$count]->{start} = $1; | |
| 170 $trans->[$count]->{end} = $2; | |
| 171 } | |
| 172 } | |
| 173 else { | |
| 174 $transtmp[$i] =~ tr/a-z/A-Z/; | |
| 175 $trans->[$count]->{seq} .= $transtmp[$i]; | |
| 176 } | |
| 177 } | |
| 178 | |
| 179 ## step 3: connect the fragments | |
| 180 if ( @$trans == 1) { | |
| 181 $codon_seq = $trans->[0]->{seq}; | |
| 182 } | |
| 183 else { | |
| 184 for (my $i = 0; $i < @$trans; $i++) { | |
| 185 $codon_seq .= $trans->[$i]->{seq}; | |
| 186 if ($i < (@$trans - 1)) { | |
| 187 my $indel = ''; | |
| 188 my $missing = $trans->[$i+1]->{start} - $trans->[$i]->{end} -1; | |
| 189 | |
| 190 ## now decide whether the nts that did not got translated are masked by | |
| 191 ## 'N' or whether they will be represented as lower case letters | |
| 192 if ($self->{get_indel}) { | |
| 193 $indel = substr($self->{nt_seq}, $trans->[$i]->{end}, $missing); | |
| 194 $indel =~ tr/A-Z/a-z/; | |
| 195 } | |
| 196 else { | |
| 197 $indel = 'N' x $missing; | |
| 198 } | |
| 199 ## now append gap characters until the frame is recovered. Not that the gap | |
| 200 ## characters are added to the end of the indel-part. Thus, the codons are | |
| 201 ## not considered. | |
| 202 while (length($indel)%3 != 0) { | |
| 203 $indel .= '-'; | |
| 204 } | |
| 205 | |
| 206 $codon_seq .= $indel; | |
| 207 } | |
| 208 } | |
| 209 } | |
| 210 return ($codon_seq); | |
| 211 } | |
| 212 ########################### | |
| 213 sub protein_borders { | |
| 214 my $self = shift; | |
| 215 my $gw = $self->{gw}; | |
| 216 for (my $i = 0; $i < @$gw; $i++) { | |
| 217 if ($gw->[$i] =~ /Bits.*introns$/) { | |
| 218 my ($start, $end) = $gw->[$i+1] =~ /.*$self->{protname}\s{1,}([0-9]{1,})\s{1,}([0-9]{1,}).*/; | |
| 219 return($start, $end); | |
| 220 } | |
| 221 else { | |
| 222 die "no protein-start and end could not be determnined. Check genewise command\n"; | |
| 223 } | |
| 224 } | |
| 225 } | |
| 226 ########################## | |
| 227 sub cdna_borders { | |
| 228 my $self = shift; | |
| 229 my $gw = $self->{gw}; | |
| 230 for (my $i = 0; $i < @$gw; $i++) { | |
| 231 if ($gw->[$i] =~ /Bits.*introns$/) { | |
| 232 my ($start, $end) = $gw->[$i+1] =~ /.*$self->{dnaname}\s{1,}([0-9]{1,})\s{1,}([0-9]{1,}).*/; | |
| 233 return($start, $end); | |
| 234 } | |
| 235 else { | |
| 236 die "no cdna-start and end could not be determnined. Check genewise command\n"; | |
| 237 } | |
| 238 } | |
| 239 } | |
| 240 ########################## | |
| 241 sub _GetIndels { | |
| 242 my $gw = shift; | |
| 243 my $indel; | |
| 244 for (my $i = 0; $i < @$gw; $i++) { | |
| 245 if ($gw->[$i] =~ /Bits/) { | |
| 246 $indel = $gw->[$i+1] =~ /.*([0-9]{1,})/; | |
| 247 return($indel); | |
| 248 } | |
| 249 } | |
| 250 } |
