comparison breakMAF.pl @ 2:434332033e82 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/rnacode commit b6d3800e260cdfcbe99e5f056344c3df101dad00
author rnateam
date Fri, 13 Apr 2018 07:40:08 -0400
parents
children
comparison
equal deleted inserted replaced
1:7a84c6c1c4e0 2:434332033e82
1 #!/usr/bin/perl -w
2
3 use strict;
4 use Getopt::Long;
5 use POSIX qw(ceil floor);
6
7 my $maxLength = 400;
8 my $desiredLength = 200;
9 my $help = 0;
10
11 GetOptions(
12 "maxLength:i" => \$maxLength,
13 "desiredLength:i" => \$desiredLength,
14 "help" => \$help,
15 "h" => \$help
16 );
17
18 if ($help) {
19 print STDERR "\nbreakMAF [options] < input.maf > output.maf\n\n";
20 print STDERR "Options:\n";
21 print STDERR " maxLength: Break all blocks longer than that (default: 400 columns)\n";
22 print STDERR " desiredLength: Try to create blocks of this size (default: 200 columns)\n";
23 exit(1);
24 }
25
26 $/ = 'a score=';
27
28 while ( my $block = <> ) {
29
30 $block = 'a score=' . $block;
31
32 my $currAln = readMAF($block)->[0];
33
34 next if not defined($currAln);
35
36 my $length = length( $currAln->[0]->{seq} );
37
38 if ( $length > $maxLength ) {
39
40 my $breakN = int( $length / $desiredLength );
41 my $chunkLength = ceil( $length / $breakN );
42
43 my $from = 0;
44
45 while (1) {
46 my $to = $from + $chunkLength;
47
48 if ( $to > $length ) {
49 $to = $length;
50 }
51
52 my $slice = sliceAlnByColumn( $currAln, $from, $to );
53
54 print formatAln( $slice, 'maf' ), "\n";
55
56 $from = $to;
57
58 last if $from == $length;
59 }
60
61 } else {
62 print formatAln( $currAln, 'maf' ), "\n";
63 }
64 }
65
66 ######################################################################
67 #
68 # readMAF($string)
69 #
70 # Converts the MAF in string to internal alignment format. Returns
71 # list of usual array references.
72 #
73 ######################################################################
74
75 sub readMAF {
76
77 my $string = shift;
78
79 return [] if $string eq '';
80
81 my @input = split( "\n", $string );
82
83 #open(FILE,"<$file") || die("Could not read $file ($!)");
84
85 my @outAlns = ();
86 my @aln = ();
87
88 foreach my $i ( 0 .. $#input ) {
89
90 $_ = $input[$i];
91
92 next if (/^\s?\#/);
93 next if (/^\s?a/);
94
95 if (/^\s?s/) {
96 ( my $dummy, my $name, my $start, my $length, my $strand, my $fullLength, my $seq ) = split;
97
98 my $end = $start + $length;
99
100 ( my $org, my $chrom ) = split( /\./, $name );
101
102 push @aln, {
103 name => $name,
104 org => $org,
105 chrom => $chrom,
106 start => $start,
107 end => $end,
108 seq => $seq,
109 strand => $strand
110 };
111 }
112
113 if ( /^\s?$/ and @aln ) {
114 push @outAlns, [@aln];
115 @aln = ();
116 }
117 if ( ( not defined $input[ $i + 1 ] ) and (@aln) ) {
118 push @outAlns, [@aln];
119 @aln = ();
120 }
121 }
122 return \@outAlns;
123 }
124
125 sub formatAln {
126
127 my @aln = @{ $_[0] };
128
129 my $format = $_[1];
130
131 $format = lc($format);
132
133 my @alnSeqs = ();
134 my @alnNames = ();
135
136 my $counter = 1;
137
138 foreach my $row (@aln) {
139
140 my $name = "seq$counter";
141 $counter++;
142 if ( $row->{name} ) {
143 $name = ( $row->{name} );
144 } elsif ( $row->{org} and $row->{chrom} ) {
145 $name = "$row->{org}.$row->{chrom}";
146 }
147
148 my $start = $row->{start};
149 my $end = $row->{end};
150 my $strand = $row->{strand};
151
152 my $pos = '';
153
154 if ( defined $start
155 and defined $end
156 and defined $strand
157 and ( $format ne 'phylip' ) ) {
158 ( $start, $end ) = ( $end, $start ) if ( $strand eq '-' );
159 $pos = "/$start-$end";
160 }
161
162 push @alnNames, "$name$pos";
163 push @alnSeqs, $row->{seq};
164
165 }
166
167 my $output = '';
168
169 if ( $format eq 'clustal' ) {
170
171 $output = "CLUSTAL W(1.81) multiple sequence alignment\n\n\n";
172 my $maxName = 0;
173
174 foreach my $name (@alnNames) {
175 $maxName = ( $maxName < length($name) ) ? length($name) : $maxName;
176 }
177
178 for my $i ( 0 .. $#alnNames ) {
179 my $buffer = " " x ( ( $maxName + 6 ) - length( $alnNames[$i] ) );
180 $alnNames[$i] .= $buffer;
181 }
182 my $columnWidth = 60;
183 my $currPos = 0;
184 my $length = length( $alnSeqs[0] );
185
186 while ( $currPos < $length ) {
187 for my $i ( 0 .. $#alnNames ) {
188 $output .= $alnNames[$i];
189 $output .= substr( $alnSeqs[$i], $currPos, $columnWidth );
190 $output .= "\n";
191 }
192 $output .= "\n\n";
193 $currPos += $columnWidth;
194 }
195 } elsif ( $format eq 'fasta' ) {
196 foreach my $i ( 0 .. $#alnNames ) {
197 my $name = $alnNames[$i];
198 my $seq = $alnSeqs[$i];
199 $seq =~ s/(.{60})/$1\n/g;
200 $output .= ">$name\n$seq\n";
201 }
202 } elsif ( $format eq 'phylip' ) {
203 my $length = length( $alnSeqs[0] );
204 my $N = @alnSeqs;
205 $output .= " $N $length\n";
206 foreach my $i ( 0 .. $#alnNames ) {
207 my $name = $alnNames[$i];
208 my $seq = $alnSeqs[$i];
209 $seq =~ s/(.{60})/$1\n/g;
210 $output .= "$name\n$seq\n";
211 }
212 } elsif ( $format eq 'maf' ) {
213 $output .= "a score=0\n";
214 foreach my $row (@aln) {
215 my $length = $row->{end} - $row->{start};
216 $output .= "s $row->{org}.$row->{chrom} $row->{start} $length $row->{strand} 0 $row->{seq}\n";
217 }
218 }
219 return $output;
220
221 }
222
223 ######################################################################
224 #
225 # sliceAlnByColumn(\@aln ref-to-alignment, $start int, $end int)
226 #
227 # Returns slice of alignment specified by alignment column.
228 #
229 # \@aln ... alignment in list of hash format
230 # $start, $end ... slice to cut
231 #
232 # Returns reference to alignment in list of hash format. This is a new
233 # alignment, i.e. the input is not sliced in place
234 #
235 ######################################################################
236
237 sub sliceAlnByColumn {
238
239 my @aln = @{ $_[0] };
240 shift;
241 ( my $start, my $end ) = @_;
242
243 # correct ends without warning if outside of valid range
244 $start = 0 if ( $start < 0 );
245 $end = length( $aln[0]->{seq} ) if ( $end > length( $aln[0]->{seq} ) );
246
247 #my @newAln=@aln;
248
249 # make deep copy of list of hash
250 my @newAln = ();
251 foreach (@aln) {
252 push @newAln, { %{$_} };
253 }
254
255 foreach my $i ( 0 .. $#newAln ) {
256
257 my $oldStart = $newAln[$i]->{start};
258 my $oldEnd = $newAln[$i]->{end};
259
260 $newAln[$i]->{start} = alnCol2genomePos( $newAln[$i]->{seq}, $oldStart, $start );
261 $newAln[$i]->{end} = alnCol2genomePos( $newAln[$i]->{seq}, $oldStart, $end - 1 ) + 1;
262 $newAln[$i]->{seq} = substr( $newAln[$i]->{seq}, $start, $end - $start );
263
264 }
265
266 return ( [@newAln] );
267
268 }
269
270 ######################################################################
271 #
272 # alnCol2genomePos($seq string, $start int, $col int)
273 #
274 # Calculates the genomic position corresponding to a column in an
275 # alignment.
276 #
277 # $seq ... sequence from alignment (i.e. letters with gaps)
278 # $start ... Genomic position of first letter in $seq
279 # $col ... column in the alignment that is to be mapped
280 #
281 # Returns genomic position. No error handling, so $col must be a valid
282 # column of the string $seq.
283 #
284 #######################################################################
285
286 sub alnCol2genomePos {
287
288 ( my $seq, my $start, my $col ) = @_;
289
290 $seq =~ s/\./-/g; #Convert all gaps to "-"
291
292 my $newPos = $start;
293
294 # if gap only...
295 return $start if ( $seq =~ /^-+$/ );
296
297 ( my $tmp ) = $seq =~ /(-*)[^-]/;
298
299 my $leadingGaps = length($tmp);
300
301 # if column is in gap before first letter,
302 # return position of the first letter
303 return $start if ( $col < $leadingGaps );
304
305 $newPos = $start - 1;
306
307 for my $i ( $leadingGaps .. $col ) {
308 $newPos++ if ( ( my $t = substr( $seq, $i, 1 ) ) ne '-' );
309 }
310 return $newPos;
311 }
312