Mercurial > repos > mcharles > genephys
comparison genephys/GenePhys.pl @ 3:8dfa09868059 draft
Uploaded
author | mcharles |
---|---|
date | Fri, 24 Oct 2014 05:54:20 -0400 |
parents | |
children | 3d79224aa2dc |
comparison
equal
deleted
inserted
replaced
2:c52e74b98773 | 3:8dfa09868059 |
---|---|
1 #!/usr/bin/perl | |
2 #V1.1.0 integrated gene extraction | |
3 #V1.0.2 integrated segment fasta extraction | |
4 #V1.0.1 added log and option | |
5 #V1.0.0 | |
6 use strict; | |
7 use warnings; | |
8 use Getopt::Long; | |
9 | |
10 my $input_blast_files; | |
11 my $input_genes_position_file; | |
12 my $input_assembly_file; | |
13 my $input_markers_position_file; | |
14 my $input_markers_file; | |
15 my $log_file; | |
16 my $output_fasta_file; | |
17 my $output_segment_file; | |
18 my $output_genes_list_file; | |
19 my $EXTRACT_SEQ = "NO"; | |
20 my $WINDOW = 200000; | |
21 my $OFFSET = 100000; | |
22 my $MAX_BLAST_LINES = 1; | |
23 | |
24 GetOptions ( | |
25 "input_assembly_file=s" => \$input_assembly_file, | |
26 "input_markers_position_file=s" => \$input_markers_position_file, | |
27 "input_markers_file=s" => \$input_markers_file, | |
28 "log_file=s" => \$log_file, | |
29 "output_fasta_file=s" => \$output_fasta_file, | |
30 "output_segment_file=s" => \$output_segment_file, | |
31 "extractseq=s" => \$EXTRACT_SEQ, | |
32 "window=i" => \$WINDOW, | |
33 "offset=i" =>\$OFFSET, | |
34 "input_blast_files=s" => \$input_blast_files, | |
35 "input_genes_position_file=s"=> \$input_genes_position_file, | |
36 "output_genes_list_file=s"=>\$output_genes_list_file, | |
37 "max_blast_lines=i" => \$MAX_BLAST_LINES | |
38 ) or die("Error in command line arguments\n"); | |
39 | |
40 open(LF, ">$log_file") or die("Can't open $log_file\n"); | |
41 #print LF $EXTRACT_SEQ."\n"; | |
42 | |
43 my $current_annotation=""; | |
44 my @list_marquer; | |
45 my %chr; | |
46 my %position; | |
47 | |
48 open(MP, $input_markers_position_file) or die("Can't open $input_markers_position_file\n"); | |
49 | |
50 my $compt=0; | |
51 while (my $line=<MP>){ | |
52 $compt++; | |
53 my @cols = split(/\t/,$line); | |
54 if ($#cols != 3){ | |
55 print STDERR "Error in marker position file format\n$compt : $line\n"; | |
56 exit(0); | |
57 } | |
58 my %current; | |
59 # Number#Map#Name#Chr#Position#GeneAT#FunctionAT | |
60 my $Name = $cols[0]; | |
61 my $Locus = $cols[1]; | |
62 my $Chr = $cols[2]; | |
63 my $Position = $cols[3]; | |
64 | |
65 | |
66 $chr{$Name} = $Chr; | |
67 $position{$Name} = $Position; | |
68 | |
69 ### Modification 0.9.9 | |
70 if ($Locus ne $Name){ | |
71 $chr{$Locus} = $Chr; | |
72 $position{$Locus} = $Position; | |
73 } | |
74 ### | |
75 | |
76 } | |
77 close (MP); | |
78 | |
79 open(MA, $input_markers_file) or die("Can't open $input_markers_file\n"); | |
80 while (my $line=<MA>){ | |
81 my @cols = split (/\s+/,$line); | |
82 for (my $i=0;$i<=$#cols;$i++){ | |
83 my $current = $cols[$i]; | |
84 chomp($current); | |
85 if ($current !~ /^\s+$/){ | |
86 push(@list_marquer,$current); | |
87 } | |
88 } | |
89 } | |
90 close (MA); | |
91 | |
92 my %coord_by_chr; | |
93 for (my $i=0;$i<=$#list_marquer;$i++){ | |
94 my $current_name = $list_marquer[$i]; | |
95 my $current_chr = $chr{$current_name}; | |
96 my $current_position = $position{$current_name}; | |
97 | |
98 if ($current_position =~ /^\d+$/){ | |
99 my @tbl_coord_for_current_chr; | |
100 if ($coord_by_chr{$current_chr}){ | |
101 @tbl_coord_for_current_chr = @{$coord_by_chr{$current_chr}}; | |
102 } | |
103 push(@tbl_coord_for_current_chr,$current_position); | |
104 $coord_by_chr{$current_chr}=\@tbl_coord_for_current_chr; | |
105 } | |
106 elsif (($current_position =~/\s*-\s*/)||($current_position =~/none/i)){ | |
107 | |
108 } | |
109 else { | |
110 chomp($current_position); | |
111 print STDERR "Error Parsing $current_name\tposition not recognized : $current_position \n"; | |
112 print $list_marquer[$i],"\n"; | |
113 } | |
114 } | |
115 | |
116 open(OS, ">$output_segment_file") or die ("Can't open $output_segment_file\n"); | |
117 | |
118 my @segment_chr; | |
119 my @segment_start; | |
120 my @segment_end; | |
121 | |
122 foreach my $key (sort keys %coord_by_chr){ | |
123 my @tbl_coord = @{$coord_by_chr{$key}}; | |
124 @tbl_coord = sort { $a <=> $b } @tbl_coord; | |
125 my $current_start; | |
126 my $current_stop; | |
127 my $current_start_with_offset; | |
128 my $current_stop_with_offset; | |
129 | |
130 for (my $i=0;$i<=$#tbl_coord;$i++){ | |
131 if (!$current_start){$current_start=$tbl_coord[$i];$current_stop=$tbl_coord[$i]} | |
132 | |
133 # print "$i : $current_start / $current_stop\n"; | |
134 if ($tbl_coord[$i]>$current_stop+$WINDOW){ | |
135 #OFFSET | |
136 if ($current_start>$OFFSET){$current_start_with_offset=$current_start-$OFFSET;}else{$current_start_with_offset=1;} | |
137 $current_stop_with_offset = $current_stop + $OFFSET; | |
138 ####### | |
139 print OS $key,":",$current_start_with_offset,"..",$current_stop_with_offset,"\n"; | |
140 push(@segment_chr,$key); | |
141 push(@segment_start,$current_start_with_offset); | |
142 push(@segment_end,$current_stop_with_offset); | |
143 | |
144 $current_start = $tbl_coord[$i]; | |
145 $current_stop = $tbl_coord[$i]; | |
146 | |
147 if ($i==$#tbl_coord){ | |
148 #OFFSET | |
149 if ($current_start>$OFFSET){$current_start_with_offset=$current_start-$OFFSET;}else{$current_start_with_offset=1;} | |
150 $current_stop_with_offset = $current_stop + $OFFSET; | |
151 ####### | |
152 print OS $key,":",$current_start_with_offset,"..",$current_stop_with_offset,"\n"; | |
153 push(@segment_chr,$key); | |
154 push(@segment_start,$current_start_with_offset); | |
155 push(@segment_end,$current_stop_with_offset); | |
156 } | |
157 } | |
158 else { | |
159 $current_stop=$tbl_coord[$i]; | |
160 if ($i==$#tbl_coord){ | |
161 #OFFSET | |
162 if ($current_start>$OFFSET){$current_start_with_offset=$current_start-$OFFSET;}else{$current_start_with_offset=1;} | |
163 $current_stop_with_offset = $current_stop + $OFFSET; | |
164 ####### | |
165 print OS $key,":",$current_start_with_offset,"..",$current_stop_with_offset,"\n"; | |
166 push(@segment_chr,$key); | |
167 push(@segment_start,$current_start_with_offset); | |
168 push(@segment_end,$current_stop_with_offset); | |
169 } | |
170 } | |
171 } | |
172 } | |
173 close(OS); | |
174 | |
175 ### Sequence extraction | |
176 if ($EXTRACT_SEQ eq "YES"){ | |
177 | |
178 my %genome; | |
179 my $current_header; | |
180 my $current_seq=""; | |
181 open(AF, $input_assembly_file) or die ("Can't open $input_assembly_file\n"); | |
182 | |
183 while (my $ligne = <AF>){ | |
184 if ($ligne =~ /^\>(.*?)\s*$/){ | |
185 if ($current_header){ | |
186 $genome{$current_header} = $current_seq; | |
187 } | |
188 $current_header=$1; | |
189 $current_seq = ""; | |
190 } | |
191 else { | |
192 if ($ligne=~/^([ATGCNXatgcnx]+)\s*$/){ | |
193 $current_seq .= $1; | |
194 } | |
195 else { | |
196 print STDERR "Erreur Parsing n°1\n$ligne\n"; | |
197 } | |
198 } | |
199 } | |
200 | |
201 #TRAITEMENT DU DERNIER | |
202 if ($current_header){ | |
203 $genome{$current_header} = $current_seq; | |
204 undef($current_seq); | |
205 } | |
206 close (AF); | |
207 | |
208 open(OF, ">$output_fasta_file") or die ("Can't open $output_fasta_file\n"); | |
209 for (my $i=0;$i<=$#segment_chr;$i++){ | |
210 my $compt=0; | |
211 my $current_seq=""; | |
212 print OF ">",$segment_chr[$i],":",$segment_start[$i],"..",$segment_end[$i]."\n"; | |
213 ### Modification 0.9.9 | |
214 if ($segment_end[$i]>length($genome{$segment_chr[$i]})){ | |
215 $segment_end[$i] = length($genome{$segment_chr[$i]}); | |
216 } | |
217 ### | |
218 | |
219 my @SEQ = split(//,$genome{$segment_chr[$i]}); | |
220 for (my $coord = $segment_start[$i]-1; $coord<=$segment_end[$i]-1;$coord++){ | |
221 $compt++; | |
222 if ($compt > 60 ){ | |
223 $current_seq .= "\n"; | |
224 $compt=1; | |
225 } | |
226 $current_seq .= $SEQ[$coord]; | |
227 | |
228 } | |
229 print OF "$current_seq\n"; | |
230 } | |
231 close (OF); | |
232 } | |
233 | |
234 ### GENE and BLAST Extraction | |
235 my @blast_by_base; | |
236 my @header; | |
237 | |
238 | |
239 my @blastfiles = split(/\,/,$input_blast_files); | |
240 for (my $i=0;$i<=$#blastfiles;$i++){ | |
241 my $current_blast_file = $blastfiles[$i]; | |
242 my $current_blast_header = "DEFAULT"; | |
243 my %current_blast; | |
244 open (B,"$current_blast_file") or die ("Can't open $current_blast_file\n"); | |
245 while (my $line =<B>){ | |
246 if ($line =~ /^\#\#(.*?)$/){ | |
247 $current_blast_header = $1; | |
248 print LF $current_blast_header."\n"; | |
249 } | |
250 elsif ($line =~ /^\#/){ | |
251 # blast file column legend | |
252 } | |
253 else { | |
254 my @fields = split(/\s+/,$line); | |
255 my $gene_id = $fields[0]; | |
256 my @blast_for_this_gene; | |
257 if ($current_blast{$gene_id}){ | |
258 @blast_for_this_gene = @{$current_blast{$gene_id}}; | |
259 } | |
260 | |
261 if ($#blast_for_this_gene<$MAX_BLAST_LINES-1){ | |
262 push(@blast_for_this_gene,$line); | |
263 print LF $gene_id,"\n"; | |
264 } | |
265 $current_blast{$gene_id}=\@blast_for_this_gene; | |
266 } | |
267 } | |
268 close(B); | |
269 push (@blast_by_base,\%current_blast); | |
270 push (@header,$current_blast_header); | |
271 } | |
272 | |
273 | |
274 open (OGL,">$output_genes_list_file") or die ("Can't open $output_genes_list_file\n"); | |
275 | |
276 for (my $i=0;$i<=$#segment_chr;$i++){ | |
277 my $segment_chr = $segment_chr[$i]; | |
278 my $segment_start = $segment_start[$i]; | |
279 my $segment_end = $segment_end[$i]; | |
280 | |
281 print OGL "#",$segment_chr[$i],":",$segment_start[$i],"..",$segment_end[$i],"\n"; | |
282 | |
283 open(IG, $input_genes_position_file) or die("Can't open $input_genes_position_file\n"); | |
284 while (my $gene_desc=<IG>){ | |
285 my @gene_desc = split(/\s+/,$gene_desc); | |
286 if ($#gene_desc != 4){ | |
287 print STDERR "Error in gene position file format\n$gene_desc\n"; | |
288 exit(0); | |
289 } | |
290 my $gene_id = $gene_desc[0]; | |
291 my $cds_id = $gene_desc[1]; | |
292 my $gene_chr = $gene_desc[2]; | |
293 my $gene_start = $gene_desc[3]; | |
294 my $gene_end = $gene_desc[4]; | |
295 if ($segment_chr eq $gene_chr){ | |
296 if ((($gene_start>=$segment_start)&&($gene_start<=$segment_end))||(($gene_end>=$segment_start)&&($gene_end<=$segment_end))){ | |
297 print OGL $gene_id," / ",$cds_id,"\n"; | |
298 | |
299 for (my $i=0;$i<=$#blast_by_base;$i++){ | |
300 #print LF $header[$i]."\n"; | |
301 my %current_blast = %{$blast_by_base[$i]}; | |
302 if ($current_blast{$cds_id}){ | |
303 my @blast_by_gene = @{$current_blast{$cds_id}}; | |
304 #print LF $#blast_by_gene."\n"; | |
305 for (my $j=0;$j<=$#blast_by_gene;$j++){ | |
306 my @fields = split(/\t/,$blast_by_gene[$j]); | |
307 print OGL $header[$i],"\t"; | |
308 print OGL $fields[1],"\t"; | |
309 print OGL $fields[3],"\t"; | |
310 print OGL $fields[4],"\t"; | |
311 print OGL $fields[5],"\t"; | |
312 print OGL $fields[10],"\t"; | |
313 print OGL $fields[6],"..",$fields[7],"(",$fields[11],")","\t"; | |
314 print OGL $fields[8],"..",$fields[9],"(",$fields[12],")","\t"; | |
315 print OGL $fields[13]; | |
316 } | |
317 print OGL "\n"; | |
318 } | |
319 else { | |
320 print OGL $header[$i],"\t","No BLAST results\n"; | |
321 print LF $gene_id," / ",$cds_id,"\n"; | |
322 } | |
323 } | |
324 } | |
325 } | |
326 | |
327 } | |
328 close(IG); | |
329 } | |
330 | |
331 | |
332 close (OGL); | |
333 | |
334 close (LF); | |
335 |