comparison genephys/GenePhys.pl @ 3:8dfa09868059 draft

Uploaded
author mcharles
date Fri, 24 Oct 2014 05:54:20 -0400
parents
children 3d79224aa2dc
comparison
equal deleted inserted replaced
2:c52e74b98773 3:8dfa09868059
1 #!/usr/bin/perl
2 #V1.1.0 integrated gene extraction
3 #V1.0.2 integrated segment fasta extraction
4 #V1.0.1 added log and option
5 #V1.0.0
6 use strict;
7 use warnings;
8 use Getopt::Long;
9
10 my $input_blast_files;
11 my $input_genes_position_file;
12 my $input_assembly_file;
13 my $input_markers_position_file;
14 my $input_markers_file;
15 my $log_file;
16 my $output_fasta_file;
17 my $output_segment_file;
18 my $output_genes_list_file;
19 my $EXTRACT_SEQ = "NO";
20 my $WINDOW = 200000;
21 my $OFFSET = 100000;
22 my $MAX_BLAST_LINES = 1;
23
24 GetOptions (
25 "input_assembly_file=s" => \$input_assembly_file,
26 "input_markers_position_file=s" => \$input_markers_position_file,
27 "input_markers_file=s" => \$input_markers_file,
28 "log_file=s" => \$log_file,
29 "output_fasta_file=s" => \$output_fasta_file,
30 "output_segment_file=s" => \$output_segment_file,
31 "extractseq=s" => \$EXTRACT_SEQ,
32 "window=i" => \$WINDOW,
33 "offset=i" =>\$OFFSET,
34 "input_blast_files=s" => \$input_blast_files,
35 "input_genes_position_file=s"=> \$input_genes_position_file,
36 "output_genes_list_file=s"=>\$output_genes_list_file,
37 "max_blast_lines=i" => \$MAX_BLAST_LINES
38 ) or die("Error in command line arguments\n");
39
40 open(LF, ">$log_file") or die("Can't open $log_file\n");
41 #print LF $EXTRACT_SEQ."\n";
42
43 my $current_annotation="";
44 my @list_marquer;
45 my %chr;
46 my %position;
47
48 open(MP, $input_markers_position_file) or die("Can't open $input_markers_position_file\n");
49
50 my $compt=0;
51 while (my $line=<MP>){
52 $compt++;
53 my @cols = split(/\t/,$line);
54 if ($#cols != 3){
55 print STDERR "Error in marker position file format\n$compt : $line\n";
56 exit(0);
57 }
58 my %current;
59 # Number#Map#Name#Chr#Position#GeneAT#FunctionAT
60 my $Name = $cols[0];
61 my $Locus = $cols[1];
62 my $Chr = $cols[2];
63 my $Position = $cols[3];
64
65
66 $chr{$Name} = $Chr;
67 $position{$Name} = $Position;
68
69 ### Modification 0.9.9
70 if ($Locus ne $Name){
71 $chr{$Locus} = $Chr;
72 $position{$Locus} = $Position;
73 }
74 ###
75
76 }
77 close (MP);
78
79 open(MA, $input_markers_file) or die("Can't open $input_markers_file\n");
80 while (my $line=<MA>){
81 my @cols = split (/\s+/,$line);
82 for (my $i=0;$i<=$#cols;$i++){
83 my $current = $cols[$i];
84 chomp($current);
85 if ($current !~ /^\s+$/){
86 push(@list_marquer,$current);
87 }
88 }
89 }
90 close (MA);
91
92 my %coord_by_chr;
93 for (my $i=0;$i<=$#list_marquer;$i++){
94 my $current_name = $list_marquer[$i];
95 my $current_chr = $chr{$current_name};
96 my $current_position = $position{$current_name};
97
98 if ($current_position =~ /^\d+$/){
99 my @tbl_coord_for_current_chr;
100 if ($coord_by_chr{$current_chr}){
101 @tbl_coord_for_current_chr = @{$coord_by_chr{$current_chr}};
102 }
103 push(@tbl_coord_for_current_chr,$current_position);
104 $coord_by_chr{$current_chr}=\@tbl_coord_for_current_chr;
105 }
106 elsif (($current_position =~/\s*-\s*/)||($current_position =~/none/i)){
107
108 }
109 else {
110 chomp($current_position);
111 print STDERR "Error Parsing $current_name\tposition not recognized : $current_position \n";
112 print $list_marquer[$i],"\n";
113 }
114 }
115
116 open(OS, ">$output_segment_file") or die ("Can't open $output_segment_file\n");
117
118 my @segment_chr;
119 my @segment_start;
120 my @segment_end;
121
122 foreach my $key (sort keys %coord_by_chr){
123 my @tbl_coord = @{$coord_by_chr{$key}};
124 @tbl_coord = sort { $a <=> $b } @tbl_coord;
125 my $current_start;
126 my $current_stop;
127 my $current_start_with_offset;
128 my $current_stop_with_offset;
129
130 for (my $i=0;$i<=$#tbl_coord;$i++){
131 if (!$current_start){$current_start=$tbl_coord[$i];$current_stop=$tbl_coord[$i]}
132
133 # print "$i : $current_start / $current_stop\n";
134 if ($tbl_coord[$i]>$current_stop+$WINDOW){
135 #OFFSET
136 if ($current_start>$OFFSET){$current_start_with_offset=$current_start-$OFFSET;}else{$current_start_with_offset=1;}
137 $current_stop_with_offset = $current_stop + $OFFSET;
138 #######
139 print OS $key,":",$current_start_with_offset,"..",$current_stop_with_offset,"\n";
140 push(@segment_chr,$key);
141 push(@segment_start,$current_start_with_offset);
142 push(@segment_end,$current_stop_with_offset);
143
144 $current_start = $tbl_coord[$i];
145 $current_stop = $tbl_coord[$i];
146
147 if ($i==$#tbl_coord){
148 #OFFSET
149 if ($current_start>$OFFSET){$current_start_with_offset=$current_start-$OFFSET;}else{$current_start_with_offset=1;}
150 $current_stop_with_offset = $current_stop + $OFFSET;
151 #######
152 print OS $key,":",$current_start_with_offset,"..",$current_stop_with_offset,"\n";
153 push(@segment_chr,$key);
154 push(@segment_start,$current_start_with_offset);
155 push(@segment_end,$current_stop_with_offset);
156 }
157 }
158 else {
159 $current_stop=$tbl_coord[$i];
160 if ($i==$#tbl_coord){
161 #OFFSET
162 if ($current_start>$OFFSET){$current_start_with_offset=$current_start-$OFFSET;}else{$current_start_with_offset=1;}
163 $current_stop_with_offset = $current_stop + $OFFSET;
164 #######
165 print OS $key,":",$current_start_with_offset,"..",$current_stop_with_offset,"\n";
166 push(@segment_chr,$key);
167 push(@segment_start,$current_start_with_offset);
168 push(@segment_end,$current_stop_with_offset);
169 }
170 }
171 }
172 }
173 close(OS);
174
175 ### Sequence extraction
176 if ($EXTRACT_SEQ eq "YES"){
177
178 my %genome;
179 my $current_header;
180 my $current_seq="";
181 open(AF, $input_assembly_file) or die ("Can't open $input_assembly_file\n");
182
183 while (my $ligne = <AF>){
184 if ($ligne =~ /^\>(.*?)\s*$/){
185 if ($current_header){
186 $genome{$current_header} = $current_seq;
187 }
188 $current_header=$1;
189 $current_seq = "";
190 }
191 else {
192 if ($ligne=~/^([ATGCNXatgcnx]+)\s*$/){
193 $current_seq .= $1;
194 }
195 else {
196 print STDERR "Erreur Parsing n°1\n$ligne\n";
197 }
198 }
199 }
200
201 #TRAITEMENT DU DERNIER
202 if ($current_header){
203 $genome{$current_header} = $current_seq;
204 undef($current_seq);
205 }
206 close (AF);
207
208 open(OF, ">$output_fasta_file") or die ("Can't open $output_fasta_file\n");
209 for (my $i=0;$i<=$#segment_chr;$i++){
210 my $compt=0;
211 my $current_seq="";
212 print OF ">",$segment_chr[$i],":",$segment_start[$i],"..",$segment_end[$i]."\n";
213 ### Modification 0.9.9
214 if ($segment_end[$i]>length($genome{$segment_chr[$i]})){
215 $segment_end[$i] = length($genome{$segment_chr[$i]});
216 }
217 ###
218
219 my @SEQ = split(//,$genome{$segment_chr[$i]});
220 for (my $coord = $segment_start[$i]-1; $coord<=$segment_end[$i]-1;$coord++){
221 $compt++;
222 if ($compt > 60 ){
223 $current_seq .= "\n";
224 $compt=1;
225 }
226 $current_seq .= $SEQ[$coord];
227
228 }
229 print OF "$current_seq\n";
230 }
231 close (OF);
232 }
233
234 ### GENE and BLAST Extraction
235 my @blast_by_base;
236 my @header;
237
238
239 my @blastfiles = split(/\,/,$input_blast_files);
240 for (my $i=0;$i<=$#blastfiles;$i++){
241 my $current_blast_file = $blastfiles[$i];
242 my $current_blast_header = "DEFAULT";
243 my %current_blast;
244 open (B,"$current_blast_file") or die ("Can't open $current_blast_file\n");
245 while (my $line =<B>){
246 if ($line =~ /^\#\#(.*?)$/){
247 $current_blast_header = $1;
248 print LF $current_blast_header."\n";
249 }
250 elsif ($line =~ /^\#/){
251 # blast file column legend
252 }
253 else {
254 my @fields = split(/\s+/,$line);
255 my $gene_id = $fields[0];
256 my @blast_for_this_gene;
257 if ($current_blast{$gene_id}){
258 @blast_for_this_gene = @{$current_blast{$gene_id}};
259 }
260
261 if ($#blast_for_this_gene<$MAX_BLAST_LINES-1){
262 push(@blast_for_this_gene,$line);
263 print LF $gene_id,"\n";
264 }
265 $current_blast{$gene_id}=\@blast_for_this_gene;
266 }
267 }
268 close(B);
269 push (@blast_by_base,\%current_blast);
270 push (@header,$current_blast_header);
271 }
272
273
274 open (OGL,">$output_genes_list_file") or die ("Can't open $output_genes_list_file\n");
275
276 for (my $i=0;$i<=$#segment_chr;$i++){
277 my $segment_chr = $segment_chr[$i];
278 my $segment_start = $segment_start[$i];
279 my $segment_end = $segment_end[$i];
280
281 print OGL "#",$segment_chr[$i],":",$segment_start[$i],"..",$segment_end[$i],"\n";
282
283 open(IG, $input_genes_position_file) or die("Can't open $input_genes_position_file\n");
284 while (my $gene_desc=<IG>){
285 my @gene_desc = split(/\s+/,$gene_desc);
286 if ($#gene_desc != 4){
287 print STDERR "Error in gene position file format\n$gene_desc\n";
288 exit(0);
289 }
290 my $gene_id = $gene_desc[0];
291 my $cds_id = $gene_desc[1];
292 my $gene_chr = $gene_desc[2];
293 my $gene_start = $gene_desc[3];
294 my $gene_end = $gene_desc[4];
295 if ($segment_chr eq $gene_chr){
296 if ((($gene_start>=$segment_start)&&($gene_start<=$segment_end))||(($gene_end>=$segment_start)&&($gene_end<=$segment_end))){
297 print OGL $gene_id," / ",$cds_id,"\n";
298
299 for (my $i=0;$i<=$#blast_by_base;$i++){
300 #print LF $header[$i]."\n";
301 my %current_blast = %{$blast_by_base[$i]};
302 if ($current_blast{$cds_id}){
303 my @blast_by_gene = @{$current_blast{$cds_id}};
304 #print LF $#blast_by_gene."\n";
305 for (my $j=0;$j<=$#blast_by_gene;$j++){
306 my @fields = split(/\t/,$blast_by_gene[$j]);
307 print OGL $header[$i],"\t";
308 print OGL $fields[1],"\t";
309 print OGL $fields[3],"\t";
310 print OGL $fields[4],"\t";
311 print OGL $fields[5],"\t";
312 print OGL $fields[10],"\t";
313 print OGL $fields[6],"..",$fields[7],"(",$fields[11],")","\t";
314 print OGL $fields[8],"..",$fields[9],"(",$fields[12],")","\t";
315 print OGL $fields[13];
316 }
317 print OGL "\n";
318 }
319 else {
320 print OGL $header[$i],"\t","No BLAST results\n";
321 print LF $gene_id," / ",$cds_id,"\n";
322 }
323 }
324 }
325 }
326
327 }
328 close(IG);
329 }
330
331
332 close (OGL);
333
334 close (LF);
335