1
|
1 #!/usr/bin/perl
|
|
2
|
|
3 use strict;
|
|
4
|
|
5
|
|
6
|
|
7 my $gfa = $ARGV[0];
|
|
8 my $gff3 = $ARGV[1];
|
|
9 my $out = $ARGV[2];
|
|
10
|
|
11 my %segment_is_covered_by_genes;
|
|
12 my %coordinates;
|
|
13 my %store_gene_paths;
|
|
14
|
|
15 my %allgenes;
|
|
16 my %coding_positions;
|
|
17 open(LENGTH,">$out.gene_length.txt");
|
|
18 open(F,$gff3);
|
|
19 while(my $line = <F>){
|
|
20 chomp($line);
|
|
21 my @infos = split(/\t/,$line);
|
|
22 #if ($infos[2] eq "gene" && /CITME_006g014440/){
|
|
23 if ($infos[2] eq "CDS"){
|
|
24 my $chr;
|
|
25 # case eukaryotes
|
|
26 if ($line =~/Parent=([^;]+);*/){
|
|
27 $chr = $infos[0];
|
|
28 }
|
|
29 # case prokaryotes/prokka
|
|
30 elsif ($line =~/ID=([^;]+);*/){
|
|
31 $chr = $infos[0];
|
|
32 }
|
|
33 my $start = $infos[3];
|
|
34 my $end = $infos[4];
|
|
35 my $gene = $1;
|
|
36 $gene =~s/\n//g;$gene =~s/\r//g;
|
|
37 if ($line =~/protein_id=([^;]+);/){
|
|
38 $gene = $1;
|
|
39 }
|
|
40 $coordinates{$chr}.= "$gene:$start-$end|";
|
|
41 for (my $i = $start; $i <= $end; $i++){
|
|
42 $allgenes{$gene}=1;
|
|
43 $coding_positions{$chr}{$i} = "$gene,";
|
|
44 }
|
|
45 my $genelength = $end-$start;
|
|
46 print LENGTH "$gene $genelength\n";
|
|
47 }
|
|
48 }
|
|
49 close(F);
|
|
50 close(LENGTH);
|
|
51
|
|
52 print "Start GFA parsing ($gfa)...\n";
|
|
53
|
|
54 my %segments_chr;
|
|
55 my %segments_of_gene;
|
|
56 my %segments;
|
|
57 #open(ZCAT,"zcat $gfa |");
|
|
58 open(GFA,$gfa);
|
|
59 while(my $line = <GFA>){
|
|
60 chomp($line);
|
|
61 if ($line =~/^S/){
|
|
62 my ($type,$segment,$segment_sequence,$SN,$tag) = split(/\t/,$line);
|
|
63 $segments{$segment} = $segment_sequence;
|
|
64 }
|
|
65 elsif ($line =~/^P/ or $line =~/^W/){
|
|
66
|
|
67 my $path;
|
|
68 my $chrom;
|
|
69 my @segments;
|
|
70 if ($line =~/^W/){
|
|
71 my ($type,$sample,$strand,$chromW,$start,$length,$pathW) = split(/\t/,$line);
|
|
72 $path = $pathW;
|
|
73 $chrom = $chromW;
|
|
74 @segments = split(/[\>\<]/,$path);
|
|
75 }
|
|
76 elsif ($line =~/^P/){
|
|
77 my ($type,$strain_chrom,$pathP) = split(/\t/,$line);
|
|
78 my ($stra,$chromP) = split(/#/,$strain_chrom);
|
|
79 $path = $pathP;
|
|
80 $chrom = $chromP;
|
|
81 @segments = split(/,/,$path);
|
|
82 }
|
|
83 $chrom =~s/\.\d+$//g;
|
|
84 my @genes = split(/\|/,$coordinates{$chrom});
|
|
85 if ($coordinates{$chrom}){
|
|
86 my $end_segment = 0;
|
|
87 my $start_segment = 0;
|
|
88 my $nb_gene = 0;
|
|
89
|
|
90 my %genes_located;
|
|
91 my %first_segment_of_gene;
|
|
92 print "$chrom: ". scalar @segments."\n";
|
|
93 my $cumul = 0;
|
|
94 LOOP_SEGMENT: foreach my $segment(@segments){
|
|
95 if ($segment eq ""){next;}
|
|
96 my $segment_including_strand = $segment;
|
|
97 $segment =~s/\+//g;
|
|
98 $segment =~s/\-//g;
|
|
99 my $size = length($segments{$segment});
|
|
100 $start_segment = $end_segment+1;
|
|
101 $end_segment = $start_segment+$size-1;
|
|
102 $cumul+=$size;
|
|
103 my $current_gene;
|
|
104
|
|
105 if ($segment_including_strand =~/\+/){
|
|
106 my $pos_in_segment = 0;
|
|
107 for (my $k = $start_segment; $k <= $end_segment; $k++){
|
|
108 $pos_in_segment++;
|
|
109 #print "$k\n";
|
|
110
|
|
111 if ($coding_positions{$chrom}{$k}){
|
|
112 my $info_gene = $coding_positions{$chrom}{$k};
|
|
113 chop($info_gene);
|
|
114 my @genes = split(/,/,$info_gene);
|
|
115 foreach my $gene(@genes){
|
|
116 $segments_of_gene{$gene} .= "s$segment:$pos_in_segment|";
|
|
117 }
|
|
118 }
|
|
119 }
|
|
120 }
|
|
121 elsif ($segment_including_strand =~/\-/){
|
|
122 my $pos_in_segment = $size+1;
|
|
123 for (my $k = $start_segment; $k <= $end_segment; $k++){
|
|
124 $pos_in_segment--;
|
|
125 if ($coding_positions{$chrom}{$k}){
|
|
126 my $info_gene = $coding_positions{$chrom}{$k};
|
|
127 chop($info_gene);
|
|
128 my @genes = split(/,/,$info_gene);
|
|
129 foreach my $gene(@genes){
|
|
130 $segments_of_gene{$gene} .= "s$segment:$pos_in_segment|";
|
|
131 }
|
|
132 }
|
|
133 }
|
|
134 }
|
|
135
|
|
136 }
|
|
137 print "$chrom $cumul $end_segment\n";
|
|
138 }
|
|
139 }
|
|
140 }
|
|
141 close(GFA);
|
|
142
|
|
143 open(O,">$out");
|
|
144 open(BED,">$out.bed");
|
|
145 foreach my $gene(keys(%segments_of_gene)){
|
|
146 my $segments = $segments_of_gene{$gene};
|
|
147 my @positions = split(/\|/,$segments);
|
|
148 my $previous_segment;
|
|
149
|
|
150 #print O "$gene $segments\n";
|
|
151 print O "$gene\t";
|
|
152
|
|
153 my $concat = "";
|
|
154 foreach my $pos(@positions){
|
|
155
|
|
156 my ($segment,$pos_in_segment) = split(/:/,$pos);
|
|
157 if ($segment ne $previous_segment){
|
|
158 $concat .= ">$segment:$pos_in_segment";
|
|
159 }
|
|
160 $concat .= "-$pos_in_segment";
|
|
161 $previous_segment = $segment;
|
|
162 }
|
|
163 my @segments = split(/>/,$concat);
|
|
164 foreach my $segment(@segments){
|
|
165 if (!$segment){next;}
|
|
166 my ($segmentname,$pos) = split(/:/,$segment);
|
|
167 my @positions = split(/-/,$pos);
|
|
168 my $start = $positions[0];
|
|
169 my $end = $positions[$#positions];
|
|
170 if ($start > $end){
|
|
171 $start = $positions[$#positions];
|
|
172 $end = $positions[0];
|
|
173 }
|
|
174 print O ">$segmentname:$start-$end";
|
|
175 print BED "$segmentname $start $end $gene\n";
|
|
176 }
|
|
177 print O "\n";
|
|
178 }
|
|
179 close(O);
|
|
180 close(BED);
|