Mercurial > repos > dereeper > pangenome_explorer
comparison PanExplorer_workflow/Perl/CreateGenePathsFromGFA.pl @ 1:032f6b3806a3 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 11:16:08 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:3cbb01081cde | 1:032f6b3806a3 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 use strict; | |
4 | |
5 | |
6 | |
7 my $gfa = $ARGV[0]; | |
8 my $gff3 = $ARGV[1]; | |
9 my $out = $ARGV[2]; | |
10 | |
11 my %segment_is_covered_by_genes; | |
12 my %coordinates; | |
13 my %store_gene_paths; | |
14 | |
15 my %allgenes; | |
16 my %coding_positions; | |
17 open(LENGTH,">$out.gene_length.txt"); | |
18 open(F,$gff3); | |
19 while(my $line = <F>){ | |
20 chomp($line); | |
21 my @infos = split(/\t/,$line); | |
22 #if ($infos[2] eq "gene" && /CITME_006g014440/){ | |
23 if ($infos[2] eq "CDS"){ | |
24 my $chr; | |
25 # case eukaryotes | |
26 if ($line =~/Parent=([^;]+);*/){ | |
27 $chr = $infos[0]; | |
28 } | |
29 # case prokaryotes/prokka | |
30 elsif ($line =~/ID=([^;]+);*/){ | |
31 $chr = $infos[0]; | |
32 } | |
33 my $start = $infos[3]; | |
34 my $end = $infos[4]; | |
35 my $gene = $1; | |
36 $gene =~s/\n//g;$gene =~s/\r//g; | |
37 if ($line =~/protein_id=([^;]+);/){ | |
38 $gene = $1; | |
39 } | |
40 $coordinates{$chr}.= "$gene:$start-$end|"; | |
41 for (my $i = $start; $i <= $end; $i++){ | |
42 $allgenes{$gene}=1; | |
43 $coding_positions{$chr}{$i} = "$gene,"; | |
44 } | |
45 my $genelength = $end-$start; | |
46 print LENGTH "$gene $genelength\n"; | |
47 } | |
48 } | |
49 close(F); | |
50 close(LENGTH); | |
51 | |
52 print "Start GFA parsing ($gfa)...\n"; | |
53 | |
54 my %segments_chr; | |
55 my %segments_of_gene; | |
56 my %segments; | |
57 #open(ZCAT,"zcat $gfa |"); | |
58 open(GFA,$gfa); | |
59 while(my $line = <GFA>){ | |
60 chomp($line); | |
61 if ($line =~/^S/){ | |
62 my ($type,$segment,$segment_sequence,$SN,$tag) = split(/\t/,$line); | |
63 $segments{$segment} = $segment_sequence; | |
64 } | |
65 elsif ($line =~/^P/ or $line =~/^W/){ | |
66 | |
67 my $path; | |
68 my $chrom; | |
69 my @segments; | |
70 if ($line =~/^W/){ | |
71 my ($type,$sample,$strand,$chromW,$start,$length,$pathW) = split(/\t/,$line); | |
72 $path = $pathW; | |
73 $chrom = $chromW; | |
74 @segments = split(/[\>\<]/,$path); | |
75 } | |
76 elsif ($line =~/^P/){ | |
77 my ($type,$strain_chrom,$pathP) = split(/\t/,$line); | |
78 my ($stra,$chromP) = split(/#/,$strain_chrom); | |
79 $path = $pathP; | |
80 $chrom = $chromP; | |
81 @segments = split(/,/,$path); | |
82 } | |
83 $chrom =~s/\.\d+$//g; | |
84 my @genes = split(/\|/,$coordinates{$chrom}); | |
85 if ($coordinates{$chrom}){ | |
86 my $end_segment = 0; | |
87 my $start_segment = 0; | |
88 my $nb_gene = 0; | |
89 | |
90 my %genes_located; | |
91 my %first_segment_of_gene; | |
92 print "$chrom: ". scalar @segments."\n"; | |
93 my $cumul = 0; | |
94 LOOP_SEGMENT: foreach my $segment(@segments){ | |
95 if ($segment eq ""){next;} | |
96 my $segment_including_strand = $segment; | |
97 $segment =~s/\+//g; | |
98 $segment =~s/\-//g; | |
99 my $size = length($segments{$segment}); | |
100 $start_segment = $end_segment+1; | |
101 $end_segment = $start_segment+$size-1; | |
102 $cumul+=$size; | |
103 my $current_gene; | |
104 | |
105 if ($segment_including_strand =~/\+/){ | |
106 my $pos_in_segment = 0; | |
107 for (my $k = $start_segment; $k <= $end_segment; $k++){ | |
108 $pos_in_segment++; | |
109 #print "$k\n"; | |
110 | |
111 if ($coding_positions{$chrom}{$k}){ | |
112 my $info_gene = $coding_positions{$chrom}{$k}; | |
113 chop($info_gene); | |
114 my @genes = split(/,/,$info_gene); | |
115 foreach my $gene(@genes){ | |
116 $segments_of_gene{$gene} .= "s$segment:$pos_in_segment|"; | |
117 } | |
118 } | |
119 } | |
120 } | |
121 elsif ($segment_including_strand =~/\-/){ | |
122 my $pos_in_segment = $size+1; | |
123 for (my $k = $start_segment; $k <= $end_segment; $k++){ | |
124 $pos_in_segment--; | |
125 if ($coding_positions{$chrom}{$k}){ | |
126 my $info_gene = $coding_positions{$chrom}{$k}; | |
127 chop($info_gene); | |
128 my @genes = split(/,/,$info_gene); | |
129 foreach my $gene(@genes){ | |
130 $segments_of_gene{$gene} .= "s$segment:$pos_in_segment|"; | |
131 } | |
132 } | |
133 } | |
134 } | |
135 | |
136 } | |
137 print "$chrom $cumul $end_segment\n"; | |
138 } | |
139 } | |
140 } | |
141 close(GFA); | |
142 | |
143 open(O,">$out"); | |
144 open(BED,">$out.bed"); | |
145 foreach my $gene(keys(%segments_of_gene)){ | |
146 my $segments = $segments_of_gene{$gene}; | |
147 my @positions = split(/\|/,$segments); | |
148 my $previous_segment; | |
149 | |
150 #print O "$gene $segments\n"; | |
151 print O "$gene\t"; | |
152 | |
153 my $concat = ""; | |
154 foreach my $pos(@positions){ | |
155 | |
156 my ($segment,$pos_in_segment) = split(/:/,$pos); | |
157 if ($segment ne $previous_segment){ | |
158 $concat .= ">$segment:$pos_in_segment"; | |
159 } | |
160 $concat .= "-$pos_in_segment"; | |
161 $previous_segment = $segment; | |
162 } | |
163 my @segments = split(/>/,$concat); | |
164 foreach my $segment(@segments){ | |
165 if (!$segment){next;} | |
166 my ($segmentname,$pos) = split(/:/,$segment); | |
167 my @positions = split(/-/,$pos); | |
168 my $start = $positions[0]; | |
169 my $end = $positions[$#positions]; | |
170 if ($start > $end){ | |
171 $start = $positions[$#positions]; | |
172 $end = $positions[0]; | |
173 } | |
174 print O ">$segmentname:$start-$end"; | |
175 print BED "$segmentname $start $end $gene\n"; | |
176 } | |
177 print O "\n"; | |
178 } | |
179 close(O); | |
180 close(BED); |