comparison Perl/CreateGenePathsFromGFA.pl @ 3:e42d30da7a74 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:52:25 +0000
parents
children
comparison
equal deleted inserted replaced
2:97e4e3e818b6 3:e42d30da7a74
1 #!/usr/bin/perl
2
3 use strict;
4
5
6
7 my $gfa = $ARGV[0];
8 my $gff3 = $ARGV[1];
9 my $out = $ARGV[2];
10
11 my %segment_is_covered_by_genes;
12 my %coordinates;
13 my %store_gene_paths;
14
15 my %allgenes;
16 my %coding_positions;
17 open(LENGTH,">$out.gene_length.txt");
18 open(F,$gff3);
19 while(my $line = <F>){
20 chomp($line);
21 my @infos = split(/\t/,$line);
22 #if ($infos[2] eq "gene" && /CITME_006g014440/){
23 if ($infos[2] eq "CDS"){
24 my $chr;
25 # case eukaryotes
26 if ($line =~/Parent=([^;]+);*/){
27 $chr = $infos[0];
28 }
29 # case prokaryotes/prokka
30 elsif ($line =~/ID=([^;]+);*/){
31 $chr = $infos[0];
32 }
33 my $start = $infos[3];
34 my $end = $infos[4];
35 my $gene = $1;
36 $gene =~s/\n//g;$gene =~s/\r//g;
37 if ($line =~/protein_id=([^;]+);/){
38 $gene = $1;
39 }
40 $coordinates{$chr}.= "$gene:$start-$end|";
41 for (my $i = $start; $i <= $end; $i++){
42 $allgenes{$gene}=1;
43 $coding_positions{$chr}{$i} = "$gene,";
44 }
45 my $genelength = $end-$start;
46 print LENGTH "$gene $genelength\n";
47 }
48 }
49 close(F);
50 close(LENGTH);
51
52 print "Start GFA parsing ($gfa)...\n";
53
54 my %segments_chr;
55 my %segments_of_gene;
56 my %segments;
57 #open(ZCAT,"zcat $gfa |");
58 open(GFA,$gfa);
59 while(my $line = <GFA>){
60 chomp($line);
61 if ($line =~/^S/){
62 my ($type,$segment,$segment_sequence,$SN,$tag) = split(/\t/,$line);
63 $segments{$segment} = $segment_sequence;
64 }
65 elsif ($line =~/^P/ or $line =~/^W/){
66
67 my $path;
68 my $chrom;
69 my @segments;
70 if ($line =~/^W/){
71 my ($type,$sample,$strand,$chromW,$start,$length,$pathW) = split(/\t/,$line);
72 $path = $pathW;
73 $chrom = $chromW;
74 @segments = split(/[\>\<]/,$path);
75 }
76 elsif ($line =~/^P/){
77 my ($type,$strain_chrom,$pathP) = split(/\t/,$line);
78 my ($stra,$chromP) = split(/#/,$strain_chrom);
79 $path = $pathP;
80 $chrom = $chromP;
81 @segments = split(/,/,$path);
82 }
83 $chrom =~s/\.\d+$//g;
84 my @genes = split(/\|/,$coordinates{$chrom});
85 if ($coordinates{$chrom}){
86 my $end_segment = 0;
87 my $start_segment = 0;
88 my $nb_gene = 0;
89
90 my %genes_located;
91 my %first_segment_of_gene;
92 print "$chrom: ". scalar @segments."\n";
93 my $cumul = 0;
94 LOOP_SEGMENT: foreach my $segment(@segments){
95 if ($segment eq ""){next;}
96 my $segment_including_strand = $segment;
97 $segment =~s/\+//g;
98 $segment =~s/\-//g;
99 my $size = length($segments{$segment});
100 $start_segment = $end_segment+1;
101 $end_segment = $start_segment+$size-1;
102 $cumul+=$size;
103 my $current_gene;
104
105 if ($segment_including_strand =~/\+/){
106 my $pos_in_segment = 0;
107 for (my $k = $start_segment; $k <= $end_segment; $k++){
108 $pos_in_segment++;
109 #print "$k\n";
110
111 if ($coding_positions{$chrom}{$k}){
112 my $info_gene = $coding_positions{$chrom}{$k};
113 chop($info_gene);
114 my @genes = split(/,/,$info_gene);
115 foreach my $gene(@genes){
116 $segments_of_gene{$gene} .= "s$segment:$pos_in_segment|";
117 }
118 }
119 }
120 }
121 elsif ($segment_including_strand =~/\-/){
122 my $pos_in_segment = $size+1;
123 for (my $k = $start_segment; $k <= $end_segment; $k++){
124 $pos_in_segment--;
125 if ($coding_positions{$chrom}{$k}){
126 my $info_gene = $coding_positions{$chrom}{$k};
127 chop($info_gene);
128 my @genes = split(/,/,$info_gene);
129 foreach my $gene(@genes){
130 $segments_of_gene{$gene} .= "s$segment:$pos_in_segment|";
131 }
132 }
133 }
134 }
135
136 }
137 print "$chrom $cumul $end_segment\n";
138 }
139 }
140 }
141 close(GFA);
142
143 open(O,">$out");
144 open(BED,">$out.bed");
145 foreach my $gene(keys(%segments_of_gene)){
146 my $segments = $segments_of_gene{$gene};
147 my @positions = split(/\|/,$segments);
148 my $previous_segment;
149
150 #print O "$gene $segments\n";
151 print O "$gene\t";
152
153 my $concat = "";
154 foreach my $pos(@positions){
155
156 my ($segment,$pos_in_segment) = split(/:/,$pos);
157 if ($segment ne $previous_segment){
158 $concat .= ">$segment:$pos_in_segment";
159 }
160 $concat .= "-$pos_in_segment";
161 $previous_segment = $segment;
162 }
163 my @segments = split(/>/,$concat);
164 foreach my $segment(@segments){
165 if (!$segment){next;}
166 my ($segmentname,$pos) = split(/:/,$segment);
167 my @positions = split(/-/,$pos);
168 my $start = $positions[0];
169 my $end = $positions[$#positions];
170 if ($start > $end){
171 $start = $positions[$#positions];
172 $end = $positions[0];
173 }
174 print O ">$segmentname:$start-$end";
175 print BED "$segmentname $start $end $gene\n";
176 }
177 print O "\n";
178 }
179 close(O);
180 close(BED);