annotate PanExplorer_workflow/Perl/CreateGenePathsFromGFA.pl @ 2:97e4e3e818b6 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:48:09 +0000
parents 032f6b3806a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
1 #!/usr/bin/perl
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
2
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
3 use strict;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
4
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
5
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
6
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
7 my $gfa = $ARGV[0];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
8 my $gff3 = $ARGV[1];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
9 my $out = $ARGV[2];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
10
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
11 my %segment_is_covered_by_genes;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
12 my %coordinates;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
13 my %store_gene_paths;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
14
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
15 my %allgenes;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
16 my %coding_positions;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
17 open(LENGTH,">$out.gene_length.txt");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
18 open(F,$gff3);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
19 while(my $line = <F>){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
20 chomp($line);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
21 my @infos = split(/\t/,$line);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
22 #if ($infos[2] eq "gene" && /CITME_006g014440/){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
23 if ($infos[2] eq "CDS"){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
24 my $chr;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
25 # case eukaryotes
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
26 if ($line =~/Parent=([^;]+);*/){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
27 $chr = $infos[0];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
28 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
29 # case prokaryotes/prokka
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
30 elsif ($line =~/ID=([^;]+);*/){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
31 $chr = $infos[0];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
32 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
33 my $start = $infos[3];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
34 my $end = $infos[4];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
35 my $gene = $1;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
36 $gene =~s/\n//g;$gene =~s/\r//g;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
37 if ($line =~/protein_id=([^;]+);/){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
38 $gene = $1;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
39 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
40 $coordinates{$chr}.= "$gene:$start-$end|";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
41 for (my $i = $start; $i <= $end; $i++){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
42 $allgenes{$gene}=1;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
43 $coding_positions{$chr}{$i} = "$gene,";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
44 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
45 my $genelength = $end-$start;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
46 print LENGTH "$gene $genelength\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
47 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
48 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
49 close(F);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
50 close(LENGTH);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
51
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
52 print "Start GFA parsing ($gfa)...\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
53
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
54 my %segments_chr;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
55 my %segments_of_gene;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
56 my %segments;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
57 #open(ZCAT,"zcat $gfa |");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
58 open(GFA,$gfa);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
59 while(my $line = <GFA>){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
60 chomp($line);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
61 if ($line =~/^S/){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
62 my ($type,$segment,$segment_sequence,$SN,$tag) = split(/\t/,$line);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
63 $segments{$segment} = $segment_sequence;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
64 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
65 elsif ($line =~/^P/ or $line =~/^W/){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
66
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
67 my $path;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
68 my $chrom;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
69 my @segments;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
70 if ($line =~/^W/){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
71 my ($type,$sample,$strand,$chromW,$start,$length,$pathW) = split(/\t/,$line);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
72 $path = $pathW;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
73 $chrom = $chromW;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
74 @segments = split(/[\>\<]/,$path);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
75 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
76 elsif ($line =~/^P/){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
77 my ($type,$strain_chrom,$pathP) = split(/\t/,$line);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
78 my ($stra,$chromP) = split(/#/,$strain_chrom);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
79 $path = $pathP;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
80 $chrom = $chromP;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
81 @segments = split(/,/,$path);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
82 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
83 $chrom =~s/\.\d+$//g;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
84 my @genes = split(/\|/,$coordinates{$chrom});
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
85 if ($coordinates{$chrom}){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
86 my $end_segment = 0;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
87 my $start_segment = 0;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
88 my $nb_gene = 0;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
89
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
90 my %genes_located;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
91 my %first_segment_of_gene;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
92 print "$chrom: ". scalar @segments."\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
93 my $cumul = 0;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
94 LOOP_SEGMENT: foreach my $segment(@segments){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
95 if ($segment eq ""){next;}
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
96 my $segment_including_strand = $segment;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
97 $segment =~s/\+//g;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
98 $segment =~s/\-//g;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
99 my $size = length($segments{$segment});
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
100 $start_segment = $end_segment+1;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
101 $end_segment = $start_segment+$size-1;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
102 $cumul+=$size;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
103 my $current_gene;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
104
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
105 if ($segment_including_strand =~/\+/){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
106 my $pos_in_segment = 0;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
107 for (my $k = $start_segment; $k <= $end_segment; $k++){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
108 $pos_in_segment++;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
109 #print "$k\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
110
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
111 if ($coding_positions{$chrom}{$k}){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
112 my $info_gene = $coding_positions{$chrom}{$k};
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
113 chop($info_gene);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
114 my @genes = split(/,/,$info_gene);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
115 foreach my $gene(@genes){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
116 $segments_of_gene{$gene} .= "s$segment:$pos_in_segment|";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
117 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
118 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
119 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
120 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
121 elsif ($segment_including_strand =~/\-/){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
122 my $pos_in_segment = $size+1;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
123 for (my $k = $start_segment; $k <= $end_segment; $k++){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
124 $pos_in_segment--;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
125 if ($coding_positions{$chrom}{$k}){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
126 my $info_gene = $coding_positions{$chrom}{$k};
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
127 chop($info_gene);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
128 my @genes = split(/,/,$info_gene);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
129 foreach my $gene(@genes){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
130 $segments_of_gene{$gene} .= "s$segment:$pos_in_segment|";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
131 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
132 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
133 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
134 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
135
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
136 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
137 print "$chrom $cumul $end_segment\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
138 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
139 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
140 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
141 close(GFA);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
142
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
143 open(O,">$out");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
144 open(BED,">$out.bed");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
145 foreach my $gene(keys(%segments_of_gene)){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
146 my $segments = $segments_of_gene{$gene};
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
147 my @positions = split(/\|/,$segments);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
148 my $previous_segment;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
149
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
150 #print O "$gene $segments\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
151 print O "$gene\t";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
152
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
153 my $concat = "";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
154 foreach my $pos(@positions){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
155
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
156 my ($segment,$pos_in_segment) = split(/:/,$pos);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
157 if ($segment ne $previous_segment){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
158 $concat .= ">$segment:$pos_in_segment";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
159 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
160 $concat .= "-$pos_in_segment";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
161 $previous_segment = $segment;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
162 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
163 my @segments = split(/>/,$concat);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
164 foreach my $segment(@segments){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
165 if (!$segment){next;}
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
166 my ($segmentname,$pos) = split(/:/,$segment);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
167 my @positions = split(/-/,$pos);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
168 my $start = $positions[0];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
169 my $end = $positions[$#positions];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
170 if ($start > $end){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
171 $start = $positions[$#positions];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
172 $end = $positions[0];
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
173 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
174 print O ">$segmentname:$start-$end";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
175 print BED "$segmentname $start $end $gene\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
176 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
177 print O "\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
178 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
179 close(O);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
180 close(BED);