annotate Perl/CreateGenePathsFromGFA.pl @ 6:35d71348e81e draft

Uploaded
author dereeper
date Thu, 30 May 2024 12:12:30 +0000
parents e42d30da7a74
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1 #!/usr/bin/perl
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
3 use strict;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
4
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
5
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
6
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
7 my $gfa = $ARGV[0];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
8 my $gff3 = $ARGV[1];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
9 my $out = $ARGV[2];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
10
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
11 my %segment_is_covered_by_genes;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
12 my %coordinates;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
13 my %store_gene_paths;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
14
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
15 my %allgenes;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
16 my %coding_positions;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
17 open(LENGTH,">$out.gene_length.txt");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
18 open(F,$gff3);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
19 while(my $line = <F>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
20 chomp($line);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
21 my @infos = split(/\t/,$line);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
22 #if ($infos[2] eq "gene" && /CITME_006g014440/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
23 if ($infos[2] eq "CDS"){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
24 my $chr;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
25 # case eukaryotes
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
26 if ($line =~/Parent=([^;]+);*/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
27 $chr = $infos[0];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
28 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
29 # case prokaryotes/prokka
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
30 elsif ($line =~/ID=([^;]+);*/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
31 $chr = $infos[0];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
32 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
33 my $start = $infos[3];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
34 my $end = $infos[4];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
35 my $gene = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
36 $gene =~s/\n//g;$gene =~s/\r//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
37 if ($line =~/protein_id=([^;]+);/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
38 $gene = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
39 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
40 $coordinates{$chr}.= "$gene:$start-$end|";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
41 for (my $i = $start; $i <= $end; $i++){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
42 $allgenes{$gene}=1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
43 $coding_positions{$chr}{$i} = "$gene,";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
44 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
45 my $genelength = $end-$start;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
46 print LENGTH "$gene $genelength\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
47 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
48 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
49 close(F);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
50 close(LENGTH);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
51
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
52 print "Start GFA parsing ($gfa)...\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
53
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
54 my %segments_chr;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
55 my %segments_of_gene;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
56 my %segments;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
57 #open(ZCAT,"zcat $gfa |");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
58 open(GFA,$gfa);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
59 while(my $line = <GFA>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
60 chomp($line);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
61 if ($line =~/^S/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
62 my ($type,$segment,$segment_sequence,$SN,$tag) = split(/\t/,$line);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
63 $segments{$segment} = $segment_sequence;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
64 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
65 elsif ($line =~/^P/ or $line =~/^W/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
66
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
67 my $path;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
68 my $chrom;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
69 my @segments;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
70 if ($line =~/^W/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
71 my ($type,$sample,$strand,$chromW,$start,$length,$pathW) = split(/\t/,$line);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
72 $path = $pathW;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
73 $chrom = $chromW;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
74 @segments = split(/[\>\<]/,$path);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
75 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
76 elsif ($line =~/^P/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
77 my ($type,$strain_chrom,$pathP) = split(/\t/,$line);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
78 my ($stra,$chromP) = split(/#/,$strain_chrom);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
79 $path = $pathP;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
80 $chrom = $chromP;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
81 @segments = split(/,/,$path);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
82 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
83 $chrom =~s/\.\d+$//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
84 my @genes = split(/\|/,$coordinates{$chrom});
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
85 if ($coordinates{$chrom}){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
86 my $end_segment = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
87 my $start_segment = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
88 my $nb_gene = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
89
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
90 my %genes_located;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
91 my %first_segment_of_gene;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
92 print "$chrom: ". scalar @segments."\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
93 my $cumul = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
94 LOOP_SEGMENT: foreach my $segment(@segments){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
95 if ($segment eq ""){next;}
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
96 my $segment_including_strand = $segment;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
97 $segment =~s/\+//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
98 $segment =~s/\-//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
99 my $size = length($segments{$segment});
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
100 $start_segment = $end_segment+1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
101 $end_segment = $start_segment+$size-1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
102 $cumul+=$size;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
103 my $current_gene;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
104
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
105 if ($segment_including_strand =~/\+/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
106 my $pos_in_segment = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
107 for (my $k = $start_segment; $k <= $end_segment; $k++){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
108 $pos_in_segment++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
109 #print "$k\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
110
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
111 if ($coding_positions{$chrom}{$k}){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
112 my $info_gene = $coding_positions{$chrom}{$k};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
113 chop($info_gene);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
114 my @genes = split(/,/,$info_gene);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
115 foreach my $gene(@genes){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
116 $segments_of_gene{$gene} .= "s$segment:$pos_in_segment|";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
117 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
118 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
119 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
120 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
121 elsif ($segment_including_strand =~/\-/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
122 my $pos_in_segment = $size+1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
123 for (my $k = $start_segment; $k <= $end_segment; $k++){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
124 $pos_in_segment--;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
125 if ($coding_positions{$chrom}{$k}){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
126 my $info_gene = $coding_positions{$chrom}{$k};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
127 chop($info_gene);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
128 my @genes = split(/,/,$info_gene);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
129 foreach my $gene(@genes){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
130 $segments_of_gene{$gene} .= "s$segment:$pos_in_segment|";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
131 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
132 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
133 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
134 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
135
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
136 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
137 print "$chrom $cumul $end_segment\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
138 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
139 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
140 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
141 close(GFA);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
142
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
143 open(O,">$out");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
144 open(BED,">$out.bed");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
145 foreach my $gene(keys(%segments_of_gene)){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
146 my $segments = $segments_of_gene{$gene};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
147 my @positions = split(/\|/,$segments);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
148 my $previous_segment;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
149
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
150 #print O "$gene $segments\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
151 print O "$gene\t";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
152
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
153 my $concat = "";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
154 foreach my $pos(@positions){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
155
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
156 my ($segment,$pos_in_segment) = split(/:/,$pos);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
157 if ($segment ne $previous_segment){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
158 $concat .= ">$segment:$pos_in_segment";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
159 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
160 $concat .= "-$pos_in_segment";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
161 $previous_segment = $segment;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
162 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
163 my @segments = split(/>/,$concat);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
164 foreach my $segment(@segments){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
165 if (!$segment){next;}
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
166 my ($segmentname,$pos) = split(/:/,$segment);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
167 my @positions = split(/-/,$pos);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
168 my $start = $positions[0];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
169 my $end = $positions[$#positions];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
170 if ($start > $end){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
171 $start = $positions[$#positions];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
172 $end = $positions[0];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
173 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
174 print O ">$segmentname:$start-$end";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
175 print BED "$segmentname $start $end $gene\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
176 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
177 print O "\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
178 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
179 close(O);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
180 close(BED);