Mercurial > repos > dereeper > pangenome_explorer
comparison PanExplorer_workflow/Perl/GeneratePAVfromBed.pl @ 1:032f6b3806a3 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 11:16:08 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:3cbb01081cde | 1:032f6b3806a3 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 use strict; | |
4 | |
5 use Graph::Undirected; | |
6 | |
7 my $strain_info = $ARGV[0]; | |
8 my $bed_directory = $ARGV[1]; | |
9 my $PAV = $ARGV[2]; | |
10 my $threshold_coverage = $ARGV[3]; | |
11 | |
12 my %strains; | |
13 open(S,$strain_info); | |
14 while (my $line = <S>) { | |
15 chomp $line; | |
16 my ($id,$name) = split(/\t/,$line); | |
17 $strains{$id} = $name; | |
18 } | |
19 close(S); | |
20 | |
21 my %gene_lengths; | |
22 foreach my $id(keys(%strains)){ | |
23 | |
24 my $gff = `ls $bed_directory/$id*.gff`; | |
25 open(GFF,$gff); | |
26 my $current_gene_length; | |
27 while(my $line = <GFF>){ | |
28 chomp($line); | |
29 my @infos = split(/\t/,$line); | |
30 if ($infos[2] eq "gene" && $line =~/ID=([^;]+);*/){ | |
31 my $chr = $infos[0]; | |
32 my $start = $infos[3]; | |
33 my $end = $infos[4]; | |
34 my $gene = $1; | |
35 my $genelength = $end-$start; | |
36 $current_gene_length = $genelength; | |
37 $gene_lengths{$id}{$gene} = $genelength; | |
38 } | |
39 if ($infos[2] eq "CDS"){ | |
40 my $gene; | |
41 if ($line =~/Parent=([^;]+);*/){ | |
42 $gene = $1; | |
43 } | |
44 elsif ($line =~/ID=([^;]+);*/){ | |
45 $gene = $1; | |
46 } | |
47 my $start = $infos[3]; | |
48 my $end = $infos[4]; | |
49 my $genelength = $end-$start; | |
50 $gene_lengths{$id}{$gene} += $genelength; | |
51 if ($line =~/protein_id=([^;]+);/){ | |
52 $gene = $1; | |
53 $gene_lengths{$id}{$gene} += $genelength; | |
54 } | |
55 } | |
56 } | |
57 close(GFF); | |
58 } | |
59 | |
60 | |
61 | |
62 | |
63 my %genes; | |
64 my $graph = Graph::Undirected->new; | |
65 my $num = 0; | |
66 foreach my $id(keys(%strains)){ | |
67 | |
68 my $file = `ls $bed_directory/$id.*.bed`; | |
69 chomp($file); | |
70 open(B,$file); | |
71 while(my $line = <B>){ | |
72 chomp($line); | |
73 my @infos = split(/\t/,$line); | |
74 my $gene1 = $infos[3]; | |
75 $genes{"$id:$gene1"} = 1; | |
76 } | |
77 close(B); | |
78 | |
79 foreach my $id2(keys(%strains)){ | |
80 my $file2 = `ls $bed_directory/$id2.*.bed`; | |
81 chomp($file2); | |
82 if (-e $file && -e $file2 && $file ne $file2){ | |
83 $num++; | |
84 system("bedtools intersect -a $file -b $file2 -wo >$PAV.$num.intersect.out"); | |
85 my %cumul_match; | |
86 my %cumul_match2; | |
87 open(INTER,"$PAV.$num.intersect.out"); | |
88 while(<INTER>){ | |
89 my $line = $_; | |
90 $line =~s/\n//g;$line =~s/\r//g; | |
91 my ($segment,$start_gene1,$end_gene1,$gene1,$segment,$start_gene2,$end_gene2,$gene2,$size_overlap) = split(/\t/,$line); | |
92 $cumul_match{"$gene1;$gene2"}+=$size_overlap; | |
93 } | |
94 close(INTER); | |
95 | |
96 #unlink("$PAV.$num.intersect.out"); | |
97 | |
98 foreach my $pair(keys(%cumul_match)){ | |
99 my $size1 = $cumul_match{$pair}; | |
100 my ($gene1,$gene2) = split(/;/,$pair); | |
101 my $gene1_length = $gene_lengths{$id}{$gene1}; | |
102 my $gene2_length = $gene_lengths{$id2}{$gene2}; | |
103 my $size_match_gene = $cumul_match{"$gene1;$gene2"}; | |
104 my $percentage_overlap_gene1 = ($size_match_gene/$gene1_length)*100; | |
105 my $percentage_overlap_gene2 = ($size_match_gene/$gene2_length)*100; | |
106 #print "$pair $size_match_gene $percentage_overlap_gene1 $percentage_overlap_gene2\n"; | |
107 if ($percentage_overlap_gene1 > $threshold_coverage && $percentage_overlap_gene2 > $threshold_coverage){ | |
108 $graph->add_edge("$id:$gene1","$id2:$gene2"); | |
109 delete($genes{"$id:$gene1"}); | |
110 delete($genes{"$id2:$gene2"}); | |
111 } | |
112 } | |
113 } | |
114 } | |
115 } | |
116 | |
117 open(OUT,">$PAV"); | |
118 print OUT "ClutserID"; | |
119 foreach my $id(sort keys(%strains)){ | |
120 my $name = $strains{$id}; | |
121 print OUT "\t".$name; | |
122 } | |
123 print OUT "\n"; | |
124 my $clnum = 0; | |
125 my @cc = $graph->connected_components(); | |
126 foreach my $component (@cc){ | |
127 $clnum++; | |
128 print OUT $clnum; | |
129 my @genes = @$component; | |
130 my %h; | |
131 foreach my $gene(@genes){ | |
132 my ($id,$genename) = split(/:/,$gene); | |
133 $h{$id}.="$genename,"; | |
134 } | |
135 foreach my $id(sort keys(%strains)){ | |
136 my $ids = "-"; | |
137 if ($h{$id}){ | |
138 $ids = $h{$id}; | |
139 chop($ids); | |
140 } | |
141 print OUT "\t".$ids; | |
142 } | |
143 print OUT "\n"; | |
144 } | |
145 ####################### | |
146 # add singletons | |
147 ####################### | |
148 foreach my $gene(keys(%genes)){ | |
149 $clnum++; | |
150 print OUT $clnum; | |
151 my ($id,$genename) = split(/:/,$gene); | |
152 my %h; | |
153 $h{$id}.="$genename,"; | |
154 foreach my $id(sort keys(%strains)){ | |
155 my $ids = "-"; | |
156 if ($h{$id}){ | |
157 $ids = $h{$id}; | |
158 chop($ids); | |
159 } | |
160 print OUT "\t".$ids; | |
161 } | |
162 print OUT "\n"; | |
163 } | |
164 close(OUT); |