comparison Perl/GeneratePAVfromBed.pl @ 3:e42d30da7a74 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:52:25 +0000
parents
children
comparison
equal deleted inserted replaced
2:97e4e3e818b6 3:e42d30da7a74
1 #!/usr/bin/perl
2
3 use strict;
4
5 use Graph::Undirected;
6
7 my $strain_info = $ARGV[0];
8 my $bed_directory = $ARGV[1];
9 my $PAV = $ARGV[2];
10 my $threshold_coverage = $ARGV[3];
11
12 my %strains;
13 open(S,$strain_info);
14 while (my $line = <S>) {
15 chomp $line;
16 my ($id,$name) = split(/\t/,$line);
17 $strains{$id} = $name;
18 }
19 close(S);
20
21 my %gene_lengths;
22 foreach my $id(keys(%strains)){
23
24 my $gff = `ls $bed_directory/$id*.gff`;
25 open(GFF,$gff);
26 my $current_gene_length;
27 while(my $line = <GFF>){
28 chomp($line);
29 my @infos = split(/\t/,$line);
30 if ($infos[2] eq "gene" && $line =~/ID=([^;]+);*/){
31 my $chr = $infos[0];
32 my $start = $infos[3];
33 my $end = $infos[4];
34 my $gene = $1;
35 my $genelength = $end-$start;
36 $current_gene_length = $genelength;
37 $gene_lengths{$id}{$gene} = $genelength;
38 }
39 if ($infos[2] eq "CDS"){
40 my $gene;
41 if ($line =~/Parent=([^;]+);*/){
42 $gene = $1;
43 }
44 elsif ($line =~/ID=([^;]+);*/){
45 $gene = $1;
46 }
47 my $start = $infos[3];
48 my $end = $infos[4];
49 my $genelength = $end-$start;
50 $gene_lengths{$id}{$gene} += $genelength;
51 if ($line =~/protein_id=([^;]+);/){
52 $gene = $1;
53 $gene_lengths{$id}{$gene} += $genelength;
54 }
55 }
56 }
57 close(GFF);
58 }
59
60
61
62
63 my %genes;
64 my $graph = Graph::Undirected->new;
65 my $num = 0;
66 foreach my $id(keys(%strains)){
67
68 my $file = `ls $bed_directory/$id.*.bed`;
69 chomp($file);
70 open(B,$file);
71 while(my $line = <B>){
72 chomp($line);
73 my @infos = split(/\t/,$line);
74 my $gene1 = $infos[3];
75 $genes{"$id:$gene1"} = 1;
76 }
77 close(B);
78
79 foreach my $id2(keys(%strains)){
80 my $file2 = `ls $bed_directory/$id2.*.bed`;
81 chomp($file2);
82 if (-e $file && -e $file2 && $file ne $file2){
83 $num++;
84 system("bedtools intersect -a $file -b $file2 -wo >$PAV.$num.intersect.out");
85 my %cumul_match;
86 my %cumul_match2;
87 open(INTER,"$PAV.$num.intersect.out");
88 while(<INTER>){
89 my $line = $_;
90 $line =~s/\n//g;$line =~s/\r//g;
91 my ($segment,$start_gene1,$end_gene1,$gene1,$segment,$start_gene2,$end_gene2,$gene2,$size_overlap) = split(/\t/,$line);
92 $cumul_match{"$gene1;$gene2"}+=$size_overlap;
93 }
94 close(INTER);
95
96 #unlink("$PAV.$num.intersect.out");
97
98 foreach my $pair(keys(%cumul_match)){
99 my $size1 = $cumul_match{$pair};
100 my ($gene1,$gene2) = split(/;/,$pair);
101 my $gene1_length = $gene_lengths{$id}{$gene1};
102 my $gene2_length = $gene_lengths{$id2}{$gene2};
103 my $size_match_gene = $cumul_match{"$gene1;$gene2"};
104 my $percentage_overlap_gene1 = ($size_match_gene/$gene1_length)*100;
105 my $percentage_overlap_gene2 = ($size_match_gene/$gene2_length)*100;
106 #print "$pair $size_match_gene $percentage_overlap_gene1 $percentage_overlap_gene2\n";
107 if ($percentage_overlap_gene1 > $threshold_coverage && $percentage_overlap_gene2 > $threshold_coverage){
108 $graph->add_edge("$id:$gene1","$id2:$gene2");
109 delete($genes{"$id:$gene1"});
110 delete($genes{"$id2:$gene2"});
111 }
112 }
113 }
114 }
115 }
116
117 open(OUT,">$PAV");
118 print OUT "ClutserID";
119 foreach my $id(sort keys(%strains)){
120 my $name = $strains{$id};
121 print OUT "\t".$name;
122 }
123 print OUT "\n";
124 my $clnum = 0;
125 my @cc = $graph->connected_components();
126 foreach my $component (@cc){
127 $clnum++;
128 print OUT $clnum;
129 my @genes = @$component;
130 my %h;
131 foreach my $gene(@genes){
132 my ($id,$genename) = split(/:/,$gene);
133 $h{$id}.="$genename,";
134 }
135 foreach my $id(sort keys(%strains)){
136 my $ids = "-";
137 if ($h{$id}){
138 $ids = $h{$id};
139 chop($ids);
140 }
141 print OUT "\t".$ids;
142 }
143 print OUT "\n";
144 }
145 #######################
146 # add singletons
147 #######################
148 foreach my $gene(keys(%genes)){
149 $clnum++;
150 print OUT $clnum;
151 my ($id,$genename) = split(/:/,$gene);
152 my %h;
153 $h{$id}.="$genename,";
154 foreach my $id(sort keys(%strains)){
155 my $ids = "-";
156 if ($h{$id}){
157 $ids = $h{$id};
158 chop($ids);
159 }
160 print OUT "\t".$ids;
161 }
162 print OUT "\n";
163 }
164 close(OUT);