1
|
1 #!/usr/bin/perl
|
|
2
|
|
3 use strict;
|
|
4
|
|
5 use Graph::Undirected;
|
|
6
|
|
7 my $strain_info = $ARGV[0];
|
|
8 my $bed_directory = $ARGV[1];
|
|
9 my $PAV = $ARGV[2];
|
|
10 my $threshold_coverage = $ARGV[3];
|
|
11
|
|
12 my %strains;
|
|
13 open(S,$strain_info);
|
|
14 while (my $line = <S>) {
|
|
15 chomp $line;
|
|
16 my ($id,$name) = split(/\t/,$line);
|
|
17 $strains{$id} = $name;
|
|
18 }
|
|
19 close(S);
|
|
20
|
|
21 my %gene_lengths;
|
|
22 foreach my $id(keys(%strains)){
|
|
23
|
|
24 my $gff = `ls $bed_directory/$id*.gff`;
|
|
25 open(GFF,$gff);
|
|
26 my $current_gene_length;
|
|
27 while(my $line = <GFF>){
|
|
28 chomp($line);
|
|
29 my @infos = split(/\t/,$line);
|
|
30 if ($infos[2] eq "gene" && $line =~/ID=([^;]+);*/){
|
|
31 my $chr = $infos[0];
|
|
32 my $start = $infos[3];
|
|
33 my $end = $infos[4];
|
|
34 my $gene = $1;
|
|
35 my $genelength = $end-$start;
|
|
36 $current_gene_length = $genelength;
|
|
37 $gene_lengths{$id}{$gene} = $genelength;
|
|
38 }
|
|
39 if ($infos[2] eq "CDS"){
|
|
40 my $gene;
|
|
41 if ($line =~/Parent=([^;]+);*/){
|
|
42 $gene = $1;
|
|
43 }
|
|
44 elsif ($line =~/ID=([^;]+);*/){
|
|
45 $gene = $1;
|
|
46 }
|
|
47 my $start = $infos[3];
|
|
48 my $end = $infos[4];
|
|
49 my $genelength = $end-$start;
|
|
50 $gene_lengths{$id}{$gene} += $genelength;
|
|
51 if ($line =~/protein_id=([^;]+);/){
|
|
52 $gene = $1;
|
|
53 $gene_lengths{$id}{$gene} += $genelength;
|
|
54 }
|
|
55 }
|
|
56 }
|
|
57 close(GFF);
|
|
58 }
|
|
59
|
|
60
|
|
61
|
|
62
|
|
63 my %genes;
|
|
64 my $graph = Graph::Undirected->new;
|
|
65 my $num = 0;
|
|
66 foreach my $id(keys(%strains)){
|
|
67
|
|
68 my $file = `ls $bed_directory/$id.*.bed`;
|
|
69 chomp($file);
|
|
70 open(B,$file);
|
|
71 while(my $line = <B>){
|
|
72 chomp($line);
|
|
73 my @infos = split(/\t/,$line);
|
|
74 my $gene1 = $infos[3];
|
|
75 $genes{"$id:$gene1"} = 1;
|
|
76 }
|
|
77 close(B);
|
|
78
|
|
79 foreach my $id2(keys(%strains)){
|
|
80 my $file2 = `ls $bed_directory/$id2.*.bed`;
|
|
81 chomp($file2);
|
|
82 if (-e $file && -e $file2 && $file ne $file2){
|
|
83 $num++;
|
|
84 system("bedtools intersect -a $file -b $file2 -wo >$PAV.$num.intersect.out");
|
|
85 my %cumul_match;
|
|
86 my %cumul_match2;
|
|
87 open(INTER,"$PAV.$num.intersect.out");
|
|
88 while(<INTER>){
|
|
89 my $line = $_;
|
|
90 $line =~s/\n//g;$line =~s/\r//g;
|
|
91 my ($segment,$start_gene1,$end_gene1,$gene1,$segment,$start_gene2,$end_gene2,$gene2,$size_overlap) = split(/\t/,$line);
|
|
92 $cumul_match{"$gene1;$gene2"}+=$size_overlap;
|
|
93 }
|
|
94 close(INTER);
|
|
95
|
|
96 #unlink("$PAV.$num.intersect.out");
|
|
97
|
|
98 foreach my $pair(keys(%cumul_match)){
|
|
99 my $size1 = $cumul_match{$pair};
|
|
100 my ($gene1,$gene2) = split(/;/,$pair);
|
|
101 my $gene1_length = $gene_lengths{$id}{$gene1};
|
|
102 my $gene2_length = $gene_lengths{$id2}{$gene2};
|
|
103 my $size_match_gene = $cumul_match{"$gene1;$gene2"};
|
|
104 my $percentage_overlap_gene1 = ($size_match_gene/$gene1_length)*100;
|
|
105 my $percentage_overlap_gene2 = ($size_match_gene/$gene2_length)*100;
|
|
106 #print "$pair $size_match_gene $percentage_overlap_gene1 $percentage_overlap_gene2\n";
|
|
107 if ($percentage_overlap_gene1 > $threshold_coverage && $percentage_overlap_gene2 > $threshold_coverage){
|
|
108 $graph->add_edge("$id:$gene1","$id2:$gene2");
|
|
109 delete($genes{"$id:$gene1"});
|
|
110 delete($genes{"$id2:$gene2"});
|
|
111 }
|
|
112 }
|
|
113 }
|
|
114 }
|
|
115 }
|
|
116
|
|
117 open(OUT,">$PAV");
|
|
118 print OUT "ClutserID";
|
|
119 foreach my $id(sort keys(%strains)){
|
|
120 my $name = $strains{$id};
|
|
121 print OUT "\t".$name;
|
|
122 }
|
|
123 print OUT "\n";
|
|
124 my $clnum = 0;
|
|
125 my @cc = $graph->connected_components();
|
|
126 foreach my $component (@cc){
|
|
127 $clnum++;
|
|
128 print OUT $clnum;
|
|
129 my @genes = @$component;
|
|
130 my %h;
|
|
131 foreach my $gene(@genes){
|
|
132 my ($id,$genename) = split(/:/,$gene);
|
|
133 $h{$id}.="$genename,";
|
|
134 }
|
|
135 foreach my $id(sort keys(%strains)){
|
|
136 my $ids = "-";
|
|
137 if ($h{$id}){
|
|
138 $ids = $h{$id};
|
|
139 chop($ids);
|
|
140 }
|
|
141 print OUT "\t".$ids;
|
|
142 }
|
|
143 print OUT "\n";
|
|
144 }
|
|
145 #######################
|
|
146 # add singletons
|
|
147 #######################
|
|
148 foreach my $gene(keys(%genes)){
|
|
149 $clnum++;
|
|
150 print OUT $clnum;
|
|
151 my ($id,$genename) = split(/:/,$gene);
|
|
152 my %h;
|
|
153 $h{$id}.="$genename,";
|
|
154 foreach my $id(sort keys(%strains)){
|
|
155 my $ids = "-";
|
|
156 if ($h{$id}){
|
|
157 $ids = $h{$id};
|
|
158 chop($ids);
|
|
159 }
|
|
160 print OUT "\t".$ids;
|
|
161 }
|
|
162 print OUT "\n";
|
|
163 }
|
|
164 close(OUT);
|