annotate Perl/GeneratePAVfromBed.pl @ 8:f919ffe96425 draft

Uploaded
author dereeper
date Thu, 30 May 2024 14:31:04 +0000
parents e42d30da7a74
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1 #!/usr/bin/perl
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
3 use strict;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
4
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
5 use Graph::Undirected;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
6
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
7 my $strain_info = $ARGV[0];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
8 my $bed_directory = $ARGV[1];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
9 my $PAV = $ARGV[2];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
10 my $threshold_coverage = $ARGV[3];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
11
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
12 my %strains;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
13 open(S,$strain_info);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
14 while (my $line = <S>) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
15 chomp $line;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
16 my ($id,$name) = split(/\t/,$line);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
17 $strains{$id} = $name;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
18 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
19 close(S);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
20
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
21 my %gene_lengths;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
22 foreach my $id(keys(%strains)){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
23
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
24 my $gff = `ls $bed_directory/$id*.gff`;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
25 open(GFF,$gff);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
26 my $current_gene_length;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
27 while(my $line = <GFF>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
28 chomp($line);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
29 my @infos = split(/\t/,$line);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
30 if ($infos[2] eq "gene" && $line =~/ID=([^;]+);*/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
31 my $chr = $infos[0];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
32 my $start = $infos[3];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
33 my $end = $infos[4];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
34 my $gene = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
35 my $genelength = $end-$start;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
36 $current_gene_length = $genelength;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
37 $gene_lengths{$id}{$gene} = $genelength;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
38 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
39 if ($infos[2] eq "CDS"){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
40 my $gene;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
41 if ($line =~/Parent=([^;]+);*/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
42 $gene = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
43 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
44 elsif ($line =~/ID=([^;]+);*/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
45 $gene = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
46 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
47 my $start = $infos[3];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
48 my $end = $infos[4];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
49 my $genelength = $end-$start;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
50 $gene_lengths{$id}{$gene} += $genelength;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
51 if ($line =~/protein_id=([^;]+);/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
52 $gene = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
53 $gene_lengths{$id}{$gene} += $genelength;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
54 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
55 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
56 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
57 close(GFF);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
58 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
59
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
60
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
61
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
62
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
63 my %genes;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
64 my $graph = Graph::Undirected->new;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
65 my $num = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
66 foreach my $id(keys(%strains)){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
67
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
68 my $file = `ls $bed_directory/$id.*.bed`;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
69 chomp($file);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
70 open(B,$file);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
71 while(my $line = <B>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
72 chomp($line);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
73 my @infos = split(/\t/,$line);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
74 my $gene1 = $infos[3];
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
75 $genes{"$id:$gene1"} = 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
76 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
77 close(B);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
78
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
79 foreach my $id2(keys(%strains)){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
80 my $file2 = `ls $bed_directory/$id2.*.bed`;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
81 chomp($file2);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
82 if (-e $file && -e $file2 && $file ne $file2){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
83 $num++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
84 system("bedtools intersect -a $file -b $file2 -wo >$PAV.$num.intersect.out");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
85 my %cumul_match;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
86 my %cumul_match2;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
87 open(INTER,"$PAV.$num.intersect.out");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
88 while(<INTER>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
89 my $line = $_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
90 $line =~s/\n//g;$line =~s/\r//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
91 my ($segment,$start_gene1,$end_gene1,$gene1,$segment,$start_gene2,$end_gene2,$gene2,$size_overlap) = split(/\t/,$line);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
92 $cumul_match{"$gene1;$gene2"}+=$size_overlap;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
93 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
94 close(INTER);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
95
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
96 #unlink("$PAV.$num.intersect.out");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
97
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
98 foreach my $pair(keys(%cumul_match)){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
99 my $size1 = $cumul_match{$pair};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
100 my ($gene1,$gene2) = split(/;/,$pair);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
101 my $gene1_length = $gene_lengths{$id}{$gene1};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
102 my $gene2_length = $gene_lengths{$id2}{$gene2};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
103 my $size_match_gene = $cumul_match{"$gene1;$gene2"};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
104 my $percentage_overlap_gene1 = ($size_match_gene/$gene1_length)*100;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
105 my $percentage_overlap_gene2 = ($size_match_gene/$gene2_length)*100;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
106 #print "$pair $size_match_gene $percentage_overlap_gene1 $percentage_overlap_gene2\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
107 if ($percentage_overlap_gene1 > $threshold_coverage && $percentage_overlap_gene2 > $threshold_coverage){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
108 $graph->add_edge("$id:$gene1","$id2:$gene2");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
109 delete($genes{"$id:$gene1"});
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
110 delete($genes{"$id2:$gene2"});
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
111 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
112 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
113 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
114 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
115 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
116
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
117 open(OUT,">$PAV");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
118 print OUT "ClutserID";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
119 foreach my $id(sort keys(%strains)){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
120 my $name = $strains{$id};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
121 print OUT "\t".$name;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
122 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
123 print OUT "\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
124 my $clnum = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
125 my @cc = $graph->connected_components();
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
126 foreach my $component (@cc){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
127 $clnum++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
128 print OUT $clnum;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
129 my @genes = @$component;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
130 my %h;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
131 foreach my $gene(@genes){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
132 my ($id,$genename) = split(/:/,$gene);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
133 $h{$id}.="$genename,";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
134 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
135 foreach my $id(sort keys(%strains)){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
136 my $ids = "-";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
137 if ($h{$id}){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
138 $ids = $h{$id};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
139 chop($ids);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
140 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
141 print OUT "\t".$ids;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
142 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
143 print OUT "\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
144 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
145 #######################
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
146 # add singletons
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
147 #######################
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
148 foreach my $gene(keys(%genes)){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
149 $clnum++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
150 print OUT $clnum;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
151 my ($id,$genename) = split(/:/,$gene);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
152 my %h;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
153 $h{$id}.="$genename,";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
154 foreach my $id(sort keys(%strains)){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
155 my $ids = "-";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
156 if ($h{$id}){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
157 $ids = $h{$id};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
158 chop($ids);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
159 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
160 print OUT "\t".$ids;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
161 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
162 print OUT "\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
163 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
164 close(OUT);