diff PanExplorer_workflow/Perl/GeneratePAVfromBed.pl @ 1:032f6b3806a3 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:16:08 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/PanExplorer_workflow/Perl/GeneratePAVfromBed.pl	Thu May 30 11:16:08 2024 +0000
@@ -0,0 +1,164 @@
+#!/usr/bin/perl
+
+use strict;
+
+use Graph::Undirected;
+
+my $strain_info = $ARGV[0];
+my $bed_directory = $ARGV[1];
+my $PAV = $ARGV[2];
+my $threshold_coverage = $ARGV[3];
+
+my %strains;
+open(S,$strain_info);
+while (my $line = <S>) {
+	chomp $line;
+	my ($id,$name) = split(/\t/,$line);
+	$strains{$id} = $name;
+}
+close(S);
+
+my %gene_lengths;
+foreach my $id(keys(%strains)){
+
+	my $gff = `ls $bed_directory/$id*.gff`;
+	open(GFF,$gff);
+	my $current_gene_length;
+	while(my $line = <GFF>){
+		chomp($line);
+		my @infos = split(/\t/,$line);
+		if ($infos[2] eq "gene" && $line =~/ID=([^;]+);*/){
+			my $chr = $infos[0];
+			my $start = $infos[3];
+			my $end = $infos[4];
+			my $gene = $1;
+			my $genelength = $end-$start;
+			$current_gene_length = $genelength;
+			$gene_lengths{$id}{$gene} = $genelength;
+		}
+		if ($infos[2] eq "CDS"){
+			my $gene;
+			if ($line =~/Parent=([^;]+);*/){
+				$gene = $1;
+			}
+			elsif ($line =~/ID=([^;]+);*/){
+                                $gene = $1;
+                        }
+			my $start = $infos[3];
+			my $end = $infos[4];
+			my $genelength = $end-$start;
+			$gene_lengths{$id}{$gene} += $genelength;
+			if ($line =~/protein_id=([^;]+);/){
+				$gene = $1;
+				$gene_lengths{$id}{$gene} += $genelength;
+			}
+		}
+	}
+	close(GFF);
+}
+
+
+
+
+my %genes;
+my $graph = Graph::Undirected->new;
+my $num = 0;
+foreach my $id(keys(%strains)){
+
+	my $file = `ls $bed_directory/$id.*.bed`;
+	chomp($file);
+	open(B,$file);
+	while(my $line = <B>){
+		chomp($line);
+		my @infos = split(/\t/,$line);
+		my $gene1 = $infos[3];
+		$genes{"$id:$gene1"} = 1;
+	}
+	close(B);
+
+	foreach my $id2(keys(%strains)){
+		my $file2 = `ls $bed_directory/$id2.*.bed`;
+		chomp($file2);
+		if (-e $file && -e $file2 && $file ne $file2){
+			$num++;
+			system("bedtools intersect -a $file -b $file2 -wo >$PAV.$num.intersect.out");
+			my %cumul_match;
+			my %cumul_match2;
+			open(INTER,"$PAV.$num.intersect.out");
+			while(<INTER>){
+				my $line = $_;
+				$line =~s/\n//g;$line =~s/\r//g;
+				my ($segment,$start_gene1,$end_gene1,$gene1,$segment,$start_gene2,$end_gene2,$gene2,$size_overlap) = split(/\t/,$line);
+				$cumul_match{"$gene1;$gene2"}+=$size_overlap;
+			}
+			close(INTER);
+
+			#unlink("$PAV.$num.intersect.out");
+
+			foreach my $pair(keys(%cumul_match)){
+				my $size1 = $cumul_match{$pair};
+				my ($gene1,$gene2) = split(/;/,$pair);
+				my $gene1_length = $gene_lengths{$id}{$gene1};
+				my $gene2_length = $gene_lengths{$id2}{$gene2};
+				my $size_match_gene = $cumul_match{"$gene1;$gene2"};
+				my $percentage_overlap_gene1 = ($size_match_gene/$gene1_length)*100;
+				my $percentage_overlap_gene2 = ($size_match_gene/$gene2_length)*100;
+				#print "$pair $size_match_gene $percentage_overlap_gene1 $percentage_overlap_gene2\n";
+				if ($percentage_overlap_gene1 > $threshold_coverage && $percentage_overlap_gene2 > $threshold_coverage){
+					$graph->add_edge("$id:$gene1","$id2:$gene2");
+					delete($genes{"$id:$gene1"});
+					delete($genes{"$id2:$gene2"});
+				}
+			}
+		}
+	}
+}
+
+open(OUT,">$PAV");
+print OUT "ClutserID";
+foreach my $id(sort keys(%strains)){
+	my $name = $strains{$id};
+	print OUT "\t".$name;
+}
+print OUT "\n";
+my $clnum = 0;
+my @cc = $graph->connected_components();
+foreach my $component (@cc){
+	$clnum++;
+	print OUT $clnum;
+        my @genes = @$component;
+	my %h;
+	foreach my $gene(@genes){
+		my ($id,$genename) = split(/:/,$gene);
+		$h{$id}.="$genename,";
+	}	
+	foreach my $id(sort keys(%strains)){
+		my $ids = "-";
+		if ($h{$id}){
+			$ids = $h{$id};	
+			chop($ids);
+		}
+		print OUT "\t".$ids;
+	}
+	print OUT "\n";
+}
+#######################
+# add singletons
+#######################
+foreach my $gene(keys(%genes)){
+	$clnum++;
+	print OUT $clnum;
+	my ($id,$genename) = split(/:/,$gene);
+	my %h;
+	$h{$id}.="$genename,";
+	foreach my $id(sort keys(%strains)){
+                my $ids = "-";
+                if ($h{$id}){
+                        $ids = $h{$id};
+                        chop($ids);
+                }
+                print OUT "\t".$ids;
+        }
+        print OUT "\n";	
+}
+close(OUT);