Mercurial > repos > dereeper > pangenome_explorer
view PanExplorer_workflow/Perl/GeneratePAVfromBed.pl @ 2:97e4e3e818b6 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 11:48:09 +0000 |
parents | 032f6b3806a3 |
children |
line wrap: on
line source
#!/usr/bin/perl use strict; use Graph::Undirected; my $strain_info = $ARGV[0]; my $bed_directory = $ARGV[1]; my $PAV = $ARGV[2]; my $threshold_coverage = $ARGV[3]; my %strains; open(S,$strain_info); while (my $line = <S>) { chomp $line; my ($id,$name) = split(/\t/,$line); $strains{$id} = $name; } close(S); my %gene_lengths; foreach my $id(keys(%strains)){ my $gff = `ls $bed_directory/$id*.gff`; open(GFF,$gff); my $current_gene_length; while(my $line = <GFF>){ chomp($line); my @infos = split(/\t/,$line); if ($infos[2] eq "gene" && $line =~/ID=([^;]+);*/){ my $chr = $infos[0]; my $start = $infos[3]; my $end = $infos[4]; my $gene = $1; my $genelength = $end-$start; $current_gene_length = $genelength; $gene_lengths{$id}{$gene} = $genelength; } if ($infos[2] eq "CDS"){ my $gene; if ($line =~/Parent=([^;]+);*/){ $gene = $1; } elsif ($line =~/ID=([^;]+);*/){ $gene = $1; } my $start = $infos[3]; my $end = $infos[4]; my $genelength = $end-$start; $gene_lengths{$id}{$gene} += $genelength; if ($line =~/protein_id=([^;]+);/){ $gene = $1; $gene_lengths{$id}{$gene} += $genelength; } } } close(GFF); } my %genes; my $graph = Graph::Undirected->new; my $num = 0; foreach my $id(keys(%strains)){ my $file = `ls $bed_directory/$id.*.bed`; chomp($file); open(B,$file); while(my $line = <B>){ chomp($line); my @infos = split(/\t/,$line); my $gene1 = $infos[3]; $genes{"$id:$gene1"} = 1; } close(B); foreach my $id2(keys(%strains)){ my $file2 = `ls $bed_directory/$id2.*.bed`; chomp($file2); if (-e $file && -e $file2 && $file ne $file2){ $num++; system("bedtools intersect -a $file -b $file2 -wo >$PAV.$num.intersect.out"); my %cumul_match; my %cumul_match2; open(INTER,"$PAV.$num.intersect.out"); while(<INTER>){ my $line = $_; $line =~s/\n//g;$line =~s/\r//g; my ($segment,$start_gene1,$end_gene1,$gene1,$segment,$start_gene2,$end_gene2,$gene2,$size_overlap) = split(/\t/,$line); $cumul_match{"$gene1;$gene2"}+=$size_overlap; } close(INTER); #unlink("$PAV.$num.intersect.out"); foreach my $pair(keys(%cumul_match)){ my $size1 = $cumul_match{$pair}; my ($gene1,$gene2) = split(/;/,$pair); my $gene1_length = $gene_lengths{$id}{$gene1}; my $gene2_length = $gene_lengths{$id2}{$gene2}; my $size_match_gene = $cumul_match{"$gene1;$gene2"}; my $percentage_overlap_gene1 = ($size_match_gene/$gene1_length)*100; my $percentage_overlap_gene2 = ($size_match_gene/$gene2_length)*100; #print "$pair $size_match_gene $percentage_overlap_gene1 $percentage_overlap_gene2\n"; if ($percentage_overlap_gene1 > $threshold_coverage && $percentage_overlap_gene2 > $threshold_coverage){ $graph->add_edge("$id:$gene1","$id2:$gene2"); delete($genes{"$id:$gene1"}); delete($genes{"$id2:$gene2"}); } } } } } open(OUT,">$PAV"); print OUT "ClutserID"; foreach my $id(sort keys(%strains)){ my $name = $strains{$id}; print OUT "\t".$name; } print OUT "\n"; my $clnum = 0; my @cc = $graph->connected_components(); foreach my $component (@cc){ $clnum++; print OUT $clnum; my @genes = @$component; my %h; foreach my $gene(@genes){ my ($id,$genename) = split(/:/,$gene); $h{$id}.="$genename,"; } foreach my $id(sort keys(%strains)){ my $ids = "-"; if ($h{$id}){ $ids = $h{$id}; chop($ids); } print OUT "\t".$ids; } print OUT "\n"; } ####################### # add singletons ####################### foreach my $gene(keys(%genes)){ $clnum++; print OUT $clnum; my ($id,$genename) = split(/:/,$gene); my %h; $h{$id}.="$genename,"; foreach my $id(sort keys(%strains)){ my $ids = "-"; if ($h{$id}){ $ids = $h{$id}; chop($ids); } print OUT "\t".$ids; } print OUT "\n"; } close(OUT);