Mercurial > repos > dereeper > pangenome_explorer
view Perl/ProjectPAVinCircos.pl @ 5:4157f435fa89 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 12:11:25 +0000 |
parents | e42d30da7a74 |
children |
line wrap: on
line source
#!/usr/bin/perl use strict; my $pav = $ARGV[0]; my $ref = $ARGV[1]; my $strain_info = $ARGV[2]; my $gff = $ARGV[3]; my $ordering = $ARGV[4]; my $outfile = $ARGV[5]; my $circos_exe = "circos/bin/circos"; #my $circos_exe = "circos"; my %metadata; open(M,$strain_info); while(my $line = <M>){ chomp($line); my ($id,$name) = split(/\t/,$line); $metadata{$id} = $name; $metadata{$name} = $id; } close(M); my $pangenome_size = 0; my $nb_core_genes = 0; my $nb_dispensable_genes = 0; my $nb_specific_genes = 0; my %cluster_of_gene; my %genes_of_cluster; my %tagging; open(P,$pav); my $firstline = <P>; chomp($firstline); my @infos = split(/\t/,$firstline); my %samples; my %hash_presence; my $col; for (my $j = 1; $j <= $#infos; $j++){ my $val = $infos[$j]; if ($val eq $ref){$col = $j;} $samples{$j} = $val; } while(my $line = <P>){ chomp($line); my @infos = split(/\t/,$line); my $cluster_num = $infos[0]; my $clnb = $infos[0]; my $pos = $clnb; my $pos_before = $pos - 1; $pangenome_size++; if (length($clnb) == 1){$clnb = "000".$clnb;} elsif (length($clnb) == 2){$clnb = "00".$clnb;} elsif (length($clnb) == 3){$clnb = "0".$clnb;} my $name = "CLUSTER".$clnb; # exclude lines where several genes for one strains if ($line =~/,/){next;} my $gene_ref = $infos[$col]; for (my $j = 1; $j <= $#infos; $j++){ my $gene = $infos[$j]; my $sample = $samples{$j}; if ($gene =~/\w+/){ $hash_presence{$gene_ref}{$sample} = 1; } } } close(P); my $track_thin = 0.5 / scalar keys(%samples); my $r0 = 0.4; my $r1 = $r0+$track_thin; my $highlight_block = ""; my %max_position; open(O,$ordering); <O>; while(my $sample_name = <O>){ #foreach my $sample(keys(%samples)){ chomp($sample_name); #my $sample_name = $samples{$sample}; my $sample_id = $metadata{$sample_name}; print "$sample_name $sample_id\n"; my $ref_id = $metadata{$ref}; open(GFF,$gff); open(CIRCOS_TRACK,">$outfile.$sample_id.circos.heatmap.txt"); my $current_start; my $current_stop; my $current_chrom; $r0+=$track_thin; $r1+=$track_thin; #my $r1_final = $r1-($track_thin/3); my $r1_final = $r1; $r0.="r"; $r1_final.="r"; $highlight_block .= qq~ <highlight> file = $outfile.$sample_id.circos.heatmap.txt r0 = $r0 r1 = $r1_final </highlight> ~; while(my $line = <GFF>){ chomp($line); my @infos = split(/\t/,$line); if ($infos[2] eq "gene" && $line =~/ID=([^;]+);*/){ my $chr = $infos[0]; my $start = $infos[3]; my $end = $infos[4]; $current_start = $start; $current_stop = $end; $current_chrom = $chr; if ($end > $max_position{$current_chrom}){$max_position{$current_chrom} = $end;} my $gene = $1; my $genelength = $end-$start; my $gene_complete = $sample_name.":".$gene; if ($hash_presence{$gene}{$sample_name}){ print CIRCOS_TRACK "$current_chrom $current_start $current_stop fill_color=red\n"; } } if ($infos[2] eq "CDS" && $line =~/Parent=([^;]+);*/){ my $gene = $1; my $gene_complete = $sample_name.":".$gene; if ($line =~/protein_id=([^;]+);/){ $gene = $1; if ($hash_presence{$gene}{$sample_name}){ print CIRCOS_TRACK "$current_chrom $current_start $current_stop fill_color=red\n"; } } } } close(GFF); close(CIRCOS_TRACK); } open(KARYOTYPE,">$outfile.karyotype.txt"); foreach my $chr(keys(%max_position)){ my $max = $max_position{$chr}; print KARYOTYPE "chr - $chr 1 0 $max black"; } close(KARYOTYPE); open(F,"circos_templates/circos1.conf"); open(CIRCOS_CONF,">$outfile.circos1.conf"); while(<F>){ if (/HIGHLIGHT_BLOCK/){print CIRCOS_CONF $highlight_block;} elsif (/KARYOTYPE_PATH/){print CIRCOS_CONF "karyotype = $outfile.karyotype.txt\n";} elsif (/OUTPUT_PNG/){print CIRCOS_CONF "file = $outfile\n";} else{print CIRCOS_CONF $_;} } close(F); close(CIRCOS_CONF); system("$circos_exe -conf $outfile.circos1.conf");