Mercurial > repos > dereeper > pangenome_explorer
diff PanExplorer_workflow/Perl/CreateGenePathsFromGFA.pl @ 1:032f6b3806a3 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 11:16:08 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PanExplorer_workflow/Perl/CreateGenePathsFromGFA.pl Thu May 30 11:16:08 2024 +0000 @@ -0,0 +1,180 @@ +#!/usr/bin/perl + +use strict; + + + +my $gfa = $ARGV[0]; +my $gff3 = $ARGV[1]; +my $out = $ARGV[2]; + +my %segment_is_covered_by_genes; +my %coordinates; +my %store_gene_paths; + +my %allgenes; +my %coding_positions; +open(LENGTH,">$out.gene_length.txt"); +open(F,$gff3); +while(my $line = <F>){ + chomp($line); + my @infos = split(/\t/,$line); + #if ($infos[2] eq "gene" && /CITME_006g014440/){ + if ($infos[2] eq "CDS"){ + my $chr; + # case eukaryotes + if ($line =~/Parent=([^;]+);*/){ + $chr = $infos[0]; + } + # case prokaryotes/prokka + elsif ($line =~/ID=([^;]+);*/){ + $chr = $infos[0]; + } + my $start = $infos[3]; + my $end = $infos[4]; + my $gene = $1; + $gene =~s/\n//g;$gene =~s/\r//g; + if ($line =~/protein_id=([^;]+);/){ + $gene = $1; + } + $coordinates{$chr}.= "$gene:$start-$end|"; + for (my $i = $start; $i <= $end; $i++){ + $allgenes{$gene}=1; + $coding_positions{$chr}{$i} = "$gene,"; + } + my $genelength = $end-$start; + print LENGTH "$gene $genelength\n"; + } +} +close(F); +close(LENGTH); + +print "Start GFA parsing ($gfa)...\n"; + +my %segments_chr; +my %segments_of_gene; +my %segments; +#open(ZCAT,"zcat $gfa |"); +open(GFA,$gfa); +while(my $line = <GFA>){ + chomp($line); + if ($line =~/^S/){ + my ($type,$segment,$segment_sequence,$SN,$tag) = split(/\t/,$line); + $segments{$segment} = $segment_sequence; + } + elsif ($line =~/^P/ or $line =~/^W/){ + + my $path; + my $chrom; + my @segments; + if ($line =~/^W/){ + my ($type,$sample,$strand,$chromW,$start,$length,$pathW) = split(/\t/,$line); + $path = $pathW; + $chrom = $chromW; + @segments = split(/[\>\<]/,$path); + } + elsif ($line =~/^P/){ + my ($type,$strain_chrom,$pathP) = split(/\t/,$line); + my ($stra,$chromP) = split(/#/,$strain_chrom); + $path = $pathP; + $chrom = $chromP; + @segments = split(/,/,$path); + } + $chrom =~s/\.\d+$//g; + my @genes = split(/\|/,$coordinates{$chrom}); + if ($coordinates{$chrom}){ + my $end_segment = 0; + my $start_segment = 0; + my $nb_gene = 0; + + my %genes_located; + my %first_segment_of_gene; + print "$chrom: ". scalar @segments."\n"; + my $cumul = 0; + LOOP_SEGMENT: foreach my $segment(@segments){ + if ($segment eq ""){next;} + my $segment_including_strand = $segment; + $segment =~s/\+//g; + $segment =~s/\-//g; + my $size = length($segments{$segment}); + $start_segment = $end_segment+1; + $end_segment = $start_segment+$size-1; + $cumul+=$size; + my $current_gene; + + if ($segment_including_strand =~/\+/){ + my $pos_in_segment = 0; + for (my $k = $start_segment; $k <= $end_segment; $k++){ + $pos_in_segment++; + #print "$k\n"; + + if ($coding_positions{$chrom}{$k}){ + my $info_gene = $coding_positions{$chrom}{$k}; + chop($info_gene); + my @genes = split(/,/,$info_gene); + foreach my $gene(@genes){ + $segments_of_gene{$gene} .= "s$segment:$pos_in_segment|"; + } + } + } + } + elsif ($segment_including_strand =~/\-/){ + my $pos_in_segment = $size+1; + for (my $k = $start_segment; $k <= $end_segment; $k++){ + $pos_in_segment--; + if ($coding_positions{$chrom}{$k}){ + my $info_gene = $coding_positions{$chrom}{$k}; + chop($info_gene); + my @genes = split(/,/,$info_gene); + foreach my $gene(@genes){ + $segments_of_gene{$gene} .= "s$segment:$pos_in_segment|"; + } + } + } + } + + } + print "$chrom $cumul $end_segment\n"; + } + } +} +close(GFA); + +open(O,">$out"); +open(BED,">$out.bed"); +foreach my $gene(keys(%segments_of_gene)){ + my $segments = $segments_of_gene{$gene}; + my @positions = split(/\|/,$segments); + my $previous_segment; + + #print O "$gene $segments\n"; + print O "$gene\t"; + + my $concat = ""; + foreach my $pos(@positions){ + + my ($segment,$pos_in_segment) = split(/:/,$pos); + if ($segment ne $previous_segment){ + $concat .= ">$segment:$pos_in_segment"; + } + $concat .= "-$pos_in_segment"; + $previous_segment = $segment; + } + my @segments = split(/>/,$concat); + foreach my $segment(@segments){ + if (!$segment){next;} + my ($segmentname,$pos) = split(/:/,$segment); + my @positions = split(/-/,$pos); + my $start = $positions[0]; + my $end = $positions[$#positions]; + if ($start > $end){ + $start = $positions[$#positions]; + $end = $positions[0]; + } + print O ">$segmentname:$start-$end"; + print BED "$segmentname $start $end $gene\n"; + } + print O "\n"; +} +close(O); +close(BED);