Mercurial > repos > dereeper > pangenome_explorer
diff PanExplorer_workflow/Perl/Naegleria/mergeSNP.pl @ 1:032f6b3806a3 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 11:16:08 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PanExplorer_workflow/Perl/Naegleria/mergeSNP.pl Thu May 30 11:16:08 2024 +0000 @@ -0,0 +1,153 @@ +#!/usr/bin/perl + +use strict; + +my $nb_file_found = 0; +open(LS,"ls *Nfowleri.snp |"); +while(<LS>){ + $nb_file_found++; +} +close(LS); +if ($nb_file_found < 1){ + print "No file found with extension \".Nfowleri.snp\". This script allows to merge multiple SNP files (named as follows \"strainname.Nfowleri.snp\") obtained with VarScan into a global VCF file.\n"; + exit; +} + + +print "##fileformat=VCFv4.1\n"; +print "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT "; + +my %corr; +open(F,"/media/results_datacalcul/Alexis/Naegleria/raw/SRR.info.xls"); +while(<F>){ + my $line = $_; + $line =~s/\n//g;$line =~s/\r//g; + my ($srr,$name) = split(/\t/,$line); + #$corr{$srr} = $name; +} +close(F); + +my %str; +my @strains; +my %hash; +my %genotypes; +my %alternates; +my %reference_bases; +open(LS,"ls *Nfowleri.snp |"); +#open(LS,"ls *Nlovaniensis.snp |"); +#open(LS,"ls ../../tuberculosis/VCF_comparaison_avec_reference/*vcf |"); +while(<LS>){ + my $file = $_; + $file =~s/\n//g;$file =~s/\r//g; + my $strain; + if ($file =~/^(.*).Nfowleri.snp/){ + #if ($file =~/^(.*).Nlovaniensis.snp/){ + #if ($file =~/\/([^\/]*)\.vcf/){ + $strain = $1; + $strain =~s/_/-/g; + #if ($strain =~/SRR12781212/ or $strain =~/SRR12781213/ or $strain =~/SRR12781214/ or $strain =~/SRR12781215/ or $strain =~/SRR12781216/){next;} + #if ($strain !~/NF/){next;} + push(@strains,$strain); + $str{$strain} = 1; + #open(F,"$strain.Klebsiella_pneumoniae_subsp._pneumoniae_HS11286.snp"); + open(F,$file); + <F>; + while(<F>){ + my $line = $_; + $line =~s/\n//g;$line =~s/\r//g; + if (/^#/){next;} + my ($chr,$pos,$ref,$alt,$info) = split("\t",$line); + my ($Cons,$Cov,$Reads1,$Reads2,$Freq) = split(":",$info); + $Freq =~s/%//g;$Freq =~s/,/./g; + $chr =~s/\.1//g; + $alternates{$chr}{$pos}{$alt}=1; + $reference_bases{$chr}{$pos} = $ref; + if ($Cov > 6 && $Freq > 70){ + $hash{$chr}{$pos}{$strain} = "1/1"; + } + elsif ($Cov > 6 && $Freq > 34){ + $hash{$chr}{$pos}{$strain} = "0/1"; + } + else{ + #$hash{$chr}{$pos}{$strain} = "0/0"; + } + } + close(F); + } +} +close(LS); + +my %depths; +open(LS,"ls *.depth |"); +while(<LS>){ + my $file = $_; + $file =~s/\n//g;$file =~s/\r//g; + my $strain; + if ($file =~/^(.*).Nfowleri.depth/){ + #if ($file =~/^(.*).Nlovaniensis.depth/){ + $strain = $1; + $strain =~s/_/-/g; + if (!$str{$strain}){next;} + #if ($strain !~/NF/){next;} + open(F,$file); + while(<F>){ + my $line = $_; + $line =~s/\n//g;$line =~s/\r//g; + my ($chr,$pos,$depth) = split("\t",$line); + $chr =~s/\.1//g; + if ($hash{$chr}{$pos}){ + $depths{$chr}{$pos}{$strain}=1; + } + } + close(F); + } +} +close(LS); + +foreach my $strain(@strains){ + if ($corr{$strain}){ + $strain= $corr{$strain}; + } + print "$strain\t"; +} +print "Reference\n"; +foreach my $chr(sort {$a<=>$b} keys(%hash)){ +my $refsubhash = $hash{$chr}; +my %subhash2 = %$refsubhash; +foreach my $pos(sort {$a<=>$b} keys(%subhash2)){ + my $refbase = $reference_bases{$chr}{$pos}; + my $ref_alt = $alternates{$chr}{$pos}; + my %subhash_alt = %$ref_alt; + my $alternate; + + # discard multiallelic + if (scalar keys(%subhash_alt) > 1){ + next; + } + else{ + foreach my $alt(keys(%subhash_alt)){ + $alternate = $alt; + } + } + + print "$chr $pos . $refbase $alternate 186.96 . PASS GT:AD:DP:GQ:PL"; + foreach my $strain(@strains){ + my $val = $hash{$chr}{$pos}{$strain}; + if ($val){ + print " $val"; + } + else{ + if ($depths{$chr}{$pos}{$strain}){ + print " 0/0"; + } + else{ + print " ./."; + } + } + + } + print " 0/0\n"; +} +} + +