Mercurial > repos > dereeper > pangenome_explorer
comparison PanExplorer_workflow/Perl/Naegleria/mergeSNP.pl @ 1:032f6b3806a3 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 11:16:08 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:3cbb01081cde | 1:032f6b3806a3 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 use strict; | |
4 | |
5 my $nb_file_found = 0; | |
6 open(LS,"ls *Nfowleri.snp |"); | |
7 while(<LS>){ | |
8 $nb_file_found++; | |
9 } | |
10 close(LS); | |
11 if ($nb_file_found < 1){ | |
12 print "No file found with extension \".Nfowleri.snp\". This script allows to merge multiple SNP files (named as follows \"strainname.Nfowleri.snp\") obtained with VarScan into a global VCF file.\n"; | |
13 exit; | |
14 } | |
15 | |
16 | |
17 print "##fileformat=VCFv4.1\n"; | |
18 print "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT "; | |
19 | |
20 my %corr; | |
21 open(F,"/media/results_datacalcul/Alexis/Naegleria/raw/SRR.info.xls"); | |
22 while(<F>){ | |
23 my $line = $_; | |
24 $line =~s/\n//g;$line =~s/\r//g; | |
25 my ($srr,$name) = split(/\t/,$line); | |
26 #$corr{$srr} = $name; | |
27 } | |
28 close(F); | |
29 | |
30 my %str; | |
31 my @strains; | |
32 my %hash; | |
33 my %genotypes; | |
34 my %alternates; | |
35 my %reference_bases; | |
36 open(LS,"ls *Nfowleri.snp |"); | |
37 #open(LS,"ls *Nlovaniensis.snp |"); | |
38 #open(LS,"ls ../../tuberculosis/VCF_comparaison_avec_reference/*vcf |"); | |
39 while(<LS>){ | |
40 my $file = $_; | |
41 $file =~s/\n//g;$file =~s/\r//g; | |
42 my $strain; | |
43 if ($file =~/^(.*).Nfowleri.snp/){ | |
44 #if ($file =~/^(.*).Nlovaniensis.snp/){ | |
45 #if ($file =~/\/([^\/]*)\.vcf/){ | |
46 $strain = $1; | |
47 $strain =~s/_/-/g; | |
48 #if ($strain =~/SRR12781212/ or $strain =~/SRR12781213/ or $strain =~/SRR12781214/ or $strain =~/SRR12781215/ or $strain =~/SRR12781216/){next;} | |
49 #if ($strain !~/NF/){next;} | |
50 push(@strains,$strain); | |
51 $str{$strain} = 1; | |
52 #open(F,"$strain.Klebsiella_pneumoniae_subsp._pneumoniae_HS11286.snp"); | |
53 open(F,$file); | |
54 <F>; | |
55 while(<F>){ | |
56 my $line = $_; | |
57 $line =~s/\n//g;$line =~s/\r//g; | |
58 if (/^#/){next;} | |
59 my ($chr,$pos,$ref,$alt,$info) = split("\t",$line); | |
60 my ($Cons,$Cov,$Reads1,$Reads2,$Freq) = split(":",$info); | |
61 $Freq =~s/%//g;$Freq =~s/,/./g; | |
62 $chr =~s/\.1//g; | |
63 $alternates{$chr}{$pos}{$alt}=1; | |
64 $reference_bases{$chr}{$pos} = $ref; | |
65 if ($Cov > 6 && $Freq > 70){ | |
66 $hash{$chr}{$pos}{$strain} = "1/1"; | |
67 } | |
68 elsif ($Cov > 6 && $Freq > 34){ | |
69 $hash{$chr}{$pos}{$strain} = "0/1"; | |
70 } | |
71 else{ | |
72 #$hash{$chr}{$pos}{$strain} = "0/0"; | |
73 } | |
74 } | |
75 close(F); | |
76 } | |
77 } | |
78 close(LS); | |
79 | |
80 my %depths; | |
81 open(LS,"ls *.depth |"); | |
82 while(<LS>){ | |
83 my $file = $_; | |
84 $file =~s/\n//g;$file =~s/\r//g; | |
85 my $strain; | |
86 if ($file =~/^(.*).Nfowleri.depth/){ | |
87 #if ($file =~/^(.*).Nlovaniensis.depth/){ | |
88 $strain = $1; | |
89 $strain =~s/_/-/g; | |
90 if (!$str{$strain}){next;} | |
91 #if ($strain !~/NF/){next;} | |
92 open(F,$file); | |
93 while(<F>){ | |
94 my $line = $_; | |
95 $line =~s/\n//g;$line =~s/\r//g; | |
96 my ($chr,$pos,$depth) = split("\t",$line); | |
97 $chr =~s/\.1//g; | |
98 if ($hash{$chr}{$pos}){ | |
99 $depths{$chr}{$pos}{$strain}=1; | |
100 } | |
101 } | |
102 close(F); | |
103 } | |
104 } | |
105 close(LS); | |
106 | |
107 foreach my $strain(@strains){ | |
108 if ($corr{$strain}){ | |
109 $strain= $corr{$strain}; | |
110 } | |
111 print "$strain\t"; | |
112 } | |
113 print "Reference\n"; | |
114 foreach my $chr(sort {$a<=>$b} keys(%hash)){ | |
115 my $refsubhash = $hash{$chr}; | |
116 my %subhash2 = %$refsubhash; | |
117 foreach my $pos(sort {$a<=>$b} keys(%subhash2)){ | |
118 my $refbase = $reference_bases{$chr}{$pos}; | |
119 my $ref_alt = $alternates{$chr}{$pos}; | |
120 my %subhash_alt = %$ref_alt; | |
121 my $alternate; | |
122 | |
123 # discard multiallelic | |
124 if (scalar keys(%subhash_alt) > 1){ | |
125 next; | |
126 } | |
127 else{ | |
128 foreach my $alt(keys(%subhash_alt)){ | |
129 $alternate = $alt; | |
130 } | |
131 } | |
132 | |
133 print "$chr $pos . $refbase $alternate 186.96 . PASS GT:AD:DP:GQ:PL"; | |
134 foreach my $strain(@strains){ | |
135 my $val = $hash{$chr}{$pos}{$strain}; | |
136 if ($val){ | |
137 print " $val"; | |
138 } | |
139 else{ | |
140 if ($depths{$chr}{$pos}{$strain}){ | |
141 print " 0/0"; | |
142 } | |
143 else{ | |
144 print " ./."; | |
145 } | |
146 } | |
147 | |
148 } | |
149 print " 0/0\n"; | |
150 } | |
151 } | |
152 | |
153 |