comparison PanExplorer_workflow/Perl/Naegleria/mergeSNP.pl @ 1:032f6b3806a3 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:16:08 +0000
parents
children
comparison
equal deleted inserted replaced
0:3cbb01081cde 1:032f6b3806a3
1 #!/usr/bin/perl
2
3 use strict;
4
5 my $nb_file_found = 0;
6 open(LS,"ls *Nfowleri.snp |");
7 while(<LS>){
8 $nb_file_found++;
9 }
10 close(LS);
11 if ($nb_file_found < 1){
12 print "No file found with extension \".Nfowleri.snp\". This script allows to merge multiple SNP files (named as follows \"strainname.Nfowleri.snp\") obtained with VarScan into a global VCF file.\n";
13 exit;
14 }
15
16
17 print "##fileformat=VCFv4.1\n";
18 print "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ";
19
20 my %corr;
21 open(F,"/media/results_datacalcul/Alexis/Naegleria/raw/SRR.info.xls");
22 while(<F>){
23 my $line = $_;
24 $line =~s/\n//g;$line =~s/\r//g;
25 my ($srr,$name) = split(/\t/,$line);
26 #$corr{$srr} = $name;
27 }
28 close(F);
29
30 my %str;
31 my @strains;
32 my %hash;
33 my %genotypes;
34 my %alternates;
35 my %reference_bases;
36 open(LS,"ls *Nfowleri.snp |");
37 #open(LS,"ls *Nlovaniensis.snp |");
38 #open(LS,"ls ../../tuberculosis/VCF_comparaison_avec_reference/*vcf |");
39 while(<LS>){
40 my $file = $_;
41 $file =~s/\n//g;$file =~s/\r//g;
42 my $strain;
43 if ($file =~/^(.*).Nfowleri.snp/){
44 #if ($file =~/^(.*).Nlovaniensis.snp/){
45 #if ($file =~/\/([^\/]*)\.vcf/){
46 $strain = $1;
47 $strain =~s/_/-/g;
48 #if ($strain =~/SRR12781212/ or $strain =~/SRR12781213/ or $strain =~/SRR12781214/ or $strain =~/SRR12781215/ or $strain =~/SRR12781216/){next;}
49 #if ($strain !~/NF/){next;}
50 push(@strains,$strain);
51 $str{$strain} = 1;
52 #open(F,"$strain.Klebsiella_pneumoniae_subsp._pneumoniae_HS11286.snp");
53 open(F,$file);
54 <F>;
55 while(<F>){
56 my $line = $_;
57 $line =~s/\n//g;$line =~s/\r//g;
58 if (/^#/){next;}
59 my ($chr,$pos,$ref,$alt,$info) = split("\t",$line);
60 my ($Cons,$Cov,$Reads1,$Reads2,$Freq) = split(":",$info);
61 $Freq =~s/%//g;$Freq =~s/,/./g;
62 $chr =~s/\.1//g;
63 $alternates{$chr}{$pos}{$alt}=1;
64 $reference_bases{$chr}{$pos} = $ref;
65 if ($Cov > 6 && $Freq > 70){
66 $hash{$chr}{$pos}{$strain} = "1/1";
67 }
68 elsif ($Cov > 6 && $Freq > 34){
69 $hash{$chr}{$pos}{$strain} = "0/1";
70 }
71 else{
72 #$hash{$chr}{$pos}{$strain} = "0/0";
73 }
74 }
75 close(F);
76 }
77 }
78 close(LS);
79
80 my %depths;
81 open(LS,"ls *.depth |");
82 while(<LS>){
83 my $file = $_;
84 $file =~s/\n//g;$file =~s/\r//g;
85 my $strain;
86 if ($file =~/^(.*).Nfowleri.depth/){
87 #if ($file =~/^(.*).Nlovaniensis.depth/){
88 $strain = $1;
89 $strain =~s/_/-/g;
90 if (!$str{$strain}){next;}
91 #if ($strain !~/NF/){next;}
92 open(F,$file);
93 while(<F>){
94 my $line = $_;
95 $line =~s/\n//g;$line =~s/\r//g;
96 my ($chr,$pos,$depth) = split("\t",$line);
97 $chr =~s/\.1//g;
98 if ($hash{$chr}{$pos}){
99 $depths{$chr}{$pos}{$strain}=1;
100 }
101 }
102 close(F);
103 }
104 }
105 close(LS);
106
107 foreach my $strain(@strains){
108 if ($corr{$strain}){
109 $strain= $corr{$strain};
110 }
111 print "$strain\t";
112 }
113 print "Reference\n";
114 foreach my $chr(sort {$a<=>$b} keys(%hash)){
115 my $refsubhash = $hash{$chr};
116 my %subhash2 = %$refsubhash;
117 foreach my $pos(sort {$a<=>$b} keys(%subhash2)){
118 my $refbase = $reference_bases{$chr}{$pos};
119 my $ref_alt = $alternates{$chr}{$pos};
120 my %subhash_alt = %$ref_alt;
121 my $alternate;
122
123 # discard multiallelic
124 if (scalar keys(%subhash_alt) > 1){
125 next;
126 }
127 else{
128 foreach my $alt(keys(%subhash_alt)){
129 $alternate = $alt;
130 }
131 }
132
133 print "$chr $pos . $refbase $alternate 186.96 . PASS GT:AD:DP:GQ:PL";
134 foreach my $strain(@strains){
135 my $val = $hash{$chr}{$pos}{$strain};
136 if ($val){
137 print " $val";
138 }
139 else{
140 if ($depths{$chr}{$pos}{$strain}){
141 print " 0/0";
142 }
143 else{
144 print " ./.";
145 }
146 }
147
148 }
149 print " 0/0\n";
150 }
151 }
152
153