annotate PanExplorer_workflow/Perl/Naegleria/mergeSNP.pl @ 1:032f6b3806a3 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:16:08 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
1 #!/usr/bin/perl
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
2
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
3 use strict;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
4
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
5 my $nb_file_found = 0;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
6 open(LS,"ls *Nfowleri.snp |");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
7 while(<LS>){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
8 $nb_file_found++;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
9 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
10 close(LS);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
11 if ($nb_file_found < 1){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
12 print "No file found with extension \".Nfowleri.snp\". This script allows to merge multiple SNP files (named as follows \"strainname.Nfowleri.snp\") obtained with VarScan into a global VCF file.\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
13 exit;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
14 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
15
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
16
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
17 print "##fileformat=VCFv4.1\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
18 print "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
19
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
20 my %corr;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
21 open(F,"/media/results_datacalcul/Alexis/Naegleria/raw/SRR.info.xls");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
22 while(<F>){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
23 my $line = $_;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
24 $line =~s/\n//g;$line =~s/\r//g;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
25 my ($srr,$name) = split(/\t/,$line);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
26 #$corr{$srr} = $name;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
27 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
28 close(F);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
29
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
30 my %str;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
31 my @strains;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
32 my %hash;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
33 my %genotypes;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
34 my %alternates;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
35 my %reference_bases;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
36 open(LS,"ls *Nfowleri.snp |");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
37 #open(LS,"ls *Nlovaniensis.snp |");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
38 #open(LS,"ls ../../tuberculosis/VCF_comparaison_avec_reference/*vcf |");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
39 while(<LS>){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
40 my $file = $_;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
41 $file =~s/\n//g;$file =~s/\r//g;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
42 my $strain;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
43 if ($file =~/^(.*).Nfowleri.snp/){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
44 #if ($file =~/^(.*).Nlovaniensis.snp/){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
45 #if ($file =~/\/([^\/]*)\.vcf/){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
46 $strain = $1;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
47 $strain =~s/_/-/g;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
48 #if ($strain =~/SRR12781212/ or $strain =~/SRR12781213/ or $strain =~/SRR12781214/ or $strain =~/SRR12781215/ or $strain =~/SRR12781216/){next;}
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
49 #if ($strain !~/NF/){next;}
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
50 push(@strains,$strain);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
51 $str{$strain} = 1;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
52 #open(F,"$strain.Klebsiella_pneumoniae_subsp._pneumoniae_HS11286.snp");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
53 open(F,$file);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
54 <F>;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
55 while(<F>){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
56 my $line = $_;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
57 $line =~s/\n//g;$line =~s/\r//g;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
58 if (/^#/){next;}
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
59 my ($chr,$pos,$ref,$alt,$info) = split("\t",$line);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
60 my ($Cons,$Cov,$Reads1,$Reads2,$Freq) = split(":",$info);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
61 $Freq =~s/%//g;$Freq =~s/,/./g;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
62 $chr =~s/\.1//g;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
63 $alternates{$chr}{$pos}{$alt}=1;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
64 $reference_bases{$chr}{$pos} = $ref;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
65 if ($Cov > 6 && $Freq > 70){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
66 $hash{$chr}{$pos}{$strain} = "1/1";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
67 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
68 elsif ($Cov > 6 && $Freq > 34){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
69 $hash{$chr}{$pos}{$strain} = "0/1";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
70 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
71 else{
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
72 #$hash{$chr}{$pos}{$strain} = "0/0";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
73 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
74 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
75 close(F);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
76 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
77 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
78 close(LS);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
79
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
80 my %depths;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
81 open(LS,"ls *.depth |");
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
82 while(<LS>){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
83 my $file = $_;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
84 $file =~s/\n//g;$file =~s/\r//g;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
85 my $strain;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
86 if ($file =~/^(.*).Nfowleri.depth/){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
87 #if ($file =~/^(.*).Nlovaniensis.depth/){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
88 $strain = $1;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
89 $strain =~s/_/-/g;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
90 if (!$str{$strain}){next;}
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
91 #if ($strain !~/NF/){next;}
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
92 open(F,$file);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
93 while(<F>){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
94 my $line = $_;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
95 $line =~s/\n//g;$line =~s/\r//g;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
96 my ($chr,$pos,$depth) = split("\t",$line);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
97 $chr =~s/\.1//g;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
98 if ($hash{$chr}{$pos}){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
99 $depths{$chr}{$pos}{$strain}=1;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
100 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
101 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
102 close(F);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
103 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
104 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
105 close(LS);
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
106
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
107 foreach my $strain(@strains){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
108 if ($corr{$strain}){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
109 $strain= $corr{$strain};
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
110 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
111 print "$strain\t";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
112 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
113 print "Reference\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
114 foreach my $chr(sort {$a<=>$b} keys(%hash)){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
115 my $refsubhash = $hash{$chr};
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
116 my %subhash2 = %$refsubhash;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
117 foreach my $pos(sort {$a<=>$b} keys(%subhash2)){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
118 my $refbase = $reference_bases{$chr}{$pos};
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
119 my $ref_alt = $alternates{$chr}{$pos};
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
120 my %subhash_alt = %$ref_alt;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
121 my $alternate;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
122
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
123 # discard multiallelic
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
124 if (scalar keys(%subhash_alt) > 1){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
125 next;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
126 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
127 else{
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
128 foreach my $alt(keys(%subhash_alt)){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
129 $alternate = $alt;
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
130 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
131 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
132
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
133 print "$chr $pos . $refbase $alternate 186.96 . PASS GT:AD:DP:GQ:PL";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
134 foreach my $strain(@strains){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
135 my $val = $hash{$chr}{$pos}{$strain};
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
136 if ($val){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
137 print " $val";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
138 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
139 else{
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
140 if ($depths{$chr}{$pos}{$strain}){
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
141 print " 0/0";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
142 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
143 else{
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
144 print " ./.";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
145 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
146 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
147
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
148 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
149 print " 0/0\n";
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
150 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
151 }
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
152
032f6b3806a3 Uploaded
dereeper
parents:
diff changeset
153