annotate Perl/Naegleria/mergeSNP.pl @ 4:53df1177ff97 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:54:27 +0000
parents e42d30da7a74
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1 #!/usr/bin/perl
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
3 use strict;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
4
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
5 my $nb_file_found = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
6 open(LS,"ls *Nfowleri.snp |");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
7 while(<LS>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
8 $nb_file_found++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
9 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
10 close(LS);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
11 if ($nb_file_found < 1){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
12 print "No file found with extension \".Nfowleri.snp\". This script allows to merge multiple SNP files (named as follows \"strainname.Nfowleri.snp\") obtained with VarScan into a global VCF file.\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
13 exit;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
14 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
15
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
16
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
17 print "##fileformat=VCFv4.1\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
18 print "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
19
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
20 my %corr;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
21 open(F,"/media/results_datacalcul/Alexis/Naegleria/raw/SRR.info.xls");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
22 while(<F>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
23 my $line = $_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
24 $line =~s/\n//g;$line =~s/\r//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
25 my ($srr,$name) = split(/\t/,$line);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
26 #$corr{$srr} = $name;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
27 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
28 close(F);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
29
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
30 my %str;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
31 my @strains;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
32 my %hash;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
33 my %genotypes;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
34 my %alternates;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
35 my %reference_bases;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
36 open(LS,"ls *Nfowleri.snp |");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
37 #open(LS,"ls *Nlovaniensis.snp |");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
38 #open(LS,"ls ../../tuberculosis/VCF_comparaison_avec_reference/*vcf |");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
39 while(<LS>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
40 my $file = $_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
41 $file =~s/\n//g;$file =~s/\r//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
42 my $strain;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
43 if ($file =~/^(.*).Nfowleri.snp/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
44 #if ($file =~/^(.*).Nlovaniensis.snp/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
45 #if ($file =~/\/([^\/]*)\.vcf/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
46 $strain = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
47 $strain =~s/_/-/g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
48 #if ($strain =~/SRR12781212/ or $strain =~/SRR12781213/ or $strain =~/SRR12781214/ or $strain =~/SRR12781215/ or $strain =~/SRR12781216/){next;}
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
49 #if ($strain !~/NF/){next;}
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
50 push(@strains,$strain);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
51 $str{$strain} = 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
52 #open(F,"$strain.Klebsiella_pneumoniae_subsp._pneumoniae_HS11286.snp");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
53 open(F,$file);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
54 <F>;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
55 while(<F>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
56 my $line = $_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
57 $line =~s/\n//g;$line =~s/\r//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
58 if (/^#/){next;}
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
59 my ($chr,$pos,$ref,$alt,$info) = split("\t",$line);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
60 my ($Cons,$Cov,$Reads1,$Reads2,$Freq) = split(":",$info);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
61 $Freq =~s/%//g;$Freq =~s/,/./g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
62 $chr =~s/\.1//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
63 $alternates{$chr}{$pos}{$alt}=1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
64 $reference_bases{$chr}{$pos} = $ref;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
65 if ($Cov > 6 && $Freq > 70){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
66 $hash{$chr}{$pos}{$strain} = "1/1";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
67 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
68 elsif ($Cov > 6 && $Freq > 34){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
69 $hash{$chr}{$pos}{$strain} = "0/1";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
70 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
71 else{
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
72 #$hash{$chr}{$pos}{$strain} = "0/0";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
73 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
74 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
75 close(F);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
76 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
77 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
78 close(LS);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
79
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
80 my %depths;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
81 open(LS,"ls *.depth |");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
82 while(<LS>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
83 my $file = $_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
84 $file =~s/\n//g;$file =~s/\r//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
85 my $strain;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
86 if ($file =~/^(.*).Nfowleri.depth/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
87 #if ($file =~/^(.*).Nlovaniensis.depth/){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
88 $strain = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
89 $strain =~s/_/-/g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
90 if (!$str{$strain}){next;}
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
91 #if ($strain !~/NF/){next;}
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
92 open(F,$file);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
93 while(<F>){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
94 my $line = $_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
95 $line =~s/\n//g;$line =~s/\r//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
96 my ($chr,$pos,$depth) = split("\t",$line);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
97 $chr =~s/\.1//g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
98 if ($hash{$chr}{$pos}){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
99 $depths{$chr}{$pos}{$strain}=1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
100 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
101 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
102 close(F);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
103 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
104 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
105 close(LS);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
106
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
107 foreach my $strain(@strains){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
108 if ($corr{$strain}){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
109 $strain= $corr{$strain};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
110 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
111 print "$strain\t";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
112 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
113 print "Reference\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
114 foreach my $chr(sort {$a<=>$b} keys(%hash)){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
115 my $refsubhash = $hash{$chr};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
116 my %subhash2 = %$refsubhash;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
117 foreach my $pos(sort {$a<=>$b} keys(%subhash2)){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
118 my $refbase = $reference_bases{$chr}{$pos};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
119 my $ref_alt = $alternates{$chr}{$pos};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
120 my %subhash_alt = %$ref_alt;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
121 my $alternate;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
122
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
123 # discard multiallelic
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
124 if (scalar keys(%subhash_alt) > 1){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
125 next;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
126 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
127 else{
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
128 foreach my $alt(keys(%subhash_alt)){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
129 $alternate = $alt;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
130 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
131 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
132
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
133 print "$chr $pos . $refbase $alternate 186.96 . PASS GT:AD:DP:GQ:PL";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
134 foreach my $strain(@strains){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
135 my $val = $hash{$chr}{$pos}{$strain};
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
136 if ($val){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
137 print " $val";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
138 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
139 else{
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
140 if ($depths{$chr}{$pos}{$strain}){
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
141 print " 0/0";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
142 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
143 else{
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
144 print " ./.";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
145 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
146 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
147
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
148 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
149 print " 0/0\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
150 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
151 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
152
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
153