annotate VCFToolFilter/VCFToolsFilter.pl @ 7:f11604d28633 draft

Uploaded
author dereeper
date Fri, 20 Feb 2015 10:57:52 -0500
parents 8d2d0b6c3521
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
1
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
2 #!/usr/bin/perl
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
3
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
4 use strict;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
5 use Switch;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
6 use Getopt::Long;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
7 use Bio::SeqIO;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
8
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
9 my $usage = qq~Usage:$0 <args> [<opts>]
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
10
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
11 where <args> are:
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
12
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
13 -i, --input <VCF input>
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
14 -o, --out <Output basename>
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
15
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
16 <opts> are:
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
17
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
18 -s, --samples <Samples to be analyzed. Comma separated list>
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
19 -c, --chromosomes <Chromosomes to be analyzed. Comma separated list>
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
20 -e, --export <Output format (VCF/freq/plink. Default: VCF>
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
21 -f, --frequency <Minimum MAF. Default: 0.001>
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
22 -m, --max_freq <Maximum MAF. Default: 0.5>
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
23 -a, --allow_missing <Allowed missing data proportion per site. Must be comprised between 0 and 1. Default: 0>
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
24 -n, --nb_alleles <Accepted number of alleles (min,max). Default: 2,4>
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
25 -t, --type <Type of polymorphisms to keep (ALL/SNP/INDEL). Default: ALL>
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
26 -b, --bounds <Lower bound and upper bound for a range of sites to be processed (start,end). Default: 1, 100000000>
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
27 ~;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
28 $usage .= "\n";
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
29
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
30 my ($input,$out);
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
31
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
32
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
33 #my $indel_size_max = 500;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
34 #my $indel_size_min = 1;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
35 my $frequency_max = 0.5;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
36 my $frequency_min = 0.001;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
37 my $pos_max = 100000000000;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
38 my $pos_min = 0;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
39 my $filter_snp_type = "all";
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
40
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
41 my $missing_data = 0;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
42 my $export = "VCF";
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
43 my $type = "ALL";
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
44 my $nb_alleles;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
45 my $bounds;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
46 my $samples;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
47 my $chromosomes;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
48
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
49 GetOptions(
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
50 "input=s" => \$input,
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
51 "out=s" => \$out,
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
52 "samples=s" => \$samples,
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
53 "chromosomes=s" => \$chromosomes,
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
54 "frequency=s" => \$frequency_min,
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
55 "max_freq=s" => \$frequency_max,
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
56 "allow_missing=s"=> \$missing_data,
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
57 "export=s" => \$export,
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
58 "type=s" => \$type,
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
59 "nb_alleles=s" => \$nb_alleles,
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
60 "bounds=s" => \$bounds,
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
61 );
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
62
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
63
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
64 die $usage
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
65 if ( !$input || !$out);
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
66
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
67 if ($samples && $samples =~/^([\w\,]+)\s*$/){
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
68 $samples = $1;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
69 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
70 elsif ($samples){
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
71 die "Error: Samples must be a comma separated list of string\n";
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
72 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
73 if ($chromosomes && $chromosomes =~/^([\w\,]+)\s*$/){
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
74 $chromosomes = $1;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
75 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
76 elsif($chromosomes){
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
77 die "Error: Chromosomes must be a comma separated list of string\n";
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
78 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
79 if ($bounds && $bounds =~/^([\d\,]+)\s*$/){
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
80 $bounds = $1;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
81 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
82 elsif($bounds){
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
83 die "Error: Bounds must be a comma separated list of integers\n";
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
84 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
85
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
86 if ($frequency_min && $frequency_min =~/^([\d\.]+)\s*$/){
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
87 $frequency_min = $1;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
88 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
89 elsif ($frequency_min){
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
90 die "Error: frequency must be an integer\n";
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
91 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
92 if ($frequency_max && $frequency_max =~/^([\d\.]+)\s*$/){
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
93 $frequency_max = $1;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
94 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
95 elsif($frequency_max){
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
96 die "Error: frequency must be an integer\n";
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
97 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
98 if ($missing_data && $missing_data =~/^([\d\.]+)\s*$/){
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
99 $missing_data = $1;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
100 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
101 elsif ($missing_data){
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
102 die "Error: Missing data must be an integer\n";
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
103 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
104 if ($nb_alleles && $nb_alleles =~/^([\d\.]+)\s*$/){
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
105 $nb_alleles = $1;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
106 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
107 elsif($nb_alleles){
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
108 die "Error: Nb alleles must be an integer\n";
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
109 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
110 if ($export && $export =~/^([\w]+)\s*$/){
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
111 $export = $1;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
112 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
113 elsif($export){
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
114 die "Error: Export must be a string\n";
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
115 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
116 if ($type && $type =~/^([\w]+)\s*$/){
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
117 $type = $1;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
118 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
119 elsif($type){
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
120 die "Error: Type must be a string\n";
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
121 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
122
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
123
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
124 my @dnasamples;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
125 if ($samples)
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
126 {
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
127 @dnasamples = split(",",$samples);
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
128 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
129 my @nalleles;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
130 if ($nb_alleles)
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
131 {
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
132 @nalleles = split(",",$nb_alleles);
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
133 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
134 my @boundaries;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
135 if ($bounds)
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
136 {
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
137 @boundaries = split(",",$bounds);
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
138 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
139 my @chromosomes_list;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
140 if ($chromosomes)
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
141 {
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
142 @chromosomes_list = split(",",$chromosomes);
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
143 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
144
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
145
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
146 my $experiment = "chromosomes";
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
147 my $table = "";
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
148 my %genes;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
149 my @snp_ids;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
150 my @snp_ids_and_positions;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
151 my @snp_ids_and_positions_all;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
152 my $gene;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
153 my $snp_num = 0;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
154 my %ref_sequences;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
155 my %snps_of_gene;
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
156
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
157
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
158
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
159
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
160 my $indiv_cmd = "";
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
161 if (@dnasamples)
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
162 {
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
163 $indiv_cmd = "--indv " . join(" --indv ",@dnasamples);
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
164 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
165
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
166 my $chrom_cmd = "";
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
167 if (@chromosomes_list)
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
168 {
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
169 $chrom_cmd = "--chr " . join(" --chr ",@chromosomes_list);
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
170 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
171
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
172 my $export_cmd = "--recode";
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
173 if ($export eq "freq")
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
174 {
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
175 $export_cmd = "--freq";
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
176 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
177 if ($export eq "plink")
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
178 {
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
179 $export_cmd = "--plink";
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
180 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
181
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
182
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
183
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
184 my $nb_alleles_cmd = "--min-alleles 1 --max-alleles 4";
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
185 if (@nalleles)
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
186 {
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
187 $nb_alleles_cmd = "--min-alleles $nalleles[0] --max-alleles $nalleles[1]";
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
188 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
189 my $bounds_cmd = "--from-bp 1 --to-bp 100000000";
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
190 if (@boundaries)
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
191 {
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
192 $bounds_cmd = "--from-bp $boundaries[0] --to-bp $boundaries[1]";
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
193 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
194
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
195
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
196 my $type_cmd = "";
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
197 if ($type eq "INDEL")
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
198 {
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
199 $type_cmd = "--keep-only-indels";
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
200 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
201 if ($type eq "SNP")
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
202 {
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
203 $type_cmd = "--remove-indels";
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
204 }
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
205
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
206
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
207 system("vcftools --vcf $input --out $out --keep-INFO-all --remove-filtered-all $type_cmd $export_cmd $chrom_cmd $indiv_cmd $nb_alleles_cmd --maf $frequency_min --max-maf $frequency_max --max-missing $missing_data");
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
208
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
209
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
210
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
211
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
212
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
213
8d2d0b6c3521 Uploaded
dereeper
parents:
diff changeset
214