annotate VCFToolFilter/VCFToolsFilter.pl @ 10:2b7eb79f0ba0 draft

planemo upload
author gandres
date Wed, 13 Apr 2016 06:49:39 -0400
parents 612066e3f57d
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
1
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
2 #!/usr/bin/perl
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
3
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
4 use strict;
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
5 use Getopt::Long;
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
6 use Bio::SeqIO;
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
7
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
8 my $usage = qq~Usage:$0 <args> [<opts>]
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
9
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
10 where <args> are:
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
11
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
12 -i, --input <VCF input>
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
13 -o, --out <Output basename>
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
14
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
15 <opts> are:
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
16
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
17 -s, --samples <Samples to be analyzed. Comma separated list>
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
18 -c, --chromosomes <Chromosomes to be analyzed. Comma separated list>
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
19 -e, --export <Output format (VCF/freq/plink. Default: VCF>
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
20 -f, --frequency <Minimum MAF. Default: 0.001>
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
21 -m, --max_freq <Maximum MAF. Default: 0.5>
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
22 -a, --allow_missing <Allowed missing data proportion per site. Must be comprised between 0 and 1. Default: 1>
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
23 -n, --nb_alleles <Accepted number of alleles (min,max). Default: 2,4>
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
24 -t, --type <Type of polymorphisms to keep (ALL/SNP/INDEL). Default: ALL>
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
25 -b, --bounds <Lower bound and upper bound for a range of sites to be processed (start,end). Default: 1, 100000000>
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
26 ~;
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
27 $usage .= "\n";
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
28
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
29 my ($input,$out);
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
30
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
31
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
32 #my $indel_size_max = 500;
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
33 #my $indel_size_min = 1;
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
34 my $frequency_max = 0.5;
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
35 my $frequency_min = 0.001;
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
36 my $pos_max = 100000000000;
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
37 my $pos_min = 0;
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
38 my $filter_snp_type = "all";
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
39
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
40 my $missing_data = 1;
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
41 my $export = "VCF";
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
42 my $type = "ALL";
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
43 my $nb_alleles;
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
44 my $bounds;
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
45 my $samples;
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
46 my $chromosomes;
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
47
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
48 GetOptions(
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
49 "input=s" => \$input,
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
50 "out=s" => \$out,
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
51 "samples=s" => \$samples,
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
52 "chromosomes=s" => \$chromosomes,
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
53 "frequency=s" => \$frequency_min,
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
54 "max_freq=s" => \$frequency_max,
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
55 "allow_missing=s"=> \$missing_data,
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
56 "export=s" => \$export,
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
57 "type=s" => \$type,
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
58 "nb_alleles=s" => \$nb_alleles,
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
59 "bounds=s" => \$bounds,
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
60 );
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
61
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
62
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
63 die $usage
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
64 if ( !$input || !$out);
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
65
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
66
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
67 my @dnasamples;
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
68 if ($samples)
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
69 {
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
70 @dnasamples = split(",",$samples);
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
71 }
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
72 my @nalleles;
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
73 if ($nb_alleles)
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
74 {
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
75 @nalleles = split(",",$nb_alleles);
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
76 }
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
77 my @boundaries;
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
78 if ($bounds)
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
79 {
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
80 @boundaries = split(",",$bounds);
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
81 }
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
82 my @chromosomes_list;
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
83 if ($chromosomes)
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
84 {
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
85 @chromosomes_list = split(",",$chromosomes);
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
86 }
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
87
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
88
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
89 my $experiment = "chromosomes";
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
90 my $table = "";
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
91 my %genes;
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
92 my @snp_ids;
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
93 my @snp_ids_and_positions;
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
94 my @snp_ids_and_positions_all;
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
95 my $gene;
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
96 my $snp_num = 0;
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
97 my %ref_sequences;
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
98 my %snps_of_gene;
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
99
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
100
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
101
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
102
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
103 my $indiv_cmd = "";
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
104 if (@dnasamples)
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
105 {
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
106 $indiv_cmd = "--indv " . join(" --indv ",@dnasamples);
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
107 }
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
108
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
109 my $chrom_cmd = "";
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
110 if (@chromosomes_list)
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
111 {
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
112 $chrom_cmd = "--chr " . join(" --chr ",@chromosomes_list);
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
113 }
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
114
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
115 my $export_cmd = "--recode";
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
116 if ($export eq "freq")
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
117 {
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
118 $export_cmd = "--freq";
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
119 }
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
120 if ($export eq "plink")
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
121 {
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
122 $export_cmd = "--plink";
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
123 }
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
124
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
125
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
126
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
127 my $nb_alleles_cmd = "--min-alleles 1 --max-alleles 4";
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
128 if (@nalleles)
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
129 {
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
130 $nb_alleles_cmd = "--min-alleles $nalleles[0] --max-alleles $nalleles[1]";
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
131 }
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
132 my $bounds_cmd = "--from-bp 1 --to-bp 100000000";
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
133 if (@boundaries)
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
134 {
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
135 $bounds_cmd = "--from-bp $boundaries[0] --to-bp $boundaries[1]";
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
136 }
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
137
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
138
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
139 my $type_cmd = "";
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
140 if ($type eq "INDEL")
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
141 {
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
142 $type_cmd = "--keep-only-indels";
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
143 }
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
144 if ($type eq "SNP")
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
145 {
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
146 $type_cmd = "--remove-indels";
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
147 }
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
148
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
149
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
150 system("vcftools --vcf $input --out $out --keep-INFO-all --remove-filtered-all $type_cmd $export_cmd $chrom_cmd $indiv_cmd $nb_alleles_cmd --maf $frequency_min --max-maf $frequency_max --max-missing $missing_data >>vcftools.log 2>&1");
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
151
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
152
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
153
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
154
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
155
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
156
ac7c9e40d601 Uploaded
gandres
parents:
diff changeset
157