comparison Plink.pl @ 4:6d1122b57344 draft

Uploaded
author dereeper
date Thu, 02 Nov 2017 05:42:47 -0400
parents fe39a4677281
children d6a7be1b5adb
comparison
equal deleted inserted replaced
3:2bc9b3ee5eea 4:6d1122b57344
1
2 #!/usr/bin/perl
3
4 use strict;
5 use Getopt::Long;
6 use Bio::SeqIO;
7
8 my $usage = qq~Usage:$0 <args> [<opts>]
9
10 where <args> are:
11
12 -i, --input <VCF input>
13 -o, --out <Output basename>
14
15 <opts> are:
16
17 -s, --samples <Samples to be analyzed. Comma separated list>
18 -c, --chromosomes <List of chromosomes to be analyzed.>
19 -e, --export <Output format (VCF/freq/plink. Default: VCF>
20 -f, --frequency <Minimum MAF. Default: 0.001>
21 -m, --max_freq <Maximum MAF. Default: 0.5>
22 -a, --allow_missing <Allowed missing data proportion per site. Must be comprised between 0 and 1. Default: 1>
23 -t, --type <Type of polymorphisms to keep (ALL/SNP). Default: ALL>
24 -b, --bounds <Lower bound and upper bound for a range of sites to be processed (start,end). Default: 1, 100000000>
25 -r, --remove_filt <Remove all sites with a FILTER flag other than PASS (true/false). Default: false>
26 -d, --distance <Thin sites so that no two sites are within the specified distance from one another. Default: 0>
27 ~;
28 $usage .= "\n";
29
30 my ($input,$out);
31
32 my $PLINK_EXE = "plink";
33
34
35 #my $indel_size_max = 500;
36 #my $indel_size_min = 1;
37 my $frequency_max = 0.5;
38 my $frequency_min = 0.001;
39 my $pos_max = 100000000000;
40 my $pos_min = 0;
41 my $filter_snp_type = "all";
42 my $remove_filt = "False";
43
44 my $missing_data = 1;
45 my $export = "VCF";
46 my $type = "ALL";
47 my $bounds;
48 my $samples;
49 my $chromosomes;
50 my $thin;
51
52 GetOptions(
53 "input=s" => \$input,
54 "out=s" => \$out,
55 "samples=s" => \$samples,
56 "chromosomes=s" => \$chromosomes,
57 "frequency=s" => \$frequency_min,
58 "max_freq=s" => \$frequency_max,
59 "allow_missing=s"=> \$missing_data,
60 "export=s" => \$export,
61 "type=s" => \$type,
62 "bounds=s" => \$bounds,
63 "remove_filt=s" => \$remove_filt,
64 "distance=s" => \$thin
65 );
66
67
68 die $usage
69 if ( !$input || !$out);
70
71 if ($samples && $samples =~/^([\w\,\-\.]+)\s*$/){
72 $samples = $1;
73 }
74 elsif ($samples){
75 die "Error: Samples must be a comma separated list of string\n";
76 }
77 if ($bounds && $bounds =~/^([\d\,]+)\s*$/){
78 $bounds = $1;
79 }
80 elsif($bounds){
81 die "Error: Bounds must be a comma separated list of integers\n";
82 }
83
84 my $minfreq_cmd = "";
85 if ($frequency_min && $frequency_min > 0 && $frequency_min =~/^([\d\.]+)\s*$/){
86 $frequency_min = $1;
87 $minfreq_cmd = "--maf $frequency_min";
88 }
89 elsif ($frequency_min == 0){
90 $minfreq_cmd = "";
91 }
92 elsif ($frequency_min){
93 die "Error: frequency must be an integer\n";
94 }
95 if ($thin && $thin =~/^([\d\.]+)\s*$/){
96 $thin = $1;
97 }
98 elsif ($thin){
99 die "Error: frequency must be an integer\n";
100 }
101 my $maxfreq_cmd = "";
102 if ($frequency_max && $frequency_max =~/^([\d\.]+)\s*$/){
103 $frequency_max = $1;
104 if ($frequency_max < 0.5){
105 $maxfreq_cmd = "--max-maf $frequency_max";
106 }
107 }
108 elsif($frequency_max){
109 die "Error: frequency must be an integer\n";
110 }
111 if ($missing_data =~/^([\d\.]+)\s*$/){
112 $missing_data = $1;
113 #$missing_data = 1 - $missing_data;
114 }
115 elsif ($missing_data){
116 die "Error: Missing data must be an integer\n";
117 }
118 if ($export && $export =~/^([\w]+)\s*$/){
119 $export = $1;
120 }
121 elsif($export){
122 die "Error: Export must be a string\n";
123 }
124 if ($type && $type =~/^([\w]+)\s*$/){
125 $type = $1;
126 }
127 elsif($type){
128 die "Error: Type must be a string\n";
129 }
130
131
132 my @dnasamples;
133 if ($samples)
134 {
135 @dnasamples = split(",",$samples);
136 }
137 my @boundaries;
138 if ($bounds)
139 {
140 @boundaries = split(",",$bounds);
141 }
142
143
144 my $experiment = "chromosomes";
145 my $table = "";
146 my %genes;
147 my @snp_ids;
148 my @snp_ids_and_positions;
149 my @snp_ids_and_positions_all;
150 my $gene;
151 my $snp_num = 0;
152 my %ref_sequences;
153 my %snps_of_gene;
154
155 my $indiv_cmd = "";
156 if (@dnasamples)
157 {
158 if (scalar @dnasamples > 1)
159 {
160 open(my $S,">$out.samples");
161 foreach my $samp(@dnasamples){
162 print $S "$samp $samp\n";
163 }
164 close($S);
165 $indiv_cmd = "--keep $out.samples ";
166 }
167 else
168 {
169 $indiv_cmd = "--indv " . join(" --indv ",@dnasamples);
170 }
171 }
172
173 my $chrom_cmd = "";
174 if ($chromosomes)
175 {
176 $chrom_cmd = "--chr ".$chromosomes
177 }
178
179 my $export_cmd = "--recode vcf-iid";
180 if ($export eq "bcf"){
181 $export_cmd = "--recode bcf";
182 }
183 if ($export eq "freq"){
184 $export_cmd = "--freq";
185 }
186 if ($export eq "plink"){
187 $export_cmd = "--make-bed";
188 }
189 if ($export eq "bed"){
190 $export_cmd = "--make-bed";
191 }
192
193
194 my $bounds_cmd = "";
195 if (@boundaries && $chrom_cmd=~/\w/ && $chrom_cmd !~/,/)
196 {
197 $bounds_cmd = "--from-bp $boundaries[0] --to-bp $boundaries[1]";
198 }
199
200
201
202 my $type_cmd = "";
203 if ($type eq "SNP")
204 {
205 $type_cmd = "--snps-only";
206 }
207
208 my $filt_cmd = "";
209 if ($remove_filt eq "true")
210 {
211 $filt_cmd = "--remove-filtered-all";
212 }
213
214 my $thin_cmd = "";
215 if ($thin){
216 $thin_cmd = "--bp-space $thin";
217 }
218
219 #my $bcf_input = $input;
220 #$bcf_input =~s/vcf/bcf/g;
221 my $bcf_input;
222 my $bed_input = $input;
223 $bed_input =~s/\.bed//g;
224
225 if (-e "$bed_input.bed"){
226 system("$PLINK_EXE --bfile $bed_input --out $out $type_cmd $export_cmd $chrom_cmd $indiv_cmd $minfreq_cmd $maxfreq_cmd --geno $missing_data $thin_cmd $bounds_cmd --allow-extra-chr 1>$out.plink.stdout 2>$out.plink.stderr");
227 # for first 1000 SNPs
228 system("$PLINK_EXE --bfile $bed_input --out $out.recode $type_cmd --recode vcf-fid $chrom_cmd $indiv_cmd $minfreq_cmd $maxfreq_cmd --geno $missing_data $thin_cmd $bounds_cmd --allow-extra-chr --thin-count 800 1>$out.2.plink.stdout 2>$out.2.plink.stderr");
229 }
230 elsif (-e $bcf_input){
231 system("$PLINK_EXE --bcf $bcf_input --out $out $type_cmd $export_cmd $chrom_cmd $indiv_cmd $minfreq_cmd $maxfreq_cmd --geno $missing_data $thin_cmd $bounds_cmd --allow-extra-chr 1>$out.plink.stdout 2>$out.plink.stderr");
232 }
233 else
234 {
235 system("$PLINK_EXE --vcf $input --out $out $type_cmd $export_cmd $chrom_cmd $indiv_cmd $minfreq_cmd $maxfreq_cmd --geno $missing_data $thin_cmd $bounds_cmd --allow-extra-chr 1>$out.3.plink.stdout 2>$out.3.plink.stderr");
236
237 }
238
239
240
241
242