Mercurial > repos > dereeper > plink
comparison Plink.pl @ 0:fe39a4677281 draft
Uploaded
author | dereeper |
---|---|
date | Fri, 05 Aug 2016 09:46:55 -0400 |
parents | |
children | d6a7be1b5adb |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:fe39a4677281 |
---|---|
1 | |
2 #!/usr/bin/perl | |
3 | |
4 use strict; | |
5 use Getopt::Long; | |
6 use Bio::SeqIO; | |
7 | |
8 my $usage = qq~Usage:$0 <args> [<opts>] | |
9 | |
10 where <args> are: | |
11 | |
12 -i, --input <VCF input> | |
13 -o, --out <Output basename> | |
14 | |
15 <opts> are: | |
16 | |
17 -s, --samples <Samples to be analyzed. Comma separated list> | |
18 -c, --chromosomes <List of chromosomes to be analyzed.> | |
19 -e, --export <Output format (VCF/freq/plink. Default: VCF> | |
20 -f, --frequency <Minimum MAF. Default: 0.001> | |
21 -m, --max_freq <Maximum MAF. Default: 0.5> | |
22 -a, --allow_missing <Allowed missing data proportion per site. Must be comprised between 0 and 1. Default: 1> | |
23 -t, --type <Type of polymorphisms to keep (ALL/SNP). Default: ALL> | |
24 -b, --bounds <Lower bound and upper bound for a range of sites to be processed (start,end). Default: 1, 100000000> | |
25 -r, --remove_filt <Remove all sites with a FILTER flag other than PASS (true/false). Default: false> | |
26 -d, --distance <Thin sites so that no two sites are within the specified distance from one another. Default: 0> | |
27 ~; | |
28 $usage .= "\n"; | |
29 | |
30 my ($input,$out); | |
31 | |
32 my $PLINK_EXE = "plink"; | |
33 | |
34 | |
35 #my $indel_size_max = 500; | |
36 #my $indel_size_min = 1; | |
37 my $frequency_max = 0.5; | |
38 my $frequency_min = 0.001; | |
39 my $pos_max = 100000000000; | |
40 my $pos_min = 0; | |
41 my $filter_snp_type = "all"; | |
42 my $remove_filt = "False"; | |
43 | |
44 my $missing_data = 1; | |
45 my $export = "VCF"; | |
46 my $type = "ALL"; | |
47 my $bounds; | |
48 my $samples; | |
49 my $chromosomes; | |
50 my $thin; | |
51 | |
52 GetOptions( | |
53 "input=s" => \$input, | |
54 "out=s" => \$out, | |
55 "samples=s" => \$samples, | |
56 "chromosomes=s" => \$chromosomes, | |
57 "frequency=s" => \$frequency_min, | |
58 "max_freq=s" => \$frequency_max, | |
59 "allow_missing=s"=> \$missing_data, | |
60 "export=s" => \$export, | |
61 "type=s" => \$type, | |
62 "bounds=s" => \$bounds, | |
63 "remove_filt=s" => \$remove_filt, | |
64 "distance=s" => \$thin | |
65 ); | |
66 | |
67 | |
68 die $usage | |
69 if ( !$input || !$out); | |
70 | |
71 if ($samples && $samples =~/^([\w\,\-\.]+)\s*$/){ | |
72 $samples = $1; | |
73 } | |
74 elsif ($samples){ | |
75 die "Error: Samples must be a comma separated list of string\n"; | |
76 } | |
77 if ($bounds && $bounds =~/^([\d\,]+)\s*$/){ | |
78 $bounds = $1; | |
79 } | |
80 elsif($bounds){ | |
81 die "Error: Bounds must be a comma separated list of integers\n"; | |
82 } | |
83 | |
84 my $minfreq_cmd = ""; | |
85 if ($frequency_min && $frequency_min > 0 && $frequency_min =~/^([\d\.]+)\s*$/){ | |
86 $frequency_min = $1; | |
87 $minfreq_cmd = "--maf $frequency_min"; | |
88 } | |
89 elsif ($frequency_min == 0){ | |
90 $minfreq_cmd = ""; | |
91 } | |
92 elsif ($frequency_min){ | |
93 die "Error: frequency must be an integer\n"; | |
94 } | |
95 if ($thin && $thin =~/^([\d\.]+)\s*$/){ | |
96 $thin = $1; | |
97 } | |
98 elsif ($thin){ | |
99 die "Error: frequency must be an integer\n"; | |
100 } | |
101 my $maxfreq_cmd = ""; | |
102 if ($frequency_max && $frequency_max =~/^([\d\.]+)\s*$/){ | |
103 $frequency_max = $1; | |
104 if ($frequency_max < 0.5){ | |
105 $maxfreq_cmd = "--max-maf $frequency_max"; | |
106 } | |
107 } | |
108 elsif($frequency_max){ | |
109 die "Error: frequency must be an integer\n"; | |
110 } | |
111 if ($missing_data =~/^([\d\.]+)\s*$/){ | |
112 $missing_data = $1; | |
113 #$missing_data = 1 - $missing_data; | |
114 } | |
115 elsif ($missing_data){ | |
116 die "Error: Missing data must be an integer\n"; | |
117 } | |
118 if ($export && $export =~/^([\w]+)\s*$/){ | |
119 $export = $1; | |
120 } | |
121 elsif($export){ | |
122 die "Error: Export must be a string\n"; | |
123 } | |
124 if ($type && $type =~/^([\w]+)\s*$/){ | |
125 $type = $1; | |
126 } | |
127 elsif($type){ | |
128 die "Error: Type must be a string\n"; | |
129 } | |
130 | |
131 | |
132 my @dnasamples; | |
133 if ($samples) | |
134 { | |
135 @dnasamples = split(",",$samples); | |
136 } | |
137 my @boundaries; | |
138 if ($bounds) | |
139 { | |
140 @boundaries = split(",",$bounds); | |
141 } | |
142 | |
143 | |
144 my $experiment = "chromosomes"; | |
145 my $table = ""; | |
146 my %genes; | |
147 my @snp_ids; | |
148 my @snp_ids_and_positions; | |
149 my @snp_ids_and_positions_all; | |
150 my $gene; | |
151 my $snp_num = 0; | |
152 my %ref_sequences; | |
153 my %snps_of_gene; | |
154 | |
155 my $indiv_cmd = ""; | |
156 if (@dnasamples) | |
157 { | |
158 if (scalar @dnasamples > 1) | |
159 { | |
160 open(my $S,">$out.samples"); | |
161 foreach my $samp(@dnasamples){ | |
162 print $S "$samp $samp\n"; | |
163 } | |
164 close($S); | |
165 $indiv_cmd = "--keep $out.samples "; | |
166 } | |
167 else | |
168 { | |
169 $indiv_cmd = "--indv " . join(" --indv ",@dnasamples); | |
170 } | |
171 } | |
172 | |
173 my $chrom_cmd = ""; | |
174 if ($chromosomes) | |
175 { | |
176 $chrom_cmd = "--chr ".$chromosomes | |
177 } | |
178 | |
179 my $export_cmd = "--recode vcf-iid"; | |
180 if ($export eq "bcf"){ | |
181 $export_cmd = "--recode bcf"; | |
182 } | |
183 if ($export eq "freq"){ | |
184 $export_cmd = "--freq"; | |
185 } | |
186 if ($export eq "plink"){ | |
187 $export_cmd = "--make-bed"; | |
188 } | |
189 if ($export eq "bed"){ | |
190 $export_cmd = "--make-bed"; | |
191 } | |
192 | |
193 | |
194 my $bounds_cmd = ""; | |
195 if (@boundaries && $chrom_cmd=~/\w/ && $chrom_cmd !~/,/) | |
196 { | |
197 $bounds_cmd = "--from-bp $boundaries[0] --to-bp $boundaries[1]"; | |
198 } | |
199 | |
200 | |
201 | |
202 my $type_cmd = ""; | |
203 if ($type eq "SNP") | |
204 { | |
205 $type_cmd = "--snps-only"; | |
206 } | |
207 | |
208 my $filt_cmd = ""; | |
209 if ($remove_filt eq "true") | |
210 { | |
211 $filt_cmd = "--remove-filtered-all"; | |
212 } | |
213 | |
214 my $thin_cmd = ""; | |
215 if ($thin){ | |
216 $thin_cmd = "--bp-space $thin"; | |
217 } | |
218 | |
219 #my $bcf_input = $input; | |
220 #$bcf_input =~s/vcf/bcf/g; | |
221 my $bcf_input; | |
222 my $bed_input = $input; | |
223 $bed_input =~s/\.bed//g; | |
224 | |
225 if (-e "$bed_input.bed"){ | |
226 system("$PLINK_EXE --bfile $bed_input --out $out $type_cmd $export_cmd $chrom_cmd $indiv_cmd $minfreq_cmd $maxfreq_cmd --geno $missing_data $thin_cmd $bounds_cmd --allow-extra-chr 1>$out.plink.stdout 2>$out.plink.stderr"); | |
227 # for first 1000 SNPs | |
228 system("$PLINK_EXE --bfile $bed_input --out $out.recode $type_cmd --recode vcf-fid $chrom_cmd $indiv_cmd $minfreq_cmd $maxfreq_cmd --geno $missing_data $thin_cmd $bounds_cmd --allow-extra-chr --thin-count 800 1>$out.2.plink.stdout 2>$out.2.plink.stderr"); | |
229 } | |
230 elsif (-e $bcf_input){ | |
231 system("$PLINK_EXE --bcf $bcf_input --out $out $type_cmd $export_cmd $chrom_cmd $indiv_cmd $minfreq_cmd $maxfreq_cmd --geno $missing_data $thin_cmd $bounds_cmd --allow-extra-chr 1>$out.plink.stdout 2>$out.plink.stderr"); | |
232 } | |
233 else | |
234 { | |
235 system("$PLINK_EXE --vcf $input --out $out $type_cmd $export_cmd $chrom_cmd $indiv_cmd $minfreq_cmd $maxfreq_cmd --geno $missing_data $thin_cmd $bounds_cmd --allow-extra-chr 1>$out.3.plink.stdout 2>$out.3.plink.stderr"); | |
236 | |
237 } | |
238 | |
239 | |
240 | |
241 | |
242 |