Mercurial > repos > dereeper > plink
changeset 0:fe39a4677281 draft
Uploaded
author | dereeper |
---|---|
date | Fri, 05 Aug 2016 09:46:55 -0400 |
parents | |
children | a963ac8cb9cf |
files | Plink.pl find_indiv.py plink.sh plink.xml tool-data/tool_dependencies.xml |
diffstat | 5 files changed, 400 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Plink.pl Fri Aug 05 09:46:55 2016 -0400 @@ -0,0 +1,242 @@ + +#!/usr/bin/perl + +use strict; +use Getopt::Long; +use Bio::SeqIO; + +my $usage = qq~Usage:$0 <args> [<opts>] + +where <args> are: + + -i, --input <VCF input> + -o, --out <Output basename> + + <opts> are: + + -s, --samples <Samples to be analyzed. Comma separated list> + -c, --chromosomes <List of chromosomes to be analyzed.> + -e, --export <Output format (VCF/freq/plink. Default: VCF> + -f, --frequency <Minimum MAF. Default: 0.001> + -m, --max_freq <Maximum MAF. Default: 0.5> + -a, --allow_missing <Allowed missing data proportion per site. Must be comprised between 0 and 1. Default: 1> + -t, --type <Type of polymorphisms to keep (ALL/SNP). Default: ALL> + -b, --bounds <Lower bound and upper bound for a range of sites to be processed (start,end). Default: 1, 100000000> + -r, --remove_filt <Remove all sites with a FILTER flag other than PASS (true/false). Default: false> + -d, --distance <Thin sites so that no two sites are within the specified distance from one another. Default: 0> +~; +$usage .= "\n"; + +my ($input,$out); + +my $PLINK_EXE = "plink"; + + +#my $indel_size_max = 500; +#my $indel_size_min = 1; +my $frequency_max = 0.5; +my $frequency_min = 0.001; +my $pos_max = 100000000000; +my $pos_min = 0; +my $filter_snp_type = "all"; +my $remove_filt = "False"; + +my $missing_data = 1; +my $export = "VCF"; +my $type = "ALL"; +my $bounds; +my $samples; +my $chromosomes; +my $thin; + +GetOptions( + "input=s" => \$input, + "out=s" => \$out, + "samples=s" => \$samples, + "chromosomes=s" => \$chromosomes, + "frequency=s" => \$frequency_min, + "max_freq=s" => \$frequency_max, + "allow_missing=s"=> \$missing_data, + "export=s" => \$export, + "type=s" => \$type, + "bounds=s" => \$bounds, + "remove_filt=s" => \$remove_filt, + "distance=s" => \$thin +); + + +die $usage + if ( !$input || !$out); + +if ($samples && $samples =~/^([\w\,\-\.]+)\s*$/){ + $samples = $1; +} +elsif ($samples){ + die "Error: Samples must be a comma separated list of string\n"; +} +if ($bounds && $bounds =~/^([\d\,]+)\s*$/){ + $bounds = $1; +} +elsif($bounds){ + die "Error: Bounds must be a comma separated list of integers\n"; +} + +my $minfreq_cmd = ""; +if ($frequency_min && $frequency_min > 0 && $frequency_min =~/^([\d\.]+)\s*$/){ + $frequency_min = $1; + $minfreq_cmd = "--maf $frequency_min"; +} +elsif ($frequency_min == 0){ + $minfreq_cmd = ""; +} +elsif ($frequency_min){ + die "Error: frequency must be an integer\n"; +} +if ($thin && $thin =~/^([\d\.]+)\s*$/){ + $thin = $1; +} +elsif ($thin){ + die "Error: frequency must be an integer\n"; +} +my $maxfreq_cmd = ""; +if ($frequency_max && $frequency_max =~/^([\d\.]+)\s*$/){ + $frequency_max = $1; + if ($frequency_max < 0.5){ + $maxfreq_cmd = "--max-maf $frequency_max"; + } +} +elsif($frequency_max){ + die "Error: frequency must be an integer\n"; +} +if ($missing_data =~/^([\d\.]+)\s*$/){ + $missing_data = $1; + #$missing_data = 1 - $missing_data; +} +elsif ($missing_data){ + die "Error: Missing data must be an integer\n"; +} +if ($export && $export =~/^([\w]+)\s*$/){ + $export = $1; +} +elsif($export){ + die "Error: Export must be a string\n"; +} +if ($type && $type =~/^([\w]+)\s*$/){ + $type = $1; +} +elsif($type){ + die "Error: Type must be a string\n"; +} + + +my @dnasamples; +if ($samples) +{ + @dnasamples = split(",",$samples); +} +my @boundaries; +if ($bounds) +{ + @boundaries = split(",",$bounds); +} + + +my $experiment = "chromosomes"; +my $table = ""; +my %genes; +my @snp_ids; +my @snp_ids_and_positions; +my @snp_ids_and_positions_all; +my $gene; +my $snp_num = 0; +my %ref_sequences; +my %snps_of_gene; + +my $indiv_cmd = ""; +if (@dnasamples) +{ + if (scalar @dnasamples > 1) + { + open(my $S,">$out.samples"); + foreach my $samp(@dnasamples){ + print $S "$samp $samp\n"; + } + close($S); + $indiv_cmd = "--keep $out.samples "; + } + else + { + $indiv_cmd = "--indv " . join(" --indv ",@dnasamples); + } +} + +my $chrom_cmd = ""; +if ($chromosomes) +{ + $chrom_cmd = "--chr ".$chromosomes +} + +my $export_cmd = "--recode vcf-iid"; +if ($export eq "bcf"){ + $export_cmd = "--recode bcf"; +} +if ($export eq "freq"){ + $export_cmd = "--freq"; +} +if ($export eq "plink"){ + $export_cmd = "--make-bed"; +} +if ($export eq "bed"){ + $export_cmd = "--make-bed"; +} + + +my $bounds_cmd = ""; +if (@boundaries && $chrom_cmd=~/\w/ && $chrom_cmd !~/,/) +{ + $bounds_cmd = "--from-bp $boundaries[0] --to-bp $boundaries[1]"; +} + + + +my $type_cmd = ""; +if ($type eq "SNP") +{ + $type_cmd = "--snps-only"; +} + +my $filt_cmd = ""; +if ($remove_filt eq "true") +{ + $filt_cmd = "--remove-filtered-all"; +} + +my $thin_cmd = ""; +if ($thin){ + $thin_cmd = "--bp-space $thin"; +} + +#my $bcf_input = $input; +#$bcf_input =~s/vcf/bcf/g; +my $bcf_input; +my $bed_input = $input; +$bed_input =~s/\.bed//g; + +if (-e "$bed_input.bed"){ + system("$PLINK_EXE --bfile $bed_input --out $out $type_cmd $export_cmd $chrom_cmd $indiv_cmd $minfreq_cmd $maxfreq_cmd --geno $missing_data $thin_cmd $bounds_cmd --allow-extra-chr 1>$out.plink.stdout 2>$out.plink.stderr"); + # for first 1000 SNPs + system("$PLINK_EXE --bfile $bed_input --out $out.recode $type_cmd --recode vcf-fid $chrom_cmd $indiv_cmd $minfreq_cmd $maxfreq_cmd --geno $missing_data $thin_cmd $bounds_cmd --allow-extra-chr --thin-count 800 1>$out.2.plink.stdout 2>$out.2.plink.stderr"); +} +elsif (-e $bcf_input){ + system("$PLINK_EXE --bcf $bcf_input --out $out $type_cmd $export_cmd $chrom_cmd $indiv_cmd $minfreq_cmd $maxfreq_cmd --geno $missing_data $thin_cmd $bounds_cmd --allow-extra-chr 1>$out.plink.stdout 2>$out.plink.stderr"); +} +else +{ + system("$PLINK_EXE --vcf $input --out $out $type_cmd $export_cmd $chrom_cmd $indiv_cmd $minfreq_cmd $maxfreq_cmd --geno $missing_data $thin_cmd $bounds_cmd --allow-extra-chr 1>$out.3.plink.stdout 2>$out.3.plink.stderr"); + +} + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/find_indiv.py Fri Aug 05 09:46:55 2016 -0400 @@ -0,0 +1,20 @@ +import sys +import os +import re + +def get_field_samples_options(dataset): + options = [] + line=os.popen("grep '#CHROM' %s"%dataset.file_name).read()[:-1].split('\t') + index=line.index('FORMAT') + for opt in line[index+1:] : + options.append((opt,opt, True)) + return options + +def get_field_chrs_options(dataset): + options = [] + chrs=os.popen("grep '##contig' %s"%dataset.file_name).read()[:-1].split('\n') + for line in chrs: + opt=re.search('^##contig=<ID=(\w+),length=',line).group(1) + options.append((opt,opt, True)) + return options +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/plink.sh Fri Aug 05 09:46:55 2016 -0400 @@ -0,0 +1,49 @@ +#!/bin/bash + + +tool_path=$(dirname $0) + +filein=$1 +fileout_label=$(date "+%Y%m%d%H%M%S") +fileout=$2 +filelog=$3 +frequency=$4 +max_freq=$5 +allow_missing=$6 +type=${7} +bound_start=${8} +bound_end=${9} + +cp -rf $filein input$$.vcf + +if [ "${10}" != "None" ] +then samples="--samples ${10}" +fi + +if [ "${11}" != "None" ] +then chromosomes="--chromosomes ${11}" +fi + +if [ "$bound_start" -gt "$bound_end" ] +then tmp=$bound_start ; bound_start=$bound_end ; bound_end=$tmp ; echo "Warning : Lower bound must be lower than greater bound!" >&2 +fi + + +export="VCF" + +perl $tool_path/Plink.pl --input input$$.vcf --out $fileout_label --export $export --frequency $frequency --max_freq $max_freq --allow_missing $allow_missing --type $type --bounds $bound_start','$bound_end $samples $chromosomes + + +#echo ${16} >>$fileout_label.log +#echo ${15} >>$fileout_label.log +#echo ${17} >>$fileout_label.log +#echo ${18} >>$fileout_label.log + +if [ "$export" = "VCF" ] +then cp $fileout_label.vcf $fileout ; rm $fileout_label.vcf +else cp $fileout_label.bed $fileout; cp $fileout_label.bed ${15} ; cp $fileout_label.bim ${18} ;rm $fileout_label.bed $fileout_label.fam $fileout_label.bim +fi + +cp $fileout_label.log $filelog +rm $fileout_label.log +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/plink.xml Fri Aug 05 09:46:55 2016 -0400 @@ -0,0 +1,83 @@ +<tool id="sniplay_plink" name="plink" version="1.0"> + + <!-- [REQUIRED] Tool description displayed after the tool name --> + <description> - Filter large VCF file</description> + + <!-- [OPTIONAL] 3rd party tools, binaries, modules... required for the tool to work --> + <requirements> + <requirement type="binary">perl</requirement> + <requirement type="package" version="1.90">plink</requirement> + </requirements> + + + <!-- [STRONGLY RECOMMANDED] Exit code rules --> + <stdio> + <!-- [HELP] If no exit code rule is defined, the tool will stop if anything is written to STDERR --> + <exit_code range="1:" level="fatal" /> + </stdio> + + <!-- [REQUIRED] The command to execute --> + <command interpreter="bash"> + ./plink.sh $vcf $fileout $filelog $frequency $max_freq $allow_missing $type_p $bound_start $bound_end + #if str( $samples ) == "": + 'None' + #else + $samples + #end if + #if str( $chromosomes ) == "": + 'None' + #else + $chromosomes + #end if + </command> + <code file="find_indiv.py"/> + <!-- [REQUIRED] Input files and tool parameters --> + <inputs> + <param name="vcf" type="data" format="vcf" optional="false" label="VCF input" /> + + <param name="samples" type="select" label="Samples" multiple="true" dynamic_options="get_field_samples_options(vcf)" help="Samples to be analyzed." /> + <!--<param name="samples" type="text" label="Samples" multiple="true" help="Samples to be analyzed." />--> + <!--<param name="chromosomes" type="select" label="Chromosomes" multiple="true" dynamic_options="get_field_chrs_options(input)" help="Chromosomes to be analyzed." />--> + <param name="frequency" type="float" value="0" label="Minimum MAF" help="Minimum Minor Allele Frequency (MAF)" /> + <param name="max_freq" type="float" value="0.5" label="Maximum MAF" help="Maximum Minor Allele Frequency (MAF)" /> + <param name="allow_missing" type="float" value="1" min="0" max="1" label="Missing data proportion" help="Allowed missing data proportion per site. Must be comprised between 0 and 1." /> + <param name="type_p" type="select" label="Polymorphisms" help="Type of polymorphisms to keep." > + <option value="ALL" selected="true">All</option> + <option value="SNP">SNPs only</option> + </param> + <param name="chromosomes" type="text" label="Chromosomes" multiple="true" help="Chromosomes to be analyzed. (Comma-separated list of reference sequences, ex: Chr1,Chr2)" /> + <param name="bound_start" type="integer" value="1" label="Lower bound" help="Lower bound for a range of sites to be processed." /> + <param name="bound_end" type="integer" value="100000000" label="Upper bound" help="Upper bound for a range of sites to be processed." /> + </inputs> + + <!-- [REQUIRED] Output files --> + <outputs> + <data name="fileout" format="vcf" label="Plink filtered VCF"/> + <data name="filelog" format="txt" label="Plink logfile" /> + </outputs> + + + <!-- [OPTIONAL] Help displayed in Galaxy --> + <help> + +.. class:: infomark + +**Authors** Shaun Purcell : plink_ + +.. _plink: https://www.cog-genomics.org/plink2 + + | **Please cite** "PLINK: a toolset for whole-genome association and population-based linkage analysis.", Purcell S, Neale B, Todd-Brown K, Thomas L, Ferreira MAR, Bender D, Maller J, Sklar P, de Bakker PIW, Daly MJ, Sham PC, **American Journal of Human Genetics**, 2007 + +.. class:: infomark + +**Galaxy integration** Dereeper Alexis (IRD), Andres Gwendoline (Institut Français de Bioinformatique). + +.. class:: infomark + +**Support** For any questions about Galaxy integration, please send an e-mail to support.abims@sb-roscoff.fr + + + + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/tool_dependencies.xml Fri Aug 05 09:46:55 2016 -0400 @@ -0,0 +1,6 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="plink" version="1.90"> + <repository changeset_revision="86ff0807889e" name="package_plink_1_90" owner="dereeper" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>