Mercurial > repos > dereeper > plink
changeset 3:2bc9b3ee5eea draft
Uploaded
author | dereeper |
---|---|
date | Thu, 02 Nov 2017 05:41:34 -0400 |
parents | 013ff9bb23aa |
children | 6d1122b57344 |
files | Plink.pl find_indiv.py plink.sh plink.xml tool-data/tool_dependencies.xml |
diffstat | 5 files changed, 0 insertions(+), 400 deletions(-) [+] |
line wrap: on
line diff
--- a/Plink.pl Fri Aug 05 10:26:44 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,242 +0,0 @@ - -#!/usr/bin/perl - -use strict; -use Getopt::Long; -use Bio::SeqIO; - -my $usage = qq~Usage:$0 <args> [<opts>] - -where <args> are: - - -i, --input <VCF input> - -o, --out <Output basename> - - <opts> are: - - -s, --samples <Samples to be analyzed. Comma separated list> - -c, --chromosomes <List of chromosomes to be analyzed.> - -e, --export <Output format (VCF/freq/plink. Default: VCF> - -f, --frequency <Minimum MAF. Default: 0.001> - -m, --max_freq <Maximum MAF. Default: 0.5> - -a, --allow_missing <Allowed missing data proportion per site. Must be comprised between 0 and 1. Default: 1> - -t, --type <Type of polymorphisms to keep (ALL/SNP). Default: ALL> - -b, --bounds <Lower bound and upper bound for a range of sites to be processed (start,end). Default: 1, 100000000> - -r, --remove_filt <Remove all sites with a FILTER flag other than PASS (true/false). Default: false> - -d, --distance <Thin sites so that no two sites are within the specified distance from one another. Default: 0> -~; -$usage .= "\n"; - -my ($input,$out); - -my $PLINK_EXE = "plink"; - - -#my $indel_size_max = 500; -#my $indel_size_min = 1; -my $frequency_max = 0.5; -my $frequency_min = 0.001; -my $pos_max = 100000000000; -my $pos_min = 0; -my $filter_snp_type = "all"; -my $remove_filt = "False"; - -my $missing_data = 1; -my $export = "VCF"; -my $type = "ALL"; -my $bounds; -my $samples; -my $chromosomes; -my $thin; - -GetOptions( - "input=s" => \$input, - "out=s" => \$out, - "samples=s" => \$samples, - "chromosomes=s" => \$chromosomes, - "frequency=s" => \$frequency_min, - "max_freq=s" => \$frequency_max, - "allow_missing=s"=> \$missing_data, - "export=s" => \$export, - "type=s" => \$type, - "bounds=s" => \$bounds, - "remove_filt=s" => \$remove_filt, - "distance=s" => \$thin -); - - -die $usage - if ( !$input || !$out); - -if ($samples && $samples =~/^([\w\,\-\.]+)\s*$/){ - $samples = $1; -} -elsif ($samples){ - die "Error: Samples must be a comma separated list of string\n"; -} -if ($bounds && $bounds =~/^([\d\,]+)\s*$/){ - $bounds = $1; -} -elsif($bounds){ - die "Error: Bounds must be a comma separated list of integers\n"; -} - -my $minfreq_cmd = ""; -if ($frequency_min && $frequency_min > 0 && $frequency_min =~/^([\d\.]+)\s*$/){ - $frequency_min = $1; - $minfreq_cmd = "--maf $frequency_min"; -} -elsif ($frequency_min == 0){ - $minfreq_cmd = ""; -} -elsif ($frequency_min){ - die "Error: frequency must be an integer\n"; -} -if ($thin && $thin =~/^([\d\.]+)\s*$/){ - $thin = $1; -} -elsif ($thin){ - die "Error: frequency must be an integer\n"; -} -my $maxfreq_cmd = ""; -if ($frequency_max && $frequency_max =~/^([\d\.]+)\s*$/){ - $frequency_max = $1; - if ($frequency_max < 0.5){ - $maxfreq_cmd = "--max-maf $frequency_max"; - } -} -elsif($frequency_max){ - die "Error: frequency must be an integer\n"; -} -if ($missing_data =~/^([\d\.]+)\s*$/){ - $missing_data = $1; - #$missing_data = 1 - $missing_data; -} -elsif ($missing_data){ - die "Error: Missing data must be an integer\n"; -} -if ($export && $export =~/^([\w]+)\s*$/){ - $export = $1; -} -elsif($export){ - die "Error: Export must be a string\n"; -} -if ($type && $type =~/^([\w]+)\s*$/){ - $type = $1; -} -elsif($type){ - die "Error: Type must be a string\n"; -} - - -my @dnasamples; -if ($samples) -{ - @dnasamples = split(",",$samples); -} -my @boundaries; -if ($bounds) -{ - @boundaries = split(",",$bounds); -} - - -my $experiment = "chromosomes"; -my $table = ""; -my %genes; -my @snp_ids; -my @snp_ids_and_positions; -my @snp_ids_and_positions_all; -my $gene; -my $snp_num = 0; -my %ref_sequences; -my %snps_of_gene; - -my $indiv_cmd = ""; -if (@dnasamples) -{ - if (scalar @dnasamples > 1) - { - open(my $S,">$out.samples"); - foreach my $samp(@dnasamples){ - print $S "$samp $samp\n"; - } - close($S); - $indiv_cmd = "--keep $out.samples "; - } - else - { - $indiv_cmd = "--indv " . join(" --indv ",@dnasamples); - } -} - -my $chrom_cmd = ""; -if ($chromosomes) -{ - $chrom_cmd = "--chr ".$chromosomes -} - -my $export_cmd = "--recode vcf-iid"; -if ($export eq "bcf"){ - $export_cmd = "--recode bcf"; -} -if ($export eq "freq"){ - $export_cmd = "--freq"; -} -if ($export eq "plink"){ - $export_cmd = "--make-bed"; -} -if ($export eq "bed"){ - $export_cmd = "--make-bed"; -} - - -my $bounds_cmd = ""; -if (@boundaries && $chrom_cmd=~/\w/ && $chrom_cmd !~/,/) -{ - $bounds_cmd = "--from-bp $boundaries[0] --to-bp $boundaries[1]"; -} - - - -my $type_cmd = ""; -if ($type eq "SNP") -{ - $type_cmd = "--snps-only"; -} - -my $filt_cmd = ""; -if ($remove_filt eq "true") -{ - $filt_cmd = "--remove-filtered-all"; -} - -my $thin_cmd = ""; -if ($thin){ - $thin_cmd = "--bp-space $thin"; -} - -#my $bcf_input = $input; -#$bcf_input =~s/vcf/bcf/g; -my $bcf_input; -my $bed_input = $input; -$bed_input =~s/\.bed//g; - -if (-e "$bed_input.bed"){ - system("$PLINK_EXE --bfile $bed_input --out $out $type_cmd $export_cmd $chrom_cmd $indiv_cmd $minfreq_cmd $maxfreq_cmd --geno $missing_data $thin_cmd $bounds_cmd --allow-extra-chr 1>$out.plink.stdout 2>$out.plink.stderr"); - # for first 1000 SNPs - system("$PLINK_EXE --bfile $bed_input --out $out.recode $type_cmd --recode vcf-fid $chrom_cmd $indiv_cmd $minfreq_cmd $maxfreq_cmd --geno $missing_data $thin_cmd $bounds_cmd --allow-extra-chr --thin-count 800 1>$out.2.plink.stdout 2>$out.2.plink.stderr"); -} -elsif (-e $bcf_input){ - system("$PLINK_EXE --bcf $bcf_input --out $out $type_cmd $export_cmd $chrom_cmd $indiv_cmd $minfreq_cmd $maxfreq_cmd --geno $missing_data $thin_cmd $bounds_cmd --allow-extra-chr 1>$out.plink.stdout 2>$out.plink.stderr"); -} -else -{ - system("$PLINK_EXE --vcf $input --out $out $type_cmd $export_cmd $chrom_cmd $indiv_cmd $minfreq_cmd $maxfreq_cmd --geno $missing_data $thin_cmd $bounds_cmd --allow-extra-chr 1>$out.3.plink.stdout 2>$out.3.plink.stderr"); - -} - - - - -
--- a/find_indiv.py Fri Aug 05 10:26:44 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,20 +0,0 @@ -import sys -import os -import re - -def get_field_samples_options(dataset): - options = [] - line=os.popen("grep '#CHROM' %s"%dataset.file_name).read()[:-1].split('\t') - index=line.index('FORMAT') - for opt in line[index+1:] : - options.append((opt,opt, True)) - return options - -def get_field_chrs_options(dataset): - options = [] - chrs=os.popen("grep '##contig' %s"%dataset.file_name).read()[:-1].split('\n') - for line in chrs: - opt=re.search('^##contig=<ID=(\w+),length=',line).group(1) - options.append((opt,opt, True)) - return options -
--- a/plink.sh Fri Aug 05 10:26:44 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,49 +0,0 @@ -#!/bin/bash - - -tool_path=$(dirname $0) - -filein=$1 -fileout_label=$(date "+%Y%m%d%H%M%S") -fileout=$2 -filelog=$3 -frequency=$4 -max_freq=$5 -allow_missing=$6 -type=${7} -bound_start=${8} -bound_end=${9} - -cp -rf $filein input$$.vcf - -if [ "${10}" != "None" ] -then samples="--samples ${10}" -fi - -if [ "${11}" != "None" ] -then chromosomes="--chromosomes ${11}" -fi - -if [ "$bound_start" -gt "$bound_end" ] -then tmp=$bound_start ; bound_start=$bound_end ; bound_end=$tmp ; echo "Warning : Lower bound must be lower than greater bound!" >&2 -fi - - -export="VCF" - -perl $tool_path/Plink.pl --input input$$.vcf --out $fileout_label --export $export --frequency $frequency --max_freq $max_freq --allow_missing $allow_missing --type $type --bounds $bound_start','$bound_end $samples $chromosomes - - -#echo ${16} >>$fileout_label.log -#echo ${15} >>$fileout_label.log -#echo ${17} >>$fileout_label.log -#echo ${18} >>$fileout_label.log - -if [ "$export" = "VCF" ] -then cp $fileout_label.vcf $fileout ; rm $fileout_label.vcf -else cp $fileout_label.bed $fileout; cp $fileout_label.bed ${15} ; cp $fileout_label.bim ${18} ;rm $fileout_label.bed $fileout_label.fam $fileout_label.bim -fi - -cp $fileout_label.log $filelog -rm $fileout_label.log -
--- a/plink.xml Fri Aug 05 10:26:44 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,83 +0,0 @@ -<tool id="sniplay_plink" name="plink" version="1.0"> - - <!-- [REQUIRED] Tool description displayed after the tool name --> - <description> - Filter large VCF file</description> - - <!-- [OPTIONAL] 3rd party tools, binaries, modules... required for the tool to work --> - <requirements> - <requirement type="binary">perl</requirement> - <requirement type="package" version="1.90">plink</requirement> - </requirements> - - - <!-- [STRONGLY RECOMMANDED] Exit code rules --> - <stdio> - <!-- [HELP] If no exit code rule is defined, the tool will stop if anything is written to STDERR --> - <exit_code range="1:" level="fatal" /> - </stdio> - - <!-- [REQUIRED] The command to execute --> - <command interpreter="bash"> - ./plink.sh $vcf $fileout $filelog $frequency $max_freq $allow_missing $type_p $bound_start $bound_end - #if str( $samples ) == "": - 'None' - #else - $samples - #end if - #if str( $chromosomes ) == "": - 'None' - #else - $chromosomes - #end if - </command> - <code file="find_indiv.py"/> - <!-- [REQUIRED] Input files and tool parameters --> - <inputs> - <param name="vcf" type="data" format="vcf" optional="false" label="VCF input" /> - - <param name="samples" type="select" label="Samples" multiple="true" dynamic_options="get_field_samples_options(vcf)" help="Samples to be analyzed." /> - <!--<param name="samples" type="text" label="Samples" multiple="true" help="Samples to be analyzed." />--> - <!--<param name="chromosomes" type="select" label="Chromosomes" multiple="true" dynamic_options="get_field_chrs_options(input)" help="Chromosomes to be analyzed." />--> - <param name="frequency" type="float" value="0" label="Minimum MAF" help="Minimum Minor Allele Frequency (MAF)" /> - <param name="max_freq" type="float" value="0.5" label="Maximum MAF" help="Maximum Minor Allele Frequency (MAF)" /> - <param name="allow_missing" type="float" value="1" min="0" max="1" label="Missing data proportion" help="Allowed missing data proportion per site. Must be comprised between 0 and 1." /> - <param name="type_p" type="select" label="Polymorphisms" help="Type of polymorphisms to keep." > - <option value="ALL" selected="true">All</option> - <option value="SNP">SNPs only</option> - </param> - <param name="chromosomes" type="text" label="Chromosomes" multiple="true" help="Chromosomes to be analyzed. (Comma-separated list of reference sequences, ex: Chr1,Chr2)" /> - <param name="bound_start" type="integer" value="1" label="Lower bound" help="Lower bound for a range of sites to be processed." /> - <param name="bound_end" type="integer" value="100000000" label="Upper bound" help="Upper bound for a range of sites to be processed." /> - </inputs> - - <!-- [REQUIRED] Output files --> - <outputs> - <data name="fileout" format="vcf" label="Plink filtered VCF"/> - <data name="filelog" format="txt" label="Plink logfile" /> - </outputs> - - - <!-- [OPTIONAL] Help displayed in Galaxy --> - <help> - -.. class:: infomark - -**Authors** Shaun Purcell : plink_ - -.. _plink: https://www.cog-genomics.org/plink2 - - | **Please cite** "PLINK: a toolset for whole-genome association and population-based linkage analysis.", Purcell S, Neale B, Todd-Brown K, Thomas L, Ferreira MAR, Bender D, Maller J, Sklar P, de Bakker PIW, Daly MJ, Sham PC, **American Journal of Human Genetics**, 2007 - -.. class:: infomark - -**Galaxy integration** Dereeper Alexis (IRD), Andres Gwendoline (Institut Français de Bioinformatique). - -.. class:: infomark - -**Support** For any questions about Galaxy integration, please send an e-mail to support.abims@sb-roscoff.fr - - - - </help> - -</tool>
--- a/tool-data/tool_dependencies.xml Fri Aug 05 10:26:44 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <package name="plink" version="1.90"> - <repository changeset_revision="98e3a3cf0a35" name="package_plink_1_90" owner="dereeper" toolshed="https://toolshed.g2.bx.psu.edu" /> - </package> -</tool_dependency>