changeset 0:a1034918ab9b draft

Uploaded
author fcaramia
date Thu, 20 Jun 2013 00:03:08 -0400
parents
children 7fb20f2d156c
files all_fasta.loc.sample joint_snv_mix.pl joint_snv_mix.xml jsm_to_vcf.pl jsm_to_vcf.xml tool_data_table_conf.xml.sample tool_dependencies.xml
diffstat 7 files changed, 549 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/all_fasta.loc.sample	Thu Jun 20 00:03:08 2013 -0400
@@ -0,0 +1,18 @@
+#This file lists the locations and dbkeys of all the fasta files
+#under the "genome" directory (a directory that contains a directory
+#for each build). The script extract_fasta.py will generate the file
+#all_fasta.loc. This file has the format (white space characters are
+#TAB characters):
+#
+#<unique_build_id>   <dbkey>   <display_name>   <file_path>
+#
+#So, all_fasta.loc could look something like this:
+#
+#apiMel3     apiMel3   Honeybee (Apis mellifera): apiMel3     /path/to/genome/apiMel3/apiMel3.fa
+#hg19canon   hg19      Human (Homo sapiens): hg19 Canonical   /path/to/genome/hg19/hg19canon.fa
+#hg19full    hg19      Human (Homo sapiens): hg19 Full        /path/to/genome/hg19/hg19full.fa
+#
+#Your all_fasta.loc file should contain an entry for each individual
+#fasta file. So there will be multiple fasta files for each build,
+#such as with hg19 above.
+#
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/joint_snv_mix.pl	Thu Jun 20 00:03:08 2013 -0400
@@ -0,0 +1,75 @@
+
+use strict;
+use warnings;
+use File::Basename; 
+use Cwd;
+use File::Path qw(make_path remove_tree);
+die qq(
+Bad numbr of inputs
+
+) if(!@ARGV);
+
+my $bam_normal;
+my $bam_tumor;
+my $player_options = "";
+my $output;
+my $action;
+my $ref_genome;
+
+foreach my $input (@ARGV) 
+{
+	my @tmp = split "::", $input;
+	if($tmp[0] eq "BAMNORMAL") 
+	{
+		$bam_normal = $tmp[1];
+	} 
+	elsif($tmp[0] eq "BAMTUMOR") 
+	{
+		$bam_tumor = $tmp[1];
+	} 
+	elsif($tmp[0] eq "OPTION") 
+	{
+		$player_options = "$player_options ${tmp[1]}";
+	}
+	elsif($tmp[0] eq "REFGENOME") 
+	{
+		$ref_genome = $tmp[1];
+	}  
+	elsif($tmp[0] eq "ACTION") 
+	{
+		$action = $tmp[1];
+	}  
+	elsif($tmp[0] eq "OUTPUT") 
+	{
+		$output = $tmp[1];
+	}  
+
+	else 
+	{
+		die("Unknown Input: $input\n");
+	}
+}
+
+
+#Create Symbolic links and indexes
+
+my $working_dir = cwd();
+
+system ("ln -s $bam_normal $working_dir/normal.bam");
+system ("samtools index $working_dir/normal.bam");
+ 
+system ("ln -s $bam_tumor $working_dir/tumor.bam");
+system ("samtools index $working_dir/tumor.bam");
+    	
+
+#run jsm 
+
+if($action eq "classify")
+{
+	system ("jsm.py $action $player_options $ref_genome $working_dir/normal.bam $working_dir/tumor.bam");
+}
+else
+{
+	system ("jsm.py $action $player_options $ref_genome $working_dir/normal.bam $working_dir/tumor.bam $output");
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/joint_snv_mix.xml	Thu Jun 20 00:03:08 2013 -0400
@@ -0,0 +1,277 @@
+<tool id="joint_snv_mix" name="Joint SNV Mix" version="0.7.5">
+  <description>classify germline and somatic mutations</description>
+  <requirements>
+  	<requirement type="package" version="2.7">python</requirement>
+  	<requirement type="package" version="0.19.1">cython</requirement>
+  	<requirement type="package" version="0.5">pysam</requirement>
+  	<requirement type="package" version="0.1.18">samtools</requirement>
+  	<requirement type="package" version="0.7.5">jointsnvmix</requirement>
+  </requirements>
+  <command interpreter="perl">
+  
+  	joint_snv_mix.pl
+  	
+  	"ACTION::${option.option}" 
+  	
+  	"REFGENOME::$refFile.fields.path" 
+  	"BAMNORMAL::$normal_file"
+  	"BAMTUMOR::$tumor_file"
+  	
+  	
+  	#if str($option.option) == "classify":
+  		#if ($option.parameters):
+		  	"OPTION::--parameters_file $option.parameters"  		
+	  	#end if
+  		"OPTION::--out_file $output"
+  		"OPTION::--somatic_threshold $option.somatic_threshold"
+  		                              
+  	#end if
+  	 
+  	#if str($option.option) == "train":
+  		#if ($option.priors):
+  			"OPTION::--priors_file $option.priors" 
+  		#end if
+  		"OUTPUT::$output" 
+  		"OPTION::--convergence_threshold $option.convergence_threshold"
+  		"OPTION::--max_iters $option.max_iters"
+  		
+  	#end if
+  	#if ($positions_file):
+		"OPTION::--positions_file $positions_file" 
+	#end if
+	
+	"OPTION::--min_base_qual $min_base_quality"
+	"OPTION::--min_map_qual $min_map_quality"
+	"OPTION::--model $model"
+	#if ($chromosome):
+		"OPTION::--chromosome $chromosome" 
+	#end if
+	
+	
+  	
+  </command>
+  <inputs>
+  	<param name="refFile" type="select" label="Select a reference genome" optional="false">
+		<options from_data_table="all_fasta">
+			<filter type="sort_by" column="2" />
+			<validator type="no_options" message="No indexes are available" />
+		</options>
+	</param>
+	<param name="normal_file" type="data" format="bam" label="Normal Sample " help="Bam" />
+  	<param name="tumor_file" type="data" format="bam" label="Tumor Sample" help="Bam" />	
+	<param name="model" type="select" label="Model" help="" optional="true">
+			<option value="binomial">binomial</option>
+			<option value="snvmix2" selected="true">snvmix2</option>
+			<option value="beta_binomial">beta binomial</option>
+	</param>
+	<param name="positions_file" type="data" format="txt" label="Positions file" help="Filter positions" optional="true"/>	
+	<param name="min_map_quality" type="text" label="Min map quality" help="Filter reads" value="0"/>	
+  	<param name="min_base_quality" type="text" label="Min base quality" help="Filter reads" value="0"/>
+  	<param name="chromosome" type="text" label="Chromosome" help="a chromosome to analyse, leave blank for all"/>
+
+
+  	<conditional name="option">
+		<param name="option" type="select" label="Action" help="" optional="true">
+			<option value="train" selected="true">Train</option>
+			<option value="classify">Classify</option>
+		</param>
+	
+		<when value="train">
+			
+			<param name="priors" type="data" format="txt" label="Prior Probabilities" optional="true"/>
+			<param name="initial_parameters" type="data" format="txt" label="Initial Parameters" optional="true"/>
+			<param name="convergence_threshold" type="text" label="Convergence Threshold"  value="1e-6"/>
+			<param name="max_iters" type="text" label="Max number of training iterations"  value="1000"/>
+			
+		</when>
+		<when value="classify">
+			
+			<param name="parameters" type="data" format="txt" label="Classify Parameters" help="" optional="true" />
+			<param name="somatic_threshold" type="text" label="Somatic Threshold" help="filter by probability" value="0.0"/>
+		</when>
+	
+	</conditional>
+	
+	
+  </inputs>
+  <outputs>
+	<data type="data" format="txt" name="output" label="${tool.name} result on ${on_string}"/>
+  </outputs>
+  	
+  <help> 
+
+.. class:: infomark
+
+**What it does**
+
+::
+
+  JointSNVMix implements a probabilistic graphical model to analyse sequence data 
+  from tumour/normal pairs. The model draws statistical strength by analysing both 
+  genome jointly to more accurately classify germline and somatic mutations. 
+
+
+  Train
+
+  The SnvMix family of models are complete generative models of the data. 
+  As such the model parameters can be learned using the Expectation Maximisation 
+  (EM) algorithm. The train command allows this to be done.
+
+  All methods require that a file with the parameters for the prior densities, 
+  and an initial set of parameters be passed in. Templates for these files can 
+  be found in the config/ directory which ships with the package. If you are 
+  unsure about setting the   priors or parameter values these files should suffice.
+
+  The train command will produce a parameters file suitable for use with the 
+  classification command. Training is highly recommended to achieve optimal 
+  performance when using SnvMix based model.
+
+  To reduce memory consumption all subcommands of train take an optional --skip-size flag. 
+  This is the number of positions to skip over before sampling a position for the training set. 
+  Smaller values will lead to larger training sets which will require more memory, 
+  but should yield better parameter estimates.
+
+  All subcommands of train also take optional parameters for minimum depth a 
+  position has in the tumour and normal to be used for training. Higher depth 
+  sites should give more robust estimates of the parameters. The default values 
+  of these are likely fine. 
+  
+  
+  Classify
+  
+  The classify command is used for analysing tumour/normal paired data and 
+  computing the posterior probability for each of the nine joint genotypes for 
+  a pair of diploid genomes.
+
+  
+  
+**Models**
+  
+::
+
+  There are currently three models supported by both the train and classify commands. 
+  All models use the JointSNVMix mixture model which jointly analyses the normal and tumour genomes.
+  By default snvmix2 is used but other models can be specified.
+  
+  binomial
+  
+  Uses binomial densities in the mixture model this was previously referred to as the JointSnvMix1 mode. 
+  
+  snvmix2
+  
+  Uses snvmix2 densities in the mixture as described in the original SNVMix paper previously referred to as JointSnvMix2.   
+  
+  beta_binomial 
+  
+  Uses beta-binomial densities in the mixture model new in version 0.8. The beta-binomial is a robust (in the statistical sense) 
+  alternative to binomial model. It can be beneficial when dealing with over-dispersed data. This is useful in cancer genomes 
+  since allelic frequencies at somatic mutations sites may deviate significantly from those expected under diploid model. 
+  
+  
+**Input**
+
+  Bam files containing normal and tumor reads.
+
+
+**Parameters**
+
+
+  Classify
+  
+  chromosome CHROMOSOME
+                        Chromosome to analyse. If not set all chromosomes will
+                        be analysed.
+  
+  min_base_qual MIN_BASE_QUAL
+                        Remove bases with base quality lower than this.
+                        Default is 0.
+  
+  min_map_qual MIN_MAP_QUAL
+                        Remove bases with mapping quality lower than this.
+                        Default is 0.
+  
+  positions_file POSITIONS_FILE
+                        Path to a file containing a list of positions to
+                        create use for analysis. Should be space separated
+                        chrom pos. Additionally for each chromosome the
+                        positions should be sorted. The same format as
+                        samtools.
+  
+  parameters_file PARAMETERS_FILE
+                        Path to a file with custom parameters values for the
+                        model.
+  
+  somatic_threshold SOMATIC_THRESHOLD
+                        Only sites with P(Somatic) = p_AA_AB + p_AA_BB greater
+                        than equal this value will be printed. Default is 0.
+  
+  
+  Train
+  
+  chromosome CHROMOSOME
+                        Chromosome to analyse. If not set all chromosomes will
+                        be analysed.
+  
+  min_base_qual MIN_BASE_QUAL
+                        Remove bases with base quality lower than this.
+                        Default is 0.
+  
+  min_map_qual MIN_MAP_QUAL
+                        Remove bases with mapping quality lower than this.
+                        Default is 0.
+  
+  positions_file POSITIONS_FILE
+                        Path to a file containing a list of positions to
+                        create use for analysis. Should be space separated
+                        chrom pos. Additionally for each chromosome the
+                        positions should be sorted. The same format as
+                        samtools.
+  
+  priors_file PRIORS_FILE
+                        Path to a file with priors for the model parameters.
+  
+  initial_parameters_file INITIAL_PARAMETERS_FILE
+                        Path to a file with initial parameter values for the
+                        model.
+  
+  min_normal_depth MIN_NORMAL_DEPTH
+                        Minimum depth of coverage in normal sample for a site
+                        to be eligible for use in training set. Default 10
+  
+  min_tumour_depth MIN_TUMOUR_DEPTH
+                        Minimum depth of coverage in tumour sample for a site
+                        to be eligible for use in training set. Default 10
+  
+  max_normal_depth MAX_NORMAL_DEPTH
+                        Maximum depth of coverage in normal sample for a site
+                        to be eligible for use in training set. Default 100
+  
+  max_tumour_depth MAX_TUMOUR_DEPTH
+                        Maximum depth of coverage in tumour sample for a site
+                        to be eligible for use in training set. Default 100
+  
+  max_iters MAX_ITERS
+                        Maximum number of iterations to used for training
+                        model. Default 1000
+  
+  skip_size SKIP_SIZE
+                        When subsampling will skip over this number of
+                        position before adding a site to the subsample. Larger
+                        values lead to smaller subsample data sets with faster
+                        training and less memory. Smaller values should lead
+                        to better parameter estimates. Default 1.
+  
+  convergence_threshold CONVERGENCE_THRESHOLD
+                        Convergence threshold for EM training. Once the change
+                        in objective function is below this value training
+                        will end. Default 1e-6
+
+  
+  
+
+  </help>
+</tool>
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/jsm_to_vcf.pl	Thu Jun 20 00:03:08 2013 -0400
@@ -0,0 +1,49 @@
+die qq(
+Bad numbr of inputs
+
+) if(!@ARGV);
+
+my $input=$ARGV[0];
+my $vcf=$ARGV[1];
+
+
+# Convert output to VCF format
+open(FH, $input) or die "Couldn't open jsm file $input!\n";
+open(OUT, ">$vcf") or die "Couldn't create vcf file $vcf!\n";
+
+# print the vcf format we are using
+print OUT "##fileformat=VCFv4.1\n";
+
+# grab header which is the first line after the comment lines which start with ##
+my $header = <FH>;
+while(grep(/^##/, $header)) 
+{
+	$header = <FH>;
+}
+my @head = split("\t", $header);
+
+print "Converting jsm output to vcf\n";
+# vcf header is
+# #CHROM POS ID REF ALT QUAL FILTER INFO
+print OUT "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n";
+# for each line in jsm transform to vcf, any columns not in vcf concatenate them
+# together and place them in the info column
+while (my $line = <FH>) 
+{
+	chomp $line;
+	my @fields = split("\t", $line);
+	# create info column
+	# tumor_name=MH208_TUMOR;normal_name=MH208_LIVER;...;n_alt_sum=702
+	my @info;
+	for(my $index = 4; $index < $#fields; $index++) 
+	{
+		push @info, "$head[$index]=$fields[$index]";
+	} 
+	my $infofield = join(";", @info);
+	$fields[-1] = "PASS";
+	
+	# print the line
+	print OUT "$fields[0]\t$fields[1]\t.\t$fields[2]\t$fields[3]\t.\t$fields[-1]\t$infofield\n";
+}
+close(FH);
+close(OUT);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/jsm_to_vcf.xml	Thu Jun 20 00:03:08 2013 -0400
@@ -0,0 +1,40 @@
+<tool id="jsm_to_vcf" name="Convert JSM to VCF" version="1.0">
+
+  <command interpreter="perl">
+  	
+  	jsm_to_vcf.pl $input $output
+  	
+  </command>
+
+  <inputs>
+  	
+  	<param name="input" type="data" format="txt"  label="JSM File"  optional="false"/>
+  	
+  </inputs>
+  
+  <outputs>
+  	<data type="data" format="vcf" name="output" label="${tool.name} result on ${on_string}"/>
+  </outputs>
+  	
+  <help> 
+
+.. class:: infomark
+
+**What it does**
+
+Convert Joint SNV Mix output file into standard VCF format, version 4.1
+
+
+**Input**
+
+Coverage:
+
+	JSM output file	
+
+
+  </help>
+</tool>
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Thu Jun 20 00:03:08 2013 -0400
@@ -0,0 +1,8 @@
+<!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc-->
+<tables>
+    <!-- Locations of all fasta files under genome directory -->
+    <table name="all_fasta" comment_char="#">
+        <columns>value, dbkey, name, path</columns>
+        <file path="all_fasta.loc" />
+    </table>
+</tables>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Thu Jun 20 00:03:08 2013 -0400
@@ -0,0 +1,82 @@
+<?xml version="1.0"?>
+<tool_dependency>
+
+    <package name="samtools" version="0.1.18">
+        <install version="1.0">
+            <actions>
+                <action type="download_by_url">http://sourceforge.net/projects/samtools/files/samtools/0.1.18/samtools-0.1.18.tar.bz2</action>
+                <action type="shell_command">sed -i.bak -e 's/-lcurses/-lncurses/g' Makefile</action>
+                <action type="shell_command">make</action>
+                <action type="move_file">
+                    <source>samtools</source>
+                    <destination>$INSTALL_DIR/bin</destination>
+                </action>
+                <action type="move_file">
+                    <source>misc/maq2sam-long</source>
+                    <destination>$INSTALL_DIR/bin</destination>
+                </action>
+                <action type="set_environment">
+                    <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable>
+                </action>
+            </actions>
+        </install>
+        <readme>
+            Compiling SAMtools requires the ncurses and zlib development libraries.
+        </readme>
+    </package>
+
+    <package name="python" version="2.7">
+        <readme>
+        	JointSNVMix requires Python >= 2.7
+        </readme>
+    </package>
+
+    <package name="cython" version="0.19.1">
+        <install version="1.0">
+            <actions>
+                <action type="download_by_url">http://cython.org/release/Cython-0.19.1.tar.gz</action>
+                <action type="make_directory">$INSTALL_DIR/lib/python</action>
+                <action type="shell_command">export PYTHONPATH=$PYTHONPATH:$INSTALL_DIR/lib/python &amp;&amp; python setup.py install --home $INSTALL_DIR --install-scripts $INSTALL_DIR/bin</action>
+                <action type="set_environment">
+                    <environment_variable name="PYTHONPATH" action="append_to">$INSTALL_DIR/lib/python</environment_variable>
+                    <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable>
+                </action>
+            </actions>
+        </install>
+        <readme>
+        </readme>
+    </package>
+
+    <package name="pysam" version="0.5">
+        <install version="1.0">
+            <actions>
+                <action type="download_by_url">http://pysam.googlecode.com/files/pysam-0.5.tar.gz</action>
+                <action type="make_directory">$INSTALL_DIR/lib/python</action>
+                <action type="shell_command">export PYTHONPATH=$PYTHONPATH:$INSTALL_DIR/lib/python &amp;&amp; python setup.py install --home $INSTALL_DIR --install-scripts $INSTALL_DIR/bin</action>
+                <action type="set_environment">
+                    <environment_variable name="PYTHONPATH" action="append_to">$INSTALL_DIR/lib/python</environment_variable>
+                </action>
+            </actions>
+        </install>
+        <readme>
+        </readme>
+    </package>
+        <package name="jointsnvmix" version="0.7.5">
+        <install version="1.0">
+            <actions>
+                <action type="download_by_url">http://joint-snv-mix.googlecode.com/files/JointSNVMix-0.7.5.tar.gz</action>
+                <action type="make_directory">$INSTALL_DIR/lib/python</action>
+                <action type="shell_command">export PYTHONPATH=$PYTHONPATH:$INSTALL_DIR/lib/python &amp;&amp; python setup.py install --home $INSTALL_DIR --install-scripts $INSTALL_DIR/bin</action>
+                <action type="set_environment">
+                    <environment_variable name="PYTHONPATH" action="append_to">$INSTALL_DIR/lib/python</environment_variable>
+                    <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable>
+                </action>
+            </actions>
+        </install>
+        <readme>
+        </readme>
+    </package>
+    
+
+    
+</tool_dependency>