diff make_phylip.xml @ 31:a631c2f6d913

Update to Miller Lab devshed revision 3c4110ffacc3
author Richard Burhans <burhans@bx.psu.edu>
date Fri, 20 Sep 2013 13:25:27 -0400
parents
children ea52b23f1141
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/make_phylip.xml	Fri Sep 20 13:25:27 2013 -0400
@@ -0,0 +1,178 @@
+<tool id="gd_make_phylip" name="Phylip" version="1.0.0" force_history_refresh="True">
+  <description>: prepare data for phylogenetic analysis</description>
+
+  <command interpreter="python">
+    #set $zero_based = 1
+    #set $gen_chrClmn = int($input.metadata.scaffold) - $zero_based
+    #set $gen_posClmn = int($input.metadata.pos) - $zero_based
+    #set $gen_refClmn = int($input.metadata.pos) - $zero_based + 1
+    #set $gen_altrClmn = int($input.metadata.pos) - $zero_based + 2
+    make_phylip.py '--altrClmn=$gen_altrClmn' '--chrClmn=$gen_chrClmn' '--gd_indivs=$indivs_input' '--input=$input' '--output=$output1' '--output_id=$output1.id' '--output_dir=$__new_file_path__' '--posClmn=$gen_posClmn' '--refClmn=$gen_refClmn'
+    #if $input_type.choice == '0'
+      #set $cov_chrClmn = int($input_type.coverage_input.metadata.scaffold) - $zero_based
+      #set $cov_posClmn = int($input_type.coverage_input.metadata.pos) - $zero_based
+      #set $cov_refClmn = int($input_type.coverage_input.metadata.pos) - $zero_based + 1
+      #set $cov_altrClmn = int($input_type.coverage_input.metadata.pos) - $zero_based + 2
+      '--altrClmnCvrg=$cov_altrClmn' '--chrClmnCvrg=$cov_chrClmn' '--cvrgTreshold=$input_type.coverage_threshold' '--gd_indivs_cover=$indivs_input' '--indvlsPrctTrshld=$input_type.indivs_threshold' '--inputCover=$input_type.coverage_input' '--posClmnCvrg=$cov_posClmn' '--refClmnCvrg=$cov_refClmn'
+    #else if $input_type.choice == '1'
+      #set $fchrClmn = int($input_type.annotation_input.metadata.chromCol) - $zero_based
+      #set $strandClmn = int($input_type.annotation_input.metadata.strandCol) - $zero_based
+      #set $geneNameClmn = int($input_type.annotation_input.metadata.nameCol) - $zero_based
+      #set $txStartClmn = int(str($input_type.tx_start_col)) - $zero_based
+      #set $txEndClmn = int(str($input_type.tx_end_col)) - $zero_based
+      #set $cdsStartClmn = int(str($input_type.cds_start_col)) - $zero_based
+      #set $cdsEndClmn = int(str($input_type.cds_end_col)) - $zero_based
+      #set $startExsClmn = int(str($input_type.exs_start_col)) - $zero_based
+      #set $endExsClmn = int(str($input_type.exs_end_col)) - $zero_based
+      '--cdsEndClmn=$cdsEndClmn' '--cdsStartClmn=$cdsStartClmn' '--endExsClmn=$endExsClmn' '--fchrClmn=$fchrClmn' '--geneNameClmn=$geneNameClmn' '--gene_info=$input_type.annotation_input' '--sequence=$input_type.fasta_input' '--startExsClmn=$startExsClmn' '--strandClmn=$strandClmn' '--txEndClmn=$txEndClmn' '--txStartClmn=$txStartClmn'
+    #end if
+  </command>
+
+  <inputs>
+    <param name="input" type="data" format="gd_genotype,gd_snp" label="Genotype/SNP dataset">
+      <validator type="metadata" check="scaffold" message="scaffold missing" />
+      <validator type="metadata" check="pos" message="pos missing" />
+    </param>
+    <param name="indivs_input" type="data" format="gd_indivs" label="Individuals dataset" />
+    <conditional name="input_type">
+      <param name="choice" type="select" format="integer" label="Input type">
+        <option value="0" selected="true">Coverage</option>
+        <option value="1">Genes</option>
+      </param>
+      <when value="0">
+        <param name="coverage_input" type="data" format="gd_genotype,gd_snp" label="Coverage dataset">
+          <validator type="metadata" check="scaffold" message="scaffold missing" />
+          <validator type="metadata" check="pos" message="pos missing" />
+        </param>
+        <param name="coverage_threshold" type="integer" min="1" value="1" label="Coverage threshold" />
+        <param name="indivs_threshold" type="float" value="0.5" min="0.0" max="1.0" label="Individuals genotype percentage threshold" />
+      </when>
+      <when value="1">
+        <param name="annotation_input" type="data" format="interval" label="Genes dataset">
+          <validator type="metadata" check="chromCol" message="chromCol missing" />
+          <validator type="metadata" check="strandCol" message="strandCol missing" />
+          <validator type="metadata" check="nameCol" message="nameCol missing" />
+        </param>
+        <param name="tx_start_col" type="data_column" data_ref="input" label="Genes transcript start column" />
+        <param name="tx_end_col" type="data_column" data_ref="input" label="Genes transcript end column" />
+        <param name="cds_start_col" type="data_column" data_ref="input" label="Genes coding sequence start column" />
+        <param name="cds_end_col" type="data_column" data_ref="input" label="Genes coding sequence end column" />
+        <param name="exs_start_col" type="data_column" data_ref="input" label="Genes exon starts column" />
+        <param name="exs_end_col" type="data_column" data_ref="input" label="Genes exon ends column" />
+        <param name="fasta_input" type="data" format="fasta" label="FASTA dataset" />
+      </when>
+    </conditional>
+  </inputs>
+
+  <outputs>
+    <data name="output1" format="txt" />
+  </outputs>
+
+  <help>
+**What it does**
+
+This tool creates phylip formatted files from two different input types:
+coverage and genes.
+
+If the coverage option is selected the inputs for the program are:
+
+   1. a gd_indivs table
+   2. a gd_genotype file with the coverage information for individuals in the gd_indivs table
+   3. a gd_genotype file with the genotype information for individuals in the gd_indivs table
+   4. a coverage threshold (optional)
+   5. a percentage of individuals (threshold).
+
+The program produces a phylip formatted file using the sequence in the
+genotype file as a template.  In this sequence nucleotides for each
+sequence that are below the coverage threshold, or the positions with
+a percentage of individuals below the selected value are replaced by "N".
+
+If the gene option is selected the inputs for the program are:
+
+   1. a gd_indivs table
+   2. a gene dataset table with a gene name in the first column
+   3. the column with transcript start in the gene dataset table
+   4. the column with transcript end in the gene dataset table
+   5. the column with coding start in the gene dataset table
+   6. the column with coding end in the gene dataset table
+   7. the column with exon starts (comma-separated) in the gene dataset table
+   8. the column with exon ends (comma-separated) in the gene dataset table
+   9. a FASTA formatted file for all the genes of interest with their names as headers (NOTE: these names should be the same in the input gene dataset table).
+   
+The program produces as output one phylip formatted file for each gene
+in the gene dataset table.
+
+-----
+
+**Example**
+
+In a case were the option coverage is selected, for the inputs:
+
+- gd_indivs::
+
+   7       W_Java
+   10      E_Java
+   16      Pen_Ma
+   ...
+
+- Genotype table::
+
+   chrM 15 T C -1 -1 2 -1 -1 2 -1 -1 -1 -1 -1 2 -1 -1 -1 -1 0 -1 -1
+   chrM 18 G A -1 -1 0 -1 -1 0 -1 -1 -1 -1 -1 0 -1 -1 -1 -1 0 -1 -1
+   chrM 20 C T -1 -1 0 -1 -1 2 -1 -1 -1 -1 -1 0 -1 -1 -1 -1 0 -1 -1
+   ...
+
+- Coverage table::
+
+   chrM 0 G G 0 0 0 0 0 0 0 0 0 0 0 0 0
+   chrM 1 T T 0 0 3 0 0 50 0 0 0 0 0 2 0
+   chrM 2 T T 0 0 5 0 0 50 0 0 0 0 0 2 0
+   ...
+
+- Coverage threshold = 0
+
+- Percentage of individuals = 0.0
+
+- The output is::
+
+   4 19 15428
+   W_Java  GTTCATCATGTTCATCGAAT
+   E_Java  GTTCATCATGTTCATCGAAC
+   Pen_Ma  GTTCATCATGTTCATCGAAT
+
+In a case were option genotype is selected with the inputs:
+
+- Gene dataset table input::
+
+   1 ENSLAFT00000017123 chrM + 1002 1061 1002 1061 1 1002, 1061, 0 ENSLAFG00000017122 cmpl incmpl 0, BTRC ENSLAFT00000017123 ENSLAFP00000014355
+   1 ENSLAFT00000037164 chrM - 1058 1092 1062 1073 1 1062,1068 1065,1073 0 ENSLAFG00000007680 cmpl cmpl 0, MYOF ENSLAFT00000037164 ENSLAFP00000025175 26509
+   1 ENSLAFT00000008925 chrM + 990 1000 990 1000 1 990, 1000, 0 ENSLAFG00000008924 incmpl incmpl 0, PRKG1 ENSLAFT00000008925 ENSLAFP00000007492
+   ...
+
+In this table:
+
+   column with transcript start = 5
+   column with transcript end = 6
+   column with coding start = 7
+   column with coding end = 8
+   column with exon starts = 10
+   column with exon ends = 11
+
+- gd_indivs::
+
+   7       W_Java
+   10      E_Java
+   16      Pen_Ma
+   ...
+
+- Genotype table::
+
+   chrM 1005 T C -1 -1 2 -1 -1 2 -1 -1 -1 -1 -1 2 -1 -1 -1 -1 0 -1 -1
+   chrM 1060 G A -1 -1 0 -1 -1 0 -1 -1 -1 -1 -1 0 -1 -1 -1 -1 0 -1 -1
+   chrM 991 C T -1 -1 0 -1 -1 2 -1 -1 -1 -1 -1 0 -1 -1 -1 -1 0 -1 -1
+   ...
+
+The outputs are going to one file for each sequence in the input gene
+dataset table (as long as they are included in the input FASTA file).
+  </help>
+</tool>