Mercurial > repos > pjbriggs > rnachipintegrator
diff rnachipintegrator_wrapper.xml @ 1:5f69a2c1b9c9 draft
Uploaded version 1.0.0.0.
author | pjbriggs |
---|---|
date | Wed, 24 Feb 2016 09:39:14 -0500 |
parents | d9c1f2133124 |
children | dc498b03ca9a |
line wrap: on
line diff
--- a/rnachipintegrator_wrapper.xml Tue Jun 30 06:44:06 2015 -0400 +++ b/rnachipintegrator_wrapper.xml Wed Feb 24 09:39:14 2016 -0500 @@ -1,161 +1,150 @@ <?xml version="1.0" encoding="utf-8"?> -<tool id="rnachipintegrator_wrapper" name="RnaChipIntegrator" version="@VERSION@-0"> - <description>Integrated analysis of gene expression data and ChIP data</description> +<tool id="rnachipintegrator_wrapper" name="RnaChipIntegrator" version="@VERSION@.0"> + <description>Integrated analysis of 'gene' and 'peak' data</description> <macros> <import>rnachipintegrator_macros.xml</import> </macros> <expand macro="requirements" /> <expand macro="version_command" /> - <command interpreter="bash">rnachipintegrator_wrapper.sh - #if str( $analysis_options.peak_type ) == "summits" - #if str( $analysis_options.window ) != "" - --window=$analysis_options.window - #end if - #if str( $analysis_options.cutoff ) != "" - --cutoff=$analysis_options.cutoff - #end if + <command interpreter="bash"><![CDATA[ + rnachipintegrator_wrapper.sh + #if $peaks_in.metadata.chromCol + --peak_cols=${peaks_in.metadata.chromCol},${peaks_in.metadata.startCol},${peaks_in.metadata.endCol} #end if - #if str( $analysis_options.peak_type ) == "regions" - #if str( $analysis_options.edge_cutoff ) != "" - --edge-cutoff=$analysis_options.edge_cutoff - #end if - #if str( $analysis_options.number ) != "" - --number=$analysis_options.number - #end if - #if (str( $analysis_options.promoter_start ) != "" and str( $analysis_options.promoter_end )) - --promoter_region=$analysis_options.promoter_start,$analysis_options.promoter_end - #end if - #if $analysis_options.pad_output - --pad - #end if + #if str( $cutoff ) != "" + --cutoff=$cutoff + #else + --cutoff=0 #end if - $rnaseq $chipseq - --output_xls $xls_output - #if $results_as_zip - --zip_file $zip_file - #else - #if str( $analysis_options.peak_type ) == "summits" - --summit_outputs $peaks_to_transcripts_out $tss_to_summits_out - #end if - #if str( $analysis_options.peak_type ) == "regions" - --peak_outputs $transcripts_to_edges_out - $transcripts_to_edges_summary - $tss_to_edges_out - $tss_to_edges_summary - #end if + #if str( $number ) != "" + --number=$number #end if - </command> + --promoter_region=$promoter_start,$promoter_end + --edge=$edge + $diff_expressed_only + --xlsx_file "$xlsx_out" + --output_files "$peaks_per_feature_out" "$features_per_peak_out" + #if $output.compact_format + --compact + #else + #if $output.summary + --summary_files "$peaks_per_feature_summary" "$features_per_peak_summary" + #end if + ${output.pad_output} + #end if + "$features_in" "$peaks_in" + ]]></command> <inputs> - <param format="tabular" name="rnaseq" type="data" label="Gene expression data file" /> - <param format="tabular" name="chipseq" type="data" label="ChIP peaks data file" /> - <conditional name="analysis_options"> - <!-- user must specify if ChIP peaks are summits or regions --> - <param name="peak_type" type="select" label="ChIP peaks are" - help="Options and outputs depend on whether ChIP data are summits or regions"> - <option value="summits">summits</option> - <option value="regions">regions</option> - </param> - <when value="summits"> - <param name="window" type="integer" value="20000" optional="true" - label="Maximum distance a peak can be from each transcript - TSS before being omitted from analysis" /> - <param name="cutoff" type="integer" value="130000" optional="true" - label="Maximum distance a transcript TSS can be from each - peak before being omitted from the analysis" /> - </when> - <when value="regions"> - <param name="edge_cutoff" type="integer" value="10000" optional="true" - label="Maximum distance a transcript edge can be from the - peak edge before being omitted from the analysis" - help="Set to zero to indicate that no cut off should be applied" /> - <param name="number" type="integer" value="4" optional="true" - label="Maximum number of transcripts per peak to report from - from the analysis" /> - <param name="promoter_start" type="integer" value="-10000" optional="true" - label="Start of promoter region with respect to gene TSS" /> - <param name="promoter_end" type="integer" value="2500" optional="true" - label="End of promoter region with respect to gene TSS" /> - <param name="pad_output" type="boolean" checked="false" truevalue="yes" - label="Output same number of lines for each peak" - help="Add blank lines in output for peaks with fewer than maximum number - of hits (--pad)" /> - </when> - </conditional> - <param name="results_as_zip" type="boolean" checked="false" truevalue="yes" - label="Put output tab-delimited files into a single zip archive" /> + <param format="tabular" name="features_in" type="data" + label="Genes/genomic features" /> + <param format="tabular" name="peaks_in" type="data" + label="Peaks/regions" /> + <expand macro="analysis_options" /> + <param name="diff_expressed_only" type="boolean" + truevalue="--only-DE" falsevalue="" checked="false" + label="Only consider genes which are flagged as differentially + expressed" + help="NB input feature data must include differential expression + flags (--only-DE)" /> + <expand macro="output_options" /> </inputs> <outputs> - <!-- Always produce XLS output --> - <data format="xls" name="xls_output" - label="All RnaChipIntegrator analyses for ${rnaseq.name} vs ${chipseq.name} (Excel spreadsheet)" /> - <!-- Outputs only produced for summit data --> - <data format="tabular" name="peaks_to_transcripts_out" - label="Nearest summits to transcripts for ${rnaseq.name} vs ${chipseq.name}" > - <filter>analysis_options['peak_type'] == "summits"</filter> - <filter>results_as_zip is False</filter> - </data> - <data format="tabular" name="tss_to_summits_out" - label="Nearest TSS to summits for ${rnaseq.name} vs ${chipseq.name}" > - <filter>analysis_options['peak_type'] == "summits"</filter> - <filter>results_as_zip is False</filter> + <!-- Always produce XLSX output --> + <data format="xlsx" name="xlsx_out" + label="All RnaChipIntegrator analyses: ${features_in.name} vs ${peaks_in.name} (Excel spreadsheet)" /> + <data format="tabular" name="peaks_per_feature_out" + label="Nearest peaks to each gene: ${features_in.name} vs ${peaks_in.name}" /> + <data format="tabular" name="features_per_peak_out" + label="Nearest genes to each peak: ${features_in.name} vs ${peaks_in.name}" /> + <data format="tabular" name="peaks_per_feature_summary" + label="Nearest peaks to each gene (summary): ${features_in.name} vs ${peaks_in.name}" > + <filter>output['compact_format'] is False</filter> + <filter>output['summary'] is True</filter> </data> - <!-- Outputs only produced for peak data --> - <data format="tabular" name="transcripts_to_edges_out" - label="Nearest transcripts to peak edges for ${rnaseq.name} vs ${chipseq.name}" > - <filter>analysis_options['peak_type'] == "regions"</filter> - <filter>results_as_zip is False</filter> - </data> - <data format="tabular" name="transcripts_to_edges_summary" - label="Nearest transcripts to peak edges (summary) for ${rnaseq.name} vs ${chipseq.name}" > - <filter>analysis_options['peak_type'] == "regions"</filter> - <filter>results_as_zip is False</filter> - </data> - <data format="tabular" name="tss_to_edges_out" - label="Nearest TSS to peak edges for ${rnaseq.name} vs ${chipseq.name}" > - <filter>analysis_options['peak_type'] == "regions"</filter> - <filter>results_as_zip is False</filter> - </data> - <data format="tabular" name="tss_to_edges_summary" - label="Nearest TSS to peak edges (summary) for ${rnaseq.name} vs ${chipseq.name}" > - <filter>analysis_options['peak_type'] == "regions"</filter> - <filter>results_as_zip is False</filter> - </data> - <data format="zip" name="zip_file" - label="All tab-delimited files for ${rnaseq.name} vs ${chipseq.name} (zip file)" > - <filter>results_as_zip is True</filter> + <data format="tabular" name="features_per_peak_summary" + label="Nearest gene to each peak (summary): ${features_in.name} vs ${peaks_in.name}" > + <filter>output['compact_format'] is False</filter> + <filter>output['summary'] is True</filter> </data> </outputs> <tests> + <!-- + RnaChipIntegrator +name=test +cutoff=130000 +promoter_region=-10000,2500 +xlsx +compact features.txt summits.txt + --> <test> - <param name="rnaseq" value="ExpressionData.txt" ftype="tabular" /> - <param name="chipseq" value="ChIP_summits.txt" ftype="tabular" /> - <param name="peak_type" value="summits" /> - <param name="window" value="20000" /> + <param name="features_in" value="features.txt" ftype="tabular" /> + <param name="peaks_in" value="summits.txt" ftype="tabular" /> <param name="cutoff" value="130000" /> - <!-- - **NB** outputs have to be specified in order that they appear in the - tool (which is the order they will be written to the history) - the - test framework seems to use the order and ignores the "name" attribute - --> - <output name="xls_output" file="summits.xls" compare="sim_size" /> - <output name="peaks_to_transcripts_out" file="peaks_to_transcripts.out" ftype="tabular" /> - <output name="tss_to_summits_out" file="tss_to_summits.out" ftype="tabular" /> + <param name="promoter_start" value="-10000" /> + <param name="promoter_end" value="2500" /> + <output name="xlsx_out" file="summits.xlsx" compare="sim_size" /> + <output name="peaks_per_feature_out" ftype="tabular" + file="summits_per_feature.out" /> + <output name="features_per_peak_out" ftype="tabular" + file="features_per_summit.out" /> + </test> + <!-- + RnaChipIntegrator +name=test +cutoff=130000 +promoter_region=-10000,2500 +xlsx +compact features.txt peaks.txt + --> + <test> + <param name="features_in" value="features.txt" ftype="tabular" /> + <param name="peaks_in" value="peaks.txt" ftype="tabular" /> + <param name="cutoff" value="130000" /> + <param name="promoter_start" value="-10000" /> + <param name="promoter_end" value="2500" /> + <output name="xlsx_out" file="peaks1.xlsx" compare="sim_size" /> + <output name="peaks_per_feature_out" ftype="tabular" + file="peaks_per_feature1.out" /> + <output name="features_per_peak_out" ftype="tabular" + file="features_per_peak1.out" /> </test> + <!-- + RnaChipIntegrator +name=test +cutoff=130000 +xlsx features.txt peaks.txt + --> <test> - <param name="rnaseq" value="ExpressionData.txt" ftype="tabular" /> - <param name="chipseq" value="ChIP_peaks.txt" ftype="tabular" /> - <param name="peak_type" value="regions" /> - <param name="edge_cutoff" value="130000" /> - <!-- - **NB** outputs have to be specified in order that they appear in the - tool (which is the order they will be written to the history) - the - test framework seems to use the order and ignores the "name" attribute - --> - <output name="xls_output" file="peaks.xls" compare="sim_size" /> - <output name="transcripts_to_edges_out" file="transcripts_to_edges.out" ftype="tabular" /> - <output name="transcripts_to_edges_summary" file="transcripts_to_edges.summary" ftype="tabular" /> - <output name="tss_to_edges_out" file="tss_to_edges.out" ftype="tabular" /> - <output name="tss_to_edges_summary" file="tss_to_edges.summary" ftype="tabular" /> + <param name="features_in" value="features.txt" ftype="tabular" /> + <param name="peaks_in" value="peaks.txt" ftype="tabular" /> + <param name="cutoff" value="130000" /> + <param name="compact_format" value="false" /> + <output name="xlsx_out" file="peaks2.xlsx" compare="sim_size" /> + <output name="peaks_per_feature_out" ftype="tabular" + file="peaks_per_feature2.out" /> + <output name="features_per_peak_out" ftype="tabular" + file="features_per_peak2.out" /> + </test> + <!-- + RnaChipIntegrator +name=test +cutoff=130000 +only-DE +xlsx +compact features.txt peaks.txt + --> + <test> + <param name="features_in" value="features.txt" ftype="tabular" /> + <param name="peaks_in" value="peaks.txt" ftype="tabular" /> + <param name="cutoff" value="130000" /> + <param name="diff_expressed_only" value="true" /> + <output name="xlsx_out" file="peaks3.xlsx" compare="sim_size" /> + <output name="peaks_per_feature_out" ftype="tabular" + file="peaks_per_feature3.out" /> + <output name="features_per_peak_out" ftype="tabular" + file="features_per_peak3.out" /> + </test> + <!-- + RnaChipIntegrator +name=test +cutoff=130000 +promoter_region=-10000,2500 +xlsx +summary features.txt peaks.txt + --> + <test> + <param name="features_in" value="features.txt" ftype="tabular" /> + <param name="peaks_in" value="peaks.txt" ftype="tabular" /> + <param name="cutoff" value="130000" /> + <param name="compact_format" value="false" /> + <param name="summary" value="true" /> + <param name="pad_output" value="true" /> + <output name="xlsx_out" file="peaks4.xlsx" compare="sim_size" /> + <output name="peaks_per_feature_out" ftype="tabular" + file="peaks_per_feature4.out" /> + <output name="features_per_peak_out" ftype="tabular" + file="features_per_peak4.out" /> + <output name="peaks_per_feature_summary" ftype="tabular" + file="peaks_per_feature4.summary" /> + <output name="features_per_peak_summary" ftype="tabular" + file="features_per_peak4.summary" /> </test> </tests> <help> @@ -164,33 +153,17 @@ **What it does** -Run RnaChipIntegrator to perform integrated analyses of gene expression -and ChIP data, identifying the nearest ChIP peaks to each transcript -and vice versa. - -For ChIP peaks defined as regions the following analyses are performed: - - * **TranscriptsToPeakEdges**: reports the nearest transcripts with the smallest - distance from either their TSS or TES to the nearest peak edge. - - * **TSSToPeakEdges**: reports the nearest transcripts with the smallest distance - from their TSS to the nearest peak edge. - -For ChIP peaks defined as summits: +Performs integrated analyses of genes (or other genomic feature data) +gainst a set of peaks (e.g. ChIP data), identifying the nearest peaks to +each feature and vice versa. - * **TSSToSummits**: reports the nearest transcripts with the smallest distance - from the TSS to the nearest peak summit. - - * **PeaksToTranscripts**: reports the nearest peak summits with the smallest - distance to either the TSS or TES of each transcript. +The program was originally written specifically for ChIP-Seq and RNA-Seq +data but works equally well for ChIP-chip and microarray expression data, +and can also be used to integrate any set of genomic features (e.g. +canonical genes, CpG islands) with expression data. -The program was originally written specifically for ChIP-Seq and RNA-Seq data -but works equally well for ChIP-chip and microarray expression data, and can -also be used to integrate any set of genomic features (e.g. canonical genes, -CpG islands) with expression data. - -RnaChipIntgerator can be obtained from -http://fls-bioinformatics-core.github.com/RnaChipIntegrator/ +RnaChipIntegrator can be obtained from +https://pypi.python.org/pypi/RnaChipIntegrator/ ------------- @@ -198,53 +171,81 @@ **Input** -The expression data must be in a tab-delimited file with the following columns -of data for each genomic feature (one feature per line): +The gene data must be in a tabular file with the following columns +of data for each gene or genomic feature (one gene per line): ====== ========== ====================================================================== Column Name Description ====== ========== ====================================================================== - 1 ID Name used to identify the feature in the output + 1 ID Name used to identify the gene in the output 2 chr Chromosome name - 3 start Start position of the feature - 4 end End position of the feature + 3 start Start position of the gene + 4 end End position of the gene 5 strand Must be either '+' or '-' - 6 diff_expr Optional: indicates feature is differentially expressed (1) or not (0) + 6 diff_expr Optional: indicates gene is differentially expressed (1) or not (0) ====== ========== ====================================================================== -The ChIP-seq data must be in a tab-delimited file with 3 columns of data for each -ChIP peak (one per line): +The peak data must be in a tabular file with at least 3 columns of data +for each peak (one peak per line): -====== ========== ====================================================================== +====== ========== ================================= Column Name Description -====== ========== ====================================================================== - 1 chr Chromosome name (must match one of those in expression data file) +====== ========== ================================= + 1 chr Chromosome name 2 start Start position of the peak - 3 end End position of the peak (start + 1 for summit data) -====== ========== ====================================================================== + 3 end End position of the peak +====== ========== ================================= -The ChIP peak data can be either the summit (in which case 'end' - 'start' = 1) or the -entire extent of the binding region (with 'start' and 'end' indicating the limits). +If peak data is in ``bed`` format then the tool will automatically +assign the correct columns, otherwise the first three columns of data +will be used. ------------- .. class:: infomark -**Output** +**Outputs** + +The key outputs from the tool are two lists compromising the nearest +peaks for each gene, and the nearest gene for each peak (one dataset +for each list). + +There are two formats for reporting: "compact" and "full": -The outputs from this tool vary depending on the type of data that is input, however -generally there is one tab-delimited results file for each analysis described above -in the **What it does** section (some analyses output a second file with just the -"best" hits). + * **Compact output** reports all the hits for each peak or gene on + a single line of output; + * **Full output** reports each peak/gene pair on a separate line + (i.e. a multi-line output format). + +In "full" output mode, additional options are available: + + * The output files can be "padded" with extra (empty) lines to ensure + that there are always the same number of lines for each peak or + gene, if fewer than the requested number of hits are found. + * "Summary" datasets can also be requested, which include just the + nearest peak reported for each gene (and vice versa). -A history item will be generated for each output file, unless the option to put them -into a single zip archive is selected; this archive file will have to be downloaded -and unzipped on your local machine. It is recommended that you refer to the -RnaChipIntegrator documentation for information on the contents of each output file: -https://github.com/fls-bioinformatics-core/RnaChipIntegrator/blob/master/doc/MANUAL.markdown +In either mode these data will also be output in a single MS Excel file, +which contains one sheet per result set. + +.. class:: warning + +Using "compact" output with the number of hits limited to more than 4 +peak/gene pairs (or with no limit at all) can result in a large number +of columns in the output files, which in some versions of Galaxy will +not be properly displayed. However the data files themselves should be +okay. -In addition an Excel spreadsheet (with one page for each analysis performed) is always -produced. +------------- + +.. class:: informark + +**More information** + +It is recommended that you refer to the ``RnaChipIntegrator`` +documentation for information on the contents of each output file: + +* http://rnachipintegrator.readthedocs.org/en/latest/ ------------- @@ -259,4 +260,5 @@ Please kindly acknowledge the Bioinformatics Core Facility if you use this tool. </help> + <expand macro="citations" /> </tool>