view rnachipintegrator_wrapper.xml @ 5:087d9872fb0d draft

Updated to version 1.1.0.0.
author pjbriggs
date Tue, 07 Aug 2018 06:57:35 -0400
parents b695071de766
children 466c68008537
line wrap: on
line source

<?xml version="1.0" encoding="utf-8"?>
<tool id="rnachipintegrator_wrapper" name="RnaChipIntegrator" version="@VERSION@">
  <description>Integrated analysis of 'gene' and 'peak' data</description>
  <macros>
    <import>rnachipintegrator_macros.xml</import>
  </macros>
  <expand macro="requirements" />
  <expand macro="version_command" />
  <command interpreter="bash"><![CDATA[
  rnachipintegrator_wrapper.sh
  #if $peaks_in.metadata.chromCol
    --peak_cols=${peaks_in.metadata.chromCol},${peaks_in.metadata.startCol},${peaks_in.metadata.endCol}
  #end if
  #if str( $cutoff ) != ""
    --cutoff=$cutoff
  #else
    --cutoff=0
  #end if
  #if str( $number ) != ""
    --number=$number
  #end if
  --promoter_region=$promoter_start,$promoter_end
  --edge=$edge
  $diff_expressed_only
  --xlsx_file "$xlsx_out"
  --output_files "$peaks_per_feature_out" "$features_per_peak_out"
  #if $output.compact_format
    --compact
  #else
    #if $output.summary
      --summary_files "$peaks_per_feature_summary" "$features_per_peak_summary"
    #end if
    ${output.pad_output}
  #end if
  "$features_in" "$peaks_in"
  ]]></command>
  <inputs>
    <param format="tabular" name="features_in" type="data"
	   label="Genes/genomic features" />
    <param format="tabular" name="peaks_in" type="data"
	   label="Peaks/regions" />
    <expand macro="analysis_options" />
    <param name="diff_expressed_only" type="boolean"
	   truevalue="--only-DE" falsevalue="" checked="false"
	   label="Only consider genes which are flagged as differentially
		  expressed"
           help="NB input feature data must include differential expression
		 flags (--only-DE)" />
    <expand macro="output_options" />
  </inputs>
  <outputs>
    <!-- Always produce XLSX output -->
    <data format="xlsx" name="xlsx_out"
	  label="All RnaChipIntegrator analyses: ${features_in.name} vs ${peaks_in.name} (Excel spreadsheet)" />
    <data format="tabular" name="peaks_per_feature_out"
	  label="Nearest peaks to each gene: ${features_in.name} vs ${peaks_in.name}" />
    <data format="tabular" name="features_per_peak_out"
	  label="Nearest genes to each peak: ${features_in.name} vs ${peaks_in.name}" />
    <data format="tabular" name="peaks_per_feature_summary"
	  label="Nearest peaks to each gene (summary): ${features_in.name} vs ${peaks_in.name}" >
      <filter>output['compact_format'] is False</filter>
      <filter>output['summary'] is True</filter>
    </data>
    <data format="tabular" name="features_per_peak_summary"
	  label="Nearest gene to each peak (summary): ${features_in.name} vs ${peaks_in.name}" >
      <filter>output['compact_format'] is False</filter>
      <filter>output['summary'] is True</filter>
    </data>
  </outputs>
  <tests>
    <!--
	RnaChipIntegrator +name=test +cutoff=130000 +promoter_region=-10000,2500 +xlsx +compact features.txt summits.txt
    -->
    <test>
      <param name="features_in" value="features.txt" ftype="tabular" />
      <param name="peaks_in" value="summits.txt" ftype="tabular" />
      <param name="cutoff" value="130000" />
      <param name="promoter_start" value="-10000" />
      <param name="promoter_end" value="2500" />
      <output name="xlsx_out" file="summits.xlsx" compare="sim_size" />
      <output name="peaks_per_feature_out" ftype="tabular"
	      file="summits_per_feature.out" />
      <output name="features_per_peak_out" ftype="tabular"
	      file="features_per_summit.out" />
    </test>
    <!--
	RnaChipIntegrator +name=test +cutoff=130000 +promoter_region=-10000,2500 +xlsx +compact features.txt peaks.txt
    -->
    <test>
      <param name="features_in" value="features.txt" ftype="tabular" />
      <param name="peaks_in" value="peaks.txt" ftype="tabular" />
      <param name="cutoff" value="130000" />
      <param name="promoter_start" value="-10000" />
      <param name="promoter_end" value="2500" />
      <output name="xlsx_out" file="peaks1.xlsx" compare="sim_size" />
      <output name="peaks_per_feature_out" ftype="tabular"
	      file="peaks_per_feature1.out" />
      <output name="features_per_peak_out" ftype="tabular"
	      file="features_per_peak1.out" />
    </test>
    <!--
	RnaChipIntegrator +name=test +cutoff=130000 +xlsx features.txt peaks.txt
    -->
    <test>
      <param name="features_in" value="features.txt" ftype="tabular" />
      <param name="peaks_in" value="peaks.txt" ftype="tabular" />
      <param name="cutoff" value="130000" />
      <param name="compact_format" value="false" />
      <output name="xlsx_out" file="peaks2.xlsx" compare="sim_size" />
      <output name="peaks_per_feature_out" ftype="tabular"
	      file="peaks_per_feature2.out" />
      <output name="features_per_peak_out" ftype="tabular"
	      file="features_per_peak2.out" />
    </test>
    <!--
	RnaChipIntegrator +name=test +cutoff=130000 +only-DE +xlsx +compact features.txt peaks.txt
    -->
    <test>
      <param name="features_in" value="features.txt" ftype="tabular" />
      <param name="peaks_in" value="peaks.txt" ftype="tabular" />
      <param name="cutoff" value="130000" />
      <param name="diff_expressed_only" value="true" />
      <output name="xlsx_out" file="peaks3.xlsx" compare="sim_size" />
      <output name="peaks_per_feature_out" ftype="tabular"
	      file="peaks_per_feature3.out" />
      <output name="features_per_peak_out" ftype="tabular"
	      file="features_per_peak3.out" />
    </test>
    <!--
	RnaChipIntegrator +name=test +cutoff=130000 +promoter_region=-10000,2500 +xlsx +summary features.txt peaks.txt
    -->
    <test>
      <param name="features_in" value="features.txt" ftype="tabular" />
      <param name="peaks_in" value="peaks.txt" ftype="tabular" />
      <param name="cutoff" value="130000" />
      <param name="compact_format" value="false" />
      <param name="summary" value="true" />
      <param name="pad_output" value="true" />
      <output name="xlsx_out" file="peaks4.xlsx" compare="sim_size" />
      <output name="peaks_per_feature_out" ftype="tabular"
	      file="peaks_per_feature4.out" />
      <output name="features_per_peak_out" ftype="tabular"
	      file="features_per_peak4.out" />
      <output name="peaks_per_feature_summary" ftype="tabular"
	      file="peaks_per_feature4.summary" />
      <output name="features_per_peak_summary" ftype="tabular"
	      file="features_per_peak4.summary" />
    </test>
    <!--
	RnaChipIntegrator +name=test +cutoff=0 +promoter_region=-10000,2500 +xlsx +summary features.txt peaks.txt
    -->
    <test>
      <param name="features_in" value="features.txt" ftype="tabular" />
      <param name="peaks_in" value="peaks.txt" ftype="tabular" />
      <param name="cutoff" value="" />
      <param name="compact_format" value="false" />
      <param name="summary" value="true" />
      <param name="pad_output" value="true" />
      <output name="xlsx_out" file="peaks6.xlsx" compare="sim_size" />
      <output name="peaks_per_feature_out" ftype="tabular"
	      file="peaks_per_feature6.out" />
      <output name="features_per_peak_out" ftype="tabular"
	      file="features_per_peak6.out" />
      <output name="peaks_per_feature_summary" ftype="tabular"
	      file="peaks_per_feature6.summary" />
      <output name="features_per_peak_summary" ftype="tabular"
	      file="features_per_peak6.summary" />
    </test>
  </tests>
  <help>

.. class:: infomark

**What it does**

Performs integrated analyses of genes (or other genomic feature data)
gainst a set of peaks (e.g. ChIP data), identifying the nearest peaks to
each feature and vice versa.

The program was originally written specifically for ChIP-Seq and RNA-Seq
data but works equally well for ChIP-chip and microarray expression data,
and can also be used to integrate any set of genomic features (e.g.
canonical genes, CpG islands) with expression data.

RnaChipIntegrator can be obtained from
https://pypi.python.org/pypi/RnaChipIntegrator/

-------------

.. class:: infomark

**Input**

The gene data must be in a tabular file with the following columns
of data for each gene or genomic feature (one gene per line):

====== ========== ======================================================================
Column Name       Description
====== ========== ======================================================================
     1 ID         Name used to identify the gene in the output
     2 chr        Chromosome name
     3 start      Start position of the gene
     4 end        End position of the gene
     5 strand     Must be either '+' or '-'
     6 diff_expr  Optional: indicates gene is differentially expressed (1) or not (0)
====== ========== ======================================================================

The peak data must be in a tabular file with at least 3 columns of data
for each peak (one peak per line):

====== ========== =================================
Column Name       Description
====== ========== =================================
     1 chr        Chromosome name
     2 start      Start position of the peak 
     3 end        End position of the peak
====== ========== =================================

If peak data is in ``bed`` format then the tool will automatically
assign the correct columns, otherwise the first three columns of data
will be used.

-------------

.. class:: infomark

**Outputs**

The key outputs from the tool are two lists compromising the nearest
peaks for each gene, and the nearest gene for each peak (one dataset
for each list).

There are two formats for reporting: "compact" and "full":

 * **Compact output** reports all the hits for each peak or gene on
   a single line of output;
 * **Full output** reports each peak/gene pair on a separate line
   (i.e. a multi-line output format).

In "full" output mode, additional options are available:

 * The output files can be "padded" with extra (empty) lines to ensure
   that there are always the same number of lines for each peak or
   gene, if fewer than the requested number of hits are found.
 * "Summary" datasets can also be requested, which include just the
   nearest peak reported for each gene (and vice versa).

In either mode these data will also be output in a single MS Excel file,
which contains one sheet per result set.

.. class:: warning

Using "compact" output with the number of hits limited to more than 4
peak/gene pairs (or with no limit at all) can result in a large number
of columns in the output files, which in some versions of Galaxy will
not be properly displayed. However the data files themselves should be
okay.

-------------

.. class:: informark

**More information**

It is recommended that you refer to the ``RnaChipIntegrator``
documentation for information on the contents of each output file:

* http://rnachipintegrator.readthedocs.org/en/latest/

-------------

.. class:: infomark

**Credits**

This Galaxy tool has been developed within the Bioinformatics Core Facility at the
University of Manchester. It runs the RnaChipIntegrator package which has also been
developed by this group, and is documented at
http://fls-bioinformatics-core.github.com/RnaChipIntegrator/

Please kindly acknowledge the Bioinformatics Core Facility if you use this tool.
  </help>
  <expand macro="citations" />
</tool>