Mercurial > repos > petr-novak > profrep

<tool id="profrep" name="ProfRep" version="1.0.0">
  <stdio>
    <regex match="Traceback" source="stderr" level="fail" description="Unknown error" />
  </stdio>
  <description> Tool to identify and visualize general repetive profile of a sequence as well as assign repetitive regions to a class from database of repeats </description>
<requirements>
    <requirement type="package">blast</requirement>
    <requirement type="package">last</requirement>
    <requirement type="package">ucsc-wigtobigwig</requirement>
    <requirement type="package">biopython</requirement>
    <requirement type="package">numpy</requirement>
    <requirement type="package">matplotlib</requirement>
    <requirement type="package">profrep</requirement>
    <requirement type="package" version="1.0">profrep_databases</requirement>
    <requirement type="package" version="1.16.4">jbrowse</requirement>
</requirements>
<command>
which python3 > /home/petr/tmp/profreptest/env;

#if not $custom_data.options_custom_data:
	profrep_reads=\$(awk -v var="${custom_data.prepared_dataset}" 'BEGIN{FS="\t"}{if ($1 == var) print $3}' $__tool_data_path__/profrep/prepared_datasets.txt) &amp;&amp;
	profrep_cls=\$(awk -v var="${custom_data.prepared_dataset}" 'BEGIN{FS="\t"}{if ($1 == var) print $4}' $__tool_data_path__/profrep/prepared_datasets.txt) &amp;&amp;
	profrep_annotation=\$(awk -v var="${custom_data.prepared_dataset}" 'BEGIN{FS="\t"}{if ($1 == var) print $5}' $__tool_data_path__/profrep/prepared_datasets.txt) &amp;&amp;
#end if

python3 ${__tool_directory__}/profrep.py --query ${input} --output_gff ${ProfGff} --html_file ${html_file}
--html_path ${html_file.extra_files_path} --n_gff ${NGff}
--protein_domains ${dm.domains_switch}
--jbrowse_bin \${JBROWSE_SOURCE_DIR}/bin
--log_file ${log_file}

 #if $dm.domains_switch:
	--domain_gff ${DomGff}
	--protein_database ${__tool_data_path__ }/protein_domains/${dm.db_type}_pdb
	--classification ${__tool_data_path__ }/protein_domains/${dm.db_type}_class
 #end if

 #if $advanced_options.opts:
	--bit_score ${advanced_options.bit_score}
	--word_size ${advanced_options.word_size}
	--e_value ${advanced_options.e_value}
	--threshold_repeat ${advanced_options.threshold}
	--window ${advanced_options.window}
	--overlap ${advanced_options.overlap}
	#if $advanced_options.dust_filter:
		--dust_filter "yes"
	#else
		--dust_filter "no"
	#end if
 #end if

 #if $custom_data.options_custom_data:
    --reads ${reads}
    --ann_tbl ${annotations}
    --cls ${cls}
    --new_db True
	#if $custom_data.cn.copy_num:
		--copy_numbers $custom_data.cn.copy_num
        --genome_size ${custom_data.cn.genome_size}
    #end if
 #else
    --db_id ${custom_data.prepared_dataset}
    --copy_numbers $custom_data.copy_numbers
    --reads $__tool_data_path__/profrep/\$profrep_reads
    --ann_tbl $__tool_data_path__/profrep/\$profrep_annotation
    --cls $__tool_data_path__/profrep/\$profrep_cls
    --new_db False
 #end if
</command>

<inputs>
 <param format="fasta" type="data" name="input" label="DNA sequence to annotate" help="Input sequence in multi-fasta format" />
 <conditional name="custom_data" >
  <param name="options_custom_data" type="boolean" truevalue="True" falsevalue="False" checked="False" label="Use custom annotation data" />
  <when value="False">
   <param name="prepared_dataset" type="select" label="Choose existing annotation dataset"  help="You can find list of all available species below in the Database section">
    <options from_file="profrep/prepared_datasets.txt" >
     <column name="name" index="1"/>
     <column name="value" index="0"/>
    </options>
   </param>
   <param name="copy_numbers" type="boolean" truevalue="True" falsevalue="False" checked="True" label="Convert hits to copy numbers"  />
  </when>
  <when value="True">
   <param format="fasta" type="data" name="reads" label="NGS reads" help="Input file of fasta-formatted reads sequences"  />
   <param format="fasta" type="data" name="cls" label="RE list of clusters and belonging reads (hitsort.cls)" help="fasta-formatted list of all clusters reported by RE and reads identifiers that belong to them" />
   <param format="tabular" type="data" name="annotations" label="Clusters classification" help="Table containing numbers of clusters and corresponding classifications" />
   <conditional name="cn">
    <param name="copy_num" type="boolean" truevalue="True" falsevalue="False" checked="False" label="Convert hits to copy numbers" />
    <when value="True">
     <param name="genome_size" type="float" value="0" min="0.0000001" max ="1000000" label="Enter the genome size in Mbp" />
    </when>
   </conditional>
  </when>
 </conditional>
 <conditional name="dm" >
   <param name="domains_switch" type="boolean" display="checkbox" truevalue="True" falsevalue="False" checked="True" label="Report protein domains"/>
   <when value="True">
     <param name="db_type" type="select" label="Select taxon and protein domain database version (REXdb)" help="">
       <options from_file="rexdb_versions.txt">
         <column name="name" index="0"/>
         <column name="value" index="1"/>
       </options>
     </param>
   </when>
 </conditional>
 <conditional name="advanced_options" >
  <param name="opts" type="boolean" display="checkbox" truevalue="True" falsevalue="False" checked="False" label="Advanced searching options"/>
  <when value="True">
   <param name="bit_score" type="float" value="50" label="Bitscore" help="Blast filtering option: BITSCORE" />
   <param name="e_value" type="text" value="0.1" label="e-value cut-off" help="Blast filtering option: statistical significance threshold for reporting hits" />
   <param name="word_size" type="integer" value="11" min="7" max="20" label="Initial word size" help="Initial word size used by Blast for alignment" />
   <param name="dust_filter" type="boolean" display="checkbox" truevalue="True" falsevalue="False" checked="True" label="Use DUST filter" help="Filters query sequence for low-complexity regions with DUST filter" >
   </param>
   <param name="window" type="integer" value="5000" min="5000" label="Sliding window size" help="Use when having a long input sequence so that it can be processed in parallel" />
   <param name="overlap" type="integer" value="150" min="150" max="500" label="Windows overlap" help="Must be greater than read length" />
   <param name="threshold" type="integer" value="3" min="1" label="Repetitive threshold" help="Threshold for copy numbers/hits at certain position to be reported as repetitive in GFF format"  />
  </when>
 </conditional>
</inputs>

 <outputs>
	<data format="gff3" name="ProfGff" label="GFF file of repetitive regions from dataset ${input.hid}" />
	<data format="gff3" name="DomGff" label="GFF file of protein domains from dataset ${input.hid}" >
	 <filter>dm['domains_switch']</filter>
	</data>
	<data format="html" name="html_file" label="HTML report, JBrowse Data Directory from ${input.hid}" />
	<data format="gff3" name="NGff" label="GFF file of unknown bases (Ns) from dataset ${input.hid}" />
	<data format="txt" name="log_file" label="Log file" />

 </outputs>

 <help>

	**HELP**

	**Input data**

	1. list of NGS reads
		[RE archive: seqclust -> sequences -> sequences.fasta]
	2. list of all clusters and belonging reads
		[RE archive: seqclust -> clustering -> hitsort.cls]
	3. clusters classification table
		[RE archive: PROFREP_CLASSIFICATION_TEMPLATE.csv (**! automatic classification - needs to be manually adjusted**)]

		*REQUIREMENTS for custom classification table:*

		TAB-separated list of cluster numbers and their repetitive classification. The list does not have to necessarily contain all the clusters. Classification may be an arbitrary custom string, but it is highly desirable to use the standardized format, especially for downstream analysis of the output (ProfRep Refiner Tool):

	- individual classification levels are separated by a pipe character "|"
	- the first classification level is derived from the origin of the repetitive sequence, i.e. repeat, organelle.
	- mobile elements classification should follow protein domains classification
	- for the rest of repeats (e.g. satellites, MITEs) arbitrary custom classification with any number of levels is allowed

		Example::

				42      repeat|mobile_element|Class_I|LTR|Ty1/copia|SIRE
				43	repeat|mobile_element|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Ogre/Tat|TatIV/Ogre
				45      repeat|mobile_element|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila
				48      repeat|satellite|PisTR/B
				134     organelle|plastid

	All the files are available from RE clustering archive. For Galaxy manipulation you can use **'Extract Data for Profrep'** tool to extract them. Please keep in mind that classification table from RepeatExplorer should serve as some kind of template and it is supposed to be manually adjusted anyway. For selected species these files will already be available as prepared datasets - at present this option is only available for Pisum sativum Terno (Macas et al 2015))


	**Principle**

	The main ProfRep tool runs blastn similarity search on given DNA against the database of all reads (low coverage sequencing). The preliminary hits have to pass quality filter (not too stringent so that the hit are not too fragmented) based on BITSCORE parameter (default **50**). These and other search parameters are all adjustable (*Advanced options* in Galaxy formular). The similarity search runs in parallel which lowers the computing times significantly especially when working with large input data - it defaultly uses all the sources available. The parallelization sliding window is set to **5kb** with **150b** overlap, both parameters are adjustable as well. When changing them, make sure that the overlap is at least of reads length so that the hits on borders are covered. The hits are sorted to clusters they belong to and subsequently assigned to a corresponding repetitive class based on the classification table. The hits amounts per each base are recorded for every repeat class separately in form of repetitive profile. Hits can be recalculated to copy numbers if the genome size of the species is provided (for prepared species in the Galaxy menu already included). The profiles are reported in a BigWig format to be visualized as graphs (log scale) in JBrowse. This format is binary, so it cannot be directly checked, but the quantitative information is still available form Wig text-based files in the output data structure ("data" DIR). For a quick check the profiles including the domains regions are also showed in summary HTML report (if the sequence length does not exceed 200kb). The summed profile **ALL** is created based on all individual profiles plus profiles of all mapped (but unclustered or unclassified) reads, keeping track of the overal sequence representation of repeats.
	Protein domains search is accomplished by DANTE tool (see below), running defaultly as a ProfRep module (can be switched off). The protein domains outputs are already **filtered** with default quality parameters optimized for Viridiplantae species.

	**Outputs**

		- **HTML summary report, JBrowse Data Directory** showing basic information and repetitive profile graphs as well as protein domains (optional) for individual sequences (up to 50). This output also serves as an data directory for [JBrowse](https://jbrowse.org/) genome browser. You can create a standalone JBrowse instance for further detailed visualization of the output tracks using Galaxy-integrated tool. This output can also be downloaded as an archive containing all relevant data for visualization via locally installed JBrowse server (see more about visualization in OUTPUT VISUALIZATION below)
		- **Ns GFF** - reports unspecified (N) bases regions in the sequence
		- **Repeats GFF** - reports repetitive regions of a certain length (defaultly **80**) and above hits/copy numbers threshold (defaultly **5**)
		- **Domains GFF** - reports protein domains, classification of domain, chain orientation and alignment sequences
		- **Log file**


 </help>

</tool>
author	petr-novak
date	Wed, 26 Jun 2019 08:01:42 -0400
parents
children	22919ea3463c