comparison profrep.xml @ 0:a5f1638b73be draft

Uploaded
author petr-novak
date Wed, 26 Jun 2019 08:01:42 -0400
parents
children 22919ea3463c
comparison
equal deleted inserted replaced
-1:000000000000 0:a5f1638b73be
1 <tool id="profrep" name="ProfRep" version="1.0.0">
2 <stdio>
3 <regex match="Traceback" source="stderr" level="fail" description="Unknown error" />
4 </stdio>
5 <description> Tool to identify and visualize general repetive profile of a sequence as well as assign repetitive regions to a class from database of repeats </description>
6 <requirements>
7 <requirement type="package">blast</requirement>
8 <requirement type="package">last</requirement>
9 <requirement type="package">ucsc-wigtobigwig</requirement>
10 <requirement type="package">biopython</requirement>
11 <requirement type="package">numpy</requirement>
12 <requirement type="package">matplotlib</requirement>
13 <requirement type="package">profrep</requirement>
14 <requirement type="package" version="1.0">profrep_databases</requirement>
15 <requirement type="package" version="1.16.4">jbrowse</requirement>
16 </requirements>
17 <command>
18 which python3 > /home/petr/tmp/profreptest/env;
19
20 #if not $custom_data.options_custom_data:
21 profrep_reads=\$(awk -v var="${custom_data.prepared_dataset}" 'BEGIN{FS="\t"}{if ($1 == var) print $3}' $__tool_data_path__/profrep/prepared_datasets.txt) &amp;&amp;
22 profrep_cls=\$(awk -v var="${custom_data.prepared_dataset}" 'BEGIN{FS="\t"}{if ($1 == var) print $4}' $__tool_data_path__/profrep/prepared_datasets.txt) &amp;&amp;
23 profrep_annotation=\$(awk -v var="${custom_data.prepared_dataset}" 'BEGIN{FS="\t"}{if ($1 == var) print $5}' $__tool_data_path__/profrep/prepared_datasets.txt) &amp;&amp;
24 #end if
25
26 python3 ${__tool_directory__}/profrep.py --query ${input} --output_gff ${ProfGff} --html_file ${html_file}
27 --html_path ${html_file.extra_files_path} --n_gff ${NGff}
28 --protein_domains ${dm.domains_switch}
29 --jbrowse_bin \${JBROWSE_SOURCE_DIR}/bin
30 --log_file ${log_file}
31
32 #if $dm.domains_switch:
33 --domain_gff ${DomGff}
34 --protein_database ${__tool_data_path__ }/protein_domains/${dm.db_type}_pdb
35 --classification ${__tool_data_path__ }/protein_domains/${dm.db_type}_class
36 #end if
37
38 #if $advanced_options.opts:
39 --bit_score ${advanced_options.bit_score}
40 --word_size ${advanced_options.word_size}
41 --e_value ${advanced_options.e_value}
42 --threshold_repeat ${advanced_options.threshold}
43 --window ${advanced_options.window}
44 --overlap ${advanced_options.overlap}
45 #if $advanced_options.dust_filter:
46 --dust_filter "yes"
47 #else
48 --dust_filter "no"
49 #end if
50 #end if
51
52 #if $custom_data.options_custom_data:
53 --reads ${reads}
54 --ann_tbl ${annotations}
55 --cls ${cls}
56 --new_db True
57 #if $custom_data.cn.copy_num:
58 --copy_numbers $custom_data.cn.copy_num
59 --genome_size ${custom_data.cn.genome_size}
60 #end if
61 #else
62 --db_id ${custom_data.prepared_dataset}
63 --copy_numbers $custom_data.copy_numbers
64 --reads $__tool_data_path__/profrep/\$profrep_reads
65 --ann_tbl $__tool_data_path__/profrep/\$profrep_annotation
66 --cls $__tool_data_path__/profrep/\$profrep_cls
67 --new_db False
68 #end if
69 </command>
70
71 <inputs>
72 <param format="fasta" type="data" name="input" label="DNA sequence to annotate" help="Input sequence in multi-fasta format" />
73 <conditional name="custom_data" >
74 <param name="options_custom_data" type="boolean" truevalue="True" falsevalue="False" checked="False" label="Use custom annotation data" />
75 <when value="False">
76 <param name="prepared_dataset" type="select" label="Choose existing annotation dataset" help="You can find list of all available species below in the Database section">
77 <options from_file="profrep/prepared_datasets.txt" >
78 <column name="name" index="1"/>
79 <column name="value" index="0"/>
80 </options>
81 </param>
82 <param name="copy_numbers" type="boolean" truevalue="True" falsevalue="False" checked="True" label="Convert hits to copy numbers" />
83 </when>
84 <when value="True">
85 <param format="fasta" type="data" name="reads" label="NGS reads" help="Input file of fasta-formatted reads sequences" />
86 <param format="fasta" type="data" name="cls" label="RE list of clusters and belonging reads (hitsort.cls)" help="fasta-formatted list of all clusters reported by RE and reads identifiers that belong to them" />
87 <param format="tabular" type="data" name="annotations" label="Clusters classification" help="Table containing numbers of clusters and corresponding classifications" />
88 <conditional name="cn">
89 <param name="copy_num" type="boolean" truevalue="True" falsevalue="False" checked="False" label="Convert hits to copy numbers" />
90 <when value="True">
91 <param name="genome_size" type="float" value="0" min="0.0000001" max ="1000000" label="Enter the genome size in Mbp" />
92 </when>
93 </conditional>
94 </when>
95 </conditional>
96 <conditional name="dm" >
97 <param name="domains_switch" type="boolean" display="checkbox" truevalue="True" falsevalue="False" checked="True" label="Report protein domains"/>
98 <when value="True">
99 <param name="db_type" type="select" label="Select taxon and protein domain database version (REXdb)" help="">
100 <options from_file="rexdb_versions.txt">
101 <column name="name" index="0"/>
102 <column name="value" index="1"/>
103 </options>
104 </param>
105 </when>
106 </conditional>
107 <conditional name="advanced_options" >
108 <param name="opts" type="boolean" display="checkbox" truevalue="True" falsevalue="False" checked="False" label="Advanced searching options"/>
109 <when value="True">
110 <param name="bit_score" type="float" value="50" label="Bitscore" help="Blast filtering option: BITSCORE" />
111 <param name="e_value" type="text" value="0.1" label="e-value cut-off" help="Blast filtering option: statistical significance threshold for reporting hits" />
112 <param name="word_size" type="integer" value="11" min="7" max="20" label="Initial word size" help="Initial word size used by Blast for alignment" />
113 <param name="dust_filter" type="boolean" display="checkbox" truevalue="True" falsevalue="False" checked="True" label="Use DUST filter" help="Filters query sequence for low-complexity regions with DUST filter" >
114 </param>
115 <param name="window" type="integer" value="5000" min="5000" label="Sliding window size" help="Use when having a long input sequence so that it can be processed in parallel" />
116 <param name="overlap" type="integer" value="150" min="150" max="500" label="Windows overlap" help="Must be greater than read length" />
117 <param name="threshold" type="integer" value="3" min="1" label="Repetitive threshold" help="Threshold for copy numbers/hits at certain position to be reported as repetitive in GFF format" />
118 </when>
119 </conditional>
120 </inputs>
121
122 <outputs>
123 <data format="gff3" name="ProfGff" label="GFF file of repetitive regions from dataset ${input.hid}" />
124 <data format="gff3" name="DomGff" label="GFF file of protein domains from dataset ${input.hid}" >
125 <filter>dm['domains_switch']</filter>
126 </data>
127 <data format="html" name="html_file" label="HTML report, JBrowse Data Directory from ${input.hid}" />
128 <data format="gff3" name="NGff" label="GFF file of unknown bases (Ns) from dataset ${input.hid}" />
129 <data format="txt" name="log_file" label="Log file" />
130
131 </outputs>
132
133 <help>
134
135 **HELP**
136
137 **Input data**
138
139 1. list of NGS reads
140 [RE archive: seqclust -> sequences -> sequences.fasta]
141 2. list of all clusters and belonging reads
142 [RE archive: seqclust -> clustering -> hitsort.cls]
143 3. clusters classification table
144 [RE archive: PROFREP_CLASSIFICATION_TEMPLATE.csv (**! automatic classification - needs to be manually adjusted**)]
145
146 *REQUIREMENTS for custom classification table:*
147
148 TAB-separated list of cluster numbers and their repetitive classification. The list does not have to necessarily contain all the clusters. Classification may be an arbitrary custom string, but it is highly desirable to use the standardized format, especially for downstream analysis of the output (ProfRep Refiner Tool):
149
150 - individual classification levels are separated by a pipe character "|"
151 - the first classification level is derived from the origin of the repetitive sequence, i.e. repeat, organelle.
152 - mobile elements classification should follow protein domains classification
153 - for the rest of repeats (e.g. satellites, MITEs) arbitrary custom classification with any number of levels is allowed
154
155 Example::
156
157 42 repeat|mobile_element|Class_I|LTR|Ty1/copia|SIRE
158 43 repeat|mobile_element|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Ogre/Tat|TatIV/Ogre
159 45 repeat|mobile_element|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila
160 48 repeat|satellite|PisTR/B
161 134 organelle|plastid
162
163 All the files are available from RE clustering archive. For Galaxy manipulation you can use **'Extract Data for Profrep'** tool to extract them. Please keep in mind that classification table from RepeatExplorer should serve as some kind of template and it is supposed to be manually adjusted anyway. For selected species these files will already be available as prepared datasets - at present this option is only available for Pisum sativum Terno (Macas et al 2015))
164
165
166 **Principle**
167
168 The main ProfRep tool runs blastn similarity search on given DNA against the database of all reads (low coverage sequencing). The preliminary hits have to pass quality filter (not too stringent so that the hit are not too fragmented) based on BITSCORE parameter (default **50**). These and other search parameters are all adjustable (*Advanced options* in Galaxy formular). The similarity search runs in parallel which lowers the computing times significantly especially when working with large input data - it defaultly uses all the sources available. The parallelization sliding window is set to **5kb** with **150b** overlap, both parameters are adjustable as well. When changing them, make sure that the overlap is at least of reads length so that the hits on borders are covered. The hits are sorted to clusters they belong to and subsequently assigned to a corresponding repetitive class based on the classification table. The hits amounts per each base are recorded for every repeat class separately in form of repetitive profile. Hits can be recalculated to copy numbers if the genome size of the species is provided (for prepared species in the Galaxy menu already included). The profiles are reported in a BigWig format to be visualized as graphs (log scale) in JBrowse. This format is binary, so it cannot be directly checked, but the quantitative information is still available form Wig text-based files in the output data structure ("data" DIR). For a quick check the profiles including the domains regions are also showed in summary HTML report (if the sequence length does not exceed 200kb). The summed profile **ALL** is created based on all individual profiles plus profiles of all mapped (but unclustered or unclassified) reads, keeping track of the overal sequence representation of repeats.
169 Protein domains search is accomplished by DANTE tool (see below), running defaultly as a ProfRep module (can be switched off). The protein domains outputs are already **filtered** with default quality parameters optimized for Viridiplantae species.
170
171 **Outputs**
172
173 - **HTML summary report, JBrowse Data Directory** showing basic information and repetitive profile graphs as well as protein domains (optional) for individual sequences (up to 50). This output also serves as an data directory for [JBrowse](https://jbrowse.org/) genome browser. You can create a standalone JBrowse instance for further detailed visualization of the output tracks using Galaxy-integrated tool. This output can also be downloaded as an archive containing all relevant data for visualization via locally installed JBrowse server (see more about visualization in OUTPUT VISUALIZATION below)
174 - **Ns GFF** - reports unspecified (N) bases regions in the sequence
175 - **Repeats GFF** - reports repetitive regions of a certain length (defaultly **80**) and above hits/copy numbers threshold (defaultly **5**)
176 - **Domains GFF** - reports protein domains, classification of domain, chain orientation and alignment sequences
177 - **Log file**
178
179
180 </help>
181
182 </tool>
183