annotate dante_gff_output_filtering.xml @ 4:e27e86406f56 draft

Uploaded
author petr-novak
date Wed, 26 Jun 2019 10:23:50 -0400
parents a5f1638b73be
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1 <tool id="domains_filter" name="Protein Domains Filter" version="1.0.0">
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
2 <requirements>
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
3 <requirement type="package" version="1.0.0">domains_filter</requirement>
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
4 </requirements>
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
5 <stdio>
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
6 <regex match="Traceback" source="stderr" level="fail" description="Unknown error" />
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
7 <regex match="error" source="stderr" level="fail" description="Unknown error" />
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
8 </stdio>
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
9 <description> Tool for filtering of gff3 output from DANTE. Filtering can be performed based domain type and alignment quality. </description>
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
10 <command>
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
11 python3 ${__tool_directory__}/dante_gff_output_filtering.py --dom_gff ${DomGff} --domains_prot_seq ${dom_prot_seq} --domains_filtered ${dom_filtered} --selected_dom ${selected_domain} --th_identity ${th_identity} --th_similarity ${th_similarity} --th_length ${th_length} --interruptions ${interruptions} --max_len_proportion ${th_len_ratio} --element_type '${element_type}'
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
12
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
13 </command>
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
14 <inputs>
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
15 <param format="gff" type="data" name="DomGff" label="Choose primary GFF3 file of all domains from Protein Domains Finder" />
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
16 <param name="th_identity" type="float" value="0.35" min="0" max="1" label="Minimum identity" help="Protein sequence indentity threshold between input and mapped protein from db [0-1]" />
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
17 <param name="th_similarity" type="float" value="0.45" min="0" max="1" label="Minimum similarity" help="Protein sequence similarity threshold between input and mapped protein from db [0-1]" />
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
18 <param name="th_length" type="float" value="0.8" min="0" max="1" label="Minimum alignment length" help="Proportion of the hit length without gaps to the length of the database sequence [0-1]" />
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
19 <param name="interruptions" type="integer" value="3" label="Interruptions [frameshifts + stop codons]" help="Tolerance threshold per every starting 100 amino acids of alignment sequence" />
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
20 <param name="th_len_ratio" type="float" value="1.2" label="Maximal length proportion" help="Maximal proportion of alignment length to the original length of protein domain from database (including indels)" />
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
21 <param name="selected_domain" type="select" label="Select protein domain type" >
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
22 <options from_file="select_domain.txt" >
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
23 <column name="name" index="0"/>
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
24 <column name="value" index="0"/>
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
25 </options>
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
26 </param>
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
27 <param name="element_type" type="text" value="Ty1/copia" label="Filter based on classification" help="You can use preset options or enter an arbitrary string to filter a certain repetitive element type of any level. It must be a continuous substring in a proper format of Final_Classification attribute of GFF3 file. Classification levels are separated by | character. Filtering is case sensitive">
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
28 <option value="Ty1/copia">Ty1/copia</option>
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
29 <option value="Ty3/copia">Ty3/gypsy</option>
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
30 <option value="Class_I|">Class_I|</option>
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
31 <option value="Class_II|">Class_II|</option>
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
32 <sanitizer>
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
33 <valid initial="string.ascii_letters,string.digits">
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
34 <add value="_" />
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
35 <add value="/" />
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
36 <add value="|" />
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
37 </valid>
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
38 </sanitizer>
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
39 </param>
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
40 </inputs>
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
41 <outputs>
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
42 <data format="gff3" name="dom_filtered" label="Filtered GFF3 file of ${selected_domain} domains from dataset ${DomGff.hid}" />
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
43 <data format="fasta" name="dom_prot_seq" label="Protein sequences of ${selected_domain} domains from dataset ${DomGff.hid}" />
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
44
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
45
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
46 </outputs>
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
47
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
48 <help>
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
49
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
50 **WHAT IT DOES**
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
51
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
52 This tool runs filtering on either primary GFF3 file of all domains, i.e. output of *Protein Domains Finder* tool or already filtered GFF3 file. Domains can be filtered based on:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
53
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
54 **Quality of alignment such as**:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
55 - alignment sequence identity
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
56 - alignment similarity
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
57 - alignment proportion length
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
58 - number of interruptions (frameshifts or stop codons) per 100 AA
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
59
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
60 **Protein domain type**
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
61 This filtration is based on "Name" attribute of GFF3 file
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
62
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
63 **Repetitive element classification**
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
64 In the text field you can specify a classification string you wish to filter. This filtration is based on "Final_Classification" attribute of GFF file, so it must be in the proper form (classification levels are separated by "|"). You can see which classifications occurs in your data taking a look into Classification summary table output. If you leave the field blank, domains of all classifications will be reported
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
65
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
66 All the records containing ambiguous domain type (e.g. RH/INT) are filtered out automatically. They do not take place in filtered gff file neither the protein sequence is derived from these potentially chimeric domains. Optimal results (for general usage) should be reached using the default quality filtering parameters which are appropriate to find all types of protein domains. Keep in mind that the results should be critically assessed based on your input data anyhow.
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
67
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
68
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
69 **OUTPUTS PRODUCED:**
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
70 1. Filtered GFF3 file
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
71 2. Translated protein sequences of the filtered domains regions of original DNA sequence in fasta format
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
72
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
73 *Translated sequences are taken from the best alignment (Best_Hit attribute) within a domain region, however this alignment does not necessarily have to cover the whole region reported as a domain in gff file*
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
74
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
75
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
76 </help>
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
77 </tool>
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
78