comparison instagraal.xml @ 0:23d20e5e427d draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/instagraal commit 37f6f76292970c74374aebcf5cfd45de8a1b7595
author bgruening
date Thu, 10 Nov 2022 14:36:10 +0000
parents
children 8d5b13b571fd
comparison
equal deleted inserted replaced
-1:000000000000 0:23d20e5e427d
1 <tool id="instagraal" name="instaGRAAL" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
2 <description>Large genome reassembly based on Hi-C data</description>
3 <macros>
4 <token name="@TOOL_VERSION@">0.1.6</token>
5 <token name="@VERSION_SUFFIX@">0</token>
6 </macros>
7 <xrefs>
8 <xref type="bio.tools">instagraal</xref>
9 </xrefs>
10 <stdio>
11 <regex match="instagraal_class" level="warning" description="instaGRAAL is raining an expected error at the end."/>
12 <regex match="Exception:"/>
13 <exit_code range="2:"/>
14 <exit_code range=":-1"/>
15 </stdio>
16 <requirements>
17 <container type="docker">koszullab/instagraal</container>
18 </requirements>
19 <version_command>instagraal --version</version_command>
20 <command>
21 <![CDATA[
22
23 #set gap_string = 'N' * int($number_of_n)
24
25 mkdir -p hic_folder &&
26 mkdir -p ./outputs/ &&
27 ln -s '$abs_fragments_contacts_weighted' hic_folder/abs_fragments_contacts_weighted.txt &&
28 ln -s '$fragments_list' hic_folder/fragments_list.txt &&
29 ln -s '$info_contigs' hic_folder/info_contigs.txt &&
30
31 #if $reference_genome.source == 'history':
32 #set $ref_genome = 'reference.fasta'
33 ln -s -f '${reference_genome.history_item}' $ref_genome &&
34 #else:
35 #set $ref_genome = $reference_genome.index.fields.path
36 #end if
37
38 instagraal
39 ./hic_folder
40 $ref_genome
41 ./outputs/
42
43 --level $level
44 --cycles $cycles
45 --coverage-std $coverage_std
46 --neighborhood $neighborhood
47
48 $circular
49 $bomb
50 $pyramid_only
51 $simple
52 &&
53
54 mv ./outputs/*/test_*/genome.fasta ./outputs/genome.fasta &&
55 mv ./outputs/*/test_*/info_frags.txt ./outputs/info_frags.txt
56
57 &&
58 instagraal-polish
59 -m polishing
60 -i ./outputs/info_frags.txt
61 -f $ref_genome
62 -o ./outputs/curated.fasta
63 -j $gap_string
64
65 ]]>
66 </command>
67 <inputs>
68
69 <param name="abs_fragments_contacts_weighted" type="data" format="tabular" label="Abs fragments contacts weighted"/>
70 <param name="fragments_list" type="data" format="tabular" label="Fragments list"/>
71 <param name="info_contigs" type="data" format="tabular" label="info_contigs"/>
72
73 <conditional name="reference_genome">
74 <param name="source" type="select" label="Source for the reference genome" help="Built-in references were created using default options.">
75 <option value="indexed" selected="true">Use a built-in genome</option>
76 <option value="history">Use a genome from history</option>
77 </param>
78 <when value="indexed">
79 <param name="index" type="select" label="Select a reference genome" help="If your genome of interest is not listed, contact the Galaxy team.">
80 <options from_data_table="fasta_indexes">
81 <filter type="sort_by" column="2" />
82 <validator type="no_options" message="No genomes are available for the selected input dataset" />
83 </options>
84 </param>
85 </when>
86 <when value="history">
87 <param name="history_item" type="data" format="fasta" label="Reference genome" help="A reference genome in FASTA format" />
88 </when>
89 </conditional>
90
91 <param argument="--level" type="integer" value="4" min="3" max="6" label="Level (resolution) of the contact map"
92 help="Increasing level by one means a threefold smaller resolution but also a threefold faster computation time."/>
93
94 <param argument="--cycles" type="integer" value="30" min="20" max="100" label="Number of iterations to perform for each bin (row/column of the contact map)"
95 help="A high number of cycles has diminishing returns but there is a necessary minimum for assembly convergence."/>
96
97 <param argument="--coverage-std" type="integer" value="1" min="0" max="1" label="Number of standard deviations below the mean"
98 help="Coverage below which fragments should be filtered out prior to binning." />
99
100 <param argument="--neighborhood" type="integer" value="5" min="0" max="100" label="Number of neighbors to sample for potential mutations for each bin" />
101 <param argument="--circular" type="boolean" truevalue="--circular" falsevalue="" label="Indicates genome is circular" />
102 <param argument="--bomb" type="boolean" truevalue="--bomb" falsevalue="" label="Indicates genome is circular" />
103 <param argument="--bomb" type="boolean" truevalue="--bomb" falsevalue="" label="Explode the genome prior to scaffolding" />
104 <param argument="--pyramid-only" type="boolean" truevalue="--pyramid-only" falsevalue="" label="Only build multi-resolution contact maps (pyramids) and don't do any scaffolding" />
105 <param argument="--simple" type="boolean" truevalue="--simple" falsevalue="" label="Only perform operations at the edge of the contigs" />
106 <param name="number_of_n" type="integer" value="10" min="1" label="Number of Ns that you want to include during the polishing step" />
107
108 </inputs>
109 <outputs>
110 <data name="genome" format="fasta" from_work_dir="./outputs/genome.fasta"/>
111 <data name="frags" format="tabular" from_work_dir="./outputs/info_frags.txt"/>
112 <data name="curated" format="fasta" from_work_dir="./outputs/curated.fasta"/>
113 </outputs>
114 <tests>
115 <test>
116 <param name="abs_fragments_contacts_weighted" value="abs_fragments_contacts_weighted.tabular" />
117 <param name="fragments_list" value="fragments_list.tabular" />
118 <param name="info_contigs" value="info_contigs.tabular" />
119 <conditional name="reference_genome">
120 <param name="source" value="history"/>
121 <param name="history_item" value="fake.fasta" />
122 </conditional>
123 <assert_command>
124 <has_text text="NNNNNNNNNN"/> <!-- 10 long, the default -->
125 <not_has_text text="NNNNNNNNNNN"/> <!-- too long -->
126 <has_text text="--cycles 30" />
127 <has_text text="instagraal" />
128 <has_text text="instagraal-polish" />
129 </assert_command>
130 </test>
131 </tests>
132 <help>
133 <![CDATA[
134
135 Large genome reassembly based on Hi-C data.
136
137 -----------
138 Input files
139 -----------
140
141 instaGRAAL needs three files:
142
143 * A file called *abs_fragments_contacts_weighted*, containing the (sparse) Hi-C map itself. The first line must be id_frag_a id_frag_b n_contact. All subsequent lines must represent the map's contacts in coordinate format (id_frag_a being the row indices, id_frag_b being the column indices, n_contact being the number of contacts between each locus or index pair, e.g. if 5 contacts are found between fragments #2 and #3, there should be a line reading 2 3 5 in the file). n_contact must be an integer. The list should be sorted according to id_frag_a first, then id_frag_b. Fragment ids start at 0.
144 * A file called *fragments_list* containing information related to each fragment of the genome. The first line must be id chrom start_pos end_pos size gc_content, and subsequent lines (representing the fragments themselves) should follow that template. The fields should be self-explanatory; notably, chrom can be any string representing the chromosome's name to which the fragment at a given line belongs, and fragment ids should start over at 1 when the chromosome name changes. Aside from the chrom field and the gc field which is currently unused in this version and can be filled with any value, all fields should be integers. Note that start_pos starts at 0.
145 * A file called *info_contigs* containing information related to each contig/scaffold/chromosome in the genome. The first line must be contig length_kb n_frags cumul_length. Field names should be again self-explanatory; naturally, the contig field must contain names that are consistent with those found in fragments_list.txt. Also, length_kb should be an integer (rounded up or down if need be), and n_frags and cumul_length are supposed to be consistent with each other in that the cumulated length (in fragments) of contig N should be equal to the sum of the fields found in n_frags for the N-1 preceding lines. Note that cumul_length starts at 0.
146
147 All fields (including those in the files' headers) must be separated by tabs and are therefor `tabular` files.
148
149 ------------
150 Output files
151 ------------
152
153 After the scaffolder is done running, whatever path you specified as output will contain a test_mcmc_X directory, where X is the level (resolution) at which scaffolding was performed. This directory, in turn, will contain the following:
154
155 * genome.fasta: the scaffolded genome. Scaffolds will be ordered by increasing size in fragments, which roughly (but not always) translates into increasing size in bp.
156 * info_frags.txt: a file that contains, for each newly formed scaffold, the original coordinates of every single bin in that scaffold, in the format chromosome, id, orientation, start, end. Each bin has a unique ID that provides a convenient way of tracking consecutive stretches. Orientations are relative to one another, and when "-1" is supplied, it is understood that the reverse complement should be taken.
157
158
159 ]]>
160 </help>
161 <citations>
162 <citation type="doi">10.5281/zenodo.3753973</citation>
163 </citations>
164 </tool>