Mercurial > repos > crs4 > hadoop_galaxy
diff cat_paths.xml @ 0:7698311d4466 draft
Uploaded
author | crs4 |
---|---|
date | Fri, 30 May 2014 06:48:47 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cat_paths.xml Fri May 30 06:48:47 2014 -0400 @@ -0,0 +1,62 @@ +<tool id="hadoop_galaxy_cat_paths" name="Cat paths" version="0.1.0"> + <description>Concatenate all components of a pathset into a single file.</description> + <requirements> + <requirement type="package" version="0.11">pydoop</requirement> + <requirement type="package" version="0.1.1">hadoop-galaxy</requirement> + </requirements> + + <command> + #if $use_hadoop + dist_cat_paths + #else + cat_paths + #end if + #if $delete_source + --delete-source + #end if + $input_pathset $output_path + </command> + + <inputs> + <param name="input_pathset" type="data" format="pathset" label="Input pathset"> + <validator type="empty_field" /> + </param> + <param name="delete_source" type="boolean" checked="false" label="Delete remote input data" + help="This option makes the tool move the data rather than copy it" /> + <param name="use_hadoop" type="boolean" checked="false" label="Use Hadoop-based program" + help="The Galaxy workspace must be accessible by the Hadoop cluster (see help for details)" /> + </inputs> + + <outputs> + <!-- TODO: can we read the format from input pathset and transfer it to output? --> + <data name="output_path" format="data" label="Concatenated dataset $input_pathset.name" /> + </outputs> + + <stdio> + <exit_code range="1:" level="fatal" /> + </stdio> + + <help> +Datasets represented as pathsets can be split in a number of files. +This tool takes all of them and concatenates them into a single output file. + +In your workflow, you'll need to explicitly set the appropriate data format on the +output dataset with an Action to "Change Datatype". + +"Delete remote input data" option +==================================== +With this option, after the data has been concated into the new Galaxy dataset, +the original files that were referenced by the pathset are deleted. This effectively +tells the action to "move" the data instead of a "copying" it and helps +avoid amassing intermediate data in your Hadoop workspace. + + +"Use Hadoop-based program" option +==================================== + +With this option you will use your entire Hadoop cluster to simultaneously write +multiple parts of the final file. For this to be possible, the Hadoop nodes +must be able to access the Galaxy file space directly. In addition, to achieve +reasonable results the Galaxy workspace should on a parallel shared file system. + </help> +</tool>