Mercurial > repos > crs4 > hadoop_galaxy

diff cat_paths.xml @ 0:7698311d4466 draft
Uploaded
author: crs4
date: Fri, 30 May 2014 06:48:47 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cat_paths.xml	Fri May 30 06:48:47 2014 -0400
@@ -0,0 +1,62 @@
+<tool id="hadoop_galaxy_cat_paths" name="Cat paths" version="0.1.0">
+  <description>Concatenate all components of a pathset into a single file.</description>
+  <requirements>
+    <requirement type="package" version="0.11">pydoop</requirement>
+    <requirement type="package" version="0.1.1">hadoop-galaxy</requirement>
+  </requirements>
+
+  <command>
+    #if $use_hadoop
+      dist_cat_paths
+    #else
+      cat_paths
+    #end if
+    #if $delete_source
+      --delete-source
+    #end if
+    $input_pathset $output_path
+  </command>
+
+  <inputs>
+    <param name="input_pathset" type="data" format="pathset" label="Input pathset">
+      <validator type="empty_field" />
+    </param>
+    <param name="delete_source" type="boolean" checked="false" label="Delete remote input data"
+        help="This option makes the tool move the data rather than copy it" />
+    <param name="use_hadoop" type="boolean" checked="false" label="Use Hadoop-based program"
+        help="The Galaxy workspace must be accessible by the Hadoop cluster (see help for details)" />
+  </inputs>
+
+  <outputs>
+      <!-- TODO: can we read the format from input pathset and transfer it to output? -->
+      <data name="output_path" format="data" label="Concatenated dataset $input_pathset.name" />
+  </outputs>
+
+  <stdio>
+    <exit_code range="1:" level="fatal" />
+  </stdio>
+
+  <help>
+Datasets represented as pathsets can be split in a number of files.
+This tool takes all of them and concatenates them into a single output file.
+
+In your workflow, you'll need to explicitly set the appropriate data format on the
+output dataset with an Action to "Change Datatype".
+
+"Delete remote input data" option
+====================================
+With this option, after the data has been concated into the new Galaxy dataset,
+the original files that were referenced by the pathset are deleted.  This effectively
+tells the action to "move" the data instead of a "copying" it and helps
+avoid amassing intermediate data in your Hadoop workspace.
+
+
+"Use Hadoop-based program" option
+====================================
+
+With this option you will use your entire Hadoop cluster to simultaneously write
+multiple parts of the final file.  For this to be possible, the Hadoop nodes
+must be able to access the Galaxy file space directly.  In addition, to achieve
+reasonable results the Galaxy workspace should on a parallel shared file system.
+  </help>
+</tool>
author	crs4
date	Fri, 30 May 2014 06:48:47 -0400
parents
children