# HG changeset patch # User crs4 # Date 1413380356 14400 # Node ID 30bd2584b6a09315c6d10f526e8eb4405d789697 # Parent 7698311d4466c6a2a1e500e2663eabf3cc17eb99 Uploaded diff -r 7698311d4466 -r 30bd2584b6a0 cat_paths.xml --- a/cat_paths.xml Fri May 30 06:48:47 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,62 +0,0 @@ - - Concatenate all components of a pathset into a single file. - - pydoop - hadoop-galaxy - - - - #if $use_hadoop - dist_cat_paths - #else - cat_paths - #end if - #if $delete_source - --delete-source - #end if - $input_pathset $output_path - - - - - - - - - - - - - - - - - - - - -Datasets represented as pathsets can be split in a number of files. -This tool takes all of them and concatenates them into a single output file. - -In your workflow, you'll need to explicitly set the appropriate data format on the -output dataset with an Action to "Change Datatype". - -"Delete remote input data" option -==================================== -With this option, after the data has been concated into the new Galaxy dataset, -the original files that were referenced by the pathset are deleted. This effectively -tells the action to "move" the data instead of a "copying" it and helps -avoid amassing intermediate data in your Hadoop workspace. - - -"Use Hadoop-based program" option -==================================== - -With this option you will use your entire Hadoop cluster to simultaneously write -multiple parts of the final file. For this to be possible, the Hadoop nodes -must be able to access the Galaxy file space directly. In addition, to achieve -reasonable results the Galaxy workspace should on a parallel shared file system. - - diff -r 7698311d4466 -r 30bd2584b6a0 datatypes_conf.xml --- a/datatypes_conf.xml Fri May 30 06:48:47 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ - - - - - - diff -r 7698311d4466 -r 30bd2584b6a0 dist_text_zipper.xml --- a/dist_text_zipper.xml Fri May 30 06:48:47 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,30 +0,0 @@ - - Compress lots of text files on Hadoop - - pydoop - hadoop-galaxy - - - - hadoop_galaxy - --input $input_data - --output $output - --executable dist_text_zipper - - - - - - - - - - - - - - - -This is a Pydoop-based distributed text file compression program. - - diff -r 7698311d4466 -r 30bd2584b6a0 hadoop_galaxy-13348e73/cat_paths.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hadoop_galaxy-13348e73/cat_paths.xml Wed Oct 15 09:39:16 2014 -0400 @@ -0,0 +1,62 @@ + + Concatenate all components of a pathset into a single file. + + pydoop + hadoop-galaxy + + + + #if $use_hadoop + dist_cat_paths + #else + cat_paths + #end if + #if $delete_source + --delete-source + #end if + $input_pathset $output_path + + + + + + + + + + + + + + + + + + + + +Datasets represented as pathsets can be split in a number of files. +This tool takes all of them and concatenates them into a single output file. + +In your workflow, you'll need to explicitly set the appropriate data format on the +output dataset with an Action to "Change Datatype". + +"Delete remote input data" option +==================================== +With this option, after the data has been concated into the new Galaxy dataset, +the original files that were referenced by the pathset are deleted. This effectively +tells the action to "move" the data instead of a "copying" it and helps +avoid amassing intermediate data in your Hadoop workspace. + + +"Use Hadoop-based program" option +==================================== + +With this option you will use your entire Hadoop cluster to simultaneously write +multiple parts of the final file. For this to be possible, the Hadoop nodes +must be able to access the Galaxy file space directly. In addition, to achieve +reasonable results the Galaxy workspace should on a parallel shared file system. + + diff -r 7698311d4466 -r 30bd2584b6a0 hadoop_galaxy-13348e73/datatypes_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hadoop_galaxy-13348e73/datatypes_conf.xml Wed Oct 15 09:39:16 2014 -0400 @@ -0,0 +1,6 @@ + + + + + + diff -r 7698311d4466 -r 30bd2584b6a0 hadoop_galaxy-13348e73/dist_text_zipper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hadoop_galaxy-13348e73/dist_text_zipper.xml Wed Oct 15 09:39:16 2014 -0400 @@ -0,0 +1,30 @@ + + Compress lots of text files on Hadoop + + pydoop + hadoop-galaxy + + + + hadoop_galaxy + --input $input_data + --output $output + --executable dist_text_zipper + + + + + + + + + + + + + + + +This is a Pydoop-based distributed text file compression program. + + diff -r 7698311d4466 -r 30bd2584b6a0 hadoop_galaxy-13348e73/make_pathset.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hadoop_galaxy-13348e73/make_pathset.xml Wed Oct 15 09:39:16 2014 -0400 @@ -0,0 +1,58 @@ + + Create a pathset for a set of files + + pydoop + hadoop-galaxy + + + + make_pathset + #if str($paths.source) == 'tool_input' + --force-local --data-format $paths.datapaths.ext "$output_path" "$paths.datapaths" + #elif str($paths.source) == 'text_box' + #if str($paths.filesystem_select) == "local_fs" + --force-local + #end if + #if $paths.data_format + --data-format "$paths.data_format" + #end if + "$output_path" "$paths.datapaths" + #else + #raise ValueError("BUG!! unknown paths.source value") + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Create a pathset for a set of files to be used as input for Hadoop tools. + + diff -r 7698311d4466 -r 30bd2584b6a0 hadoop_galaxy-13348e73/put_dataset.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hadoop_galaxy-13348e73/put_dataset.xml Wed Oct 15 09:39:16 2014 -0400 @@ -0,0 +1,43 @@ + + Copy data from Galaxy storage to Hadoop storage. + + pydoop + hadoop-galaxy + + + + put_dataset + #if $workspace != "" + --hadoop-workspace "$workspace" + #end if + #if $use_distcp + --distcp + #end if + "$input_pathset" "$output_path" + + + + + + + + + + + + + + + + + + + + This tools copies data from Galaxy's storage to storage that is suitable for + Hadoop jobs. An example of a use case may be to copy data from the Galaxy server + to HDFS. Whether this tool is required depends on your specific local setup. + + + diff -r 7698311d4466 -r 30bd2584b6a0 hadoop_galaxy-13348e73/split_pathset.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hadoop_galaxy-13348e73/split_pathset.xml Wed Oct 15 09:39:16 2014 -0400 @@ -0,0 +1,60 @@ + + Split a pathset according to a regular expression criteria + + pydoop + hadoop-galaxy + + + + split_pathset '$criteria_expr' + #if $anchor_end + --anchor-end + #end if + --expand-levels $expand_levels + $input_pathset $output_true $output_false + + + + + + + + + + + + + + + + + + + + + + + + + + + + Splits a pathset according to a regular expression. + + You can have the tool expand the paths in the pathset by a certain number + of levels prior to testing whether it matches the regular expression. + + + **Note**: you can't use '$' in your regular expression. To anchor the + expression to the end of the path use the checkbox. + + + *Note*: the regular expression must match the path from its beginning. + + diff -r 7698311d4466 -r 30bd2584b6a0 hadoop_galaxy-13348e73/tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hadoop_galaxy-13348e73/tool_dependencies.xml Wed Oct 15 09:39:16 2014 -0400 @@ -0,0 +1,27 @@ + + + + + + + + + git clone https://github.com/crs4/hadoop-galaxy/ + git reset --hard 0.1.4 + + + + + + $INSTALL_DIR/lib/python + export PYTHONPATH=$INSTALL_DIR/lib/python:$PYTHONPATH && python setup.py install --prefix=$INSTALL_DIR --install-lib=$INSTALL_DIR/lib/python + + $INSTALL_DIR/bin + $INSTALL_DIR/lib/python + + + + + + + diff -r 7698311d4466 -r 30bd2584b6a0 make_pathset.xml --- a/make_pathset.xml Fri May 30 06:48:47 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,58 +0,0 @@ - - Create a pathset for a set of files - - pydoop - hadoop-galaxy - - - - make_pathset - #if str($paths.source) == 'tool_input' - --force-local --data-format $paths.datapaths.ext "$output_path" "$paths.datapaths" - #elif str($paths.source) == 'text_box' - #if str($paths.filesystem_select) == "local_fs" - --force-local - #end if - #if $paths.data_format - --data-format "$paths.data_format" - #end if - "$output_path" "$paths.datapaths" - #else - #raise ValueError("BUG!! unknown paths.source value") - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Create a pathset for a set of files to be used as input for Hadoop tools. - - diff -r 7698311d4466 -r 30bd2584b6a0 put_dataset.xml --- a/put_dataset.xml Fri May 30 06:48:47 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,43 +0,0 @@ - - Copy data from Galaxy storage to Hadoop storage. - - pydoop - hadoop-galaxy - - - - put_dataset - #if $workspace != "" - --hadoop-workspace "$workspace" - #end if - #if $use_distcp - --distcp - #end if - "$input_pathset" "$output_path" - - - - - - - - - - - - - - - - - - - - This tools copies data from Galaxy's storage to storage that is suitable for - Hadoop jobs. An example of a use case may be to copy data from the Galaxy server - to HDFS. Whether this tool is required depends on your specific local setup. - - - diff -r 7698311d4466 -r 30bd2584b6a0 split_pathset.xml --- a/split_pathset.xml Fri May 30 06:48:47 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,60 +0,0 @@ - - Split a pathset according to a regular expression criteria - - pydoop - hadoop-galaxy - - - - split_pathset '$criteria_expr' - #if $anchor_end - --anchor-end - #end if - --expand-levels $expand_levels - $input_pathset $output_true $output_false - - - - - - - - - - - - - - - - - - - - - - - - - - - - Splits a pathset according to a regular expression. - - You can have the tool expand the paths in the pathset by a certain number - of levels prior to testing whether it matches the regular expression. - - - **Note**: you can't use '$' in your regular expression. To anchor the - expression to the end of the path use the checkbox. - - - *Note*: the regular expression must match the path from its beginning. - - diff -r 7698311d4466 -r 30bd2584b6a0 tool_conf.xml --- a/tool_conf.xml Fri May 30 06:48:47 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ - - -
- - - - - -
-
diff -r 7698311d4466 -r 30bd2584b6a0 tool_dependencies.xml --- a/tool_dependencies.xml Fri May 30 06:48:47 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,27 +0,0 @@ - - - - - - - - - git clone https://github.com/crs4/hadoop-galaxy/ - git reset --hard 0.1.1 - - - - - - $INSTALL_DIR/lib/python - export PYTHONPATH=$INSTALL_DIR/lib/python:$PYTHONPATH && python setup.py install --prefix=$INSTALL_DIR --install-lib=$INSTALL_DIR/lib/python - - $INSTALL_DIR/bin - $INSTALL_DIR/lib/python - - - - - - -