Repository 'data_manager_fetch_genome_dbkeys_all_fasta'
hg clone https://toolshed.g2.bx.psu.edu/repos/devteam/data_manager_fetch_genome_dbkeys_all_fasta

Changeset 3:86fa71e9b427 (2016-08-26)
Previous changeset 2:776bb1b478a0 (2015-10-14) Next changeset 4:60994ca04177 (2017-04-04)
Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/data_managers/data_manager_fetch_genome_dbkeys_all_fasta commit 9df6291d7d65c5274c4fd35775108eae58ea6c23
modified:
data_manager/data_manager_fetch_genome_all_fasta_dbkeys.py
data_manager/data_manager_fetch_genome_all_fasta_dbkeys.xml
added:
test-data/test.tar
test-data/test.tar.bz2
test-data/test.tar.gz
test-data/test.zip
b
diff -r 776bb1b478a0 -r 86fa71e9b427 data_manager/data_manager_fetch_genome_all_fasta_dbkeys.py
--- a/data_manager/data_manager_fetch_genome_all_fasta_dbkeys.py Wed Oct 14 13:46:35 2015 -0400
+++ b/data_manager/data_manager_fetch_genome_all_fasta_dbkeys.py Fri Aug 26 12:46:47 2016 -0400
[
b'@@ -6,27 +6,37 @@\n import tempfile\n import shutil\n import optparse\n-import urllib2\n-#import uuid\n from ftplib import FTP\n import tarfile\n import zipfile\n import gzip\n import bz2\n-\n+try:\n+    # For Python 3.0 and later\n+    from urllib.request import urlopen\n+    from io import BytesIO as StringIO\n+    from io import UnsupportedOperation\n+except ImportError:\n+    # Fall back to Python 2\'s urllib2\n+    from urllib2 import urlopen\n+    from StringIO import StringIO\n+    UnsupportedOperation = AttributeError\n from json import loads, dumps\n \n \n-CHUNK_SIZE = 2**20 #1mb\n+CHUNK_SIZE = 2**20  # 1mb\n+\n \n def cleanup_before_exit( tmp_dir ):\n     if tmp_dir and os.path.exists( tmp_dir ):\n         shutil.rmtree( tmp_dir )\n \n+\n def stop_err(msg):\n     sys.stderr.write(msg)\n     sys.exit(1)\n-    \n+\n+\n def get_dbkey_dbname_id_name( params, dbkey_description=None ):\n     dbkey = params[\'param_dict\'][\'dbkey_source\'][\'dbkey\']\n     #TODO: ensure sequence_id is unique and does not already appear in location file\n@@ -48,28 +58,38 @@\n             sequence_name = dbkey\n     return dbkey, dbkey_name, sequence_id, sequence_name\n \n+\n def _get_files_in_ftp_path( ftp, path ):\n     path_contents = []\n     ftp.retrlines( \'MLSD %s\' % ( path ), path_contents.append )\n     return [ line.split( \';\' )[ -1 ].lstrip() for line in path_contents ]\n \n-def _get_stream_readers_for_tar( file_obj, tmp_dir ):\n-    fasta_tar = tarfile.open( fileobj=file_obj, mode=\'r:*\' )\n-    return filter( lambda x: x is not None, [ fasta_tar.extractfile( member ) for member in fasta_tar.getmembers() ] )\n+\n+def _get_stream_readers_for_tar( fh, tmp_dir ):\n+    fasta_tar = tarfile.open( fileobj=fh, mode=\'r:*\' )\n+    return [x for x in [fasta_tar.extractfile(member) for member in fasta_tar.getmembers()] if x]\n+\n \n-def _get_stream_readers_for_zip( file_obj, tmp_dir ):\n-    fasta_zip = zipfile.ZipFile( file_obj, \'r\' )\n+def _get_stream_readers_for_zip( fh, tmp_dir ):\n+    """\n+    Unpacks all archived files in a zip file.\n+    Individual files will be concatenated (in _stream_fasta_to_file)\n+    """\n+    fasta_zip = zipfile.ZipFile( fh, \'r\' )\n     rval = []\n     for member in fasta_zip.namelist():\n         fasta_zip.extract( member, tmp_dir )\n         rval.append( open( os.path.join( tmp_dir, member ), \'rb\' ) )\n     return rval\n \n-def _get_stream_readers_for_gzip( file_obj, tmp_dir ):\n-    return [ gzip.GzipFile( fileobj=file_obj, mode=\'rb\' ) ]\n+\n+def _get_stream_readers_for_gzip( fh, tmp_dir ):\n+    return [ gzip.GzipFile( fileobj=fh, mode=\'rb\') ]\n \n-def _get_stream_readers_for_bz2( file_obj, tmp_dir ):\n-    return [ bz2.BZ2File( file_obj.name, \'rb\' ) ]\n+\n+def _get_stream_readers_for_bz2( fh, tmp_dir ):\n+    return [ bz2.BZ2File( fh.name, \'rb\') ]\n+\n \n def sort_fasta( fasta_filename, sort_method, params ):\n     if sort_method is None:\n@@ -77,6 +97,7 @@\n     assert sort_method in SORTING_METHODS, ValueError( "%s is not a valid sorting option." % sort_method )\n     return SORTING_METHODS[ sort_method ]( fasta_filename, params )\n \n+\n def _move_and_index_fasta_for_sorting( fasta_filename ):\n     unsorted_filename = tempfile.NamedTemporaryFile().name\n     shutil.move( fasta_filename, unsorted_filename )\n@@ -94,6 +115,7 @@\n     current_order = map( lambda x: x[1], sorted( map( lambda x: ( x[1], x[0] ), fasta_offsets.items() ) ) )\n     return ( unsorted_filename, fasta_offsets, current_order )\n \n+\n def _write_sorted_fasta( sorted_names, fasta_offsets, sorted_fasta_filename, unsorted_fasta_filename ):\n     unsorted_fh = open( unsorted_fasta_filename )\n     sorted_fh = open( sorted_fasta_filename, \'wb+\' )\n@@ -110,6 +132,7 @@\n     unsorted_fh.close()\n     sorted_fh.close()\n \n+\n def _sort_fasta_as_is( fasta_filename, params ):\n     return\n \n@@ -121,6 +144,7 @@\n     else:\n         _write_sorted_fasta( sorted_names, fasta_offsets, fasta_filename, unsorted_filename )    \n \n+\n def _sort_fasta_gatk( fasta_filename, params ):\n     #This method was added by reviewer request.\n     ( unsorted_filename, fasta_offs'..b'am[0]\n-    \n-    if isinstance( fasta_stream, list ):\n-        last_char = None\n-        for fh in fasta_stream:\n-            if last_char not in [ None, \'\\n\', \'\\r\' ]:\n-                fasta_writer.write( \'\\n\' )\n+    with open( fasta_filename, \'wb+\' ) as fasta_writer:\n+\n+        if isinstance( fasta_stream, list ) and len( fasta_stream ) == 1:\n+            fasta_stream = fasta_stream[0]\n+\n+        if isinstance( fasta_stream, list ):\n+            last_char = None\n+            for fh in fasta_stream:\n+                if last_char not in [ None, \'\\n\', \'\\r\', b\'\\n\', b\'\\r\' ]:\n+                    fasta_writer.write( b\'\\n\' )\n+                while True:\n+                    data = fh.read( CHUNK_SIZE )\n+                    if data:\n+                        fasta_writer.write( data )\n+                        last_char = data[-1]\n+                    else:\n+                        break\n+                if close_stream:\n+                    fh.close()\n+        else:\n             while True:\n-                data = fh.read( CHUNK_SIZE )\n+                data = fasta_stream.read( CHUNK_SIZE )\n                 if data:\n                     fasta_writer.write( data )\n-                    last_char = data[-1]\n                 else:\n                     break\n             if close_stream:\n-                fh.close()\n-    else:\n-        while True:\n-            data = fasta_stream.read( CHUNK_SIZE )\n-            if data:\n-                fasta_writer.write( data )\n-            else:\n-                break\n-        if close_stream:\n-            fasta_stream.close()\n-    \n-    fasta_writer.close()\n-    \n+                fasta_stream.close()\n+\n     sort_fasta( fasta_filename, params[\'param_dict\'][\'sorting\'][\'sort_selector\'], params )\n     \n     dbkey_dict = None\n@@ -335,6 +368,7 @@\n     \n     return [ ( \'__dbkeys__\', dbkey_dict ), ( \'all_fasta\', dict( value=sequence_id, dbkey=dbkey, name=sequence_name, path=fasta_base_filename ) ) ]\n \n+\n def compute_fasta_length( fasta_file, out_file, keep_first_word=False ):\n \n     infile = fasta_file\n@@ -367,6 +401,7 @@\n     out.write( "%s\\t%d\\n" % ( fasta_title[ 1: ], seq_len ) )\n     out.close()\n \n+\n def _create_symlink( input_filename, target_directory, dbkey, dbkey_name, sequence_id, sequence_name ):\n     fasta_base_filename = "%s.fa" % sequence_id\n     fasta_filename = os.path.join( target_directory, fasta_base_filename )\n@@ -382,12 +417,11 @@\n     return [ ( \'__dbkeys__\', dbkey_dict ), ( \'all_fasta\', dict( value=sequence_id, dbkey=dbkey, name=sequence_name, path=fasta_base_filename ) ) ]\n \n \n-\n-\n REFERENCE_SOURCE_TO_DOWNLOAD = dict( ucsc=download_from_ucsc, ncbi=download_from_ncbi, url=download_from_url, history=download_from_history, directory=copy_from_directory )\n \n SORTING_METHODS = dict( as_is=_sort_fasta_as_is, lexicographical=_sort_fasta_lexicographical, gatk=_sort_fasta_gatk, custom=_sort_fasta_custom )\n \n+\n def main():\n     #Parse Command Line\n     parser = optparse.OptionParser()\n@@ -405,11 +439,16 @@\n     \n     if dbkey in [ None, \'\', \'?\' ]:\n         raise Exception( \'"%s" is not a valid dbkey. You must specify a valid dbkey.\' % ( dbkey ) )\n-    \n+\n+    # Create a tmp_dir, in case a zip file needs to be uncompressed\n+    tmp_dir = tempfile.mkdtemp()\n     #Fetch the FASTA\n-    REFERENCE_SOURCE_TO_DOWNLOAD[ params[\'param_dict\'][\'reference_source\'][\'reference_source_selector\'] ]( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name )\n-    \n+    try:\n+        REFERENCE_SOURCE_TO_DOWNLOAD[ params[\'param_dict\'][\'reference_source\'][\'reference_source_selector\'] ]( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir )\n+    finally:\n+        cleanup_before_exit(tmp_dir)\n     #save info to json file\n-    open( filename, \'wb\' ).write( dumps( data_manager_dict ) )\n+    open( filename, \'wb\' ).write( dumps( data_manager_dict ).encode() )\n         \n-if __name__ == "__main__": main()\n+if __name__ == "__main__":\n+    main()\n'
b
diff -r 776bb1b478a0 -r 86fa71e9b427 data_manager/data_manager_fetch_genome_all_fasta_dbkeys.xml
--- a/data_manager/data_manager_fetch_genome_all_fasta_dbkeys.xml Wed Oct 14 13:46:35 2015 -0400
+++ b/data_manager/data_manager_fetch_genome_all_fasta_dbkeys.xml Fri Aug 26 12:46:47 2016 -0400
[
b'@@ -1,86 +1,77 @@\n-<tool id="data_manager_fetch_genome_all_fasta_dbkey" name="Create DBKey and Reference Genome" version="0.0.1" tool_type="manage_data">\n+<tool id="data_manager_fetch_genome_all_fasta_dbkey" name="Create DBKey and Reference Genome" version="0.0.2" tool_type="manage_data">\n     <description>fetching</description>\n-    <command interpreter="python">data_manager_fetch_genome_all_fasta_dbkeys.py "${out_file}" \n-    #if str( $dbkey_source.dbkey_source_selector ) == \'existing\':\n-    --dbkey_description ${ dbkey_source.dbkey.get_display_text() }\n-    #else\n-    --dbkey_description "${ dbkey_source.dbkey_name or $dbkey_source.dbkey }"\n-    #end if\n-    \n-    </command>\n+    <command><![CDATA[\n+       python "$__tool_directory__"/data_manager_fetch_genome_all_fasta_dbkeys.py "${out_file}"\n+       #if str( $dbkey_source.dbkey_source_selector ) == \'existing\':\n+       --dbkey_description ${ dbkey_source.dbkey.get_display_text() }\n+       #else\n+       --dbkey_description "${ dbkey_source.dbkey_name or $dbkey_source.dbkey }"\n+       #end if\n+    ]]></command>\n     <inputs>\n         <conditional name="dbkey_source">\n-          <param name="dbkey_source_selector" type="select" label="Use existing dbkey or create a new one.">\n-            <option value="existing" selected="True">Existing</option>\n-            <option value="new">New</option>\n-          </param>\n-          <when value="existing">\n-              <param name="dbkey" type="genomebuild" label="DBKEY to assign to data" />\n-          </when>\n-          <when value="new">\n-              <param type="text" name="dbkey" value="" label="dbkey" optional="False" />\n-              <param type="text" name="dbkey_name" value="" label="Display name for dbkey" />\n-          </when>\n+            <param name="dbkey_source_selector" type="select" label="Use existing dbkey or create a new one.">\n+                <option value="existing" selected="True">Existing</option>\n+                <option value="new">New</option>\n+            </param>\n+            <when value="existing">\n+                <param name="dbkey" type="genomebuild" label="DBKEY to assign to data" />\n+            </when>\n+            <when value="new">\n+                <param type="text" name="dbkey" value="" label="dbkey" optional="False" />\n+                <param type="text" name="dbkey_name" value="" label="Display name for dbkey" />\n+            </when>\n         </conditional>\n-        \n         <param type="text" name="sequence_name" value="" label="Name of sequence" />\n         <param type="text" name="sequence_id" value="" label="ID for sequence" />\n         <conditional name="reference_source">\n-          <param name="reference_source_selector" type="select" label="Choose the source for the reference genome">\n-            <option value="ucsc" selected="True">UCSC</option>\n-            <option value="ncbi">NCBI</option>\n-            <option value="url">URL</option>\n-            <option value="history">History</option>\n-            <option value="directory">Directory on Server</option>\n-          </param>\n-          <when value="ucsc">\n-            <param type="text" name="requested_dbkey" value="" label="UCSC\'s DBKEY for source FASTA" optional="False" />\n-          </when>\n-          <when value="ncbi">\n-            <param type="text" name="requested_identifier" value="" label="NCBI identifier" optional="False" />\n-          </when>\n-          <when value="url">\n-            <param type="text" area="True" name="user_url" value="http://" label="URLs" optional="False" />\n-          </when>\n-          <when value="history">\n-            <param name="input_fasta" type="data" format="fasta" label="FASTA File" multiple="False" optional="False" />\n-          </when>\n-          <when value="directory">\n-            <param type="text" name="fasta_filename" value="" label="Full path to FASTA File on disk" optional="False" />\n-            <param type="boolean" name="create_symlink" truevalue="create_symlink" falseval'..b'value="url">URL</option>\n+                <option value="history">History</option>\n+                <option value="directory">Directory on Server</option>\n+            </param>\n+            <when value="ucsc">\n+                <param type="text" name="requested_dbkey" value="" label="UCSC\'s DBKEY for source FASTA" optional="False" />\n+            </when>\n+            <when value="ncbi">\n+                <param type="text" name="requested_identifier" value="" label="NCBI identifier/accession" help="Identifiers (e.g 667699573) or accessions (e.g AC020606.7) may be used" optional="False" />\n+            </when>\n+            <when value="url">\n+                <param type="text" area="True" name="user_url" value="http://" label="URLs" optional="False" />\n+            </when>\n+            <when value="history">\n+                <param name="input_fasta" type="data" format="fasta" label="FASTA File" multiple="False" optional="False" />\n+            </when>\n+            <when value="directory">\n+                <param type="text" name="fasta_filename" value="" label="Full path to FASTA File on disk" optional="False" />\n+                <param type="boolean" name="create_symlink" truevalue="create_symlink" falsevalue="copy_file" label="Create symlink to original data instead of copying" checked="False" />\n+            </when>\n         </conditional>\n         <conditional name="sorting">\n-          <param name="sort_selector" type="select" label="Sort by chromosome name">\n-            <option value="as_is" selected="True">As is</option>\n-            <option value="lexicographical">Lexicographical</option>\n-            <option value="gatk">GATK</option>\n-            <option value="custom">Custom</option>\n-          </param>\n-          <when value="as_is">\n-          </when>\n-          <when value="lexicographical">\n-          </when>\n-          <when value="gatk">\n-          </when>\n-          <when value="custom">\n-            <repeat name="sequence_identifiers" title="Sequence Identifiers" min="1" default="1">\n-                <param type="text" name="identifier" value="" label="Sequence Identifier" optional="False" />\n-            </repeat>\n-            <conditional name="handle_not_listed">\n+            <param name="sort_selector" type="select" label="Sort by chromosome name">\n+                <option value="as_is" selected="True">As is</option>\n+                <option value="lexicographical">Lexicographical</option>\n+                <option value="gatk">GATK</option>\n+                <option value="custom">Custom</option>\n+            </param>\n+            <when value="as_is">\n+            </when>\n+            <when value="lexicographical">\n+            </when>\n+            <when value="gatk">\n+            </when>\n+            <when value="custom">\n+                <repeat name="sequence_identifiers" title="Sequence Identifiers" min="1" default="1">\n+                    <param type="text" name="identifier" value="" label="Sequence Identifier" optional="False" />\n+                </repeat>\n                 <param name="handle_not_listed_selector" type="select" label="How to handle non-specified Identifiers">\n                     <option value="discard" selected="True">Discard</option>\n                     <option value="keep_append">Keep and Append</option>\n                     <option value="keep_prepend">Keep and Prepend</option>\n                 </param>\n-                <when value="discard">\n-                </when>\n-                <when value="keep_append">\n-                </when>\n-                <when value="keep_prepend">\n-                </when>\n-            </conditional>\n-          </when>\n+            </when>\n         </conditional>\n     </inputs>\n     <outputs>\n@@ -110,7 +101,7 @@\n \n .. class:: infomark\n \n-**Notice:** If you leave name, description, or id blank, it will be generated automatically. \n+**Notice:** If you leave name, description, or id blank, it will be generated automatically.\n \n     </help>\n </tool>\n'
b
diff -r 776bb1b478a0 -r 86fa71e9b427 test-data/test.tar
b
Binary file test-data/test.tar has changed
b
diff -r 776bb1b478a0 -r 86fa71e9b427 test-data/test.tar.bz2
b
Binary file test-data/test.tar.bz2 has changed
b
diff -r 776bb1b478a0 -r 86fa71e9b427 test-data/test.tar.gz
b
Binary file test-data/test.tar.gz has changed
b
diff -r 776bb1b478a0 -r 86fa71e9b427 test-data/test.zip
b
Binary file test-data/test.zip has changed