Repository 'data_manager_fetch_gff'
hg clone https://toolshed.g2.bx.psu.edu/repos/ieguinoa/data_manager_fetch_gff

Changeset 3:cb0fa3584aeb (2018-10-09)
Previous changeset 2:c57bd7f3fb46 (2018-07-10) Next changeset 4:4b10131cb66d (2018-10-09)
Commit message:
Uploaded
modified:
data_manager/data_manager_fetch_gff.py
data_manager/data_manager_fetch_gff.xml
data_manager_conf.xml
b
diff -r c57bd7f3fb46 -r cb0fa3584aeb data_manager/data_manager_fetch_gff.py
--- a/data_manager/data_manager_fetch_gff.py Tue Jul 10 10:55:47 2018 -0400
+++ b/data_manager/data_manager_fetch_gff.py Tue Oct 09 14:32:48 2018 -0400
[
b'@@ -93,116 +93,6 @@\n     return [ bz2.BZ2File( fh.name, \'rb\') ]\n \n \n-def sort_fasta( fasta_filename, sort_method, params ):\n-    if sort_method is None:\n-        return\n-    assert sort_method in SORTING_METHODS, ValueError( "%s is not a valid sorting option." % sort_method )\n-    return SORTING_METHODS[ sort_method ]( fasta_filename, params )\n-\n-\n-def _move_and_index_fasta_for_sorting( fasta_filename ):\n-    unsorted_filename = tempfile.NamedTemporaryFile().name\n-    shutil.move( fasta_filename, unsorted_filename )\n-    fasta_offsets = {}\n-    unsorted_fh = open( unsorted_filename )\n-    while True:\n-        offset = unsorted_fh.tell()\n-        line = unsorted_fh.readline()\n-        if not line:\n-            break\n-        if line.startswith( ">" ):\n-            line = line.split( None, 1 )[0][1:]\n-            fasta_offsets[ line ] = offset\n-    unsorted_fh.close()\n-    current_order = map( lambda x: x[1], sorted( map( lambda x: ( x[1], x[0] ), fasta_offsets.items() ) ) )\n-    return ( unsorted_filename, fasta_offsets, current_order )\n-\n-\n-def _write_sorted_fasta( sorted_names, fasta_offsets, sorted_fasta_filename, unsorted_fasta_filename ):\n-    unsorted_fh = open( unsorted_fasta_filename )\n-    sorted_fh = open( sorted_fasta_filename, \'wb+\' )\n-    \n-    for name in sorted_names:\n-        offset = fasta_offsets[ name ]\n-        unsorted_fh.seek( offset )\n-        sorted_fh.write( unsorted_fh.readline() )\n-        while True:\n-            line = unsorted_fh.readline()\n-            if not line or line.startswith( ">" ):\n-                break\n-            sorted_fh.write( line )\n-    unsorted_fh.close()\n-    sorted_fh.close()\n-\n-\n-def _sort_fasta_as_is( fasta_filename, params ):\n-    return\n-\n-def _sort_fasta_lexicographical( fasta_filename, params ):\n-    ( unsorted_filename, fasta_offsets, current_order ) = _move_and_index_fasta_for_sorting( fasta_filename )\n-    sorted_names = sorted( fasta_offsets.keys() )\n-    if sorted_names == current_order:\n-        shutil.move( unsorted_filename, fasta_filename )\n-    else:\n-        _write_sorted_fasta( sorted_names, fasta_offsets, fasta_filename, unsorted_filename )    \n-\n-\n-def _sort_fasta_gatk( fasta_filename, params ):\n-    #This method was added by reviewer request.\n-    ( unsorted_filename, fasta_offsets, current_order ) = _move_and_index_fasta_for_sorting( fasta_filename )\n-    sorted_names = map( str, range( 1, 23 ) ) + [ \'X\', \'Y\' ]\n-    #detect if we have chrN, or just N\n-    has_chr = False\n-    for chrom in sorted_names:\n-        if "chr%s" % chrom in current_order:\n-            has_chr = True\n-            break\n-    \n-    if has_chr:\n-        sorted_names = map( lambda x: "chr%s" % x, sorted_names)\n-        sorted_names.insert( 0, "chrM" )\n-    else:\n-        sorted_names.insert( 0, "MT" )\n-    sorted_names.extend( map( lambda x: "%s_random" % x, sorted_names ) )\n-    \n-    existing_sorted_names = []\n-    for name in sorted_names:\n-        if name in current_order:\n-            existing_sorted_names.append( name )\n-    for name in current_order:\n-        #TODO: confirm that non-canonical names do not need to be sorted specially\n-        if name not in existing_sorted_names:\n-            existing_sorted_names.append( name )\n-    \n-    if existing_sorted_names == current_order:\n-        shutil.move( unsorted_filename, fasta_filename )\n-    else:\n-        _write_sorted_fasta( existing_sorted_names, fasta_offsets, fasta_filename, unsorted_filename )\n-\n-\n-def _sort_fasta_custom( fasta_filename, params ):\n-    ( unsorted_filename, fasta_offsets, current_order ) = _move_and_index_fasta_for_sorting( fasta_filename )\n-    sorted_names = []\n-    for id_repeat in params[\'param_dict\'][\'sorting\'][\'sequence_identifiers\']:\n-        sorted_names.append( id_repeat[ \'identifier\' ] )\n-    handle_not_listed = params[\'param_dict\'][\'sorting\'][\'handle_not_listed_selector\']\n-    if handle_not_listed.startswith( \'keep\' ):\n-        add_list = []\n-        for name in current_order:\n-            if '..b't)\n-                return "ftp://%s%s" % (UCSC_FTP_SERVER, ucsc_file_name)\n-\n-    raise Exception(\'Unable to determine filename for UCSC Genome for %s: %s\' % (ucsc_dbkey, path_contents))\n \n def add_fasta_to_table(data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params):\n     for data_table_name, data_table_entry in _stream_fasta_to_file( fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params ):\n@@ -274,20 +135,6 @@\n             _add_data_table_entry( data_manager_dict, data_table_entry, data_table_name )\n \n \n-def download_from_ucsc( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ):\n-    url = _get_ucsc_download_address(params, dbkey)\n-    fasta_readers = get_stream_reader(urlopen(url), tmp_dir)\n-    add_fasta_to_table(data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params)\n-\n-\n-def download_from_ncbi( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ):\n-    NCBI_DOWNLOAD_URL = \'http://togows.dbcls.jp/entry/ncbi-nucleotide/%s.fasta\' #FIXME: taken from dave\'s genome manager...why some japan site?\n-    requested_identifier = params[\'param_dict\'][\'reference_source\'][\'requested_identifier\']\n-    url = NCBI_DOWNLOAD_URL % requested_identifier\n-    fasta_readers = get_stream_reader(urlopen(url), tmp_dir)\n-    add_fasta_to_table(data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params)\n-\n-\n def download_from_url( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ):\n     urls = filter( bool, map( lambda x: x.strip(), params[\'param_dict\'][\'reference_source\'][\'user_url\'].split( \'\\n\' ) ) )\n     fasta_readers = [ get_stream_reader(urlopen( url ), tmp_dir) for url in urls ]\n@@ -295,7 +142,6 @@\n \n \n def download_from_history( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ):\n-    #TODO: allow multiple FASTA input files\n     input_filename = params[\'param_dict\'][\'reference_source\'][\'input_fasta\']\n     if isinstance( input_filename, list ):\n         fasta_readers = [ get_stream_reader(open(filename, \'rb\'), tmp_dir) for filename in input_filename ]\n@@ -405,22 +251,19 @@\n     return [  ( DATA_TABLE_NAME, dict( value=sequence_id, dbkey=dbkey, name=sequence_name, path=fasta_base_filename ) ) ]\n \n \n-REFERENCE_SOURCE_TO_DOWNLOAD = dict( ucsc=download_from_ucsc, ncbi=download_from_ncbi, url=download_from_url, history=download_from_history, directory=copy_from_directory )\n-\n-SORTING_METHODS = dict( as_is=_sort_fasta_as_is, lexicographical=_sort_fasta_lexicographical, gatk=_sort_fasta_gatk, custom=_sort_fasta_custom )\n+#REFERENCE_SOURCE_TO_DOWNLOAD = dict( ucsc=download_from_ucsc, ncbi=download_from_ncbi, url=download_from_url, history=download_from_history, directory=copy_from_directory )\n+REFERENCE_SOURCE_TO_DOWNLOAD = dict( url=download_from_url, history=download_from_history, directory=copy_from_directory )\n+#SORTING_METHODS = dict( as_is=_sort_fasta_as_is, lexicographical=_sort_fasta_lexicographical, gatk=_sort_fasta_gatk, custom=_sort_fasta_custom )\n \n \n def main():\n     #Parse Command Line\n     parser = optparse.OptionParser()\n     parser.add_option( \'-d\', \'--dbkey_description\', dest=\'dbkey_description\', action=\'store\', type="string", default=None, help=\'dbkey_description\' )\n-    parser.add_option( \'-t\', \'--type\', dest=\'file_type\', action=\'store\', type=\'string\', default=None, help=\'file_type\')\n     (options, args) = parser.parse_args()\n     \n     filename = args[0]\n-    global DATA_TABLE_NAME\n-    if options.file_type == \'representative\':\n-       DATA_TABLE_NAME= \'representative_gff\'\n+    #global DATA_TABLE_NAME\n     params = loads( open( filename ).read() )\n     target_directory = params[ \'output_data\' ][0][\'extra_files_path\']\n     os.mkdir( target_directory )\n'
b
diff -r c57bd7f3fb46 -r cb0fa3584aeb data_manager/data_manager_fetch_gff.xml
--- a/data_manager/data_manager_fetch_gff.xml Tue Jul 10 10:55:47 2018 -0400
+++ b/data_manager/data_manager_fetch_gff.xml Tue Oct 09 14:32:48 2018 -0400
[
@@ -2,16 +2,10 @@
     <description>fetching</description>
     <command><![CDATA[
        python "$__tool_directory__"/data_manager_fetch_gff.py "${out_file}"
-       --type $file_type
        --dbkey_description ${ dbkey.get_display_text() }
         
     ]]></command>
     <inputs>
-         <param name="file_type" type="select" label="GFF file with only one representative transcript per gene (for htseq-count use) or full features file">
-                <option value="representative">Representative GFF</option>
-                <option value="full">GFF with complete features</option>
-            </param>

         <param name="dbkey" type="genomebuild" label="DBKEY to assign to data" />
         <param type="text" name="sequence_name" value="" label="Name of sequence" />
         <param type="text" name="sequence_id" value="" label="ID for sequence" />
b
diff -r c57bd7f3fb46 -r cb0fa3584aeb data_manager_conf.xml
--- a/data_manager_conf.xml Tue Jul 10 10:55:47 2018 -0400
+++ b/data_manager_conf.xml Tue Oct 09 14:32:48 2018 -0400
b
@@ -16,20 +16,5 @@
                 </column>
             </output>
         </data_table>
-     <data_table name="representative_gff">
-            <output>
-                <column name="value" />
-                <column name="dbkey" />
-                <column name="name" />
-                <column name="path" output_ref="out_file">
-                    <move type="file">
-                        <source>${path}</source>
-                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">${dbkey}/representative_gff/${path}</target>
-                    </move>
-                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/${dbkey}/representative_gff/${path}</value_translation>
-                    <value_translation type="function">abspath</value_translation>
-                </column>
-            </output>
-        </data_table>
     </data_manager>
 </data_managers>