Repository 'data_manager_fetch_tx2gene'
hg clone https://toolshed.g2.bx.psu.edu/repos/ieguinoa/data_manager_fetch_tx2gene

Changeset 3:d71f65b854de (2018-10-19)
Previous changeset 2:7d3ffe28ff3f (2018-10-10) Next changeset 4:bacd91d8b05a (2018-10-21)
Commit message:
Uploaded
modified:
data_manager/data_manager_fetch_tx2gene.py
data_manager/data_manager_fetch_tx2gene.xml
tool-data/tx2gene.loc.sample
tool_data_table_conf.xml.sample
added:
data_manager/get_tx2gene_table.R
b
diff -r 7d3ffe28ff3f -r d71f65b854de data_manager/data_manager_fetch_tx2gene.py
--- a/data_manager/data_manager_fetch_tx2gene.py Wed Oct 10 11:44:17 2018 -0400
+++ b/data_manager/data_manager_fetch_tx2gene.py Fri Oct 19 07:36:02 2018 -0400
[
b'@@ -11,6 +11,7 @@\n import zipfile\n import gzip\n import bz2\n+import subprocess\n try:\n     # For Python 3.0 and later\n     from urllib.request import urlopen\n@@ -93,20 +94,35 @@\n     return [ bz2.BZ2File( fh.name, \'rb\') ]\n \n \n-def convert_tx2gene( fasta_filename, file_type, params ):\n-    if file_type is \'tx2gene\':\n+def convert_to_tx2gene( rscript_gff_to_tx2gene, fasta_filename, file_type, params ):\n+    if file_type == \'tx2gene\':\n         return   #no need to extract tx2gene table\n+    #print file_type\n     #If the file is actually a GFF/GTF file then extract the tx2gene\n     gff_temp_filename = tempfile.NamedTemporaryFile().name\n     shutil.move(fasta_filename, gff_temp_filename)\n     args= [\'Rscript\']\n-    args.append(RSCRIPT_GFF_TO_TX2GENE)\n-    args.append(gff_temp_filename)\n-    args.append(fasta_filename)\n+    args.append(rscript_gff_to_tx2gene)\n+    args.extend([\'-x\',gff_temp_filename])\n+    args.extend([\'-o\',fasta_filename])\n+    args.extend([\'-t\',file_type])\n+    tmp_stderr = tempfile.NamedTemporaryFile( prefix = "tmp-stderr" )\n+    return_code = subprocess.call( args=args, shell=False, stderr=tmp_stderr.fileno() )\n+    #return_code = subprocess.call( args=args, shell=False, stderr=None)\n+    if return_code:\n+        tmp_stderr.flush()\n+        tmp_stderr.seek(0)\n+        print >> sys.stderr, "Error in process call"\n+        while True:\n+            chunk = tmp_stderr.read( CHUNK_SIZE )\n+            if not chunk:\n+                break\n+            sys.stderr.write( chunk )\n+        sys.exit( return_code )\n+    tmp_stderr.close()\n \n-    #assert sort_method in SORTING_METHODS, ValueError( "%s is not a valid sorting option." % sort_method )\n-    #return SORTING_METHODS[ sort_method ]( fasta_filename, params )\n-    \n+\n+\n def _download_file(start, fh):\n     tmp = tempfile.NamedTemporaryFile()\n     tmp.write(start)\n@@ -143,29 +159,29 @@\n \n \n \n-def add_fasta_to_table(data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params):\n-    for data_table_name, data_table_entry in _stream_fasta_to_file( fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params ):\n+def add_fasta_to_table(rscript_gff_to_tx2gene, data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params):\n+    for data_table_name, data_table_entry in _stream_fasta_to_file(rscript_gff_to_tx2gene, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params ):\n         if data_table_entry:\n             _add_data_table_entry( data_manager_dict, data_table_entry, data_table_name )\n \n \n-def download_from_url( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ):\n+def download_from_url(rscript_gff_to_tx2gene, data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ):\n     urls = filter( bool, map( lambda x: x.strip(), params[\'param_dict\'][\'reference_source\'][\'user_url\'].split( \'\\n\' ) ) )\n     fasta_readers = [ get_stream_reader(urlopen( url ), tmp_dir) for url in urls ]\n-    add_fasta_to_table(data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id,sequence_name, params)\n+    add_fasta_to_table(rscript_gff_to_tx2gene,data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id,sequence_name, params)\n \n \n-def download_from_history( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ):\n+def download_from_history(rscript_gff_to_tx2gene, data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ):\n     #TODO: allow multiple FASTA input files\n     input_filename = params[\'param_dict\'][\'reference_source\'][\'input_fasta\']\n     if isinstance( input_filename, list ):\n         fasta_readers = [ get_stream_reader(open(filename, \'rb\'), tmp_dir) for filename in input_filename ]\n     else:\n         fasta_readers ='..b'_to_tx2gene,data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params)\n \n \n-def copy_from_directory( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ):\n+def copy_from_directory(rscript_gff_to_tx2gene, data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ):\n     input_filename = params[\'param_dict\'][\'reference_source\'][\'fasta_filename\']\n     create_symlink = params[\'param_dict\'][\'reference_source\'][\'create_symlink\'] == \'create_symlink\'\n     if create_symlink:\n@@ -175,7 +191,7 @@\n             fasta_readers = [ get_stream_reader(open(filename, \'rb\'), tmp_dir) for filename in input_filename ]\n         else:\n             fasta_readers = get_stream_reader(open(input_filename), tmp_dir)\n-        data_table_entries = _stream_fasta_to_file( fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params )\n+        data_table_entries = _stream_fasta_to_file(rscript_gff_to_tx2gene, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params )\n     for data_table_name, data_table_entry in data_table_entries:\n         if data_table_entry:\n             _add_data_table_entry( data_manager_dict, data_table_entry, data_table_name )\n@@ -188,7 +204,7 @@\n     return data_manager_dict\n \n \n-def _stream_fasta_to_file( fasta_stream, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params, close_stream=True ):\n+def _stream_fasta_to_file( rscript_gff_to_tx2gene, fasta_stream, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params, close_stream=True ):\n     fasta_base_filename = "%s_tx2gene.tab" % sequence_id\n     fasta_filename = os.path.join( target_directory, fasta_base_filename )\n     with open( fasta_filename, \'wb+\' ) as fasta_writer:\n@@ -220,7 +236,7 @@\n             if close_stream:\n                 fasta_stream.close()\n \n-    convert_to_tx2gene( fasta_filename, params[\'param_dict\'][\'file_type\'], params )\n+    convert_to_tx2gene( rscript_gff_to_tx2gene,fasta_filename, params[\'param_dict\'][\'file_type\'], params )\n     return [ ( DATA_TABLE_NAME, dict( value=sequence_id, dbkey=dbkey, name=sequence_name, path=fasta_base_filename ) ) ]\n \n \n@@ -271,17 +287,17 @@\n     #Parse Command Line\n     parser = optparse.OptionParser()\n     parser.add_option( \'-d\', \'--dbkey_description\', dest=\'dbkey_description\', action=\'store\', type="string", default=None, help=\'dbkey_description\' )\n+    parser.add_option( \'-b\', \'--base_dir\', dest=\'base_dir\', action=\'store\', type=\'string\', default=None, help=\'base_dir\')\n     parser.add_option( \'-t\', \'--type\', dest=\'file_type\', action=\'store\', type=\'string\', default=None, help=\'file_type\')\n     (options, args) = parser.parse_args()\n     \n     filename = args[0]\n     #global DATA_TABLE_NAME\n-    global RSCRIPT_GFF_TO_TX2GENE= os.path.join( options.base_dir, \'tximport.r\')\n-\n+    rscript_gff_to_tx2gene=os.path.join( options.base_dir, \'get_tx2gene_table.R\')\n \n-    if options.file_type == \'gff_gtf\':\n-        #DATA_TABLE_NAME= \'representative_gff\'\n-    else:   #file_type=\'tx2gene\'\n+    #input_type=\'gff_gtf\'\n+    #if options.file_type != \'gff_gtf\':\n+    # \tfile_type=\'tx2gene\'\n         \n     params = loads( open( filename ).read() )\n     target_directory = params[ \'output_data\' ][0][\'extra_files_path\']\n@@ -297,7 +313,7 @@\n     tmp_dir = tempfile.mkdtemp()\n     #Fetch the input file\n     try:\n-        REFERENCE_SOURCE_TO_DOWNLOAD[ params[\'param_dict\'][\'reference_source\'][\'reference_source_selector\'] ]( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir)\n+        REFERENCE_SOURCE_TO_DOWNLOAD[ params[\'param_dict\'][\'reference_source\'][\'reference_source_selector\'] ]( rscript_gff_to_tx2gene, data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir)\n     finally:\n         cleanup_before_exit(tmp_dir)\n     #save info to json file\n'
b
diff -r 7d3ffe28ff3f -r d71f65b854de data_manager/data_manager_fetch_tx2gene.xml
--- a/data_manager/data_manager_fetch_tx2gene.xml Wed Oct 10 11:44:17 2018 -0400
+++ b/data_manager/data_manager_fetch_tx2gene.xml Fri Oct 19 07:36:02 2018 -0400
[
@@ -1,5 +1,10 @@
 <tool id="data_manager_fetch_tx2gene" name="Create entries in tx2gene data table" version="0.0.1" tool_type="manage_data">
     <description>fetching</description>
+    <requirements>
+        <requirement type="package" version="1.26.4">bioconductor-genomicfeatures</requirement>
+        <requirement type="package">r-getopt</requirement>
+    </requirements>
+
     <command><![CDATA[
        python "$__tool_directory__"/data_manager_fetch_tx2gene.py "${out_file}"
        --type $file_type
@@ -14,9 +19,10 @@
         <param type="text" name="sequence_id" value="" label="ID for sequence" />
  
         <param name="file_type" type="select" label="Select input type: GFF/GTF file(features will be extracted to create tx2gene table) or transcript to gene table file(tab separated)">
-                <option value="gff_gtf">GFF/GTF file</option>
+                <option value="gtf">GTF file</option>
+                <option value="gff3">GFF3 file</option>
                 <option value="tx2gene">tx2gene</option>
-            </param>
+        </param>
  <conditional name="reference_source">
      <param name="reference_source_selector" type="select" label="Choose the source for the reference genome">
  <option value="url">URL</option>
b
diff -r 7d3ffe28ff3f -r d71f65b854de data_manager/get_tx2gene_table.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/get_tx2gene_table.R Fri Oct 19 07:36:02 2018 -0400
[
@@ -0,0 +1,17 @@
+library(getopt)
+
+# we read the options from the default: commandArgs(TRUE).
+spec <- matrix(c(
+  "input_type", "t", 1, "character",
+  "outfile", "o", 1, "character",
+  "gtfFile", "x", 1, "character"),
+  byrow=TRUE, ncol=4)
+opt <- getopt(spec)
+
+suppressPackageStartupMessages({library("GenomicFeatures")})
+txdb <- makeTxDbFromGFF(opt$gtfFile, format=opt$input_type)
+k <- keys(txdb, keytype = "GENEID")
+df <- select(txdb, keys = k, keytype = "GENEID", columns = "TXNAME")
+tx2gene <- df[, 2:1] # tx ID, then gene ID
+write.table(tx2gene,file = opt$outfile, quote = FALSE, sep = " ",row.names = FALSE,col.names = FALSE)
+
b
diff -r 7d3ffe28ff3f -r d71f65b854de tool-data/tx2gene.loc.sample
--- a/tool-data/tx2gene.loc.sample Wed Oct 10 11:44:17 2018 -0400
+++ b/tool-data/tx2gene.loc.sample Fri Oct 19 07:36:02 2018 -0400
b
@@ -1,3 +1,3 @@
 #The tx2gene.loc file has this format:
 #
-#<unique_build_id> <dbkey> <display_name> <path_to_gff_file>
+#<unique_build_id> <dbkey> <display_name> <path_to_tx2gene_file>
b
diff -r 7d3ffe28ff3f -r d71f65b854de tool_data_table_conf.xml.sample
--- a/tool_data_table_conf.xml.sample Wed Oct 10 11:44:17 2018 -0400
+++ b/tool_data_table_conf.xml.sample Fri Oct 19 07:36:02 2018 -0400
b
@@ -1,4 +1,4 @@
 <?xml version="1.0"?>
 <tables>
- <table name="tx2gene_table" comment_char="#" allow_duplicate_entries="False"><columns>value, dbkey, name, path</columns><file path="tool-data/tx2gene.loc" /></table>
+ <table name="tx2gene" comment_char="#" allow_duplicate_entries="False"><columns>value, dbkey, name, path</columns><file path="tool-data/tx2gene.loc" /></table>
 </tables>