changeset 0:7b1b03c4465d draft default tip

Imported from capsule None
author devteam
date Mon, 27 Jan 2014 09:28:26 -0500
parents
children
files gi2taxonomy.py gi2taxonomy.xml test-data/taxonomy2gi-input.tabular test-data/taxonomy2gi-output.tabular tool_dependencies.xml
diffstat 5 files changed, 290 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gi2taxonomy.py	Mon Jan 27 09:28:26 2014 -0500
@@ -0,0 +1,174 @@
+import sys
+import string
+import tempfile
+import subprocess
+from os import path
+
+# -----------------------------------------------------------------------------------
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+# -----------------------------------------------------------------------------------
+def gi_name_to_sorted_list(file_name, gi_col, name_col):
+    """ Suppose input file looks like this:
+        a       2
+        b       4
+        c       5
+        d       5
+        where column 1 is gi_col and column 0 is name_col
+        output of this function will look like this:
+        [[2, 'a'], [4, 'b'], [5, 'c'], [5, 'd']]
+    """
+
+    result = []
+    try:
+        F = open( file_name, 'r' )
+        try:
+            for line in F:
+                file_cols = string.split(line.rstrip(), '\t')
+                file_cols[gi_col] = int(  file_cols[gi_col] )
+                result.append( [ file_cols[gi_col], file_cols[name_col] ] )
+        except:
+            print >>sys.stderr, 'Non numeric GI field...skipping'
+            
+    except Exception, e:
+        stop_err('%s\n' % e)
+    F.close()
+    result.sort()
+    return result   
+
+# -----------------------------------------------------------------------------------
+
+def collapse_repeating_gis( L ):
+    """ Accepts 2-d array of gi-key pairs such as this
+        L = [
+                [gi1, 'key1'],
+                [gi1, 'key2'],
+                [gi2','key3']
+            ]
+
+         Returns this:
+         [      [gi1, 'key1', 'key2'],
+                [gi2, 'key3' ]
+         ]
+         
+         The first value in each sublist MUST be int
+    """
+    gi = []
+    i = 0
+    result = []
+    
+    try:
+        for item in L:
+            if i == 0:
+                prev = item[0]
+            
+            if prev != item[0]:
+                prev_L = []
+                prev_L.append( prev )
+                result.append( prev_L + gi )
+                prev = item[0]
+                gi =[]
+                
+            gi.append( item[1] )
+            i += 1
+            
+    except Exception, e:
+        stop_err('%s\n' % e)
+        
+    prev_L = []
+    prev_L.append( prev )
+    result.append( prev_L + gi )
+    del(L)
+    return result
+
+# -----------------------------------------------------------------------------------
+
+def get_taxId( gi2tax_file, gi_name_list, out_file ):
+    """ Maps GI numbers from gi_name_list to TaxId identifiers from gi2tax_file and
+        prints result to out_file
+
+        gi2tax_file MUST be sorted on GI column
+
+        gi_name_list is a list that look slike this:
+        [[1,'a'], [2,'b','x'], [7,'c'], [10,'d'], [90,'f']]
+        where the first element of each sublist is a GI number
+        this list MUST also be sorted on GI
+
+        This function searches through 117,000,000 rows of gi2taxId file from NCBI
+        in approximately 4 minutes. This time is not dependent on the length of
+        gi_name_list
+    """
+
+    L = gi_name_list.pop(0)
+    my_gi = L[0]
+    F = open( out_file, 'w' )
+    gi = 0
+    for line in file( gi2tax_file ):
+        line = line.rstrip()
+        gi, taxId = string.split( line, '\t' )
+        gi = int( gi )
+        
+        if gi > my_gi:
+            try:
+                while ( my_gi < gi ):
+                    L = gi_name_list.pop(0)
+                    my_gi = L[0]
+            except:
+                break
+    
+        if  gi == my_gi:
+            for i in range( 1,len( L ) ):
+                print >>F, '%s\t%s\t%d' % (L[i], taxId, gi)
+            try:
+                L = gi_name_list.pop(0)
+                my_gi = L[0]
+            except:
+                break
+
+# -----------------------------------------------------------------------------------
+
+
+try:
+    in_f          = sys.argv[1]            # input file with GIs
+    gi_col        = int( sys.argv[2] ) - 1 # column in input containing GIs
+    name_col      = int( sys.argv[3] ) - 1 # column containing sequence names
+    out_f         = sys.argv[4]            # output file
+    tool_data     = sys.argv[5]
+except:
+    stop_err('Check arguments\n')
+
+#  GI2TAX point to a file produced by concatenation of:
+#  ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_nucl.zip
+#  and
+#  ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_prot.zip
+#  a sorting using this command:
+#  sort -n -k 1
+
+GI2TAX = path.join( tool_data, 'taxonomy', 'gi_taxid_sorted.txt' )
+
+#  NAME_FILE and NODE_FILE point to names.dmg and nodes.dmg
+#  files contained within:
+#  ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
+
+NAME_FILE = path.join( tool_data, 'taxonomy', 'names.dmp' )
+NODE_FILE = path.join( tool_data, 'taxonomy', 'nodes.dmp' )
+
+g2n =  gi_name_to_sorted_list(in_f, gi_col, name_col)
+
+if len(g2n) == 0:
+    stop_err('No valid GI-containing fields. Please, check your column assignments.\n')
+
+tb_F = tempfile.NamedTemporaryFile('w')
+
+get_taxId( GI2TAX, collapse_repeating_gis( g2n ), tb_F.name )
+
+try:
+    tb_cmd = 'taxBuilder %s %s %s %s' % ( NAME_FILE, NODE_FILE, tb_F.name, out_f )
+    retcode = subprocess.call( tb_cmd, shell=True )
+    if retcode < 0:
+        print >>sys.stderr, "Execution of taxBuilder terminated by signal", -retcode
+except OSError, e:
+    print >>sys.stderr, "Execution of taxBuilder2tree failed:", e
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gi2taxonomy.xml	Mon Jan 27 09:28:26 2014 -0500
@@ -0,0 +1,102 @@
+<tool id="Fetch Taxonomic Ranks" name="Fetch taxonomic representation" version="1.1.0">
+  <description></description>
+    <requirements>
+        <requirement type="package" version="1.0.0">taxonomy</requirement>
+    </requirements>
+  <command interpreter="python">gi2taxonomy.py $input $giField $idField $out_file1 ${GALAXY_DATA_INDEX_DIR}</command>
+  <inputs>
+    <param format="tabular" name="input" type="data" label="Show taxonomic representation for"></param>
+    <param name="giField" label="GIs column" type="data_column" data_ref="input" numerical="True" help="select column containing GI numbers"/>
+    <param name="idField" label="Name column" type="data_column" data_ref="input" help="select column containing identifiers you want to include into output"/>
+  </inputs>
+  <outputs>
+    <data format="taxonomy" name="out_file1" />
+  </outputs>
+  <requirements>
+    <requirement type="binary">taxBuilder</requirement>
+  </requirements>
+  <tests>
+    <test>
+      <param name="input" ftype="tabular" value="taxonomy2gi-input.tabular"/>
+      <param name="giField" value="1"/>
+      <param name="idField" value="2"/>
+      <output name="out_file1" file="taxonomy2gi-output.tabular"/>
+    </test>
+  </tests>
+
+  <help>
+
+.. class:: infomark
+
+Use *Filter and Sort->Filter* to restrict output of this tool to desired taxonomic ranks. You can also use *Text Manipulation->Cut* to remove unwanted columns from the output.
+
+------
+
+**What it does**
+
+Fetches taxonomic information for a list of GI numbers (sequences identifiers used by the National Center for Biotechnology Information http://www.ncbi.nlm.nih.gov).
+
+-------
+
+**Example**
+
+Suppose you have BLAST output that looks like this::
+  
+   +-----------------------+----------+----------+-----------------+------------+------+--------+
+   | queryId               | targetGI | identity | alignmentLength | mismatches | gaps | score  |
+   +-----------------------+----------+----------+-----------------+------------+------+--------+
+   | 1L_EYKX4VC01BXWX1_265 |  1430919 |    90.09 |             212 |         15 |    6 | 252.00 | 
+   +-----------------------+----------+----------+-----------------+------------+------+--------+
+
+and you want to obtain full taxonomic representation for GIs listed in *targetGI* column. If you set parameters as shown here:
+
+.. image:: ${static_path}/images/fetchTax.png
+
+
+the tool will generate the following output (you may need to scroll sideways to see the entire line)::
+
+  1                     2    3    4         5       6 7 8        9        10            11       12 13               14       15         16          17        18  19  20 21  22  23           24 25
+  1L_EYKX4VC01BXWX1_265 9606 root Eukaryota Metazoa n n Chordata Craniata Gnathostomata Mammalia n  Euarchontoglires Primates Haplorrhini Hominoidea Hominidae n   n   n  Homo n  Homo sapiens n  1430919
+
+In other words the tool printed *Name column*, *taxonomy Id*, appended 22 columns containing taxonomic ranks from Superkingdom to Subspecies and added *GI* as the last column. Below is a formal definition of the output columns::
+
+    Column Definition
+   ------- ------------------------------------------
+         1 Name (specified by 'Name column' dropdown)
+         2 GI   (specified by 'GI column' dropdown)
+         3 root
+         4 superkingdom
+         5 kingdom
+         6 subkingdom
+         7 superphylum
+         8 phylum
+         9 subphylum
+        10 superclass
+        11 class
+        12 subclass
+        13 superorder
+        14 order
+        15 suborder
+        16 superfamily
+        17 family
+        18 subfamily
+        19 tribe
+        20 subtribe
+        21 genus
+        22 subgenus
+        23 species
+        24 subspecies
+
+------
+
+.. class:: warningmark
+
+**Why do I have these "n" things?** 
+
+Be aware that the NCBI taxonomy (ftp://ftp.ncbi.nih.gov/pub/taxonomy/) this tool relies upon is incomplete.  This means that for many species one or more ranks are absent and represented as "**n**". In the above example *subkingdom*, *superphylum* etc. are missing.
+
+
+</help>
+</tool>
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/taxonomy2gi-input.tabular	Mon Jan 27 09:28:26 2014 -0500
@@ -0,0 +1,4 @@
+2	9913
+4	9646
+15	9915
+16	9771
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/taxonomy2gi-output.tabular	Mon Jan 27 09:28:26 2014 -0500
@@ -0,0 +1,4 @@
+9913	9913	root	Eukaryota	Metazoa	n	n	Chordata	Craniata	Gnathostomata	Mammalia	n	Laurasiatheria	n	Ruminantia	n	Bovidae	Bovinae	n	n	Bos	n	Bos taurus	n	2
+9646	9646	root	Eukaryota	Metazoa	n	n	Chordata	Craniata	Gnathostomata	Mammalia	n	Laurasiatheria	Carnivora	Caniformia	n	Ursidae	n	n	n	Ailuropoda	n	Ailuropoda melanoleuca	n	4
+9915	9915	root	Eukaryota	Metazoa	n	n	Chordata	Craniata	Gnathostomata	Mammalia	n	Laurasiatheria	n	Ruminantia	n	Bovidae	Bovinae	n	n	Bos	n	Bos indicus	n	15
+9771	9771	root	Eukaryota	Metazoa	n	n	Chordata	Craniata	Gnathostomata	Mammalia	n	Laurasiatheria	Cetacea	Mysticeti	n	Balaenopteridae	n	n	n	Balaenoptera	n	Balaenoptera musculus	n	16
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Mon Jan 27 09:28:26 2014 -0500
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+  <package name="taxonomy" version="1.0.0">
+      <repository changeset_revision="c011d4659306" name="package_taxonomy_1_0_0" owner="devteam" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>