# HG changeset patch # User devteam # Date 1390832906 18000 # Node ID 7b1b03c4465d2d1ebd1628e2686f91f49a382f40 Imported from capsule None diff -r 000000000000 -r 7b1b03c4465d gi2taxonomy.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gi2taxonomy.py Mon Jan 27 09:28:26 2014 -0500 @@ -0,0 +1,174 @@ +import sys +import string +import tempfile +import subprocess +from os import path + +# ----------------------------------------------------------------------------------- + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + +# ----------------------------------------------------------------------------------- +def gi_name_to_sorted_list(file_name, gi_col, name_col): + """ Suppose input file looks like this: + a 2 + b 4 + c 5 + d 5 + where column 1 is gi_col and column 0 is name_col + output of this function will look like this: + [[2, 'a'], [4, 'b'], [5, 'c'], [5, 'd']] + """ + + result = [] + try: + F = open( file_name, 'r' ) + try: + for line in F: + file_cols = string.split(line.rstrip(), '\t') + file_cols[gi_col] = int( file_cols[gi_col] ) + result.append( [ file_cols[gi_col], file_cols[name_col] ] ) + except: + print >>sys.stderr, 'Non numeric GI field...skipping' + + except Exception, e: + stop_err('%s\n' % e) + F.close() + result.sort() + return result + +# ----------------------------------------------------------------------------------- + +def collapse_repeating_gis( L ): + """ Accepts 2-d array of gi-key pairs such as this + L = [ + [gi1, 'key1'], + [gi1, 'key2'], + [gi2','key3'] + ] + + Returns this: + [ [gi1, 'key1', 'key2'], + [gi2, 'key3' ] + ] + + The first value in each sublist MUST be int + """ + gi = [] + i = 0 + result = [] + + try: + for item in L: + if i == 0: + prev = item[0] + + if prev != item[0]: + prev_L = [] + prev_L.append( prev ) + result.append( prev_L + gi ) + prev = item[0] + gi =[] + + gi.append( item[1] ) + i += 1 + + except Exception, e: + stop_err('%s\n' % e) + + prev_L = [] + prev_L.append( prev ) + result.append( prev_L + gi ) + del(L) + return result + +# ----------------------------------------------------------------------------------- + +def get_taxId( gi2tax_file, gi_name_list, out_file ): + """ Maps GI numbers from gi_name_list to TaxId identifiers from gi2tax_file and + prints result to out_file + + gi2tax_file MUST be sorted on GI column + + gi_name_list is a list that look slike this: + [[1,'a'], [2,'b','x'], [7,'c'], [10,'d'], [90,'f']] + where the first element of each sublist is a GI number + this list MUST also be sorted on GI + + This function searches through 117,000,000 rows of gi2taxId file from NCBI + in approximately 4 minutes. This time is not dependent on the length of + gi_name_list + """ + + L = gi_name_list.pop(0) + my_gi = L[0] + F = open( out_file, 'w' ) + gi = 0 + for line in file( gi2tax_file ): + line = line.rstrip() + gi, taxId = string.split( line, '\t' ) + gi = int( gi ) + + if gi > my_gi: + try: + while ( my_gi < gi ): + L = gi_name_list.pop(0) + my_gi = L[0] + except: + break + + if gi == my_gi: + for i in range( 1,len( L ) ): + print >>F, '%s\t%s\t%d' % (L[i], taxId, gi) + try: + L = gi_name_list.pop(0) + my_gi = L[0] + except: + break + +# ----------------------------------------------------------------------------------- + + +try: + in_f = sys.argv[1] # input file with GIs + gi_col = int( sys.argv[2] ) - 1 # column in input containing GIs + name_col = int( sys.argv[3] ) - 1 # column containing sequence names + out_f = sys.argv[4] # output file + tool_data = sys.argv[5] +except: + stop_err('Check arguments\n') + +# GI2TAX point to a file produced by concatenation of: +# ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_nucl.zip +# and +# ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_prot.zip +# a sorting using this command: +# sort -n -k 1 + +GI2TAX = path.join( tool_data, 'taxonomy', 'gi_taxid_sorted.txt' ) + +# NAME_FILE and NODE_FILE point to names.dmg and nodes.dmg +# files contained within: +# ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz + +NAME_FILE = path.join( tool_data, 'taxonomy', 'names.dmp' ) +NODE_FILE = path.join( tool_data, 'taxonomy', 'nodes.dmp' ) + +g2n = gi_name_to_sorted_list(in_f, gi_col, name_col) + +if len(g2n) == 0: + stop_err('No valid GI-containing fields. Please, check your column assignments.\n') + +tb_F = tempfile.NamedTemporaryFile('w') + +get_taxId( GI2TAX, collapse_repeating_gis( g2n ), tb_F.name ) + +try: + tb_cmd = 'taxBuilder %s %s %s %s' % ( NAME_FILE, NODE_FILE, tb_F.name, out_f ) + retcode = subprocess.call( tb_cmd, shell=True ) + if retcode < 0: + print >>sys.stderr, "Execution of taxBuilder terminated by signal", -retcode +except OSError, e: + print >>sys.stderr, "Execution of taxBuilder2tree failed:", e diff -r 000000000000 -r 7b1b03c4465d gi2taxonomy.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gi2taxonomy.xml Mon Jan 27 09:28:26 2014 -0500 @@ -0,0 +1,102 @@ + + + + taxonomy + + gi2taxonomy.py $input $giField $idField $out_file1 ${GALAXY_DATA_INDEX_DIR} + + + + + + + + + + taxBuilder + + + + + + + + + + + + +.. class:: infomark + +Use *Filter and Sort->Filter* to restrict output of this tool to desired taxonomic ranks. You can also use *Text Manipulation->Cut* to remove unwanted columns from the output. + +------ + +**What it does** + +Fetches taxonomic information for a list of GI numbers (sequences identifiers used by the National Center for Biotechnology Information http://www.ncbi.nlm.nih.gov). + +------- + +**Example** + +Suppose you have BLAST output that looks like this:: + + +-----------------------+----------+----------+-----------------+------------+------+--------+ + | queryId | targetGI | identity | alignmentLength | mismatches | gaps | score | + +-----------------------+----------+----------+-----------------+------------+------+--------+ + | 1L_EYKX4VC01BXWX1_265 | 1430919 | 90.09 | 212 | 15 | 6 | 252.00 | + +-----------------------+----------+----------+-----------------+------------+------+--------+ + +and you want to obtain full taxonomic representation for GIs listed in *targetGI* column. If you set parameters as shown here: + +.. image:: ${static_path}/images/fetchTax.png + + +the tool will generate the following output (you may need to scroll sideways to see the entire line):: + + 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 + 1L_EYKX4VC01BXWX1_265 9606 root Eukaryota Metazoa n n Chordata Craniata Gnathostomata Mammalia n Euarchontoglires Primates Haplorrhini Hominoidea Hominidae n n n Homo n Homo sapiens n 1430919 + +In other words the tool printed *Name column*, *taxonomy Id*, appended 22 columns containing taxonomic ranks from Superkingdom to Subspecies and added *GI* as the last column. Below is a formal definition of the output columns:: + + Column Definition + ------- ------------------------------------------ + 1 Name (specified by 'Name column' dropdown) + 2 GI (specified by 'GI column' dropdown) + 3 root + 4 superkingdom + 5 kingdom + 6 subkingdom + 7 superphylum + 8 phylum + 9 subphylum + 10 superclass + 11 class + 12 subclass + 13 superorder + 14 order + 15 suborder + 16 superfamily + 17 family + 18 subfamily + 19 tribe + 20 subtribe + 21 genus + 22 subgenus + 23 species + 24 subspecies + +------ + +.. class:: warningmark + +**Why do I have these "n" things?** + +Be aware that the NCBI taxonomy (ftp://ftp.ncbi.nih.gov/pub/taxonomy/) this tool relies upon is incomplete. This means that for many species one or more ranks are absent and represented as "**n**". In the above example *subkingdom*, *superphylum* etc. are missing. + + + + + + diff -r 000000000000 -r 7b1b03c4465d test-data/taxonomy2gi-input.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/taxonomy2gi-input.tabular Mon Jan 27 09:28:26 2014 -0500 @@ -0,0 +1,4 @@ +2 9913 +4 9646 +15 9915 +16 9771 diff -r 000000000000 -r 7b1b03c4465d test-data/taxonomy2gi-output.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/taxonomy2gi-output.tabular Mon Jan 27 09:28:26 2014 -0500 @@ -0,0 +1,4 @@ +9913 9913 root Eukaryota Metazoa n n Chordata Craniata Gnathostomata Mammalia n Laurasiatheria n Ruminantia n Bovidae Bovinae n n Bos n Bos taurus n 2 +9646 9646 root Eukaryota Metazoa n n Chordata Craniata Gnathostomata Mammalia n Laurasiatheria Carnivora Caniformia n Ursidae n n n Ailuropoda n Ailuropoda melanoleuca n 4 +9915 9915 root Eukaryota Metazoa n n Chordata Craniata Gnathostomata Mammalia n Laurasiatheria n Ruminantia n Bovidae Bovinae n n Bos n Bos indicus n 15 +9771 9771 root Eukaryota Metazoa n n Chordata Craniata Gnathostomata Mammalia n Laurasiatheria Cetacea Mysticeti n Balaenopteridae n n n Balaenoptera n Balaenoptera musculus n 16 \ No newline at end of file diff -r 000000000000 -r 7b1b03c4465d tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon Jan 27 09:28:26 2014 -0500 @@ -0,0 +1,6 @@ + + + + + +