Mercurial > repos > devteam > find_diag_hits
changeset 0:acf51ff24c7d draft default tip
Imported from capsule None
author | devteam |
---|---|
date | Mon, 27 Jan 2014 09:25:21 -0500 |
parents | |
children | |
files | find_diag_hits.py find_diag_hits.xml test-data/find_diag_hits.tabular test-data/taxonomyGI.taxonomy tool_dependencies.xml |
diffstat | 5 files changed, 282 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/find_diag_hits.py Mon Jan 27 09:25:21 2014 -0500 @@ -0,0 +1,170 @@ +#!/usr/bin/env python + +""" +tax_read_grouping.py <file in taxonomy format> <id column> <taxonomic ranks> <output format> <output file> + finds reads that only hit one taxonomic group. For example, consider the folliowing example: + + read1 mammalia + read1 insecta + read2 insecta + + in this case only read2 will be selected becuase it stays within insecta + + This program takes the following options: + + file in taxonomy format - dataset that complies with Galaxy's taxonomy format + id column - integer specifying the number of column containing seq id (starting with 1) + taxonomic ranks - a comma separated list of ranks from this list: + + superkingdom + kingdom + subkingdom + superphylum + phylum + subphylum + superclass + class + subclass + superorder + order + suborder + superfamily + family + subfamily + tribe + subtribe + genus + subgenus + species + subspecies + + output format - reads or counts + +""" + +from galaxy import eggs +import pkg_resources +pkg_resources.require( 'pysqlite' ) +from pysqlite2 import dbapi2 as sqlite +import string, sys, tempfile + +# This dictionary maps taxonomic ranks to fields of Taxonomy file +taxRank = { + 'root' :2, + 'superkingdom':3, + 'kingdom' :4, + 'subkingdom' :5, + 'superphylum' :6, + 'phylum' :7, + 'subphylum' :8, + 'superclass' :9, + 'class' :10, + 'subclass' :11, + 'superorder' :12, + 'ord' :13, + 'suborder' :14, + 'superfamily' :15, + 'family' :16, + 'subfamily' :17, + 'tribe' :18, + 'subtribe' :19, + 'genus' :20, + 'subgenus' :21, + 'species' :22, + 'subspecies' :23, + 'order' :13 + } + + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + + +db = tempfile.NamedTemporaryFile('w') + +try: + con = sqlite.connect(db.name) + cur = con.cursor() +except: + stop_err('Cannot connect to %s\n') % db.name + +try: + tax_file = open(sys.argv[1], 'r') + id_col = int(sys.argv[2]) - 1 + taxa = string.split(sys.argv[3].rstrip(),',') + + if sys.argv[4] == 'reads': + out_format = True + elif sys.argv[4] == 'counts': + out_format = False + else: + stop_err('Please specify "reads" or "counts" for output format\n') + out_file = open(sys.argv[5], 'w') + +except: + stop_err('Check arguments\n') + +if taxa[0] == 'None': stop_err('Please, use checkboxes to specify taxonomic ranks.\n') + +sql = "" +for i in range(len(taxa)): + if taxa[i] == 'order': taxa[i] = 'ord' # SQL does not like fields to be named 'order' + sql += '%s text, ' % taxa[i] + +sql = sql.strip(', ') +sql = 'create table tax (name varchar(50) not null, ' + sql + ')' + + +cur.execute(sql) + +invalid_line_number = 0 + +try: + for line in tax_file: + fields = string.split(line.rstrip(), '\t') + if len(fields) < 24: + invalid_line_number += 1 + continue # Skipping malformed taxonomy lines + + val_string = '"' + fields[id_col] + '", ' + + for rank in taxa: + taxon = fields[taxRank[rank]] + val_string += '"%s", ' % taxon + + val_string = val_string.strip(', ') + val_string = "insert into tax values(" + val_string + ")" + cur.execute(val_string) +except Exception, e: + stop_err('%s\n' % e) + +tax_file.close() + +try: + for rank in taxa: + cur.execute('create temporary table %s (name varchar(50), id text, rank text)' % rank ) + cur.execute('insert into %s select name, name || %s as id, %s from tax group by id' % ( rank, rank, rank ) ) + cur.execute('create temporary table %s_count(name varchar(50), id text, rank text, N int)' % rank) + cur.execute('insert into %s_count select name, id, rank, count(*) from %s group by name' % ( rank, rank) ) + + if rank == 'ord': + rankName = 'order' + else: + rankName = rank + + if out_format: + cur.execute('select name,rank from %s_count where N = 1 and length(rank)>1' % rank) + for item in cur.fetchall(): + out_string = '%s\t%s\t' % ( item[0], item[1] ) + out_string += rankName + print >>out_file, out_string + else: + cur.execute('select rank, count(*) from %s_count where N = 1 and length(rank)>1 group by rank' % rank) + for item in cur.fetchall(): + out_string = '%s\t%s\t' % ( item[0], item[1] ) + out_string += rankName + print >>out_file, out_string +except Exception, e: + stop_err("%s\n" % e) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/find_diag_hits.xml Mon Jan 27 09:25:21 2014 -0500 @@ -0,0 +1,99 @@ +<tool id="find_diag_hits" name="Find diagnostic hits" version="1.0.0"> + <description></description> + <requirements> + <requirement type="package" version="1.0.0">taxonomy</requirement> + </requirements> + <command interpreter="python">find_diag_hits.py $input1 $id_col $rank_list $out_format $out_file1</command> + <inputs> + <param format="taxonomy" name="input1" type="data" label="Find diagnostic hits in"/> + <param name="id_col" type="data_column" data_ref="input1" numerical="False" label="Select column with sequence id" /> + <param name="rank_list" type="select" display="checkboxes" multiple="true" label="select taxonomic ranks"> + <option value="superkingdom">Superkingdom</option> + <option value="kingdom">Kingdom</option> + <option value="subkingdom">Subkingdom</option> + <option value="superphylum">Superphylum</option> + <option value="phylum">Phylum</option> + <option value="subphylum">Subphylum</option> + <option value="superclass">Superclass</option> + <option value="class">Class</option> + <option value="subclass">Subclass</option> + <option value="superorder">Superorder</option> + <option value="order">Order</option> + <option value="suborder">Suborder</option> + <option value="superfamily">Superfamily</option> + <option value="family">Family</option> + <option value="subfamily">Subfamily</option> + <option value="tribe">Tribe</option> + <option value="subtribe">Subtribe</option> + <option value="genus">Genus</option> + <option value="subgenus">Subgenus</option> + <option selected="true" value="species">Species</option> + <option value="subspecies">Subspecies</option> + </param> + <param name="out_format" type="select" label="Select output format"> + <option value="reads">Diagnostic read list</option> + <option value="counts">Number of diagnostic reads per taxonomic rank</option> + </param> + </inputs> + <outputs> + <data format="tabular" name="out_file1" /> + </outputs> + <tests> + <test> + <param name="input1" value="taxonomyGI.taxonomy" ftype="taxonomy"/> + <param name="id_col" value="1" /> + <param name="rank_list" value="order,genus" /> + <param name="out_format" value="counts" /> + <output name="out_file1" file="find_diag_hits.tabular" /> + </test> + </tests> + + +<help> + +**What it does** + +When performing metagenomic analyses it is often necessary to identify sequence reads corresponding to a particular taxonomic group, or, in other words, diagnostic of a particular taxonomic rank. This utility performs this analysis. It takes data generated by *Taxonomy manipulation->Fetch Taxonomic Ranks* as input and outputs either a list of sequence reads unique to a particular taxonomic rank, or a list of taxonomic ranks and the count of unique reads corresponding to each rank. + +------ + +**Example** + +Suppose the *Taxonomy manipulation->Fetch Taxonomic Ranks* generated the following taxonomy representation:: + + read1 2 root Eukaryota Metazoa n n Chordata Craniata Gnathostomata Mammalia n Laurasiatheria n Ruminantia n Bovidae Bovinae n n Bos n Bos taurus n + read2 12585 root Eukaryota Metazoa n n Chordata Craniata Gnathostomata Mammalia n Euarchontoglires Primates Haplorrhini Hominoidea Hominidae n n n Homo n Homo sapiens n + read1 58615 root Eukaryota Metazoa n n Arthropoda n Hexapoda Insecta Neoptera Amphiesmenoptera Lepidoptera Glossata Papilionoidea Nymphalidae Nymphalinae Melitaeini Phyciodina Anthanassa n Anthanassa otanes n + read3 56785 root Eukaryota Metazoa n n Chordata Craniata Gnathostomata Mammalia n Euarchontoglires Primates Haplorrhini Hominoidea Hominidae n n n Homo n Homo sapiens n + +Running this tool with the following parameters: + + * *Select column with sequence id* set to **c1** + * *Select taxonomic ranks* with **order**, and **genus** checked + * *Output format* set to **Diagnostic read list** + +will return:: + + read2 Primates order + read3 Primates order + read2 Homo genus + read3 Homo genus + +Changing *Output format* set to **Number of diagnostic reads per taxonomic rank** will produce:: + + Primates 2 order + Homo 2 genus + +.. class:: infomark + +Note that **read1** is omitted because it is non-unique: it hits Mammals and Insects at the same time. + +-------- + +.. class:: warningmark + +This tool omits "**n**" corresponding to ranks missing from NCBI taxonomy. In the above example *Home sapiens* contains the order name (Primates) while *Bos taurus* does not. + + +</help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/find_diag_hits.tabular Mon Jan 27 09:25:21 2014 -0500 @@ -0,0 +1,2 @@ +Primates 4 order +Homo 2 genus
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/taxonomyGI.taxonomy Mon Jan 27 09:25:21 2014 -0500 @@ -0,0 +1,5 @@ +9606 9606 root Eukaryota Metazoa n n Chordata Craniata Gnathostomata Mammalia n Euarchontoglires Primates Haplorrhini Hominoidea Hominidae n n n Homo n Homo sapiens n 12583 +40674 40674 root Eukaryota Metazoa n n Chordata Craniata Gnathostomata Mammalia n n n n n n n n n n n n n 410771 +63221 63221 root Eukaryota Metazoa n n Chordata Craniata Gnathostomata Mammalia n Euarchontoglires Primates Haplorrhini Hominoidea Hominidae n n n Homo n Homo sapiens Homo sapiens neanderthalensis 2286205 +9604 9604 root Eukaryota Metazoa n n Chordata Craniata Gnathostomata Mammalia n Euarchontoglires Primates Haplorrhini Hominoidea Hominidae n n n n n n n 23236241 +9443 9443 root Eukaryota Metazoa n n Chordata Craniata Gnathostomata Mammalia n Euarchontoglires Primates n n n n n n n n n n 33001686 \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon Jan 27 09:25:21 2014 -0500 @@ -0,0 +1,6 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="taxonomy" version="1.0.0"> + <repository changeset_revision="c011d4659306" name="package_taxonomy_1_0_0" owner="devteam" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>