diff find_diag_hits.py @ 0:acf51ff24c7d draft default tip

Imported from capsule None
author devteam
date Mon, 27 Jan 2014 09:25:21 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/find_diag_hits.py	Mon Jan 27 09:25:21 2014 -0500
@@ -0,0 +1,170 @@
+#!/usr/bin/env python
+
+"""
+tax_read_grouping.py <file in taxonomy format> <id column> <taxonomic ranks> <output format> <output file>
+    finds reads that only hit one taxonomic group. For example, consider the folliowing example:
+    
+    read1   mammalia
+    read1   insecta
+    read2   insecta
+    
+    in this case only read2 will be selected becuase it stays within insecta
+    
+    This program takes the following options:
+    
+    file in taxonomy format - dataset that complies with Galaxy's taxonomy format
+    id column               - integer specifying the number of column containing seq id (starting with 1)
+    taxonomic ranks         - a comma separated list of ranks from this list:
+    
+         superkingdom
+         kingdom
+         subkingdom
+         superphylum
+         phylum
+         subphylum
+         superclass
+         class
+         subclass
+         superorder
+         order
+         suborder
+         superfamily
+         family
+         subfamily
+         tribe
+         subtribe
+         genus
+         subgenus
+         species
+         subspecies
+    
+    output format           - reads or counts
+
+"""
+
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( 'pysqlite' )
+from pysqlite2 import dbapi2 as sqlite
+import string, sys, tempfile
+
+# This dictionary maps taxonomic ranks to fields of Taxonomy file
+taxRank = {
+        'root'        :2, 
+        'superkingdom':3, 
+        'kingdom'     :4, 
+        'subkingdom'  :5, 
+        'superphylum' :6, 
+        'phylum'      :7, 
+        'subphylum'   :8, 
+        'superclass'  :9, 
+        'class'       :10, 
+        'subclass'    :11, 
+        'superorder'  :12, 
+        'ord'         :13, 
+        'suborder'    :14, 
+        'superfamily' :15,
+        'family'      :16,
+        'subfamily'   :17,
+        'tribe'       :18,
+        'subtribe'    :19,
+        'genus'       :20,
+        'subgenus'    :21,
+        'species'     :22,
+        'subspecies'  :23,
+        'order'       :13
+    }
+
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+
+db = tempfile.NamedTemporaryFile('w')
+
+try:
+    con = sqlite.connect(db.name)
+    cur = con.cursor()
+except:
+    stop_err('Cannot connect to %s\n') % db.name
+    
+try:
+    tax_file   = open(sys.argv[1], 'r')
+    id_col     = int(sys.argv[2]) - 1
+    taxa       = string.split(sys.argv[3].rstrip(),',')
+    
+    if sys.argv[4] == 'reads':
+        out_format = True
+    elif sys.argv[4] == 'counts':
+        out_format = False
+    else:
+        stop_err('Please specify "reads" or "counts" for output format\n')
+    out_file = open(sys.argv[5], 'w')
+    
+except:
+    stop_err('Check arguments\n')
+    
+if taxa[0] == 'None': stop_err('Please, use checkboxes to specify taxonomic ranks.\n')
+
+sql = ""
+for i in range(len(taxa)):
+        if taxa[i] == 'order': taxa[i] = 'ord' # SQL does not like fields to be named 'order'
+        sql += '%s text, ' % taxa[i]
+
+sql = sql.strip(', ')
+sql = 'create table tax (name varchar(50) not null, ' + sql + ')'
+
+    
+cur.execute(sql)
+
+invalid_line_number = 0
+
+try:
+    for line in tax_file:
+        fields = string.split(line.rstrip(), '\t')
+        if len(fields) < 24: 
+            invalid_line_number += 1
+            continue # Skipping malformed taxonomy lines
+        
+        val_string = '"' + fields[id_col] + '", '
+        
+        for rank in taxa:
+            taxon = fields[taxRank[rank]]
+            val_string += '"%s", ' % taxon
+                
+        val_string = val_string.strip(', ')
+        val_string = "insert into tax values(" + val_string + ")"
+        cur.execute(val_string)
+except Exception, e:
+    stop_err('%s\n' % e)
+
+tax_file.close()    
+
+try:    
+    for rank in taxa:
+        cur.execute('create temporary table %s (name varchar(50), id text, rank text)' % rank  )
+        cur.execute('insert into %s select name, name || %s as id, %s from tax group by id' % ( rank, rank, rank ) )
+        cur.execute('create temporary table %s_count(name varchar(50), id text, rank text, N int)' % rank)
+        cur.execute('insert into %s_count select name, id, rank, count(*) from %s group by name' % ( rank, rank) )
+        
+        if rank == 'ord':
+            rankName = 'order'
+        else:
+            rankName = rank
+    
+        if out_format:
+            cur.execute('select name,rank from %s_count where N = 1 and length(rank)>1' % rank)
+            for item in cur.fetchall():
+                out_string = '%s\t%s\t' % ( item[0], item[1] )
+                out_string += rankName
+                print >>out_file, out_string
+        else:
+            cur.execute('select rank, count(*) from %s_count where N = 1 and length(rank)>1 group by rank' % rank)
+            for item in cur.fetchall():
+                out_string = '%s\t%s\t' % ( item[0], item[1] )
+                out_string += rankName
+                print >>out_file, out_string
+except Exception, e:
+    stop_err("%s\n" % e)
+