diff dgidb_annotator.py @ 0:8c6dc9da6c89 draft

Uploaded
author devteam
date Wed, 27 Nov 2013 23:51:48 -0500
parents
children 8cc7cf4bd833
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dgidb_annotator.py	Wed Nov 27 23:51:48 2013 -0500
@@ -0,0 +1,64 @@
+'''
+Annotates a tabular file with information from the Drug-Gene Interaction (DGI) database.
+'''
+
+import optparse, json, urllib2, sys
+
+def __main__():
+    # -- Parse command line. --
+    parser = optparse.OptionParser()
+    parser.add_option('-g', '--gene-name-col', dest='gene_name_col', help='column of gene names')
+    parser.add_option('-a', '--print-all', dest='print_all', action='store_true', help='print all lines, even though without a result')
+    parser.add_option('-e', '--expert-curated', dest='expert_curated', action='store_true', help='use only expert curated results')
+    (options, args) = parser.parse_args()
+    gene_name_col = int(options.gene_name_col) - 1
+
+    # Open input stream.
+    if len(args) > 0:
+        input_file = open(args[0], 'r')
+    else:
+        input_file = sys.stdin
+
+    # -- Make connection and get results. --
+
+    # Get gene list.
+    gene_list = []
+    lines = []
+    for line in input_file:
+        gene_list.append( line.split('\t')[gene_name_col].strip() )
+        lines.append(line.strip())
+    
+    # Query for results.
+    query_str = 'http://dgidb.genome.wustl.edu/api/v1/interactions.json?genes=%s' % ','.join(set(gene_list))
+    if options.expert_curated:
+        query_str += '&source_trust_levels=Expert%20curated'
+    results = urllib2.urlopen(query_str).read()
+    results_dict = json.loads(results)
+    
+    # Process results.
+    matched_results = results_dict['matchedTerms']
+    for result in matched_results:
+        # Process result.
+        processed_results = []
+        result_fields = [ result['geneName'], result['geneLongName'], ','.join( result['geneCategories'] ) ]
+        for interaction in result['interactions']:
+            result_fields = result_fields[0:3]
+            result_fields.extend( [
+                interaction['interactionType'], interaction['drugName'], interaction['source']
+            ] )
+            processed_results.append( '\t'.join( result_fields ) )
+            
+        # Store processed results.
+        results_dict[ result['searchTerm'] ] = processed_results
+
+    # -- Annotate input file and produce output. --
+    for line in lines:
+        fields = line.split('\t')
+        gene = fields[gene_name_col]
+        if gene in results_dict:
+            for result in results_dict[gene]:
+                print line.strip() + '\t' + result
+        elif options.print_all:
+            print line
+
+if __name__=="__main__": __main__()
\ No newline at end of file