changeset 3:c5bb987015c5 draft default tip

Uploaded
author devteam
date Fri, 07 Mar 2014 16:34:26 -0500
parents 792f3cb0eff4
children
files dgidb_annotator.py dgidb_annotator.xml test-data/out1.tabular
diffstat 3 files changed, 53 insertions(+), 31 deletions(-) [+]
line wrap: on
line diff
--- a/dgidb_annotator.py	Tue Feb 25 14:19:17 2014 -0500
+++ b/dgidb_annotator.py	Fri Mar 07 16:34:26 2014 -0500
@@ -2,7 +2,7 @@
 Annotates a tabular file with information from the Drug-Gene Interaction (DGI) database.
 '''
 
-import optparse, json, urllib2, sys, re
+import optparse, json, urllib2, sys
 
 def __main__():
     # -- Parse command line. --
@@ -19,7 +19,7 @@
     else:
         input_file = sys.stdin
 
-    # -- Make connection and get results. --
+    # -- Set up gene list queries. --
 
     # Get gene list.
     gene_list = []
@@ -31,16 +31,36 @@
         gene_list.append(entry.split(';')[0].split('(')[0])
         lines.append(line.strip())
     
+    # Set up gene lists to be ~8K because this is near the max HTTP request length.
+    gene_list = ','.join(set(gene_list))
+    queries = []
+    MAX_QUERY_SIZE = 8000
+    if len(gene_list) > MAX_QUERY_SIZE:
+        # Break queries.
+        queries = [ gene_list[i:i + MAX_QUERY_SIZE] for i in range(0, len(gene_list), MAX_QUERY_SIZE) ]
+
+        # Adjust queries to include whole genes.
+        for i, query in enumerate( queries[1:] ):
+            part_gene, comma, remainder = query.partition(',')
+            queries[i] += part_gene
+            queries[i+1] = remainder
+    else:
+        queries = [ gene_list ]
+
+    # -- Query and process results. --
+
     # Query for results.
-    query_str = 'http://dgidb.genome.wustl.edu/api/v1/interactions.json?genes=%s' % ','.join(set(gene_list))
-    if options.expert_curated:
-        query_str += '&source_trust_levels=Expert%20curated'
-    results = urllib2.urlopen(query_str).read()
-    results_dict = json.loads(results)
-    
+    results = []
+    for genes in queries:
+        query_str = 'http://dgidb.genome.wustl.edu/api/v1/interactions.json?genes=%s' % genes
+        if options.expert_curated:
+            query_str += '&source_trust_levels=Expert%20curated'
+        raw_results = urllib2.urlopen(query_str).read()
+        results_dict = json.loads(raw_results)
+        results.extend(results_dict['matchedTerms'])
+        
     # Process results.
-    matched_results = results_dict['matchedTerms']
-    for result in matched_results:
+    for result in results:
         # Process result.
         processed_results = []
         result_fields = [ result['geneName'], result['geneLongName'], ','.join( result['geneCategories'] ) ]
--- a/dgidb_annotator.xml	Tue Feb 25 14:19:17 2014 -0500
+++ b/dgidb_annotator.xml	Fri Mar 07 16:34:26 2014 -0500
@@ -1,4 +1,4 @@
-<tool id="dgidb_annotator" name="Annotate with DGI" version="0.1">
+<tool id="dgidb_annotate" name="Annotate with DGI" version="0.1">
     <description>database info</description>
     
     <command interpreter="python">
--- a/test-data/out1.tabular	Tue Feb 25 14:19:17 2014 -0500
+++ b/test-data/out1.tabular	Fri Mar 07 16:34:26 2014 -0500
@@ -15,30 +15,32 @@
 RET	gene1	RET	ret proto-oncogene	TYROSINE KINASE,DRUGGABLE GENOME,KINASE,CLINICALLY ACTIONABLE	inhibitor	LENVATINIB	TALC
 RET	gene1	RET	ret proto-oncogene	TYROSINE KINASE,DRUGGABLE GENOME,KINASE,CLINICALLY ACTIONABLE	inhibitor	AMUVATINIB	TALC
 RET	gene1	RET	ret proto-oncogene	TYROSINE KINASE,DRUGGABLE GENOME,KINASE,CLINICALLY ACTIONABLE	inhibitor	AT9283	TALC
+RET	gene1	RET	ret proto-oncogene	TYROSINE KINASE,DRUGGABLE GENOME,KINASE,CLINICALLY ACTIONABLE	n/a	VANDETANIB	ClearityFoundationClinicalTrial
+RET	gene1	RET	ret proto-oncogene	TYROSINE KINASE,DRUGGABLE GENOME,KINASE,CLINICALLY ACTIONABLE	n/a	DOVITINIB	ClearityFoundationClinicalTrial
 BIRC3	gene2	BIRC3	baculoviral IAP repeat containing 3		antagonist	AEG40826	TALC
 BIRC3	gene2	BIRC3	baculoviral IAP repeat containing 3		antagonist	TL 32711	TALC
 BIRC3	gene2	BIRC3	baculoviral IAP repeat containing 3		antagonist	AT-406	TALC
 BIRC3	gene2	BIRC3	baculoviral IAP repeat containing 3		antagonist	GDC0917	TALC
 BIRC3	gene2	BIRC3	baculoviral IAP repeat containing 3		antagonist	LCL161	TALC
-ATM	gene3	ATM	ataxia telangiectasia mutated	PHOSPHATIDYLINOSITOL 3 KINASE,DRUGGABLE GENOME,CLINICALLY ACTIONABLE,SERINE THREONINE KINASE,KINASE,TUMOR SUPPRESSOR,DNA REPAIR	n/a	E7449	ClearityFoundation
-ATM	gene3	ATM	ataxia telangiectasia mutated	PHOSPHATIDYLINOSITOL 3 KINASE,DRUGGABLE GENOME,CLINICALLY ACTIONABLE,SERINE THREONINE KINASE,KINASE,TUMOR SUPPRESSOR,DNA REPAIR	n/a	NIRAPARIB	ClearityFoundation
-ATM	gene3	ATM	ataxia telangiectasia mutated	PHOSPHATIDYLINOSITOL 3 KINASE,DRUGGABLE GENOME,CLINICALLY ACTIONABLE,SERINE THREONINE KINASE,KINASE,TUMOR SUPPRESSOR,DNA REPAIR	n/a	OLAPARIB	ClearityFoundation
-ATM	gene3	ATM	ataxia telangiectasia mutated	PHOSPHATIDYLINOSITOL 3 KINASE,DRUGGABLE GENOME,CLINICALLY ACTIONABLE,SERINE THREONINE KINASE,KINASE,TUMOR SUPPRESSOR,DNA REPAIR	n/a	BMN673	ClearityFoundation
-ATM	gene3	ATM	ataxia telangiectasia mutated	PHOSPHATIDYLINOSITOL 3 KINASE,DRUGGABLE GENOME,CLINICALLY ACTIONABLE,SERINE THREONINE KINASE,KINASE,TUMOR SUPPRESSOR,DNA REPAIR	n/a	RUCAPARIB	ClearityFoundation
-ATM	gene3	ATM	ataxia telangiectasia mutated	PHOSPHATIDYLINOSITOL 3 KINASE,DRUGGABLE GENOME,CLINICALLY ACTIONABLE,SERINE THREONINE KINASE,KINASE,TUMOR SUPPRESSOR,DNA REPAIR	n/a	VELIPARIB	ClearityFoundation
-KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	n/a	VANDETANIB	ClearityFoundation
-KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	n/a	EVEROLIMUS	ClearityFoundation
-KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	n/a	TEMSIROLIMUS	ClearityFoundation
-KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	n/a	GDC-0973	ClearityFoundation
-KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	n/a	MEK162	ClearityFoundation
-KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	n/a	PD-325901	ClearityFoundation
-KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	n/a	RAFAMETINIB	ClearityFoundation
-KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	n/a	SELUMETINIB	ClearityFoundation
-KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	n/a	TRAMETINIB	ClearityFoundation
-KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	n/a	CETUXIMAB	ClearityFoundation
-KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	n/a	ERLOTINIB	ClearityFoundation
-KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	n/a	GEFITINIB	ClearityFoundation
-KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	n/a	PANITUMUMAB	ClearityFoundation
-KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	n/a	PIMASERTIB	ClearityFoundation
+ATM	gene3	ATM	ataxia telangiectasia mutated	PHOSPHATIDYLINOSITOL 3 KINASE,DRUGGABLE GENOME,CLINICALLY ACTIONABLE,SERINE THREONINE KINASE,KINASE,TUMOR SUPPRESSOR,DNA REPAIR	n/a	E7449	ClearityFoundationBiomarkers
+ATM	gene3	ATM	ataxia telangiectasia mutated	PHOSPHATIDYLINOSITOL 3 KINASE,DRUGGABLE GENOME,CLINICALLY ACTIONABLE,SERINE THREONINE KINASE,KINASE,TUMOR SUPPRESSOR,DNA REPAIR	n/a	NIRAPARIB	ClearityFoundationBiomarkers
+ATM	gene3	ATM	ataxia telangiectasia mutated	PHOSPHATIDYLINOSITOL 3 KINASE,DRUGGABLE GENOME,CLINICALLY ACTIONABLE,SERINE THREONINE KINASE,KINASE,TUMOR SUPPRESSOR,DNA REPAIR	n/a	OLAPARIB	ClearityFoundationBiomarkers
+ATM	gene3	ATM	ataxia telangiectasia mutated	PHOSPHATIDYLINOSITOL 3 KINASE,DRUGGABLE GENOME,CLINICALLY ACTIONABLE,SERINE THREONINE KINASE,KINASE,TUMOR SUPPRESSOR,DNA REPAIR	n/a	BMN673	ClearityFoundationBiomarkers
+ATM	gene3	ATM	ataxia telangiectasia mutated	PHOSPHATIDYLINOSITOL 3 KINASE,DRUGGABLE GENOME,CLINICALLY ACTIONABLE,SERINE THREONINE KINASE,KINASE,TUMOR SUPPRESSOR,DNA REPAIR	n/a	RUCAPARIB	ClearityFoundationBiomarkers
+ATM	gene3	ATM	ataxia telangiectasia mutated	PHOSPHATIDYLINOSITOL 3 KINASE,DRUGGABLE GENOME,CLINICALLY ACTIONABLE,SERINE THREONINE KINASE,KINASE,TUMOR SUPPRESSOR,DNA REPAIR	n/a	VELIPARIB	ClearityFoundationBiomarkers
+KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	n/a	VANDETANIB	ClearityFoundationBiomarkers
+KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	n/a	EVEROLIMUS	ClearityFoundationBiomarkers
+KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	n/a	TEMSIROLIMUS	ClearityFoundationBiomarkers
+KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	n/a	GDC-0973	ClearityFoundationBiomarkers
+KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	n/a	MEK162	ClearityFoundationBiomarkers
+KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	n/a	PD-325901	ClearityFoundationBiomarkers
+KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	n/a	RAFAMETINIB	ClearityFoundationBiomarkers
+KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	n/a	SELUMETINIB	ClearityFoundationBiomarkers
+KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	n/a	TRAMETINIB	ClearityFoundationBiomarkers
+KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	n/a	CETUXIMAB	ClearityFoundationBiomarkers
+KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	n/a	ERLOTINIB	ClearityFoundationBiomarkers
+KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	n/a	GEFITINIB	ClearityFoundationBiomarkers
+KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	n/a	PANITUMUMAB	ClearityFoundationBiomarkers
+KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	n/a	PIMASERTIB	ClearityFoundationBiomarkers
 KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	inhibitor	REOLYSIN	CancerCommons
 KRAS	gene4	KRAS	v-Ki-ras2 Kirsten rat sarcoma viral oncogene homolog	CLINICALLY ACTIONABLE	vaccine	RAS PEPTIDE CANCER VACCINE	TALC