Mercurial > repos > peterjc > venn_list
comparison tools/venn_list/venn_list.py @ 4:991342eca214 draft
Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
author | peterjc |
---|---|
date | Wed, 29 Apr 2015 11:00:41 -0400 |
parents | |
children | ea68a1a4c1d9 |
comparison
equal
deleted
inserted
replaced
3:6aae6bc0802d | 4:991342eca214 |
---|---|
1 #!/usr/bin/env python | |
2 """Plot up to 3-way Venn Diagram using R limma vennDiagram (via rpy) | |
3 | |
4 This script is copyright 2010 by Peter Cock, The James Hutton Institute | |
5 (formerly SCRI), UK. All rights reserved. | |
6 See accompanying text file for licence details (MIT/BSD style). | |
7 | |
8 This is version 0.0.8 of the script. | |
9 """ | |
10 | |
11 | |
12 import sys | |
13 | |
14 def sys_exit(msg, error_level=1): | |
15 """Print error message to stdout and quit with given error level.""" | |
16 sys.stderr.write("%s\n" % msg) | |
17 sys.exit(error_level) | |
18 | |
19 try: | |
20 import rpy | |
21 except ImportError: | |
22 sys_exit("Requires the Python library rpy (to call R)") | |
23 except RuntimeError, e: | |
24 sys_exit("The Python library rpy is not availble for the current R version\n\n%s" % e) | |
25 | |
26 try: | |
27 rpy.r.library("limma") | |
28 except: | |
29 sys_exit("Requires the R library limma (for vennDiagram function)") | |
30 | |
31 | |
32 if len(sys.argv)-1 not in [7, 10, 13]: | |
33 sys_exit("Expected 7, 10 or 13 arguments (for 1, 2 or 3 sets), not %i" % (len(sys.argv)-1)) | |
34 | |
35 all_file, all_type, all_label = sys.argv[1:4] | |
36 set_data = [] | |
37 if len(sys.argv)-1 >= 7: | |
38 set_data.append(tuple(sys.argv[4:7])) | |
39 if len(sys.argv)-1 >= 10: | |
40 set_data.append(tuple(sys.argv[7:10])) | |
41 if len(sys.argv)-1 >= 13: | |
42 set_data.append(tuple(sys.argv[10:13])) | |
43 pdf_file = sys.argv[-1] | |
44 n = len(set_data) | |
45 print "Doing %i-way Venn Diagram" % n | |
46 | |
47 def load_ids(filename, filetype): | |
48 if filetype=="tabular": | |
49 for line in open(filename): | |
50 line = line.rstrip("\n") | |
51 if line and not line.startswith("#"): | |
52 yield line.split("\t",1)[0] | |
53 elif filetype=="fasta": | |
54 for line in open(filename): | |
55 if line.startswith(">"): | |
56 yield line[1:].rstrip("\n").split(None,1)[0] | |
57 elif filetype.startswith("fastq"): | |
58 #Use the Galaxy library not Biopython to cope with CS | |
59 from galaxy_utils.sequence.fastq import fastqReader | |
60 handle = open(filename, "rU") | |
61 for record in fastqReader(handle): | |
62 #The [1:] is because the fastaReader leaves the @ on the identifer. | |
63 yield record.identifier.split()[0][1:] | |
64 handle.close() | |
65 elif filetype=="sff": | |
66 try: | |
67 from Bio.SeqIO import index | |
68 except ImportError: | |
69 sys_exit("Require Biopython 1.54 or later (to read SFF files)") | |
70 #This will read the SFF index block if present (very fast) | |
71 for name in index(filename, "sff"): | |
72 yield name | |
73 else: | |
74 sys_exit("Unexpected file type %s" % filetype) | |
75 | |
76 def load_ids_whitelist(filename, filetype, whitelist): | |
77 for name in load_ids(filename, filetype): | |
78 if name in whitelist: | |
79 yield name | |
80 else: | |
81 sys_exit("Unexpected ID %s in %s file %s" % (name, filetype, filename)) | |
82 | |
83 if all_file in ["", "-", '""', '"-"']: | |
84 #Load without white list | |
85 sets = [set(load_ids(f,t)) for (f,t,c) in set_data] | |
86 #Take union | |
87 all = set() | |
88 for s in sets: | |
89 all.update(s) | |
90 print "Inferred total of %i IDs" % len(all) | |
91 else: | |
92 all = set(load_ids(all_file, all_type)) | |
93 print "Total of %i IDs" % len(all) | |
94 sets = [set(load_ids_whitelist(f,t,all)) for (f,t,c) in set_data] | |
95 | |
96 for s, (f,t,c) in zip(sets, set_data): | |
97 print "%i in %s" % (len(s), c) | |
98 | |
99 #Now call R library to draw simple Venn diagram | |
100 try: | |
101 #Create dummy Venn diagram counts object for three groups | |
102 cols = 'c("%s")' % '","'.join("Set%i" % (i+1) for i in range(n)) | |
103 rpy.r('groups <- cbind(%s)' % ','.join(['1']*n)) | |
104 rpy.r('colnames(groups) <- %s' % cols) | |
105 rpy.r('vc <- vennCounts(groups)') | |
106 #Populate the 2^n classes with real counts | |
107 #Don't make any assumptions about the class order | |
108 #print rpy.r('vc') | |
109 for index, row in enumerate(rpy.r('vc[,%s]' % cols)): | |
110 if isinstance(row, int) or isinstance(row, float): | |
111 #Hack for rpy being too clever for single element row | |
112 row = [row] | |
113 names = all | |
114 for wanted, s in zip(row, sets): | |
115 if wanted: | |
116 names = names.intersection(s) | |
117 else: | |
118 names = names.difference(s) | |
119 rpy.r('vc[%i,"Counts"] <- %i' % (index+1, len(names))) | |
120 #print rpy.r('vc') | |
121 if n == 1: | |
122 #Single circle, don't need to add (Total XXX) line | |
123 names = [c for (t,f,c) in set_data] | |
124 else: | |
125 names = ["%s\n(Total %i)" % (c, len(s)) for s, (f,t,c) in zip(sets, set_data)] | |
126 rpy.r.assign("names", names) | |
127 rpy.r.assign("colors", ["red","green","blue"][:n]) | |
128 rpy.r.pdf(pdf_file, 8, 8) | |
129 rpy.r("""vennDiagram(vc, include="both", names=names, | |
130 main="%s", sub="(Total %i)", | |
131 circle.col=colors) | |
132 """ % (all_label, len(all))) | |
133 rpy.r.dev_off() | |
134 except Exception, exc: | |
135 sys_exit( "%s" %str( exc ) ) | |
136 rpy.r.quit( save="no" ) | |
137 print "Done" |