comparison tools/venn_list/venn_list.py @ 11:679b6323db03 draft

v0.1.0 now using matplotlib_venn instead of limma R package via rpy. Contribution from Frederic Sapet.
author peterjc
date Wed, 29 May 2019 04:45:19 -0400
parents 20d347feb882
children
comparison
equal deleted inserted replaced
10:20d347feb882 11:679b6323db03
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 """Plot up to 3-way Venn Diagram using R limma vennDiagram (via rpy) 2
3 """
4 Plot up to 3-way Venn Diagram.
5
6 It uses the Python libraries matplotlib and matplotlib_venn.
3 7
4 This script is copyright 2010-2017 by Peter Cock, The James Hutton Institute 8 This script is copyright 2010-2017 by Peter Cock, The James Hutton Institute
5 (formerly SCRI), UK. All rights reserved. 9 (formerly SCRI), UK. All rights reserved.
6 10
7 See accompanying text file for licence details (MIT License). 11 See accompanying text file for licence details (MIT License).
8 """ 12 """
9 13
10 from __future__ import print_function 14 from __future__ import print_function
11 15
12 import sys 16 import sys
17 from shutil import move
13 18
14 if "-v" in sys.argv or "--version" in sys.argv: 19 if "-v" in sys.argv or "--version" in sys.argv:
15 print("v0.0.11") 20 print("v0.1.0")
16 sys.exit(0) 21 sys.exit(0)
17 22
18 try: 23 try:
19 import rpy 24 import matplotlib
20 except ImportError: 25 except ImportError:
21 sys.exit("Requires the Python library rpy (to call R)") 26 sys.exit("Requires the Python library matplotlib")
22 except RuntimeError, e:
23 sys.exit("The Python library rpy is not availble for the current R version\n\n%s" % e)
24 27
25 try: 28 matplotlib.use("Agg")
26 rpy.r.library("limma") 29 from matplotlib import pyplot as plt
27 except Exception:
28 sys.exit("Requires the R library limma (for vennDiagram function)")
29
30 30
31 if len(sys.argv) - 1 not in [7, 10, 13]: 31 if len(sys.argv) - 1 not in [7, 10, 13]:
32 sys.exit("Expected 7, 10 or 13 arguments (for 1, 2 or 3 sets), not %i" % (len(sys.argv) - 1)) 32 sys.exit(
33 "Expected 7, 10 or 13 arguments (for 1, 2 or 3 sets), not %i"
34 % (len(sys.argv) - 1)
35 )
33 36
34 all_file, all_type, all_label = sys.argv[1:4] 37 all_file, all_type, all_label = sys.argv[1:4]
35 set_data = [] 38 set_data = []
36 if len(sys.argv) - 1 >= 7: 39 if len(sys.argv) - 1 >= 7:
37 set_data.append(tuple(sys.argv[4:7])) 40 set_data.append(tuple(sys.argv[4:7]))
43 n = len(set_data) 46 n = len(set_data)
44 print("Doing %i-way Venn Diagram" % n) 47 print("Doing %i-way Venn Diagram" % n)
45 48
46 49
47 def load_ids(filename, filetype): 50 def load_ids(filename, filetype):
51 """Load ids from files."""
48 if filetype == "tabular": 52 if filetype == "tabular":
49 for line in open(filename): 53 for line in open(filename):
50 line = line.rstrip("\n") 54 line = line.rstrip("\n")
51 if line and not line.startswith("#"): 55 if line and not line.startswith("#"):
52 yield line.split("\t", 1)[0] 56 yield line.split("\t", 1)[0]
55 if line.startswith(">"): 59 if line.startswith(">"):
56 yield line[1:].rstrip("\n").split(None, 1)[0] 60 yield line[1:].rstrip("\n").split(None, 1)[0]
57 elif filetype.startswith("fastq"): 61 elif filetype.startswith("fastq"):
58 # Use the Galaxy library not Biopython to cope with CS 62 # Use the Galaxy library not Biopython to cope with CS
59 from galaxy_utils.sequence.fastq import fastqReader 63 from galaxy_utils.sequence.fastq import fastqReader
64
60 handle = open(filename, "rU") 65 handle = open(filename, "rU")
61 for record in fastqReader(handle): 66 for record in fastqReader(handle):
62 # The [1:] is because the fastaReader leaves the @ on the identifer. 67 # The [1:] is because the fastaReader leaves the @ on the identifer.
63 yield record.identifier.split()[0][1:] 68 yield record.identifier.split()[0][1:]
64 handle.close() 69 handle.close()
66 try: 71 try:
67 from Bio.SeqIO import index 72 from Bio.SeqIO import index
68 except ImportError: 73 except ImportError:
69 sys.exit("Require Biopython 1.54 or later (to read SFF files)") 74 sys.exit("Require Biopython 1.54 or later (to read SFF files)")
70 # This will read the SFF index block if present (very fast) 75 # This will read the SFF index block if present (very fast)
71 for name in index(filename, "sff"): 76 for this_name in index(filename, "sff"):
72 yield name 77 yield this_name
73 else: 78 else:
74 sys.exit("Unexpected file type %s" % filetype) 79 sys.exit("Unexpected file type %s" % filetype)
75 80
76 81
77 def load_ids_whitelist(filename, filetype, whitelist): 82 def load_ids_whitelist(filename, filetype, whitelist):
78 for name in load_ids(filename, filetype): 83 """Check if ids are in whitelist."""
79 if name in whitelist: 84 for single_name in load_ids(filename, filetype):
80 yield name 85 if single_name in whitelist:
86 yield single_name
81 else: 87 else:
82 sys.exit("Unexpected ID %s in %s file %s" % (name, filetype, filename)) 88 sys.exit(
89 "Unexpected ID %s in %s file %s" % (single_name, filetype, filename)
90 )
83 91
84 92
85 if all_file in ["", "-", '""', '"-"']: 93 if all_file in ["", "-", '""', '"-"']:
86 # Load without white list 94 # Load without white list
87 sets = [set(load_ids(f, t)) for (f, t, c) in set_data] 95 sets = [set(load_ids(f, t)) for (f, t, c) in set_data]
96 sets = [set(load_ids_whitelist(f, t, all_ids)) for (f, t, c) in set_data] 104 sets = [set(load_ids_whitelist(f, t, all_ids)) for (f, t, c) in set_data]
97 105
98 for s, (f, t, c) in zip(sets, set_data): 106 for s, (f, t, c) in zip(sets, set_data):
99 print("%i in %s" % (len(s), c)) 107 print("%i in %s" % (len(s), c))
100 108
101 # Now call R library to draw simple Venn diagram 109 names = [one_name for one_file, one_type, one_name in set_data]
110 lengths = [len(one_set) for one_set in sets]
111
112 if len(sets) == 3:
113 try:
114 from matplotlib_venn import venn3_unweighted
115 except ImportError:
116 sys.exit("Requires the Python library matplotlib_venn")
117 venn3_unweighted(
118 sets,
119 [
120 "{} (Total {})".format(name, length)
121 for (name, length) in zip(names, lengths)
122 ],
123 )
124
125 if len(sets) == 2:
126 try:
127 from matplotlib_venn import venn2_unweighted
128 except ImportError:
129 sys.exit("Requires the Python library matplotlib_venn")
130 venn2_unweighted(
131 sets,
132 [
133 "{} (Total {})".format(name, length)
134 for (name, length) in zip(names, lengths)
135 ],
136 )
137
138 # not sure what I am doing here,
139 # matplotlib_venn does not want to create a single Venn circle
140 # stick to the old behavior (rpy and Limma) as much as possible
141 if len(sets) == 1:
142 try:
143 from matplotlib_venn import venn2
144 except ImportError:
145 sys.exit("Requires the Python library matplotlib_venn")
146 venn2((sets[0], set()), [set_data[0][2], ""])
147
148 plt.title(all_label)
149 plt.savefig(pdf_file + ".pdf")
150
151 # Format "dat" is not supported.
102 try: 152 try:
103 # Create dummy Venn diagram counts object for three groups 153 move(pdf_file + ".pdf", pdf_file)
104 cols = 'c("%s")' % '","'.join("Set%i" % (i + 1) for i in range(n)) 154 except (OSError, IOError) as error:
105 rpy.r('groups <- cbind(%s)' % ','.join(['1'] * n)) 155 sys.exit("Fail to rename file {}".format(str(error)))
106 rpy.r('colnames(groups) <- %s' % cols) 156
107 rpy.r('vc <- vennCounts(groups)') 157 plt.close()
108 # Populate the 2^n classes with real counts 158
109 # Don't make any assumptions about the class order
110 # print(rpy.r('vc'))
111 for index, row in enumerate(rpy.r('vc[,%s]' % cols)):
112 if isinstance(row, int) or isinstance(row, float):
113 # Hack for rpy being too clever for single element row
114 row = [row]
115 names = all_ids
116 for wanted, s in zip(row, sets):
117 if wanted:
118 names = names.intersection(s)
119 else:
120 names = names.difference(s)
121 rpy.r('vc[%i,"Counts"] <- %i' % (index + 1, len(names)))
122 # print(rpy.r('vc'))
123 if n == 1:
124 # Single circle, don't need to add (Total XXX) line
125 names = [c for (t, f, c) in set_data]
126 else:
127 names = ["%s\n(Total %i)" % (c, len(s)) for s, (f, t, c) in zip(sets, set_data)]
128 rpy.r.assign("names", names)
129 rpy.r.assign("colors", ["red", "green", "blue"][:n])
130 rpy.r.pdf(pdf_file, 8, 8)
131 rpy.r("""vennDiagram(vc, include="both", names=names,
132 main="%s", sub="(Total %i)",
133 circle.col=colors)
134 """ % (all_label, len(all_ids)))
135 rpy.r.dev_off()
136 except Exception, exc:
137 sys.exit("%s" % str(exc))
138 rpy.r.quit(save="no")
139 print("Done") 159 print("Done")