Mercurial > repos > peterjc > venn_list
comparison tools/venn_list/venn_list.py @ 11:679b6323db03 draft
v0.1.0 now using matplotlib_venn instead of limma R package via rpy. Contribution from Frederic Sapet.
author | peterjc |
---|---|
date | Wed, 29 May 2019 04:45:19 -0400 |
parents | 20d347feb882 |
children |
comparison
equal
deleted
inserted
replaced
10:20d347feb882 | 11:679b6323db03 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 """Plot up to 3-way Venn Diagram using R limma vennDiagram (via rpy) | 2 |
3 """ | |
4 Plot up to 3-way Venn Diagram. | |
5 | |
6 It uses the Python libraries matplotlib and matplotlib_venn. | |
3 | 7 |
4 This script is copyright 2010-2017 by Peter Cock, The James Hutton Institute | 8 This script is copyright 2010-2017 by Peter Cock, The James Hutton Institute |
5 (formerly SCRI), UK. All rights reserved. | 9 (formerly SCRI), UK. All rights reserved. |
6 | 10 |
7 See accompanying text file for licence details (MIT License). | 11 See accompanying text file for licence details (MIT License). |
8 """ | 12 """ |
9 | 13 |
10 from __future__ import print_function | 14 from __future__ import print_function |
11 | 15 |
12 import sys | 16 import sys |
17 from shutil import move | |
13 | 18 |
14 if "-v" in sys.argv or "--version" in sys.argv: | 19 if "-v" in sys.argv or "--version" in sys.argv: |
15 print("v0.0.11") | 20 print("v0.1.0") |
16 sys.exit(0) | 21 sys.exit(0) |
17 | 22 |
18 try: | 23 try: |
19 import rpy | 24 import matplotlib |
20 except ImportError: | 25 except ImportError: |
21 sys.exit("Requires the Python library rpy (to call R)") | 26 sys.exit("Requires the Python library matplotlib") |
22 except RuntimeError, e: | |
23 sys.exit("The Python library rpy is not availble for the current R version\n\n%s" % e) | |
24 | 27 |
25 try: | 28 matplotlib.use("Agg") |
26 rpy.r.library("limma") | 29 from matplotlib import pyplot as plt |
27 except Exception: | |
28 sys.exit("Requires the R library limma (for vennDiagram function)") | |
29 | |
30 | 30 |
31 if len(sys.argv) - 1 not in [7, 10, 13]: | 31 if len(sys.argv) - 1 not in [7, 10, 13]: |
32 sys.exit("Expected 7, 10 or 13 arguments (for 1, 2 or 3 sets), not %i" % (len(sys.argv) - 1)) | 32 sys.exit( |
33 "Expected 7, 10 or 13 arguments (for 1, 2 or 3 sets), not %i" | |
34 % (len(sys.argv) - 1) | |
35 ) | |
33 | 36 |
34 all_file, all_type, all_label = sys.argv[1:4] | 37 all_file, all_type, all_label = sys.argv[1:4] |
35 set_data = [] | 38 set_data = [] |
36 if len(sys.argv) - 1 >= 7: | 39 if len(sys.argv) - 1 >= 7: |
37 set_data.append(tuple(sys.argv[4:7])) | 40 set_data.append(tuple(sys.argv[4:7])) |
43 n = len(set_data) | 46 n = len(set_data) |
44 print("Doing %i-way Venn Diagram" % n) | 47 print("Doing %i-way Venn Diagram" % n) |
45 | 48 |
46 | 49 |
47 def load_ids(filename, filetype): | 50 def load_ids(filename, filetype): |
51 """Load ids from files.""" | |
48 if filetype == "tabular": | 52 if filetype == "tabular": |
49 for line in open(filename): | 53 for line in open(filename): |
50 line = line.rstrip("\n") | 54 line = line.rstrip("\n") |
51 if line and not line.startswith("#"): | 55 if line and not line.startswith("#"): |
52 yield line.split("\t", 1)[0] | 56 yield line.split("\t", 1)[0] |
55 if line.startswith(">"): | 59 if line.startswith(">"): |
56 yield line[1:].rstrip("\n").split(None, 1)[0] | 60 yield line[1:].rstrip("\n").split(None, 1)[0] |
57 elif filetype.startswith("fastq"): | 61 elif filetype.startswith("fastq"): |
58 # Use the Galaxy library not Biopython to cope with CS | 62 # Use the Galaxy library not Biopython to cope with CS |
59 from galaxy_utils.sequence.fastq import fastqReader | 63 from galaxy_utils.sequence.fastq import fastqReader |
64 | |
60 handle = open(filename, "rU") | 65 handle = open(filename, "rU") |
61 for record in fastqReader(handle): | 66 for record in fastqReader(handle): |
62 # The [1:] is because the fastaReader leaves the @ on the identifer. | 67 # The [1:] is because the fastaReader leaves the @ on the identifer. |
63 yield record.identifier.split()[0][1:] | 68 yield record.identifier.split()[0][1:] |
64 handle.close() | 69 handle.close() |
66 try: | 71 try: |
67 from Bio.SeqIO import index | 72 from Bio.SeqIO import index |
68 except ImportError: | 73 except ImportError: |
69 sys.exit("Require Biopython 1.54 or later (to read SFF files)") | 74 sys.exit("Require Biopython 1.54 or later (to read SFF files)") |
70 # This will read the SFF index block if present (very fast) | 75 # This will read the SFF index block if present (very fast) |
71 for name in index(filename, "sff"): | 76 for this_name in index(filename, "sff"): |
72 yield name | 77 yield this_name |
73 else: | 78 else: |
74 sys.exit("Unexpected file type %s" % filetype) | 79 sys.exit("Unexpected file type %s" % filetype) |
75 | 80 |
76 | 81 |
77 def load_ids_whitelist(filename, filetype, whitelist): | 82 def load_ids_whitelist(filename, filetype, whitelist): |
78 for name in load_ids(filename, filetype): | 83 """Check if ids are in whitelist.""" |
79 if name in whitelist: | 84 for single_name in load_ids(filename, filetype): |
80 yield name | 85 if single_name in whitelist: |
86 yield single_name | |
81 else: | 87 else: |
82 sys.exit("Unexpected ID %s in %s file %s" % (name, filetype, filename)) | 88 sys.exit( |
89 "Unexpected ID %s in %s file %s" % (single_name, filetype, filename) | |
90 ) | |
83 | 91 |
84 | 92 |
85 if all_file in ["", "-", '""', '"-"']: | 93 if all_file in ["", "-", '""', '"-"']: |
86 # Load without white list | 94 # Load without white list |
87 sets = [set(load_ids(f, t)) for (f, t, c) in set_data] | 95 sets = [set(load_ids(f, t)) for (f, t, c) in set_data] |
96 sets = [set(load_ids_whitelist(f, t, all_ids)) for (f, t, c) in set_data] | 104 sets = [set(load_ids_whitelist(f, t, all_ids)) for (f, t, c) in set_data] |
97 | 105 |
98 for s, (f, t, c) in zip(sets, set_data): | 106 for s, (f, t, c) in zip(sets, set_data): |
99 print("%i in %s" % (len(s), c)) | 107 print("%i in %s" % (len(s), c)) |
100 | 108 |
101 # Now call R library to draw simple Venn diagram | 109 names = [one_name for one_file, one_type, one_name in set_data] |
110 lengths = [len(one_set) for one_set in sets] | |
111 | |
112 if len(sets) == 3: | |
113 try: | |
114 from matplotlib_venn import venn3_unweighted | |
115 except ImportError: | |
116 sys.exit("Requires the Python library matplotlib_venn") | |
117 venn3_unweighted( | |
118 sets, | |
119 [ | |
120 "{} (Total {})".format(name, length) | |
121 for (name, length) in zip(names, lengths) | |
122 ], | |
123 ) | |
124 | |
125 if len(sets) == 2: | |
126 try: | |
127 from matplotlib_venn import venn2_unweighted | |
128 except ImportError: | |
129 sys.exit("Requires the Python library matplotlib_venn") | |
130 venn2_unweighted( | |
131 sets, | |
132 [ | |
133 "{} (Total {})".format(name, length) | |
134 for (name, length) in zip(names, lengths) | |
135 ], | |
136 ) | |
137 | |
138 # not sure what I am doing here, | |
139 # matplotlib_venn does not want to create a single Venn circle | |
140 # stick to the old behavior (rpy and Limma) as much as possible | |
141 if len(sets) == 1: | |
142 try: | |
143 from matplotlib_venn import venn2 | |
144 except ImportError: | |
145 sys.exit("Requires the Python library matplotlib_venn") | |
146 venn2((sets[0], set()), [set_data[0][2], ""]) | |
147 | |
148 plt.title(all_label) | |
149 plt.savefig(pdf_file + ".pdf") | |
150 | |
151 # Format "dat" is not supported. | |
102 try: | 152 try: |
103 # Create dummy Venn diagram counts object for three groups | 153 move(pdf_file + ".pdf", pdf_file) |
104 cols = 'c("%s")' % '","'.join("Set%i" % (i + 1) for i in range(n)) | 154 except (OSError, IOError) as error: |
105 rpy.r('groups <- cbind(%s)' % ','.join(['1'] * n)) | 155 sys.exit("Fail to rename file {}".format(str(error))) |
106 rpy.r('colnames(groups) <- %s' % cols) | 156 |
107 rpy.r('vc <- vennCounts(groups)') | 157 plt.close() |
108 # Populate the 2^n classes with real counts | 158 |
109 # Don't make any assumptions about the class order | |
110 # print(rpy.r('vc')) | |
111 for index, row in enumerate(rpy.r('vc[,%s]' % cols)): | |
112 if isinstance(row, int) or isinstance(row, float): | |
113 # Hack for rpy being too clever for single element row | |
114 row = [row] | |
115 names = all_ids | |
116 for wanted, s in zip(row, sets): | |
117 if wanted: | |
118 names = names.intersection(s) | |
119 else: | |
120 names = names.difference(s) | |
121 rpy.r('vc[%i,"Counts"] <- %i' % (index + 1, len(names))) | |
122 # print(rpy.r('vc')) | |
123 if n == 1: | |
124 # Single circle, don't need to add (Total XXX) line | |
125 names = [c for (t, f, c) in set_data] | |
126 else: | |
127 names = ["%s\n(Total %i)" % (c, len(s)) for s, (f, t, c) in zip(sets, set_data)] | |
128 rpy.r.assign("names", names) | |
129 rpy.r.assign("colors", ["red", "green", "blue"][:n]) | |
130 rpy.r.pdf(pdf_file, 8, 8) | |
131 rpy.r("""vennDiagram(vc, include="both", names=names, | |
132 main="%s", sub="(Total %i)", | |
133 circle.col=colors) | |
134 """ % (all_label, len(all_ids))) | |
135 rpy.r.dev_off() | |
136 except Exception, exc: | |
137 sys.exit("%s" % str(exc)) | |
138 rpy.r.quit(save="no") | |
139 print("Done") | 159 print("Done") |