annotate tools/venn_list/venn_list.py @ 8:ee50d9ef9d69 draft

v0.0.11 Python 3 compatible print; capture script version
author peterjc
date Thu, 11 May 2017 06:21:20 -0400
parents ea68a1a4c1d9
children 20d347feb882
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
1 #!/usr/bin/env python
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
2 """Plot up to 3-way Venn Diagram using R limma vennDiagram (via rpy)
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
3
8
ee50d9ef9d69 v0.0.11 Python 3 compatible print; capture script version
peterjc
parents: 6
diff changeset
4 This script is copyright 2010-2017 by Peter Cock, The James Hutton Institute
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
5 (formerly SCRI), UK. All rights reserved.
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
6
8
ee50d9ef9d69 v0.0.11 Python 3 compatible print; capture script version
peterjc
parents: 6
diff changeset
7 See accompanying text file for licence details (MIT License).
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
8 """
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
9
8
ee50d9ef9d69 v0.0.11 Python 3 compatible print; capture script version
peterjc
parents: 6
diff changeset
10 from __futute__ import print_function
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
11
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
12 import sys
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
13
8
ee50d9ef9d69 v0.0.11 Python 3 compatible print; capture script version
peterjc
parents: 6
diff changeset
14 if "-v" in sys.argv or "--version" in sys.argv:
ee50d9ef9d69 v0.0.11 Python 3 compatible print; capture script version
peterjc
parents: 6
diff changeset
15 print("v0.0.11")
ee50d9ef9d69 v0.0.11 Python 3 compatible print; capture script version
peterjc
parents: 6
diff changeset
16 sys.exit(0)
ee50d9ef9d69 v0.0.11 Python 3 compatible print; capture script version
peterjc
parents: 6
diff changeset
17
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
18 try:
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
19 import rpy
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
20 except ImportError:
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
21 sys.exit("Requires the Python library rpy (to call R)")
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
22 except RuntimeError, e:
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
23 sys.exit("The Python library rpy is not availble for the current R version\n\n%s" % e)
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
24
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
25 try:
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
26 rpy.r.library("limma")
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
27 except Exception:
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
28 sys.exit("Requires the R library limma (for vennDiagram function)")
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
29
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
30
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
31 if len(sys.argv) - 1 not in [7, 10, 13]:
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
32 sys.exit("Expected 7, 10 or 13 arguments (for 1, 2 or 3 sets), not %i" % (len(sys.argv) - 1))
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
33
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
34 all_file, all_type, all_label = sys.argv[1:4]
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
35 set_data = []
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
36 if len(sys.argv) - 1 >= 7:
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
37 set_data.append(tuple(sys.argv[4:7]))
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
38 if len(sys.argv) - 1 >= 10:
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
39 set_data.append(tuple(sys.argv[7:10]))
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
40 if len(sys.argv) - 1 >= 13:
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
41 set_data.append(tuple(sys.argv[10:13]))
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
42 pdf_file = sys.argv[-1]
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
43 n = len(set_data)
8
ee50d9ef9d69 v0.0.11 Python 3 compatible print; capture script version
peterjc
parents: 6
diff changeset
44 print("Doing %i-way Venn Diagram" % n)
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
45
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
46
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
47 def load_ids(filename, filetype):
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
48 if filetype == "tabular":
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
49 for line in open(filename):
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
50 line = line.rstrip("\n")
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
51 if line and not line.startswith("#"):
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
52 yield line.split("\t", 1)[0]
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
53 elif filetype == "fasta":
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
54 for line in open(filename):
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
55 if line.startswith(">"):
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
56 yield line[1:].rstrip("\n").split(None, 1)[0]
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
57 elif filetype.startswith("fastq"):
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
58 # Use the Galaxy library not Biopython to cope with CS
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
59 from galaxy_utils.sequence.fastq import fastqReader
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
60 handle = open(filename, "rU")
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
61 for record in fastqReader(handle):
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
62 # The [1:] is because the fastaReader leaves the @ on the identifer.
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
63 yield record.identifier.split()[0][1:]
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
64 handle.close()
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
65 elif filetype == "sff":
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
66 try:
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
67 from Bio.SeqIO import index
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
68 except ImportError:
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
69 sys.exit("Require Biopython 1.54 or later (to read SFF files)")
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
70 # This will read the SFF index block if present (very fast)
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
71 for name in index(filename, "sff"):
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
72 yield name
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
73 else:
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
74 sys.exit("Unexpected file type %s" % filetype)
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
75
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
76
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
77 def load_ids_whitelist(filename, filetype, whitelist):
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
78 for name in load_ids(filename, filetype):
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
79 if name in whitelist:
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
80 yield name
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
81 else:
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
82 sys.exit("Unexpected ID %s in %s file %s" % (name, filetype, filename))
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
83
8
ee50d9ef9d69 v0.0.11 Python 3 compatible print; capture script version
peterjc
parents: 6
diff changeset
84
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
85 if all_file in ["", "-", '""', '"-"']:
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
86 # Load without white list
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
87 sets = [set(load_ids(f, t)) for (f, t, c) in set_data]
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
88 # Take union
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
89 all_ids = set()
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
90 for s in sets:
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
91 all_ids.update(s)
8
ee50d9ef9d69 v0.0.11 Python 3 compatible print; capture script version
peterjc
parents: 6
diff changeset
92 print("Inferred total of %i IDs" % len(all_ids))
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
93 else:
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
94 all_ids = set(load_ids(all_file, all_type))
8
ee50d9ef9d69 v0.0.11 Python 3 compatible print; capture script version
peterjc
parents: 6
diff changeset
95 print("Total of %i IDs" % len(all_ids))
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
96 sets = [set(load_ids_whitelist(f, t, all_ids)) for (f, t, c) in set_data]
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
97
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
98 for s, (f, t, c) in zip(sets, set_data):
8
ee50d9ef9d69 v0.0.11 Python 3 compatible print; capture script version
peterjc
parents: 6
diff changeset
99 print("%i in %s" % (len(s), c))
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
100
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
101 # Now call R library to draw simple Venn diagram
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
102 try:
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
103 # Create dummy Venn diagram counts object for three groups
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
104 cols = 'c("%s")' % '","'.join("Set%i" % (i + 1) for i in range(n))
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
105 rpy.r('groups <- cbind(%s)' % ','.join(['1'] * n))
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
106 rpy.r('colnames(groups) <- %s' % cols)
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
107 rpy.r('vc <- vennCounts(groups)')
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
108 # Populate the 2^n classes with real counts
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
109 # Don't make any assumptions about the class order
8
ee50d9ef9d69 v0.0.11 Python 3 compatible print; capture script version
peterjc
parents: 6
diff changeset
110 # print(rpy.r('vc'))
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
111 for index, row in enumerate(rpy.r('vc[,%s]' % cols)):
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
112 if isinstance(row, int) or isinstance(row, float):
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
113 # Hack for rpy being too clever for single element row
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
114 row = [row]
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
115 names = all_ids
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
116 for wanted, s in zip(row, sets):
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
117 if wanted:
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
118 names = names.intersection(s)
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
119 else:
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
120 names = names.difference(s)
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
121 rpy.r('vc[%i,"Counts"] <- %i' % (index + 1, len(names)))
8
ee50d9ef9d69 v0.0.11 Python 3 compatible print; capture script version
peterjc
parents: 6
diff changeset
122 # print(rpy.r('vc'))
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
123 if n == 1:
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
124 # Single circle, don't need to add (Total XXX) line
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
125 names = [c for (t, f, c) in set_data]
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
126 else:
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
127 names = ["%s\n(Total %i)" % (c, len(s)) for s, (f, t, c) in zip(sets, set_data)]
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
128 rpy.r.assign("names", names)
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
129 rpy.r.assign("colors", ["red", "green", "blue"][:n])
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
130 rpy.r.pdf(pdf_file, 8, 8)
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
131 rpy.r("""vennDiagram(vc, include="both", names=names,
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
132 main="%s", sub="(Total %i)",
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
133 circle.col=colors)
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
134 """ % (all_label, len(all_ids)))
4
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
135 rpy.r.dev_off()
991342eca214 Uploaded v0.0.8a, declare Biopython dependency via Tool Shed
peterjc
parents:
diff changeset
136 except Exception, exc:
6
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
137 sys.exit("%s" % str(exc))
ea68a1a4c1d9 v0.0.10 explicit galaxy_sequence_utils dependency etc
peterjc
parents: 4
diff changeset
138 rpy.r.quit(save="no")
8
ee50d9ef9d69 v0.0.11 Python 3 compatible print; capture script version
peterjc
parents: 6
diff changeset
139 print("Done")