annotate tools/protein_analysis/seq_analysis_utils.py @ 5:0f1c61998b22

Migrated tool version 0.0.8 from old tool shed archive to new tool shed repository
author peterjc
date Tue, 07 Jun 2011 18:06:27 -0400
parents f3b373a41f81
children a290c6d4e658
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
1 """A few useful functions for working with FASTA files and running jobs.
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
2
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
3 This module was originally written to hold common code used in both the TMHMM
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
4 and SignalP wrappers in Galaxy.
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
5
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
6 Given Galaxy currently supports Python 2.4+ this cannot use the Python module
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
7 multiprocessing so the function run_jobs instead is a simple pool approach
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
8 using just the subprocess library.
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
9 """
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
10 import sys
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
11 import os
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
12 import subprocess
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
13 from time import sleep
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
14
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
15 __version__ = "0.0.1"
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
16
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
17 def stop_err(msg, error_level=1):
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
18 """Print error message to stdout and quit with given error level."""
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
19 sys.stderr.write("%s\n" % msg)
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
20 sys.exit(error_level)
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
21
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
22 def fasta_iterator(filename, max_len=None, truncate=None):
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
23 """Simple FASTA parser yielding tuples of (title, sequence) strings."""
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
24 handle = open(filename)
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
25 title, seq = "", ""
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
26 for line in handle:
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
27 if line.startswith(">"):
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
28 if title:
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
29 if truncate:
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
30 seq = seq[:truncate]
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
31 if max_len and len(seq) > max_len:
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
32 raise ValueError("Sequence %s is length %i, max length %i" \
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
33 % (title.split()[0], len(seq), max_len))
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
34 yield title, seq
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
35 title = line[1:].rstrip()
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
36 seq = ""
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
37 elif title:
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
38 seq += line.strip()
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
39 elif not line.strip() or line.startswith("#"):
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
40 #Ignore blank lines, and any comment lines
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
41 #between records (starting with hash).
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
42 pass
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
43 else:
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
44 raise ValueError("Bad FASTA line %r" % line)
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
45 handle.close()
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
46 if title:
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
47 if truncate:
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
48 seq = seq[:truncate]
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
49 if max_len and len(seq) > max_len:
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
50 raise ValueError("Sequence %s is length %i, max length %i" \
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
51 % (title.split()[0], len(seq), max_len))
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
52 yield title, seq
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
53 raise StopIteration
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
54
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
55 def split_fasta(input_filename, output_filename_base, n=500, truncate=None, keep_descr=False, max_len=None):
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
56 """Split FASTA file into sub-files each of at most n sequences.
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
57
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
58 Returns a list of the filenames used (based on the input filename).
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
59 Each sequence can also be truncated (since we only need the start for
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
60 SignalP), and have its description discarded (since we don't usually
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
61 care about it and some tools don't like very long title lines).
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
62
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
63 If a max_len is given and any sequence exceeds it no temp files are
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
64 created and an exception is raised.
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
65 """
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
66 iterator = fasta_iterator(input_filename, max_len, truncate)
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
67 files = []
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
68 try:
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
69 while True:
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
70 records = []
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
71 for i in range(n):
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
72 try:
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
73 records.append(iterator.next())
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
74 except StopIteration:
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
75 break
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
76 if not records:
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
77 break
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
78 new_filename = "%s.%i.tmp" % (output_filename_base, len(files))
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
79 handle = open(new_filename, "w")
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
80 if keep_descr:
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
81 for title, seq in records:
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
82 handle.write(">%s\n" % title)
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
83 for i in range(0, len(seq), 60):
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
84 handle.write(seq[i:i+60] + "\n")
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
85 else:
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
86 for title, seq in records:
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
87 handle.write(">%s\n" % title.split()[0])
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
88 for i in range(0, len(seq), 60):
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
89 handle.write(seq[i:i+60] + "\n")
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
90 handle.close()
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
91 files.append(new_filename)
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
92 #print "%i records in %s" % (len(records), new_filename)
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
93 except ValueError, err:
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
94 #Max length failure from parser - clean up
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
95 try:
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
96 handle.close()
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
97 except:
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
98 pass
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
99 for f in files:
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
100 if os.path.isfile(f):
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
101 os.remove(f)
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
102 raise err
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
103 return files
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
104
3
f3b373a41f81 Migrated tool version 0.0.6 from old tool shed archive to new tool shed repository
peterjc
parents: 0
diff changeset
105 def run_jobs(jobs, threads, verbose=False):
0
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
106 """Takes list of cmd strings, returns dict with error levels."""
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
107 pending = jobs[:]
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
108 running = []
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
109 results = {}
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
110 while pending or running:
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
111 #See if any have finished
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
112 for (cmd, process) in running:
3
f3b373a41f81 Migrated tool version 0.0.6 from old tool shed archive to new tool shed repository
peterjc
parents: 0
diff changeset
113 return_code = process.poll() #non-blocking
0
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
114 if return_code is not None:
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
115 results[cmd] = return_code
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
116 running = [(cmd, process) for (cmd, process) in running \
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
117 if cmd not in results]
3
f3b373a41f81 Migrated tool version 0.0.6 from old tool shed archive to new tool shed repository
peterjc
parents: 0
diff changeset
118 if verbose:
f3b373a41f81 Migrated tool version 0.0.6 from old tool shed archive to new tool shed repository
peterjc
parents: 0
diff changeset
119 print "%i jobs pending, %i running, %i completed" \
f3b373a41f81 Migrated tool version 0.0.6 from old tool shed archive to new tool shed repository
peterjc
parents: 0
diff changeset
120 % (len(pending), len(running), len(results))
0
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
121 #See if we can start any new threads
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
122 while pending and len(running) < threads:
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
123 cmd = pending.pop(0)
3
f3b373a41f81 Migrated tool version 0.0.6 from old tool shed archive to new tool shed repository
peterjc
parents: 0
diff changeset
124 if verbose:
f3b373a41f81 Migrated tool version 0.0.6 from old tool shed archive to new tool shed repository
peterjc
parents: 0
diff changeset
125 print cmd
0
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
126 process = subprocess.Popen(cmd, shell=True)
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
127 running.append((cmd, process))
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
128 #Loop...
3
f3b373a41f81 Migrated tool version 0.0.6 from old tool shed archive to new tool shed repository
peterjc
parents: 0
diff changeset
129 sleep(10)
f3b373a41f81 Migrated tool version 0.0.6 from old tool shed archive to new tool shed repository
peterjc
parents: 0
diff changeset
130 if verbose:
f3b373a41f81 Migrated tool version 0.0.6 from old tool shed archive to new tool shed repository
peterjc
parents: 0
diff changeset
131 print "%i jobs completed" % len(results)
0
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
132 assert set(jobs) == set(results)
bca9bc7fdaef Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
133 return results