comparison tools/protein_analysis/predictnls.py @ 0:6e26c5a48e9a draft

Uploaded v0.0.4, first public release.
author peterjc
date Wed, 20 Feb 2013 11:39:06 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:6e26c5a48e9a
1 #!/usr/bin/env python
2
3 #Copyright 2011-2013 by Peter Cock, James Hutton Institute (formerly SCRI), UK
4 #
5 #Licenced under the GPL (GNU General Public Licence) version 3.
6 #
7 #Based on Perl script predictNLS v1.3, copyright 2001-2005 and the later
8 #versions up to predictnls v1.0.20 (copright 2012), by Rajesh Nair
9 #(nair@rostlab.org) and Burkhard Rost (rost@rostlab.org), Rost Lab,
10 #Columbia University http://rostlab.org/
11
12 """Batch mode predictNLS, for finding nuclear localization signals
13
14 This is a Python script re-implementing the predictNLS method, originally
15 written in Perl, described here:
16
17 Murat Cokol, Rajesh Nair, and Burkhard Rost.
18 Finding nuclear localization signals.
19 EMBO reports 1(5), 411-415, 2000
20
21 http://dx.doi.org/10.1093/embo-reports/kvd092
22
23 The original Perl script was designed to work on a single sequence at a time,
24 but offers quite detailed output, including HTML (webpage).
25
26 This Python version is designed to work on a single FASTA file containing
27 multiple sequences, and produces a single tabular output file, with one line
28 per NLS found (i.e. zero or more rows per query sequence).
29
30 It takes either two or three command line arguments:
31
32 predictNLS_batch input_file output_file [nls_motif_file]
33
34 The input file should be protein sequences in FASTA format, the output file
35 is tab separated plain text, and the NLS motif file defaults to using the
36 plain text My_NLS_list file located next to the script file, or in a data
37 subdirectory.
38
39 For convience if using this outside Galaxy, the input filename can be '-'
40 to mean stdin, and likewise the output filename can be '-' to mean stdout.
41
42 Tested with the My_NLS_list file included with predictnls-1.0.7.tar.gz to
43 predictnls-1.0.20.tar.gz inclusive (the list was extended in v1.0.7 in
44 August 2010, see the change log included in those tar-balls).
45
46 The Rost Lab provide source code tar balls for predictNLS on the FTP site
47 ftp://rostlab.org/predictnls/ but for Debian or RedHat based Linux they
48 recommend their package repository instead,
49 https://rostlab.org/owiki/index.php/Packages
50 """
51
52 import os
53 import sys
54 import re
55
56 def stop_err(msg, return_code=1):
57 sys.stderr.write(msg.rstrip() + "\n")
58 sys.exit(return_code)
59
60 if len(sys.argv) == 4:
61 fasta_filename, tabular_filename, re_filename = sys.argv[1:]
62 elif len(sys.argv) == 3:
63 fasta_filename, tabular_filename = sys.argv[1:]
64 #Use os.path.realpath(...) to handle being called via a symlink
65 #Try under subdirectory data:
66 re_filename = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])),
67 "data", "My_NLS_list")
68 if not os.path.isfile(re_filename):
69 #Try in same directory as this script:
70 re_filename = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])),
71 "My_NLS_list")
72 else:
73 stop_err("Expect 2 or 3 arguments: input FASTA file, output tabular file, and NLS motif file")
74
75 if not os.path.isfile(fasta_filename):
76 stop_err("Could not find FASTA input file: %s" % fasta_filename)
77
78 if not os.path.isfile(re_filename):
79 stop_err("Could not find NLS motif file: %s" % re_filename)
80
81 def load_re(filename):
82 """Parse the 5+ column tabular NLS motif file."""
83 handle = open(filename, "rU")
84 for line in handle:
85 line = line.rstrip("\n")
86 if not line:
87 continue
88 parts = line.split("\t")
89 assert 5 <= len(parts), parts
90 regex, evidence, p_count, percent_nuc, precent_non_nuc = parts[0:5]
91 try:
92 regex = re.compile(regex)
93 p_count = int(p_count)
94 except ValueError:
95 stop_err("Bad data in line: %s" % line)
96 if 6 <= len(parts):
97 proteins = parts[5]
98 assert p_count == len(proteins.split(",")), line
99 else:
100 proteins = ""
101 assert p_count == 0
102 if 7 <= len(parts):
103 domains = parts[6]
104 assert int(p_count) == len(domains.split(",")), line
105 else:
106 domains = ""
107 assert p_count == 0
108 #There can be further columns (DNA binding?), but we don't use them.
109 yield regex, evidence, p_count, percent_nuc, proteins, domains
110 handle.close()
111
112 def fasta_iterator(filename):
113 """Simple FASTA parser yielding tuples of (name, upper case sequence)."""
114 if filename == "-":
115 handle = sys.stdin
116 else:
117 handle = open(filename)
118 name, seq = "", ""
119 for line in handle:
120 if line.startswith(">"):
121 if name:
122 yield name, seq
123 #Take the first word only as the name:
124 name = line[1:].rstrip().split(None,1)[0]
125 seq = ""
126 elif name:
127 #Simple way would leave in any internal white space,
128 #seq += line.strip().upper()
129 seq += "".join(line.strip().upper().split())
130 elif not line.strip():
131 #Ignore blank lines before first record
132 pass
133 else:
134 raise ValueError("Bad FASTA line %r" % line)
135 if filename != "-":
136 handle.close()
137 if name:
138 yield name, seq
139 raise StopIteration
140
141 motifs = list(load_re(re_filename))
142 print "Looking for %i NLS motifs" % len(motifs)
143
144 if tabular_filename == "-":
145 out_handle = sys.stdout
146 else:
147 out_handle = open(tabular_filename, "w")
148 out_handle.write("#ID\tNLS start\tNLS seq\tNLS pattern\tType\tProtCount\t%NucProt\tProtList\tProtLoci\n")
149 count = 0
150 nls = 0
151 for idn, seq in fasta_iterator(fasta_filename):
152 for regex, evidence, p_count, percent_nuc_prot, proteins, domains in motifs:
153 #Perl predictnls v1.0.17 (and older) take right most hit only, Bug #40
154 #This has been fixed (v1.0.18 onwards, June 2011), so we return all the matches
155 for match in regex.finditer(seq):
156 #Perl predictnls v1.0.17 (and older) return NLS start position with zero
157 #but changed to one based counting in v1.0.18 (June 2011) onwards, Bug #38
158 #We therefore also use one based couting, hence the start+1 here:
159 out_handle.write("%s\t%i\t%s\t%s\t%s\t%i\t%s\t%s\t%s\n" \
160 % (idn, match.start()+1, match.group(),
161 regex.pattern, evidence, p_count,
162 percent_nuc_prot, proteins, domains))
163 nls += 1
164 count += 1
165 if tabular_filename != "-":
166 out_handle.close()
167 print "Found %i NLS motifs in %i sequences" % (nls, count)