predictnls: tools/protein_analysis/predictnls.py comparison

comparison tools/protein_analysis/predictnls.py @ 0:6e26c5a48e9a draft

Uploaded v0.0.4, first public release.

author	peterjc
date	Wed, 20 Feb 2013 11:39:06 -0500
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:6e26c5a48e9a
+#!/usr/bin/env python
+#Copyright 2011-2013 by Peter Cock, James Hutton Institute (formerly SCRI), UK
+#
+#Licenced under the GPL (GNU General Public Licence) version 3.
+#
+#Based on Perl script predictNLS v1.3, copyright 2001-2005 and the later
+#versions up to predictnls v1.0.20 (copright 2012), by Rajesh Nair
+#(nair@rostlab.org) and Burkhard Rost (rost@rostlab.org), Rost Lab,
+#Columbia University http://rostlab.org/
+"""Batch mode predictNLS, for finding nuclear localization signals
+This is a Python script re-implementing the predictNLS method, originally
+written in Perl, described here:
+Murat Cokol, Rajesh Nair, and Burkhard Rost.
+Finding nuclear localization signals.
+EMBO reports 1(5), 411-415, 2000
+http://dx.doi.org/10.1093/embo-reports/kvd092
+The original Perl script was designed to work on a single sequence at a time,
+but offers quite detailed output, including HTML (webpage).
+This Python version is designed to work on a single FASTA file containing
+multiple sequences, and produces a single tabular output file, with one line
+per NLS found (i.e. zero or more rows per query sequence).
+It takes either two or three command line arguments:
+predictNLS_batch input_file output_file [nls_motif_file]
+The input file should be protein sequences in FASTA format, the output file
+is tab separated plain text, and the NLS motif file defaults to using the
+plain text My_NLS_list file located next to the script file, or in a data
+subdirectory.
+For convience if using this outside Galaxy, the input filename can be '-'
+to mean stdin, and likewise the output filename can be '-' to mean stdout.
+Tested with the My_NLS_list file included with predictnls-1.0.7.tar.gz to
+predictnls-1.0.20.tar.gz inclusive (the list was extended in v1.0.7 in
+August 2010, see the change log included in those tar-balls).
+The Rost Lab provide source code tar balls for predictNLS on the FTP site
+ftp://rostlab.org/predictnls/ but for Debian or RedHat based Linux they
+recommend their package repository instead,
+https://rostlab.org/owiki/index.php/Packages
+"""
+import os
+import sys
+import re
+def stop_err(msg, return_code=1):
+sys.stderr.write(msg.rstrip() + "\n")
+sys.exit(return_code)
+if len(sys.argv) == 4:
+fasta_filename, tabular_filename, re_filename = sys.argv[1:]
+elif len(sys.argv) == 3:
+fasta_filename, tabular_filename = sys.argv[1:]
+#Use os.path.realpath(...) to handle being called via a symlink
+#Try under subdirectory data:
+re_filename = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])),
+"data", "My_NLS_list")
+if not os.path.isfile(re_filename):
+#Try in same directory as this script:
+re_filename = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])),
+"My_NLS_list")
+else:
+stop_err("Expect 2 or 3 arguments: input FASTA file, output tabular file, and NLS motif file")
+if not os.path.isfile(fasta_filename):
+stop_err("Could not find FASTA input file: %s" % fasta_filename)
+if not os.path.isfile(re_filename):
+stop_err("Could not find NLS motif file: %s" % re_filename)
+def load_re(filename):
+"""Parse the 5+ column tabular NLS motif file."""
+handle = open(filename, "rU")
+for line in handle:
+line = line.rstrip("\n")
+if not line:
+continue
+parts = line.split("\t")
+assert 5 <= len(parts), parts
+regex, evidence, p_count, percent_nuc, precent_non_nuc = parts[0:5]
+try:
+regex = re.compile(regex)
+p_count = int(p_count)
+except ValueError:
+stop_err("Bad data in line: %s" % line)
+if 6 <= len(parts):
+proteins = parts[5]
+assert p_count == len(proteins.split(",")), line
+else:
+proteins = ""
+assert p_count == 0
+if 7 <= len(parts):
+domains = parts[6]
+assert int(p_count) == len(domains.split(",")), line
+else:
+domains = ""
+assert p_count == 0
+#There can be further columns (DNA binding?), but we don't use them.
+yield regex, evidence, p_count, percent_nuc, proteins, domains
+handle.close()
+def fasta_iterator(filename):
+"""Simple FASTA parser yielding tuples of (name, upper case sequence)."""
+if filename == "-":
+handle = sys.stdin
+else:
+handle = open(filename)
+name, seq = "", ""
+for line in handle:
+if line.startswith(">"):
+if name:
+yield name, seq
+#Take the first word only as the name:
+name = line[1:].rstrip().split(None,1)[0]
+seq = ""
+elif name:
+#Simple way would leave in any internal white space,
+#seq += line.strip().upper()
+seq += "".join(line.strip().upper().split())
+elif not line.strip():
+#Ignore blank lines before first record
+pass
+else:
+raise ValueError("Bad FASTA line %r" % line)
+if filename != "-":
+handle.close()
+if name:
+yield name, seq
+raise StopIteration
+motifs = list(load_re(re_filename))
+print "Looking for %i NLS motifs" % len(motifs)
+if tabular_filename == "-":
+out_handle = sys.stdout
+else:
+out_handle = open(tabular_filename, "w")
+out_handle.write("#ID\tNLS start\tNLS seq\tNLS pattern\tType\tProtCount\t%NucProt\tProtList\tProtLoci\n")
+count = 0
+nls = 0
+for idn, seq in fasta_iterator(fasta_filename):
+for regex, evidence, p_count, percent_nuc_prot, proteins, domains in motifs:
+#Perl predictnls v1.0.17 (and older) take right most hit only, Bug #40
+#This has been fixed (v1.0.18 onwards, June 2011), so we return all the matches
+for match in regex.finditer(seq):
+#Perl predictnls v1.0.17 (and older) return NLS start position with zero
+#but changed to one based counting in v1.0.18 (June 2011) onwards, Bug #38
+#We therefore also use one based couting, hence the start+1 here:
+out_handle.write("%s\t%i\t%s\t%s\t%s\t%i\t%s\t%s\t%s\n" \
+% (idn, match.start()+1, match.group(),
+regex.pattern, evidence, p_count,
+percent_nuc_prot, proteins, domains))
+nls += 1
+count += 1
+if tabular_filename != "-":
+out_handle.close()
+print "Found %i NLS motifs in %i sequences" % (nls, count)

Mercurial > repos > peterjc > predictnls

comparison tools/protein_analysis/predictnls.py @ 0:6e26c5a48e9a draft