# HG changeset patch # User galaxyp # Date 1371740964 14400 # Node ID 9156a440afed738d4e4197277aa384f1db5941f7 Improved some datatype handling diff -r 000000000000 -r 9156a440afed README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README Thu Jun 20 11:09:24 2013 -0400 @@ -0,0 +1,1 @@ +Uncategories Utility-style Tools for Galaxy-P \ No newline at end of file diff -r 000000000000 -r 9156a440afed datatypes_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes_conf.xml Thu Jun 20 11:09:24 2013 -0400 @@ -0,0 +1,9 @@ + + + + + + + + + diff -r 000000000000 -r 9156a440afed filter_by_an_id.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_by_an_id.py Thu Jun 20 11:09:24 2013 -0400 @@ -0,0 +1,108 @@ +""" A script to build specific fasta databases """ +import sys +import logging + +#===================================== Iterator =============================== +class Sequence: + ''' Holds protein sequence information ''' + def __init__(self): + self.header = "" + self.sequence_parts = [] + + def get_sequence(self): + return "".join([line.rstrip().replace('\n','').replace('\r','') for line in self.sequence_parts]) + +class FASTAReader: + """ + FASTA db iterator. Returns a single FASTA sequence object. + """ + def __init__(self, fasta_name): + self.fasta_file = open(fasta_name) + self.next_line = self.fasta_file.readline() + + def __iter__(self): + return self + + def next(self): + ''' Iteration ''' + #while True: + # line = self.fasta_file.readline() + # if not line: + # raise StopIteration + # if line[0] == '>': + # break + next_line = self.next_line + if not next_line: + raise StopIteration + + seq = Sequence() + seq.header = next_line.rstrip().replace('\n','').replace('\r','') + + next_line = self.fasta_file.readline() + while next_line and next_line[0] != '>': + #tail = self.fasta_file.tell() + #line = self.fasta_file.readline() + #if not line: + # break + #if line[0] == '>': + # self.fasta_file.seek(tail) + # break + seq.sequence_parts.append(next_line) + next_line = self.fasta_file.readline() + self.next_line = next_line + return seq +#============================================================================== + +def target_match(target, search_entry): + ''' Matches ''' + search_entry = search_entry.upper() + for atarget in target: + if search_entry.find(atarget) > -1: + return atarget + return None + + +def main(): + ''' the main function''' + logging.basicConfig(filename='filter_fasta_log', + level=logging.INFO, + format='%(asctime)s :: %(levelname)s :: %(message)s',) + + used_sequences = set() + work_summary = {'wanted': 0, 'found':0, 'duplicates':0} + targets = [] + + f_target = open(sys.argv[1]) + for line in f_target.readlines(): + targets.append(">%s" % line.strip().upper()) + f_target.close() + + logging.info('Read target file and am now looking for %d %s', len(targets), 'sequences.') + + work_summary['wanted'] = len(targets) + homd_db = FASTAReader(sys.argv[2]) + + i = 0 + output = open(sys.argv[3], "w") + try: + for entry in homd_db: + target_matched_results = target_match(targets, entry.header) + if target_matched_results: + work_summary['found'] += 1 + targets.remove(target_matched_results) + sequence = entry.get_sequence() + if sequence in used_sequences: + work_summary['duplicates'] += 1 + else: + used_sequences.add(sequence) + print >>output, entry.header + print >>output, sequence + finally: + output.close() + + logging.info('Completed filtering') + for parm, count in work_summary.iteritems(): + logging.info('%s ==> %d', parm, count) + +if __name__ == "__main__": + main() diff -r 000000000000 -r 9156a440afed filter_by_an_id.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_by_an_id.xml Thu Jun 20 11:09:24 2013 -0400 @@ -0,0 +1,13 @@ + + Extract sequences from a FASTA file based on a list of IDs + filter_by_an_id.py $identifiers $input $output + + + + + + + + + + diff -r 000000000000 -r 9156a440afed galaxyp_util.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxyp_util.py Thu Jun 20 11:09:24 2013 -0400 @@ -0,0 +1,37 @@ +from galaxy.datatypes.tabular import Tabular +import logging + +log = logging.getLogger(__name__) + + +class PepXmlReport(Tabular): + """pepxml converted to tabular report""" + file_ext = "tsv" + + def __init__(self, **kwd): + Tabular.__init__( self, **kwd ) + self.column_names = ['Protein', 'Peptide', 'Assumed Charge', 'Neutral Pep Mass (calculated)', 'Neutral Mass', 'Retention Time', 'Start Scan', 'End Scan', 'Search Engine', 'PeptideProphet Probability', 'Interprophet Probabaility'] + + def set_meta( self, dataset, **kwd ): + Tabular.set_meta( self, dataset, **kwd ) + + #def display_peek( self, dataset ): + # """Returns formated html of peek""" + # return Tabular.make_html_table( self, dataset, column_names=self.column_names ) + + +class ProtXmlReport(Tabular): + """protxml converted to tabular report""" + file_ext = "tsv" + comment_lines = 1 + + def __init__(self, **kwd): + Tabular.__init__( self, **kwd ) + self.column_names = ["Entry Number", "Group Probability", "Protein", "Protein Link", "Protein Probability", "Percent Coverage", "Number of Unique Peptides", "Total Independent Spectra", "Percent Share of Spectrum ID's", "Description", "Protein Molecular Weight", "Protein Length", "Is Nondegenerate Evidence", "Weight", "Precursor Ion Charge", "Peptide sequence", "Peptide Link", "NSP Adjusted Probability", "Initial Probability", "Number of Total Termini", "Number of Sibling Peptides Bin", "Number of Instances", "Peptide Group Designator", "Is Evidence?"] + + def set_meta( self, dataset, **kwd ): + Tabular.set_meta( self, dataset, **kwd ) + + #def display_peek( self, dataset ): + # """Returns formated html of peek""" + # return Tabular.make_html_table( self, dataset, column_names=self.column_names ) diff -r 000000000000 -r 9156a440afed pepxml_to_xls.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pepxml_to_xls.xml Thu Jun 20 11:09:24 2013 -0400 @@ -0,0 +1,24 @@ + + + + + transproteomic_pipeline + + + + + pepxml_viewer_wrapper.py --input=${input} --export_spreadsheet + + + + + + + + + + + + + + diff -r 000000000000 -r 9156a440afed pepxml_viewer_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pepxml_viewer_wrapper.py Thu Jun 20 11:09:24 2013 -0400 @@ -0,0 +1,95 @@ +#!/usr/bin/env python +import optparse +import os +import sys +import tempfile +import shutil +import subprocess +import re +from os.path import basename +import logging + +assert sys.version_info[:2] >= ( 2, 6 ) + +log = logging.getLogger(__name__) +working_directory = os.getcwd() +tmp_stderr_name = tempfile.NamedTemporaryFile(dir = working_directory, suffix = '.stderr').name +tmp_stdout_name = tempfile.NamedTemporaryFile(dir = working_directory, suffix = '.stdout').name + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + +def read_stderr(): + stderr = '' + if(os.path.exists(tmp_stderr_name)): + with open(tmp_stderr_name, 'rb') as tmp_stderr: + buffsize = 1048576 + try: + while True: + stderr += tmp_stderr.read(buffsize) + if not stderr or len(stderr) % buffsize != 0: + break + except OverflowError: + pass + return stderr + +def execute(command, stdin=None): + with open(tmp_stderr_name, 'wb') as tmp_stderr: + with open(tmp_stdout_name, 'wb') as tmp_stdout: + proc = subprocess.Popen(args=command, shell=True, stderr=tmp_stderr.fileno(), stdout=tmp_stdout.fileno(), stdin=stdin, env=os.environ) + returncode = proc.wait() + if returncode != 0: + raise Exception, "Program returned with non-zero exit code %d. stderr: %s" % (returncode, read_stderr()) + +def delete_file(path): + if os.path.exists(path): + try: + os.remove(path) + except: + pass + +def delete_directory(directory): + if os.path.exists(directory): + try: + shutil.rmtree(directory) + except: + pass + +def symlink(source, link_name): + import platform + if platform.system() == 'Windows': + import win32file + win32file.CreateSymbolicLink(source, link_name, 1) + else: + os.symlink(source, link_name) + + +def copy_to_working_directory(data_file, relative_path): + if os.path.abspath(data_file) != os.path.abspath(relative_path): + shutil.copy(data_file, relative_path) + return relative_path + +def __main__(): + run_script() + +#ENDTEMPLATE + + +def run_script(): + parser = optparse.OptionParser() + parser.add_option("--input") + parser.add_option("--export_spreadsheet", action="store_true", dest="export_spreadsheet") + parser.set_defaults(export_spreadsheet=False) + (options, args) = parser.parse_args() + + copy_to_working_directory(options.input, "input.pep.xml") + cmd = "PepXMLViewer.cgi -I input.pep.xml" + cmd = "%s %s" % (cmd, "-B exportSpreadsheet") + if options.export_spreadsheet: + cmd = "%s %s" % (cmd, "1") + else: + cmd = "%s %s" % (cmd, "0") + execute(cmd) + +if __name__ == '__main__': __main__() diff -r 000000000000 -r 9156a440afed protxml2html_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/protxml2html_wrapper.py Thu Jun 20 11:09:24 2013 -0400 @@ -0,0 +1,94 @@ +#!/usr/bin/env python +import optparse +import os +import sys +import tempfile +import shutil +import subprocess +import re +from os.path import basename +import logging + +assert sys.version_info[:2] >= ( 2, 6 ) + +log = logging.getLogger(__name__) +working_directory = os.getcwd() +tmp_stderr_name = tempfile.NamedTemporaryFile(dir = working_directory, suffix = '.stderr').name +tmp_stdout_name = tempfile.NamedTemporaryFile(dir = working_directory, suffix = '.stdout').name + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + +def read_stderr(): + stderr = '' + if(os.path.exists(tmp_stderr_name)): + with open(tmp_stderr_name, 'rb') as tmp_stderr: + buffsize = 1048576 + try: + while True: + stderr += tmp_stderr.read(buffsize) + if not stderr or len(stderr) % buffsize != 0: + break + except OverflowError: + pass + return stderr + +def execute(command, stdin=None): + with open(tmp_stderr_name, 'wb') as tmp_stderr: + with open(tmp_stdout_name, 'wb') as tmp_stdout: + proc = subprocess.Popen(args=command, shell=True, stderr=tmp_stderr.fileno(), stdout=tmp_stdout.fileno(), stdin=stdin, env=os.environ) + returncode = proc.wait() + if returncode != 0: + raise Exception, "Program returned with non-zero exit code %d. stderr: %s" % (returncode, read_stderr()) + +def delete_file(path): + if os.path.exists(path): + try: + os.remove(path) + except: + pass + +def delete_directory(directory): + if os.path.exists(directory): + try: + shutil.rmtree(directory) + except: + pass + +def symlink(source, link_name): + import platform + if platform.system() == 'Windows': + import win32file + win32file.CreateSymbolicLink(source, link_name, 1) + else: + os.symlink(source, link_name) + + +def copy_to_working_directory(data_file, relative_path): + if os.path.abspath(data_file) != os.path.abspath(relative_path): + shutil.copy(data_file, relative_path) + return relative_path + +def __main__(): + run_script() + +#ENDTEMPLATE + + +def run_script(): + parser = optparse.OptionParser() + parser.add_option("--input") + parser.add_option("--export_spreadsheet", action="store_true", dest="export_spreadsheet") + parser.set_defaults(export_spreadsheet=False) + (options, args) = parser.parse_args() + + copy_to_working_directory(options.input, "input.prot.xml") + cmd = "protxml2html.pl -file ./input.prot.xml" + if options.export_spreadsheet: + cmd = "%s FORMAT EXCEL" % cmd + else: + cmd = "%s FORMAT HTML" % cmd + execute(cmd) + +if __name__ == '__main__': __main__() diff -r 000000000000 -r 9156a440afed protxml_to_xls.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/protxml_to_xls.xml Thu Jun 20 11:09:24 2013 -0400 @@ -0,0 +1,22 @@ + + + + + protxml2html_wrapper.py --input=${input} --export_spreadsheet + + + + + + + + + + + + transproteomic_pipeline + + + + +