changeset 0:9156a440afed draft default tip

Improved some datatype handling
author galaxyp
date Thu, 20 Jun 2013 11:09:24 -0400
parents
children
files README datatypes_conf.xml filter_by_an_id.py filter_by_an_id.xml galaxyp_util.py pepxml_to_xls.xml pepxml_viewer_wrapper.py protxml2html_wrapper.py protxml_to_xls.xml
diffstat 9 files changed, 403 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README	Thu Jun 20 11:09:24 2013 -0400
@@ -0,0 +1,1 @@
+Uncategories Utility-style Tools for Galaxy-P
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes_conf.xml	Thu Jun 20 11:09:24 2013 -0400
@@ -0,0 +1,9 @@
+<?xml version="1.0"?>
+<datatypes>
+  <datatype_files>
+    <datatype_file name="galaxyp_util.py"/>
+  </datatype_files>
+  <registration>
+    <datatype extension="pepxml.tsv" type="galaxy.datatypes.galaxyp_util:PepXmlReport" display_in_upload="true" />
+  </registration>
+</datatypes>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_by_an_id.py	Thu Jun 20 11:09:24 2013 -0400
@@ -0,0 +1,108 @@
+""" A script to build specific fasta databases """
+import sys
+import logging
+
+#===================================== Iterator ===============================
+class Sequence:
+    ''' Holds protein sequence information '''
+    def __init__(self):
+        self.header = ""
+        self.sequence_parts = []
+
+    def get_sequence(self):
+        return "".join([line.rstrip().replace('\n','').replace('\r','') for line in self.sequence_parts])
+   
+class FASTAReader:
+    """
+        FASTA db iterator. Returns a single FASTA sequence object.
+    """
+    def __init__(self, fasta_name):
+        self.fasta_file = open(fasta_name)
+        self.next_line = self.fasta_file.readline()
+        
+    def __iter__(self):
+        return self
+        
+    def next(self):
+        ''' Iteration '''
+        #while True:
+        #    line = self.fasta_file.readline()
+        #    if not line:
+        #        raise StopIteration
+        #    if line[0] == '>':
+        #        break
+        next_line = self.next_line
+        if not next_line:
+            raise StopIteration
+        
+        seq = Sequence()
+        seq.header = next_line.rstrip().replace('\n','').replace('\r','')
+
+        next_line = self.fasta_file.readline()
+        while next_line and next_line[0] != '>':
+            #tail = self.fasta_file.tell()
+            #line = self.fasta_file.readline()
+            #if not line:
+            #    break
+            #if line[0] == '>':
+            #    self.fasta_file.seek(tail)
+            #    break
+            seq.sequence_parts.append(next_line)
+            next_line = self.fasta_file.readline()
+        self.next_line = next_line
+        return seq
+#==============================================================================
+
+def target_match(target, search_entry):
+    ''' Matches '''
+    search_entry = search_entry.upper()
+    for atarget in target:
+        if search_entry.find(atarget) > -1:
+            return atarget
+    return None
+       
+
+def main():
+    ''' the main function'''
+    logging.basicConfig(filename='filter_fasta_log', 
+        level=logging.INFO,
+        format='%(asctime)s :: %(levelname)s :: %(message)s',)
+
+    used_sequences = set()
+    work_summary = {'wanted': 0, 'found':0, 'duplicates':0}
+    targets = []
+
+    f_target = open(sys.argv[1])
+    for line in f_target.readlines():
+        targets.append(">%s" % line.strip().upper())
+    f_target.close()
+
+    logging.info('Read target file and am now looking for %d %s', len(targets), 'sequences.') 
+
+    work_summary['wanted'] = len(targets)
+    homd_db = FASTAReader(sys.argv[2])
+    
+    i = 0
+    output = open(sys.argv[3], "w")
+    try:
+        for entry in homd_db:
+            target_matched_results = target_match(targets, entry.header)
+            if target_matched_results:
+                work_summary['found'] += 1
+                targets.remove(target_matched_results)
+                sequence = entry.get_sequence()
+                if sequence in used_sequences:
+                    work_summary['duplicates'] += 1
+                else:
+                    used_sequences.add(sequence)
+                    print >>output, entry.header
+                    print >>output, sequence
+    finally:
+        output.close()
+        
+    logging.info('Completed filtering')
+    for parm, count in work_summary.iteritems():
+        logging.info('%s ==> %d', parm, count)
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_by_an_id.xml	Thu Jun 20 11:09:24 2013 -0400
@@ -0,0 +1,13 @@
+<tool id="filter_by_an_id" version="0.1" name="Filer FASTA by IDs">
+  <description>Extract sequences from a FASTA file based on a list of IDs</description>
+  <command interpreter="python">filter_by_an_id.py $identifiers $input $output</command>
+  <inputs>
+    <param format="fasta" name="input" type="data" label="FASTA sequences"/>
+    <param format="txt" name="identifiers" type="data" label="List of IDs to extract sequences for"/>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="output" label="FASTA sequences for ${identifiers.name}"/>
+  </outputs>
+  <help>
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/galaxyp_util.py	Thu Jun 20 11:09:24 2013 -0400
@@ -0,0 +1,37 @@
+from galaxy.datatypes.tabular import Tabular
+import logging
+
+log = logging.getLogger(__name__)
+
+
+class PepXmlReport(Tabular):
+    """pepxml converted to tabular report"""
+    file_ext = "tsv"
+
+    def __init__(self, **kwd):
+        Tabular.__init__( self, **kwd )
+        self.column_names = ['Protein', 'Peptide', 'Assumed Charge', 'Neutral Pep Mass (calculated)', 'Neutral Mass', 'Retention Time', 'Start Scan', 'End Scan', 'Search Engine', 'PeptideProphet Probability', 'Interprophet Probabaility']
+
+    def set_meta( self, dataset, **kwd ):
+        Tabular.set_meta( self, dataset, **kwd )
+
+    #def display_peek( self, dataset ):
+    #    """Returns formated html of peek"""
+    #    return Tabular.make_html_table( self, dataset, column_names=self.column_names )
+
+
+class ProtXmlReport(Tabular):
+    """protxml converted to tabular report"""
+    file_ext = "tsv"
+    comment_lines = 1
+
+    def __init__(self, **kwd):
+        Tabular.__init__( self, **kwd )
+        self.column_names = ["Entry Number", "Group Probability", "Protein", "Protein Link", "Protein Probability", "Percent Coverage", "Number of Unique Peptides", "Total Independent Spectra", "Percent Share of Spectrum ID's", "Description", "Protein Molecular Weight", "Protein Length", "Is Nondegenerate Evidence", "Weight", "Precursor Ion Charge", "Peptide sequence", "Peptide Link", "NSP Adjusted Probability", "Initial Probability", "Number of Total Termini", "Number of Sibling Peptides Bin", "Number of Instances", "Peptide Group Designator", "Is Evidence?"]
+
+    def set_meta( self, dataset, **kwd ):
+        Tabular.set_meta( self, dataset, **kwd )
+
+    #def display_peek( self, dataset ):
+    #    """Returns formated html of peek"""
+    #    return Tabular.make_html_table( self, dataset, column_names=self.column_names )
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pepxml_to_xls.xml	Thu Jun 20 11:09:24 2013 -0400
@@ -0,0 +1,24 @@
+<tool id="pepxml_to_xls" name="Convert PepXML to Tabular" version="0.1.0">
+  <description></description>
+
+  <requirements>
+    <requirement type="package">transproteomic_pipeline</requirement>
+  </requirements>
+
+
+  <command interpreter="python">
+    pepxml_viewer_wrapper.py --input=${input} --export_spreadsheet
+  </command>
+
+  <inputs>
+    <param format="pepxml" name="input" type="data" label="Pep XML Input"/>
+  </inputs>
+
+  <outputs>
+    <data format="pepxml.tsv" name="output" from_work_dir="input.pep.xls" />
+  </outputs>
+
+  <help>
+    
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pepxml_viewer_wrapper.py	Thu Jun 20 11:09:24 2013 -0400
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+import optparse
+import os
+import sys
+import tempfile
+import shutil 
+import subprocess
+import re
+from os.path import basename
+import logging
+
+assert sys.version_info[:2] >= ( 2, 6 )
+
+log = logging.getLogger(__name__)
+working_directory = os.getcwd()
+tmp_stderr_name = tempfile.NamedTemporaryFile(dir = working_directory, suffix = '.stderr').name
+tmp_stdout_name = tempfile.NamedTemporaryFile(dir = working_directory, suffix = '.stdout').name
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+
+def read_stderr():
+    stderr = ''
+    if(os.path.exists(tmp_stderr_name)):
+        with open(tmp_stderr_name, 'rb') as tmp_stderr:
+            buffsize = 1048576
+            try:
+                while True:
+                    stderr += tmp_stderr.read(buffsize)
+                    if not stderr or len(stderr) % buffsize != 0:
+                        break
+            except OverflowError:
+                pass
+    return stderr
+    
+def execute(command, stdin=None):
+    with open(tmp_stderr_name, 'wb') as tmp_stderr:
+        with open(tmp_stdout_name, 'wb') as tmp_stdout:
+            proc = subprocess.Popen(args=command, shell=True, stderr=tmp_stderr.fileno(), stdout=tmp_stdout.fileno(), stdin=stdin, env=os.environ)
+            returncode = proc.wait()
+            if returncode != 0:
+                raise Exception, "Program returned with non-zero exit code %d. stderr: %s" % (returncode, read_stderr())
+
+def delete_file(path):
+    if os.path.exists(path):
+        try:
+            os.remove(path)
+        except:
+            pass
+
+def delete_directory(directory):
+    if os.path.exists(directory):
+        try:
+            shutil.rmtree(directory)
+        except:
+            pass
+
+def symlink(source, link_name):
+    import platform
+    if platform.system() == 'Windows':
+        import win32file
+        win32file.CreateSymbolicLink(source, link_name, 1)
+    else:
+        os.symlink(source, link_name)
+
+
+def copy_to_working_directory(data_file, relative_path):
+    if os.path.abspath(data_file) != os.path.abspath(relative_path):
+        shutil.copy(data_file, relative_path)
+    return relative_path
+
+def __main__():
+    run_script()
+
+#ENDTEMPLATE
+    
+
+def run_script():
+    parser = optparse.OptionParser()
+    parser.add_option("--input")
+    parser.add_option("--export_spreadsheet", action="store_true", dest="export_spreadsheet")
+    parser.set_defaults(export_spreadsheet=False)
+    (options, args) = parser.parse_args()
+
+    copy_to_working_directory(options.input, "input.pep.xml")
+    cmd = "PepXMLViewer.cgi -I input.pep.xml"
+    cmd = "%s %s" % (cmd, "-B exportSpreadsheet")
+    if options.export_spreadsheet:
+        cmd = "%s %s" % (cmd, "1")
+    else:
+        cmd = "%s %s" % (cmd, "0")    
+    execute(cmd)
+    
+if __name__ == '__main__': __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/protxml2html_wrapper.py	Thu Jun 20 11:09:24 2013 -0400
@@ -0,0 +1,94 @@
+#!/usr/bin/env python
+import optparse
+import os
+import sys
+import tempfile
+import shutil 
+import subprocess
+import re
+from os.path import basename
+import logging
+
+assert sys.version_info[:2] >= ( 2, 6 )
+
+log = logging.getLogger(__name__)
+working_directory = os.getcwd()
+tmp_stderr_name = tempfile.NamedTemporaryFile(dir = working_directory, suffix = '.stderr').name
+tmp_stdout_name = tempfile.NamedTemporaryFile(dir = working_directory, suffix = '.stdout').name
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+
+def read_stderr():
+    stderr = ''
+    if(os.path.exists(tmp_stderr_name)):
+        with open(tmp_stderr_name, 'rb') as tmp_stderr:
+            buffsize = 1048576
+            try:
+                while True:
+                    stderr += tmp_stderr.read(buffsize)
+                    if not stderr or len(stderr) % buffsize != 0:
+                        break
+            except OverflowError:
+                pass
+    return stderr
+    
+def execute(command, stdin=None):
+    with open(tmp_stderr_name, 'wb') as tmp_stderr:
+        with open(tmp_stdout_name, 'wb') as tmp_stdout:
+            proc = subprocess.Popen(args=command, shell=True, stderr=tmp_stderr.fileno(), stdout=tmp_stdout.fileno(), stdin=stdin, env=os.environ)
+            returncode = proc.wait()
+            if returncode != 0:
+                raise Exception, "Program returned with non-zero exit code %d. stderr: %s" % (returncode, read_stderr())
+
+def delete_file(path):
+    if os.path.exists(path):
+        try:
+            os.remove(path)
+        except:
+            pass
+
+def delete_directory(directory):
+    if os.path.exists(directory):
+        try:
+            shutil.rmtree(directory)
+        except:
+            pass
+
+def symlink(source, link_name):
+    import platform
+    if platform.system() == 'Windows':
+        import win32file
+        win32file.CreateSymbolicLink(source, link_name, 1)
+    else:
+        os.symlink(source, link_name)
+
+
+def copy_to_working_directory(data_file, relative_path):
+    if os.path.abspath(data_file) != os.path.abspath(relative_path):
+        shutil.copy(data_file, relative_path)
+    return relative_path
+
+def __main__():
+    run_script()
+
+#ENDTEMPLATE
+    
+
+def run_script():
+    parser = optparse.OptionParser()
+    parser.add_option("--input")
+    parser.add_option("--export_spreadsheet", action="store_true", dest="export_spreadsheet")
+    parser.set_defaults(export_spreadsheet=False)
+    (options, args) = parser.parse_args()
+
+    copy_to_working_directory(options.input, "input.prot.xml")
+    cmd = "protxml2html.pl -file ./input.prot.xml"
+    if options.export_spreadsheet:
+        cmd = "%s FORMAT EXCEL" % cmd
+    else:
+        cmd = "%s FORMAT HTML" % cmd
+    execute(cmd)
+    
+if __name__ == '__main__': __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/protxml_to_xls.xml	Thu Jun 20 11:09:24 2013 -0400
@@ -0,0 +1,22 @@
+<tool id="protxml_to_xls" name="Convert ProtXML to Tabular" version="0.1.0">
+  <description></description>
+
+  <command interpreter="python">
+    protxml2html_wrapper.py --input=${input} --export_spreadsheet
+  </command>
+
+  <inputs>
+    <param format="prot.xml" name="input" type="data" label="Prot XML Input"/>
+  </inputs>
+
+  <outputs>
+    <data format="protxml.tsv" name="output" from_work_dir="input.prot.xls" />
+  </outputs>
+
+  <requirements>
+    <requirement type="package">transproteomic_pipeline</requirement>
+  </requirements>
+
+  <help>
+  </help>
+</tool>