Repository 'proteomics_datatypes'
hg clone https://toolshed.g2.bx.psu.edu/repos/iracooke/proteomics_datatypes

Changeset 0:c10a62c886b8 (2013-01-06)
Next changeset 1:84c6c70a4e5a (2013-03-04)
Commit message:
Uploaded
added:
datatypes_conf.xml
display_applications/proteomics/PepXml.xml
display_applications/proteomics/ProtXml.xml
display_applications/proteomics/mzML.xml
pepxml_to_table.xml
proteomics.py
tool-data/protk_display_site.txt.sample
b
diff -r 000000000000 -r c10a62c886b8 datatypes_conf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes_conf.xml Sun Jan 06 19:07:22 2013 -0500
b
@@ -0,0 +1,48 @@
+<?xml version="1.0"?>
+<datatypes>
+  <datatype_files>
+    <datatype_file name="proteomics.py"/>
+  </datatype_files>
+  <registration display_path="display_applications">
+    <datatype extension="pepxml" type="galaxy.datatypes.proteomics:PepXml" mimetype="application/xml" display_in_upload="true">
+      <display file="proteomics/PepXml.xml" />
+    </datatype>
+    <datatype extension="raw_pepxml" type="galaxy.datatypes.proteomics:PepXml" subclass="true">
+      <display file="proteomics/PepXml.xml" />
+    </datatype>
+    <datatype extension="peptideprophet_pepxml" type="galaxy.datatypes.proteomics:PepXml" subclass="true">
+      <display file="proteomics/PepXml.xml" />
+    </datatype>
+    <datatype extension="interprophet_pepxml" type="galaxy.datatypes.proteomics:PepXml" subclass="true">
+      <display file="proteomics/PepXml.xml" />
+    </datatype>
+    <datatype extension="protxml" type="galaxy.datatypes.proteomics:ProtXML" display_in_upload="true" >
+      <display file="proteomics/ProtXml.xml"/>
+    </datatype>
+    <datatype extension="mascotdat" type="galaxy.datatypes.proteomics:MascotDat" display_in_upload="false" />
+    <datatype extension="mzml" type="galaxy.datatypes.proteomics:MzML" mimetype="application/xml" display_in_upload="true">
+      <display file="proteomics/mzML.xml"/>
+    </datatype>
+    <datatype extension="mgf" type="galaxy.datatypes.proteomics:Mgf" display_in_upload="true" />
+    <datatype extension="xls" type="galaxy.datatypes.proteomics:Xls" display_in_upload="true" />
+    <datatype extension="mzxml" type="galaxy.datatypes.proteomics:MzXML" mimetype="application/xml" display_in_upload="true" />
+    <datatype extension="mzq" type="galaxy.datatypes.proteomics:MzQuantML" mimetype="application/xml" display_in_upload="true" />
+    <datatype extension="mzid" type="galaxy.datatypes.proteomics:MzIdentML" mimetype="application/xml" display_in_upload="true" />
+    <datatype extension="traML" type="galaxy.datatypes.proteomics:TraML" mimetype="application/xml" display_in_upload="true" />
+    <datatype extension="raw" type="galaxy.datatypes.proteomics:RAW" display_in_upload="true" />
+    <datatype extension="msp" type="galaxy.datatypes.proteomics:Msp" display_in_upload="true" />
+    <datatype extension="ms2" type="galaxy.datatypes.proteomics:Ms2" display_in_upload="true" />
+    <datatype extension="hlf" type="galaxy.datatypes.proteomics:XHunterAslFormat" display_in_upload="true" />
+  </registration>
+  <sniffers>
+    <sniffer type="galaxy.datatypes.proteomics:MzML"/>        
+    <sniffer type="galaxy.datatypes.proteomics:PepXml"/>
+    <sniffer type="galaxy.datatypes.proteomics:Mgf"/>
+    <sniffer type="galaxy.datatypes.proteomics:ProtXML"/>
+    <sniffer type="galaxy.datatypes.proteomics:MzXML"/>
+    <sniffer type="galaxy.datatypes.proteomics:TraML"/>
+    <sniffer type="galaxy.datatypes.proteomics:MzIdentML"/>
+    <sniffer type="galaxy.datatypes.proteomics:MzQuantML"/>
+    <sniffer type="galaxy.datatypes.proteomics:Xls"/>
+  </sniffers>
+</datatypes>
b
diff -r 000000000000 -r c10a62c886b8 display_applications/proteomics/PepXml.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/display_applications/proteomics/PepXml.xml Sun Jan 06 19:07:22 2013 -0500
[
@@ -0,0 +1,18 @@
+<display id="proteomics_pepxml" version="1.0.0" name="view pepXML in">
+ <dynamic_links from_file="tool-data/protk_display_site.txt" skip_startswith="#" id="0" name="0">
+        <!-- Define parameters by column from file -->
+        <dynamic_param name="site_id" value="0"/>
+        <dynamic_param name="site_url" value="1"/>
+        <!-- We define url and params as normal, but values defined in dynamic_param are available by specified name -->
+        <url target_frame="galaxy_main">${site_url}/init_local?file=${encoded_filename.qp}&amp;type=pepxml</url>
+        <param type="data" name="pep_file" viewable="False" format="pepXML"/>
+        <param type="data" dataset="pep_file" name="pepxml_file" format="pepXML" viewable="False" />
+        <param type="template" name="encoded_filename" strip="True" >
+            #import binascii
+            ${binascii.hexlify( $pepxml_file.file_name )}
+        </param>
+        <param type="template" name="galaxy_url" strip="True" >
+                ${BASE_URL.split(":")[1][2:]}
+        </param>
+    </dynamic_links>
+</display>
b
diff -r 000000000000 -r c10a62c886b8 display_applications/proteomics/ProtXml.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/display_applications/proteomics/ProtXml.xml Sun Jan 06 19:07:22 2013 -0500
[
@@ -0,0 +1,18 @@
+<display id="proteomics_protxml" version="1.0.0" name="view protXML in">
+ <dynamic_links from_file="tool-data/protk_display_site.txt" skip_startswith="#" id="0" name="0">
+        <!-- Define parameters by column from file -->
+        <dynamic_param name="site_id" value="0"/>
+        <dynamic_param name="site_url" value="1"/>
+        <!-- We define url and params as normal, but values defined in dynamic_param are available by specified name -->
+        <url target_frame="galaxy_main">${site_url}/init_local?file=${encoded_filename.qp}&amp;type=protxml</url>
+        <param type="data" name="prot_file" viewable="False" format="protXML"/>
+        <param type="data" dataset="prot_file" name="protxml_file" format="protXML" viewable="False" />
+        <param type="template" name="encoded_filename" strip="True" >
+            #import binascii
+            ${binascii.hexlify( $protxml_file.file_name )}
+        </param>
+        <param type="template" name="galaxy_url" strip="True" >
+                ${BASE_URL.split(":")[1][2:]}
+        </param>
+    </dynamic_links>
+</display>
\ No newline at end of file
b
diff -r 000000000000 -r c10a62c886b8 display_applications/proteomics/mzML.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/display_applications/proteomics/mzML.xml Sun Jan 06 19:07:22 2013 -0500
[
@@ -0,0 +1,18 @@
+<display id="proteomics_mzml" version="1.0.2" name="view mzML data">
+ <dynamic_links from_file="tool-data/protk_display_site.txt" skip_startswith="#" id="0" name="0">
+        <!-- Define parameters by column from file -->
+        <dynamic_param name="site_id" value="0"/>
+        <dynamic_param name="site_url" value="1"/>
+        <!-- We define url and params as normal, but values defined in dynamic_param are available by specified name -->
+        <url target_frame="galaxy_main">${site_url}/init_local?file=${encoded_filename.qp}&amp;type=mzml</url>
+        <param type="data" name="raw_file" viewable="False" format="mzML"/>
+        <param type="data" dataset="raw_file" name="mzml_file" format="mzML" viewable="False" />
+        <param type="template" name="encoded_filename" strip="True" >
+            #import binascii
+            ${binascii.hexlify( $mzml_file.file_name )}
+        </param>
+        <param type="template" name="galaxy_url" strip="True" >
+                ${BASE_URL.split(":")[1][2:]}
+        </param>
+    </dynamic_links>
+</display>
\ No newline at end of file
b
diff -r 000000000000 -r c10a62c886b8 pepxml_to_table.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pepxml_to_table.xml Sun Jan 06 19:07:22 2013 -0500
b
@@ -0,0 +1,23 @@
+<tool id="pepxml_to_table_1" name="PepXML to Table" version="1.0.0">
+ <requirements><requirement type="package">protkgem</requirement></requirements>
+  <description>Converts a pepXML file to a tab delimited text file</description>
+
+
+<!-- Note .. the input file is assumed to be the first argument -->
+<command>pepxml_to_table.rb $input_file -o $output</command>
+
+
+<inputs>
+
+ <param name="input_file" type="data" format="pepxml,peptideprophet_pepxml,interprophet_pepxml"  multiple="false" label="Input File" help="A pepXML file"/>
+
+</inputs>
+<outputs>
+ <data format="csv" name="output" metadata_source="input_file" label="${input_file.display_name}.csv" />
+</outputs>
+
+<help>
+ Convert a pepXML file to Tab delimited text
+</help>
+
+</tool>
b
diff -r 000000000000 -r c10a62c886b8 proteomics.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/proteomics.py Sun Jan 06 19:07:22 2013 -0500
[
b'@@ -0,0 +1,251 @@\n+"""\n+Proteomics format classes\n+"""\n+import logging\n+import re\n+from galaxy.datatypes.data import *\n+from galaxy.datatypes.xml import *\n+from galaxy.datatypes.sniff import *\n+from galaxy.datatypes.binary import *\n+\n+log = logging.getLogger(__name__)\n+\n+\n+class Xls( Binary ):\n+    """Class describing a binary excel spreadsheet file"""\n+    file_ext = "xls"\n+\n+    def set_peek( self, dataset, is_multi_byte=False ):\n+        if not dataset.dataset.purged:\n+            dataset.peek  = "Excel Spreadsheet file"\n+            dataset.blurb = data.nice_size( dataset.get_size() )\n+        else:\n+            dataset.peek = \'file does not exist\'\n+            dataset.blurb = \'file purged from disk\'\n+    def display_peek( self, dataset ):\n+        try:\n+            return dataset.peek\n+        except:\n+            return "Binary xls file (%s)" % ( data.nice_size( dataset.get_size() ) )\n+\n+class ProteomicsXml(GenericXml):\n+    """ An enhanced XML datatype used to reuse code across several\n+    proteomic/mass-spec datatypes. """\n+\n+    def sniff(self, filename):\n+        """ Determines whether the file is the correct XML type. """\n+        with open(filename, \'r\') as contents:            \n+            while True:\n+                line = contents.readline()\n+                if line == None or not line.startswith(\'<?\'):\n+                    break\n+            pattern = \'^<(\\w*:)?%s\' % self.root # pattern match <root or <ns:root for any ns string\n+            return line != None and re.match(pattern, line) != None\n+\n+    def set_peek( self, dataset, is_multi_byte=False ):\n+        """Set the peek and blurb text"""\n+        if not dataset.dataset.purged:\n+            dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )\n+            dataset.blurb = self.blurb\n+        else:\n+            dataset.peek = \'file does not exist\'\n+            dataset.blurb = \'file purged from disk\'\n+\n+class PepXml(ProteomicsXml):\n+    """pepXML data"""\n+    file_ext = "pepxml"\n+    blurb = \'pepXML data\'\n+    root = "msms_pipeline_analysis"\n+    \n+\n+class MzML(ProteomicsXml):\n+    """mzML data"""\n+    file_ext = "mzml"\n+    blurb = \'mzML Mass Spectrometry data\'\n+    root = "(mzML|indexedmzML)"\n+\n+\n+class ProtXML(ProteomicsXml):\n+    """protXML data"""\n+    file_ext = "protxml"\n+    blurb = \'prot XML Search Results\'\n+    root = "protein_summary"\n+\n+\n+class MzXML(ProteomicsXml):\n+    """mzXML data"""\n+    file_ext = "mzXML"\n+    blurb = "mzXML Mass Spectrometry data"\n+    root = "mzXML"\n+\n+## PSI datatypes\n+class MzIdentML(ProteomicsXml):\n+    file_ext = "mzid"\n+    blurb = "XML identified peptides and proteins."\n+    root = "MzIdentML"\n+    \n+\n+class TraML(ProteomicsXml):\n+    file_ext = "traML"\n+    blurb = "TraML transition list"\n+    root = "TraML"\n+\n+\n+class MzQuantML(ProteomicsXml):\n+    file_ext = "mzq"\n+    blurb = "XML quantification data"\n+    root = "MzQuantML"\n+\n+ \n+class Mgf( Text ):\n+    """Mascot Generic Format data"""\n+    file_ext = "mgf"\n+\n+    def set_peek( self, dataset, is_multi_byte=False ):\n+        """Set the peek and blurb text"""\n+        if not dataset.dataset.purged:\n+            dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )\n+            dataset.blurb = \'mgf Mascot Generic Format\'\n+        else:\n+            dataset.peek = \'file does not exist\'\n+            dataset.blurb = \'file purged from disk\'\n+\n+\n+    def sniff( self, filename ):\n+        mgf_begin_ions = "BEGIN IONS"\n+        max_lines=100\n+\n+        for i, line in enumerate( file( filename ) ):\n+            line = line.rstrip( \'\\n\\r\' )\n+            if line==mgf_begin_ions:\n+                return True\n+            if i>max_lines:\n+                return False\n+            \n+                \n+class MascotDat( Text ):\n+    """Mascot search results """\n+    file_ext = "mascotdat"\n+\n+    def set_peek( self, dataset, is_multi_byte=False ):\n+        """Set the peek and blurb text"""\n+        if not dataset.da'..b'    mime_version = "MIME-Version: 1.0 (Generated by Mascot version 1.0)"\n+        max_lines=10\n+\n+        for i, line in enumerate( file( filename ) ):\n+            line = line.rstrip( \'\\n\\r\' )\n+            if line==mime_version:\n+                return True\n+            if i>max_lines:\n+                return False\n+\n+\n+class RAW( Binary ):\n+    """Class describing a Thermo Finnigan binary RAW file"""\n+    file_ext = "raw"\n+    def sniff( self, filename ):\n+        # Thermo Finnigan RAW format is proprietary and hence not well documented.\n+        # Files start with 2 bytes that seem to differ followed by F\\0i\\0n\\0n\\0i\\0g\\0a\\0n\n+        # This combination represents 17 bytes, but to play safe we read 20 bytes from \n+        # the start of the file.\n+        try:\n+            header = open( filename ).read(20)\n+            hexheader = binascii.b2a_hex( header )\n+            finnigan  = binascii.hexlify( \'F\\0i\\0n\\0n\\0i\\0g\\0a\\0n\' )\n+            if hexheader.find(finnigan) != -1:\n+                return True\n+            return False\n+        except:\n+            return False\n+    def set_peek( self, dataset, is_multi_byte=False ):\n+        if not dataset.dataset.purged:\n+            dataset.peek  = "Thermo Finnigan RAW file"\n+            dataset.blurb = data.nice_size( dataset.get_size() )\n+        else:\n+            dataset.peek = \'file does not exist\'\n+            dataset.blurb = \'file purged from disk\'\n+    def display_peek( self, dataset ):\n+        try:\n+            return dataset.peek\n+        except:\n+            return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) )\n+\n+\n+if hasattr(Binary, \'register_sniffable_binary_format\'):\n+    Binary.register_sniffable_binary_format(\'RAW\', \'RAW\', RAW)\n+\n+\n+class Msp(Text):\n+    """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """\n+    file_ext = "msp"\n+    \n+    @staticmethod\n+    def next_line_starts_with(contents, prefix):\n+        next_line = contents.readline()\n+        return next_line != None and next_line.startswith(prefix)\n+\n+    def sniff(self, filename):\n+        """ Determines whether the file is a NIST MSP output file. \n+\n+        >>> fname = get_test_fname(\'test.msp\')  \n+        >>> Msp().sniff(fname)\n+        True\n+        >>> fname = get_test_fname(\'test.mzXML\')\n+        >>> Msp().sniff(fname)\n+        False\n+        """\n+        with open(filename, \'r\') as contents:\n+            return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "MW:")\n+\n+class Ms2(Text):\n+    file_ext = "ms2"\n+    \n+    def sniff(self, filename):\n+        """ Determines whether the file is a valid ms2 file. \n+\n+        >>> fname = get_test_fname(\'test.msp\')  \n+        >>> Ms2().sniff(fname)\n+        False\n+        >>> fname = get_test_fname(\'test.ms2\')\n+        >>> Ms2().sniff(fname)\n+        True\n+        """\n+\n+        with open(filename, \'r\') as contents:\n+            header_lines = []\n+            while True:\n+                line = contents.readline()\n+                if line == None or len(line) == 0:\n+                    pass\n+                elif line.startswith(\'H\\t\'):\n+                    header_lines.append(line)\n+                else:\n+                    break\n+        for header_field in [\'CreationDate\', \'Extractor\', \'ExtractorVersion\', \'ExtractorOptions\']:\n+            found_header = False\n+            for header_line in header_lines:\n+                if header_line.startswith(\'H\\t%s\' % (header_field)):\n+                    found_header = True\n+                    break\n+            if not found_header:\n+                return False\n+\n+        return True\n+\n+# unsniffable binary format, should do something about this\n+class XHunterAslFormat(Binary):\n+    """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """\n+    file_ext = "hlf"\n+\n+\n+if hasattr(Binary, \'register_unsniffable_binary_ext\'):\n+    Binary.register_unsniffable_binary_ext(\'hlf\')\n'
b
diff -r 000000000000 -r c10a62c886b8 tool-data/protk_display_site.txt.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/protk_display_site.txt.sample Sun Jan 06 19:07:22 2013 -0500
b
@@ -0,0 +1,3 @@
+#Proteomic Visualization application should be hosted on the same server as galaxy
+#Entries in this file are of the format "site_id" site_url
+Proteomics Visualize http://127.0.0.1:8500