Repository 'encyclopedia_searchtolib'
hg clone https://toolshed.g2.bx.psu.edu/repos/galaxyp/encyclopedia_searchtolib

Changeset 0:62a718b76f62 (2020-09-14)
Next changeset 1:36880dfd9fa7 (2021-04-08)
Commit message:
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/encyclopedia/tools/encyclopedia commit d94002fc79f552c8a64ffca86298396b1568df97"
added:
encyclopedia_searchtolib.xml
macros.xml
static/images/SearchToLib_Workflow.png
b
diff -r 000000000000 -r 62a718b76f62 encyclopedia_searchtolib.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/encyclopedia_searchtolib.xml Mon Sep 14 17:06:51 2020 +0000
[
b'@@ -0,0 +1,152 @@\n+<tool id="encyclopedia_searchtolib" name="SearchToLib" version="@VERSION@.0">\n+    <description>Build a Chromatogram Library from Data-Independent Acquisition (DIA) MS/MS Data</description>\n+    <macros>\n+        <import>macros.xml</import>\n+    </macros>\n+    <expand macro="requirements" />\n+    <command detect_errors="aggressive"><![CDATA[\n+        @SEARCH2LIB_CMDS@\n+    ]]></command>\n+    <inputs>\n+        <expand macro="scan_inputs"/>\n+        <expand macro="lib_input" optional="true" libhelp="Use a Prosit dlib spectral library to make a chromatogram elib using EncyclopeDIA, or else leave blank to make a Chromatogram library from just the fasta using Walnut"/>\n+        <expand macro="fasta_input"/>\n+        <expand macro="target_fasta"/>\n+        <expand macro="options_section"/>\n+        <param argument="-a" type="boolean" truevalue="true" falsevalue="false" checked="false" label="align between files" help="retention-time alignment of peptides is generally not needed when when building a library from narrow-window spectrums"/>\n+        <param name="select_outputs" type="select" label="Select outputs" multiple="true">\n+            <option value="log" selected="true">log</option>\n+            <option value="elib" selected="true">elib</option>\n+            <option value="features" selected="false">concatenated_features.txt</option>\n+            <option value="results" selected="false">concatenated_results.txt</option>\n+            <option value="decoy" selected="false">concatenated_decoy.txt</option>\n+            <option value="rt_plots" selected="false">Retention Time Plots (requires library)</option>\n+            <option value="rt_tables" selected="false">Retention Time Tables (requires library)</option>\n+            <option value="peptides" selected="false">peptides.txt (requires align between files)</option>\n+            <option value="proteins" selected="false">proteins.txt (requires align between files)</option>\n+        </param>\n+    </inputs>\n+    <outputs>\n+        <data name="log" format="txt" label="${tool.name} ${on_string} log" from_work_dir="search2lib.log">\n+            <filter>\'log\' in select_outputs</filter>\n+        </data>\n+        <data name="elib" format="elib" label="${tool.name} ${on_string} elib" from_work_dir="chromatogram_library.elib">\n+            <filter>\'elib\' in select_outputs</filter>\n+        </data>\n+        <data name="features" format="tabular" label="${tool.name} ${on_string} concatenated_features.txt" from_work_dir="inputs/chromatogram_library_concatenated_features.txt">\n+            <filter>\'features\' in select_outputs</filter>\n+            <actions>\n+                <action name="column_names" type="metadata" default="id,TD,ScanNr,topN,rank,peakZScore,peakCalibratedScore,deltaSn,avgIdotp,midIdotp,peakScore,peakWeightedScore,NCI,CIMassErrMean,CIMassErrVar,precursorMassErrMean,precursorMassErrVar,peakSimilarity,sampledTimes,midTime,spectraNorm,pepLength,charge2,charge3,precursorMz,sequence,protein" />\n+            </actions>\n+        </data>\n+        <data name="results" format="tabular" label="${tool.name} ${on_string} concatenated_results.txt" from_work_dir="inputs/chromatogram_library_concatenated_results.txt">\n+            <filter>\'results\' in select_outputs</filter>\n+            <actions>\n+                <action name="column_names" type="metadata" default="PSMId,score,q-value,posterior_error_prob,peptide,proteinIds" />\n+            </actions>\n+        </data>\n+        <data name="decoy" format="tabular" label="${tool.name} ${on_string} concatenated_decoy.txt" from_work_dir="inputs/chromatogram_library_concatenated_decoy.txt">\n+            <filter>\'decoy\' in select_outputs</filter>\n+            <actions>\n+                <action name="column_names" type="metadata" default="PSMId,score,q-value,posterior_error_prob,peptide,proteinIds" />\n+            </actions>\n+        </data>\n+        <collection name="rt_plots" type="list" label="${tool.name} - ${on_string}: Ret'..b'ram_library.elib.proteins.txt">\n+            <filter>a and \'proteins\' in select_outputs</filter>\n+            <actions>\n+                <action name="column_names" type="metadata" default="Protein,NumPeptides,PeptideSequences" />\n+            </actions>\n+        </data>\n+    </outputs>\n+    <tests>\n+        <test>\n+            <param name="scan_inputs" ftype="mzml" value="BCS_hela_narrow_3_1.mzML,BCS_hela_narrow_3_2.mzML"/>\n+            <param name="library" ftype="dlib" value="small_pan_human_library.dlib"/>\n+            <param name="fasta" ftype="fasta" value="uniprot_human.fasta"/>\n+            <param name="select_outputs" value="log,elib,features,results"/>\n+            <output name="results" ftype="tabular">\n+                <assert_contents>\n+                    <has_text text="QDSAAVGFDYK"/>\n+                </assert_contents>\n+            </output>\n+        </test>\n+        <test>\n+            <param name="scan_inputs" ftype="mzml" value="BCS_hela_narrow_3_1.mzML,BCS_hela_narrow_3_2.mzML"/>\n+            <param name="fasta" ftype="fasta" value="uniprot_human.fasta"/>\n+            <param name="select_outputs" value="log,elib,features,results"/>\n+            <output name="results" ftype="tabular">\n+                <assert_contents>\n+                    <has_text text="QDSAAVGFDYK"/>\n+                </assert_contents>\n+            </output>\n+        </test>\n+    </tests>\n+    <help><![CDATA[\n+**SearchToLIB**\n+\n+@ENCYCLOPEDIA_WIKI@\n+\n+SearchToLIB uses the EncyclopeDIA algorithm, or the Walnut (Pecan) algorithm, to search Data-Independent Acquisition (DIA) MS/MS spectrum files and creates a DIA elib chromatogram library for EncyclopeDIA DIA quantitation search. \n+\n+\n+**Inputs**\n+\n+  - Spectrum files in mzML format\n+  - A protein data base in fasta format\n+  - An optional DDA Spectral library (.dlib) that can be generated by Prosit\n+      - *SearchToLIB uses Enclopedia if the Prosit dlib is provided, otherwise it uses Walnut with just a fasta.*\n+\n+@MSCONVERT_HELP@\n+\n+**Outputs**\n+\n+  - A log file\n+  - A Chromatogram Library (.elib)\n+  - The identified features in tabular format\n+    Feature values of scans that are used by percolator to determine matches.\n+  - The identified Peptide Spectral Match results in tabular format\n+    Columns: PSMId, score, q-value, posterior_error_prob, peptide, proteinIds\n+  - The identified peptides in tabular format\n+    Per peptide: the normalized intensity for each scan file.\n+    Columns: Peptide, Protein, numFragments, intensity_in_file1, intensity_in_file2, ...\n+  - The identified proteins in tabular format\n+    Per protein: the normalized intensity for each scan file.\n+    Columns: Protein, NumPeptides, PeptideSequences, intensity_in_file1, intensity_in_file2, ...\n+\n+**Typical DIA Workflow**\n+\n+Two sets of Mass Spec MS/MS DIA data are collected for the experiment.  In addition to collecting wide-window DIA experiments on each quantitative replicate, a pool containing peptides from every condition is measured using several staggered narrow-window DIA experiments.\n+\n+  1. SearchToLib is first run with the pooled narrow-window mzML files to create a combined DIA elib chromatogram library.   \n+     If a Spectral library argument is provided, for example from **Prosit**, SearchToLIB uses EncyclopeDIA to search each input spectrum mzML file.  \n+     Otherwise, SearchToLIB uses Walnut, a FASTA database search engine for DIA data that uses PECAN-style scoring.\n+\n+\n+       * Prosit_ generates a predicted spectrum library of fragmentation patterns and retention times for every +2H and +3H tryptic peptide in a FASTA database, with up to one missed cleavage.\n+\n+\n+  2. EncyclopeDIA Quantify is then run on the wide-window quantitative replicate mzML files using that chromatogram library to produce quantification results.\n+\n+.. image:: SearchToLib_Workflow.png\n+  :width: 810\n+  :height: 580\n+\n+.. _Prosit: https://www.proteomicsdb.org/prosit\n+\n+    ]]></help>\n+    <expand macro="citations" />\n+</tool>\n'
b
diff -r 000000000000 -r 62a718b76f62 macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Mon Sep 14 17:06:51 2020 +0000
[
b'@@ -0,0 +1,551 @@\n+<macros>\n+    <token name="@VERSION@">0.9.5</token>\n+    <xml name="requirements">\n+        <requirements>\n+            <requirement type="package" version="@VERSION@">encyclopedia</requirement>\n+            <yield/>\n+        </requirements>\n+    </xml>\n+\n+    <token name="@ENCYCLOPEDIA_WIKI@">\n+EncyclopeDIA_ is library search engine comprised of several algorithms for DIA data analysis and can search for peptides using either DDA-based spectrum libraries or DIA-based chromatogram libraries. See: https://bitbucket.org/searleb/encyclopedia/wiki/Home\n+\n+.. _EncyclopeDIA: https://bitbucket.org/searleb/encyclopedia/wiki/Home\n+    </token>\n+    <xml name="citations">\n+        <citations>\n+            <citation type="doi">10.1038/s41467-018-07454-w</citation>\n+            <citation type="doi">10.1038/s41467-020-15346-1</citation>\n+            <citation type="doi">10.1074/mcp.P119.001913</citation>\n+            <yield/>\n+        </citations>\n+    </xml>\n+\n+    <token name="@CMD_IMPORTS@">\n+#import re\n+#def identifier_or_name($input1)\n+    #if hasattr($input1, \'element_identifier\')\n+        #return $input1.element_identifier\n+    #else\n+        #return $input1.name\n+    #end if\n+#end def\n+#def clean($name1)\n+    #set $name_clean = $re.sub(\'[^\\w\\-_]\', \'_\', $re.sub(\'(?i)[.](fa|fasta|imzml|mzml)$\',\'\', $re.sub(\'.*/\',\'\', $name1.rstrip(\'.gz\'))))\n+    #return $name_clean\n+#end def\n+#def ln_name($ds) \n+    #set $ext = \'\'\n+    #if $ds.is_of_type(\'mzml\') or $ds.is_of_type(\'imzml\')\n+        #set $ext = ".mzML"\n+    #else if $ds.is_of_type(\'elib\')\n+        #set $ext = ".elib"\n+    #else if $ds.is_of_type(\'dlib\')\n+        #set $ext = ".dlib"\n+    #else if $ds.is_of_type(\'blib\')\n+        #set $ext = ".blib"\n+    #else if $ds.is_of_type(\'fasta\')\n+        #set $ext = ".fasta"\n+    #else if $ds.is_of_type(\'fasta.gz\')\n+        #set $ext = ".fasta.gz"\n+    #end if\n+    #set $name = "%s%s" % ($clean($identifier_or_name($ds)),$ext) \n+    #return $name\n+#end def\n+#set $i_name = None\n+#set $f_name = None\n+#set $l_name = None\n+#set $t_name = None\n+    </token>\n+\n+    <xml name="scan_input">\n+        <param name="scan_input" argument="-i" type="data" format="imzml,mzml" label="Spectrum file in mzML format"> \n+            <help>@MSCONVERT_RAW@</help>\n+        </param>\n+    </xml>\n+    <token name="@LINK_SCAN_INPUT@"><![CDATA[\n+    #set $i_name = $ln_name($scan_input)\n+    ln -s \'$scan_input\' \'$i_name\' &&\n+    ]]></token>\n+    <token name="@SCAN_INPUT@">\n+    -i \'$i_name\'\n+    </token>\n+\n+    <xml name="scan_inputs">\n+        <param name="scan_inputs" argument="-i" type="data" format="imzml,mzml" multiple="true" label="Spectrum files in  mzML format">\n+            <help>@MSCONVERT_RAW@</help>\n+        </param>\n+    </xml>\n+    <token name="@LINK_SCAN_INPUTS@"><![CDATA[\n+    #set $inputs_dir = \'inputs\'\n+    mkdir -p $inputs_dir &&\n+    #for $sf in $scan_inputs\n+      #set $i_name = $ln_name($sf)\n+      ln -s \'$sf\' \'${inputs_dir}/${i_name}\' &&\n+    #end for\n+    ]]></token>\n+    <token name="@SCAN_INPUTS@">\n+    -i \'$inputs_dir\'\n+    </token>\n+\n+    <xml name="fasta_input">\n+        <param name="fasta" argument="-f" type="data" format="fasta" label="Background proteome protein fasta database"> \n+            <help>provides the necessary peptide-to-protein links not specified in the spectrum library</help>\n+        </param>\n+    </xml>\n+    <token name="@LINK_FASTA_INPUT@"><![CDATA[\n+    #set $f_name = $ln_name($fasta)\n+    ln -s \'$fasta\' \'$f_name\' &&\n+    ]]></token>\n+    <token name="@FASTA_INPUT@">\n+    -f \'$f_name\'\n+    </token>\n+\n+    <xml name="target_fasta">\n+        <param name="target_fasta" argument="-t" type="data" format="fasta" label="Target fasta database" optional="true"> \n+            <help>Optional - Only analyze this subset of the background fasta proteome</help>\n+        </param>\n+        <param argument="-tp" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Target FASTA file contains peptides">\n+     '..b'  <param argument="-numberOfQuantitativePeaks" type="integer" value="3" min="1" max="10" label="numberOfQuantitativePeaks" optional="true"/>\n+<!--\n+                <param argument="-addDecoysToBackground" type="boolean" truevalue="true" falsevalue="false" checked="false" label="addDecoysToBackground"/>\n+                <param argument="-dontRunDecoys" type="boolean" truevalue="true" falsevalue="false" checked="false" label="dontRunDecoys"/>\n+-->\n+            </when>\n+        </conditional>\n+    </xml>\n+    <token name="@SEARCH_OPTIONS@">\n+    #if $options.search.set_search == \'yes\'\n+        -minCharge $options.search.minCharge\n+        -maxCharge $options.search.maxCharge\n+        -minLength $options.search.minLength\n+        -maxLength $options.search.maxLength\n+        -minEluteTime $options.search.minEluteTime\n+        -maxMissedCleavage $options.search.maxMissedCleavage\n+        -minQuantitativeIonNumber $options.search.minQuantitativeIonNumber\n+        -minNumOfQuantitativePeaks $options.search.minNumOfQuantitativePeaks\n+        -numberOfQuantitativePeaks $options.search.numberOfQuantitativePeaks\n+        ## -addDecoysToBackground $options.search.addDecoysToBackground\n+        ## -dontRunDecoys $options.search.dontRunDecoys\n+    #end if\n+    </token>\n+\n+    <xml name="options_section">\n+        <section name="options" title="Parameter Settings" expanded="false">\n+            <expand macro="common_options"/>\n+            <expand macro="mass_library_tolerance"/>\n+            <expand macro="percolator_options"/>\n+            <expand macro="peak_options"/>\n+            <expand macro="window_options"/>\n+            <expand macro="modification_options"/>\n+            <expand macro="search_options"/>\n+        </section>\n+    </xml>\n+\n+    <xml name="libexport">\n+        <param argument="-a" type="boolean" truevalue="true" falsevalue="false" checked="false" label="align between files"/>\n+    </xml>\n+\n+    <token name="@SEARCH2LIB_CMDS@"><![CDATA[\n+        @CMD_IMPORTS@\n+        @LINK_SCAN_INPUTS@\n+        @LINK_FASTA_INPUT@\n+        @LINK_TARGET_FASTA@\n+        @LINK_LIB_INPUT@\n+        for SCAN_FILE in `ls -1 inputs/*`; do\n+            echo "\\$SCAN_FILE" &&\n+            EncyclopeDIA -Djava.awt.headless=true -Duser.language=en-US -Duser.region=US\n+                -Xmx\\$[ \\${GALAXY_MEMORY_MB:-20480} / 1024 ]g\n+                -numberOfThreadsUsed "\\${GALAXY_SLOTS:-4}"\n+            #if not $library\n+                -walnut\n+            #end if\n+            -i \\$SCAN_FILE\n+            @FASTA_INPUT@\n+            @TARGET_FASTA@\n+            @LIB_INPUT@\n+            @COMMON_OPTIONS@\n+            @MASS_LIBRARY_TOLERANCE@\n+            @PERCOLATOR_OPTIONS@\n+            @PEAK_OPTIONS@\n+            @WINDOW_OPTIONS@\n+            @MODIFICATION_OPTIONS@\n+            @SEARCH_OPTIONS@ | tee -a search2lib.log\n+        ; done &&\n+        for TXT in `find inputs/*.mzML.[efw]*[ast].txt`; do TRGT=`echo \\$TXT | sed \'s/mzML/dia/\'`; ln -s \\$TXT \\$TRGT; done &&\n+        EncyclopeDIA -Djava.awt.headless=true -Duser.language=en-US -Duser.region=US -Xmx\\$[ \\${GALAXY_MEMORY_MB:-20480} / 1024 ]g -libexport\n+        #if not $library\n+            -pecan\n+        #end if\n+        @SCAN_INPUTS@\n+        @FASTA_INPUT@\n+        @TARGET_FASTA@\n+        @LIB_INPUT@\n+        -a $a\n+        -o chromatogram_library.elib\n+        && ls -l ./*.* inputs/*\n+        | tee -a search2lib.log\n+]]>\n+    </token>\n+    <token name="@MSCONVERT_CMD@"><![CDATA[\n+      msconvert  --zlib --64 --mzML --simAsSpectra --filter "peakPicking true 1-" --filter "demultiplex optimization=overlap_only" *.raw\n+]]>\n+    </token>\n+    <token name="@MSCONVERT_RAW@"><![CDATA[\n+mzML conversion from RAW requires special options: @MSCONVERT_CMD@\n+]]>\n+    </token>\n+    <token name="@MSCONVERT_HELP@"><![CDATA[\n+\n+    The MSConvert command can be used to convert and deconvolute DIA raw files to mzML format. You need to use these options:\n+\n+    ::\n+\n+      @MSCONVERT_CMD@\n+\n+]]>\n+    </token>\n+</macros>\n'
b
diff -r 000000000000 -r 62a718b76f62 static/images/SearchToLib_Workflow.png
b
Binary file static/images/SearchToLib_Workflow.png has changed