Repository 'fastg2protlib'
hg clone https://toolshed.g2.bx.psu.edu/repos/galaxyp/fastg2protlib

Changeset 0:6b226c5907a1 (2020-08-07)
Commit message:
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fastg2protlib commit e777bdb1d28b1ffee75cb1a8ad782a50c10a5358"
added:
app_validate.py
application.py
fastg2protlib-peptides.xml
fastg2protlib-validate.xml
macros.xml
test-data/mgf_tst.tab
test-data/tst_valid.db
test-data/two.fastg
b
diff -r 000000000000 -r 6b226c5907a1 app_validate.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/app_validate.py Fri Aug 07 06:17:31 2020 -0400
b
@@ -0,0 +1,32 @@
+import argparse
+
+import fastg2protlib.fastg2protlib as fg
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run peptides for fastg")
+    parser.add_argument("msgf", help="Path MSGF+ tabular results.")
+    parser.add_argument(
+        "-d",
+        "--dbname",
+        default="results.db",
+        help="Name for the results database. Defaults to results.db",
+    )
+    parser.add_argument(
+        "-f",
+        "--fdr",
+        default=0.10,
+        type=float,
+        help="FDR cutoff for accepting PSM validation.",
+    )
+    parser.add_argument(
+        "-x",
+        "--decoy_header",
+        default="XXX_",
+        help="String used for marking decoy proteins.",
+    )
+
+    args = parser.parse_args()
+    fg.verified_proteins(
+        args.msgf, fdr_level=0.10, decoy_header="XXX_", db_name=args.dbname
+    )
b
diff -r 000000000000 -r 6b226c5907a1 application.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/application.py Fri Aug 07 06:17:31 2020 -0400
[
@@ -0,0 +1,88 @@
+import argparse
+
+import fastg2protlib.fastg2protlib as fg
+
+expasy_rules = [
+    "arg-c",
+    "asp-n",
+    "bnps-skatole",
+    "caspase 1",
+    "caspase 2",
+    "caspase 3",
+    "caspase 4",
+    "caspase 5",
+    "caspase 6",
+    "caspase 7",
+    "caspase 8",
+    "caspase 9",
+    "caspase 10",
+    "chymotrypsin high specificity",
+    "chymotrypsin low specificity",
+    "clostripain",
+    "cnbr",
+    "enterokinase",
+    "factor xa",
+    "formic acid",
+    "glutamyl endopeptidase",
+    "granzyme b",
+    "hydroxylamine",
+    "iodosobenzoic acid",
+    "lysc",
+    "ntcb",
+    "pepsin ph1.3",
+    "pepsin ph2.0",
+    "proline endopeptidase",
+    "proteinase k",
+    "staphylococcal peptidase i",
+    "thermolysin",
+    "thrombin",
+    "trypsin",
+    "trypsin_exception",
+]
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run peptides for fastg")
+    parser.add_argument("fastg", help="Path to Spades formatted FASTG.")
+    parser.add_argument(
+        "-d",
+        "--dbname",
+        default="results.db",
+        help="Name for the results database. Defaults to results.db",
+    )
+    parser.add_argument(
+        "-c",
+        "--cleavage",
+        default="trypsin",
+        help="Cleavage rule from ExPASy cleavage rules. Defaults to trypsin.",
+    )
+    parser.add_argument(
+        "-p",
+        "--min_protein_length",
+        default=55,
+        type=int,
+        help="Minimum protein length in number of amino acids. Defaults to 55.",
+    )
+    parser.add_argument(
+        "-m",
+        "--min_peptide_length",
+        default=8,
+        type=int,
+        help="Minimum peptide length in amino acids. Defaults to eight.",
+    )
+    parser.add_argument(
+        "-l", "--plots", default=True, type=bool, help="Generate diagnostic plots.",
+    )
+
+    args = parser.parse_args()
+
+    print(args)
+
+    fg.peptides_for_fastg(
+        fastg_filename=args.fastg,
+        db_name=args.dbname,
+        cleavage=args.cleavage,
+        min_protein_length=(args.min_protein_length * 3),
+        min_peptide_length=args.min_peptide_length,
+        create_plots=args.plots,
+    )
b
diff -r 000000000000 -r 6b226c5907a1 fastg2protlib-peptides.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fastg2protlib-peptides.xml Fri Aug 07 06:17:31 2020 -0400
[
@@ -0,0 +1,59 @@
+<tool id="fastg2protlib-peptides" name="FASTG2Protlib-Peptides" version="@VERSION@">
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <description>Generate FASTA from FASTG</description>
+    <expand macro="pkg_requirement" />
+    <command detect_errors="exit_code">
+        <![CDATA[
+        python '$__tool_directory__/application.py' 
+        -m $min_peptide_length
+        -p $min_protein_length  
+        -c $cleavage 
+        -d 'results.db' 
+        -l $show_plots
+        '$fastg_file'
+        ]]>
+    </command>
+    <inputs>
+        <param name="fastg_file" type="data" format="fastg" label="FASTG file" />
+        <param name="cleavage" type="select" label="Peptide Cleavage">
+            <option value="trypsin" selected="true">Trypsin</option>
+            <expand macro="cleavages" />
+        </param>
+        <param name="min_protein_length" type="integer" value="55" label="Minimum Protein Length in Amino Acids" />
+        <param name="min_peptide_length" type="integer" value="8" label="Minimum Peptide Length in Amino Acids" />
+        <param name="show_plots" type="boolean" checked="true" label="Create Diagnostic Plots" />
+    </inputs>
+    <outputs>
+        <data name="peptide_fasta" format="txt" from_work_dir="peptide.fasta" label="${on_string} Peptides from FASTG" />
+        <data name="results_db" format="sqlite" from_work_dir="results.db" label="${on_string} Results DB" />
+        <data name="aa_count_plot" format="png" from_work_dir="aa_count_chart.png" label="${on_string} AA Count Plot">
+            <filter>show_plots == True</filter>
+        </data>
+        <data name="fastg_length_plot" format="png" from_work_dir="fastg_seq_lengths.png" label="${on_string} FASTG Sequence Length Plot">
+            <filter>show_plots == True</filter>
+        </data>
+        <data name="protein_length_plot" format="png" from_work_dir="protein_seq_lengths.png" label="${on_string} Protein Sequence Length Plot">
+            <filter>show_plots == True</filter>
+        </data>
+        <data name="gc_pct_plot" format="png" from_work_dir="gc_pct.png" label="${on_string} GC Percent Plot">
+            <filter>show_plots == True</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="fastg_file" value="two.fastg" />
+            <param name="cleavage" value="trypsin" />
+            <param name="min_protein_length" value="20" />
+            <param name="min_peptide_length" value="8" />
+            <param name="show_plots" value="false" />
+            <output name="peptide_fasta">
+                <assert_contents>
+                    <has_text text="IFLPFSTHSR" />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <expand macro="help-text" />
+</tool>
b
diff -r 000000000000 -r 6b226c5907a1 fastg2protlib-validate.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fastg2protlib-validate.xml Fri Aug 07 06:17:31 2020 -0400
[
@@ -0,0 +1,41 @@
+<tool id="fastg2protlib-validate" name="FASTG2Protlib-Validate" version="@VERSION@">
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="pkg_requirement" />
+    <description>Validate a candidate protein library</description>
+
+    <command detect_errors="exit_code">
+        <![CDATA[
+        python '$__tool_directory__/app_validate.py' 
+        -d '$database_file'
+        -f $fdr_level
+        -x '$decoy_header'
+        '$tabular_file'
+        ]]>
+    </command>
+    <inputs>
+        <param name="tabular_file" type="data" format="txt" label="MSGF+ tabular file" />
+        <param name="database_file" type="data" format="sqlite" label="Database Name"/>
+        <param name="fdr_level" type="float" value="0.10" label="FDR value for validation."/>
+        <param name="decoy_header" type="text" value="XXX_" label="Decoy protein header"/>
+    </inputs>
+    <outputs>
+        <data name="protein_fasta" format="fasta" from_work_dir="protein.fasta" label="Validated protein library"/>
+        <data name="protein_score" format="csv" from_work_dir="protein_scores.csv" label="Protein scores"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="tabular_file" value="mgf_tst.tab" />
+            <param name="database_file" value="tst_valid.db" />
+            <param name="fdr_level" value="0.10" />
+            <param name="decoy_header" value="XXX_" />
+            <output name="protein_fasta">
+                <assert_contents>
+                    <has_text text="RYSRPLSHL" />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>    
+    <expand macro="help-text" />
+</tool>
b
diff -r 000000000000 -r 6b226c5907a1 macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Fri Aug 07 06:17:31 2020 -0400
[
@@ -0,0 +1,29 @@
+<macros>
+    <token name="@VERSION@">1.0.2</token>
+    <xml name="cleavages">
+        <option value="arg-c">Arg-c</option>
+        <option value="asp-n">Asp-n</option>
+        <option value="thrombin">Thrombin</option></xml>
+    <xml name="help-text">
+        <help>
+            <![CDATA[
+
+                FASTG2Protlib creates a validated protein FASTA library starting from FASTG output. The tool
+                operates in two steps.
+
+                **Generate Peptides from FASTG**
+                
+                Use the tool to generate peptides cleaved from putative proteins generated from a FASTG file.
+
+                **Generate Verified Protein Library**
+               
+                Use the tool to generate a verified protein library with MSGF+ verified peptides as input
+            ]]>
+        </help>
+    </xml>
+    <xml name="pkg_requirement">
+        <requirements>
+            <requirement type="package" version="@VERSION@">fastg2protlib</requirement>
+        </requirements>
+    </xml>
+</macros>
\ No newline at end of file
b
diff -r 000000000000 -r 6b226c5907a1 test-data/mgf_tst.tab
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mgf_tst.tab Fri Aug 07 06:17:31 2020 -0400
b
@@ -0,0 +1,25 @@
+#SpecFile SpecID ScanNum FragMethod Precursor IsotopeError PrecursorError(ppm) Charge Peptide Protein DeNovoScore MSGFScore SpecEValue EValue QValue PepQValue
+wendt005_mickela_20200214_17647_12_V.mzML index=8575 -1 CID 501.26144 1 14.785407 3 +42.011IFLPFSTHSR+0.984 Pep_1|Protein_1(pre=-,post=-) 74 36 2.7071892E-10 2.855757E-4 0.0 0.0
+wendt005_mickela_20200214_17647_12_V.mzML index=10628 -1 CID 631.3283 0 16.628782 2 RTVWSN+0.984GTSPR Pep_2|Protein_1_29(pre=-,post=P) 61 36 6.954425E-10 7.203602E-4 0.088495575 0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML index=6020 -1 CID 607.79266 1 -11.606342 2 +42.011AQ+0.984YWLSQFK Pep_3|Protein_1_28(pre=-,post=-) 23 10 9.667708E-10 9.6076715E-4 0.088495575 0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML index=11469 -1 CID 470.6093 1 -7.4827867 3 RLLLQ+0.984C+57.021PRVPR Pep_4|Protein_2(pre=-,post=L) 68 35 1.2193706E-9 0.0012630607 0.088495575 0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML index=15017 -1 CID 799.7483 0 2.747454 3 YFM+15.995YSIQYILIFYVQYVK Pep_5|Protein_2_29(pre=-,post=-) 2 -17 2.587433E-9 0.0029598887 0.088495575 0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML index=16951 -1 CID 424.5418 1 -19.301939 3 RC+57.021GPLQASEPR Pep_6|Protein_4_16_31_32(pre=-,post=E) 69 41 3.636947E-9 0.0037672587 0.088495575 0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML index=34154 -1 CID 768.88916 1 13.068233 2 +42.011STPVELEFSQ+0.984VEK Pep_7|Protein_5_34(pre=-,post=-) 77 33 4.083382E-9 0.0043801093 0.088495575 0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML index=12592 -1 CID 701.0163 0 -5.6593018 3 YQSTPNIYYILYMYIR Pep_8|Protein_5_6_34_40(pre=-,post=-) 77 19 6.9333055E-9 0.0077557205 0.088495575 0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML index=12620 -1 CID 485.91306 0 -10.551063 3 M+15.995SGIITN+0.984EISVFK Pep_9|Protein_7_9_22(pre=-,post=-) 55 28 7.184936E-9 0.007707044 0.088495575 0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML index=5150 -1 CID 479.9201 0 -11.509454 3 YFEGKPVIEEVK Pep_10|Protein_7_22(pre=-,post=-) 87 44 7.371949E-9 0.007776514 0.088495575 0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML index=26411 -1 CID 708.3275 0 -15.423832 2 PAQ+0.984PTGTRPC+57.021SSR Pep_11|Protein_8_21(pre=R,post=-) 41 15 7.913002E-9 0.008488016 0.088495575 0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML index=22717 -1 CID 783.3668 0 11.843052 2 +42.011EEQDTFAVNSQQK XXX_Pep_22060|Protein_2878(pre=-,post=-) 135 39 7.923481E-9 0.008499257 0.088495575 0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML index=8020 -1 CID 629.31287 0 13.772342 2 +42.011FQEPQQPWR XXX_Pep_16062|Protein_2307(pre=-,post=-) 31 14 8.15928E-9 0.008108611 0.088495575 0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML index=20351 -1 CID 497.26334 0 0.79782444 2 LVPASGMYR XXX_Pep_9843|Protein_1629(pre=-,post=-) 28 16 8.401295E-9 0.0083491225 0.088495575 0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML index=18867 -1 CID 559.29803 1 19.583662 3 +42.011LIGTATSVDEAIAN+0.984EK XXX_Pep_14112|Protein_2085(pre=-,post=-) 57 21 8.928303E-9 0.009987362 0.088495575 0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML index=43084 -1 CID 653.8416 1 0.048319984 2 +42.011YSNYILYTVK XXX_Pep_3426|Protein_702(pre=-,post=-) 13 0 9.61208E-9 0.009760836 0.088495575 0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML index=43084 -1 CID 653.8416 0 14.842637 2 +42.011YSN+0.984YILYTVK XXX_Pep_3426|Protein_702(pre=-,post=-) 13 0 9.61208E-9 0.009760836 0.088495575 0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML index=32362 -1 CID 1036.4833 0 5.6531625 3 AWIGMQ+0.984WNGIEWNAM+15.995EWIQLEWNGK XXX_Pep_14712|Protein_2168(pre=-,post=-) 13 -31 1.0415514E-8 0.012581826 0.088495575 0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML index=10370 -1 CID 461.22098 0 0.13233389 2 N+0.984DTQMLAK XXX_Pep_7493|Protein_1335_1348(pre=-,post=-) 83 51 1.0437349E-8 0.0101322755 0.088495575 0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML index=21871 -1 CID 523.7984 0 1.3982916 2 +42.011AYVLNISPK XXX_Pep_31555|Protein_3895(pre=-,post=-) 86 43 1.0667454E-8 0.010601209 0.088495575 0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML index=15561 -1 CID 581.9749 0 17.304827 3 GLDWDLAADLEGN+0.984IIK XXX_Pep_17726|Protein_2472(pre=-,post=-) 107 48 1.0803276E-8 0.012084738 0.088495575 0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML index=22491 -1 CID 522.26483 0 -13.790032 2 +42.011Q+0.984LEAVQ+0.984VGR XXX_Pep_12397|Protein_1871(pre=-,post=-) 46 31 1.09720055E-8 0.01090387 0.088495575 0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML index=9573 -1 CID 623.81696 0 -13.893293 2 +42.011RHALDGPWPR XXX_Pep_17806|Protein_2476(pre=-,post=Q) 28 13 1.1109479E-8 0.011281409 0.088495575 0.0952381
+wendt005_mickela_20200214_17647_12_V.mzML index=16000 -1 CID 381.54922 1 13.824602 3 +42.011N+0.984TYLSFLIK XXX_Pep_29178|Protein_3650(pre=-,post=-) 66 42 1.1131118E-8 0.011061994 0.088495575 0.0952381
b
diff -r 000000000000 -r 6b226c5907a1 test-data/tst_valid.db
b
Binary file test-data/tst_valid.db has changed
b
diff -r 000000000000 -r 6b226c5907a1 test-data/two.fastg
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/two.fastg Fri Aug 07 06:17:31 2020 -0400
b
@@ -0,0 +1,40 @@
+>EDGE_1_length_84_cov_1.0:EDGE_3_length_84_cov_1.0;
+CGTTATTCGCGCCCACTCTCCCATTTATCCGCGCAAGCGGATGCGATGCGATTGCCCGCTAAGATATTCTTACCATTCTCGACA
+>EDGE_1_length_84_cov_1.0';
+TGTCGAGAATGGTAAGAATATCTTAGCGGGCAATCGCATCGCATCCGCTTGCGCGGATAAATGGGAGAGTGGGCGCGAATAACG
+>EDGE_2_length_84_cov_1.0:EDGE_3_length_84_cov_1.0;
+CTGGTCCTGTTGACTACAATGGGCCCAACTCAATCACAGCTCGAGCGCCTTGAATAACATACTCATCTCTATACATTCTCGACA
+>EDGE_2_length_84_cov_1.0':EDGE_3_length_84_cov_1.0';
+TGTCGAGAATGTATAGAGATGAGTATGTTATTCAAGGCGCTCGAGCTGTGATTGAGTTGGGCCCATTGTAGTCAACAGGACCAG
+>EDGE_3_length_84_cov_1.0:EDGE_2_length_84_cov_1.0,EDGE_4_length_84_cov_1.0;
+CATTCTCGACATGCTGAGCTGAGACGGCGTCGATGCATAGCGGACTTTCGGTCAGTCGCAATTCCTCACGAGACTGGTCCTGTT
+>EDGE_3_length_84_cov_1.0':EDGE_2_length_84_cov_1.0',EDGE_1_length_84_cov_1.0';
+AACAGGACCAGTCTCGTGAGGAATTGCGACTGACCGAAAGTCCGCTATGCATCGACGCCGTCTCAGCTCAGCATGTCGAGAATG
+>EDGE_4_length_84_cov_1.0:EDGE_5_length_84_cov_1.0;
+CTGGTCCTGTTACAGAGCTGGCGTACGCGTTGAACACTTCACAGATGATAGGGATTCGGGTAAAGAGCGTGTCATTGGGGGCTT
+>EDGE_4_length_84_cov_1.0':EDGE_3_length_84_cov_1.0';
+AAGCCCCCAATGACACGCTCTTTACCCGAATCCCTATCATCTGTGAAGTGTTCAACGCGTACGCCAGCTCTGTAACAGGACCAG
+>EDGE_5_length_84_cov_1.0;
+ATTGGGGGCTTCATACATAGAGCAAGGGCGTCGAACGGTCGTGAAAGTCTTAGTACCGCACGTACCAACTTACTGAGGATATTG
+>EDGE_5_length_84_cov_1.0':EDGE_4_length_84_cov_1.0',EDGE_6_length_84_cov_1.0';
+CAATATCCTCAGTAAGTTGGTACGTGCGGTACTAAGACTTTCACGACCGTTCGACGCCCTTGCTCTATGTATGAAGCCCCCAAT
+>EDGE_6_length_84_cov_1.0:EDGE_5_length_84_cov_1.0;
+AAGAGGCCGCCACCGTTTTAGGGGGGGAAGGTTGAAGATCTCCTCTTCTCATGACTGAACTCGCGAGGGCCGTATTGGGGGCTT
+>EDGE_6_length_84_cov_1.0':EDGE_8_length_84_cov_1.0';
+AAGCCCCCAATACGGCCCTCGCGAGTTCAGTCATGAGAAGAGGAGATCTTCAACCTTCCCCCCCTAAAACGGTGGCGGCCTCTT
+>EDGE_7_length_84_cov_1.0:EDGE_8_length_84_cov_1.0;
+AAGAGGCCGCCAAAGAACAAAGGCTTACTGTGCGCAGAGGAACGCCCATTTAGCGGCTGGCGTTTTGAATCCTTTTAATATTGT
+>EDGE_7_length_84_cov_1.0':EDGE_8_length_84_cov_1.0';
+ACAATATTAAAAGGATTCAAAACGCCAGCCGCTAAATGGGCGTTCCTCTGCGCACAGTAAGCCTTTGTTCTTTGGCGGCCTCTT
+>EDGE_8_length_84_cov_1.0:EDGE_7_length_84_cov_1.0,EDGE_6_length_84_cov_1.0;
+TTTAATATTGTTTAATCCAATTCCCTCATTTAGGACCCTACCAAGTCAACATTGGTATATGAATGCGACCTCGAAGAGGCCGCC
+>EDGE_8_length_84_cov_1.0':EDGE_7_length_84_cov_1.0',EDGE_9_length_84_cov_1.0';
+GGCGGCCTCTTCGAGGTCGCATTCATATACCAATGTTGACTTGGTAGGGTCCTAAATGAGGGAATTGGATTAAACAATATTAAA
+>EDGE_9_length_84_cov_1.0:EDGE_8_length_84_cov_1.0;
+TAAAAATGACAGTGGTTGGTGCTCTAAACTTCATTTGGTTAACTCGTGTATCAGCGCGATAGGCTGTTAGAGGTTTAATATTGT
+>EDGE_9_length_84_cov_1.0';
+ACAATATTAAACCTCTAACAGCCTATCGCGCTGATACACGAGTTAACCAAATGAAGTTTAGAGCACCAACCACTGTCATTTTTA
+>EDGE_10_length_84_cov_1.0;
+ATGGCAAGGTACTTCCGGTCTTAATGAATGGCCGGGAAAGGTACGCACGCGGTATGGGGGGGTGAAGGGGCGAATAGACAGGCT
+>EDGE_10_length_84_cov_1.0':EDGE_10_length_84_cov_1.0;
+AGCCTGTCTATTCGCCCCTTCACCCCCCCATACCGCGTGCGTACCTTTCCCGGCCATTCATTAAGACCGGAAGTACCTTGCCAT