changeset 2:77ddaee887a8 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/pi_db_tools commit 71a4265d11aef48342142b8cf2caa86f79f9a554
author galaxyp
date Fri, 01 Sep 2017 03:14:54 -0400
parents 8a30d6e5b97d
children 78afc81ab244
files __pycache__/peptide_pi_annotator.cpython-36.pyc align_dbspec.py delta_pi_calc.xml pi_db_split.xml pi_dbspec_align.xml test-data/merged_twice_decoy_fr1-3.fasta test-data/specnames.txt
diffstat 7 files changed, 225 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
Binary file __pycache__/peptide_pi_annotator.cpython-36.pyc has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/align_dbspec.py	Fri Sep 01 03:14:54 2017 -0400
@@ -0,0 +1,127 @@
+#!/usr/bin/env python
+import sys
+import os
+import argparse
+import re
+from Bio import SeqIO
+
+
+def create_spectra_maps(specfiles, dbfiles, frregex, firstfr):
+    """Output something like
+    {'fr01', 'fr04'} # Normal filename set
+    and
+    {'fr03': ['fr02', 'fr03']}  # pool definition
+    and
+    {'fr04': 'fr04', 'fr04b': 'fr04'}  # rerun fraction, rerun may also be pool
+    """
+    specrange = get_fn_fractionmap(specfiles, frregex)
+    to_pool = []
+    poolmap, rerun_map, normal_fns = {}, [], set()
+    for i in range(0, len(dbfiles)):
+        num = i + firstfr
+        if num not in specrange:
+            to_pool.append(i)
+        elif to_pool and num in specrange:
+            to_pool.append(i)
+            poolmap[specrange[num][0]] = to_pool
+            to_pool = []
+        if not to_pool and specrange[num][0] in poolmap:
+            if poolmap[specrange[num][0]][-1] != i:
+                normal_fns.add((dbfiles[num - 1],
+                                specfiles[specrange[num][0]]))
+        elif not to_pool:
+            normal_fns.add((dbfiles[num - 1], specfiles[specrange[num][0]]))
+    for num in sorted(specrange.keys()):
+        if len(specrange[num]) > 1:
+            rerun_map.append(specrange[num])
+    return normal_fns, rerun_map, poolmap
+
+
+def get_fn_fractionmap(files, frregex):
+    fnfrmap = {}
+    for f_ix, fn in enumerate(files):
+        fnum = int(re.sub(frregex, '\\1', fn))
+        try:
+            fnfrmap[fnum].append(f_ix)
+        except KeyError:
+            fnfrmap[fnum] = [f_ix]
+    return fnfrmap
+
+
+def pool_fasta_files(poolfiles):
+    acc_seq = {}
+    for fr in poolfiles:
+        for seq in SeqIO.parse(fr, 'fasta'):
+            sequence = str(seq.seq.upper())
+            try:
+                if sequence in acc_seq[seq.id]:
+                    continue
+            except KeyError:
+                acc_seq[seq.id] = {sequence: 1}
+                yield seq
+            else:
+                acc_seq[seq.id][sequence] = 1
+                yield seq
+
+
+def write_pooled_fasta(poolmap, specnames, dbfiles):
+    """Runs through poolmap and pooles output files, filtering out
+    duplicates"""
+    for outfr, infrs in poolmap.items():
+        outfn = os.path.join('aligned_out', os.path.basename(specnames[outfr]))
+        print('Pooling FASTA files {} - {} into: {}'.format(
+            dbfiles[infrs[0]], dbfiles[infrs[-1]], outfn))
+        with open(outfn, 'w') as fp:
+            SeqIO.write(pool_fasta_files([dbfiles[x] for x in infrs]), fp,
+                        'fasta')
+
+
+def write_nonpooled_fasta(fractions):
+    """Symlinks nonpooled db files"""
+    print('Symlinking non-pooled non-rerun files',
+          [(fr[0], os.path.join('aligned_out', os.path.basename(fr[1])))
+           for fr in fractions])
+    [os.symlink(fr[0], os.path.join('aligned_out', os.path.basename(fr[1])))
+     for fr in fractions]
+
+
+def copy_rerun_fasta(rerun_map, specnames):
+    for dst_indices in rerun_map:
+        src = os.path.join(specnames[dst_indices[0]])
+        for outfn in [specnames[x] for x in dst_indices[1:]]:
+            print('Symlinking {} to {}'.format(src, outfn))
+            os.symlink(src, os.path.join('aligned_out', outfn))
+
+
+def main():
+    args = parse_commandline()
+    with open(args.spectranames) as fp:
+        spectranames = [x.strip() for x in fp.read().strip().split('\n')]
+    vanilla_fr, rerun_map, poolmap = create_spectra_maps(spectranames,
+                                                         args.dbfiles,
+                                                         args.frspecregex,
+                                                         args.firstfr)
+    write_pooled_fasta(poolmap, spectranames, args.dbfiles)
+    write_nonpooled_fasta(vanilla_fr)
+    copy_rerun_fasta(rerun_map, spectranames)
+
+
+def parse_commandline():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument('--specnames', dest='spectranames', help='File '
+                        'containing spectra filenames with fractions. '
+                        'Test data example illustrates reruns (fr03b, 09b) and'
+                        ' pooled samples (fr05-09 are inside fr09 and fr09b).',
+                        required=True)
+    parser.add_argument('--dbfiles', dest='dbfiles', help='FASTA db files',
+                        nargs='+', required=True)
+    parser.add_argument('--frspec', dest='frspecregex', help='Fraction regex '
+                        'to detect spectra fraction numbers', required=True)
+    parser.add_argument('--firstfr', dest='firstfr', help='First fraction nr',
+                        type=int, required=True)
+    return parser.parse_args(sys.argv[1:])
+
+
+if __name__ == '__main__':
+    main()
--- a/delta_pi_calc.xml	Mon Jul 24 05:25:22 2017 -0400
+++ b/delta_pi_calc.xml	Fri Sep 01 03:14:54 2017 -0400
@@ -1,9 +1,9 @@
-<tool id="calc_delta_pi" name="Add delta pI" version="1.1">
+<tool id="calc_delta_pi" name="Add delta pI" version="1.2">
+    <description>to peptide table</description>
     <requirements>
         <requirement type="package" version="3.6">python</requirement>
     </requirements>
-    <description>to peptide table</description>
-    <command>
+    <command detect_errors="exit_code">
 	    python '$__tool_directory__/peptide_pi_annotator.py' -i '$trainingpi' -p '$peptable' --out '$output'
 	    #if $stripcol
 	        --stripcol $stripcol
--- a/pi_db_split.xml	Mon Jul 24 05:25:22 2017 -0400
+++ b/pi_db_split.xml	Fri Sep 01 03:14:54 2017 -0400
@@ -1,10 +1,10 @@
-<tool id="pi_db_split" name="Split peptide database" version="1.1">
+<tool id="pi_db_split" name="Split peptide database" version="1.2">
     <description>into pI separated fractions</description>
     <requirements>
         <requirement type="package">numpy</requirement>
         <requirement type="package" version="3.6">python</requirement>
     </requirements>
-    <command>
+    <command detect_errors="exit_code">
 	    <![CDATA[
 	    mkdir pi_fr_out && cd pi_fr_out &&
 	    python '$__tool_directory__/pi_database_splitter.py' -i '$pipeptides' -p '$peptable'
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pi_dbspec_align.xml	Fri Sep 01 03:14:54 2017 -0400
@@ -0,0 +1,77 @@
+<tool id="pi_dbspec_align" name="Align DB fractions" version="0.3">
+    <description>to resemble spectra fraction scheme</description>
+    <requirements>
+        <requirement type="package" version="3.6">python</requirement>
+	<requirement type="package" version="1.62">biopython</requirement>
+    </requirements>
+    <command detect_errors="exit_code">
+	    <![CDATA[
+	    mkdir aligned_out && 
+	    python '$__tool_directory__/align_dbspec.py' 
+	    --specnames $specnames
+	    --dbfiles 
+		#for $key in $databases.keys()
+		'$databases[$key]'
+ 		#end for
+	    --frspec '$frspec'
+	    --firstfr $firstfr
+
+	    ]]>
+    </command>
+    
+    <inputs>
+	    <param name="specnames" type="data" format="text,tabular" label="Spectra files" />
+	    <param name="databases" type="data_collection" collection_type="list" format="fasta" label="Fractionated databases" />
+      	    <param name="frspec" type="text" label="Regex to find fraction numbers in spectra file names" help="If spectra file is called myspectra_fr01b.mzML, use .*fr([0-9]+).*" >
+            <sanitizer>
+                <valid initial="string.printable">
+                    <remove value="&apos;"/>
+                </valid>
+            </sanitizer>
+	</param>
+      	    <param name="firstfr" type="integer" value="1" label="First fraction number in series" />
+    </inputs>
+    
+    <outputs>
+	<collection name="aligned_db" type="list" label="spectra-fraction-aligned DB">
+            <discover_datasets pattern="__designation__" ext="fasta" directory="aligned_out" />
+	</collection>
+    </outputs>
+    <tests>
+	    <test>
+		    <param name="specnames" value="specnames.txt" />
+		    <param name="databases">
+		    	<collection type="list">
+			    <element name="fr1" value="target_splitdb_fr1.fasta" />
+			    <element name="fr2" value="target_splitdb_fr2.fasta" />
+			    <element name="fr3" value="target_splitdb_fr3.fasta" />
+			    <element name="fr4" value="decoy_splitdb_fr1.fasta" />
+			    <element name="fr5" value="decoy_splitdb_fr2.fasta" />
+			    <element name="fr6" value="decoy_splitdb_fr3.fasta" />
+			    <element name="fr7" value="decoy_splitdb_fr1.fasta" />
+			    <element name="fr8" value="decoy_splitdb_fr2.fasta" />
+			    <element name="fr9" value="decoy_splitdb_fr3.fasta" />
+			</collection>
+			</param>
+		    <param name="frspec" value=".*c_f([0-9]+).*" />
+		    <param name="firstfr" value="1" />
+		    <output_collection name="aligned_db" type="list">
+			    <element name="spec_f01.mzML" value="target_splitdb_fr1.fasta" />
+			    <element name="spec_f02.mzML" value="target_splitdb_fr2.fasta" />
+			    <element name="spec_f03.mzML" value="target_splitdb_fr3.fasta" />
+			    <element name="spec_f03b.mzML" value="target_splitdb_fr3.fasta" />
+			    <element name="spec_f09.mzML" value="merged_twice_decoy_fr1-3.fasta" compare="sim_size" />
+			    <element name="spec_f09b.mzML" value="merged_twice_decoy_fr1-3.fasta" compare="sim_size" />
+		    </output_collection>
+	    </test>
+    </tests>
+
+    <help>
+	    Filters, pools and doubles fractionated databases with a set of identically fractionated spectra files which have been
+	    subjected to pooling and contain reruns.
+	    You may have fraction 1-10 in databases but spectra file fractions 4-7 have been pooled before loading to the MS, 
+	    and spectra fraction 2 and 8 have been reran creating fractions 2 and 2a, and 8, 8a and 8b.
+	    This tool pools FASTA databases and duplicates them where needed to line up the databases to your spectra collections.
+    </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/merged_twice_decoy_fr1-3.fasta	Fri Sep 01 03:14:54 2017 -0400
@@ -0,0 +1,10 @@
+>decoy_protein1
+TFSLFGCSIPNTNVEFSIKLFDVCLLLCNCLFSLIIMIYVII
+>decoy_protein2
+TFSLFGCSIPNTNVEFSI
+>decoy_protein1
+LNLSKPILSEST
+>decoy_protein3
+LFDVCLLLCNCLFSLIIMIYVIIK
+>decoy_protein2
+LFDVCLLLCNCLFSLIIMIYVIIKLWLFK
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/specnames.txt	Fri Sep 01 03:14:54 2017 -0400
@@ -0,0 +1,6 @@
+spec_f01.mzML
+spec_f02.mzML
+spec_f03.mzML
+spec_f03b.mzML
+spec_f09.mzML
+spec_f09b.mzML