diff pi_database_splitter.py @ 0:34c5c95740a1 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/pi_db_tools commit a58e2a324724f344a07d4499c860a5b2da06927d
author galaxyp
date Mon, 22 May 2017 05:08:23 -0400
parents
children 8a30d6e5b97d
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pi_database_splitter.py	Mon May 22 05:08:23 2017 -0400
@@ -0,0 +1,176 @@
+#!/usr/bin/env python
+import sys
+import argparse
+from numpy import median
+from contextlib import ExitStack
+
+
+def main():
+    if sys.argv[1:] == []:
+        sys.argv.append('-h')
+    args = parse_commandline()
+    locfun = {False: locatefraction,
+              True: reverse_locatefraction}[args.reverse]
+    # Column nrs should start from 0
+    # If negative, -1 is last item in list, etc
+    if args.fdrcol > 0:
+        args.fdrcol -= 1
+    if args.deltapicol > 0:
+        args.deltapicol -= 1
+    pishift = get_pishift(args.train_peptable, args.fdrcol, args.deltapicol,
+                          args.fdrcutoff, args.picutoff)
+    binarray = get_bin_array(args.fr_amount, args.fr_width, args.intercept,
+                             args.tolerance, pishift)
+    write_fractions(args.pipeps, args.fr_amount, args.prefix,
+                    binarray, locfun, args.minlen, args.maxlen)
+
+
+def locatefraction(pep_pi, bins):
+    index = []
+    for pibin in bins:
+        if pep_pi > pibin[2]:
+            continue
+        elif pep_pi >= pibin[1]:
+            index.append(pibin[0])
+        else:
+            return index
+    return index
+
+
+def reverse_locatefraction(pep_pi, bins):
+    index = []
+    for pibin in bins:
+        if pep_pi < pibin[1]:
+            continue
+        elif pep_pi < pibin[2]:
+            index.append(pibin[0])
+        else:
+            return index
+    return index
+
+
+def parse_commandline():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument('-p', dest='train_peptable', help='Peptide table with '
+                        'peptides, FDR, and fraction numbers. Used to '
+                        'calculate pI shift. Leave emtpy for no shift. '
+                        'Tab separated file.')
+    parser.add_argument('--deltacol', dest='deltapicol', help='Delta pI column'
+                        ' number in peptide table. First column is nr. 1. '
+                        'Negative number for counting from last col '
+                        '(-1 is last).', default=False, type=int)
+    parser.add_argument('--picutoff', dest='picutoff',
+                        help='delta pI value to filter experimental peptides'
+                        ' when calculating pi shift.', default=0.2, type=float)
+    parser.add_argument('--fdrcol', dest='fdrcol', help='FDR column number in '
+                        'peptide table. First column is nr. 1. Empty includes '
+                        'all peptides', default=False, type=int)
+    parser.add_argument('--fdrcutoff', dest='fdrcutoff',
+                        help='FDR cutoff value to filter experimental peptides'
+                        ' when calculating pi shift.', default=0, type=float)
+    parser.add_argument('-i', dest='pipeps', help='A tab-separated txt file '
+                        'with accession, peptide seq, pI value')
+    parser.add_argument('--prefix', dest='prefix', default='pisep',
+                        help='Prefix for target/decoy output files')
+    parser.add_argument('--tolerance', dest='tolerance',
+                        help='Strip fraction tolerance pi tolerance represents'
+                        ' 2.5/97.5 percentile', type=float)
+    parser.add_argument('--amount', dest='fr_amount',
+                        help='Strip fraction amount', type=int)
+    parser.add_argument('--reverse', dest='reverse', help='Strip is reversed',
+                        action='store_const', const=True, default=False)
+    parser.add_argument('--intercept', dest='intercept',
+                        help='pI Intercept of strip', type=float)
+    parser.add_argument('--width', dest='fr_width',
+                        help='Strip fraction width in pI', type=float)
+    parser.add_argument('--minlen', dest='minlen', help='Minimal peptide length', 
+                        type=int)
+    parser.add_argument('--maxlen', dest='maxlen', help='Maximal peptide length',
+                        type=int, default=False)
+    return parser.parse_args(sys.argv[1:])
+
+
+def get_pishift(peptable, fdrcol, deltapicol, fdrcutoff, delta_pi_cutoff):
+    delta_pis = []
+    with open(peptable) as fp:
+        next(fp)  # skip header
+        for line in fp:
+            line = line.strip('\n').split('\t')
+            if fdrcol:
+                try:
+                    fdr = float(line[fdrcol])
+                except ValueError:
+                    continue
+                if fdr > fdrcutoff:
+                    continue
+            try:
+                delta_pi = float(line[deltapicol])
+            except ValueError:
+                continue
+            if delta_pi < delta_pi_cutoff:
+                delta_pis.append(delta_pi)
+    shift = median(delta_pis)
+    print('pI shift (median of delta pIs): {}'.format(shift))
+    return shift
+
+
+def get_bin_array(amount_fractions, fr_width, intercept, tolerance, pi_shift):
+    frnr = 1
+    bin_array = []
+    while frnr <= amount_fractions:
+        pi_center = fr_width * frnr + intercept
+        bin_left = pi_center - fr_width / 2 - tolerance - pi_shift
+        bin_right = pi_center + fr_width / 2 + tolerance - pi_shift
+        print('Bins in fraction', frnr, bin_left, bin_right)
+        bin_array.append((frnr, bin_left, bin_right))
+        frnr += 1
+    return bin_array
+
+
+def write_fractions(pi_peptides_fn, amount_fractions, out_prefix,
+                    bin_array, locate_function, minlen, maxlen):
+    amountpad = len(str(amount_fractions))
+    with ExitStack() as stack:
+        target_out_fp = {frnr: ([], stack.enter_context(
+            open('{p}_fr{i:0{pad}}.fasta'.format(p=out_prefix, i=frnr,
+                                                 pad=amountpad), 'w')))
+            for frnr in range(1, amount_fractions + 1)}
+        decoy_out_fp = {frnr: ([], stack.enter_context(
+            open('decoy_{p}_fr{i:0{pad}}.fasta'.format(p=out_prefix, i=frnr,
+                                                       pad=amountpad), 'w')))
+            for frnr in range(1, amount_fractions + 1)}
+        input_fp = stack.enter_context(open(pi_peptides_fn))
+        pepcount = 0
+        for line in input_fp:
+            accs, pep, pi = line.strip().split("\t")
+            pi = float(pi)
+            if maxlen and len(pep) > maxlen:
+                continue 
+            elif len(pep) >= minlen:
+                pepcount += 1
+                if pep[-1] in {'K', 'R'}:
+                    rev_pep = pep[::-1][1:] + pep[-1]
+                else:
+                    rev_pep = pep[::-1]
+                for i in locate_function(pi, bin_array):
+                    target_out_fp[i][0].append('>{}\n{}\n'.format(accs, pep))
+                    # write pseudoReversed decoy peptide at the same time
+                    decoy_out_fp[i][0].append('>decoy_{}\n{}\n'.format(
+                        accs, rev_pep))
+            if pepcount > 1000000:
+                # write in chunks to make it go faster
+                pepcount = 0
+                [fp.write(''.join(peps)) for peps, fp in
+                 target_out_fp.values()]
+                [fp.write(''.join(peps)) for peps, fp in decoy_out_fp.values()]
+                target_out_fp = {fr: ([], pep_fp[1])
+                                 for fr, pep_fp in target_out_fp.items()}
+                decoy_out_fp = {fr: ([], pep_fp[1])
+                                for fr, pep_fp in decoy_out_fp.items()}
+        [fp.write(''.join(peps)) for peps, fp in target_out_fp.values()]
+        [fp.write(''.join(peps)) for peps, fp in decoy_out_fp.values()]
+
+
+if __name__ == '__main__':
+    main()