# HG changeset patch # User galaxyp # Date 1495444103 14400 # Node ID 34c5c95740a17cbc3835ca179042924b78a72a18 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/pi_db_tools commit a58e2a324724f344a07d4499c860a5b2da06927d diff -r 000000000000 -r 34c5c95740a1 README.rst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.rst Mon May 22 05:08:23 2017 -0400 @@ -0,0 +1,41 @@ +GalaxyP - Percolator +======================= + +- Home: +- Galaxy Tool Shed: + +.. _GalaxyP: https://github.com/galaxyproteomics/ + + +Contributing +------------ + +Contributions to this repository are reviewed through pull requests. If you would like your work acknowledged, please also add yourself to the Authors section. If your pull request is accepted, you will also be acknowledged in + + +Authors +------- + +Authors and contributors: + +* Jorrit Boekel diff -r 000000000000 -r 34c5c95740a1 delta_pi_calc.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/delta_pi_calc.xml Mon May 22 05:08:23 2017 -0400 @@ -0,0 +1,146 @@ + + + python + + to peptide table + + python '$__tool_directory__/peptide_pi_annotator.py' -i '$trainingpi' -p '$peptable' + --stripcol $stripcol --pepcol $pepcol --fraccol $fraccol --out '$output' + + --strippatterns + #for $strip in $strips + '$strip.pattern' + #end for + + --intercepts + #for $strip in $strips + $strip.intercept + #end for + + --widths + #for $strip in $strips + $strip.fr_width + #end for + + #if len($ignoremods) > 0 + --ignoremods + #for $mod in $ignoremods + '$mod.regex' + #end for + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + In case you have no pI calculation method but a large table with + peptides and their predicted pIs available. This tool adds a column + with delta-pI values to a peptide or PSM table for each peptide it + can find in the predicted collection. Needs a tab-separated file + with peptide-sequences and their predicted pI, and a PSM/peptide table + with at least peptide sequences. + + Regexes, or regular expressions are are sequences of characters that + are used to find a certain pattern in a string of text. For example + the regex "peptide" will find the word "peptide" in the text + "thisisa peptide in my sample". More advanced regexes can allow for + finding for example specific but variable pieces of text, e.g. + "[a-c].*" will match a string "acbcba" in "yxyzyxacbcbayxzyxyzxy". + Much more elaborate regexes exist. Since this is a python script, + python regexes are described here: + https://docs.python.org/3/library/re.html + + + diff -r 000000000000 -r 34c5c95740a1 peptide_pi_annotator.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/peptide_pi_annotator.py Mon May 22 05:08:23 2017 -0400 @@ -0,0 +1,109 @@ +#!/usr/bin/env python + +import re +import sys +import argparse + + +def main(): + if sys.argv[1:] == []: + sys.argv.append('-h') + args = parse_commandline() + strips = {} + for i, strip in enumerate(args.pipatterns): + strips[strip] = {'intercept': args.intercepts[i], + 'fr_width': args.fr_width[i]} + with open(args.outpeptable, 'w') as fp: + for outline in annotate_peptable(args.pipeps, args.peptable, + args.pepcol, args.frac_col, + args.stripcol, strips, + args.ignoremods): + fp.write('\t'.join([str(x) for x in outline])) + fp.write('\n') + + +def get_first_matching_pattern(patterns, string): + for pattern in patterns: + if re.search(pattern, string): + return pattern + return False + + +def annotate_peptable(predicted_peps_fn, peptable, seqcol, frac_col, stripcol, + strips, ignoremods): + if frac_col > 0: + frac_col -= 1 + predicted_peps = {} + with open(predicted_peps_fn) as fp: + for line in fp: + line = line.strip('\n').split('\t') + predicted_peps[line[0]] = line[1] + not_predicted_count, predicted_count = 0, 0 + with open(peptable) as fp: + header = next(fp).strip('\n').split('\t') + yield header + ['Experimental pI', 'Predicted pI', 'Delta pI'] + for line in fp: + line = line.strip('\n').split('\t') + strip = strips[get_first_matching_pattern(strips.keys(), + line[stripcol - 1])] + exp_pi = (strip['fr_width'] * int(line[frac_col]) + + strip['intercept']) + + sequence = line[seqcol - 1] + for weight in ignoremods: + if weight == '*': + regex = '[+-]\d*\.\d*' + else: + regex = '[+-]{}'.format(weight) + sequence = re.sub(regex, '', sequence) + try: + pred_pi = float(predicted_peps[sequence]) + except KeyError: + print('CANNOT PREDICT', sequence) + not_predicted_count += 1 + pred_pi, delta_pi = 'NA', 'NA' + else: + delta_pi = exp_pi - pred_pi + predicted_count += 1 + yield line + [exp_pi, pred_pi, delta_pi] + print('Number of peptides without pI prediction: {}\n' + 'Number of peptides with predicion: {}\n'.format(not_predicted_count, + predicted_count)) + + +def parse_commandline(): + parser = argparse.ArgumentParser( + formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('--out', dest='outpeptable', help='Output peptide ' + 'table') + parser.add_argument('-p', dest='peptable', help='Peptide/PSM table with ' + 'peptides, FDR, fraction numbers. Used to calculate' + 'pI shift.') + parser.add_argument('-i', dest='pipeps', help='A tab-separated txt file ' + 'with peptide seq, pI value') + parser.add_argument('--pepcol', dest='pepcol', help='Peptide sequence ' + 'column number in peptide table. First column is 1.', + default=False, type=int) + parser.add_argument('--fraccol', dest='frac_col', help='Fraction number ' + 'column number in peptide table. First column is 1.', + type=int) + parser.add_argument('--ignoremods', dest='ignoremods', help='Regex to ' + 'identify modification weights to be ignored.', + default=[], nargs='+', type=str) + parser.add_argument('--stripcol', dest='stripcol', help='Strip name ' + 'column number in peptide table. Will be used to ' + 'detect strips if multiple are present using pattern ' + 'passed with --strippatterns. First column is nr. 1.', + default=False, type=int) + parser.add_argument('--strippatterns', dest='pipatterns', + help='Patterns to detect different pI ranges from e.g.' + ' file name in peptide table', nargs='+') + parser.add_argument('--intercepts', dest='intercepts', + help='pI Intercept of strips', nargs='+', type=float) + parser.add_argument('--widths', dest='fr_width', nargs='+', + help='Strip fraction widths in pI', type=float) + return parser.parse_args(sys.argv[1:]) + + +if __name__ == '__main__': + main() diff -r 000000000000 -r 34c5c95740a1 pi_database_splitter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pi_database_splitter.py Mon May 22 05:08:23 2017 -0400 @@ -0,0 +1,176 @@ +#!/usr/bin/env python +import sys +import argparse +from numpy import median +from contextlib import ExitStack + + +def main(): + if sys.argv[1:] == []: + sys.argv.append('-h') + args = parse_commandline() + locfun = {False: locatefraction, + True: reverse_locatefraction}[args.reverse] + # Column nrs should start from 0 + # If negative, -1 is last item in list, etc + if args.fdrcol > 0: + args.fdrcol -= 1 + if args.deltapicol > 0: + args.deltapicol -= 1 + pishift = get_pishift(args.train_peptable, args.fdrcol, args.deltapicol, + args.fdrcutoff, args.picutoff) + binarray = get_bin_array(args.fr_amount, args.fr_width, args.intercept, + args.tolerance, pishift) + write_fractions(args.pipeps, args.fr_amount, args.prefix, + binarray, locfun, args.minlen, args.maxlen) + + +def locatefraction(pep_pi, bins): + index = [] + for pibin in bins: + if pep_pi > pibin[2]: + continue + elif pep_pi >= pibin[1]: + index.append(pibin[0]) + else: + return index + return index + + +def reverse_locatefraction(pep_pi, bins): + index = [] + for pibin in bins: + if pep_pi < pibin[1]: + continue + elif pep_pi < pibin[2]: + index.append(pibin[0]) + else: + return index + return index + + +def parse_commandline(): + parser = argparse.ArgumentParser( + formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('-p', dest='train_peptable', help='Peptide table with ' + 'peptides, FDR, and fraction numbers. Used to ' + 'calculate pI shift. Leave emtpy for no shift. ' + 'Tab separated file.') + parser.add_argument('--deltacol', dest='deltapicol', help='Delta pI column' + ' number in peptide table. First column is nr. 1. ' + 'Negative number for counting from last col ' + '(-1 is last).', default=False, type=int) + parser.add_argument('--picutoff', dest='picutoff', + help='delta pI value to filter experimental peptides' + ' when calculating pi shift.', default=0.2, type=float) + parser.add_argument('--fdrcol', dest='fdrcol', help='FDR column number in ' + 'peptide table. First column is nr. 1. Empty includes ' + 'all peptides', default=False, type=int) + parser.add_argument('--fdrcutoff', dest='fdrcutoff', + help='FDR cutoff value to filter experimental peptides' + ' when calculating pi shift.', default=0, type=float) + parser.add_argument('-i', dest='pipeps', help='A tab-separated txt file ' + 'with accession, peptide seq, pI value') + parser.add_argument('--prefix', dest='prefix', default='pisep', + help='Prefix for target/decoy output files') + parser.add_argument('--tolerance', dest='tolerance', + help='Strip fraction tolerance pi tolerance represents' + ' 2.5/97.5 percentile', type=float) + parser.add_argument('--amount', dest='fr_amount', + help='Strip fraction amount', type=int) + parser.add_argument('--reverse', dest='reverse', help='Strip is reversed', + action='store_const', const=True, default=False) + parser.add_argument('--intercept', dest='intercept', + help='pI Intercept of strip', type=float) + parser.add_argument('--width', dest='fr_width', + help='Strip fraction width in pI', type=float) + parser.add_argument('--minlen', dest='minlen', help='Minimal peptide length', + type=int) + parser.add_argument('--maxlen', dest='maxlen', help='Maximal peptide length', + type=int, default=False) + return parser.parse_args(sys.argv[1:]) + + +def get_pishift(peptable, fdrcol, deltapicol, fdrcutoff, delta_pi_cutoff): + delta_pis = [] + with open(peptable) as fp: + next(fp) # skip header + for line in fp: + line = line.strip('\n').split('\t') + if fdrcol: + try: + fdr = float(line[fdrcol]) + except ValueError: + continue + if fdr > fdrcutoff: + continue + try: + delta_pi = float(line[deltapicol]) + except ValueError: + continue + if delta_pi < delta_pi_cutoff: + delta_pis.append(delta_pi) + shift = median(delta_pis) + print('pI shift (median of delta pIs): {}'.format(shift)) + return shift + + +def get_bin_array(amount_fractions, fr_width, intercept, tolerance, pi_shift): + frnr = 1 + bin_array = [] + while frnr <= amount_fractions: + pi_center = fr_width * frnr + intercept + bin_left = pi_center - fr_width / 2 - tolerance - pi_shift + bin_right = pi_center + fr_width / 2 + tolerance - pi_shift + print('Bins in fraction', frnr, bin_left, bin_right) + bin_array.append((frnr, bin_left, bin_right)) + frnr += 1 + return bin_array + + +def write_fractions(pi_peptides_fn, amount_fractions, out_prefix, + bin_array, locate_function, minlen, maxlen): + amountpad = len(str(amount_fractions)) + with ExitStack() as stack: + target_out_fp = {frnr: ([], stack.enter_context( + open('{p}_fr{i:0{pad}}.fasta'.format(p=out_prefix, i=frnr, + pad=amountpad), 'w'))) + for frnr in range(1, amount_fractions + 1)} + decoy_out_fp = {frnr: ([], stack.enter_context( + open('decoy_{p}_fr{i:0{pad}}.fasta'.format(p=out_prefix, i=frnr, + pad=amountpad), 'w'))) + for frnr in range(1, amount_fractions + 1)} + input_fp = stack.enter_context(open(pi_peptides_fn)) + pepcount = 0 + for line in input_fp: + accs, pep, pi = line.strip().split("\t") + pi = float(pi) + if maxlen and len(pep) > maxlen: + continue + elif len(pep) >= minlen: + pepcount += 1 + if pep[-1] in {'K', 'R'}: + rev_pep = pep[::-1][1:] + pep[-1] + else: + rev_pep = pep[::-1] + for i in locate_function(pi, bin_array): + target_out_fp[i][0].append('>{}\n{}\n'.format(accs, pep)) + # write pseudoReversed decoy peptide at the same time + decoy_out_fp[i][0].append('>decoy_{}\n{}\n'.format( + accs, rev_pep)) + if pepcount > 1000000: + # write in chunks to make it go faster + pepcount = 0 + [fp.write(''.join(peps)) for peps, fp in + target_out_fp.values()] + [fp.write(''.join(peps)) for peps, fp in decoy_out_fp.values()] + target_out_fp = {fr: ([], pep_fp[1]) + for fr, pep_fp in target_out_fp.items()} + decoy_out_fp = {fr: ([], pep_fp[1]) + for fr, pep_fp in decoy_out_fp.items()} + [fp.write(''.join(peps)) for peps, fp in target_out_fp.values()] + [fp.write(''.join(peps)) for peps, fp in decoy_out_fp.values()] + + +if __name__ == '__main__': + main() diff -r 000000000000 -r 34c5c95740a1 pi_db_split.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pi_db_split.xml Mon May 22 05:08:23 2017 -0400 @@ -0,0 +1,83 @@ + + into pI separated fractions + + numpy + python + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Creates a pI separated database collection from a pI-determined input + file of peptide/protein mappings. Outputs one db for target, one + for decoy. + + + diff -r 000000000000 -r 34c5c95740a1 test-data/decoy_splitdb_fr1.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/decoy_splitdb_fr1.fasta Mon May 22 05:08:23 2017 -0400 @@ -0,0 +1,4 @@ +>decoy_protein1 +TFSLFGCSIPNTNVEFSIKLFDVCLLLCNCLFSLIIMIYVII +>decoy_protein2 +TFSLFGCSIPNTNVEFSI diff -r 000000000000 -r 34c5c95740a1 test-data/decoy_splitdb_fr2.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/decoy_splitdb_fr2.fasta Mon May 22 05:08:23 2017 -0400 @@ -0,0 +1,4 @@ +>decoy_protein1 +LNLSKPILSEST +>decoy_protein3 +LFDVCLLLCNCLFSLIIMIYVIIK diff -r 000000000000 -r 34c5c95740a1 test-data/decoy_splitdb_fr3.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/decoy_splitdb_fr3.fasta Mon May 22 05:08:23 2017 -0400 @@ -0,0 +1,2 @@ +>decoy_protein2 +LFDVCLLLCNCLFSLIIMIYVIIKLWLFK diff -r 000000000000 -r 34c5c95740a1 test-data/peptable.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/peptable.txt Mon May 22 05:08:23 2017 -0400 @@ -0,0 +1,5 @@ +Sequence Filename FDR Fraction +TSESLIPKSLNL strip1_fr20 0.1 20 +FLWLKIIVYIM+15.994915IILSFLCNCLLLCVDFLK strip1_fr50 0.3 50 +IIVYIMIILSFLCNCLLLCVDFLK strip2_fr50 0 50 +IIVYIMIILSFLCNCLLLCVDFLKISFEVNTNPISCGFLSFT strip2_fr43 0.01 43 diff -r 000000000000 -r 34c5c95740a1 test-data/peptable_deltapi.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/peptable_deltapi.txt Mon May 22 05:08:23 2017 -0400 @@ -0,0 +1,5 @@ +Sequence Filename FDR Fraction Experimental pI Predicted pI Delta pI +TSESLIPKSLNL strip1_fr20 0.1 20 8.47 6.13955 2.330450000000001 +FLWLKIIVYIM+15.994915IILSFLCNCLLLCVDFLK strip1_fr50 0.3 50 8.860000000000001 7.6171 1.2429000000000014 +IIVYIMIILSFLCNCLLLCVDFLK strip2_fr50 0 50 8.11 5.99038 2.1196199999999994 +IIVYIMIILSFLCNCLLLCVDFLKISFEVNTNPISCGFLSFT strip2_fr43 0.01 43 7.83 4.55361 3.27639 diff -r 000000000000 -r 34c5c95740a1 test-data/peptable_missed_ox.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/peptable_missed_ox.txt Mon May 22 05:08:23 2017 -0400 @@ -0,0 +1,5 @@ +Sequence Filename FDR Fraction Experimental pI Predicted pI Delta pI +TSESLIPKSLNL strip1_fr20 0.1 20 8.47 6.13955 2.330450000000001 +FLWLKIIVYIM+15.994915IILSFLCNCLLLCVDFLK strip1_fr50 0.3 50 8.860000000000001 NA NA +IIVYIMIILSFLCNCLLLCVDFLK strip2_fr50 0 50 8.11 5.99038 2.1196199999999994 +IIVYIMIILSFLCNCLLLCVDFLKISFEVNTNPISCGFLSFT strip2_fr43 0.01 43 7.83 4.55361 3.27639 diff -r 000000000000 -r 34c5c95740a1 test-data/predicted_peptides.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/predicted_peptides.txt Mon May 22 05:08:23 2017 -0400 @@ -0,0 +1,5 @@ +TSESLIPKSLNL 6.13955 +FLWLKIIVYIMIILSFLCNCLLLCVDFLK 7.6171 +IIVYIMIILSFLCNCLLLCVDFLK 5.99038 +IIVYIMIILSFLCNCLLLCVDFLKISFEVNTNPISCGFLSFT 4.55361 +ISFEVNTNPISCGFLSFT 4.08563 diff -r 000000000000 -r 34c5c95740a1 test-data/predicted_peptides_to_split.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/predicted_peptides_to_split.txt Mon May 22 05:08:23 2017 -0400 @@ -0,0 +1,5 @@ +protein1 TSESLIPKSLNL 6.13955 +protein2 FLWLKIIVYIMIILSFLCNCLLLCVDFLK 7.6171 +protein3 IIVYIMIILSFLCNCLLLCVDFLK 5.99038 +protein1 IIVYIMIILSFLCNCLLLCVDFLKISFEVNTNPISCGFLSFT 4.55361 +protein2 ISFEVNTNPISCGFLSFT 4.08563 diff -r 000000000000 -r 34c5c95740a1 test-data/target_splitdb_fr1.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/target_splitdb_fr1.fasta Mon May 22 05:08:23 2017 -0400 @@ -0,0 +1,4 @@ +>protein1 +IIVYIMIILSFLCNCLLLCVDFLKISFEVNTNPISCGFLSFT +>protein2 +ISFEVNTNPISCGFLSFT diff -r 000000000000 -r 34c5c95740a1 test-data/target_splitdb_fr2.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/target_splitdb_fr2.fasta Mon May 22 05:08:23 2017 -0400 @@ -0,0 +1,4 @@ +>protein1 +TSESLIPKSLNL +>protein3 +IIVYIMIILSFLCNCLLLCVDFLK diff -r 000000000000 -r 34c5c95740a1 test-data/target_splitdb_fr3.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/target_splitdb_fr3.fasta Mon May 22 05:08:23 2017 -0400 @@ -0,0 +1,2 @@ +>protein2 +FLWLKIIVYIMIILSFLCNCLLLCVDFLK