Mercurial > repos > galaxyp > hirieftools
comparison pi_database_splitter.py @ 0:34c5c95740a1 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/pi_db_tools commit a58e2a324724f344a07d4499c860a5b2da06927d
author | galaxyp |
---|---|
date | Mon, 22 May 2017 05:08:23 -0400 |
parents | |
children | 8a30d6e5b97d |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:34c5c95740a1 |
---|---|
1 #!/usr/bin/env python | |
2 import sys | |
3 import argparse | |
4 from numpy import median | |
5 from contextlib import ExitStack | |
6 | |
7 | |
8 def main(): | |
9 if sys.argv[1:] == []: | |
10 sys.argv.append('-h') | |
11 args = parse_commandline() | |
12 locfun = {False: locatefraction, | |
13 True: reverse_locatefraction}[args.reverse] | |
14 # Column nrs should start from 0 | |
15 # If negative, -1 is last item in list, etc | |
16 if args.fdrcol > 0: | |
17 args.fdrcol -= 1 | |
18 if args.deltapicol > 0: | |
19 args.deltapicol -= 1 | |
20 pishift = get_pishift(args.train_peptable, args.fdrcol, args.deltapicol, | |
21 args.fdrcutoff, args.picutoff) | |
22 binarray = get_bin_array(args.fr_amount, args.fr_width, args.intercept, | |
23 args.tolerance, pishift) | |
24 write_fractions(args.pipeps, args.fr_amount, args.prefix, | |
25 binarray, locfun, args.minlen, args.maxlen) | |
26 | |
27 | |
28 def locatefraction(pep_pi, bins): | |
29 index = [] | |
30 for pibin in bins: | |
31 if pep_pi > pibin[2]: | |
32 continue | |
33 elif pep_pi >= pibin[1]: | |
34 index.append(pibin[0]) | |
35 else: | |
36 return index | |
37 return index | |
38 | |
39 | |
40 def reverse_locatefraction(pep_pi, bins): | |
41 index = [] | |
42 for pibin in bins: | |
43 if pep_pi < pibin[1]: | |
44 continue | |
45 elif pep_pi < pibin[2]: | |
46 index.append(pibin[0]) | |
47 else: | |
48 return index | |
49 return index | |
50 | |
51 | |
52 def parse_commandline(): | |
53 parser = argparse.ArgumentParser( | |
54 formatter_class=argparse.RawTextHelpFormatter) | |
55 parser.add_argument('-p', dest='train_peptable', help='Peptide table with ' | |
56 'peptides, FDR, and fraction numbers. Used to ' | |
57 'calculate pI shift. Leave emtpy for no shift. ' | |
58 'Tab separated file.') | |
59 parser.add_argument('--deltacol', dest='deltapicol', help='Delta pI column' | |
60 ' number in peptide table. First column is nr. 1. ' | |
61 'Negative number for counting from last col ' | |
62 '(-1 is last).', default=False, type=int) | |
63 parser.add_argument('--picutoff', dest='picutoff', | |
64 help='delta pI value to filter experimental peptides' | |
65 ' when calculating pi shift.', default=0.2, type=float) | |
66 parser.add_argument('--fdrcol', dest='fdrcol', help='FDR column number in ' | |
67 'peptide table. First column is nr. 1. Empty includes ' | |
68 'all peptides', default=False, type=int) | |
69 parser.add_argument('--fdrcutoff', dest='fdrcutoff', | |
70 help='FDR cutoff value to filter experimental peptides' | |
71 ' when calculating pi shift.', default=0, type=float) | |
72 parser.add_argument('-i', dest='pipeps', help='A tab-separated txt file ' | |
73 'with accession, peptide seq, pI value') | |
74 parser.add_argument('--prefix', dest='prefix', default='pisep', | |
75 help='Prefix for target/decoy output files') | |
76 parser.add_argument('--tolerance', dest='tolerance', | |
77 help='Strip fraction tolerance pi tolerance represents' | |
78 ' 2.5/97.5 percentile', type=float) | |
79 parser.add_argument('--amount', dest='fr_amount', | |
80 help='Strip fraction amount', type=int) | |
81 parser.add_argument('--reverse', dest='reverse', help='Strip is reversed', | |
82 action='store_const', const=True, default=False) | |
83 parser.add_argument('--intercept', dest='intercept', | |
84 help='pI Intercept of strip', type=float) | |
85 parser.add_argument('--width', dest='fr_width', | |
86 help='Strip fraction width in pI', type=float) | |
87 parser.add_argument('--minlen', dest='minlen', help='Minimal peptide length', | |
88 type=int) | |
89 parser.add_argument('--maxlen', dest='maxlen', help='Maximal peptide length', | |
90 type=int, default=False) | |
91 return parser.parse_args(sys.argv[1:]) | |
92 | |
93 | |
94 def get_pishift(peptable, fdrcol, deltapicol, fdrcutoff, delta_pi_cutoff): | |
95 delta_pis = [] | |
96 with open(peptable) as fp: | |
97 next(fp) # skip header | |
98 for line in fp: | |
99 line = line.strip('\n').split('\t') | |
100 if fdrcol: | |
101 try: | |
102 fdr = float(line[fdrcol]) | |
103 except ValueError: | |
104 continue | |
105 if fdr > fdrcutoff: | |
106 continue | |
107 try: | |
108 delta_pi = float(line[deltapicol]) | |
109 except ValueError: | |
110 continue | |
111 if delta_pi < delta_pi_cutoff: | |
112 delta_pis.append(delta_pi) | |
113 shift = median(delta_pis) | |
114 print('pI shift (median of delta pIs): {}'.format(shift)) | |
115 return shift | |
116 | |
117 | |
118 def get_bin_array(amount_fractions, fr_width, intercept, tolerance, pi_shift): | |
119 frnr = 1 | |
120 bin_array = [] | |
121 while frnr <= amount_fractions: | |
122 pi_center = fr_width * frnr + intercept | |
123 bin_left = pi_center - fr_width / 2 - tolerance - pi_shift | |
124 bin_right = pi_center + fr_width / 2 + tolerance - pi_shift | |
125 print('Bins in fraction', frnr, bin_left, bin_right) | |
126 bin_array.append((frnr, bin_left, bin_right)) | |
127 frnr += 1 | |
128 return bin_array | |
129 | |
130 | |
131 def write_fractions(pi_peptides_fn, amount_fractions, out_prefix, | |
132 bin_array, locate_function, minlen, maxlen): | |
133 amountpad = len(str(amount_fractions)) | |
134 with ExitStack() as stack: | |
135 target_out_fp = {frnr: ([], stack.enter_context( | |
136 open('{p}_fr{i:0{pad}}.fasta'.format(p=out_prefix, i=frnr, | |
137 pad=amountpad), 'w'))) | |
138 for frnr in range(1, amount_fractions + 1)} | |
139 decoy_out_fp = {frnr: ([], stack.enter_context( | |
140 open('decoy_{p}_fr{i:0{pad}}.fasta'.format(p=out_prefix, i=frnr, | |
141 pad=amountpad), 'w'))) | |
142 for frnr in range(1, amount_fractions + 1)} | |
143 input_fp = stack.enter_context(open(pi_peptides_fn)) | |
144 pepcount = 0 | |
145 for line in input_fp: | |
146 accs, pep, pi = line.strip().split("\t") | |
147 pi = float(pi) | |
148 if maxlen and len(pep) > maxlen: | |
149 continue | |
150 elif len(pep) >= minlen: | |
151 pepcount += 1 | |
152 if pep[-1] in {'K', 'R'}: | |
153 rev_pep = pep[::-1][1:] + pep[-1] | |
154 else: | |
155 rev_pep = pep[::-1] | |
156 for i in locate_function(pi, bin_array): | |
157 target_out_fp[i][0].append('>{}\n{}\n'.format(accs, pep)) | |
158 # write pseudoReversed decoy peptide at the same time | |
159 decoy_out_fp[i][0].append('>decoy_{}\n{}\n'.format( | |
160 accs, rev_pep)) | |
161 if pepcount > 1000000: | |
162 # write in chunks to make it go faster | |
163 pepcount = 0 | |
164 [fp.write(''.join(peps)) for peps, fp in | |
165 target_out_fp.values()] | |
166 [fp.write(''.join(peps)) for peps, fp in decoy_out_fp.values()] | |
167 target_out_fp = {fr: ([], pep_fp[1]) | |
168 for fr, pep_fp in target_out_fp.items()} | |
169 decoy_out_fp = {fr: ([], pep_fp[1]) | |
170 for fr, pep_fp in decoy_out_fp.items()} | |
171 [fp.write(''.join(peps)) for peps, fp in target_out_fp.values()] | |
172 [fp.write(''.join(peps)) for peps, fp in decoy_out_fp.values()] | |
173 | |
174 | |
175 if __name__ == '__main__': | |
176 main() |