comparison fastq_seq_count.py @ 0:27c39155d53b draft default tip

"planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/fastq_seq_count commit 7fcdca778df4012c93cb4aec26c2ff056817afee-dirty"
author jjohnson
date Tue, 26 Oct 2021 14:39:00 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:27c39155d53b
1 #!/usr/bin/env python
2 from __future__ import print_function
3
4 import argparse
5 import multiprocessing as mp
6 import subprocess
7 from operator import itemgetter
8
9
10 gresults = []
11
12
13 def revcomp(seq):
14 return seq.translate(str.maketrans('ACGTacgtRYMKrymkVBHDvbhd',
15 'TGCAtgcaYRKMyrkmBVDHbvdh'))[::-1]
16
17
18 def get_count(pat, fastq):
19 cmd = 'HITS=`grep -E \'' + pat + '\' ' + fastq + ' | wc -l` && echo $HITS'
20 process = subprocess.Popen(['bash', '-c', cmd],
21 stdout=subprocess.PIPE,
22 universal_newlines=True)
23 output = process.stdout.read().strip()
24 return int(output) if output else 0
25
26
27 def get_seq_count(path, name, ln, seq, label, strand):
28 # print('%s\t%d\t%s\t%s\t' % (path, ln, label, strand), file=sys.stderr)
29 cmd = 'HITS=`grep -E \'' + seq + '\' ' + path + ' | wc -l` && echo $HITS'
30 process = subprocess.Popen(['bash', '-c', cmd],
31 stdout=subprocess.PIPE,
32 universal_newlines=True)
33 output = process.stdout.read().strip()
34 n = int(output) if output else 0
35 return (path, name, ln, seq, label, strand, n)
36
37
38 def process_seq_count(result):
39 global results
40 global csh
41 gresults.append(result)
42 """
43 if csh:
44 (path, name, ln, seq, label, strand, n) = result
45 print('%s\t%d\t%s\t%s\t%d' % (path, ln, label, strand, n), file=cfh)
46 """
47
48
49 def main():
50 parser = argparse.ArgumentParser()
51 parser.add_argument('-p', '--path_file', required=True,
52 help='file files paths and names')
53 parser.add_argument('-i', '--query_file', required=True,
54 help='queries file')
55 parser.add_argument('-s', '--summary', required=True,
56 help='summary report file')
57 parser.add_argument('-n', '--counts', required=False,
58 help='counts')
59 parser.add_argument('-I', '--id_col', required=False, default=None,
60 help='identifier column indices')
61 parser.add_argument('-q', '--q_col', type=int, required=True,
62 help='column in queries for search sequence')
63 parser.add_argument('-Q', '--q_label', required=False, default='mutant',
64 help='label for search sequence column')
65 parser.add_argument('-c', '--c_col', type=int,
66 default=None, required=False,
67 help='column in queries for comparison sequence')
68 parser.add_argument('-C', '--c_label', required=False, default='normal',
69 help='label for comparison sequence column')
70 parser.add_argument('-r', '--reverse_complement',
71 action='store_true', default=False,
72 help='Also search for reverse complements')
73 parser.add_argument('-P', '--per_file', action='store_true', default=False,
74 help='report per file')
75 parser.add_argument('-T', '--threads', type=int, default=4, required=False,
76 help='threads')
77 args = parser.parse_args()
78
79 id_col_list = []
80 if args.id_col:
81 id_col_list = [int(x) for x in str(args.id_col).split(',')]
82 strands = ['+', '-'] if args.reverse_complement else ['+']
83 labels = [args.q_label]
84 if args.c_col is not None:
85 labels.append(args.c_label)
86 ids = dict()
87 qseqs = dict()
88 cseqs = dict()
89 files = dict()
90 with open(args.query_file, 'r') as fh:
91 for ln, line in enumerate(fh):
92 qnum = ln + 1
93 fields = str(line).rstrip().split('\t')
94 id = '\t'.join([str(qnum)] +
95 [fields[i] for i in id_col_list])
96 ids[ln] = id
97 qseqs[ln] = fields[args.q_col]
98 if args.c_col is not None:
99 cseqs[ln] = fields[args.c_col]
100
101 with open(str(args.path_file), 'r') as fh:
102 for ln, line in enumerate(fh):
103 path, name = str(line).rstrip().split('\t')
104 files[name] = path
105
106 queries = []
107 for name, path in files.items():
108 for i in range(len(qseqs)):
109 ln = i + 1
110 seq = qseqs[i]
111 label = args.q_label
112 strand = '+'
113 queries.append((path, name, ln, seq, label, strand))
114 if args.reverse_complement:
115 strand = '-'
116 queries.append((path, name, ln, revcomp(seq), label, strand))
117 if i in cseqs:
118 seq = cseqs[i]
119 label = args.c_label
120 strand = '+'
121 queries.append((path, name, ln, seq, label, strand))
122 if args.reverse_complement:
123 strand = '-'
124 queries.append((path, name, ln, revcomp(seq),
125 label, strand))
126
127 pool = mp.Pool(args.threads)
128 for i, query in enumerate(queries):
129 pool.apply_async(get_seq_count,
130 args=(query),
131 callback=process_seq_count)
132 pool.close()
133 pool.join()
134
135 if args.counts:
136 with open(args.counts, 'w') as cfh:
137 for result in gresults:
138 print('\t'.join([str(x) for x in result[1:]]), file=cfh)
139
140 # count ln name label
141 counts = dict()
142 for i in range(len(ids)):
143 counts[i] = dict()
144 for name, path in files.items():
145 counts[i][name] = dict()
146 for j, label in enumerate(labels):
147 counts[i][name][label] = dict()
148 for strand in strands:
149 counts[i][name][label][strand] = 0
150
151 results = sorted(gresults, key=itemgetter(2, 1, 4, 5))
152 for i, result in enumerate(results):
153 (path, name, ln, seq, label, strand, n) = result
154 counts[ln-1][name][label][strand] = n
155
156 with open(args.summary, 'w') as ofh:
157 labels = [args.q_label]
158 if args.c_col is not None:
159 labels.append(args.c_label)
160 for i in range(len(ids)):
161 tcnts = [0, 0]
162 print(ids[i], end='\t', file=ofh)
163 for name, path in files.items():
164 frac = 1.
165 cnts = [0, 0]
166 for j, label in enumerate(labels):
167 for strand in strands:
168 cnts[j] += counts[i][name][label][strand]
169 if args.per_file:
170 print(cnts[j], end='\t', file=ofh)
171 if args.per_file:
172 cnt = sum(cnts)
173 frac = float(cnts[0])/cnt if cnt else 1.0
174 print(frac, end='\t', file=ofh)
175 tcnts[0] += cnts[0]
176 tcnts[1] += cnts[1]
177 for k, label in enumerate(labels):
178 print(tcnts[k], end='\t', file=ofh)
179 tcnt = sum(tcnts)
180 frac = float(tcnts[0])/tcnt if tcnt else 1.0
181 print(frac, end='\n', file=ofh)
182
183
184 if __name__ == "__main__":
185 main()