Mercurial > repos > petr-novak > re_utils
comparison ChipSeqRatioAnalysis.py @ 3:e320ef2d105a draft
Uploaded
author | petr-novak |
---|---|
date | Thu, 05 Sep 2019 09:04:56 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:ff658cf87f16 | 3:e320ef2d105a |
---|---|
1 #!/usr/bin/env python3 | |
2 import re | |
3 import argparse | |
4 import csv | |
5 import os | |
6 import sys | |
7 import os.path | |
8 import subprocess | |
9 import shlex | |
10 import multiprocessing as mp | |
11 import tempfile | |
12 import itertools as it | |
13 | |
14 | |
15 def get_arguments(): | |
16 | |
17 parser = argparse.ArgumentParser() | |
18 parser.add_argument('-m', | |
19 '--max_cl', | |
20 default=300, | |
21 type=int, | |
22 help='Sets the maximum cluster number. Default = 300') | |
23 parser.add_argument('-b', | |
24 '--bitscore', | |
25 default=30, | |
26 type=int, | |
27 help='minimal bitscore to report') | |
28 parser.add_argument( | |
29 '-n', | |
30 '--nproc', | |
31 default=mp.cpu_count(), | |
32 type=int, | |
33 help='Sets the number of cpus to be used. Default = all available') | |
34 parser.add_argument('-c', | |
35 '--ChipSeq', | |
36 required=True, | |
37 help='Fasta file containing the Chip Sequences') | |
38 parser.add_argument('-i', | |
39 '--InputSeq', | |
40 required=True, | |
41 help='Fasta file containing the Input Sequences') | |
42 parser.add_argument( | |
43 '-o', | |
44 '--output', | |
45 required=False, | |
46 default='ChipSeqRatio.csv', | |
47 help=('Specify a name for the CSV file to which the' | |
48 ' output will be save to. Default: ChipSeqRatio.csv')) | |
49 parser.add_argument( | |
50 '-ht', | |
51 '--html', | |
52 required=False, | |
53 default='ChipSeqRatioReport', | |
54 help='Specify a name for the html report. Default : ChipSeqRatioReport') | |
55 parser.add_argument('-k', | |
56 '--Contigs', | |
57 required=True, | |
58 help='Contig file for blast') | |
59 args = parser.parse_args() | |
60 return args | |
61 | |
62 | |
63 def split(filename, num_chunks): | |
64 # splits a files into nproc files | |
65 files = [] | |
66 temp_files = [] | |
67 | |
68 # creating a list with nproc temporary files | |
69 for f in range(num_chunks): | |
70 temp_files.append(tempfile.NamedTemporaryFile('w', delete=False)) | |
71 with open(filename, 'r') as f: | |
72 end = os.path.getsize(filename) | |
73 for temp in it.cycle(temp_files): | |
74 # cycling indefenitly through temp files | |
75 ID = f.readline() # get sequence id | |
76 if ID[0] == '>': | |
77 temp.write(ID) | |
78 SEQP = f.tell() # get file pointer location | |
79 while f.readline()[0] is not '>': | |
80 f.seek(SEQP) # jump to last saved file pointer | |
81 temp.write(f.readline()) # write sequen | |
82 SEQP = f.tell() # overwrite last file pointer location | |
83 # break loop if file pointer reached the EOF | |
84 if (SEQP == end): | |
85 break | |
86 if (SEQP == end): # break loop if file pointer reached the EOF | |
87 break | |
88 f.seek(SEQP) | |
89 | |
90 for f in range(num_chunks): | |
91 # save temp files names into list for further use | |
92 files.append(temp_files[f].name) | |
93 temp_files[f].close() # close temp files | |
94 return files | |
95 | |
96 | |
97 def blast(query, database, bitscore): | |
98 # blast a file for given arguments and save result to a list | |
99 print(query) | |
100 arguments = ( | |
101 "blastn -task blastn -db {} -query {} " | |
102 "-evalue 1e-2 -gapopen 5 -gapextend 2 -word_size 11 -num_alignments 1" | |
103 " -penalty -3 -reward 2 -outfmt 6 -dust no").format(database, query) | |
104 cmd = shlex.split(arguments) | |
105 Blast_Output = [0 for x in range(max_cl + 1)] | |
106 ma = re.compile('(\S+)\tCL(\d+)Contig') # expression to check for | |
107 with subprocess.Popen(cmd, | |
108 stdout=subprocess.PIPE, | |
109 universal_newlines=True) as p: | |
110 for line in p.stdout: | |
111 if float(line.split()[11]) > bitscore: | |
112 gr = ma.match(line) | |
113 previous_query = '' | |
114 if gr: | |
115 if (gr.group(1) != previous_query): | |
116 if (int(gr.group(2)) > max_cl): | |
117 Blast_Output[0] = Blast_Output[0] + 1 | |
118 else: | |
119 Blast_Output[int(gr.group(2))] = Blast_Output[ | |
120 int(gr.group(2))] + 1 | |
121 previous_query = gr.group(1) | |
122 return Blast_Output | |
123 | |
124 | |
125 def ReduceLists(x, y): | |
126 ''' reduces two lists into a 2-dim matrix ''' | |
127 Matrix = [[0 for i in range(max_cl + 1)] for i in range(2)] | |
128 for i in range(len(x)): | |
129 for j in range(len(x[i])): | |
130 Matrix[0][j] = Matrix[0][j] + x[i][j] | |
131 for i in range(len(y)): | |
132 for j in range(len(y[i])): | |
133 Matrix[1][j] = Matrix[1][j] + y[i][j] | |
134 return Matrix | |
135 | |
136 | |
137 def fasta_size(fastafile): | |
138 with open(fastafile, 'r') as f: | |
139 s = 0 | |
140 for i in f: | |
141 if i[0] == ">": | |
142 s += 1 | |
143 return s | |
144 | |
145 | |
146 def makeblastdb(filename): | |
147 dbtmp = tempfile.NamedTemporaryFile() | |
148 cmd = [ | |
149 'makeblastdb', '-in', filename, '-input_type', 'fasta', '-dbtype', | |
150 'nucl', '-out', dbtmp.name | |
151 ] | |
152 subprocess.call(cmd) | |
153 return dbtmp | |
154 | |
155 | |
156 if __name__ == "__main__": | |
157 args = get_arguments() | |
158 max_cl = args.max_cl | |
159 output = args.output | |
160 HTMLreport = args.html | |
161 contigs = args.Contigs | |
162 | |
163 # Creation of database | |
164 db = makeblastdb(contigs) | |
165 | |
166 inputN = fasta_size(args.InputSeq) | |
167 chipN = fasta_size(args.ChipSeq) | |
168 | |
169 # Reading and distribution of data to temp files for multiprocessing | |
170 filesC = split(args.ChipSeq, args.nproc) | |
171 filesI = split(args.InputSeq, args.nproc) | |
172 | |
173 # start of parallized blast | |
174 pool = mp.Pool(processes=args.nproc) | |
175 results = [pool.apply_async(blast, args=(f, db.name, args.bitscore)) for f in filesC] | |
176 Cout = [p.get() for p in results] | |
177 results = [pool.apply_async(blast, args=(f, db.name, args.bitscore)) for f in filesI] | |
178 Iout = [p.get() for p in results] | |
179 | |
180 # Merging of blast output into a 2-dim matrix | |
181 Matrix = ReduceLists(Cout, Iout) | |
182 | |
183 with open(args.output, 'w') as f: | |
184 print("Cluster", "Chip_Hits", "Input_Hits", sep='\t', file=f) | |
185 for hit in range(1, args.max_cl + 1): | |
186 print(hit, Matrix[0][hit], Matrix[1][hit], sep='\t', file=f) | |
187 Rarguments = "Rscript " + \ | |
188 os.path.dirname(__file__) + "/ChipSeqRatioAnalysis.R" | |
189 # order is important - programmed by georg - this it realy ugly! | |
190 args = shlex.split(Rarguments) | |
191 args.append(output) | |
192 args.append(HTMLreport) | |
193 args.append(str(inputN)) | |
194 args.append(str(chipN)) | |
195 with subprocess.Popen(args, stderr=subprocess.PIPE) as p: | |
196 print("Creating HTML report") | |
197 stdout, stderr = p.communicate() | |
198 if (len(stderr) > 0): | |
199 print(stderr) | |
200 # cleanup | |
201 for i in filesC + filesI: | |
202 os.unlink(i) |