3
|
1 #!/usr/bin/env python3
|
|
2 import re
|
|
3 import argparse
|
|
4 import csv
|
|
5 import os
|
|
6 import sys
|
|
7 import os.path
|
|
8 import subprocess
|
|
9 import shlex
|
|
10 import multiprocessing as mp
|
|
11 import tempfile
|
|
12 import itertools as it
|
|
13
|
|
14
|
|
15 def get_arguments():
|
|
16
|
|
17 parser = argparse.ArgumentParser()
|
|
18 parser.add_argument('-m',
|
|
19 '--max_cl',
|
|
20 default=300,
|
|
21 type=int,
|
|
22 help='Sets the maximum cluster number. Default = 300')
|
|
23 parser.add_argument('-b',
|
|
24 '--bitscore',
|
|
25 default=30,
|
|
26 type=int,
|
|
27 help='minimal bitscore to report')
|
|
28 parser.add_argument(
|
|
29 '-n',
|
|
30 '--nproc',
|
|
31 default=mp.cpu_count(),
|
|
32 type=int,
|
|
33 help='Sets the number of cpus to be used. Default = all available')
|
|
34 parser.add_argument('-c',
|
|
35 '--ChipSeq',
|
|
36 required=True,
|
|
37 help='Fasta file containing the Chip Sequences')
|
|
38 parser.add_argument('-i',
|
|
39 '--InputSeq',
|
|
40 required=True,
|
|
41 help='Fasta file containing the Input Sequences')
|
|
42 parser.add_argument(
|
|
43 '-o',
|
|
44 '--output',
|
|
45 required=False,
|
|
46 default='ChipSeqRatio.csv',
|
|
47 help=('Specify a name for the CSV file to which the'
|
|
48 ' output will be save to. Default: ChipSeqRatio.csv'))
|
|
49 parser.add_argument(
|
|
50 '-ht',
|
|
51 '--html',
|
|
52 required=False,
|
|
53 default='ChipSeqRatioReport',
|
|
54 help='Specify a name for the html report. Default : ChipSeqRatioReport')
|
|
55 parser.add_argument('-k',
|
|
56 '--Contigs',
|
|
57 required=True,
|
|
58 help='Contig file for blast')
|
|
59 args = parser.parse_args()
|
|
60 return args
|
|
61
|
|
62
|
|
63 def split(filename, num_chunks):
|
|
64 # splits a files into nproc files
|
|
65 files = []
|
|
66 temp_files = []
|
|
67
|
|
68 # creating a list with nproc temporary files
|
|
69 for f in range(num_chunks):
|
|
70 temp_files.append(tempfile.NamedTemporaryFile('w', delete=False))
|
|
71 with open(filename, 'r') as f:
|
|
72 end = os.path.getsize(filename)
|
|
73 for temp in it.cycle(temp_files):
|
|
74 # cycling indefenitly through temp files
|
|
75 ID = f.readline() # get sequence id
|
|
76 if ID[0] == '>':
|
|
77 temp.write(ID)
|
|
78 SEQP = f.tell() # get file pointer location
|
|
79 while f.readline()[0] is not '>':
|
|
80 f.seek(SEQP) # jump to last saved file pointer
|
|
81 temp.write(f.readline()) # write sequen
|
|
82 SEQP = f.tell() # overwrite last file pointer location
|
|
83 # break loop if file pointer reached the EOF
|
|
84 if (SEQP == end):
|
|
85 break
|
|
86 if (SEQP == end): # break loop if file pointer reached the EOF
|
|
87 break
|
|
88 f.seek(SEQP)
|
|
89
|
|
90 for f in range(num_chunks):
|
|
91 # save temp files names into list for further use
|
|
92 files.append(temp_files[f].name)
|
|
93 temp_files[f].close() # close temp files
|
|
94 return files
|
|
95
|
|
96
|
|
97 def blast(query, database, bitscore):
|
|
98 # blast a file for given arguments and save result to a list
|
|
99 print(query)
|
|
100 arguments = (
|
|
101 "blastn -task blastn -db {} -query {} "
|
|
102 "-evalue 1e-2 -gapopen 5 -gapextend 2 -word_size 11 -num_alignments 1"
|
|
103 " -penalty -3 -reward 2 -outfmt 6 -dust no").format(database, query)
|
|
104 cmd = shlex.split(arguments)
|
|
105 Blast_Output = [0 for x in range(max_cl + 1)]
|
|
106 ma = re.compile('(\S+)\tCL(\d+)Contig') # expression to check for
|
|
107 with subprocess.Popen(cmd,
|
|
108 stdout=subprocess.PIPE,
|
|
109 universal_newlines=True) as p:
|
|
110 for line in p.stdout:
|
|
111 if float(line.split()[11]) > bitscore:
|
|
112 gr = ma.match(line)
|
|
113 previous_query = ''
|
|
114 if gr:
|
|
115 if (gr.group(1) != previous_query):
|
|
116 if (int(gr.group(2)) > max_cl):
|
|
117 Blast_Output[0] = Blast_Output[0] + 1
|
|
118 else:
|
|
119 Blast_Output[int(gr.group(2))] = Blast_Output[
|
|
120 int(gr.group(2))] + 1
|
|
121 previous_query = gr.group(1)
|
|
122 return Blast_Output
|
|
123
|
|
124
|
|
125 def ReduceLists(x, y):
|
|
126 ''' reduces two lists into a 2-dim matrix '''
|
|
127 Matrix = [[0 for i in range(max_cl + 1)] for i in range(2)]
|
|
128 for i in range(len(x)):
|
|
129 for j in range(len(x[i])):
|
|
130 Matrix[0][j] = Matrix[0][j] + x[i][j]
|
|
131 for i in range(len(y)):
|
|
132 for j in range(len(y[i])):
|
|
133 Matrix[1][j] = Matrix[1][j] + y[i][j]
|
|
134 return Matrix
|
|
135
|
|
136
|
|
137 def fasta_size(fastafile):
|
|
138 with open(fastafile, 'r') as f:
|
|
139 s = 0
|
|
140 for i in f:
|
|
141 if i[0] == ">":
|
|
142 s += 1
|
|
143 return s
|
|
144
|
|
145
|
|
146 def makeblastdb(filename):
|
|
147 dbtmp = tempfile.NamedTemporaryFile()
|
|
148 cmd = [
|
|
149 'makeblastdb', '-in', filename, '-input_type', 'fasta', '-dbtype',
|
|
150 'nucl', '-out', dbtmp.name
|
|
151 ]
|
|
152 subprocess.call(cmd)
|
|
153 return dbtmp
|
|
154
|
|
155
|
|
156 if __name__ == "__main__":
|
|
157 args = get_arguments()
|
|
158 max_cl = args.max_cl
|
|
159 output = args.output
|
|
160 HTMLreport = args.html
|
|
161 contigs = args.Contigs
|
|
162
|
|
163 # Creation of database
|
|
164 db = makeblastdb(contigs)
|
|
165
|
|
166 inputN = fasta_size(args.InputSeq)
|
|
167 chipN = fasta_size(args.ChipSeq)
|
|
168
|
|
169 # Reading and distribution of data to temp files for multiprocessing
|
|
170 filesC = split(args.ChipSeq, args.nproc)
|
|
171 filesI = split(args.InputSeq, args.nproc)
|
|
172
|
|
173 # start of parallized blast
|
|
174 pool = mp.Pool(processes=args.nproc)
|
|
175 results = [pool.apply_async(blast, args=(f, db.name, args.bitscore)) for f in filesC]
|
|
176 Cout = [p.get() for p in results]
|
|
177 results = [pool.apply_async(blast, args=(f, db.name, args.bitscore)) for f in filesI]
|
|
178 Iout = [p.get() for p in results]
|
|
179
|
|
180 # Merging of blast output into a 2-dim matrix
|
|
181 Matrix = ReduceLists(Cout, Iout)
|
|
182
|
|
183 with open(args.output, 'w') as f:
|
|
184 print("Cluster", "Chip_Hits", "Input_Hits", sep='\t', file=f)
|
|
185 for hit in range(1, args.max_cl + 1):
|
|
186 print(hit, Matrix[0][hit], Matrix[1][hit], sep='\t', file=f)
|
|
187 Rarguments = "Rscript " + \
|
|
188 os.path.dirname(__file__) + "/ChipSeqRatioAnalysis.R"
|
|
189 # order is important - programmed by georg - this it realy ugly!
|
|
190 args = shlex.split(Rarguments)
|
|
191 args.append(output)
|
|
192 args.append(HTMLreport)
|
|
193 args.append(str(inputN))
|
|
194 args.append(str(chipN))
|
|
195 with subprocess.Popen(args, stderr=subprocess.PIPE) as p:
|
|
196 print("Creating HTML report")
|
|
197 stdout, stderr = p.communicate()
|
|
198 if (len(stderr) > 0):
|
|
199 print(stderr)
|
|
200 # cleanup
|
|
201 for i in filesC + filesI:
|
|
202 os.unlink(i)
|