comparison ChipSeqRatioAnalysis.py @ 3:e320ef2d105a draft

Uploaded
author petr-novak
date Thu, 05 Sep 2019 09:04:56 -0400
parents
children
comparison
equal deleted inserted replaced
2:ff658cf87f16 3:e320ef2d105a
1 #!/usr/bin/env python3
2 import re
3 import argparse
4 import csv
5 import os
6 import sys
7 import os.path
8 import subprocess
9 import shlex
10 import multiprocessing as mp
11 import tempfile
12 import itertools as it
13
14
15 def get_arguments():
16
17 parser = argparse.ArgumentParser()
18 parser.add_argument('-m',
19 '--max_cl',
20 default=300,
21 type=int,
22 help='Sets the maximum cluster number. Default = 300')
23 parser.add_argument('-b',
24 '--bitscore',
25 default=30,
26 type=int,
27 help='minimal bitscore to report')
28 parser.add_argument(
29 '-n',
30 '--nproc',
31 default=mp.cpu_count(),
32 type=int,
33 help='Sets the number of cpus to be used. Default = all available')
34 parser.add_argument('-c',
35 '--ChipSeq',
36 required=True,
37 help='Fasta file containing the Chip Sequences')
38 parser.add_argument('-i',
39 '--InputSeq',
40 required=True,
41 help='Fasta file containing the Input Sequences')
42 parser.add_argument(
43 '-o',
44 '--output',
45 required=False,
46 default='ChipSeqRatio.csv',
47 help=('Specify a name for the CSV file to which the'
48 ' output will be save to. Default: ChipSeqRatio.csv'))
49 parser.add_argument(
50 '-ht',
51 '--html',
52 required=False,
53 default='ChipSeqRatioReport',
54 help='Specify a name for the html report. Default : ChipSeqRatioReport')
55 parser.add_argument('-k',
56 '--Contigs',
57 required=True,
58 help='Contig file for blast')
59 args = parser.parse_args()
60 return args
61
62
63 def split(filename, num_chunks):
64 # splits a files into nproc files
65 files = []
66 temp_files = []
67
68 # creating a list with nproc temporary files
69 for f in range(num_chunks):
70 temp_files.append(tempfile.NamedTemporaryFile('w', delete=False))
71 with open(filename, 'r') as f:
72 end = os.path.getsize(filename)
73 for temp in it.cycle(temp_files):
74 # cycling indefenitly through temp files
75 ID = f.readline() # get sequence id
76 if ID[0] == '>':
77 temp.write(ID)
78 SEQP = f.tell() # get file pointer location
79 while f.readline()[0] is not '>':
80 f.seek(SEQP) # jump to last saved file pointer
81 temp.write(f.readline()) # write sequen
82 SEQP = f.tell() # overwrite last file pointer location
83 # break loop if file pointer reached the EOF
84 if (SEQP == end):
85 break
86 if (SEQP == end): # break loop if file pointer reached the EOF
87 break
88 f.seek(SEQP)
89
90 for f in range(num_chunks):
91 # save temp files names into list for further use
92 files.append(temp_files[f].name)
93 temp_files[f].close() # close temp files
94 return files
95
96
97 def blast(query, database, bitscore):
98 # blast a file for given arguments and save result to a list
99 print(query)
100 arguments = (
101 "blastn -task blastn -db {} -query {} "
102 "-evalue 1e-2 -gapopen 5 -gapextend 2 -word_size 11 -num_alignments 1"
103 " -penalty -3 -reward 2 -outfmt 6 -dust no").format(database, query)
104 cmd = shlex.split(arguments)
105 Blast_Output = [0 for x in range(max_cl + 1)]
106 ma = re.compile('(\S+)\tCL(\d+)Contig') # expression to check for
107 with subprocess.Popen(cmd,
108 stdout=subprocess.PIPE,
109 universal_newlines=True) as p:
110 for line in p.stdout:
111 if float(line.split()[11]) > bitscore:
112 gr = ma.match(line)
113 previous_query = ''
114 if gr:
115 if (gr.group(1) != previous_query):
116 if (int(gr.group(2)) > max_cl):
117 Blast_Output[0] = Blast_Output[0] + 1
118 else:
119 Blast_Output[int(gr.group(2))] = Blast_Output[
120 int(gr.group(2))] + 1
121 previous_query = gr.group(1)
122 return Blast_Output
123
124
125 def ReduceLists(x, y):
126 ''' reduces two lists into a 2-dim matrix '''
127 Matrix = [[0 for i in range(max_cl + 1)] for i in range(2)]
128 for i in range(len(x)):
129 for j in range(len(x[i])):
130 Matrix[0][j] = Matrix[0][j] + x[i][j]
131 for i in range(len(y)):
132 for j in range(len(y[i])):
133 Matrix[1][j] = Matrix[1][j] + y[i][j]
134 return Matrix
135
136
137 def fasta_size(fastafile):
138 with open(fastafile, 'r') as f:
139 s = 0
140 for i in f:
141 if i[0] == ">":
142 s += 1
143 return s
144
145
146 def makeblastdb(filename):
147 dbtmp = tempfile.NamedTemporaryFile()
148 cmd = [
149 'makeblastdb', '-in', filename, '-input_type', 'fasta', '-dbtype',
150 'nucl', '-out', dbtmp.name
151 ]
152 subprocess.call(cmd)
153 return dbtmp
154
155
156 if __name__ == "__main__":
157 args = get_arguments()
158 max_cl = args.max_cl
159 output = args.output
160 HTMLreport = args.html
161 contigs = args.Contigs
162
163 # Creation of database
164 db = makeblastdb(contigs)
165
166 inputN = fasta_size(args.InputSeq)
167 chipN = fasta_size(args.ChipSeq)
168
169 # Reading and distribution of data to temp files for multiprocessing
170 filesC = split(args.ChipSeq, args.nproc)
171 filesI = split(args.InputSeq, args.nproc)
172
173 # start of parallized blast
174 pool = mp.Pool(processes=args.nproc)
175 results = [pool.apply_async(blast, args=(f, db.name, args.bitscore)) for f in filesC]
176 Cout = [p.get() for p in results]
177 results = [pool.apply_async(blast, args=(f, db.name, args.bitscore)) for f in filesI]
178 Iout = [p.get() for p in results]
179
180 # Merging of blast output into a 2-dim matrix
181 Matrix = ReduceLists(Cout, Iout)
182
183 with open(args.output, 'w') as f:
184 print("Cluster", "Chip_Hits", "Input_Hits", sep='\t', file=f)
185 for hit in range(1, args.max_cl + 1):
186 print(hit, Matrix[0][hit], Matrix[1][hit], sep='\t', file=f)
187 Rarguments = "Rscript " + \
188 os.path.dirname(__file__) + "/ChipSeqRatioAnalysis.R"
189 # order is important - programmed by georg - this it realy ugly!
190 args = shlex.split(Rarguments)
191 args.append(output)
192 args.append(HTMLreport)
193 args.append(str(inputN))
194 args.append(str(chipN))
195 with subprocess.Popen(args, stderr=subprocess.PIPE) as p:
196 print("Creating HTML report")
197 stdout, stderr = p.communicate()
198 if (len(stderr) > 0):
199 print(stderr)
200 # cleanup
201 for i in filesC + filesI:
202 os.unlink(i)