annotate ChipSeqRatioAnalysis.py @ 7:89c5ba120b21 draft

Uploaded
author petr-novak
date Mon, 02 Dec 2019 08:41:43 -0500
parents e320ef2d105a
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env python3
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
2 import re
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
3 import argparse
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
4 import csv
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
5 import os
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
6 import sys
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
7 import os.path
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
8 import subprocess
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
9 import shlex
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
10 import multiprocessing as mp
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
11 import tempfile
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
12 import itertools as it
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
13
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
14
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
15 def get_arguments():
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
16
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
17 parser = argparse.ArgumentParser()
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
18 parser.add_argument('-m',
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
19 '--max_cl',
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
20 default=300,
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
21 type=int,
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
22 help='Sets the maximum cluster number. Default = 300')
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
23 parser.add_argument('-b',
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
24 '--bitscore',
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
25 default=30,
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
26 type=int,
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
27 help='minimal bitscore to report')
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
28 parser.add_argument(
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
29 '-n',
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
30 '--nproc',
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
31 default=mp.cpu_count(),
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
32 type=int,
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
33 help='Sets the number of cpus to be used. Default = all available')
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
34 parser.add_argument('-c',
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
35 '--ChipSeq',
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
36 required=True,
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
37 help='Fasta file containing the Chip Sequences')
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
38 parser.add_argument('-i',
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
39 '--InputSeq',
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
40 required=True,
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
41 help='Fasta file containing the Input Sequences')
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
42 parser.add_argument(
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
43 '-o',
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
44 '--output',
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
45 required=False,
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
46 default='ChipSeqRatio.csv',
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
47 help=('Specify a name for the CSV file to which the'
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
48 ' output will be save to. Default: ChipSeqRatio.csv'))
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
49 parser.add_argument(
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
50 '-ht',
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
51 '--html',
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
52 required=False,
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
53 default='ChipSeqRatioReport',
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
54 help='Specify a name for the html report. Default : ChipSeqRatioReport')
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
55 parser.add_argument('-k',
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
56 '--Contigs',
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
57 required=True,
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
58 help='Contig file for blast')
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
59 args = parser.parse_args()
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
60 return args
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
61
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
62
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
63 def split(filename, num_chunks):
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
64 # splits a files into nproc files
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
65 files = []
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
66 temp_files = []
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
67
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
68 # creating a list with nproc temporary files
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
69 for f in range(num_chunks):
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
70 temp_files.append(tempfile.NamedTemporaryFile('w', delete=False))
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
71 with open(filename, 'r') as f:
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
72 end = os.path.getsize(filename)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
73 for temp in it.cycle(temp_files):
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
74 # cycling indefenitly through temp files
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
75 ID = f.readline() # get sequence id
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
76 if ID[0] == '>':
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
77 temp.write(ID)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
78 SEQP = f.tell() # get file pointer location
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
79 while f.readline()[0] is not '>':
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
80 f.seek(SEQP) # jump to last saved file pointer
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
81 temp.write(f.readline()) # write sequen
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
82 SEQP = f.tell() # overwrite last file pointer location
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
83 # break loop if file pointer reached the EOF
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
84 if (SEQP == end):
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
85 break
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
86 if (SEQP == end): # break loop if file pointer reached the EOF
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
87 break
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
88 f.seek(SEQP)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
89
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
90 for f in range(num_chunks):
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
91 # save temp files names into list for further use
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
92 files.append(temp_files[f].name)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
93 temp_files[f].close() # close temp files
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
94 return files
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
95
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
96
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
97 def blast(query, database, bitscore):
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
98 # blast a file for given arguments and save result to a list
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
99 print(query)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
100 arguments = (
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
101 "blastn -task blastn -db {} -query {} "
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
102 "-evalue 1e-2 -gapopen 5 -gapextend 2 -word_size 11 -num_alignments 1"
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
103 " -penalty -3 -reward 2 -outfmt 6 -dust no").format(database, query)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
104 cmd = shlex.split(arguments)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
105 Blast_Output = [0 for x in range(max_cl + 1)]
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
106 ma = re.compile('(\S+)\tCL(\d+)Contig') # expression to check for
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
107 with subprocess.Popen(cmd,
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
108 stdout=subprocess.PIPE,
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
109 universal_newlines=True) as p:
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
110 for line in p.stdout:
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
111 if float(line.split()[11]) > bitscore:
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
112 gr = ma.match(line)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
113 previous_query = ''
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
114 if gr:
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
115 if (gr.group(1) != previous_query):
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
116 if (int(gr.group(2)) > max_cl):
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
117 Blast_Output[0] = Blast_Output[0] + 1
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
118 else:
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
119 Blast_Output[int(gr.group(2))] = Blast_Output[
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
120 int(gr.group(2))] + 1
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
121 previous_query = gr.group(1)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
122 return Blast_Output
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
123
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
124
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
125 def ReduceLists(x, y):
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
126 ''' reduces two lists into a 2-dim matrix '''
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
127 Matrix = [[0 for i in range(max_cl + 1)] for i in range(2)]
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
128 for i in range(len(x)):
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
129 for j in range(len(x[i])):
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
130 Matrix[0][j] = Matrix[0][j] + x[i][j]
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
131 for i in range(len(y)):
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
132 for j in range(len(y[i])):
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
133 Matrix[1][j] = Matrix[1][j] + y[i][j]
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
134 return Matrix
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
135
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
136
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
137 def fasta_size(fastafile):
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
138 with open(fastafile, 'r') as f:
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
139 s = 0
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
140 for i in f:
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
141 if i[0] == ">":
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
142 s += 1
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
143 return s
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
144
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
145
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
146 def makeblastdb(filename):
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
147 dbtmp = tempfile.NamedTemporaryFile()
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
148 cmd = [
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
149 'makeblastdb', '-in', filename, '-input_type', 'fasta', '-dbtype',
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
150 'nucl', '-out', dbtmp.name
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
151 ]
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
152 subprocess.call(cmd)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
153 return dbtmp
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
154
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
155
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
156 if __name__ == "__main__":
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
157 args = get_arguments()
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
158 max_cl = args.max_cl
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
159 output = args.output
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
160 HTMLreport = args.html
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
161 contigs = args.Contigs
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
162
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
163 # Creation of database
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
164 db = makeblastdb(contigs)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
165
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
166 inputN = fasta_size(args.InputSeq)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
167 chipN = fasta_size(args.ChipSeq)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
168
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
169 # Reading and distribution of data to temp files for multiprocessing
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
170 filesC = split(args.ChipSeq, args.nproc)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
171 filesI = split(args.InputSeq, args.nproc)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
172
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
173 # start of parallized blast
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
174 pool = mp.Pool(processes=args.nproc)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
175 results = [pool.apply_async(blast, args=(f, db.name, args.bitscore)) for f in filesC]
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
176 Cout = [p.get() for p in results]
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
177 results = [pool.apply_async(blast, args=(f, db.name, args.bitscore)) for f in filesI]
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
178 Iout = [p.get() for p in results]
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
179
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
180 # Merging of blast output into a 2-dim matrix
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
181 Matrix = ReduceLists(Cout, Iout)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
182
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
183 with open(args.output, 'w') as f:
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
184 print("Cluster", "Chip_Hits", "Input_Hits", sep='\t', file=f)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
185 for hit in range(1, args.max_cl + 1):
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
186 print(hit, Matrix[0][hit], Matrix[1][hit], sep='\t', file=f)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
187 Rarguments = "Rscript " + \
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
188 os.path.dirname(__file__) + "/ChipSeqRatioAnalysis.R"
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
189 # order is important - programmed by georg - this it realy ugly!
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
190 args = shlex.split(Rarguments)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
191 args.append(output)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
192 args.append(HTMLreport)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
193 args.append(str(inputN))
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
194 args.append(str(chipN))
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
195 with subprocess.Popen(args, stderr=subprocess.PIPE) as p:
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
196 print("Creating HTML report")
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
197 stdout, stderr = p.communicate()
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
198 if (len(stderr) > 0):
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
199 print(stderr)
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
200 # cleanup
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
201 for i in filesC + filesI:
e320ef2d105a Uploaded
petr-novak
parents:
diff changeset
202 os.unlink(i)