comparison RepEnrich.py @ 0:f6f0f1e5e940 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
author artbio
date Wed, 02 Aug 2017 05:17:29 -0400
parents
children 15e3e29f310e
comparison
equal deleted inserted replaced
-1:000000000000 0:f6f0f1e5e940
1 #!/usr/bin/env python
2
3 import argparse
4 import csv
5 import os
6 import shlex
7
8 import shutil
9 import subprocess
10 import sys
11
12 import numpy
13
14
15 parser = argparse.ArgumentParser(description='''
16 Part II: Conducting the alignments to the psuedogenomes. Before\
17 doing this step you will require 1) a bamfile of the unique\
18 alignments with index 2) a fastq file of the reads mapping to\
19 more than one location. These files can be obtained using the\
20 following bowtie options [EXAMPLE: bowtie -S -m 1\
21 --max multimap.fastq mm9 mate1_reads.fastq] Once you have the\
22 unique alignment bamfile and the reads mapping to more than one\
23 location in a fastq file you can run this step. EXAMPLE: python\
24 master_output.py\
25 /users/nneretti/data/annotation/hg19/hg19_repeatmasker.txt\
26 /users/nneretti/datasets/repeatmapping/POL3/Pol3_human/
27 HeLa_InputChIPseq_Rep1 HeLa_InputChIPseq_Rep1\
28 /users/nneretti/data/annotation/hg19/setup_folder\
29 HeLa_InputChIPseq_Rep1_multimap.fastq\
30 HeLa_InputChIPseq_Rep1.bam''')
31 parser.add_argument('--version', action='version', version='%(prog)s 0.1')
32 parser.add_argument('annotation_file', action='store',
33 metavar='annotation_file',
34 help='List RepeatMasker.org annotation file for your\
35 organism. The file may be downloaded from the\
36 RepeatMasker.org website. Example:\
37 /data/annotation/hg19/hg19_repeatmasker.txt')
38 parser.add_argument('outputfolder', action='store', metavar='outputfolder',
39 help='List folder to contain results.\
40 Example: /outputfolder')
41 parser.add_argument('outputprefix', action='store', metavar='outputprefix',
42 help='Enter prefix name for data.\
43 Example: HeLa_InputChIPseq_Rep1')
44 parser.add_argument('setup_folder', action='store', metavar='setup_folder',
45 help='List folder that contains the repeat element\
46 pseudogenomes.\
47 Example: /data/annotation/hg19/setup_folder')
48 parser.add_argument('fastqfile', action='store', metavar='fastqfile',
49 help='Enter file for the fastq reads that map to multiple\
50 locations. Example: /data/multimap.fastq')
51 parser.add_argument('alignment_bam', action='store', metavar='alignment_bam',
52 help='Enter bamfile output for reads that map uniquely.\
53 Example /bamfiles/old.bam')
54 parser.add_argument('--pairedend', action='store', dest='pairedend',
55 default='FALSE',
56 help='Designate this option for paired-end sequencing.\
57 Default FALSE change to TRUE')
58 parser.add_argument('--collapserepeat', action='store', dest='collapserepeat',
59 metavar='collapserepeat', default='Simple_repeat',
60 help='Designate this option to generate a collapsed repeat\
61 type. Uncollapsed output is generated in addition to\
62 collapsed repeat type. Simple_repeat is default to\
63 simplify downstream analysis. You can change the\
64 default to another repeat name to collapse a\
65 seperate specific repeat instead or if the name of\
66 Simple_repeat is different for your organism.\
67 Default Simple_repeat')
68 parser.add_argument('--fastqfile2', action='store', dest='fastqfile2',
69 metavar='fastqfile2', default='none',
70 help='Enter fastqfile2 when using paired-end option.\
71 Default none')
72 parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus',
73 default="1", type=int,
74 help='Enter available cpus per node. The more cpus the\
75 faster RepEnrich performs. RepEnrich is designed to\
76 only work on one node. Default: "1"')
77 parser.add_argument('--allcountmethod', action='store', dest='allcountmethod',
78 metavar='allcountmethod', default="FALSE",
79 help='By default the pipeline only outputs the fraction\
80 count method. Consdidered to be the best way to\
81 count multimapped reads. Changing this option will\
82 include the unique count method, a conservative\
83 count, and the total count method, a liberal\
84 counting strategy. Our evaluation of simulated data\
85 indicated fraction counting is best.\
86 Default = FALSE, change to TRUE')
87 parser.add_argument('--is_bed', action='store', dest='is_bed',
88 metavar='is_bed', default='FALSE',
89 help='Is the annotation file a bed file.\
90 This is also a compatible format. The file needs to\
91 be a tab seperated bed with optional fields.\
92 Ex. format: chr\tstart\tend\tName_element\tclass\
93 \tfamily. The class and family should identical to\
94 name_element if not applicable.\
95 Default FALSE change to TRUE')
96 args = parser.parse_args()
97
98 # parameters
99 annotation_file = args.annotation_file
100 outputfolder = args.outputfolder
101 outputfile_prefix = args.outputprefix
102 setup_folder = args.setup_folder
103 repeat_bed = setup_folder + os.path.sep + 'repnames.bed'
104 unique_mapper_bam = args.alignment_bam
105 fastqfile_1 = args.fastqfile
106 fastqfile_2 = args.fastqfile2
107 cpus = args.cpus
108 b_opt = "-k1 -p " + str(1) + " --quiet"
109 simple_repeat = args.collapserepeat
110 paired_end = args.pairedend
111 allcountmethod = args.allcountmethod
112 is_bed = args.is_bed
113
114 ##############################################################################
115 # check that the programs we need are available
116 try:
117 subprocess.call(shlex.split("coverageBed -h"),
118 stdout=open(os.devnull, 'wb'),
119 stderr=open(os.devnull, 'wb'))
120 subprocess.call(shlex.split("bowtie --version"),
121 stdout=open(os.devnull, 'wb'),
122 stderr=open(os.devnull, 'wb'))
123 except OSError:
124 print("Error: Bowtie or BEDTools not loaded")
125 raise
126
127 ##############################################################################
128 # define a csv reader that reads space deliminated files
129 print('Preparing for analysis using RepEnrich...')
130 csv.field_size_limit(sys.maxsize)
131
132
133 def import_text(filename, separator):
134 for line in csv.reader(open(filename), delimiter=separator,
135 skipinitialspace=True):
136 if line:
137 yield line
138
139
140 ##############################################################################
141 # build dictionaries to convert repclass and rep families'
142 if is_bed == "FALSE":
143 repeatclass = {}
144 repeatfamily = {}
145 fin = import_text(annotation_file, ' ')
146 x = 0
147 for line in fin:
148 if x > 2:
149 classfamily = []
150 classfamily = line[10].split(os.path.sep)
151 line9 = line[9].replace("(", "_").replace(
152 ")", "_").replace("/", "_")
153 repeatclass[line9] = classfamily[0]
154 if len(classfamily) == 2:
155 repeatfamily[line9] = classfamily[1]
156 else:
157 repeatfamily[line9] = classfamily[0]
158 x += 1
159 if is_bed == "TRUE":
160 repeatclass = {}
161 repeatfamily = {}
162 fin = open(annotation_file, 'r')
163 for line in fin:
164 line = line.strip('\n')
165 line = line.split('\t')
166 theclass = line[4]
167 thefamily = line[5]
168 line3 = line[3].replace("(", "_").replace(")", "_").replace("/", "_")
169 repeatclass[line3] = theclass
170 repeatfamily[line3] = thefamily
171 fin.close()
172
173 ##############################################################################
174 # build list of repeats initializing dictionaries for downstream analysis'
175 fin = import_text(setup_folder + os.path.sep + 'repgenomes_key.txt', '\t')
176 repeat_key = {}
177 rev_repeat_key = {}
178 repeat_list = []
179 reptotalcounts = {}
180 classfractionalcounts = {}
181 familyfractionalcounts = {}
182 classtotalcounts = {}
183 familytotalcounts = {}
184 reptotalcounts_simple = {}
185 fractionalcounts = {}
186 i = 0
187 for line in fin:
188 reptotalcounts[line[0]] = 0
189 fractionalcounts[line[0]] = 0
190 if line[0] in repeatclass:
191 classtotalcounts[repeatclass[line[0]]] = 0
192 classfractionalcounts[repeatclass[line[0]]] = 0
193 if line[0] in repeatfamily:
194 familytotalcounts[repeatfamily[line[0]]] = 0
195 familyfractionalcounts[repeatfamily[line[0]]] = 0
196 if line[0] in repeatfamily:
197 if repeatfamily[line[0]] == simple_repeat:
198 reptotalcounts_simple[simple_repeat] = 0
199 else:
200 reptotalcounts_simple[line[0]] = 0
201 repeat_list.append(line[0])
202 repeat_key[line[0]] = int(line[1])
203 rev_repeat_key[int(line[1])] = line[0]
204 fin.close()
205 ##############################################################################
206 # map the repeats to the psuedogenomes:
207 if not os.path.exists(outputfolder):
208 os.mkdir(outputfolder)
209 ##############################################################################
210 # Conduct the regions sorting
211 print('Conducting region sorting on unique mapping reads....')
212 fileout = outputfolder + os.path.sep + outputfile_prefix + '_regionsorter.txt'
213 with open(fileout, 'w') as stdout:
214 command = shlex.split("coverageBed -abam " + unique_mapper_bam + " -b " +
215 setup_folder + os.path.sep + 'repnames.bed')
216 p = subprocess.Popen(command, stdout=stdout)
217 p.communicate()
218 stdout.close()
219 filein = open(outputfolder + os.path.sep + outputfile_prefix
220 + '_regionsorter.txt', 'r')
221 counts = {}
222 sumofrepeatreads = 0
223 for line in filein:
224 line = line.split('\t')
225 if not str(repeat_key[line[3]]) in counts:
226 counts[str(repeat_key[line[3]])] = 0
227 counts[str(repeat_key[line[3]])] += int(line[4])
228 sumofrepeatreads += int(line[4])
229 print('Identified ' + str(sumofrepeatreads) +
230 'unique reads that mapped to repeats.')
231 ##############################################################################
232 if paired_end == 'TRUE':
233 if not os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'):
234 os.mkdir(outputfolder + os.path.sep + 'pair1_bowtie')
235 if not os.path.exists(outputfolder + os.path.sep + 'pair2_bowtie'):
236 os.mkdir(outputfolder + os.path.sep + 'pair2_bowtie')
237 folder_pair1 = outputfolder + os.path.sep + 'pair1_bowtie'
238 folder_pair2 = outputfolder + os.path.sep + 'pair2_bowtie'
239 ##############################################################################
240 print("Processing repeat psuedogenomes...")
241 ps = []
242 psb = []
243 ticker = 0
244 for metagenome in repeat_list:
245 metagenomepath = setup_folder + os.path.sep + metagenome
246 file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
247 file2 = folder_pair2 + os.path.sep + metagenome + '.bowtie'
248 with open(file1, 'w') as stdout:
249 command = shlex.split("bowtie " + b_opt + " " +
250 metagenomepath + " " + fastqfile_1)
251 p = subprocess.Popen(command, stdout=stdout)
252 with open(file2, 'w') as stdout:
253 command = shlex.split("bowtie " + b_opt + " " +
254 metagenomepath + " " + fastqfile_2)
255 pp = subprocess.Popen(command, stdout=stdout)
256 ps.append(p)
257 ticker += 1
258 psb.append(pp)
259 ticker += 1
260 if ticker == cpus:
261 for p in ps:
262 p.communicate()
263 for p in psb:
264 p.communicate()
265 ticker = 0
266 psb = []
267 ps = []
268 if len(ps) > 0:
269 for p in ps:
270 p.communicate()
271 stdout.close()
272
273 ##############################################################################
274 # combine the output from both read pairs:
275 print('sorting and combining the output for both read pairs...')
276 if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'):
277 os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie')
278 sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie'
279 for metagenome in repeat_list:
280 file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
281 file2 = folder_pair2 + os.path.sep + metagenome + '.bowtie'
282 fileout = sorted_bowtie + os.path.sep + metagenome + '.bowtie'
283 with open(fileout, 'w') as stdout:
284 p1 = subprocess.Popen(['cat', file1, file2],
285 stdout=subprocess.PIPE)
286 p2 = subprocess.Popen(['cut', '-f1', "-d "], stdin=p1.stdout,
287 stdout=subprocess.PIPE)
288 p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout,
289 stdout=subprocess.PIPE)
290 p4 = subprocess.Popen(['sort'], stdin=p3.stdout,
291 stdout=subprocess.PIPE)
292 p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout)
293 p5.communicate()
294 stdout.close()
295 print('completed ...')
296 ##############################################################################
297 if paired_end == 'FALSE':
298 if not os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'):
299 os.mkdir(outputfolder + os.path.sep + 'pair1_bowtie')
300 folder_pair1 = outputfolder + os.path.sep + 'pair1_bowtie'
301 ##############################################################################
302 ps = []
303 ticker = 0
304 print("Processing repeat psuedogenomes...")
305 for metagenome in repeat_list:
306 metagenomepath = setup_folder + os.path.sep + metagenome
307 file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
308 with open(file1, 'w') as stdout:
309 command = shlex.split("bowtie " + b_opt + " " +
310 metagenomepath + " " + fastqfile_1)
311 p = subprocess.Popen(command, stdout=stdout)
312 ps.append(p)
313 ticker += 1
314 if ticker == cpus:
315 for p in ps:
316 p.communicate()
317 ticker = 0
318 ps = []
319 if len(ps) > 0:
320 for p in ps:
321 p.communicate()
322 stdout.close()
323
324 ##############################################################################
325 # combine the output from both read pairs:
326 print('Sorting and combining the output for both read pairs....')
327 if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'):
328 os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie')
329 sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie'
330 for metagenome in repeat_list:
331 file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
332 fileout = sorted_bowtie + os.path.sep + metagenome + '.bowtie'
333 with open(fileout, 'w') as stdout:
334 p1 = subprocess.Popen(['cat', file1], stdout=subprocess.PIPE)
335 p2 = subprocess.Popen(['cut', '-f1'], stdin=p1.stdout,
336 stdout=subprocess.PIPE)
337 p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout,
338 stdout=subprocess.PIPE)
339 p4 = subprocess.Popen(['sort'], stdin=p3.stdout,
340 stdout=subprocess.PIPE)
341 p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout)
342 p5.communicate()
343 stdout.close()
344 print('completed ...')
345
346 ##############################################################################
347 # build a file of repeat keys for all reads
348 print('Writing and processing intermediate files...')
349 sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie'
350 readid = {}
351 sumofrepeatreads = 0
352 for rep in repeat_list:
353 for data in import_text(sorted_bowtie + os.path.sep +
354 rep + '.bowtie', '\t'):
355 readid[data[0]] = ''
356 for rep in repeat_list:
357 for data in import_text(sorted_bowtie + os.path.sep
358 + rep + '.bowtie', '\t'):
359 readid[data[0]] += str(repeat_key[rep]) + str(',')
360 for subfamilies in readid.values():
361 if subfamilies not in counts:
362 counts[subfamilies] = 0
363 counts[subfamilies] += 1
364 sumofrepeatreads += 1
365 del readid
366 print('Identified ' + str(sumofrepeatreads) +
367 ' reads that mapped to repeats for unique and multimappers.')
368
369 ##############################################################################
370 print("Conducting final calculations...")
371
372
373 def convert(x):
374 '''
375 build a converter to numeric label for repeat and yield a combined list
376 of repnames seperated by backslash
377 '''
378 x = x.strip(',')
379 x = x.split(',')
380 global repname
381 repname = ""
382 for i in x:
383 repname = repname + os.path.sep + rev_repeat_key[int(i)]
384
385
386 # building the total counts for repeat element enrichment...
387 for x in counts.keys():
388 count = counts[x]
389 x = x.strip(',')
390 x = x.split(',')
391 for i in x:
392 reptotalcounts[rev_repeat_key[int(i)]] += int(count)
393 # building the fractional counts for repeat element enrichment...
394 for x in counts.keys():
395 count = counts[x]
396 x = x.strip(',')
397 x = x.split(',')
398 splits = len(x)
399 for i in x:
400 fractionalcounts[rev_repeat_key[int(i)]] += float(
401 numpy.divide(float(count), float(splits)))
402 # building categorized table of repeat element enrichment...
403 repcounts = {}
404 repcounts['other'] = 0
405 for key in counts.keys():
406 convert(key)
407 repcounts[repname] = counts[key]
408 # building the total counts for class enrichment...
409 for key in reptotalcounts.keys():
410 classtotalcounts[repeatclass[key]] += reptotalcounts[key]
411 # building total counts for family enrichment...
412 for key in reptotalcounts.keys():
413 familytotalcounts[repeatfamily[key]] += reptotalcounts[key]
414 # building unique counts table'
415 repcounts2 = {}
416 for rep in repeat_list:
417 if "/" + rep in repcounts:
418 repcounts2[rep] = repcounts["/" + rep]
419 else:
420 repcounts2[rep] = 0
421 # building the fractionalcounts counts for class enrichment...
422 for key in fractionalcounts.keys():
423 classfractionalcounts[repeatclass[key]] += fractionalcounts[key]
424 # building fractional counts for family enrichment...
425 for key in fractionalcounts.keys():
426 familyfractionalcounts[repeatfamily[key]] += fractionalcounts[key]
427
428 ##############################################################################
429 print('Writing final output and removing intermediate files...')
430 # print output to file of the categorized counts and total overlapping counts:
431 if allcountmethod == "TRUE":
432 fout1 = open(outputfolder + os.path.sep + outputfile_prefix
433 + '_total_counts.txt', 'w')
434 for key in reptotalcounts.keys():
435 fout1.write(str(key) + '\t' + repeatclass[key] + '\t' +
436 repeatfamily[key] + '\t' + str(reptotalcounts[key])
437 + '\n')
438 fout2 = open(outputfolder + os.path.sep + outputfile_prefix
439 + '_class_total_counts.txt', 'w')
440 for key in classtotalcounts.keys():
441 fout2.write(str(key) + '\t' + str(classtotalcounts[key]) + '\n')
442 fout3 = open(outputfolder + os.path.sep + outputfile_prefix
443 + '_family_total_counts.txt', 'w')
444 for key in familytotalcounts.keys():
445 fout3.write(str(key) + '\t' + str(familytotalcounts[key]) + '\n')
446 fout4 = open(outputfolder + os.path.sep + outputfile_prefix +
447 '_unique_counts.txt', 'w')
448 for key in repcounts2.keys():
449 fout4.write(str(key) + '\t' + repeatclass[key] + '\t' +
450 repeatfamily[key] + '\t' + str(repcounts2[key]) + '\n')
451 fout5 = open(outputfolder + os.path.sep + outputfile_prefix
452 + '_class_fraction_counts.txt', 'w')
453 for key in classfractionalcounts.keys():
454 fout5.write(str(key) + '\t' + str(classfractionalcounts[key]) + '\n')
455 fout6 = open(outputfolder + os.path.sep + outputfile_prefix +
456 '_family_fraction_counts.txt', 'w')
457 for key in familyfractionalcounts.keys():
458 fout6.write(str(key) + '\t' + str(familyfractionalcounts[key]) + '\n')
459 fout7 = open(outputfolder + os.path.sep + outputfile_prefix
460 + '_fraction_counts.txt', 'w')
461 for key in fractionalcounts.keys():
462 fout7.write(str(key) + '\t' + repeatclass[key] + '\t' +
463 repeatfamily[key] + '\t' + str(int(fractionalcounts[key]))
464 + '\n')
465 fout1.close()
466 fout2.close()
467 fout3.close()
468 fout4.close()
469 fout5.close()
470 fout6.close()
471 fout7.close()
472 else:
473 fout1 = open(outputfolder + os.path.sep + outputfile_prefix +
474 '_class_fraction_counts.txt', 'w')
475 for key in classfractionalcounts.keys():
476 fout1.write(str(key) + '\t' + str(classfractionalcounts[key]) + '\n')
477 fout2 = open(outputfolder + os.path.sep + outputfile_prefix +
478 '_family_fraction_counts.txt', 'w')
479 for key in familyfractionalcounts.keys():
480 fout2.write(str(key) + '\t' + str(familyfractionalcounts[key]) + '\n')
481 fout3 = open(outputfolder + os.path.sep + outputfile_prefix +
482 '_fraction_counts.txt', 'w')
483 for key in fractionalcounts.keys():
484 fout3.write(str(key) + '\t' + repeatclass[key] + '\t' +
485 repeatfamily[key] + '\t' + str(int(fractionalcounts[key]))
486 + '\n')
487 fout1.close()
488 fout2.close()
489 fout3.close()
490 ##############################################################################
491 # Remove Large intermediate files
492 if os.path.exists(outputfolder + os.path.sep + outputfile_prefix +
493 '_regionsorter.txt'):
494 os.remove(outputfolder + os.path.sep + outputfile_prefix +
495 '_regionsorter.txt')
496 if os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'):
497 shutil.rmtree(outputfolder + os.path.sep + 'pair1_bowtie')
498 if os.path.exists(outputfolder + os.path.sep + 'pair2_bowtie'):
499 shutil.rmtree(outputfolder + os.path.sep + 'pair2_bowtie')
500 if os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'):
501 shutil.rmtree(outputfolder + os.path.sep + 'sorted_bowtie')
502 print("... Done")