comparison RepEnrich.py @ 13:530626b0757c draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
author artbio
date Tue, 02 Apr 2024 21:16:37 +0000
parents 89e05f831259
children bf866bedd4b4
comparison
equal deleted inserted replaced
12:89e05f831259 13:530626b0757c
2 import csv 2 import csv
3 import os 3 import os
4 import shlex 4 import shlex
5 import subprocess 5 import subprocess
6 import sys 6 import sys
7 7 from collections import defaultdict
8 import numpy 8 from concurrent.futures import ProcessPoolExecutor
9 9
10 10
11 parser = argparse.ArgumentParser(description=''' 11 parser = argparse.ArgumentParser(description='''
12 Repenrich aligns reads to Repeat Elements pseudogenomes\ 12 Repenrich aligns reads to Repeat Elements pseudogenomes\
13 and counts aligned reads. RepEnrich_setup must be run\ 13 and counts aligned reads. RepEnrich_setup must be run\
15 parser.add_argument('--annotation_file', action='store', 15 parser.add_argument('--annotation_file', action='store',
16 metavar='annotation_file', 16 metavar='annotation_file',
17 help='RepeatMasker.org annotation file for your\ 17 help='RepeatMasker.org annotation file for your\
18 organism. The file may be downloaded from\ 18 organism. The file may be downloaded from\
19 RepeatMasker.org. E.g. hg19_repeatmasker.txt') 19 RepeatMasker.org. E.g. hg19_repeatmasker.txt')
20 parser.add_argument('--outputfolder', action='store', metavar='outputfolder', 20 parser.add_argument('--alignment_bam', action='store', metavar='alignment_bam',
21 help='Folder that will contain results. Should be the\ 21 help='Bam alignments of unique mapper reads.')
22 same as the one used for RepEnrich_setup.\
23 Example: ./outputfolder')
24 parser.add_argument('--outputprefix', action='store', metavar='outputprefix',
25 help='Prefix name for Repenrich output files.')
26 parser.add_argument('--setup_folder', action='store', metavar='setup_folder',
27 help='Folder produced by RepEnrich_setup which contains\
28 repeat element pseudogenomes.')
29 parser.add_argument('--fastqfile', action='store', metavar='fastqfile', 22 parser.add_argument('--fastqfile', action='store', metavar='fastqfile',
30 help='File of fastq reads mapping to multiple\ 23 help='File of fastq reads mapping to multiple\
31 locations. Example: /data/multimap.fastq') 24 locations. Example: /data/multimap.fastq')
32 parser.add_argument('--alignment_bam', action='store', metavar='alignment_bam',
33 help='Bam alignments of unique mapper reads.')
34 parser.add_argument('--pairedend', action='store', dest='pairedend',
35 default='FALSE',
36 help='Change to TRUE for paired-end fastq files.\
37 Default FALSE')
38 parser.add_argument('--fastqfile2', action='store', dest='fastqfile2', 25 parser.add_argument('--fastqfile2', action='store', dest='fastqfile2',
39 metavar='fastqfile2', default='none', 26 metavar='fastqfile2', default='',
40 help='fastqfile #2 when using paired-end option.\ 27 help='fastqfile #2 when using paired-end option.\
41 Default none') 28 Default none')
42 parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus', 29 parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus',
43 default="1", type=int, 30 default="1", type=int,
44 help='Number of CPUs. The more cpus the\ 31 help='Number of CPUs. The more cpus the\
46 33
47 args = parser.parse_args() 34 args = parser.parse_args()
48 35
49 # parameters 36 # parameters
50 annotation_file = args.annotation_file 37 annotation_file = args.annotation_file
51 outputfolder = args.outputfolder
52 outputfile_prefix = args.outputprefix
53 setup_folder = args.setup_folder
54 repeat_bed = os.path.join(setup_folder, 'repnames.bed')
55 unique_mapper_bam = args.alignment_bam 38 unique_mapper_bam = args.alignment_bam
56 fastqfile_1 = args.fastqfile 39 fastqfile_1 = args.fastqfile
57 fastqfile_2 = args.fastqfile2 40 fastqfile_2 = args.fastqfile2
58 cpus = args.cpus 41 cpus = args.cpus
59 b_opt = "-k1 -p 1 --quiet"
60 # Change if simple repeats are differently annotated in your organism 42 # Change if simple repeats are differently annotated in your organism
61 simple_repeat = "Simple_repeat" 43 simple_repeat = "Simple_repeat"
62 paired_end = args.pairedend 44 if args.fastqfile2:
45 paired_end = True
46 else:
47 paired_end = False
63 48
64 # check that the programs we need are available 49 # check that the programs we need are available
65 try: 50 try:
66 subprocess.call(shlex.split("coverageBed -h"), 51 subprocess.call(shlex.split("coverageBed -h"),
67 stdout=open(os.devnull, 'wb'), 52 stdout=open(os.devnull, 'wb'),
71 stderr=open(os.devnull, 'wb')) 56 stderr=open(os.devnull, 'wb'))
72 except OSError: 57 except OSError:
73 print("Error: Bowtie or bedtools not loaded") 58 print("Error: Bowtie or bedtools not loaded")
74 raise 59 raise
75 60
76 # define a csv reader that reads space deliminated files 61
77 print('Preparing for analysis using RepEnrich...') 62 def starts_with_numerical(list):
78 csv.field_size_limit(sys.maxsize) 63 try:
79 64 if len(list) == 0:
80 65 return False
66 int(list[0])
67 return True
68 except ValueError:
69 return False
70
71
72 # define a text importer for .out/.txt format of repbase
81 def import_text(filename, separator): 73 def import_text(filename, separator):
82 for line in csv.reader(open(filename), delimiter=separator, 74 csv.field_size_limit(sys.maxsize)
83 skipinitialspace=True): 75 file = csv.reader(open(filename), delimiter=separator,
84 if line: 76 skipinitialspace=True)
85 yield line 77 return [line for line in file if starts_with_numerical(line)]
86 78
87 79
88 # build dictionaries to convert repclass and rep families 80 def run_bowtie(args):
89 repeatclass, repeatfamily = {}, {} 81 metagenome, fastqfile = args
82 b_opt = "-k 1 -p 1 --quiet"
83 command = shlex.split(f"bowtie {b_opt} -x {metagenome} {fastqfile}")
84 bowtie_align = subprocess.run(command, check=True,
85 capture_output=True, text=True).stdout
86 bowtie_align = bowtie_align.rstrip('\r\n').split('\n')
87 readlist = [metagenome]
88 if paired_end:
89 for line in bowtie_align:
90 readlist.append(line.split("/")[0])
91 else:
92 for line in bowtie_align:
93 readlist.append(line.split("\t")[0])
94 return readlist
95
96
97 # set a reference repeat list for the script
98 repeat_list = [listline[9].translate(
99 str.maketrans(
100 '()/', '___')) for listline in import_text(annotation_file, ' ')]
101 repeat_list = sorted(list(set(repeat_list)))
102
103 # unique mapper counting
104 cmd = f"bedtools bamtobed -i {unique_mapper_bam} | \
105 bedtools coverage -b stdin -a repnames.bed"
106 p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
107 bedtools_counts = p.communicate()[0].decode().rstrip('\r\n').split('\n')
108
109 # parse bedtools output
110 counts = defaultdict(int) # key: repeat names, value: unique mapper counts
111 sumofrepeatreads = 0
112 for line in bedtools_counts:
113 line = line.split('\t')
114 counts[line[3]] += int(line[4])
115 sumofrepeatreads += int(line[4])
116 print(f"Identified {sumofrepeatreads} unique reads that mapped to repeats.")
117
118 # multimapper parsing
119 if not paired_end:
120 args_list = [(metagenome, fastqfile_1) for
121 metagenome in repeat_list]
122 with ProcessPoolExecutor(max_workers=cpus) as executor:
123 results = executor.map(run_bowtie, args_list)
124 else:
125 args_list = [(metagenome, fastqfile_1) for
126 metagenome in repeat_list]
127 args_list.extend([(metagenome, fastqfile_2) for
128 metagenome in repeat_list])
129 with ProcessPoolExecutor(max_workers=cpus) as executor:
130 results = executor.map(run_bowtie, args_list)
131
132 # Aggregate results (avoiding race conditions)
133 metagenome_reads = defaultdict(list) # repeat_name: list of multimap reads
134 for result in results:
135 metagenome_reads[result[0]] += result[1:]
136
137 for name in metagenome_reads:
138 # read are only once in list
139 metagenome_reads[name] = list(set(metagenome_reads[name]))
140 # remove "no read" instances
141 metagenome_reads[name] = [read for read in metagenome_reads[name]
142 if read != ""]
143
144 # implement repeats_by_reads from the inverse dictionnary metagenome_reads
145 repeats_by_reads = defaultdict(list) # readids: list of repeats names
146 for repname in metagenome_reads:
147 for read in metagenome_reads[repname]:
148 repeats_by_reads[read].append(repname)
149 for repname in repeats_by_reads:
150 repeats_by_reads[repname] = list(set(repeats_by_reads[repname]))
151
152 # 3 dictionnaries and 1 pointer variable to be populated
153 fractionalcounts = defaultdict(float)
154 familyfractionalcounts = defaultdict(float)
155 classfractionalcounts = defaultdict(float)
156 sumofrepeatreads = 0
157
158 # Update counts dictionnary with sets of repeats (was "subfamilies")
159 # matched by multimappers
160 for repeat_set in repeats_by_reads.values():
161 repeat_set_string = ','.join(repeat_set)
162 counts[repeat_set_string] += 1
163 sumofrepeatreads += 1
164
165 print(f'Identified more {sumofrepeatreads} mutimapper repeat reads')
166
167 # Populate fractionalcounts
168 for key, count in counts.items():
169 key_list = key.split(',')
170 for i in key_list:
171 fractionalcounts[i] += count / len(key_list)
172
173 # build repeat_ref for easy access to rep class and rep families
174 repeat_ref = defaultdict(dict)
90 repeats = import_text(annotation_file, ' ') 175 repeats = import_text(annotation_file, ' ')
91 # skip three first lines of the iterator
92 for line in range(3):
93 next(repeats)
94 for repeat in repeats: 176 for repeat in repeats:
95 classfamily = [] 177 repeat_name = repeat[9].translate(str.maketrans('()/', '___'))
96 classfamily = repeat[10].split('/') 178 try:
97 matching_repeat = repeat[9].translate(str.maketrans('()/', '___')) 179 repclass = repeat[10].split('/')[0]
98 repeatclass[matching_repeat] = classfamily[0] 180 repfamily = repeat[10].split('/')[1]
99 if len(classfamily) == 2: 181 except IndexError:
100 repeatfamily[matching_repeat] = classfamily[1] 182 repclass, repfamily = repeat[10], repeat[10]
101 else: 183 repeat_ref[repeat_name]['class'] = repclass
102 repeatfamily[matching_repeat] = classfamily[0] 184 repeat_ref[repeat_name]['family'] = repfamily
103 185
104 # build list of repeats initializing dictionaries for downstream analysis' 186 # Populate classfractionalcounts and familyfractionalcounts
105 repgenome_path = os.path.join(setup_folder, 'repgenomes_key.txt') 187 for key, value in fractionalcounts.items():
106 reptotalcounts = {line[0]: 0 for line in import_text(repgenome_path, '\t')} 188 classfractionalcounts[repeat_ref[key]['class']] += value
107 fractionalcounts = {line[0]: 0 for line in import_text(repgenome_path, '\t')} 189 familyfractionalcounts[repeat_ref[key]['family']] += value
108 classtotalcounts = {repeatclass[line[0]]: 0 for line in import_text( 190
109 repgenome_path, '\t') if line[0] in repeatclass} 191 # print class-, family- and fraction-repeats counts to files
110 classfractionalcounts = {repeatclass[line[0]]: 0 for line in import_text( 192 with open("class_fraction_counts.tsv", 'w') as fout:
111 repgenome_path, '\t') if line[0] in repeatclass} 193 for key in sorted(classfractionalcounts):
112 familytotalcounts = {repeatfamily[line[0]]: 0 for line in import_text(
113 repgenome_path, '\t') if line[0] in repeatfamily}
114 familyfractionalcounts = {repeatfamily[line[0]]: 0 for line in import_text(
115 repgenome_path, '\t') if line[0] in repeatfamily}
116 reptotalcounts_simple = {(simple_repeat if line[0] in repeatfamily and
117 repeatfamily[line[0]] == simple_repeat else
118 line[0]): 0 for line in import_text(
119 repgenome_path, '\t')}
120 repeat_key = {line[0]: int(line[1]) for line in import_text(
121 repgenome_path, '\t')}
122 rev_repeat_key = {int(line[1]): line[0] for line in import_text(
123 repgenome_path, '\t')}
124 repeat_list = [line[0] for line in import_text(repgenome_path, '\t')]
125
126 # map the repeats to the psuedogenomes:
127 if not os.path.exists(outputfolder):
128 os.mkdir(outputfolder)
129
130 # Conduct the regions sorting
131 fileout = os.path.join(outputfolder, f"{outputfile_prefix}_regionsorter.txt")
132 command = shlex.split(f"coverageBed -abam {unique_mapper_bam} -b \
133 {os.path.join(setup_folder, 'repnames.bed')}")
134 with open(fileout, 'w') as stdout:
135 subprocess.run(command, stdout=stdout, check=True)
136
137 counts = {}
138 sumofrepeatreads = 0
139 with open(fileout) as filein:
140 for line in filein:
141 line = line.split('\t')
142 if not str(repeat_key[line[3]]) in counts:
143 counts[str(repeat_key[line[3]])] = 0
144 counts[str(repeat_key[line[3]])] += int(line[4])
145 sumofrepeatreads += int(line[4])
146 print(f"Identified {sumofrepeatreads} unique reads that \
147 mapped to repeats.")
148
149
150 def run_bowtie(metagenome, fastqfile, folder):
151 metagenomepath = os.path.join(setup_folder, metagenome)
152 output_file = os.path.join(folder, f"{metagenome}.bowtie")
153 command = shlex.split(f"bowtie {b_opt} {metagenomepath} {fastqfile}")
154 with open(output_file, 'w') as stdout:
155 return subprocess.Popen(command, stdout=stdout)
156
157
158 if paired_end == 'FALSE':
159 folder_pair1 = os.path.join(outputfolder, 'pair1_bowtie')
160 os.makedirs(folder_pair1, exist_ok=True)
161
162 print("Processing repeat pseudogenomes...")
163 processes = []
164 ticker = 0
165
166 for metagenome in repeat_list:
167 processes.append(run_bowtie(metagenome, fastqfile_1, folder_pair1))
168 ticker += 1
169 if ticker == cpus:
170 for p in processes:
171 p.communicate()
172 ticker = 0
173 processes = []
174
175 for p in processes:
176 p.communicate()
177 # Combine the output from both read pairs:
178 print('Sorting and combining the output for both read pairs....')
179 sorted_bowtie = os.path.join(outputfolder, 'sorted_bowtie')
180 os.makedirs(sorted_bowtie, exist_ok=True)
181 for metagenome in repeat_list:
182 file1 = os.path.join(folder_pair1, f"{metagenome}.bowtie")
183 fileout = os.path.join(sorted_bowtie, f"{metagenome}.bowtie")
184 with open(fileout, 'w') as stdout:
185 p1 = subprocess.Popen(['cat', file1], stdout=subprocess.PIPE)
186 p2 = subprocess.Popen(['cut', '-f1'], stdin=p1.stdout,
187 stdout=subprocess.PIPE)
188 p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout,
189 stdout=subprocess.PIPE)
190 p4 = subprocess.Popen(['sort'], stdin=p3.stdout,
191 stdout=subprocess.PIPE)
192 p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout)
193 p5.communicate()
194 stdout.close()
195 print('completed ...')
196 else:
197 folder_pair1 = os.path.join(outputfolder, 'pair1_bowtie')
198 folder_pair2 = os.path.join(outputfolder, 'pair2_bowtie')
199 os.makedirs(folder_pair1, exist_ok=True)
200 os.makedirs(folder_pair2, exist_ok=True)
201
202 print("Processing repeat pseudogenomes...")
203 ps, psb, ticker = [], [], 0
204
205 for metagenome in repeat_list:
206 ps.append(run_bowtie(metagenome, fastqfile_1, folder_pair1))
207 ticker += 1
208 if fastqfile_2 != 'none':
209 psb.append(run_bowtie(metagenome, fastqfile_2, folder_pair2))
210 ticker += 1
211 if ticker >= cpus:
212 for p in ps:
213 p.communicate()
214 for p in psb:
215 p.communicate()
216 ticker = 0
217 ps = []
218 psb = []
219
220 for p in ps:
221 p.communicate()
222 for p in psb:
223 p.communicate()
224 # combine the output from both read pairs:
225 print('Sorting and combining the output for both read pairs...')
226 if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'):
227 os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie')
228 sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie'
229 for metagenome in repeat_list:
230 file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
231 file2 = folder_pair2 + os.path.sep + metagenome + '.bowtie'
232 fileout = sorted_bowtie + os.path.sep + metagenome + '.bowtie'
233 with open(fileout, 'w') as stdout:
234 p1 = subprocess.Popen(['cat', file1, file2],
235 stdout=subprocess.PIPE)
236 p2 = subprocess.Popen(['cut', '-f1', "-d "], stdin=p1.stdout,
237 stdout=subprocess.PIPE)
238 p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout,
239 stdout=subprocess.PIPE)
240 p4 = subprocess.Popen(['sort'], stdin=p3.stdout,
241 stdout=subprocess.PIPE)
242 p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout)
243 p5.communicate()
244 stdout.close()
245 print('completed ...')
246
247 # build a file of repeat keys for all reads
248 print('Writing and processing intermediate files...')
249 sorted_bowtie = os.path.join(outputfolder, 'sorted_bowtie')
250 sumofrepeatreads = 0
251 readid = {}
252
253 for rep in repeat_list:
254 for data in import_text(
255 f"{os.path.join(sorted_bowtie, rep)}.bowtie", '\t'):
256 readid[data[0]] = ''
257
258 for rep in repeat_list:
259 for data in import_text(
260 f"{os.path.join(sorted_bowtie, rep)}.bowtie", '\t'):
261 readid[data[0]] += f"{repeat_key[rep]},"
262
263 for subfamilies in readid.values():
264 if subfamilies not in counts:
265 counts[subfamilies] = 0
266 counts[subfamilies] += 1
267 sumofrepeatreads += 1
268
269 print(f'Identified {sumofrepeatreads} reads that mapped to \
270 repeats for unique and multimappers.')
271 print("Conducting final calculations...")
272
273 # building the total counts for repeat element enrichment...
274 for x in counts.keys():
275 count = counts[x]
276 x = x.strip(',').split(',')
277 for i in x:
278 reptotalcounts[rev_repeat_key[int(i)]] += int(count)
279 # building the fractional counts for repeat element enrichment...
280 for x in counts.keys():
281 count = counts[x]
282 x = x.strip(',') .split(',')
283 splits = len(x)
284 for i in x:
285 fractionalcounts[rev_repeat_key[int(i)]] += float(
286 numpy.divide(float(count), float(splits)))
287 # building categorized table of repeat element enrichment...
288 repcounts = {}
289 repcounts['other'] = 0
290 for key in counts.keys():
291 key_list = key.strip(',').split(',')
292 repname = ''
293 for i in key_list:
294 repname = os.path.join(repname, rev_repeat_key[int(i)])
295 repcounts[repname] = counts[key]
296 # building the total counts for class enrichment...
297 for key in reptotalcounts.keys():
298 classtotalcounts[repeatclass[key]] += reptotalcounts[key]
299 # building total counts for family enrichment...
300 for key in reptotalcounts.keys():
301 familytotalcounts[repeatfamily[key]] += reptotalcounts[key]
302 # building unique counts table
303 repcounts2 = {}
304 for rep in repeat_list:
305 if "/" + rep in repcounts:
306 repcounts2[rep] = repcounts["/" + rep]
307 else:
308 repcounts2[rep] = 0
309 # building the fractionalcounts counts for class enrichment...
310 for key in fractionalcounts.keys():
311 classfractionalcounts[repeatclass[key]] += fractionalcounts[key]
312 # building fractional counts for family enrichment...
313 for key in fractionalcounts.keys():
314 familyfractionalcounts[repeatfamily[key]] += fractionalcounts[key]
315
316 # print output to file of the categorized counts and total overlapping counts:
317 print('Writing final output...')
318 with open(f"{os.path.join(outputfolder, outputfile_prefix)}_"
319 f"class_fraction_counts.txt", 'w') as fout:
320 for key in sorted(classfractionalcounts.keys()):
321 fout.write(f"{key}\t{classfractionalcounts[key]}\n") 194 fout.write(f"{key}\t{classfractionalcounts[key]}\n")
322 195
323 with open(f"{os.path.join(outputfolder, outputfile_prefix)}_" 196 with open("family_fraction_counts.tsv", 'w') as fout:
324 f"family_fraction_counts.txt", 'w') as fout: 197 for key in sorted(familyfractionalcounts):
325 for key in sorted(familyfractionalcounts.keys()):
326 fout.write(f"{key}\t{familyfractionalcounts[key]}\n") 198 fout.write(f"{key}\t{familyfractionalcounts[key]}\n")
327 199
328 with open(f"{os.path.join(outputfolder, outputfile_prefix)}_" 200 with open("fraction_counts.tsv", 'w') as fout:
329 f"fraction_counts.txt", 'w') as fout: 201 for key in sorted(fractionalcounts):
330 for key in sorted(fractionalcounts.keys()): 202 fout.write(f"{key}\t{repeat_ref[key]['class']}\t"
331 fout.write(f"{key}\t{repeatclass[key]}\t{repeatfamily[key]}\t" 203 f"{repeat_ref[key]['family']}\t"
332 f"{int(fractionalcounts[key])}\n") 204 f"{fractionalcounts[key]}\n")