comparison RepEnrich2.py @ 0:4905a332a094 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
author artbio
date Sat, 20 Apr 2024 11:56:53 +0000
parents
children 08e50af788f7
comparison
equal deleted inserted replaced
-1:000000000000 0:4905a332a094
1 import argparse
2 import csv
3 import os
4 import shlex
5 import subprocess
6 import sys
7 from collections import defaultdict
8 from concurrent.futures import ProcessPoolExecutor
9
10
11 parser = argparse.ArgumentParser(description='''
12 Repenrich aligns reads to Repeat Elements pseudogenomes\
13 and counts aligned reads. RepEnrich_setup must be run\
14 before its use''')
15 parser.add_argument('--annotation_file', action='store',
16 metavar='annotation_file',
17 help='RepeatMasker.org annotation file for your\
18 organism. The file may be downloaded from\
19 RepeatMasker.org. E.g. hg19_repeatmasker.txt')
20 parser.add_argument('--alignment_bam', action='store', metavar='alignment_bam',
21 help='Bam alignments of unique mapper reads.')
22 parser.add_argument('--fastqfile', action='store', metavar='fastqfile',
23 help='File of fastq reads mapping to multiple\
24 locations. Example: /data/multimap.fastq')
25 parser.add_argument('--fastqfile2', action='store', dest='fastqfile2',
26 metavar='fastqfile2', default='',
27 help='fastqfile #2 when using paired-end option.\
28 Default none')
29 parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus',
30 default="1", type=int,
31 help='Number of CPUs. The more cpus the\
32 faster RepEnrich performs. Default: "1"')
33
34 args = parser.parse_args()
35
36 # parameters
37 annotation_file = args.annotation_file
38 unique_mapper_bam = args.alignment_bam
39 fastqfile_1 = args.fastqfile
40 fastqfile_2 = args.fastqfile2
41 cpus = args.cpus
42 # Change if simple repeats are differently annotated in your organism
43 simple_repeat = "Simple_repeat"
44 if args.fastqfile2:
45 paired_end = True
46 else:
47 paired_end = False
48
49 # check that the programs we need are available
50 try:
51 subprocess.call(shlex.split("coverageBed -h"),
52 stdout=open(os.devnull, 'wb'),
53 stderr=open(os.devnull, 'wb'))
54 subprocess.call(shlex.split("bowtie2 --version"),
55 stdout=open(os.devnull, 'wb'),
56 stderr=open(os.devnull, 'wb'))
57 except OSError:
58 print("Error: Bowtie2 or bedtools not loaded")
59 raise
60
61
62 def starts_with_numerical(list):
63 try:
64 if len(list) == 0:
65 return False
66 int(list[0])
67 return True
68 except ValueError:
69 return False
70
71
72 # define a text importer for .out/.txt format of repbase
73 def import_text(filename, separator):
74 csv.field_size_limit(sys.maxsize)
75 file = csv.reader(open(filename), delimiter=separator,
76 skipinitialspace=True)
77 return [line for line in file if starts_with_numerical(line)]
78
79
80 def run_bowtie(args):
81 metagenome, fastqfile = args
82 b_opt = "-k 1 -p 1 --quiet --no-hd"
83 command = shlex.split(f"bowtie2 {b_opt} -x {metagenome} {fastqfile}")
84 bowtie_align = subprocess.run(command, check=True,
85 capture_output=True, text=True).stdout
86 bowtie_align = bowtie_align.rstrip('\r\n').split('\n')
87 readlist = [metagenome]
88 if paired_end:
89 for line in bowtie_align:
90 readlist.append(line.split("/")[0])
91 else:
92 for line in bowtie_align:
93 readlist.append(line.split("\t")[0])
94 return readlist
95
96
97 # set a reference repeat list for the script
98 repeat_list = [listline[9].translate(
99 str.maketrans(
100 '()/', '___')) for listline in import_text(annotation_file, ' ')]
101 repeat_list = sorted(list(set(repeat_list)))
102
103 # unique mapper counting
104 cmd = f"bedtools bamtobed -i {unique_mapper_bam} | \
105 bedtools coverage -b stdin -a repnames.bed"
106 p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
107 bedtools_counts = p.communicate()[0].decode().rstrip('\r\n').split('\n')
108
109 # parse bedtools output
110 counts = defaultdict(int) # key: repeat names, value: unique mapper counts
111 sumofrepeatreads = 0
112 for line in bedtools_counts:
113 line = line.split('\t')
114 counts[line[3]] += int(line[4])
115 sumofrepeatreads += int(line[4])
116 print(f"Identified {sumofrepeatreads} unique reads that mapped to repeats.")
117
118 # multimapper parsing
119 if not paired_end:
120 args_list = [(metagenome, fastqfile_1) for
121 metagenome in repeat_list]
122 with ProcessPoolExecutor(max_workers=cpus) as executor:
123 results = executor.map(run_bowtie, args_list)
124 else:
125 args_list = [(metagenome, fastqfile_1) for
126 metagenome in repeat_list]
127 args_list.extend([(metagenome, fastqfile_2) for
128 metagenome in repeat_list])
129 with ProcessPoolExecutor(max_workers=cpus) as executor:
130 results = executor.map(run_bowtie, args_list)
131
132 # Aggregate results (avoiding race conditions)
133 metagenome_reads = defaultdict(list) # repeat_name: list of multimap reads
134 for result in results:
135 metagenome_reads[result[0]] += result[1:]
136
137 for name in metagenome_reads:
138 # read are only once in list
139 metagenome_reads[name] = list(set(metagenome_reads[name]))
140 # remove "no read" instances
141 metagenome_reads[name] = [read for read in metagenome_reads[name]
142 if read != ""]
143
144 # implement repeats_by_reads from the inverse dictionnary metagenome_reads
145 repeats_by_reads = defaultdict(list) # readids: list of repeats names
146 for repname in metagenome_reads:
147 for read in metagenome_reads[repname]:
148 repeats_by_reads[read].append(repname)
149 for repname in repeats_by_reads:
150 repeats_by_reads[repname] = list(set(repeats_by_reads[repname]))
151
152 # 3 dictionnaries and 1 pointer variable to be populated
153 fractionalcounts = defaultdict(float)
154 familyfractionalcounts = defaultdict(float)
155 classfractionalcounts = defaultdict(float)
156 sumofrepeatreads = 0
157
158 # Update counts dictionnary with sets of repeats (was "subfamilies")
159 # matched by multimappers
160 for repeat_set in repeats_by_reads.values():
161 repeat_set_string = ','.join(repeat_set)
162 counts[repeat_set_string] += 1
163 sumofrepeatreads += 1
164
165 print(f'Identified more {sumofrepeatreads} mutimapper repeat reads')
166
167 # Populate fractionalcounts
168 for key, count in counts.items():
169 key_list = key.split(',')
170 for i in key_list:
171 fractionalcounts[i] += count / len(key_list)
172
173 # build repeat_ref for easy access to rep class and rep families
174 repeat_ref = defaultdict(dict)
175 repeats = import_text(annotation_file, ' ')
176 for repeat in repeats:
177 repeat_name = repeat[9].translate(str.maketrans('()/', '___'))
178 try:
179 repclass = repeat[10].split('/')[0]
180 repfamily = repeat[10].split('/')[1]
181 except IndexError:
182 repclass, repfamily = repeat[10], repeat[10]
183 repeat_ref[repeat_name]['class'] = repclass
184 repeat_ref[repeat_name]['family'] = repfamily
185
186 # Populate classfractionalcounts and familyfractionalcounts
187 for key, value in fractionalcounts.items():
188 classfractionalcounts[repeat_ref[key]['class']] += value
189 familyfractionalcounts[repeat_ref[key]['family']] += value
190
191 # print class-, family- and fraction-repeats counts to files
192 with open("class_fraction_counts.tsv", 'w') as fout:
193 for key in sorted(classfractionalcounts):
194 fout.write(f"{key}\t{classfractionalcounts[key]}\n")
195
196 with open("family_fraction_counts.tsv", 'w') as fout:
197 for key in sorted(familyfractionalcounts):
198 fout.write(f"{key}\t{familyfractionalcounts[key]}\n")
199
200 with open("fraction_counts.tsv", 'w') as fout:
201 for key in sorted(fractionalcounts):
202 fout.write(f"{key}\t{repeat_ref[key]['class']}\t"
203 f"{repeat_ref[key]['family']}\t"
204 f"{fractionalcounts[key]}\n")