Mercurial > repos > artbio > repenrich2
comparison RepEnrich2.py @ 0:4905a332a094 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
| author | artbio |
|---|---|
| date | Sat, 20 Apr 2024 11:56:53 +0000 |
| parents | |
| children | 08e50af788f7 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4905a332a094 |
|---|---|
| 1 import argparse | |
| 2 import csv | |
| 3 import os | |
| 4 import shlex | |
| 5 import subprocess | |
| 6 import sys | |
| 7 from collections import defaultdict | |
| 8 from concurrent.futures import ProcessPoolExecutor | |
| 9 | |
| 10 | |
| 11 parser = argparse.ArgumentParser(description=''' | |
| 12 Repenrich aligns reads to Repeat Elements pseudogenomes\ | |
| 13 and counts aligned reads. RepEnrich_setup must be run\ | |
| 14 before its use''') | |
| 15 parser.add_argument('--annotation_file', action='store', | |
| 16 metavar='annotation_file', | |
| 17 help='RepeatMasker.org annotation file for your\ | |
| 18 organism. The file may be downloaded from\ | |
| 19 RepeatMasker.org. E.g. hg19_repeatmasker.txt') | |
| 20 parser.add_argument('--alignment_bam', action='store', metavar='alignment_bam', | |
| 21 help='Bam alignments of unique mapper reads.') | |
| 22 parser.add_argument('--fastqfile', action='store', metavar='fastqfile', | |
| 23 help='File of fastq reads mapping to multiple\ | |
| 24 locations. Example: /data/multimap.fastq') | |
| 25 parser.add_argument('--fastqfile2', action='store', dest='fastqfile2', | |
| 26 metavar='fastqfile2', default='', | |
| 27 help='fastqfile #2 when using paired-end option.\ | |
| 28 Default none') | |
| 29 parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus', | |
| 30 default="1", type=int, | |
| 31 help='Number of CPUs. The more cpus the\ | |
| 32 faster RepEnrich performs. Default: "1"') | |
| 33 | |
| 34 args = parser.parse_args() | |
| 35 | |
| 36 # parameters | |
| 37 annotation_file = args.annotation_file | |
| 38 unique_mapper_bam = args.alignment_bam | |
| 39 fastqfile_1 = args.fastqfile | |
| 40 fastqfile_2 = args.fastqfile2 | |
| 41 cpus = args.cpus | |
| 42 # Change if simple repeats are differently annotated in your organism | |
| 43 simple_repeat = "Simple_repeat" | |
| 44 if args.fastqfile2: | |
| 45 paired_end = True | |
| 46 else: | |
| 47 paired_end = False | |
| 48 | |
| 49 # check that the programs we need are available | |
| 50 try: | |
| 51 subprocess.call(shlex.split("coverageBed -h"), | |
| 52 stdout=open(os.devnull, 'wb'), | |
| 53 stderr=open(os.devnull, 'wb')) | |
| 54 subprocess.call(shlex.split("bowtie2 --version"), | |
| 55 stdout=open(os.devnull, 'wb'), | |
| 56 stderr=open(os.devnull, 'wb')) | |
| 57 except OSError: | |
| 58 print("Error: Bowtie2 or bedtools not loaded") | |
| 59 raise | |
| 60 | |
| 61 | |
| 62 def starts_with_numerical(list): | |
| 63 try: | |
| 64 if len(list) == 0: | |
| 65 return False | |
| 66 int(list[0]) | |
| 67 return True | |
| 68 except ValueError: | |
| 69 return False | |
| 70 | |
| 71 | |
| 72 # define a text importer for .out/.txt format of repbase | |
| 73 def import_text(filename, separator): | |
| 74 csv.field_size_limit(sys.maxsize) | |
| 75 file = csv.reader(open(filename), delimiter=separator, | |
| 76 skipinitialspace=True) | |
| 77 return [line for line in file if starts_with_numerical(line)] | |
| 78 | |
| 79 | |
| 80 def run_bowtie(args): | |
| 81 metagenome, fastqfile = args | |
| 82 b_opt = "-k 1 -p 1 --quiet --no-hd" | |
| 83 command = shlex.split(f"bowtie2 {b_opt} -x {metagenome} {fastqfile}") | |
| 84 bowtie_align = subprocess.run(command, check=True, | |
| 85 capture_output=True, text=True).stdout | |
| 86 bowtie_align = bowtie_align.rstrip('\r\n').split('\n') | |
| 87 readlist = [metagenome] | |
| 88 if paired_end: | |
| 89 for line in bowtie_align: | |
| 90 readlist.append(line.split("/")[0]) | |
| 91 else: | |
| 92 for line in bowtie_align: | |
| 93 readlist.append(line.split("\t")[0]) | |
| 94 return readlist | |
| 95 | |
| 96 | |
| 97 # set a reference repeat list for the script | |
| 98 repeat_list = [listline[9].translate( | |
| 99 str.maketrans( | |
| 100 '()/', '___')) for listline in import_text(annotation_file, ' ')] | |
| 101 repeat_list = sorted(list(set(repeat_list))) | |
| 102 | |
| 103 # unique mapper counting | |
| 104 cmd = f"bedtools bamtobed -i {unique_mapper_bam} | \ | |
| 105 bedtools coverage -b stdin -a repnames.bed" | |
| 106 p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) | |
| 107 bedtools_counts = p.communicate()[0].decode().rstrip('\r\n').split('\n') | |
| 108 | |
| 109 # parse bedtools output | |
| 110 counts = defaultdict(int) # key: repeat names, value: unique mapper counts | |
| 111 sumofrepeatreads = 0 | |
| 112 for line in bedtools_counts: | |
| 113 line = line.split('\t') | |
| 114 counts[line[3]] += int(line[4]) | |
| 115 sumofrepeatreads += int(line[4]) | |
| 116 print(f"Identified {sumofrepeatreads} unique reads that mapped to repeats.") | |
| 117 | |
| 118 # multimapper parsing | |
| 119 if not paired_end: | |
| 120 args_list = [(metagenome, fastqfile_1) for | |
| 121 metagenome in repeat_list] | |
| 122 with ProcessPoolExecutor(max_workers=cpus) as executor: | |
| 123 results = executor.map(run_bowtie, args_list) | |
| 124 else: | |
| 125 args_list = [(metagenome, fastqfile_1) for | |
| 126 metagenome in repeat_list] | |
| 127 args_list.extend([(metagenome, fastqfile_2) for | |
| 128 metagenome in repeat_list]) | |
| 129 with ProcessPoolExecutor(max_workers=cpus) as executor: | |
| 130 results = executor.map(run_bowtie, args_list) | |
| 131 | |
| 132 # Aggregate results (avoiding race conditions) | |
| 133 metagenome_reads = defaultdict(list) # repeat_name: list of multimap reads | |
| 134 for result in results: | |
| 135 metagenome_reads[result[0]] += result[1:] | |
| 136 | |
| 137 for name in metagenome_reads: | |
| 138 # read are only once in list | |
| 139 metagenome_reads[name] = list(set(metagenome_reads[name])) | |
| 140 # remove "no read" instances | |
| 141 metagenome_reads[name] = [read for read in metagenome_reads[name] | |
| 142 if read != ""] | |
| 143 | |
| 144 # implement repeats_by_reads from the inverse dictionnary metagenome_reads | |
| 145 repeats_by_reads = defaultdict(list) # readids: list of repeats names | |
| 146 for repname in metagenome_reads: | |
| 147 for read in metagenome_reads[repname]: | |
| 148 repeats_by_reads[read].append(repname) | |
| 149 for repname in repeats_by_reads: | |
| 150 repeats_by_reads[repname] = list(set(repeats_by_reads[repname])) | |
| 151 | |
| 152 # 3 dictionnaries and 1 pointer variable to be populated | |
| 153 fractionalcounts = defaultdict(float) | |
| 154 familyfractionalcounts = defaultdict(float) | |
| 155 classfractionalcounts = defaultdict(float) | |
| 156 sumofrepeatreads = 0 | |
| 157 | |
| 158 # Update counts dictionnary with sets of repeats (was "subfamilies") | |
| 159 # matched by multimappers | |
| 160 for repeat_set in repeats_by_reads.values(): | |
| 161 repeat_set_string = ','.join(repeat_set) | |
| 162 counts[repeat_set_string] += 1 | |
| 163 sumofrepeatreads += 1 | |
| 164 | |
| 165 print(f'Identified more {sumofrepeatreads} mutimapper repeat reads') | |
| 166 | |
| 167 # Populate fractionalcounts | |
| 168 for key, count in counts.items(): | |
| 169 key_list = key.split(',') | |
| 170 for i in key_list: | |
| 171 fractionalcounts[i] += count / len(key_list) | |
| 172 | |
| 173 # build repeat_ref for easy access to rep class and rep families | |
| 174 repeat_ref = defaultdict(dict) | |
| 175 repeats = import_text(annotation_file, ' ') | |
| 176 for repeat in repeats: | |
| 177 repeat_name = repeat[9].translate(str.maketrans('()/', '___')) | |
| 178 try: | |
| 179 repclass = repeat[10].split('/')[0] | |
| 180 repfamily = repeat[10].split('/')[1] | |
| 181 except IndexError: | |
| 182 repclass, repfamily = repeat[10], repeat[10] | |
| 183 repeat_ref[repeat_name]['class'] = repclass | |
| 184 repeat_ref[repeat_name]['family'] = repfamily | |
| 185 | |
| 186 # Populate classfractionalcounts and familyfractionalcounts | |
| 187 for key, value in fractionalcounts.items(): | |
| 188 classfractionalcounts[repeat_ref[key]['class']] += value | |
| 189 familyfractionalcounts[repeat_ref[key]['family']] += value | |
| 190 | |
| 191 # print class-, family- and fraction-repeats counts to files | |
| 192 with open("class_fraction_counts.tsv", 'w') as fout: | |
| 193 for key in sorted(classfractionalcounts): | |
| 194 fout.write(f"{key}\t{classfractionalcounts[key]}\n") | |
| 195 | |
| 196 with open("family_fraction_counts.tsv", 'w') as fout: | |
| 197 for key in sorted(familyfractionalcounts): | |
| 198 fout.write(f"{key}\t{familyfractionalcounts[key]}\n") | |
| 199 | |
| 200 with open("fraction_counts.tsv", 'w') as fout: | |
| 201 for key in sorted(fractionalcounts): | |
| 202 fout.write(f"{key}\t{repeat_ref[key]['class']}\t" | |
| 203 f"{repeat_ref[key]['family']}\t" | |
| 204 f"{fractionalcounts[key]}\n") |
