repenrich: RepEnrich.py comparison

comparison RepEnrich.py @ 13:530626b0757c draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115

author	artbio
date	Tue, 02 Apr 2024 21:16:37 +0000
parents	89e05f831259
children	bf866bedd4b4

comparison

equal deleted inserted replaced

-:89e05f831259
+:530626b0757c
 import csv
 import os
 import shlex
 import subprocess
 import sys
+from collections import defaultdict
-import numpy
+from concurrent.futures import ProcessPoolExecutor
 parser = argparse.ArgumentParser(description='''
 Repenrich aligns reads to Repeat Elements pseudogenomes\
 and counts aligned reads. RepEnrich_setup must be run\
 parser.add_argument('--annotation_file', action='store',
 metavar='annotation_file',
 help='RepeatMasker.org annotation file for your\
 organism. The file may be downloaded from\
 RepeatMasker.org. E.g. hg19_repeatmasker.txt')
-parser.add_argument('--outputfolder', action='store', metavar='outputfolder',
+parser.add_argument('--alignment_bam', action='store', metavar='alignment_bam',
-help='Folder that will contain results. Should be the\
+help='Bam alignments of unique mapper reads.')
-same as the one used for RepEnrich_setup.\
-Example: ./outputfolder')
-parser.add_argument('--outputprefix', action='store', metavar='outputprefix',
-help='Prefix name for Repenrich output files.')
-parser.add_argument('--setup_folder', action='store', metavar='setup_folder',
-help='Folder produced by RepEnrich_setup which contains\
-repeat element pseudogenomes.')
 parser.add_argument('--fastqfile', action='store', metavar='fastqfile',
 help='File of fastq reads mapping to multiple\
 locations. Example: /data/multimap.fastq')
-parser.add_argument('--alignment_bam', action='store', metavar='alignment_bam',
-help='Bam alignments of unique mapper reads.')
-parser.add_argument('--pairedend', action='store', dest='pairedend',
-default='FALSE',
-help='Change to TRUE for paired-end fastq files.\
-Default FALSE')
 parser.add_argument('--fastqfile2', action='store', dest='fastqfile2',
-metavar='fastqfile2', default='none',
+metavar='fastqfile2', default='',
 help='fastqfile #2 when using paired-end option.\
 Default none')
 parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus',
 default="1", type=int,
 help='Number of CPUs. The more cpus the\
 args = parser.parse_args()
 # parameters
 annotation_file = args.annotation_file
-outputfolder = args.outputfolder
-outputfile_prefix = args.outputprefix
-setup_folder = args.setup_folder
-repeat_bed = os.path.join(setup_folder, 'repnames.bed')
 unique_mapper_bam = args.alignment_bam
 fastqfile_1 = args.fastqfile
 fastqfile_2 = args.fastqfile2
 cpus = args.cpus
-b_opt = "-k1 -p 1 --quiet"
 # Change if simple repeats are differently annotated in your organism
 simple_repeat = "Simple_repeat"
-paired_end = args.pairedend
+if args.fastqfile2:
+paired_end = True
+else:
+paired_end = False
 # check that the programs we need are available
 try:
 subprocess.call(shlex.split("coverageBed -h"),
 stdout=open(os.devnull, 'wb'),
 stderr=open(os.devnull, 'wb'))
 except OSError:
 print("Error: Bowtie or bedtools not loaded")
 raise
-# define a csv reader that reads space deliminated files
-print('Preparing for analysis using RepEnrich...')
+def starts_with_numerical(list):
-csv.field_size_limit(sys.maxsize)
+try:
+if len(list) == 0:
+return False
+int(list[0])
+return True
+except ValueError:
+return False
+# define a text importer for .out/.txt format of repbase
 def import_text(filename, separator):
-for line in csv.reader(open(filename), delimiter=separator,
+csv.field_size_limit(sys.maxsize)
-skipinitialspace=True):
+file = csv.reader(open(filename), delimiter=separator,
-if line:
+skipinitialspace=True)
-yield line
+return [line for line in file if starts_with_numerical(line)]
-# build dictionaries to convert repclass and rep families
+def run_bowtie(args):
-repeatclass, repeatfamily = {}, {}
+metagenome, fastqfile = args
+b_opt = "-k 1 -p 1 --quiet"
+command = shlex.split(f"bowtie {b_opt} -x {metagenome} {fastqfile}")
+bowtie_align = subprocess.run(command, check=True,
+capture_output=True, text=True).stdout
+bowtie_align = bowtie_align.rstrip('\r\n').split('\n')
+readlist = [metagenome]
+if paired_end:
+for line in bowtie_align:
+readlist.append(line.split("/")[0])
+else:
+for line in bowtie_align:
+readlist.append(line.split("\t")[0])
+return readlist
+# set a reference repeat list for the script
+repeat_list = [listline[9].translate(
+str.maketrans(
+'()/', '___')) for listline in import_text(annotation_file, ' ')]
+repeat_list = sorted(list(set(repeat_list)))
+# unique mapper counting
+cmd = f"bedtools bamtobed -i {unique_mapper_bam} | \
+bedtools coverage -b stdin -a  repnames.bed"
+p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
+bedtools_counts = p.communicate()[0].decode().rstrip('\r\n').split('\n')
+# parse bedtools output
+counts = defaultdict(int)  # key: repeat names, value: unique mapper counts
+sumofrepeatreads = 0
+for line in bedtools_counts:
+line = line.split('\t')
+counts[line[3]] += int(line[4])
+sumofrepeatreads += int(line[4])
+print(f"Identified {sumofrepeatreads} unique reads that mapped to repeats.")
+# multimapper parsing
+if not paired_end:
+args_list = [(metagenome, fastqfile_1) for
+metagenome in repeat_list]
+with ProcessPoolExecutor(max_workers=cpus) as executor:
+results = executor.map(run_bowtie, args_list)
+else:
+args_list = [(metagenome, fastqfile_1) for
+metagenome in repeat_list]
+args_list.extend([(metagenome, fastqfile_2) for
+metagenome in repeat_list])
+with ProcessPoolExecutor(max_workers=cpus) as executor:
+results = executor.map(run_bowtie, args_list)
+# Aggregate results (avoiding race conditions)
+metagenome_reads = defaultdict(list)  # repeat_name: list of multimap reads
+for result in results:
+metagenome_reads[result[0]] += result[1:]
+for name in metagenome_reads:
+#  read are only once in list
+metagenome_reads[name] = list(set(metagenome_reads[name]))
+#  remove "no read" instances
+metagenome_reads[name] = [read for read in metagenome_reads[name]
+if read != ""]
+# implement repeats_by_reads from the inverse dictionnary metagenome_reads
+repeats_by_reads = defaultdict(list)  # readids: list of repeats names
+for repname in metagenome_reads:
+for read in metagenome_reads[repname]:
+repeats_by_reads[read].append(repname)
+for repname in repeats_by_reads:
+repeats_by_reads[repname] = list(set(repeats_by_reads[repname]))
+# 3 dictionnaries and 1 pointer variable to be populated
+fractionalcounts = defaultdict(float)
+familyfractionalcounts = defaultdict(float)
+classfractionalcounts = defaultdict(float)
+sumofrepeatreads = 0
+# Update counts dictionnary with sets of repeats (was "subfamilies")
+# matched by multimappers
+for repeat_set in repeats_by_reads.values():
+repeat_set_string = ','.join(repeat_set)
+counts[repeat_set_string] += 1
+sumofrepeatreads += 1
+print(f'Identified more {sumofrepeatreads} mutimapper repeat reads')
+# Populate fractionalcounts
+for key, count in counts.items():
+key_list = key.split(',')
+for i in key_list:
+fractionalcounts[i] += count / len(key_list)
+# build repeat_ref for easy access to rep class and rep families
+repeat_ref = defaultdict(dict)
 repeats = import_text(annotation_file, ' ')
-# skip three first lines of the iterator
-for line in range(3):
-next(repeats)
 for repeat in repeats:
-classfamily = []
+repeat_name = repeat[9].translate(str.maketrans('()/', '___'))
-classfamily = repeat[10].split('/')
+try:
-matching_repeat = repeat[9].translate(str.maketrans('()/', '___'))
+repclass = repeat[10].split('/')[0]
-repeatclass[matching_repeat] = classfamily[0]
+repfamily = repeat[10].split('/')[1]
-if len(classfamily) == 2:
+except IndexError:
-repeatfamily[matching_repeat] = classfamily[1]
+repclass, repfamily = repeat[10], repeat[10]
-else:
+repeat_ref[repeat_name]['class'] = repclass
-repeatfamily[matching_repeat] = classfamily[0]
+repeat_ref[repeat_name]['family'] = repfamily
-# build list of repeats initializing dictionaries for downstream analysis'
+# Populate classfractionalcounts and familyfractionalcounts
-repgenome_path = os.path.join(setup_folder, 'repgenomes_key.txt')
+for key, value in fractionalcounts.items():
-reptotalcounts = {line[0]: 0 for line in import_text(repgenome_path, '\t')}
+classfractionalcounts[repeat_ref[key]['class']] += value
-fractionalcounts = {line[0]: 0 for line in import_text(repgenome_path, '\t')}
+familyfractionalcounts[repeat_ref[key]['family']] += value
-classtotalcounts = {repeatclass[line[0]]: 0 for line in import_text(
-repgenome_path, '\t') if line[0] in repeatclass}
+# print class-, family- and fraction-repeats counts to files
-classfractionalcounts = {repeatclass[line[0]]: 0 for line in import_text(
+with open("class_fraction_counts.tsv", 'w') as fout:
-repgenome_path, '\t') if line[0] in repeatclass}
+for key in sorted(classfractionalcounts):
-familytotalcounts = {repeatfamily[line[0]]: 0 for line in import_text(
-repgenome_path, '\t') if line[0] in repeatfamily}
-familyfractionalcounts = {repeatfamily[line[0]]: 0 for line in import_text(
-repgenome_path, '\t') if line[0] in repeatfamily}
-reptotalcounts_simple = {(simple_repeat if line[0] in repeatfamily and
-repeatfamily[line[0]] == simple_repeat else
-line[0]): 0 for line in import_text(
-repgenome_path, '\t')}
-repeat_key = {line[0]: int(line[1]) for line in import_text(
-repgenome_path, '\t')}
-rev_repeat_key = {int(line[1]): line[0] for line in import_text(
-repgenome_path, '\t')}
-repeat_list = [line[0] for line in import_text(repgenome_path, '\t')]
-# map the repeats to the psuedogenomes:
-if not os.path.exists(outputfolder):
-os.mkdir(outputfolder)
-# Conduct the regions sorting
-fileout = os.path.join(outputfolder, f"{outputfile_prefix}_regionsorter.txt")
-command = shlex.split(f"coverageBed -abam {unique_mapper_bam} -b \
-{os.path.join(setup_folder, 'repnames.bed')}")
-with open(fileout, 'w') as stdout:
-subprocess.run(command, stdout=stdout, check=True)
-counts = {}
-sumofrepeatreads = 0
-with open(fileout) as filein:
-for line in filein:
-line = line.split('\t')
-if not str(repeat_key[line[3]]) in counts:
-counts[str(repeat_key[line[3]])] = 0
-counts[str(repeat_key[line[3]])] += int(line[4])
-sumofrepeatreads += int(line[4])
-print(f"Identified {sumofrepeatreads} unique reads that \
-mapped to repeats.")
-def run_bowtie(metagenome, fastqfile, folder):
-metagenomepath = os.path.join(setup_folder, metagenome)
-output_file = os.path.join(folder, f"{metagenome}.bowtie")
-command = shlex.split(f"bowtie {b_opt} {metagenomepath} {fastqfile}")
-with open(output_file, 'w') as stdout:
-return subprocess.Popen(command, stdout=stdout)
-if paired_end == 'FALSE':
-folder_pair1 = os.path.join(outputfolder, 'pair1_bowtie')
-os.makedirs(folder_pair1, exist_ok=True)
-print("Processing repeat pseudogenomes...")
-processes = []
-ticker = 0
-for metagenome in repeat_list:
-processes.append(run_bowtie(metagenome, fastqfile_1, folder_pair1))
-ticker += 1
-if ticker == cpus:
-for p in processes:
-p.communicate()
-ticker = 0
-processes = []
-for p in processes:
-p.communicate()
-# Combine the output from both read pairs:
-print('Sorting and combining the output for both read pairs....')
-sorted_bowtie = os.path.join(outputfolder, 'sorted_bowtie')
-os.makedirs(sorted_bowtie, exist_ok=True)
-for metagenome in repeat_list:
-file1 = os.path.join(folder_pair1, f"{metagenome}.bowtie")
-fileout = os.path.join(sorted_bowtie, f"{metagenome}.bowtie")
-with open(fileout, 'w') as stdout:
-p1 = subprocess.Popen(['cat', file1], stdout=subprocess.PIPE)
-p2 = subprocess.Popen(['cut', '-f1'], stdin=p1.stdout,
-stdout=subprocess.PIPE)
-p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout,
-stdout=subprocess.PIPE)
-p4 = subprocess.Popen(['sort'], stdin=p3.stdout,
-stdout=subprocess.PIPE)
-p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout)
-p5.communicate()
-stdout.close()
-print('completed ...')
-else:
-folder_pair1 = os.path.join(outputfolder, 'pair1_bowtie')
-folder_pair2 = os.path.join(outputfolder, 'pair2_bowtie')
-os.makedirs(folder_pair1, exist_ok=True)
-os.makedirs(folder_pair2, exist_ok=True)
-print("Processing repeat pseudogenomes...")
-ps, psb, ticker = [], [], 0
-for metagenome in repeat_list:
-ps.append(run_bowtie(metagenome, fastqfile_1, folder_pair1))
-ticker += 1
-if fastqfile_2 != 'none':
-psb.append(run_bowtie(metagenome, fastqfile_2, folder_pair2))
-ticker += 1
-if ticker >= cpus:
-for p in ps:
-p.communicate()
-for p in psb:
-p.communicate()
-ticker = 0
-ps = []
-psb = []
-for p in ps:
-p.communicate()
-for p in psb:
-p.communicate()
-# combine the output from both read pairs:
-print('Sorting and combining the output for both read pairs...')
-if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'):
-os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie')
-sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie'
-for metagenome in repeat_list:
-file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
-file2 = folder_pair2 + os.path.sep + metagenome + '.bowtie'
-fileout = sorted_bowtie + os.path.sep + metagenome + '.bowtie'
-with open(fileout, 'w') as stdout:
-p1 = subprocess.Popen(['cat', file1, file2],
-stdout=subprocess.PIPE)
-p2 = subprocess.Popen(['cut', '-f1', "-d "], stdin=p1.stdout,
-stdout=subprocess.PIPE)
-p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout,
-stdout=subprocess.PIPE)
-p4 = subprocess.Popen(['sort'], stdin=p3.stdout,
-stdout=subprocess.PIPE)
-p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout)
-p5.communicate()
-stdout.close()
-print('completed ...')
-# build a file of repeat keys for all reads
-print('Writing and processing intermediate files...')
-sorted_bowtie = os.path.join(outputfolder, 'sorted_bowtie')
-sumofrepeatreads = 0
-readid = {}
-for rep in repeat_list:
-for data in import_text(
-f"{os.path.join(sorted_bowtie, rep)}.bowtie", '\t'):
-readid[data[0]] = ''
-for rep in repeat_list:
-for data in import_text(
-f"{os.path.join(sorted_bowtie, rep)}.bowtie", '\t'):
-readid[data[0]] += f"{repeat_key[rep]},"
-for subfamilies in readid.values():
-if subfamilies not in counts:
-counts[subfamilies] = 0
-counts[subfamilies] += 1
-sumofrepeatreads += 1
-print(f'Identified {sumofrepeatreads} reads that mapped to \
-repeats for unique and multimappers.')
-print("Conducting final calculations...")
-# building the total counts for repeat element enrichment...
-for x in counts.keys():
-count = counts[x]
-x = x.strip(',').split(',')
-for i in x:
-reptotalcounts[rev_repeat_key[int(i)]] += int(count)
-# building the fractional counts for repeat element enrichment...
-for x in counts.keys():
-count = counts[x]
-x = x.strip(',')    .split(',')
-splits = len(x)
-for i in x:
-fractionalcounts[rev_repeat_key[int(i)]] += float(
-numpy.divide(float(count), float(splits)))
-# building categorized table of repeat element enrichment...
-repcounts = {}
-repcounts['other'] = 0
-for key in counts.keys():
-key_list = key.strip(',').split(',')
-repname = ''
-for i in key_list:
-repname = os.path.join(repname, rev_repeat_key[int(i)])
-repcounts[repname] = counts[key]
-# building the total counts for class enrichment...
-for key in reptotalcounts.keys():
-classtotalcounts[repeatclass[key]] += reptotalcounts[key]
-# building total counts for family enrichment...
-for key in reptotalcounts.keys():
-familytotalcounts[repeatfamily[key]] += reptotalcounts[key]
-# building unique counts table
-repcounts2 = {}
-for rep in repeat_list:
-if "/" + rep in repcounts:
-repcounts2[rep] = repcounts["/" + rep]
-else:
-repcounts2[rep] = 0
-# building the fractionalcounts counts for class enrichment...
-for key in fractionalcounts.keys():
-classfractionalcounts[repeatclass[key]] += fractionalcounts[key]
-# building fractional counts for family enrichment...
-for key in fractionalcounts.keys():
-familyfractionalcounts[repeatfamily[key]] += fractionalcounts[key]
-# print output to file of the categorized counts and total overlapping counts:
-print('Writing final output...')
-with open(f"{os.path.join(outputfolder, outputfile_prefix)}_"
-f"class_fraction_counts.txt", 'w') as fout:
-for key in sorted(classfractionalcounts.keys()):
 fout.write(f"{key}\t{classfractionalcounts[key]}\n")
-with open(f"{os.path.join(outputfolder, outputfile_prefix)}_"
+with open("family_fraction_counts.tsv", 'w') as fout:
-f"family_fraction_counts.txt", 'w') as fout:
+for key in sorted(familyfractionalcounts):
-for key in sorted(familyfractionalcounts.keys()):
 fout.write(f"{key}\t{familyfractionalcounts[key]}\n")
-with open(f"{os.path.join(outputfolder, outputfile_prefix)}_"
+with open("fraction_counts.tsv", 'w') as fout:
-f"fraction_counts.txt", 'w') as fout:
+for key in sorted(fractionalcounts):
-for key in sorted(fractionalcounts.keys()):
+fout.write(f"{key}\t{repeat_ref[key]['class']}\t"
-fout.write(f"{key}\t{repeatclass[key]}\t{repeatfamily[key]}\t"
+f"{repeat_ref[key]['family']}\t"
-f"{int(fractionalcounts[key])}\n")
+f"{fractionalcounts[key]}\n")

Mercurial > repos > artbio > repenrich

comparison RepEnrich.py @ 13:530626b0757c draft