Mercurial > repos > artbio > repenrich
view RepEnrich.py @ 12:89e05f831259 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
author | artbio |
---|---|
date | Mon, 18 Mar 2024 09:39:44 +0000 |
parents | 6f4143893463 |
children | 530626b0757c |
line wrap: on
line source
import argparse import csv import os import shlex import subprocess import sys import numpy parser = argparse.ArgumentParser(description=''' Repenrich aligns reads to Repeat Elements pseudogenomes\ and counts aligned reads. RepEnrich_setup must be run\ before its use''') parser.add_argument('--annotation_file', action='store', metavar='annotation_file', help='RepeatMasker.org annotation file for your\ organism. The file may be downloaded from\ RepeatMasker.org. E.g. hg19_repeatmasker.txt') parser.add_argument('--outputfolder', action='store', metavar='outputfolder', help='Folder that will contain results. Should be the\ same as the one used for RepEnrich_setup.\ Example: ./outputfolder') parser.add_argument('--outputprefix', action='store', metavar='outputprefix', help='Prefix name for Repenrich output files.') parser.add_argument('--setup_folder', action='store', metavar='setup_folder', help='Folder produced by RepEnrich_setup which contains\ repeat element pseudogenomes.') parser.add_argument('--fastqfile', action='store', metavar='fastqfile', help='File of fastq reads mapping to multiple\ locations. Example: /data/multimap.fastq') parser.add_argument('--alignment_bam', action='store', metavar='alignment_bam', help='Bam alignments of unique mapper reads.') parser.add_argument('--pairedend', action='store', dest='pairedend', default='FALSE', help='Change to TRUE for paired-end fastq files.\ Default FALSE') parser.add_argument('--fastqfile2', action='store', dest='fastqfile2', metavar='fastqfile2', default='none', help='fastqfile #2 when using paired-end option.\ Default none') parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus', default="1", type=int, help='Number of CPUs. The more cpus the\ faster RepEnrich performs. Default: "1"') args = parser.parse_args() # parameters annotation_file = args.annotation_file outputfolder = args.outputfolder outputfile_prefix = args.outputprefix setup_folder = args.setup_folder repeat_bed = os.path.join(setup_folder, 'repnames.bed') unique_mapper_bam = args.alignment_bam fastqfile_1 = args.fastqfile fastqfile_2 = args.fastqfile2 cpus = args.cpus b_opt = "-k1 -p 1 --quiet" # Change if simple repeats are differently annotated in your organism simple_repeat = "Simple_repeat" paired_end = args.pairedend # check that the programs we need are available try: subprocess.call(shlex.split("coverageBed -h"), stdout=open(os.devnull, 'wb'), stderr=open(os.devnull, 'wb')) subprocess.call(shlex.split("bowtie --version"), stdout=open(os.devnull, 'wb'), stderr=open(os.devnull, 'wb')) except OSError: print("Error: Bowtie or bedtools not loaded") raise # define a csv reader that reads space deliminated files print('Preparing for analysis using RepEnrich...') csv.field_size_limit(sys.maxsize) def import_text(filename, separator): for line in csv.reader(open(filename), delimiter=separator, skipinitialspace=True): if line: yield line # build dictionaries to convert repclass and rep families repeatclass, repeatfamily = {}, {} repeats = import_text(annotation_file, ' ') # skip three first lines of the iterator for line in range(3): next(repeats) for repeat in repeats: classfamily = [] classfamily = repeat[10].split('/') matching_repeat = repeat[9].translate(str.maketrans('()/', '___')) repeatclass[matching_repeat] = classfamily[0] if len(classfamily) == 2: repeatfamily[matching_repeat] = classfamily[1] else: repeatfamily[matching_repeat] = classfamily[0] # build list of repeats initializing dictionaries for downstream analysis' repgenome_path = os.path.join(setup_folder, 'repgenomes_key.txt') reptotalcounts = {line[0]: 0 for line in import_text(repgenome_path, '\t')} fractionalcounts = {line[0]: 0 for line in import_text(repgenome_path, '\t')} classtotalcounts = {repeatclass[line[0]]: 0 for line in import_text( repgenome_path, '\t') if line[0] in repeatclass} classfractionalcounts = {repeatclass[line[0]]: 0 for line in import_text( repgenome_path, '\t') if line[0] in repeatclass} familytotalcounts = {repeatfamily[line[0]]: 0 for line in import_text( repgenome_path, '\t') if line[0] in repeatfamily} familyfractionalcounts = {repeatfamily[line[0]]: 0 for line in import_text( repgenome_path, '\t') if line[0] in repeatfamily} reptotalcounts_simple = {(simple_repeat if line[0] in repeatfamily and repeatfamily[line[0]] == simple_repeat else line[0]): 0 for line in import_text( repgenome_path, '\t')} repeat_key = {line[0]: int(line[1]) for line in import_text( repgenome_path, '\t')} rev_repeat_key = {int(line[1]): line[0] for line in import_text( repgenome_path, '\t')} repeat_list = [line[0] for line in import_text(repgenome_path, '\t')] # map the repeats to the psuedogenomes: if not os.path.exists(outputfolder): os.mkdir(outputfolder) # Conduct the regions sorting fileout = os.path.join(outputfolder, f"{outputfile_prefix}_regionsorter.txt") command = shlex.split(f"coverageBed -abam {unique_mapper_bam} -b \ {os.path.join(setup_folder, 'repnames.bed')}") with open(fileout, 'w') as stdout: subprocess.run(command, stdout=stdout, check=True) counts = {} sumofrepeatreads = 0 with open(fileout) as filein: for line in filein: line = line.split('\t') if not str(repeat_key[line[3]]) in counts: counts[str(repeat_key[line[3]])] = 0 counts[str(repeat_key[line[3]])] += int(line[4]) sumofrepeatreads += int(line[4]) print(f"Identified {sumofrepeatreads} unique reads that \ mapped to repeats.") def run_bowtie(metagenome, fastqfile, folder): metagenomepath = os.path.join(setup_folder, metagenome) output_file = os.path.join(folder, f"{metagenome}.bowtie") command = shlex.split(f"bowtie {b_opt} {metagenomepath} {fastqfile}") with open(output_file, 'w') as stdout: return subprocess.Popen(command, stdout=stdout) if paired_end == 'FALSE': folder_pair1 = os.path.join(outputfolder, 'pair1_bowtie') os.makedirs(folder_pair1, exist_ok=True) print("Processing repeat pseudogenomes...") processes = [] ticker = 0 for metagenome in repeat_list: processes.append(run_bowtie(metagenome, fastqfile_1, folder_pair1)) ticker += 1 if ticker == cpus: for p in processes: p.communicate() ticker = 0 processes = [] for p in processes: p.communicate() # Combine the output from both read pairs: print('Sorting and combining the output for both read pairs....') sorted_bowtie = os.path.join(outputfolder, 'sorted_bowtie') os.makedirs(sorted_bowtie, exist_ok=True) for metagenome in repeat_list: file1 = os.path.join(folder_pair1, f"{metagenome}.bowtie") fileout = os.path.join(sorted_bowtie, f"{metagenome}.bowtie") with open(fileout, 'w') as stdout: p1 = subprocess.Popen(['cat', file1], stdout=subprocess.PIPE) p2 = subprocess.Popen(['cut', '-f1'], stdin=p1.stdout, stdout=subprocess.PIPE) p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout, stdout=subprocess.PIPE) p4 = subprocess.Popen(['sort'], stdin=p3.stdout, stdout=subprocess.PIPE) p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout) p5.communicate() stdout.close() print('completed ...') else: folder_pair1 = os.path.join(outputfolder, 'pair1_bowtie') folder_pair2 = os.path.join(outputfolder, 'pair2_bowtie') os.makedirs(folder_pair1, exist_ok=True) os.makedirs(folder_pair2, exist_ok=True) print("Processing repeat pseudogenomes...") ps, psb, ticker = [], [], 0 for metagenome in repeat_list: ps.append(run_bowtie(metagenome, fastqfile_1, folder_pair1)) ticker += 1 if fastqfile_2 != 'none': psb.append(run_bowtie(metagenome, fastqfile_2, folder_pair2)) ticker += 1 if ticker >= cpus: for p in ps: p.communicate() for p in psb: p.communicate() ticker = 0 ps = [] psb = [] for p in ps: p.communicate() for p in psb: p.communicate() # combine the output from both read pairs: print('Sorting and combining the output for both read pairs...') if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'): os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie') sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie' for metagenome in repeat_list: file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie' file2 = folder_pair2 + os.path.sep + metagenome + '.bowtie' fileout = sorted_bowtie + os.path.sep + metagenome + '.bowtie' with open(fileout, 'w') as stdout: p1 = subprocess.Popen(['cat', file1, file2], stdout=subprocess.PIPE) p2 = subprocess.Popen(['cut', '-f1', "-d "], stdin=p1.stdout, stdout=subprocess.PIPE) p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout, stdout=subprocess.PIPE) p4 = subprocess.Popen(['sort'], stdin=p3.stdout, stdout=subprocess.PIPE) p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout) p5.communicate() stdout.close() print('completed ...') # build a file of repeat keys for all reads print('Writing and processing intermediate files...') sorted_bowtie = os.path.join(outputfolder, 'sorted_bowtie') sumofrepeatreads = 0 readid = {} for rep in repeat_list: for data in import_text( f"{os.path.join(sorted_bowtie, rep)}.bowtie", '\t'): readid[data[0]] = '' for rep in repeat_list: for data in import_text( f"{os.path.join(sorted_bowtie, rep)}.bowtie", '\t'): readid[data[0]] += f"{repeat_key[rep]}," for subfamilies in readid.values(): if subfamilies not in counts: counts[subfamilies] = 0 counts[subfamilies] += 1 sumofrepeatreads += 1 print(f'Identified {sumofrepeatreads} reads that mapped to \ repeats for unique and multimappers.') print("Conducting final calculations...") # building the total counts for repeat element enrichment... for x in counts.keys(): count = counts[x] x = x.strip(',').split(',') for i in x: reptotalcounts[rev_repeat_key[int(i)]] += int(count) # building the fractional counts for repeat element enrichment... for x in counts.keys(): count = counts[x] x = x.strip(',') .split(',') splits = len(x) for i in x: fractionalcounts[rev_repeat_key[int(i)]] += float( numpy.divide(float(count), float(splits))) # building categorized table of repeat element enrichment... repcounts = {} repcounts['other'] = 0 for key in counts.keys(): key_list = key.strip(',').split(',') repname = '' for i in key_list: repname = os.path.join(repname, rev_repeat_key[int(i)]) repcounts[repname] = counts[key] # building the total counts for class enrichment... for key in reptotalcounts.keys(): classtotalcounts[repeatclass[key]] += reptotalcounts[key] # building total counts for family enrichment... for key in reptotalcounts.keys(): familytotalcounts[repeatfamily[key]] += reptotalcounts[key] # building unique counts table repcounts2 = {} for rep in repeat_list: if "/" + rep in repcounts: repcounts2[rep] = repcounts["/" + rep] else: repcounts2[rep] = 0 # building the fractionalcounts counts for class enrichment... for key in fractionalcounts.keys(): classfractionalcounts[repeatclass[key]] += fractionalcounts[key] # building fractional counts for family enrichment... for key in fractionalcounts.keys(): familyfractionalcounts[repeatfamily[key]] += fractionalcounts[key] # print output to file of the categorized counts and total overlapping counts: print('Writing final output...') with open(f"{os.path.join(outputfolder, outputfile_prefix)}_" f"class_fraction_counts.txt", 'w') as fout: for key in sorted(classfractionalcounts.keys()): fout.write(f"{key}\t{classfractionalcounts[key]}\n") with open(f"{os.path.join(outputfolder, outputfile_prefix)}_" f"family_fraction_counts.txt", 'w') as fout: for key in sorted(familyfractionalcounts.keys()): fout.write(f"{key}\t{familyfractionalcounts[key]}\n") with open(f"{os.path.join(outputfolder, outputfile_prefix)}_" f"fraction_counts.txt", 'w') as fout: for key in sorted(fractionalcounts.keys()): fout.write(f"{key}\t{repeatclass[key]}\t{repeatfamily[key]}\t" f"{int(fractionalcounts[key])}\n")