changeset 12:89e05f831259 draft default tip

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
author artbio
date Mon, 18 Mar 2024 09:39:44 +0000
parents 6bba3e33c2e7
children
files RepEnrich.py RepEnrich_setup.py macros.xml repenrich.xml
diffstat 4 files changed, 270 insertions(+), 548 deletions(-) [+]
line wrap: on
line diff
--- a/RepEnrich.py	Sat Mar 09 22:32:46 2024 +0000
+++ b/RepEnrich.py	Mon Mar 18 09:39:44 2024 +0000
@@ -2,7 +2,6 @@
 import csv
 import os
 import shlex
-import shutil
 import subprocess
 import sys
 
@@ -10,86 +9,41 @@
 
 
 parser = argparse.ArgumentParser(description='''
-             Part II: Conducting the alignments to the psuedogenomes. Before\
-             doing this step you will require 1) a bamfile of the unique\
-             alignments with index 2) a fastq file of the reads mapping to\
-             more than one location. These files can be obtained using the\
-             following bowtie options [EXAMPLE: bowtie -S -m 1\
-             --max multimap.fastq mm9 mate1_reads.fastq]  Once you have the\
-             unique alignment bamfile and the reads mapping to more than one\
-             location in a fastq file you can run this step.  EXAMPLE: python\
-             master_output.py\
-             /users/nneretti/data/annotation/hg19/hg19_repeatmasker.txt\
-             /users/nneretti/datasets/repeatmapping/POL3/Pol3_human/
-             HeLa_InputChIPseq_Rep1 HeLa_InputChIPseq_Rep1\
-             /users/nneretti/data/annotation/hg19/setup_folder\
-             HeLa_InputChIPseq_Rep1_multimap.fastq\
-             HeLa_InputChIPseq_Rep1.bam''')
-parser.add_argument('--version', action='version', version='%(prog)s 0.1')
-parser.add_argument('annotation_file', action='store',
+             Repenrich aligns reads to Repeat Elements pseudogenomes\
+             and counts aligned reads. RepEnrich_setup must be run\
+             before its use''')
+parser.add_argument('--annotation_file', action='store',
                     metavar='annotation_file',
-                    help='List RepeatMasker.org annotation file for your\
-                          organism.  The file may be downloaded from the\
-                          RepeatMasker.org website. Example:\
-                          /data/annotation/hg19/hg19_repeatmasker.txt')
-parser.add_argument('outputfolder', action='store', metavar='outputfolder',
-                    help='List folder to contain results.\
-                          Example: /outputfolder')
-parser.add_argument('outputprefix', action='store', metavar='outputprefix',
-                    help='Enter prefix name for data.\
-                           Example: HeLa_InputChIPseq_Rep1')
-parser.add_argument('setup_folder', action='store', metavar='setup_folder',
-                    help='List folder that contains the repeat element\
-                          pseudogenomes.\
-                          Example: /data/annotation/hg19/setup_folder')
-parser.add_argument('fastqfile', action='store', metavar='fastqfile',
-                    help='Enter file for the fastq reads that map to multiple\
+                    help='RepeatMasker.org annotation file for your\
+                          organism. The file may be downloaded from\
+                          RepeatMasker.org. E.g. hg19_repeatmasker.txt')
+parser.add_argument('--outputfolder', action='store', metavar='outputfolder',
+                    help='Folder that will contain results. Should be the\
+                          same as the one used for RepEnrich_setup.\
+                          Example: ./outputfolder')
+parser.add_argument('--outputprefix', action='store', metavar='outputprefix',
+                    help='Prefix name for Repenrich output files.')
+parser.add_argument('--setup_folder', action='store', metavar='setup_folder',
+                    help='Folder produced by RepEnrich_setup which contains\
+                    repeat element pseudogenomes.')
+parser.add_argument('--fastqfile', action='store', metavar='fastqfile',
+                    help='File of fastq reads mapping to multiple\
                           locations. Example: /data/multimap.fastq')
-parser.add_argument('alignment_bam', action='store', metavar='alignment_bam',
-                    help='Enter bamfile output for reads that map uniquely.\
-                    Example /bamfiles/old.bam')
+parser.add_argument('--alignment_bam', action='store', metavar='alignment_bam',
+                    help='Bam alignments of unique mapper reads.')
 parser.add_argument('--pairedend', action='store', dest='pairedend',
                     default='FALSE',
-                    help='Designate this option for paired-end sequencing.\
-                          Default FALSE change to TRUE')
-parser.add_argument('--collapserepeat', action='store', dest='collapserepeat',
-                    metavar='collapserepeat', default='Simple_repeat',
-                    help='Designate this option to generate a collapsed repeat\
-                          type. Uncollapsed output is generated in addition to\
-                          collapsed repeat type. Simple_repeat is default to\
-                          simplify downstream analysis. You can change the\
-                          default to another repeat name to collapse a\
-                          seperate specific repeat instead or if the name of\
-                          Simple_repeat is different for your organism.\
-                          Default Simple_repeat')
+                    help='Change to TRUE for paired-end fastq files.\
+                          Default FALSE')
 parser.add_argument('--fastqfile2', action='store', dest='fastqfile2',
                     metavar='fastqfile2', default='none',
-                    help='Enter fastqfile2 when using paired-end option.\
+                    help='fastqfile #2 when using paired-end option.\
                           Default none')
 parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus',
                     default="1", type=int,
-                    help='Enter available cpus per node.  The more cpus the\
-                          faster RepEnrich performs. RepEnrich is designed to\
-                          only work on one node. Default: "1"')
-parser.add_argument('--allcountmethod', action='store', dest='allcountmethod',
-                    metavar='allcountmethod', default="FALSE",
-                    help='By default the pipeline only outputs the fraction\
-                          count method.  Consdidered to be the best way to\
-                          count multimapped reads. Changing this option will\
-                          include the unique count method, a conservative\
-                          count, and the total count method, a liberal\
-                          counting strategy. Our evaluation of simulated data\
-                          indicated fraction counting is best.\
-                          Default = FALSE, change to TRUE')
-parser.add_argument('--is_bed', action='store', dest='is_bed',
-                    metavar='is_bed', default='FALSE',
-                    help='Is the annotation file a bed file.\
-                         This is also a compatible format. The file needs to\
-                         be a tab seperated bed with optional fields.\
-                         Ex. format: chr\tstart\tend\tName_element\tclass\
-                         \tfamily. The class and family should identical to\
-                         name_element if not applicable.\
-                         Default FALSE change to TRUE')
+                    help='Number of CPUs. The more cpus the\
+                          faster RepEnrich performs. Default: "1"')
+
 args = parser.parse_args()
 
 # parameters
@@ -97,18 +51,16 @@
 outputfolder = args.outputfolder
 outputfile_prefix = args.outputprefix
 setup_folder = args.setup_folder
-repeat_bed = setup_folder + os.path.sep + 'repnames.bed'
+repeat_bed = os.path.join(setup_folder, 'repnames.bed')
 unique_mapper_bam = args.alignment_bam
 fastqfile_1 = args.fastqfile
 fastqfile_2 = args.fastqfile2
 cpus = args.cpus
-b_opt = "-k1 -p " + str(1) + " --quiet"
-simple_repeat = args.collapserepeat
+b_opt = "-k1 -p 1 --quiet"
+# Change if simple repeats are differently annotated in your organism
+simple_repeat = "Simple_repeat"
 paired_end = args.pairedend
-allcountmethod = args.allcountmethod
-is_bed = args.is_bed
 
-##############################################################################
 # check that the programs we need are available
 try:
     subprocess.call(shlex.split("coverageBed -h"),
@@ -118,10 +70,9 @@
                     stdout=open(os.devnull, 'wb'),
                     stderr=open(os.devnull, 'wb'))
 except OSError:
-    print("Error: Bowtie or BEDTools not loaded")
+    print("Error: Bowtie or bedtools not loaded")
     raise
 
-##############################################################################
 # define a csv reader that reads space deliminated files
 print('Preparing for analysis using RepEnrich...')
 csv.field_size_limit(sys.maxsize)
@@ -134,142 +85,144 @@
             yield line
 
 
-##############################################################################
-# build dictionaries to convert repclass and rep families'
-if is_bed == "FALSE":
-    repeatclass = {}
-    repeatfamily = {}
-    fin = import_text(annotation_file, ' ')
-    x = 0
-    for line in fin:
-        if x > 2:
-            classfamily = []
-            classfamily = line[10].split(os.path.sep)
-            line9 = line[9].replace("(", "_").replace(
-                                    ")", "_").replace("/", "_")
-            repeatclass[line9] = classfamily[0]
-            if len(classfamily) == 2:
-                repeatfamily[line9] = classfamily[1]
-            else:
-                repeatfamily[line9] = classfamily[0]
-        x += 1
-if is_bed == "TRUE":
-    repeatclass = {}
-    repeatfamily = {}
-    fin = open(annotation_file, 'r')
-    for line in fin:
-        line = line.strip('\n')
-        line = line.split('\t')
-        theclass = line[4]
-        thefamily = line[5]
-        line3 = line[3].replace("(", "_").replace(")", "_").replace("/", "_")
-        repeatclass[line3] = theclass
-        repeatfamily[line3] = thefamily
-fin.close()
+# build dictionaries to convert repclass and rep families
+repeatclass, repeatfamily = {}, {}
+repeats = import_text(annotation_file, ' ')
+# skip three first lines of the iterator
+for line in range(3):
+    next(repeats)
+for repeat in repeats:
+    classfamily = []
+    classfamily = repeat[10].split('/')
+    matching_repeat = repeat[9].translate(str.maketrans('()/', '___'))
+    repeatclass[matching_repeat] = classfamily[0]
+    if len(classfamily) == 2:
+        repeatfamily[matching_repeat] = classfamily[1]
+    else:
+        repeatfamily[matching_repeat] = classfamily[0]
 
-##############################################################################
 # build list of repeats initializing dictionaries for downstream analysis'
-fin = import_text(setup_folder + os.path.sep + 'repgenomes_key.txt', '\t')
-repeat_key = {}
-rev_repeat_key = {}
-repeat_list = []
-reptotalcounts = {}
-classfractionalcounts = {}
-familyfractionalcounts = {}
-classtotalcounts = {}
-familytotalcounts = {}
-reptotalcounts_simple = {}
-fractionalcounts = {}
-i = 0
-for line in fin:
-    reptotalcounts[line[0]] = 0
-    fractionalcounts[line[0]] = 0
-    if line[0] in repeatclass:
-        classtotalcounts[repeatclass[line[0]]] = 0
-        classfractionalcounts[repeatclass[line[0]]] = 0
-    if line[0] in repeatfamily:
-        familytotalcounts[repeatfamily[line[0]]] = 0
-        familyfractionalcounts[repeatfamily[line[0]]] = 0
-    if line[0] in repeatfamily:
-        if repeatfamily[line[0]] == simple_repeat:
-            reptotalcounts_simple[simple_repeat] = 0
-    else:
-        reptotalcounts_simple[line[0]] = 0
-    repeat_list.append(line[0])
-    repeat_key[line[0]] = int(line[1])
-    rev_repeat_key[int(line[1])] = line[0]
-fin.close()
-##############################################################################
+repgenome_path = os.path.join(setup_folder, 'repgenomes_key.txt')
+reptotalcounts = {line[0]: 0 for line in import_text(repgenome_path, '\t')}
+fractionalcounts = {line[0]: 0 for line in import_text(repgenome_path, '\t')}
+classtotalcounts = {repeatclass[line[0]]: 0 for line in import_text(
+    repgenome_path, '\t') if line[0] in repeatclass}
+classfractionalcounts = {repeatclass[line[0]]: 0 for line in import_text(
+    repgenome_path, '\t') if line[0] in repeatclass}
+familytotalcounts = {repeatfamily[line[0]]: 0 for line in import_text(
+    repgenome_path, '\t') if line[0] in repeatfamily}
+familyfractionalcounts = {repeatfamily[line[0]]: 0 for line in import_text(
+    repgenome_path, '\t') if line[0] in repeatfamily}
+reptotalcounts_simple = {(simple_repeat if line[0] in repeatfamily and
+                          repeatfamily[line[0]] == simple_repeat else
+                          line[0]): 0 for line in import_text(
+                              repgenome_path, '\t')}
+repeat_key = {line[0]: int(line[1]) for line in import_text(
+    repgenome_path, '\t')}
+rev_repeat_key = {int(line[1]): line[0] for line in import_text(
+    repgenome_path, '\t')}
+repeat_list = [line[0] for line in import_text(repgenome_path, '\t')]
+
 # map the repeats to the psuedogenomes:
 if not os.path.exists(outputfolder):
     os.mkdir(outputfolder)
-##############################################################################
+
 # Conduct the regions sorting
-print('Conducting region sorting on unique mapping reads....')
-fileout = outputfolder + os.path.sep + outputfile_prefix + '_regionsorter.txt'
+fileout = os.path.join(outputfolder, f"{outputfile_prefix}_regionsorter.txt")
+command = shlex.split(f"coverageBed -abam {unique_mapper_bam} -b \
+                        {os.path.join(setup_folder, 'repnames.bed')}")
 with open(fileout, 'w') as stdout:
-    command = shlex.split("coverageBed -abam " + unique_mapper_bam + " -b " +
-                          setup_folder + os.path.sep + 'repnames.bed')
-    p = subprocess.Popen(command, stdout=stdout)
-p.communicate()
-stdout.close()
-filein = open(outputfolder + os.path.sep + outputfile_prefix
-              + '_regionsorter.txt', 'r')
+    subprocess.run(command, stdout=stdout, check=True)
+
 counts = {}
 sumofrepeatreads = 0
-for line in filein:
-    line = line.split('\t')
-    if not str(repeat_key[line[3]]) in counts:
-        counts[str(repeat_key[line[3]])] = 0
-    counts[str(repeat_key[line[3]])] += int(line[4])
-    sumofrepeatreads += int(line[4])
-print('Identified ' + str(sumofrepeatreads) +
-      'unique reads that mapped to repeats.')
-##############################################################################
-if paired_end == 'TRUE':
-    if not os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'):
-        os.mkdir(outputfolder + os.path.sep + 'pair1_bowtie')
-    if not os.path.exists(outputfolder + os.path.sep + 'pair2_bowtie'):
-        os.mkdir(outputfolder + os.path.sep + 'pair2_bowtie')
-    folder_pair1 = outputfolder + os.path.sep + 'pair1_bowtie'
-    folder_pair2 = outputfolder + os.path.sep + 'pair2_bowtie'
-##############################################################################
-    print("Processing repeat psuedogenomes...")
-    ps = []
-    psb = []
+with open(fileout) as filein:
+    for line in filein:
+        line = line.split('\t')
+        if not str(repeat_key[line[3]]) in counts:
+            counts[str(repeat_key[line[3]])] = 0
+        counts[str(repeat_key[line[3]])] += int(line[4])
+        sumofrepeatreads += int(line[4])
+    print(f"Identified {sumofrepeatreads} unique reads that \
+            mapped to repeats.")
+
+
+def run_bowtie(metagenome, fastqfile, folder):
+    metagenomepath = os.path.join(setup_folder, metagenome)
+    output_file = os.path.join(folder, f"{metagenome}.bowtie")
+    command = shlex.split(f"bowtie {b_opt} {metagenomepath} {fastqfile}")
+    with open(output_file, 'w') as stdout:
+        return subprocess.Popen(command, stdout=stdout)
+
+
+if paired_end == 'FALSE':
+    folder_pair1 = os.path.join(outputfolder, 'pair1_bowtie')
+    os.makedirs(folder_pair1, exist_ok=True)
+
+    print("Processing repeat pseudogenomes...")
+    processes = []
     ticker = 0
+
     for metagenome in repeat_list:
-        metagenomepath = setup_folder + os.path.sep + metagenome
-        file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
-        file2 = folder_pair2 + os.path.sep + metagenome + '.bowtie'
-        with open(file1, 'w') as stdout:
-            command = shlex.split("bowtie " + b_opt + " " +
-                                  metagenomepath + " " + fastqfile_1)
-            p = subprocess.Popen(command, stdout=stdout)
-        with open(file2, 'w') as stdout:
-            command = shlex.split("bowtie " + b_opt + " " +
-                                  metagenomepath + " " + fastqfile_2)
-            pp = subprocess.Popen(command, stdout=stdout)
-        ps.append(p)
-        ticker += 1
-        psb.append(pp)
+        processes.append(run_bowtie(metagenome, fastqfile_1, folder_pair1))
         ticker += 1
         if ticker == cpus:
+            for p in processes:
+                p.communicate()
+            ticker = 0
+            processes = []
+
+    for p in processes:
+        p.communicate()
+    # Combine the output from both read pairs:
+    print('Sorting and combining the output for both read pairs....')
+    sorted_bowtie = os.path.join(outputfolder, 'sorted_bowtie')
+    os.makedirs(sorted_bowtie, exist_ok=True)
+    for metagenome in repeat_list:
+        file1 = os.path.join(folder_pair1, f"{metagenome}.bowtie")
+        fileout = os.path.join(sorted_bowtie, f"{metagenome}.bowtie")
+        with open(fileout, 'w') as stdout:
+            p1 = subprocess.Popen(['cat', file1], stdout=subprocess.PIPE)
+            p2 = subprocess.Popen(['cut', '-f1'], stdin=p1.stdout,
+                                  stdout=subprocess.PIPE)
+            p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout,
+                                  stdout=subprocess.PIPE)
+            p4 = subprocess.Popen(['sort'], stdin=p3.stdout,
+                                  stdout=subprocess.PIPE)
+            p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout)
+            p5.communicate()
+        stdout.close()
+    print('completed ...')
+else:
+    folder_pair1 = os.path.join(outputfolder, 'pair1_bowtie')
+    folder_pair2 = os.path.join(outputfolder, 'pair2_bowtie')
+    os.makedirs(folder_pair1, exist_ok=True)
+    os.makedirs(folder_pair2, exist_ok=True)
+
+    print("Processing repeat pseudogenomes...")
+    ps, psb, ticker = [], [], 0
+
+    for metagenome in repeat_list:
+        ps.append(run_bowtie(metagenome, fastqfile_1, folder_pair1))
+        ticker += 1
+        if fastqfile_2 != 'none':
+            psb.append(run_bowtie(metagenome, fastqfile_2, folder_pair2))
+            ticker += 1
+        if ticker >= cpus:
             for p in ps:
                 p.communicate()
             for p in psb:
                 p.communicate()
             ticker = 0
+            ps = []
             psb = []
-            ps = []
-    if len(ps) > 0:
-        for p in ps:
-            p.communicate()
-    stdout.close()
 
-##############################################################################
-# combine the output from both read pairs:
-    print('sorting and combining the output for both read pairs...')
+    for p in ps:
+        p.communicate()
+    for p in psb:
+        p.communicate()
+    # combine the output from both read pairs:
+    print('Sorting and combining the output for both read pairs...')
     if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'):
         os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie')
     sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie'
@@ -290,108 +243,43 @@
             p5.communicate()
         stdout.close()
     print('completed ...')
-##############################################################################
-if paired_end == 'FALSE':
-    if not os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'):
-        os.mkdir(outputfolder + os.path.sep + 'pair1_bowtie')
-    folder_pair1 = outputfolder + os.path.sep + 'pair1_bowtie'
-##############################################################################
-    ps = []
-    ticker = 0
-    print("Processing repeat psuedogenomes...")
-    for metagenome in repeat_list:
-        metagenomepath = setup_folder + os.path.sep + metagenome
-        file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
-        with open(file1, 'w') as stdout:
-            command = shlex.split("bowtie " + b_opt + " " +
-                                  metagenomepath + " " + fastqfile_1)
-            p = subprocess.Popen(command, stdout=stdout)
-        ps.append(p)
-        ticker += 1
-        if ticker == cpus:
-            for p in ps:
-                p.communicate()
-            ticker = 0
-            ps = []
-    if len(ps) > 0:
-        for p in ps:
-            p.communicate()
-    stdout.close()
 
-##############################################################################
-# combine the output from both read pairs:
-    print('Sorting and combining the output for both read pairs....')
-    if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'):
-        os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie')
-    sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie'
-    for metagenome in repeat_list:
-        file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie'
-        fileout = sorted_bowtie + os.path.sep + metagenome + '.bowtie'
-        with open(fileout, 'w') as stdout:
-            p1 = subprocess.Popen(['cat', file1], stdout=subprocess.PIPE)
-            p2 = subprocess.Popen(['cut', '-f1'], stdin=p1.stdout,
-                                  stdout=subprocess.PIPE)
-            p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout,
-                                  stdout=subprocess.PIPE)
-            p4 = subprocess.Popen(['sort'], stdin=p3.stdout,
-                                  stdout=subprocess.PIPE)
-            p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout)
-            p5.communicate()
-        stdout.close()
-    print('completed ...')
-
-##############################################################################
 # build a file of repeat keys for all reads
 print('Writing and processing intermediate files...')
-sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie'
+sorted_bowtie = os.path.join(outputfolder, 'sorted_bowtie')
+sumofrepeatreads = 0
 readid = {}
-sumofrepeatreads = 0
+
 for rep in repeat_list:
-    for data in import_text(sorted_bowtie + os.path.sep +
-                            rep + '.bowtie', '\t'):
+    for data in import_text(
+            f"{os.path.join(sorted_bowtie, rep)}.bowtie", '\t'):
         readid[data[0]] = ''
+
 for rep in repeat_list:
-    for data in import_text(sorted_bowtie + os.path.sep
-                            + rep + '.bowtie', '\t'):
-        readid[data[0]] += str(repeat_key[rep]) + str(',')
+    for data in import_text(
+            f"{os.path.join(sorted_bowtie, rep)}.bowtie", '\t'):
+        readid[data[0]] += f"{repeat_key[rep]},"
+
 for subfamilies in readid.values():
     if subfamilies not in counts:
         counts[subfamilies] = 0
     counts[subfamilies] += 1
     sumofrepeatreads += 1
-del readid
-print('Identified ' + str(sumofrepeatreads) +
-      ' reads that mapped to repeats for unique and multimappers.')
 
-##############################################################################
+print(f'Identified {sumofrepeatreads} reads that mapped to \
+        repeats for unique and multimappers.')
 print("Conducting final calculations...")
 
-
-def convert(x):
-    '''
-    build a converter to numeric label for repeat and yield a combined list
-    of repnames seperated by backslash
-    '''
-    x = x.strip(',')
-    x = x.split(',')
-    global repname
-    repname = ""
-    for i in x:
-        repname = repname + os.path.sep + rev_repeat_key[int(i)]
-
-
 # building the total counts for repeat element enrichment...
 for x in counts.keys():
     count = counts[x]
-    x = x.strip(',')
-    x = x.split(',')
+    x = x.strip(',').split(',')
     for i in x:
         reptotalcounts[rev_repeat_key[int(i)]] += int(count)
 # building the fractional counts for repeat element enrichment...
 for x in counts.keys():
     count = counts[x]
-    x = x.strip(',')
-    x = x.split(',')
+    x = x.strip(',')    .split(',')
     splits = len(x)
     for i in x:
         fractionalcounts[rev_repeat_key[int(i)]] += float(
@@ -400,7 +288,10 @@
 repcounts = {}
 repcounts['other'] = 0
 for key in counts.keys():
-    convert(key)
+    key_list = key.strip(',').split(',')
+    repname = ''
+    for i in key_list:
+        repname = os.path.join(repname, rev_repeat_key[int(i)])
     repcounts[repname] = counts[key]
 # building the total counts for class enrichment...
 for key in reptotalcounts.keys():
@@ -408,7 +299,7 @@
 # building total counts for family enrichment...
 for key in reptotalcounts.keys():
     familytotalcounts[repeatfamily[key]] += reptotalcounts[key]
-# building unique counts table'
+# building unique counts table
 repcounts2 = {}
 for rep in repeat_list:
     if "/" + rep in repcounts:
@@ -422,78 +313,20 @@
 for key in fractionalcounts.keys():
     familyfractionalcounts[repeatfamily[key]] += fractionalcounts[key]
 
-##############################################################################
-print('Writing final output and removing intermediate files...')
 # print output to file of the categorized counts and total overlapping counts:
-if allcountmethod == "TRUE":
-    fout1 = open(outputfolder + os.path.sep + outputfile_prefix
-                 + '_total_counts.txt', 'w')
-    for key in sorted(reptotalcounts.keys()):
-        fout1.write(str(key) + '\t' + repeatclass[key] + '\t' +
-                    repeatfamily[key] + '\t' + str(reptotalcounts[key])
-                    + '\n')
-    fout2 = open(outputfolder + os.path.sep + outputfile_prefix
-                 + '_class_total_counts.txt', 'w')
-    for key in sorted(classtotalcounts.keys()):
-        fout2.write(str(key) + '\t' + str(classtotalcounts[key]) + '\n')
-    fout3 = open(outputfolder + os.path.sep + outputfile_prefix
-                 + '_family_total_counts.txt', 'w')
-    for key in sorted(familytotalcounts.keys()):
-        fout3.write(str(key) + '\t' + str(familytotalcounts[key]) + '\n')
-    fout4 = open(outputfolder + os.path.sep + outputfile_prefix +
-                 '_unique_counts.txt', 'w')
-    for key in sorted(repcounts2.keys()):
-        fout4.write(str(key) + '\t' + repeatclass[key] + '\t' +
-                    repeatfamily[key] + '\t' + str(repcounts2[key]) + '\n')
-        fout5 = open(outputfolder + os.path.sep + outputfile_prefix
-                     + '_class_fraction_counts.txt', 'w')
+print('Writing final output...')
+with open(f"{os.path.join(outputfolder, outputfile_prefix)}_"
+          f"class_fraction_counts.txt", 'w') as fout:
     for key in sorted(classfractionalcounts.keys()):
-        fout5.write(str(key) + '\t' + str(classfractionalcounts[key]) + '\n')
-    fout6 = open(outputfolder + os.path.sep + outputfile_prefix +
-                 '_family_fraction_counts.txt', 'w')
+        fout.write(f"{key}\t{classfractionalcounts[key]}\n")
+
+with open(f"{os.path.join(outputfolder, outputfile_prefix)}_"
+          f"family_fraction_counts.txt", 'w') as fout:
     for key in sorted(familyfractionalcounts.keys()):
-        fout6.write(str(key) + '\t' + str(familyfractionalcounts[key]) + '\n')
-    fout7 = open(outputfolder + os.path.sep + outputfile_prefix
-                 + '_fraction_counts.txt', 'w')
+        fout.write(f"{key}\t{familyfractionalcounts[key]}\n")
+
+with open(f"{os.path.join(outputfolder, outputfile_prefix)}_"
+          f"fraction_counts.txt", 'w') as fout:
     for key in sorted(fractionalcounts.keys()):
-        fout7.write(str(key) + '\t' + repeatclass[key] + '\t' +
-                    repeatfamily[key] + '\t' + str(int(fractionalcounts[key]))
-                    + '\n')
-        fout1.close()
-    fout2.close()
-    fout3.close()
-    fout4.close()
-    fout5.close()
-    fout6.close()
-    fout7.close()
-else:
-    fout1 = open(outputfolder + os.path.sep + outputfile_prefix +
-                 '_class_fraction_counts.txt', 'w')
-    for key in sorted(classfractionalcounts.keys()):
-        fout1.write(str(key) + '\t' + str(classfractionalcounts[key]) + '\n')
-    fout2 = open(outputfolder + os.path.sep + outputfile_prefix +
-                 '_family_fraction_counts.txt', 'w')
-    for key in sorted(familyfractionalcounts.keys()):
-        fout2.write(str(key) + '\t' + str(familyfractionalcounts[key]) + '\n')
-    fout3 = open(outputfolder + os.path.sep + outputfile_prefix +
-                 '_fraction_counts.txt', 'w')
-    for key in sorted(fractionalcounts.keys()):
-        fout3.write(str(key) + '\t' + repeatclass[key] + '\t' +
-                    repeatfamily[key] + '\t' + str(int(fractionalcounts[key]))
-                    + '\n')
-    fout1.close()
-    fout2.close()
-    fout3.close()
-##############################################################################
-#  Remove Large intermediate files
-if os.path.exists(outputfolder + os.path.sep + outputfile_prefix +
-                  '_regionsorter.txt'):
-    os.remove(outputfolder + os.path.sep + outputfile_prefix +
-              '_regionsorter.txt')
-if os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'):
-    shutil.rmtree(outputfolder + os.path.sep + 'pair1_bowtie')
-if os.path.exists(outputfolder + os.path.sep + 'pair2_bowtie'):
-    shutil.rmtree(outputfolder + os.path.sep + 'pair2_bowtie')
-if os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'):
-    shutil.rmtree(outputfolder + os.path.sep + 'sorted_bowtie')
-print("... Done")
+        fout.write(f"{key}\t{repeatclass[key]}\t{repeatfamily[key]}\t"
+                   f"{int(fractionalcounts[key])}\n")
--- a/RepEnrich_setup.py	Sat Mar 09 22:32:46 2024 +0000
+++ b/RepEnrich_setup.py	Mon Mar 18 09:39:44 2024 +0000
@@ -11,86 +11,49 @@
 from Bio.SeqRecord import SeqRecord
 
 parser = argparse.ArgumentParser(description='''
-             Part I: Prepartion of repetive element psuedogenomes and repetive\
-             element bamfiles.  This script prepares the annotation used by\
-             downstream applications to analyze for repetitive element\
-             enrichment. For this script to run properly bowtie must be\
-             loaded.  The repeat element psuedogenomes are prepared in order\
-             to analyze reads that map to multiple locations of the genome.\
-             The repeat element bamfiles are prepared in order to use a\
-             region sorter to analyze reads that map to a single location\
-             of the genome. You will 1) annotation_file:\
-             The repetitive element annotation file downloaded from\
-             RepeatMasker.org database for your organism of interest.\
-             2) genomefasta: Your genome of interest in fasta format,\
-             3)setup_folder: a folder to contain repeat element setup files\
-             command-line usage
-             EXAMPLE: python master_setup.py\
-             /users/nneretti/data/annotation/mm9/mm9_repeatmasker.txt\
-             /users/nneretti/data/annotation/mm9/mm9.fa\
-             /users/nneretti/data/annotation/mm9/setup_folder''',
+             Prepartion of repetive element pseudogenomes bowtie\
+             indexes and annotation files used by RepEnrich.py enrichment.''',
                                  prog='getargs_genome_maker.py')
-parser.add_argument('--version', action='version', version='%(prog)s 0.1')
-parser.add_argument('annotation_file', action='store',
+parser.add_argument('--annotation_file', action='store',
                     metavar='annotation_file',
-                    help='''List annotation file. The annotation file contains\
-                         the repeat masker annotation for the genome of\
-                         interest and may be downloaded at RepeatMasker.org\
-                         Example /data/annotation/mm9/mm9.fa.out''')
-parser.add_argument('genomefasta', action='store', metavar='genomefasta',
-                    help='''File name and path for genome of interest in fasta\
-                         format.  Example /data/annotation/mm9/mm9.fa''')
-parser.add_argument('setup_folder', action='store', metavar='setup_folder',
-                    help='''List folder to contain bamfiles for repeats and\
+                    help='''Repeat masker annotation of the genome of\
+                         interest. Download from RepeatMasker.org\
+                         Example: mm9.fa.out''')
+parser.add_argument('--genomefasta', action='store', metavar='genomefasta',
+                    help='''Genome of interest in fasta format.\
+                         Example: mm9.fa''')
+parser.add_argument('--setup_folder', action='store', metavar='setup_folder',
+                    help='''Folder that contains bowtie indexes of repeats and\
                          repeat element psuedogenomes.\
-                         Example /data/annotation/mm9/setup''')
-parser.add_argument('--nfragmentsfile1', action='store',
-                    dest='nfragmentsfile1', metavar='nfragmentsfile1',
-                    default='./repnames_nfragments.txt',
-                    help='''Output location of a description file that saves\
-                         the number of fragments processed per repname.
-                         Default ./repnames_nfragments.txt''')
+                         Example working/setup''')
 parser.add_argument('--gaplength', action='store', dest='gaplength',
                     metavar='gaplength', default='200', type=int,
-                    help='Length of the spacer used to build\
-                         repeat psuedogeneomes.  Default 200')
+                    help='''Length of the N-spacer in the\
+                         repeat pseudogenomes.  Default 200''')
 parser.add_argument('--flankinglength', action='store', dest='flankinglength',
                     metavar='flankinglength', default='25', type=int,
-                    help='Length of the flanking region adjacent to the repeat\
-                         element that is used to build repeat psuedogeneomes.\
-                         The flanking length should be set according to the\
-                         length of your reads.  Default 25')
-parser.add_argument('--is_bed', action='store', dest='is_bed',
-                    metavar='is_bed', default='FALSE',
-                    help='''Is the annotation file a bed file. This is also a\
-                         compatible format. The file needs to be a tab\
-                         separated bed with optional fields.
-                         Ex. format:
-                         chr\tstart\tend\tName_element\tclass\tfamily.
-                         The class and family should identical to name_element\
-                         if not applicable.  Default FALSE change to TRUE''')
+                    help='''Length of the flanking regions used to build\
+                         repeat pseudogenomes. Flanking length should be set\
+                         according to the length of your reads.\
+                         Default 25, for 50 nt reads''')
 args = parser.parse_args()
 
-# parameters and paths specified in args_parse
+# parameters from argsparse
 gapl = args.gaplength
 flankingl = args.flankinglength
 annotation_file = args.annotation_file
 genomefasta = args.genomefasta
 setup_folder = args.setup_folder
-nfragmentsfile1 = args.nfragmentsfile1
-is_bed = args.is_bed
 
-##############################################################################
 # check that the programs we need are available
 try:
     subprocess.call(shlex.split("bowtie --version"),
                     stdout=open(os.devnull, 'wb'),
                     stderr=open(os.devnull, 'wb'))
 except OSError:
-    print("Error: Bowtie or BEDTools not loaded")
+    print("Error: Bowtie not available in the path")
     raise
 
-##############################################################################
 # Define a text importer
 csv.field_size_limit(sys.maxsize)
 
@@ -105,154 +68,67 @@
 # Make a setup folder
 if not os.path.exists(setup_folder):
     os.makedirs(setup_folder)
-##############################################################################
-# load genome into dictionary
-print("loading genome...")
+# load genome into dictionary and compute length
 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta"))
+idxgenome, lgenome, genome = {}, {}, {}
 
-print("Precomputing length of all chromosomes...")
-idxgenome = {}
-lgenome = {}
-genome = {}
-allchrs = g.keys()
-k = 0
-for chr in allchrs:
-    genome[chr] = str(g[chr].seq)
-#    del g[chr]
+for k, chr in enumerate(g.keys()):
+    genome[chr] = g[chr].seq
     lgenome[chr] = len(genome[chr])
     idxgenome[chr] = k
-    k = k + 1
-del g
 
-##############################################################################
 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter
-if is_bed == "FALSE":
-    repeat_elements = []
-    fout = open(os.path.realpath(setup_folder + os.path.sep
-                                 + 'repnames.bed'), 'w')
-    fin = import_text(annotation_file, ' ')
-    x = 0
-    rep_chr = {}
-    rep_start = {}
-    rep_end = {}
-    x = 0
+repeat_elements = []
+# these dictionaries will contain lists
+rep_chr, rep_start, rep_end = {}, {}, {}
+fin = import_text(annotation_file, ' ')
+with open(os.path.join(setup_folder, 'repnames.bed'), 'w') as fout:
+    for i in range(3):
+        next(fin)
     for line in fin:
-        if x > 2:
-            line9 = line[9].replace("(", "_").replace(")",
-                                                      "_").replace("/", "_")
-            repname = line9
-            if repname not in repeat_elements:
-                repeat_elements.append(repname)
-            repchr = line[4]
-            repstart = int(line[5])
-            repend = int(line[6])
-            fout.write(str(repchr) + '\t' + str(repstart) + '\t' + str(repend)
-                       + '\t' + str(repname) + '\n')
-            if repname in rep_chr:
-                rep_chr[repname].append(repchr)
-                rep_start[repname].append(int(repstart))
-                rep_end[repname].append(int(repend))
-            else:
-                rep_chr[repname] = [repchr]
-                rep_start[repname] = [int(repstart)]
-                rep_end[repname] = [int(repend)]
-        x += 1
-if is_bed == "TRUE":
-    repeat_elements = []
-    fout = open(os.path.realpath(setup_folder + os.path.sep + 'repnames.bed'),
-                'w')
-    fin = open(os.path.realpath(annotation_file), 'r')
-    x = 0
-    rep_chr = {}
-    rep_start = {}
-    rep_end = {}
-    x = 0
-    for line in fin:
-        line = line.strip('\n')
-        line = line.split('\t')
-        line3 = line[3].replace("(", "_").replace(")", "_").replace("/", "_")
-        repname = line3
+        repname = line[9].translate(str.maketrans('()/', '___'))
         if repname not in repeat_elements:
             repeat_elements.append(repname)
-        repchr = line[0]
-        repstart = int(line[1])
-        repend = int(line[2])
-        fout.write(str(repchr) + '\t' + str(repstart) + '\t' +
-                   str(repend) + '\t' + str(repname) + '\n')
-        # if rep_chr.has_key(repname):
+        repchr = line[4]
+        repstart = line[5]
+        repend = line[6]
+        fout.write('\t'.join([repchr, repstart, repend, repname]) + '\n')
         if repname in rep_chr:
             rep_chr[repname].append(repchr)
-            rep_start[repname].append(int(repstart))
-            rep_end[repname].append(int(repend))
+            rep_start[repname].append(repstart)
+            rep_end[repname].append(repend)
         else:
             rep_chr[repname] = [repchr]
-            rep_start[repname] = [int(repstart)]
-            rep_end[repname] = [int(repend)]
+            rep_start[repname] = [repstart]
+            rep_end[repname] = [repend]
 
-fin.close()
-fout.close()
-repeat_elements = sorted(repeat_elements)
-print("Writing a key for all repeats...")
-# print to fout the binary key that contains each repeat type with the
-# associated binary number; sort the binary key:
-fout = open(os.path.realpath(setup_folder + os.path.sep +
-                             'repgenomes_key.txt'), 'w')
-x = 0
-for repeat in repeat_elements:
-    # print >> fout, str(repeat) + '\t' + str(x)
-    fout.write(str(repeat) + '\t' + str(x) + '\n')
-    x += 1
-fout.close()
-##############################################################################
-# generate spacer for psuedogenomes
-spacer = ""
-for i in range(gapl):
-    spacer = spacer + "N"
+# sort repeat_elements and print them in repgenomes_key.txt
+with open(os.path.join(setup_folder, 'repgenomes_key.txt'), 'w') as fout:
+    for i, repeat in enumerate(sorted(repeat_elements)):
+        fout.write('\t'.join([repeat, str(i)]) + '\n')
 
-# save file with number of fragments processed per repname
-print("Saving number of fragments processed per repname to "
-      + nfragmentsfile1)
-fout1 = open(os.path.realpath(nfragmentsfile1), "w")
-for repname in rep_chr.keys():
-    rep_chr_current = rep_chr[repname]
-#    print >>fout1, str(len(rep_chr[repname])) + "\t" + repname
-    fout1.write(str(len(rep_chr[repname])) + "\t" + repname + '\n')
-fout1.close()
+# generate spacer for pseudogenomes
+spacer = ''.join(['N' for i in range(gapl)])
 
-# generate metagenomes and save them to FASTA files
-k = 1
-nrepgenomes = len(rep_chr.keys())
-for repname in rep_chr.keys():
-    metagenome = ""
-    newname = repname.replace("(", "_").replace(")", "_").replace("/", "_")
-    print("processing repgenome " + newname + ".fa" + " (" + str(k)
-          + " of " + str(nrepgenomes) + ")")
-    rep_chr_current = rep_chr[repname]
-    rep_start_current = rep_start[repname]
-    rep_end_current = rep_end[repname]
-    print("-------> " + str(len(rep_chr[repname])) + " fragments")
-    for i in range(len(rep_chr[repname])):
+# generate metagenomes and save them to FASTA files for bowtie build
+for repname in rep_chr:
+    metagenome = ''
+    for i, repeat in enumerate(rep_chr[repname]):
         try:
-            chr = rep_chr_current[i]
-            rstart = max(rep_start_current[i] - flankingl, 0)
-            rend = min(rep_end_current[i] + flankingl, lgenome[chr]-1)
-            metagenome = metagenome + spacer + genome[chr][rstart:(rend+1)]
+            chromosome = rep_chr[repname][i]
+            start = max(int(rep_start[repname][i]) - flankingl, 0)
+            end = min(int(rep_end[repname][i]) + flankingl,
+                      int(lgenome[chr])-1) + 1
+            metagenome = f"{metagenome}{spacer}{genome[chromosome][start:end]}"
         except KeyError:
-            print("Unrecognised Chromosome: "+chr)
-            pass
-    # Convert metagenome to SeqRecord object (required by SeqIO.write)
-    record = SeqRecord(Seq(metagenome), id="repname",
-                       name="", description="")
-    print("saving repgenome " + newname + ".fa" + " (" + str(k) + " of "
-          + str(nrepgenomes) + ")")
-    fastafilename = os.path.realpath(setup_folder + os.path.sep
-                                     + newname + ".fa")
+            print("Unrecognised Chromosome: " + rep_chr[repname][i])
+
+    # Create Fasta of repeat pseudogenome
+    fastafilename = f"{os.path.join(setup_folder, repname)}.fa"
+    record = SeqRecord(Seq(metagenome), id=repname, name='', description='')
     SeqIO.write(record, fastafilename, "fasta")
-    print("indexing repgenome " + newname + ".fa" + " (" +
-          str(k) + " of " + str(nrepgenomes) + ")")
-    command = shlex.split('bowtie-build -f ' + fastafilename + ' ' +
-                          setup_folder + os.path.sep + newname)
-    p = subprocess.Popen(command).communicate()
-    k += 1
 
-print("... Done")
+    # Generate repeat pseudogenome bowtie index
+    bowtie_build_cmd = ["bowtie-build", "-f", fastafilename,
+                        os.path.join(setup_folder, repname)]
+    subprocess.run(bowtie_build_cmd, check=True)
--- a/macros.xml	Sat Mar 09 22:32:46 2024 +0000
+++ b/macros.xml	Mon Mar 18 09:39:44 2024 +0000
@@ -1,6 +1,6 @@
 <macros>
     <token name="@TOOL_VERSION@">1.83</token>
-    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@VERSION_SUFFIX@">1</token>
     <token name="@PROFILE@">23.0</token>
 
     <xml name="requirements">
--- a/repenrich.xml	Sat Mar 09 22:32:46 2024 +0000
+++ b/repenrich.xml	Mon Mar 18 09:39:44 2024 +0000
@@ -30,16 +30,18 @@
         #else:
             #if $seq_method.input2_fastq.is_of_type("fastq.gz", "fastqsanger.gz"):
                 gunzip < '$seq_method.input_fastq' > '${input_base}.fastq' &&
-                gunzip < '$seq_method.input2_fastq' > '${input_base}_2.fastq' &&                
+                gunzip < '$seq_method.input2_fastq' > '${input_base}_2.fastq' &&
             #else:
                 ln -f -s '$seq_method.input_fastq' '${input_base}.fastq' &&
                 ln -f -s '$seq_method.input2_fastq' '${input_base}_2.fastq' &&
             #end if
         #end if
-        
         ln -f -s '$genome' '${baseReference}.fa' &&
         bowtie-build '$genome' ${baseReference} &&
-        python $__tool_directory__/RepEnrich_setup.py $repeatmasker ${baseReference}.fa setup_folder_${baseReference} &&
+        python $__tool_directory__/RepEnrich_setup.py
+            --annotation_file $repeatmasker
+            --genomefasta ${baseReference}.fa
+            --setup_folder setup_folder_${baseReference} &&
         #if $seq_method.seq_method_list == "single-read":
             bowtie $baseReference -p \${GALAXY_SLOTS:-4} -t -m 1 -S --max ${input_base}_multimap.fastq ${input_base}.fastq ${input_base}_unique.sam 2>bowtie_alignments.txt &&
             TOTAL=\$(grep 'reads processed:' bowtie_alignments.txt | cut -d ' ' -f 4) &&
@@ -56,9 +58,25 @@
         samtools view -@ \${GALAXY_SLOTS:-4} -bS '${input_base}_unique.sam' | samtools sort -@ \${GALAXY_SLOTS:-4} -O bam -o '${input_base}_unique.bam' &&
         samtools index ${input_base}_unique.bam &&
         #if $seq_method.seq_method_list == "single-read":
-            python $__tool_directory__/RepEnrich.py $repeatmasker ${input_base} ${input_base} setup_folder_${baseReference} ${input_base}_multimap.fastq ${input_base}_unique.bam --cpus "\${GALAXY_SLOTS:-4}" &&
+            python $__tool_directory__/RepEnrich.py
+                --annotation_file $repeatmasker
+                --outputfolder ${input_base}
+                --outputprefix ${input_base}
+                --setup_folder setup_folder_${baseReference}
+                --fastqfile ${input_base}_multimap.fastq
+                --alignment_bam ${input_base}_unique.bam
+                --cpus "\${GALAXY_SLOTS:-4}" &&
         #else:
-            python $__tool_directory__/RepEnrich.py $repeatmasker ${input_base} ${input_base} setup_folder_${baseReference} ${input_base}_multimap_1.fastq --fastqfile2 ${input_base}_multimap_2.fastq ${input_base}_unique.bam --cpus "\${GALAXY_SLOTS:-4}" --pairedend TRUE &&
+            python $__tool_directory__/RepEnrich.py
+                --annotation_file $repeatmasker
+                --outputfolder ${input_base}
+                --outputprefix ${input_base}
+                --setup_folder setup_folder_${baseReference}
+                --fastqfile ${input_base}_multimap_1.fastq
+                --fastqfile2 ${input_base}_multimap_2.fastq
+                --alignment_bam ${input_base}_unique.bam
+                --cpus "\${GALAXY_SLOTS:-4}"
+                --pairedend TRUE &&
         #end if
         cp $input_base/${input_base}_class_fraction_counts.txt class_fraction_counts.tabular &&
         cp $input_base/${input_base}_family_fraction_counts.txt family_fraction_counts.tabular &&
@@ -227,15 +245,10 @@
 
 .. class:: infomark
 
-For more information on the tools, please visit our `code repository`_.
-
-If you would like to give us feedback or you run into any trouble, please send an email to artbio.ibps@gmail.com 
-
-This tool wrapper is developed by the `ARTbio team`_ at the `Institut de Biologie Paris Seine (IBPS)`_.
+For more information on the tools, or giving us feedback, please visit our `code repository`_.
 
 .. _code repository: https://github.com/ARTbio/tools-artbio/tree/master/tools/
 .. _ARTbio team: http://artbio.fr
-.. _Institut de Biologie Paris Seine (IBPS): http://www.ibps.upmc.fr/en/core-facilities/bioinformatics
 
     </help>