Repository 'repenrich'
hg clone https://toolshed.g2.bx.psu.edu/repos/artbio/repenrich

Changeset 12:89e05f831259 (2024-03-18)
Previous changeset 11:6bba3e33c2e7 (2024-03-09) Next changeset 13:530626b0757c (2024-04-02)
Commit message:
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
modified:
RepEnrich.py
RepEnrich_setup.py
macros.xml
repenrich.xml
b
diff -r 6bba3e33c2e7 -r 89e05f831259 RepEnrich.py
--- a/RepEnrich.py Sat Mar 09 22:32:46 2024 +0000
+++ b/RepEnrich.py Mon Mar 18 09:39:44 2024 +0000
[
b"@@ -2,7 +2,6 @@\n import csv\n import os\n import shlex\n-import shutil\n import subprocess\n import sys\n \n@@ -10,86 +9,41 @@\n \n \n parser = argparse.ArgumentParser(description='''\n-             Part II: Conducting the alignments to the psuedogenomes. Before\\\n-             doing this step you will require 1) a bamfile of the unique\\\n-             alignments with index 2) a fastq file of the reads mapping to\\\n-             more than one location. These files can be obtained using the\\\n-             following bowtie options [EXAMPLE: bowtie -S -m 1\\\n-             --max multimap.fastq mm9 mate1_reads.fastq]  Once you have the\\\n-             unique alignment bamfile and the reads mapping to more than one\\\n-             location in a fastq file you can run this step.  EXAMPLE: python\\\n-             master_output.py\\\n-             /users/nneretti/data/annotation/hg19/hg19_repeatmasker.txt\\\n-             /users/nneretti/datasets/repeatmapping/POL3/Pol3_human/\n-             HeLa_InputChIPseq_Rep1 HeLa_InputChIPseq_Rep1\\\n-             /users/nneretti/data/annotation/hg19/setup_folder\\\n-             HeLa_InputChIPseq_Rep1_multimap.fastq\\\n-             HeLa_InputChIPseq_Rep1.bam''')\n-parser.add_argument('--version', action='version', version='%(prog)s 0.1')\n-parser.add_argument('annotation_file', action='store',\n+             Repenrich aligns reads to Repeat Elements pseudogenomes\\\n+             and counts aligned reads. RepEnrich_setup must be run\\\n+             before its use''')\n+parser.add_argument('--annotation_file', action='store',\n                     metavar='annotation_file',\n-                    help='List RepeatMasker.org annotation file for your\\\n-                          organism.  The file may be downloaded from the\\\n-                          RepeatMasker.org website. Example:\\\n-                          /data/annotation/hg19/hg19_repeatmasker.txt')\n-parser.add_argument('outputfolder', action='store', metavar='outputfolder',\n-                    help='List folder to contain results.\\\n-                          Example: /outputfolder')\n-parser.add_argument('outputprefix', action='store', metavar='outputprefix',\n-                    help='Enter prefix name for data.\\\n-                           Example: HeLa_InputChIPseq_Rep1')\n-parser.add_argument('setup_folder', action='store', metavar='setup_folder',\n-                    help='List folder that contains the repeat element\\\n-                          pseudogenomes.\\\n-                          Example: /data/annotation/hg19/setup_folder')\n-parser.add_argument('fastqfile', action='store', metavar='fastqfile',\n-                    help='Enter file for the fastq reads that map to multiple\\\n+                    help='RepeatMasker.org annotation file for your\\\n+                          organism. The file may be downloaded from\\\n+                          RepeatMasker.org. E.g. hg19_repeatmasker.txt')\n+parser.add_argument('--outputfolder', action='store', metavar='outputfolder',\n+                    help='Folder that will contain results. Should be the\\\n+                          same as the one used for RepEnrich_setup.\\\n+                          Example: ./outputfolder')\n+parser.add_argument('--outputprefix', action='store', metavar='outputprefix',\n+                    help='Prefix name for Repenrich output files.')\n+parser.add_argument('--setup_folder', action='store', metavar='setup_folder',\n+                    help='Folder produced by RepEnrich_setup which contains\\\n+                    repeat element pseudogenomes.')\n+parser.add_argument('--fastqfile', action='store', metavar='fastqfile',\n+                    help='File of fastq reads mapping to multiple\\\n                           locations. Example: /data/multimap.fastq')\n-parser.add_argument('alignment_bam', action='store', metavar='alignment_bam',\n-                    help='Enter bamfile output for reads that map uniquely.\\\n-                    Example /bamfiles/old.bam')\n+parser.add_argument('--alignment_bam', action='st"..b'ile_prefix\n-                 + \'_class_total_counts.txt\', \'w\')\n-    for key in sorted(classtotalcounts.keys()):\n-        fout2.write(str(key) + \'\\t\' + str(classtotalcounts[key]) + \'\\n\')\n-    fout3 = open(outputfolder + os.path.sep + outputfile_prefix\n-                 + \'_family_total_counts.txt\', \'w\')\n-    for key in sorted(familytotalcounts.keys()):\n-        fout3.write(str(key) + \'\\t\' + str(familytotalcounts[key]) + \'\\n\')\n-    fout4 = open(outputfolder + os.path.sep + outputfile_prefix +\n-                 \'_unique_counts.txt\', \'w\')\n-    for key in sorted(repcounts2.keys()):\n-        fout4.write(str(key) + \'\\t\' + repeatclass[key] + \'\\t\' +\n-                    repeatfamily[key] + \'\\t\' + str(repcounts2[key]) + \'\\n\')\n-        fout5 = open(outputfolder + os.path.sep + outputfile_prefix\n-                     + \'_class_fraction_counts.txt\', \'w\')\n+print(\'Writing final output...\')\n+with open(f"{os.path.join(outputfolder, outputfile_prefix)}_"\n+          f"class_fraction_counts.txt", \'w\') as fout:\n     for key in sorted(classfractionalcounts.keys()):\n-        fout5.write(str(key) + \'\\t\' + str(classfractionalcounts[key]) + \'\\n\')\n-    fout6 = open(outputfolder + os.path.sep + outputfile_prefix +\n-                 \'_family_fraction_counts.txt\', \'w\')\n+        fout.write(f"{key}\\t{classfractionalcounts[key]}\\n")\n+\n+with open(f"{os.path.join(outputfolder, outputfile_prefix)}_"\n+          f"family_fraction_counts.txt", \'w\') as fout:\n     for key in sorted(familyfractionalcounts.keys()):\n-        fout6.write(str(key) + \'\\t\' + str(familyfractionalcounts[key]) + \'\\n\')\n-    fout7 = open(outputfolder + os.path.sep + outputfile_prefix\n-                 + \'_fraction_counts.txt\', \'w\')\n+        fout.write(f"{key}\\t{familyfractionalcounts[key]}\\n")\n+\n+with open(f"{os.path.join(outputfolder, outputfile_prefix)}_"\n+          f"fraction_counts.txt", \'w\') as fout:\n     for key in sorted(fractionalcounts.keys()):\n-        fout7.write(str(key) + \'\\t\' + repeatclass[key] + \'\\t\' +\n-                    repeatfamily[key] + \'\\t\' + str(int(fractionalcounts[key]))\n-                    + \'\\n\')\n-        fout1.close()\n-    fout2.close()\n-    fout3.close()\n-    fout4.close()\n-    fout5.close()\n-    fout6.close()\n-    fout7.close()\n-else:\n-    fout1 = open(outputfolder + os.path.sep + outputfile_prefix +\n-                 \'_class_fraction_counts.txt\', \'w\')\n-    for key in sorted(classfractionalcounts.keys()):\n-        fout1.write(str(key) + \'\\t\' + str(classfractionalcounts[key]) + \'\\n\')\n-    fout2 = open(outputfolder + os.path.sep + outputfile_prefix +\n-                 \'_family_fraction_counts.txt\', \'w\')\n-    for key in sorted(familyfractionalcounts.keys()):\n-        fout2.write(str(key) + \'\\t\' + str(familyfractionalcounts[key]) + \'\\n\')\n-    fout3 = open(outputfolder + os.path.sep + outputfile_prefix +\n-                 \'_fraction_counts.txt\', \'w\')\n-    for key in sorted(fractionalcounts.keys()):\n-        fout3.write(str(key) + \'\\t\' + repeatclass[key] + \'\\t\' +\n-                    repeatfamily[key] + \'\\t\' + str(int(fractionalcounts[key]))\n-                    + \'\\n\')\n-    fout1.close()\n-    fout2.close()\n-    fout3.close()\n-##############################################################################\n-#  Remove Large intermediate files\n-if os.path.exists(outputfolder + os.path.sep + outputfile_prefix +\n-                  \'_regionsorter.txt\'):\n-    os.remove(outputfolder + os.path.sep + outputfile_prefix +\n-              \'_regionsorter.txt\')\n-if os.path.exists(outputfolder + os.path.sep + \'pair1_bowtie\'):\n-    shutil.rmtree(outputfolder + os.path.sep + \'pair1_bowtie\')\n-if os.path.exists(outputfolder + os.path.sep + \'pair2_bowtie\'):\n-    shutil.rmtree(outputfolder + os.path.sep + \'pair2_bowtie\')\n-if os.path.exists(outputfolder + os.path.sep + \'sorted_bowtie\'):\n-    shutil.rmtree(outputfolder + os.path.sep + \'sorted_bowtie\')\n-print("... Done")\n+        fout.write(f"{key}\\t{repeatclass[key]}\\t{repeatfamily[key]}\\t"\n+                   f"{int(fractionalcounts[key])}\\n")\n'
b
diff -r 6bba3e33c2e7 -r 89e05f831259 RepEnrich_setup.py
--- a/RepEnrich_setup.py Sat Mar 09 22:32:46 2024 +0000
+++ b/RepEnrich_setup.py Mon Mar 18 09:39:44 2024 +0000
[
b"@@ -11,86 +11,49 @@\n from Bio.SeqRecord import SeqRecord\n \n parser = argparse.ArgumentParser(description='''\n-             Part I: Prepartion of repetive element psuedogenomes and repetive\\\n-             element bamfiles.  This script prepares the annotation used by\\\n-             downstream applications to analyze for repetitive element\\\n-             enrichment. For this script to run properly bowtie must be\\\n-             loaded.  The repeat element psuedogenomes are prepared in order\\\n-             to analyze reads that map to multiple locations of the genome.\\\n-             The repeat element bamfiles are prepared in order to use a\\\n-             region sorter to analyze reads that map to a single location\\\n-             of the genome. You will 1) annotation_file:\\\n-             The repetitive element annotation file downloaded from\\\n-             RepeatMasker.org database for your organism of interest.\\\n-             2) genomefasta: Your genome of interest in fasta format,\\\n-             3)setup_folder: a folder to contain repeat element setup files\\\n-             command-line usage\n-             EXAMPLE: python master_setup.py\\\n-             /users/nneretti/data/annotation/mm9/mm9_repeatmasker.txt\\\n-             /users/nneretti/data/annotation/mm9/mm9.fa\\\n-             /users/nneretti/data/annotation/mm9/setup_folder''',\n+             Prepartion of repetive element pseudogenomes bowtie\\\n+             indexes and annotation files used by RepEnrich.py enrichment.''',\n                                  prog='getargs_genome_maker.py')\n-parser.add_argument('--version', action='version', version='%(prog)s 0.1')\n-parser.add_argument('annotation_file', action='store',\n+parser.add_argument('--annotation_file', action='store',\n                     metavar='annotation_file',\n-                    help='''List annotation file. The annotation file contains\\\n-                         the repeat masker annotation for the genome of\\\n-                         interest and may be downloaded at RepeatMasker.org\\\n-                         Example /data/annotation/mm9/mm9.fa.out''')\n-parser.add_argument('genomefasta', action='store', metavar='genomefasta',\n-                    help='''File name and path for genome of interest in fasta\\\n-                         format.  Example /data/annotation/mm9/mm9.fa''')\n-parser.add_argument('setup_folder', action='store', metavar='setup_folder',\n-                    help='''List folder to contain bamfiles for repeats and\\\n+                    help='''Repeat masker annotation of the genome of\\\n+                         interest. Download from RepeatMasker.org\\\n+                         Example: mm9.fa.out''')\n+parser.add_argument('--genomefasta', action='store', metavar='genomefasta',\n+                    help='''Genome of interest in fasta format.\\\n+                         Example: mm9.fa''')\n+parser.add_argument('--setup_folder', action='store', metavar='setup_folder',\n+                    help='''Folder that contains bowtie indexes of repeats and\\\n                          repeat element psuedogenomes.\\\n-                         Example /data/annotation/mm9/setup''')\n-parser.add_argument('--nfragmentsfile1', action='store',\n-                    dest='nfragmentsfile1', metavar='nfragmentsfile1',\n-                    default='./repnames_nfragments.txt',\n-                    help='''Output location of a description file that saves\\\n-                         the number of fragments processed per repname.\n-                         Default ./repnames_nfragments.txt''')\n+                         Example working/setup''')\n parser.add_argument('--gaplength', action='store', dest='gaplength',\n                     metavar='gaplength', default='200', type=int,\n-                    help='Length of the spacer used to build\\\n-                         repeat psuedogeneomes.  Default 200')\n+                    help='''Length of the N-spacer in the\\\n+                         repeat pseudogenomes.  Default 200''')\n pa"..b'nary key that contains each repeat type with the\n-# associated binary number; sort the binary key:\n-fout = open(os.path.realpath(setup_folder + os.path.sep +\n-                             \'repgenomes_key.txt\'), \'w\')\n-x = 0\n-for repeat in repeat_elements:\n-    # print >> fout, str(repeat) + \'\\t\' + str(x)\n-    fout.write(str(repeat) + \'\\t\' + str(x) + \'\\n\')\n-    x += 1\n-fout.close()\n-##############################################################################\n-# generate spacer for psuedogenomes\n-spacer = ""\n-for i in range(gapl):\n-    spacer = spacer + "N"\n+# sort repeat_elements and print them in repgenomes_key.txt\n+with open(os.path.join(setup_folder, \'repgenomes_key.txt\'), \'w\') as fout:\n+    for i, repeat in enumerate(sorted(repeat_elements)):\n+        fout.write(\'\\t\'.join([repeat, str(i)]) + \'\\n\')\n \n-# save file with number of fragments processed per repname\n-print("Saving number of fragments processed per repname to "\n-      + nfragmentsfile1)\n-fout1 = open(os.path.realpath(nfragmentsfile1), "w")\n-for repname in rep_chr.keys():\n-    rep_chr_current = rep_chr[repname]\n-#    print >>fout1, str(len(rep_chr[repname])) + "\\t" + repname\n-    fout1.write(str(len(rep_chr[repname])) + "\\t" + repname + \'\\n\')\n-fout1.close()\n+# generate spacer for pseudogenomes\n+spacer = \'\'.join([\'N\' for i in range(gapl)])\n \n-# generate metagenomes and save them to FASTA files\n-k = 1\n-nrepgenomes = len(rep_chr.keys())\n-for repname in rep_chr.keys():\n-    metagenome = ""\n-    newname = repname.replace("(", "_").replace(")", "_").replace("/", "_")\n-    print("processing repgenome " + newname + ".fa" + " (" + str(k)\n-          + " of " + str(nrepgenomes) + ")")\n-    rep_chr_current = rep_chr[repname]\n-    rep_start_current = rep_start[repname]\n-    rep_end_current = rep_end[repname]\n-    print("-------> " + str(len(rep_chr[repname])) + " fragments")\n-    for i in range(len(rep_chr[repname])):\n+# generate metagenomes and save them to FASTA files for bowtie build\n+for repname in rep_chr:\n+    metagenome = \'\'\n+    for i, repeat in enumerate(rep_chr[repname]):\n         try:\n-            chr = rep_chr_current[i]\n-            rstart = max(rep_start_current[i] - flankingl, 0)\n-            rend = min(rep_end_current[i] + flankingl, lgenome[chr]-1)\n-            metagenome = metagenome + spacer + genome[chr][rstart:(rend+1)]\n+            chromosome = rep_chr[repname][i]\n+            start = max(int(rep_start[repname][i]) - flankingl, 0)\n+            end = min(int(rep_end[repname][i]) + flankingl,\n+                      int(lgenome[chr])-1) + 1\n+            metagenome = f"{metagenome}{spacer}{genome[chromosome][start:end]}"\n         except KeyError:\n-            print("Unrecognised Chromosome: "+chr)\n-            pass\n-    # Convert metagenome to SeqRecord object (required by SeqIO.write)\n-    record = SeqRecord(Seq(metagenome), id="repname",\n-                       name="", description="")\n-    print("saving repgenome " + newname + ".fa" + " (" + str(k) + " of "\n-          + str(nrepgenomes) + ")")\n-    fastafilename = os.path.realpath(setup_folder + os.path.sep\n-                                     + newname + ".fa")\n+            print("Unrecognised Chromosome: " + rep_chr[repname][i])\n+\n+    # Create Fasta of repeat pseudogenome\n+    fastafilename = f"{os.path.join(setup_folder, repname)}.fa"\n+    record = SeqRecord(Seq(metagenome), id=repname, name=\'\', description=\'\')\n     SeqIO.write(record, fastafilename, "fasta")\n-    print("indexing repgenome " + newname + ".fa" + " (" +\n-          str(k) + " of " + str(nrepgenomes) + ")")\n-    command = shlex.split(\'bowtie-build -f \' + fastafilename + \' \' +\n-                          setup_folder + os.path.sep + newname)\n-    p = subprocess.Popen(command).communicate()\n-    k += 1\n \n-print("... Done")\n+    # Generate repeat pseudogenome bowtie index\n+    bowtie_build_cmd = ["bowtie-build", "-f", fastafilename,\n+                        os.path.join(setup_folder, repname)]\n+    subprocess.run(bowtie_build_cmd, check=True)\n'
b
diff -r 6bba3e33c2e7 -r 89e05f831259 macros.xml
--- a/macros.xml Sat Mar 09 22:32:46 2024 +0000
+++ b/macros.xml Mon Mar 18 09:39:44 2024 +0000
b
@@ -1,6 +1,6 @@
 <macros>
     <token name="@TOOL_VERSION@">1.83</token>
-    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@VERSION_SUFFIX@">1</token>
     <token name="@PROFILE@">23.0</token>
 
     <xml name="requirements">
b
diff -r 6bba3e33c2e7 -r 89e05f831259 repenrich.xml
--- a/repenrich.xml Sat Mar 09 22:32:46 2024 +0000
+++ b/repenrich.xml Mon Mar 18 09:39:44 2024 +0000
b
@@ -30,16 +30,18 @@
         #else:
             #if $seq_method.input2_fastq.is_of_type("fastq.gz", "fastqsanger.gz"):
                 gunzip < '$seq_method.input_fastq' > '${input_base}.fastq' &&
-                gunzip < '$seq_method.input2_fastq' > '${input_base}_2.fastq' &&                
+                gunzip < '$seq_method.input2_fastq' > '${input_base}_2.fastq' &&
             #else:
                 ln -f -s '$seq_method.input_fastq' '${input_base}.fastq' &&
                 ln -f -s '$seq_method.input2_fastq' '${input_base}_2.fastq' &&
             #end if
         #end if
-        
         ln -f -s '$genome' '${baseReference}.fa' &&
         bowtie-build '$genome' ${baseReference} &&
-        python $__tool_directory__/RepEnrich_setup.py $repeatmasker ${baseReference}.fa setup_folder_${baseReference} &&
+        python $__tool_directory__/RepEnrich_setup.py
+            --annotation_file $repeatmasker
+            --genomefasta ${baseReference}.fa
+            --setup_folder setup_folder_${baseReference} &&
         #if $seq_method.seq_method_list == "single-read":
             bowtie $baseReference -p \${GALAXY_SLOTS:-4} -t -m 1 -S --max ${input_base}_multimap.fastq ${input_base}.fastq ${input_base}_unique.sam 2>bowtie_alignments.txt &&
             TOTAL=\$(grep 'reads processed:' bowtie_alignments.txt | cut -d ' ' -f 4) &&
@@ -56,9 +58,25 @@
         samtools view -@ \${GALAXY_SLOTS:-4} -bS '${input_base}_unique.sam' | samtools sort -@ \${GALAXY_SLOTS:-4} -O bam -o '${input_base}_unique.bam' &&
         samtools index ${input_base}_unique.bam &&
         #if $seq_method.seq_method_list == "single-read":
-            python $__tool_directory__/RepEnrich.py $repeatmasker ${input_base} ${input_base} setup_folder_${baseReference} ${input_base}_multimap.fastq ${input_base}_unique.bam --cpus "\${GALAXY_SLOTS:-4}" &&
+            python $__tool_directory__/RepEnrich.py
+                --annotation_file $repeatmasker
+                --outputfolder ${input_base}
+                --outputprefix ${input_base}
+                --setup_folder setup_folder_${baseReference}
+                --fastqfile ${input_base}_multimap.fastq
+                --alignment_bam ${input_base}_unique.bam
+                --cpus "\${GALAXY_SLOTS:-4}" &&
         #else:
-            python $__tool_directory__/RepEnrich.py $repeatmasker ${input_base} ${input_base} setup_folder_${baseReference} ${input_base}_multimap_1.fastq --fastqfile2 ${input_base}_multimap_2.fastq ${input_base}_unique.bam --cpus "\${GALAXY_SLOTS:-4}" --pairedend TRUE &&
+            python $__tool_directory__/RepEnrich.py
+                --annotation_file $repeatmasker
+                --outputfolder ${input_base}
+                --outputprefix ${input_base}
+                --setup_folder setup_folder_${baseReference}
+                --fastqfile ${input_base}_multimap_1.fastq
+                --fastqfile2 ${input_base}_multimap_2.fastq
+                --alignment_bam ${input_base}_unique.bam
+                --cpus "\${GALAXY_SLOTS:-4}"
+                --pairedend TRUE &&
         #end if
         cp $input_base/${input_base}_class_fraction_counts.txt class_fraction_counts.tabular &&
         cp $input_base/${input_base}_family_fraction_counts.txt family_fraction_counts.tabular &&
@@ -227,15 +245,10 @@
 
 .. class:: infomark
 
-For more information on the tools, please visit our `code repository`_.
-
-If you would like to give us feedback or you run into any trouble, please send an email to artbio.ibps@gmail.com 
-
-This tool wrapper is developed by the `ARTbio team`_ at the `Institut de Biologie Paris Seine (IBPS)`_.
+For more information on the tools, or giving us feedback, please visit our `code repository`_.
 
 .. _code repository: https://github.com/ARTbio/tools-artbio/tree/master/tools/
 .. _ARTbio team: http://artbio.fr
-.. _Institut de Biologie Paris Seine (IBPS): http://www.ibps.upmc.fr/en/core-facilities/bioinformatics
 
     </help>