Previous changeset 11:6bba3e33c2e7 (2024-03-09) Next changeset 13:530626b0757c (2024-04-02) |
Commit message:
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df |
modified:
RepEnrich.py RepEnrich_setup.py macros.xml repenrich.xml |
b |
diff -r 6bba3e33c2e7 -r 89e05f831259 RepEnrich.py --- a/RepEnrich.py Sat Mar 09 22:32:46 2024 +0000 +++ b/RepEnrich.py Mon Mar 18 09:39:44 2024 +0000 |
[ |
b"@@ -2,7 +2,6 @@\n import csv\n import os\n import shlex\n-import shutil\n import subprocess\n import sys\n \n@@ -10,86 +9,41 @@\n \n \n parser = argparse.ArgumentParser(description='''\n- Part II: Conducting the alignments to the psuedogenomes. Before\\\n- doing this step you will require 1) a bamfile of the unique\\\n- alignments with index 2) a fastq file of the reads mapping to\\\n- more than one location. These files can be obtained using the\\\n- following bowtie options [EXAMPLE: bowtie -S -m 1\\\n- --max multimap.fastq mm9 mate1_reads.fastq] Once you have the\\\n- unique alignment bamfile and the reads mapping to more than one\\\n- location in a fastq file you can run this step. EXAMPLE: python\\\n- master_output.py\\\n- /users/nneretti/data/annotation/hg19/hg19_repeatmasker.txt\\\n- /users/nneretti/datasets/repeatmapping/POL3/Pol3_human/\n- HeLa_InputChIPseq_Rep1 HeLa_InputChIPseq_Rep1\\\n- /users/nneretti/data/annotation/hg19/setup_folder\\\n- HeLa_InputChIPseq_Rep1_multimap.fastq\\\n- HeLa_InputChIPseq_Rep1.bam''')\n-parser.add_argument('--version', action='version', version='%(prog)s 0.1')\n-parser.add_argument('annotation_file', action='store',\n+ Repenrich aligns reads to Repeat Elements pseudogenomes\\\n+ and counts aligned reads. RepEnrich_setup must be run\\\n+ before its use''')\n+parser.add_argument('--annotation_file', action='store',\n metavar='annotation_file',\n- help='List RepeatMasker.org annotation file for your\\\n- organism. The file may be downloaded from the\\\n- RepeatMasker.org website. Example:\\\n- /data/annotation/hg19/hg19_repeatmasker.txt')\n-parser.add_argument('outputfolder', action='store', metavar='outputfolder',\n- help='List folder to contain results.\\\n- Example: /outputfolder')\n-parser.add_argument('outputprefix', action='store', metavar='outputprefix',\n- help='Enter prefix name for data.\\\n- Example: HeLa_InputChIPseq_Rep1')\n-parser.add_argument('setup_folder', action='store', metavar='setup_folder',\n- help='List folder that contains the repeat element\\\n- pseudogenomes.\\\n- Example: /data/annotation/hg19/setup_folder')\n-parser.add_argument('fastqfile', action='store', metavar='fastqfile',\n- help='Enter file for the fastq reads that map to multiple\\\n+ help='RepeatMasker.org annotation file for your\\\n+ organism. The file may be downloaded from\\\n+ RepeatMasker.org. E.g. hg19_repeatmasker.txt')\n+parser.add_argument('--outputfolder', action='store', metavar='outputfolder',\n+ help='Folder that will contain results. Should be the\\\n+ same as the one used for RepEnrich_setup.\\\n+ Example: ./outputfolder')\n+parser.add_argument('--outputprefix', action='store', metavar='outputprefix',\n+ help='Prefix name for Repenrich output files.')\n+parser.add_argument('--setup_folder', action='store', metavar='setup_folder',\n+ help='Folder produced by RepEnrich_setup which contains\\\n+ repeat element pseudogenomes.')\n+parser.add_argument('--fastqfile', action='store', metavar='fastqfile',\n+ help='File of fastq reads mapping to multiple\\\n locations. Example: /data/multimap.fastq')\n-parser.add_argument('alignment_bam', action='store', metavar='alignment_bam',\n- help='Enter bamfile output for reads that map uniquely.\\\n- Example /bamfiles/old.bam')\n+parser.add_argument('--alignment_bam', action='st"..b'ile_prefix\n- + \'_class_total_counts.txt\', \'w\')\n- for key in sorted(classtotalcounts.keys()):\n- fout2.write(str(key) + \'\\t\' + str(classtotalcounts[key]) + \'\\n\')\n- fout3 = open(outputfolder + os.path.sep + outputfile_prefix\n- + \'_family_total_counts.txt\', \'w\')\n- for key in sorted(familytotalcounts.keys()):\n- fout3.write(str(key) + \'\\t\' + str(familytotalcounts[key]) + \'\\n\')\n- fout4 = open(outputfolder + os.path.sep + outputfile_prefix +\n- \'_unique_counts.txt\', \'w\')\n- for key in sorted(repcounts2.keys()):\n- fout4.write(str(key) + \'\\t\' + repeatclass[key] + \'\\t\' +\n- repeatfamily[key] + \'\\t\' + str(repcounts2[key]) + \'\\n\')\n- fout5 = open(outputfolder + os.path.sep + outputfile_prefix\n- + \'_class_fraction_counts.txt\', \'w\')\n+print(\'Writing final output...\')\n+with open(f"{os.path.join(outputfolder, outputfile_prefix)}_"\n+ f"class_fraction_counts.txt", \'w\') as fout:\n for key in sorted(classfractionalcounts.keys()):\n- fout5.write(str(key) + \'\\t\' + str(classfractionalcounts[key]) + \'\\n\')\n- fout6 = open(outputfolder + os.path.sep + outputfile_prefix +\n- \'_family_fraction_counts.txt\', \'w\')\n+ fout.write(f"{key}\\t{classfractionalcounts[key]}\\n")\n+\n+with open(f"{os.path.join(outputfolder, outputfile_prefix)}_"\n+ f"family_fraction_counts.txt", \'w\') as fout:\n for key in sorted(familyfractionalcounts.keys()):\n- fout6.write(str(key) + \'\\t\' + str(familyfractionalcounts[key]) + \'\\n\')\n- fout7 = open(outputfolder + os.path.sep + outputfile_prefix\n- + \'_fraction_counts.txt\', \'w\')\n+ fout.write(f"{key}\\t{familyfractionalcounts[key]}\\n")\n+\n+with open(f"{os.path.join(outputfolder, outputfile_prefix)}_"\n+ f"fraction_counts.txt", \'w\') as fout:\n for key in sorted(fractionalcounts.keys()):\n- fout7.write(str(key) + \'\\t\' + repeatclass[key] + \'\\t\' +\n- repeatfamily[key] + \'\\t\' + str(int(fractionalcounts[key]))\n- + \'\\n\')\n- fout1.close()\n- fout2.close()\n- fout3.close()\n- fout4.close()\n- fout5.close()\n- fout6.close()\n- fout7.close()\n-else:\n- fout1 = open(outputfolder + os.path.sep + outputfile_prefix +\n- \'_class_fraction_counts.txt\', \'w\')\n- for key in sorted(classfractionalcounts.keys()):\n- fout1.write(str(key) + \'\\t\' + str(classfractionalcounts[key]) + \'\\n\')\n- fout2 = open(outputfolder + os.path.sep + outputfile_prefix +\n- \'_family_fraction_counts.txt\', \'w\')\n- for key in sorted(familyfractionalcounts.keys()):\n- fout2.write(str(key) + \'\\t\' + str(familyfractionalcounts[key]) + \'\\n\')\n- fout3 = open(outputfolder + os.path.sep + outputfile_prefix +\n- \'_fraction_counts.txt\', \'w\')\n- for key in sorted(fractionalcounts.keys()):\n- fout3.write(str(key) + \'\\t\' + repeatclass[key] + \'\\t\' +\n- repeatfamily[key] + \'\\t\' + str(int(fractionalcounts[key]))\n- + \'\\n\')\n- fout1.close()\n- fout2.close()\n- fout3.close()\n-##############################################################################\n-# Remove Large intermediate files\n-if os.path.exists(outputfolder + os.path.sep + outputfile_prefix +\n- \'_regionsorter.txt\'):\n- os.remove(outputfolder + os.path.sep + outputfile_prefix +\n- \'_regionsorter.txt\')\n-if os.path.exists(outputfolder + os.path.sep + \'pair1_bowtie\'):\n- shutil.rmtree(outputfolder + os.path.sep + \'pair1_bowtie\')\n-if os.path.exists(outputfolder + os.path.sep + \'pair2_bowtie\'):\n- shutil.rmtree(outputfolder + os.path.sep + \'pair2_bowtie\')\n-if os.path.exists(outputfolder + os.path.sep + \'sorted_bowtie\'):\n- shutil.rmtree(outputfolder + os.path.sep + \'sorted_bowtie\')\n-print("... Done")\n+ fout.write(f"{key}\\t{repeatclass[key]}\\t{repeatfamily[key]}\\t"\n+ f"{int(fractionalcounts[key])}\\n")\n' |
b |
diff -r 6bba3e33c2e7 -r 89e05f831259 RepEnrich_setup.py --- a/RepEnrich_setup.py Sat Mar 09 22:32:46 2024 +0000 +++ b/RepEnrich_setup.py Mon Mar 18 09:39:44 2024 +0000 |
[ |
b"@@ -11,86 +11,49 @@\n from Bio.SeqRecord import SeqRecord\n \n parser = argparse.ArgumentParser(description='''\n- Part I: Prepartion of repetive element psuedogenomes and repetive\\\n- element bamfiles. This script prepares the annotation used by\\\n- downstream applications to analyze for repetitive element\\\n- enrichment. For this script to run properly bowtie must be\\\n- loaded. The repeat element psuedogenomes are prepared in order\\\n- to analyze reads that map to multiple locations of the genome.\\\n- The repeat element bamfiles are prepared in order to use a\\\n- region sorter to analyze reads that map to a single location\\\n- of the genome. You will 1) annotation_file:\\\n- The repetitive element annotation file downloaded from\\\n- RepeatMasker.org database for your organism of interest.\\\n- 2) genomefasta: Your genome of interest in fasta format,\\\n- 3)setup_folder: a folder to contain repeat element setup files\\\n- command-line usage\n- EXAMPLE: python master_setup.py\\\n- /users/nneretti/data/annotation/mm9/mm9_repeatmasker.txt\\\n- /users/nneretti/data/annotation/mm9/mm9.fa\\\n- /users/nneretti/data/annotation/mm9/setup_folder''',\n+ Prepartion of repetive element pseudogenomes bowtie\\\n+ indexes and annotation files used by RepEnrich.py enrichment.''',\n prog='getargs_genome_maker.py')\n-parser.add_argument('--version', action='version', version='%(prog)s 0.1')\n-parser.add_argument('annotation_file', action='store',\n+parser.add_argument('--annotation_file', action='store',\n metavar='annotation_file',\n- help='''List annotation file. The annotation file contains\\\n- the repeat masker annotation for the genome of\\\n- interest and may be downloaded at RepeatMasker.org\\\n- Example /data/annotation/mm9/mm9.fa.out''')\n-parser.add_argument('genomefasta', action='store', metavar='genomefasta',\n- help='''File name and path for genome of interest in fasta\\\n- format. Example /data/annotation/mm9/mm9.fa''')\n-parser.add_argument('setup_folder', action='store', metavar='setup_folder',\n- help='''List folder to contain bamfiles for repeats and\\\n+ help='''Repeat masker annotation of the genome of\\\n+ interest. Download from RepeatMasker.org\\\n+ Example: mm9.fa.out''')\n+parser.add_argument('--genomefasta', action='store', metavar='genomefasta',\n+ help='''Genome of interest in fasta format.\\\n+ Example: mm9.fa''')\n+parser.add_argument('--setup_folder', action='store', metavar='setup_folder',\n+ help='''Folder that contains bowtie indexes of repeats and\\\n repeat element psuedogenomes.\\\n- Example /data/annotation/mm9/setup''')\n-parser.add_argument('--nfragmentsfile1', action='store',\n- dest='nfragmentsfile1', metavar='nfragmentsfile1',\n- default='./repnames_nfragments.txt',\n- help='''Output location of a description file that saves\\\n- the number of fragments processed per repname.\n- Default ./repnames_nfragments.txt''')\n+ Example working/setup''')\n parser.add_argument('--gaplength', action='store', dest='gaplength',\n metavar='gaplength', default='200', type=int,\n- help='Length of the spacer used to build\\\n- repeat psuedogeneomes. Default 200')\n+ help='''Length of the N-spacer in the\\\n+ repeat pseudogenomes. Default 200''')\n pa"..b'nary key that contains each repeat type with the\n-# associated binary number; sort the binary key:\n-fout = open(os.path.realpath(setup_folder + os.path.sep +\n- \'repgenomes_key.txt\'), \'w\')\n-x = 0\n-for repeat in repeat_elements:\n- # print >> fout, str(repeat) + \'\\t\' + str(x)\n- fout.write(str(repeat) + \'\\t\' + str(x) + \'\\n\')\n- x += 1\n-fout.close()\n-##############################################################################\n-# generate spacer for psuedogenomes\n-spacer = ""\n-for i in range(gapl):\n- spacer = spacer + "N"\n+# sort repeat_elements and print them in repgenomes_key.txt\n+with open(os.path.join(setup_folder, \'repgenomes_key.txt\'), \'w\') as fout:\n+ for i, repeat in enumerate(sorted(repeat_elements)):\n+ fout.write(\'\\t\'.join([repeat, str(i)]) + \'\\n\')\n \n-# save file with number of fragments processed per repname\n-print("Saving number of fragments processed per repname to "\n- + nfragmentsfile1)\n-fout1 = open(os.path.realpath(nfragmentsfile1), "w")\n-for repname in rep_chr.keys():\n- rep_chr_current = rep_chr[repname]\n-# print >>fout1, str(len(rep_chr[repname])) + "\\t" + repname\n- fout1.write(str(len(rep_chr[repname])) + "\\t" + repname + \'\\n\')\n-fout1.close()\n+# generate spacer for pseudogenomes\n+spacer = \'\'.join([\'N\' for i in range(gapl)])\n \n-# generate metagenomes and save them to FASTA files\n-k = 1\n-nrepgenomes = len(rep_chr.keys())\n-for repname in rep_chr.keys():\n- metagenome = ""\n- newname = repname.replace("(", "_").replace(")", "_").replace("/", "_")\n- print("processing repgenome " + newname + ".fa" + " (" + str(k)\n- + " of " + str(nrepgenomes) + ")")\n- rep_chr_current = rep_chr[repname]\n- rep_start_current = rep_start[repname]\n- rep_end_current = rep_end[repname]\n- print("-------> " + str(len(rep_chr[repname])) + " fragments")\n- for i in range(len(rep_chr[repname])):\n+# generate metagenomes and save them to FASTA files for bowtie build\n+for repname in rep_chr:\n+ metagenome = \'\'\n+ for i, repeat in enumerate(rep_chr[repname]):\n try:\n- chr = rep_chr_current[i]\n- rstart = max(rep_start_current[i] - flankingl, 0)\n- rend = min(rep_end_current[i] + flankingl, lgenome[chr]-1)\n- metagenome = metagenome + spacer + genome[chr][rstart:(rend+1)]\n+ chromosome = rep_chr[repname][i]\n+ start = max(int(rep_start[repname][i]) - flankingl, 0)\n+ end = min(int(rep_end[repname][i]) + flankingl,\n+ int(lgenome[chr])-1) + 1\n+ metagenome = f"{metagenome}{spacer}{genome[chromosome][start:end]}"\n except KeyError:\n- print("Unrecognised Chromosome: "+chr)\n- pass\n- # Convert metagenome to SeqRecord object (required by SeqIO.write)\n- record = SeqRecord(Seq(metagenome), id="repname",\n- name="", description="")\n- print("saving repgenome " + newname + ".fa" + " (" + str(k) + " of "\n- + str(nrepgenomes) + ")")\n- fastafilename = os.path.realpath(setup_folder + os.path.sep\n- + newname + ".fa")\n+ print("Unrecognised Chromosome: " + rep_chr[repname][i])\n+\n+ # Create Fasta of repeat pseudogenome\n+ fastafilename = f"{os.path.join(setup_folder, repname)}.fa"\n+ record = SeqRecord(Seq(metagenome), id=repname, name=\'\', description=\'\')\n SeqIO.write(record, fastafilename, "fasta")\n- print("indexing repgenome " + newname + ".fa" + " (" +\n- str(k) + " of " + str(nrepgenomes) + ")")\n- command = shlex.split(\'bowtie-build -f \' + fastafilename + \' \' +\n- setup_folder + os.path.sep + newname)\n- p = subprocess.Popen(command).communicate()\n- k += 1\n \n-print("... Done")\n+ # Generate repeat pseudogenome bowtie index\n+ bowtie_build_cmd = ["bowtie-build", "-f", fastafilename,\n+ os.path.join(setup_folder, repname)]\n+ subprocess.run(bowtie_build_cmd, check=True)\n' |
b |
diff -r 6bba3e33c2e7 -r 89e05f831259 macros.xml --- a/macros.xml Sat Mar 09 22:32:46 2024 +0000 +++ b/macros.xml Mon Mar 18 09:39:44 2024 +0000 |
b |
@@ -1,6 +1,6 @@ <macros> <token name="@TOOL_VERSION@">1.83</token> - <token name="@VERSION_SUFFIX@">0</token> + <token name="@VERSION_SUFFIX@">1</token> <token name="@PROFILE@">23.0</token> <xml name="requirements"> |
b |
diff -r 6bba3e33c2e7 -r 89e05f831259 repenrich.xml --- a/repenrich.xml Sat Mar 09 22:32:46 2024 +0000 +++ b/repenrich.xml Mon Mar 18 09:39:44 2024 +0000 |
b |
@@ -30,16 +30,18 @@ #else: #if $seq_method.input2_fastq.is_of_type("fastq.gz", "fastqsanger.gz"): gunzip < '$seq_method.input_fastq' > '${input_base}.fastq' && - gunzip < '$seq_method.input2_fastq' > '${input_base}_2.fastq' && + gunzip < '$seq_method.input2_fastq' > '${input_base}_2.fastq' && #else: ln -f -s '$seq_method.input_fastq' '${input_base}.fastq' && ln -f -s '$seq_method.input2_fastq' '${input_base}_2.fastq' && #end if #end if - ln -f -s '$genome' '${baseReference}.fa' && bowtie-build '$genome' ${baseReference} && - python $__tool_directory__/RepEnrich_setup.py $repeatmasker ${baseReference}.fa setup_folder_${baseReference} && + python $__tool_directory__/RepEnrich_setup.py + --annotation_file $repeatmasker + --genomefasta ${baseReference}.fa + --setup_folder setup_folder_${baseReference} && #if $seq_method.seq_method_list == "single-read": bowtie $baseReference -p \${GALAXY_SLOTS:-4} -t -m 1 -S --max ${input_base}_multimap.fastq ${input_base}.fastq ${input_base}_unique.sam 2>bowtie_alignments.txt && TOTAL=\$(grep 'reads processed:' bowtie_alignments.txt | cut -d ' ' -f 4) && @@ -56,9 +58,25 @@ samtools view -@ \${GALAXY_SLOTS:-4} -bS '${input_base}_unique.sam' | samtools sort -@ \${GALAXY_SLOTS:-4} -O bam -o '${input_base}_unique.bam' && samtools index ${input_base}_unique.bam && #if $seq_method.seq_method_list == "single-read": - python $__tool_directory__/RepEnrich.py $repeatmasker ${input_base} ${input_base} setup_folder_${baseReference} ${input_base}_multimap.fastq ${input_base}_unique.bam --cpus "\${GALAXY_SLOTS:-4}" && + python $__tool_directory__/RepEnrich.py + --annotation_file $repeatmasker + --outputfolder ${input_base} + --outputprefix ${input_base} + --setup_folder setup_folder_${baseReference} + --fastqfile ${input_base}_multimap.fastq + --alignment_bam ${input_base}_unique.bam + --cpus "\${GALAXY_SLOTS:-4}" && #else: - python $__tool_directory__/RepEnrich.py $repeatmasker ${input_base} ${input_base} setup_folder_${baseReference} ${input_base}_multimap_1.fastq --fastqfile2 ${input_base}_multimap_2.fastq ${input_base}_unique.bam --cpus "\${GALAXY_SLOTS:-4}" --pairedend TRUE && + python $__tool_directory__/RepEnrich.py + --annotation_file $repeatmasker + --outputfolder ${input_base} + --outputprefix ${input_base} + --setup_folder setup_folder_${baseReference} + --fastqfile ${input_base}_multimap_1.fastq + --fastqfile2 ${input_base}_multimap_2.fastq + --alignment_bam ${input_base}_unique.bam + --cpus "\${GALAXY_SLOTS:-4}" + --pairedend TRUE && #end if cp $input_base/${input_base}_class_fraction_counts.txt class_fraction_counts.tabular && cp $input_base/${input_base}_family_fraction_counts.txt family_fraction_counts.tabular && @@ -227,15 +245,10 @@ .. class:: infomark -For more information on the tools, please visit our `code repository`_. - -If you would like to give us feedback or you run into any trouble, please send an email to artbio.ibps@gmail.com - -This tool wrapper is developed by the `ARTbio team`_ at the `Institut de Biologie Paris Seine (IBPS)`_. +For more information on the tools, or giving us feedback, please visit our `code repository`_. .. _code repository: https://github.com/ARTbio/tools-artbio/tree/master/tools/ .. _ARTbio team: http://artbio.fr -.. _Institut de Biologie Paris Seine (IBPS): http://www.ibps.upmc.fr/en/core-facilities/bioinformatics </help> |