annotate RepEnrich2_setup.py @ 2:cfb06f8e8f52 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 4ac07201d6267f5efd5c2af20db7f53fce5af8af
author artbio
date Sat, 20 Apr 2024 15:17:04 +0000
parents 4905a332a094
children c5bb2f9af708
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
1 #!/usr/bin/env python
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
2 import argparse
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
3 import csv
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
4 import os
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
5 import shlex
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
6 import subprocess
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
7 import sys
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
8 from collections import defaultdict
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
9 from concurrent.futures import ProcessPoolExecutor
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
10
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
11
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
12 from Bio import SeqIO
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
13 from Bio.Seq import Seq
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
14 from Bio.SeqRecord import SeqRecord
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
15
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
16 parser = argparse.ArgumentParser(description='''
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
17 Prepartion of repetive element pseudogenomes bowtie\
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
18 indexes and annotation files used by RepEnrich.py enrichment.''',
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
19 prog='getargs_genome_maker.py')
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
20 parser.add_argument('--annotation_file', action='store',
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
21 metavar='annotation_file',
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
22 help='''Repeat masker annotation of the genome of\
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
23 interest. Download from RepeatMasker.org\
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
24 Example: mm9.fa.out''')
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
25 parser.add_argument('--genomefasta', action='store', metavar='genomefasta',
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
26 help='''Genome of interest in fasta format.\
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
27 Example: mm9.fa''')
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
28 parser.add_argument('--gaplength', action='store', dest='gaplength',
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
29 metavar='gaplength', default='200', type=int,
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
30 help='''Length of the N-spacer in the\
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
31 repeat pseudogenomes. Default 200''')
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
32 parser.add_argument('--flankinglength', action='store', dest='flankinglength',
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
33 metavar='flankinglength', default='25', type=int,
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
34 help='''Length of the flanking regions used to build\
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
35 repeat pseudogenomes. Flanking length should be set\
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
36 according to the length of your reads.\
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
37 Default 25, for 50 nt reads''')
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
38 parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus',
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
39 default="1", type=int,
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
40 help='Number of CPUs. The more cpus the\
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
41 faster RepEnrich performs. Default: "1"')
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
42 args = parser.parse_args()
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
43
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
44 # parameters from argsparse
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
45 gapl = args.gaplength
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
46 flankingl = args.flankinglength
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
47 annotation_file = args.annotation_file
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
48 genomefasta = args.genomefasta
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
49 cpus = args.cpus
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
50
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
51 # check that the programs we need are available
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
52 try:
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
53 subprocess.call(shlex.split("bowtie2 --version"),
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
54 stdout=open(os.devnull, 'wb'),
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
55 stderr=open(os.devnull, 'wb'))
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
56 except OSError:
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
57 print("Error: Bowtie2 not available in the path")
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
58 raise
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
59
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
60
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
61 def starts_with_numerical(list):
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
62 try:
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
63 if len(list) == 0:
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
64 return False
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
65 int(list[0])
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
66 return True
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
67 except ValueError:
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
68 return False
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
69
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
70
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
71 # define a text importer for .out/.txt format of repbase
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
72 def import_text(filename, separator):
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
73 csv.field_size_limit(sys.maxsize)
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
74 file = csv.reader(open(filename), delimiter=separator,
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
75 skipinitialspace=True)
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
76 return [line for line in file if starts_with_numerical(line)]
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
77
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
78
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
79 # load genome into dictionary and compute length
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
80 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta"))
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
81 genome = defaultdict(dict)
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
82
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
83 for chr in g.keys():
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
84 genome[chr]['sequence'] = g[chr].seq
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
85 genome[chr]['length'] = len(g[chr].seq)
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
86
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
87 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
88 repeat_elements = set()
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
89 rep_coords = defaultdict(list) # Merged dictionary for coordinates
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
90
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
91 with open('repnames.bed', 'w') as fout:
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
92 f_in = import_text(annotation_file, ' ')
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
93 for line in f_in:
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
94 repname = line[9].translate(str.maketrans('()/', '___'))
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
95 repeat_elements.add(repname)
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
96 repchr, repstart, repend = line[4], line[5], line[6]
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
97 fout.write(f"{repchr}\t{repstart}\t{repend}\t{repname}\n")
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
98 rep_coords[repname].extend([repchr, repstart, repend])
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
99 # repeat_elements now contains the unique repeat names
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
100 # rep_coords is a dictionary where keys are repeat names and values are lists
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
101 # containing chromosome, start, and end coordinates for each repeat instance
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
102
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
103 # sort repeat_elements and print them in repeatIDs.txt
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
104 with open('repeatIDs.txt', 'w') as fout:
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
105 for i, repeat in enumerate(sorted(repeat_elements)):
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
106 fout.write('\t'.join([repeat, str(i)]) + '\n')
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
107
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
108 # generate spacer for pseudogenomes
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
109 spacer = ''.join(['N' for i in range(gapl)])
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
110
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
111 # generate metagenomes and save them to FASTA files for bowtie build
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
112 for repname in rep_coords:
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
113 metagenome = ''
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
114 # iterating coordinate list by block of 3 (chr, start, end)
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
115 block = 3
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
116 for i in range(0, len(rep_coords[repname]) - block + 1, block):
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
117 batch = rep_coords[repname][i:i+block]
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
118 chromosome = batch[0]
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
119 start = max(int(batch[1]) - flankingl, 0)
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
120 end = min(int(batch[2]) + flankingl,
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
121 int(genome[chromosome]['length'])-1) + 1
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
122 metagenome = (
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
123 f"{metagenome}{spacer}"
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
124 f"{genome[chromosome]['sequence'][start:end]}"
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
125 )
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
126
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
127 # Create Fasta of repeat pseudogenome
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
128 fastafilename = f"{repname}.fa"
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
129 record = SeqRecord(Seq(metagenome), id=repname, name='', description='')
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
130 SeqIO.write(record, fastafilename, "fasta")
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
131
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
132
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
133 def bowtie_build(args):
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
134 """
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
135 Function to be executed in parallel by ProcessPoolExecutor.
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
136 """
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
137 try:
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
138 bowtie_base, fasta = args
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
139 command = shlex.split(f"bowtie2-build -f {fasta} {bowtie_base}")
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
140 squash = subprocess.run(command, capture_output=True, text=True)
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
141 return squash.stdout
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
142 except Exception as e:
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
143 return str(e)
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
144
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
145
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
146 args_list = [(name, f"{name}.fa") for name in rep_coords]
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
147 with ProcessPoolExecutor(max_workers=cpus) as executor:
4905a332a094 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
artbio
parents:
diff changeset
148 executor.map(bowtie_build, args_list)