Mercurial > repos > artbio > repenrich
annotate RepEnrich_setup.py @ 15:2e3d976e7d5d draft default tip
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 03183e29f807ec33548016a7c4144f52720b7b9e
author | artbio |
---|---|
date | Sun, 21 Apr 2024 09:44:51 +0000 |
parents | bf866bedd4b4 |
children |
rev | line source |
---|---|
0
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
1 #!/usr/bin/env python |
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
2 import argparse |
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
3 import csv |
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
4 import shlex |
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
5 import subprocess |
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
6 import sys |
13
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
7 from collections import defaultdict |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
8 from concurrent.futures import ProcessPoolExecutor |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
9 |
0
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
10 |
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
11 from Bio import SeqIO |
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
12 from Bio.Seq import Seq |
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
13 from Bio.SeqRecord import SeqRecord |
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
14 |
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
15 parser = argparse.ArgumentParser(description=''' |
12
89e05f831259
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
artbio
parents:
11
diff
changeset
|
16 Prepartion of repetive element pseudogenomes bowtie\ |
89e05f831259
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
artbio
parents:
11
diff
changeset
|
17 indexes and annotation files used by RepEnrich.py enrichment.''', |
0
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
18 prog='getargs_genome_maker.py') |
12
89e05f831259
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
artbio
parents:
11
diff
changeset
|
19 parser.add_argument('--annotation_file', action='store', |
0
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
20 metavar='annotation_file', |
12
89e05f831259
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
artbio
parents:
11
diff
changeset
|
21 help='''Repeat masker annotation of the genome of\ |
89e05f831259
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
artbio
parents:
11
diff
changeset
|
22 interest. Download from RepeatMasker.org\ |
89e05f831259
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
artbio
parents:
11
diff
changeset
|
23 Example: mm9.fa.out''') |
89e05f831259
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
artbio
parents:
11
diff
changeset
|
24 parser.add_argument('--genomefasta', action='store', metavar='genomefasta', |
89e05f831259
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
artbio
parents:
11
diff
changeset
|
25 help='''Genome of interest in fasta format.\ |
89e05f831259
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
artbio
parents:
11
diff
changeset
|
26 Example: mm9.fa''') |
0
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
27 parser.add_argument('--gaplength', action='store', dest='gaplength', |
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
28 metavar='gaplength', default='200', type=int, |
12
89e05f831259
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
artbio
parents:
11
diff
changeset
|
29 help='''Length of the N-spacer in the\ |
89e05f831259
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
artbio
parents:
11
diff
changeset
|
30 repeat pseudogenomes. Default 200''') |
0
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
31 parser.add_argument('--flankinglength', action='store', dest='flankinglength', |
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
32 metavar='flankinglength', default='25', type=int, |
12
89e05f831259
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
artbio
parents:
11
diff
changeset
|
33 help='''Length of the flanking regions used to build\ |
89e05f831259
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
artbio
parents:
11
diff
changeset
|
34 repeat pseudogenomes. Flanking length should be set\ |
89e05f831259
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
artbio
parents:
11
diff
changeset
|
35 according to the length of your reads.\ |
89e05f831259
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
artbio
parents:
11
diff
changeset
|
36 Default 25, for 50 nt reads''') |
13
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
37 parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus', |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
38 default="1", type=int, |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
39 help='Number of CPUs. The more cpus the\ |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
40 faster RepEnrich performs. Default: "1"') |
0
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
41 args = parser.parse_args() |
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
42 |
12
89e05f831259
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
artbio
parents:
11
diff
changeset
|
43 # parameters from argsparse |
0
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
44 gapl = args.gaplength |
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
45 flankingl = args.flankinglength |
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
46 annotation_file = args.annotation_file |
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
47 genomefasta = args.genomefasta |
13
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
48 cpus = args.cpus |
0
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
49 |
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
50 |
13
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
51 def starts_with_numerical(list): |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
52 try: |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
53 if len(list) == 0: |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
54 return False |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
55 int(list[0]) |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
56 return True |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
57 except ValueError: |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
58 return False |
0
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
59 |
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
60 |
13
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
61 # define a text importer for .out/.txt format of repbase |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
62 def import_text(filename, separator): |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
63 csv.field_size_limit(sys.maxsize) |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
64 file = csv.reader(open(filename), delimiter=separator, |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
65 skipinitialspace=True) |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
66 return [line for line in file if starts_with_numerical(line)] |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
67 |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
68 |
12
89e05f831259
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
artbio
parents:
11
diff
changeset
|
69 # load genome into dictionary and compute length |
0
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
70 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta")) |
13
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
71 genome = defaultdict(dict) |
0
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
72 |
13
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
73 for chr in g.keys(): |
15
2e3d976e7d5d
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 03183e29f807ec33548016a7c4144f52720b7b9e
artbio
parents:
14
diff
changeset
|
74 genome[chr]['sequence'] = str(g[chr].seq) |
13
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
75 genome[chr]['length'] = len(g[chr].seq) |
0
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
76 |
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
77 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter |
13
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
78 repeat_elements = set() |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
79 rep_coords = defaultdict(list) # Merged dictionary for coordinates |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
80 |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
81 with open('repnames.bed', 'w') as fout: |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
82 f_in = import_text(annotation_file, ' ') |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
83 for line in f_in: |
12
89e05f831259
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
artbio
parents:
11
diff
changeset
|
84 repname = line[9].translate(str.maketrans('()/', '___')) |
13
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
85 repeat_elements.add(repname) |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
86 repchr, repstart, repend = line[4], line[5], line[6] |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
87 fout.write(f"{repchr}\t{repstart}\t{repend}\t{repname}\n") |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
88 rep_coords[repname].extend([repchr, repstart, repend]) |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
89 # repeat_elements now contains the unique repeat names |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
90 # rep_coords is a dictionary where keys are repeat names and values are lists |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
91 # containing chromosome, start, and end coordinates for each repeat instance |
0
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
92 |
13
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
93 # sort repeat_elements and print them in repeatIDs.txt |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
94 with open('repeatIDs.txt', 'w') as fout: |
12
89e05f831259
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
artbio
parents:
11
diff
changeset
|
95 for i, repeat in enumerate(sorted(repeat_elements)): |
89e05f831259
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
artbio
parents:
11
diff
changeset
|
96 fout.write('\t'.join([repeat, str(i)]) + '\n') |
0
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
97 |
12
89e05f831259
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
artbio
parents:
11
diff
changeset
|
98 # generate spacer for pseudogenomes |
89e05f831259
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
artbio
parents:
11
diff
changeset
|
99 spacer = ''.join(['N' for i in range(gapl)]) |
0
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
100 |
12
89e05f831259
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
artbio
parents:
11
diff
changeset
|
101 # generate metagenomes and save them to FASTA files for bowtie build |
13
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
102 for repname in rep_coords: |
15
2e3d976e7d5d
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 03183e29f807ec33548016a7c4144f52720b7b9e
artbio
parents:
14
diff
changeset
|
103 genomes_list = [] |
13
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
104 # iterating coordinate list by block of 3 (chr, start, end) |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
105 block = 3 |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
106 for i in range(0, len(rep_coords[repname]) - block + 1, block): |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
107 batch = rep_coords[repname][i:i+block] |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
108 chromosome = batch[0] |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
109 start = max(int(batch[1]) - flankingl, 0) |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
110 end = min(int(batch[2]) + flankingl, |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
111 int(genome[chromosome]['length'])-1) + 1 |
15
2e3d976e7d5d
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 03183e29f807ec33548016a7c4144f52720b7b9e
artbio
parents:
14
diff
changeset
|
112 genomes_list.append(genome[chromosome]['sequence'][start:end]) |
2e3d976e7d5d
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 03183e29f807ec33548016a7c4144f52720b7b9e
artbio
parents:
14
diff
changeset
|
113 metagenome = spacer.join(genomes_list) |
12
89e05f831259
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
artbio
parents:
11
diff
changeset
|
114 # Create Fasta of repeat pseudogenome |
13
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
115 fastafilename = f"{repname}.fa" |
12
89e05f831259
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 212b838f614f1f7b8e770473c026d9c1180722df
artbio
parents:
11
diff
changeset
|
116 record = SeqRecord(Seq(metagenome), id=repname, name='', description='') |
0
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
117 SeqIO.write(record, fastafilename, "fasta") |
f6f0f1e5e940
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
artbio
parents:
diff
changeset
|
118 |
13
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
119 |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
120 def bowtie_build(args): |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
121 """ |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
122 Function to be executed in parallel by ProcessPoolExecutor. |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
123 """ |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
124 try: |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
125 bowtie_base, fasta = args |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
126 command = shlex.split(f"bowtie-build -f {fasta} {bowtie_base}") |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
127 squash = subprocess.run(command, capture_output=True, text=True) |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
128 return squash.stdout |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
129 except Exception as e: |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
130 return str(e) |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
131 |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
132 |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
133 args_list = [(name, f"{name}.fa") for name in rep_coords] |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
134 with ProcessPoolExecutor(max_workers=cpus) as executor: |
530626b0757c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
artbio
parents:
12
diff
changeset
|
135 executor.map(bowtie_build, args_list) |