comparison RepEnrich_setup.py @ 13:530626b0757c draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit df6b9491ad06e8a85e67c663b68db3cce3eb0115
author artbio
date Tue, 02 Apr 2024 21:16:37 +0000
parents 89e05f831259
children bf866bedd4b4
comparison
equal deleted inserted replaced
12:89e05f831259 13:530626b0757c
3 import csv 3 import csv
4 import os 4 import os
5 import shlex 5 import shlex
6 import subprocess 6 import subprocess
7 import sys 7 import sys
8 from collections import defaultdict
9 from concurrent.futures import ProcessPoolExecutor
10
8 11
9 from Bio import SeqIO 12 from Bio import SeqIO
10 from Bio.Seq import Seq 13 from Bio.Seq import Seq
11 from Bio.SeqRecord import SeqRecord 14 from Bio.SeqRecord import SeqRecord
12 15
20 interest. Download from RepeatMasker.org\ 23 interest. Download from RepeatMasker.org\
21 Example: mm9.fa.out''') 24 Example: mm9.fa.out''')
22 parser.add_argument('--genomefasta', action='store', metavar='genomefasta', 25 parser.add_argument('--genomefasta', action='store', metavar='genomefasta',
23 help='''Genome of interest in fasta format.\ 26 help='''Genome of interest in fasta format.\
24 Example: mm9.fa''') 27 Example: mm9.fa''')
25 parser.add_argument('--setup_folder', action='store', metavar='setup_folder',
26 help='''Folder that contains bowtie indexes of repeats and\
27 repeat element psuedogenomes.\
28 Example working/setup''')
29 parser.add_argument('--gaplength', action='store', dest='gaplength', 28 parser.add_argument('--gaplength', action='store', dest='gaplength',
30 metavar='gaplength', default='200', type=int, 29 metavar='gaplength', default='200', type=int,
31 help='''Length of the N-spacer in the\ 30 help='''Length of the N-spacer in the\
32 repeat pseudogenomes. Default 200''') 31 repeat pseudogenomes. Default 200''')
33 parser.add_argument('--flankinglength', action='store', dest='flankinglength', 32 parser.add_argument('--flankinglength', action='store', dest='flankinglength',
34 metavar='flankinglength', default='25', type=int, 33 metavar='flankinglength', default='25', type=int,
35 help='''Length of the flanking regions used to build\ 34 help='''Length of the flanking regions used to build\
36 repeat pseudogenomes. Flanking length should be set\ 35 repeat pseudogenomes. Flanking length should be set\
37 according to the length of your reads.\ 36 according to the length of your reads.\
38 Default 25, for 50 nt reads''') 37 Default 25, for 50 nt reads''')
38 parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus',
39 default="1", type=int,
40 help='Number of CPUs. The more cpus the\
41 faster RepEnrich performs. Default: "1"')
39 args = parser.parse_args() 42 args = parser.parse_args()
40 43
41 # parameters from argsparse 44 # parameters from argsparse
42 gapl = args.gaplength 45 gapl = args.gaplength
43 flankingl = args.flankinglength 46 flankingl = args.flankinglength
44 annotation_file = args.annotation_file 47 annotation_file = args.annotation_file
45 genomefasta = args.genomefasta 48 genomefasta = args.genomefasta
46 setup_folder = args.setup_folder 49 cpus = args.cpus
47 50
48 # check that the programs we need are available 51 # check that the programs we need are available
49 try: 52 try:
50 subprocess.call(shlex.split("bowtie --version"), 53 subprocess.call(shlex.split("bowtie --version"),
51 stdout=open(os.devnull, 'wb'), 54 stdout=open(os.devnull, 'wb'),
52 stderr=open(os.devnull, 'wb')) 55 stderr=open(os.devnull, 'wb'))
53 except OSError: 56 except OSError:
54 print("Error: Bowtie not available in the path") 57 print("Error: Bowtie not available in the path")
55 raise 58 raise
56 59
57 # Define a text importer 60
58 csv.field_size_limit(sys.maxsize) 61 def starts_with_numerical(list):
62 try:
63 if len(list) == 0:
64 return False
65 int(list[0])
66 return True
67 except ValueError:
68 return False
59 69
60 70
71 # define a text importer for .out/.txt format of repbase
61 def import_text(filename, separator): 72 def import_text(filename, separator):
62 for line in csv.reader(open(os.path.realpath(filename)), 73 csv.field_size_limit(sys.maxsize)
63 delimiter=separator, skipinitialspace=True): 74 file = csv.reader(open(filename), delimiter=separator,
64 if line: 75 skipinitialspace=True)
65 yield line 76 return [line for line in file if starts_with_numerical(line)]
66 77
67 78
68 # Make a setup folder
69 if not os.path.exists(setup_folder):
70 os.makedirs(setup_folder)
71 # load genome into dictionary and compute length 79 # load genome into dictionary and compute length
72 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta")) 80 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta"))
73 idxgenome, lgenome, genome = {}, {}, {} 81 genome = defaultdict(dict)
74 82
75 for k, chr in enumerate(g.keys()): 83 for chr in g.keys():
76 genome[chr] = g[chr].seq 84 genome[chr]['sequence'] = g[chr].seq
77 lgenome[chr] = len(genome[chr]) 85 genome[chr]['length'] = len(g[chr].seq)
78 idxgenome[chr] = k
79 86
80 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter 87 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter
81 repeat_elements = [] 88 repeat_elements = set()
82 # these dictionaries will contain lists 89 rep_coords = defaultdict(list) # Merged dictionary for coordinates
83 rep_chr, rep_start, rep_end = {}, {}, {} 90
84 fin = import_text(annotation_file, ' ') 91 with open('repnames.bed', 'w') as fout:
85 with open(os.path.join(setup_folder, 'repnames.bed'), 'w') as fout: 92 f_in = import_text(annotation_file, ' ')
86 for i in range(3): 93 for line in f_in:
87 next(fin)
88 for line in fin:
89 repname = line[9].translate(str.maketrans('()/', '___')) 94 repname = line[9].translate(str.maketrans('()/', '___'))
90 if repname not in repeat_elements: 95 repeat_elements.add(repname)
91 repeat_elements.append(repname) 96 repchr, repstart, repend = line[4], line[5], line[6]
92 repchr = line[4] 97 fout.write(f"{repchr}\t{repstart}\t{repend}\t{repname}\n")
93 repstart = line[5] 98 rep_coords[repname].extend([repchr, repstart, repend])
94 repend = line[6] 99 # repeat_elements now contains the unique repeat names
95 fout.write('\t'.join([repchr, repstart, repend, repname]) + '\n') 100 # rep_coords is a dictionary where keys are repeat names and values are lists
96 if repname in rep_chr: 101 # containing chromosome, start, and end coordinates for each repeat instance
97 rep_chr[repname].append(repchr)
98 rep_start[repname].append(repstart)
99 rep_end[repname].append(repend)
100 else:
101 rep_chr[repname] = [repchr]
102 rep_start[repname] = [repstart]
103 rep_end[repname] = [repend]
104 102
105 # sort repeat_elements and print them in repgenomes_key.txt 103 # sort repeat_elements and print them in repeatIDs.txt
106 with open(os.path.join(setup_folder, 'repgenomes_key.txt'), 'w') as fout: 104 with open('repeatIDs.txt', 'w') as fout:
107 for i, repeat in enumerate(sorted(repeat_elements)): 105 for i, repeat in enumerate(sorted(repeat_elements)):
108 fout.write('\t'.join([repeat, str(i)]) + '\n') 106 fout.write('\t'.join([repeat, str(i)]) + '\n')
109 107
110 # generate spacer for pseudogenomes 108 # generate spacer for pseudogenomes
111 spacer = ''.join(['N' for i in range(gapl)]) 109 spacer = ''.join(['N' for i in range(gapl)])
112 110
113 # generate metagenomes and save them to FASTA files for bowtie build 111 # generate metagenomes and save them to FASTA files for bowtie build
114 for repname in rep_chr: 112 for repname in rep_coords:
115 metagenome = '' 113 metagenome = ''
116 for i, repeat in enumerate(rep_chr[repname]): 114 # iterating coordinate list by block of 3 (chr, start, end)
117 try: 115 block = 3
118 chromosome = rep_chr[repname][i] 116 for i in range(0, len(rep_coords[repname]) - block + 1, block):
119 start = max(int(rep_start[repname][i]) - flankingl, 0) 117 batch = rep_coords[repname][i:i+block]
120 end = min(int(rep_end[repname][i]) + flankingl, 118 print(batch)
121 int(lgenome[chr])-1) + 1 119 chromosome = batch[0]
122 metagenome = f"{metagenome}{spacer}{genome[chromosome][start:end]}" 120 start = max(int(batch[1]) - flankingl, 0)
123 except KeyError: 121 end = min(int(batch[2]) + flankingl,
124 print("Unrecognised Chromosome: " + rep_chr[repname][i]) 122 int(genome[chromosome]['length'])-1) + 1
123 metagenome = (
124 f"{metagenome}{spacer}"
125 f"{genome[chromosome]['sequence'][start:end]}"
126 )
125 127
126 # Create Fasta of repeat pseudogenome 128 # Create Fasta of repeat pseudogenome
127 fastafilename = f"{os.path.join(setup_folder, repname)}.fa" 129 fastafilename = f"{repname}.fa"
128 record = SeqRecord(Seq(metagenome), id=repname, name='', description='') 130 record = SeqRecord(Seq(metagenome), id=repname, name='', description='')
129 SeqIO.write(record, fastafilename, "fasta") 131 SeqIO.write(record, fastafilename, "fasta")
130 132
131 # Generate repeat pseudogenome bowtie index 133
132 bowtie_build_cmd = ["bowtie-build", "-f", fastafilename, 134 def bowtie_build(args):
133 os.path.join(setup_folder, repname)] 135 """
134 subprocess.run(bowtie_build_cmd, check=True) 136 Function to be executed in parallel by ProcessPoolExecutor.
137 """
138 try:
139 bowtie_base, fasta = args
140 command = shlex.split(f"bowtie-build -f {fasta} {bowtie_base}")
141 squash = subprocess.run(command, capture_output=True, text=True)
142 return squash.stdout
143 except Exception as e:
144 return str(e)
145
146
147 args_list = [(name, f"{name}.fa") for name in rep_coords]
148 with ProcessPoolExecutor(max_workers=cpus) as executor:
149 executor.map(bowtie_build, args_list)