comparison RepEnrich_setup.py @ 0:f6f0f1e5e940 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
author artbio
date Wed, 02 Aug 2017 05:17:29 -0400
parents
children 6bba3e33c2e7
comparison
equal deleted inserted replaced
-1:000000000000 0:f6f0f1e5e940
1 #!/usr/bin/env python
2 import argparse
3 import csv
4 import os
5 import shlex
6 import subprocess
7 import sys
8
9 from Bio import SeqIO
10 from Bio.Alphabet import IUPAC
11 from Bio.Seq import Seq
12 from Bio.SeqRecord import SeqRecord
13
14 parser = argparse.ArgumentParser(description='''
15 Part I: Prepartion of repetive element psuedogenomes and repetive\
16 element bamfiles. This script prepares the annotation used by\
17 downstream applications to analyze for repetitive element\
18 enrichment. For this script to run properly bowtie must be\
19 loaded. The repeat element psuedogenomes are prepared in order\
20 to analyze reads that map to multiple locations of the genome.\
21 The repeat element bamfiles are prepared in order to use a\
22 region sorter to analyze reads that map to a single location\
23 of the genome. You will 1) annotation_file:\
24 The repetitive element annotation file downloaded from\
25 RepeatMasker.org database for your organism of interest.\
26 2) genomefasta: Your genome of interest in fasta format,\
27 3)setup_folder: a folder to contain repeat element setup files\
28 command-line usage
29 EXAMPLE: python master_setup.py\
30 /users/nneretti/data/annotation/mm9/mm9_repeatmasker.txt\
31 /users/nneretti/data/annotation/mm9/mm9.fa\
32 /users/nneretti/data/annotation/mm9/setup_folder''',
33 prog='getargs_genome_maker.py')
34 parser.add_argument('--version', action='version', version='%(prog)s 0.1')
35 parser.add_argument('annotation_file', action='store',
36 metavar='annotation_file',
37 help='''List annotation file. The annotation file contains\
38 the repeat masker annotation for the genome of\
39 interest and may be downloaded at RepeatMasker.org\
40 Example /data/annotation/mm9/mm9.fa.out''')
41 parser.add_argument('genomefasta', action='store', metavar='genomefasta',
42 help='''File name and path for genome of interest in fasta\
43 format. Example /data/annotation/mm9/mm9.fa''')
44 parser.add_argument('setup_folder', action='store', metavar='setup_folder',
45 help='''List folder to contain bamfiles for repeats and\
46 repeat element psuedogenomes.\
47 Example /data/annotation/mm9/setup''')
48 parser.add_argument('--nfragmentsfile1', action='store',
49 dest='nfragmentsfile1', metavar='nfragmentsfile1',
50 default='./repnames_nfragments.txt',
51 help='''Output location of a description file that saves\
52 the number of fragments processed per repname.
53 Default ./repnames_nfragments.txt''')
54 parser.add_argument('--gaplength', action='store', dest='gaplength',
55 metavar='gaplength', default='200', type=int,
56 help='Length of the spacer used to build\
57 repeat psuedogeneomes. Default 200')
58 parser.add_argument('--flankinglength', action='store', dest='flankinglength',
59 metavar='flankinglength', default='25', type=int,
60 help='Length of the flanking region adjacent to the repeat\
61 element that is used to build repeat psuedogeneomes.\
62 The flanking length should be set according to the\
63 length of your reads. Default 25')
64 parser.add_argument('--is_bed', action='store', dest='is_bed',
65 metavar='is_bed', default='FALSE',
66 help='''Is the annotation file a bed file. This is also a\
67 compatible format. The file needs to be a tab\
68 separated bed with optional fields.
69 Ex. format:
70 chr\tstart\tend\tName_element\tclass\tfamily.
71 The class and family should identical to name_element\
72 if not applicable. Default FALSE change to TRUE''')
73 args = parser.parse_args()
74
75 # parameters and paths specified in args_parse
76 gapl = args.gaplength
77 flankingl = args.flankinglength
78 annotation_file = args.annotation_file
79 genomefasta = args.genomefasta
80 setup_folder = args.setup_folder
81 nfragmentsfile1 = args.nfragmentsfile1
82 is_bed = args.is_bed
83
84 ##############################################################################
85 # check that the programs we need are available
86 try:
87 subprocess.call(shlex.split("bowtie --version"),
88 stdout=open(os.devnull, 'wb'),
89 stderr=open(os.devnull, 'wb'))
90 except OSError:
91 print("Error: Bowtie or BEDTools not loaded")
92 raise
93
94 ##############################################################################
95 # Define a text importer
96 csv.field_size_limit(sys.maxsize)
97
98
99 def import_text(filename, separator):
100 for line in csv.reader(open(os.path.realpath(filename)),
101 delimiter=separator, skipinitialspace=True):
102 if line:
103 yield line
104
105
106 # Make a setup folder
107 if not os.path.exists(setup_folder):
108 os.makedirs(setup_folder)
109 ##############################################################################
110 # load genome into dictionary
111 print("loading genome...")
112 g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta"))
113
114 print("Precomputing length of all chromosomes...")
115 idxgenome = {}
116 lgenome = {}
117 genome = {}
118 allchrs = g.keys()
119 k = 0
120 for chr in allchrs:
121 genome[chr] = str(g[chr].seq)
122 # del g[chr]
123 lgenome[chr] = len(genome[chr])
124 idxgenome[chr] = k
125 k = k + 1
126 del g
127
128 ##############################################################################
129 # Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter
130 if is_bed == "FALSE":
131 repeat_elements = []
132 fout = open(os.path.realpath(setup_folder + os.path.sep
133 + 'repnames.bed'), 'w')
134 fin = import_text(annotation_file, ' ')
135 x = 0
136 rep_chr = {}
137 rep_start = {}
138 rep_end = {}
139 x = 0
140 for line in fin:
141 if x > 2:
142 line9 = line[9].replace("(", "_").replace(")",
143 "_").replace("/", "_")
144 repname = line9
145 if repname not in repeat_elements:
146 repeat_elements.append(repname)
147 repchr = line[4]
148 repstart = int(line[5])
149 repend = int(line[6])
150 fout.write(str(repchr) + '\t' + str(repstart) + '\t' + str(repend)
151 + '\t' + str(repname) + '\n')
152 if repname in rep_chr:
153 rep_chr[repname].append(repchr)
154 rep_start[repname].append(int(repstart))
155 rep_end[repname].append(int(repend))
156 else:
157 rep_chr[repname] = [repchr]
158 rep_start[repname] = [int(repstart)]
159 rep_end[repname] = [int(repend)]
160 x += 1
161 if is_bed == "TRUE":
162 repeat_elements = []
163 fout = open(os.path.realpath(setup_folder + os.path.sep + 'repnames.bed'),
164 'w')
165 fin = open(os.path.realpath(annotation_file), 'r')
166 x = 0
167 rep_chr = {}
168 rep_start = {}
169 rep_end = {}
170 x = 0
171 for line in fin:
172 line = line.strip('\n')
173 line = line.split('\t')
174 line3 = line[3].replace("(", "_").replace(")", "_").replace("/", "_")
175 repname = line3
176 if repname not in repeat_elements:
177 repeat_elements.append(repname)
178 repchr = line[0]
179 repstart = int(line[1])
180 repend = int(line[2])
181 fout.write(str(repchr) + '\t' + str(repstart) + '\t' +
182 str(repend) + '\t' + str(repname) + '\n')
183 # if rep_chr.has_key(repname):
184 if repname in rep_chr:
185 rep_chr[repname].append(repchr)
186 rep_start[repname].append(int(repstart))
187 rep_end[repname].append(int(repend))
188 else:
189 rep_chr[repname] = [repchr]
190 rep_start[repname] = [int(repstart)]
191 rep_end[repname] = [int(repend)]
192
193 fin.close()
194 fout.close()
195 repeat_elements = sorted(repeat_elements)
196 print("Writing a key for all repeats...")
197 # print to fout the binary key that contains each repeat type with the
198 # associated binary number; sort the binary key:
199 fout = open(os.path.realpath(setup_folder + os.path.sep +
200 'repgenomes_key.txt'), 'w')
201 x = 0
202 for repeat in repeat_elements:
203 # print >> fout, str(repeat) + '\t' + str(x)
204 fout.write(str(repeat) + '\t' + str(x) + '\n')
205 x += 1
206 fout.close()
207 ##############################################################################
208 # generate spacer for psuedogenomes
209 spacer = ""
210 for i in range(gapl):
211 spacer = spacer + "N"
212
213 # save file with number of fragments processed per repname
214 print("Saving number of fragments processed per repname to "
215 + nfragmentsfile1)
216 fout1 = open(os.path.realpath(nfragmentsfile1), "w")
217 for repname in rep_chr.keys():
218 rep_chr_current = rep_chr[repname]
219 # print >>fout1, str(len(rep_chr[repname])) + "\t" + repname
220 fout1.write(str(len(rep_chr[repname])) + "\t" + repname + '\n')
221 fout1.close()
222
223 # generate metagenomes and save them to FASTA files
224 k = 1
225 nrepgenomes = len(rep_chr.keys())
226 for repname in rep_chr.keys():
227 metagenome = ""
228 newname = repname.replace("(", "_").replace(")", "_").replace("/", "_")
229 print("processing repgenome " + newname + ".fa" + " (" + str(k)
230 + " of " + str(nrepgenomes) + ")")
231 rep_chr_current = rep_chr[repname]
232 rep_start_current = rep_start[repname]
233 rep_end_current = rep_end[repname]
234 print("-------> " + str(len(rep_chr[repname])) + " fragments")
235 for i in range(len(rep_chr[repname])):
236 try:
237 chr = rep_chr_current[i]
238 rstart = max(rep_start_current[i] - flankingl, 0)
239 rend = min(rep_end_current[i] + flankingl, lgenome[chr]-1)
240 metagenome = metagenome + spacer + genome[chr][rstart:(rend+1)]
241 except KeyError:
242 print("Unrecognised Chromosome: "+chr)
243 pass
244 # Convert metagenome to SeqRecord object (required by SeqIO.write)
245 record = SeqRecord(Seq(metagenome, IUPAC.unambiguous_dna), id="repname",
246 name="", description="")
247 print("saving repgenome " + newname + ".fa" + " (" + str(k) + " of "
248 + str(nrepgenomes) + ")")
249 fastafilename = os.path.realpath(setup_folder + os.path.sep
250 + newname + ".fa")
251 SeqIO.write(record, fastafilename, "fasta")
252 print("indexing repgenome " + newname + ".fa" + " (" +
253 str(k) + " of " + str(nrepgenomes) + ")")
254 command = shlex.split('bowtie-build -f ' + fastafilename + ' ' +
255 setup_folder + os.path.sep + newname)
256 p = subprocess.Popen(command).communicate()
257 k += 1
258
259 print("... Done")